Back to PyTorch Mastery Series

Scikit-learn ↔ PyTorch Interoperability

May 29, 2026 Wasil Zafar 22 min read

Seamlessly integrate scikit-learn’s rich ecosystem with PyTorch’s deep learning capabilities: wrap PyTorch models as sklearn estimators, use sklearn pipelines, and bridge numpy and tensor data flows.

Table of Contents

  1. Data Type Conversion
  2. PyTorch as sklearn Estimator
  3. Using sklearn Pipelines
  4. Feature Extraction with sklearn
  5. Best Practices
  6. Related Articles

Data Type Conversion

Scikit-learn operates on NumPy arrays; PyTorch on tensors. Seamless conversion between them is the foundation of any hybrid workflow. PyTorch and NumPy share the same memory on CPU, enabling near-zero-cost conversion.

Memory Sharing: torch.from_numpy(arr) and tensor.numpy() share the same underlying memory on CPU. Modifying one modifies the other — use .copy() if you need independence.
import numpy as np
import torch

# NumPy -> PyTorch (shares memory on CPU)
arr = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
tensor = torch.from_numpy(arr)

print(f"NumPy array: {arr.tolist()}")
print(f"Tensor:      {tensor.tolist()}")
print(f"Same memory: {tensor.data_ptr() == arr.ctypes.data}")

# Modifying tensor ALSO modifies arr (shared memory)
tensor[0, 0] = 99.0
print(f"After modifying tensor, arr[0,0]: {arr[0, 0]}")

# To avoid aliasing: use .clone() or np.copy()
tensor_independent = torch.from_numpy(arr.copy())
tensor_independent[0, 0] = -1.0
print(f"Independent tensor[0,0]: {tensor_independent[0,0].item()}")
print(f"Original arr[0,0]:       {arr[0, 0]}")  # Unchanged
import numpy as np
import torch

# PyTorch -> NumPy
tensor = torch.randn(4, 3)
arr = tensor.numpy()  # CPU only, shares memory
print(f"Tensor shape: {tensor.shape}")
print(f"NumPy shape:  {arr.shape}")
print(f"NumPy dtype:  {arr.dtype}")

# For GPU tensors: must move to CPU first
tensor_gpu = tensor  # pretend this is on GPU
arr_from_gpu = tensor_gpu.detach().cpu().numpy()

# Type conversion during transfer
tensor_int = torch.randint(0, 10, (5,))
arr_float = tensor_int.float().numpy()
print(f"Int tensor:    {tensor_int.tolist()}")
print(f"Float64 array: {arr_float.tolist()}")

PyTorch Model as sklearn Estimator

Wrapping a PyTorch model as a scikit-learn estimator unlocks the entire sklearn ecosystem: GridSearchCV, Pipeline, cross_val_score, and all evaluation utilities.

import numpy as np
import torch
import torch.nn as nn


class PyTorchClassifier:
    """
    sklearn-compatible wrapper for PyTorch classifiers.
    Implements fit/predict/predict_proba interface.
    """

    def __init__(self, hidden_size=32, lr=1e-3, epochs=100, batch_size=32):
        self.hidden_size = hidden_size
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.model_ = None
        self.classes_ = None

    def _build_model(self, n_features, n_classes):
        return nn.Sequential(
            nn.Linear(n_features, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, n_classes)
        )

    def fit(self, X, y):
        """Train on NumPy arrays — sklearn interface."""
        X_t = torch.from_numpy(np.asarray(X, dtype=np.float32))
        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)
        # Map labels to 0..n_classes-1
        y_mapped = np.searchsorted(self.classes_, y)
        y_t = torch.from_numpy(y_mapped.astype(np.int64))

        self.model_ = self._build_model(X_t.shape[1], n_classes)
        optimizer = torch.optim.Adam(self.model_.parameters(), lr=self.lr)
        criterion = nn.CrossEntropyLoss()

        n = len(X_t)
        for epoch in range(self.epochs):
            perm = torch.randperm(n)
            for start in range(0, n, self.batch_size):
                idx = perm[start:start + self.batch_size]
                loss = criterion(self.model_(X_t[idx]), y_t[idx])
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        return self  # sklearn convention: return self from fit

    def predict(self, X):
        """Return class labels as NumPy array."""
        self.model_.eval()
        X_t = torch.from_numpy(np.asarray(X, dtype=np.float32))
        with torch.no_grad():
            logits = self.model_(X_t)
            pred_idx = logits.argmax(dim=1).numpy()
        return self.classes_[pred_idx]

    def predict_proba(self, X):
        """Return probability matrix (n_samples, n_classes)."""
        self.model_.eval()
        X_t = torch.from_numpy(np.asarray(X, dtype=np.float32))
        with torch.no_grad():
            proba = torch.softmax(self.model_(X_t), dim=1).numpy()
        return proba

    def score(self, X, y):
        """Accuracy score — required for GridSearchCV."""
        return (self.predict(X) == y).mean()


# Demo with synthetic data
import numpy as np

np.random.seed(42)
X_demo = np.random.randn(300, 5).astype(np.float32)
y_demo = (X_demo[:, 0] + X_demo[:, 1] > 0).astype(int)

clf = PyTorchClassifier(hidden_size=16, lr=1e-3, epochs=200)
clf.fit(X_demo[:240], y_demo[:240])

test_acc = clf.score(X_demo[240:], y_demo[240:])
print(f"Test accuracy: {test_acc:.4f}")

Using sklearn Pipelines

sklearn Pipelines chain preprocessing and modeling into a single object that respects train/test boundaries automatically.

import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline


class PyTorchClassifier:
    """Minimal sklearn-compatible wrapper (re-defined for standalone use)."""

    def __init__(self, hidden_size=32, lr=1e-3, epochs=150):
        self.hidden_size = hidden_size
        self.lr = lr
        self.epochs = epochs
        self.model_ = None
        self.classes_ = None

    def fit(self, X, y):
        X_t = torch.from_numpy(np.asarray(X, dtype=np.float32))
        self.classes_ = np.unique(y)
        y_t = torch.from_numpy(np.searchsorted(self.classes_, y).astype(np.int64))
        self.model_ = nn.Sequential(nn.Linear(X_t.shape[1], self.hidden_size),
                                    nn.ReLU(), nn.Linear(self.hidden_size, len(self.classes_)))
        opt = torch.optim.Adam(self.model_.parameters(), lr=self.lr)
        for _ in range(self.epochs):
            loss = nn.CrossEntropyLoss()(self.model_(X_t), y_t)
            opt.zero_grad(); loss.backward(); opt.step()
        return self

    def predict(self, X):
        X_t = torch.from_numpy(np.asarray(X, dtype=np.float32))
        with torch.no_grad():
            return self.classes_[self.model_(X_t).argmax(1).numpy()]

    def score(self, X, y):
        return (self.predict(X) == y).mean()


# Build pipeline: Standardize -> Polynomial -> PyTorch model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('model', PyTorchClassifier(hidden_size=32, epochs=200))
])

np.random.seed(42)
X = np.random.randn(400, 3).astype(np.float32)
y = (X[:, 0]**2 + X[:, 1]**2 < 1.5).astype(int)

pipeline.fit(X[:320], y[:320])
test_acc = pipeline.score(X[320:], y[320:])
print(f"Pipeline test accuracy: {test_acc:.4f}")
print(f"Features after PolynomialFeatures: {pipeline.named_steps['poly'].n_output_features_}")

GridSearchCV with PyTorch

import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


class PyTorchClassifier:
    """sklearn-compatible wrapper with get_params/set_params for GridSearchCV."""

    def __init__(self, hidden_size=32, lr=1e-3, epochs=100):
        self.hidden_size = hidden_size
        self.lr = lr
        self.epochs = epochs
        self.model_ = None
        self.classes_ = None

    def get_params(self, deep=True):
        """Required for GridSearchCV parameter exploration."""
        return {'hidden_size': self.hidden_size, 'lr': self.lr, 'epochs': self.epochs}

    def set_params(self, **params):
        """Required for GridSearchCV — sets hyperparameters."""
        for k, v in params.items():
            setattr(self, k, v)
        return self

    def fit(self, X, y):
        X_t = torch.from_numpy(np.asarray(X, dtype=np.float32))
        self.classes_ = np.unique(y)
        y_t = torch.from_numpy(np.searchsorted(self.classes_, y).astype(np.int64))
        self.model_ = nn.Sequential(nn.Linear(X_t.shape[1], self.hidden_size),
                                    nn.ReLU(), nn.Linear(self.hidden_size, len(self.classes_)))
        opt = torch.optim.Adam(self.model_.parameters(), lr=self.lr)
        for _ in range(self.epochs):
            loss = nn.CrossEntropyLoss()(self.model_(X_t), y_t)
            opt.zero_grad(); loss.backward(); opt.step()
        return self

    def predict(self, X):
        X_t = torch.from_numpy(np.asarray(X, dtype=np.float32))
        with torch.no_grad():
            return self.classes_[self.model_(X_t).argmax(1).numpy()]

    def score(self, X, y):
        return (self.predict(X) == y).mean()


np.random.seed(42)
X = np.random.randn(300, 4).astype(np.float32)
y = (X[:, 0] + X[:, 1] > 0).astype(int)

# GridSearchCV over PyTorch hyperparameters
param_grid = {'hidden_size': [16, 32], 'lr': [1e-3, 1e-2]}
clf = GridSearchCV(PyTorchClassifier(epochs=100), param_grid, cv=3, scoring='accuracy', n_jobs=1)
clf.fit(X, y)

print(f"Best params:   {clf.best_params_}")
print(f"Best CV score: {clf.best_score_:.4f}")

Feature Extraction with sklearn

Use sklearn to extract features (TF-IDF, PCA, encodings) then feed into a PyTorch model:

import numpy as np
import torch
import torch.nn as nn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


# Combine: PCA for dimensionality reduction -> PyTorch for classification
np.random.seed(42)
X_high_dim = np.random.randn(500, 50).astype(np.float32)  # 50-dimensional raw features
y = (X_high_dim[:, 0] + X_high_dim[:, 1] > 0).astype(int)

# Step 1: Reduce to 10 dimensions with PCA (fit only on training data)
train_n = 400
X_train_raw = X_high_dim[:train_n]
X_test_raw = X_high_dim[train_n:]

pca = PCA(n_components=10)
scaler = StandardScaler()

X_train_pca = pca.fit_transform(scaler.fit_transform(X_train_raw))
X_test_pca = pca.transform(scaler.transform(X_test_raw))

print(f"Original dims: {X_train_raw.shape[1]} -> PCA dims: {X_train_pca.shape[1]}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_[:5].round(3)}")

# Step 2: PyTorch model on reduced features
X_tr = torch.from_numpy(X_train_pca.astype(np.float32))
y_tr = torch.from_numpy(y[:train_n].astype(np.int64))
X_te = torch.from_numpy(X_test_pca.astype(np.float32))
y_te = torch.from_numpy(y[train_n:].astype(np.int64))

model = nn.Sequential(nn.Linear(10, 16), nn.ReLU(), nn.Linear(16, 2))
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
for _ in range(300):
    loss = nn.CrossEntropyLoss()(model(X_tr), y_tr)
    opt.zero_grad(); loss.backward(); opt.step()

with torch.no_grad():
    acc = (model(X_te).argmax(1) == y_te).float().mean().item()
print(f"Test accuracy (PCA -> PyTorch): {acc:.4f}")

Best Practices

Guidelines Integration Best Practices

When to Use Each Ecosystem

  • sklearn preprocessing: StandardScaler, PCA, TF-IDF, OrdinalEncoder — battle-tested, Pipeline-compatible
  • sklearn for small/tabular ML: Random Forests, SVM, Gradient Boosting often outperform neural nets on structured data
  • PyTorch for sequences/images/text: Convolutions, attention, custom architectures
  • Hybrid pipelines: sklearn preprocessing → PyTorch model for the best of both worlds
  • Memory safety: Always use .detach().cpu().numpy() when converting PyTorch outputs; never call .numpy() on a tensor that requires grad
Architecture Best Practices Production
import numpy as np
import torch
import torch.nn as nn

# CRITICAL: Always detach before calling .numpy()
model = nn.Linear(5, 3)
x_input = torch.randn(4, 5)

# BAD: raises RuntimeError (tensor has grad_fn)
output = model(x_input)
# output.numpy()  # This would fail!

# GOOD: detach first
output_safe = output.detach().cpu().numpy()
print(f"Safe conversion shape: {output_safe.shape}")

# For probabilities: detach after softmax
proba = torch.softmax(output, dim=1)
proba_np = proba.detach().cpu().numpy()
print(f"Probabilities shape: {proba_np.shape}")
print(f"Row sums (should be 1): {proba_np.sum(axis=1).round(4)}")