Data Type Conversion
Scikit-learn operates on NumPy arrays; PyTorch on tensors. Seamless conversion between them is the foundation of any hybrid workflow. PyTorch and NumPy share the same memory on CPU, enabling near-zero-cost conversion.
torch.from_numpy(arr) and tensor.numpy() share the same underlying memory on CPU. Modifying one modifies the other — use .copy() if you need independence.
import numpy as np
import torch
# NumPy -> PyTorch (shares memory on CPU)
arr = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
tensor = torch.from_numpy(arr)
print(f"NumPy array: {arr.tolist()}")
print(f"Tensor: {tensor.tolist()}")
print(f"Same memory: {tensor.data_ptr() == arr.ctypes.data}")
# Modifying tensor ALSO modifies arr (shared memory)
tensor[0, 0] = 99.0
print(f"After modifying tensor, arr[0,0]: {arr[0, 0]}")
# To avoid aliasing: use .clone() or np.copy()
tensor_independent = torch.from_numpy(arr.copy())
tensor_independent[0, 0] = -1.0
print(f"Independent tensor[0,0]: {tensor_independent[0,0].item()}")
print(f"Original arr[0,0]: {arr[0, 0]}") # Unchanged
import numpy as np
import torch
# PyTorch -> NumPy
tensor = torch.randn(4, 3)
arr = tensor.numpy() # CPU only, shares memory
print(f"Tensor shape: {tensor.shape}")
print(f"NumPy shape: {arr.shape}")
print(f"NumPy dtype: {arr.dtype}")
# For GPU tensors: must move to CPU first
tensor_gpu = tensor # pretend this is on GPU
arr_from_gpu = tensor_gpu.detach().cpu().numpy()
# Type conversion during transfer
tensor_int = torch.randint(0, 10, (5,))
arr_float = tensor_int.float().numpy()
print(f"Int tensor: {tensor_int.tolist()}")
print(f"Float64 array: {arr_float.tolist()}")
PyTorch Model as sklearn Estimator
Wrapping a PyTorch model as a scikit-learn estimator unlocks the entire sklearn ecosystem: GridSearchCV, Pipeline, cross_val_score, and all evaluation utilities.
import numpy as np
import torch
import torch.nn as nn
class PyTorchClassifier:
"""
sklearn-compatible wrapper for PyTorch classifiers.
Implements fit/predict/predict_proba interface.
"""
def __init__(self, hidden_size=32, lr=1e-3, epochs=100, batch_size=32):
self.hidden_size = hidden_size
self.lr = lr
self.epochs = epochs
self.batch_size = batch_size
self.model_ = None
self.classes_ = None
def _build_model(self, n_features, n_classes):
return nn.Sequential(
nn.Linear(n_features, self.hidden_size),
nn.ReLU(),
nn.Linear(self.hidden_size, n_classes)
)
def fit(self, X, y):
"""Train on NumPy arrays — sklearn interface."""
X_t = torch.from_numpy(np.asarray(X, dtype=np.float32))
self.classes_ = np.unique(y)
n_classes = len(self.classes_)
# Map labels to 0..n_classes-1
y_mapped = np.searchsorted(self.classes_, y)
y_t = torch.from_numpy(y_mapped.astype(np.int64))
self.model_ = self._build_model(X_t.shape[1], n_classes)
optimizer = torch.optim.Adam(self.model_.parameters(), lr=self.lr)
criterion = nn.CrossEntropyLoss()
n = len(X_t)
for epoch in range(self.epochs):
perm = torch.randperm(n)
for start in range(0, n, self.batch_size):
idx = perm[start:start + self.batch_size]
loss = criterion(self.model_(X_t[idx]), y_t[idx])
optimizer.zero_grad()
loss.backward()
optimizer.step()
return self # sklearn convention: return self from fit
def predict(self, X):
"""Return class labels as NumPy array."""
self.model_.eval()
X_t = torch.from_numpy(np.asarray(X, dtype=np.float32))
with torch.no_grad():
logits = self.model_(X_t)
pred_idx = logits.argmax(dim=1).numpy()
return self.classes_[pred_idx]
def predict_proba(self, X):
"""Return probability matrix (n_samples, n_classes)."""
self.model_.eval()
X_t = torch.from_numpy(np.asarray(X, dtype=np.float32))
with torch.no_grad():
proba = torch.softmax(self.model_(X_t), dim=1).numpy()
return proba
def score(self, X, y):
"""Accuracy score — required for GridSearchCV."""
return (self.predict(X) == y).mean()
# Demo with synthetic data
import numpy as np
np.random.seed(42)
X_demo = np.random.randn(300, 5).astype(np.float32)
y_demo = (X_demo[:, 0] + X_demo[:, 1] > 0).astype(int)
clf = PyTorchClassifier(hidden_size=16, lr=1e-3, epochs=200)
clf.fit(X_demo[:240], y_demo[:240])
test_acc = clf.score(X_demo[240:], y_demo[240:])
print(f"Test accuracy: {test_acc:.4f}")
Using sklearn Pipelines
sklearn Pipelines chain preprocessing and modeling into a single object that respects train/test boundaries automatically.
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
class PyTorchClassifier:
"""Minimal sklearn-compatible wrapper (re-defined for standalone use)."""
def __init__(self, hidden_size=32, lr=1e-3, epochs=150):
self.hidden_size = hidden_size
self.lr = lr
self.epochs = epochs
self.model_ = None
self.classes_ = None
def fit(self, X, y):
X_t = torch.from_numpy(np.asarray(X, dtype=np.float32))
self.classes_ = np.unique(y)
y_t = torch.from_numpy(np.searchsorted(self.classes_, y).astype(np.int64))
self.model_ = nn.Sequential(nn.Linear(X_t.shape[1], self.hidden_size),
nn.ReLU(), nn.Linear(self.hidden_size, len(self.classes_)))
opt = torch.optim.Adam(self.model_.parameters(), lr=self.lr)
for _ in range(self.epochs):
loss = nn.CrossEntropyLoss()(self.model_(X_t), y_t)
opt.zero_grad(); loss.backward(); opt.step()
return self
def predict(self, X):
X_t = torch.from_numpy(np.asarray(X, dtype=np.float32))
with torch.no_grad():
return self.classes_[self.model_(X_t).argmax(1).numpy()]
def score(self, X, y):
return (self.predict(X) == y).mean()
# Build pipeline: Standardize -> Polynomial -> PyTorch model
pipeline = Pipeline([
('scaler', StandardScaler()),
('poly', PolynomialFeatures(degree=2, include_bias=False)),
('model', PyTorchClassifier(hidden_size=32, epochs=200))
])
np.random.seed(42)
X = np.random.randn(400, 3).astype(np.float32)
y = (X[:, 0]**2 + X[:, 1]**2 < 1.5).astype(int)
pipeline.fit(X[:320], y[:320])
test_acc = pipeline.score(X[320:], y[320:])
print(f"Pipeline test accuracy: {test_acc:.4f}")
print(f"Features after PolynomialFeatures: {pipeline.named_steps['poly'].n_output_features_}")
GridSearchCV with PyTorch
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
class PyTorchClassifier:
"""sklearn-compatible wrapper with get_params/set_params for GridSearchCV."""
def __init__(self, hidden_size=32, lr=1e-3, epochs=100):
self.hidden_size = hidden_size
self.lr = lr
self.epochs = epochs
self.model_ = None
self.classes_ = None
def get_params(self, deep=True):
"""Required for GridSearchCV parameter exploration."""
return {'hidden_size': self.hidden_size, 'lr': self.lr, 'epochs': self.epochs}
def set_params(self, **params):
"""Required for GridSearchCV — sets hyperparameters."""
for k, v in params.items():
setattr(self, k, v)
return self
def fit(self, X, y):
X_t = torch.from_numpy(np.asarray(X, dtype=np.float32))
self.classes_ = np.unique(y)
y_t = torch.from_numpy(np.searchsorted(self.classes_, y).astype(np.int64))
self.model_ = nn.Sequential(nn.Linear(X_t.shape[1], self.hidden_size),
nn.ReLU(), nn.Linear(self.hidden_size, len(self.classes_)))
opt = torch.optim.Adam(self.model_.parameters(), lr=self.lr)
for _ in range(self.epochs):
loss = nn.CrossEntropyLoss()(self.model_(X_t), y_t)
opt.zero_grad(); loss.backward(); opt.step()
return self
def predict(self, X):
X_t = torch.from_numpy(np.asarray(X, dtype=np.float32))
with torch.no_grad():
return self.classes_[self.model_(X_t).argmax(1).numpy()]
def score(self, X, y):
return (self.predict(X) == y).mean()
np.random.seed(42)
X = np.random.randn(300, 4).astype(np.float32)
y = (X[:, 0] + X[:, 1] > 0).astype(int)
# GridSearchCV over PyTorch hyperparameters
param_grid = {'hidden_size': [16, 32], 'lr': [1e-3, 1e-2]}
clf = GridSearchCV(PyTorchClassifier(epochs=100), param_grid, cv=3, scoring='accuracy', n_jobs=1)
clf.fit(X, y)
print(f"Best params: {clf.best_params_}")
print(f"Best CV score: {clf.best_score_:.4f}")
Feature Extraction with sklearn
Use sklearn to extract features (TF-IDF, PCA, encodings) then feed into a PyTorch model:
import numpy as np
import torch
import torch.nn as nn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Combine: PCA for dimensionality reduction -> PyTorch for classification
np.random.seed(42)
X_high_dim = np.random.randn(500, 50).astype(np.float32) # 50-dimensional raw features
y = (X_high_dim[:, 0] + X_high_dim[:, 1] > 0).astype(int)
# Step 1: Reduce to 10 dimensions with PCA (fit only on training data)
train_n = 400
X_train_raw = X_high_dim[:train_n]
X_test_raw = X_high_dim[train_n:]
pca = PCA(n_components=10)
scaler = StandardScaler()
X_train_pca = pca.fit_transform(scaler.fit_transform(X_train_raw))
X_test_pca = pca.transform(scaler.transform(X_test_raw))
print(f"Original dims: {X_train_raw.shape[1]} -> PCA dims: {X_train_pca.shape[1]}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_[:5].round(3)}")
# Step 2: PyTorch model on reduced features
X_tr = torch.from_numpy(X_train_pca.astype(np.float32))
y_tr = torch.from_numpy(y[:train_n].astype(np.int64))
X_te = torch.from_numpy(X_test_pca.astype(np.float32))
y_te = torch.from_numpy(y[train_n:].astype(np.int64))
model = nn.Sequential(nn.Linear(10, 16), nn.ReLU(), nn.Linear(16, 2))
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
for _ in range(300):
loss = nn.CrossEntropyLoss()(model(X_tr), y_tr)
opt.zero_grad(); loss.backward(); opt.step()
with torch.no_grad():
acc = (model(X_te).argmax(1) == y_te).float().mean().item()
print(f"Test accuracy (PCA -> PyTorch): {acc:.4f}")
Best Practices
When to Use Each Ecosystem
- sklearn preprocessing: StandardScaler, PCA, TF-IDF, OrdinalEncoder — battle-tested, Pipeline-compatible
- sklearn for small/tabular ML: Random Forests, SVM, Gradient Boosting often outperform neural nets on structured data
- PyTorch for sequences/images/text: Convolutions, attention, custom architectures
- Hybrid pipelines: sklearn preprocessing → PyTorch model for the best of both worlds
- Memory safety: Always use
.detach().cpu().numpy()when converting PyTorch outputs; never call.numpy()on a tensor that requires grad
import numpy as np
import torch
import torch.nn as nn
# CRITICAL: Always detach before calling .numpy()
model = nn.Linear(5, 3)
x_input = torch.randn(4, 5)
# BAD: raises RuntimeError (tensor has grad_fn)
output = model(x_input)
# output.numpy() # This would fail!
# GOOD: detach first
output_safe = output.detach().cpu().numpy()
print(f"Safe conversion shape: {output_safe.shape}")
# For probabilities: detach after softmax
proba = torch.softmax(output, dim=1)
proba_np = proba.detach().cpu().numpy()
print(f"Probabilities shape: {proba_np.shape}")
print(f"Row sums (should be 1): {proba_np.sum(axis=1).round(4)}")