Data Splits
The most fundamental principle in ML evaluation: never evaluate on data used for training. A proper three-way split ensures that your final test accuracy reflects true generalization performance.
import torch
def train_val_test_split(X, y, train_frac=0.7, val_frac=0.15, seed=42):
"""Split dataset into train/val/test with reproducible shuffling."""
torch.manual_seed(seed)
n = len(X)
perm = torch.randperm(n)
X, y = X[perm], y[perm]
train_end = int(n * train_frac)
val_end = int(n * (train_frac + val_frac))
return (X[:train_end], y[:train_end],
X[train_end:val_end], y[train_end:val_end],
X[val_end:], y[val_end:])
# Demo
torch.manual_seed(42)
X = torch.randn(1000, 10)
y = (X[:, 0] > 0).long()
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(X, y)
print(f"Train: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.0f}%)")
print(f"Val: {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.0f}%)")
print(f"Test: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.0f}%)")
# Check class balance in each split
for name, labels in [('Train', y_train), ('Val', y_val), ('Test', y_test)]:
pos_rate = labels.float().mean().item()
print(f"{name} positive rate: {pos_rate:.3f}")
Data Leakage
Data leakage occurs when information from the validation or test set influences training. The most common mistake: computing normalization statistics on the full dataset before splitting.
import torch
torch.manual_seed(42)
X = torch.randn(1000, 10)
y = (X[:, 0] > 0).long()
# WRONG: Compute stats on ALL data (leaks test info into training)
X_mean_wrong = X.mean(dim=0) # Includes test statistics!
X_std_wrong = X.std(dim=0)
X_normalized_wrong = (X - X_mean_wrong) / (X_std_wrong + 1e-8)
# CORRECT: Compute stats ONLY on training data
n_train = 700
X_train = X[:n_train]
X_val = X[n_train:850]
X_test = X[850:]
X_mean = X_train.mean(dim=0) # Only from training set
X_std = X_train.std(dim=0)
# Apply SAME transformation to val and test
X_train_norm = (X_train - X_mean) / (X_std + 1e-8)
X_val_norm = (X_val - X_mean) / (X_std + 1e-8) # Use TRAIN stats
X_test_norm = (X_test - X_mean) / (X_std + 1e-8) # Use TRAIN stats
print(f"Train mean (should be ~0): {X_train_norm.mean().item():.4f}")
print(f"Val mean (slightly off): {X_val_norm.mean().item():.4f}")
K-Fold Cross-Validation
K-fold CV provides a more reliable estimate of generalization performance by training K models, each on a different train/val partition. The final metric is the mean ± std across folds.
import torch
import torch.nn as nn
def kfold_cv(X, y, model_fn, k=5, seed=42):
"""
K-fold cross-validation.
model_fn: callable that returns a fresh (untrained) model.
Returns list of per-fold validation accuracies.
"""
torch.manual_seed(seed)
n = len(X)
perm = torch.randperm(n)
X, y = X[perm], y[perm]
fold_size = n // k
accs = []
for fold in range(k):
val_start = fold * fold_size
val_end = val_start + fold_size
X_val = X[val_start:val_end]
y_val = y[val_start:val_end]
X_tr = torch.cat([X[:val_start], X[val_end:]])
y_tr = torch.cat([y[:val_start], y[val_end:]])
# Normalize using training fold statistics
mu = X_tr.mean(dim=0)
sigma = X_tr.std(dim=0) + 1e-8
X_tr = (X_tr - mu) / sigma
X_val = (X_val - mu) / sigma
# Train
model = model_fn()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
for _ in range(100):
logits = model(X_tr)
loss = nn.CrossEntropyLoss()(logits, y_tr)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Evaluate
with torch.no_grad():
preds = model(X_val).argmax(dim=1)
acc = (preds == y_val).float().mean().item()
accs.append(acc)
print(f"Fold {fold+1}/{k}: acc={acc:.4f}")
return accs
# Demo
torch.manual_seed(42)
X = torch.randn(500, 5)
y = ((X[:, 0] + X[:, 1]) > 0).long()
def make_model():
return nn.Sequential(nn.Linear(5, 16), nn.ReLU(), nn.Linear(16, 2))
fold_accs = kfold_cv(X, y, make_model, k=5)
mean_acc = sum(fold_accs) / len(fold_accs)
std_acc = (sum((a - mean_acc)**2 for a in fold_accs) / len(fold_accs))**0.5
print(f"\nCV accuracy: {mean_acc:.4f} ± {std_acc:.4f}")
Stratified K-Fold
For imbalanced datasets, stratified K-fold preserves the class ratio in each fold:
import torch
def stratified_kfold_indices(y, k=5, seed=42):
"""
Returns list of (train_indices, val_indices) for each fold.
Preserves class distribution within each fold.
"""
torch.manual_seed(seed)
classes = torch.unique(y)
all_folds = [[] for _ in range(k)]
all_val = [[] for _ in range(k)]
for cls in classes:
cls_indices = (y == cls).nonzero(as_tuple=True)[0]
cls_indices = cls_indices[torch.randperm(len(cls_indices))]
fold_size = len(cls_indices) // k
for fold in range(k):
start = fold * fold_size
end = start + fold_size if fold < k - 1 else len(cls_indices)
val_idx = cls_indices[start:end].tolist()
train_idx = torch.cat([cls_indices[:start], cls_indices[end:]]).tolist()
all_val[fold].extend(val_idx)
all_folds[fold].extend(train_idx)
return [(torch.tensor(train), torch.tensor(val))
for train, val in zip(all_folds, all_val)]
# Demo with imbalanced data
torch.manual_seed(42)
X = torch.randn(500, 5)
y = torch.cat([torch.zeros(400), torch.ones(100)]).long() # 80/20 imbalance
y = y[torch.randperm(500)]
folds = stratified_kfold_indices(y, k=5)
for i, (train_idx, val_idx) in enumerate(folds):
val_pos_rate = y[val_idx].float().mean().item()
print(f"Fold {i+1}: val size={len(val_idx)}, pos rate={val_pos_rate:.3f}")
Feature Engineering
Standardization & Normalization
import torch
class StandardScaler:
"""z-score standardization using PyTorch tensors."""
def __init__(self):
self.mean = None
self.std = None
def fit(self, X):
X = X.float()
self.mean = X.mean(dim=0)
self.std = X.std(dim=0) + 1e-8
return self
def transform(self, X):
return (X.float() - self.mean) / self.std
def fit_transform(self, X):
return self.fit(X).transform(X)
def inverse_transform(self, X):
return X.float() * self.std + self.mean
class MinMaxScaler:
"""Min-max normalization to [0, 1] range."""
def __init__(self):
self.min = None
self.range = None
def fit(self, X):
X = X.float()
self.min = X.min(dim=0).values
self.range = X.max(dim=0).values - self.min + 1e-8
return self
def transform(self, X):
return (X.float() - self.min) / self.range
# Demo: effect on model training
torch.manual_seed(42)
X_raw = torch.randn(200, 3) * torch.tensor([100.0, 0.01, 1.0]) # Very different scales
y = (X_raw[:, 0] / 100 + X_raw[:, 1] * 100 > 0).long()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)
print(f"Raw: mean={X_raw.mean(dim=0).tolist()}, std={X_raw.std(dim=0).tolist()}")
print(f"Scaled: mean={X_scaled.mean(dim=0).round(decimals=3).tolist()}, "
f"std={X_scaled.std(dim=0).round(decimals=3).tolist()}")
Polynomial Features
import torch
from itertools import combinations_with_replacement
def polynomial_features(X, degree=2):
"""
Generate polynomial features up to given degree.
For degree=2 with features [x1, x2]: outputs [x1, x2, x1^2, x1*x2, x2^2]
"""
X = X.float()
n, d = X.shape
feature_list = [X]
for deg in range(2, degree + 1):
for combo in combinations_with_replacement(range(d), deg):
new_feature = torch.ones(n)
for idx in combo:
new_feature = new_feature * X[:, idx]
feature_list.append(new_feature.unsqueeze(1))
return torch.cat(feature_list, dim=1)
# Demo: polynomial features let a linear model fit nonlinear data
torch.manual_seed(42)
X = torch.randn(200, 2)
y = (X[:, 0]**2 + X[:, 1]**2 < 1.5).long() # Circle — not linearly separable
# Linear model on raw features
X_poly = polynomial_features(X, degree=2)
print(f"Original features: {X.shape[1]}")
print(f"Polynomial features (degree 2): {X_poly.shape[1]}")
# degree=2 with 2 features: [x1, x2, x1^2, x1*x2, x2^2] = 5 features
Hyperparameter Tuning
Grid Search with K-Fold CV
import torch
import torch.nn as nn
def evaluate_model(X, y, hidden_size, lr, k=3):
"""Evaluate a specific (hidden_size, lr) config via k-fold CV."""
n = len(X)
fold_size = n // k
accs = []
for fold in range(k):
val_s = fold * fold_size
val_e = val_s + fold_size
X_val = X[val_s:val_e]
y_val = y[val_s:val_e]
X_tr = torch.cat([X[:val_s], X[val_e:]])
y_tr = torch.cat([y[:val_s], y[val_e:]])
mu, sigma = X_tr.mean(0), X_tr.std(0) + 1e-8
X_tr = (X_tr - mu) / sigma
X_val = (X_val - mu) / sigma
model = nn.Sequential(nn.Linear(X.shape[1], hidden_size), nn.ReLU(), nn.Linear(hidden_size, 2))
opt = torch.optim.Adam(model.parameters(), lr=lr)
for _ in range(200):
loss = nn.CrossEntropyLoss()(model(X_tr), y_tr)
opt.zero_grad(); loss.backward(); opt.step()
with torch.no_grad():
acc = (model(X_val).argmax(1) == y_val).float().mean().item()
accs.append(acc)
return sum(accs) / len(accs)
# Grid search
torch.manual_seed(42)
X = torch.randn(400, 6)
y = ((X[:, 0]**2 + X[:, 1]**2) < 2).long()
perm = torch.randperm(400)
X, y = X[perm], y[perm]
hidden_sizes = [8, 16, 32]
lrs = [1e-3, 1e-2]
best_score = 0
best_config = None
for hs in hidden_sizes:
for lr in lrs:
score = evaluate_model(X, y, hs, lr, k=3)
print(f"hidden={hs}, lr={lr}: acc={score:.4f}")
if score > best_score:
best_score = score
best_config = (hs, lr)
print(f"\nBest config: hidden={best_config[0]}, lr={best_config[1]}, acc={best_score:.4f}")
Model Selection & Metrics
Beyond Accuracy
import torch
def classification_metrics(y_true, y_pred, num_classes=2):
"""Compute precision, recall, F1 for binary or multiclass problems."""
y_true = y_true.long()
y_pred = y_pred.long()
metrics = {}
for c in range(num_classes):
tp = ((y_pred == c) & (y_true == c)).sum().float()
fp = ((y_pred == c) & (y_true != c)).sum().float()
fn = ((y_pred != c) & (y_true == c)).sum().float()
precision = tp / (tp + fp + 1e-8)
recall = tp / (tp + fn + 1e-8)
f1 = 2 * precision * recall / (precision + recall + 1e-8)
metrics[c] = {'precision': precision.item(), 'recall': recall.item(), 'f1': f1.item()}
accuracy = (y_pred == y_true).float().mean().item()
return accuracy, metrics
# Demo with imbalanced predictions
torch.manual_seed(42)
y_true = torch.cat([torch.zeros(80), torch.ones(20)]).long()
y_pred = torch.zeros(100).long() # Naive: always predict 0
acc, metrics = classification_metrics(y_true, y_pred)
print(f"Accuracy: {acc:.3f} (misleading on imbalanced data!)")
print(f"Class 0: precision={metrics[0]['precision']:.3f}, recall={metrics[0]['recall']:.3f}, F1={metrics[0]['f1']:.3f}")
print(f"Class 1: precision={metrics[1]['precision']:.3f}, recall={metrics[1]['recall']:.3f}, F1={metrics[1]['f1']:.3f}")
Confusion Matrix
import torch
def confusion_matrix(y_true, y_pred, num_classes):
"""Compute confusion matrix as a (num_classes x num_classes) tensor."""
cm = torch.zeros(num_classes, num_classes, dtype=torch.long)
for t, p in zip(y_true, y_pred):
cm[t.item(), p.item()] += 1
return cm
torch.manual_seed(42)
y_true = torch.randint(0, 3, (100,))
y_pred = y_true.clone()
# Add some errors
error_idx = torch.randperm(100)[:20]
y_pred[error_idx] = torch.randint(0, 3, (20,))
cm = confusion_matrix(y_true, y_pred, num_classes=3)
print("Confusion Matrix (rows=actual, cols=predicted):")
print(cm)
# Per-class accuracy from diagonal
per_class_acc = cm.diagonal().float() / cm.sum(dim=1).float()
print(f"Per-class accuracy: {per_class_acc.tolist()}")