ML Workflow: Cross-Validation & Feature Engineering

Data Splits

The most fundamental principle in ML evaluation: never evaluate on data used for training. A proper three-way split ensures that your final test accuracy reflects true generalization performance.

                            
                            The Three-Way Split: Train (fit model parameters), Validation (tune hyperparameters and architecture), Test (report final performance — touch only once). Using the test set multiple times is a subtle form of data leakage.
                        

import torch


def train_val_test_split(X, y, train_frac=0.7, val_frac=0.15, seed=42):
    """Split dataset into train/val/test with reproducible shuffling."""
    torch.manual_seed(seed)
    n = len(X)
    perm = torch.randperm(n)
    X, y = X[perm], y[perm]

    train_end = int(n * train_frac)
    val_end = int(n * (train_frac + val_frac))

    return (X[:train_end], y[:train_end],
            X[train_end:val_end], y[train_end:val_end],
            X[val_end:], y[val_end:])


# Demo
torch.manual_seed(42)
X = torch.randn(1000, 10)
y = (X[:, 0] > 0).long()

X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(X, y)
print(f"Train: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.0f}%)")
print(f"Val:   {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.0f}%)")
print(f"Test:  {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.0f}%)")

# Check class balance in each split
for name, labels in [('Train', y_train), ('Val', y_val), ('Test', y_test)]:
    pos_rate = labels.float().mean().item()
    print(f"{name} positive rate: {pos_rate:.3f}")

Data Leakage

Data leakage occurs when information from the validation or test set influences training. The most common mistake: computing normalization statistics on the full dataset before splitting.

import torch

torch.manual_seed(42)
X = torch.randn(1000, 10)
y = (X[:, 0] > 0).long()

# WRONG: Compute stats on ALL data (leaks test info into training)
X_mean_wrong = X.mean(dim=0)  # Includes test statistics!
X_std_wrong = X.std(dim=0)
X_normalized_wrong = (X - X_mean_wrong) / (X_std_wrong + 1e-8)

# CORRECT: Compute stats ONLY on training data
n_train = 700
X_train = X[:n_train]
X_val = X[n_train:850]
X_test = X[850:]

X_mean = X_train.mean(dim=0)    # Only from training set
X_std = X_train.std(dim=0)

# Apply SAME transformation to val and test
X_train_norm = (X_train - X_mean) / (X_std + 1e-8)
X_val_norm = (X_val - X_mean) / (X_std + 1e-8)   # Use TRAIN stats
X_test_norm = (X_test - X_mean) / (X_std + 1e-8)  # Use TRAIN stats

print(f"Train mean (should be ~0): {X_train_norm.mean().item():.4f}")
print(f"Val mean (slightly off):   {X_val_norm.mean().item():.4f}")

K-Fold Cross-Validation

K-fold CV provides a more reliable estimate of generalization performance by training K models, each on a different train/val partition. The final metric is the mean ± std across folds.

import torch
import torch.nn as nn


def kfold_cv(X, y, model_fn, k=5, seed=42):
    """
    K-fold cross-validation.
    model_fn: callable that returns a fresh (untrained) model.
    Returns list of per-fold validation accuracies.
    """
    torch.manual_seed(seed)
    n = len(X)
    perm = torch.randperm(n)
    X, y = X[perm], y[perm]

    fold_size = n // k
    accs = []

    for fold in range(k):
        val_start = fold * fold_size
        val_end = val_start + fold_size

        X_val = X[val_start:val_end]
        y_val = y[val_start:val_end]
        X_tr = torch.cat([X[:val_start], X[val_end:]])
        y_tr = torch.cat([y[:val_start], y[val_end:]])

        # Normalize using training fold statistics
        mu = X_tr.mean(dim=0)
        sigma = X_tr.std(dim=0) + 1e-8
        X_tr = (X_tr - mu) / sigma
        X_val = (X_val - mu) / sigma

        # Train
        model = model_fn()
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        for _ in range(100):
            logits = model(X_tr)
            loss = nn.CrossEntropyLoss()(logits, y_tr)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Evaluate
        with torch.no_grad():
            preds = model(X_val).argmax(dim=1)
            acc = (preds == y_val).float().mean().item()
        accs.append(acc)
        print(f"Fold {fold+1}/{k}: acc={acc:.4f}")

    return accs


# Demo
torch.manual_seed(42)
X = torch.randn(500, 5)
y = ((X[:, 0] + X[:, 1]) > 0).long()

def make_model():
    return nn.Sequential(nn.Linear(5, 16), nn.ReLU(), nn.Linear(16, 2))

fold_accs = kfold_cv(X, y, make_model, k=5)
mean_acc = sum(fold_accs) / len(fold_accs)
std_acc = (sum((a - mean_acc)**2 for a in fold_accs) / len(fold_accs))**0.5
print(f"\nCV accuracy: {mean_acc:.4f} ± {std_acc:.4f}")

Stratified K-Fold

For imbalanced datasets, stratified K-fold preserves the class ratio in each fold:

import torch


def stratified_kfold_indices(y, k=5, seed=42):
    """
    Returns list of (train_indices, val_indices) for each fold.
    Preserves class distribution within each fold.
    """
    torch.manual_seed(seed)
    classes = torch.unique(y)
    all_folds = [[] for _ in range(k)]
    all_val = [[] for _ in range(k)]

    for cls in classes:
        cls_indices = (y == cls).nonzero(as_tuple=True)[0]
        cls_indices = cls_indices[torch.randperm(len(cls_indices))]

        fold_size = len(cls_indices) // k
        for fold in range(k):
            start = fold * fold_size
            end = start + fold_size if fold < k - 1 else len(cls_indices)
            val_idx = cls_indices[start:end].tolist()
            train_idx = torch.cat([cls_indices[:start], cls_indices[end:]]).tolist()
            all_val[fold].extend(val_idx)
            all_folds[fold].extend(train_idx)

    return [(torch.tensor(train), torch.tensor(val))
            for train, val in zip(all_folds, all_val)]


# Demo with imbalanced data
torch.manual_seed(42)
X = torch.randn(500, 5)
y = torch.cat([torch.zeros(400), torch.ones(100)]).long()  # 80/20 imbalance
y = y[torch.randperm(500)]

folds = stratified_kfold_indices(y, k=5)
for i, (train_idx, val_idx) in enumerate(folds):
    val_pos_rate = y[val_idx].float().mean().item()
    print(f"Fold {i+1}: val size={len(val_idx)}, pos rate={val_pos_rate:.3f}")

Feature Engineering

Standardization & Normalization

import torch


class StandardScaler:
    """z-score standardization using PyTorch tensors."""

    def __init__(self):
        self.mean = None
        self.std = None

    def fit(self, X):
        X = X.float()
        self.mean = X.mean(dim=0)
        self.std = X.std(dim=0) + 1e-8
        return self

    def transform(self, X):
        return (X.float() - self.mean) / self.std

    def fit_transform(self, X):
        return self.fit(X).transform(X)

    def inverse_transform(self, X):
        return X.float() * self.std + self.mean


class MinMaxScaler:
    """Min-max normalization to [0, 1] range."""

    def __init__(self):
        self.min = None
        self.range = None

    def fit(self, X):
        X = X.float()
        self.min = X.min(dim=0).values
        self.range = X.max(dim=0).values - self.min + 1e-8
        return self

    def transform(self, X):
        return (X.float() - self.min) / self.range


# Demo: effect on model training
torch.manual_seed(42)
X_raw = torch.randn(200, 3) * torch.tensor([100.0, 0.01, 1.0])  # Very different scales
y = (X_raw[:, 0] / 100 + X_raw[:, 1] * 100 > 0).long()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)

print(f"Raw:    mean={X_raw.mean(dim=0).tolist()}, std={X_raw.std(dim=0).tolist()}")
print(f"Scaled: mean={X_scaled.mean(dim=0).round(decimals=3).tolist()}, "
      f"std={X_scaled.std(dim=0).round(decimals=3).tolist()}")

Polynomial Features

import torch
from itertools import combinations_with_replacement


def polynomial_features(X, degree=2):
    """
    Generate polynomial features up to given degree.
    For degree=2 with features [x1, x2]: outputs [x1, x2, x1^2, x1*x2, x2^2]
    """
    X = X.float()
    n, d = X.shape
    feature_list = [X]

    for deg in range(2, degree + 1):
        for combo in combinations_with_replacement(range(d), deg):
            new_feature = torch.ones(n)
            for idx in combo:
                new_feature = new_feature * X[:, idx]
            feature_list.append(new_feature.unsqueeze(1))

    return torch.cat(feature_list, dim=1)


# Demo: polynomial features let a linear model fit nonlinear data
torch.manual_seed(42)
X = torch.randn(200, 2)
y = (X[:, 0]**2 + X[:, 1]**2 < 1.5).long()  # Circle — not linearly separable

# Linear model on raw features
X_poly = polynomial_features(X, degree=2)
print(f"Original features: {X.shape[1]}")
print(f"Polynomial features (degree 2): {X_poly.shape[1]}")
# degree=2 with 2 features: [x1, x2, x1^2, x1*x2, x2^2] = 5 features

Hyperparameter Tuning

Grid Search with K-Fold CV

import torch
import torch.nn as nn


def evaluate_model(X, y, hidden_size, lr, k=3):
    """Evaluate a specific (hidden_size, lr) config via k-fold CV."""
    n = len(X)
    fold_size = n // k
    accs = []

    for fold in range(k):
        val_s = fold * fold_size
        val_e = val_s + fold_size
        X_val = X[val_s:val_e]
        y_val = y[val_s:val_e]
        X_tr = torch.cat([X[:val_s], X[val_e:]])
        y_tr = torch.cat([y[:val_s], y[val_e:]])

        mu, sigma = X_tr.mean(0), X_tr.std(0) + 1e-8
        X_tr = (X_tr - mu) / sigma
        X_val = (X_val - mu) / sigma

        model = nn.Sequential(nn.Linear(X.shape[1], hidden_size), nn.ReLU(), nn.Linear(hidden_size, 2))
        opt = torch.optim.Adam(model.parameters(), lr=lr)
        for _ in range(200):
            loss = nn.CrossEntropyLoss()(model(X_tr), y_tr)
            opt.zero_grad(); loss.backward(); opt.step()

        with torch.no_grad():
            acc = (model(X_val).argmax(1) == y_val).float().mean().item()
        accs.append(acc)

    return sum(accs) / len(accs)


# Grid search
torch.manual_seed(42)
X = torch.randn(400, 6)
y = ((X[:, 0]**2 + X[:, 1]**2) < 2).long()
perm = torch.randperm(400)
X, y = X[perm], y[perm]

hidden_sizes = [8, 16, 32]
lrs = [1e-3, 1e-2]
best_score = 0
best_config = None

for hs in hidden_sizes:
    for lr in lrs:
        score = evaluate_model(X, y, hs, lr, k=3)
        print(f"hidden={hs}, lr={lr}: acc={score:.4f}")
        if score > best_score:
            best_score = score
            best_config = (hs, lr)

print(f"\nBest config: hidden={best_config[0]}, lr={best_config[1]}, acc={best_score:.4f}")

Model Selection & Metrics

Beyond Accuracy

import torch


def classification_metrics(y_true, y_pred, num_classes=2):
    """Compute precision, recall, F1 for binary or multiclass problems."""
    y_true = y_true.long()
    y_pred = y_pred.long()

    metrics = {}
    for c in range(num_classes):
        tp = ((y_pred == c) & (y_true == c)).sum().float()
        fp = ((y_pred == c) & (y_true != c)).sum().float()
        fn = ((y_pred != c) & (y_true == c)).sum().float()

        precision = tp / (tp + fp + 1e-8)
        recall = tp / (tp + fn + 1e-8)
        f1 = 2 * precision * recall / (precision + recall + 1e-8)
        metrics[c] = {'precision': precision.item(), 'recall': recall.item(), 'f1': f1.item()}

    accuracy = (y_pred == y_true).float().mean().item()
    return accuracy, metrics


# Demo with imbalanced predictions
torch.manual_seed(42)
y_true = torch.cat([torch.zeros(80), torch.ones(20)]).long()
y_pred = torch.zeros(100).long()  # Naive: always predict 0

acc, metrics = classification_metrics(y_true, y_pred)
print(f"Accuracy: {acc:.3f}  (misleading on imbalanced data!)")
print(f"Class 0: precision={metrics[0]['precision']:.3f}, recall={metrics[0]['recall']:.3f}, F1={metrics[0]['f1']:.3f}")
print(f"Class 1: precision={metrics[1]['precision']:.3f}, recall={metrics[1]['recall']:.3f}, F1={metrics[1]['f1']:.3f}")

Confusion Matrix

import torch


def confusion_matrix(y_true, y_pred, num_classes):
    """Compute confusion matrix as a (num_classes x num_classes) tensor."""
    cm = torch.zeros(num_classes, num_classes, dtype=torch.long)
    for t, p in zip(y_true, y_pred):
        cm[t.item(), p.item()] += 1
    return cm


torch.manual_seed(42)
y_true = torch.randint(0, 3, (100,))
y_pred = y_true.clone()
# Add some errors
error_idx = torch.randperm(100)[:20]
y_pred[error_idx] = torch.randint(0, 3, (20,))

cm = confusion_matrix(y_true, y_pred, num_classes=3)
print("Confusion Matrix (rows=actual, cols=predicted):")
print(cm)

# Per-class accuracy from diagonal
per_class_acc = cm.diagonal().float() / cm.sum(dim=1).float()
print(f"Per-class accuracy: {per_class_acc.tolist()}")

Cookie Consent

Table of Contents

Data Splits

Data Leakage

K-Fold Cross-Validation

Stratified K-Fold

Feature Engineering

Standardization & Normalization

Polynomial Features

Hyperparameter Tuning

Grid Search with K-Fold CV

Model Selection & Metrics

Beyond Accuracy

Confusion Matrix

Cookie Consent

ML Workflow: Cross-Validation & Feature Engineering

Table of Contents

Data Splits

Data Leakage

K-Fold Cross-Validation

Stratified K-Fold

Feature Engineering

Standardization & Normalization

Polynomial Features

Hyperparameter Tuning

Grid Search with K-Fold CV

Model Selection & Metrics

Beyond Accuracy

Confusion Matrix

Related Articles in This Series

K-Nearest Neighbors in PyTorch

Scikit-learn ↔ PyTorch Interoperability

Part 3: Training & Evaluation