Geometric Intuition
PCA finds the directions of maximum variance in your data. The first principal component is the axis along which the data is most spread out; the second is orthogonal to the first and captures the next most variance; and so on.
PCA via SVD
The numerically stable way to compute PCA is via Singular Value Decomposition. For a mean-centered matrix $X = U \Sigma V^T$, the principal components are the columns of $V$ (right singular vectors).
import torch
class PCA:
"""
Principal Component Analysis using PyTorch SVD.
Numerically stable and GPU-compatible.
"""
def __init__(self, n_components=None):
self.n_components = n_components
self.components_ = None # Principal axes (n_components, n_features)
self.explained_variance_ = None
self.explained_variance_ratio_ = None
self.mean_ = None
def fit(self, X):
"""Compute principal components from data matrix X (n_samples, n_features)."""
X = X.float()
n, d = X.shape
n_components = self.n_components or d
# Mean center
self.mean_ = X.mean(dim=0)
X_centered = X - self.mean_
# SVD: X_centered = U * S * V^T
# Columns of V are principal components
U, S, Vh = torch.linalg.svd(X_centered, full_matrices=False)
# Vh shape: (min(n,d), d) — rows are principal components
# Explained variance from singular values
variance = (S ** 2) / (n - 1)
total_var = variance.sum()
self.components_ = Vh[:n_components] # (n_components, d)
self.explained_variance_ = variance[:n_components]
self.explained_variance_ratio_ = self.explained_variance_ / total_var
return self
def transform(self, X):
"""Project X onto principal components."""
X_centered = X.float() - self.mean_
return X_centered @ self.components_.T # (n, n_components)
def fit_transform(self, X):
return self.fit(X).transform(X)
def inverse_transform(self, X_reduced):
"""Reconstruct from reduced representation (approximate)."""
return X_reduced.float() @ self.components_ + self.mean_
# Demo: reduce 10D to 2D
torch.manual_seed(42)
X_high = torch.randn(300, 10)
# Add correlations
X_high[:, 1] = X_high[:, 0] + 0.2 * torch.randn(300)
X_high[:, 2] = X_high[:, 0] * 0.5
pca = PCA(n_components=2)
X_2d = pca.fit_transform(X_high)
print(f"Original shape: {X_high.shape}")
print(f"Reduced shape: {X_2d.shape}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_.tolist()[:3]}")
print(f"Cumulative (2 components): {pca.explained_variance_ratio_.sum().item():.3f}")
Choosing the Number of Components
import torch
torch.manual_seed(42)
# 10D data with 3 truly informative dimensions
X = torch.randn(500, 10)
X[:, 1] = X[:, 0] * 2 + torch.randn(500) * 0.1
X[:, 2] = X[:, 0] - X[:, 1] + torch.randn(500) * 0.1
# Full PCA (all components)
X_centered = X - X.mean(0)
U, S, Vh = torch.linalg.svd(X_centered, full_matrices=False)
variance = S**2 / (len(X) - 1)
cumulative = torch.cumsum(variance / variance.sum(), dim=0)
print("Components vs. cumulative explained variance:")
for i, cv in enumerate(cumulative[:8]):
bar = "#" * int(cv.item() * 40)
print(f" {i+1:2d} components: {cv.item()*100:6.1f}% [{bar}]")
# Find minimum components for 95% variance
n_95 = (cumulative < 0.95).sum().item() + 1
print(f"\nComponents needed for 95% variance: {n_95}")
Reconstruction Error
import torch
torch.manual_seed(42)
X = torch.randn(200, 20)
X[:, :5] = X[:, :5] * 3 # High-variance first 5 dims
# Mean center
mean = X.mean(0)
X_c = X - mean
# SVD
U, S, Vh = torch.linalg.svd(X_c, full_matrices=False)
# Reconstruction error vs number of components
print("Components | Reconstruction error | Explained variance")
for k in [1, 2, 5, 10, 15, 20]:
# Project to k dimensions then reconstruct
components_k = Vh[:k] # (k, d)
scores_k = X_c @ components_k.T # (n, k)
X_reconstructed = scores_k @ components_k + mean # (n, d)
error = ((X - X_reconstructed)**2).mean().item()
var_explained = (S[:k]**2).sum() / (S**2).sum()
print(f" {k:3d} | {error:8.4f} | {var_explained:.3f}")
PCA Whitening
Whitening transforms the data so that each principal component has unit variance. This is useful before neural network training and some clustering algorithms.
import torch
torch.manual_seed(42)
X = torch.randn(300, 5)
X[:, 0] *= 10 # Very different scales
X[:, 1] *= 0.1
# Standard PCA
mean = X.mean(0)
X_c = X - mean
U, S, Vh = torch.linalg.svd(X_c, full_matrices=False)
n = len(X)
# PCA projection (without whitening)
X_pca = X_c @ Vh.T # (n, 5)
# PCA whitening: divide each component by its std dev
X_whitened = X_pca / (S / (n - 1)**0.5).unsqueeze(0)
print(f"Original variance per feature: {X.var(0).tolist()}")
print(f"PCA variance per component: {X_pca.var(0).tolist()}")
print(f"Whitened variance per component: {X_whitened.var(0).round(decimals=4).tolist()}")
# Whitened: all variances should be ~1.0
Using torch.pca_lowrank
import torch
# PyTorch built-in: efficient randomized PCA for large data
torch.manual_seed(42)
X = torch.randn(1000, 100)
# torch.pca_lowrank uses randomized SVD — much faster for large matrices
U, S, V = torch.pca_lowrank(X, q=10) # q = number of components
# Explained variance ratio
variance = S**2 / (len(X) - 1)
total_var = X.var(0).sum()
explained_ratio = variance / total_var
print(f"Top 10 components explain: {explained_ratio.sum().item()*100:.1f}% of variance")
print(f"S shape (singular values): {S.shape}")
print(f"V shape (components): {V.shape}")
# Transform: project onto top-10 components
X_reduced = X @ V # (1000, 10)
print(f"Reduced shape: {X_reduced.shape}")
When to Use PCA vs t-SNE
Choosing the Right Method
- PCA: Linear; preserves global structure; fast $O(nd^2)$; use for preprocessing, noise reduction, visualization of global trends
- t-SNE: Non-linear; preserves local neighborhood structure; slow $O(n^2)$; ideal for cluster visualization on datasets <50k points
- UMAP: Non-linear; faster than t-SNE; better global structure; use for large datasets or when t-SNE is too slow
- Autoencoders: Learned non-linear compression; when you need to encode new unseen points after training