Probability Theory
Probability theory gives us the language to talk about uncertainty. In deep learning it underpins maximum likelihood estimation, Bayesian networks, variational autoencoders, and every stochastic element of training.
Distributions in PyTorch
import torch
from torch.distributions import (
Normal, Bernoulli, Categorical, MultivariateNormal, Beta, Dirichlet
)
# ----- Univariate Gaussian -----
mu, sigma = torch.tensor(2.0), torch.tensor(1.5)
dist = Normal(mu, sigma)
x = torch.linspace(-2, 6, 100)
log_probs = dist.log_prob(x)
sample = dist.sample((1000,))
print(f"Normal({mu:.1f}, {sigma:.1f}²):")
print(f" Sample mean: {sample.mean():.4f} (expected: {mu:.1f})")
print(f" Sample std: {sample.std():.4f} (expected: {sigma:.1f})")
print(f" log p(2.0) = {dist.log_prob(torch.tensor(2.0)):.4f}")
print(f" CDF at 2.0 = {dist.cdf(torch.tensor(2.0)):.4f}")
import torch
from torch.distributions import Categorical
# ----- Categorical distribution (softmax output) -----
logits = torch.tensor([2.0, 1.0, -0.5, 3.0])
probs = logits.softmax(dim=-1)
cat = Categorical(probs=probs)
samples = cat.sample((10000,))
print("Categorical distribution:")
print(f" Probs: {probs.tolist()}")
print(f" Sample freqs: {[(samples == k).float().mean().item():.4f for k in range(4)]}")
print(f" Entropy: {cat.entropy():.4f}")
print(f" Max entropy: {torch.log(torch.tensor(4.0)):.4f} (uniform over 4 classes)")
import torch
from torch.distributions import MultivariateNormal
# ----- Multivariate Gaussian -----
mu = torch.tensor([0.0, 0.0])
cov = torch.tensor([[1.0, 0.8], [0.8, 1.0]]) # Correlated
mvn = MultivariateNormal(loc=mu, covariance_matrix=cov)
X = mvn.sample((500,))
print(f"MultivariateNormal:")
print(f" Sample shape: {X.shape}")
print(f" Sample mean: {X.mean(0).tolist()}")
print(f" Sample cov[0,1]: {((X[:,0]-X[:,0].mean())*(X[:,1]-X[:,1].mean())).mean():.4f}")
print(f" (expected ≈ {cov[0,1]:.1f})")
Bayes' Rule
$$P(\theta \mid X) = \frac{P(X \mid \theta)\, P(\theta)}{P(X)}$$
The posterior $P(\theta|X)$ combines the likelihood $P(X|\theta)$ with the prior $P(\theta)$. Maximum Likelihood Estimation (MLE) maximises $P(X|\theta)$ — equivalent to Bayes with a flat prior.
import torch
def log_posterior(theta, X, sigma_likelihood=1.0, sigma_prior=1.0):
"""
Unnormalized log posterior for a Gaussian likelihood and prior.
Model: X_i ~ N(theta, sigma_likelihood^2)
theta ~ N(0, sigma_prior^2)
log p(theta|X) ∝ log p(X|theta) + log p(theta)
"""
# Log likelihood: sum of Gaussian log probs
log_lik = -0.5 * ((X - theta) ** 2).sum() / sigma_likelihood ** 2
# Log prior: Gaussian N(0, sigma_prior)
log_prior = -0.5 * (theta ** 2) / sigma_prior ** 2
return log_lik + log_prior
# True theta = 3.0, observe 20 noisy samples
torch.manual_seed(42)
true_theta = torch.tensor(3.0)
X = true_theta + torch.randn(20)
# Grid search over theta values
thetas = torch.linspace(0, 6, 200)
log_posts = torch.stack([log_posterior(t, X) for t in thetas])
mle_theta = X.mean()
map_theta = thetas[log_posts.argmax()]
print(f"True theta: {true_theta.item():.2f}")
print(f"MLE (sample mean): {mle_theta.item():.4f}")
print(f"MAP (log-posterior peak): {map_theta.item():.4f}")
print(f" MAP is shrunk toward 0 by the prior — this is L2 regularization!")
print(f" With flat prior (sigma_prior→∞), MAP = MLE")
Information Theory
Entropy & Cross-Entropy
Entropy measures how uncertain a distribution is:
$$H(P) = -\sum_x P(x) \log P(x)$$
Cross-entropy — used as the standard classification loss — measures how well $Q$ approximates $P$:
$$H(P, Q) = -\sum_x P(x) \log Q(x)$$
import torch
import torch.nn.functional as F
def entropy(probs, eps=1e-10):
"""Shannon entropy of a probability distribution."""
return -(probs * (probs + eps).log()).sum(dim=-1)
def cross_entropy_manual(p_true, q_pred_log, eps=1e-10):
"""H(P, Q) = -sum(P * log Q)."""
return -(p_true * q_pred_log).sum(dim=-1)
# Binary classification example
probs_true = torch.tensor([
[1.0, 0.0], # Hard label: class 0
[0.9, 0.1], # Soft label: mostly class 0
[0.5, 0.5], # Maximum uncertainty
])
probs_pred = torch.tensor([
[0.95, 0.05], # Good prediction
[0.6, 0.4 ], # Uncertain prediction
[0.3, 0.7 ], # Wrong prediction
])
print("Entropy (measures label uncertainty):")
for i, p in enumerate(probs_true):
print(f" P={p.tolist()}: H = {entropy(p):.4f}")
print("\nCross-entropy loss H(P, Q):")
log_q = probs_pred.log()
for i in range(len(probs_true)):
ce = cross_entropy_manual(probs_true[i], log_q[i])
print(f" P={probs_true[i].tolist()}, Q={probs_pred[i].tolist()}: H(P,Q) = {ce:.4f}")
# Verify matches PyTorch's F.cross_entropy (expects hard integer labels)
hard_labels = torch.argmax(probs_true, dim=1)
pt_ce = F.cross_entropy(probs_pred.log(), hard_labels, reduction='none')
print(f"\nPyTorch F.cross_entropy (hard labels): {pt_ce.tolist()}")
KL Divergence
KL divergence measures how much information is lost when $Q$ approximates $P$:
$$D_{KL}(P \| Q) = \sum_x P(x) \log \frac{P(x)}{Q(x)} = H(P, Q) - H(P)$$
import torch
import torch.nn.functional as F
from torch.distributions import Normal
# KL divergence between two Gaussians has a closed form
def kl_gaussian(mu1, sigma1, mu2, sigma2):
"""
Analytical KL(N(mu1,sigma1²) || N(mu2,sigma2²)).
Used in VAE: KL(q(z|x) || p(z)) where p(z) = N(0,1).
"""
return (
torch.log(sigma2 / sigma1)
+ (sigma1**2 + (mu1 - mu2)**2) / (2 * sigma2**2)
- 0.5
)
# VAE use-case: KL between learned posterior q and standard normal prior
mu_q = torch.tensor([1.5, -0.5, 0.0])
sigma_q = torch.tensor([0.8, 1.2, 1.0])
mu_p = torch.zeros(3)
sigma_p = torch.ones(3)
kl = kl_gaussian(mu_q, sigma_q, mu_p, sigma_p)
print("KL(q(z|x) || p(z)) per latent dimension:")
for i in range(3):
print(f" dim {i}: mu={mu_q[i]:.1f}, sigma={sigma_q[i]:.1f} → KL = {kl[i]:.4f}")
print(f"Total KL: {kl.sum():.4f} (penalizes posterior moving away from N(0,1))")
# Verify with PyTorch distributions
d1 = Normal(mu_q, sigma_q)
d2 = Normal(mu_p, sigma_p)
kl_torch = torch.distributions.kl_divergence(d1, d2)
print(f"\nPyTorch kl_divergence: {kl_torch.tolist()}")
print(f"Match: {torch.allclose(kl, kl_torch, atol=1e-5)}")
Linear Algebra
Matrix Operations
import torch
# Core operations every deep learning practitioner needs
torch.manual_seed(42)
A = torch.randn(4, 3)
B = torch.randn(3, 5)
# Matrix multiplication (the fundamental operation in neural nets)
C = A @ B # Same as torch.matmul(A, B)
print(f"A ({A.shape}) @ B ({B.shape}) = C ({C.shape})")
# Batch matrix multiply (batch_size × seq_len × d_model patterns)
batch_A = torch.randn(8, 4, 3)
batch_B = torch.randn(8, 3, 5)
batch_C = batch_A @ batch_B
print(f"Batch: {batch_A.shape} @ {batch_B.shape} = {batch_C.shape}")
# Norms
x = torch.tensor([3.0, 4.0])
print(f"\nNorms of {x.tolist()}:")
print(f" L1 (||x||_1): {x.norm(p=1):.4f} (sum of abs values)")
print(f" L2 (||x||_2): {x.norm(p=2):.4f} (Euclidean length)")
print(f" Linf: {x.norm(p=float('inf')):.4f} (max abs value)")
# Cosine similarity (attention, embedding comparisons)
a = torch.tensor([1.0, 2.0, 3.0])
b = torch.tensor([2.0, 2.0, 1.0])
cos_sim = (a @ b) / (a.norm() * b.norm())
print(f"\nCos similarity: {cos_sim:.4f}")
print(f"Via F.cosine_similarity: {torch.nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item():.4f}")
SVD & Eigendecomposition
SVD decomposes any matrix $A = U \Sigma V^T$ where $U,V$ are orthogonal and $\Sigma$ is diagonal. This underlies PCA, low-rank approximation, and numerical stability analysis of neural network weight matrices.
import torch
torch.manual_seed(42)
A = torch.randn(6, 4)
# Singular Value Decomposition
U, S, Vh = torch.linalg.svd(A, full_matrices=False)
print(f"SVD: A ({A.shape})")
print(f" U: {U.shape}, S: {S.shape}, Vh: {Vh.shape}")
print(f" Singular values: {S.tolist()}")
# Verify reconstruction
A_reconstructed = U @ torch.diag(S) @ Vh
print(f" Max reconstruction error: {(A - A_reconstructed).abs().max():.2e}")
# Low-rank approximation (keep only k singular values)
k = 2
A_low_rank = U[:, :k] @ torch.diag(S[:k]) @ Vh[:k, :]
print(f"\nRank-{k} approximation:")
print(f" Frobenius error: {(A - A_low_rank).norm():.4f}")
print(f" Original rank: {torch.linalg.matrix_rank(A).item()}")
# Nuclear norm (sum of singular values) — used in matrix completion
print(f" Nuclear norm: {S.sum():.4f} (promotes low-rank solutions)")
import torch
# Eigendecomposition for symmetric matrices (covariance, Hessian, etc.)
torch.manual_seed(42)
X = torch.randn(100, 5)
cov = (X.T @ X) / (100 - 1) # 5×5 covariance matrix
eigenvalues, eigenvectors = torch.linalg.eigh(cov) # eigh: faster for symmetric
# Sorted ascending by torch.linalg.eigh
eigenvalues_desc = eigenvalues.flip(0)
print(f"Covariance matrix eigenvalues (descending): {eigenvalues_desc.tolist()}")
print(f"\nConnections to deep learning:")
print(f" - PCA uses top-k eigenvectors of covariance (or SVD of data)")
print(f" - Hessian eigenvalues indicate loss surface curvature")
print(f" - Large eigenvalue ratio → ill-conditioned optimization")
print(f" - Condition number: {eigenvalues_desc[0] / (eigenvalues_desc[-1] + 1e-10):.2f}")
Calculus & Autograd
Chain Rule
Every backpropagation pass is just the multivariable chain rule applied repeatedly through the computational graph:
$$\frac{\partial L}{\partial W_1} = \frac{\partial L}{\partial y} \cdot \frac{\partial y}{\partial h} \cdot \frac{\partial h}{\partial W_1}$$
import torch
# Explicitly trace the chain rule through a 2-layer network
torch.manual_seed(42)
x = torch.randn(3, requires_grad=False)
W1 = torch.randn(4, 3, requires_grad=True)
W2 = torch.randn(2, 4, requires_grad=True)
# Forward pass
h = torch.relu(W1 @ x) # Hidden layer
y = W2 @ h # Output
L = y.pow(2).sum() # Scalar loss
# Autograd backward
L.backward()
# Manual chain rule verification
with torch.no_grad():
dL_dy = 2 * y # dL/dy
dL_dh = W2.T @ dL_dy # dL/dh via chain rule
dL_dW2 = dL_dy.unsqueeze(1) * h.unsqueeze(0) # dL/dW2
relu_mask = (W1 @ x > 0).float() # ReLU derivative
dL_dW1 = (dL_dh * relu_mask).unsqueeze(1) * x.unsqueeze(0)
print("Gradient comparison (autograd vs manual chain rule):")
print(f" W2 max abs diff: {(W2.grad - dL_dW2).abs().max():.2e}")
print(f" W1 max abs diff: {(W1.grad - dL_dW1).abs().max():.2e}")
print(" Both match to numerical precision. Autograd IS the chain rule.")
Jacobian & Hessian
import torch
from torch.func import jacrev, hessian
def f(W, x):
"""Simple linear + ReLU function."""
return torch.relu(W @ x)
torch.manual_seed(42)
W = torch.randn(3, 4)
x = torch.randn(4)
# Jacobian of f w.r.t. W (3×12 matrix: d(output_i)/d(W_jk))
J = jacrev(f)(W, x)
print(f"Jacobian of f w.r.t. W: shape = {J.shape}")
print(f" Outer dims = output shape {f(W,x).shape}")
print(f" Inner dims = W shape {W.shape}")
# Scalar function for Hessian
def scalar_loss(w):
return (w ** 2 - 2 * w + 1).sum() # Quadratic bowl
w = torch.tensor([1.5, 0.5, 2.0])
H = hessian(scalar_loss)(w)
print(f"\nHessian of quadratic loss: shape = {H.shape}")
print(f" Diagonal: {H.diagonal().tolist()} (expected: [2, 2, 2] for d²/dw² of w²)")
print(f" Hessian condition number measures optimization difficulty")
Optimization
Gradient Descent Variants
import torch
def rosenbrock(x, y):
"""
Rosenbrock function: non-convex, curved valley.
Minimum at (1, 1) with f=0.
Classic optimization test function.
"""
return (1 - x)**2 + 100 * (y - x**2)**2
# Compare optimizers on Rosenbrock
torch.manual_seed(42)
start = torch.tensor([-1.0, 1.0])
def run_optimizer(optimizer_fn, n_steps=500):
p = start.clone().requires_grad_(True)
opt = optimizer_fn([p])
history = []
for _ in range(n_steps):
opt.zero_grad()
loss = rosenbrock(p[0], p[1])
loss.backward()
opt.step()
history.append(loss.item())
return p.detach(), history[-1]
configs = {
'SGD (lr=0.001)': lambda p: torch.optim.SGD(p, lr=0.001),
'SGD+momentum': lambda p: torch.optim.SGD(p, lr=0.001, momentum=0.9),
'Adam': lambda p: torch.optim.Adam(p, lr=0.01),
'AdamW (wd=0.01)': lambda p: torch.optim.AdamW(p, lr=0.01, weight_decay=0.01),
'RMSprop': lambda p: torch.optim.RMSprop(p, lr=0.01),
}
print(f"{'Optimizer':<22} | {'Final loss':>10} | {'x':>7} | {'y':>7}")
print("-" * 55)
for name, fn in configs.items():
p_final, final_loss = run_optimizer(fn)
print(f"{name:<22} | {final_loss:10.6f} | {p_final[0]:.5f} | {p_final[1]:.5f}")
print("\nOptimum: x=1.0, y=1.0, f=0")
Convexity & Saddle Points
import torch
# Visualize the loss landscape near a saddle point
# f(x,y) = x^2 - y^2 has a saddle at (0,0)
# The neural network loss has many such points
def saddle(x, y):
return x**2 - y**2
x_vals = torch.linspace(-2, 2, 20)
y_vals = torch.linspace(-2, 2, 20)
# Curvature along each axis
x_point = torch.tensor(0.01, requires_grad=True)
y_point = torch.tensor(0.01, requires_grad=True)
loss = saddle(x_point, y_point)
loss.backward()
print("Saddle point analysis at (0, 0):")
print(f" df/dx = {x_point.grad.item():.4f} (positive: gradient points away)")
print(f" df/dy = {y_point.grad.item():.4f} (negative: gradient points toward)")
print("\nKey concepts for neural network optimization:")
print(" Convex: Every local minimum is the global minimum")
print(" Non-convex: NN loss surfaces — many local minima and saddle points")
print(" Saddle point: Gradient = 0, but not a minimum (curvature mixed)")
print(" Flat region: Very small gradient — training stalls")
print(" Batch norm + skip connections help smooth the loss landscape")
loss.backward() computes Jacobians. The optimizer implements a gradient descent variant on a non-convex landscape. Understanding this math lets you diagnose training failures, choose appropriate regularisation, and debug gradient flow.