Back to PyTorch Mastery Series

Mathematical Foundations for PyTorch

May 29, 2026 Wasil Zafar 35 min read

Probability, information theory, linear algebra, and optimization form the bedrock of every deep learning model. This reference article implements each concept with PyTorch tensors so you can connect the math to the code you write every day.

Table of Contents

  1. Probability Theory
  2. Information Theory
  3. Linear Algebra
  4. Calculus & Autograd
  5. Optimization
  6. Related Articles

Probability Theory

Probability theory gives us the language to talk about uncertainty. In deep learning it underpins maximum likelihood estimation, Bayesian networks, variational autoencoders, and every stochastic element of training.

Distributions in PyTorch

import torch
from torch.distributions import (
    Normal, Bernoulli, Categorical, MultivariateNormal, Beta, Dirichlet
)


# ----- Univariate Gaussian -----
mu, sigma = torch.tensor(2.0), torch.tensor(1.5)
dist = Normal(mu, sigma)

x = torch.linspace(-2, 6, 100)
log_probs = dist.log_prob(x)

sample = dist.sample((1000,))
print(f"Normal({mu:.1f}, {sigma:.1f}²):")
print(f"  Sample mean: {sample.mean():.4f} (expected: {mu:.1f})")
print(f"  Sample std:  {sample.std():.4f}  (expected: {sigma:.1f})")
print(f"  log p(2.0) = {dist.log_prob(torch.tensor(2.0)):.4f}")
print(f"  CDF at 2.0 = {dist.cdf(torch.tensor(2.0)):.4f}")
import torch
from torch.distributions import Categorical


# ----- Categorical distribution (softmax output) -----
logits = torch.tensor([2.0, 1.0, -0.5, 3.0])
probs  = logits.softmax(dim=-1)

cat = Categorical(probs=probs)
samples = cat.sample((10000,))

print("Categorical distribution:")
print(f"  Probs:        {probs.tolist()}")
print(f"  Sample freqs: {[(samples == k).float().mean().item():.4f for k in range(4)]}")
print(f"  Entropy:      {cat.entropy():.4f}")
print(f"  Max entropy:  {torch.log(torch.tensor(4.0)):.4f}  (uniform over 4 classes)")
import torch
from torch.distributions import MultivariateNormal


# ----- Multivariate Gaussian -----
mu    = torch.tensor([0.0, 0.0])
cov   = torch.tensor([[1.0, 0.8], [0.8, 1.0]])   # Correlated

mvn = MultivariateNormal(loc=mu, covariance_matrix=cov)
X = mvn.sample((500,))

print(f"MultivariateNormal:")
print(f"  Sample shape:    {X.shape}")
print(f"  Sample mean:     {X.mean(0).tolist()}")
print(f"  Sample cov[0,1]: {((X[:,0]-X[:,0].mean())*(X[:,1]-X[:,1].mean())).mean():.4f}")
print(f"                   (expected ≈ {cov[0,1]:.1f})")

Bayes' Rule

$$P(\theta \mid X) = \frac{P(X \mid \theta)\, P(\theta)}{P(X)}$$

The posterior $P(\theta|X)$ combines the likelihood $P(X|\theta)$ with the prior $P(\theta)$. Maximum Likelihood Estimation (MLE) maximises $P(X|\theta)$ — equivalent to Bayes with a flat prior.

import torch


def log_posterior(theta, X, sigma_likelihood=1.0, sigma_prior=1.0):
    """
    Unnormalized log posterior for a Gaussian likelihood and prior.

    Model: X_i ~ N(theta, sigma_likelihood^2)
           theta ~ N(0, sigma_prior^2)

    log p(theta|X) ∝ log p(X|theta) + log p(theta)
    """
    # Log likelihood: sum of Gaussian log probs
    log_lik = -0.5 * ((X - theta) ** 2).sum() / sigma_likelihood ** 2

    # Log prior: Gaussian N(0, sigma_prior)
    log_prior = -0.5 * (theta ** 2) / sigma_prior ** 2

    return log_lik + log_prior


# True theta = 3.0, observe 20 noisy samples
torch.manual_seed(42)
true_theta = torch.tensor(3.0)
X = true_theta + torch.randn(20)

# Grid search over theta values
thetas = torch.linspace(0, 6, 200)
log_posts = torch.stack([log_posterior(t, X) for t in thetas])

mle_theta = X.mean()
map_theta = thetas[log_posts.argmax()]

print(f"True theta: {true_theta.item():.2f}")
print(f"MLE (sample mean): {mle_theta.item():.4f}")
print(f"MAP (log-posterior peak): {map_theta.item():.4f}")
print(f"  MAP is shrunk toward 0 by the prior — this is L2 regularization!")
print(f"  With flat prior (sigma_prior→∞), MAP = MLE")

Information Theory

Entropy & Cross-Entropy

Entropy measures how uncertain a distribution is:

$$H(P) = -\sum_x P(x) \log P(x)$$

Cross-entropy — used as the standard classification loss — measures how well $Q$ approximates $P$:

$$H(P, Q) = -\sum_x P(x) \log Q(x)$$

import torch
import torch.nn.functional as F


def entropy(probs, eps=1e-10):
    """Shannon entropy of a probability distribution."""
    return -(probs * (probs + eps).log()).sum(dim=-1)


def cross_entropy_manual(p_true, q_pred_log, eps=1e-10):
    """H(P, Q) = -sum(P * log Q)."""
    return -(p_true * q_pred_log).sum(dim=-1)


# Binary classification example
probs_true = torch.tensor([
    [1.0, 0.0],   # Hard label: class 0
    [0.9, 0.1],   # Soft label: mostly class 0
    [0.5, 0.5],   # Maximum uncertainty
])

probs_pred = torch.tensor([
    [0.95, 0.05],   # Good prediction
    [0.6,  0.4 ],   # Uncertain prediction
    [0.3,  0.7 ],   # Wrong prediction
])

print("Entropy (measures label uncertainty):")
for i, p in enumerate(probs_true):
    print(f"  P={p.tolist()}: H = {entropy(p):.4f}")

print("\nCross-entropy loss H(P, Q):")
log_q = probs_pred.log()
for i in range(len(probs_true)):
    ce = cross_entropy_manual(probs_true[i], log_q[i])
    print(f"  P={probs_true[i].tolist()}, Q={probs_pred[i].tolist()}: H(P,Q) = {ce:.4f}")

# Verify matches PyTorch's F.cross_entropy (expects hard integer labels)
hard_labels = torch.argmax(probs_true, dim=1)
pt_ce = F.cross_entropy(probs_pred.log(), hard_labels, reduction='none')
print(f"\nPyTorch F.cross_entropy (hard labels): {pt_ce.tolist()}")

KL Divergence

KL divergence measures how much information is lost when $Q$ approximates $P$:

$$D_{KL}(P \| Q) = \sum_x P(x) \log \frac{P(x)}{Q(x)} = H(P, Q) - H(P)$$

import torch
import torch.nn.functional as F
from torch.distributions import Normal


# KL divergence between two Gaussians has a closed form
def kl_gaussian(mu1, sigma1, mu2, sigma2):
    """
    Analytical KL(N(mu1,sigma1²) || N(mu2,sigma2²)).
    Used in VAE: KL(q(z|x) || p(z)) where p(z) = N(0,1).
    """
    return (
        torch.log(sigma2 / sigma1)
        + (sigma1**2 + (mu1 - mu2)**2) / (2 * sigma2**2)
        - 0.5
    )


# VAE use-case: KL between learned posterior q and standard normal prior
mu_q    = torch.tensor([1.5, -0.5, 0.0])
sigma_q = torch.tensor([0.8,  1.2,  1.0])
mu_p    = torch.zeros(3)
sigma_p = torch.ones(3)

kl = kl_gaussian(mu_q, sigma_q, mu_p, sigma_p)
print("KL(q(z|x) || p(z)) per latent dimension:")
for i in range(3):
    print(f"  dim {i}: mu={mu_q[i]:.1f}, sigma={sigma_q[i]:.1f} → KL = {kl[i]:.4f}")
print(f"Total KL: {kl.sum():.4f}  (penalizes posterior moving away from N(0,1))")

# Verify with PyTorch distributions
d1 = Normal(mu_q, sigma_q)
d2 = Normal(mu_p, sigma_p)
kl_torch = torch.distributions.kl_divergence(d1, d2)
print(f"\nPyTorch kl_divergence: {kl_torch.tolist()}")
print(f"Match: {torch.allclose(kl, kl_torch, atol=1e-5)}")

Linear Algebra

Matrix Operations

import torch


# Core operations every deep learning practitioner needs
torch.manual_seed(42)

A = torch.randn(4, 3)
B = torch.randn(3, 5)

# Matrix multiplication (the fundamental operation in neural nets)
C = A @ B                          # Same as torch.matmul(A, B)
print(f"A ({A.shape}) @ B ({B.shape}) = C ({C.shape})")

# Batch matrix multiply (batch_size × seq_len × d_model patterns)
batch_A = torch.randn(8, 4, 3)
batch_B = torch.randn(8, 3, 5)
batch_C = batch_A @ batch_B
print(f"Batch: {batch_A.shape} @ {batch_B.shape} = {batch_C.shape}")

# Norms
x = torch.tensor([3.0, 4.0])
print(f"\nNorms of {x.tolist()}:")
print(f"  L1 (||x||_1): {x.norm(p=1):.4f}  (sum of abs values)")
print(f"  L2 (||x||_2): {x.norm(p=2):.4f}  (Euclidean length)")
print(f"  Linf:         {x.norm(p=float('inf')):.4f}  (max abs value)")

# Cosine similarity (attention, embedding comparisons)
a = torch.tensor([1.0, 2.0, 3.0])
b = torch.tensor([2.0, 2.0, 1.0])
cos_sim = (a @ b) / (a.norm() * b.norm())
print(f"\nCos similarity: {cos_sim:.4f}")
print(f"Via F.cosine_similarity: {torch.nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item():.4f}")

SVD & Eigendecomposition

SVD decomposes any matrix $A = U \Sigma V^T$ where $U,V$ are orthogonal and $\Sigma$ is diagonal. This underlies PCA, low-rank approximation, and numerical stability analysis of neural network weight matrices.

import torch


torch.manual_seed(42)
A = torch.randn(6, 4)

# Singular Value Decomposition
U, S, Vh = torch.linalg.svd(A, full_matrices=False)
print(f"SVD: A ({A.shape})")
print(f"  U: {U.shape}, S: {S.shape}, Vh: {Vh.shape}")
print(f"  Singular values: {S.tolist()}")

# Verify reconstruction
A_reconstructed = U @ torch.diag(S) @ Vh
print(f"  Max reconstruction error: {(A - A_reconstructed).abs().max():.2e}")

# Low-rank approximation (keep only k singular values)
k = 2
A_low_rank = U[:, :k] @ torch.diag(S[:k]) @ Vh[:k, :]
print(f"\nRank-{k} approximation:")
print(f"  Frobenius error: {(A - A_low_rank).norm():.4f}")
print(f"  Original rank:  {torch.linalg.matrix_rank(A).item()}")

# Nuclear norm (sum of singular values) — used in matrix completion
print(f"  Nuclear norm: {S.sum():.4f}   (promotes low-rank solutions)")
import torch


# Eigendecomposition for symmetric matrices (covariance, Hessian, etc.)
torch.manual_seed(42)
X = torch.randn(100, 5)
cov = (X.T @ X) / (100 - 1)   # 5×5 covariance matrix

eigenvalues, eigenvectors = torch.linalg.eigh(cov)   # eigh: faster for symmetric
# Sorted ascending by torch.linalg.eigh
eigenvalues_desc = eigenvalues.flip(0)

print(f"Covariance matrix eigenvalues (descending): {eigenvalues_desc.tolist()}")
print(f"\nConnections to deep learning:")
print(f"  - PCA uses top-k eigenvectors of covariance (or SVD of data)")
print(f"  - Hessian eigenvalues indicate loss surface curvature")
print(f"  - Large eigenvalue ratio → ill-conditioned optimization")
print(f"  - Condition number: {eigenvalues_desc[0] / (eigenvalues_desc[-1] + 1e-10):.2f}")

Calculus & Autograd

Chain Rule

Every backpropagation pass is just the multivariable chain rule applied repeatedly through the computational graph:

$$\frac{\partial L}{\partial W_1} = \frac{\partial L}{\partial y} \cdot \frac{\partial y}{\partial h} \cdot \frac{\partial h}{\partial W_1}$$

import torch


# Explicitly trace the chain rule through a 2-layer network
torch.manual_seed(42)
x  = torch.randn(3, requires_grad=False)
W1 = torch.randn(4, 3, requires_grad=True)
W2 = torch.randn(2, 4, requires_grad=True)

# Forward pass
h  = torch.relu(W1 @ x)        # Hidden layer
y  = W2 @ h                    # Output
L  = y.pow(2).sum()            # Scalar loss

# Autograd backward
L.backward()

# Manual chain rule verification
with torch.no_grad():
    dL_dy  = 2 * y                              # dL/dy
    dL_dh  = W2.T @ dL_dy                       # dL/dh via chain rule
    dL_dW2 = dL_dy.unsqueeze(1) * h.unsqueeze(0)  # dL/dW2

    relu_mask = (W1 @ x > 0).float()            # ReLU derivative
    dL_dW1 = (dL_dh * relu_mask).unsqueeze(1) * x.unsqueeze(0)

print("Gradient comparison (autograd vs manual chain rule):")
print(f"  W2 max abs diff: {(W2.grad - dL_dW2).abs().max():.2e}")
print(f"  W1 max abs diff: {(W1.grad - dL_dW1).abs().max():.2e}")
print("  Both match to numerical precision. Autograd IS the chain rule.")

Jacobian & Hessian

import torch
from torch.func import jacrev, hessian


def f(W, x):
    """Simple linear + ReLU function."""
    return torch.relu(W @ x)


torch.manual_seed(42)
W = torch.randn(3, 4)
x = torch.randn(4)

# Jacobian of f w.r.t. W (3×12 matrix: d(output_i)/d(W_jk))
J = jacrev(f)(W, x)
print(f"Jacobian of f w.r.t. W: shape = {J.shape}")
print(f"  Outer dims = output shape {f(W,x).shape}")
print(f"  Inner dims = W shape {W.shape}")

# Scalar function for Hessian
def scalar_loss(w):
    return (w ** 2 - 2 * w + 1).sum()   # Quadratic bowl

w = torch.tensor([1.5, 0.5, 2.0])
H = hessian(scalar_loss)(w)
print(f"\nHessian of quadratic loss: shape = {H.shape}")
print(f"  Diagonal: {H.diagonal().tolist()}  (expected: [2, 2, 2] for d²/dw² of w²)")
print(f"  Hessian condition number measures optimization difficulty")

Optimization

Gradient Descent Variants

import torch


def rosenbrock(x, y):
    """
    Rosenbrock function: non-convex, curved valley.
    Minimum at (1, 1) with f=0.
    Classic optimization test function.
    """
    return (1 - x)**2 + 100 * (y - x**2)**2


# Compare optimizers on Rosenbrock
torch.manual_seed(42)
start = torch.tensor([-1.0, 1.0])

def run_optimizer(optimizer_fn, n_steps=500):
    p = start.clone().requires_grad_(True)
    opt = optimizer_fn([p])
    history = []
    for _ in range(n_steps):
        opt.zero_grad()
        loss = rosenbrock(p[0], p[1])
        loss.backward()
        opt.step()
        history.append(loss.item())
    return p.detach(), history[-1]

configs = {
    'SGD (lr=0.001)':      lambda p: torch.optim.SGD(p, lr=0.001),
    'SGD+momentum':        lambda p: torch.optim.SGD(p, lr=0.001, momentum=0.9),
    'Adam':                lambda p: torch.optim.Adam(p, lr=0.01),
    'AdamW (wd=0.01)':     lambda p: torch.optim.AdamW(p, lr=0.01, weight_decay=0.01),
    'RMSprop':             lambda p: torch.optim.RMSprop(p, lr=0.01),
}

print(f"{'Optimizer':<22} | {'Final loss':>10} | {'x':>7} | {'y':>7}")
print("-" * 55)
for name, fn in configs.items():
    p_final, final_loss = run_optimizer(fn)
    print(f"{name:<22} | {final_loss:10.6f} | {p_final[0]:.5f} | {p_final[1]:.5f}")
print("\nOptimum: x=1.0, y=1.0, f=0")

Convexity & Saddle Points

import torch


# Visualize the loss landscape near a saddle point
# f(x,y) = x^2 - y^2 has a saddle at (0,0)
# The neural network loss has many such points

def saddle(x, y):
    return x**2 - y**2


x_vals = torch.linspace(-2, 2, 20)
y_vals = torch.linspace(-2, 2, 20)

# Curvature along each axis
x_point = torch.tensor(0.01, requires_grad=True)
y_point = torch.tensor(0.01, requires_grad=True)

loss = saddle(x_point, y_point)
loss.backward()

print("Saddle point analysis at (0, 0):")
print(f"  df/dx = {x_point.grad.item():.4f}  (positive: gradient points away)")
print(f"  df/dy = {y_point.grad.item():.4f}  (negative: gradient points toward)")

print("\nKey concepts for neural network optimization:")
print("  Convex:      Every local minimum is the global minimum")
print("  Non-convex:  NN loss surfaces — many local minima and saddle points")
print("  Saddle point: Gradient = 0, but not a minimum (curvature mixed)")
print("  Flat region: Very small gradient — training stalls")
print("  Batch norm + skip connections help smooth the loss landscape")
Connecting math to practice: Every PyTorch training loop is rooted in these concepts. The forward pass is function composition (chain rule). loss.backward() computes Jacobians. The optimizer implements a gradient descent variant on a non-convex landscape. Understanding this math lets you diagnose training failures, choose appropriate regularisation, and debug gradient flow.