Bayes’ Theorem Foundations
At the heart of Bayesian classification lies a simple but profound equation that inverts the direction of probabilistic reasoning. Instead of asking “what data would we see given a class?” (likelihood), we ask “what class is most probable given the observed data?” (posterior).
$$P(C_k | x) = \frac{P(x | C_k) \cdot P(C_k)}{P(x)}$$
Prior, Likelihood & Posterior
Let’s implement the components of Bayes’ theorem with tensors:
import torch
# Simulate a 2-class problem with known distributions
torch.manual_seed(42)
# Class priors: P(C_0) = 0.6, P(C_1) = 0.4
priors = torch.tensor([0.6, 0.4])
# Class-conditional means and variances (1D for simplicity)
means = torch.tensor([2.0, 5.0]) # mu_0=2, mu_1=5
variances = torch.tensor([1.0, 1.5]) # sigma^2_0=1, sigma^2_1=1.5
# Compute Gaussian likelihood P(x | C_k) for a query point
x_query = torch.tensor(3.5)
# Gaussian PDF: P(x|C_k) = (1/sqrt(2*pi*sigma^2)) * exp(-(x-mu)^2 / (2*sigma^2))
likelihoods = (1.0 / torch.sqrt(2 * torch.pi * variances)) * \
torch.exp(-(x_query - means)**2 / (2 * variances))
# Posterior: P(C_k|x) ∝ P(x|C_k) * P(C_k)
unnormalized_posterior = likelihoods * priors
posterior = unnormalized_posterior / unnormalized_posterior.sum()
print(f"Query x = {x_query.item()}")
print(f"Likelihoods: P(x|C_0)={likelihoods[0]:.4f}, P(x|C_1)={likelihoods[1]:.4f}")
print(f"Posterior: P(C_0|x)={posterior[0]:.4f}, P(C_1|x)={posterior[1]:.4f}")
print(f"Predicted class: {posterior.argmax().item()}")
Naive Bayes Classifier
The “naive” assumption: all features are conditionally independent given the class. This reduces a multivariate density estimation problem to $d$ univariate problems:
$$P(x | C_k) = \prod_{i=1}^{d} P(x_i | C_k)$$
Gaussian Naive Bayes from Scratch
import torch
class GaussianNaiveBayes:
"""Gaussian Naive Bayes classifier in pure PyTorch."""
def __init__(self):
self.class_priors = None
self.means = None # (num_classes, num_features)
self.variances = None # (num_classes, num_features)
def fit(self, X, y):
"""Estimate class priors and per-feature Gaussian parameters."""
X = X.float()
classes = torch.unique(y)
num_classes = len(classes)
num_features = X.shape[1]
self.means = torch.zeros(num_classes, num_features)
self.variances = torch.zeros(num_classes, num_features)
self.class_priors = torch.zeros(num_classes)
for k in range(num_classes):
X_k = X[y == k]
self.class_priors[k] = len(X_k) / len(X)
self.means[k] = X_k.mean(dim=0)
self.variances[k] = X_k.var(dim=0) + 1e-9 # Smoothing
return self
def _log_likelihood(self, X):
"""Compute log P(x|C_k) for all classes (vectorized)."""
# X: (N, D), means: (K, D), variances: (K, D)
# Expand for broadcasting: X -> (N, 1, D), means -> (1, K, D)
X_exp = X.unsqueeze(1) # (N, 1, D)
mu = self.means.unsqueeze(0) # (1, K, D)
var = self.variances.unsqueeze(0) # (1, K, D)
# Log Gaussian: -0.5 * (log(2*pi*var) + (x-mu)^2/var)
log_probs = -0.5 * (torch.log(2 * torch.pi * var) + (X_exp - mu)**2 / var)
# Sum across features (independence assumption)
return log_probs.sum(dim=2) # (N, K)
def predict(self, X):
"""Predict class labels."""
X = X.float()
log_likelihood = self._log_likelihood(X) # (N, K)
log_prior = torch.log(self.class_priors) # (K,)
log_posterior = log_likelihood + log_prior # (N, K)
return log_posterior.argmax(dim=1)
def predict_proba(self, X):
"""Predict class probabilities."""
X = X.float()
log_likelihood = self._log_likelihood(X)
log_prior = torch.log(self.class_priors)
log_posterior = log_likelihood + log_prior
# Softmax to get proper probabilities
return torch.softmax(log_posterior, dim=1)
# Demo
torch.manual_seed(42)
X_train = torch.cat([
torch.randn(100, 4) + torch.tensor([2, 0, -1, 1]), # Class 0
torch.randn(100, 4) + torch.tensor([-2, 1, 1, -1]), # Class 1
torch.randn(100, 4) + torch.tensor([0, -2, 2, 0]), # Class 2
])
y_train = torch.cat([torch.full((100,), i) for i in range(3)]).long()
gnb = GaussianNaiveBayes()
gnb.fit(X_train, y_train)
# Test
X_test = torch.tensor([[2.0, 0.0, -1.0, 1.0], [-2.0, 1.0, 1.0, -1.0], [0.0, 0.0, 0.0, 0.0]])
predictions = gnb.predict(X_test)
probabilities = gnb.predict_proba(X_test)
print(f"Predictions: {predictions.tolist()}")
print(f"Confidences: {probabilities.max(dim=1).values.tolist()}")
Generative vs Discriminative Models
This fundamental distinction shapes how we think about classification:
Generative vs Discriminative
Generative models (Naive Bayes, LDA, HMMs) learn $P(x|C_k)$ and $P(C_k)$ — they model how data is generated. Discriminative models (Logistic Regression, Neural Networks, SVMs) learn $P(C_k|x)$ directly — they only model the decision boundary.
import torch
import torch.nn as nn
# Compare: Generative (Naive Bayes) vs Discriminative (Logistic Regression)
torch.manual_seed(42)
# Create dataset
X_class0 = torch.randn(500, 2) + torch.tensor([1.0, 1.0])
X_class1 = torch.randn(500, 2) + torch.tensor([-1.0, -1.0])
X_train = torch.cat([X_class0, X_class1])
y_train = torch.cat([torch.zeros(500), torch.ones(500)]).long()
# Generative: Gaussian NB (already implemented above)
gnb = GaussianNaiveBayes()
gnb.fit(X_train, y_train)
gnb_preds = gnb.predict(X_train)
gnb_acc = (gnb_preds == y_train).float().mean()
# Discriminative: Simple Logistic Regression
model = nn.Linear(2, 2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
criterion = nn.CrossEntropyLoss()
for epoch in range(100):
logits = model(X_train)
loss = criterion(logits, y_train)
optimizer.zero_grad()
loss.backward()
optimizer.step()
lr_preds = model(X_train).argmax(dim=1)
lr_acc = (lr_preds == y_train).float().mean()
print(f"Gaussian NB accuracy: {gnb_acc:.4f}")
print(f"Logistic Reg accuracy: {lr_acc:.4f}")
Linear Discriminant Analysis (LDA)
LDA is a generative classifier that assumes all classes share the same covariance matrix $\Sigma$. This shared covariance produces linear decision boundaries between classes.
$$\delta_k(x) = x^T \Sigma^{-1} \mu_k - \frac{1}{2} \mu_k^T \Sigma^{-1} \mu_k + \log P(C_k)$$
import torch
class LDAClassifier:
"""Linear Discriminant Analysis using shared covariance."""
def __init__(self):
self.means = None
self.shared_cov_inv = None
self.log_priors = None
def fit(self, X, y):
X = X.float()
classes = torch.unique(y)
num_classes = len(classes)
n, d = X.shape
self.means = torch.zeros(num_classes, d)
self.log_priors = torch.zeros(num_classes)
shared_cov = torch.zeros(d, d)
for k in range(num_classes):
X_k = X[y == k]
self.means[k] = X_k.mean(dim=0)
self.log_priors[k] = torch.log(torch.tensor(len(X_k) / n))
# Within-class scatter
diff = X_k - self.means[k]
shared_cov += diff.T @ diff
shared_cov /= (n - num_classes)
# Add regularization for numerical stability
shared_cov += 1e-6 * torch.eye(d)
self.shared_cov_inv = torch.linalg.inv(shared_cov)
return self
def predict(self, X):
X = X.float()
# Discriminant function for each class
# delta_k(x) = x @ Sigma_inv @ mu_k - 0.5 * mu_k @ Sigma_inv @ mu_k + log_prior_k
scores = X @ self.shared_cov_inv @ self.means.T # (N, K)
bias = -0.5 * (self.means @ self.shared_cov_inv * self.means).sum(dim=1) # (K,)
scores = scores + bias + self.log_priors
return scores.argmax(dim=1)
# Demo
torch.manual_seed(42)
X_train = torch.cat([
torch.randn(200, 2) @ torch.tensor([[1.0, 0.5], [0.5, 1.0]]) + torch.tensor([3, 0]),
torch.randn(200, 2) @ torch.tensor([[1.0, 0.5], [0.5, 1.0]]) + torch.tensor([-3, 0]),
torch.randn(200, 2) @ torch.tensor([[1.0, 0.5], [0.5, 1.0]]) + torch.tensor([0, 3]),
])
y_train = torch.cat([torch.full((200,), i) for i in range(3)]).long()
lda = LDAClassifier()
lda.fit(X_train, y_train)
preds = lda.predict(X_train)
accuracy = (preds == y_train).float().mean()
print(f"LDA training accuracy: {accuracy:.4f}")
Quadratic Discriminant Analysis (QDA)
QDA relaxes LDA’s shared covariance assumption — each class gets its own covariance matrix $\Sigma_k$. This produces quadratic (curved) decision boundaries:
$$\delta_k(x) = -\frac{1}{2} \log|\Sigma_k| - \frac{1}{2}(x - \mu_k)^T \Sigma_k^{-1} (x - \mu_k) + \log P(C_k)$$
import torch
class QDAClassifier:
"""Quadratic Discriminant Analysis with per-class covariance."""
def __init__(self):
self.means = None
self.cov_invs = None
self.log_dets = None
self.log_priors = None
def fit(self, X, y):
X = X.float()
classes = torch.unique(y)
num_classes = len(classes)
n, d = X.shape
self.means = torch.zeros(num_classes, d)
self.cov_invs = torch.zeros(num_classes, d, d)
self.log_dets = torch.zeros(num_classes)
self.log_priors = torch.zeros(num_classes)
for k in range(num_classes):
X_k = X[y == k]
self.means[k] = X_k.mean(dim=0)
self.log_priors[k] = torch.log(torch.tensor(len(X_k) / n))
diff = X_k - self.means[k]
cov_k = (diff.T @ diff) / (len(X_k) - 1) + 1e-6 * torch.eye(d)
self.cov_invs[k] = torch.linalg.inv(cov_k)
self.log_dets[k] = torch.linalg.slogdet(cov_k).logabsdet
return self
def predict(self, X):
X = X.float()
num_classes = self.means.shape[0]
n = X.shape[0]
scores = torch.zeros(n, num_classes)
for k in range(num_classes):
diff = X - self.means[k] # (N, D)
mahal = (diff @ self.cov_invs[k] * diff).sum(dim=1) # (N,)
scores[:, k] = -0.5 * self.log_dets[k] - 0.5 * mahal + self.log_priors[k]
return scores.argmax(dim=1)
# Demo: QDA with different class shapes
torch.manual_seed(42)
# Class 0: round cluster, Class 1: elongated cluster
X_round = torch.randn(200, 2) + torch.tensor([2, 2])
X_elongated = torch.randn(200, 2) @ torch.tensor([[3.0, 0.0], [0.0, 0.3]]) + torch.tensor([-2, -1])
X_train = torch.cat([X_round, X_elongated])
y_train = torch.cat([torch.zeros(200), torch.ones(200)]).long()
qda = QDAClassifier()
qda.fit(X_train, y_train)
preds = qda.predict(X_train)
print(f"QDA accuracy: {(preds == y_train).float().mean():.4f}")
MNIST with Bayes Classifiers
import torch
import torchvision
import torchvision.transforms as transforms
# Load MNIST
train_data = torchvision.datasets.MNIST(root='./data', train=True, download=True,
transform=transforms.ToTensor())
test_data = torchvision.datasets.MNIST(root='./data', train=False, download=True,
transform=transforms.ToTensor())
X_train = train_data.data.float().reshape(-1, 784) / 255.0
y_train = train_data.targets
X_test = test_data.data.float().reshape(-1, 784) / 255.0
y_test = test_data.targets
# Use subset for QDA (full 784D covariance is expensive)
# PCA first to reduce dimensionality for LDA/QDA
X_mean = X_train.mean(dim=0)
X_centered = X_train - X_mean
U, S, Vt = torch.linalg.svd(X_centered, full_matrices=False)
n_components = 50
X_train_pca = X_centered @ Vt[:n_components].T
X_test_pca = (X_test - X_mean) @ Vt[:n_components].T
print(f"Reduced shape: {X_train_pca.shape}")
# Gaussian Naive Bayes on full features
gnb = GaussianNaiveBayes()
gnb.fit(X_train, y_train)
gnb_preds = gnb.predict(X_test)
gnb_acc = (gnb_preds == y_test).float().mean()
# LDA on PCA features
lda = LDAClassifier()
lda.fit(X_train_pca, y_train)
lda_preds = lda.predict(X_test_pca)
lda_acc = (lda_preds == y_test).float().mean()
print(f"Gaussian NB accuracy (784D): {gnb_acc:.4f}")
print(f"LDA accuracy (50D PCA): {lda_acc:.4f}")
Limitations & When to Use
Practical Tips
- Feature independence violation: Naive Bayes still works well even when features are correlated — it’s surprisingly robust despite the “naive” assumption
- Laplace smoothing: Add a small constant to variances to prevent zero-probability issues
- Log-space computation: Always work in log-space to avoid numerical underflow with many features
- Feature scaling: LDA/QDA are sensitive to feature scales — standardize first