Vanilla Autoencoder
An autoencoder learns to compress data into a low-dimensional latent representation and then reconstruct it. The network is forced to learn the most informative features because the bottleneck can’t store everything.
import torch
import torch.nn as nn
class Autoencoder(nn.Module):
"""Vanilla autoencoder with symmetric encoder/decoder."""
def __init__(self, input_dim, latent_dim, hidden_dim=64):
super().__init__()
self.encoder = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, latent_dim)
)
self.decoder = nn.Sequential(
nn.Linear(latent_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, input_dim)
)
def encode(self, x):
return self.encoder(x)
def decode(self, z):
return self.decoder(z)
def forward(self, x):
z = self.encode(x)
return self.decode(z)
# Train on synthetic data
torch.manual_seed(42)
X = torch.randn(1000, 20)
X[:, :5] *= 3 # High variance in first 5 dims
ae = Autoencoder(input_dim=20, latent_dim=3, hidden_dim=32)
optimizer = torch.optim.Adam(ae.parameters(), lr=1e-3)
for epoch in range(300):
recon = ae(X)
loss = nn.MSELoss()(recon, X)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Final reconstruction loss: {loss.item():.6f}")
# Check latent space
with torch.no_grad():
z = ae.encode(X)
print(f"Latent space shape: {z.shape}")
print(f"Latent space mean: {z.mean(0).round(decimals=3).tolist()}")
print(f"Latent space std: {z.std(0).round(decimals=3).tolist()}")
Bottleneck Effect
import torch
import torch.nn as nn
torch.manual_seed(42)
X = torch.randn(500, 20)
X[:, :5] *= 3
# Compare reconstruction quality across bottleneck sizes
for latent_dim in [1, 2, 5, 10, 20]:
ae = nn.Sequential(
nn.Linear(20, 32), nn.ReLU(),
nn.Linear(32, latent_dim), nn.ReLU(),
nn.Linear(latent_dim, 32), nn.ReLU(),
nn.Linear(32, 20)
)
opt = torch.optim.Adam(ae.parameters(), lr=1e-3)
for _ in range(300):
loss = nn.MSELoss()(ae(X), X)
opt.zero_grad(); loss.backward(); opt.step()
with torch.no_grad():
final_loss = nn.MSELoss()(ae(X), X).item()
print(f"Latent dim={latent_dim:3d}: reconstruction loss={final_loss:.4f}")
Denoising Autoencoder
Add noise to inputs but train to reconstruct the clean originals. This forces the autoencoder to learn robust features that aren’t just memorization.
import torch
import torch.nn as nn
class DenoisingAutoencoder(nn.Module):
"""Autoencoder that learns to remove noise from corrupted inputs."""
def __init__(self, input_dim, latent_dim, noise_std=0.3):
super().__init__()
self.noise_std = noise_std
self.encoder = nn.Sequential(
nn.Linear(input_dim, 64), nn.ReLU(),
nn.Linear(64, latent_dim)
)
self.decoder = nn.Sequential(
nn.Linear(latent_dim, 64), nn.ReLU(),
nn.Linear(64, input_dim)
)
def add_noise(self, x):
"""Add Gaussian noise to input (only during training)."""
return x + torch.randn_like(x) * self.noise_std
def forward(self, x, training=True):
noisy = self.add_noise(x) if training else x
z = self.encoder(noisy)
return self.decoder(z)
torch.manual_seed(42)
X = torch.randn(800, 30)
dae = DenoisingAutoencoder(input_dim=30, latent_dim=5, noise_std=0.5)
optimizer = torch.optim.Adam(dae.parameters(), lr=1e-3)
for epoch in range(400):
recon = dae(X, training=True) # Reconstruct CLEAN X from NOISY input
loss = nn.MSELoss()(recon, X) # Loss is against CLEAN target
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Denoising loss: {loss.item():.6f}")
# Test on noisy data
with torch.no_grad():
X_noisy = X + torch.randn_like(X) * 0.5
X_denoised = dae(X_noisy, training=False)
noise_mse = nn.MSELoss()(X_noisy, X).item()
denoised_mse = nn.MSELoss()(X_denoised, X).item()
print(f"Noisy input MSE vs clean: {noise_mse:.4f}")
print(f"Denoised output MSE vs clean: {denoised_mse:.4f}")
Anomaly Detection
Autoencoders trained on normal data will reconstruct normal samples well but fail on anomalies. High reconstruction error = anomaly signal.
import torch
import torch.nn as nn
torch.manual_seed(42)
# Training data: all "normal" (Gaussian cluster)
X_normal = torch.randn(800, 10)
# Train autoencoder on ONLY normal data
ae = nn.Sequential(
nn.Linear(10, 16), nn.ReLU(),
nn.Linear(16, 3), nn.ReLU(),
nn.Linear(3, 16), nn.ReLU(),
nn.Linear(16, 10)
)
opt = torch.optim.Adam(ae.parameters(), lr=1e-3)
for _ in range(500):
loss = nn.MSELoss()(ae(X_normal), X_normal)
opt.zero_grad(); loss.backward(); opt.step()
# Test: mix normal + anomalous samples
X_test_normal = torch.randn(100, 10)
X_test_anomaly = torch.randn(100, 10) * 5 + 10 # Very different distribution
with torch.no_grad():
recon_normal = ae(X_test_normal)
recon_anomaly = ae(X_test_anomaly)
# Per-sample reconstruction error
error_normal = ((X_test_normal - recon_normal)**2).mean(dim=1)
error_anomaly = ((X_test_anomaly - recon_anomaly)**2).mean(dim=1)
print(f"Normal samples — mean error: {error_normal.mean():.4f}, std: {error_normal.std():.4f}")
print(f"Anomaly samples — mean error: {error_anomaly.mean():.4f}, std: {error_anomaly.std():.4f}")
# Simple threshold-based detection
threshold = error_normal.mean() + 3 * error_normal.std()
detected = (error_anomaly > threshold).float().mean().item()
print(f"\nThreshold: {threshold:.4f}")
print(f"Anomaly detection rate: {detected*100:.1f}%")
Variational Autoencoder (VAE)
The VAE encodes inputs as distributions $q(z|x) = \mathcal{N}(\mu, \sigma^2)$ rather than point estimates. The ELBO loss combines reconstruction quality with a KL divergence term that regularizes the latent space toward a standard normal.
$$\mathcal{L}_{ELBO} = \mathbb{E}_{q(z|x)}[\log p(x|z)] - D_{KL}(q(z|x) \| p(z))$$
import torch
import torch.nn as nn
import torch.nn.functional as F
class VAE(nn.Module):
"""Variational Autoencoder with reparameterization trick."""
def __init__(self, input_dim, latent_dim, hidden_dim=64):
super().__init__()
# Encoder outputs BOTH mu and log_var
self.encoder_base = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.ReLU())
self.fc_mu = nn.Linear(hidden_dim, latent_dim)
self.fc_logvar = nn.Linear(hidden_dim, latent_dim)
# Decoder
self.decoder = nn.Sequential(
nn.Linear(latent_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, input_dim)
)
def encode(self, x):
h = self.encoder_base(x)
return self.fc_mu(h), self.fc_logvar(h)
def reparameterize(self, mu, logvar):
"""Reparameterization trick: z = mu + eps * std, eps ~ N(0,1)."""
if self.training:
std = torch.exp(0.5 * logvar)
eps = torch.randn_like(std)
return mu + eps * std
return mu # During inference: use mean directly
def decode(self, z):
return self.decoder(z)
def forward(self, x):
mu, logvar = self.encode(x)
z = self.reparameterize(mu, logvar)
recon = self.decode(z)
return recon, mu, logvar
def vae_loss(recon, x, mu, logvar, beta=1.0):
"""
ELBO loss = Reconstruction loss + beta * KL divergence.
beta=1: standard VAE; beta>1: beta-VAE (disentanglement)
"""
recon_loss = F.mse_loss(recon, x, reduction='sum')
# KL divergence: -0.5 * sum(1 + logvar - mu^2 - exp(logvar))
kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
return (recon_loss + beta * kl_loss) / x.shape[0]
torch.manual_seed(42)
X = torch.randn(1000, 20)
vae = VAE(input_dim=20, latent_dim=3, hidden_dim=32)
optimizer = torch.optim.Adam(vae.parameters(), lr=1e-3)
for epoch in range(500):
recon, mu, logvar = vae(X)
loss = vae_loss(recon, X, mu, logvar, beta=1.0)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"VAE ELBO loss: {loss.item():.4f}")
# Check latent space regularity (should be approx. N(0,1))
with torch.no_grad():
mu_out, logvar_out = vae.encode(X)
print(f"Latent mu mean: {mu_out.mean().item():.4f} (should be ~0)")
print(f"Latent sigma mean: {logvar_out.exp().sqrt().mean().item():.4f} (should be ~1)")
Latent Space Interpolation
import torch
import torch.nn as nn
import torch.nn.functional as F
class VAE(nn.Module):
def __init__(self, input_dim, latent_dim, hidden_dim=32):
super().__init__()
self.enc_base = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.ReLU())
self.fc_mu = nn.Linear(hidden_dim, latent_dim)
self.fc_logvar = nn.Linear(hidden_dim, latent_dim)
self.decoder = nn.Sequential(nn.Linear(latent_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, input_dim))
def encode(self, x):
h = self.enc_base(x)
return self.fc_mu(h), self.fc_logvar(h)
def reparameterize(self, mu, logvar):
std = torch.exp(0.5 * logvar)
return mu + torch.randn_like(std) * std if self.training else mu
def forward(self, x):
mu, logvar = self.encode(x)
z = self.reparameterize(mu, logvar)
return self.decoder(z), mu, logvar
torch.manual_seed(42)
X = torch.randn(1000, 20)
vae = VAE(20, 3, 32)
opt = torch.optim.Adam(vae.parameters(), lr=1e-3)
for _ in range(500):
recon, mu, logvar = vae(X)
loss = (F.mse_loss(recon, X, reduction='sum') -
0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())) / len(X)
opt.zero_grad(); loss.backward(); opt.step()
# Interpolate between two samples in latent space
with torch.no_grad():
mu_a, _ = vae.encode(X[0:1]) # Latent of sample A
mu_b, _ = vae.encode(X[100:101]) # Latent of sample B
print("Interpolating A -> B in latent space:")
for alpha in [0.0, 0.25, 0.5, 0.75, 1.0]:
z_interp = (1 - alpha) * mu_a + alpha * mu_b
x_interp = vae.decoder(z_interp)
print(f" alpha={alpha:.2f}: decoded norm={x_interp.norm().item():.3f}")