Historical Context
The perceptron is the ancestor of every neural network in existence today. Introduced by Frank Rosenblatt in 1958, it was inspired by Warren McCulloch and Walter Pitts’ 1943 mathematical model of a biological neuron. Understanding the perceptron — especially its famous failure — explains why deep learning is structured the way it is.
nn.Linear layer in PyTorch is a generalization of the perceptron to multiple outputs. The transformation $y = Wx + b$ is exactly the perceptron with continuous weights and no threshold activation.
flowchart LR
A["McCulloch-Pitts\n(1943)\nBinary logic"] --> B["Perceptron\n(1958)\nLearnable weights"]
B --> C["MLP + Backprop\n(1986)\nHidden layers"]
C --> D["Deep Learning\n(2012+)\nGPU + Big Data"]
The Perceptron Algorithm
The perceptron computes a weighted sum of inputs and applies a step function:
$$\hat{y} = \text{sign}(w \cdot x + b) = \begin{cases} 1 & \text{if } w \cdot x + b \geq 0 \\ -1 & \text{otherwise} \end{cases}$$
The learning rule: when a prediction is wrong, nudge the weights in the direction that would have produced the correct output:
$$w \leftarrow w + \eta \cdot (y - \hat{y}) \cdot x$$
import torch
class Perceptron:
"""Rosenblatt's original perceptron with step-function activation."""
def __init__(self, n_features, lr=0.01):
self.weights = torch.zeros(n_features)
self.bias = torch.tensor(0.0)
self.lr = lr
def predict(self, X):
"""Binary prediction: +1 or -1 using step function."""
linear = X @ self.weights + self.bias
return torch.where(linear >= 0, torch.ones_like(linear), -torch.ones_like(linear))
def fit(self, X, y, max_epochs=100):
"""
Train using the perceptron update rule.
y must be +1 or -1 (not 0/1).
"""
X = X.float()
y = y.float()
history = []
for epoch in range(max_epochs):
errors = 0
for xi, yi in zip(X, y):
y_hat = self.predict(xi.unsqueeze(0)).squeeze()
if y_hat != yi:
# Perceptron update rule
self.weights += self.lr * yi * xi
self.bias += self.lr * yi
errors += 1
history.append(errors)
if errors == 0:
print(f"Converged at epoch {epoch + 1}")
break
return history
# Demo: Linearly separable data (AND gate)
X_and = torch.tensor([[0., 0.], [0., 1.], [1., 0.], [1., 1.]])
y_and = torch.tensor([-1., -1., -1., 1.]) # AND: only (1,1) -> +1
p = Perceptron(n_features=2, lr=0.1)
history = p.fit(X_and, y_and, max_epochs=50)
print(f"Learned weights: {p.weights.tolist()}")
print(f"Learned bias: {p.bias.item():.3f}")
print(f"Predictions: {p.predict(X_and).tolist()}")
print(f"Expected: {y_and.tolist()}")
Perceptron Convergence Theorem
The perceptron convergence theorem guarantees that if the data is linearly separable, the algorithm will converge in a finite number of steps. The number of updates is bounded by:
$$\text{updates} \leq \left(\frac{R}{\gamma}\right)^2$$
where $R$ is the maximum norm of any input vector and $\gamma$ is the margin (distance from the decision boundary to the nearest point).
import torch
# Verify convergence theorem empirically
torch.manual_seed(42)
# Generate linearly separable data with known margin gamma
w_true = torch.tensor([1.0, 2.0])
b_true = -0.5
X = torch.randn(100, 2)
y = torch.sign(X @ w_true + b_true)
# Replace zeros (rare, but possible)
y[y == 0] = 1.0
p = Perceptron(n_features=2, lr=0.1)
history = p.fit(X, y, max_epochs=1000)
total_updates = sum(history)
R = X.norm(dim=1).max().item()
print(f"Total updates made: {total_updates}")
print(f"Max input norm R: {R:.3f}")
print(f"Bound (R/gamma)^2 — depends on true margin gamma")
Linear Separability
A dataset is linearly separable if there exists a hyperplane that perfectly divides the two classes. The perceptron can only solve linearly separable problems.
AND & OR Gates — Solvable
import torch
class Perceptron:
def __init__(self, n_features, lr=0.1):
self.weights = torch.zeros(n_features)
self.bias = torch.tensor(0.0)
self.lr = lr
def predict(self, X):
linear = X @ self.weights + self.bias
return torch.where(linear >= 0, torch.ones_like(linear), -torch.ones_like(linear))
def fit(self, X, y, max_epochs=100):
X, y = X.float(), y.float()
for epoch in range(max_epochs):
errors = 0
for xi, yi in zip(X, y):
y_hat = self.predict(xi.unsqueeze(0)).squeeze()
if y_hat != yi:
self.weights += self.lr * yi * xi
self.bias += self.lr * yi
errors += 1
if errors == 0:
break
return self
# All 4 truth table inputs
X = torch.tensor([[0., 0.], [0., 1.], [1., 0.], [1., 1.]])
# AND gate: +1 only for (1,1)
y_and = torch.tensor([-1., -1., -1., 1.])
p_and = Perceptron(2).fit(X, y_and)
print("AND gate solved:", all(p_and.predict(X) == y_and))
# OR gate: -1 only for (0,0)
y_or = torch.tensor([-1., 1., 1., 1.])
p_or = Perceptron(2).fit(X, y_or)
print("OR gate solved: ", all(p_or.predict(X) == y_or))
# NOT gate (single feature): flip the input
X_not = torch.tensor([[0.], [1.]])
y_not = torch.tensor([1., -1.])
p_not = Perceptron(1).fit(X_not, y_not)
print("NOT gate solved:", all(p_not.predict(X_not) == y_not))
The XOR Failure
XOR returns 1 when inputs differ, 0 when they match. No single straight line can separate the two classes — this killed AI funding for a decade (the first “AI winter”).
import torch
class Perceptron:
def __init__(self, n_features, lr=0.1):
self.weights = torch.zeros(n_features)
self.bias = torch.tensor(0.0)
self.lr = lr
def predict(self, X):
linear = X @ self.weights + self.bias
return torch.where(linear >= 0, torch.ones_like(linear), -torch.ones_like(linear))
def fit(self, X, y, max_epochs=1000):
X, y = X.float(), y.float()
for epoch in range(max_epochs):
errors = 0
for xi, yi in zip(X, y):
y_hat = self.predict(xi.unsqueeze(0)).squeeze()
if y_hat != yi:
self.weights += self.lr * yi * xi
self.bias += self.lr * yi
errors += 1
if errors == 0:
return True # Converged
return False # Did NOT converge
# XOR: +1 when inputs differ, -1 when same
X = torch.tensor([[0., 0.], [0., 1.], [1., 0.], [1., 1.]])
y_xor = torch.tensor([-1., 1., 1., -1.])
p = Perceptron(2)
converged = p.fit(X, y_xor, max_epochs=10000)
print(f"Converged: {converged}") # False — cannot solve XOR
print(f"Best predictions: {p.predict(X).tolist()}")
print(f"Expected: {y_xor.tolist()}")
Multi-Layer Perceptron: Solving XOR
Adding a hidden layer transforms the feature space, making previously non-separable data separable. This is the insight that unlocked deep learning.
import torch
import torch.nn as nn
# XOR data
X_xor = torch.tensor([[0., 0.], [0., 1.], [1., 0.], [1., 1.]])
y_xor = torch.tensor([0., 1., 1., 0.]) # 0/1 labels for BCE
# MLP with one hidden layer — solves XOR
mlp = nn.Sequential(
nn.Linear(2, 4), # Hidden layer
nn.ReLU(),
nn.Linear(4, 1), # Output
nn.Sigmoid()
)
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.1)
criterion = nn.BCELoss()
for epoch in range(500):
preds = mlp(X_xor).squeeze()
loss = criterion(preds, y_xor)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Final loss: {loss.item():.6f}")
with torch.no_grad():
raw = mlp(X_xor).squeeze()
binary = (raw > 0.5).float()
print(f"MLP predictions: {binary.tolist()}")
print(f"Expected: {y_xor.tolist()}")
print(f"Solved XOR: {(binary == y_xor).all().item()}")
The PyTorch Connection
Every nn.Linear layer is a vectorized, differentiable perceptron. The only differences are:
Original vs Modern
- Activation: Perceptron uses step function (non-differentiable); modern networks use ReLU/sigmoid/tanh (differentiable)
- Learning: Perceptron rule is heuristic; backpropagation computes exact gradients via chain rule
- Multiple outputs:
nn.Linear(in, out)is $n$ perceptrons processing the same input simultaneously - Batch processing:
nn.Linearprocesses batches as a single matrix multiplication $Y = XW^T + b$
import torch
import torch.nn as nn
# Show equivalence: manual perceptron vs nn.Linear
torch.manual_seed(42)
X = torch.randn(5, 3) # 5 samples, 3 features
# Manual perceptron (single output)
w = torch.randn(3)
b = torch.randn(1)
manual_output = (X @ w + b).sign()
print(f"Manual perceptron output: {manual_output.tolist()}")
# nn.Linear equivalent (single neuron)
linear = nn.Linear(3, 1, bias=True)
with torch.no_grad():
linear.weight.copy_(w.unsqueeze(0))
linear.bias.copy_(b)
torch_output = linear(X).sign().squeeze()
print(f"nn.Linear output: {torch_output.tolist()}")
# nn.Linear with multiple neurons = multiple perceptrons
multi = nn.Linear(3, 10) # 10 perceptrons processing the same 3 features
out = multi(X)
print(f"Multi-neuron output shape: {out.shape}") # (5, 10)
The Modern Neuron
import torch
import torch.nn as nn
# The complete modern neuron: linear + nonlinear activation
class ModernNeuron(nn.Module):
"""Single neuron with differentiable activation — the building block of deep learning."""
def __init__(self, in_features, activation='relu'):
super().__init__()
self.linear = nn.Linear(in_features, 1)
activations = {'relu': nn.ReLU(), 'sigmoid': nn.Sigmoid(), 'tanh': nn.Tanh()}
self.activation = activations[activation]
def forward(self, x):
return self.activation(self.linear(x))
# Compare: step (perceptron) vs differentiable activations
torch.manual_seed(42)
x_vals = torch.linspace(-3, 3, 50).unsqueeze(1) # 50 points, 1 feature
weights_data = torch.ones(1, 1)
bias_data = torch.zeros(1)
# Step function (original perceptron) — non-differentiable
linear_out = x_vals @ weights_data.T + bias_data
step_out = (linear_out >= 0).float()
# Sigmoid — smooth approximation of step
sigmoid_out = torch.sigmoid(linear_out)
# ReLU — modern preferred activation
relu_out = torch.relu(linear_out)
print(f"Step at x=0.1: {step_out[25].item():.4f}")
print(f"Sigmoid at x=0.1: {sigmoid_out[25].item():.4f}")
print(f"ReLU at x=0.1: {relu_out[25].item():.4f}")