Core Concepts
DBSCAN (Density-Based Spatial Clustering of Applications with Noise) classifies every point as one of three types based on its neighborhood density:
Three Point Types:
- Core point: Has at least
min_samplesneighbors within distanceeps(including itself) - Border point: Within
epsof a core point but doesn’t have enough neighbors to be core itself - Noise point: Not within
epsof any core point — labeled-1
DBSCAN Algorithm
flowchart TD
A[For each unvisited point] --> B{Has >= min_samples\nneighbors in eps?}
B -->|Yes: Core point| C[Start new cluster\nor expand existing]
B -->|No| D{Within eps of\na core point?}
C --> E[BFS: add all\ndensity-reachable points]
D -->|Yes: Border| F[Assign to nearby cluster]
D -->|No: Noise| G[Label as -1]
DBSCAN Implementation
import torch
from collections import deque
class DBSCAN:
"""
Density-Based Spatial Clustering using PyTorch vectorized operations.
Labels: -1 = noise, 0,1,2... = cluster IDs
"""
def __init__(self, eps=0.5, min_samples=5):
self.eps = eps
self.min_samples = min_samples
self.labels_ = None
self.n_clusters_ = 0
def fit(self, X):
X = X.float()
n = len(X)
# Precompute full distance matrix using cdist (GPU-compatible)
dist_matrix = torch.cdist(X, X) # (n, n)
# Find neighbors: boolean adjacency matrix
neighbors = dist_matrix <= self.eps # (n, n)
neighbor_counts = neighbors.sum(dim=1) # Including self
# Identify core points
is_core = neighbor_counts >= self.min_samples # (n,)
self.labels_ = torch.full((n,), -1, dtype=torch.long) # Start: all noise
cluster_id = 0
for i in range(n):
if self.labels_[i] != -1 or not is_core[i]:
continue # Already visited or not a core point
# BFS: expand cluster from core point i
self.labels_[i] = cluster_id
queue = deque([i])
while queue:
current = queue.popleft()
# All neighbors of current point
nbrs = neighbors[current].nonzero(as_tuple=True)[0]
for nbr in nbrs.tolist():
if self.labels_[nbr] == -1:
self.labels_[nbr] = cluster_id
if is_core[nbr]:
queue.append(nbr)
cluster_id += 1
self.n_clusters_ = cluster_id
return self
# Demo: circular clusters that DBSCAN handles, K-Means doesn't
import torch
import math
torch.manual_seed(42)
# Generate two concentric rings
def make_ring(n, radius, noise=0.1):
angles = torch.linspace(0, 2 * math.pi, n)
x = radius * torch.cos(angles) + torch.randn(n) * noise
y = radius * torch.sin(angles) + torch.randn(n) * noise
return torch.stack([x, y], dim=1)
inner = make_ring(100, 1.0, noise=0.05)
outer = make_ring(150, 2.5, noise=0.05)
X = torch.cat([inner, outer])
db = DBSCAN(eps=0.3, min_samples=5)
db.fit(X)
unique_labels = db.labels_.unique()
n_noise = (db.labels_ == -1).sum().item()
print(f"Found {db.n_clusters_} clusters")
print(f"Noise points: {n_noise}")
print(f"Cluster sizes: {[(db.labels_ == i).sum().item() for i in range(db.n_clusters_)]}")
Parameter Selection
The k-distance graph helps select eps: sort points by their distance to their k-th nearest neighbor. The “knee” of this curve is a good estimate for eps.
import torch
torch.manual_seed(42)
# Clustered data with some noise
X = torch.cat([
torch.randn(80, 2) * 0.4 + torch.tensor([0., 0.]),
torch.randn(80, 2) * 0.4 + torch.tensor([3., 0.]),
torch.randn(80, 2) * 0.4 + torch.tensor([1.5, 2.5]),
torch.randn(20, 2) * 4.0, # Noise
])
# k-distance graph: for each point, find distance to k-th nearest neighbor
k = 5 # Same as min_samples
dist_matrix = torch.cdist(X, X)
# Mask self-distance (diagonal = 0)
dist_matrix.fill_diagonal_(float('inf'))
# k-th nearest neighbor distance for each point
k_dists, _ = dist_matrix.topk(k, dim=1, largest=False)
kth_dist = k_dists[:, -1] # (n,) ā distance to k-th nearest
# Sort for visualization
sorted_dists, _ = kth_dist.sort(descending=True)
# Find approximate "knee": where the curve starts to flatten
# (simplified: look for largest second derivative)
diffs = sorted_dists[:-1] - sorted_dists[1:]
knee_idx = diffs.argmax().item()
suggested_eps = sorted_dists[knee_idx].item()
print(f"Suggested eps (knee of k-distance graph): {suggested_eps:.3f}")
print(f"5th-nearest distance percentiles:")
for p in [10, 25, 50, 75, 90]:
val = kth_dist.quantile(p / 100).item()
print(f" {p}th percentile: {val:.3f}")
import torch
from collections import deque
import math
def make_ring(n, radius, noise=0.05, seed=0):
torch.manual_seed(seed)
angles = torch.linspace(0, 2 * math.pi, n)
x = radius * torch.cos(angles) + torch.randn(n) * noise
y = radius * torch.sin(angles) + torch.randn(n) * noise
return torch.stack([x, y], dim=1)
def dbscan(X, eps, min_samples):
n = len(X)
dist_matrix = torch.cdist(X.float(), X.float())
neighbors = dist_matrix <= eps
is_core = neighbors.sum(1) >= min_samples
labels = torch.full((n,), -1, dtype=torch.long)
cluster_id = 0
for i in range(n):
if labels[i] != -1 or not is_core[i]:
continue
labels[i] = cluster_id
q = deque([i])
while q:
cur = q.popleft()
for nbr in neighbors[cur].nonzero(as_tuple=True)[0].tolist():
if labels[nbr] == -1:
labels[nbr] = cluster_id
if is_core[nbr]:
q.append(nbr)
cluster_id += 1
return labels, cluster_id
# Test different eps values
X = torch.cat([make_ring(100, 1.0), make_ring(150, 2.5)])
print("eps | clusters | noise points")
for eps in [0.1, 0.2, 0.3, 0.5, 1.0]:
labels, n_clusters = dbscan(X, eps=eps, min_samples=5)
n_noise = (labels == -1).sum().item()
print(f"{eps:.1f} | {n_clusters} | {n_noise:3d}")
DBSCAN vs K-Means
When to Choose Each
- DBSCAN wins: Non-spherical shapes (rings, crescents, spirals); unknown number of clusters; datasets with meaningful outliers to flag
- K-Means wins: Large datasets (DBSCAN is O(n²) without spatial indexing); globular clusters; when K is known; when speed matters
- DBSCAN parameters:
epscontrols neighborhood radius;min_samplescontrols minimum cluster density; use the k-distance graph to chooseeps - High dimensions: Both struggle ā the “curse of dimensionality” makes distances near-uniform. Apply PCA first.