DBSCAN & Density-Based Clustering in PyTorch

Core Concepts

DBSCAN (Density-Based Spatial Clustering of Applications with Noise) classifies every point as one of three types based on its neighborhood density:

                            
                            Three Point Types:
                            Core point: Has at least min_samples neighbors within distance eps (including itself)
Border point: Within eps of a core point but doesn’t have enough neighbors to be core itself
Noise point: Not within eps of any core point — labeled -1

DBSCAN Algorithm

flowchart TD
    A[For each unvisited point] --> B{Has >= min_samples\nneighbors in eps?}
    B -->|Yes: Core point| C[Start new cluster\nor expand existing]
    B -->|No| D{Within eps of\na core point?}
    C --> E[BFS: add all\ndensity-reachable points]
    D -->|Yes: Border| F[Assign to nearby cluster]
    D -->|No: Noise| G[Label as -1]

DBSCAN Implementation

import torch
from collections import deque


class DBSCAN:
    """
    Density-Based Spatial Clustering using PyTorch vectorized operations.
    Labels: -1 = noise, 0,1,2... = cluster IDs
    """

    def __init__(self, eps=0.5, min_samples=5):
        self.eps = eps
        self.min_samples = min_samples
        self.labels_ = None
        self.n_clusters_ = 0

    def fit(self, X):
        X = X.float()
        n = len(X)

        # Precompute full distance matrix using cdist (GPU-compatible)
        dist_matrix = torch.cdist(X, X)  # (n, n)

        # Find neighbors: boolean adjacency matrix
        neighbors = dist_matrix <= self.eps  # (n, n)
        neighbor_counts = neighbors.sum(dim=1)  # Including self

        # Identify core points
        is_core = neighbor_counts >= self.min_samples  # (n,)

        self.labels_ = torch.full((n,), -1, dtype=torch.long)  # Start: all noise
        cluster_id = 0

        for i in range(n):
            if self.labels_[i] != -1 or not is_core[i]:
                continue  # Already visited or not a core point

            # BFS: expand cluster from core point i
            self.labels_[i] = cluster_id
            queue = deque([i])

            while queue:
                current = queue.popleft()
                # All neighbors of current point
                nbrs = neighbors[current].nonzero(as_tuple=True)[0]

                for nbr in nbrs.tolist():
                    if self.labels_[nbr] == -1:
                        self.labels_[nbr] = cluster_id
                        if is_core[nbr]:
                            queue.append(nbr)

            cluster_id += 1

        self.n_clusters_ = cluster_id
        return self


# Demo: circular clusters that DBSCAN handles, K-Means doesn't
import torch
import math

torch.manual_seed(42)

# Generate two concentric rings
def make_ring(n, radius, noise=0.1):
    angles = torch.linspace(0, 2 * math.pi, n)
    x = radius * torch.cos(angles) + torch.randn(n) * noise
    y = radius * torch.sin(angles) + torch.randn(n) * noise
    return torch.stack([x, y], dim=1)

inner = make_ring(100, 1.0, noise=0.05)
outer = make_ring(150, 2.5, noise=0.05)
X = torch.cat([inner, outer])

db = DBSCAN(eps=0.3, min_samples=5)
db.fit(X)

unique_labels = db.labels_.unique()
n_noise = (db.labels_ == -1).sum().item()
print(f"Found {db.n_clusters_} clusters")
print(f"Noise points: {n_noise}")
print(f"Cluster sizes: {[(db.labels_ == i).sum().item() for i in range(db.n_clusters_)]}")

Parameter Selection

The k-distance graph helps select eps: sort points by their distance to their k-th nearest neighbor. The “knee” of this curve is a good estimate for eps.

import torch

torch.manual_seed(42)
# Clustered data with some noise
X = torch.cat([
    torch.randn(80, 2) * 0.4 + torch.tensor([0., 0.]),
    torch.randn(80, 2) * 0.4 + torch.tensor([3., 0.]),
    torch.randn(80, 2) * 0.4 + torch.tensor([1.5, 2.5]),
    torch.randn(20, 2) * 4.0,  # Noise
])

# k-distance graph: for each point, find distance to k-th nearest neighbor
k = 5  # Same as min_samples
dist_matrix = torch.cdist(X, X)

# Mask self-distance (diagonal = 0)
dist_matrix.fill_diagonal_(float('inf'))

# k-th nearest neighbor distance for each point
k_dists, _ = dist_matrix.topk(k, dim=1, largest=False)
kth_dist = k_dists[:, -1]  # (n,) — distance to k-th nearest

# Sort for visualization
sorted_dists, _ = kth_dist.sort(descending=True)

# Find approximate "knee": where the curve starts to flatten
# (simplified: look for largest second derivative)
diffs = sorted_dists[:-1] - sorted_dists[1:]
knee_idx = diffs.argmax().item()
suggested_eps = sorted_dists[knee_idx].item()

print(f"Suggested eps (knee of k-distance graph): {suggested_eps:.3f}")
print(f"5th-nearest distance percentiles:")
for p in [10, 25, 50, 75, 90]:
    val = kth_dist.quantile(p / 100).item()
    print(f"  {p}th percentile: {val:.3f}")

import torch
from collections import deque
import math


def make_ring(n, radius, noise=0.05, seed=0):
    torch.manual_seed(seed)
    angles = torch.linspace(0, 2 * math.pi, n)
    x = radius * torch.cos(angles) + torch.randn(n) * noise
    y = radius * torch.sin(angles) + torch.randn(n) * noise
    return torch.stack([x, y], dim=1)


def dbscan(X, eps, min_samples):
    n = len(X)
    dist_matrix = torch.cdist(X.float(), X.float())
    neighbors = dist_matrix <= eps
    is_core = neighbors.sum(1) >= min_samples
    labels = torch.full((n,), -1, dtype=torch.long)
    cluster_id = 0
    for i in range(n):
        if labels[i] != -1 or not is_core[i]:
            continue
        labels[i] = cluster_id
        q = deque([i])
        while q:
            cur = q.popleft()
            for nbr in neighbors[cur].nonzero(as_tuple=True)[0].tolist():
                if labels[nbr] == -1:
                    labels[nbr] = cluster_id
                    if is_core[nbr]:
                        q.append(nbr)
        cluster_id += 1
    return labels, cluster_id


# Test different eps values
X = torch.cat([make_ring(100, 1.0), make_ring(150, 2.5)])

print("eps   | clusters | noise points")
for eps in [0.1, 0.2, 0.3, 0.5, 1.0]:
    labels, n_clusters = dbscan(X, eps=eps, min_samples=5)
    n_noise = (labels == -1).sum().item()
    print(f"{eps:.1f}   |    {n_clusters}     |     {n_noise:3d}")

DBSCAN vs K-Means

Comparison Algorithm Trade-offs

When to Choose Each

DBSCAN wins: Non-spherical shapes (rings, crescents, spirals); unknown number of clusters; datasets with meaningful outliers to flag
K-Means wins: Large datasets (DBSCAN is O(n²) without spatial indexing); globular clusters; when K is known; when speed matters
DBSCAN parameters: eps controls neighborhood radius; min_samples controls minimum cluster density; use the k-distance graph to choose eps
High dimensions: Both struggle — the “curse of dimensionality” makes distances near-uniform. Apply PCA first.

Trade-offs Scalability Outliers

Cookie Consent

Table of Contents

Core Concepts

DBSCAN Implementation

Parameter Selection

DBSCAN vs K-Means

When to Choose Each

Cookie Consent

DBSCAN & Density-Based Clustering in PyTorch

Table of Contents

Core Concepts

DBSCAN Implementation

Parameter Selection

DBSCAN vs K-Means

When to Choose Each

Related Articles in This Series

K-Means Clustering in PyTorch

Gaussian Mixture Models & EM Algorithm

PCA & Dimensionality Reduction