Complete Workflow Examples with Datasets
Now that you've learned preprocessing, classification, regression, clustering, evaluation, pipelines, and hyperparameter tuning, let's see how everything fits together. This section demonstrates complete end-to-end ML workflows using Scikit-learn's built-in datasets.
What You'll See: Each example below walks through the entire process—from loading data and exploration, through preprocessing and model selection, to evaluation and visualization. These are realistic workflows you can adapt for your own projects.
Datasets Covered
- Classification: Iris, Wine, Digits, Breast Cancer—demonstrating Logistic Regression, Random Forest, SVM, and evaluation
- Regression: Diabetes, California Housing—demonstrating Linear Regression, Ridge, feature importance
- Complete Pipeline: Every example shows data splitting, preprocessing, training, evaluation, and visualization
1. Iris Dataset (Multi-class Classification)
About Iris: Classic dataset with 150 samples of iris flowers. Features include sepal length, sepal width, petal length, and petal width. Target: 3 species (setosa, versicolor, virginica). Perfect for learning classification.
# Import necessary libraries
import numpy as np # For numerical operations
import pandas as pd # For data manipulation
import matplotlib.pyplot as plt # For plotting
from sklearn.datasets import load_iris # Load built-in Iris dataset
from sklearn.model_selection import train_test_split, cross_val_score # For data splitting and validation
from sklearn.preprocessing import StandardScaler # For feature scaling
from sklearn.linear_model import LogisticRegression # Linear classifier
from sklearn.ensemble import RandomForestClassifier # Tree-based ensemble classifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # Evaluation metrics
import seaborn as sns # For advanced visualization
# 1. LOAD DATA
# load_iris() returns a Bunch object (dict-like) containing:
# - data: feature matrix (150 samples x 4 features)
# - target: class labels (0, 1, 2 for setosa, versicolor, virginica)
# - feature_names: names of the 4 features
# - target_names: names of the 3 species
iris = load_iris()
X, y = iris.data, iris.target # X = features (150x4), y = labels (150,)
# Display dataset information
print(f"Dataset shape: {X.shape}") # Output: (150, 4) - 150 samples, 4 features
print(f"Feature names: {iris.feature_names}") # sepal length/width, petal length/width
print(f"Target names: {iris.target_names}") # setosa, versicolor, virginica
print(f"Sample distribution: {np.bincount(y)}") # Count samples per class - Output: [50 50 50] (balanced)
# Import libraries for data exploration
import pandas as pd # For DataFrame operations
import matplotlib.pyplot as plt # For visualization
import seaborn as sns # For enhanced plots
from sklearn.datasets import load_iris # Load dataset
# 2. EXPLORE DATA
iris = load_iris()
X, y = iris.data, iris.target
# Create DataFrame for easy exploration and analysis
# pd.DataFrame() converts NumPy array to tabular format with column names
df = pd.DataFrame(X, columns=iris.feature_names)
# Add species names by mapping numeric labels (0,1,2) to text labels
df['species'] = iris.target_names[y] # e.g., 0 -> 'setosa'
# Display first 5 rows to see data structure
print(df.head()) # Shows sample data with feature values and species
# Statistical summary: count, mean, std, min, 25%, 50%, 75%, max
print(df.describe()) # Helps identify feature ranges and distributions
# Count samples per species - should be 50 each (balanced dataset)
print(df['species'].value_counts())
# Visualize feature distributions to understand data patterns
plt.figure(figsize=(12, 4)) # Create figure 12 inches wide, 4 tall
for i in range(4): # Loop through 4 features
plt.subplot(1, 4, i+1) # Create 1 row, 4 columns of subplots
# Create overlapping histograms for each species
# X[y==0, i] gets feature i values for species 0, etc.
plt.hist([X[y==0, i], X[y==1, i], X[y==2, i]],
label=iris.target_names, alpha=0.7) # alpha=0.7 for transparency
plt.xlabel(iris.feature_names[i]) # Label x-axis with feature name
plt.ylabel('Frequency') # Count of samples in each bin
plt.legend() # Show which color represents which species
plt.tight_layout() # Adjust spacing to prevent overlap
plt.show() # Display the plot
# Import required libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split # For splitting data
from sklearn.preprocessing import StandardScaler # For feature normalization
from sklearn.linear_model import LogisticRegression # Linear classification model
from sklearn.ensemble import RandomForestClassifier # Ensemble tree model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# 3. SPLIT DATA into training and testing sets
iris = load_iris()
X, y = iris.data, iris.target
# train_test_split() randomly divides data into train/test sets
# test_size=0.2: Use 20% for testing, 80% for training
# random_state=42: Set seed for reproducibility (same split every time)
# stratify=y: Maintain class proportions in both sets (33% of each species)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set: {X_train.shape}") # (120, 4) - 80% of 150 samples
print(f"Test set: {X_test.shape}") # (30, 4) - 20% of 150 samples
# 4. PREPROCESS: Scale features to mean=0, std=1
# Scaling is crucial for distance-based algorithms (e.g., Logistic Regression, SVM)
scaler = StandardScaler() # Create scaler object
# fit_transform(): Learn mean/std from training data AND transform it
X_train_scaled = scaler.fit_transform(X_train)
# transform(): Apply same scaling (using training mean/std) to test data
# NEVER fit on test data - this would cause data leakage!
X_test_scaled = scaler.transform(X_test)
# 5. TRAIN MODELS on the training data
# Logistic Regression (linear decision boundaries)
# max_iter=200: Maximum optimization iterations
# random_state=42: For reproducibility in stochastic processes
log_reg = LogisticRegression(max_iter=200, random_state=42)
log_reg.fit(X_train_scaled, y_train) # Learn weights from scaled training data
# Random Forest (ensemble of decision trees)
# n_estimators=100: Build 100 decision trees and average their predictions
# Tree-based models are scale-invariant (don't need scaled features)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train) # Train on original (unscaled) data
# 6. EVALUATE models on test data (unseen data)
# predict(): Generate predictions for test samples
y_pred_lr = log_reg.predict(X_test_scaled) # Use scaled test data
y_pred_rf = rf.predict(X_test) # Use original test data
# accuracy_score(): Fraction of correct predictions
print(f"\nLogistic Regression Accuracy: {accuracy_score(y_test, y_pred_lr):.3f}")
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf):.3f}")
# classification_report(): Precision, recall, F1-score for each class
# Provides detailed per-class performance metrics
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr, target_names=iris.target_names))
import matplotlib.pyplot as plt
import seaborn as sns # Advanced visualization library built on matplotlib
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix # For error analysis
# 7. VISUALIZE RESULTS with a confusion matrix
iris = load_iris()
X, y = iris.data, iris.target
# Split data with same parameters to ensure reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train) # Learn from training data
y_pred_rf = rf.predict(X_test) # Make predictions on test data
# confusion_matrix(): Create matrix showing actual vs predicted classes
# Rows = actual classes, Columns = predicted classes
# Diagonal elements = correct predictions, off-diagonal = errors
cm = confusion_matrix(y_test, y_pred_rf)
# Visualize confusion matrix as a heatmap
plt.figure(figsize=(8, 6)) # Set figure size
# sns.heatmap(): Display matrix with color-coded cells
# annot=True: Show numbers in each cell
# fmt='d': Format numbers as integers (not decimals)
# cmap='Blues': Use blue color scheme (darker = higher values)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=iris.target_names, # Label columns with species names
yticklabels=iris.target_names) # Label rows with species names
plt.xlabel('Predicted') # What the model predicted
plt.ylabel('Actual') # What the true class was
plt.title('Iris Classification Confusion Matrix') # Descriptive title
plt.show() # Display the plot
# How to read: If cell (setosa, versicolor) = 2, means 2 setosa samples
# were incorrectly classified as versicolor
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
# 8. FEATURE IMPORTANCE - Which features are most useful for predictions?
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Random Forest (tree-based models provide feature importance)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
# feature_importances_: Array of importance scores (sum to 1.0)
# Higher score = feature contributes more to accurate predictions
# Based on how much each feature decreases impurity (Gini) across trees
importances = rf.feature_importances_
# np.argsort(): Get indices that would sort array in ascending order
# [::-1] reverses to get descending order (most important first)
indices = np.argsort(importances)[::-1]
# Create bar plot of feature importance
plt.figure(figsize=(10, 6))
# Plot bars in order of importance
plt.bar(range(X.shape[1]), importances[indices])
# Label x-axis with feature names in sorted order, rotated 45° for readability
plt.xticks(range(X.shape[1]), [iris.feature_names[i] for i in indices], rotation=45)
plt.xlabel('Feature') # X-axis label
plt.ylabel('Importance') # Y-axis label (0 to ~0.5 for Iris dataset)
plt.title('Feature Importance for Iris Classification')
plt.tight_layout() # Prevent label cutoff
plt.show()
# Print ranked list of features with importance scores
print("Feature ranking:")
for i in range(X.shape[1]):
print(f"{i+1}. {iris.feature_names[indices[i]]}: {importances[indices[i]]:.3f}")
# Typically petal width and petal length are most important for Iris
2. Wine Dataset (Multi-class Classification)
About Wine: Chemical analysis of 178 wine samples from Italy. 13 features (alcohol, acidity, phenols, etc.). Target: 3 wine types. Great for classification with multiple continuous features.
# Import necessary libraries
from sklearn.datasets import load_wine # Wine quality dataset
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler # For feature scaling
from sklearn.svm import SVC # Support Vector Machine classifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd # For data manipulation
import numpy as np # For numerical operations
# 1. LOAD & EXPLORE
# load_wine() returns chemical analysis of 178 wine samples
# Features include alcohol content, acidity, phenols, color intensity, etc.
wine = load_wine()
X, y = wine.data, wine.target # X = 13 chemical features, y = wine class (0, 1, 2)
print(f"Dataset shape: {X.shape}") # (178, 13) - 178 samples, 13 features
print(f"Features: {len(wine.feature_names)}") # 13 chemical properties
print(f"Classes: {wine.target_names}") # class_0, class_1, class_2 (wine cultivars)
print(f"Class distribution: {np.bincount(y)}") # Samples per class - may be imbalanced
# Create DataFrame for easier exploration
df_wine = pd.DataFrame(X, columns=wine.feature_names)
df_wine['wine_type'] = y # Add target column
print("\nFirst few rows:") # Preview data structure
print(df_wine.head())
print("\nStatistics:") # Mean, std, min, max for each feature
print(df_wine.describe()) # Note: features have different scales (0.74 to 1680)
# Import required libraries
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # For feature normalization
from sklearn.svm import SVC # Support Vector Machine
from sklearn.ensemble import GradientBoostingClassifier # Boosting ensemble
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
# 2. SPLIT & PREPROCESS data
wine = load_wine()
X, y = wine.data, wine.target
# train_test_split(): Randomly divide data
# test_size=0.25: Use 25% for testing (higher than standard 20% due to small dataset)
# random_state=42: Reproducible split
# stratify=y: Maintain class proportions in train/test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=42, stratify=y
)
# Scale features to mean=0, std=1 (critical for SVM performance)
# Wine features have vastly different scales (alcohol ~10-15, proline ~200-1600)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # Learn scaling from training data
X_test_scaled = scaler.transform(X_test) # Apply same scaling to test data
# 3. TRAIN MULTIPLE MODELS for comparison
# SVM with RBF (Radial Basis Function) kernel
# kernel='rbf': Non-linear decision boundaries
# C=10: High regularization penalty (tighter fit to training data)
# gamma='scale': Kernel coefficient = 1 / (n_features * X.var())
# See: https://scikit-learn.org/stable/modules/svm.html
svm = SVC(kernel='rbf', C=10, gamma='scale', random_state=42)
svm.fit(X_train_scaled, y_train) # Train on scaled features
# Gradient Boosting: Sequential ensemble of decision trees
# n_estimators=100: Build 100 trees (each learns from previous tree's errors)
# See: https://scikit-learn.org/stable/modules/ensemble.html#gradient-boosting
# Tree models don't need scaled features (they only use rank order)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train) # Train on original unscaled data
# 4. EVALUATE both models on test set
y_pred_svm = svm.predict(X_test_scaled) # SVM needs scaled features
y_pred_gb = gb.predict(X_test) # GB uses original features
# Compare accuracy scores
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm):.3f}")
print(f"Gradient Boosting Accuracy: {accuracy_score(y_test, y_pred_gb):.3f}")
# Detailed per-class metrics (precision, recall, F1-score)
print("\nGradient Boosting Report:")
print(classification_report(y_test, y_pred_gb, target_names=wine.target_names))
# Import libraries for cross-validation workflow
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_val_score # For k-fold cross-validation
from sklearn.preprocessing import StandardScaler # Feature scaling
from sklearn.svm import SVC # Classifier
from sklearn.pipeline import Pipeline # Chain preprocessing + model together
import numpy as np
# 5. CROSS-VALIDATION with Pipeline
# Pipeline ensures scaling is done correctly within each CV fold
# This prevents data leakage (test data influencing training)
wine = load_wine()
X, y = wine.data, wine.target
# Create Pipeline: preprocessing step + model step
# Pipeline chains operations: data flows scaler → SVM
# See: https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
pipeline = Pipeline([
('scaler', StandardScaler()), # Step 1: Scale features
('svm', SVC(kernel='rbf', C=10, random_state=42)) # Step 2: Train SVM
])
# cross_val_score(): Perform k-fold cross-validation
# cv=5: Split data into 5 folds
# - Train on 4 folds, test on 1 fold
# - Repeat 5 times (each fold used as test once)
# - Returns 5 accuracy scores
# scoring='accuracy': Metric to evaluate (could be 'f1', 'precision', etc.)
# Pipeline ensures each fold is scaled independently (no data leakage)
# See: https://scikit-learn.org/stable/modules/cross_validation.html
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
# Display results
print(f"Cross-validation scores: {scores}") # 5 individual fold scores
# Mean ± 2*std gives 95% confidence interval estimate
print(f"Mean accuracy: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
# Example output: "Mean accuracy: 0.978 (+/- 0.034)"
3. Digits Dataset (Image Classification)
About Digits: 1,797 images of handwritten digits (0-9), each 8x8 pixels (64 features). Perfect for learning image classification and dimensionality reduction techniques.
# Import libraries for visualization and data loading
import matplotlib.pyplot as plt # For plotting
from sklearn.datasets import load_digits # Handwritten digits dataset
import numpy as np # For numerical operations
# 1. LOAD & VISUALIZE handwritten digits
# load_digits() returns 1,797 images of digits 0-9
# Each image is 8x8 pixels, flattened to 64-dimensional vector
digits = load_digits()
X, y = digits.data, digits.target # X = 64 features (pixel intensities), y = digit label (0-9)
print(f"Dataset shape: {X.shape}") # (1797, 64) - 1797 images, 64 pixels each
print(f"Image shape: {digits.images.shape}") # (1797, 8, 8) - original 2D format
print(f"Classes: 0-9 (10 classes)") # 10 possible digit labels
print(f"Samples per class: {np.bincount(y)}") # Distribution (~180 samples per digit)
# Visualize sample digits to understand the data
# Create 2 rows × 5 columns = 10 subplots
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flat): # axes.flat iterates over all subplots
# imshow(): Display 2D array as image
# cmap='gray': Use grayscale colormap (0=black, 16=white)
ax.imshow(digits.images[i], cmap='gray')
ax.set_title(f"Label: {digits.target[i]}") # Show true digit label
ax.axis('off') # Hide axis ticks and labels
plt.tight_layout() # Adjust spacing between subplots
plt.show() # Display the plot
# Import libraries for neural network training
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # For feature normalization
from sklearn.neural_network import MLPClassifier # Multi-Layer Perceptron (neural network)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# 2. SPLIT & TRAIN with neural network
digits = load_digits()
X, y = digits.data, digits.target # X = 64 pixel intensities, y = digit label (0-9)
# Split data: 80% training, 20% testing
# stratify=y: Ensure balanced digit distribution in both sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Scale features to improve neural network convergence
# Neural networks learn faster when features are standardized
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # Fit and transform training data
X_test_scaled = scaler.transform(X_test) # Transform test data (using training stats)
# MLPClassifier: Multi-Layer Perceptron (feedforward neural network)
# hidden_layer_sizes=(100, 50): Architecture with 2 hidden layers
# - Layer 1: 100 neurons (fully connected to 64 input pixels)
# - Layer 2: 50 neurons (fully connected to Layer 1)
# - Output: 10 neurons (one per digit class)
# max_iter=500: Maximum training epochs (iterations through dataset)
# See: https://scikit-learn.org/stable/modules/neural_networks_supervised.html
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp.fit(X_train_scaled, y_train) # Train network using backpropagation
# Predict digit labels for test images
y_pred = mlp.predict(X_test_scaled)
# Evaluate performance
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}") # Overall correctness
print("\nClassification Report:") # Per-digit precision, recall, F1-score
print(classification_report(y_test, y_pred)) # Shows performance for each digit 0-9
# Import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns # For advanced heatmaps
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix # For error pattern analysis
# 3. CONFUSION MATRIX VISUALIZATION - See where model makes mistakes
digits = load_digits()
X, y = digits.data, digits.target
# Reproduce same train/test split as previous example
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train neural network
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp.fit(X_train_scaled, y_train)
y_pred = mlp.predict(X_test_scaled) # Get predictions
# confusion_matrix(): 10x10 matrix showing actual vs predicted digits
# Rows = true digit, Columns = predicted digit
# Diagonal = correct predictions, off-diagonal = errors
cm = confusion_matrix(y_test, y_pred)
# Visualize as heatmap
plt.figure(figsize=(10, 8)) # Large figure for 10x10 matrix
# annot=True: Show count in each cell
# fmt='d': Display as integers
# cmap='Blues': Blue color scheme (darker = more samples)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Digit') # What model predicted
plt.ylabel('True Digit') # Actual digit in test set
plt.title('Digit Classification Confusion Matrix')
plt.show()
# How to read: If cell (8, 3) = 5, means 5 images of digit 8 were
# incorrectly classified as digit 3 (common mistake due to similar shapes)
# Import visualization libraries
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import numpy as np
# 4. VISUALIZE PREDICTIONS - See model's predictions on actual images
digits = load_digits()
X, y = digits.data, digits.target
# Reproduce same train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train neural network
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp.fit(X_train_scaled, y_train)
y_pred = mlp.predict(X_test_scaled) # Predict all test samples
# Show first 18 test images (3 rows × 6 columns) with predictions
fig, axes = plt.subplots(3, 6, figsize=(15, 8))
for i, ax in enumerate(axes.flat): # Loop through 18 subplots
# X_test[i] is 64-element array; reshape to 8x8 for display
ax.imshow(X_test[i].reshape(8, 8), cmap='gray')
# Show true label vs predicted label
ax.set_title(f"True: {y_test[i]}\nPred: {y_pred[i]}")
ax.axis('off') # Hide axis ticks
# Highlight incorrect predictions in red for easy spotting
if y_test[i] != y_pred[i]:
# Make incorrect predictions stand out visually
ax.set_title(f"True: {y_test[i]}\nPred: {y_pred[i]}",
color='red', fontweight='bold')
plt.tight_layout() # Prevent title overlap
plt.show()
# This visualization helps identify which digits the model confuses
# e.g., 8 vs 3, 5 vs 3, 1 vs 7 are common errors
4. Breast Cancer Dataset (Binary Classification)
About Breast Cancer: 569 samples with 30 features computed from breast mass images. Binary classification: malignant (0) or benign (1). Real medical data—demonstrates importance of precision/recall.
# Import libraries for medical dataset analysis
from sklearn.datasets import load_breast_cancer # Real medical diagnostic data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd # For data manipulation
import numpy as np # For numerical operations
# 1. LOAD & EXPLORE breast cancer diagnostic data
# This dataset contains features computed from digitized images of breast mass
# Binary classification: malignant (cancerous) vs benign (non-cancerous)
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target # X = 30 features, y = diagnosis (0=malignant, 1=benign)
print(f"Dataset shape: {X.shape}") # (569, 30) - 569 samples, 30 features
print(f"Features: {cancer.feature_names[:5]}... (30 total)") # radius, texture, perimeter, area, smoothness, etc.
print(f"Classes: {cancer.target_names}") # ['malignant' 'benign']
print(f"Class distribution: {np.bincount(y)}") # Count of each class
print(f"Malignant (0): {(y==0).sum()}, Benign (1): {(y==1).sum()}") # Show imbalance if any
# Create DataFrame for statistical analysis
# Features include mean, std error, and worst values for 10 measurements
df_cancer = pd.DataFrame(X, columns=cancer.feature_names)
print("\nFeature statistics:") # Mean, std, min, max for all features
print(df_cancer.describe()) # Note: Features have very different scales (0.1 to 3000)
# Import libraries for medical classification
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression # Linear classifier
from sklearn.ensemble import RandomForestClassifier # Tree ensemble
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import numpy as np
# 2. SPLIT, SCALE & TRAIN multiple models
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
# Split: 80% train, 20% test, maintaining class balance
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Scale features (critical for logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train Logistic Regression
# max_iter=10000: High iteration limit (default 100 may not converge)
# See: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
lr = LogisticRegression(max_iter=10000, random_state=42)
lr.fit(X_train_scaled, y_train) # Train on scaled data
# Train Random Forest (comparison model)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train) # Trees don't need scaling
# 3. EVALUATE with MULTIPLE METRICS (critical for medical applications)
# Medical data requires careful evaluation beyond just accuracy
y_pred_lr = lr.predict(X_test_scaled) # Binary predictions (0 or 1)
y_pred_rf = rf.predict(X_test)
y_proba_lr = lr.predict_proba(X_test_scaled)[:, 1] # Probability of benign (class 1)
print("Logistic Regression:")
print(f" Accuracy: {accuracy_score(y_test, y_pred_lr):.3f}") # Overall correctness
print(f" Precision: {precision_score(y_test, y_pred_lr):.3f}") # Of predicted benign, how many are actually benign?
print(f" Recall: {recall_score(y_test, y_pred_lr):.3f}") # Of actual benign, how many did we catch?
print(f" F1 Score: {f1_score(y_test, y_pred_lr):.3f}") # Harmonic mean of precision & recall
print(f" ROC-AUC: {roc_auc_score(y_test, y_proba_lr):.3f}") # Area under ROC curve (0.5-1.0)
print("\nRandom Forest:")
print(f" Accuracy: {accuracy_score(y_test, y_pred_rf):.3f}")
print(f" Precision: {precision_score(y_test, y_pred_rf):.3f}")
print(f" Recall: {recall_score(y_test, y_pred_rf):.3f}") # High recall = fewer missed cancers
print(f" F1 Score: {f1_score(y_test, y_pred_rf):.3f}")
# For medical diagnosis: High recall is often prioritized (don't miss cancers)
# High precision avoids false alarms (unnecessary biopsies)
# Import libraries for ROC curve analysis
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score # For ROC analysis
# 4. ROC CURVE - Visualize classifier performance across thresholds
# ROC = Receiver Operating Characteristic
# Shows trade-off between True Positive Rate and False Positive Rate
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
# Reproduce same train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train logistic regression
lr = LogisticRegression(max_iter=10000, random_state=42)
lr.fit(X_train_scaled, y_train)
# Get probability predictions (not binary 0/1)
y_proba = lr.predict_proba(X_test_scaled)[:, 1] # Probability of benign (class 1)
# roc_curve(): Calculate TPR and FPR at different classification thresholds
# fpr: False Positive Rate (X-axis) - How many benign predicted as malignant?
# tpr: True Positive Rate (Y-axis) - How many benign correctly identified?
# thresholds: Classification thresholds (0.0 to 1.0)
# See: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
# roc_auc_score(): Area Under ROC Curve
# 0.5 = random classifier, 1.0 = perfect classifier
auc = roc_auc_score(y_test, y_proba)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier') # Diagonal line (AUC=0.5)
plt.xlabel('False Positive Rate') # More FP = more false alarms
plt.ylabel('True Positive Rate') # More TP = fewer missed diagnoses
plt.title('ROC Curve - Breast Cancer Classification')
plt.legend()
plt.grid(alpha=0.3)
plt.show()
# Ideal curve: Hugs top-left corner (high TPR, low FPR)
# Higher AUC = better overall classifier performance
5. Diabetes Dataset (Regression)
About Diabetes: 442 samples with 10 baseline features (age, BMI, blood pressure, etc.). Target: quantitative measure of disease progression one year after baseline. Great for learning regression.
# Import libraries for regression analysis
from sklearn.datasets import load_diabetes # Medical progression prediction dataset
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso # Linear models with regularization
from sklearn.ensemble import RandomForestRegressor # Tree ensemble for regression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pandas as pd # For data analysis
import numpy as np
# 1. LOAD & EXPLORE diabetes progression dataset
# This dataset contains baseline patient data and disease progression after 1 year
# Target is quantitative measure of disease progression (continuous value, not classification)
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target # X = 10 features (age, BMI, BP, etc.), y = progression score
print(f"Dataset shape: {X.shape}") # (442, 10) - 442 patients, 10 baseline measurements
print(f"Features: {diabetes.feature_names}") # age, sex, bmi, bp, s1-s6 (blood serum measurements)
print(f"Target statistics: min={y.min():.1f}, max={y.max():.1f}, mean={y.mean():.1f}")
# Target values range from 25 to 346 (higher = worse progression)
# Create DataFrame for correlation analysis
df_diabetes = pd.DataFrame(X, columns=diabetes.feature_names)
df_diabetes['progression'] = y # Add target column
# Identify which features correlate most with disease progression
print("\nCorrelation with target:") # Positive = feature increases with disease progression
print(df_diabetes.corr()['progression'].sort_values(ascending=False))
# Typically: bmi (body mass index), s5 (serum triglycerides) are top predictors
# Import regression models and metrics
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso # Linear models
from sklearn.ensemble import RandomForestRegressor # Non-linear ensemble
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
# 2. SPLIT & TRAIN MULTIPLE REGRESSION MODELS
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
# Split data (no stratification needed for regression)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Linear Regression - no regularization (baseline model)
# See: https://scikit-learn.org/stable/modules/linear_model.html#ordinary-least-squares
lr = LinearRegression()
lr.fit(X_train, y_train) # Learns weights for each feature
# Ridge Regression - L2 regularization (penalizes large coefficients)
# alpha=1.0: Regularization strength (higher = more penalty = simpler model)
# Good when features are correlated (reduces overfitting)
# See: https://scikit-learn.org/stable/modules/linear_model.html#ridge-regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
# Lasso Regression - L1 regularization (can zero out features)
# alpha=0.5: Regularization strength
# Performs automatic feature selection (sets some coefficients to exactly 0)
# See: https://scikit-learn.org/stable/modules/linear_model.html#lasso
lasso = Lasso(alpha=0.5)
lasso.fit(X_train, y_train)
# Random Forest Regressor - ensemble of decision trees
# n_estimators=100: Build 100 trees and average predictions
# Captures non-linear relationships between features and target
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
# 3. EVALUATE with regression metrics
models = {
'Linear Regression': lr,
'Ridge': ridge,
'Lasso': lasso,
'Random Forest': rf_reg
}
for name, model in models.items():
y_pred = model.predict(X_test)
# RMSE: Root Mean Squared Error (same units as target, penalizes large errors)
rmse = mean_squared_error(y_test, y_pred, squared=False)
# R² Score: Coefficient of determination (0-1, higher = better fit)
# 1.0 = perfect predictions, 0 = model as good as mean baseline
r2 = r2_score(y_test, y_pred)
# MAE: Mean Absolute Error (average absolute difference, robust to outliers)
mae = mean_absolute_error(y_test, y_pred)
print(f"\n{name}:")
print(f" RMSE: {rmse:.2f}") # Lower is better
print(f" R² Score: {r2:.3f}") # Higher is better (max 1.0)
print(f" MAE: {mae:.2f}") # Lower is better
# Import visualization library
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
# 4. VISUALIZE PREDICTIONS - Scatter plot of actual vs predicted values
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
# Reproduce same train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Random Forest (typically best performer)
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
y_pred = rf_reg.predict(X_test) # Predict disease progression for test patients
# Create scatter plot
plt.figure(figsize=(10, 6))
# Each point = one test patient
# X-axis = actual progression, Y-axis = model's prediction
plt.scatter(y_test, y_pred, alpha=0.6) # alpha=0.6 for transparency (see overlapping points)
# Plot ideal prediction line (y=x)
# Perfect predictions would fall exactly on this red dashed line
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Disease Progression') # True progression after 1 year
plt.ylabel('Predicted Disease Progression') # Model's prediction
plt.title('Diabetes Progression: Actual vs Predicted') # Title
plt.grid(alpha=0.3) # Light grid for easier reading
plt.show()
# Points close to red line = good predictions
# Points far from line = model errors (over/under-estimation)
# Import libraries for feature importance analysis
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import numpy as np
# 5. FEATURE IMPORTANCE - Which features best predict disease progression?
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
# Reproduce same split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Random Forest
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
# Extract feature importances (how much each feature contributes to predictions)
importances = rf_reg.feature_importances_ # Sum to 1.0
# Sort features by importance (descending order)
indices = np.argsort(importances)[::-1]
# Create bar chart
plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), importances[indices]) # Bars sorted by importance
# Label x-axis with feature names in sorted order
plt.xticks(range(X.shape[1]), [diabetes.feature_names[i] for i in indices], rotation=45)
plt.xlabel('Feature') # Baseline patient measurements
plt.ylabel('Importance') # 0 to ~0.3 for Diabetes dataset
plt.title('Feature Importance for Diabetes Progression Prediction')
plt.tight_layout() # Prevent label cutoff
plt.show()
# Print ranked list with importance scores
print("Feature ranking:")
for i in range(X.shape[1]):
print(f"{i+1}. {diabetes.feature_names[indices[i]]}: {importances[indices[i]]:.3f}")
# Typically: bmi (body mass index) and s5 (serum triglycerides) are most important
# This tells us which patient measurements to prioritize in clinical settings
6. California Housing Dataset (Regression)
About California Housing: 20,640 samples from California census data. 8 features (median income, house age, rooms, location, etc.). Target: median house value. Larger dataset ideal for testing model scalability.
# Import libraries for large-scale regression
from sklearn.datasets import fetch_california_housing # NOTE: fetch_, not load_
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd # For data analysis
import numpy as np
# 1. LOAD & EXPLORE California Housing dataset
# fetch_california_housing() downloads dataset from internet (first time only)
# This is a larger dataset (20,640 samples) - good for testing model scalability
# Based on 1990 California census data
housing = fetch_california_housing()
X, y = housing.data, housing.target # X = 8 features, y = median house value
print(f"Dataset shape: {X.shape}") # (20640, 8) - 20,640 California districts
print(f"Features: {housing.feature_names}")
# MedInc: median income, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude
print(f"Target (median house value in $100k): min={y.min():.2f}, max={y.max():.2f}, mean={y.mean():.2f}")
# Values in $100,000s - e.g., 2.5 = $250,000 median house value
# Create DataFrame for correlation analysis
df_housing = pd.DataFrame(X, columns=housing.feature_names)
df_housing['MedHouseVal'] = y # Add target column
print("\nFirst few rows:") # Preview data structure
print(df_housing.head())
print("\nCorrelation with target:") # Which features correlate with house prices?
print(df_housing.corr()['MedHouseVal'].sort_values(ascending=False))
# Typically: MedInc (median income) is strongest predictor of house value
# Import regression models and evaluation metrics
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression # Simple linear model
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor # Powerful ensembles
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
# 2. SPLIT, SCALE & TRAIN multiple models
housing = fetch_california_housing()
X, y = housing.data, housing.target
# Split: 80% train (16,512 samples), 20% test (4,128 samples)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"Training samples: {X_train.shape[0]}") # 16,512 districts
print(f"Test samples: {X_test.shape[0]}") # 4,128 districts
# Scale features (important for linear models, not trees)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Linear Regression - fast baseline model
lr = LinearRegression()
lr.fit(X_train_scaled, y_train) # Train on scaled data
# Gradient Boosting - powerful for tabular data
# n_estimators=100: Build 100 sequential trees
# learning_rate=0.1: Step size for gradient descent (smaller = more conservative)
# max_depth=5: Maximum tree depth (prevents overfitting)
# See: https://scikit-learn.org/stable/modules/ensemble.html#gradient-boosting
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gb.fit(X_train, y_train) # Trees don't need scaling
# Random Forest - ensemble of independent trees
# max_depth=20: Allow deeper trees than Gradient Boosting
rf = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42)
rf.fit(X_train, y_train)
# 3. EVALUATE all models on test set
models = {
'Linear Regression': (lr, X_test_scaled), # Needs scaled data
'Gradient Boosting': (gb, X_test), # Original data
'Random Forest': (rf, X_test) # Original data
}
for name, (model, X_test_data) in models.items():
y_pred = model.predict(X_test_data)
# RMSE in $100k units (multiply by 100,000 for dollars)
rmse = mean_squared_error(y_test, y_pred, squared=False)
# R²: proportion of variance explained (0-1, higher = better)
r2 = r2_score(y_test, y_pred)
print(f"\n{name}:")
print(f" RMSE: {rmse:.3f} ($100k)") # e.g., 0.5 = ±$50,000 error
print(f" R² Score: {r2:.3f}") # e.g., 0.8 = explains 80% of variance
# Import visualization library
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
# 4. VISUALIZE PREDICTIONS with dual plots
housing = fetch_california_housing()
X, y = housing.data, housing.target
# Reproduce same split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Gradient Boosting (typically best model for this dataset)
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test) # Predict house values for 4,128 test districts
# Create figure with 2 side-by-side subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 5)) # 1 row, 2 columns
# LEFT PLOT: Scatter plot of Actual vs Predicted
axes[0].scatter(y_test, y_pred, alpha=0.3) # alpha=0.3 for transparency (many points)
# Plot y=x line (perfect predictions)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual House Value ($100k)') # True median house value
axes[0].set_ylabel('Predicted House Value ($100k)') # Model's prediction
axes[0].set_title('California Housing: Actual vs Predicted')
axes[0].grid(alpha=0.3)
# Points near red line = accurate predictions
# Points above line = overestimation, below = underestimation
# RIGHT PLOT: Residual plot (errors vs predictions)
# residuals = actual - predicted (positive = underestimated, negative = overestimated)
residuals = y_test - y_pred
axes[1].scatter(y_pred, residuals, alpha=0.3)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2) # Zero error line
axes[1].set_xlabel('Predicted House Value ($100k)')
axes[1].set_ylabel('Residuals') # Error in predictions
axes[1].set_title('Residual Plot') # Check for patterns in errors
axes[1].grid(alpha=0.3)
# Random scatter around y=0 = good (no systematic bias)
# Pattern (e.g., curve) = model missing relationships
plt.tight_layout() # Prevent subplot overlap
plt.show()
# Import libraries for feature importance analysis
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
# 5. FEATURE IMPORTANCE - Which factors most influence house prices?
housing = fetch_california_housing()
X, y = housing.data, housing.target
# Reproduce same split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Gradient Boosting
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gb.fit(X_train, y_train)
# Extract feature importances from trained model
importances = gb.feature_importances_ # Sum to 1.0
# Sort features by importance (descending)
indices = np.argsort(importances)[::-1]
# Create bar chart of feature importance
plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), importances[indices]) # 8 features
# Label x-axis with feature names in sorted order
plt.xticks(range(X.shape[1]), [housing.feature_names[i] for i in indices], rotation=45)
plt.xlabel('Feature') # Census and geographic features
plt.ylabel('Importance') # 0 to ~0.5 for California Housing
plt.title('Feature Importance for California Housing Price Prediction')
plt.tight_layout() # Prevent x-label cutoff
plt.show()
# Print ranked list with importance scores
print("Feature ranking:")
for i in range(X.shape[1]):
print(f"{i+1}. {housing.feature_names[indices[i]]}: {importances[indices[i]]:.3f}")
# Typically: MedInc (median income) is by far the most important predictor
# Latitude and Longitude also matter (location, location, location!)
# This tells us income and location are key drivers of California house prices
Datasets Summary: You've now seen complete workflows for all major Scikit-learn datasets—from loading and exploring to training, evaluation, and visualization. These patterns apply to any ML project. Use these datasets to experiment with new algorithms and techniques!