Scikit-Learn Cheat Sheet — ML Pipeline Reference — Scikit Le
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score Fit → predict pipeline: transform data, train model, evaluate metrics.
Like LEGO blocks: every piece has the same connector interface (fit/predict/transform). You plug them together in any order without caring what's inside. Want to try logistic regression instead of SVM? Swap one block, everything else stays the same.
Core Concepts
Common Workflows
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred)) Accuracy: 0.95
precision recall f1-score support
0 0.94 0.96 0.95 100
1 0.96 0.94 0.95 100 from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
pipe = Pipeline([
('scaler', StandardScaler()),
('model', LogisticRegression(random_state=42, max_iter=1000))
])
# Cross-validation automatically fits scaler on each fold's train data
scores = cross_val_score(pipe, X, y, cv=5, scoring='accuracy')
print(f"CV scores: {scores}")
print(f"Mean: {scores.mean():.3f} (+/- {scores.std():.3f})")
# Retrain on full data
pipe.fit(X, y)
y_pred = pipe.predict(X_test) CV scores: [0.92 0.94 0.91 0.93 0.92]
Mean: 0.924 (+/- 0.011) from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
param_grid = {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.05, 0.1],
'max_depth': [3, 5, 7]
}
grid = GridSearchCV(
GradientBoostingClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1 # Use all cores
)
grid.fit(X_train, y_train)
print(f"Best params: {grid.best_params_}")
print(f"Best CV score: {grid.best_score_:.3f}")
print(f"Test score: {grid.score(X_test, y_test):.3f}") Best params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}
Best CV score: 0.918
Test score: 0.912 from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
# StandardScaler: (x - mean) / std → mean=0, std=1
scaler_standard = StandardScaler()
X_scaled_std = scaler_standard.fit_transform(X_train)
# MinMaxScaler: (x - min) / (max - min) → range [0, 1]
scaler_minmax = MinMaxScaler()
X_scaled_minmax = scaler_minmax.fit_transform(X_train)
# Use in pipeline to avoid leakage
pipe = Pipeline([
('scaler', StandardScaler()),
('svm', SVC(kernel='rbf', gamma='scale'))
])
pipe.fit(X_train, y_train)
print(f"Score: {pipe.score(X_test, y_test):.3f}") Score: 0.968 from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
# Check imbalance
print(f"Class distribution: {np.bincount(y)}")
# Option 1: Auto-balance
model = RandomForestClassifier(
n_estimators=100,
class_weight='balanced', # Inverse frequency weighting
random_state=42
)
# Option 2: Manual weights
weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_dict = {i: w for i, w in enumerate(weights)}
model = RandomForestClassifier(
n_estimators=100,
class_weight=class_weights_dict,
random_state=42
)
model.fit(X_train, y_train)
print(f"Score: {model.score(X_test, y_test):.3f}") Class distribution: [950 50]
Score: 0.876 from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Get feature importances
importances = model.feature_importances_
feature_names = X_train.columns # Assumes pandas DataFrame
fi_df = pd.DataFrame({
'feature': feature_names,
'importance': importances
}).sort_values('importance', ascending=False)
print(fi_df.head(10))
# Top 5 features
top_features = fi_df.head(5)['feature'].tolist()
X_train_selected = X_train[top_features]
model.fit(X_train_selected, y_train) feature importance
0 age 0.324
1 income 0.251
2 tenure 0.189
3 spend 0.156
4 visits 0.080 Model Selection Cheatsheet
| Task | Best Starting Model | When to Use | Key Hyperparameters |
|---|---|---|---|
| Binary Classification | LogisticRegression or RandomForestClassifier | Logistic for interpretability + speed; RF for nonlinear + feature importance | LogReg: C, solver; RF: n_estimators, max_depth, min_samples_split |
| Regression | LinearRegression or GradientBoostingRegressor | Linear for interpretability; GB for complex nonlinear relationships | LinearReg: none (fitting only); GB: n_estimators, learning_rate, max_depth |
| Clustering | KMeans or DBSCAN | KMeans for spherical clusters + speed; DBSCAN for arbitrary shapes + density-based | KMeans: n_clusters; DBSCAN: eps, min_samples |
| Dimensionality Reduction | PCA or SelectKBest | PCA for variance; SelectKBest for supervised feature selection | PCA: n_components; SelectKBest: k, score_func |
| Multiclass Classification | RandomForestClassifier or SVC | RF for interpretability + speed; SVC for high-dimensional data | RF: n_estimators, max_depth; SVC: C, kernel, gamma |
| Imbalanced Classification | RandomForestClassifier (class_weight='balanced') | Tree-based models handle imbalance better; use class_weight or SMOTE | class_weight='balanced', n_estimators, max_depth |
Common Errors & Fixes
ValueError: Unable to parse string at position X for column Y Cause: Non-numeric data in features; categorical columns not encoded
from sklearn.preprocessing import LabelEncoder
# For categorical features
le = LabelEncoder()
X['category'] = le.fit_transform(X['category'])
# Or use one-hot encoding
X = pd.get_dummies(X, columns=['category'], drop_first=True)
# Better: use Pipeline with ColumnTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
preprocessor = ColumnTransformer([
('num', StandardScaler(), numeric_cols),
('cat', OneHotEncoder(drop='first'), categorical_cols)
])
pipe = Pipeline([('prep', preprocessor), ('model', RandomForestClassifier())]) fit() got an unexpected keyword argument 'learning_rate' Cause: Wrong hyperparameter name for the model; typo or using XGBoost param names
# Check correct param names
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
print(model.get_params().keys())
# Correct params for GradientBoostingClassifier
model = GradientBoostingClassifier(
learning_rate=0.05, # Correct
n_estimators=100,
max_depth=5,
random_state=42
)
# For XGBoost, use xgboost.XGBClassifier() instead
import xgboost as xgb
model = xgb.XGBClassifier(learning_rate=0.05, n_estimators=100) NameError: name 'X_test' is not defined after GridSearchCV Cause: GridSearchCV.fit() is called on full data, no X_test created; or X_test never scaled
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Always split BEFORE grid search
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# GridSearchCV only on train data
grid = GridSearchCV(
Pipeline([('scaler', StandardScaler()), ('model', LogisticRegression())]),
{'model__C': [0.1, 1, 10]},
cv=5
)
grid.fit(X_train, y_train) # Not X, y
print(f"Test score: {grid.score(X_test, y_test)}") StandardScaler not fitted; calling transform before fit Cause: Calling transform() or predict() on a transformer/model that hasn't been fit yet
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Wrong: transform before fit
# X_scaled = scaler.transform(X) # ValueError
# Correct: fit then transform
X_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Only transform, reuse fitted scaler
# Better: use Pipeline to auto-manage fit/transform order
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([('scaler', StandardScaler()), ('model', LogisticRegression())])
pipe.fit(X_train, y_train) # Pipeline internally calls fit_transform
y_pred = pipe.predict(X_test) # Pipeline internally calls transform All samples are either positive or negative (in fold X) during cross-validation Cause: Skewed data + small cv folds → one fold has only one class
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
# Use StratifiedKFold to preserve class balance in each fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = RandomForestClassifier(random_state=42)
scores = cross_val_score(
model, X, y, cv=cv, scoring='f1_weighted'
)
print(f"Scores: {scores}")
# For very imbalanced data, use class_weight
model = RandomForestClassifier(
class_weight='balanced_subsample', # Subsample balancing
random_state=42
) Production Pitfalls
If you fit a scaler or encoder on the full dataset before splitting, information from test data leaks into training. Always use Pipeline or manually fit transformers ONLY on training data. Common: fitting StandardScaler on X before train_test_split, then scaling X_train and X_test separately with the same scaler fit on both.
On imbalanced datasets, a random split can produce a training fold with 95% of one class. Use stratify=y to guarantee class distribution is preserved in train and test sets. This matters more for small datasets.
All scikit-learn models use random_state for reproducibility. Without it, results vary between runs. Always set random_state=42 (or any fixed int) in train_test_split, model instantiation, and cross_val_score. Production code must be deterministic.
If you run GridSearchCV on your full dataset (or only check test performance), you're tuning on test data. Always: split → GridSearchCV on train → evaluate on test. Never refit on test data before final evaluation.
A classifier that predicts majority class always has 95% accuracy on a 95-5 split, but zero recall on the minority class. Use F1, precision, recall, or ROC-AUC instead. classification_report() shows all metrics.
SelectKBest, feature_importances_, and RFE measure importance using the target (y). If you select features before train_test_split, test performance is overestimated. Use Pipeline or select features only on training data.
Scikit-learn models don't impute NaNs; they fail silently or crash. Always check for missing values with X.isnull().sum() and use SimpleImputer in your Pipeline. Don't assume your data is clean.
GridSearchCV is exhaustive: n_params^n_combinations. For large datasets or many parameters, use RandomizedSearchCV instead, or set n_jobs=-1 to parallelize. Even then, limit grid size (max 3-4 params × 3-5 values).
Complete workflow: load → preprocess → split → tune → evaluate → save model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import joblib
# 1. Load and explore
df = pd.read_csv('data.csv')
print(f"Shape: {df.shape}")
print(f"Missing: {df.isnull().sum()}")
print(f"Classes: {df['target'].value_counts()}")
# 2. Separate features and target
X = df.drop('target', axis=1)
y = df['target']
# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns
# 3. Split data (stratified for class balance)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 4. Build preprocessing pipeline
preprocessor = ColumnTransformer([
('num', StandardScaler(), numeric_cols),
('cat', LabelEncoder().fit(y).transform if len(categorical_cols) == 0 else 'passthrough', categorical_cols)
])
# 5. Build full pipeline with model
full_pipeline = Pipeline([
('preprocessor', preprocessor),
('model', RandomForestClassifier(random_state=42))
])
# 6. Hyperparameter tuning with cross-validation
param_grid = {
'model__n_estimators': [50, 100],
'model__max_depth': [5, 10, None],
'model__min_samples_split': [2, 5]
}
grid = GridSearchCV(
full_pipeline,
param_grid,
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
scoring='f1_weighted',
n_jobs=-1
)
grid.fit(X_train, y_train)
print(f"\nBest params: {grid.best_params_}")
print(f"Best CV score: {grid.best_score_:.3f}")
# 7. Evaluate on test set
y_pred = grid.predict(X_test)
y_pred_proba = grid.predict_proba(X_test)[:, 1]
print(f"\nTest Score: {grid.score(X_test, y_test):.3f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.3f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
# 8. Save model for production
joblib.dump(grid.best_estimator_, 'model.pkl')
print("\nModel saved to model.pkl")
# 9. Load and use in production
model = joblib.load('model.pkl')
X_new = pd.read_csv('new_data.csv')
y_new_pred = model.predict(X_new)
print(f"Predictions: {y_new_pred[:5]}")