The Problem with Ad-Hoc ML
Software engineers know version control. Every code change is tracked in Git. You can reproduce any past state, compare diffs, and understand what changed.
ML training runs are different in a critical way: the "artifact" (the model) depends not just on code but on data, hyperparameters, and random seeds. Git tracks code. It doesn't track "what hyperparameters did I use in that run that worked well last Tuesday?"
Experiment tracking tools fill this gap.
What to Track Per Experiment
# For every training run, log:
# 1. Parameters — hyperparameters and config
params = {
"model_type": "gradient_boosting",
"learning_rate": 0.05,
"n_estimators": 200,
"max_depth": 4,
"feature_set": "v3",
"training_date_start": "2024-01-01",
"training_date_end": "2024-03-31",
"random_seed": 42,
}
# 2. Metrics — evaluation results
metrics = {
"train_auc": 0.923,
"val_auc": 0.887,
"test_auc": 0.881,
"val_precision@0.5": 0.76,
"val_recall@0.5": 0.68,
"training_time_seconds": 342,
}
# 3. Artifacts — model file, plots, data samples
artifacts = [
"models/model.joblib",
"plots/confusion_matrix.png",
"plots/feature_importance.png",
"data/val_predictions.csv",
]
# 4. Environment — library versions, hardware
environment = {
"python_version": "3.11.2",
"sklearn_version": "1.4.0",
"cpu": "Intel Xeon E5-2690",
"git_commit": "abc1234",
}
MLflow: Self-Hosted Experiment Tracking
import mlflow
import mlflow.sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
# Start tracking server: mlflow server --port 5000
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("churn-prediction-v2")
with mlflow.start_run(run_name="gbm-learning-rate-sweep"):
# Log hyperparameters
params = {
"learning_rate": 0.05,
"n_estimators": 200,
"max_depth": 4,
"subsample": 0.8,
"colsample_bytree": 0.8,
}
mlflow.log_params(params)
# Tag for filtering
mlflow.set_tags({
"feature_version": "v3",
"developer": "alice",
"purpose": "hyperparameter_search",
})
# Train
model = GradientBoostingClassifier(**params)
model.fit(X_train, y_train)
# Log metrics
mlflow.log_metrics({
"train_auc": roc_auc_score(y_train, model.predict_proba(X_train)[:,1]),
"val_auc": roc_auc_score(y_val, model.predict_proba(X_val)[:,1]),
"test_auc": roc_auc_score(y_test, model.predict_proba(X_test)[:,1]),
})
# Log artifacts
mlflow.log_artifact("plots/feature_importance.png")
# Log model (with schema inference)
signature = mlflow.models.infer_signature(X_train, model.predict(X_train))
mlflow.sklearn.log_model(model, "model", signature=signature)
run_id = mlflow.active_run().info.run_id
print(f"Run ID: {run_id}")
print(f"MLflow UI: http://localhost:5000/#/experiments/...")
Comparing Runs
from mlflow.tracking import MlflowClient
client = MlflowClient()
experiment = client.get_experiment_by_name("churn-prediction-v2")
# Get all runs sorted by val AUC
runs = client.search_runs(
experiment_ids=[experiment.experiment_id],
filter_string="tags.purpose = 'hyperparameter_search'",
order_by=["metrics.val_auc DESC"],
max_results=20,
)
for run in runs[:5]:
print(f"Run {run.info.run_id[:8]}: "
f"lr={run.data.params['learning_rate']}, "
f"val_auc={run.data.metrics['val_auc']:.4f}")
Logging Metrics Per Step
# For training loops — log metrics at each epoch
with mlflow.start_run():
for epoch in range(num_epochs):
train_loss, val_loss = train_one_epoch(model, ...)
mlflow.log_metrics({
"train_loss": train_loss,
"val_loss": val_loss,
}, step=epoch) # step= enables time-series view in UI
Weights & Biases: Team-Oriented Tracking
W&B is popular for deep learning projects, especially teams. Key advantage: richer visualization and collaboration features.
import wandb
import torch
# Initialize run
wandb.init(
project="image-classifier",
name="resnet50-augmented",
config={
"learning_rate": 1e-3,
"batch_size": 64,
"epochs": 50,
"architecture": "ResNet50",
"augmentation": "heavy",
}
)
model = torchvision.models.resnet50(pretrained=True)
optimizer = torch.optim.Adam(model.parameters(), lr=wandb.config.learning_rate)
for epoch in range(wandb.config.epochs):
train_loss, train_acc = train(model, train_loader, optimizer)
val_loss, val_acc = evaluate(model, val_loader)
# Log metrics — appears in real-time in W&B dashboard
wandb.log({
"epoch": epoch,
"train/loss": train_loss,
"train/accuracy": train_acc,
"val/loss": val_loss,
"val/accuracy": val_acc,
"learning_rate": optimizer.param_groups[0]["lr"],
})
# Log model checkpoint
if val_acc > best_val_acc:
torch.save(model.state_dict(), "best_model.pt")
wandb.save("best_model.pt")
# Log summary metrics
wandb.run.summary["best_val_accuracy"] = best_val_acc
wandb.finish()
W&B Sweeps: Hyperparameter Search
# Define sweep configuration
sweep_config = {
"method": "bayes", # Bayesian optimization
"metric": {"name": "val/accuracy", "goal": "maximize"},
"parameters": {
"learning_rate": {"distribution": "log_uniform_values", "min": 1e-4, "max": 1e-2},
"batch_size": {"values": [32, 64, 128]},
"dropout": {"distribution": "uniform", "min": 0.1, "max": 0.5},
}
}
def train_sweep():
with wandb.init() as run:
config = run.config
model = build_model(dropout=config.dropout)
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
# ... train and log metrics
sweep_id = wandb.sweep(sweep_config, project="image-classifier")
wandb.agent(sweep_id, train_sweep, count=50) # Run 50 trials
Reproducibility: The Real Goal
Tracking is only useful if you can reproduce a run. For full reproducibility:
import random
import numpy as np
import torch
def set_seeds(seed: int = 42):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# For fully deterministic behavior:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seeds(42)
mlflow.log_param("random_seed", 42)
# Also log data version
mlflow.log_param("data_hash", hashlib.md5(open("data/train.parquet","rb").read()).hexdigest())
MLflow vs. W&B: When to Use Which
| MLflow | W&B | |
|---|---|---|
| Hosting | Self-hosted (free) | SaaS (free tier) |
| Setup | Medium | Easy |
| Visualization | Basic | Rich |
| Team features | Limited | Strong |
| Model registry | Yes | Yes |
| Best for | Compliance-sensitive orgs | ML research teams |
If you can use cloud services: start with W&B. If you need on-premises or self-hosted: MLflow.
Connect experiment tracking to automated retraining in our MLOps guide.