tutorial 2025-03-27 11 min read

Experiment Tracking: Version Control for ML Training Runs

Learn how to track ML experiments with MLflow and Weights & Biases. Compare runs, reproduce results, and build a systematic approach to model iteration as an ML engineer.

MLflow Weights & Biases experiment tracking MLOps reproducibility

The Problem with Ad-Hoc ML

Software engineers know version control. Every code change is tracked in Git. You can reproduce any past state, compare diffs, and understand what changed.

ML training runs are different in a critical way: the "artifact" (the model) depends not just on code but on data, hyperparameters, and random seeds. Git tracks code. It doesn't track "what hyperparameters did I use in that run that worked well last Tuesday?"

Experiment tracking tools fill this gap.

What to Track Per Experiment

# For every training run, log:

# 1. Parameters — hyperparameters and config
params = {
    "model_type": "gradient_boosting",
    "learning_rate": 0.05,
    "n_estimators": 200,
    "max_depth": 4,
    "feature_set": "v3",
    "training_date_start": "2024-01-01",
    "training_date_end": "2024-03-31",
    "random_seed": 42,
}

# 2. Metrics — evaluation results
metrics = {
    "train_auc": 0.923,
    "val_auc": 0.887,
    "test_auc": 0.881,
    "val_precision@0.5": 0.76,
    "val_recall@0.5": 0.68,
    "training_time_seconds": 342,
}

# 3. Artifacts — model file, plots, data samples
artifacts = [
    "models/model.joblib",
    "plots/confusion_matrix.png",
    "plots/feature_importance.png",
    "data/val_predictions.csv",
]

# 4. Environment — library versions, hardware
environment = {
    "python_version": "3.11.2",
    "sklearn_version": "1.4.0",
    "cpu": "Intel Xeon E5-2690",
    "git_commit": "abc1234",
}

MLflow: Self-Hosted Experiment Tracking

import mlflow
import mlflow.sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

# Start tracking server: mlflow server --port 5000
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("churn-prediction-v2")

with mlflow.start_run(run_name="gbm-learning-rate-sweep"):
    # Log hyperparameters
    params = {
        "learning_rate": 0.05,
        "n_estimators": 200,
        "max_depth": 4,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
    }
    mlflow.log_params(params)

    # Tag for filtering
    mlflow.set_tags({
        "feature_version": "v3",
        "developer": "alice",
        "purpose": "hyperparameter_search",
    })

    # Train
    model = GradientBoostingClassifier(**params)
    model.fit(X_train, y_train)

    # Log metrics
    mlflow.log_metrics({
        "train_auc": roc_auc_score(y_train, model.predict_proba(X_train)[:,1]),
        "val_auc": roc_auc_score(y_val, model.predict_proba(X_val)[:,1]),
        "test_auc": roc_auc_score(y_test, model.predict_proba(X_test)[:,1]),
    })

    # Log artifacts
    mlflow.log_artifact("plots/feature_importance.png")

    # Log model (with schema inference)
    signature = mlflow.models.infer_signature(X_train, model.predict(X_train))
    mlflow.sklearn.log_model(model, "model", signature=signature)

    run_id = mlflow.active_run().info.run_id
    print(f"Run ID: {run_id}")
    print(f"MLflow UI: http://localhost:5000/#/experiments/...")

Comparing Runs

from mlflow.tracking import MlflowClient

client = MlflowClient()
experiment = client.get_experiment_by_name("churn-prediction-v2")

# Get all runs sorted by val AUC
runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    filter_string="tags.purpose = 'hyperparameter_search'",
    order_by=["metrics.val_auc DESC"],
    max_results=20,
)

for run in runs[:5]:
    print(f"Run {run.info.run_id[:8]}: "
          f"lr={run.data.params['learning_rate']}, "
          f"val_auc={run.data.metrics['val_auc']:.4f}")

Logging Metrics Per Step

# For training loops — log metrics at each epoch
with mlflow.start_run():
    for epoch in range(num_epochs):
        train_loss, val_loss = train_one_epoch(model, ...)

        mlflow.log_metrics({
            "train_loss": train_loss,
            "val_loss": val_loss,
        }, step=epoch)  # step= enables time-series view in UI

Weights & Biases: Team-Oriented Tracking

W&B is popular for deep learning projects, especially teams. Key advantage: richer visualization and collaboration features.

import wandb
import torch

# Initialize run
wandb.init(
    project="image-classifier",
    name="resnet50-augmented",
    config={
        "learning_rate": 1e-3,
        "batch_size": 64,
        "epochs": 50,
        "architecture": "ResNet50",
        "augmentation": "heavy",
    }
)

model = torchvision.models.resnet50(pretrained=True)
optimizer = torch.optim.Adam(model.parameters(), lr=wandb.config.learning_rate)

for epoch in range(wandb.config.epochs):
    train_loss, train_acc = train(model, train_loader, optimizer)
    val_loss, val_acc = evaluate(model, val_loader)

    # Log metrics — appears in real-time in W&B dashboard
    wandb.log({
        "epoch": epoch,
        "train/loss": train_loss,
        "train/accuracy": train_acc,
        "val/loss": val_loss,
        "val/accuracy": val_acc,
        "learning_rate": optimizer.param_groups[0]["lr"],
    })

    # Log model checkpoint
    if val_acc > best_val_acc:
        torch.save(model.state_dict(), "best_model.pt")
        wandb.save("best_model.pt")

# Log summary metrics
wandb.run.summary["best_val_accuracy"] = best_val_acc
wandb.finish()

W&B Sweeps: Hyperparameter Search

# Define sweep configuration
sweep_config = {
    "method": "bayes",  # Bayesian optimization
    "metric": {"name": "val/accuracy", "goal": "maximize"},
    "parameters": {
        "learning_rate": {"distribution": "log_uniform_values", "min": 1e-4, "max": 1e-2},
        "batch_size": {"values": [32, 64, 128]},
        "dropout": {"distribution": "uniform", "min": 0.1, "max": 0.5},
    }
}

def train_sweep():
    with wandb.init() as run:
        config = run.config
        model = build_model(dropout=config.dropout)
        optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
        # ... train and log metrics

sweep_id = wandb.sweep(sweep_config, project="image-classifier")
wandb.agent(sweep_id, train_sweep, count=50)  # Run 50 trials

Reproducibility: The Real Goal

Tracking is only useful if you can reproduce a run. For full reproducibility:

import random
import numpy as np
import torch

def set_seeds(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For fully deterministic behavior:
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seeds(42)
mlflow.log_param("random_seed", 42)

# Also log data version
mlflow.log_param("data_hash", hashlib.md5(open("data/train.parquet","rb").read()).hexdigest())

MLflow vs. W&B: When to Use Which

MLflow W&B
Hosting Self-hosted (free) SaaS (free tier)
Setup Medium Easy
Visualization Basic Rich
Team features Limited Strong
Model registry Yes Yes
Best for Compliance-sensitive orgs ML research teams

If you can use cloud services: start with W&B. If you need on-premises or self-hosted: MLflow.


Connect experiment tracking to automated retraining in our MLOps guide.

Want to Go Deeper?

This article is part of our comprehensive curriculum on building ML systems at scale. Explore our full courses for hands-on learning.