Experiment Tracking: Version Control for ML Training Runs

The Problem with Ad-Hoc ML

Software engineers know version control. Every code change is tracked in Git. You can reproduce any past state, compare diffs, and understand what changed.

ML training runs are different in a critical way: the "artifact" (the model) depends not just on code but on data, hyperparameters, and random seeds. Git tracks code. It doesn't track "what hyperparameters did I use in that run that worked well last Tuesday?"

Experiment tracking tools fill this gap.

What to Track Per Experiment

# For every training run, log:

# 1. Parameters — hyperparameters and config
params = {
    "model_type": "gradient_boosting",
    "learning_rate": 0.05,
    "n_estimators": 200,
    "max_depth": 4,
    "feature_set": "v3",
    "training_date_start": "2024-01-01",
    "training_date_end": "2024-03-31",
    "random_seed": 42,
}

# 2. Metrics — evaluation results
metrics = {
    "train_auc": 0.923,
    "val_auc": 0.887,
    "test_auc": 0.881,
    "val_precision@0.5": 0.76,
    "val_recall@0.5": 0.68,
    "training_time_seconds": 342,
}

# 3. Artifacts — model file, plots, data samples
artifacts = [
    "models/model.joblib",
    "plots/confusion_matrix.png",
    "plots/feature_importance.png",
    "data/val_predictions.csv",
]

# 4. Environment — library versions, hardware
environment = {
    "python_version": "3.11.2",
    "sklearn_version": "1.4.0",
    "cpu": "Intel Xeon E5-2690",
    "git_commit": "abc1234",
}

MLflow: Self-Hosted Experiment Tracking

import mlflow
import mlflow.sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

# Start tracking server: mlflow server --port 5000
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("churn-prediction-v2")

with mlflow.start_run(run_name="gbm-learning-rate-sweep"):
    # Log hyperparameters
    params = {
        "learning_rate": 0.05,
        "n_estimators": 200,
        "max_depth": 4,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
    }
    mlflow.log_params(params)

    # Tag for filtering
    mlflow.set_tags({
        "feature_version": "v3",
        "developer": "alice",
        "purpose": "hyperparameter_search",
    })

    # Train
    model = GradientBoostingClassifier(**params)
    model.fit(X_train, y_train)

    # Log metrics
    mlflow.log_metrics({
        "train_auc": roc_auc_score(y_train, model.predict_proba(X_train)[:,1]),
        "val_auc": roc_auc_score(y_val, model.predict_proba(X_val)[:,1]),
        "test_auc": roc_auc_score(y_test, model.predict_proba(X_test)[:,1]),
    })

    # Log artifacts
    mlflow.log_artifact("plots/feature_importance.png")

    # Log model (with schema inference)
    signature = mlflow.models.infer_signature(X_train, model.predict(X_train))
    mlflow.sklearn.log_model(model, "model", signature=signature)

    run_id = mlflow.active_run().info.run_id
    print(f"Run ID: {run_id}")
    print(f"MLflow UI: http://localhost:5000/#/experiments/...")

Comparing Runs

from mlflow.tracking import MlflowClient

client = MlflowClient()
experiment = client.get_experiment_by_name("churn-prediction-v2")

# Get all runs sorted by val AUC
runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    filter_string="tags.purpose = 'hyperparameter_search'",
    order_by=["metrics.val_auc DESC"],
    max_results=20,
)

for run in runs[:5]:
    print(f"Run {run.info.run_id[:8]}: "
          f"lr={run.data.params['learning_rate']}, "
          f"val_auc={run.data.metrics['val_auc']:.4f}")

Logging Metrics Per Step

# For training loops — log metrics at each epoch
with mlflow.start_run():
    for epoch in range(num_epochs):
        train_loss, val_loss = train_one_epoch(model, ...)

        mlflow.log_metrics({
            "train_loss": train_loss,
            "val_loss": val_loss,
        }, step=epoch)  # step= enables time-series view in UI

Weights & Biases: Team-Oriented Tracking

W&B is popular for deep learning projects, especially teams. Key advantage: richer visualization and collaboration features.

import wandb
import torch

# Initialize run
wandb.init(
    project="image-classifier",
    name="resnet50-augmented",
    config={
        "learning_rate": 1e-3,
        "batch_size": 64,
        "epochs": 50,
        "architecture": "ResNet50",
        "augmentation": "heavy",
    }
)

model = torchvision.models.resnet50(pretrained=True)
optimizer = torch.optim.Adam(model.parameters(), lr=wandb.config.learning_rate)

for epoch in range(wandb.config.epochs):
    train_loss, train_acc = train(model, train_loader, optimizer)
    val_loss, val_acc = evaluate(model, val_loader)

    # Log metrics — appears in real-time in W&B dashboard
    wandb.log({
        "epoch": epoch,
        "train/loss": train_loss,
        "train/accuracy": train_acc,
        "val/loss": val_loss,
        "val/accuracy": val_acc,
        "learning_rate": optimizer.param_groups[0]["lr"],
    })

    # Log model checkpoint
    if val_acc > best_val_acc:
        torch.save(model.state_dict(), "best_model.pt")
        wandb.save("best_model.pt")

# Log summary metrics
wandb.run.summary["best_val_accuracy"] = best_val_acc
wandb.finish()

W&B Sweeps: Hyperparameter Search

# Define sweep configuration
sweep_config = {
    "method": "bayes",  # Bayesian optimization
    "metric": {"name": "val/accuracy", "goal": "maximize"},
    "parameters": {
        "learning_rate": {"distribution": "log_uniform_values", "min": 1e-4, "max": 1e-2},
        "batch_size": {"values": [32, 64, 128]},
        "dropout": {"distribution": "uniform", "min": 0.1, "max": 0.5},
    }
}

def train_sweep():
    with wandb.init() as run:
        config = run.config
        model = build_model(dropout=config.dropout)
        optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
        # ... train and log metrics

sweep_id = wandb.sweep(sweep_config, project="image-classifier")
wandb.agent(sweep_id, train_sweep, count=50)  # Run 50 trials

Reproducibility: The Real Goal

Tracking is only useful if you can reproduce a run. For full reproducibility:

import random
import numpy as np
import torch

def set_seeds(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For fully deterministic behavior:
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seeds(42)
mlflow.log_param("random_seed", 42)

# Also log data version
mlflow.log_param("data_hash", hashlib.md5(open("data/train.parquet","rb").read()).hexdigest())

MLflow vs. W&B: When to Use Which

	MLflow	W&B
Hosting	Self-hosted (free)	SaaS (free tier)
Setup	Medium	Easy
Visualization	Basic	Rich
Team features	Limited	Strong
Model registry	Yes	Yes
Best for	Compliance-sensitive orgs	ML research teams

If you can use cloud services: start with W&B. If you need on-premises or self-hosted: MLflow.

Connect experiment tracking to automated retraining in our MLOps guide.

The Problem with Ad-Hoc ML

What to Track Per Experiment

MLflow: Self-Hosted Experiment Tracking

Comparing Runs

Logging Metrics Per Step

Weights & Biases: Team-Oriented Tracking

W&B Sweeps: Hyperparameter Search

Reproducibility: The Real Goal

MLflow vs. W&B: When to Use Which

Related Articles

Production ML: A Reality Check on MLOps Practices

KV Cache Optimization: The Engineering Core of Efficient LLM Serving

Test-Time Compute Scaling: The New Dimension of AI Performance

Want to Go Deeper?