End-to-End ML Experiment Tracking with MLflow
A practical, reproducible tutorial you can drop into your project repo.
Why this guide
When You already have an MLflow Tracking Server running (great!). This post shows you how to:
- Log data version, hyperparameters, metrics, plots, artifacts, models, and environment to MLflow.
- Compare and visualize runs in the MLflow UI.
- Register and serve models from the Model Registry.
- Bootstrap a clean project structure with a one-command script (Bash & PowerShell provided).
Prerequisites
- MLflow Tracking Server reachable at
http://<MLFLOW_HOST>:5000(adjust host/port as needed). - Python 3.9+ recommended.
- Git installed (strongly recommended).
- Optional: a remote artifact store (S3/MinIO/…).
Quick Start: Auto-Generate the Project Skeleton
Run one of the scripts below in an empty folder (or your repo root). It will create a minimal, production-ready structure plus starter files.
Option A — Bash (Linux/macOS/WSL)
#!/usr/bin/env bash
set -euo pipefail
PROJECT_NAME="mlflow-demo"
EXPERIMENT_NAME="aicargo_demo"
RUN_NAME="rf-baseline"
mkdir -p data notebooks src .github/workflows
touch data/.gitkeep notebooks/.gitkeep
# .gitignore
cat > .gitignore <<'EOF'
# Python
__pycache__/
*.pyc
.venv/
.env
# Data & artifacts
data/**/*
!data/.gitkeep
mlruns/
# OS
.DS_Store
# VSCode
.vscode/
EOF
# requirements.txt
cat > requirements.txt <<'EOF'
mlflow>=2.12.0
pandas>=2.1.0
numpy>=1.26.0
scikit-learn>=1.3.0
matplotlib>=3.8.0
joblib>=1.3.0
EOF
# optional conda env
cat > conda.yaml <<'EOF'
name: mlflow-demo
channels:
- conda-forge
dependencies:
- python=3.11
- pip
- pip:
- -r requirements.txt
EOF
# sample dataset (Iris)
cat > data/iris.csv <<'EOF'
sepal_length,sepal_width,petal_length,petal_width,target
5.1,3.5,1.4,0.2,0
4.9,3.0,1.4,0.2,0
6.2,3.4,5.4,2.3,2
6.3,3.3,6.0,2.5,2
5.4,3.9,1.7,0.4,0
6.7,3.1,4.4,1.4,1
6.0,2.9,4.5,1.5,1
4.6,3.2,1.4,0.2,0
7.7,3.8,6.7,2.2,2
5.9,3.0,5.1,1.8,2
EOF
# src/utils.py
cat > src/utils.py <<'EOF'
import hashlib
def file_md5(path: str, chunk_size: int = 2**20) -> str:
md5 = hashlib.md5()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(chunk_size), b""):
md5.update(chunk)
return md5.hexdigest()
EOF
# src/train.py
cat > src/train.py <<'EOF'
import os, json, tempfile
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
recall_score, confusion_matrix, classification_report)
import mlflow
import mlflow.sklearn
from utils import file_md5
EXPERIMENT_NAME = os.getenv("EXPERIMENT_NAME", "aicargo_demo")
RUN_NAME = os.getenv("RUN_NAME", "rf-baseline")
DATA_PATH = os.getenv("DATA_PATH", "data/iris.csv")
TARGET_COL = os.getenv("TARGET_COL", "target")
N_ESTIMATORS = int(os.getenv("N_ESTIMATORS", 200))
MAX_DEPTH = int(os.getenv("MAX_DEPTH", 10))
TEST_SIZE = float(os.getenv("TEST_SIZE", 0.2))
RANDOM_STATE = int(os.getenv("RANDOM_STATE", 42))
def plot_confusion_matrix(cm, labels, out_path):
fig = plt.figure()
plt.imshow(cm, interpolation="nearest")
plt.title("Confusion matrix")
plt.colorbar()
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels, rotation=45)
plt.yticks(tick_marks, labels)
thresh = cm.max() / 2.0
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
plt.text(j, i, format(cm[i, j], "d"),
ha="center",
color="white" if cm[i, j] > thresh else "black")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.tight_layout()
fig.savefig(out_path)
plt.close(fig)
def main():
# 0) Tracking URI from env, if set
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", ""))
# 1) experiment
mlflow.set_experiment(EXPERIMENT_NAME)
with mlflow.start_run(run_name=RUN_NAME) as run:
mlflow.set_tags({
"project":"mlflow-demo",
"owner":"jiahong",
"stage":"dev",
"feature_set":"demo_v1",
"git_sha": os.getenv("GIT_COMMIT","")
})
# 2) load data
if DATA_PATH.endswith(".csv"):
df = pd.read_csv(DATA_PATH)
elif DATA_PATH.endswith(".parquet"):
df = pd.read_parquet(DATA_PATH)
else:
raise ValueError("Unsupported data format")
# 2.1 data version artifact
dataset_info = {
"path": os.path.abspath(DATA_PATH),
"md5": file_md5(DATA_PATH),
"rows": int(len(df)),
"cols": int(df.shape[1]),
"loaded_at": datetime.utcnow().isoformat()
}
mlflow.log_dict(dataset_info, "data/dataset_info.json")
# 3) split
X = df.drop(columns=[TARGET_COL]).values
y = df[TARGET_COL].values
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)
# 4) params
mlflow.log_params({
"model":"RandomForestClassifier",
"n_estimators":N_ESTIMATORS,
"max_depth":MAX_DEPTH,
"test_size":TEST_SIZE,
"random_state":RANDOM_STATE
})
# 5) train
model = RandomForestClassifier(
n_estimators=N_ESTIMATORS,
max_depth=MAX_DEPTH,
random_state=RANDOM_STATE,
n_jobs=-1
).fit(X_train, y_train)
# 6) eval
y_pred = model.predict(X_test)
metrics = {
"acc": float(accuracy_score(y_test, y_pred)),
"f1_macro": float(f1_score(y_test, y_pred, average="macro")),
"precision_macro": float(precision_score(y_test, y_pred, average="macro")),
"recall_macro": float(recall_score(y_test, y_pred, average="macro")),
}
mlflow.log_metrics(metrics)
# 6.1 report & confusion matrix
mlflow.log_text(classification_report(y_test, y_pred),
"eval/classification_report.txt")
labels = [str(c) for c in np.unique(y)]
cm = confusion_matrix(y_test, y_pred, labels=np.unique(y))
with tempfile.TemporaryDirectory() as td:
out = os.path.join(td, "confusion_matrix.png")
plot_confusion_matrix(cm, labels, out)
mlflow.log_artifact(out, artifact_path="eval")
# 7) model + env
mlflow.sklearn.log_model(
model, artifact_path="model",
input_example=X_train[:2],
signature=mlflow.models.infer_signature(X_train, model.predict(X_train))
)
if os.path.exists("requirements.txt"):
mlflow.log_artifact("requirements.txt", artifact_path="env")
if os.path.exists("conda.yaml"):
mlflow.log_artifact("conda.yaml", artifact_path="env")
print("Run logged:", run.info.run_id)
if __name__ == "__main__":
main()
EOF
# README quick commands
cat > RUN.sh <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
python -m venv .venv
source .venv/bin/activate
python -m pip install --upgrade pip
pip install -r requirements.txt
# Set your server first: export MLFLOW_TRACKING_URI="http://<host>:5000"
python src/train.py
EOF
chmod +x RUN.sh
echo "✅ Project skeleton created. Next:
1) git init && git add . && git commit -m 'init'
2) export MLFLOW_TRACKING_URI='http://<YOUR_HOST>:5000'
3) ./RUN.sh
4) Open MLflow UI and compare runs."
Option B — PowerShell (Windows)
# Run in PowerShell (Windows)
$ErrorActionPreference = "Stop"
mkdir data, notebooks, src, ".github/workflows" -ea SilentlyContinue | Out-Null
ni data\.gitkeep -Force | Out-Null
ni notebooks\.gitkeep -Force | Out-Null
@"
__pycache__/
*.pyc
.venv/
.env
data/**/*
!data/.gitkeep
mlruns/
.DS_Store
.vscode/
"@ | Out-File .gitignore -Encoding UTF8
@"
mlflow>=2.12.0
pandas>=2.1.0
numpy>=1.26.0
scikit-learn>=1.3.0
matplotlib>=3.8.0
joblib>=1.3.0
"@ | Out-File requirements.txt -Encoding UTF8
@"
name: mlflow-demo
channels:
- conda-forge
dependencies:
- python=3.11
- pip
- pip:
- -r requirements.txt
"@ | Out-File conda.yaml -Encoding UTF8
@"
sepal_length,sepal_width,petal_length,petal_width,target
5.1,3.5,1.4,0.2,0
4.9,3.0,1.4,0.2,0
6.2,3.4,5.4,2.3,2
6.3,3.3,6.0,2.5,2
5.4,3.9,1.7,0.4,0
6.7,3.1,4.4,1.4,1
6.0,2.9,4.5,1.5,1
4.6,3.2,1.4,0.2,0
7.7,3.8,6.7,2.2,2
5.9,3.0,5.1,1.8,2
"@ | Out-File data\iris.csv -Encoding UTF8
@"
import hashlib
def file_md5(path: str, chunk_size: int = 2**20) -> str:
md5 = hashlib.md5()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(chunk_size), b""):
md5.update(chunk)
return md5.hexdigest()
"@ | Out-File src\utils.py -Encoding UTF8
# train.py (same as Bash version)
$train = Get-Content -Raw src/train.py 2>$null
if (-not $train) {
@"
$(Get-Content -Raw -Path ([IO.Path]::Combine($PSScriptRoot, 'src', 'train.py')) 2>$null)
"@ | Out-File src\train.py -Encoding UTF8
}
@"
python -m venv .venv
.\.venv\Scripts\activate
python -m pip install --upgrade pip
pip install -r requirements.txt
# Set your server first: setx MLFLOW_TRACKING_URI http://<host>:5000 (then reopen shell) or `$env:MLFLOW_TRACKING_URI="http://<host>:5000"`
python src\train.py
"@ | Out-File RUN.ps1 -Encoding UTF8
Write-Host "✅ Project skeleton created. Next:
1) git init; git add .; git commit -m 'init'
2) `$env:MLFLOW_TRACKING_URI='http://<YOUR_HOST>:5000'`
3) .\RUN.ps1
4) Open MLflow UI and compare runs."
What the Structure Contains
your-project/
├─ data/ # raw/intermediate data (versioned by path/hash)
├─ notebooks/ # EDA & prototypes
├─ src/
│ ├─ train.py # logging params/metrics/plots/model/environment
│ └─ utils.py # data hash helper
├─ requirements.txt # pip dependencies
├─ conda.yaml # optional conda environment
├─ .gitignore
└─ .github/workflows/ # (empty now) add CI later
Point Your Code to the MLflow Server
Pick one:
Linux/macOS/WSL
export MLFLOW_TRACKING_URI="http://<YOUR_MLFLOW_HOST>:5000"
# If behind basic auth / reverse proxy:
export MLFLOW_TRACKING_USERNAME="your_user"
export MLFLOW_TRACKING_PASSWORD="your_pass"
Windows PowerShell
$env:MLFLOW_TRACKING_URI = "http://<YOUR_MLFLOW_HOST>:5000"
# Optional:
$env:MLFLOW_TRACKING_USERNAME = "your_user"
$env:MLFLOW_TRACKING_PASSWORD = "your_pass"
Using S3/MinIO for artifacts? Also setAWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,MLFLOW_S3_ENDPOINT_URL.
Run a Baseline Experiment
# Linux/macOS/WSL
./RUN.sh
# Windows PowerShell
.\RUN.ps1
You’ll see a new run under the aicargo_demo experiment in the MLflow UI with:
- Params: model type, hyperparams, split seed…
- Metrics: accuracy, F1, precision, recall…
- Artifacts:
eval/confusion_matrix.png,eval/classification_report.txt,data/dataset_info.json(path, MD5, row/col count),env/requirements.txt,conda.yaml - Model: logged in
model/(with signature & example)
Record Everything That Matters (What the script already does)
- Data Version
- Path-based versioning: dataset path includes version (e.g.,
s3://…/2020Q4_v3/). - Hashing:
dataset_info.jsonrecords MD5 + shape + timestamp. - Tip: use DVC or date/commit-stamped paths for larger pipelines.
- Path-based versioning: dataset path includes version (e.g.,
- Hyperparameters
mlflow.log_params({...})stores all tuning knobs.
- Metrics
mlflow.log_metrics({...})for accuracy/F1/…- For deep learning, log per-epoch metrics (step-wise).
- Visuals & Reports
- Confusion matrix PNG, classification report TXT.
- Add ROC/PR/feature-importance/SHAP as you grow.
- Model & Environment
mlflow.sklearn.log_model(...)saves model + signature.- Log
requirements.txtand/orconda.yamlfor reproducibility.
- Tags
project,owner,stage,git_sha,feature_set—great for filtering & comparisons.
Compare Runs in the UI
- Go to Experiments → aicargo_demo.
- Select multiple runs → Compare.
- Use:
- Parallel coordinates to see which params drive metrics.
- Scatter/Line charts to visualize trade-offs.
- Artifacts tab to inspect plots & reports.
(Optional) Autologging Shortcuts
If you use the following frameworks, add one line to capture lots of metadata automatically:
import mlflow, mlflow.sklearn; mlflow.sklearn.autolog()
# or:
# mlflow.tensorflow.autolog()
# mlflow.pytorch.autolog()
# mlflow.xgboost.autolog()
# mlflow.lightgbm.autolog()
Still log data version and custom plots explicitly.
(Optional) Model Registry & Serving
Register a model (either from the UI or in code with registered_model_name), then:
mlflow models serve -m "models:/your_model_name/Production" -p 5001 --env-manager local
Promote versions between Staging and Production in the UI.
Troubleshooting
- No runs showing up
EnsureMLFLOW_TRACKING_URIis set in the same shell that runs Python. - Artifact logging fails
Verify artifact store credentials/endpoint env vars. - Plots not visible
Make sure you close the figure (we callplt.close(fig)) and log the file path as an artifact.
Next Steps
- Wire this into CI (GitHub Actions) to train on every PR and tag runs with the
GIT_COMMIT. - Add ROC/PR/SHAP and your domain-specific diagnostics.
- Switch
data/iris.csvto your real dataset and keep versioned paths.
That’s it! You now have a repeatable project template, a one-command bootstrapper, and a training script that logs everything useful to MLflow—ready for comparisons, governance, and deployment.