模型评估
本指南涵盖了 MLflow 在分类和回归任务中的核心模型评估功能,展示了如何利用自动化指标、可视化和诊断工具全面评估模型性能。
快速开始:评估分类模型
评估模型最简单的方法是使用 MLflow 的统一评估 API
import mlflow
import xgboost as xgb
import shap
from sklearn.model_selection import train_test_split
from mlflow.models import infer_signature
# Load the UCI Adult Dataset
X, y = shap.datasets.adult()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)
# Train model
model = xgb.XGBClassifier().fit(X_train, y_train)
# Create evaluation dataset
eval_data = X_test.copy()
eval_data["label"] = y_test
with mlflow.start_run():
# Log model with signature
signature = infer_signature(X_test, model.predict(X_test))
mlflow.sklearn.log_model(model, name="model", signature=signature)
model_uri = mlflow.get_artifact_uri("model")
# Comprehensive evaluation
result = mlflow.models.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
print(f"Accuracy: {result.metrics['accuracy_score']:.3f}")
print(f"F1 Score: {result.metrics['f1_score']:.3f}")
print(f"ROC AUC: {result.metrics['roc_auc']:.3f}")
此单次调用会自动生成
- 性能指标:准确率、精确率、召回率、F1 分数、ROC-AUC
- 可视化:混淆矩阵、ROC 曲线、精确率-召回率曲线
- 特征重要性:SHAP 值和特征贡献分析
- 模型工件:所有图表和诊断信息都保存到 MLflow
支持的模型类型
MLflow 支持不同的模型类型,每种类型都有专门的指标和评估
classifier
- 二元和多分类模型regressor
- 用于连续目标预测的回归模型
- 分类
- 回归
对于分类任务,MLflow 自动计算全面的指标
# Binary Classification
result = mlflow.models.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier", # Automatically detects binary vs multiclass
evaluators=["default"],
)
# Access classification-specific metrics
metrics = result.metrics
print(f"Precision: {metrics['precision_score']:.3f}")
print(f"Recall: {metrics['recall_score']:.3f}")
print(f"F1 Score: {metrics['f1_score']:.3f}")
print(f"ROC AUC: {metrics['roc_auc']:.3f}")
自动分类指标
- 准确率、精确率、召回率、F1 分数
- ROC-AUC 和精确率-召回率 AUC
- 对数损失和布里尔分数
- 混淆矩阵和分类报告
对于回归任务,MLflow 提供全面的误差分析
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
# Load regression dataset
housing = fetch_california_housing(as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(
housing.data, housing.target, test_size=0.2, random_state=42
)
# Train regression model
reg_model = LinearRegression().fit(X_train, y_train)
# Create evaluation dataset
eval_data = X_test.copy()
eval_data["target"] = y_test
with mlflow.start_run():
# Log and evaluate regression model
signature = infer_signature(X_train, reg_model.predict(X_train))
mlflow.sklearn.log_model(reg_model, name="model", signature=signature)
model_uri = mlflow.get_artifact_uri("model")
result = mlflow.models.evaluate(
model_uri,
eval_data,
targets="target",
model_type="regressor",
evaluators=["default"],
)
print(f"MAE: {result.metrics['mean_absolute_error']:.3f}")
print(f"RMSE: {result.metrics['root_mean_squared_error']:.3f}")
print(f"R² Score: {result.metrics['r2_score']:.3f}")
自动回归指标
- 平均绝对误差 (MAE)
- 均方误差 (MSE) 和均方根误差 (RMSE)
- R² 分数和调整 R²
- 平均绝对百分比误差 (MAPE)
- 残差图和分布分析
高级评估配置
指定评估器
控制评估期间运行哪些评估器
# Run only default metrics (fastest)
result = mlflow.models.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
# Include SHAP explainer for feature importance
result = mlflow.models.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
evaluator_config={"log_explainer": True},
)
配置选项参考
自定义指标和工件
- 自定义指标
- 自定义工件
MLflow 提供了一个强大的框架,用于使用 make_metric
函数定义自定义评估指标
import mlflow
import numpy as np
from mlflow.models import make_metric
def weighted_accuracy(predictions, targets, metrics, sample_weights=None):
"""Custom weighted accuracy metric."""
if sample_weights is None:
return (predictions == targets).mean()
else:
correct = predictions == targets
return np.average(correct, weights=sample_weights)
# Create custom metric
custom_accuracy = make_metric(
eval_fn=weighted_accuracy, greater_is_better=True, name="weighted_accuracy"
)
# Use in evaluation
result = mlflow.models.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
extra_metrics=[custom_accuracy],
)
创建自定义可视化和分析工件
import matplotlib.pyplot as plt
import os
def create_residual_plot(eval_df, builtin_metrics, artifacts_dir):
"""Create custom residual plot for regression models."""
residuals = eval_df["target"] - eval_df["prediction"]
plt.figure(figsize=(10, 6))
plt.scatter(eval_df["prediction"], residuals, alpha=0.6)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plot_path = os.path.join(artifacts_dir, "residual_plot.png")
plt.savefig(plot_path)
plt.close()
return {"residual_plot": plot_path}
# Use custom artifact
result = mlflow.models.evaluate(
model_uri,
eval_data,
targets="target",
model_type="regressor",
custom_artifacts=[create_residual_plot],
)
处理评估结果
评估结果对象提供对所有生成指标和工件的全面访问
# Run evaluation
result = mlflow.models.evaluate(
model_uri, eval_data, targets="label", model_type="classifier"
)
# Access metrics
print("All Metrics:")
for metric_name, value in result.metrics.items():
print(f" {metric_name}: {value}")
# Access artifacts (plots, tables, etc.)
print("\nGenerated Artifacts:")
for artifact_name, path in result.artifacts.items():
print(f" {artifact_name}: {path}")
# Access evaluation dataset
eval_table = result.tables["eval_results_table"]
print(f"\nEvaluation table shape: {eval_table.shape}")
print(f"Columns: {list(eval_table.columns)}")
模型比较和高级工作流
- 模型比较
- 交叉验证
- 自动化选择
系统地比较多个模型
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# Define models to compare
models = {
"random_forest": RandomForestClassifier(n_estimators=100, random_state=42),
"logistic_regression": LogisticRegression(random_state=42),
"svm": SVC(probability=True, random_state=42),
}
# Evaluate each model
results = {}
for model_name, model in models.items():
with mlflow.start_run(run_name=f"eval_{model_name}"):
# Train model
model.fit(X_train, y_train)
# Log model
signature = infer_signature(X_train, model.predict(X_train))
mlflow.sklearn.log_model(model, name="model", signature=signature)
model_uri = mlflow.get_artifact_uri("model")
# Evaluate model
result = mlflow.models.evaluate(
model_uri, eval_data, targets="label", model_type="classifier"
)
results[model_name] = result.metrics
# Log comparison metrics
mlflow.log_metrics(
{
"accuracy": result.metrics["accuracy_score"],
"f1": result.metrics["f1_score"],
"roc_auc": result.metrics["roc_auc"],
}
)
# Compare results
comparison_df = pd.DataFrame(results).T
print("Model Comparison:")
print(comparison_df[["accuracy_score", "f1_score", "roc_auc"]].round(3))
将 MLflow 评估与交叉验证结合
from sklearn.model_selection import cross_val_score, StratifiedKFold
def evaluate_with_cv(model, X, y, eval_data, cv_folds=5):
"""Evaluate model with cross-validation and final test evaluation."""
with mlflow.start_run():
# Cross-validation scores
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=cv, scoring="f1_weighted")
# Log CV results
mlflow.log_metrics(
{"cv_mean_f1": cv_scores.mean(), "cv_std_f1": cv_scores.std()}
)
# Train on full dataset
model.fit(X, y)
# Final evaluation
signature = infer_signature(X, model.predict(X))
mlflow.sklearn.log_model(model, name="model", signature=signature)
model_uri = mlflow.get_artifact_uri("model")
result = mlflow.models.evaluate(
model_uri, eval_data, targets="label", model_type="classifier"
)
# Compare CV and test performance
test_f1 = result.metrics["f1_score"]
cv_f1 = cv_scores.mean()
mlflow.log_metrics(
{
"cv_vs_test_diff": abs(cv_f1 - test_f1),
"potential_overfit": cv_f1 - test_f1 > 0.05,
}
)
return result
# Usage
result = evaluate_with_cv(
RandomForestClassifier(n_estimators=100, random_state=42),
X_train,
y_train,
eval_data,
)
基于评估指标的自动化模型选择
def evaluate_and_select_best_model(
models, X_train, y_train, eval_data, metric="f1_score"
):
"""Evaluate multiple models and select the best performer."""
results = {}
best_score = -1
best_model_name = None
for model_name, model in models.items():
with mlflow.start_run(run_name=f"candidate_{model_name}"):
# Train and evaluate
model.fit(X_train, y_train)
signature = infer_signature(X_train, model.predict(X_train))
mlflow.sklearn.log_model(model, name="model", signature=signature)
model_uri = mlflow.get_artifact_uri("model")
result = mlflow.models.evaluate(
model_uri, eval_data, targets="label", model_type="classifier"
)
score = result.metrics[metric]
results[model_name] = score
# Track best model
if score > best_score:
best_score = score
best_model_name = model_name
# Log selection metrics
mlflow.log_metrics(
{"selection_score": score, "is_best": score == best_score}
)
print(f"Best model: {best_model_name} (Score: {best_score:.3f})")
return best_model_name, results
# Use automated selection
best_model, all_scores = evaluate_and_select_best_model(
models, X_train, y_train, eval_data, metric="f1_score"
)
模型验证和质量门
MLflow 2.18.0 已将模型验证功能从 mlflow.models.evaluate()
API 移至专用的 mlflow.validate_evaluation_results()
API。相关参数(例如 baseline_model)已被弃用,并将在未来版本中从旧 API 中移除。
使用 mlflow.validate_evaluation_results()
API,您可以验证模型评估期间生成的指标,以根据基线评估模型的质量。
from mlflow.models import MetricThreshold
# Evaluate your model first
result = mlflow.models.evaluate(
model_uri, eval_data, targets="label", model_type="classifier"
)
# Define static performance thresholds
static_thresholds = {
"accuracy_score": MetricThreshold(
threshold=0.85, greater_is_better=True # Must achieve 85% accuracy
),
"precision_score": MetricThreshold(
threshold=0.80, greater_is_better=True # Must achieve 80% precision
),
"recall_score": MetricThreshold(
threshold=0.75, greater_is_better=True # Must achieve 75% recall
),
}
# Validate against static thresholds
try:
mlflow.validate_evaluation_results(
candidate_result=result,
baseline_result=None, # No baseline comparison
validation_thresholds=static_thresholds,
)
print("✅ Model meets all static performance thresholds.")
except mlflow.exceptions.ModelValidationFailedException as e:
print(f"❌ Model failed static validation: {e}")
有关模型验证行为和输出的更多信息,请参阅 mlflow.validate_evaluation_results()
API 文档。
误差分析和调试
- 误差调查
- 特征分析
详细分析模型误差
def analyze_model_errors(result, eval_data, targets, top_n=20):
"""Analyze model errors in detail."""
# Load evaluation results
eval_table = result.tables["eval_results_table"]
# Identify errors
errors = eval_table[eval_table["prediction"] != eval_table[targets]]
if len(errors) > 0:
print(f"Total errors: {len(errors)} out of {len(eval_table)} predictions")
print(f"Error rate: {len(errors) / len(eval_table) * 100:.2f}%")
# Most confident wrong predictions
if "prediction_score" in errors.columns:
confident_errors = errors.nlargest(top_n, "prediction_score")
print(f"\nTop {top_n} most confident errors:")
print(confident_errors[["prediction", targets, "prediction_score"]].head())
# Error patterns by true class
error_by_class = errors.groupby(targets).size()
print(f"\nErrors by true class:")
print(error_by_class)
return errors
# Usage
errors = analyze_model_errors(result, eval_data, "label")
分析模型误差与输入特征的关系
def analyze_errors_by_features(model_uri, eval_data, targets, feature_columns):
"""Analyze how model errors relate to input features."""
# Get model predictions
model = mlflow.pyfunc.load_model(model_uri)
predictions = model.predict(eval_data[feature_columns])
# Create analysis dataframe
analysis_df = eval_data.copy()
analysis_df["prediction"] = predictions
analysis_df["is_error"] = analysis_df["prediction"] != analysis_df[targets]
# Feature statistics for errors vs correct predictions
feature_stats = {}
for feature in feature_columns:
if analysis_df[feature].dtype in ["int64", "float64"]:
# Numerical features
correct_mean = analysis_df[~analysis_df["is_error"]][feature].mean()
error_mean = analysis_df[analysis_df["is_error"]][feature].mean()
feature_stats[feature] = {
"correct_mean": correct_mean,
"error_mean": error_mean,
"difference": abs(error_mean - correct_mean),
"relative_difference": abs(error_mean - correct_mean) / correct_mean
if correct_mean != 0
else 0,
}
# Sort features by impact on errors
numerical_features = [
(k, v["relative_difference"])
for k, v in feature_stats.items()
if "relative_difference" in v
]
numerical_features.sort(key=lambda x: x[1], reverse=True)
print("Features most associated with errors:")
for feature, diff in numerical_features[:5]:
print(f" {feature}: {diff:.3f}")
return feature_stats, analysis_df
# Usage
feature_stats, analysis = analyze_errors_by_features(
model_uri,
eval_data,
"label",
feature_columns=eval_data.drop(columns=["label"]).columns.tolist(),
)
最佳实践和优化
- 最佳实践
- 性能优化
- 可复现评估
包含最佳实践的完整评估工作流
def comprehensive_model_evaluation(
model, X_train, y_train, eval_data, targets, model_type
):
"""Complete evaluation workflow with best practices."""
with mlflow.start_run():
# Train model
model.fit(X_train, y_train)
# Log training info
mlflow.log_params(
{"model_class": model.__class__.__name__, "training_samples": len(X_train)}
)
# Log model with signature
signature = infer_signature(X_train, model.predict(X_train))
mlflow.sklearn.log_model(model, name="model", signature=signature)
model_uri = mlflow.get_artifact_uri("model")
# Comprehensive evaluation
result = mlflow.models.evaluate(
model_uri,
eval_data,
targets=targets,
model_type=model_type,
evaluators=["default"],
evaluator_config={
"log_explainer": True,
"explainer_type": "exact",
"log_model_explanations": True,
},
)
return result
优化大型数据集和复杂模型的评估
# Optimize evaluation performance
result = mlflow.models.evaluate(
model_uri,
eval_data.sample(n=10000, random_state=42), # Sample for faster evaluation
targets="label",
model_type="classifier",
evaluators=["default"],
evaluator_config={
"log_explainer": False, # Skip SHAP for speed
"max_error_examples": 50, # Reduce error analysis
},
)
# For very large datasets - evaluate in batches
def evaluate_in_batches(model_uri, large_eval_data, targets, batch_size=1000):
"""Evaluate large datasets in batches to manage memory."""
all_predictions = []
all_targets = []
for i in range(0, len(large_eval_data), batch_size):
batch = large_eval_data.iloc[i : i + batch_size]
# Get predictions for batch
model = mlflow.pyfunc.load_model(model_uri)
batch_predictions = model.predict(batch.drop(columns=[targets]))
all_predictions.extend(batch_predictions)
all_targets.extend(batch[targets].values)
# Create final evaluation dataset
final_eval_data = pd.DataFrame(
{"prediction": all_predictions, "target": all_targets}
)
# Evaluate using static dataset approach
result = mlflow.models.evaluate(
data=final_eval_data,
predictions="prediction",
targets="target",
model_type="classifier",
)
return result
确保评估结果一致
def reproducible_evaluation(model, eval_data, targets, random_seed=42):
"""Ensure reproducible evaluation results."""
# Set random seeds
np.random.seed(random_seed)
with mlflow.start_run():
# Log evaluation configuration
mlflow.log_params(
{
"eval_random_seed": random_seed,
"eval_data_size": len(eval_data),
"eval_timestamp": pd.Timestamp.now().isoformat(),
}
)
# Consistent data ordering
eval_data_sorted = eval_data.sort_values(
by=eval_data.columns.tolist()
).reset_index(drop=True)
# Run evaluation
result = mlflow.models.evaluate(
model,
eval_data_sorted,
targets=targets,
model_type="classifier",
evaluator_config={"random_seed": random_seed},
)
return result
结论
MLflow 的模型评估功能提供了一个全面的框架,用于评估分类和回归任务中的模型性能。统一的 API 简化了复杂的评估工作流,同时通过自动化指标、可视化和诊断工具深入了解模型行为。
MLflow 模型评估的主要优势包括
- 全面评估:自动生成任务特定指标和可视化
- 可复现工作流:具有完整跟踪和版本控制的一致评估过程
- 高级分析:误差调查、特征影响分析和模型比较功能
- 生产集成:与 MLflow 跟踪无缝集成,用于实验组织和报告
无论您是评估单个模型还是比较多个候选模型,MLflow 的评估框架都提供了做出关于模型性能和生产准备情况的明智决策所需的工具。