跳到主要内容

模型评估

本指南涵盖了 MLflow 在分类和回归任务中的核心模型评估功能,展示了如何利用自动化指标、可视化和诊断工具全面评估模型性能。

快速开始:评估分类模型

评估模型最简单的方法是使用 MLflow 的统一评估 API

import mlflow
import xgboost as xgb
import shap
from sklearn.model_selection import train_test_split
from mlflow.models import infer_signature

# Load the UCI Adult Dataset
X, y = shap.datasets.adult()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)

# Train model
model = xgb.XGBClassifier().fit(X_train, y_train)

# Create evaluation dataset
eval_data = X_test.copy()
eval_data["label"] = y_test

with mlflow.start_run():
# Log model with signature
signature = infer_signature(X_test, model.predict(X_test))
mlflow.sklearn.log_model(model, name="model", signature=signature)
model_uri = mlflow.get_artifact_uri("model")

# Comprehensive evaluation
result = mlflow.models.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)

print(f"Accuracy: {result.metrics['accuracy_score']:.3f}")
print(f"F1 Score: {result.metrics['f1_score']:.3f}")
print(f"ROC AUC: {result.metrics['roc_auc']:.3f}")

此单次调用会自动生成

  • 性能指标:准确率、精确率、召回率、F1 分数、ROC-AUC
  • 可视化:混淆矩阵、ROC 曲线、精确率-召回率曲线
  • 特征重要性:SHAP 值和特征贡献分析
  • 模型工件:所有图表和诊断信息都保存到 MLflow

支持的模型类型

MLflow 支持不同的模型类型,每种类型都有专门的指标和评估

  • classifier - 二元和多分类模型
  • regressor - 用于连续目标预测的回归模型

对于分类任务,MLflow 自动计算全面的指标

# Binary Classification
result = mlflow.models.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier", # Automatically detects binary vs multiclass
evaluators=["default"],
)

# Access classification-specific metrics
metrics = result.metrics
print(f"Precision: {metrics['precision_score']:.3f}")
print(f"Recall: {metrics['recall_score']:.3f}")
print(f"F1 Score: {metrics['f1_score']:.3f}")
print(f"ROC AUC: {metrics['roc_auc']:.3f}")

自动分类指标

  • 准确率、精确率、召回率、F1 分数
  • ROC-AUC 和精确率-召回率 AUC
  • 对数损失和布里尔分数
  • 混淆矩阵和分类报告

高级评估配置

指定评估器

控制评估期间运行哪些评估器

# Run only default metrics (fastest)
result = mlflow.models.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)

# Include SHAP explainer for feature importance
result = mlflow.models.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
evaluator_config={"log_explainer": True},
)
配置选项参考

SHAP 配置

  • log_explainer:是否将 SHAP 解释器记录为模型
  • explainer_type:SHAP 解释器类型(“exact”、“permutation”、“partition”)
  • max_error_examples:要分析的最大误差样本数
  • log_model_explanations:是否记录单个预测解释

性能选项

  • pos_label:二元分类指标的正类别标签
  • average:多类别指标的平均策略(“macro”、“micro”、“weighted”)
  • sample_weights:加权指标的样本权重
  • normalize:混淆矩阵的归一化(“true”、“pred”、“all”)

自定义指标和工件

MLflow 提供了一个强大的框架,用于使用 make_metric 函数定义自定义评估指标

import mlflow
import numpy as np
from mlflow.models import make_metric


def weighted_accuracy(predictions, targets, metrics, sample_weights=None):
"""Custom weighted accuracy metric."""
if sample_weights is None:
return (predictions == targets).mean()
else:
correct = predictions == targets
return np.average(correct, weights=sample_weights)


# Create custom metric
custom_accuracy = make_metric(
eval_fn=weighted_accuracy, greater_is_better=True, name="weighted_accuracy"
)

# Use in evaluation
result = mlflow.models.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
extra_metrics=[custom_accuracy],
)

处理评估结果

评估结果对象提供对所有生成指标和工件的全面访问

# Run evaluation
result = mlflow.models.evaluate(
model_uri, eval_data, targets="label", model_type="classifier"
)

# Access metrics
print("All Metrics:")
for metric_name, value in result.metrics.items():
print(f" {metric_name}: {value}")

# Access artifacts (plots, tables, etc.)
print("\nGenerated Artifacts:")
for artifact_name, path in result.artifacts.items():
print(f" {artifact_name}: {path}")

# Access evaluation dataset
eval_table = result.tables["eval_results_table"]
print(f"\nEvaluation table shape: {eval_table.shape}")
print(f"Columns: {list(eval_table.columns)}")

模型比较和高级工作流

系统地比较多个模型

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Define models to compare
models = {
"random_forest": RandomForestClassifier(n_estimators=100, random_state=42),
"logistic_regression": LogisticRegression(random_state=42),
"svm": SVC(probability=True, random_state=42),
}

# Evaluate each model
results = {}

for model_name, model in models.items():
with mlflow.start_run(run_name=f"eval_{model_name}"):
# Train model
model.fit(X_train, y_train)

# Log model
signature = infer_signature(X_train, model.predict(X_train))
mlflow.sklearn.log_model(model, name="model", signature=signature)
model_uri = mlflow.get_artifact_uri("model")

# Evaluate model
result = mlflow.models.evaluate(
model_uri, eval_data, targets="label", model_type="classifier"
)

results[model_name] = result.metrics

# Log comparison metrics
mlflow.log_metrics(
{
"accuracy": result.metrics["accuracy_score"],
"f1": result.metrics["f1_score"],
"roc_auc": result.metrics["roc_auc"],
}
)

# Compare results
comparison_df = pd.DataFrame(results).T
print("Model Comparison:")
print(comparison_df[["accuracy_score", "f1_score", "roc_auc"]].round(3))

模型验证和质量门

注意

MLflow 2.18.0 已将模型验证功能从 mlflow.models.evaluate() API 移至专用的 mlflow.validate_evaluation_results() API。相关参数(例如 baseline_model)已被弃用,并将在未来版本中从旧 API 中移除。

使用 mlflow.validate_evaluation_results() API,您可以验证模型评估期间生成的指标,以根据基线评估模型的质量。

from mlflow.models import MetricThreshold

# Evaluate your model first
result = mlflow.models.evaluate(
model_uri, eval_data, targets="label", model_type="classifier"
)

# Define static performance thresholds
static_thresholds = {
"accuracy_score": MetricThreshold(
threshold=0.85, greater_is_better=True # Must achieve 85% accuracy
),
"precision_score": MetricThreshold(
threshold=0.80, greater_is_better=True # Must achieve 80% precision
),
"recall_score": MetricThreshold(
threshold=0.75, greater_is_better=True # Must achieve 75% recall
),
}

# Validate against static thresholds
try:
mlflow.validate_evaluation_results(
candidate_result=result,
baseline_result=None, # No baseline comparison
validation_thresholds=static_thresholds,
)
print("✅ Model meets all static performance thresholds.")
except mlflow.exceptions.ModelValidationFailedException as e:
print(f"❌ Model failed static validation: {e}")

有关模型验证行为和输出的更多信息,请参阅 mlflow.validate_evaluation_results() API 文档。

误差分析和调试

详细分析模型误差

def analyze_model_errors(result, eval_data, targets, top_n=20):
"""Analyze model errors in detail."""

# Load evaluation results
eval_table = result.tables["eval_results_table"]

# Identify errors
errors = eval_table[eval_table["prediction"] != eval_table[targets]]

if len(errors) > 0:
print(f"Total errors: {len(errors)} out of {len(eval_table)} predictions")
print(f"Error rate: {len(errors) / len(eval_table) * 100:.2f}%")

# Most confident wrong predictions
if "prediction_score" in errors.columns:
confident_errors = errors.nlargest(top_n, "prediction_score")
print(f"\nTop {top_n} most confident errors:")
print(confident_errors[["prediction", targets, "prediction_score"]].head())

# Error patterns by true class
error_by_class = errors.groupby(targets).size()
print(f"\nErrors by true class:")
print(error_by_class)

return errors


# Usage
errors = analyze_model_errors(result, eval_data, "label")

最佳实践和优化

包含最佳实践的完整评估工作流

def comprehensive_model_evaluation(
model, X_train, y_train, eval_data, targets, model_type
):
"""Complete evaluation workflow with best practices."""

with mlflow.start_run():
# Train model
model.fit(X_train, y_train)

# Log training info
mlflow.log_params(
{"model_class": model.__class__.__name__, "training_samples": len(X_train)}
)

# Log model with signature
signature = infer_signature(X_train, model.predict(X_train))
mlflow.sklearn.log_model(model, name="model", signature=signature)
model_uri = mlflow.get_artifact_uri("model")

# Comprehensive evaluation
result = mlflow.models.evaluate(
model_uri,
eval_data,
targets=targets,
model_type=model_type,
evaluators=["default"],
evaluator_config={
"log_explainer": True,
"explainer_type": "exact",
"log_model_explanations": True,
},
)

return result

结论

MLflow 的模型评估功能提供了一个全面的框架,用于评估分类和回归任务中的模型性能。统一的 API 简化了复杂的评估工作流,同时通过自动化指标、可视化和诊断工具深入了解模型行为。

MLflow 模型评估的主要优势包括

  • 全面评估:自动生成任务特定指标和可视化
  • 可复现工作流:具有完整跟踪和版本控制的一致评估过程
  • 高级分析:误差调查、特征影响分析和模型比较功能
  • 生产集成:与 MLflow 跟踪无缝集成,用于实验组织和报告

无论您是评估单个模型还是比较多个候选模型,MLflow 的评估框架都提供了做出关于模型性能和生产准备情况的明智决策所需的工具。