跳到主要内容

自定义指标与可视化

MLflow 的评估框架允许您定义自定义指标并创建专门的可视化,以满足您的特定业务需求。当标准指标无法捕获您领域独特的成功标准或需要自定义视觉分析以进行利益相关者沟通时,此功能至关重要。

快速入门:创建自定义指标

使用 MLflow 的指标构建器定义领域特定指标

import mlflow
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from mlflow.models import make_metric
from mlflow.metrics.base import MetricValue

# Generate sample data
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Create evaluation dataset
eval_data = pd.DataFrame(X_test)
eval_data["target"] = y_test


# Define a custom business metric
def business_value_metric(predictions, targets, metrics):
"""
Custom metric calculating business value impact.
True Positives = $100 value, False Positives = -$20 cost
"""
tp = np.sum((predictions == 1) & (targets == 1))
fp = np.sum((predictions == 1) & (targets == 0))
tn = np.sum((predictions == 0) & (targets == 0))
fn = np.sum((predictions == 0) & (targets == 1))

# Business logic: TP worth $100, FP costs $20
business_value = (tp * 100) - (fp * 20)
total_possible_value = np.sum(targets == 1) * 100

return MetricValue(
scores=[business_value], # Total business value
aggregate_results={
"total_business_value": business_value,
"value_per_prediction": business_value / len(predictions),
"value_efficiency": business_value / total_possible_value
if total_possible_value > 0
else 0,
},
)


# Create the metric
business_metric = make_metric(
eval_fn=business_value_metric, greater_is_better=True, name="business_value"
)

with mlflow.start_run():
# Log model
mlflow.sklearn.log_model(model, name="model")
model_uri = mlflow.get_artifact_uri("model")

# Evaluate with custom metric
result = mlflow.evaluate(
model_uri,
eval_data,
targets="target",
model_type="classifier",
extra_metrics=[business_metric],
)

print(f"Standard Accuracy: {result.metrics['accuracy_score']:.3f}")
print(
f"Business Value: ${result.metrics['business_value/total_business_value']:.2f}"
)
print(
f"Value per Prediction: ${result.metrics['business_value/value_per_prediction']:.2f}"
)

自定义指标模式

创建将模型性能转化为业务术语的指标

def create_profit_loss_metric(cost_per_fp=50, revenue_per_tp=200):
"""Calculate profit/loss impact of model predictions."""

def eval_fn(predictions, targets, metrics):
tp = np.sum((predictions == 1) & (targets == 1))
fp = np.sum((predictions == 1) & (targets == 0))
fn = np.sum((predictions == 0) & (targets == 1))
tn = np.sum((predictions == 0) & (targets == 0))

# Calculate financial impact
revenue = tp * revenue_per_tp
costs = fp * cost_per_fp
missed_opportunity = fn * revenue_per_tp
net_profit = revenue - costs

return MetricValue(
aggregate_results={
"net_profit": net_profit,
"total_revenue": revenue,
"total_costs": costs,
"missed_revenue": missed_opportunity,
"roi": (net_profit / max(costs, 1)) * 100,
}
)

return make_metric(eval_fn=eval_fn, greater_is_better=True, name="profit_loss")


# Usage
profit_metric = create_profit_loss_metric(cost_per_fp=30, revenue_per_tp=150)

with mlflow.start_run():
result = mlflow.evaluate(
model_uri,
eval_data,
targets="target",
model_type="classifier",
extra_metrics=[profit_metric],
)

print(f"Net Profit: ${result.metrics['profit_loss/net_profit']:.2f}")
print(f"ROI: {result.metrics['profit_loss/roi']:.1f}%")

自定义可视化

生成超越标准图表的自定义视觉分析

import matplotlib.pyplot as plt
import seaborn as sns
import os


def create_business_impact_visualization(eval_df, builtin_metrics, artifacts_dir):
"""Create custom business impact visualization."""

# Calculate business segments
eval_df["prediction_confidence"] = np.abs(eval_df["prediction"] - 0.5)
eval_df["confidence_segment"] = pd.cut(
eval_df["prediction_confidence"],
bins=[0, 0.1, 0.3, 0.5],
labels=["Low", "Medium", "High"],
)

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Accuracy by confidence segment
accuracy_by_segment = eval_df.groupby("confidence_segment").apply(
lambda x: (x["prediction"] == x["target"]).mean()
)

axes[0, 0].bar(
accuracy_by_segment.index, accuracy_by_segment.values, color="skyblue"
)
axes[0, 0].set_title("Accuracy by Confidence Segment")
axes[0, 0].set_ylabel("Accuracy")
axes[0, 0].set_ylim(0, 1)

# 2. Prediction distribution
axes[0, 1].hist(
eval_df["prediction"],
bins=20,
alpha=0.7,
label="Predictions",
color="lightgreen",
)
axes[0, 1].axvline(
eval_df["prediction"].mean(), color="red", linestyle="--", label="Mean"
)
axes[0, 1].set_title("Prediction Distribution")
axes[0, 1].legend()

# 3. Confusion matrix heatmap
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(eval_df["target"], eval_df["prediction"])
sns.heatmap(cm, annot=True, fmt="d", ax=axes[1, 0], cmap="Blues")
axes[1, 0].set_title("Confusion Matrix")
axes[1, 0].set_xlabel("Predicted")
axes[1, 0].set_ylabel("Actual")

# 4. Business value by segment
def calculate_segment_value(segment_data):
tp = np.sum((segment_data["prediction"] == 1) & (segment_data["target"] == 1))
fp = np.sum((segment_data["prediction"] == 1) & (segment_data["target"] == 0))
return (tp * 100) - (fp * 20) # Business value calculation

value_by_segment = eval_df.groupby("confidence_segment").apply(
calculate_segment_value
)

colors = ["lightcoral" if v < 0 else "lightgreen" for v in value_by_segment.values]
axes[1, 1].bar(value_by_segment.index, value_by_segment.values, color=colors)
axes[1, 1].set_title("Business Value by Confidence Segment")
axes[1, 1].set_ylabel("Business Value ($)")
axes[1, 1].axhline(y=0, color="black", linestyle="-", alpha=0.3)

plt.tight_layout()

# Save visualization
viz_path = os.path.join(artifacts_dir, "business_impact_analysis.png")
plt.savefig(viz_path, dpi=300, bbox_inches="tight")
plt.close()

return {"business_impact_analysis": viz_path}

高级自定义指标

创建组合多个评估标准的复合指标

def create_composite_business_metric():
"""Composite metric combining accuracy, profit, and risk measures."""

def eval_fn(predictions, targets, metrics):
# Get standard accuracy
accuracy = metrics.get("accuracy_score", 0)

# Calculate profit (reuse previous logic)
tp = np.sum((predictions == 1) & (targets == 1))
fp = np.sum((predictions == 1) & (targets == 0))
fn = np.sum((predictions == 0) & (targets == 1))

profit = (tp * 100) - (fp * 20) - (fn * 50) # Include missed opportunity cost

# Calculate risk (variance in performance across segments)
n_segments = 5
segment_size = len(predictions) // n_segments
segment_accuracies = []

for i in range(n_segments):
start_idx = i * segment_size
end_idx = (
start_idx + segment_size if i < n_segments - 1 else len(predictions)
)

seg_pred = predictions[start_idx:end_idx]
seg_target = targets[start_idx:end_idx]
seg_accuracy = np.mean(seg_pred == seg_target) if len(seg_pred) > 0 else 0
segment_accuracies.append(seg_accuracy)

risk_measure = np.std(segment_accuracies) # Lower std = lower risk

# Composite score: balance accuracy, profit, and risk
composite_score = (
(accuracy * 0.4) + (profit / 1000 * 0.4) + ((1 - risk_measure) * 0.2)
)

return MetricValue(
aggregate_results={
"composite_score": composite_score,
"accuracy_component": accuracy,
"profit_component": profit,
"risk_component": risk_measure,
"segment_consistency": 1 - risk_measure,
}
)

return make_metric(
eval_fn=eval_fn, greater_is_better=True, name="composite_business"
)


# Usage
composite_metric = create_composite_business_metric()

最佳实践和指南

指标设计原则

使指标与业务相关 - 使用与实际业务目标一致的领域特定术语和阈值,将技术性能转化为业务影响。

确保指标可靠性 - 处理除以零和空数据集等边缘情况,使用已知测试用例验证计算,并在适当情况下包含置信区间。

优化可解释性 - 提供多个聚合级别,包含中间计算以提高透明度,使用描述性命名,并通过比率和百分比指标添加上下文。

文档和验证 - 记录指标假设和限制,使用已知真实情况的合成数据进行测试,并提供指标值的清晰解释。

可视化最佳实践

  • 针对您的受众:为数据科学家创建技术图表,为业务利益相关者创建执行仪表板
  • 使其交互式:使用交互式可视化进行探索,使用静态可视化进行报告
  • 专注于洞察:突出关键发现和可操作的洞察,而不仅仅是显示数据
  • 一致的样式:在所有可视化中使用一致的配色方案和格式
  • 性能考虑:优化大型可视化以提高渲染速度和文件大小

集成示例

这是一个结合了自定义指标和可视化的综合示例

def comprehensive_custom_evaluation():
"""Complete example of custom metrics and visualizations."""

# Define multiple custom metrics
business_metric = create_profit_loss_metric(cost_per_fp=30, revenue_per_tp=150)
threshold_metric = create_threshold_precision_metric(threshold=0.8)
composite_metric = create_composite_business_metric()

# Define custom visualizations
custom_artifacts = [
create_business_impact_visualization,
create_performance_breakdown_visualization,
create_interactive_analysis_artifacts,
]

with mlflow.start_run(run_name="Comprehensive_Custom_Evaluation"):
# Evaluate with all custom components
result = mlflow.evaluate(
model_uri,
eval_data,
targets="target",
model_type="classifier",
extra_metrics=[business_metric, threshold_metric, composite_metric],
custom_artifacts=custom_artifacts,
)

# Log additional business context
mlflow.log_params(
{
"evaluation_focus": "business_impact",
"custom_metrics_count": 3,
"custom_visualizations_count": len(custom_artifacts),
}
)

print("Custom Evaluation Results:")
print(f"Profit/Loss: ${result.metrics.get('profit_loss/net_profit', 0):.2f}")
print(
f"High Confidence Precision: {result.metrics.get('threshold_precision/high_confidence_precision', 0):.3f}"
)
print(
f"Composite Score: {result.metrics.get('composite_business/composite_score', 0):.3f}"
)

print("\nCustom Artifacts Created:")
for name, path in result.artifacts.items():
if any(
keyword in name.lower()
for keyword in ["business", "performance", "interactive"]
):
print(f" {name}: {path}")


# Run comprehensive evaluation
# comprehensive_custom_evaluation()

结论

MLflow 中的自定义指标和可视化使您能够创建完美契合业务需求的评估框架。通过定义领域特定的成功标准并创建有针对性的视觉分析,您可以弥合技术模型性能与业务价值之间的差距。

主要优势包括

  • 业务一致性:直接反映业务影响和优先级的指标
  • 利益相关者沟通:使模型性能易于访问的视觉仪表板
  • 领域专业知识:捕获行业特定要求的评估标准
  • 决策支持:清晰的洞察,推动知情的模型选择和部署

无论您是优化利润、最小化风险还是满足法规要求,自定义指标和可视化都能确保您的模型评估过程与您的用例真正重要的内容保持一致。