自定义指标与可视化
MLflow 的评估框架允许您定义自定义指标并创建专门的可视化,以满足您的特定业务需求。当标准指标无法捕获您领域独特的成功标准或需要自定义视觉分析以进行利益相关者沟通时,此功能至关重要。
快速入门:创建自定义指标
使用 MLflow 的指标构建器定义领域特定指标
import mlflow
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from mlflow.models import make_metric
from mlflow.metrics.base import MetricValue
# Generate sample data
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Create evaluation dataset
eval_data = pd.DataFrame(X_test)
eval_data["target"] = y_test
# Define a custom business metric
def business_value_metric(predictions, targets, metrics):
"""
Custom metric calculating business value impact.
True Positives = $100 value, False Positives = -$20 cost
"""
tp = np.sum((predictions == 1) & (targets == 1))
fp = np.sum((predictions == 1) & (targets == 0))
tn = np.sum((predictions == 0) & (targets == 0))
fn = np.sum((predictions == 0) & (targets == 1))
# Business logic: TP worth $100, FP costs $20
business_value = (tp * 100) - (fp * 20)
total_possible_value = np.sum(targets == 1) * 100
return MetricValue(
scores=[business_value], # Total business value
aggregate_results={
"total_business_value": business_value,
"value_per_prediction": business_value / len(predictions),
"value_efficiency": business_value / total_possible_value
if total_possible_value > 0
else 0,
},
)
# Create the metric
business_metric = make_metric(
eval_fn=business_value_metric, greater_is_better=True, name="business_value"
)
with mlflow.start_run():
# Log model
mlflow.sklearn.log_model(model, name="model")
model_uri = mlflow.get_artifact_uri("model")
# Evaluate with custom metric
result = mlflow.evaluate(
model_uri,
eval_data,
targets="target",
model_type="classifier",
extra_metrics=[business_metric],
)
print(f"Standard Accuracy: {result.metrics['accuracy_score']:.3f}")
print(
f"Business Value: ${result.metrics['business_value/total_business_value']:.2f}"
)
print(
f"Value per Prediction: ${result.metrics['business_value/value_per_prediction']:.2f}"
)
自定义指标模式
- 财务影响指标
- 基于阈值的指标
- 领域特定指标
创建将模型性能转化为业务术语的指标
def create_profit_loss_metric(cost_per_fp=50, revenue_per_tp=200):
"""Calculate profit/loss impact of model predictions."""
def eval_fn(predictions, targets, metrics):
tp = np.sum((predictions == 1) & (targets == 1))
fp = np.sum((predictions == 1) & (targets == 0))
fn = np.sum((predictions == 0) & (targets == 1))
tn = np.sum((predictions == 0) & (targets == 0))
# Calculate financial impact
revenue = tp * revenue_per_tp
costs = fp * cost_per_fp
missed_opportunity = fn * revenue_per_tp
net_profit = revenue - costs
return MetricValue(
aggregate_results={
"net_profit": net_profit,
"total_revenue": revenue,
"total_costs": costs,
"missed_revenue": missed_opportunity,
"roi": (net_profit / max(costs, 1)) * 100,
}
)
return make_metric(eval_fn=eval_fn, greater_is_better=True, name="profit_loss")
# Usage
profit_metric = create_profit_loss_metric(cost_per_fp=30, revenue_per_tp=150)
with mlflow.start_run():
result = mlflow.evaluate(
model_uri,
eval_data,
targets="target",
model_type="classifier",
extra_metrics=[profit_metric],
)
print(f"Net Profit: ${result.metrics['profit_loss/net_profit']:.2f}")
print(f"ROI: {result.metrics['profit_loss/roi']:.1f}%")
创建在特定业务阈值下评估性能的指标
def create_threshold_precision_metric(threshold=0.8):
"""Precision metric for high-confidence predictions only."""
def eval_fn(predictions, targets, metrics):
# This assumes predictions are probabilities; adjust for your use case
high_confidence_mask = np.abs(predictions - 0.5) >= (threshold - 0.5)
if np.sum(high_confidence_mask) == 0:
return MetricValue(aggregate_results={"high_confidence_precision": 0.0})
hc_predictions = predictions[high_confidence_mask]
hc_targets = targets[high_confidence_mask]
# Convert probabilities to binary predictions
binary_predictions = (hc_predictions > 0.5).astype(int)
tp = np.sum((binary_predictions == 1) & (hc_targets == 1))
fp = np.sum((binary_predictions == 1) & (hc_targets == 0))
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
coverage = np.sum(high_confidence_mask) / len(predictions)
return MetricValue(
aggregate_results={
"high_confidence_precision": precision,
"high_confidence_coverage": coverage,
"high_confidence_count": np.sum(high_confidence_mask),
}
)
return make_metric(
eval_fn=eval_fn, greater_is_better=True, name="threshold_precision"
)
# Create threshold-based metric
threshold_metric = create_threshold_precision_metric(threshold=0.8)
with mlflow.start_run():
result = mlflow.evaluate(
model_uri,
eval_data,
targets="target",
model_type="classifier",
extra_metrics=[threshold_metric],
)
print(
f"High Confidence Precision: {result.metrics['threshold_precision/high_confidence_precision']:.3f}"
)
print(
f"Coverage: {result.metrics['threshold_precision/high_confidence_coverage']:.3f}"
)
创建针对特定业务领域定制的指标
# Example: Healthcare/Medical Domain
def create_medical_safety_metric(false_negative_penalty=10, false_positive_penalty=1):
"""Safety-focused metric for medical predictions where FN is more critical than FP."""
def eval_fn(predictions, targets, metrics):
tp = np.sum((predictions == 1) & (targets == 1))
fp = np.sum((predictions == 1) & (targets == 0))
fn = np.sum((predictions == 0) & (targets == 1))
tn = np.sum((predictions == 0) & (targets == 0))
# Safety score: heavily penalize missed positive cases
safety_score = (
tp - (fn * false_negative_penalty) - (fp * false_positive_penalty)
)
max_possible_score = np.sum(
targets == 1
) # All true positives, no false negatives
# Normalized safety score
normalized_safety = (
safety_score / max_possible_score if max_possible_score > 0 else 0
)
return MetricValue(
aggregate_results={
"safety_score": safety_score,
"normalized_safety": normalized_safety,
"missed_critical_cases": fn,
"false_alarms": fp,
}
)
return make_metric(eval_fn=eval_fn, greater_is_better=True, name="medical_safety")
# Example: E-commerce/Recommendation Domain
def create_recommendation_diversity_metric():
"""Diversity metric for recommendation systems."""
def eval_fn(predictions, targets, metrics):
# Assumes predictions contain recommendation scores or categories
unique_predictions = len(np.unique(predictions))
total_predictions = len(predictions)
diversity_ratio = unique_predictions / total_predictions
# Calculate entropy as diversity measure
pred_counts = np.bincount(predictions.astype(int))
pred_probs = pred_counts / len(predictions)
entropy = -np.sum(pred_probs * np.log2(pred_probs + 1e-10))
return MetricValue(
aggregate_results={
"diversity_ratio": diversity_ratio,
"prediction_entropy": entropy,
"unique_predictions": unique_predictions,
}
)
return make_metric(
eval_fn=eval_fn, greater_is_better=True, name="recommendation_diversity"
)
# Usage examples
medical_metric = create_medical_safety_metric(
false_negative_penalty=5, false_positive_penalty=1
)
diversity_metric = create_recommendation_diversity_metric()
自定义可视化
- 业务影响可视化
- 高级性能分析
- 交互式可视化
生成超越标准图表的自定义视觉分析
import matplotlib.pyplot as plt
import seaborn as sns
import os
def create_business_impact_visualization(eval_df, builtin_metrics, artifacts_dir):
"""Create custom business impact visualization."""
# Calculate business segments
eval_df["prediction_confidence"] = np.abs(eval_df["prediction"] - 0.5)
eval_df["confidence_segment"] = pd.cut(
eval_df["prediction_confidence"],
bins=[0, 0.1, 0.3, 0.5],
labels=["Low", "Medium", "High"],
)
# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. Accuracy by confidence segment
accuracy_by_segment = eval_df.groupby("confidence_segment").apply(
lambda x: (x["prediction"] == x["target"]).mean()
)
axes[0, 0].bar(
accuracy_by_segment.index, accuracy_by_segment.values, color="skyblue"
)
axes[0, 0].set_title("Accuracy by Confidence Segment")
axes[0, 0].set_ylabel("Accuracy")
axes[0, 0].set_ylim(0, 1)
# 2. Prediction distribution
axes[0, 1].hist(
eval_df["prediction"],
bins=20,
alpha=0.7,
label="Predictions",
color="lightgreen",
)
axes[0, 1].axvline(
eval_df["prediction"].mean(), color="red", linestyle="--", label="Mean"
)
axes[0, 1].set_title("Prediction Distribution")
axes[0, 1].legend()
# 3. Confusion matrix heatmap
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(eval_df["target"], eval_df["prediction"])
sns.heatmap(cm, annot=True, fmt="d", ax=axes[1, 0], cmap="Blues")
axes[1, 0].set_title("Confusion Matrix")
axes[1, 0].set_xlabel("Predicted")
axes[1, 0].set_ylabel("Actual")
# 4. Business value by segment
def calculate_segment_value(segment_data):
tp = np.sum((segment_data["prediction"] == 1) & (segment_data["target"] == 1))
fp = np.sum((segment_data["prediction"] == 1) & (segment_data["target"] == 0))
return (tp * 100) - (fp * 20) # Business value calculation
value_by_segment = eval_df.groupby("confidence_segment").apply(
calculate_segment_value
)
colors = ["lightcoral" if v < 0 else "lightgreen" for v in value_by_segment.values]
axes[1, 1].bar(value_by_segment.index, value_by_segment.values, color=colors)
axes[1, 1].set_title("Business Value by Confidence Segment")
axes[1, 1].set_ylabel("Business Value ($)")
axes[1, 1].axhline(y=0, color="black", linestyle="-", alpha=0.3)
plt.tight_layout()
# Save visualization
viz_path = os.path.join(artifacts_dir, "business_impact_analysis.png")
plt.savefig(viz_path, dpi=300, bbox_inches="tight")
plt.close()
return {"business_impact_analysis": viz_path}
创建详细的性能分析可视化
def create_performance_breakdown_visualization(eval_df, builtin_metrics, artifacts_dir):
"""Create detailed performance breakdown visualization."""
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
# 1. Performance by prediction probability
eval_df["prob_bin"] = pd.cut(eval_df["prediction"], bins=10, labels=False)
perf_by_prob = eval_df.groupby("prob_bin").apply(
lambda x: (x["prediction"] == x["target"]).mean() if len(x) > 0 else 0
)
axes[0, 0].plot(perf_by_prob.index, perf_by_prob.values, marker="o")
axes[0, 0].set_title("Accuracy by Prediction Probability Bin")
axes[0, 0].set_xlabel("Probability Bin")
axes[0, 0].set_ylabel("Accuracy")
# 2. Calibration plot
true_probs = eval_df.groupby("prob_bin")["target"].mean()
pred_probs = eval_df.groupby("prob_bin")["prediction"].mean()
axes[0, 1].plot([0, 1], [0, 1], "k--", alpha=0.5, label="Perfect Calibration")
axes[0, 1].scatter(pred_probs, true_probs, alpha=0.7, label="Model")
axes[0, 1].set_title("Calibration Plot")
axes[0, 1].set_xlabel("Mean Predicted Probability")
axes[0, 1].set_ylabel("Fraction of Positives")
axes[0, 1].legend()
# 3. Error distribution
errors = eval_df["target"] - eval_df["prediction"]
axes[0, 2].hist(errors, bins=20, alpha=0.7, color="orange")
axes[0, 2].set_title("Prediction Error Distribution")
axes[0, 2].set_xlabel("Error (Actual - Predicted)")
axes[0, 2].set_ylabel("Frequency")
# 4. Feature importance correlation (if available)
if "feature_0" in eval_df.columns:
feature_cols = [col for col in eval_df.columns if col.startswith("feature_")][
:5
]
corr_with_error = []
for feature in feature_cols:
corr = np.corrcoef(eval_df[feature], errors)[0, 1]
corr_with_error.append(abs(corr))
axes[1, 0].bar(range(len(corr_with_error)), corr_with_error)
axes[1, 0].set_title("Feature Correlation with Prediction Errors")
axes[1, 0].set_xlabel("Feature Index")
axes[1, 0].set_ylabel("Absolute Correlation")
axes[1, 0].set_xticks(range(len(feature_cols)))
axes[1, 0].set_xticklabels([f"F{i}" for i in range(len(feature_cols))])
# 5. Class distribution
class_dist = eval_df["target"].value_counts()
axes[1, 1].pie(
class_dist.values,
labels=[f"Class {i}" for i in class_dist.index],
autopct="%1.1f%%",
)
axes[1, 1].set_title("Target Class Distribution")
# 6. Precision-Recall by threshold
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(
eval_df["target"], eval_df["prediction"]
)
axes[1, 2].plot(recall, precision, marker=".", markersize=2)
axes[1, 2].set_title("Precision-Recall Curve")
axes[1, 2].set_xlabel("Recall")
axes[1, 2].set_ylabel("Precision")
axes[1, 2].grid(True, alpha=0.3)
plt.tight_layout()
# Save visualization
viz_path = os.path.join(artifacts_dir, "performance_breakdown_analysis.png")
plt.savefig(viz_path, dpi=300, bbox_inches="tight")
plt.close()
return {"performance_breakdown_analysis": viz_path}
创建交互式可视化以进行更深入的分析
def create_interactive_analysis_artifacts(eval_df, builtin_metrics, artifacts_dir):
"""Create interactive HTML visualizations using Plotly."""
try:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.offline as pyo
# Create subplot figure
fig = make_subplots(
rows=2,
cols=2,
subplot_titles=(
"Prediction Distribution",
"Accuracy by Confidence",
"ROC Curve",
"Feature Analysis",
),
specs=[
[{"secondary_y": False}, {"secondary_y": False}],
[{"secondary_y": False}, {"secondary_y": False}],
],
)
# 1. Interactive prediction distribution
fig.add_trace(
go.Histogram(x=eval_df["prediction"], name="Predictions", nbinsx=20),
row=1,
col=1,
)
# 2. Confidence-based accuracy
eval_df["confidence_level"] = pd.cut(
np.abs(eval_df["prediction"] - 0.5),
bins=5,
labels=["Very Low", "Low", "Medium", "High", "Very High"],
)
conf_accuracy = eval_df.groupby("confidence_level").apply(
lambda x: (x["prediction"] == x["target"]).mean()
)
fig.add_trace(
go.Bar(x=conf_accuracy.index, y=conf_accuracy.values, name="Accuracy"),
row=1,
col=2,
)
# 3. ROC Curve
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(eval_df["target"], eval_df["prediction"])
fig.add_trace(
go.Scatter(x=fpr, y=tpr, mode="lines", name="ROC Curve"), row=2, col=1
)
fig.add_trace(
go.Scatter(
x=[0, 1], y=[0, 1], mode="lines", name="Random", line=dict(dash="dash")
),
row=2,
col=1,
)
# 4. Feature analysis (if features available)
if "feature_0" in eval_df.columns:
fig.add_trace(
go.Scatter(
x=eval_df["feature_0"],
y=eval_df["prediction"],
mode="markers",
marker=dict(color=eval_df["target"], colorscale="Viridis"),
name="Feature vs Prediction",
),
row=2,
col=2,
)
# Update layout
fig.update_layout(
title_text="Interactive Model Performance Analysis",
showlegend=True,
height=800,
)
# Save interactive plot
interactive_path = os.path.join(artifacts_dir, "interactive_analysis.html")
pyo.plot(fig, filename=interactive_path, auto_open=False)
return {"interactive_analysis": interactive_path}
except ImportError:
# Fallback to matplotlib if plotly not available
print("Plotly not available, creating static visualization instead")
return create_business_impact_visualization(
eval_df, builtin_metrics, artifacts_dir
)
高级自定义指标
- 多指标组合
- 时间感知指标
创建组合多个评估标准的复合指标
def create_composite_business_metric():
"""Composite metric combining accuracy, profit, and risk measures."""
def eval_fn(predictions, targets, metrics):
# Get standard accuracy
accuracy = metrics.get("accuracy_score", 0)
# Calculate profit (reuse previous logic)
tp = np.sum((predictions == 1) & (targets == 1))
fp = np.sum((predictions == 1) & (targets == 0))
fn = np.sum((predictions == 0) & (targets == 1))
profit = (tp * 100) - (fp * 20) - (fn * 50) # Include missed opportunity cost
# Calculate risk (variance in performance across segments)
n_segments = 5
segment_size = len(predictions) // n_segments
segment_accuracies = []
for i in range(n_segments):
start_idx = i * segment_size
end_idx = (
start_idx + segment_size if i < n_segments - 1 else len(predictions)
)
seg_pred = predictions[start_idx:end_idx]
seg_target = targets[start_idx:end_idx]
seg_accuracy = np.mean(seg_pred == seg_target) if len(seg_pred) > 0 else 0
segment_accuracies.append(seg_accuracy)
risk_measure = np.std(segment_accuracies) # Lower std = lower risk
# Composite score: balance accuracy, profit, and risk
composite_score = (
(accuracy * 0.4) + (profit / 1000 * 0.4) + ((1 - risk_measure) * 0.2)
)
return MetricValue(
aggregate_results={
"composite_score": composite_score,
"accuracy_component": accuracy,
"profit_component": profit,
"risk_component": risk_measure,
"segment_consistency": 1 - risk_measure,
}
)
return make_metric(
eval_fn=eval_fn, greater_is_better=True, name="composite_business"
)
# Usage
composite_metric = create_composite_business_metric()
创建考虑预测时间方面的指标
def create_time_decay_metric(decay_rate=0.1):
"""Metric that gives more weight to recent predictions."""
def eval_fn(predictions, targets, metrics):
# Assume predictions are ordered by time (most recent last)
n_predictions = len(predictions)
# Create time weights (exponential decay)
time_weights = np.exp(decay_rate * np.arange(n_predictions))
time_weights = time_weights / np.sum(time_weights) # Normalize
# Calculate weighted accuracy
correct_predictions = (predictions == targets).astype(float)
weighted_accuracy = np.sum(correct_predictions * time_weights)
# Calculate recency bias
recent_accuracy = np.mean(
correct_predictions[-len(predictions) // 4 :]
) # Last 25%
overall_accuracy = np.mean(correct_predictions)
recency_bias = recent_accuracy - overall_accuracy
return MetricValue(
aggregate_results={
"time_weighted_accuracy": weighted_accuracy,
"recent_accuracy": recent_accuracy,
"overall_accuracy": overall_accuracy,
"recency_bias": recency_bias,
}
)
return make_metric(eval_fn=eval_fn, greater_is_better=True, name="time_decay")
最佳实践和指南
指标设计原则
使指标与业务相关 - 使用与实际业务目标一致的领域特定术语和阈值,将技术性能转化为业务影响。
确保指标可靠性 - 处理除以零和空数据集等边缘情况,使用已知测试用例验证计算,并在适当情况下包含置信区间。
优化可解释性 - 提供多个聚合级别,包含中间计算以提高透明度,使用描述性命名,并通过比率和百分比指标添加上下文。
文档和验证 - 记录指标假设和限制,使用已知真实情况的合成数据进行测试,并提供指标值的清晰解释。
可视化最佳实践
- 针对您的受众:为数据科学家创建技术图表,为业务利益相关者创建执行仪表板
- 使其交互式:使用交互式可视化进行探索,使用静态可视化进行报告
- 专注于洞察:突出关键发现和可操作的洞察,而不仅仅是显示数据
- 一致的样式:在所有可视化中使用一致的配色方案和格式
- 性能考虑:优化大型可视化以提高渲染速度和文件大小
集成示例
这是一个结合了自定义指标和可视化的综合示例
def comprehensive_custom_evaluation():
"""Complete example of custom metrics and visualizations."""
# Define multiple custom metrics
business_metric = create_profit_loss_metric(cost_per_fp=30, revenue_per_tp=150)
threshold_metric = create_threshold_precision_metric(threshold=0.8)
composite_metric = create_composite_business_metric()
# Define custom visualizations
custom_artifacts = [
create_business_impact_visualization,
create_performance_breakdown_visualization,
create_interactive_analysis_artifacts,
]
with mlflow.start_run(run_name="Comprehensive_Custom_Evaluation"):
# Evaluate with all custom components
result = mlflow.evaluate(
model_uri,
eval_data,
targets="target",
model_type="classifier",
extra_metrics=[business_metric, threshold_metric, composite_metric],
custom_artifacts=custom_artifacts,
)
# Log additional business context
mlflow.log_params(
{
"evaluation_focus": "business_impact",
"custom_metrics_count": 3,
"custom_visualizations_count": len(custom_artifacts),
}
)
print("Custom Evaluation Results:")
print(f"Profit/Loss: ${result.metrics.get('profit_loss/net_profit', 0):.2f}")
print(
f"High Confidence Precision: {result.metrics.get('threshold_precision/high_confidence_precision', 0):.3f}"
)
print(
f"Composite Score: {result.metrics.get('composite_business/composite_score', 0):.3f}"
)
print("\nCustom Artifacts Created:")
for name, path in result.artifacts.items():
if any(
keyword in name.lower()
for keyword in ["business", "performance", "interactive"]
):
print(f" {name}: {path}")
# Run comprehensive evaluation
# comprehensive_custom_evaluation()
结论
MLflow 中的自定义指标和可视化使您能够创建完美契合业务需求的评估框架。通过定义领域特定的成功标准并创建有针对性的视觉分析,您可以弥合技术模型性能与业务价值之间的差距。
主要优势包括
- 业务一致性:直接反映业务影响和优先级的指标
- 利益相关者沟通:使模型性能易于访问的视觉仪表板
- 领域专业知识:捕获行业特定要求的评估标准
- 决策支持:清晰的洞察,推动知情的模型选择和部署
无论您是优化利润、最小化风险还是满足法规要求,自定义指标和可视化都能确保您的模型评估过程与您的用例真正重要的内容保持一致。