自定义指标与可视化

MLflow 的评估框架允许您定义自定义指标并创建专门的可视化，以满足您的特定业务需求。当标准指标无法捕捉您所在领域的独特成功标准，或者当您需要自定义可视化分析来与利益相关者沟通时，此功能至关重要。

快速入门：创建自定义指标

使用 MLflow 的指标构建器定义特定于域的指标

import mlflow
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from mlflow.models import make_metric
from mlflow.metrics.base import MetricValue

# Generate sample data
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Create evaluation dataset
eval_data = pd.DataFrame(X_test)
eval_data["target"] = y_test


# Define a custom business metric
def business_value_metric(predictions, targets, metrics):
    """
    Custom metric calculating business value impact.
    True Positives = $100 value, False Positives = -$20 cost
    """
    tp = np.sum((predictions == 1) & (targets == 1))
    fp = np.sum((predictions == 1) & (targets == 0))
    tn = np.sum((predictions == 0) & (targets == 0))
    fn = np.sum((predictions == 0) & (targets == 1))

    # Business logic: TP worth $100, FP costs $20
    business_value = (tp * 100) - (fp * 20)
    total_possible_value = np.sum(targets == 1) * 100

    return MetricValue(
        scores=[business_value],  # Total business value
        aggregate_results={
            "total_business_value": business_value,
            "value_per_prediction": business_value / len(predictions),
            "value_efficiency": business_value / total_possible_value
            if total_possible_value > 0
            else 0,
        },
    )


# Create the metric
business_metric = make_metric(
    eval_fn=business_value_metric, greater_is_better=True, name="business_value"
)

with mlflow.start_run():
    # Log model
    model_info = mlflow.sklearn.log_model(model, name="model")

    # Evaluate with custom metric
    result = mlflow.evaluate(
        model_info.model_uri,
        eval_data,
        targets="target",
        model_type="classifier",
        extra_metrics=[business_metric],
    )

    print(f"Standard Accuracy: {result.metrics['accuracy_score']:.3f}")
    print(
        f"Business Value: ${result.metrics['business_value/total_business_value']:.2f}"
    )
    print(
        f"Value per Prediction: ${result.metrics['business_value/value_per_prediction']:.2f}"
    )

自定义指标模式

财务影响指标
基于阈值的指标
特定于域的指标

创建将模型性能转化为业务术语的指标

def create_profit_loss_metric(cost_per_fp=50, revenue_per_tp=200):
    """Calculate profit/loss impact of model predictions."""

    def eval_fn(predictions, targets, metrics):
        tp = np.sum((predictions == 1) & (targets == 1))
        fp = np.sum((predictions == 1) & (targets == 0))
        fn = np.sum((predictions == 0) & (targets == 1))
        tn = np.sum((predictions == 0) & (targets == 0))

        # Calculate financial impact
        revenue = tp * revenue_per_tp
        costs = fp * cost_per_fp
        missed_opportunity = fn * revenue_per_tp
        net_profit = revenue - costs

        return MetricValue(
            aggregate_results={
                "net_profit": net_profit,
                "total_revenue": revenue,
                "total_costs": costs,
                "missed_revenue": missed_opportunity,
                "roi": (net_profit / max(costs, 1)) * 100,
            }
        )

    return make_metric(eval_fn=eval_fn, greater_is_better=True, name="profit_loss")


# Usage
profit_metric = create_profit_loss_metric(cost_per_fp=30, revenue_per_tp=150)

with mlflow.start_run():
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="target",
        model_type="classifier",
        extra_metrics=[profit_metric],
    )

    print(f"Net Profit: ${result.metrics['profit_loss/net_profit']:.2f}")
    print(f"ROI: {result.metrics['profit_loss/roi']:.1f}%")

创建评估特定业务阈值下性能的指标

def create_threshold_precision_metric(threshold=0.8):
    """Precision metric for high-confidence predictions only."""

    def eval_fn(predictions, targets, metrics):
        # This assumes predictions are probabilities; adjust for your use case
        high_confidence_mask = np.abs(predictions - 0.5) >= (threshold - 0.5)

        if np.sum(high_confidence_mask) == 0:
            return MetricValue(aggregate_results={"high_confidence_precision": 0.0})

        hc_predictions = predictions[high_confidence_mask]
        hc_targets = targets[high_confidence_mask]

        # Convert probabilities to binary predictions
        binary_predictions = (hc_predictions > 0.5).astype(int)

        tp = np.sum((binary_predictions == 1) & (hc_targets == 1))
        fp = np.sum((binary_predictions == 1) & (hc_targets == 0))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        coverage = np.sum(high_confidence_mask) / len(predictions)

        return MetricValue(
            aggregate_results={
                "high_confidence_precision": precision,
                "high_confidence_coverage": coverage,
                "high_confidence_count": np.sum(high_confidence_mask),
            }
        )

    return make_metric(
        eval_fn=eval_fn, greater_is_better=True, name="threshold_precision"
    )


# Create threshold-based metric
threshold_metric = create_threshold_precision_metric(threshold=0.8)

with mlflow.start_run():
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="target",
        model_type="classifier",
        extra_metrics=[threshold_metric],
    )

    print(
        f"High Confidence Precision: {result.metrics['threshold_precision/high_confidence_precision']:.3f}"
    )
    print(
        f"Coverage: {result.metrics['threshold_precision/high_confidence_coverage']:.3f}"
    )

创建针对特定业务领域量身定制的指标

# Example: Healthcare/Medical Domain
def create_medical_safety_metric(false_negative_penalty=10, false_positive_penalty=1):
    """Safety-focused metric for medical predictions where FN is more critical than FP."""

    def eval_fn(predictions, targets, metrics):
        tp = np.sum((predictions == 1) & (targets == 1))
        fp = np.sum((predictions == 1) & (targets == 0))
        fn = np.sum((predictions == 0) & (targets == 1))
        tn = np.sum((predictions == 0) & (targets == 0))

        # Safety score: heavily penalize missed positive cases
        safety_score = (
            tp - (fn * false_negative_penalty) - (fp * false_positive_penalty)
        )
        max_possible_score = np.sum(
            targets == 1
        )  # All true positives, no false negatives

        # Normalized safety score
        normalized_safety = (
            safety_score / max_possible_score if max_possible_score > 0 else 0
        )

        return MetricValue(
            aggregate_results={
                "safety_score": safety_score,
                "normalized_safety": normalized_safety,
                "missed_critical_cases": fn,
                "false_alarms": fp,
            }
        )

    return make_metric(eval_fn=eval_fn, greater_is_better=True, name="medical_safety")


# Example: E-commerce/Recommendation Domain
def create_recommendation_diversity_metric():
    """Diversity metric for recommendation systems."""

    def eval_fn(predictions, targets, metrics):
        # Assumes predictions contain recommendation scores or categories
        unique_predictions = len(np.unique(predictions))
        total_predictions = len(predictions)

        diversity_ratio = unique_predictions / total_predictions

        # Calculate entropy as diversity measure
        pred_counts = np.bincount(predictions.astype(int))
        pred_probs = pred_counts / len(predictions)
        entropy = -np.sum(pred_probs * np.log2(pred_probs + 1e-10))

        return MetricValue(
            aggregate_results={
                "diversity_ratio": diversity_ratio,
                "prediction_entropy": entropy,
                "unique_predictions": unique_predictions,
            }
        )

    return make_metric(
        eval_fn=eval_fn, greater_is_better=True, name="recommendation_diversity"
    )


# Usage examples
medical_metric = create_medical_safety_metric(
    false_negative_penalty=5, false_positive_penalty=1
)
diversity_metric = create_recommendation_diversity_metric()

自定义可视化

业务影响可视化
高级性能分析
交互式可视化

生成超越标准图表的自定义可视化分析

import matplotlib.pyplot as plt
import seaborn as sns
import os


def create_business_impact_visualization(eval_df, builtin_metrics, artifacts_dir):
    """Create custom business impact visualization."""

    # Calculate business segments
    eval_df["prediction_confidence"] = np.abs(eval_df["prediction"] - 0.5)
    eval_df["confidence_segment"] = pd.cut(
        eval_df["prediction_confidence"],
        bins=[0, 0.1, 0.3, 0.5],
        labels=["Low", "Medium", "High"],
    )

    # Create subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))

    # 1. Accuracy by confidence segment
    accuracy_by_segment = eval_df.groupby("confidence_segment").apply(
        lambda x: (x["prediction"] == x["target"]).mean()
    )

    axes[0, 0].bar(
        accuracy_by_segment.index, accuracy_by_segment.values, color="skyblue"
    )
    axes[0, 0].set_title("Accuracy by Confidence Segment")
    axes[0, 0].set_ylabel("Accuracy")
    axes[0, 0].set_ylim(0, 1)

    # 2. Prediction distribution
    axes[0, 1].hist(
        eval_df["prediction"],
        bins=20,
        alpha=0.7,
        label="Predictions",
        color="lightgreen",
    )
    axes[0, 1].axvline(
        eval_df["prediction"].mean(), color="red", linestyle="--", label="Mean"
    )
    axes[0, 1].set_title("Prediction Distribution")
    axes[0, 1].legend()

    # 3. Confusion matrix heatmap
    from sklearn.metrics import confusion_matrix

    cm = confusion_matrix(eval_df["target"], eval_df["prediction"])
    sns.heatmap(cm, annot=True, fmt="d", ax=axes[1, 0], cmap="Blues")
    axes[1, 0].set_title("Confusion Matrix")
    axes[1, 0].set_xlabel("Predicted")
    axes[1, 0].set_ylabel("Actual")

    # 4. Business value by segment
    def calculate_segment_value(segment_data):
        tp = np.sum((segment_data["prediction"] == 1) & (segment_data["target"] == 1))
        fp = np.sum((segment_data["prediction"] == 1) & (segment_data["target"] == 0))
        return (tp * 100) - (fp * 20)  # Business value calculation

    value_by_segment = eval_df.groupby("confidence_segment").apply(
        calculate_segment_value
    )

    colors = ["lightcoral" if v < 0 else "lightgreen" for v in value_by_segment.values]
    axes[1, 1].bar(value_by_segment.index, value_by_segment.values, color=colors)
    axes[1, 1].set_title("Business Value by Confidence Segment")
    axes[1, 1].set_ylabel("Business Value ($)")
    axes[1, 1].axhline(y=0, color="black", linestyle="-", alpha=0.3)

    plt.tight_layout()

    # Save visualization
    viz_path = os.path.join(artifacts_dir, "business_impact_analysis.png")
    plt.savefig(viz_path, dpi=300, bbox_inches="tight")
    plt.close()

    return {"business_impact_analysis": viz_path}

创建详细的性能分析可视化

def create_performance_breakdown_visualization(eval_df, builtin_metrics, artifacts_dir):
    """Create detailed performance breakdown visualization."""

    fig, axes = plt.subplots(2, 3, figsize=(18, 12))

    # 1. Performance by prediction probability
    eval_df["prob_bin"] = pd.cut(eval_df["prediction"], bins=10, labels=False)
    perf_by_prob = eval_df.groupby("prob_bin").apply(
        lambda x: (x["prediction"] == x["target"]).mean() if len(x) > 0 else 0
    )

    axes[0, 0].plot(perf_by_prob.index, perf_by_prob.values, marker="o")
    axes[0, 0].set_title("Accuracy by Prediction Probability Bin")
    axes[0, 0].set_xlabel("Probability Bin")
    axes[0, 0].set_ylabel("Accuracy")

    # 2. Calibration plot
    true_probs = eval_df.groupby("prob_bin")["target"].mean()
    pred_probs = eval_df.groupby("prob_bin")["prediction"].mean()

    axes[0, 1].plot([0, 1], [0, 1], "k--", alpha=0.5, label="Perfect Calibration")
    axes[0, 1].scatter(pred_probs, true_probs, alpha=0.7, label="Model")
    axes[0, 1].set_title("Calibration Plot")
    axes[0, 1].set_xlabel("Mean Predicted Probability")
    axes[0, 1].set_ylabel("Fraction of Positives")
    axes[0, 1].legend()

    # 3. Error distribution
    errors = eval_df["target"] - eval_df["prediction"]
    axes[0, 2].hist(errors, bins=20, alpha=0.7, color="orange")
    axes[0, 2].set_title("Prediction Error Distribution")
    axes[0, 2].set_xlabel("Error (Actual - Predicted)")
    axes[0, 2].set_ylabel("Frequency")

    # 4. Feature importance correlation (if available)
    if "feature_0" in eval_df.columns:
        feature_cols = [col for col in eval_df.columns if col.startswith("feature_")][
            :5
        ]
        corr_with_error = []

        for feature in feature_cols:
            corr = np.corrcoef(eval_df[feature], errors)[0, 1]
            corr_with_error.append(abs(corr))

        axes[1, 0].bar(range(len(corr_with_error)), corr_with_error)
        axes[1, 0].set_title("Feature Correlation with Prediction Errors")
        axes[1, 0].set_xlabel("Feature Index")
        axes[1, 0].set_ylabel("Absolute Correlation")
        axes[1, 0].set_xticks(range(len(feature_cols)))
        axes[1, 0].set_xticklabels([f"F{i}" for i in range(len(feature_cols))])

    # 5. Class distribution
    class_dist = eval_df["target"].value_counts()
    axes[1, 1].pie(
        class_dist.values,
        labels=[f"Class {i}" for i in class_dist.index],
        autopct="%1.1f%%",
    )
    axes[1, 1].set_title("Target Class Distribution")

    # 6. Precision-Recall by threshold
    from sklearn.metrics import precision_recall_curve

    precision, recall, thresholds = precision_recall_curve(
        eval_df["target"], eval_df["prediction"]
    )

    axes[1, 2].plot(recall, precision, marker=".", markersize=2)
    axes[1, 2].set_title("Precision-Recall Curve")
    axes[1, 2].set_xlabel("Recall")
    axes[1, 2].set_ylabel("Precision")
    axes[1, 2].grid(True, alpha=0.3)

    plt.tight_layout()

    # Save visualization
    viz_path = os.path.join(artifacts_dir, "performance_breakdown_analysis.png")
    plt.savefig(viz_path, dpi=300, bbox_inches="tight")
    plt.close()

    return {"performance_breakdown_analysis": viz_path}

创建交互式可视化以进行深入分析

def create_interactive_analysis_artifacts(eval_df, builtin_metrics, artifacts_dir):
    """Create interactive HTML visualizations using Plotly."""

    try:
        import plotly.graph_objects as go
        import plotly.express as px
        from plotly.subplots import make_subplots
        import plotly.offline as pyo

        # Create subplot figure
        fig = make_subplots(
            rows=2,
            cols=2,
            subplot_titles=(
                "Prediction Distribution",
                "Accuracy by Confidence",
                "ROC Curve",
                "Feature Analysis",
            ),
            specs=[
                [{"secondary_y": False}, {"secondary_y": False}],
                [{"secondary_y": False}, {"secondary_y": False}],
            ],
        )

        # 1. Interactive prediction distribution
        fig.add_trace(
            go.Histogram(x=eval_df["prediction"], name="Predictions", nbinsx=20),
            row=1,
            col=1,
        )

        # 2. Confidence-based accuracy
        eval_df["confidence_level"] = pd.cut(
            np.abs(eval_df["prediction"] - 0.5),
            bins=5,
            labels=["Very Low", "Low", "Medium", "High", "Very High"],
        )

        conf_accuracy = eval_df.groupby("confidence_level").apply(
            lambda x: (x["prediction"] == x["target"]).mean()
        )

        fig.add_trace(
            go.Bar(x=conf_accuracy.index, y=conf_accuracy.values, name="Accuracy"),
            row=1,
            col=2,
        )

        # 3. ROC Curve
        from sklearn.metrics import roc_curve

        fpr, tpr, _ = roc_curve(eval_df["target"], eval_df["prediction"])

        fig.add_trace(
            go.Scatter(x=fpr, y=tpr, mode="lines", name="ROC Curve"), row=2, col=1
        )
        fig.add_trace(
            go.Scatter(
                x=[0, 1], y=[0, 1], mode="lines", name="Random", line=dict(dash="dash")
            ),
            row=2,
            col=1,
        )

        # 4. Feature analysis (if features available)
        if "feature_0" in eval_df.columns:
            fig.add_trace(
                go.Scatter(
                    x=eval_df["feature_0"],
                    y=eval_df["prediction"],
                    mode="markers",
                    marker=dict(color=eval_df["target"], colorscale="Viridis"),
                    name="Feature vs Prediction",
                ),
                row=2,
                col=2,
            )

        # Update layout
        fig.update_layout(
            title_text="Interactive Model Performance Analysis",
            showlegend=True,
            height=800,
        )

        # Save interactive plot
        interactive_path = os.path.join(artifacts_dir, "interactive_analysis.html")
        pyo.plot(fig, filename=interactive_path, auto_open=False)

        return {"interactive_analysis": interactive_path}

    except ImportError:
        # Fallback to matplotlib if plotly not available
        print("Plotly not available, creating static visualization instead")
        return create_business_impact_visualization(
            eval_df, builtin_metrics, artifacts_dir
        )

高级自定义指标

多指标组合
时间感知指标

创建结合多个评估标准的复合指标

def create_composite_business_metric():
    """Composite metric combining accuracy, profit, and risk measures."""

    def eval_fn(predictions, targets, metrics):
        # Get standard accuracy
        accuracy = metrics.get("accuracy_score", 0)

        # Calculate profit (reuse previous logic)
        tp = np.sum((predictions == 1) & (targets == 1))
        fp = np.sum((predictions == 1) & (targets == 0))
        fn = np.sum((predictions == 0) & (targets == 1))

        profit = (tp * 100) - (fp * 20) - (fn * 50)  # Include missed opportunity cost

        # Calculate risk (variance in performance across segments)
        n_segments = 5
        segment_size = len(predictions) // n_segments
        segment_accuracies = []

        for i in range(n_segments):
            start_idx = i * segment_size
            end_idx = (
                start_idx + segment_size if i < n_segments - 1 else len(predictions)
            )

            seg_pred = predictions[start_idx:end_idx]
            seg_target = targets[start_idx:end_idx]
            seg_accuracy = np.mean(seg_pred == seg_target) if len(seg_pred) > 0 else 0
            segment_accuracies.append(seg_accuracy)

        risk_measure = np.std(segment_accuracies)  # Lower std = lower risk

        # Composite score: balance accuracy, profit, and risk
        composite_score = (
            (accuracy * 0.4) + (profit / 1000 * 0.4) + ((1 - risk_measure) * 0.2)
        )

        return MetricValue(
            aggregate_results={
                "composite_score": composite_score,
                "accuracy_component": accuracy,
                "profit_component": profit,
                "risk_component": risk_measure,
                "segment_consistency": 1 - risk_measure,
            }
        )

    return make_metric(
        eval_fn=eval_fn, greater_is_better=True, name="composite_business"
    )


# Usage
composite_metric = create_composite_business_metric()

创建考虑预测时间方面的指标

def create_time_decay_metric(decay_rate=0.1):
    """Metric that gives more weight to recent predictions."""

    def eval_fn(predictions, targets, metrics):
        # Assume predictions are ordered by time (most recent last)
        n_predictions = len(predictions)

        # Create time weights (exponential decay)
        time_weights = np.exp(decay_rate * np.arange(n_predictions))
        time_weights = time_weights / np.sum(time_weights)  # Normalize

        # Calculate weighted accuracy
        correct_predictions = (predictions == targets).astype(float)
        weighted_accuracy = np.sum(correct_predictions * time_weights)

        # Calculate recency bias
        recent_accuracy = np.mean(
            correct_predictions[-len(predictions) // 4 :]
        )  # Last 25%
        overall_accuracy = np.mean(correct_predictions)
        recency_bias = recent_accuracy - overall_accuracy

        return MetricValue(
            aggregate_results={
                "time_weighted_accuracy": weighted_accuracy,
                "recent_accuracy": recent_accuracy,
                "overall_accuracy": overall_accuracy,
                "recency_bias": recency_bias,
            }
        )

    return make_metric(eval_fn=eval_fn, greater_is_better=True, name="time_decay")

最佳实践和指南

指标设计原则

使指标与业务相关 - 使用符合实际业务目标的特定领域术语和阈值，将技术性能转化为业务影响。

确保指标可靠性 - 处理零除法和空数据集等边缘情况，使用已知测试用例验证计算，并在适用时包含置信区间。

优化可解释性 - 提供多个聚合级别，包含中间计算以提高透明度，使用描述性名称，并通过比率和百分比指标添加上下文。

文档和验证 - 记录指标假设和限制，使用已知真实情况的合成数据进行测试，并提供指标值的清晰解释。

可视化最佳实践

定位受众：为数据科学家创建技术图表，为业务利益相关者创建执行仪表板
使其具有交互性：使用交互式可视化进行探索，使用静态可视化进行报告
关注见解：突出关键发现和可操作见解，而不仅仅是显示数据
风格一致：在所有可视化中使用一致的颜色方案和格式
性能考虑：优化大型可视化的渲染速度和文件大小

集成示例

这是一个结合了自定义指标和可视化的综合示例

def comprehensive_custom_evaluation():
    """Complete example of custom metrics and visualizations."""

    # Define multiple custom metrics
    business_metric = create_profit_loss_metric(cost_per_fp=30, revenue_per_tp=150)
    threshold_metric = create_threshold_precision_metric(threshold=0.8)
    composite_metric = create_composite_business_metric()

    # Define custom visualizations
    custom_artifacts = [
        create_business_impact_visualization,
        create_performance_breakdown_visualization,
        create_interactive_analysis_artifacts,
    ]

    with mlflow.start_run(run_name="Comprehensive_Custom_Evaluation"):
        # Evaluate with all custom components
        result = mlflow.evaluate(
            model_uri,
            eval_data,
            targets="target",
            model_type="classifier",
            extra_metrics=[business_metric, threshold_metric, composite_metric],
            custom_artifacts=custom_artifacts,
        )

        # Log additional business context
        mlflow.log_params(
            {
                "evaluation_focus": "business_impact",
                "custom_metrics_count": 3,
                "custom_visualizations_count": len(custom_artifacts),
            }
        )

        print("Custom Evaluation Results:")
        print(f"Profit/Loss: ${result.metrics.get('profit_loss/net_profit', 0):.2f}")
        print(
            f"High Confidence Precision: {result.metrics.get('threshold_precision/high_confidence_precision', 0):.3f}"
        )
        print(
            f"Composite Score: {result.metrics.get('composite_business/composite_score', 0):.3f}"
        )

        print("\nCustom Artifacts Created:")
        for name, path in result.artifacts.items():
            if any(
                keyword in name.lower()
                for keyword in ["business", "performance", "interactive"]
            ):
                print(f"  {name}: {path}")


# Run comprehensive evaluation
# comprehensive_custom_evaluation()

结论

MLflow 中的自定义指标和可视化使您能够创建完全符合业务需求的评估框架。通过定义特定于领域的成功标准并创建有针对性的可视化分析，您可以弥合技术模型性能与业务价值之间的差距。

主要优势包括

业务一致性：直接反映业务影响和优先事项的指标
利益相关者沟通：使模型性能易于理解的可视化仪表板
领域专业知识：捕捉行业特定要求的评估标准
决策支持：为知情的模型选择和部署提供清晰见解

无论您是为了优化利润、降低风险还是满足监管要求，自定义指标和可视化都能确保您的模型评估流程与您的用例真正重要的事情保持一致。

快速入门：创建自定义指标​

自定义指标模式​

自定义可视化​

高级自定义指标​

最佳实践和指南​

指标设计原则​

可视化最佳实践​

集成示例​

结论​