MLflow XGBoost 集成
简介
XGBoost (eXtreme Gradient Boosting) 是一个流行的用于结构化数据的梯度提升库。MLflow 为实验跟踪、模型管理和部署提供了与 XGBoost 的原生集成。
此集成支持 XGBoost 的原生 API 和与 scikit-learn 兼容的接口,无论您偏好哪种 API,都可以轻松跟踪实验和部署模型。
为什么选择 MLflow + XGBoost?
自动日志记录
只需一行代码(mlflow.xgboost.autolog())即可捕获所有参数、每个提升轮次的指标以及特征重要性,无需手动插桩。
完整的模型记录
记录训练好的模型,包括序列化格式、输入/输出签名、模型依赖项和 Python 环境,以便进行可复现的部署。
超参数调优
自动为 GridSearchCV 和 RandomizedSearchCV 创建子运行,跟踪所有参数组合及其性能指标。
双 API 支持
支持原生 XGBoost API (xgb.train) 和与 scikit-learn 兼容的估计器 (XGBClassifier, XGBRegressor),并提供相同的自动日志记录功能。
开始使用
只需几行代码即可开始使用 XGBoost 和 MLflow
import mlflow
import xgboost as xgb
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
# Enable autologging - captures everything automatically
mlflow.xgboost.autolog()
# Load and prepare data
data = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42
)
# Prepare data in XGBoost format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# Train model - MLflow automatically logs everything!
with mlflow.start_run():
model = xgb.train(
params={
"objective": "reg:squarederror",
"max_depth": 6,
"learning_rate": 0.1,
},
dtrain=dtrain,
num_boost_round=100,
evals=[(dtrain, "train"), (dtest, "test")],
)
自动日志记录可捕获参数、每轮次的指标、特征重要性(带可视化)以及训练好的模型。
在本地运行?MLflow 默认将实验存储在当前目录中。如需团队协作或远程跟踪,请设置跟踪服务器。
自动日志记录
启用自动日志记录,只需一行代码即可自动跟踪 XGBoost 实验
- 原生 XGBoost API
- Scikit-learn API
import mlflow
import xgboost as xgb
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
# Load data
data = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42
)
# Enable autologging
mlflow.xgboost.autolog()
# Train with native API
with mlflow.start_run():
dtrain = xgb.DMatrix(X_train, label=y_train)
model = xgb.train(
params={"objective": "reg:squarederror", "max_depth": 6},
dtrain=dtrain,
num_boost_round=100,
)
import mlflow
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
# Load data
data = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42
)
# Enable sklearn autologging (works with XGBoost estimators)
mlflow.sklearn.autolog()
# Train with sklearn-compatible API
with mlflow.start_run():
model = XGBRegressor(n_estimators=100, max_depth=6)
model.fit(X_train, y_train)
记录内容
启用自动日志记录后,MLflow 会自动捕获
- 参数:所有提升器参数和训练配置
- 指标:每个提升轮次的训练和验证指标
- 特征重要性:多种重要性类型(weight, gain, cover)及可视化
- 模型:带有正确序列化格式的训练好的模型
- 制品:特征重要性图和 JSON 数据
自动日志配置
自定义自动日志记录行为
mlflow.xgboost.autolog(
log_input_examples=True,
log_model_signatures=True,
log_models=True,
log_datasets=True,
model_format="json", # Recommended for portability
registered_model_name="XGBoostModel",
extra_tags={"team": "data-science"},
)
超参数调优
网格搜索
MLflow 会自动为超参数调优创建子运行
import mlflow
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBClassifier
# Load data
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42
)
# Enable autologging
mlflow.sklearn.autolog()
# Define parameter grid
param_grid = {
"n_estimators": [50, 100, 200],
"max_depth": [3, 6, 9],
"learning_rate": [0.01, 0.1, 0.3],
}
# Run grid search - MLflow logs each combination as a child run
with mlflow.start_run():
model = XGBClassifier(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring="roc_auc", n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"Best score: {grid_search.best_score_}")
Optuna 集成
用于更高级的超参数优化
import mlflow
import optuna
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
# Load data
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42
)
mlflow.xgboost.autolog()
def objective(trial):
params = {
"n_estimators": trial.suggest_int("n_estimators", 50, 300),
"max_depth": trial.suggest_int("max_depth", 3, 10),
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
"subsample": trial.suggest_float("subsample", 0.6, 1.0),
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
}
with mlflow.start_run(nested=True):
model = XGBClassifier(**params, random_state=42)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
return score
with mlflow.start_run():
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
mlflow.log_params({f"best_{k}": v for k, v in study.best_params.items()})
mlflow.log_metric("best_score", study.best_value)
模型管理
记录具有特定配置的模型
import mlflow.xgboost
import xgboost as xgb
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
# Load data
data = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42
)
dtrain = xgb.DMatrix(X_train, label=y_train)
with mlflow.start_run():
params = {"objective": "reg:squarederror", "max_depth": 6}
model = xgb.train(params, dtrain, num_boost_round=100)
mlflow.xgboost.log_model(
xgb_model=model,
name="model",
model_format="json", # Recommended for portability
registered_model_name="production_model",
)
使用 model_format="json" 以获得跨 XGBoost 版本最佳的可移植性。json 格式是人类可读且跨平台兼容的。
加载模型进行推理
# Load as native XGBoost model
model = mlflow.xgboost.load_model("runs:/<run_id>/model")
# Load as PyFunc for generic interface
pyfunc_model = mlflow.pyfunc.load_model("runs:/<run_id>/model")
# Load from model registry using alias
model = mlflow.pyfunc.load_model("models:/XGBoostModel@champion")
模型注册表集成
注册和管理模型版本
import mlflow.xgboost
import xgboost as xgb
from mlflow import MlflowClient
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
# Load and prepare data
data = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42
)
dtrain = xgb.DMatrix(X_train, label=y_train)
# Register model during training
with mlflow.start_run():
params = {"objective": "reg:squarederror", "max_depth": 6}
model = xgb.train(params, dtrain, num_boost_round=100)
mlflow.xgboost.log_model(
xgb_model=model,
name="model",
registered_model_name="XGBoostModel",
)
# Set alias for deployment
client = MlflowClient()
client.set_registered_model_alias(
name="XGBoostModel",
alias="champion",
version=1,
)
# Load model by alias
model = mlflow.pyfunc.load_model("models:/XGBoostModel@champion")
模型服务
在本地服务模型进行测试
mlflow models serve -m "models:/XGBoostModel@champion" -p 5000
通过 REST API 进行预测
import requests
import pandas as pd
data = pd.DataFrame(
{
"feature1": [1.2, 2.3],
"feature2": [0.8, 1.5],
"feature3": [3.4, 4.2],
}
)
response = requests.post(
"https://:5000/invocations",
headers={"Content-Type": "application/json"},
json={"dataframe_split": data.to_dict(orient="split")},
)
predictions = response.json()
部署到云平台
# Deploy to AWS SageMaker
mlflow deployments create \
-t sagemaker \
--name xgboost-endpoint \
-m models:/XGBoostModel@champion
# Deploy to Azure ML
mlflow deployments create \
-t azureml \
--name xgboost-service \
-m models:/XGBoostModel@champion