机器学习实战之 XGBoost 与 LightGBM 算法(十)

一、XGBoost

反欺诈模型

"""
XGBoost案例

案例:金融反欺诈模型
说明:
特征变量:客户换设备次数,支付失败次数,换IP次数,换IP国次数,交易金额
目标变量:本次交易是否存在欺诈,1:代表欺诈,0:代表正常交易
其中400个欺诈样本,600个非欺诈样本
"""

"""
# 需要安装的包
pip3 install pandas
pip3 install matplotlib
pip3 install pyyaml
pip3 install xgboost
pip3 install openpyxl
"""

import pandas as pd
import pickle
import os

import yaml
import json
import datetime
import xgboost as xgb

def save_MLModel(model_url, model_file, feature_list, name="MLmodel"):
    model_yaml = {}

    # lang,模型语言 python,java等
    model_yaml['lang'] = {}
    model_yaml['lang']['type'] = 'python'
    model_yaml['lang']['env_file'] = 'conda.yaml'

    # flavors,定义了模型的算法类型以及通过哪一种模块加载模型
    model_yaml['flavors'] = {'sklearn': {}}
    model_yaml['flavors']['sklearn']['pickled_model'] = model_file
    model_yaml['flavors']['sklearn']['serialization_format'] = 'pickle'
    model_yaml['flavors']['sklearn']['algorithm_name'] = 'gbdt'

    # signature,是对模型的输入输出的描述
    model_yaml['signature'] = {}

    # objective,模型预测目标类型:1二分类,2回归,3多分类,4聚类
    model_yaml['objective'] = 1

    inputs_list = []
    for col in feature_list:
        fdict = {'name': col, 'type': 'float'}
        inputs_list.append(fdict)

    model_yaml['signature']['inputs'] = json.dumps(inputs_list)

    outputs_list = [{"type": "float", "name": 'DEFAULT_()'}]
    model_yaml['signature']['outputs'] = json.dumps(outputs_list)
    model_yaml['utc_time_created'] = str(datetime.datetime.now())

    with open(os.path.join(model_url, name), 'w', encoding='utf8') as t:
        yaml.dump(model_yaml, t)

    conda = {
        "channels": [
            "defaults"
        ],
        "dependencies": [
            "python=3.6.4",
            {
                "pip": [
                    "pandas<1.2",
                    "joblib",
                    "xgboost<1.5"
                ]
            }
        ],
        "name": "mlflow-env"
    }

    # 生成conda.yaml文件
    with open(os.path.join(model_url, 'conda.yaml'), 'w', encoding='utf8') as t:
        yaml.dump(conda, t)

# 目录不存在,则创建
if not os.path.exists('model'):
    os.makedirs('model')

# 1、读取数据
file = '信用卡交易数据.xlsx'
# data = pd.read_csv(file)
df = pd.read_excel(file)
print(df.head())

# 2、提取特征变量和目标变量
X = df.drop(columns='欺诈标签')
y = df['欺诈标签']

print(X.head())
print(y.head())

# 3、划分训练集和测试集
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# 4、模型训练及搭建
from xgboost import XGBClassifier

# 最大迭代次数(弱学习器的个数)设为100
clf = XGBClassifier(n_estimators=100, learning_rate=0.05)

print("模型训练中...")

# 训练模型
clf.fit(X_train, y_train)

print("模型训练结束...")

# TODO::模型保存

# 5、模型预测及评估
y_pred = clf.predict(X_test)
print("预测值:", y_pred)

# 汇总预测值和实际值,方便对比
a = pd.DataFrame()  # 创建一个空DataFrame
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)

print(a.head())

# 查看所有测试集数据的预测准确度
from sklearn.metrics import accuracy_score

score = accuracy_score(y_pred, y_test)
print("准确度:", score)

# 用XGBClassifier自带的score()函数来看准确度
clf_score = clf.score(X_test, y_test)
print("clf准确度:", clf_score)

# 使用 predict_proba()函数查看预测属于各个分类的概率
y_pred_proba = clf.predict_proba(X_test)
print(y_pred_proba[:, 1])

# 6、绘制ROC曲线来评估模型的预测效果
from sklearn.metrics import roc_curve

fpr, tpr, thres = roc_curve(y_test, y_pred_proba[:, 1])
import matplotlib.pyplot as plt

plt.plot(fpr, tpr)
plt.show()  # 这里会有阻塞,在执行下面的代码时,要关闭图

# 7、计算模型的AUC值
from sklearn.metrics import roc_auc_score

auc_score = roc_auc_score(y_test, y_pred_proba[:, 1])

# 8、查看各个特征变量的特征重要性,以便筛选出信用卡欺诈行为判断中重要的特征变量
# f_importance = clf.feature_importances_
# print("特征变量的特征重要性:", f_importance)

# 对特征名称和特征重要性进行汇总
features = X.columns  # 获取特征名称
importances = clf.feature_importances_  # 获取特征重要性
# 整理成二维表格,并按特征重要性降序排列
importances_df = pd.DataFrame()
importances_df['特征名称'] = features
importances_df['特征重要性'] = importances
importances_df.sort_values('特征重要性', ascending=False)
print("特征重要性表格:", importances_df)

为者常成,行者常至