diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md index 899f6fa..b1ee0c6 100644 --- a/docs/source/quickstart.md +++ b/docs/source/quickstart.md @@ -562,32 +562,277 @@ bin_plot(score_table_train, desc="训练集模型评分", figsize=(10, 6), ancho ```python # 查看某个特征的 PSI +score_clip = card.score_clip(train["score"], clip=10) score_table_train = feature_bin_stats(train, "score", desc="训练集模型评分", target=target, rules=score_clip) score_table_test = feature_bin_stats(test, "score", desc="测试集模型评分", target=target, rules=score_clip) train_test_score_psi = psi_plot(score_table_train, score_table_test, labels=["训练数据集", "测试数据集"], save="model_report/train_test_psiplot.png", result=True) # 查看某个入模特征的 CSI for col in card._feature_names: - rule = combiner[col] - feature_table_train = feature_bin_stats(train, col, target=target, desc="训练集分布", combiner=rule) - feature_table_test = feature_bin_stats(test, col, target=target, desc="测试集分布", combiner=rule) + feature_table_train = feature_bin_stats(train, col, target=target, desc="训练集分布", combiner=combiner) + feature_table_test = feature_bin_stats(test, col, target=target, desc="测试集分布", combiner=combiner) train_test_csi_table = csi_plot(feature_table_train, feature_table_test, card[col], desc=col, result=True, plot=True, max_len=35, figsize=(10, 6), labels=["训练数据集", "测试数据集"], save=f"model_report/csi_{col}.png") ```
- +
- +
+### 模型持久化存储 + +`scorecardpipeline` 提供了几种可选的模型持久化存储方式,可以将训练好的评分卡模型保存为 `pickle` 或 `pmml` 格式的模型文件,供后续生产部署或离线回溯使用,非常方便快捷 + +```python +# 将评分卡模型保存 pmml 文件 +scorecard_pipeline = card.scorecard2pmml(pmml="model_report/scorecard.pmml", debug=True) +# 将评分卡模型保存 pickle 文件 +save_pickle(card, "model_report/scorecard.pkl") +``` + + +### 评分卡 `pipeline` 建模 + +在 `scorecardpipeline` 中,几乎所以的模型和数据预处理步骤都支持 `pipeline` 式构建模型,同时还可以与 `sklearn` 中其他的 `pipeline` 组件一起构建模型。 + +```python +# 构建 pipeline +model_pipeline = Pipeline([ + ("preprocessing", FeatureSelection(target=target, engine="scorecardpy")), + ("combiner", Combiner(target=target, min_bin_size=0.2)), + ("transform", WOETransformer(target=target)), + ("processing_select", FeatureSelection(target=target, engine="toad")), + ("stepwise", StepwiseSelection(target=target)), + ("logistic", ITLubberLogisticRegression(target=target)), +]) +# 训练 pipeline +model_pipeline.fit(train) +# 转换评分卡 +card = ScoreCard(target=target, pipeline=model_pipeline, base_score=50, base_odds=(1 - bad_rate) / bad_rate, pdo=10) +card.fit(model_pipeline[:-1].transform(train)) +card.scorecard_points() +``` + +| 序号 | 变量名称 | 变量分箱 | 对应分数 | +|:--:|:----------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------|-----------:| +| 0 | purpose | radio/television,car (used) | 12.8514 | +| 1 | purpose | business,furniture/equipment,others,education,domestic appliances,retraining,car (new),缺失值,repairs | 2.7818 | +| 2 | installment_rate_in_percentage_of_disposable_income | [负无穷 , 4.0) | 7.7878 | +| 3 | installment_rate_in_percentage_of_disposable_income | [4.0 , 正无穷) | 0.9877 | +| 4 | installment_rate_in_percentage_of_disposable_income | 缺失值 | 10.7462 | +| 5 | savings_account_and_bonds | 500 <= ... < 1000 DM,unknown/ no savings account,... >= 1000 DM | 5.9834 | +| 6 | savings_account_and_bonds | ... < 100 DM,100 <= ... < 500 DM | 12.1585 | +| 7 | savings_account_and_bonds | 缺失值 | 3.2466 | +| 8 | present_employment_since | 4 <= ... < 7 years,... >= 7 years | 9.4896 | +| 9 | present_employment_since | 缺失值,unemployed,1 <= ... < 4 years,... < 1 year | 3.8957 | +| 10 | age_in_years | [负无穷 , 35.0) | 0.8917 | +| 11 | age_in_years | [35.0 , 正无穷) | 10.8457 | +| 12 | age_in_years | 缺失值 | 6.7213 | +| 13 | property | real estate | 11.2541 | +| 14 | property | 缺失值,building society savings agreement/ life insurance,unknown / no property,car or other, not in attribute Savings account/bonds | 4.0428 | +| 15 | personal_status_and_sex | 缺失值,female : divorced/separated/married | 7.8997 | +| 16 | personal_status_and_sex | male : single,male : married/widowed,male : divorced/separated | 3.7463 | +| 17 | credit_amount | [负无穷 , 2145.0) | 8.0057 | +| 18 | credit_amount | [2145.0 , 3804.0) | 14.2751 | +| 19 | credit_amount | [3804.0 , 正无穷) | -2.6347 | +| 20 | credit_amount | 缺失值 | 2.1778 | +| 21 | status_of_existing_checking_account | no checking account | 22.4653 | +| 22 | status_of_existing_checking_account | 缺失值,... >= 200 DM / salary assignments for at least 1 year | 6.6694 | +| 23 | status_of_existing_checking_account | 0 <= ... < 200 DM | 0.0181 | +| 24 | status_of_existing_checking_account | ... < 0 DM | -4.8558 | + + +### 评分卡全流程超参数搜索 + +```python +# 导入超参数搜索方法 +from sklearn.model_selection import GridSearchCV + +# 构建 pipeline +model_pipeline = Pipeline([ + ("preprocessing", FeatureSelection(target=target, engine="scorecardpy")), + ("combiner", Combiner(target=target, min_bin_size=0.2)), + ("transform", WOETransformer(target=target)), + ("processing_select", FeatureSelection(target=target, engine="toad")), + ("stepwise", StepwiseSelection(target=target)), + ("logistic", ITLubberLogisticRegression(target=target)), +]) + +# 定义超参数搜索空间,参数命名: {pipeline名称}__{对应超参数名称} +params_grid = { + "combiner__max_n_bins": [3], + "logistic__C": [np.power(2, i) for i in range(5)], + "logistic__penalty": ["l2"], + "logistic__class_weight": [None, "balanced"] + [{1: i / 10.0, 0: 1 - i / 10.0} for i in range(1, 10, 2)], + "logistic__max_iter": [10, 50, 100], + "logistic__solver": ["sag"], # ["liblinear", "sag", "lbfgs", "newton-cg"], +} + +pipeline_grid_search = GridSearchCV(model_pipeline, params_grid, cv=3, scoring='roc_auc', verbose=1, n_jobs=-1, return_train_score=True) +pipeline_grid_search.fit(train, train[target]) + +print(pipeline_grid_search.best_params_) + +# 更新模型 +model_pipeline.set_params(**pipeline_grid_search.best_params_) +model_pipeline.fit(train) + +# 转换评分卡 +card = ScoreCard(target=target, pipeline=model_pipeline, base_score=50, base_odds=(1 - bad_rate) / bad_rate, pdo=10) +card.fit(model_pipeline[:-1].transform(train)) +``` + ### 模型报告输出 +在 `scorecardpipeline` 中,提供了操作 `excel` 文件的写入器 `ExcelWriter`,支持将文字、表格、图像等过程内容保存至 `excel` 文件中,对相关方法抽象和封装后,能够满足日常大部分数据分析过程中结果保存的需求。 -### 模型持久化存储 +`ExcelWriter`支持调整列宽、调整单元格格式、条件格式、指定位置插入数据、插入图片等功能,且提供了 `dataframe2excel` 来在日常工作中快速保存 `dataframe` 至 `excel` 文件中,并且自动设置样式。 + +```python +# 初始化 Excel 写入器 +writer = sp.ExcelWriter() + +start_row, start_col = 2, 2 + +# ////////////////////////////////////// 样本说明 ///////////////////////////////////// # +worksheet = writer.get_sheet_by_name("汇总信息") + +# 样本总体分布情况 +end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="样本总体分布情况", style="header") +end_row, end_col = sp.dataframe2excel(dataset_summary, writer, worksheet, percent_cols=["样本占比", "坏客户占比"], start_row=end_row + 1) + +# 建模样本时间分布情况 +temp = sp.distribution_plot(df, date="date", target=target, save="model_report/all_sample_time_count.png", result=True) +end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="建模样本时间分布情况", style="header") +end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/all_sample_time_count.png", (end_row, start_col), figsize=(720, 370)) +end_row, end_col = sp.dataframe2excel(temp, writer, worksheet, percent_cols=["样本占比", "好样本占比", "坏样本占比", "坏样本率"], condition_cols=["坏样本率"], start_row=end_row) + +# ////////////////////////////////////// 模型报告 ///////////////////////////////////// # +summary = logistic.summary2(feature_map=feature_map) + +# 逻辑回归拟合情况 +worksheet = writer.get_sheet_by_name("逻辑回归拟合结果") + +end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="逻辑回归拟合效果", style="header") +end_row, end_col = sp.dataframe2excel(summary, writer, worksheet, condition_cols=["Coef."], start_row=end_row + 1) + +end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="训练数据集拟合报告", style="header") +end_row, end_col = sp.dataframe2excel(logistic.report(train_woe_stepwise), writer, worksheet, percent_cols=["precision", "recall", "f1-score"], start_row=end_row + 1) + +end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="测试数据集拟合报告", style="header") +end_row, end_col = sp.dataframe2excel(logistic.report(test_woe_stepwise), writer, worksheet, percent_cols=["precision", "recall", "f1-score"], start_row=end_row + 1) + +# ////////////////////////////////////// 特征概述 ///////////////////////////////////// # +worksheet = writer.get_sheet_by_name("模型变量信息") + +start_row, start_col = 2, 2 +end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="入模变量信息", style="header") +end_row, end_col = writer.insert_df2sheet(worksheet, feature_describe.reset_index().rename(columns={"index": "序号"}), (end_row + 1, start_col)) + +# 变量分布情况 +import toad +data_info = toad.detect(data[card.rules.keys()]).reset_index().rename(columns={"index": "变量名称", "type": "变量类型", "size": "样本个数", "missing": "缺失值", "unique": "唯一值个数"}) +end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量分布情况", style="header") +end_row, end_col = writer.insert_df2sheet(worksheet, data_info, (end_row + 1, start_col)) + +# 变量相关性 +data_corr = train_woe_stepwise.corr() +logistic.corr(train_woe_stepwise, save="model_report/train_corr.png", annot=False) +end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量相关性", style="header") +end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_corr.png", (end_row + 1, start_col), figsize=(700, 500)) +end_row, end_col = sp.dataframe2excel(data_corr.reset_index().rename(columns={"index": ""}), writer, worksheet, color_cols=list(data_corr.columns), start_row=end_row + 1) + +# 变量分箱信息 +end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量分箱信息", style="header") + +for col in logistic.feature_names_in_: + feature_table = sp.feature_bin_stats(data, col, target=target, desc=feature_map.get(col, "") or "逻辑回归入模变量", combiner=combiner) + _ = sp.bin_plot(feature_table, desc=feature_map.get(col, "") or "逻辑回归入模变量", figsize=(8, 4), save=f"model_report/bin_plots/data_{col}.png") + + end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/bin_plots/data_{col}.png", (end_row + 1, start_col), figsize=(700, 400)) + end_row, end_col = sp.dataframe2excel(feature_table, writer, worksheet, percent_cols=["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值"], condition_cols=["坏样本率", "LIFT值"], start_row=end_row) + +# ////////////////////////////////////// 评分卡说明 ///////////////////////////////////// # +worksheet = writer.get_sheet_by_name("评分卡结果") + +# 评分卡刻度 +scorecard_kedu = card.scorecard_scale() +scorecard_points = card.scorecard_points(feature_map=feature_map) +scorecard_clip = card.score_clip(train["score"], clip=100) + +start_row, start_col = 2, 2 +end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="评分卡刻度", style="header") +end_row, end_col = writer.insert_df2sheet(worksheet, scorecard_kedu, (end_row + 1, start_col)) + +# 评分卡对应分数 +end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="评分卡分数", style="header") +end_row, end_col = writer.insert_df2sheet(worksheet, scorecard_points, (end_row + 1, start_col), merge_column="变量名称") + +# 评分效果 +score_table_train = sp.feature_bin_stats(train, "score", desc="测试集模型评分", target=target, rules=scorecard_clip) +score_table_test = sp.feature_bin_stats(test, "score", desc="测试集模型评分", target=target, rules=scorecard_clip) + +sp.ks_plot(train["score"], train[target], title="Train \tDataset", save="model_report/train_ksplot.png") +sp.ks_plot(test["score"], test[target], title="Test \tDataset", save="model_report/test_ksplot.png") + +sp.hist_plot(train["score"], train[target], save="model_report/train_scorehist.png", bins=30) +sp.hist_plot(test["score"], test[target], save="model_report/test_scorehist.png", bins=30) + +end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="训练数据集评分模型效果", style="header") +ks_row = end_row +end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_ksplot.png", (ks_row, start_col)) +end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_scorehist.png", (ks_row, end_col)) +end_row, end_col = sp.dataframe2excel(score_table_train, writer, worksheet, percent_cols=["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值", "分档KS值"], condition_cols=["坏样本率", "LIFT值", "分档KS值"], start_row=end_row + 1) + +end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="测试数据集评分模型效果", style="header") +ks_row = end_row +end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_ksplot.png", (ks_row, start_col)) +end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_scorehist.png", (ks_row, end_col)) +end_row, end_col = sp.dataframe2excel(score_table_test, writer, worksheet, percent_cols=["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值", "分档KS值"], condition_cols=["坏样本率", "LIFT值", "分档KS值"], start_row=end_row + 1) + +# ////////////////////////////////////// 模型稳定性 ///////////////////////////////////// # +worksheet = writer.get_sheet_by_name("模型稳定性") +start_row, start_col = 2, 2 + +# 评分分布稳定性 +train_test_score_psi = sp.psi_plot(score_table_train, score_table_test, labels=["训练数据集", "测试数据集"], save="model_report/train_test_psiplot.png", result=True) + +end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="模型评分稳定性指标 (Population Stability Index, PSI): 训练数据集 vs 测试数据集", style="header") +end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_test_psiplot.png", (end_row, start_col), figsize=(800, 400)) +end_row, end_col = sp.dataframe2excel(train_test_score_psi, writer, worksheet, percent_cols=["训练数据集样本占比", "训练数据集坏样本率", "测试数据集样本占比", "测试数据集坏样本率"], condition_cols=["分档PSI值"], start_row=end_row + 1) + +# 变量 PSI 表 +for col in card._feature_names: + feature_table_train = sp.feature_bin_stats(train, col, target=target, desc=feature_map.get(col, "") or "逻辑回归入模变量", combiner=combiner) + feature_table_test = sp.feature_bin_stats(test, col, target=target, desc=feature_map.get(col, "") or "逻辑回归入模变量", combiner=combiner) + psi_table = sp.psi_plot(feature_table_train, feature_table_test, desc=col, result=True, plot=True, max_len=35, figsize=(10, 6), labels=["训练数据集", "测试数据集"], save=f"model_report/psi_{col}.png") + + end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/psi_{col}.png", (end_row, start_col), figsize=(700, 400)) + end_row, end_col = sp.dataframe2excel(psi_table, writer, worksheet, percent_cols=["训练数据集样本占比", "训练数据集坏样本率", "测试数据集样本占比", "测试数据集坏样本率", "测试数据集% - 训练数据集%"], condition_cols=["分档PSI值"], start_row=end_row + 1) + +# 变量 CSI 表 +end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="入模变量稳定性指标 (Characteristic Stability Index, CSI): 训练数据集 vs 测试数据集", style="header") + +for col in card._feature_names: + feature_table_train = sp.feature_bin_stats(train, col, target=target, desc=feature_map.get(col, "") or "逻辑回归入模变量", combiner=combiner) + feature_table_test = sp.feature_bin_stats(test, col, target=target, desc=feature_map.get(col, "") or "逻辑回归入模变量", combiner=combiner) + train_test_csi_table = sp.csi_plot(feature_table_train, feature_table_test, card[col], desc=col, result=True, plot=True, max_len=35, figsize=(10, 6), labels=["训练数据集", "测试数据集"], save=f"model_report/csi_{col}.png") + + end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/csi_{col}.png", (end_row, start_col), figsize=(700, 400)) + end_row, end_col = sp.dataframe2excel(train_test_csi_table, writer, worksheet, percent_cols=["训练数据集样本占比", "训练数据集坏样本率", "测试数据集样本占比", "测试数据集坏样本率", "测试数据集% - 训练数据集%"], condition_cols=["分档CSI值"], start_row=end_row + 1) + +# 保存结果文件 +writer.save("model_report/评分卡模型报告.xlsx") +``` + +
+ +
## 交流 diff --git a/examples/quickstart.ipynb b/examples/quickstart.ipynb new file mode 100644 index 0000000..354c616 --- /dev/null +++ b/examples/quickstart.ipynb @@ -0,0 +1,1812 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../\")" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "import scorecardpipeline as sp\n", + "from scorecardpipeline import *" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sp" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "logger = sp.init_setting(seed=6666, logger=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
status_of_existing_checking_accountduration_in_monthcredit_historypurposecredit_amountsavings_account_and_bondspresent_employment_sinceinstallment_rate_in_percentage_of_disposable_incomepersonal_status_and_sexother_debtors_or_guarantors...propertyage_in_yearsother_installment_planshousingnumber_of_existing_credits_at_this_bankjobnumber_of_people_being_liable_to_provide_maintenance_fortelephoneforeign_workercreditability
0... < 0 DM6.0000NaNNaN1169.0000unknown/ no savings accountNaN4.0000male : divorced/separatednone...real estate67.0000noneownNaNskilled employee / official1.0000yes, registered under the customers nameyes0
1NaN48.0000existing credits paid back duly till nowNaN5951.0000... < 100 DM1 <= ... < 4 years2.0000male : divorced/separatednone...real estateNaNnoneown1.0000skilled employee / official1.0000noneyes1
2no checking account12.0000critical account/ other credits existing (not at this bank)educationNaNNaN4 <= ... < 7 years2.0000male : divorced/separatedNaN...NaN49.0000NaNown1.0000unskilled - resident2.0000NaNyes0
3... < 0 DMNaNexisting credits paid back duly till nowfurniture/equipment7882.0000... < 100 DMNaNNaNmale : divorced/separatedguarantor...building society savings agreement/ life insurance45.0000NaNfor free1.0000skilled employee / officialNaNnoneNaN0
4... < 0 DM24.0000NaNcar (new)4870.0000NaN1 <= ... < 4 years3.0000male : divorced/separatednone...unknown / no property53.0000nonefor freeNaNNaN2.0000noneyes1
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " status_of_existing_checking_account duration_in_month credit_history purpose credit_amount savings_account_and_bonds present_employment_since installment_rate_in_percentage_of_disposable_income personal_status_and_sex other_debtors_or_guarantors ... property age_in_years other_installment_plans housing number_of_existing_credits_at_this_bank job number_of_people_being_liable_to_provide_maintenance_for telephone foreign_worker creditability\n", + "0 ... < 0 DM 6.0000 NaN NaN 1169.0000 unknown/ no savings account NaN 4.0000 male : divorced/separated none ... real estate 67.0000 none own NaN skilled employee / official 1.0000 yes, registered under the customers name yes 0\n", + "1 NaN 48.0000 existing credits paid back duly till now NaN 5951.0000 ... < 100 DM 1 <= ... < 4 years 2.0000 male : divorced/separated none ... real estate NaN none own 1.0000 skilled employee / official 1.0000 none yes 1\n", + "2 no checking account 12.0000 critical account/ other credits existing (not at this bank) education NaN NaN 4 <= ... < 7 years 2.0000 male : divorced/separated NaN ... NaN 49.0000 NaN own 1.0000 unskilled - resident 2.0000 NaN yes 0\n", + "3 ... < 0 DM NaN existing credits paid back duly till now furniture/equipment 7882.0000 ... < 100 DM NaN NaN male : divorced/separated guarantor ... building society savings agreement/ life insurance 45.0000 NaN for free 1.0000 skilled employee / official NaN none NaN 0\n", + "4 ... < 0 DM 24.0000 NaN car (new) 4870.0000 NaN 1 <= ... < 4 years 3.0000 male : divorced/separated none ... unknown / no property 53.0000 none for free NaN NaN 2.0000 none yes 1\n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target = \"creditability\"\n", + "data = germancredit()\n", + "data[target] = data[target].map({\"good\": 0, \"bad\": 1})\n", + "\n", + "# 随机替换 20% 的数据为 np.nan\n", + "for col in data.columns.drop(target):\n", + " for i in range(len(data)):\n", + " if np.random.rand() > 0.8:\n", + " data[col].loc[i] = np.nan\n", + "\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 2023-12-09 03:04:52,641 ][ INFO ][ 3133036894.py::3 ] 训练集数据: (700, 21), 测试集数据: (300, 21)\n", + "[ 2023-12-09 03:04:52,641 ][ INFO ][ 3133036894.py::3 ] 训练集数据: (700, 21), 测试集数据: (300, 21)\n", + "[ 2023-12-09 03:04:52,641 ][ INFO ][ 3133036894.py::3 ] 训练集数据: (700, 21), 测试集数据: (300, 21)\n" + ] + } + ], + "source": [ + "train, test = train_test_split(data, test_size=0.3, shuffle=True, stratify=data[target])\n", + "\n", + "logger.info(f\"训练集数据: {train.shape}, 测试集数据: {test.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
数据集开始时间结束时间样本总数样本占比坏客户数坏客户占比备注
0建模样本2022-01-012023-01-3110001.00003000.3000
1训练集2022-01-012023-12-317000.70002100.3000
2测试集2022-01-012023-12-313000.3000900.3000
\n", + "
" + ], + "text/plain": [ + " 数据集 开始时间 结束时间 样本总数 样本占比 坏客户数 坏客户占比 备注\n", + "0 建模样本 2022-01-01 2023-01-31 1000 1.0000 300 0.3000 \n", + "1 训练集 2022-01-01 2023-12-31 700 0.7000 210 0.3000 \n", + "2 测试集 2022-01-01 2023-12-31 300 0.3000 90 0.3000 " + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 模拟实际场景中的数据, date 为数据集中的日期,为 datetime 类型,实际生产过程中可能是 申请时间|放款时间|入催时间|流失时间 等\n", + "\n", + "df = pd.DataFrame()\n", + "df[\"date\"] = pd.date_range(start=\"2021-01-01\", end=\"2021-06-30\", freq=\"5H\")\n", + "df[target] = np.random.randint(0, 2, len(df))\n", + "\n", + "total_count = len(data)\n", + "dataset_summary = pd.DataFrame(\n", + " [\n", + " [\"建模样本\", \"2022-01-01\", \"2023-01-31\", len(data), len(data) / total_count, data[target].sum(), data[target].sum() / len(data), \"\"],\n", + " [\"训练集\", \"2022-01-01\", \"2023-12-31\", len(train), len(train) / total_count, train[target].sum(), train[target].sum() / len(train), \"\"],\n", + " [\"测试集\", \"2022-01-01\", \"2023-12-31\", len(test), len(test) / total_count, test[target].sum(), test[target].sum() / len(test), \"\"],\n", + " ],\n", + " columns=[\"数据集\", \"开始时间\", \"结束时间\", \"样本总数\", \"样本占比\", \"坏客户数\", \"坏客户占比\", \"备注\"],\n", + ")\n", + "\n", + "dataset_summary" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "distribution_plot(df, date=\"date\", target=target)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "select = FeatureSelection(target=target, engine=\"toad\", identical=0.95, empty=0.95, iv=0.02, corr=0.6)\n", + "select.fit(train)\n", + "\n", + "train_select = select.transform(train)\n", + "test_select = select.transform(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "combiner = Combiner(target=target, min_bin_size=0.2, empty_separate=True)\n", + "\n", + "combiner.fit(train_select)\n", + "\n", + "train_bins = combiner.transform(train_select)\n", + "test_bins = combiner.transform(test_select)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
指标名称指标含义分箱样本总数样本占比好样本数好样本占比坏样本数坏样本占比坏样本率分档WOE值分档IV值指标IV值LIFT值累积LIFT值累积好样本数累积坏样本数分档KS值
0credit_amount信用额度[负无穷 , 4000.0)4310.61573250.66331060.50480.24590.27310.04330.11580.81980.8198325106-0.1585
1credit_amount信用额度[4000.0 , 正无穷)1390.1986800.1633590.28100.4245-0.54280.06390.11581.41490.9649405165-0.0408
2credit_amount信用额度缺失值1300.1857850.1735450.21430.3462-0.21130.00860.11581.15381.00004902100.0000
\n", + "
" + ], + "text/plain": [ + " 指标名称 指标含义 分箱 样本总数 样本占比 好样本数 好样本占比 坏样本数 坏样本占比 坏样本率 分档WOE值 分档IV值 指标IV值 LIFT值 累积LIFT值 累积好样本数 累积坏样本数 分档KS值\n", + "0 credit_amount 信用额度 [负无穷 , 4000.0) 431 0.6157 325 0.6633 106 0.5048 0.2459 0.2731 0.0433 0.1158 0.8198 0.8198 325 106 -0.1585\n", + "1 credit_amount 信用额度 [4000.0 , 正无穷) 139 0.1986 80 0.1633 59 0.2810 0.4245 -0.5428 0.0639 0.1158 1.4149 0.9649 405 165 -0.0408\n", + "2 credit_amount 信用额度 缺失值 130 0.1857 85 0.1735 45 0.2143 0.3462 -0.2113 0.0086 0.1158 1.1538 1.0000 490 210 0.0000" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "combiner.bin_plot(train_select, \"credit_amount\", result=True, desc=\"信用额度\", rule=[4000.0, np.nan])" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "transform = WOETransformer(target=target)\n", + "transform.fit(train_bins)\n", + "\n", + "train_woe = transform.transform(train_bins)\n", + "test_woe = transform.transform(test_bins)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "# 初始化逐步回归特征筛选器\n", + "stepwise = StepwiseSelection(target=target)\n", + "# 训练\n", + "stepwise.fit(train_woe)\n", + "# 应用逐步回归特征筛选器\n", + "train_woe_stepwise = stepwise.transform(train_woe)\n", + "test_woe_stepwise = stepwise.transform(test_woe)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "# 逻辑回归模型构建\n", + "logistic = ITLubberLogisticRegression(target=target)\n", + "# 训练\n", + "logistic.fit(train_woe_stepwise)\n", + "# 预测数据集样本违约概率\n", + "y_pred_train = logistic.predict_proba(train_woe_stepwise.drop(columns=target))[:, 1]\n", + "y_pred_test = logistic.predict_proba(test_woe_stepwise.drop(columns=target))[:, 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FeaturesDescribeCoef.Std.ErrzP>|z|[ 0.0250.975 ]VIF
0const截距项-0.84230.0942-8.94070.0000-1.0269-0.65761.0499
1status_of_existing_checking_account现有支票账户的状态0.90180.12956.96530.00000.64801.15551.0828
2purpose目的1.00390.30823.25770.00110.39991.60791.0108
3credit_amount信用额度1.10530.25714.29880.00000.60141.60931.0346
4savings_account_and_bonds0.60720.25162.41300.01580.11401.10041.0635
5present_employment_since现居住地至今0.67390.36671.83790.0661-0.04481.39261.0612
6installment_rate_in_percentage_of_disposable_income分期付款率占可支配收入的百分比1.28560.43822.93380.00330.42672.14441.0178
7personal_status_and_sex个人地位和性别0.80990.52581.54040.1235-0.22061.84051.0106
8property0.82690.38412.15270.03130.07401.57971.0279
9age_in_years年龄0.90570.27493.29460.00100.36691.44451.0459
\n", + "
" + ], + "text/plain": [ + " Features Describe Coef. Std.Err z P>|z| [ 0.025 0.975 ] VIF\n", + "0 const 截距项 -0.8423 0.0942 -8.9407 0.0000 -1.0269 -0.6576 1.0499\n", + "1 status_of_existing_checking_account 现有支票账户的状态 0.9018 0.1295 6.9653 0.0000 0.6480 1.1555 1.0828\n", + "2 purpose 目的 1.0039 0.3082 3.2577 0.0011 0.3999 1.6079 1.0108\n", + "3 credit_amount 信用额度 1.1053 0.2571 4.2988 0.0000 0.6014 1.6093 1.0346\n", + "4 savings_account_and_bonds 0.6072 0.2516 2.4130 0.0158 0.1140 1.1004 1.0635\n", + "5 present_employment_since 现居住地至今 0.6739 0.3667 1.8379 0.0661 -0.0448 1.3926 1.0612\n", + "6 installment_rate_in_percentage_of_disposable_income 分期付款率占可支配收入的百分比 1.2856 0.4382 2.9338 0.0033 0.4267 2.1444 1.0178\n", + "7 personal_status_and_sex 个人地位和性别 0.8099 0.5258 1.5404 0.1235 -0.2206 1.8405 1.0106\n", + "8 property 0.8269 0.3841 2.1527 0.0313 0.0740 1.5797 1.0279\n", + "9 age_in_years 年龄 0.9057 0.2749 3.2946 0.0010 0.3669 1.4445 1.0459" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 数据字典或特征描述信息\n", + "feature_map = {\n", + " \"const\": \"截距项\",\n", + " \"status_of_existing_checking_account\": \"现有支票账户的状态\",\n", + " \"credit_history\": \"信用记录\",\n", + " \"purpose\": \"目的\",\n", + " \"credit_amount\": \"信用额度\",\n", + " \"present_employment_since\": \"现居住地至今\",\n", + " \"installment_rate_in_percentage_of_disposable_income\": \"分期付款率占可支配收入的百分比\",\n", + " \"personal_status_and_sex\": \"个人地位和性别\",\n", + " \"age_in_years\": \"年龄\",\n", + " \"housing\": \"住房情况\",\n", + "}\n", + "# summary 仅支持输出简单的统计信息,使用 summary2 可以输出有特征描述的统计信息表\n", + "logistic.summary2(feature_map=feature_map)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "logistic.plot_weights(figsize=(10, 6), save=\"model_report/sp_lr_weight.png\");" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "bad_rate = train[target].mean()\n", + "# 逻辑回归模型转评分卡\n", + "card = ScoreCard(target=target, combiner=combiner, transer=transform, pretrain_lr=logistic, base_score=50, base_odds=(1 - bad_rate) / bad_rate, pdo=10)\n", + "# 训练\n", + "card.fit(train_woe_stepwise)\n", + "\n", + "# 预测\n", + "train[\"score\"] = card.predict(train)\n", + "test[\"score\"] = card.predict(test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
指标名称指标含义分箱样本总数样本占比好样本数好样本占比坏样本数坏样本占比坏样本率分档WOE值分档IV值指标IV值LIFT值累积LIFT值累积好样本数累积坏样本数分档KS值
0score训练集模型评分[负无穷 , 40)1110.1586410.0837700.33330.6306-1.38220.34510.84212.10212.102141700.2497
1score训练集模型评分[40 , 60)3010.43001950.39801060.50480.3522-0.23770.02540.84211.17391.42392361760.3565
2score训练集模型评分[60 , 80)2270.32431960.4000310.14760.13660.99680.25160.84210.45521.07984322070.1041
3score训练集模型评分[80 , 正无穷)610.0871580.118430.01430.04922.11450.22010.84210.16391.00004902100.0000
\n", + "
" + ], + "text/plain": [ + " 指标名称 指标含义 分箱 样本总数 样本占比 好样本数 好样本占比 坏样本数 坏样本占比 坏样本率 分档WOE值 分档IV值 指标IV值 LIFT值 累积LIFT值 累积好样本数 累积坏样本数 分档KS值\n", + "0 score 训练集模型评分 [负无穷 , 40) 111 0.1586 41 0.0837 70 0.3333 0.6306 -1.3822 0.3451 0.8421 2.1021 2.1021 41 70 0.2497\n", + "1 score 训练集模型评分 [40 , 60) 301 0.4300 195 0.3980 106 0.5048 0.3522 -0.2377 0.0254 0.8421 1.1739 1.4239 236 176 0.3565\n", + "2 score 训练集模型评分 [60 , 80) 227 0.3243 196 0.4000 31 0.1476 0.1366 0.9968 0.2516 0.8421 0.4552 1.0798 432 207 0.1041\n", + "3 score 训练集模型评分 [80 , 正无穷) 61 0.0871 58 0.1184 3 0.0143 0.0492 2.1145 0.2201 0.8421 0.1639 1.0000 490 210 0.0000" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# 训练集评分排序性\n", + "score_clip = card.score_clip(train[\"score\"], clip=20)\n", + "score_table_train = feature_bin_stats(train, \"score\", desc=\"训练集模型评分\", target=target, rules=score_clip)\n", + "bin_plot(score_table_train, desc=\"训练集模型评分\", figsize=(10, 6), anchor=0.935, save=\"model_report/train_score_bins.png\")\n", + "\n", + "display(score_table_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"savings_account_and_bonds\" in combiner" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: '500 <= ... < 1000 DM,unknown/ no savings account,... >= 1000 DM',\n", + " 1: '... < 100 DM,100 <= ... < 500 DM',\n", + " -1: '缺失值'}" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_bins(np.array(combiner[\"savings_account_and_bonds\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "# # 查看某个特征的 PSI\n", + "# score_clip = card.score_clip(train[\"score\"], clip=10)\n", + "# score_table_train = feature_bin_stats(train, \"score\", desc=\"训练集模型评分\", target=target, rules=score_clip)\n", + "# score_table_test = feature_bin_stats(test, \"score\", desc=\"测试集模型评分\", target=target, rules=score_clip)\n", + "# train_test_score_psi = psi_plot(score_table_train, score_table_test, labels=[\"训练数据集\", \"测试数据集\"], save=\"model_report/train_test_psiplot.png\", result=True)\n", + "\n", + "# # 查看某个入模特征的 CSI\n", + "# for col in card._feature_names:\n", + "# feature_table_train = feature_bin_stats(train, col, target=target, desc=\"训练集分布\", combiner=combiner)\n", + "# feature_table_test = feature_bin_stats(test, col, target=target, desc=\"测试集分布\", combiner=combiner)\n", + "# train_test_csi_table = csi_plot(feature_table_train, feature_table_test, card[col], desc=col, result=True, plot=True, max_len=35, figsize=(10, 6), labels=[\"训练数据集\", \"测试数据集\"], save=f\"model_report/csi_{col}.png\")\n", + "# if col == \"savings_account_and_bonds\": break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "# for col in card._feature_names:\n", + "# print(col)\n", + "# feature_table = feature_bin_stats(data, col, target=target, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", combiner=combiner)\n", + "# _ = sp.bin_plot(feature_table, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", figsize=(8, 4), anchor=0.9)\n", + " \n", + "# display(feature_table)\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'bins': array([35., nan]),\n", + " 'woes': array([ 0.35631058, -0.40546511, -0.08982696]),\n", + " 'weight': 0.9057217143033184,\n", + " 'scores': array([ 0.89168832, 10.84566059, 6.7212793 ])}" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "card[\"age_in_years\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "save_pickle(card, \"model_report/scorecard.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "python: 3.8.13\n", + "sklearn: 1.2.2\n", + "sklearn2pmml: 0.90.4\n", + "joblib: 1.2.0\n", + "sklearn_pandas: 2.2.0\n", + "pandas: 1.5.3\n", + "numpy: 1.22.2\n", + "openjdk: 1.8.0_362\n", + "Executing command:\n", + "java -cp /home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/sklearn2pmml-1.0-SNAPSHOT.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/gson-2.10.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/guava-21.0.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/h2o-genmodel-3.38.0.4.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/h2o-logger-3.38.0.4.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/h2o-tree-api-0.3.17.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/istack-commons-runtime-4.0.1.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/jackson-annotations-2.13.3.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/jakarta.activation-2.0.1.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/jakarta.xml.bind-api-3.0.1.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/jaxb-core-3.0.2.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/jaxb-runtime-3.0.2.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/jcommander-1.72.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pickle-1.3.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-converter-1.5.4.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-h2o-1.2.5.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-lightgbm-1.4.4.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-model-1.6.4.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-model-metro-1.6.4.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-python-1.1.11.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-sklearn-1.7.24.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-sklearn-extension-1.7.24.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-sklearn-h2o-1.7.24.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-sklearn-lightgbm-1.7.24.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-sklearn-statsmodels-1.7.24.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-sklearn-xgboost-1.7.24.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-statsmodels-1.0.1.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-xgboost-1.7.3.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/serpent-1.40.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/slf4j-api-1.7.36.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/slf4j-jdk14-1.7.36.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/ubjson-0.1.8.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/ubjson-gson-0.1.8.jar com.sklearn2pmml.Main --pkl-pipeline-input /tmp/pipeline-wyc0ltv2.pkl.z --pmml-output model_report/scorecard.pmml\n", + "Standard output is empty\n", + "Standard error:\n", + "十二月 09, 2023 2:40:35 上午 sklearn2pmml.pipeline.PMMLPipeline encodePMML\n", + "警告: Model verification data is not set. Use method 'sklearn2pmml.pipeline.PMMLPipeline.verify(X)' to correct this deficiency\n", + "\n", + "Preserved joblib dump file(s): /tmp/pipeline-wyc0ltv2.pkl.z\n" + ] + } + ], + "source": [ + "scorecard_pipeline = card.scorecard2pmml(pmml=\"model_report/scorecard.pmml\", debug=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
predtrue
27074.646774.6467
45579.467879.4678
68941.641741.6417
71154.603154.6031
66346.482446.4824
.........
1688.840688.8406
1551.482551.4825
38460.964960.9649
57261.988161.9881
41373.707973.7079
\n", + "

700 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " pred true\n", + "270 74.6467 74.6467\n", + "455 79.4678 79.4678\n", + "689 41.6417 41.6417\n", + "711 54.6031 54.6031\n", + "663 46.4824 46.4824\n", + ".. ... ...\n", + "16 88.8406 88.8406\n", + "15 51.4825 51.4825\n", + "384 60.9649 60.9649\n", + "572 61.9881 61.9881\n", + "413 73.7079 73.7079\n", + "\n", + "[700 rows x 2 columns]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame({\"pred\": scorecard_pipeline.predict(train), \"true\": train[\"score\"]})" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "# # 导入超参数搜索方法\n", + "# from sklearn.model_selection import GridSearchCV\n", + "\n", + "# # 构建 pipeline\n", + "# model_pipeline = Pipeline([\n", + "# (\"preprocessing\", FeatureSelection(target=target, engine=\"scorecardpy\")),\n", + "# (\"combiner\", Combiner(target=target, min_bin_size=0.2)),\n", + "# (\"transform\", WOETransformer(target=target)),\n", + "# (\"processing_select\", FeatureSelection(target=target, engine=\"toad\")),\n", + "# (\"stepwise\", StepwiseSelection(target=target)),\n", + "# (\"logistic\", ITLubberLogisticRegression(target=target)),\n", + "# ])\n", + "\n", + "# params_grid = {\n", + "# \"combiner__max_n_bins\": [3],\n", + "# \"logistic__C\": [np.power(2, i) for i in range(5)],\n", + "# \"logistic__penalty\": [\"l2\"],\n", + "# \"logistic__class_weight\": [None, \"balanced\"] + [{1: i / 10.0, 0: 1 - i / 10.0} for i in range(1, 10, 2)],\n", + "# \"logistic__max_iter\": [10, 50, 100],\n", + "# \"logistic__solver\": [\"sag\"], # [\"liblinear\", \"sag\", \"lbfgs\", \"newton-cg\"],\n", + "# }\n", + "\n", + "# pipeline_grid_search = GridSearchCV(model_pipeline, params_grid, cv=3, scoring='roc_auc', verbose=1, n_jobs=-1, return_train_score=True)\n", + "# pipeline_grid_search.fit(train, train[target])\n", + "\n", + "# print(pipeline_grid_search.best_params_)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] filtering variables ...\n", + "| | 变量名称 | 变量分箱 | 对应分数 |\n", + "|---:|:----------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------|-----------:|\n", + "| 0 | purpose | radio/television,car (used) | 12.8514 |\n", + "| 1 | purpose | business,furniture/equipment,others,education,domestic appliances,retraining,car (new),缺失值,repairs | 2.7818 |\n", + "| 2 | installment_rate_in_percentage_of_disposable_income | [负无穷 , 4.0) | 7.7878 |\n", + "| 3 | installment_rate_in_percentage_of_disposable_income | [4.0 , 正无穷) | 0.9877 |\n", + "| 4 | installment_rate_in_percentage_of_disposable_income | 缺失值 | 10.7462 |\n", + "| 5 | savings_account_and_bonds | 500 <= ... < 1000 DM,unknown/ no savings account,... >= 1000 DM | 5.9834 |\n", + "| 6 | savings_account_and_bonds | ... < 100 DM,100 <= ... < 500 DM | 12.1585 |\n", + "| 7 | savings_account_and_bonds | 缺失值 | 3.2466 |\n", + "| 8 | present_employment_since | 4 <= ... < 7 years,... >= 7 years | 9.4896 |\n", + "| 9 | present_employment_since | 缺失值,unemployed,1 <= ... < 4 years,... < 1 year | 3.8957 |\n", + "| 10 | age_in_years | [负无穷 , 35.0) | 0.8917 |\n", + "| 11 | age_in_years | [35.0 , 正无穷) | 10.8457 |\n", + "| 12 | age_in_years | 缺失值 | 6.7213 |\n", + "| 13 | property | real estate | 11.2541 |\n", + "| 14 | property | 缺失值,building society savings agreement/ life insurance,unknown / no property,car or other, not in attribute Savings account/bonds | 4.0428 |\n", + "| 15 | personal_status_and_sex | 缺失值,female : divorced/separated/married | 7.8997 |\n", + "| 16 | personal_status_and_sex | male : single,male : married/widowed,male : divorced/separated | 3.7463 |\n", + "| 17 | credit_amount | [负无穷 , 2145.0) | 8.0057 |\n", + "| 18 | credit_amount | [2145.0 , 3804.0) | 14.2751 |\n", + "| 19 | credit_amount | [3804.0 , 正无穷) | -2.6347 |\n", + "| 20 | credit_amount | 缺失值 | 2.1778 |\n", + "| 21 | status_of_existing_checking_account | no checking account | 22.4653 |\n", + "| 22 | status_of_existing_checking_account | 缺失值,... >= 200 DM / salary assignments for at least 1 year | 6.6694 |\n", + "| 23 | status_of_existing_checking_account | 0 <= ... < 200 DM | 0.0181 |\n", + "| 24 | status_of_existing_checking_account | ... < 0 DM | -4.8558 |\n" + ] + } + ], + "source": [ + "# 构建 pipeline\n", + "model_pipeline = Pipeline([\n", + " (\"preprocessing\", FeatureSelection(target=target, engine=\"scorecardpy\")),\n", + " (\"combiner\", Combiner(target=target, min_bin_size=0.2)),\n", + " (\"transform\", WOETransformer(target=target)),\n", + " (\"processing_select\", FeatureSelection(target=target, engine=\"toad\")),\n", + " (\"stepwise\", StepwiseSelection(target=target)),\n", + " (\"logistic\", ITLubberLogisticRegression(target=target)),\n", + "])\n", + "# 训练 pipeline\n", + "model_pipeline.fit(train)\n", + "# 转换评分卡\n", + "card = ScoreCard(target=target, pipeline=model_pipeline, base_score=50, base_odds=(1 - bad_rate) / bad_rate, pdo=10)\n", + "card.fit(model_pipeline[:-1].transform(train))\n", + "print(card.scorecard_points().to_markdown())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "card.scorecard_points()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
变量名称变量分箱对应分数
0purposeradio/television,car (used)12.8514
1purposebusiness,furniture/equipment,others,education,domestic appliances,retraining,car (new),缺失值,repairs2.7818
2installment_rate_in_percentage_of_disposable_income[负无穷 , 4.0)7.7878
3installment_rate_in_percentage_of_disposable_income[4.0 , 正无穷)0.9877
4installment_rate_in_percentage_of_disposable_income缺失值10.7462
5savings_account_and_bonds500 <= ... < 1000 DM,unknown/ no savings account,... >= 1000 DM5.9834
6savings_account_and_bonds... < 100 DM,100 <= ... < 500 DM12.1585
7savings_account_and_bonds缺失值3.2466
8present_employment_since4 <= ... < 7 years,... >= 7 years9.4896
9present_employment_since缺失值,unemployed,1 <= ... < 4 years,... < 1 year3.8957
10age_in_years[负无穷 , 35.0)0.8917
11age_in_years[35.0 , 正无穷)10.8457
12age_in_years缺失值6.7213
13propertyreal estate11.2541
14property缺失值,building society savings agreement/ life insurance,unknown / no property,car or other, not in attribute Savings account/bonds4.0428
15personal_status_and_sex缺失值,female : divorced/separated/married7.8997
16personal_status_and_sexmale : single,male : married/widowed,male : divorced/separated3.7463
17credit_amount[负无穷 , 2145.0)8.0057
18credit_amount[2145.0 , 3804.0)14.2751
19credit_amount[3804.0 , 正无穷)-2.6347
20credit_amount缺失值2.1778
21status_of_existing_checking_accountno checking account22.4653
22status_of_existing_checking_account缺失值,... >= 200 DM / salary assignments for at least 1 year6.6694
23status_of_existing_checking_account0 <= ... < 200 DM0.0181
24status_of_existing_checking_account... < 0 DM-4.8558
\n", + "
" + ], + "text/plain": [ + " 变量名称 变量分箱 对应分数\n", + "0 purpose radio/television,car (used) 12.8514\n", + "1 purpose business,furniture/equipment,others,education,domestic appliances,retraining,car (new),缺失值,repairs 2.7818\n", + "2 installment_rate_in_percentage_of_disposable_income [负无穷 , 4.0) 7.7878\n", + "3 installment_rate_in_percentage_of_disposable_income [4.0 , 正无穷) 0.9877\n", + "4 installment_rate_in_percentage_of_disposable_income 缺失值 10.7462\n", + "5 savings_account_and_bonds 500 <= ... < 1000 DM,unknown/ no savings account,... >= 1000 DM 5.9834\n", + "6 savings_account_and_bonds ... < 100 DM,100 <= ... < 500 DM 12.1585\n", + "7 savings_account_and_bonds 缺失值 3.2466\n", + "8 present_employment_since 4 <= ... < 7 years,... >= 7 years 9.4896\n", + "9 present_employment_since 缺失值,unemployed,1 <= ... < 4 years,... < 1 year 3.8957\n", + "10 age_in_years [负无穷 , 35.0) 0.8917\n", + "11 age_in_years [35.0 , 正无穷) 10.8457\n", + "12 age_in_years 缺失值 6.7213\n", + "13 property real estate 11.2541\n", + "14 property 缺失值,building society savings agreement/ life insurance,unknown / no property,car or other, not in attribute Savings account/bonds 4.0428\n", + "15 personal_status_and_sex 缺失值,female : divorced/separated/married 7.8997\n", + "16 personal_status_and_sex male : single,male : married/widowed,male : divorced/separated 3.7463\n", + "17 credit_amount [负无穷 , 2145.0) 8.0057\n", + "18 credit_amount [2145.0 , 3804.0) 14.2751\n", + "19 credit_amount [3804.0 , 正无穷) -2.6347\n", + "20 credit_amount 缺失值 2.1778\n", + "21 status_of_existing_checking_account no checking account 22.4653\n", + "22 status_of_existing_checking_account 缺失值,... >= 200 DM / salary assignments for at least 1 year 6.6694\n", + "23 status_of_existing_checking_account 0 <= ... < 200 DM 0.0181\n", + "24 status_of_existing_checking_account ... < 0 DM -4.8558" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bad_rate = train[target].mean()\n", + "# 逻辑回归模型转评分卡\n", + "card = ScoreCard(target=target, pipeline=mdoel_pipeline, base_score=50, base_odds=(1 - bad_rate) / bad_rate, pdo=10)\n", + "# 训练\n", + "card.fit(mdoel_pipeline[:-1].transform(train))\n", + "card.scorecard_points()" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "# 数据字段[可选]\n", + "feature_describe = pd.DataFrame([\n", + " [\"status_account\", \"支票账户状态\"], [\"duration\", \"借款周期\"], [\"credit_histor\", \"历史信用\"], [\"purpose\", \"借款目的\"], [\"amount\", \"信用额度\"], [\"svaing_account\", \"储蓄账户状态\"], [\"present_emp\", \"当前就业状态\"], [\"income_rate\", \"分期付款占可支配收入百分比\"], [\"personal_status\", \"性别与婚姻状态\"], [\"other_debtors\", \"他人担保信息\"], [\"residence_info\", \"现居住地\"], [\"property\", \"财产状态\"], [\"age\", \"年龄\"], [\"inst_plans\", \"其他分期情况\"], [\"housing\", \"房产状态\"], [\"num_credits\", \"信用卡数量\"], [\"job\", \"工作状态\"], [\"dependents\", \"赡养人数\"], [\"telephone\", \"电话号码注册情况\"], [\"foreign_worke\", \"是否有海外工作经历\"],\n", + "], columns=[\"变量名称\", \"变量含义\"])\n", + "feature_map = dict(zip(feature_describe[\"变量名称\"], feature_describe[\"变量含义\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "writer = sp.ExcelWriter()\n", + "start_row, start_col = 2, 2" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "# # ////////////////////////////////////// 样本说明 ///////////////////////////////////// #\n", + "worksheet = writer.get_sheet_by_name(\"汇总信息\")\n", + "\n", + "# 样本总体分布情况\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value=\"样本总体分布情况\", style=\"header\")\n", + "end_row, end_col = sp.dataframe2excel(dataset_summary, writer, worksheet, percent_cols=[\"样本占比\", \"坏客户占比\"], start_row=end_row + 1)\n", + "\n", + "# 建模样本时间分布情况\n", + "temp = sp.distribution_plot(df, date=\"date\", target=target, save=\"model_report/all_sample_time_count.png\", result=True)\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"建模样本时间分布情况\", style=\"header\")\n", + "end_row, end_col = writer.insert_pic2sheet(worksheet, \"model_report/all_sample_time_count.png\", (end_row, start_col), figsize=(720, 370))\n", + "end_row, end_col = sp.dataframe2excel(temp, writer, worksheet, percent_cols=[\"样本占比\", \"好样本占比\", \"坏样本占比\", \"坏样本率\"], condition_cols=[\"坏样本率\"], start_row=end_row)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "# ////////////////////////////////////// 模型报告 ///////////////////////////////////// #\n", + "summary = logistic.summary2(feature_map=feature_map)\n", + "\n", + "# 逻辑回归拟合情况\n", + "worksheet = writer.get_sheet_by_name(\"逻辑回归拟合结果\")\n", + "\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value=\"逻辑回归拟合效果\", style=\"header\")\n", + "end_row, end_col = sp.dataframe2excel(summary, writer, worksheet, condition_cols=[\"Coef.\"], start_row=end_row + 1)\n", + "\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"训练数据集拟合报告\", style=\"header\")\n", + "end_row, end_col = sp.dataframe2excel(logistic.report(train_woe_stepwise), writer, worksheet, percent_cols=[\"precision\", \"recall\", \"f1-score\"], start_row=end_row + 1)\n", + "\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"测试数据集拟合报告\", style=\"header\")\n", + "end_row, end_col = sp.dataframe2excel(logistic.report(test_woe_stepwise), writer, worksheet, percent_cols=[\"precision\", \"recall\", \"f1-score\"], start_row=end_row + 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [], + "source": [ + "# ////////////////////////////////////// 特征概述 ///////////////////////////////////// #\n", + "worksheet = writer.get_sheet_by_name(\"模型变量信息\")\n", + "\n", + "start_row, start_col = 2, 2\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value=\"入模变量信息\", style=\"header\")\n", + "end_row, end_col = writer.insert_df2sheet(worksheet, feature_describe.reset_index().rename(columns={\"index\": \"序号\"}), (end_row + 1, start_col))\n", + "\n", + "# 变量分布情况\n", + "import toad\n", + "data_info = toad.detect(data[card.rules.keys()]).reset_index().rename(columns={\"index\": \"变量名称\", \"type\": \"变量类型\", \"size\": \"样本个数\", \"missing\": \"缺失值\", \"unique\": \"唯一值个数\"})\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"变量分布情况\", style=\"header\")\n", + "end_row, end_col = writer.insert_df2sheet(worksheet, data_info, (end_row + 1, start_col))\n", + "\n", + "# 变量相关性\n", + "data_corr = train_woe_stepwise.corr()\n", + "logistic.corr(train_woe_stepwise, save=\"model_report/train_corr.png\", annot=False)\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"变量相关性\", style=\"header\")\n", + "end_row, end_col = writer.insert_pic2sheet(worksheet, \"model_report/train_corr.png\", (end_row + 1, start_col), figsize=(700, 500))\n", + "end_row, end_col = sp.dataframe2excel(data_corr.reset_index().rename(columns={\"index\": \"\"}), writer, worksheet, color_cols=list(data_corr.columns), start_row=end_row + 1)\n", + "\n", + "# 变量分箱信息\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"变量分箱信息\", style=\"header\")\n", + "\n", + "for col in logistic.feature_names_in_:\n", + " feature_table = sp.feature_bin_stats(data, col, target=target, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", combiner=combiner)\n", + " _ = sp.bin_plot(feature_table, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", figsize=(8, 4), save=f\"model_report/bin_plots/data_{col}.png\")\n", + " \n", + " end_row, end_col = writer.insert_pic2sheet(worksheet, f\"model_report/bin_plots/data_{col}.png\", (end_row + 1, start_col), figsize=(700, 400))\n", + " end_row, end_col = sp.dataframe2excel(feature_table, writer, worksheet, percent_cols=[\"样本占比\", \"好样本占比\", \"坏样本占比\", \"坏样本率\", \"LIFT值\", \"累积LIFT值\"], condition_cols=[\"坏样本率\", \"LIFT值\"], start_row=end_row)" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "# ////////////////////////////////////// 评分卡说明 ///////////////////////////////////// #\n", + "worksheet = writer.get_sheet_by_name(\"评分卡结果\")\n", + "\n", + "# 评分卡刻度\n", + "scorecard_kedu = card.scorecard_scale()\n", + "scorecard_points = card.scorecard_points(feature_map=feature_map)\n", + "scorecard_clip = card.score_clip(train[\"score\"], clip=100)\n", + "\n", + "start_row, start_col = 2, 2\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value=\"评分卡刻度\", style=\"header\")\n", + "end_row, end_col = writer.insert_df2sheet(worksheet, scorecard_kedu, (end_row + 1, start_col))\n", + "\n", + "# 评分卡对应分数\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"评分卡分数\", style=\"header\")\n", + "end_row, end_col = writer.insert_df2sheet(worksheet, scorecard_points, (end_row + 1, start_col), merge_column=\"变量名称\")\n", + "\n", + "# 评分效果\n", + "score_table_train = sp.feature_bin_stats(train, \"score\", desc=\"测试集模型评分\", target=target, rules=scorecard_clip)\n", + "score_table_test = sp.feature_bin_stats(test, \"score\", desc=\"测试集模型评分\", target=target, rules=scorecard_clip)\n", + "\n", + "sp.ks_plot(train[\"score\"], train[target], title=\"Train \\tDataset\", save=\"model_report/train_ksplot.png\")\n", + "sp.ks_plot(test[\"score\"], test[target], title=\"Test \\tDataset\", save=\"model_report/test_ksplot.png\")\n", + "\n", + "sp.hist_plot(train[\"score\"], train[target], save=\"model_report/train_scorehist.png\", bins=30)\n", + "sp.hist_plot(test[\"score\"], test[target], save=\"model_report/test_scorehist.png\", bins=30)\n", + "\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"训练数据集评分模型效果\", style=\"header\")\n", + "ks_row = end_row\n", + "end_row, end_col = writer.insert_pic2sheet(worksheet, \"model_report/train_ksplot.png\", (ks_row, start_col))\n", + "end_row, end_col = writer.insert_pic2sheet(worksheet, \"model_report/train_scorehist.png\", (ks_row, end_col))\n", + "end_row, end_col = sp.dataframe2excel(score_table_train, writer, worksheet, percent_cols=[\"样本占比\", \"好样本占比\", \"坏样本占比\", \"坏样本率\", \"LIFT值\", \"累积LIFT值\", \"分档KS值\"], condition_cols=[\"坏样本率\", \"LIFT值\", \"分档KS值\"], start_row=end_row + 1)\n", + "\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"测试数据集评分模型效果\", style=\"header\")\n", + "ks_row = end_row\n", + "end_row, end_col = writer.insert_pic2sheet(worksheet, \"model_report/test_ksplot.png\", (ks_row, start_col))\n", + "end_row, end_col = writer.insert_pic2sheet(worksheet, \"model_report/test_scorehist.png\", (ks_row, end_col))\n", + "end_row, end_col = sp.dataframe2excel(score_table_test, writer, worksheet, percent_cols=[\"样本占比\", \"好样本占比\", \"坏样本占比\", \"坏样本率\", \"LIFT值\", \"累积LIFT值\", \"分档KS值\"], condition_cols=[\"坏样本率\", \"LIFT值\", \"分档KS值\"], start_row=end_row + 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [], + "source": [ + "# ////////////////////////////////////// 模型稳定性 ///////////////////////////////////// #\n", + "worksheet = writer.get_sheet_by_name(\"模型稳定性\")\n", + "start_row, start_col = 2, 2\n", + "\n", + "# 评分分布稳定性\n", + "train_test_score_psi = sp.psi_plot(score_table_train, score_table_test, labels=[\"训练数据集\", \"测试数据集\"], save=\"model_report/train_test_psiplot.png\", result=True)\n", + "\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value=\"模型评分稳定性指标 (Population Stability Index, PSI): 训练数据集 vs 测试数据集\", style=\"header\")\n", + "end_row, end_col = writer.insert_pic2sheet(worksheet, \"model_report/train_test_psiplot.png\", (end_row, start_col), figsize=(800, 400))\n", + "end_row, end_col = sp.dataframe2excel(train_test_score_psi, writer, worksheet, percent_cols=[\"训练数据集样本占比\", \"训练数据集坏样本率\", \"测试数据集样本占比\", \"测试数据集坏样本率\"], condition_cols=[\"分档PSI值\"], start_row=end_row + 1)\n", + "\n", + "# 变量 PSI 表\n", + "for col in card._feature_names:\n", + " feature_table_train = sp.feature_bin_stats(train, col, target=target, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", combiner=combiner)\n", + " feature_table_test = sp.feature_bin_stats(test, col, target=target, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", combiner=combiner)\n", + " psi_table = sp.psi_plot(feature_table_train, feature_table_test, desc=col, result=True, plot=True, max_len=35, figsize=(10, 6), labels=[\"训练数据集\", \"测试数据集\"], save=f\"model_report/psi_{col}.png\")\n", + " \n", + " end_row, end_col = writer.insert_pic2sheet(worksheet, f\"model_report/psi_{col}.png\", (end_row, start_col), figsize=(700, 400))\n", + " end_row, end_col = sp.dataframe2excel(psi_table, writer, worksheet, percent_cols=[\"训练数据集样本占比\", \"训练数据集坏样本率\", \"测试数据集样本占比\", \"测试数据集坏样本率\", \"测试数据集% - 训练数据集%\"], condition_cols=[\"分档PSI值\"], start_row=end_row + 1)\n", + "\n", + "# 变量 CSI 表\n", + "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"入模变量稳定性指标 (Characteristic Stability Index, CSI): 训练数据集 vs 测试数据集\", style=\"header\")\n", + "\n", + "for col in card._feature_names:\n", + " feature_table_train = sp.feature_bin_stats(train, col, target=target, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", combiner=combiner)\n", + " feature_table_test = sp.feature_bin_stats(test, col, target=target, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", combiner=combiner)\n", + " train_test_csi_table = sp.csi_plot(feature_table_train, feature_table_test, card[col], desc=col, result=True, plot=True, max_len=35, figsize=(10, 6), labels=[\"训练数据集\", \"测试数据集\"], save=f\"model_report/csi_{col}.png\")\n", + " \n", + " end_row, end_col = writer.insert_pic2sheet(worksheet, f\"model_report/csi_{col}.png\", (end_row, start_col), figsize=(700, 400))\n", + " end_row, end_col = sp.dataframe2excel(train_test_csi_table, writer, worksheet, percent_cols=[\"训练数据集样本占比\", \"训练数据集坏样本率\", \"测试数据集样本占比\", \"测试数据集坏样本率\", \"测试数据集% - 训练数据集%\"], condition_cols=[\"分档CSI值\"], start_row=end_row + 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [], + "source": [ + "writer.save(\"model_report/评分卡模型报告.xlsx\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "scorecard", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scorecardpipeline/model.py b/scorecardpipeline/model.py index 26bb851..6f7e3b7 100644 --- a/scorecardpipeline/model.py +++ b/scorecardpipeline/model.py @@ -439,6 +439,51 @@ def scorecard_scale(self): ) return scorecard_kedu + @classmethod + def format_bins(self, bins, index=False, ellipsis=None, decimal=4): + """分箱转换为标签 + + :param bins: 分箱 + :param index: 是否需要索引 + :param ellipsis: 字符显示最大长度 + + :return: ndarray: 分箱标签 + """ + if len(bins) == 0: + return ["全部样本"] + + if isinstance(bins, list): bins = np.array(bins) + EMPTYBINS = len(bins) if not isinstance(bins[0], (set, list, np.ndarray)) else -1 + + l = [] + if not isinstance(bins[0], (set, list, np.ndarray)): + has_empty = len(bins) > 0 and pd.isnull(bins[-1]) + if has_empty: bins = bins[:-1] + sp_l = ["负无穷"] + [round_float(b, decimal=decimal) for b in bins] + ["正无穷"] + for i in range(len(sp_l) - 1): l.append('[' + str(sp_l[i]) + ' , ' + str(sp_l[i + 1]) + ')') + if has_empty: l.append('缺失值') + else: + for keys in bins: + keys_update = set() + for key in keys: + if pd.isnull(key) or key == "nan": + keys_update.add("缺失值") + elif key.strip() == "": + keys_update.add("空字符串") + else: + keys_update.add(key) + label = ','.join(keys_update) + + if ellipsis is not None: + label = label[:ellipsis] + '..' if len(label) > ellipsis else label + + l.append(label) + + if index: + l = ["{:02}.{}".format(i if b != '缺失值' else EMPTYBINS, b) for i, b in enumerate(l)] + + return np.array(l) + def scorecard_points(self, feature_map={}): """输出评分卡分箱信息及其对应的分数 @@ -476,10 +521,10 @@ def scorecard2pmml(self, pmml: str = 'scorecard.pmml', debug: bool = False): mapping = {} for bins, score in zip(rule['bins'], rule['scores'].tolist()): for _bin in bins: - if _bin == 'nan': + if pd.isnull(_bin) or _bin == 'nan': default_value = float(score) - - mapping[_bin] = float(score) + else: + mapping[_bin] = float(score) mapper.append(( [var], @@ -532,7 +577,12 @@ def scorecard2pmml(self, pmml: str = 'scorecard.pmml', debug: bool = False): pipeline.named_steps['scorecard'].coef_ = np.ones(len(scorecard_mapper.features)) - sklearn2pmml(pipeline, pmml, with_repr=True, debug=debug) + try: + sklearn2pmml(pipeline, pmml, with_repr=True, debug=debug) + except: + import traceback + print(traceback.format_exc()) + return pipeline if debug: return pipeline diff --git a/scorecardpipeline/processing.py b/scorecardpipeline/processing.py index 6029988..4c29255 100644 --- a/scorecardpipeline/processing.py +++ b/scorecardpipeline/processing.py @@ -392,12 +392,12 @@ def catboost_selector(self, x, y, cat_features=None): class Combiner(TransformerMixin, BaseEstimator): - def __init__(self, target="target", method='chi', empty_separate=False, min_n_bins=2, max_n_bins=None, max_n_prebins=20, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc", adj_rules={}, n_jobs=1): + def __init__(self, target="target", method='chi', empty_separate=True, min_n_bins=2, max_n_bins=None, max_n_prebins=20, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc", adj_rules={}, n_jobs=1): """特征分箱封装方法 :param target: 数据集中标签名称,默认 target :param method: 特征分箱方法,可选 "chi", "dt", "quantile", "step", "kmeans", "cart", "mdlp", "uniform", 参考 toad.Combiner: https://github.com/amphibian-dev/toad/blob/master/toad/transform.py#L178-L355 & optbinning.OptimalBinning: https://gnpalencia.org/optbinning/ - :param empty_separate: 是否空值单独一箱, 默认 False,推荐设置为 True + :param empty_separate: 是否空值单独一箱, 默认 True :param min_n_bins: 最小分箱数,默认 2,即最小拆分2箱 :param max_n_bins: 最大分箱数,默认 None,即不限制拆分箱数,推荐设置 3 ~ 5,不宜过多,偶尔使用 optbinning 时不起效 :param max_n_prebins: 使用 optbinning 时预分箱数量 @@ -431,6 +431,10 @@ def update(self, rules): """ self.combiner.update(rules) + # 检查规则内容 + for feature in rules.keys(): + self.check_rules(feature=feature) + def optbinning_bins(self, feature, data=None, target="target", min_n_bins=2, max_n_bins=3, max_n_prebins=10, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc"): """基于 optbinning.OptimalBinning 的特征分箱方法,使用 optbinning.OptimalBinning 分箱失败时,使用 toad.transform.Combiner 的卡方分箱处理 @@ -508,14 +512,30 @@ def fit(self, x: pd.DataFrame, y=None): self.update(self.adj_rules) + # 检查类别变量空值是否被转为字符串,如果转为了字符串,强制转回空值,同时检查分箱顺序并调整为正确顺序 + self.check_rules() + return self - - def _check_rules(self): - """检查类别变量空值是否被转为字符串,如果转为了字符串,强制转回空值""" + + def check_rules(self, feature=None): + """检查类别变量空值是否被转为字符串,如果转为了字符串,强制转回空值,同时检查分箱顺序并调整为正确顺序""" for col in self.combiner.rules.keys(): - if not np.issubdtype(self.combiner[col].dtype, np.number): - if sum([sum([1 for b in r if b in ("nan", "None")]) for r in self.combiner[col]]) > 0: - self.combiner.update({col: [[np.nan if b in ("nan", "None") else b for b in r] for r in self.combiner[col]]}) + if feature is not None and col != feature: + continue + + _rule = self.combiner[col] + + if not np.issubdtype(_rule.dtype, np.number): + if sum([sum([1 for b in r if b in ("nan", "None")]) for r in _rule]) > 0: + _rule = [[np.nan if b == "nan" else (None if b == "None" else b) for b in r] for r in _rule] + if [np.nan] in _rule: + _rule.remove([np.nan]) + _rule.append([np.nan]) + if [None] in _rule: + _rule.remove([None]) + _rule.append([None]) + + self.combiner.update({col: _rule}) def transform(self, x, y=None, labels=False): """特征分箱转换方法 @@ -589,16 +609,15 @@ def feature_bin_stats(cls, data, feature, target="target", rules=None, method='s else: _combiner = deepcopy(combiner) - if rules and len(rules) > 0: + if rules is not None and len(rules) > 0: if isinstance(rules, (list, np.ndarray)): _combiner.update({feature: rules}) else: _combiner.update(rules) - feature_bin_dict = feature_bins(np.array(_combiner[feature])) + feature_bin_dict = feature_bins(_combiner[feature]) df_bin = _combiner.transform(data[[feature, target]], labels=False) - table = df_bin[[feature, target]].groupby([feature, target]).agg(len).unstack() table.columns.name = None table = table.rename(columns={0: '好样本数', 1: '坏样本数'}).fillna(0) diff --git a/scorecardpipeline/utils.py b/scorecardpipeline/utils.py index 4a49d49..628a671 100644 --- a/scorecardpipeline/utils.py +++ b/scorecardpipeline/utils.py @@ -572,11 +572,11 @@ def psi_plot(expected, actual, labels=["预期", "实际"], desc="", save=None, ax1.tick_params(axis='x', labelrotation=90) ax2 = ax1.twinx() - ax2.plot(df_psi["分箱"], df_psi[f"{labels[0]}坏样本率"], color=colors[0], label=f"{labels[0]}坏样本率", linestyle=(5, (10, 3))) - ax2.plot(df_psi["分箱"], df_psi[f"{labels[1]}坏样本率"], color=colors[1], label=f"{labels[1]}坏样本率", linestyle=(5, (10, 3))) + ax2.plot(x, df_psi[f"{labels[0]}坏样本率"], color=colors[0], label=f"{labels[0]}坏样本率", linestyle=(5, (10, 3))) + ax2.plot(x, df_psi[f"{labels[1]}坏样本率"], color=colors[1], label=f"{labels[1]}坏样本率", linestyle=(5, (10, 3))) - ax2.scatter(df_psi["分箱"], df_psi[f"{labels[0]}坏样本率"], marker=".") - ax2.scatter(df_psi["分箱"], df_psi[f"{labels[1]}坏样本率"], marker=".") + ax2.scatter(x, df_psi[f"{labels[0]}坏样本率"], marker=".") + ax2.scatter(x, df_psi[f"{labels[1]}坏样本率"], marker=".") ax2.set_ylabel('坏样本率: 坏样本数 / 样本总数') @@ -629,7 +629,7 @@ def csi_plot(expected, actual, score_bins, labels=["预期", "实际"], desc="", df_csi["指标名称"] = desc if plot: - x = df_csi['分箱'].apply(lambda l: l if max_len is None or len(str(l)) < max_len else f"{str(l)[:max_len]}...") + x = df_csi['分箱'].apply(lambda l: str(l) if pd.isnull(l) or len(str(l)) < max_len else f"{str(l)[:max_len]}...") x_indexes = np.arange(len(x)) fig, ax1 = plt.subplots(figsize=figsize) @@ -642,11 +642,11 @@ def csi_plot(expected, actual, score_bins, labels=["预期", "实际"], desc="", ax1.tick_params(axis='x', labelrotation=90) ax2 = ax1.twinx() - ax2.plot(df_csi["分箱"], df_csi[f"{labels[0]}坏样本率"], color=colors[0], label=f"{labels[0]}坏样本率", linestyle=(5, (10, 3))) - ax2.plot(df_csi["分箱"], df_csi[f"{labels[1]}坏样本率"], color=colors[1], label=f"{labels[1]}坏样本率", linestyle=(5, (10, 3))) + ax2.plot(x, df_csi[f"{labels[0]}坏样本率"], color=colors[0], label=f"{labels[0]}坏样本率", linestyle=(5, (10, 3))) + ax2.plot(x, df_csi[f"{labels[1]}坏样本率"], color=colors[1], label=f"{labels[1]}坏样本率", linestyle=(5, (10, 3))) - ax2.scatter(df_csi["分箱"], df_csi[f"{labels[0]}坏样本率"], marker=".") - ax2.scatter(df_csi["分箱"], df_csi[f"{labels[1]}坏样本率"], marker=".") + ax2.scatter(x, df_csi[f"{labels[0]}坏样本率"], marker=".") + ax2.scatter(x, df_csi[f"{labels[1]}坏样本率"], marker=".") ax2.set_ylabel('坏样本率: 坏样本数 / 样本总数')