From 7248859c13040ded64e5d654ff15a5e45af991ad Mon Sep 17 00:00:00 2001
From: itlubber <1830611168@qq.com>
Date: Sat, 9 Dec 2023 03:23:13 +0800
Subject: [PATCH] add quick start
---
docs/source/quickstart.md | 257 ++++-
examples/quickstart.ipynb | 1812 +++++++++++++++++++++++++++++++
scorecardpipeline/model.py | 58 +-
scorecardpipeline/processing.py | 41 +-
scorecardpipeline/utils.py | 18 +-
5 files changed, 2156 insertions(+), 30 deletions(-)
create mode 100644 examples/quickstart.ipynb
diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md
index 899f6fa..b1ee0c6 100644
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -562,32 +562,277 @@ bin_plot(score_table_train, desc="训练集模型评分", figsize=(10, 6), ancho
```python
# 查看某个特征的 PSI
+score_clip = card.score_clip(train["score"], clip=10)
score_table_train = feature_bin_stats(train, "score", desc="训练集模型评分", target=target, rules=score_clip)
score_table_test = feature_bin_stats(test, "score", desc="测试集模型评分", target=target, rules=score_clip)
train_test_score_psi = psi_plot(score_table_train, score_table_test, labels=["训练数据集", "测试数据集"], save="model_report/train_test_psiplot.png", result=True)
# 查看某个入模特征的 CSI
for col in card._feature_names:
- rule = combiner[col]
- feature_table_train = feature_bin_stats(train, col, target=target, desc="训练集分布", combiner=rule)
- feature_table_test = feature_bin_stats(test, col, target=target, desc="测试集分布", combiner=rule)
+ feature_table_train = feature_bin_stats(train, col, target=target, desc="训练集分布", combiner=combiner)
+ feature_table_test = feature_bin_stats(test, col, target=target, desc="测试集分布", combiner=combiner)
train_test_csi_table = csi_plot(feature_table_train, feature_table_test, card[col], desc=col, result=True, plot=True, max_len=35, figsize=(10, 6), labels=["训练数据集", "测试数据集"], save=f"model_report/csi_{col}.png")
```
-
+
-
+
+### 模型持久化存储
+
+`scorecardpipeline` 提供了几种可选的模型持久化存储方式,可以将训练好的评分卡模型保存为 `pickle` 或 `pmml` 格式的模型文件,供后续生产部署或离线回溯使用,非常方便快捷
+
+```python
+# 将评分卡模型保存 pmml 文件
+scorecard_pipeline = card.scorecard2pmml(pmml="model_report/scorecard.pmml", debug=True)
+# 将评分卡模型保存 pickle 文件
+save_pickle(card, "model_report/scorecard.pkl")
+```
+
+
+### 评分卡 `pipeline` 建模
+
+在 `scorecardpipeline` 中,几乎所以的模型和数据预处理步骤都支持 `pipeline` 式构建模型,同时还可以与 `sklearn` 中其他的 `pipeline` 组件一起构建模型。
+
+```python
+# 构建 pipeline
+model_pipeline = Pipeline([
+ ("preprocessing", FeatureSelection(target=target, engine="scorecardpy")),
+ ("combiner", Combiner(target=target, min_bin_size=0.2)),
+ ("transform", WOETransformer(target=target)),
+ ("processing_select", FeatureSelection(target=target, engine="toad")),
+ ("stepwise", StepwiseSelection(target=target)),
+ ("logistic", ITLubberLogisticRegression(target=target)),
+])
+# 训练 pipeline
+model_pipeline.fit(train)
+# 转换评分卡
+card = ScoreCard(target=target, pipeline=model_pipeline, base_score=50, base_odds=(1 - bad_rate) / bad_rate, pdo=10)
+card.fit(model_pipeline[:-1].transform(train))
+card.scorecard_points()
+```
+
+| 序号 | 变量名称 | 变量分箱 | 对应分数 |
+|:--:|:----------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------|-----------:|
+| 0 | purpose | radio/television,car (used) | 12.8514 |
+| 1 | purpose | business,furniture/equipment,others,education,domestic appliances,retraining,car (new),缺失值,repairs | 2.7818 |
+| 2 | installment_rate_in_percentage_of_disposable_income | [负无穷 , 4.0) | 7.7878 |
+| 3 | installment_rate_in_percentage_of_disposable_income | [4.0 , 正无穷) | 0.9877 |
+| 4 | installment_rate_in_percentage_of_disposable_income | 缺失值 | 10.7462 |
+| 5 | savings_account_and_bonds | 500 <= ... < 1000 DM,unknown/ no savings account,... >= 1000 DM | 5.9834 |
+| 6 | savings_account_and_bonds | ... < 100 DM,100 <= ... < 500 DM | 12.1585 |
+| 7 | savings_account_and_bonds | 缺失值 | 3.2466 |
+| 8 | present_employment_since | 4 <= ... < 7 years,... >= 7 years | 9.4896 |
+| 9 | present_employment_since | 缺失值,unemployed,1 <= ... < 4 years,... < 1 year | 3.8957 |
+| 10 | age_in_years | [负无穷 , 35.0) | 0.8917 |
+| 11 | age_in_years | [35.0 , 正无穷) | 10.8457 |
+| 12 | age_in_years | 缺失值 | 6.7213 |
+| 13 | property | real estate | 11.2541 |
+| 14 | property | 缺失值,building society savings agreement/ life insurance,unknown / no property,car or other, not in attribute Savings account/bonds | 4.0428 |
+| 15 | personal_status_and_sex | 缺失值,female : divorced/separated/married | 7.8997 |
+| 16 | personal_status_and_sex | male : single,male : married/widowed,male : divorced/separated | 3.7463 |
+| 17 | credit_amount | [负无穷 , 2145.0) | 8.0057 |
+| 18 | credit_amount | [2145.0 , 3804.0) | 14.2751 |
+| 19 | credit_amount | [3804.0 , 正无穷) | -2.6347 |
+| 20 | credit_amount | 缺失值 | 2.1778 |
+| 21 | status_of_existing_checking_account | no checking account | 22.4653 |
+| 22 | status_of_existing_checking_account | 缺失值,... >= 200 DM / salary assignments for at least 1 year | 6.6694 |
+| 23 | status_of_existing_checking_account | 0 <= ... < 200 DM | 0.0181 |
+| 24 | status_of_existing_checking_account | ... < 0 DM | -4.8558 |
+
+
+### 评分卡全流程超参数搜索
+
+```python
+# 导入超参数搜索方法
+from sklearn.model_selection import GridSearchCV
+
+# 构建 pipeline
+model_pipeline = Pipeline([
+ ("preprocessing", FeatureSelection(target=target, engine="scorecardpy")),
+ ("combiner", Combiner(target=target, min_bin_size=0.2)),
+ ("transform", WOETransformer(target=target)),
+ ("processing_select", FeatureSelection(target=target, engine="toad")),
+ ("stepwise", StepwiseSelection(target=target)),
+ ("logistic", ITLubberLogisticRegression(target=target)),
+])
+
+# 定义超参数搜索空间,参数命名: {pipeline名称}__{对应超参数名称}
+params_grid = {
+ "combiner__max_n_bins": [3],
+ "logistic__C": [np.power(2, i) for i in range(5)],
+ "logistic__penalty": ["l2"],
+ "logistic__class_weight": [None, "balanced"] + [{1: i / 10.0, 0: 1 - i / 10.0} for i in range(1, 10, 2)],
+ "logistic__max_iter": [10, 50, 100],
+ "logistic__solver": ["sag"], # ["liblinear", "sag", "lbfgs", "newton-cg"],
+}
+
+pipeline_grid_search = GridSearchCV(model_pipeline, params_grid, cv=3, scoring='roc_auc', verbose=1, n_jobs=-1, return_train_score=True)
+pipeline_grid_search.fit(train, train[target])
+
+print(pipeline_grid_search.best_params_)
+
+# 更新模型
+model_pipeline.set_params(**pipeline_grid_search.best_params_)
+model_pipeline.fit(train)
+
+# 转换评分卡
+card = ScoreCard(target=target, pipeline=model_pipeline, base_score=50, base_odds=(1 - bad_rate) / bad_rate, pdo=10)
+card.fit(model_pipeline[:-1].transform(train))
+```
+
### 模型报告输出
+在 `scorecardpipeline` 中,提供了操作 `excel` 文件的写入器 `ExcelWriter`,支持将文字、表格、图像等过程内容保存至 `excel` 文件中,对相关方法抽象和封装后,能够满足日常大部分数据分析过程中结果保存的需求。
-### 模型持久化存储
+`ExcelWriter`支持调整列宽、调整单元格格式、条件格式、指定位置插入数据、插入图片等功能,且提供了 `dataframe2excel` 来在日常工作中快速保存 `dataframe` 至 `excel` 文件中,并且自动设置样式。
+
+```python
+# 初始化 Excel 写入器
+writer = sp.ExcelWriter()
+
+start_row, start_col = 2, 2
+
+# ////////////////////////////////////// 样本说明 ///////////////////////////////////// #
+worksheet = writer.get_sheet_by_name("汇总信息")
+
+# 样本总体分布情况
+end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="样本总体分布情况", style="header")
+end_row, end_col = sp.dataframe2excel(dataset_summary, writer, worksheet, percent_cols=["样本占比", "坏客户占比"], start_row=end_row + 1)
+
+# 建模样本时间分布情况
+temp = sp.distribution_plot(df, date="date", target=target, save="model_report/all_sample_time_count.png", result=True)
+end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="建模样本时间分布情况", style="header")
+end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/all_sample_time_count.png", (end_row, start_col), figsize=(720, 370))
+end_row, end_col = sp.dataframe2excel(temp, writer, worksheet, percent_cols=["样本占比", "好样本占比", "坏样本占比", "坏样本率"], condition_cols=["坏样本率"], start_row=end_row)
+
+# ////////////////////////////////////// 模型报告 ///////////////////////////////////// #
+summary = logistic.summary2(feature_map=feature_map)
+
+# 逻辑回归拟合情况
+worksheet = writer.get_sheet_by_name("逻辑回归拟合结果")
+
+end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="逻辑回归拟合效果", style="header")
+end_row, end_col = sp.dataframe2excel(summary, writer, worksheet, condition_cols=["Coef."], start_row=end_row + 1)
+
+end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="训练数据集拟合报告", style="header")
+end_row, end_col = sp.dataframe2excel(logistic.report(train_woe_stepwise), writer, worksheet, percent_cols=["precision", "recall", "f1-score"], start_row=end_row + 1)
+
+end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="测试数据集拟合报告", style="header")
+end_row, end_col = sp.dataframe2excel(logistic.report(test_woe_stepwise), writer, worksheet, percent_cols=["precision", "recall", "f1-score"], start_row=end_row + 1)
+
+# ////////////////////////////////////// 特征概述 ///////////////////////////////////// #
+worksheet = writer.get_sheet_by_name("模型变量信息")
+
+start_row, start_col = 2, 2
+end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="入模变量信息", style="header")
+end_row, end_col = writer.insert_df2sheet(worksheet, feature_describe.reset_index().rename(columns={"index": "序号"}), (end_row + 1, start_col))
+
+# 变量分布情况
+import toad
+data_info = toad.detect(data[card.rules.keys()]).reset_index().rename(columns={"index": "变量名称", "type": "变量类型", "size": "样本个数", "missing": "缺失值", "unique": "唯一值个数"})
+end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量分布情况", style="header")
+end_row, end_col = writer.insert_df2sheet(worksheet, data_info, (end_row + 1, start_col))
+
+# 变量相关性
+data_corr = train_woe_stepwise.corr()
+logistic.corr(train_woe_stepwise, save="model_report/train_corr.png", annot=False)
+end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量相关性", style="header")
+end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_corr.png", (end_row + 1, start_col), figsize=(700, 500))
+end_row, end_col = sp.dataframe2excel(data_corr.reset_index().rename(columns={"index": ""}), writer, worksheet, color_cols=list(data_corr.columns), start_row=end_row + 1)
+
+# 变量分箱信息
+end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量分箱信息", style="header")
+
+for col in logistic.feature_names_in_:
+ feature_table = sp.feature_bin_stats(data, col, target=target, desc=feature_map.get(col, "") or "逻辑回归入模变量", combiner=combiner)
+ _ = sp.bin_plot(feature_table, desc=feature_map.get(col, "") or "逻辑回归入模变量", figsize=(8, 4), save=f"model_report/bin_plots/data_{col}.png")
+
+ end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/bin_plots/data_{col}.png", (end_row + 1, start_col), figsize=(700, 400))
+ end_row, end_col = sp.dataframe2excel(feature_table, writer, worksheet, percent_cols=["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值"], condition_cols=["坏样本率", "LIFT值"], start_row=end_row)
+
+# ////////////////////////////////////// 评分卡说明 ///////////////////////////////////// #
+worksheet = writer.get_sheet_by_name("评分卡结果")
+
+# 评分卡刻度
+scorecard_kedu = card.scorecard_scale()
+scorecard_points = card.scorecard_points(feature_map=feature_map)
+scorecard_clip = card.score_clip(train["score"], clip=100)
+
+start_row, start_col = 2, 2
+end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="评分卡刻度", style="header")
+end_row, end_col = writer.insert_df2sheet(worksheet, scorecard_kedu, (end_row + 1, start_col))
+
+# 评分卡对应分数
+end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="评分卡分数", style="header")
+end_row, end_col = writer.insert_df2sheet(worksheet, scorecard_points, (end_row + 1, start_col), merge_column="变量名称")
+
+# 评分效果
+score_table_train = sp.feature_bin_stats(train, "score", desc="测试集模型评分", target=target, rules=scorecard_clip)
+score_table_test = sp.feature_bin_stats(test, "score", desc="测试集模型评分", target=target, rules=scorecard_clip)
+
+sp.ks_plot(train["score"], train[target], title="Train \tDataset", save="model_report/train_ksplot.png")
+sp.ks_plot(test["score"], test[target], title="Test \tDataset", save="model_report/test_ksplot.png")
+
+sp.hist_plot(train["score"], train[target], save="model_report/train_scorehist.png", bins=30)
+sp.hist_plot(test["score"], test[target], save="model_report/test_scorehist.png", bins=30)
+
+end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="训练数据集评分模型效果", style="header")
+ks_row = end_row
+end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_ksplot.png", (ks_row, start_col))
+end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_scorehist.png", (ks_row, end_col))
+end_row, end_col = sp.dataframe2excel(score_table_train, writer, worksheet, percent_cols=["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值", "分档KS值"], condition_cols=["坏样本率", "LIFT值", "分档KS值"], start_row=end_row + 1)
+
+end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="测试数据集评分模型效果", style="header")
+ks_row = end_row
+end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_ksplot.png", (ks_row, start_col))
+end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_scorehist.png", (ks_row, end_col))
+end_row, end_col = sp.dataframe2excel(score_table_test, writer, worksheet, percent_cols=["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值", "分档KS值"], condition_cols=["坏样本率", "LIFT值", "分档KS值"], start_row=end_row + 1)
+
+# ////////////////////////////////////// 模型稳定性 ///////////////////////////////////// #
+worksheet = writer.get_sheet_by_name("模型稳定性")
+start_row, start_col = 2, 2
+
+# 评分分布稳定性
+train_test_score_psi = sp.psi_plot(score_table_train, score_table_test, labels=["训练数据集", "测试数据集"], save="model_report/train_test_psiplot.png", result=True)
+
+end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="模型评分稳定性指标 (Population Stability Index, PSI): 训练数据集 vs 测试数据集", style="header")
+end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_test_psiplot.png", (end_row, start_col), figsize=(800, 400))
+end_row, end_col = sp.dataframe2excel(train_test_score_psi, writer, worksheet, percent_cols=["训练数据集样本占比", "训练数据集坏样本率", "测试数据集样本占比", "测试数据集坏样本率"], condition_cols=["分档PSI值"], start_row=end_row + 1)
+
+# 变量 PSI 表
+for col in card._feature_names:
+ feature_table_train = sp.feature_bin_stats(train, col, target=target, desc=feature_map.get(col, "") or "逻辑回归入模变量", combiner=combiner)
+ feature_table_test = sp.feature_bin_stats(test, col, target=target, desc=feature_map.get(col, "") or "逻辑回归入模变量", combiner=combiner)
+ psi_table = sp.psi_plot(feature_table_train, feature_table_test, desc=col, result=True, plot=True, max_len=35, figsize=(10, 6), labels=["训练数据集", "测试数据集"], save=f"model_report/psi_{col}.png")
+
+ end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/psi_{col}.png", (end_row, start_col), figsize=(700, 400))
+ end_row, end_col = sp.dataframe2excel(psi_table, writer, worksheet, percent_cols=["训练数据集样本占比", "训练数据集坏样本率", "测试数据集样本占比", "测试数据集坏样本率", "测试数据集% - 训练数据集%"], condition_cols=["分档PSI值"], start_row=end_row + 1)
+
+# 变量 CSI 表
+end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="入模变量稳定性指标 (Characteristic Stability Index, CSI): 训练数据集 vs 测试数据集", style="header")
+
+for col in card._feature_names:
+ feature_table_train = sp.feature_bin_stats(train, col, target=target, desc=feature_map.get(col, "") or "逻辑回归入模变量", combiner=combiner)
+ feature_table_test = sp.feature_bin_stats(test, col, target=target, desc=feature_map.get(col, "") or "逻辑回归入模变量", combiner=combiner)
+ train_test_csi_table = sp.csi_plot(feature_table_train, feature_table_test, card[col], desc=col, result=True, plot=True, max_len=35, figsize=(10, 6), labels=["训练数据集", "测试数据集"], save=f"model_report/csi_{col}.png")
+
+ end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/csi_{col}.png", (end_row, start_col), figsize=(700, 400))
+ end_row, end_col = sp.dataframe2excel(train_test_csi_table, writer, worksheet, percent_cols=["训练数据集样本占比", "训练数据集坏样本率", "测试数据集样本占比", "测试数据集坏样本率", "测试数据集% - 训练数据集%"], condition_cols=["分档CSI值"], start_row=end_row + 1)
+
+# 保存结果文件
+writer.save("model_report/评分卡模型报告.xlsx")
+```
+
+
+
+
## 交流
diff --git a/examples/quickstart.ipynb b/examples/quickstart.ipynb
new file mode 100644
index 0000000..354c616
--- /dev/null
+++ b/examples/quickstart.ipynb
@@ -0,0 +1,1812 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "sys.path.append(\"../\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "import scorecardpipeline as sp\n",
+ "from scorecardpipeline import *"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 79,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sp"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "logger = sp.init_setting(seed=6666, logger=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " status_of_existing_checking_account | \n",
+ " duration_in_month | \n",
+ " credit_history | \n",
+ " purpose | \n",
+ " credit_amount | \n",
+ " savings_account_and_bonds | \n",
+ " present_employment_since | \n",
+ " installment_rate_in_percentage_of_disposable_income | \n",
+ " personal_status_and_sex | \n",
+ " other_debtors_or_guarantors | \n",
+ " ... | \n",
+ " property | \n",
+ " age_in_years | \n",
+ " other_installment_plans | \n",
+ " housing | \n",
+ " number_of_existing_credits_at_this_bank | \n",
+ " job | \n",
+ " number_of_people_being_liable_to_provide_maintenance_for | \n",
+ " telephone | \n",
+ " foreign_worker | \n",
+ " creditability | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ... < 0 DM | \n",
+ " 6.0000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1169.0000 | \n",
+ " unknown/ no savings account | \n",
+ " NaN | \n",
+ " 4.0000 | \n",
+ " male : divorced/separated | \n",
+ " none | \n",
+ " ... | \n",
+ " real estate | \n",
+ " 67.0000 | \n",
+ " none | \n",
+ " own | \n",
+ " NaN | \n",
+ " skilled employee / official | \n",
+ " 1.0000 | \n",
+ " yes, registered under the customers name | \n",
+ " yes | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 48.0000 | \n",
+ " existing credits paid back duly till now | \n",
+ " NaN | \n",
+ " 5951.0000 | \n",
+ " ... < 100 DM | \n",
+ " 1 <= ... < 4 years | \n",
+ " 2.0000 | \n",
+ " male : divorced/separated | \n",
+ " none | \n",
+ " ... | \n",
+ " real estate | \n",
+ " NaN | \n",
+ " none | \n",
+ " own | \n",
+ " 1.0000 | \n",
+ " skilled employee / official | \n",
+ " 1.0000 | \n",
+ " none | \n",
+ " yes | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " no checking account | \n",
+ " 12.0000 | \n",
+ " critical account/ other credits existing (not at this bank) | \n",
+ " education | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 4 <= ... < 7 years | \n",
+ " 2.0000 | \n",
+ " male : divorced/separated | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " 49.0000 | \n",
+ " NaN | \n",
+ " own | \n",
+ " 1.0000 | \n",
+ " unskilled - resident | \n",
+ " 2.0000 | \n",
+ " NaN | \n",
+ " yes | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ... < 0 DM | \n",
+ " NaN | \n",
+ " existing credits paid back duly till now | \n",
+ " furniture/equipment | \n",
+ " 7882.0000 | \n",
+ " ... < 100 DM | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " male : divorced/separated | \n",
+ " guarantor | \n",
+ " ... | \n",
+ " building society savings agreement/ life insurance | \n",
+ " 45.0000 | \n",
+ " NaN | \n",
+ " for free | \n",
+ " 1.0000 | \n",
+ " skilled employee / official | \n",
+ " NaN | \n",
+ " none | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ... < 0 DM | \n",
+ " 24.0000 | \n",
+ " NaN | \n",
+ " car (new) | \n",
+ " 4870.0000 | \n",
+ " NaN | \n",
+ " 1 <= ... < 4 years | \n",
+ " 3.0000 | \n",
+ " male : divorced/separated | \n",
+ " none | \n",
+ " ... | \n",
+ " unknown / no property | \n",
+ " 53.0000 | \n",
+ " none | \n",
+ " for free | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2.0000 | \n",
+ " none | \n",
+ " yes | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " status_of_existing_checking_account duration_in_month credit_history purpose credit_amount savings_account_and_bonds present_employment_since installment_rate_in_percentage_of_disposable_income personal_status_and_sex other_debtors_or_guarantors ... property age_in_years other_installment_plans housing number_of_existing_credits_at_this_bank job number_of_people_being_liable_to_provide_maintenance_for telephone foreign_worker creditability\n",
+ "0 ... < 0 DM 6.0000 NaN NaN 1169.0000 unknown/ no savings account NaN 4.0000 male : divorced/separated none ... real estate 67.0000 none own NaN skilled employee / official 1.0000 yes, registered under the customers name yes 0\n",
+ "1 NaN 48.0000 existing credits paid back duly till now NaN 5951.0000 ... < 100 DM 1 <= ... < 4 years 2.0000 male : divorced/separated none ... real estate NaN none own 1.0000 skilled employee / official 1.0000 none yes 1\n",
+ "2 no checking account 12.0000 critical account/ other credits existing (not at this bank) education NaN NaN 4 <= ... < 7 years 2.0000 male : divorced/separated NaN ... NaN 49.0000 NaN own 1.0000 unskilled - resident 2.0000 NaN yes 0\n",
+ "3 ... < 0 DM NaN existing credits paid back duly till now furniture/equipment 7882.0000 ... < 100 DM NaN NaN male : divorced/separated guarantor ... building society savings agreement/ life insurance 45.0000 NaN for free 1.0000 skilled employee / official NaN none NaN 0\n",
+ "4 ... < 0 DM 24.0000 NaN car (new) 4870.0000 NaN 1 <= ... < 4 years 3.0000 male : divorced/separated none ... unknown / no property 53.0000 none for free NaN NaN 2.0000 none yes 1\n",
+ "\n",
+ "[5 rows x 21 columns]"
+ ]
+ },
+ "execution_count": 81,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "target = \"creditability\"\n",
+ "data = germancredit()\n",
+ "data[target] = data[target].map({\"good\": 0, \"bad\": 1})\n",
+ "\n",
+ "# 随机替换 20% 的数据为 np.nan\n",
+ "for col in data.columns.drop(target):\n",
+ " for i in range(len(data)):\n",
+ " if np.random.rand() > 0.8:\n",
+ " data[col].loc[i] = np.nan\n",
+ "\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[ 2023-12-09 03:04:52,641 ][ INFO ][ 3133036894.py::3 ] 训练集数据: (700, 21), 测试集数据: (300, 21)\n",
+ "[ 2023-12-09 03:04:52,641 ][ INFO ][ 3133036894.py::3 ] 训练集数据: (700, 21), 测试集数据: (300, 21)\n",
+ "[ 2023-12-09 03:04:52,641 ][ INFO ][ 3133036894.py::3 ] 训练集数据: (700, 21), 测试集数据: (300, 21)\n"
+ ]
+ }
+ ],
+ "source": [
+ "train, test = train_test_split(data, test_size=0.3, shuffle=True, stratify=data[target])\n",
+ "\n",
+ "logger.info(f\"训练集数据: {train.shape}, 测试集数据: {test.shape}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 数据集 | \n",
+ " 开始时间 | \n",
+ " 结束时间 | \n",
+ " 样本总数 | \n",
+ " 样本占比 | \n",
+ " 坏客户数 | \n",
+ " 坏客户占比 | \n",
+ " 备注 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 建模样本 | \n",
+ " 2022-01-01 | \n",
+ " 2023-01-31 | \n",
+ " 1000 | \n",
+ " 1.0000 | \n",
+ " 300 | \n",
+ " 0.3000 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 训练集 | \n",
+ " 2022-01-01 | \n",
+ " 2023-12-31 | \n",
+ " 700 | \n",
+ " 0.7000 | \n",
+ " 210 | \n",
+ " 0.3000 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 测试集 | \n",
+ " 2022-01-01 | \n",
+ " 2023-12-31 | \n",
+ " 300 | \n",
+ " 0.3000 | \n",
+ " 90 | \n",
+ " 0.3000 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 数据集 开始时间 结束时间 样本总数 样本占比 坏客户数 坏客户占比 备注\n",
+ "0 建模样本 2022-01-01 2023-01-31 1000 1.0000 300 0.3000 \n",
+ "1 训练集 2022-01-01 2023-12-31 700 0.7000 210 0.3000 \n",
+ "2 测试集 2022-01-01 2023-12-31 300 0.3000 90 0.3000 "
+ ]
+ },
+ "execution_count": 83,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 模拟实际场景中的数据, date 为数据集中的日期,为 datetime 类型,实际生产过程中可能是 申请时间|放款时间|入催时间|流失时间 等\n",
+ "\n",
+ "df = pd.DataFrame()\n",
+ "df[\"date\"] = pd.date_range(start=\"2021-01-01\", end=\"2021-06-30\", freq=\"5H\")\n",
+ "df[target] = np.random.randint(0, 2, len(df))\n",
+ "\n",
+ "total_count = len(data)\n",
+ "dataset_summary = pd.DataFrame(\n",
+ " [\n",
+ " [\"建模样本\", \"2022-01-01\", \"2023-01-31\", len(data), len(data) / total_count, data[target].sum(), data[target].sum() / len(data), \"\"],\n",
+ " [\"训练集\", \"2022-01-01\", \"2023-12-31\", len(train), len(train) / total_count, train[target].sum(), train[target].sum() / len(train), \"\"],\n",
+ " [\"测试集\", \"2022-01-01\", \"2023-12-31\", len(test), len(test) / total_count, test[target].sum(), test[target].sum() / len(test), \"\"],\n",
+ " ],\n",
+ " columns=[\"数据集\", \"开始时间\", \"结束时间\", \"样本总数\", \"样本占比\", \"坏客户数\", \"坏客户占比\", \"备注\"],\n",
+ ")\n",
+ "\n",
+ "dataset_summary"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "distribution_plot(df, date=\"date\", target=target)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "select = FeatureSelection(target=target, engine=\"toad\", identical=0.95, empty=0.95, iv=0.02, corr=0.6)\n",
+ "select.fit(train)\n",
+ "\n",
+ "train_select = select.transform(train)\n",
+ "test_select = select.transform(test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "combiner = Combiner(target=target, min_bin_size=0.2, empty_separate=True)\n",
+ "\n",
+ "combiner.fit(train_select)\n",
+ "\n",
+ "train_bins = combiner.transform(train_select)\n",
+ "test_bins = combiner.transform(test_select)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 指标名称 | \n",
+ " 指标含义 | \n",
+ " 分箱 | \n",
+ " 样本总数 | \n",
+ " 样本占比 | \n",
+ " 好样本数 | \n",
+ " 好样本占比 | \n",
+ " 坏样本数 | \n",
+ " 坏样本占比 | \n",
+ " 坏样本率 | \n",
+ " 分档WOE值 | \n",
+ " 分档IV值 | \n",
+ " 指标IV值 | \n",
+ " LIFT值 | \n",
+ " 累积LIFT值 | \n",
+ " 累积好样本数 | \n",
+ " 累积坏样本数 | \n",
+ " 分档KS值 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " credit_amount | \n",
+ " 信用额度 | \n",
+ " [负无穷 , 4000.0) | \n",
+ " 431 | \n",
+ " 0.6157 | \n",
+ " 325 | \n",
+ " 0.6633 | \n",
+ " 106 | \n",
+ " 0.5048 | \n",
+ " 0.2459 | \n",
+ " 0.2731 | \n",
+ " 0.0433 | \n",
+ " 0.1158 | \n",
+ " 0.8198 | \n",
+ " 0.8198 | \n",
+ " 325 | \n",
+ " 106 | \n",
+ " -0.1585 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " credit_amount | \n",
+ " 信用额度 | \n",
+ " [4000.0 , 正无穷) | \n",
+ " 139 | \n",
+ " 0.1986 | \n",
+ " 80 | \n",
+ " 0.1633 | \n",
+ " 59 | \n",
+ " 0.2810 | \n",
+ " 0.4245 | \n",
+ " -0.5428 | \n",
+ " 0.0639 | \n",
+ " 0.1158 | \n",
+ " 1.4149 | \n",
+ " 0.9649 | \n",
+ " 405 | \n",
+ " 165 | \n",
+ " -0.0408 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " credit_amount | \n",
+ " 信用额度 | \n",
+ " 缺失值 | \n",
+ " 130 | \n",
+ " 0.1857 | \n",
+ " 85 | \n",
+ " 0.1735 | \n",
+ " 45 | \n",
+ " 0.2143 | \n",
+ " 0.3462 | \n",
+ " -0.2113 | \n",
+ " 0.0086 | \n",
+ " 0.1158 | \n",
+ " 1.1538 | \n",
+ " 1.0000 | \n",
+ " 490 | \n",
+ " 210 | \n",
+ " 0.0000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 指标名称 指标含义 分箱 样本总数 样本占比 好样本数 好样本占比 坏样本数 坏样本占比 坏样本率 分档WOE值 分档IV值 指标IV值 LIFT值 累积LIFT值 累积好样本数 累积坏样本数 分档KS值\n",
+ "0 credit_amount 信用额度 [负无穷 , 4000.0) 431 0.6157 325 0.6633 106 0.5048 0.2459 0.2731 0.0433 0.1158 0.8198 0.8198 325 106 -0.1585\n",
+ "1 credit_amount 信用额度 [4000.0 , 正无穷) 139 0.1986 80 0.1633 59 0.2810 0.4245 -0.5428 0.0639 0.1158 1.4149 0.9649 405 165 -0.0408\n",
+ "2 credit_amount 信用额度 缺失值 130 0.1857 85 0.1735 45 0.2143 0.3462 -0.2113 0.0086 0.1158 1.1538 1.0000 490 210 0.0000"
+ ]
+ },
+ "execution_count": 87,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "combiner.bin_plot(train_select, \"credit_amount\", result=True, desc=\"信用额度\", rule=[4000.0, np.nan])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "transform = WOETransformer(target=target)\n",
+ "transform.fit(train_bins)\n",
+ "\n",
+ "train_woe = transform.transform(train_bins)\n",
+ "test_woe = transform.transform(test_bins)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 初始化逐步回归特征筛选器\n",
+ "stepwise = StepwiseSelection(target=target)\n",
+ "# 训练\n",
+ "stepwise.fit(train_woe)\n",
+ "# 应用逐步回归特征筛选器\n",
+ "train_woe_stepwise = stepwise.transform(train_woe)\n",
+ "test_woe_stepwise = stepwise.transform(test_woe)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 逻辑回归模型构建\n",
+ "logistic = ITLubberLogisticRegression(target=target)\n",
+ "# 训练\n",
+ "logistic.fit(train_woe_stepwise)\n",
+ "# 预测数据集样本违约概率\n",
+ "y_pred_train = logistic.predict_proba(train_woe_stepwise.drop(columns=target))[:, 1]\n",
+ "y_pred_test = logistic.predict_proba(test_woe_stepwise.drop(columns=target))[:, 1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Features | \n",
+ " Describe | \n",
+ " Coef. | \n",
+ " Std.Err | \n",
+ " z | \n",
+ " P>|z| | \n",
+ " [ 0.025 | \n",
+ " 0.975 ] | \n",
+ " VIF | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " const | \n",
+ " 截距项 | \n",
+ " -0.8423 | \n",
+ " 0.0942 | \n",
+ " -8.9407 | \n",
+ " 0.0000 | \n",
+ " -1.0269 | \n",
+ " -0.6576 | \n",
+ " 1.0499 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " status_of_existing_checking_account | \n",
+ " 现有支票账户的状态 | \n",
+ " 0.9018 | \n",
+ " 0.1295 | \n",
+ " 6.9653 | \n",
+ " 0.0000 | \n",
+ " 0.6480 | \n",
+ " 1.1555 | \n",
+ " 1.0828 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " purpose | \n",
+ " 目的 | \n",
+ " 1.0039 | \n",
+ " 0.3082 | \n",
+ " 3.2577 | \n",
+ " 0.0011 | \n",
+ " 0.3999 | \n",
+ " 1.6079 | \n",
+ " 1.0108 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " credit_amount | \n",
+ " 信用额度 | \n",
+ " 1.1053 | \n",
+ " 0.2571 | \n",
+ " 4.2988 | \n",
+ " 0.0000 | \n",
+ " 0.6014 | \n",
+ " 1.6093 | \n",
+ " 1.0346 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " savings_account_and_bonds | \n",
+ " | \n",
+ " 0.6072 | \n",
+ " 0.2516 | \n",
+ " 2.4130 | \n",
+ " 0.0158 | \n",
+ " 0.1140 | \n",
+ " 1.1004 | \n",
+ " 1.0635 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " present_employment_since | \n",
+ " 现居住地至今 | \n",
+ " 0.6739 | \n",
+ " 0.3667 | \n",
+ " 1.8379 | \n",
+ " 0.0661 | \n",
+ " -0.0448 | \n",
+ " 1.3926 | \n",
+ " 1.0612 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " installment_rate_in_percentage_of_disposable_income | \n",
+ " 分期付款率占可支配收入的百分比 | \n",
+ " 1.2856 | \n",
+ " 0.4382 | \n",
+ " 2.9338 | \n",
+ " 0.0033 | \n",
+ " 0.4267 | \n",
+ " 2.1444 | \n",
+ " 1.0178 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " personal_status_and_sex | \n",
+ " 个人地位和性别 | \n",
+ " 0.8099 | \n",
+ " 0.5258 | \n",
+ " 1.5404 | \n",
+ " 0.1235 | \n",
+ " -0.2206 | \n",
+ " 1.8405 | \n",
+ " 1.0106 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " property | \n",
+ " | \n",
+ " 0.8269 | \n",
+ " 0.3841 | \n",
+ " 2.1527 | \n",
+ " 0.0313 | \n",
+ " 0.0740 | \n",
+ " 1.5797 | \n",
+ " 1.0279 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " age_in_years | \n",
+ " 年龄 | \n",
+ " 0.9057 | \n",
+ " 0.2749 | \n",
+ " 3.2946 | \n",
+ " 0.0010 | \n",
+ " 0.3669 | \n",
+ " 1.4445 | \n",
+ " 1.0459 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Features Describe Coef. Std.Err z P>|z| [ 0.025 0.975 ] VIF\n",
+ "0 const 截距项 -0.8423 0.0942 -8.9407 0.0000 -1.0269 -0.6576 1.0499\n",
+ "1 status_of_existing_checking_account 现有支票账户的状态 0.9018 0.1295 6.9653 0.0000 0.6480 1.1555 1.0828\n",
+ "2 purpose 目的 1.0039 0.3082 3.2577 0.0011 0.3999 1.6079 1.0108\n",
+ "3 credit_amount 信用额度 1.1053 0.2571 4.2988 0.0000 0.6014 1.6093 1.0346\n",
+ "4 savings_account_and_bonds 0.6072 0.2516 2.4130 0.0158 0.1140 1.1004 1.0635\n",
+ "5 present_employment_since 现居住地至今 0.6739 0.3667 1.8379 0.0661 -0.0448 1.3926 1.0612\n",
+ "6 installment_rate_in_percentage_of_disposable_income 分期付款率占可支配收入的百分比 1.2856 0.4382 2.9338 0.0033 0.4267 2.1444 1.0178\n",
+ "7 personal_status_and_sex 个人地位和性别 0.8099 0.5258 1.5404 0.1235 -0.2206 1.8405 1.0106\n",
+ "8 property 0.8269 0.3841 2.1527 0.0313 0.0740 1.5797 1.0279\n",
+ "9 age_in_years 年龄 0.9057 0.2749 3.2946 0.0010 0.3669 1.4445 1.0459"
+ ]
+ },
+ "execution_count": 91,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 数据字典或特征描述信息\n",
+ "feature_map = {\n",
+ " \"const\": \"截距项\",\n",
+ " \"status_of_existing_checking_account\": \"现有支票账户的状态\",\n",
+ " \"credit_history\": \"信用记录\",\n",
+ " \"purpose\": \"目的\",\n",
+ " \"credit_amount\": \"信用额度\",\n",
+ " \"present_employment_since\": \"现居住地至今\",\n",
+ " \"installment_rate_in_percentage_of_disposable_income\": \"分期付款率占可支配收入的百分比\",\n",
+ " \"personal_status_and_sex\": \"个人地位和性别\",\n",
+ " \"age_in_years\": \"年龄\",\n",
+ " \"housing\": \"住房情况\",\n",
+ "}\n",
+ "# summary 仅支持输出简单的统计信息,使用 summary2 可以输出有特征描述的统计信息表\n",
+ "logistic.summary2(feature_map=feature_map)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "logistic.plot_weights(figsize=(10, 6), save=\"model_report/sp_lr_weight.png\");"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bad_rate = train[target].mean()\n",
+ "# 逻辑回归模型转评分卡\n",
+ "card = ScoreCard(target=target, combiner=combiner, transer=transform, pretrain_lr=logistic, base_score=50, base_odds=(1 - bad_rate) / bad_rate, pdo=10)\n",
+ "# 训练\n",
+ "card.fit(train_woe_stepwise)\n",
+ "\n",
+ "# 预测\n",
+ "train[\"score\"] = card.predict(train)\n",
+ "test[\"score\"] = card.predict(test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 指标名称 | \n",
+ " 指标含义 | \n",
+ " 分箱 | \n",
+ " 样本总数 | \n",
+ " 样本占比 | \n",
+ " 好样本数 | \n",
+ " 好样本占比 | \n",
+ " 坏样本数 | \n",
+ " 坏样本占比 | \n",
+ " 坏样本率 | \n",
+ " 分档WOE值 | \n",
+ " 分档IV值 | \n",
+ " 指标IV值 | \n",
+ " LIFT值 | \n",
+ " 累积LIFT值 | \n",
+ " 累积好样本数 | \n",
+ " 累积坏样本数 | \n",
+ " 分档KS值 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " score | \n",
+ " 训练集模型评分 | \n",
+ " [负无穷 , 40) | \n",
+ " 111 | \n",
+ " 0.1586 | \n",
+ " 41 | \n",
+ " 0.0837 | \n",
+ " 70 | \n",
+ " 0.3333 | \n",
+ " 0.6306 | \n",
+ " -1.3822 | \n",
+ " 0.3451 | \n",
+ " 0.8421 | \n",
+ " 2.1021 | \n",
+ " 2.1021 | \n",
+ " 41 | \n",
+ " 70 | \n",
+ " 0.2497 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " score | \n",
+ " 训练集模型评分 | \n",
+ " [40 , 60) | \n",
+ " 301 | \n",
+ " 0.4300 | \n",
+ " 195 | \n",
+ " 0.3980 | \n",
+ " 106 | \n",
+ " 0.5048 | \n",
+ " 0.3522 | \n",
+ " -0.2377 | \n",
+ " 0.0254 | \n",
+ " 0.8421 | \n",
+ " 1.1739 | \n",
+ " 1.4239 | \n",
+ " 236 | \n",
+ " 176 | \n",
+ " 0.3565 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " score | \n",
+ " 训练集模型评分 | \n",
+ " [60 , 80) | \n",
+ " 227 | \n",
+ " 0.3243 | \n",
+ " 196 | \n",
+ " 0.4000 | \n",
+ " 31 | \n",
+ " 0.1476 | \n",
+ " 0.1366 | \n",
+ " 0.9968 | \n",
+ " 0.2516 | \n",
+ " 0.8421 | \n",
+ " 0.4552 | \n",
+ " 1.0798 | \n",
+ " 432 | \n",
+ " 207 | \n",
+ " 0.1041 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " score | \n",
+ " 训练集模型评分 | \n",
+ " [80 , 正无穷) | \n",
+ " 61 | \n",
+ " 0.0871 | \n",
+ " 58 | \n",
+ " 0.1184 | \n",
+ " 3 | \n",
+ " 0.0143 | \n",
+ " 0.0492 | \n",
+ " 2.1145 | \n",
+ " 0.2201 | \n",
+ " 0.8421 | \n",
+ " 0.1639 | \n",
+ " 1.0000 | \n",
+ " 490 | \n",
+ " 210 | \n",
+ " 0.0000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 指标名称 指标含义 分箱 样本总数 样本占比 好样本数 好样本占比 坏样本数 坏样本占比 坏样本率 分档WOE值 分档IV值 指标IV值 LIFT值 累积LIFT值 累积好样本数 累积坏样本数 分档KS值\n",
+ "0 score 训练集模型评分 [负无穷 , 40) 111 0.1586 41 0.0837 70 0.3333 0.6306 -1.3822 0.3451 0.8421 2.1021 2.1021 41 70 0.2497\n",
+ "1 score 训练集模型评分 [40 , 60) 301 0.4300 195 0.3980 106 0.5048 0.3522 -0.2377 0.0254 0.8421 1.1739 1.4239 236 176 0.3565\n",
+ "2 score 训练集模型评分 [60 , 80) 227 0.3243 196 0.4000 31 0.1476 0.1366 0.9968 0.2516 0.8421 0.4552 1.0798 432 207 0.1041\n",
+ "3 score 训练集模型评分 [80 , 正无穷) 61 0.0871 58 0.1184 3 0.0143 0.0492 2.1145 0.2201 0.8421 0.1639 1.0000 490 210 0.0000"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# 训练集评分排序性\n",
+ "score_clip = card.score_clip(train[\"score\"], clip=20)\n",
+ "score_table_train = feature_bin_stats(train, \"score\", desc=\"训练集模型评分\", target=target, rules=score_clip)\n",
+ "bin_plot(score_table_train, desc=\"训练集模型评分\", figsize=(10, 6), anchor=0.935, save=\"model_report/train_score_bins.png\")\n",
+ "\n",
+ "display(score_table_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\"savings_account_and_bonds\" in combiner"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{0: '500 <= ... < 1000 DM,unknown/ no savings account,... >= 1000 DM',\n",
+ " 1: '... < 100 DM,100 <= ... < 500 DM',\n",
+ " -1: '缺失值'}"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "feature_bins(np.array(combiner[\"savings_account_and_bonds\"]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# # 查看某个特征的 PSI\n",
+ "# score_clip = card.score_clip(train[\"score\"], clip=10)\n",
+ "# score_table_train = feature_bin_stats(train, \"score\", desc=\"训练集模型评分\", target=target, rules=score_clip)\n",
+ "# score_table_test = feature_bin_stats(test, \"score\", desc=\"测试集模型评分\", target=target, rules=score_clip)\n",
+ "# train_test_score_psi = psi_plot(score_table_train, score_table_test, labels=[\"训练数据集\", \"测试数据集\"], save=\"model_report/train_test_psiplot.png\", result=True)\n",
+ "\n",
+ "# # 查看某个入模特征的 CSI\n",
+ "# for col in card._feature_names:\n",
+ "# feature_table_train = feature_bin_stats(train, col, target=target, desc=\"训练集分布\", combiner=combiner)\n",
+ "# feature_table_test = feature_bin_stats(test, col, target=target, desc=\"测试集分布\", combiner=combiner)\n",
+ "# train_test_csi_table = csi_plot(feature_table_train, feature_table_test, card[col], desc=col, result=True, plot=True, max_len=35, figsize=(10, 6), labels=[\"训练数据集\", \"测试数据集\"], save=f\"model_report/csi_{col}.png\")\n",
+ "# if col == \"savings_account_and_bonds\": break"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# for col in card._feature_names:\n",
+ "# print(col)\n",
+ "# feature_table = feature_bin_stats(data, col, target=target, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", combiner=combiner)\n",
+ "# _ = sp.bin_plot(feature_table, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", figsize=(8, 4), anchor=0.9)\n",
+ " \n",
+ "# display(feature_table)\n",
+ "# plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'bins': array([35., nan]),\n",
+ " 'woes': array([ 0.35631058, -0.40546511, -0.08982696]),\n",
+ " 'weight': 0.9057217143033184,\n",
+ " 'scores': array([ 0.89168832, 10.84566059, 6.7212793 ])}"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "card[\"age_in_years\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "save_pickle(card, \"model_report/scorecard.pkl\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "python: 3.8.13\n",
+ "sklearn: 1.2.2\n",
+ "sklearn2pmml: 0.90.4\n",
+ "joblib: 1.2.0\n",
+ "sklearn_pandas: 2.2.0\n",
+ "pandas: 1.5.3\n",
+ "numpy: 1.22.2\n",
+ "openjdk: 1.8.0_362\n",
+ "Executing command:\n",
+ "java -cp /home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/sklearn2pmml-1.0-SNAPSHOT.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/gson-2.10.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/guava-21.0.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/h2o-genmodel-3.38.0.4.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/h2o-logger-3.38.0.4.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/h2o-tree-api-0.3.17.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/istack-commons-runtime-4.0.1.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/jackson-annotations-2.13.3.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/jakarta.activation-2.0.1.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/jakarta.xml.bind-api-3.0.1.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/jaxb-core-3.0.2.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/jaxb-runtime-3.0.2.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/jcommander-1.72.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pickle-1.3.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-converter-1.5.4.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-h2o-1.2.5.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-lightgbm-1.4.4.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-model-1.6.4.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-model-metro-1.6.4.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-python-1.1.11.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-sklearn-1.7.24.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-sklearn-extension-1.7.24.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-sklearn-h2o-1.7.24.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-sklearn-lightgbm-1.7.24.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-sklearn-statsmodels-1.7.24.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-sklearn-xgboost-1.7.24.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-statsmodels-1.0.1.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/pmml-xgboost-1.7.3.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/serpent-1.40.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/slf4j-api-1.7.36.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/slf4j-jdk14-1.7.36.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/ubjson-0.1.8.jar:/home/itlubber/anaconda3/envs/scorecard/lib/python3.8/site-packages/sklearn2pmml/resources/ubjson-gson-0.1.8.jar com.sklearn2pmml.Main --pkl-pipeline-input /tmp/pipeline-wyc0ltv2.pkl.z --pmml-output model_report/scorecard.pmml\n",
+ "Standard output is empty\n",
+ "Standard error:\n",
+ "十二月 09, 2023 2:40:35 上午 sklearn2pmml.pipeline.PMMLPipeline encodePMML\n",
+ "警告: Model verification data is not set. Use method 'sklearn2pmml.pipeline.PMMLPipeline.verify(X)' to correct this deficiency\n",
+ "\n",
+ "Preserved joblib dump file(s): /tmp/pipeline-wyc0ltv2.pkl.z\n"
+ ]
+ }
+ ],
+ "source": [
+ "scorecard_pipeline = card.scorecard2pmml(pmml=\"model_report/scorecard.pmml\", debug=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pred | \n",
+ " true | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 270 | \n",
+ " 74.6467 | \n",
+ " 74.6467 | \n",
+ "
\n",
+ " \n",
+ " 455 | \n",
+ " 79.4678 | \n",
+ " 79.4678 | \n",
+ "
\n",
+ " \n",
+ " 689 | \n",
+ " 41.6417 | \n",
+ " 41.6417 | \n",
+ "
\n",
+ " \n",
+ " 711 | \n",
+ " 54.6031 | \n",
+ " 54.6031 | \n",
+ "
\n",
+ " \n",
+ " 663 | \n",
+ " 46.4824 | \n",
+ " 46.4824 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 88.8406 | \n",
+ " 88.8406 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 51.4825 | \n",
+ " 51.4825 | \n",
+ "
\n",
+ " \n",
+ " 384 | \n",
+ " 60.9649 | \n",
+ " 60.9649 | \n",
+ "
\n",
+ " \n",
+ " 572 | \n",
+ " 61.9881 | \n",
+ " 61.9881 | \n",
+ "
\n",
+ " \n",
+ " 413 | \n",
+ " 73.7079 | \n",
+ " 73.7079 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
700 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pred true\n",
+ "270 74.6467 74.6467\n",
+ "455 79.4678 79.4678\n",
+ "689 41.6417 41.6417\n",
+ "711 54.6031 54.6031\n",
+ "663 46.4824 46.4824\n",
+ ".. ... ...\n",
+ "16 88.8406 88.8406\n",
+ "15 51.4825 51.4825\n",
+ "384 60.9649 60.9649\n",
+ "572 61.9881 61.9881\n",
+ "413 73.7079 73.7079\n",
+ "\n",
+ "[700 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame({\"pred\": scorecard_pipeline.predict(train), \"true\": train[\"score\"]})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# # 导入超参数搜索方法\n",
+ "# from sklearn.model_selection import GridSearchCV\n",
+ "\n",
+ "# # 构建 pipeline\n",
+ "# model_pipeline = Pipeline([\n",
+ "# (\"preprocessing\", FeatureSelection(target=target, engine=\"scorecardpy\")),\n",
+ "# (\"combiner\", Combiner(target=target, min_bin_size=0.2)),\n",
+ "# (\"transform\", WOETransformer(target=target)),\n",
+ "# (\"processing_select\", FeatureSelection(target=target, engine=\"toad\")),\n",
+ "# (\"stepwise\", StepwiseSelection(target=target)),\n",
+ "# (\"logistic\", ITLubberLogisticRegression(target=target)),\n",
+ "# ])\n",
+ "\n",
+ "# params_grid = {\n",
+ "# \"combiner__max_n_bins\": [3],\n",
+ "# \"logistic__C\": [np.power(2, i) for i in range(5)],\n",
+ "# \"logistic__penalty\": [\"l2\"],\n",
+ "# \"logistic__class_weight\": [None, \"balanced\"] + [{1: i / 10.0, 0: 1 - i / 10.0} for i in range(1, 10, 2)],\n",
+ "# \"logistic__max_iter\": [10, 50, 100],\n",
+ "# \"logistic__solver\": [\"sag\"], # [\"liblinear\", \"sag\", \"lbfgs\", \"newton-cg\"],\n",
+ "# }\n",
+ "\n",
+ "# pipeline_grid_search = GridSearchCV(model_pipeline, params_grid, cv=3, scoring='roc_auc', verbose=1, n_jobs=-1, return_train_score=True)\n",
+ "# pipeline_grid_search.fit(train, train[target])\n",
+ "\n",
+ "# print(pipeline_grid_search.best_params_)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[INFO] filtering variables ...\n",
+ "| | 变量名称 | 变量分箱 | 对应分数 |\n",
+ "|---:|:----------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------|-----------:|\n",
+ "| 0 | purpose | radio/television,car (used) | 12.8514 |\n",
+ "| 1 | purpose | business,furniture/equipment,others,education,domestic appliances,retraining,car (new),缺失值,repairs | 2.7818 |\n",
+ "| 2 | installment_rate_in_percentage_of_disposable_income | [负无穷 , 4.0) | 7.7878 |\n",
+ "| 3 | installment_rate_in_percentage_of_disposable_income | [4.0 , 正无穷) | 0.9877 |\n",
+ "| 4 | installment_rate_in_percentage_of_disposable_income | 缺失值 | 10.7462 |\n",
+ "| 5 | savings_account_and_bonds | 500 <= ... < 1000 DM,unknown/ no savings account,... >= 1000 DM | 5.9834 |\n",
+ "| 6 | savings_account_and_bonds | ... < 100 DM,100 <= ... < 500 DM | 12.1585 |\n",
+ "| 7 | savings_account_and_bonds | 缺失值 | 3.2466 |\n",
+ "| 8 | present_employment_since | 4 <= ... < 7 years,... >= 7 years | 9.4896 |\n",
+ "| 9 | present_employment_since | 缺失值,unemployed,1 <= ... < 4 years,... < 1 year | 3.8957 |\n",
+ "| 10 | age_in_years | [负无穷 , 35.0) | 0.8917 |\n",
+ "| 11 | age_in_years | [35.0 , 正无穷) | 10.8457 |\n",
+ "| 12 | age_in_years | 缺失值 | 6.7213 |\n",
+ "| 13 | property | real estate | 11.2541 |\n",
+ "| 14 | property | 缺失值,building society savings agreement/ life insurance,unknown / no property,car or other, not in attribute Savings account/bonds | 4.0428 |\n",
+ "| 15 | personal_status_and_sex | 缺失值,female : divorced/separated/married | 7.8997 |\n",
+ "| 16 | personal_status_and_sex | male : single,male : married/widowed,male : divorced/separated | 3.7463 |\n",
+ "| 17 | credit_amount | [负无穷 , 2145.0) | 8.0057 |\n",
+ "| 18 | credit_amount | [2145.0 , 3804.0) | 14.2751 |\n",
+ "| 19 | credit_amount | [3804.0 , 正无穷) | -2.6347 |\n",
+ "| 20 | credit_amount | 缺失值 | 2.1778 |\n",
+ "| 21 | status_of_existing_checking_account | no checking account | 22.4653 |\n",
+ "| 22 | status_of_existing_checking_account | 缺失值,... >= 200 DM / salary assignments for at least 1 year | 6.6694 |\n",
+ "| 23 | status_of_existing_checking_account | 0 <= ... < 200 DM | 0.0181 |\n",
+ "| 24 | status_of_existing_checking_account | ... < 0 DM | -4.8558 |\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 构建 pipeline\n",
+ "model_pipeline = Pipeline([\n",
+ " (\"preprocessing\", FeatureSelection(target=target, engine=\"scorecardpy\")),\n",
+ " (\"combiner\", Combiner(target=target, min_bin_size=0.2)),\n",
+ " (\"transform\", WOETransformer(target=target)),\n",
+ " (\"processing_select\", FeatureSelection(target=target, engine=\"toad\")),\n",
+ " (\"stepwise\", StepwiseSelection(target=target)),\n",
+ " (\"logistic\", ITLubberLogisticRegression(target=target)),\n",
+ "])\n",
+ "# 训练 pipeline\n",
+ "model_pipeline.fit(train)\n",
+ "# 转换评分卡\n",
+ "card = ScoreCard(target=target, pipeline=model_pipeline, base_score=50, base_odds=(1 - bad_rate) / bad_rate, pdo=10)\n",
+ "card.fit(model_pipeline[:-1].transform(train))\n",
+ "print(card.scorecard_points().to_markdown())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "card.scorecard_points()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 变量名称 | \n",
+ " 变量分箱 | \n",
+ " 对应分数 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " purpose | \n",
+ " radio/television,car (used) | \n",
+ " 12.8514 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " purpose | \n",
+ " business,furniture/equipment,others,education,domestic appliances,retraining,car (new),缺失值,repairs | \n",
+ " 2.7818 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " installment_rate_in_percentage_of_disposable_income | \n",
+ " [负无穷 , 4.0) | \n",
+ " 7.7878 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " installment_rate_in_percentage_of_disposable_income | \n",
+ " [4.0 , 正无穷) | \n",
+ " 0.9877 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " installment_rate_in_percentage_of_disposable_income | \n",
+ " 缺失值 | \n",
+ " 10.7462 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " savings_account_and_bonds | \n",
+ " 500 <= ... < 1000 DM,unknown/ no savings account,... >= 1000 DM | \n",
+ " 5.9834 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " savings_account_and_bonds | \n",
+ " ... < 100 DM,100 <= ... < 500 DM | \n",
+ " 12.1585 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " savings_account_and_bonds | \n",
+ " 缺失值 | \n",
+ " 3.2466 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " present_employment_since | \n",
+ " 4 <= ... < 7 years,... >= 7 years | \n",
+ " 9.4896 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " present_employment_since | \n",
+ " 缺失值,unemployed,1 <= ... < 4 years,... < 1 year | \n",
+ " 3.8957 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " age_in_years | \n",
+ " [负无穷 , 35.0) | \n",
+ " 0.8917 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " age_in_years | \n",
+ " [35.0 , 正无穷) | \n",
+ " 10.8457 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " age_in_years | \n",
+ " 缺失值 | \n",
+ " 6.7213 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " property | \n",
+ " real estate | \n",
+ " 11.2541 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " property | \n",
+ " 缺失值,building society savings agreement/ life insurance,unknown / no property,car or other, not in attribute Savings account/bonds | \n",
+ " 4.0428 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " personal_status_and_sex | \n",
+ " 缺失值,female : divorced/separated/married | \n",
+ " 7.8997 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " personal_status_and_sex | \n",
+ " male : single,male : married/widowed,male : divorced/separated | \n",
+ " 3.7463 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " credit_amount | \n",
+ " [负无穷 , 2145.0) | \n",
+ " 8.0057 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " credit_amount | \n",
+ " [2145.0 , 3804.0) | \n",
+ " 14.2751 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " credit_amount | \n",
+ " [3804.0 , 正无穷) | \n",
+ " -2.6347 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " credit_amount | \n",
+ " 缺失值 | \n",
+ " 2.1778 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " status_of_existing_checking_account | \n",
+ " no checking account | \n",
+ " 22.4653 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " status_of_existing_checking_account | \n",
+ " 缺失值,... >= 200 DM / salary assignments for at least 1 year | \n",
+ " 6.6694 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " status_of_existing_checking_account | \n",
+ " 0 <= ... < 200 DM | \n",
+ " 0.0181 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " status_of_existing_checking_account | \n",
+ " ... < 0 DM | \n",
+ " -4.8558 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 变量名称 变量分箱 对应分数\n",
+ "0 purpose radio/television,car (used) 12.8514\n",
+ "1 purpose business,furniture/equipment,others,education,domestic appliances,retraining,car (new),缺失值,repairs 2.7818\n",
+ "2 installment_rate_in_percentage_of_disposable_income [负无穷 , 4.0) 7.7878\n",
+ "3 installment_rate_in_percentage_of_disposable_income [4.0 , 正无穷) 0.9877\n",
+ "4 installment_rate_in_percentage_of_disposable_income 缺失值 10.7462\n",
+ "5 savings_account_and_bonds 500 <= ... < 1000 DM,unknown/ no savings account,... >= 1000 DM 5.9834\n",
+ "6 savings_account_and_bonds ... < 100 DM,100 <= ... < 500 DM 12.1585\n",
+ "7 savings_account_and_bonds 缺失值 3.2466\n",
+ "8 present_employment_since 4 <= ... < 7 years,... >= 7 years 9.4896\n",
+ "9 present_employment_since 缺失值,unemployed,1 <= ... < 4 years,... < 1 year 3.8957\n",
+ "10 age_in_years [负无穷 , 35.0) 0.8917\n",
+ "11 age_in_years [35.0 , 正无穷) 10.8457\n",
+ "12 age_in_years 缺失值 6.7213\n",
+ "13 property real estate 11.2541\n",
+ "14 property 缺失值,building society savings agreement/ life insurance,unknown / no property,car or other, not in attribute Savings account/bonds 4.0428\n",
+ "15 personal_status_and_sex 缺失值,female : divorced/separated/married 7.8997\n",
+ "16 personal_status_and_sex male : single,male : married/widowed,male : divorced/separated 3.7463\n",
+ "17 credit_amount [负无穷 , 2145.0) 8.0057\n",
+ "18 credit_amount [2145.0 , 3804.0) 14.2751\n",
+ "19 credit_amount [3804.0 , 正无穷) -2.6347\n",
+ "20 credit_amount 缺失值 2.1778\n",
+ "21 status_of_existing_checking_account no checking account 22.4653\n",
+ "22 status_of_existing_checking_account 缺失值,... >= 200 DM / salary assignments for at least 1 year 6.6694\n",
+ "23 status_of_existing_checking_account 0 <= ... < 200 DM 0.0181\n",
+ "24 status_of_existing_checking_account ... < 0 DM -4.8558"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bad_rate = train[target].mean()\n",
+ "# 逻辑回归模型转评分卡\n",
+ "card = ScoreCard(target=target, pipeline=mdoel_pipeline, base_score=50, base_odds=(1 - bad_rate) / bad_rate, pdo=10)\n",
+ "# 训练\n",
+ "card.fit(mdoel_pipeline[:-1].transform(train))\n",
+ "card.scorecard_points()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 数据字段[可选]\n",
+ "feature_describe = pd.DataFrame([\n",
+ " [\"status_account\", \"支票账户状态\"], [\"duration\", \"借款周期\"], [\"credit_histor\", \"历史信用\"], [\"purpose\", \"借款目的\"], [\"amount\", \"信用额度\"], [\"svaing_account\", \"储蓄账户状态\"], [\"present_emp\", \"当前就业状态\"], [\"income_rate\", \"分期付款占可支配收入百分比\"], [\"personal_status\", \"性别与婚姻状态\"], [\"other_debtors\", \"他人担保信息\"], [\"residence_info\", \"现居住地\"], [\"property\", \"财产状态\"], [\"age\", \"年龄\"], [\"inst_plans\", \"其他分期情况\"], [\"housing\", \"房产状态\"], [\"num_credits\", \"信用卡数量\"], [\"job\", \"工作状态\"], [\"dependents\", \"赡养人数\"], [\"telephone\", \"电话号码注册情况\"], [\"foreign_worke\", \"是否有海外工作经历\"],\n",
+ "], columns=[\"变量名称\", \"变量含义\"])\n",
+ "feature_map = dict(zip(feature_describe[\"变量名称\"], feature_describe[\"变量含义\"]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "writer = sp.ExcelWriter()\n",
+ "start_row, start_col = 2, 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# # ////////////////////////////////////// 样本说明 ///////////////////////////////////// #\n",
+ "worksheet = writer.get_sheet_by_name(\"汇总信息\")\n",
+ "\n",
+ "# 样本总体分布情况\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value=\"样本总体分布情况\", style=\"header\")\n",
+ "end_row, end_col = sp.dataframe2excel(dataset_summary, writer, worksheet, percent_cols=[\"样本占比\", \"坏客户占比\"], start_row=end_row + 1)\n",
+ "\n",
+ "# 建模样本时间分布情况\n",
+ "temp = sp.distribution_plot(df, date=\"date\", target=target, save=\"model_report/all_sample_time_count.png\", result=True)\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"建模样本时间分布情况\", style=\"header\")\n",
+ "end_row, end_col = writer.insert_pic2sheet(worksheet, \"model_report/all_sample_time_count.png\", (end_row, start_col), figsize=(720, 370))\n",
+ "end_row, end_col = sp.dataframe2excel(temp, writer, worksheet, percent_cols=[\"样本占比\", \"好样本占比\", \"坏样本占比\", \"坏样本率\"], condition_cols=[\"坏样本率\"], start_row=end_row)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ////////////////////////////////////// 模型报告 ///////////////////////////////////// #\n",
+ "summary = logistic.summary2(feature_map=feature_map)\n",
+ "\n",
+ "# 逻辑回归拟合情况\n",
+ "worksheet = writer.get_sheet_by_name(\"逻辑回归拟合结果\")\n",
+ "\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value=\"逻辑回归拟合效果\", style=\"header\")\n",
+ "end_row, end_col = sp.dataframe2excel(summary, writer, worksheet, condition_cols=[\"Coef.\"], start_row=end_row + 1)\n",
+ "\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"训练数据集拟合报告\", style=\"header\")\n",
+ "end_row, end_col = sp.dataframe2excel(logistic.report(train_woe_stepwise), writer, worksheet, percent_cols=[\"precision\", \"recall\", \"f1-score\"], start_row=end_row + 1)\n",
+ "\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"测试数据集拟合报告\", style=\"header\")\n",
+ "end_row, end_col = sp.dataframe2excel(logistic.report(test_woe_stepwise), writer, worksheet, percent_cols=[\"precision\", \"recall\", \"f1-score\"], start_row=end_row + 1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ////////////////////////////////////// 特征概述 ///////////////////////////////////// #\n",
+ "worksheet = writer.get_sheet_by_name(\"模型变量信息\")\n",
+ "\n",
+ "start_row, start_col = 2, 2\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value=\"入模变量信息\", style=\"header\")\n",
+ "end_row, end_col = writer.insert_df2sheet(worksheet, feature_describe.reset_index().rename(columns={\"index\": \"序号\"}), (end_row + 1, start_col))\n",
+ "\n",
+ "# 变量分布情况\n",
+ "import toad\n",
+ "data_info = toad.detect(data[card.rules.keys()]).reset_index().rename(columns={\"index\": \"变量名称\", \"type\": \"变量类型\", \"size\": \"样本个数\", \"missing\": \"缺失值\", \"unique\": \"唯一值个数\"})\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"变量分布情况\", style=\"header\")\n",
+ "end_row, end_col = writer.insert_df2sheet(worksheet, data_info, (end_row + 1, start_col))\n",
+ "\n",
+ "# 变量相关性\n",
+ "data_corr = train_woe_stepwise.corr()\n",
+ "logistic.corr(train_woe_stepwise, save=\"model_report/train_corr.png\", annot=False)\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"变量相关性\", style=\"header\")\n",
+ "end_row, end_col = writer.insert_pic2sheet(worksheet, \"model_report/train_corr.png\", (end_row + 1, start_col), figsize=(700, 500))\n",
+ "end_row, end_col = sp.dataframe2excel(data_corr.reset_index().rename(columns={\"index\": \"\"}), writer, worksheet, color_cols=list(data_corr.columns), start_row=end_row + 1)\n",
+ "\n",
+ "# 变量分箱信息\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"变量分箱信息\", style=\"header\")\n",
+ "\n",
+ "for col in logistic.feature_names_in_:\n",
+ " feature_table = sp.feature_bin_stats(data, col, target=target, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", combiner=combiner)\n",
+ " _ = sp.bin_plot(feature_table, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", figsize=(8, 4), save=f\"model_report/bin_plots/data_{col}.png\")\n",
+ " \n",
+ " end_row, end_col = writer.insert_pic2sheet(worksheet, f\"model_report/bin_plots/data_{col}.png\", (end_row + 1, start_col), figsize=(700, 400))\n",
+ " end_row, end_col = sp.dataframe2excel(feature_table, writer, worksheet, percent_cols=[\"样本占比\", \"好样本占比\", \"坏样本占比\", \"坏样本率\", \"LIFT值\", \"累积LIFT值\"], condition_cols=[\"坏样本率\", \"LIFT值\"], start_row=end_row)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 106,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ////////////////////////////////////// 评分卡说明 ///////////////////////////////////// #\n",
+ "worksheet = writer.get_sheet_by_name(\"评分卡结果\")\n",
+ "\n",
+ "# 评分卡刻度\n",
+ "scorecard_kedu = card.scorecard_scale()\n",
+ "scorecard_points = card.scorecard_points(feature_map=feature_map)\n",
+ "scorecard_clip = card.score_clip(train[\"score\"], clip=100)\n",
+ "\n",
+ "start_row, start_col = 2, 2\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value=\"评分卡刻度\", style=\"header\")\n",
+ "end_row, end_col = writer.insert_df2sheet(worksheet, scorecard_kedu, (end_row + 1, start_col))\n",
+ "\n",
+ "# 评分卡对应分数\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"评分卡分数\", style=\"header\")\n",
+ "end_row, end_col = writer.insert_df2sheet(worksheet, scorecard_points, (end_row + 1, start_col), merge_column=\"变量名称\")\n",
+ "\n",
+ "# 评分效果\n",
+ "score_table_train = sp.feature_bin_stats(train, \"score\", desc=\"测试集模型评分\", target=target, rules=scorecard_clip)\n",
+ "score_table_test = sp.feature_bin_stats(test, \"score\", desc=\"测试集模型评分\", target=target, rules=scorecard_clip)\n",
+ "\n",
+ "sp.ks_plot(train[\"score\"], train[target], title=\"Train \\tDataset\", save=\"model_report/train_ksplot.png\")\n",
+ "sp.ks_plot(test[\"score\"], test[target], title=\"Test \\tDataset\", save=\"model_report/test_ksplot.png\")\n",
+ "\n",
+ "sp.hist_plot(train[\"score\"], train[target], save=\"model_report/train_scorehist.png\", bins=30)\n",
+ "sp.hist_plot(test[\"score\"], test[target], save=\"model_report/test_scorehist.png\", bins=30)\n",
+ "\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"训练数据集评分模型效果\", style=\"header\")\n",
+ "ks_row = end_row\n",
+ "end_row, end_col = writer.insert_pic2sheet(worksheet, \"model_report/train_ksplot.png\", (ks_row, start_col))\n",
+ "end_row, end_col = writer.insert_pic2sheet(worksheet, \"model_report/train_scorehist.png\", (ks_row, end_col))\n",
+ "end_row, end_col = sp.dataframe2excel(score_table_train, writer, worksheet, percent_cols=[\"样本占比\", \"好样本占比\", \"坏样本占比\", \"坏样本率\", \"LIFT值\", \"累积LIFT值\", \"分档KS值\"], condition_cols=[\"坏样本率\", \"LIFT值\", \"分档KS值\"], start_row=end_row + 1)\n",
+ "\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"测试数据集评分模型效果\", style=\"header\")\n",
+ "ks_row = end_row\n",
+ "end_row, end_col = writer.insert_pic2sheet(worksheet, \"model_report/test_ksplot.png\", (ks_row, start_col))\n",
+ "end_row, end_col = writer.insert_pic2sheet(worksheet, \"model_report/test_scorehist.png\", (ks_row, end_col))\n",
+ "end_row, end_col = sp.dataframe2excel(score_table_test, writer, worksheet, percent_cols=[\"样本占比\", \"好样本占比\", \"坏样本占比\", \"坏样本率\", \"LIFT值\", \"累积LIFT值\", \"分档KS值\"], condition_cols=[\"坏样本率\", \"LIFT值\", \"分档KS值\"], start_row=end_row + 1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 108,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ////////////////////////////////////// 模型稳定性 ///////////////////////////////////// #\n",
+ "worksheet = writer.get_sheet_by_name(\"模型稳定性\")\n",
+ "start_row, start_col = 2, 2\n",
+ "\n",
+ "# 评分分布稳定性\n",
+ "train_test_score_psi = sp.psi_plot(score_table_train, score_table_test, labels=[\"训练数据集\", \"测试数据集\"], save=\"model_report/train_test_psiplot.png\", result=True)\n",
+ "\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value=\"模型评分稳定性指标 (Population Stability Index, PSI): 训练数据集 vs 测试数据集\", style=\"header\")\n",
+ "end_row, end_col = writer.insert_pic2sheet(worksheet, \"model_report/train_test_psiplot.png\", (end_row, start_col), figsize=(800, 400))\n",
+ "end_row, end_col = sp.dataframe2excel(train_test_score_psi, writer, worksheet, percent_cols=[\"训练数据集样本占比\", \"训练数据集坏样本率\", \"测试数据集样本占比\", \"测试数据集坏样本率\"], condition_cols=[\"分档PSI值\"], start_row=end_row + 1)\n",
+ "\n",
+ "# 变量 PSI 表\n",
+ "for col in card._feature_names:\n",
+ " feature_table_train = sp.feature_bin_stats(train, col, target=target, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", combiner=combiner)\n",
+ " feature_table_test = sp.feature_bin_stats(test, col, target=target, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", combiner=combiner)\n",
+ " psi_table = sp.psi_plot(feature_table_train, feature_table_test, desc=col, result=True, plot=True, max_len=35, figsize=(10, 6), labels=[\"训练数据集\", \"测试数据集\"], save=f\"model_report/psi_{col}.png\")\n",
+ " \n",
+ " end_row, end_col = writer.insert_pic2sheet(worksheet, f\"model_report/psi_{col}.png\", (end_row, start_col), figsize=(700, 400))\n",
+ " end_row, end_col = sp.dataframe2excel(psi_table, writer, worksheet, percent_cols=[\"训练数据集样本占比\", \"训练数据集坏样本率\", \"测试数据集样本占比\", \"测试数据集坏样本率\", \"测试数据集% - 训练数据集%\"], condition_cols=[\"分档PSI值\"], start_row=end_row + 1)\n",
+ "\n",
+ "# 变量 CSI 表\n",
+ "end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=\"入模变量稳定性指标 (Characteristic Stability Index, CSI): 训练数据集 vs 测试数据集\", style=\"header\")\n",
+ "\n",
+ "for col in card._feature_names:\n",
+ " feature_table_train = sp.feature_bin_stats(train, col, target=target, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", combiner=combiner)\n",
+ " feature_table_test = sp.feature_bin_stats(test, col, target=target, desc=feature_map.get(col, \"\") or \"逻辑回归入模变量\", combiner=combiner)\n",
+ " train_test_csi_table = sp.csi_plot(feature_table_train, feature_table_test, card[col], desc=col, result=True, plot=True, max_len=35, figsize=(10, 6), labels=[\"训练数据集\", \"测试数据集\"], save=f\"model_report/csi_{col}.png\")\n",
+ " \n",
+ " end_row, end_col = writer.insert_pic2sheet(worksheet, f\"model_report/csi_{col}.png\", (end_row, start_col), figsize=(700, 400))\n",
+ " end_row, end_col = sp.dataframe2excel(train_test_csi_table, writer, worksheet, percent_cols=[\"训练数据集样本占比\", \"训练数据集坏样本率\", \"测试数据集样本占比\", \"测试数据集坏样本率\", \"测试数据集% - 训练数据集%\"], condition_cols=[\"分档CSI值\"], start_row=end_row + 1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 109,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "writer.save(\"model_report/评分卡模型报告.xlsx\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "scorecard",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.13"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/scorecardpipeline/model.py b/scorecardpipeline/model.py
index 26bb851..6f7e3b7 100644
--- a/scorecardpipeline/model.py
+++ b/scorecardpipeline/model.py
@@ -439,6 +439,51 @@ def scorecard_scale(self):
)
return scorecard_kedu
+ @classmethod
+ def format_bins(self, bins, index=False, ellipsis=None, decimal=4):
+ """分箱转换为标签
+
+ :param bins: 分箱
+ :param index: 是否需要索引
+ :param ellipsis: 字符显示最大长度
+
+ :return: ndarray: 分箱标签
+ """
+ if len(bins) == 0:
+ return ["全部样本"]
+
+ if isinstance(bins, list): bins = np.array(bins)
+ EMPTYBINS = len(bins) if not isinstance(bins[0], (set, list, np.ndarray)) else -1
+
+ l = []
+ if not isinstance(bins[0], (set, list, np.ndarray)):
+ has_empty = len(bins) > 0 and pd.isnull(bins[-1])
+ if has_empty: bins = bins[:-1]
+ sp_l = ["负无穷"] + [round_float(b, decimal=decimal) for b in bins] + ["正无穷"]
+ for i in range(len(sp_l) - 1): l.append('[' + str(sp_l[i]) + ' , ' + str(sp_l[i + 1]) + ')')
+ if has_empty: l.append('缺失值')
+ else:
+ for keys in bins:
+ keys_update = set()
+ for key in keys:
+ if pd.isnull(key) or key == "nan":
+ keys_update.add("缺失值")
+ elif key.strip() == "":
+ keys_update.add("空字符串")
+ else:
+ keys_update.add(key)
+ label = ','.join(keys_update)
+
+ if ellipsis is not None:
+ label = label[:ellipsis] + '..' if len(label) > ellipsis else label
+
+ l.append(label)
+
+ if index:
+ l = ["{:02}.{}".format(i if b != '缺失值' else EMPTYBINS, b) for i, b in enumerate(l)]
+
+ return np.array(l)
+
def scorecard_points(self, feature_map={}):
"""输出评分卡分箱信息及其对应的分数
@@ -476,10 +521,10 @@ def scorecard2pmml(self, pmml: str = 'scorecard.pmml', debug: bool = False):
mapping = {}
for bins, score in zip(rule['bins'], rule['scores'].tolist()):
for _bin in bins:
- if _bin == 'nan':
+ if pd.isnull(_bin) or _bin == 'nan':
default_value = float(score)
-
- mapping[_bin] = float(score)
+ else:
+ mapping[_bin] = float(score)
mapper.append((
[var],
@@ -532,7 +577,12 @@ def scorecard2pmml(self, pmml: str = 'scorecard.pmml', debug: bool = False):
pipeline.named_steps['scorecard'].coef_ = np.ones(len(scorecard_mapper.features))
- sklearn2pmml(pipeline, pmml, with_repr=True, debug=debug)
+ try:
+ sklearn2pmml(pipeline, pmml, with_repr=True, debug=debug)
+ except:
+ import traceback
+ print(traceback.format_exc())
+ return pipeline
if debug:
return pipeline
diff --git a/scorecardpipeline/processing.py b/scorecardpipeline/processing.py
index 6029988..4c29255 100644
--- a/scorecardpipeline/processing.py
+++ b/scorecardpipeline/processing.py
@@ -392,12 +392,12 @@ def catboost_selector(self, x, y, cat_features=None):
class Combiner(TransformerMixin, BaseEstimator):
- def __init__(self, target="target", method='chi', empty_separate=False, min_n_bins=2, max_n_bins=None, max_n_prebins=20, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc", adj_rules={}, n_jobs=1):
+ def __init__(self, target="target", method='chi', empty_separate=True, min_n_bins=2, max_n_bins=None, max_n_prebins=20, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc", adj_rules={}, n_jobs=1):
"""特征分箱封装方法
:param target: 数据集中标签名称,默认 target
:param method: 特征分箱方法,可选 "chi", "dt", "quantile", "step", "kmeans", "cart", "mdlp", "uniform", 参考 toad.Combiner: https://github.com/amphibian-dev/toad/blob/master/toad/transform.py#L178-L355 & optbinning.OptimalBinning: https://gnpalencia.org/optbinning/
- :param empty_separate: 是否空值单独一箱, 默认 False,推荐设置为 True
+ :param empty_separate: 是否空值单独一箱, 默认 True
:param min_n_bins: 最小分箱数,默认 2,即最小拆分2箱
:param max_n_bins: 最大分箱数,默认 None,即不限制拆分箱数,推荐设置 3 ~ 5,不宜过多,偶尔使用 optbinning 时不起效
:param max_n_prebins: 使用 optbinning 时预分箱数量
@@ -431,6 +431,10 @@ def update(self, rules):
"""
self.combiner.update(rules)
+ # 检查规则内容
+ for feature in rules.keys():
+ self.check_rules(feature=feature)
+
def optbinning_bins(self, feature, data=None, target="target", min_n_bins=2, max_n_bins=3, max_n_prebins=10, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc"):
"""基于 optbinning.OptimalBinning 的特征分箱方法,使用 optbinning.OptimalBinning 分箱失败时,使用 toad.transform.Combiner 的卡方分箱处理
@@ -508,14 +512,30 @@ def fit(self, x: pd.DataFrame, y=None):
self.update(self.adj_rules)
+ # 检查类别变量空值是否被转为字符串,如果转为了字符串,强制转回空值,同时检查分箱顺序并调整为正确顺序
+ self.check_rules()
+
return self
-
- def _check_rules(self):
- """检查类别变量空值是否被转为字符串,如果转为了字符串,强制转回空值"""
+
+ def check_rules(self, feature=None):
+ """检查类别变量空值是否被转为字符串,如果转为了字符串,强制转回空值,同时检查分箱顺序并调整为正确顺序"""
for col in self.combiner.rules.keys():
- if not np.issubdtype(self.combiner[col].dtype, np.number):
- if sum([sum([1 for b in r if b in ("nan", "None")]) for r in self.combiner[col]]) > 0:
- self.combiner.update({col: [[np.nan if b in ("nan", "None") else b for b in r] for r in self.combiner[col]]})
+ if feature is not None and col != feature:
+ continue
+
+ _rule = self.combiner[col]
+
+ if not np.issubdtype(_rule.dtype, np.number):
+ if sum([sum([1 for b in r if b in ("nan", "None")]) for r in _rule]) > 0:
+ _rule = [[np.nan if b == "nan" else (None if b == "None" else b) for b in r] for r in _rule]
+ if [np.nan] in _rule:
+ _rule.remove([np.nan])
+ _rule.append([np.nan])
+ if [None] in _rule:
+ _rule.remove([None])
+ _rule.append([None])
+
+ self.combiner.update({col: _rule})
def transform(self, x, y=None, labels=False):
"""特征分箱转换方法
@@ -589,16 +609,15 @@ def feature_bin_stats(cls, data, feature, target="target", rules=None, method='s
else:
_combiner = deepcopy(combiner)
- if rules and len(rules) > 0:
+ if rules is not None and len(rules) > 0:
if isinstance(rules, (list, np.ndarray)):
_combiner.update({feature: rules})
else:
_combiner.update(rules)
- feature_bin_dict = feature_bins(np.array(_combiner[feature]))
+ feature_bin_dict = feature_bins(_combiner[feature])
df_bin = _combiner.transform(data[[feature, target]], labels=False)
-
table = df_bin[[feature, target]].groupby([feature, target]).agg(len).unstack()
table.columns.name = None
table = table.rename(columns={0: '好样本数', 1: '坏样本数'}).fillna(0)
diff --git a/scorecardpipeline/utils.py b/scorecardpipeline/utils.py
index 4a49d49..628a671 100644
--- a/scorecardpipeline/utils.py
+++ b/scorecardpipeline/utils.py
@@ -572,11 +572,11 @@ def psi_plot(expected, actual, labels=["预期", "实际"], desc="", save=None,
ax1.tick_params(axis='x', labelrotation=90)
ax2 = ax1.twinx()
- ax2.plot(df_psi["分箱"], df_psi[f"{labels[0]}坏样本率"], color=colors[0], label=f"{labels[0]}坏样本率", linestyle=(5, (10, 3)))
- ax2.plot(df_psi["分箱"], df_psi[f"{labels[1]}坏样本率"], color=colors[1], label=f"{labels[1]}坏样本率", linestyle=(5, (10, 3)))
+ ax2.plot(x, df_psi[f"{labels[0]}坏样本率"], color=colors[0], label=f"{labels[0]}坏样本率", linestyle=(5, (10, 3)))
+ ax2.plot(x, df_psi[f"{labels[1]}坏样本率"], color=colors[1], label=f"{labels[1]}坏样本率", linestyle=(5, (10, 3)))
- ax2.scatter(df_psi["分箱"], df_psi[f"{labels[0]}坏样本率"], marker=".")
- ax2.scatter(df_psi["分箱"], df_psi[f"{labels[1]}坏样本率"], marker=".")
+ ax2.scatter(x, df_psi[f"{labels[0]}坏样本率"], marker=".")
+ ax2.scatter(x, df_psi[f"{labels[1]}坏样本率"], marker=".")
ax2.set_ylabel('坏样本率: 坏样本数 / 样本总数')
@@ -629,7 +629,7 @@ def csi_plot(expected, actual, score_bins, labels=["预期", "实际"], desc="",
df_csi["指标名称"] = desc
if plot:
- x = df_csi['分箱'].apply(lambda l: l if max_len is None or len(str(l)) < max_len else f"{str(l)[:max_len]}...")
+ x = df_csi['分箱'].apply(lambda l: str(l) if pd.isnull(l) or len(str(l)) < max_len else f"{str(l)[:max_len]}...")
x_indexes = np.arange(len(x))
fig, ax1 = plt.subplots(figsize=figsize)
@@ -642,11 +642,11 @@ def csi_plot(expected, actual, score_bins, labels=["预期", "实际"], desc="",
ax1.tick_params(axis='x', labelrotation=90)
ax2 = ax1.twinx()
- ax2.plot(df_csi["分箱"], df_csi[f"{labels[0]}坏样本率"], color=colors[0], label=f"{labels[0]}坏样本率", linestyle=(5, (10, 3)))
- ax2.plot(df_csi["分箱"], df_csi[f"{labels[1]}坏样本率"], color=colors[1], label=f"{labels[1]}坏样本率", linestyle=(5, (10, 3)))
+ ax2.plot(x, df_csi[f"{labels[0]}坏样本率"], color=colors[0], label=f"{labels[0]}坏样本率", linestyle=(5, (10, 3)))
+ ax2.plot(x, df_csi[f"{labels[1]}坏样本率"], color=colors[1], label=f"{labels[1]}坏样本率", linestyle=(5, (10, 3)))
- ax2.scatter(df_csi["分箱"], df_csi[f"{labels[0]}坏样本率"], marker=".")
- ax2.scatter(df_csi["分箱"], df_csi[f"{labels[1]}坏样本率"], marker=".")
+ ax2.scatter(x, df_csi[f"{labels[0]}坏样本率"], marker=".")
+ ax2.scatter(x, df_csi[f"{labels[1]}坏样本率"], marker=".")
ax2.set_ylabel('坏样本率: 坏样本数 / 样本总数')