diff --git "a/examples/model_report/\345\206\263\347\255\226\346\240\221\347\273\204\345\220\210\347\255\226\347\225\245\346\214\226\346\216\230.xlsx" "b/examples/model_report/\345\206\263\347\255\226\346\240\221\347\273\204\345\220\210\347\255\226\347\225\245\346\214\226\346\216\230.xlsx" new file mode 100644 index 0000000..0b51582 Binary files /dev/null and "b/examples/model_report/\345\206\263\347\255\226\346\240\221\347\273\204\345\220\210\347\255\226\347\225\245\346\214\226\346\216\230.xlsx" differ diff --git a/examples/rule_extraction.ipynb b/examples/rule_extraction.ipynb new file mode 100644 index 0000000..772f6cf --- /dev/null +++ b/examples/rule_extraction.ipynb @@ -0,0 +1,141 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../\")\n", + "\n", + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "from scorecardpipeline import *\n", + "from scorecardpipeline.rule_extraction import DecisionTreeRuleExtractor\n", + "\n", + "\n", + "logger = init_setting(seed=8888, logger=True)\n", + "\n", + "\n", + "feature_map = {}\n", + "n_samples = 10000\n", + "ab = np.array(list('ABCDEFG'))\n", + "\n", + "data = pd.DataFrame({\n", + " 'A': np.random.randint(10, size = n_samples),\n", + " 'B': ab[np.random.choice(7, n_samples)],\n", + " 'C': ab[np.random.choice(2, n_samples)],\n", + " '时间': np.random.random(size = n_samples),\n", + " 'target': np.random.randint(2, size = n_samples)\n", + "})\n", + "\n", + "\n", + "train, test = train_test_split(data, test_size=0.3, stratify=data[\"target\"])\n", + "\n", + "\n", + "pdtr = DecisionTreeRuleExtractor(target=\"target\", feature_map=feature_map, max_iter=8)\n", + "pdtr.fit(train, lift=0., max_depth=2, max_samples=1., verbose=False, min_samples_split=8, min_samples_leaf=5)\n", + "report = pdtr.report(valid=[test, train, data], save=\"model_report/决策树组合策略挖掘.xlsx\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [ + { + "data": { + "text/plain": " 组合策略 命中数 命中率 好样本数 好样本占比 坏样本数 坏样本占比 坏率 样本整体坏率 LIFT值\n0 B <= 0.495 & 时间 > 0.943 243 0.0347 142 0.0401 101 0.0292 0.4156 0.4941 0.8411\n1 B <= 0.495 & 时间 <= 0.943 3758 0.5369 1933 0.5459 1825 0.5276 0.4856 0.4941 0.9828\n2 B > 0.495 & 时间 <= 0.877 2669 0.3813 1321 0.3731 1348 0.3897 0.5051 0.4941 1.0221\n3 B > 0.495 & 时间 > 0.877 330 0.0471 145 0.0409 185 0.0535 0.5606 0.4941 1.1345\n4 B <= 0.495 & A <= 8.5 3605 0.5150 1889 0.5335 1716 0.4961 0.4760 0.4941 0.9633\n5 B > 0.495 & A > 4.5 1496 0.2137 749 0.2115 747 0.2160 0.4993 0.4941 1.0105\n6 B > 0.495 & A <= 4.5 1503 0.2147 717 0.2025 786 0.2272 0.5230 0.4941 1.0583\n7 B <= 0.495 & A > 8.5 396 0.0566 186 0.0525 210 0.0607 0.5303 0.4941 1.0732\n8 A <= 8.5 & A <= 3.5 2772 0.3960 1432 0.4044 1340 0.3874 0.4834 0.4941 0.9783\n9 A <= 8.5 & A > 3.5 3526 0.5037 1775 0.5013 1751 0.5062 0.4966 0.4941 1.0050\n10 A > 8.5 & C <= 0.494 358 0.0511 174 0.0491 184 0.0532 0.5140 0.4941 1.0401\n11 A > 8.5 & C > 0.494 344 0.0491 160 0.0452 184 0.0532 0.5349 0.4941 1.0824", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
组合策略命中数命中率好样本数好样本占比坏样本数坏样本占比坏率样本整体坏率LIFT值
0B <= 0.495 & 时间 > 0.9432430.03471420.04011010.02920.41560.49410.8411
1B <= 0.495 & 时间 <= 0.94337580.536919330.545918250.52760.48560.49410.9828
2B > 0.495 & 时间 <= 0.87726690.381313210.373113480.38970.50510.49411.0221
3B > 0.495 & 时间 > 0.8773300.04711450.04091850.05350.56060.49411.1345
4B <= 0.495 & A <= 8.536050.515018890.533517160.49610.47600.49410.9633
5B > 0.495 & A > 4.514960.21377490.21157470.21600.49930.49411.0105
6B > 0.495 & A <= 4.515030.21477170.20257860.22720.52300.49411.0583
7B <= 0.495 & A > 8.53960.05661860.05252100.06070.53030.49411.0732
8A <= 8.5 & A <= 3.527720.396014320.404413400.38740.48340.49410.9783
9A <= 8.5 & A > 3.535260.503717750.501317510.50620.49660.49411.0050
10A > 8.5 & C <= 0.4943580.05111740.04911840.05320.51400.49411.0401
11A > 8.5 & C > 0.4943440.04911600.04521840.05320.53490.49411.0824
\n
" + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report[0]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "data": { + "text/plain": " 组合策略 命中数 命中率 好样本数 好样本占比 坏样本数 坏样本占比 坏率 样本整体坏率 LIFT值\n0 B <= 0.495 & 时间 > 0.943 96 0.0320 50 0.0329 46 0.0310 0.4792 0.4940 0.9700\n1 B <= 0.495 & 时间 <= 0.943 1604 0.5347 825 0.5435 779 0.5256 0.4857 0.4940 0.9831\n2 B > 0.495 & 时间 <= 0.877 1146 0.3820 570 0.3755 576 0.3887 0.5026 0.4940 1.0174\n3 B > 0.495 & 时间 > 0.877 154 0.0513 73 0.0481 81 0.0547 0.5260 0.4940 1.0647\n4 B <= 0.495 & A <= 8.5 1535 0.5117 791 0.5211 744 0.5020 0.4847 0.4940 0.9812\n5 B > 0.495 & A > 4.5 617 0.2057 306 0.2016 311 0.2099 0.5041 0.4940 1.0203\n6 B > 0.495 & A <= 4.5 683 0.2277 337 0.2220 346 0.2335 0.5066 0.4940 1.0255\n7 B <= 0.495 & A > 8.5 165 0.0550 84 0.0553 81 0.0547 0.4909 0.4940 0.9937\n8 A <= 8.5 & A <= 3.5 1214 0.4047 615 0.4051 599 0.4042 0.4934 0.4940 0.9988\n9 A <= 8.5 & A > 3.5 1487 0.4957 744 0.4901 743 0.5013 0.4997 0.4940 1.0115\n10 A > 8.5 & C <= 0.494 147 0.0490 81 0.0534 66 0.0445 0.4490 0.4940 0.9089\n11 A > 8.5 & C > 0.494 152 0.0507 78 0.0514 74 0.0499 0.4868 0.4940 0.9855", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
组合策略命中数命中率好样本数好样本占比坏样本数坏样本占比坏率样本整体坏率LIFT值
0B <= 0.495 & 时间 > 0.943960.0320500.0329460.03100.47920.49400.9700
1B <= 0.495 & 时间 <= 0.94316040.53478250.54357790.52560.48570.49400.9831
2B > 0.495 & 时间 <= 0.87711460.38205700.37555760.38870.50260.49401.0174
3B > 0.495 & 时间 > 0.8771540.0513730.0481810.05470.52600.49401.0647
4B <= 0.495 & A <= 8.515350.51177910.52117440.50200.48470.49400.9812
5B > 0.495 & A > 4.56170.20573060.20163110.20990.50410.49401.0203
6B > 0.495 & A <= 4.56830.22773370.22203460.23350.50660.49401.0255
7B <= 0.495 & A > 8.51650.0550840.0553810.05470.49090.49400.9937
8A <= 8.5 & A <= 3.512140.40476150.40515990.40420.49340.49400.9988
9A <= 8.5 & A > 3.514870.49577440.49017430.50130.49970.49401.0115
10A > 8.5 & C <= 0.4941470.0490810.0534660.04450.44900.49400.9089
11A > 8.5 & C > 0.4941520.0507780.0514740.04990.48680.49400.9855
\n
" + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report[1]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "data": { + "text/plain": " 组合策略 命中数 命中率 好样本数 好样本占比 坏样本数 坏样本占比 坏率 样本整体坏率 LIFT值\n0 B <= 0.495 & 时间 > 0.943 242 0.0346 141 0.0398 101 0.0292 0.4174 0.4941 0.8446\n1 B <= 0.495 & 时间 <= 0.943 3759 0.5370 1934 0.5462 1825 0.5276 0.4855 0.4941 0.9825\n2 B > 0.495 & 时间 <= 0.877 2668 0.3811 1320 0.3728 1348 0.3897 0.5052 0.4941 1.0225\n3 B > 0.495 & 时间 > 0.877 331 0.0473 146 0.0412 185 0.0535 0.5589 0.4941 1.1311\n4 B <= 0.495 & A <= 8.5 3605 0.5150 1889 0.5335 1716 0.4961 0.4760 0.4941 0.9633\n5 B > 0.495 & A > 4.5 1496 0.2137 749 0.2115 747 0.2160 0.4993 0.4941 1.0105\n6 B > 0.495 & A <= 4.5 1503 0.2147 717 0.2025 786 0.2272 0.5230 0.4941 1.0583\n7 B <= 0.495 & A > 8.5 396 0.0566 186 0.0525 210 0.0607 0.5303 0.4941 1.0732\n8 A <= 8.5 & A <= 3.5 2772 0.3960 1432 0.4044 1340 0.3874 0.4834 0.4941 0.9783\n9 A <= 8.5 & A > 3.5 3526 0.5037 1775 0.5013 1751 0.5062 0.4966 0.4941 1.0050\n10 A > 8.5 & C <= 0.494 358 0.0511 174 0.0491 184 0.0532 0.5140 0.4941 1.0401\n11 A > 8.5 & C > 0.494 344 0.0491 160 0.0452 184 0.0532 0.5349 0.4941 1.0824", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
组合策略命中数命中率好样本数好样本占比坏样本数坏样本占比坏率样本整体坏率LIFT值
0B <= 0.495 & 时间 > 0.9432420.03461410.03981010.02920.41740.49410.8446
1B <= 0.495 & 时间 <= 0.94337590.537019340.546218250.52760.48550.49410.9825
2B > 0.495 & 时间 <= 0.87726680.381113200.372813480.38970.50520.49411.0225
3B > 0.495 & 时间 > 0.8773310.04731460.04121850.05350.55890.49411.1311
4B <= 0.495 & A <= 8.536050.515018890.533517160.49610.47600.49410.9633
5B > 0.495 & A > 4.514960.21377490.21157470.21600.49930.49411.0105
6B > 0.495 & A <= 4.515030.21477170.20257860.22720.52300.49411.0583
7B <= 0.495 & A > 8.53960.05661860.05252100.06070.53030.49411.0732
8A <= 8.5 & A <= 3.527720.396014320.404413400.38740.48340.49410.9783
9A <= 8.5 & A > 3.535260.503717750.501317510.50620.49660.49411.0050
10A > 8.5 & C <= 0.4943580.05111740.04911840.05320.51400.49411.0401
11A > 8.5 & C > 0.4943440.04911600.04521840.05320.53490.49411.0824
\n
" + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report[2]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/scorecardpipeline/rule_extraction.py b/scorecardpipeline/rule_extraction.py new file mode 100644 index 0000000..1a68b50 --- /dev/null +++ b/scorecardpipeline/rule_extraction.py @@ -0,0 +1,358 @@ +# -*- coding: utf-8 -*- +""" +@Time : 2024/2/29 13:29 +@Author : itlubber +@Site : itlubber.art +""" +import warnings +import os +import re +import graphviz +import dtreeviz +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib import font_manager +from openpyxl.worksheet.worksheet import Worksheet + +import category_encoders as ce +from optbinning import OptimalBinning +from sklearn.tree import DecisionTreeClassifier + +from .utils import init_setting +from .excel_writer import ExcelWriter, dataframe2excel + + +class DecisionTreeRuleExtractor: + def __init__(self, target="target", labels=["positive", "negative"], feature_map={}, nan=-1., max_iter=128, writer=None, combiner=None, seed=None, theme_color="2639E9"): + """决策树自动规则挖掘工具包 + + :param target: 数据集中好坏样本标签列名称,默认 target + :param labels: 好坏样本标签名称,传入一个长度为2的列表,第0个元素为好样本标签,第1个元素为坏样本标签,默认 ["positive", "negative"] + :param feature_map: 变量名称及其含义,在后续输出报告和策略信息时增加可读性,默认 {} + :param nan: 在决策树策略挖掘时,默认空值填充的值,默认 -1 + :param max_iter: 最多支持在数据集上训练多少颗树模型,每次生成一棵树后,会剔除特征重要性最高的特征后,再生成树,默认 128 + :param writer: 在之前程序运行时生成的 ExcelWriter,可以支持传入一个已有的writer,后续所有内容将保存至该workbook中,默认 None + """ + self.seed = seed + self.nan = nan + self.target = target + self.labels = labels + self.theme_color = theme_color + self.feature_map = feature_map + self.decision_trees = [] + self.max_iter = max_iter + self.target_enc = None + self.feature_names = None + self.dt_rules = pd.DataFrame() + self.end_row = 2 + self.start_col = 2 + self.describe_columns = ["组合策略", "命中数", "命中率", "好样本数", "好样本占比", "坏样本数", "坏样本占比", "坏率", "样本整体坏率", "LIFT值"] + + init_setting() + + if writer: + self.writer = writer + else: + self.writer = ExcelWriter(theme_color=self.theme_color) + + def encode_cat_features(self, X, y): + cat_features = list(set(X.select_dtypes(include=[object, pd.CategoricalDtype]).columns)) + cat_features_index = [i for i, f in enumerate(X.columns) if f in cat_features] + + if len(cat_features) > 0: + if self.target_enc is None: + self.target_enc = ce.TargetEncoder(cols=cat_features) + self.target_enc.fit(X[cat_features], y) + self.target_enc.target_mapping = {} + X_TE = X.join(self.target_enc.transform(X[cat_features]).add_suffix('_target')) + for col in cat_features: + mapping = X_TE[[col, f"{col}_target"]].drop_duplicates() + self.target_enc.target_mapping[col] = dict(zip(mapping[col], mapping[f"{col}_target"])) + else: + X_TE = X.join(self.target_enc.transform(X[cat_features]).add_suffix('_target')) + + X_TE = X_TE.drop(columns=cat_features) + return X_TE.rename(columns={f"{c}_target": c for c in cat_features}) + else: + return X + + def get_dt_rules(self, tree, feature_names, total_bad_rate, total_count): + tree_ = tree.tree_ + left = tree.tree_.children_left + right = tree.tree_.children_right + feature_name = [feature_names[i] if i != -2 else "undefined!" for i in tree_.feature] + rules = dict() + + result_dataframe = pd.DataFrame() + + def recurse(node, depth, parent): # 搜每个节点的规则 + nonlocal result_dataframe + + if tree_.feature[node] != -2: # 非叶子节点,搜索每个节点的规则 + name = feature_name[node] + thd = np.round(tree_.threshold[node], 3) + s = "{} <= {} ".format(name, thd, node) + # 左子 + if node == 0: + rules[node] = s + else: + rules[node] = rules[parent] + ' & ' + s + recurse(left[node], depth + 1, node) + s = "{} > {}".format(name, thd) + # 右子 + if node == 0: + rules[node] = s + else: + rules[node] = rules[parent] + ' & ' + s + recurse(right[node], depth + 1, node) + else: + result = pd.DataFrame() + result['组合策略'] = rules[parent], + result['好样本数'] = tree_.value[node][0][0].astype(int) + result['好样本占比'] = result['好样本数'] / (total_count * (1 - total_bad_rate)) + result['坏样本数'] = tree_.value[node][0][1].astype(int) + result['坏样本占比'] = result['坏样本数'] / (total_count * total_bad_rate) + result['命中数'] = result['好样本数'] + result['坏样本数'] + result['命中率'] = result['命中数'] / total_count + result['坏率'] = result['坏样本数'] / result['命中数'] + result['样本整体坏率'] = total_bad_rate + result['LIFT值'] = result['坏率'] / result['样本整体坏率'] + + result_dataframe = pd.concat([result_dataframe, result], axis=0) + + recurse(0, 1, 0) + + return result_dataframe.sort_values("LIFT值", ascending=True)[self.describe_columns].reset_index(drop=True) + + def select_dt_rules(self, decision_tree, x, y, lift=0., max_samples=1., save=None, verbose=False, drop=False): + rules = self.get_dt_rules(decision_tree, x.columns, sum(y) / len(y), len(y)) + total_rules = len(rules) + + try: + viz_model = dtreeviz.model(decision_tree, + X_train=x, + y_train=y, + feature_names=x.columns, + target_name=self.target, + class_names=self.labels, + ) + except AttributeError: + raise "请检查 dtreeviz 版本" + + rules = rules.query(f"LIFT值 >= {lift} & 命中率 <= {max_samples}").reset_index(drop=True) + + if len(rules) > 0: + # font_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'matplot_chinese.ttf') + # font_manager.fontManager.addfont(font_path) + # plt.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name() + # plt.rcParams['axes.unicode_minus'] = False + + decision_tree_viz = viz_model.view( + scale=1.5, + orientation='LR', + colors={ + "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]], + "arrow": "#2639E9", + 'text_wedge': "#F76E6C", + "pie": "#2639E9", + "tile_alpha": 1, + "legend_edge": "#FFFFFF", + }, + ticks_fontsize=10, + label_fontsize=10, + fontname=plt.rcParams['font.family'], + ) + if verbose: + from IPython.core.display_functions import display + if self.feature_map is not None and len(self.feature_map) > 0: + display(rules.replace(self.feature_map, regex=True)) + else: + display(rules) + display(decision_tree_viz) + if save: + if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)): + os.makedirs(os.path.dirname(save)) + + try: + decision_tree_viz.save("combine_rules_cache.svg") + except graphviz.backend.execute.ExecutableNotFound: + print("请确保您已安装 graphviz 程序并且正确配置了 PATH 路径。可参考: https://stackoverflow.com/questions/35064304/runtimeerror-make-sure-the-graphviz-executables-are-on-your-systems-path-aft") + + try: + import cairosvg + cairosvg.svg2png(url="combine_rules_cache.svg", write_to=save, dpi=240) + except: + from reportlab.graphics import renderPDF + from svglib.svglib import svg2rlg + drawing = svg2rlg("combine_rules_cache.svg") + renderPDF.drawToFile(drawing, save, dpi=240, fmt="PNG") + + if os.path.isfile("combine_rules_cache.svg"): + os.remove("combine_rules_cache.svg") + + if os.path.isfile("combine_rules_cache"): + os.remove("combine_rules_cache") + + if drop: + if len(rules) > 0: + return rules, decision_tree.feature_names_in_[list(decision_tree.feature_importances_).index(max(decision_tree.feature_importances_))], total_rules + else: + return rules, decision_tree.feature_names_in_[list(decision_tree.feature_importances_).index(min(decision_tree.feature_importances_))], total_rules + else: + return rules, total_rules + + def query_dt_rules(self, x, y, parsed_rules=None): + total_count = len(y) + total_bad_rate = y.sum() / len(y) + + rules = pd.DataFrame() + + if isinstance(parsed_rules, pd.DataFrame): + parsed_rules = parsed_rules["组合策略"].unique() + + for rule in parsed_rules: + select_index = x.query(rule).index + if len(select_index) > 0: + y_select = y[select_index] + df = pd.Series() + df['组合策略'] = rule + df['好样本数'] = len(y_select) - y_select.sum() + df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate)) + df['坏样本数'] = y_select.sum() + df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate) + df['命中数'] = df['好样本数'] + df['坏样本数'] + df['命中率'] = df['命中数'] / total_count + df['坏率'] = df['坏样本数'] / df['命中数'] + df['样本整体坏率'] = total_bad_rate + df['LIFT值'] = df['坏率'] / df['样本整体坏率'] + else: + df = pd.Series({'组合策略': rule, '好样本数': 0, '好样本占比': 0., '坏样本数': 0, '坏样本占比': 0., '命中数': 0, '命中率': 0., '坏率': 0., '样本整体坏率': total_bad_rate, 'LIFT值': 0., }) + + rules = pd.concat([rules, pd.DataFrame(df).T]).reset_index(drop=True) + + return rules[self.describe_columns] + + def insert_dt_rules(self, parsed_rules, end_row, start_col, save=None, sheet=None, figsize=(500, 350)): + if isinstance(sheet, Worksheet): + worksheet = sheet + else: + worksheet = self.writer.get_sheet_by_name(sheet or "决策树组合策略挖掘") + + end_row, end_col = dataframe2excel(parsed_rules, self.writer, sheet_name=worksheet, start_row=end_row + 1, start_col=start_col, percent_cols=['好样本占比', '坏样本占比', '命中率', '坏率', '样本整体坏率', 'LIFT值'], condition_cols=["坏率", "LIFT值"]) + + if save is not None: + end_row, end_col = self.writer.insert_pic2sheet(worksheet, save, (end_row + 1, start_col), figsize=figsize) + + return end_row, end_col + + def fit(self, x, y=None, max_depth=2, lift=0., max_samples=1., min_score=None, verbose=False, *args, **kwargs): + """组合策略挖掘 + + :param x: 包含标签的数据集 + :param max_depth: 决策树最大深度,即最多组合的特征个数,默认 2 + :param lift: 组合策略最小的lift值,默认 0.,即全部组合策略 + :param max_samples: 每条组合策略的最大样本占比,默认 1.0,即全部组合策略 + :param min_score: 决策树拟合时最小的auc,如果不满足则停止后续生成决策树 + :param verbose: 是否调试模式,仅在 jupyter 环境有效 + :param kwargs: DecisionTreeClassifier 参数 + """ + worksheet = self.writer.get_sheet_by_name("策略详情") + + y = x[self.target] + X_TE = self.encode_cat_features(x.drop(columns=[self.target]), y) + X_TE = X_TE.fillna(self.nan) + + self.feature_names = list(X_TE.columns) + + for i in range(self.max_iter): + decision_tree = DecisionTreeClassifier(max_depth=max_depth, *args, **kwargs) + decision_tree = decision_tree.fit(X_TE, y) + + if (min_score is not None and decision_tree.score(X_TE, y) < min_score) or len(X_TE.columns) < max_depth: + break + + try: + parsed_rules, remove, total_rules = self.select_dt_rules(decision_tree, X_TE, y, lift=lift, max_samples=max_samples, verbose=verbose, save=f"model_report/auto_mining_rules/combiner_rules_{i}.png", drop=True) + + if len(parsed_rules) > 0: + self.dt_rules = pd.concat([self.dt_rules, parsed_rules]).reset_index(drop=True) + + if self.writer is not None: + if self.feature_map is not None and len(self.feature_map) > 0: + parsed_rules["组合策略"] = parsed_rules["组合策略"].replace(self.feature_map, regex=True) + self.end_row, _ = self.insert_dt_rules(parsed_rules, self.end_row, self.start_col, save=f"model_report/auto_mining_rules/combiner_rules_{i}.png", figsize=(500, 100 * total_rules), sheet=worksheet) + + X_TE = X_TE.drop(columns=remove) + self.decision_trees.append(decision_tree) + except: + import traceback + traceback.print_exc() + + if len(self.dt_rules) <= 0: + print(f"未挖掘到有效策略, 可以考虑适当调整预设的筛选参数, 降低 lift / 提高 max_samples, 当前筛选标准为: 提取 lift >= {lift} 且 max_samples <= {max_samples} 的策略") + + return self + + def transform(self, x, y=None): + y = x[self.target] + X_TE = self.encode_cat_features(x.drop(columns=[self.target]), y) + X_TE = X_TE.fillna(self.nan) + if self.dt_rules is not None and len(self.dt_rules) > 0: + parsed_rules = self.query_dt_rules(X_TE, y, parsed_rules=self.dt_rules) + if self.feature_map is not None and len(self.feature_map) > 0: + parsed_rules["组合策略"] = parsed_rules["组合策略"].replace(self.feature_map, regex=True) + return parsed_rules + else: + return pd.DataFrame(columns=self.describe_columns) + + def report(self, valid=None, sheet="组合策略汇总", save=None): + """组合策略插入excel文档 + + :param valid: 验证数据集 + :param sheet: 保存组合策略的表格sheet名称 + :param save: 保存报告的文件路径 + + :return: 返回每个数据集组合策略命中情况 + """ + worksheet = self.writer.get_sheet_by_name(sheet or "决策树组合策略挖掘") + + if sheet: + self.writer.workbook.move_sheet(sheet, -1) + + parsed_rules_train = self.dt_rules.copy() + + if self.feature_map is not None and len(self.feature_map) > 0: + parsed_rules_train["组合策略"] = parsed_rules_train["组合策略"].replace(self.feature_map, regex=True) + + self.end_row, _ = self.writer.insert_value2sheet(worksheet, (2 if sheet else self.end_row + 2, self.start_col), value="组合策略: 训练集", style="header_middle", end_space=(2 if sheet else self.end_row + 2, self.start_col + len(parsed_rules_train.columns))) + self.end_row, _ = self.insert_dt_rules(parsed_rules_train, self.end_row, self.start_col, sheet=worksheet) + outputs = (parsed_rules_train,) + + if valid is not None: + if isinstance(valid, pd.DataFrame) and len(valid) > 0: + parsed_rules_val = self.transform(valid) + self.end_row, _ = self.writer.insert_value2sheet(worksheet, (self.end_row + 2, self.start_col), value="组合策略: 验证集", style="header_middle", end_space=(self.end_row + 2, self.start_col + len(parsed_rules_val.columns))) + self.end_row, _ = self.insert_dt_rules(parsed_rules_val, self.end_row, self.start_col, sheet=worksheet) + outputs = outputs + (parsed_rules_val,) + + elif isinstance(valid, (list, tuple)): + for i, dataset in enumerate(valid): + if isinstance(dataset, pd.DataFrame) and len(dataset) > 0: + parsed_rules_val = self.transform(dataset) + self.end_row, _ = self.writer.insert_value2sheet(worksheet, (self.end_row + 2, self.start_col), value=f"组合策略: 验证集 {i + 1}", style="header_middle", end_space=(self.end_row + 2, self.start_col + len(parsed_rules_val.columns))) + self.end_row, _ = self.insert_dt_rules(parsed_rules_val, self.end_row, self.start_col, sheet=worksheet) + outputs = outputs + (parsed_rules_val,) + + elif isinstance(valid, dict): + for k, dataset in valid.items(): + if isinstance(dataset, pd.DataFrame) and len(dataset) > 0: + parsed_rules_val = self.transform(dataset) + self.end_row, _ = self.writer.insert_value2sheet(worksheet, (self.end_row + 2, self.start_col), value=f"组合策略: {k}", style="header_middle", end_space=(self.end_row + 2, self.start_col + len(parsed_rules_val.columns))) + self.end_row, _ = self.insert_dt_rules(parsed_rules_val, self.end_row, self.start_col, sheet=worksheet) + outputs = outputs + (parsed_rules_val,) + + if save: + self.writer.save(save) + + return outputs