diff --git "a/examples/model_report/\345\206\263\347\255\226\346\240\221\347\273\204\345\220\210\347\255\226\347\225\245\346\214\226\346\216\230.xlsx" "b/examples/model_report/\345\206\263\347\255\226\346\240\221\347\273\204\345\220\210\347\255\226\347\225\245\346\214\226\346\216\230.xlsx"
new file mode 100644
index 0000000..0b51582
Binary files /dev/null and "b/examples/model_report/\345\206\263\347\255\226\346\240\221\347\273\204\345\220\210\347\255\226\347\225\245\346\214\226\346\216\230.xlsx" differ
diff --git a/examples/rule_extraction.ipynb b/examples/rule_extraction.ipynb
new file mode 100644
index 0000000..772f6cf
--- /dev/null
+++ b/examples/rule_extraction.ipynb
@@ -0,0 +1,141 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "sys.path.append(\"../\")\n",
+ "\n",
+ "import os\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from scorecardpipeline import *\n",
+ "from scorecardpipeline.rule_extraction import DecisionTreeRuleExtractor\n",
+ "\n",
+ "\n",
+ "logger = init_setting(seed=8888, logger=True)\n",
+ "\n",
+ "\n",
+ "feature_map = {}\n",
+ "n_samples = 10000\n",
+ "ab = np.array(list('ABCDEFG'))\n",
+ "\n",
+ "data = pd.DataFrame({\n",
+ " 'A': np.random.randint(10, size = n_samples),\n",
+ " 'B': ab[np.random.choice(7, n_samples)],\n",
+ " 'C': ab[np.random.choice(2, n_samples)],\n",
+ " '时间': np.random.random(size = n_samples),\n",
+ " 'target': np.random.randint(2, size = n_samples)\n",
+ "})\n",
+ "\n",
+ "\n",
+ "train, test = train_test_split(data, test_size=0.3, stratify=data[\"target\"])\n",
+ "\n",
+ "\n",
+ "pdtr = DecisionTreeRuleExtractor(target=\"target\", feature_map=feature_map, max_iter=8)\n",
+ "pdtr.fit(train, lift=0., max_depth=2, max_samples=1., verbose=False, min_samples_split=8, min_samples_leaf=5)\n",
+ "report = pdtr.report(valid=[test, train, data], save=\"model_report/决策树组合策略挖掘.xlsx\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "outputs": [
+ {
+ "data": {
+ "text/plain": " 组合策略 命中数 命中率 好样本数 好样本占比 坏样本数 坏样本占比 坏率 样本整体坏率 LIFT值\n0 B <= 0.495 & 时间 > 0.943 243 0.0347 142 0.0401 101 0.0292 0.4156 0.4941 0.8411\n1 B <= 0.495 & 时间 <= 0.943 3758 0.5369 1933 0.5459 1825 0.5276 0.4856 0.4941 0.9828\n2 B > 0.495 & 时间 <= 0.877 2669 0.3813 1321 0.3731 1348 0.3897 0.5051 0.4941 1.0221\n3 B > 0.495 & 时间 > 0.877 330 0.0471 145 0.0409 185 0.0535 0.5606 0.4941 1.1345\n4 B <= 0.495 & A <= 8.5 3605 0.5150 1889 0.5335 1716 0.4961 0.4760 0.4941 0.9633\n5 B > 0.495 & A > 4.5 1496 0.2137 749 0.2115 747 0.2160 0.4993 0.4941 1.0105\n6 B > 0.495 & A <= 4.5 1503 0.2147 717 0.2025 786 0.2272 0.5230 0.4941 1.0583\n7 B <= 0.495 & A > 8.5 396 0.0566 186 0.0525 210 0.0607 0.5303 0.4941 1.0732\n8 A <= 8.5 & A <= 3.5 2772 0.3960 1432 0.4044 1340 0.3874 0.4834 0.4941 0.9783\n9 A <= 8.5 & A > 3.5 3526 0.5037 1775 0.5013 1751 0.5062 0.4966 0.4941 1.0050\n10 A > 8.5 & C <= 0.494 358 0.0511 174 0.0491 184 0.0532 0.5140 0.4941 1.0401\n11 A > 8.5 & C > 0.494 344 0.0491 160 0.0452 184 0.0532 0.5349 0.4941 1.0824",
+ "text/html": "
\n\n
\n \n \n | \n 组合策略 | \n 命中数 | \n 命中率 | \n 好样本数 | \n 好样本占比 | \n 坏样本数 | \n 坏样本占比 | \n 坏率 | \n 样本整体坏率 | \n LIFT值 | \n
\n \n \n \n 0 | \n B <= 0.495 & 时间 > 0.943 | \n 243 | \n 0.0347 | \n 142 | \n 0.0401 | \n 101 | \n 0.0292 | \n 0.4156 | \n 0.4941 | \n 0.8411 | \n
\n \n 1 | \n B <= 0.495 & 时间 <= 0.943 | \n 3758 | \n 0.5369 | \n 1933 | \n 0.5459 | \n 1825 | \n 0.5276 | \n 0.4856 | \n 0.4941 | \n 0.9828 | \n
\n \n 2 | \n B > 0.495 & 时间 <= 0.877 | \n 2669 | \n 0.3813 | \n 1321 | \n 0.3731 | \n 1348 | \n 0.3897 | \n 0.5051 | \n 0.4941 | \n 1.0221 | \n
\n \n 3 | \n B > 0.495 & 时间 > 0.877 | \n 330 | \n 0.0471 | \n 145 | \n 0.0409 | \n 185 | \n 0.0535 | \n 0.5606 | \n 0.4941 | \n 1.1345 | \n
\n \n 4 | \n B <= 0.495 & A <= 8.5 | \n 3605 | \n 0.5150 | \n 1889 | \n 0.5335 | \n 1716 | \n 0.4961 | \n 0.4760 | \n 0.4941 | \n 0.9633 | \n
\n \n 5 | \n B > 0.495 & A > 4.5 | \n 1496 | \n 0.2137 | \n 749 | \n 0.2115 | \n 747 | \n 0.2160 | \n 0.4993 | \n 0.4941 | \n 1.0105 | \n
\n \n 6 | \n B > 0.495 & A <= 4.5 | \n 1503 | \n 0.2147 | \n 717 | \n 0.2025 | \n 786 | \n 0.2272 | \n 0.5230 | \n 0.4941 | \n 1.0583 | \n
\n \n 7 | \n B <= 0.495 & A > 8.5 | \n 396 | \n 0.0566 | \n 186 | \n 0.0525 | \n 210 | \n 0.0607 | \n 0.5303 | \n 0.4941 | \n 1.0732 | \n
\n \n 8 | \n A <= 8.5 & A <= 3.5 | \n 2772 | \n 0.3960 | \n 1432 | \n 0.4044 | \n 1340 | \n 0.3874 | \n 0.4834 | \n 0.4941 | \n 0.9783 | \n
\n \n 9 | \n A <= 8.5 & A > 3.5 | \n 3526 | \n 0.5037 | \n 1775 | \n 0.5013 | \n 1751 | \n 0.5062 | \n 0.4966 | \n 0.4941 | \n 1.0050 | \n
\n \n 10 | \n A > 8.5 & C <= 0.494 | \n 358 | \n 0.0511 | \n 174 | \n 0.0491 | \n 184 | \n 0.0532 | \n 0.5140 | \n 0.4941 | \n 1.0401 | \n
\n \n 11 | \n A > 8.5 & C > 0.494 | \n 344 | \n 0.0491 | \n 160 | \n 0.0452 | \n 184 | \n 0.0532 | \n 0.5349 | \n 0.4941 | \n 1.0824 | \n
\n \n
\n
"
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "report[0]"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "outputs": [
+ {
+ "data": {
+ "text/plain": " 组合策略 命中数 命中率 好样本数 好样本占比 坏样本数 坏样本占比 坏率 样本整体坏率 LIFT值\n0 B <= 0.495 & 时间 > 0.943 96 0.0320 50 0.0329 46 0.0310 0.4792 0.4940 0.9700\n1 B <= 0.495 & 时间 <= 0.943 1604 0.5347 825 0.5435 779 0.5256 0.4857 0.4940 0.9831\n2 B > 0.495 & 时间 <= 0.877 1146 0.3820 570 0.3755 576 0.3887 0.5026 0.4940 1.0174\n3 B > 0.495 & 时间 > 0.877 154 0.0513 73 0.0481 81 0.0547 0.5260 0.4940 1.0647\n4 B <= 0.495 & A <= 8.5 1535 0.5117 791 0.5211 744 0.5020 0.4847 0.4940 0.9812\n5 B > 0.495 & A > 4.5 617 0.2057 306 0.2016 311 0.2099 0.5041 0.4940 1.0203\n6 B > 0.495 & A <= 4.5 683 0.2277 337 0.2220 346 0.2335 0.5066 0.4940 1.0255\n7 B <= 0.495 & A > 8.5 165 0.0550 84 0.0553 81 0.0547 0.4909 0.4940 0.9937\n8 A <= 8.5 & A <= 3.5 1214 0.4047 615 0.4051 599 0.4042 0.4934 0.4940 0.9988\n9 A <= 8.5 & A > 3.5 1487 0.4957 744 0.4901 743 0.5013 0.4997 0.4940 1.0115\n10 A > 8.5 & C <= 0.494 147 0.0490 81 0.0534 66 0.0445 0.4490 0.4940 0.9089\n11 A > 8.5 & C > 0.494 152 0.0507 78 0.0514 74 0.0499 0.4868 0.4940 0.9855",
+ "text/html": "\n\n
\n \n \n | \n 组合策略 | \n 命中数 | \n 命中率 | \n 好样本数 | \n 好样本占比 | \n 坏样本数 | \n 坏样本占比 | \n 坏率 | \n 样本整体坏率 | \n LIFT值 | \n
\n \n \n \n 0 | \n B <= 0.495 & 时间 > 0.943 | \n 96 | \n 0.0320 | \n 50 | \n 0.0329 | \n 46 | \n 0.0310 | \n 0.4792 | \n 0.4940 | \n 0.9700 | \n
\n \n 1 | \n B <= 0.495 & 时间 <= 0.943 | \n 1604 | \n 0.5347 | \n 825 | \n 0.5435 | \n 779 | \n 0.5256 | \n 0.4857 | \n 0.4940 | \n 0.9831 | \n
\n \n 2 | \n B > 0.495 & 时间 <= 0.877 | \n 1146 | \n 0.3820 | \n 570 | \n 0.3755 | \n 576 | \n 0.3887 | \n 0.5026 | \n 0.4940 | \n 1.0174 | \n
\n \n 3 | \n B > 0.495 & 时间 > 0.877 | \n 154 | \n 0.0513 | \n 73 | \n 0.0481 | \n 81 | \n 0.0547 | \n 0.5260 | \n 0.4940 | \n 1.0647 | \n
\n \n 4 | \n B <= 0.495 & A <= 8.5 | \n 1535 | \n 0.5117 | \n 791 | \n 0.5211 | \n 744 | \n 0.5020 | \n 0.4847 | \n 0.4940 | \n 0.9812 | \n
\n \n 5 | \n B > 0.495 & A > 4.5 | \n 617 | \n 0.2057 | \n 306 | \n 0.2016 | \n 311 | \n 0.2099 | \n 0.5041 | \n 0.4940 | \n 1.0203 | \n
\n \n 6 | \n B > 0.495 & A <= 4.5 | \n 683 | \n 0.2277 | \n 337 | \n 0.2220 | \n 346 | \n 0.2335 | \n 0.5066 | \n 0.4940 | \n 1.0255 | \n
\n \n 7 | \n B <= 0.495 & A > 8.5 | \n 165 | \n 0.0550 | \n 84 | \n 0.0553 | \n 81 | \n 0.0547 | \n 0.4909 | \n 0.4940 | \n 0.9937 | \n
\n \n 8 | \n A <= 8.5 & A <= 3.5 | \n 1214 | \n 0.4047 | \n 615 | \n 0.4051 | \n 599 | \n 0.4042 | \n 0.4934 | \n 0.4940 | \n 0.9988 | \n
\n \n 9 | \n A <= 8.5 & A > 3.5 | \n 1487 | \n 0.4957 | \n 744 | \n 0.4901 | \n 743 | \n 0.5013 | \n 0.4997 | \n 0.4940 | \n 1.0115 | \n
\n \n 10 | \n A > 8.5 & C <= 0.494 | \n 147 | \n 0.0490 | \n 81 | \n 0.0534 | \n 66 | \n 0.0445 | \n 0.4490 | \n 0.4940 | \n 0.9089 | \n
\n \n 11 | \n A > 8.5 & C > 0.494 | \n 152 | \n 0.0507 | \n 78 | \n 0.0514 | \n 74 | \n 0.0499 | \n 0.4868 | \n 0.4940 | \n 0.9855 | \n
\n \n
\n
"
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "report[1]"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "outputs": [
+ {
+ "data": {
+ "text/plain": " 组合策略 命中数 命中率 好样本数 好样本占比 坏样本数 坏样本占比 坏率 样本整体坏率 LIFT值\n0 B <= 0.495 & 时间 > 0.943 242 0.0346 141 0.0398 101 0.0292 0.4174 0.4941 0.8446\n1 B <= 0.495 & 时间 <= 0.943 3759 0.5370 1934 0.5462 1825 0.5276 0.4855 0.4941 0.9825\n2 B > 0.495 & 时间 <= 0.877 2668 0.3811 1320 0.3728 1348 0.3897 0.5052 0.4941 1.0225\n3 B > 0.495 & 时间 > 0.877 331 0.0473 146 0.0412 185 0.0535 0.5589 0.4941 1.1311\n4 B <= 0.495 & A <= 8.5 3605 0.5150 1889 0.5335 1716 0.4961 0.4760 0.4941 0.9633\n5 B > 0.495 & A > 4.5 1496 0.2137 749 0.2115 747 0.2160 0.4993 0.4941 1.0105\n6 B > 0.495 & A <= 4.5 1503 0.2147 717 0.2025 786 0.2272 0.5230 0.4941 1.0583\n7 B <= 0.495 & A > 8.5 396 0.0566 186 0.0525 210 0.0607 0.5303 0.4941 1.0732\n8 A <= 8.5 & A <= 3.5 2772 0.3960 1432 0.4044 1340 0.3874 0.4834 0.4941 0.9783\n9 A <= 8.5 & A > 3.5 3526 0.5037 1775 0.5013 1751 0.5062 0.4966 0.4941 1.0050\n10 A > 8.5 & C <= 0.494 358 0.0511 174 0.0491 184 0.0532 0.5140 0.4941 1.0401\n11 A > 8.5 & C > 0.494 344 0.0491 160 0.0452 184 0.0532 0.5349 0.4941 1.0824",
+ "text/html": "\n\n
\n \n \n | \n 组合策略 | \n 命中数 | \n 命中率 | \n 好样本数 | \n 好样本占比 | \n 坏样本数 | \n 坏样本占比 | \n 坏率 | \n 样本整体坏率 | \n LIFT值 | \n
\n \n \n \n 0 | \n B <= 0.495 & 时间 > 0.943 | \n 242 | \n 0.0346 | \n 141 | \n 0.0398 | \n 101 | \n 0.0292 | \n 0.4174 | \n 0.4941 | \n 0.8446 | \n
\n \n 1 | \n B <= 0.495 & 时间 <= 0.943 | \n 3759 | \n 0.5370 | \n 1934 | \n 0.5462 | \n 1825 | \n 0.5276 | \n 0.4855 | \n 0.4941 | \n 0.9825 | \n
\n \n 2 | \n B > 0.495 & 时间 <= 0.877 | \n 2668 | \n 0.3811 | \n 1320 | \n 0.3728 | \n 1348 | \n 0.3897 | \n 0.5052 | \n 0.4941 | \n 1.0225 | \n
\n \n 3 | \n B > 0.495 & 时间 > 0.877 | \n 331 | \n 0.0473 | \n 146 | \n 0.0412 | \n 185 | \n 0.0535 | \n 0.5589 | \n 0.4941 | \n 1.1311 | \n
\n \n 4 | \n B <= 0.495 & A <= 8.5 | \n 3605 | \n 0.5150 | \n 1889 | \n 0.5335 | \n 1716 | \n 0.4961 | \n 0.4760 | \n 0.4941 | \n 0.9633 | \n
\n \n 5 | \n B > 0.495 & A > 4.5 | \n 1496 | \n 0.2137 | \n 749 | \n 0.2115 | \n 747 | \n 0.2160 | \n 0.4993 | \n 0.4941 | \n 1.0105 | \n
\n \n 6 | \n B > 0.495 & A <= 4.5 | \n 1503 | \n 0.2147 | \n 717 | \n 0.2025 | \n 786 | \n 0.2272 | \n 0.5230 | \n 0.4941 | \n 1.0583 | \n
\n \n 7 | \n B <= 0.495 & A > 8.5 | \n 396 | \n 0.0566 | \n 186 | \n 0.0525 | \n 210 | \n 0.0607 | \n 0.5303 | \n 0.4941 | \n 1.0732 | \n
\n \n 8 | \n A <= 8.5 & A <= 3.5 | \n 2772 | \n 0.3960 | \n 1432 | \n 0.4044 | \n 1340 | \n 0.3874 | \n 0.4834 | \n 0.4941 | \n 0.9783 | \n
\n \n 9 | \n A <= 8.5 & A > 3.5 | \n 3526 | \n 0.5037 | \n 1775 | \n 0.5013 | \n 1751 | \n 0.5062 | \n 0.4966 | \n 0.4941 | \n 1.0050 | \n
\n \n 10 | \n A > 8.5 & C <= 0.494 | \n 358 | \n 0.0511 | \n 174 | \n 0.0491 | \n 184 | \n 0.0532 | \n 0.5140 | \n 0.4941 | \n 1.0401 | \n
\n \n 11 | \n A > 8.5 & C > 0.494 | \n 344 | \n 0.0491 | \n 160 | \n 0.0452 | \n 184 | \n 0.0532 | \n 0.5349 | \n 0.4941 | \n 1.0824 | \n
\n \n
\n
"
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "report[2]"
+ ],
+ "metadata": {
+ "collapsed": false
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "outputs": [],
+ "source": [],
+ "metadata": {
+ "collapsed": false
+ }
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/scorecardpipeline/rule_extraction.py b/scorecardpipeline/rule_extraction.py
new file mode 100644
index 0000000..1a68b50
--- /dev/null
+++ b/scorecardpipeline/rule_extraction.py
@@ -0,0 +1,358 @@
+# -*- coding: utf-8 -*-
+"""
+@Time : 2024/2/29 13:29
+@Author : itlubber
+@Site : itlubber.art
+"""
+import warnings
+import os
+import re
+import graphviz
+import dtreeviz
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib import font_manager
+from openpyxl.worksheet.worksheet import Worksheet
+
+import category_encoders as ce
+from optbinning import OptimalBinning
+from sklearn.tree import DecisionTreeClassifier
+
+from .utils import init_setting
+from .excel_writer import ExcelWriter, dataframe2excel
+
+
+class DecisionTreeRuleExtractor:
+ def __init__(self, target="target", labels=["positive", "negative"], feature_map={}, nan=-1., max_iter=128, writer=None, combiner=None, seed=None, theme_color="2639E9"):
+ """决策树自动规则挖掘工具包
+
+ :param target: 数据集中好坏样本标签列名称,默认 target
+ :param labels: 好坏样本标签名称,传入一个长度为2的列表,第0个元素为好样本标签,第1个元素为坏样本标签,默认 ["positive", "negative"]
+ :param feature_map: 变量名称及其含义,在后续输出报告和策略信息时增加可读性,默认 {}
+ :param nan: 在决策树策略挖掘时,默认空值填充的值,默认 -1
+ :param max_iter: 最多支持在数据集上训练多少颗树模型,每次生成一棵树后,会剔除特征重要性最高的特征后,再生成树,默认 128
+ :param writer: 在之前程序运行时生成的 ExcelWriter,可以支持传入一个已有的writer,后续所有内容将保存至该workbook中,默认 None
+ """
+ self.seed = seed
+ self.nan = nan
+ self.target = target
+ self.labels = labels
+ self.theme_color = theme_color
+ self.feature_map = feature_map
+ self.decision_trees = []
+ self.max_iter = max_iter
+ self.target_enc = None
+ self.feature_names = None
+ self.dt_rules = pd.DataFrame()
+ self.end_row = 2
+ self.start_col = 2
+ self.describe_columns = ["组合策略", "命中数", "命中率", "好样本数", "好样本占比", "坏样本数", "坏样本占比", "坏率", "样本整体坏率", "LIFT值"]
+
+ init_setting()
+
+ if writer:
+ self.writer = writer
+ else:
+ self.writer = ExcelWriter(theme_color=self.theme_color)
+
+ def encode_cat_features(self, X, y):
+ cat_features = list(set(X.select_dtypes(include=[object, pd.CategoricalDtype]).columns))
+ cat_features_index = [i for i, f in enumerate(X.columns) if f in cat_features]
+
+ if len(cat_features) > 0:
+ if self.target_enc is None:
+ self.target_enc = ce.TargetEncoder(cols=cat_features)
+ self.target_enc.fit(X[cat_features], y)
+ self.target_enc.target_mapping = {}
+ X_TE = X.join(self.target_enc.transform(X[cat_features]).add_suffix('_target'))
+ for col in cat_features:
+ mapping = X_TE[[col, f"{col}_target"]].drop_duplicates()
+ self.target_enc.target_mapping[col] = dict(zip(mapping[col], mapping[f"{col}_target"]))
+ else:
+ X_TE = X.join(self.target_enc.transform(X[cat_features]).add_suffix('_target'))
+
+ X_TE = X_TE.drop(columns=cat_features)
+ return X_TE.rename(columns={f"{c}_target": c for c in cat_features})
+ else:
+ return X
+
+ def get_dt_rules(self, tree, feature_names, total_bad_rate, total_count):
+ tree_ = tree.tree_
+ left = tree.tree_.children_left
+ right = tree.tree_.children_right
+ feature_name = [feature_names[i] if i != -2 else "undefined!" for i in tree_.feature]
+ rules = dict()
+
+ result_dataframe = pd.DataFrame()
+
+ def recurse(node, depth, parent): # 搜每个节点的规则
+ nonlocal result_dataframe
+
+ if tree_.feature[node] != -2: # 非叶子节点,搜索每个节点的规则
+ name = feature_name[node]
+ thd = np.round(tree_.threshold[node], 3)
+ s = "{} <= {} ".format(name, thd, node)
+ # 左子
+ if node == 0:
+ rules[node] = s
+ else:
+ rules[node] = rules[parent] + ' & ' + s
+ recurse(left[node], depth + 1, node)
+ s = "{} > {}".format(name, thd)
+ # 右子
+ if node == 0:
+ rules[node] = s
+ else:
+ rules[node] = rules[parent] + ' & ' + s
+ recurse(right[node], depth + 1, node)
+ else:
+ result = pd.DataFrame()
+ result['组合策略'] = rules[parent],
+ result['好样本数'] = tree_.value[node][0][0].astype(int)
+ result['好样本占比'] = result['好样本数'] / (total_count * (1 - total_bad_rate))
+ result['坏样本数'] = tree_.value[node][0][1].astype(int)
+ result['坏样本占比'] = result['坏样本数'] / (total_count * total_bad_rate)
+ result['命中数'] = result['好样本数'] + result['坏样本数']
+ result['命中率'] = result['命中数'] / total_count
+ result['坏率'] = result['坏样本数'] / result['命中数']
+ result['样本整体坏率'] = total_bad_rate
+ result['LIFT值'] = result['坏率'] / result['样本整体坏率']
+
+ result_dataframe = pd.concat([result_dataframe, result], axis=0)
+
+ recurse(0, 1, 0)
+
+ return result_dataframe.sort_values("LIFT值", ascending=True)[self.describe_columns].reset_index(drop=True)
+
+ def select_dt_rules(self, decision_tree, x, y, lift=0., max_samples=1., save=None, verbose=False, drop=False):
+ rules = self.get_dt_rules(decision_tree, x.columns, sum(y) / len(y), len(y))
+ total_rules = len(rules)
+
+ try:
+ viz_model = dtreeviz.model(decision_tree,
+ X_train=x,
+ y_train=y,
+ feature_names=x.columns,
+ target_name=self.target,
+ class_names=self.labels,
+ )
+ except AttributeError:
+ raise "请检查 dtreeviz 版本"
+
+ rules = rules.query(f"LIFT值 >= {lift} & 命中率 <= {max_samples}").reset_index(drop=True)
+
+ if len(rules) > 0:
+ # font_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'matplot_chinese.ttf')
+ # font_manager.fontManager.addfont(font_path)
+ # plt.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name()
+ # plt.rcParams['axes.unicode_minus'] = False
+
+ decision_tree_viz = viz_model.view(
+ scale=1.5,
+ orientation='LR',
+ colors={
+ "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]],
+ "arrow": "#2639E9",
+ 'text_wedge': "#F76E6C",
+ "pie": "#2639E9",
+ "tile_alpha": 1,
+ "legend_edge": "#FFFFFF",
+ },
+ ticks_fontsize=10,
+ label_fontsize=10,
+ fontname=plt.rcParams['font.family'],
+ )
+ if verbose:
+ from IPython.core.display_functions import display
+ if self.feature_map is not None and len(self.feature_map) > 0:
+ display(rules.replace(self.feature_map, regex=True))
+ else:
+ display(rules)
+ display(decision_tree_viz)
+ if save:
+ if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
+ os.makedirs(os.path.dirname(save))
+
+ try:
+ decision_tree_viz.save("combine_rules_cache.svg")
+ except graphviz.backend.execute.ExecutableNotFound:
+ print("请确保您已安装 graphviz 程序并且正确配置了 PATH 路径。可参考: https://stackoverflow.com/questions/35064304/runtimeerror-make-sure-the-graphviz-executables-are-on-your-systems-path-aft")
+
+ try:
+ import cairosvg
+ cairosvg.svg2png(url="combine_rules_cache.svg", write_to=save, dpi=240)
+ except:
+ from reportlab.graphics import renderPDF
+ from svglib.svglib import svg2rlg
+ drawing = svg2rlg("combine_rules_cache.svg")
+ renderPDF.drawToFile(drawing, save, dpi=240, fmt="PNG")
+
+ if os.path.isfile("combine_rules_cache.svg"):
+ os.remove("combine_rules_cache.svg")
+
+ if os.path.isfile("combine_rules_cache"):
+ os.remove("combine_rules_cache")
+
+ if drop:
+ if len(rules) > 0:
+ return rules, decision_tree.feature_names_in_[list(decision_tree.feature_importances_).index(max(decision_tree.feature_importances_))], total_rules
+ else:
+ return rules, decision_tree.feature_names_in_[list(decision_tree.feature_importances_).index(min(decision_tree.feature_importances_))], total_rules
+ else:
+ return rules, total_rules
+
+ def query_dt_rules(self, x, y, parsed_rules=None):
+ total_count = len(y)
+ total_bad_rate = y.sum() / len(y)
+
+ rules = pd.DataFrame()
+
+ if isinstance(parsed_rules, pd.DataFrame):
+ parsed_rules = parsed_rules["组合策略"].unique()
+
+ for rule in parsed_rules:
+ select_index = x.query(rule).index
+ if len(select_index) > 0:
+ y_select = y[select_index]
+ df = pd.Series()
+ df['组合策略'] = rule
+ df['好样本数'] = len(y_select) - y_select.sum()
+ df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate))
+ df['坏样本数'] = y_select.sum()
+ df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate)
+ df['命中数'] = df['好样本数'] + df['坏样本数']
+ df['命中率'] = df['命中数'] / total_count
+ df['坏率'] = df['坏样本数'] / df['命中数']
+ df['样本整体坏率'] = total_bad_rate
+ df['LIFT值'] = df['坏率'] / df['样本整体坏率']
+ else:
+ df = pd.Series({'组合策略': rule, '好样本数': 0, '好样本占比': 0., '坏样本数': 0, '坏样本占比': 0., '命中数': 0, '命中率': 0., '坏率': 0., '样本整体坏率': total_bad_rate, 'LIFT值': 0., })
+
+ rules = pd.concat([rules, pd.DataFrame(df).T]).reset_index(drop=True)
+
+ return rules[self.describe_columns]
+
+ def insert_dt_rules(self, parsed_rules, end_row, start_col, save=None, sheet=None, figsize=(500, 350)):
+ if isinstance(sheet, Worksheet):
+ worksheet = sheet
+ else:
+ worksheet = self.writer.get_sheet_by_name(sheet or "决策树组合策略挖掘")
+
+ end_row, end_col = dataframe2excel(parsed_rules, self.writer, sheet_name=worksheet, start_row=end_row + 1, start_col=start_col, percent_cols=['好样本占比', '坏样本占比', '命中率', '坏率', '样本整体坏率', 'LIFT值'], condition_cols=["坏率", "LIFT值"])
+
+ if save is not None:
+ end_row, end_col = self.writer.insert_pic2sheet(worksheet, save, (end_row + 1, start_col), figsize=figsize)
+
+ return end_row, end_col
+
+ def fit(self, x, y=None, max_depth=2, lift=0., max_samples=1., min_score=None, verbose=False, *args, **kwargs):
+ """组合策略挖掘
+
+ :param x: 包含标签的数据集
+ :param max_depth: 决策树最大深度,即最多组合的特征个数,默认 2
+ :param lift: 组合策略最小的lift值,默认 0.,即全部组合策略
+ :param max_samples: 每条组合策略的最大样本占比,默认 1.0,即全部组合策略
+ :param min_score: 决策树拟合时最小的auc,如果不满足则停止后续生成决策树
+ :param verbose: 是否调试模式,仅在 jupyter 环境有效
+ :param kwargs: DecisionTreeClassifier 参数
+ """
+ worksheet = self.writer.get_sheet_by_name("策略详情")
+
+ y = x[self.target]
+ X_TE = self.encode_cat_features(x.drop(columns=[self.target]), y)
+ X_TE = X_TE.fillna(self.nan)
+
+ self.feature_names = list(X_TE.columns)
+
+ for i in range(self.max_iter):
+ decision_tree = DecisionTreeClassifier(max_depth=max_depth, *args, **kwargs)
+ decision_tree = decision_tree.fit(X_TE, y)
+
+ if (min_score is not None and decision_tree.score(X_TE, y) < min_score) or len(X_TE.columns) < max_depth:
+ break
+
+ try:
+ parsed_rules, remove, total_rules = self.select_dt_rules(decision_tree, X_TE, y, lift=lift, max_samples=max_samples, verbose=verbose, save=f"model_report/auto_mining_rules/combiner_rules_{i}.png", drop=True)
+
+ if len(parsed_rules) > 0:
+ self.dt_rules = pd.concat([self.dt_rules, parsed_rules]).reset_index(drop=True)
+
+ if self.writer is not None:
+ if self.feature_map is not None and len(self.feature_map) > 0:
+ parsed_rules["组合策略"] = parsed_rules["组合策略"].replace(self.feature_map, regex=True)
+ self.end_row, _ = self.insert_dt_rules(parsed_rules, self.end_row, self.start_col, save=f"model_report/auto_mining_rules/combiner_rules_{i}.png", figsize=(500, 100 * total_rules), sheet=worksheet)
+
+ X_TE = X_TE.drop(columns=remove)
+ self.decision_trees.append(decision_tree)
+ except:
+ import traceback
+ traceback.print_exc()
+
+ if len(self.dt_rules) <= 0:
+ print(f"未挖掘到有效策略, 可以考虑适当调整预设的筛选参数, 降低 lift / 提高 max_samples, 当前筛选标准为: 提取 lift >= {lift} 且 max_samples <= {max_samples} 的策略")
+
+ return self
+
+ def transform(self, x, y=None):
+ y = x[self.target]
+ X_TE = self.encode_cat_features(x.drop(columns=[self.target]), y)
+ X_TE = X_TE.fillna(self.nan)
+ if self.dt_rules is not None and len(self.dt_rules) > 0:
+ parsed_rules = self.query_dt_rules(X_TE, y, parsed_rules=self.dt_rules)
+ if self.feature_map is not None and len(self.feature_map) > 0:
+ parsed_rules["组合策略"] = parsed_rules["组合策略"].replace(self.feature_map, regex=True)
+ return parsed_rules
+ else:
+ return pd.DataFrame(columns=self.describe_columns)
+
+ def report(self, valid=None, sheet="组合策略汇总", save=None):
+ """组合策略插入excel文档
+
+ :param valid: 验证数据集
+ :param sheet: 保存组合策略的表格sheet名称
+ :param save: 保存报告的文件路径
+
+ :return: 返回每个数据集组合策略命中情况
+ """
+ worksheet = self.writer.get_sheet_by_name(sheet or "决策树组合策略挖掘")
+
+ if sheet:
+ self.writer.workbook.move_sheet(sheet, -1)
+
+ parsed_rules_train = self.dt_rules.copy()
+
+ if self.feature_map is not None and len(self.feature_map) > 0:
+ parsed_rules_train["组合策略"] = parsed_rules_train["组合策略"].replace(self.feature_map, regex=True)
+
+ self.end_row, _ = self.writer.insert_value2sheet(worksheet, (2 if sheet else self.end_row + 2, self.start_col), value="组合策略: 训练集", style="header_middle", end_space=(2 if sheet else self.end_row + 2, self.start_col + len(parsed_rules_train.columns)))
+ self.end_row, _ = self.insert_dt_rules(parsed_rules_train, self.end_row, self.start_col, sheet=worksheet)
+ outputs = (parsed_rules_train,)
+
+ if valid is not None:
+ if isinstance(valid, pd.DataFrame) and len(valid) > 0:
+ parsed_rules_val = self.transform(valid)
+ self.end_row, _ = self.writer.insert_value2sheet(worksheet, (self.end_row + 2, self.start_col), value="组合策略: 验证集", style="header_middle", end_space=(self.end_row + 2, self.start_col + len(parsed_rules_val.columns)))
+ self.end_row, _ = self.insert_dt_rules(parsed_rules_val, self.end_row, self.start_col, sheet=worksheet)
+ outputs = outputs + (parsed_rules_val,)
+
+ elif isinstance(valid, (list, tuple)):
+ for i, dataset in enumerate(valid):
+ if isinstance(dataset, pd.DataFrame) and len(dataset) > 0:
+ parsed_rules_val = self.transform(dataset)
+ self.end_row, _ = self.writer.insert_value2sheet(worksheet, (self.end_row + 2, self.start_col), value=f"组合策略: 验证集 {i + 1}", style="header_middle", end_space=(self.end_row + 2, self.start_col + len(parsed_rules_val.columns)))
+ self.end_row, _ = self.insert_dt_rules(parsed_rules_val, self.end_row, self.start_col, sheet=worksheet)
+ outputs = outputs + (parsed_rules_val,)
+
+ elif isinstance(valid, dict):
+ for k, dataset in valid.items():
+ if isinstance(dataset, pd.DataFrame) and len(dataset) > 0:
+ parsed_rules_val = self.transform(dataset)
+ self.end_row, _ = self.writer.insert_value2sheet(worksheet, (self.end_row + 2, self.start_col), value=f"组合策略: {k}", style="header_middle", end_space=(self.end_row + 2, self.start_col + len(parsed_rules_val.columns)))
+ self.end_row, _ = self.insert_dt_rules(parsed_rules_val, self.end_row, self.start_col, sheet=worksheet)
+ outputs = outputs + (parsed_rules_val,)
+
+ if save:
+ self.writer.save(save)
+
+ return outputs