Skip to content

Commit

Permalink
add rule extractor method
Browse files Browse the repository at this point in the history
  • Loading branch information
itlubber committed Feb 29, 2024
1 parent a59e898 commit 79dee0b
Show file tree
Hide file tree
Showing 3 changed files with 499 additions and 0 deletions.
Binary file not shown.
141 changes: 141 additions & 0 deletions examples/rule_extraction.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append(\"../\")\n",
"\n",
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"from scorecardpipeline import *\n",
"from scorecardpipeline.rule_extraction import DecisionTreeRuleExtractor\n",
"\n",
"\n",
"logger = init_setting(seed=8888, logger=True)\n",
"\n",
"\n",
"feature_map = {}\n",
"n_samples = 10000\n",
"ab = np.array(list('ABCDEFG'))\n",
"\n",
"data = pd.DataFrame({\n",
" 'A': np.random.randint(10, size = n_samples),\n",
" 'B': ab[np.random.choice(7, n_samples)],\n",
" 'C': ab[np.random.choice(2, n_samples)],\n",
" '时间': np.random.random(size = n_samples),\n",
" 'target': np.random.randint(2, size = n_samples)\n",
"})\n",
"\n",
"\n",
"train, test = train_test_split(data, test_size=0.3, stratify=data[\"target\"])\n",
"\n",
"\n",
"pdtr = DecisionTreeRuleExtractor(target=\"target\", feature_map=feature_map, max_iter=8)\n",
"pdtr.fit(train, lift=0., max_depth=2, max_samples=1., verbose=False, min_samples_split=8, min_samples_leaf=5)\n",
"report = pdtr.report(valid=[test, train, data], save=\"model_report/决策树组合策略挖掘.xlsx\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"data": {
"text/plain": " 组合策略 命中数 命中率 好样本数 好样本占比 坏样本数 坏样本占比 坏率 样本整体坏率 LIFT值\n0 B <= 0.495 & 时间 > 0.943 243 0.0347 142 0.0401 101 0.0292 0.4156 0.4941 0.8411\n1 B <= 0.495 & 时间 <= 0.943 3758 0.5369 1933 0.5459 1825 0.5276 0.4856 0.4941 0.9828\n2 B > 0.495 & 时间 <= 0.877 2669 0.3813 1321 0.3731 1348 0.3897 0.5051 0.4941 1.0221\n3 B > 0.495 & 时间 > 0.877 330 0.0471 145 0.0409 185 0.0535 0.5606 0.4941 1.1345\n4 B <= 0.495 & A <= 8.5 3605 0.5150 1889 0.5335 1716 0.4961 0.4760 0.4941 0.9633\n5 B > 0.495 & A > 4.5 1496 0.2137 749 0.2115 747 0.2160 0.4993 0.4941 1.0105\n6 B > 0.495 & A <= 4.5 1503 0.2147 717 0.2025 786 0.2272 0.5230 0.4941 1.0583\n7 B <= 0.495 & A > 8.5 396 0.0566 186 0.0525 210 0.0607 0.5303 0.4941 1.0732\n8 A <= 8.5 & A <= 3.5 2772 0.3960 1432 0.4044 1340 0.3874 0.4834 0.4941 0.9783\n9 A <= 8.5 & A > 3.5 3526 0.5037 1775 0.5013 1751 0.5062 0.4966 0.4941 1.0050\n10 A > 8.5 & C <= 0.494 358 0.0511 174 0.0491 184 0.0532 0.5140 0.4941 1.0401\n11 A > 8.5 & C > 0.494 344 0.0491 160 0.0452 184 0.0532 0.5349 0.4941 1.0824",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>组合策略</th>\n <th>命中数</th>\n <th>命中率</th>\n <th>好样本数</th>\n <th>好样本占比</th>\n <th>坏样本数</th>\n <th>坏样本占比</th>\n <th>坏率</th>\n <th>样本整体坏率</th>\n <th>LIFT值</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>B &lt;= 0.495 &amp; 时间 &gt; 0.943</td>\n <td>243</td>\n <td>0.0347</td>\n <td>142</td>\n <td>0.0401</td>\n <td>101</td>\n <td>0.0292</td>\n <td>0.4156</td>\n <td>0.4941</td>\n <td>0.8411</td>\n </tr>\n <tr>\n <th>1</th>\n <td>B &lt;= 0.495 &amp; 时间 &lt;= 0.943</td>\n <td>3758</td>\n <td>0.5369</td>\n <td>1933</td>\n <td>0.5459</td>\n <td>1825</td>\n <td>0.5276</td>\n <td>0.4856</td>\n <td>0.4941</td>\n <td>0.9828</td>\n </tr>\n <tr>\n <th>2</th>\n <td>B &gt; 0.495 &amp; 时间 &lt;= 0.877</td>\n <td>2669</td>\n <td>0.3813</td>\n <td>1321</td>\n <td>0.3731</td>\n <td>1348</td>\n <td>0.3897</td>\n <td>0.5051</td>\n <td>0.4941</td>\n <td>1.0221</td>\n </tr>\n <tr>\n <th>3</th>\n <td>B &gt; 0.495 &amp; 时间 &gt; 0.877</td>\n <td>330</td>\n <td>0.0471</td>\n <td>145</td>\n <td>0.0409</td>\n <td>185</td>\n <td>0.0535</td>\n <td>0.5606</td>\n <td>0.4941</td>\n <td>1.1345</td>\n </tr>\n <tr>\n <th>4</th>\n <td>B &lt;= 0.495 &amp; A &lt;= 8.5</td>\n <td>3605</td>\n <td>0.5150</td>\n <td>1889</td>\n <td>0.5335</td>\n <td>1716</td>\n <td>0.4961</td>\n <td>0.4760</td>\n <td>0.4941</td>\n <td>0.9633</td>\n </tr>\n <tr>\n <th>5</th>\n <td>B &gt; 0.495 &amp; A &gt; 4.5</td>\n <td>1496</td>\n <td>0.2137</td>\n <td>749</td>\n <td>0.2115</td>\n <td>747</td>\n <td>0.2160</td>\n <td>0.4993</td>\n <td>0.4941</td>\n <td>1.0105</td>\n </tr>\n <tr>\n <th>6</th>\n <td>B &gt; 0.495 &amp; A &lt;= 4.5</td>\n <td>1503</td>\n <td>0.2147</td>\n <td>717</td>\n <td>0.2025</td>\n <td>786</td>\n <td>0.2272</td>\n <td>0.5230</td>\n <td>0.4941</td>\n <td>1.0583</td>\n </tr>\n <tr>\n <th>7</th>\n <td>B &lt;= 0.495 &amp; A &gt; 8.5</td>\n <td>396</td>\n <td>0.0566</td>\n <td>186</td>\n <td>0.0525</td>\n <td>210</td>\n <td>0.0607</td>\n <td>0.5303</td>\n <td>0.4941</td>\n <td>1.0732</td>\n </tr>\n <tr>\n <th>8</th>\n <td>A &lt;= 8.5 &amp; A &lt;= 3.5</td>\n <td>2772</td>\n <td>0.3960</td>\n <td>1432</td>\n <td>0.4044</td>\n <td>1340</td>\n <td>0.3874</td>\n <td>0.4834</td>\n <td>0.4941</td>\n <td>0.9783</td>\n </tr>\n <tr>\n <th>9</th>\n <td>A &lt;= 8.5 &amp; A &gt; 3.5</td>\n <td>3526</td>\n <td>0.5037</td>\n <td>1775</td>\n <td>0.5013</td>\n <td>1751</td>\n <td>0.5062</td>\n <td>0.4966</td>\n <td>0.4941</td>\n <td>1.0050</td>\n </tr>\n <tr>\n <th>10</th>\n <td>A &gt; 8.5 &amp; C &lt;= 0.494</td>\n <td>358</td>\n <td>0.0511</td>\n <td>174</td>\n <td>0.0491</td>\n <td>184</td>\n <td>0.0532</td>\n <td>0.5140</td>\n <td>0.4941</td>\n <td>1.0401</td>\n </tr>\n <tr>\n <th>11</th>\n <td>A &gt; 8.5 &amp; C &gt; 0.494</td>\n <td>344</td>\n <td>0.0491</td>\n <td>160</td>\n <td>0.0452</td>\n <td>184</td>\n <td>0.0532</td>\n <td>0.5349</td>\n <td>0.4941</td>\n <td>1.0824</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report[0]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [
{
"data": {
"text/plain": " 组合策略 命中数 命中率 好样本数 好样本占比 坏样本数 坏样本占比 坏率 样本整体坏率 LIFT值\n0 B <= 0.495 & 时间 > 0.943 96 0.0320 50 0.0329 46 0.0310 0.4792 0.4940 0.9700\n1 B <= 0.495 & 时间 <= 0.943 1604 0.5347 825 0.5435 779 0.5256 0.4857 0.4940 0.9831\n2 B > 0.495 & 时间 <= 0.877 1146 0.3820 570 0.3755 576 0.3887 0.5026 0.4940 1.0174\n3 B > 0.495 & 时间 > 0.877 154 0.0513 73 0.0481 81 0.0547 0.5260 0.4940 1.0647\n4 B <= 0.495 & A <= 8.5 1535 0.5117 791 0.5211 744 0.5020 0.4847 0.4940 0.9812\n5 B > 0.495 & A > 4.5 617 0.2057 306 0.2016 311 0.2099 0.5041 0.4940 1.0203\n6 B > 0.495 & A <= 4.5 683 0.2277 337 0.2220 346 0.2335 0.5066 0.4940 1.0255\n7 B <= 0.495 & A > 8.5 165 0.0550 84 0.0553 81 0.0547 0.4909 0.4940 0.9937\n8 A <= 8.5 & A <= 3.5 1214 0.4047 615 0.4051 599 0.4042 0.4934 0.4940 0.9988\n9 A <= 8.5 & A > 3.5 1487 0.4957 744 0.4901 743 0.5013 0.4997 0.4940 1.0115\n10 A > 8.5 & C <= 0.494 147 0.0490 81 0.0534 66 0.0445 0.4490 0.4940 0.9089\n11 A > 8.5 & C > 0.494 152 0.0507 78 0.0514 74 0.0499 0.4868 0.4940 0.9855",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>组合策略</th>\n <th>命中数</th>\n <th>命中率</th>\n <th>好样本数</th>\n <th>好样本占比</th>\n <th>坏样本数</th>\n <th>坏样本占比</th>\n <th>坏率</th>\n <th>样本整体坏率</th>\n <th>LIFT值</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>B &lt;= 0.495 &amp; 时间 &gt; 0.943</td>\n <td>96</td>\n <td>0.0320</td>\n <td>50</td>\n <td>0.0329</td>\n <td>46</td>\n <td>0.0310</td>\n <td>0.4792</td>\n <td>0.4940</td>\n <td>0.9700</td>\n </tr>\n <tr>\n <th>1</th>\n <td>B &lt;= 0.495 &amp; 时间 &lt;= 0.943</td>\n <td>1604</td>\n <td>0.5347</td>\n <td>825</td>\n <td>0.5435</td>\n <td>779</td>\n <td>0.5256</td>\n <td>0.4857</td>\n <td>0.4940</td>\n <td>0.9831</td>\n </tr>\n <tr>\n <th>2</th>\n <td>B &gt; 0.495 &amp; 时间 &lt;= 0.877</td>\n <td>1146</td>\n <td>0.3820</td>\n <td>570</td>\n <td>0.3755</td>\n <td>576</td>\n <td>0.3887</td>\n <td>0.5026</td>\n <td>0.4940</td>\n <td>1.0174</td>\n </tr>\n <tr>\n <th>3</th>\n <td>B &gt; 0.495 &amp; 时间 &gt; 0.877</td>\n <td>154</td>\n <td>0.0513</td>\n <td>73</td>\n <td>0.0481</td>\n <td>81</td>\n <td>0.0547</td>\n <td>0.5260</td>\n <td>0.4940</td>\n <td>1.0647</td>\n </tr>\n <tr>\n <th>4</th>\n <td>B &lt;= 0.495 &amp; A &lt;= 8.5</td>\n <td>1535</td>\n <td>0.5117</td>\n <td>791</td>\n <td>0.5211</td>\n <td>744</td>\n <td>0.5020</td>\n <td>0.4847</td>\n <td>0.4940</td>\n <td>0.9812</td>\n </tr>\n <tr>\n <th>5</th>\n <td>B &gt; 0.495 &amp; A &gt; 4.5</td>\n <td>617</td>\n <td>0.2057</td>\n <td>306</td>\n <td>0.2016</td>\n <td>311</td>\n <td>0.2099</td>\n <td>0.5041</td>\n <td>0.4940</td>\n <td>1.0203</td>\n </tr>\n <tr>\n <th>6</th>\n <td>B &gt; 0.495 &amp; A &lt;= 4.5</td>\n <td>683</td>\n <td>0.2277</td>\n <td>337</td>\n <td>0.2220</td>\n <td>346</td>\n <td>0.2335</td>\n <td>0.5066</td>\n <td>0.4940</td>\n <td>1.0255</td>\n </tr>\n <tr>\n <th>7</th>\n <td>B &lt;= 0.495 &amp; A &gt; 8.5</td>\n <td>165</td>\n <td>0.0550</td>\n <td>84</td>\n <td>0.0553</td>\n <td>81</td>\n <td>0.0547</td>\n <td>0.4909</td>\n <td>0.4940</td>\n <td>0.9937</td>\n </tr>\n <tr>\n <th>8</th>\n <td>A &lt;= 8.5 &amp; A &lt;= 3.5</td>\n <td>1214</td>\n <td>0.4047</td>\n <td>615</td>\n <td>0.4051</td>\n <td>599</td>\n <td>0.4042</td>\n <td>0.4934</td>\n <td>0.4940</td>\n <td>0.9988</td>\n </tr>\n <tr>\n <th>9</th>\n <td>A &lt;= 8.5 &amp; A &gt; 3.5</td>\n <td>1487</td>\n <td>0.4957</td>\n <td>744</td>\n <td>0.4901</td>\n <td>743</td>\n <td>0.5013</td>\n <td>0.4997</td>\n <td>0.4940</td>\n <td>1.0115</td>\n </tr>\n <tr>\n <th>10</th>\n <td>A &gt; 8.5 &amp; C &lt;= 0.494</td>\n <td>147</td>\n <td>0.0490</td>\n <td>81</td>\n <td>0.0534</td>\n <td>66</td>\n <td>0.0445</td>\n <td>0.4490</td>\n <td>0.4940</td>\n <td>0.9089</td>\n </tr>\n <tr>\n <th>11</th>\n <td>A &gt; 8.5 &amp; C &gt; 0.494</td>\n <td>152</td>\n <td>0.0507</td>\n <td>78</td>\n <td>0.0514</td>\n <td>74</td>\n <td>0.0499</td>\n <td>0.4868</td>\n <td>0.4940</td>\n <td>0.9855</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report[1]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"data": {
"text/plain": " 组合策略 命中数 命中率 好样本数 好样本占比 坏样本数 坏样本占比 坏率 样本整体坏率 LIFT值\n0 B <= 0.495 & 时间 > 0.943 242 0.0346 141 0.0398 101 0.0292 0.4174 0.4941 0.8446\n1 B <= 0.495 & 时间 <= 0.943 3759 0.5370 1934 0.5462 1825 0.5276 0.4855 0.4941 0.9825\n2 B > 0.495 & 时间 <= 0.877 2668 0.3811 1320 0.3728 1348 0.3897 0.5052 0.4941 1.0225\n3 B > 0.495 & 时间 > 0.877 331 0.0473 146 0.0412 185 0.0535 0.5589 0.4941 1.1311\n4 B <= 0.495 & A <= 8.5 3605 0.5150 1889 0.5335 1716 0.4961 0.4760 0.4941 0.9633\n5 B > 0.495 & A > 4.5 1496 0.2137 749 0.2115 747 0.2160 0.4993 0.4941 1.0105\n6 B > 0.495 & A <= 4.5 1503 0.2147 717 0.2025 786 0.2272 0.5230 0.4941 1.0583\n7 B <= 0.495 & A > 8.5 396 0.0566 186 0.0525 210 0.0607 0.5303 0.4941 1.0732\n8 A <= 8.5 & A <= 3.5 2772 0.3960 1432 0.4044 1340 0.3874 0.4834 0.4941 0.9783\n9 A <= 8.5 & A > 3.5 3526 0.5037 1775 0.5013 1751 0.5062 0.4966 0.4941 1.0050\n10 A > 8.5 & C <= 0.494 358 0.0511 174 0.0491 184 0.0532 0.5140 0.4941 1.0401\n11 A > 8.5 & C > 0.494 344 0.0491 160 0.0452 184 0.0532 0.5349 0.4941 1.0824",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>组合策略</th>\n <th>命中数</th>\n <th>命中率</th>\n <th>好样本数</th>\n <th>好样本占比</th>\n <th>坏样本数</th>\n <th>坏样本占比</th>\n <th>坏率</th>\n <th>样本整体坏率</th>\n <th>LIFT值</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>B &lt;= 0.495 &amp; 时间 &gt; 0.943</td>\n <td>242</td>\n <td>0.0346</td>\n <td>141</td>\n <td>0.0398</td>\n <td>101</td>\n <td>0.0292</td>\n <td>0.4174</td>\n <td>0.4941</td>\n <td>0.8446</td>\n </tr>\n <tr>\n <th>1</th>\n <td>B &lt;= 0.495 &amp; 时间 &lt;= 0.943</td>\n <td>3759</td>\n <td>0.5370</td>\n <td>1934</td>\n <td>0.5462</td>\n <td>1825</td>\n <td>0.5276</td>\n <td>0.4855</td>\n <td>0.4941</td>\n <td>0.9825</td>\n </tr>\n <tr>\n <th>2</th>\n <td>B &gt; 0.495 &amp; 时间 &lt;= 0.877</td>\n <td>2668</td>\n <td>0.3811</td>\n <td>1320</td>\n <td>0.3728</td>\n <td>1348</td>\n <td>0.3897</td>\n <td>0.5052</td>\n <td>0.4941</td>\n <td>1.0225</td>\n </tr>\n <tr>\n <th>3</th>\n <td>B &gt; 0.495 &amp; 时间 &gt; 0.877</td>\n <td>331</td>\n <td>0.0473</td>\n <td>146</td>\n <td>0.0412</td>\n <td>185</td>\n <td>0.0535</td>\n <td>0.5589</td>\n <td>0.4941</td>\n <td>1.1311</td>\n </tr>\n <tr>\n <th>4</th>\n <td>B &lt;= 0.495 &amp; A &lt;= 8.5</td>\n <td>3605</td>\n <td>0.5150</td>\n <td>1889</td>\n <td>0.5335</td>\n <td>1716</td>\n <td>0.4961</td>\n <td>0.4760</td>\n <td>0.4941</td>\n <td>0.9633</td>\n </tr>\n <tr>\n <th>5</th>\n <td>B &gt; 0.495 &amp; A &gt; 4.5</td>\n <td>1496</td>\n <td>0.2137</td>\n <td>749</td>\n <td>0.2115</td>\n <td>747</td>\n <td>0.2160</td>\n <td>0.4993</td>\n <td>0.4941</td>\n <td>1.0105</td>\n </tr>\n <tr>\n <th>6</th>\n <td>B &gt; 0.495 &amp; A &lt;= 4.5</td>\n <td>1503</td>\n <td>0.2147</td>\n <td>717</td>\n <td>0.2025</td>\n <td>786</td>\n <td>0.2272</td>\n <td>0.5230</td>\n <td>0.4941</td>\n <td>1.0583</td>\n </tr>\n <tr>\n <th>7</th>\n <td>B &lt;= 0.495 &amp; A &gt; 8.5</td>\n <td>396</td>\n <td>0.0566</td>\n <td>186</td>\n <td>0.0525</td>\n <td>210</td>\n <td>0.0607</td>\n <td>0.5303</td>\n <td>0.4941</td>\n <td>1.0732</td>\n </tr>\n <tr>\n <th>8</th>\n <td>A &lt;= 8.5 &amp; A &lt;= 3.5</td>\n <td>2772</td>\n <td>0.3960</td>\n <td>1432</td>\n <td>0.4044</td>\n <td>1340</td>\n <td>0.3874</td>\n <td>0.4834</td>\n <td>0.4941</td>\n <td>0.9783</td>\n </tr>\n <tr>\n <th>9</th>\n <td>A &lt;= 8.5 &amp; A &gt; 3.5</td>\n <td>3526</td>\n <td>0.5037</td>\n <td>1775</td>\n <td>0.5013</td>\n <td>1751</td>\n <td>0.5062</td>\n <td>0.4966</td>\n <td>0.4941</td>\n <td>1.0050</td>\n </tr>\n <tr>\n <th>10</th>\n <td>A &gt; 8.5 &amp; C &lt;= 0.494</td>\n <td>358</td>\n <td>0.0511</td>\n <td>174</td>\n <td>0.0491</td>\n <td>184</td>\n <td>0.0532</td>\n <td>0.5140</td>\n <td>0.4941</td>\n <td>1.0401</td>\n </tr>\n <tr>\n <th>11</th>\n <td>A &gt; 8.5 &amp; C &gt; 0.494</td>\n <td>344</td>\n <td>0.0491</td>\n <td>160</td>\n <td>0.0452</td>\n <td>184</td>\n <td>0.0532</td>\n <td>0.5349</td>\n <td>0.4941</td>\n <td>1.0824</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report[2]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Loading

0 comments on commit 79dee0b

Please sign in to comment.