Skip to content

Commit

Permalink
fix hist plot
Browse files Browse the repository at this point in the history
  • Loading branch information
itlubber committed May 24, 2024
1 parent 88f80af commit e7839f0
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 49 deletions.
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ sklearn2pmml
pypmml
joblib>=0.12
six>=1.15.0
openpyxl==3.0.7
openpyxl>=3.0.7
sweetviz
numexpr
minepy
95 changes: 55 additions & 40 deletions scorecardpipeline/rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def predict(self, X: DataFrame, part=""): # dict预测对应part_dict 、字符

return result

def report(self, datasets, target="target", overdue="overdue", dpd=-1, del_grey=False, desc="", return_cols=None, prior_rules=None) -> pd.DataFrame:
def report(self, datasets: pd.DataFrame, target="target", overdue=None, dpd=None, del_grey=False, desc="", return_cols=None, prior_rules=None) -> pd.DataFrame:
"""规则效果报告表格输出
:param datasets: 数据集,需要包含 目标变量 或 逾期天数,当不包含目标变量时,会通过逾期天数计算目标变量,同时需要传入逾期定义的DPD天数
Expand All @@ -150,49 +150,64 @@ def report(self, datasets, target="target", overdue="overdue", dpd=-1, del_grey=
if desc is None or desc == "" and "指标含义" in return_cols:
return_cols.remove("指标含义")

datasets = datasets.copy()
if target not in datasets.columns and overdue in datasets.columns and dpd >= 0:
datasets[target] = (datasets[overdue] > dpd).astype(int)
rule_expr = self.expr

if isinstance(del_grey, bool) and del_grey:
grey = datasets[(datasets[overdue] > 0) & (datasets[overdue] <= dpd)].reset_index(drop=True)
datasets = datasets[(datasets[overdue] == 0) | (datasets[overdue] > dpd)].reset_index(drop=True)
def _report_one_rule(data, target, desc='', prior_rules=None):
if prior_rules:
prior_tables = prior_rules.report(data, target=target, desc=desc, prior_rules=None, return_cols=return_cols)
prior_tables["规则分类"] = "先验规则"
temp = data[~prior_rules.predict(data)]
rule_result = pd.DataFrame({rule_expr: np.where(self.predict(temp), "命中", "未命中"), "target": temp[target].tolist()})
else:
prior_tables = pd.DataFrame(columns=return_cols)
rule_result = pd.DataFrame({rule_expr: np.where(self.predict(datasets), "命中", "未命中"), "target": data[target].tolist()})

combiner = Combiner(target=target)
combiner.load({rule_expr: [["命中"], ["未命中"]]})
table = feature_bin_stats(rule_result, rule_expr, combiner=combiner, desc=desc, return_cols=return_cols)

# 准确率、精确率、召回率、F1分数
metrics = pd.DataFrame({
"分箱": ["命中", "未命中"],
"准确率": [accuracy_score(rule_result["target"], rule_result[rule_expr].map({"命中": 1, "未命中": 0})), accuracy_score(rule_result["target"], rule_result[rule_expr].map({"命中": 0, "未命中": 1}))],
"精确率": [precision_score(rule_result["target"], rule_result[rule_expr].map({"命中": 1, "未命中": 0})), precision_score(rule_result["target"], rule_result[rule_expr].map({"命中": 0, "未命中": 1}))],
"召回率": [recall_score(rule_result["target"], rule_result[rule_expr].map({"命中": 1, "未命中": 0})), recall_score(rule_result["target"], rule_result[rule_expr].map({"命中": 0, "未命中": 1}))],
"F1分数": [f1_score(rule_result["target"], rule_result[rule_expr].map({"命中": 1, "未命中": 0})), f1_score(rule_result["target"], rule_result[rule_expr].map({"命中": 0, "未命中": 1}))],
})
table = table.merge(metrics, on="分箱", how="left")

if prior_rules:
# prior_tables.insert(loc=0, column="规则分类", value=["先验规则"] * len(prior_tables))
table.insert(loc=0, column="规则分类", value=["验证规则"] * len(table))
table = pd.concat([prior_tables, table]).set_index(["规则分类"])
else:
table.insert(loc=0, column="规则分类", value=["验证规则"] * len(table))

rule_expr = self.expr
return table

if overdue is not None:
if not isinstance(overdue, list):
overdue = [overdue]

if not isinstance(dpd, list):
dpd = [dpd]

for i, col in enumerate(overdue):
for j, d in enumerate(dpd):
_datasets = datasets.copy()
_datasets[f"{col}_{d}"] = (_datasets[col] > d).astype(int)

if isinstance(del_grey, bool) and del_grey:
_datasets = _datasets.query(f"({col} > {d}) | ({col} == 0)").reset_index(drop=True)

if prior_rules:
prior_tables = prior_rules.report(datasets, target=target, overdue=overdue, dpd=dpd, del_grey=del_grey, desc=desc, return_cols=return_cols, prior_rules=None)
temp = datasets[~prior_rules.predict(datasets)]
rule_result = pd.DataFrame({rule_expr: np.where(self.predict(temp), "命中", "未命中"), "target": temp[target].tolist()})
if i == 0 and j == 0:
table = _report_one_rule(_datasets, f"{col}_{d}", desc=desc, prior_rules=prior_rules).rename(columns={"坏账改善": f"{col} {d}+改善"})
else:
_table = _report_one_rule(_datasets, f"{col}_{d}", desc=desc, prior_rules=prior_rules).rename(columns={"坏账改善": f"{col} {d}+改善"})
table = table.merge(_table[["规则分类", "分箱", f"{col} {d}+改善"]], on=["规则分类", "分箱"])
else:
prior_tables = pd.DataFrame(columns=return_cols)
rule_result = pd.DataFrame({rule_expr: np.where(self.predict(datasets), "命中", "未命中"), "target": datasets[target].tolist()})

combiner = Combiner(target=target)
combiner.load({rule_expr: [["命中"], ["未命中"]]})
table = feature_bin_stats(rule_result, rule_expr, combiner=combiner, desc=desc, return_cols=return_cols)

# 准确率、精确率、召回率、F1分数
metrics = pd.DataFrame({
"分箱": ["命中", "未命中"],
"准确率": [accuracy_score(rule_result["target"], rule_result[rule_expr].map({"命中": 1, "未命中": 0})), accuracy_score(rule_result["target"], rule_result[rule_expr].map({"命中": 0, "未命中": 1}))],
"精确率": [precision_score(rule_result["target"], rule_result[rule_expr].map({"命中": 1, "未命中": 0})), precision_score(rule_result["target"], rule_result[rule_expr].map({"命中": 0, "未命中": 1}))],
"召回率": [recall_score(rule_result["target"], rule_result[rule_expr].map({"命中": 1, "未命中": 0})), recall_score(rule_result["target"], rule_result[rule_expr].map({"命中": 0, "未命中": 1}))],
"F1分数": [f1_score(rule_result["target"], rule_result[rule_expr].map({"命中": 1, "未命中": 0})), f1_score(rule_result["target"], rule_result[rule_expr].map({"命中": 0, "未命中": 1}))],
})
table = table.merge(metrics, on="分箱", how="left")

# 规则上线后增益评估
# 坏账率变化情况: 上线后拒绝多少比例的坏客户同时拒绝后坏账水平多少,在原始数据基础上换张改善多少
# total_bad, total = table["坏样本数"].sum(), table["样本总数"].sum()
# total_bad_rate = total_bad / total
# table["坏账改善"] = (total_bad_rate - (total_bad - table["坏样本数"]) / (total - table["样本总数"])) / total_bad_rate

if prior_rules:
prior_tables.insert(loc=0, column="规则分类", value=["先验规则"] * len(prior_tables))
# prior_tables["坏账改善"] = np.nan
table.insert(loc=0, column="规则分类", value=["验证规则"] * len(table))
table = pd.concat([prior_tables, table]).set_index(["规则分类"])
_datasets = datasets.copy()
table = _report_one_rule(_datasets, target, desc=desc, prior_rules=prior_rules)

return table

Expand Down
20 changes: 13 additions & 7 deletions scorecardpipeline/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from matplotlib import font_manager
from matplotlib.ticker import PercentFormatter, FuncFormatter
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_curve, auc, roc_auc_score
import toad
from optbinning import OptimalBinning

Expand Down Expand Up @@ -379,9 +379,16 @@ def ks_plot(score, target, title="", fontsize=14, figsize=(16, 8), save=None, co
:param anchor: 图例显示的位置,默认 0.945,根据实际显示情况进行调整即可,0.95 附近小范围调整
:return: Figure
"""
if np.mean(score) < 0 or np.mean(score) > 1:
warnings.warn('Since the average of pred is not in [0,1], it is treated as credit score but not probability.')
auc_value = roc_auc_score(target, score)

if auc_value < 0.5:
warnings.warn('评分AUC指标小于50%, 推断数据值越大, 正样本率越高, 将数据值转为负数后进行绘图')
score = -score
auc_value = 1 - auc_value

# if np.mean(score) < 0 or np.mean(score) > 1:
# warnings.warn('Since the average of pred is not in [0,1], it is treated as credit score but not probability.')
# score = -score

df = pd.DataFrame({'label': target, 'pred': score})

Expand Down Expand Up @@ -430,7 +437,6 @@ def n1(x):

# ROC 曲线
fpr, tpr, thresholds = roc_curve(target, score)
auc_value = toad.metrics.AUC(score, target)

ax[1].plot(fpr, tpr, color=colors[0], label="ROC Curve")
ax[1].stackplot(fpr, tpr, color=colors[0], alpha=0.25)
Expand Down Expand Up @@ -487,7 +493,7 @@ def hist_plot(score, y_true=None, figsize=(15, 10), bins=30, save=None, labels=[
:return: Figure
"""
target_unique = 1 if y_true is None else len(np.unique(y_true))

if y_true is not None:
if isinstance(labels, dict):
y_true = y_true.map(labels)
Expand All @@ -498,7 +504,7 @@ def hist_plot(score, y_true=None, figsize=(15, 10), bins=30, save=None, labels=[
else:
y_true = None
hue_order = None

fig, ax = plt.subplots(1, 1, figsize=figsize)
palette = sns.diverging_palette(340, 267, n=target_unique, s=100, l=40)

Expand All @@ -517,7 +523,7 @@ def hist_plot(score, y_true=None, figsize=(15, 10), bins=30, save=None, labels=[
ax.set_ylabel("样本占比", fontsize=fontsize)

ax.yaxis.set_major_formatter(PercentFormatter(1))

ax.set_title(f"{desc + ' ' if desc else '特征'}分布情况\n\n", fontsize=fontsize)

if y_true is not None:
Expand Down

0 comments on commit e7839f0

Please sign in to comment.