Skip to content

Commit

Permalink
fix excel writer
Browse files Browse the repository at this point in the history
  • Loading branch information
itlubber committed Mar 15, 2024
1 parent 77db591 commit 724ba91
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 19 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ matplotlib
seaborn>=0.10.0
scipy>=1.6.0
statsmodels<0.14,>=0.13.2
scikit-learn
scikit-learn>=1.3.1
toad
scorecardpy
ortools
Expand Down
8 changes: 4 additions & 4 deletions scorecardpipeline/auto_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def auto_data_testing_report(data, features=None, target="target", date=None, da

worksheet = writer.get_sheet_by_name(sheet)

end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="数据有效性分析报告", style="header_middle", end_space=(start_row, start_col + 17))
end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="数据有效性分析报告", style="header_middle", end_space=(start_row, start_col + 19))

if date is not None and date in data.columns:
if data[date].dtype.name in ["str", "object"]:
Expand Down Expand Up @@ -87,11 +87,11 @@ def auto_data_testing_report(data, features=None, target="target", date=None, da
# 变量相关性
if corr:
temp = data[features].select_dtypes(include="number")
corr_plot(temp, save=f"model_report/auto_report_corr_plot.png", annot=True)
end_row, end_col = dataframe2excel(temp.corr(), writer, worksheet, color_cols=list(temp.columns), start_row=end_row, figures=["model_report/auto_report_corr_plot.png"], title="数值类变量相关性", figsize=(700, 500), index=True, custom_cols=list(temp.columns), custom_format="0.00")
corr_plot(temp, save=f"model_report/auto_report_corr_plot.png", annot=True if len(temp.columns) <= 10 else False, fontsize=14 if len(temp.columns) <= 10 else 12)
end_row, end_col = dataframe2excel(temp.corr(), writer, worksheet, color_cols=list(temp.columns), start_row=end_row, figures=["model_report/auto_report_corr_plot.png"], title="数值类变量相关性", figsize=(min(60 * len(temp.columns), 1080), min(55 * len(temp.columns), 950)), index=True, custom_cols=list(temp.columns), custom_format="0.00")
end_row += 2

end_row, end_col = writer.insert_value2sheet(worksheet, (end_row, start_col), value="数值类特征 OR 评分效果评估", style="header_middle", end_space=(end_row, start_col + 17))
end_row, end_col = writer.insert_value2sheet(worksheet, (end_row, start_col), value="数值类特征 OR 评分效果评估", style="header_middle", end_space=(end_row, start_col + 19))
features_iter = tqdm(features)
for col in features_iter:
features_iter.set_postfix(feature=feature_map.get(col, col))
Expand Down
29 changes: 17 additions & 12 deletions scorecardpipeline/excel_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,13 +317,12 @@ def get_merge_rows(values, start_row):
merge_rows = None

def _iter_rows(df, header=True, index=True):
for i, row in enumerate(dataframe_to_rows(df, header=header, index=index)):
columns = df.columns.tolist()
columns = df.columns.tolist()
indexs = df.index.tolist()
for i, row in enumerate(dataframe_to_rows(df, header=header, index=False)):
if header:
if index:
if i == df.columns.nlevels:
continue
elif i < df.columns.nlevels:
if i < df.columns.nlevels:
if index:
if df.columns.nlevels > 1:
if i == df.columns.nlevels - 1:
yield list(df.index.names) + [c[i] for c in columns]
Expand All @@ -334,14 +333,20 @@ def _iter_rows(df, header=True, index=True):
else:
yield list(df.index.names) + columns
continue
else:
if df.columns.nlevels > 1 and i < df.columns.nlevels:
yield [c[i] for c in columns]
continue
else:
if df.columns.nlevels > 1 and i < df.columns.nlevels:
yield [c[i] for c in columns]
continue
if index:
yield list(indexs[i - df.columns.nlevels]) + row
else:
yield row
else:
if index and i == 0:
continue
yield row
if index:
yield list(indexs[i]) + row
else:
yield row

for i, row in enumerate(_iter_rows(df, header=header, index=index)):
if fill:
Expand Down
3 changes: 1 addition & 2 deletions scorecardpipeline/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,8 +644,6 @@ def feature_bin_stats(cls, data, feature, target="target", rules=None, method='s

table = table.replace(np.inf, 0).replace(-np.inf, 0)

table['指标IV值'] = table['分档IV值'].sum()

table["LIFT值"] = table['坏样本率'] / (table["坏样本数"].sum() / table["样本总数"].sum())
table["坏账改善"] = (table["坏样本数"].sum() / table["样本总数"].sum() - (table["坏样本数"].sum() - table["坏样本数"]) / (table["样本总数"].sum() - table["样本总数"])) / (table["坏样本数"].sum() / table["样本总数"].sum())

Expand Down Expand Up @@ -688,6 +686,7 @@ def reverse_series(series):

table["分箱"] = table["分箱"].map(feature_bin_dict)
table = table.set_index(['指标名称', '指标含义', '分箱']).reindex([(feature, desc, b) for b in feature_bin_dict.values()]).fillna(0).reset_index()
table['指标IV值'] = table['分档IV值'].sum()

if return_cols:
table = table[[c for c in return_cols if c in table.columns]]
Expand Down
34 changes: 34 additions & 0 deletions scorecardpipeline/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -790,3 +790,37 @@ def distribution_plot(data, date="date", target="target", save=None, figsize=(10
temp["坏样本率"] = temp["坏样本"] / temp["样本总数"]

return temp[["日期", "样本总数", "样本占比", "好样本", "好样本占比", "坏样本", "坏样本占比", "坏样本率"]]


def sample_lift_transformer(df, rule, target='target', sample_rate=0.7):
"""采取好坏样本 sample_rate:1 的抽样方式时,计算抽样样本和原始样本上的 lift 指标
:param df: 原始数据,需全部为数值型变量
:param rule: Rule
:param target: 目标变量名称
:param sample_rate: 好样本采样比例
:return:
lift_sam: float, 抽样样本上拒绝人群的lift
lift_ori: float, 原始样本上拒绝人群的lift
"""
rj_df = df[rule.predict(df)]
ps_df = df[~rule.predict(df)]

# 拒绝样本好坏样本数
rj = len(rj_df)
bad_rj = rj_df[target].sum()
good_rj = rj - bad_rj

# 通过样本好坏样本数
ps = len(ps_df)
bad_ps = ps_df[target].sum()
good_ps = ps - bad_ps

# 抽样样本上的lift
lift_sam = (bad_rj / rj) / ((bad_rj + bad_ps) / (rj + ps))

# 原始样本上的lift
lift_ori = bad_rj / (bad_rj + bad_ps) * (1 + (sample_rate * bad_ps + good_ps) / (sample_rate * bad_rj + good_rj))

return lift_sam, lift_ori

0 comments on commit 724ba91

Please sign in to comment.