Skip to content

Commit

Permalink
fix auto report
Browse files Browse the repository at this point in the history
  • Loading branch information
itlubber committed Mar 18, 2024
1 parent 656a7f6 commit 3e56343
Showing 1 changed file with 12 additions and 11 deletions.
23 changes: 12 additions & 11 deletions scorecardpipeline/auto_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
from .excel_writer import ExcelWriter, dataframe2excel


def auto_data_testing_report(data, features=None, target="target", date=None, data_summary_comment="", freq="M", excel_writer=None, sheet="分析报告", start_col=2, start_row=2, writer_params={}, bin_params={}, feature_map={}, corr=False, pictures=["bin", "ks", "hist"]):
def auto_data_testing_report(data, features=None, target="target", date=None, data_summary_comment="", freq="M", excel_writer=None, sheet="分析报告", start_col=2, start_row=2, writer_params={}, bin_params={}, feature_map={}, corr=False, pictures=["bin", "ks", "hist"], suffix=""):
"""自动数据测试报告,用于三方数据评估或自有评分效果评估
:param suffix: 用于避免未保存excel时,同名图片被覆盖的图片后缀名称
:param corr: 是否需要评估数值类变量之间的相关性,默认为 False,设置为 True 后会输出变量相关性图和表
:param pictures: 需要包含的图片,支持 ["ks", "hist", "bin"]
:param data: 需要评估的数据集,需要包含目标变量
Expand Down Expand Up @@ -69,9 +70,9 @@ def auto_data_testing_report(data, features=None, target="target", date=None, da
)
end_row, end_col = dataframe2excel(dataset_summary, writer, worksheet, percent_cols=["样本占比", "坏客户占比"], start_row=end_row + 2, title="样本总体分布情况")

distribution = distribution_plot(data, date=date, freq=freq, target=target, save=f"model_report/sample_time_distribution.png", result=True)
distribution = distribution_plot(data, date=date, freq=freq, target=target, save=f"model_report/sample_time_distribution{suffix}.png", result=True)
end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="样本时间分布情况", style="header", end_space=(end_row + 2, start_col + len(distribution.columns) - 1))
end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/sample_time_distribution.png", (end_row + 1, start_col), figsize=(720, 370))
end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/sample_time_distribution{suffix}.png", (end_row + 1, start_col), figsize=(720, 370))
end_row, end_col = dataframe2excel(distribution, writer, worksheet, percent_cols=["样本占比", "好样本占比", "坏样本占比", "坏样本率"], condition_cols=["坏样本率"], start_row=end_row)
end_row += 2
else:
Expand All @@ -87,8 +88,8 @@ def auto_data_testing_report(data, features=None, target="target", date=None, da
# 变量相关性
if corr:
temp = data[features].select_dtypes(include="number")
corr_plot(temp, save=f"model_report/auto_report_corr_plot.png", annot=True if len(temp.columns) <= 10 else False, fontsize=14 if len(temp.columns) <= 10 else 12)
end_row, end_col = dataframe2excel(temp.corr(), writer, worksheet, color_cols=list(temp.columns), start_row=end_row, figures=["model_report/auto_report_corr_plot.png"], title="数值类变量相关性", figsize=(min(60 * len(temp.columns), 1080), min(55 * len(temp.columns), 950)), index=True, custom_cols=list(temp.columns), custom_format="0.00")
corr_plot(temp, save=f"model_report/auto_report_corr_plot{suffix}.png", annot=True if len(temp.columns) <= 10 else False, fontsize=14 if len(temp.columns) <= 10 else 12)
end_row, end_col = dataframe2excel(temp.corr(), writer, worksheet, color_cols=list(temp.columns), start_row=end_row, figures=[f"model_report/auto_report_corr_plot{suffix}.png"], title="数值类变量相关性", figsize=(min(60 * len(temp.columns), 1080), min(55 * len(temp.columns), 950)), index=True, custom_cols=list(temp.columns), custom_format="0.00")
end_row += 2

end_row, end_col = writer.insert_value2sheet(worksheet, (end_row, start_col), value="数值类特征 OR 评分效果评估", style="header_middle", end_space=(end_row, start_col + 19))
Expand All @@ -100,29 +101,29 @@ def auto_data_testing_report(data, features=None, target="target", date=None, da
score_table_train = Combiner.feature_bin_stats(temp, col, desc=f"{feature_map.get(col, col)}", target=target, **bin_params)
if pictures and len(pictures) > 0:
if "bin" in pictures:
bin_plot(score_table_train, desc=f"{feature_map.get(col, col)}", figsize=(10, 5), anchor=0.935, save=f"model_report/feature_bins_plot_{col}.png")
bin_plot(score_table_train, desc=f"{feature_map.get(col, col)}", figsize=(10, 5), anchor=0.935, save=f"model_report/feature_bins_plot_{col}{suffix}.png")
if temp[col].dtypes.name not in ['object', 'str', 'category']:
if "ks" in pictures:
_ = temp.dropna().reset_index(drop=True)
has_ks = len(_) > 0 and _[col].nunique() > 1 and _[target].nunique() > 1
if has_ks:
ks_plot(_[col], _[target], figsize=(10, 5), title=f"{feature_map.get(col, col)}", save=f"model_report/feature_ks_plot_{col}.png")
ks_plot(_[col], _[target], figsize=(10, 5), title=f"{feature_map.get(col, col)}", save=f"model_report/feature_ks_plot_{col}{suffix}.png")
if "hist" in pictures:
_ = temp.dropna().reset_index(drop=True)
if len(_) > 0:
hist_plot(_[col], y_true=_[target], figsize=(10, 6), desc=f"{feature_map.get(col, col)} 好客户 VS 坏客户", bins=30, anchor=1.11, fontsize=14, labels={0: "好客户", 1: "坏客户"}, save=f"model_report/feature_hist_plot_{col}.png")
hist_plot(_[col], y_true=_[target], figsize=(10, 6), desc=f"{feature_map.get(col, col)} 好客户 VS 坏客户", bins=30, anchor=1.11, fontsize=14, labels={0: "好客户", 1: "坏客户"}, save=f"model_report/feature_hist_plot_{col}{suffix}.png")

end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=f"数据字段: {feature_map.get(col, col)}", style="header", end_space=(end_row + 2, start_col + len(score_table_train.columns) - 1))

if pictures and len(pictures) > 0:
ks_row = end_row + 1
if "bin" in pictures:
end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/feature_bins_plot_{col}.png", (ks_row, start_col), figsize=(600, 350))
end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/feature_bins_plot_{col}{suffix}.png", (ks_row, start_col), figsize=(600, 350))
if temp[col].dtypes.name not in ['object', 'str', 'category'] and temp[col].isnull().sum() != len(temp):
if "ks" in pictures and has_ks:
end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/feature_ks_plot_{col}.png", (ks_row, end_col - 1), figsize=(600, 350))
end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/feature_ks_plot_{col}{suffix}.png", (ks_row, end_col - 1), figsize=(600, 350))
if "hist" in pictures:
end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/feature_hist_plot_{col}.png", (ks_row, end_col - 1), figsize=(600, 350))
end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/feature_hist_plot_{col}{suffix}.png", (ks_row, end_col - 1), figsize=(600, 350))

end_row, end_col = dataframe2excel(score_table_train, writer, worksheet, percent_cols=["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "坏账改善", "累积LIFT值", "累积坏账改善"], condition_cols=["坏样本率", "LIFT值"], merge_column=["指标名称", "指标含义"], merge=True, fill=True, start_row=end_row)
except:
Expand Down

0 comments on commit 3e56343

Please sign in to comment.