Skip to content

Commit

Permalink
fix excel writer & auto report method
Browse files Browse the repository at this point in the history
  • Loading branch information
itlubber committed Mar 1, 2024
1 parent 70d01cb commit 6915a5d
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 21 deletions.
31 changes: 20 additions & 11 deletions scorecardpipeline/auto_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
from .excel_writer import ExcelWriter, dataframe2excel


def auto_data_testing_report(data, features=None, target="target", date=None, data_summary_comment="", freq="M", excel_writer=None, sheet="分析报告", start_col=2, start_row=2, writer_params={}, bin_params={}, feature_map={}, pictures=["bin", "ks", "hist"]):
def auto_data_testing_report(data, features=None, target="target", date=None, data_summary_comment="", freq="M", excel_writer=None, sheet="分析报告", start_col=2, start_row=2, writer_params={}, bin_params={}, feature_map={}, corr=False, pictures=["bin", "ks", "hist"]):
"""自动数据测试报告,用于三方数据评估或自有评分效果评估
:param corr: 是否需要评估数值类变量之间的相关性,默认为 False,设置为 True 后会输出变量相关性图和表
:param pictures: 需要包含的图片,支持 ["ks", "hist", "bin"]
:param data: 需要评估的数据集,需要包含目标变量
:param features: 需要进行分析的特征名称,支持单个字符串传入或列表传入
Expand Down Expand Up @@ -83,6 +84,13 @@ def auto_data_testing_report(data, features=None, target="target", date=None, da
end_row, end_col = dataframe2excel(dataset_summary, writer, worksheet, percent_cols=["样本占比", "坏客户占比"], start_row=end_row + 2, title="样本总体分布情况")
end_row += 2

# 变量相关性
if corr:
temp = data[features].select_dtypes(include="number")
corr_plot(temp, save=f"model_report/auto_report_corr_plot.png", annot=True)
end_row, end_col = dataframe2excel(temp.corr(), writer, worksheet, color_cols=list(temp.columns), start_row=end_row, figures=["model_report/auto_report_corr_plot.png"], title="数值类变量相关性", figsize=(700, 500), index=True, custom_cols=list(temp.columns), custom_format="0.00")
end_row += 2

end_row, end_col = writer.insert_value2sheet(worksheet, (end_row, start_col), value="数值类特征 OR 评分效果评估", style="header_middle", end_space=(end_row, start_col + 17))
features_iter = tqdm(features)
for col in features_iter:
Expand All @@ -93,23 +101,24 @@ def auto_data_testing_report(data, features=None, target="target", date=None, da
if pictures and len(pictures) > 0:
if "bin" in pictures:
bin_plot(score_table_train, desc=f"{feature_map.get(col, col)}", figsize=(10, 5), anchor=0.935, save=f"model_report/feature_bins_plot_{col}.png")
if "ks" in pictures:
_ = temp.dropna().reset_index(drop=True)
has_ks = len(_) > 0 and _[col].nunique() > 1 and _[target].nunique() > 1
if has_ks:
ks_plot(_[col], _[target], figsize=(10, 5), title=f"{feature_map.get(col, col)}", save=f"model_report/feature_ks_plot_{col}.png")
if "hist" in pictures:
_ = temp.dropna().reset_index(drop=True)
if len(_) > 0:
hist_plot(_[col], y_true=_[target], figsize=(10, 6), desc=f"{feature_map.get(col, col)} 好客户 VS 坏客户", bins=30, anchor=1.11, fontsize=14, labels={0: "好客户", 1: "坏客户"}, save=f"model_report/feature_hist_plot_{col}.png")
if temp[col].dtypes.name not in ['object', 'str', 'category']:
if "ks" in pictures:
_ = temp.dropna().reset_index(drop=True)
has_ks = len(_) > 0 and _[col].nunique() > 1 and _[target].nunique() > 1
if has_ks:
ks_plot(_[col], _[target], figsize=(10, 5), title=f"{feature_map.get(col, col)}", save=f"model_report/feature_ks_plot_{col}.png")
if "hist" in pictures:
_ = temp.dropna().reset_index(drop=True)
if len(_) > 0:
hist_plot(_[col], y_true=_[target], figsize=(10, 6), desc=f"{feature_map.get(col, col)} 好客户 VS 坏客户", bins=30, anchor=1.11, fontsize=14, labels={0: "好客户", 1: "坏客户"}, save=f"model_report/feature_hist_plot_{col}.png")

end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=f"数据字段: {feature_map.get(col, col)}", style="header", end_space=(end_row + 2, start_col + len(score_table_train.columns) - 1))

if pictures and len(pictures) > 0:
ks_row = end_row + 1
if "bin" in pictures:
end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/feature_bins_plot_{col}.png", (ks_row, start_col), figsize=(600, 350))
if temp[col].isnull().sum() != len(temp):
if temp[col].dtypes.name not in ['object', 'str', 'category'] and temp[col].isnull().sum() != len(temp):
if "ks" in pictures and has_ks:
end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/feature_ks_plot_{col}.png", (ks_row, end_col - 1), figsize=(600, 350))
if "hist" in pictures:
Expand Down
29 changes: 19 additions & 10 deletions scorecardpipeline/excel_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,12 +291,21 @@ def _iter_rows(df, header=True, index=True):
if index:
if i == df.columns.nlevels:
continue
elif i == df.columns.nlevels - 1:
yield list(df.index.names) + [c[i] for c in columns]
continue
elif i < df.columns.nlevels - 1:
yield [None] * df.index.nlevels + [c[i] for c in columns]
continue
elif i < df.columns.nlevels:
if df.columns.nlevels > 1:
if i == df.columns.nlevels - 1:
yield list(df.index.names) + [c[i] for c in columns]
continue
elif i < df.columns.nlevels - 1:
yield [None] * df.index.nlevels + [c[i] for c in columns]
continue
else:
if i == 0:
yield list(df.index.names) + columns
continue
else:
yield [None] * df.index.nlevels + [c[i] for c in columns]
continue
else:
if df.columns.nlevels > 1 and i < df.columns.nlevels:
yield [c[i] for c in columns]
Expand Down Expand Up @@ -702,24 +711,24 @@ def dataframe2excel(data, excel_writer, sheet_name=None, title=None, header=True

if percent_cols:
for c in [c for c in percent_cols if c in data.columns]:
conditional_column = get_column_letter(start_col + data.columns.get_loc(c))
conditional_column = get_column_letter(start_col + data.columns.get_loc(c) + data.index.nlevels if kwargs.get("index", False) else start_col + data.columns.get_loc(c))
writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(data)}:{conditional_column}{end_row - 1}", "0.00%")

if custom_cols:
for c in [c for c in custom_cols if c in data.columns]:
conditional_column = get_column_letter(start_col + data.columns.get_loc(c))
conditional_column = get_column_letter(start_col + data.columns.get_loc(c) + data.index.nlevels if kwargs.get("index", False) else start_col + data.columns.get_loc(c))
writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(data)}:{conditional_column}{end_row - 1}", custom_format)

if condition_cols:
for c in [c for c in condition_cols if c in data.columns]:
conditional_column = get_column_letter(start_col + data.columns.get_loc(c))
conditional_column = get_column_letter(start_col + data.columns.get_loc(c) + data.index.nlevels if kwargs.get("index", False) else start_col + data.columns.get_loc(c))
writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(data)}', f'{conditional_column}{end_row - 1}')

if color_cols:
for c in [c for c in color_cols if c in data.columns]:
try:
rule = ColorScaleRule(start_type='num', start_value=data[c].min(), start_color=theme_color, mid_type='num', mid_value=0., mid_color='FFFFFF', end_type='num', end_value=data[c].max(), end_color=theme_color)
conditional_column = get_column_letter(start_col + data.columns.get_loc(c))
conditional_column = get_column_letter(start_col + data.columns.get_loc(c) + data.index.nlevels if kwargs.get("index", False) else start_col + data.columns.get_loc(c))
worksheet.conditional_formatting.add(f"{conditional_column}{end_row - len(data)}:{conditional_column}{end_row - 1}", rule)
except:
import traceback
Expand Down

0 comments on commit 6915a5d

Please sign in to comment.