fix auto report

itlubber · Mar 18, 2024 · 3e56343 · 3e56343
1 parent 656a7f6
commit 3e56343
Showing 1 changed file with 12 additions and 11 deletions.
diff --git a/scorecardpipeline/auto_report.py b/scorecardpipeline/auto_report.py
@@ -20,9 +20,10 @@
 from .excel_writer import ExcelWriter, dataframe2excel
 
 
-def auto_data_testing_report(data, features=None, target="target", date=None, data_summary_comment="", freq="M", excel_writer=None, sheet="分析报告", start_col=2, start_row=2, writer_params={}, bin_params={}, feature_map={}, corr=False, pictures=["bin", "ks", "hist"]):
+def auto_data_testing_report(data, features=None, target="target", date=None, data_summary_comment="", freq="M", excel_writer=None, sheet="分析报告", start_col=2, start_row=2, writer_params={}, bin_params={}, feature_map={}, corr=False, pictures=["bin", "ks", "hist"], suffix=""):
     """自动数据测试报告，用于三方数据评估或自有评分效果评估
 
+    :param suffix: 用于避免未保存excel时，同名图片被覆盖的图片后缀名称
     :param corr: 是否需要评估数值类变量之间的相关性，默认为 False，设置为 True 后会输出变量相关性图和表
     :param pictures: 需要包含的图片，支持 ["ks", "hist", "bin"]
     :param data: 需要评估的数据集，需要包含目标变量
@@ -69,9 +70,9 @@ def auto_data_testing_report(data, features=None, target="target", date=None, da
         )
         end_row, end_col = dataframe2excel(dataset_summary, writer, worksheet, percent_cols=["样本占比", "坏客户占比"], start_row=end_row + 2, title="样本总体分布情况")
 
-        distribution = distribution_plot(data, date=date, freq=freq, target=target, save=f"model_report/sample_time_distribution.png", result=True)
+        distribution = distribution_plot(data, date=date, freq=freq, target=target, save=f"model_report/sample_time_distribution{suffix}.png", result=True)
         end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="样本时间分布情况", style="header", end_space=(end_row + 2, start_col + len(distribution.columns) - 1))
-        end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/sample_time_distribution.png", (end_row + 1, start_col), figsize=(720, 370))
+        end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/sample_time_distribution{suffix}.png", (end_row + 1, start_col), figsize=(720, 370))
         end_row, end_col = dataframe2excel(distribution, writer, worksheet, percent_cols=["样本占比", "好样本占比", "坏样本占比", "坏样本率"], condition_cols=["坏样本率"], start_row=end_row)
         end_row += 2
     else:
@@ -87,8 +88,8 @@ def auto_data_testing_report(data, features=None, target="target", date=None, da
     # 变量相关性
     if corr:
         temp = data[features].select_dtypes(include="number")
-        corr_plot(temp, save=f"model_report/auto_report_corr_plot.png", annot=True if len(temp.columns) <= 10 else False, fontsize=14 if len(temp.columns) <= 10 else 12)
-        end_row, end_col = dataframe2excel(temp.corr(), writer, worksheet, color_cols=list(temp.columns), start_row=end_row, figures=["model_report/auto_report_corr_plot.png"], title="数值类变量相关性", figsize=(min(60 * len(temp.columns), 1080), min(55 * len(temp.columns), 950)), index=True, custom_cols=list(temp.columns), custom_format="0.00")
+        corr_plot(temp, save=f"model_report/auto_report_corr_plot{suffix}.png", annot=True if len(temp.columns) <= 10 else False, fontsize=14 if len(temp.columns) <= 10 else 12)
+        end_row, end_col = dataframe2excel(temp.corr(), writer, worksheet, color_cols=list(temp.columns), start_row=end_row, figures=[f"model_report/auto_report_corr_plot{suffix}.png"], title="数值类变量相关性", figsize=(min(60 * len(temp.columns), 1080), min(55 * len(temp.columns), 950)), index=True, custom_cols=list(temp.columns), custom_format="0.00")
         end_row += 2
 
     end_row, end_col = writer.insert_value2sheet(worksheet, (end_row, start_col), value="数值类特征 OR 评分效果评估", style="header_middle", end_space=(end_row, start_col + 19))
@@ -100,29 +101,29 @@ def auto_data_testing_report(data, features=None, target="target", date=None, da
             score_table_train = Combiner.feature_bin_stats(temp, col, desc=f"{feature_map.get(col, col)}", target=target, **bin_params)
             if pictures and len(pictures) > 0:
                 if "bin" in pictures:
-                    bin_plot(score_table_train, desc=f"{feature_map.get(col, col)}", figsize=(10, 5), anchor=0.935, save=f"model_report/feature_bins_plot_{col}.png")
+                    bin_plot(score_table_train, desc=f"{feature_map.get(col, col)}", figsize=(10, 5), anchor=0.935, save=f"model_report/feature_bins_plot_{col}{suffix}.png")
                 if temp[col].dtypes.name not in ['object', 'str', 'category']:
                     if "ks" in pictures:
                         _ = temp.dropna().reset_index(drop=True)
                         has_ks = len(_) > 0 and _[col].nunique() > 1 and _[target].nunique() > 1
                         if has_ks:
-                            ks_plot(_[col], _[target], figsize=(10, 5), title=f"{feature_map.get(col, col)}", save=f"model_report/feature_ks_plot_{col}.png")
+                            ks_plot(_[col], _[target], figsize=(10, 5), title=f"{feature_map.get(col, col)}", save=f"model_report/feature_ks_plot_{col}{suffix}.png")
                     if "hist" in pictures:
                         _ = temp.dropna().reset_index(drop=True)
                         if len(_) > 0:
-                            hist_plot(_[col], y_true=_[target], figsize=(10, 6), desc=f"{feature_map.get(col, col)} 好客户 VS 坏客户", bins=30, anchor=1.11, fontsize=14, labels={0: "好客户", 1: "坏客户"}, save=f"model_report/feature_hist_plot_{col}.png")
+                            hist_plot(_[col], y_true=_[target], figsize=(10, 6), desc=f"{feature_map.get(col, col)} 好客户 VS 坏客户", bins=30, anchor=1.11, fontsize=14, labels={0: "好客户", 1: "坏客户"}, save=f"model_report/feature_hist_plot_{col}{suffix}.png")
 
             end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value=f"数据字段: {feature_map.get(col, col)}", style="header", end_space=(end_row + 2, start_col + len(score_table_train.columns) - 1))
 
             if pictures and len(pictures) > 0:
                 ks_row = end_row + 1
                 if "bin" in pictures:
-                    end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/feature_bins_plot_{col}.png", (ks_row, start_col), figsize=(600, 350))
+                    end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/feature_bins_plot_{col}{suffix}.png", (ks_row, start_col), figsize=(600, 350))
                 if temp[col].dtypes.name not in ['object', 'str', 'category'] and temp[col].isnull().sum() != len(temp):
                     if "ks" in pictures and has_ks:
-                        end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/feature_ks_plot_{col}.png", (ks_row, end_col - 1), figsize=(600, 350))
+                        end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/feature_ks_plot_{col}{suffix}.png", (ks_row, end_col - 1), figsize=(600, 350))
                     if "hist" in pictures:
-                        end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/feature_hist_plot_{col}.png", (ks_row, end_col - 1), figsize=(600, 350))
+                        end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/feature_hist_plot_{col}{suffix}.png", (ks_row, end_col - 1), figsize=(600, 350))
 
             end_row, end_col = dataframe2excel(score_table_train, writer, worksheet, percent_cols=["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "坏账改善", "累积LIFT值", "累积坏账改善"], condition_cols=["坏样本率", "LIFT值"], merge_column=["指标名称", "指标含义"], merge=True, fill=True, start_row=end_row)
         except: