diff --git a/src/ccf_tools.py b/src/ccf_tools.py index 10ce6969b..f38fa428a 100644 --- a/src/ccf_tools.py +++ b/src/ccf_tools.py @@ -51,7 +51,11 @@ def no_parent(log_dict, cell_type, row_number): asct_b_tab = json.load(open(path)) as_invalid_terms = set() + as_temp_terms = set() + as_out_ub = set() ct_invalid_terms = set() + ct_temp_terms = set() + ct_out_ct = set() unique_terms = set() as_valid_terms = set() ct_valid_terms = set() @@ -87,16 +91,32 @@ def no_parent(log_dict, cell_type, row_number): as_valid_terms.add(next['id']) else: if not check_id(current['id']) and current['rdfs_label'] != '': + if current['id'] != '': + as_out_ub.add(current['id']) + else: + as_temp_terms.add(current['rdfs_label']) as_invalid_terms.add(current['rdfs_label']) unique_terms.add(current['rdfs_label']) elif not check_id(current['id']) and current['name'] != '': + if current['id'] != '': + as_out_ub.add(current['id']) + else: + as_temp_terms.add(current['name']) as_invalid_terms.add(current['name']) unique_terms.add(current['name']) if not check_id(next['id']) and next['rdfs_label'] != '': + if next['id'] != '': + as_out_ub.add(next['id']) + else: + as_temp_terms.add(next['rdfs_label']) as_invalid_terms.add(next['rdfs_label']) unique_terms.add(next['rdfs_label']) elif not check_id(next['id']) and next['name'] != '': + if next['id'] != '': + as_out_ub.add(next['id']) + else: + as_temp_terms.add(next['name']) as_invalid_terms.add(next['name']) unique_terms.add(next['name']) @@ -125,16 +145,32 @@ def no_parent(log_dict, cell_type, row_number): ct_valid_terms.add(next['id']) else: if not check_id(current['id']) and current['rdfs_label'] != '': + if current['id'] != '': + ct_out_ct.add(current['id']) + else: + ct_temp_terms.add(current['rdfs_label']) ct_invalid_terms.add(current['rdfs_label']) unique_terms.add(current['rdfs_label']) elif not check_id(current['id']) and current['name'] != '': + if current['id'] != '': + ct_out_ct.add(current['id']) + else: + ct_temp_terms.add(current['name']) ct_invalid_terms.add(current['name']) unique_terms.add(current['name']) if not check_id(next['id']) and next['rdfs_label'] != '': + if next['id'] != '': + ct_out_ct.add(next['id']) + else: + ct_temp_terms.add(next['rdfs_label']) ct_invalid_terms.add(next['rdfs_label']) unique_terms.add(next['rdfs_label']) elif not check_id(next['id']) and next['name'] != '': + if next['id'] != '': + ct_out_ct.add(next['id']) + else: + ct_temp_terms.add(next['name']) ct_invalid_terms.add(next['name']) unique_terms.add(next['name']) @@ -171,16 +207,32 @@ def no_parent(log_dict, cell_type, row_number): if check_id(last_ct['id']): ct_valid_terms.add(last_ct['id']) if not check_id(last_as['id']) and last_as['rdfs_label'] != '': + if last_as['id'] != '': + as_out_ub.add(last_as['id']) + else: + as_temp_terms.add(last_as['rdfs_label']) as_invalid_terms.add(last_as['rdfs_label']) unique_terms.add(last_as['rdfs_label']) elif not check_id(last_as['id']) and last_as['name'] != '': + if last_as['id'] != '': + as_out_ub.add(last_as['id']) + else: + as_temp_terms.add(last_as['name']) as_invalid_terms.add(last_as['name']) unique_terms.add(last_as['name']) if not check_id(last_ct['id']) and last_ct['rdfs_label'] != '': + if last_ct['id'] != '': + ct_out_ct.add(last_ct['id']) + else: + ct_temp_terms.add(last_ct['rdfs_label']) ct_invalid_terms.add(last_ct['rdfs_label']) unique_terms.add(last_ct['rdfs_label']) elif not check_id(last_ct['id']) and last_ct['name'] != '': + if last_ct['id'] != '': + ct_out_ct.add(last_ct['id']) + else: + ct_temp_terms.add(last_ct['name']) ct_invalid_terms.add(last_ct['name']) unique_terms.add(last_ct['name']) @@ -218,6 +270,7 @@ def no_parent(log_dict, cell_type, row_number): as_invalid_term_percent = 0 ct_invalid_terms_percent = 0 + invalid_terms_percent = 0 if len(as_valid_terms) + len(ct_invalid_terms) > 0: as_invalid_term_percent = round((len(as_invalid_terms)*100)/(len(as_valid_terms)+len(as_invalid_terms)), 2) if len(ct_valid_terms) + len(ct_invalid_terms) > 0: @@ -227,10 +280,14 @@ def no_parent(log_dict, cell_type, row_number): report_terms = { 'Table': '', - 'AS_valid_term_number': [len(as_valid_terms)], - 'AS_invalid_term_number': [len(as_invalid_terms)], + 'AS_valid_term_number': [len(as_valid_terms)], + 'AS_temp_term_number': [len(as_temp_terms)], + 'AS_out_ub': [len(as_out_ub)], + 'AS_invalid_term_number': [len(as_invalid_terms)], 'AS_invalid_term_percent': [as_invalid_term_percent], 'CT_valid_term_number': [len(ct_valid_terms)], + 'CT_temp_term_number': [len(ct_temp_terms)], + 'CT_out_ub': [len(ct_out_ct)], 'CT_invalid_term_number': [len(ct_invalid_terms)], 'CT_invalid_term_percent': [ct_invalid_terms_percent], 'invalid_terms_percent': [invalid_terms_percent] diff --git a/src/dashboard_generation.py b/src/dashboard_generation.py index 7c210ccd9..175eb0777 100644 --- a/src/dashboard_generation.py +++ b/src/dashboard_generation.py @@ -13,7 +13,8 @@ def clean_up(report): def add_link(report): for row in report.itertuples(): row_table = row.Table - report.at[row.Index, "Table"] = f"[{row_table}]({row_table}/README.md)" + if row_table != "Total": + report.at[row.Index, "Table"] = f"[{row_table}]({row_table}/README.md)" return report @@ -50,24 +51,38 @@ def add_color(report, report_type): def get_reports(date): BASE_PATH = "../reports/report_" - + ter_report = pd.read_csv(f"{BASE_PATH}terms_{date}.tsv", sep='\t') ter_report.sort_values(by=["Table"], inplace=True) + ter_report.loc["Total"] = ter_report.sum() + ter_report.loc[ter_report.index[-1], "Table"] = "Total" + ter_report.loc[ter_report.index[-1], "AS_invalid_term_percent"] = "" + ter_report.loc[ter_report.index[-1], "CT_invalid_term_percent"] = "" + ter_report.loc[ter_report.index[-1], "invalid_terms_percent"] = "" ter_report = add_color(ter_report.reset_index(drop=True), "terms") ter_report.rename(columns={ "AS_valid_term_number": "# VALID AS TERMS", + "AS_temp_term_number": "# AS TEMP TERMS", + "AS_out_ub": "# AS NOT UBERON TERMS", "AS_invalid_term_number": "# INVALID AS TERMS", "AS_invalid_term_percent": "% INVALID AS TERMS", "CT_valid_term_number": "# VALID CT TERMS", + "CT_temp_term_number": "# CT TEMP TERMS", + "CT_out_ub": "# CT NOT CL TERMS", "CT_invalid_term_number": "# INVALID CT TERMS", "CT_invalid_term_percent": "% INVALID CT TERMS", "invalid_terms_percent": "% INVALID TERMS" }, inplace=True) ter_report = add_link(ter_report) ter_report_md = tsv2md(ter_report) - + rel_report = pd.read_csv(f"{BASE_PATH}relationship_{date}.tsv", sep='\t') rel_report.sort_values(by=["Table"], inplace=True) + rel_report.loc["Total"] = rel_report.sum() + rel_report.loc[rel_report.index[-1], "Table"] = "Total" + rel_report.loc[rel_report.index[-1], "percent_invalid_AS-AS_relationship"] = "" + rel_report.loc[rel_report.index[-1], "percent_invalid_CT-CT_relationship"] = "" + rel_report.loc[rel_report.index[-1], "percent_invalid_CT-AS_relationship"] = "" rel_report = clean_up(rel_report.reset_index(drop=True)) rel_report = add_color(rel_report, "relations") rel_report.rename(columns={