From 5a6e2cb6e359540c22fc22b790c7cb181efdf231 Mon Sep 17 00:00:00 2001 From: Siddharth Krishna Date: Fri, 15 Dec 2023 17:36:17 +0000 Subject: [PATCH] Use times-info file for parameter mappings (#148) --------- Co-authored-by: Olexandr Balyk --- xl2times/__main__.py | 48 ++++++++++----------- xl2times/config/times-info.json | 75 +++++++++++++++++---------------- xl2times/datatypes.py | 46 +++++++++++++++++--- xl2times/transforms.py | 12 ++---- 4 files changed, 106 insertions(+), 75 deletions(-) diff --git a/xl2times/__main__.py b/xl2times/__main__.py index ffba993..1153f66 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -193,30 +193,30 @@ def compare( f"WARNING: Table {table_name} header incorrect, was" f" {data_cols}, should be {transformed_gt_cols}" ) - else: - # both are in string form so can be compared without any issues - gt_rows = set(tuple(row) for row in gt_table.to_numpy().tolist()) - data_rows = set(tuple(row) for row in data_table.to_numpy().tolist()) - total_correct_rows += len(gt_rows.intersection(data_rows)) - additional = data_rows - gt_rows - total_additional_rows += len(additional) - missing = gt_rows - data_rows - if len(additional) != 0 or len(missing) != 0: - print( - f"WARNING: Table {table_name} ({data_table.shape[0]} rows," - f" {gt_table.shape[0]} GT rows) contains {len(additional)}" - f" additional rows and is missing {len(missing)} rows" - ) - if len(additional) != 0: - DataFrame(additional).to_csv( - os.path.join(output_dir, table_name + "_additional.csv"), - index=False, - ) - if len(missing) != 0: - DataFrame(missing).to_csv( - os.path.join(output_dir, table_name + "_missing.csv"), - index=False, - ) + + # both are in string form so can be compared without any issues + gt_rows = set(tuple(row) for row in gt_table.to_numpy().tolist()) + data_rows = set(tuple(row) for row in data_table.to_numpy().tolist()) + total_correct_rows += len(gt_rows.intersection(data_rows)) + additional = data_rows - gt_rows + total_additional_rows += len(additional) + missing = gt_rows - data_rows + if len(additional) != 0 or len(missing) != 0: + print( + f"WARNING: Table {table_name} ({data_table.shape[0]} rows," + f" {gt_table.shape[0]} GT rows) contains {len(additional)}" + f" additional rows and is missing {len(missing)} rows" + ) + if len(additional) != 0: + DataFrame(additional).to_csv( + os.path.join(output_dir, table_name + "_additional.csv"), + index=False, + ) + if len(missing) != 0: + DataFrame(missing).to_csv( + os.path.join(output_dir, table_name + "_missing.csv"), + index=False, + ) print( f"{total_correct_rows / total_gt_rows :.1%} of ground truth rows present" diff --git a/xl2times/config/times-info.json b/xl2times/config/times-info.json index e6152b0..a66e799 100644 --- a/xl2times/config/times-info.json +++ b/xl2times/config/times-info.json @@ -137,7 +137,7 @@ "region", "year", "process", - "commodity_group", + "other_indexes", "timeslice" ] }, @@ -155,7 +155,7 @@ "region", "year", "process", - "commodity_group", + "other_indexes", "timeslice" ] }, @@ -298,6 +298,7 @@ { "name": "B", "gams-cat": "parameter", + "type": "derived", "indexes": [ "YEAR" ], @@ -696,7 +697,7 @@ "mapping": [ "region", "commodity", - "commodity_group" + "other_indexes" ] }, { @@ -764,7 +765,7 @@ "COM_GRP" ], "mapping": [ - "commodity_group" + "other_indexes" ] }, { @@ -1031,7 +1032,7 @@ ], "mapping": [ "region", - "commodity_group", + "other_indexes", "commodity" ] }, @@ -1098,7 +1099,7 @@ ], "mapping": [ "region", - "commodity_group" + "other_indexes" ] }, { @@ -1141,7 +1142,7 @@ ], "mapping": [ "region", - "commodity_group", + "other_indexes", "timeslice" ] }, @@ -1428,6 +1429,7 @@ { "name": "E", "gams-cat": "parameter", + "type": "derived", "indexes": [ "YEAR" ], @@ -1468,7 +1470,7 @@ "region", "year", "process", - "commodity_group", + "other_indexes", "timeslice", "limtype" ] @@ -1548,7 +1550,7 @@ "region", "year", "process", - "commodity_group", + "other_indexes", "commodity", "timeslice" ] @@ -1568,7 +1570,7 @@ "region", "year", "process", - "commodity_group", + "other_indexes", "commodity", "timeslice" ] @@ -1624,8 +1626,8 @@ "region", "year", "process", - "commodity_group", - "commodity_group", + "other_indexes", + "other_indexes", "timeslice" ] }, @@ -1643,8 +1645,8 @@ "region", "year", "process", - "commodity_group", - "commodity_group" + "other_indexes", + "other_indexes" ] }, { @@ -1700,7 +1702,7 @@ "year", "process", "commodity", - "commodity_group", + "other_indexes", "timeslice", "limtype" ] @@ -1741,9 +1743,9 @@ "region", "year", "process", - "commodity_group", + "other_indexes", "commodity", - "commodity_group", + "other_indexes", "timeslice" ] }, @@ -1772,11 +1774,11 @@ "gams-cat": "parameter", "indexes": [ "CUR", - "CUR" + "CUR2" ], "mapping": [ "currency", - "currency" + "other_indexes" ] }, { @@ -1900,7 +1902,7 @@ "region", "year", "commodity", - "commodity_group" + "other_indexes" ] }, { @@ -2206,7 +2208,7 @@ "region", "year", "process", - "commodity_group" + "other_indexes" ] }, { @@ -2223,7 +2225,7 @@ "region", "year", "process", - "commodity_group", + "other_indexes", "timeslice" ] }, @@ -2241,7 +2243,7 @@ "region", "year", "process", - "commodity_group", + "other_indexes", "timeslice" ] }, @@ -2506,7 +2508,7 @@ "region", "year", "process", - "tbd" + "other_indexes" ] }, { @@ -3052,7 +3054,7 @@ "region", "year", "process", - "commodity_group" + "other_indexes" ] }, { @@ -3067,7 +3069,7 @@ "mapping": [ "region", "process", - "commodity_group", + "other_indexes", "tbd" ] }, @@ -3111,7 +3113,7 @@ "mapping": [ "region", "process", - "commodity_group", + "other_indexes", "tbd" ] }, @@ -3747,8 +3749,8 @@ "region", "year", "process", - "commodity_group", - "commodity_group", + "other_indexes", + "other_indexes", "stage", "sow" ] @@ -3925,7 +3927,7 @@ "AGE" ], "mapping": [ - "stage", + "other_indexes", "year" ] }, @@ -4401,7 +4403,7 @@ ], "mapping": [ "region", - "commodity_group", + "other_indexes", "limtype" ] }, @@ -4417,7 +4419,7 @@ "mapping": [ "region", "year", - "commodity_group", + "other_indexes", "limtype" ] }, @@ -4752,7 +4754,7 @@ "region", "year", "process", - "commodity_group" + "other_indexes" ] }, { @@ -5102,6 +5104,7 @@ { "name": "VDA_EMCB", "gams-cat": "parameter", + "type": "special", "indexes": [ "REG", "YEAR", @@ -5111,7 +5114,7 @@ "mapping": [ "region", "year", - "commodity", + "other_indexes", "commodity" ] }, @@ -5129,8 +5132,8 @@ "region", "year", "process", - "commodity_group", + "other_indexes", "timeslice" ] } -] +] diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py index e0967f0..340df4d 100644 --- a/xl2times/datatypes.py +++ b/xl2times/datatypes.py @@ -132,7 +132,7 @@ class TimesXlMap: times_name: str times_cols: List[str] - xl_name: str + xl_name: str # TODO once we move away from times_mapping.txt, make this type Tag xl_cols: List[str] col_map: Dict[str, str] filter_rows: Dict[str, str] @@ -163,9 +163,11 @@ def __init__( veda_attr_defaults_file: str, ): self.times_xl_maps = Config._read_mappings(mapping_file) - self.dd_table_order, self.all_attributes = Config._process_times_info( - times_info_file - ) + ( + self.dd_table_order, + self.all_attributes, + param_mappings, + ) = Config._process_times_info(times_info_file) ( self.column_aliases, self.row_comment_chars, @@ -174,9 +176,16 @@ def __init__( self.veda_attr_defaults, self.attr_aliases = Config._read_veda_attr_defaults( veda_attr_defaults_file ) + # Migration in progress: use parameter mappings from times_info_file for now + name_to_map = {m.times_name: m for m in self.times_xl_maps} + for m in param_mappings: + name_to_map[m.times_name] = m + self.times_xl_maps = list(name_to_map.values()) @staticmethod - def _process_times_info(times_info_file: str) -> Tuple[Iterable[str], Set[str]]: + def _process_times_info( + times_info_file: str, + ) -> Tuple[Iterable[str], Set[str], List[TimesXlMap]]: # Read times_info_file and compute dd_table_order: # We output tables in order by categories: set, subset, subsubset, md-set, and parameter with resources.open_text("xl2times.config", times_info_file) as f: @@ -198,7 +207,32 @@ def _process_times_info(times_info_file: str) -> Tuple[Iterable[str], Set[str]]: for item in table_info if item["gams-cat"] == "parameter" } - return dd_table_order, attributes + + # Compute the mapping for attributes / parameters: + def create_mapping(entity): + assert entity["gams-cat"] == "parameter" + times_cols = entity["indexes"] + ["VALUE"] + xl_cols = entity["mapping"] + ["value"] # TODO map in json + col_map = dict(zip(times_cols, xl_cols)) + # If tag starts with UC, then the data is in UC_T, else FI_T + xl_name = Tag.uc_t if entity["name"].lower().startswith("uc") else Tag.fi_t + return TimesXlMap( + times_name=entity["name"], + times_cols=times_cols, + xl_name=xl_name, + xl_cols=xl_cols, + col_map=col_map, + filter_rows={"attribute": entity["name"]}, # TODO value:1? + ) + + param_mappings = [ + create_mapping(x) + for x in table_info + if x["gams-cat"] == "parameter" + and "type" not in x # TODO Generalise derived parameters? + ] + + return dd_table_order, attributes, param_mappings @staticmethod def _read_mappings(filename: str) -> List[TimesXlMap]: diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 14db425..64f27b8 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -248,12 +248,6 @@ def merge_tables( set(t.dataframe.columns) == set(group[0].dataframe.columns) for t in group ): cols = [(",".join(g.dataframe.columns), g) for g in group] - cols_groups = [ - (key, list(group)) - for key, group in groupby( - sorted(cols, key=lambda ct: ct[0]), lambda ct: ct[0] - ) - ] print( f"WARNING: Cannot merge tables with tag {key} as their columns are not identical" ) @@ -535,8 +529,6 @@ def process_user_constraint_table( "uc_desc", # Why is this in the index columns? # TODO remove these? "timeslice", - "commodity", - "process", ] data_columns = [x for x in df.columns if x not in known_columns] @@ -1837,6 +1829,8 @@ def make_str(df): lambda row: make_str(get_matching_commodities(row, dictionary)), axis=1 ) + cols_to_drop = [col for col in df.columns if col in query_columns] + df = expand_rows( datatypes.EmbeddedXlTable( tag="", @@ -1844,7 +1838,7 @@ def make_str(df): sheetname="", range="", filename="", - dataframe=df, + dataframe=df.drop(columns=cols_to_drop), ) ).dataframe