From 5a6e2cb6e359540c22fc22b790c7cb181efdf231 Mon Sep 17 00:00:00 2001
From: Siddharth Krishna <siddharth-krishna@users.noreply.github.com>
Date: Fri, 15 Dec 2023 17:36:17 +0000
Subject: [PATCH] Use times-info file for parameter mappings (#148)

---------

Co-authored-by: Olexandr Balyk <ob@facilitate.energy>
---
 xl2times/__main__.py            | 48 ++++++++++-----------
 xl2times/config/times-info.json | 75 +++++++++++++++++----------------
 xl2times/datatypes.py           | 46 +++++++++++++++++---
 xl2times/transforms.py          | 12 ++----
 4 files changed, 106 insertions(+), 75 deletions(-)

diff --git a/xl2times/__main__.py b/xl2times/__main__.py
index ffba993..1153f66 100644
--- a/xl2times/__main__.py
+++ b/xl2times/__main__.py
@@ -193,30 +193,30 @@ def compare(
                     f"WARNING: Table {table_name} header incorrect, was"
                     f" {data_cols}, should be {transformed_gt_cols}"
                 )
-            else:
-                # both are in string form so can be compared without any issues
-                gt_rows = set(tuple(row) for row in gt_table.to_numpy().tolist())
-                data_rows = set(tuple(row) for row in data_table.to_numpy().tolist())
-                total_correct_rows += len(gt_rows.intersection(data_rows))
-                additional = data_rows - gt_rows
-                total_additional_rows += len(additional)
-                missing = gt_rows - data_rows
-                if len(additional) != 0 or len(missing) != 0:
-                    print(
-                        f"WARNING: Table {table_name} ({data_table.shape[0]} rows,"
-                        f" {gt_table.shape[0]} GT rows) contains {len(additional)}"
-                        f" additional rows and is missing {len(missing)} rows"
-                    )
-                if len(additional) != 0:
-                    DataFrame(additional).to_csv(
-                        os.path.join(output_dir, table_name + "_additional.csv"),
-                        index=False,
-                    )
-                if len(missing) != 0:
-                    DataFrame(missing).to_csv(
-                        os.path.join(output_dir, table_name + "_missing.csv"),
-                        index=False,
-                    )
+
+            # both are in string form so can be compared without any issues
+            gt_rows = set(tuple(row) for row in gt_table.to_numpy().tolist())
+            data_rows = set(tuple(row) for row in data_table.to_numpy().tolist())
+            total_correct_rows += len(gt_rows.intersection(data_rows))
+            additional = data_rows - gt_rows
+            total_additional_rows += len(additional)
+            missing = gt_rows - data_rows
+            if len(additional) != 0 or len(missing) != 0:
+                print(
+                    f"WARNING: Table {table_name} ({data_table.shape[0]} rows,"
+                    f" {gt_table.shape[0]} GT rows) contains {len(additional)}"
+                    f" additional rows and is missing {len(missing)} rows"
+                )
+            if len(additional) != 0:
+                DataFrame(additional).to_csv(
+                    os.path.join(output_dir, table_name + "_additional.csv"),
+                    index=False,
+                )
+            if len(missing) != 0:
+                DataFrame(missing).to_csv(
+                    os.path.join(output_dir, table_name + "_missing.csv"),
+                    index=False,
+                )
 
     print(
         f"{total_correct_rows / total_gt_rows :.1%} of ground truth rows present"
diff --git a/xl2times/config/times-info.json b/xl2times/config/times-info.json
index e6152b0..a66e799 100644
--- a/xl2times/config/times-info.json
+++ b/xl2times/config/times-info.json
@@ -137,7 +137,7 @@
       "region",
       "year",
       "process",
-      "commodity_group",
+      "other_indexes",
       "timeslice"
     ]
   },
@@ -155,7 +155,7 @@
       "region",
       "year",
       "process",
-      "commodity_group",
+      "other_indexes",
       "timeslice"
     ]
   },
@@ -298,6 +298,7 @@
   {
     "name": "B",
     "gams-cat": "parameter",
+    "type": "derived",
     "indexes": [
       "YEAR"
     ],
@@ -696,7 +697,7 @@
     "mapping": [
       "region",
       "commodity",
-      "commodity_group"
+      "other_indexes"
     ]
   },
   {
@@ -764,7 +765,7 @@
       "COM_GRP"
     ],
     "mapping": [
-      "commodity_group"
+      "other_indexes"
     ]
   },
   {
@@ -1031,7 +1032,7 @@
     ],
     "mapping": [
       "region",
-      "commodity_group",
+      "other_indexes",
       "commodity"
     ]
   },
@@ -1098,7 +1099,7 @@
     ],
     "mapping": [
       "region",
-      "commodity_group"
+      "other_indexes"
     ]
   },
   {
@@ -1141,7 +1142,7 @@
     ],
     "mapping": [
       "region",
-      "commodity_group",
+      "other_indexes",
       "timeslice"
     ]
   },
@@ -1428,6 +1429,7 @@
   {
     "name": "E",
     "gams-cat": "parameter",
+    "type": "derived",
     "indexes": [
       "YEAR"
     ],
@@ -1468,7 +1470,7 @@
       "region",
       "year",
       "process",
-      "commodity_group",
+      "other_indexes",
       "timeslice",
       "limtype"
     ]
@@ -1548,7 +1550,7 @@
       "region",
       "year",
       "process",
-      "commodity_group",
+      "other_indexes",
       "commodity",
       "timeslice"
     ]
@@ -1568,7 +1570,7 @@
       "region",
       "year",
       "process",
-      "commodity_group",
+      "other_indexes",
       "commodity",
       "timeslice"
     ]
@@ -1624,8 +1626,8 @@
       "region",
       "year",
       "process",
-      "commodity_group",
-      "commodity_group",
+      "other_indexes",
+      "other_indexes",
       "timeslice"
     ]
   },
@@ -1643,8 +1645,8 @@
       "region",
       "year",
       "process",
-      "commodity_group",
-      "commodity_group"
+      "other_indexes",
+      "other_indexes"
     ]
   },
   {
@@ -1700,7 +1702,7 @@
       "year",
       "process",
       "commodity",
-      "commodity_group",
+      "other_indexes",
       "timeslice",
       "limtype"
     ]
@@ -1741,9 +1743,9 @@
       "region",
       "year",
       "process",
-      "commodity_group",
+      "other_indexes",
       "commodity",
-      "commodity_group",
+      "other_indexes",
       "timeslice"
     ]
   },
@@ -1772,11 +1774,11 @@
     "gams-cat": "parameter",
     "indexes": [
       "CUR",
-      "CUR"
+      "CUR2"
     ],
     "mapping": [
       "currency",
-      "currency"
+      "other_indexes"
     ]
   },
   {
@@ -1900,7 +1902,7 @@
       "region",
       "year",
       "commodity",
-      "commodity_group"
+      "other_indexes"
     ]
   },
   {
@@ -2206,7 +2208,7 @@
       "region",
       "year",
       "process",
-      "commodity_group"
+      "other_indexes"
     ]
   },
   {
@@ -2223,7 +2225,7 @@
       "region",
       "year",
       "process",
-      "commodity_group",
+      "other_indexes",
       "timeslice"
     ]
   },
@@ -2241,7 +2243,7 @@
       "region",
       "year",
       "process",
-      "commodity_group",
+      "other_indexes",
       "timeslice"
     ]
   },
@@ -2506,7 +2508,7 @@
       "region",
       "year",
       "process",
-      "tbd"
+      "other_indexes"
     ]
   },
   {
@@ -3052,7 +3054,7 @@
       "region",
       "year",
       "process",
-      "commodity_group"
+      "other_indexes"
     ]
   },
   {
@@ -3067,7 +3069,7 @@
     "mapping": [
       "region",
       "process",
-      "commodity_group",
+      "other_indexes",
       "tbd"
     ]
   },
@@ -3111,7 +3113,7 @@
     "mapping": [
       "region",
       "process",
-      "commodity_group",
+      "other_indexes",
       "tbd"
     ]
   },
@@ -3747,8 +3749,8 @@
       "region",
       "year",
       "process",
-      "commodity_group",
-      "commodity_group",
+      "other_indexes",
+      "other_indexes",
       "stage",
       "sow"
     ]
@@ -3925,7 +3927,7 @@
       "AGE"
     ],
     "mapping": [
-      "stage",
+      "other_indexes",
       "year"
     ]
   },
@@ -4401,7 +4403,7 @@
     ],
     "mapping": [
       "region",
-      "commodity_group",
+      "other_indexes",
       "limtype"
     ]
   },
@@ -4417,7 +4419,7 @@
     "mapping": [
       "region",
       "year",
-      "commodity_group",
+      "other_indexes",
       "limtype"
     ]
   },
@@ -4752,7 +4754,7 @@
       "region",
       "year",
       "process",
-      "commodity_group"
+      "other_indexes"
     ]
   },
   {
@@ -5102,6 +5104,7 @@
   {
     "name": "VDA_EMCB",
     "gams-cat": "parameter",
+    "type": "special",
     "indexes": [
       "REG",
       "YEAR",
@@ -5111,7 +5114,7 @@
     "mapping": [
       "region",
       "year",
-      "commodity",
+      "other_indexes",
       "commodity"
     ]
   },
@@ -5129,8 +5132,8 @@
       "region",
       "year",
       "process",
-      "commodity_group",
+      "other_indexes",
       "timeslice"
     ]
   }
-]
+]
diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py
index e0967f0..340df4d 100644
--- a/xl2times/datatypes.py
+++ b/xl2times/datatypes.py
@@ -132,7 +132,7 @@ class TimesXlMap:
 
     times_name: str
     times_cols: List[str]
-    xl_name: str
+    xl_name: str  # TODO once we move away from times_mapping.txt, make this type Tag
     xl_cols: List[str]
     col_map: Dict[str, str]
     filter_rows: Dict[str, str]
@@ -163,9 +163,11 @@ def __init__(
         veda_attr_defaults_file: str,
     ):
         self.times_xl_maps = Config._read_mappings(mapping_file)
-        self.dd_table_order, self.all_attributes = Config._process_times_info(
-            times_info_file
-        )
+        (
+            self.dd_table_order,
+            self.all_attributes,
+            param_mappings,
+        ) = Config._process_times_info(times_info_file)
         (
             self.column_aliases,
             self.row_comment_chars,
@@ -174,9 +176,16 @@ def __init__(
         self.veda_attr_defaults, self.attr_aliases = Config._read_veda_attr_defaults(
             veda_attr_defaults_file
         )
+        # Migration in progress: use parameter mappings from times_info_file for now
+        name_to_map = {m.times_name: m for m in self.times_xl_maps}
+        for m in param_mappings:
+            name_to_map[m.times_name] = m
+        self.times_xl_maps = list(name_to_map.values())
 
     @staticmethod
-    def _process_times_info(times_info_file: str) -> Tuple[Iterable[str], Set[str]]:
+    def _process_times_info(
+        times_info_file: str,
+    ) -> Tuple[Iterable[str], Set[str], List[TimesXlMap]]:
         # Read times_info_file and compute dd_table_order:
         # We output tables in order by categories: set, subset, subsubset, md-set, and parameter
         with resources.open_text("xl2times.config", times_info_file) as f:
@@ -198,7 +207,32 @@ def _process_times_info(times_info_file: str) -> Tuple[Iterable[str], Set[str]]:
             for item in table_info
             if item["gams-cat"] == "parameter"
         }
-        return dd_table_order, attributes
+
+        # Compute the mapping for attributes / parameters:
+        def create_mapping(entity):
+            assert entity["gams-cat"] == "parameter"
+            times_cols = entity["indexes"] + ["VALUE"]
+            xl_cols = entity["mapping"] + ["value"]  # TODO map in json
+            col_map = dict(zip(times_cols, xl_cols))
+            # If tag starts with UC, then the data is in UC_T, else FI_T
+            xl_name = Tag.uc_t if entity["name"].lower().startswith("uc") else Tag.fi_t
+            return TimesXlMap(
+                times_name=entity["name"],
+                times_cols=times_cols,
+                xl_name=xl_name,
+                xl_cols=xl_cols,
+                col_map=col_map,
+                filter_rows={"attribute": entity["name"]},  # TODO value:1?
+            )
+
+        param_mappings = [
+            create_mapping(x)
+            for x in table_info
+            if x["gams-cat"] == "parameter"
+            and "type" not in x  # TODO Generalise derived parameters?
+        ]
+
+        return dd_table_order, attributes, param_mappings
 
     @staticmethod
     def _read_mappings(filename: str) -> List[TimesXlMap]:
diff --git a/xl2times/transforms.py b/xl2times/transforms.py
index 14db425..64f27b8 100644
--- a/xl2times/transforms.py
+++ b/xl2times/transforms.py
@@ -248,12 +248,6 @@ def merge_tables(
             set(t.dataframe.columns) == set(group[0].dataframe.columns) for t in group
         ):
             cols = [(",".join(g.dataframe.columns), g) for g in group]
-            cols_groups = [
-                (key, list(group))
-                for key, group in groupby(
-                    sorted(cols, key=lambda ct: ct[0]), lambda ct: ct[0]
-                )
-            ]
             print(
                 f"WARNING: Cannot merge tables with tag {key} as their columns are not identical"
             )
@@ -535,8 +529,6 @@ def process_user_constraint_table(
             "uc_desc",  # Why is this in the index columns?
             # TODO remove these?
             "timeslice",
-            "commodity",
-            "process",
         ]
         data_columns = [x for x in df.columns if x not in known_columns]
 
@@ -1837,6 +1829,8 @@ def make_str(df):
             lambda row: make_str(get_matching_commodities(row, dictionary)), axis=1
         )
 
+        cols_to_drop = [col for col in df.columns if col in query_columns]
+
         df = expand_rows(
             datatypes.EmbeddedXlTable(
                 tag="",
@@ -1844,7 +1838,7 @@ def make_str(df):
                 sheetname="",
                 range="",
                 filename="",
-                dataframe=df,
+                dataframe=df.drop(columns=cols_to_drop),
             )
         ).dataframe