From f386a36446e65574f749aacb892062e95721230a Mon Sep 17 00:00:00 2001
From: Olexandr Balyk <ob@facilitate.energy>
Date: Thu, 30 Nov 2023 23:42:01 -0500
Subject: [PATCH] Update comments removal transforms (#140)

This PR adds information on comment row indicators to `config`. The info
is then used to identify and remove those rows.

---------

Co-authored-by: Siddharth Krishna <siddharth-krishna@users.noreply.github.com>
---
 times_reader/__main__.py           |  4 +-
 times_reader/config/veda-tags.json | 23 ++++++++
 times_reader/datatypes.py          | 22 +++++--
 times_reader/transforms.py         | 94 +++++++++++++++++-------------
 4 files changed, 96 insertions(+), 47 deletions(-)

diff --git a/times_reader/__main__.py b/times_reader/__main__.py
index 741199d..43dd924 100644
--- a/times_reader/__main__.py
+++ b/times_reader/__main__.py
@@ -57,10 +57,12 @@ def convert_xl_to_times(
         transforms.normalize_tags_columns,
         transforms.remove_fill_tables,
         transforms.remove_empty_tables,
-        lambda config, tables: [transforms.remove_comment_rows(t) for t in tables],
         lambda config, tables: [transforms.remove_comment_cols(t) for t in tables],
         transforms.remove_tables_with_formulas,  # slow
         transforms.normalize_column_aliases,
+        lambda config, tables: [
+            transforms.remove_comment_rows(config, t) for t in tables
+        ],
         transforms.apply_postnormalisation_fixes,
         transforms.generate_dummy_processes,
         transforms.process_transform_insert_variants,
diff --git a/times_reader/config/veda-tags.json b/times_reader/config/veda-tags.json
index d8bc2ec..0a2fe5a 100644
--- a/times_reader/config/veda-tags.json
+++ b/times_reader/config/veda-tags.json
@@ -25,6 +25,29 @@
       ]
     }
   },
+  {
+    "tag_name": "comemi",
+    "tag_allowed_in": [
+      "BY",
+      "SubRES"
+    ],
+    "tag_fields": {
+      "fields_names": [
+        "commname"
+      ],
+      "fields_aliases": [
+        [
+          "commodity"
+        ]
+      ],
+      "row_ignore_symbol": [
+        [
+          "\\I:",
+          "*"
+        ]
+      ]
+    }
+  },
   {
     "tag_name": "currencies",
     "tag_allowed_in": [
diff --git a/times_reader/datatypes.py b/times_reader/datatypes.py
index b58e9d7..b893ca5 100644
--- a/times_reader/datatypes.py
+++ b/times_reader/datatypes.py
@@ -148,6 +148,8 @@ class Config:
     all_attributes: Set[str]
     # For each tag, this dictionary maps each column alias to the normalized name
     column_aliases: Dict[Tag, Dict[str, str]]
+    # For each tag, this dictionary specifies comment row symbols by column name
+    row_comment_chars: Dict[Tag, Dict[str, list]]
     veda_attr_defaults: Dict[str, Dict[str, list]]
 
     def __init__(
@@ -161,7 +163,9 @@ def __init__(
         self.dd_table_order, self.all_attributes = Config._process_times_info(
             times_info_file
         )
-        self.column_aliases = Config._read_veda_tags_info(veda_tags_file)
+        self.column_aliases, self.row_comment_chars = Config._read_veda_tags_info(
+            veda_tags_file
+        )
         self.veda_attr_defaults = Config._read_veda_attr_defaults(
             veda_attr_defaults_file
         )
@@ -263,15 +267,19 @@ def _read_mappings(filename: str) -> List[TimesXlMap]:
         return mappings
 
     @staticmethod
-    def _read_veda_tags_info(veda_tags_file: str) -> Dict[Tag, Dict[str, str]]:
-        # Read veda_tags_file
+    def _read_veda_tags_info(
+        veda_tags_file: str,
+    ) -> Tuple[Dict[Tag, Dict[str, str]], Dict[Tag, Dict[str, list]]]:
         with resources.open_text("times_reader.config", veda_tags_file) as f:
             veda_tags_info = json.load(f)
         column_aliases = {}
+        row_comment_chars = {}
+
         for tag_info in veda_tags_info:
             if "tag_fields" in tag_info:
                 # The file stores the tag name in lowercase, and without the ~
                 tag_name = "~" + tag_info["tag_name"].upper()
+                # Process column aliases:
                 column_aliases[tag_name] = {}
                 names = tag_info["tag_fields"]["fields_names"]
                 aliases = tag_info["tag_fields"]["fields_aliases"]
@@ -279,7 +287,13 @@ def _read_veda_tags_info(veda_tags_file: str) -> Dict[Tag, Dict[str, str]]:
                 for name, aliases in zip(names, aliases):
                     for alias in aliases:
                         column_aliases[tag_name][alias] = name
-        return column_aliases
+                # Process comment chars:
+                row_comment_chars[tag_name] = {}
+                chars = tag_info["tag_fields"]["row_ignore_symbol"]
+                assert len(names) == len(chars)
+                for name, chars_list in zip(names, chars):
+                    row_comment_chars[tag_name][name] = chars_list
+        return column_aliases, row_comment_chars
 
     @staticmethod
     def _read_veda_attr_defaults(
diff --git a/times_reader/transforms.py b/times_reader/transforms.py
index f4ef4a6..57d4bd6 100644
--- a/times_reader/transforms.py
+++ b/times_reader/transforms.py
@@ -39,15 +39,13 @@
 }
 
 
-def remove_comment_rows(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedXlTable:
+def remove_comment_rows(
+    config: datatypes.Config, table: datatypes.EmbeddedXlTable
+) -> datatypes.EmbeddedXlTable:
     """
-    Return a modified copy of 'table' where rows with cells containig '*'
-    or '\I:' in their first or third columns have been deleted. These characters
-    are defined in https://iea-etsap.org/docs/Documentation_for_the_TIMES_Model-Part-IV.pdf
-    as comment identifiers (pag 15).
-    TODO: we believe the deletion of the third column is a bug. We tried deleting that part
-    of the code but we failed to parse a row as a consequence. We need to investigate why,
-    fix that parsing and remove the deletion of the third column.
+    Return a modified copy of 'table' where rows with cells starting with symbols
+    indicating a comment row in any column have been deleted. Comment row symbols
+    are column name dependant and are specified in the config.
 
     :param table:       Table object in EmbeddedXlTable format.
     :return:            Table object in EmbeddedXlTable format without comment rows.
@@ -56,35 +54,39 @@ def remove_comment_rows(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedX
         return table
 
     df = table.dataframe.copy()
-    comment_rows = list(
-        locate(
-            df.iloc[:, 0],
-            lambda cell: isinstance(cell, str)
-            and (cell.startswith("*") or cell.startswith("\\I:")),
-        )
-    )
-    df.drop(index=comment_rows, inplace=True)
-    df.reset_index(drop=True, inplace=True)
 
-    # TODO: the deletion of this third column is a bug. Removing it causes the
-    # program to fail parse all rows. We need to fix the parsing so it can read
-    # all rows and remove this code block.
-    if df.shape[1] > 1:
-        comment_rows = list(
-            locate(
-                df.iloc[:, 1],
-                lambda cell: isinstance(cell, str) and cell.startswith("*"),
+    tag = table.tag.split(":")[0]
+
+    if tag in config.row_comment_chars:
+        chars_by_colname = config.row_comment_chars[tag]
+    else:
+        return table
+
+    comment_rows = set()
+
+    for colname in df.columns:
+        if colname in chars_by_colname.keys():
+            comment_rows.update(
+                list(
+                    locate(
+                        df[colname],
+                        lambda cell: isinstance(cell, str)
+                        and (cell.startswith(tuple(chars_by_colname[colname]))),
+                    )
+                )
             )
-        )
-        df.drop(index=comment_rows, inplace=True)
-        df.reset_index(drop=True, inplace=True)
+
+    df.drop(index=list(comment_rows), inplace=True)
+    df.reset_index(drop=True, inplace=True)
+
     return replace(table, dataframe=df)
 
 
 def remove_comment_cols(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedXlTable:
     """
     Return a modified copy of 'table' where columns with labels starting with '*'
-    have been deleted.
+    have been deleted. Assumes that any leading spaces in the original input table
+    have been removed.
 
     :param table:       Table object in EmbeddedXlTable format.
     :return:            Table object in EmbeddedXlTable format without comment columns.
@@ -92,14 +94,16 @@ def remove_comment_cols(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedX
     if table.dataframe.size == 0:
         return table
 
-    comment_cols = list(
-        locate(
-            table.dataframe.columns,
-            lambda cell: isinstance(cell, str) and cell.startswith("*"),
-        )
-    )
-    df = table.dataframe.drop(table.dataframe.columns[comment_cols], axis=1)
+    comment_cols = [
+        colname
+        for colname in table.dataframe.columns
+        if isinstance(colname, str) and colname.startswith("*")
+    ]
+
+    df = table.dataframe.drop(comment_cols, axis=1)
     df.reset_index(drop=True, inplace=True)
+
+    # TODO: should we move the code below to a separate transform?
     seen = set()
     dupes = [x for x in df.columns if x in seen or seen.add(x)]
     if len(dupes) > 0:
@@ -232,6 +236,7 @@ def apply_postnormalisation_fixes(
     config: datatypes.Config, tables: List[datatypes.EmbeddedXlTable]
 ) -> List[datatypes.EmbeddedXlTable]:
     rename_cols_dict = {
+        datatypes.Tag.comemi: {"commname": "commodity"},
         datatypes.Tag.fi_comm: {"commname": "commodity"},
         datatypes.Tag.fi_process: {"techname": "process"},
         datatypes.Tag.tfm_comgrp: {"value": "allregions"},
@@ -460,9 +465,14 @@ def process_flexible_import_table(
                 .loc[veda_process_sets["process"] == process]
                 .unique()
             )
-            df.loc[i & (df["process"] == process), other] = cost_mapping[
-                veda_process_set[0]
-            ]
+            if veda_process_set.shape[0]:
+                df.loc[i & (df["process"] == process), other] = cost_mapping[
+                    veda_process_set[0]
+                ]
+            else:
+                print(
+                    f"WARNING: COST won't be processed as IRE_PRICE for {process}, because it is not in IMP/EXP/MIN"
+                )
 
         # Use CommName to store the active commodity for EXP / IMP
         i = df[attribute].isin(["COST", "IRE_PRICE"])
@@ -1274,8 +1284,6 @@ def process_commodity_emissions(
             result.append(table)
         else:
             df = table.dataframe.copy()
-            # TODO either add ~COMEMI to veda-tags.json or do this somewhere less hacky:
-            df.rename(columns={"commname": "commodity"}, inplace=True)
             index_columns = ["region", "year", "commodity"]
             data_columns = [
                 colname for colname in df.columns if colname not in index_columns
@@ -1287,7 +1295,9 @@ def process_commodity_emissions(
 
             if "region" in df.columns:
                 df = df.astype({"region": "string"})
-                df["region"] = df["region"].map(lambda s: s.split(","))
+                df["region"] = df["region"].map(
+                    lambda s: s.split(",") if isinstance(s, str) else s
+                )
                 df = df.explode("region", ignore_index=True)
                 df = df[df["region"].isin(regions)]