Make misc fixes (#256)

Introduce various improvements - use `model.topology` in `generate_topology_dictionary` - include column-specific default values - simplify pattern processing - do not fill missing values in mig tables - modify defaults for processing of `FLO_MARK`, `NCAP_AFC`, and `FLO_DELIV` - apply code review suggestions from #259 --------- Co-authored-by: Siddharth Krishna <[email protected]>
etsap-TIMES · Dec 26, 2024 · 5370fcd · 5370fcd
1 parent 1d853ca
commit 5370fcd
Show file tree

Hide file tree

Showing 5 changed files with 88 additions and 47 deletions.
diff --git a/xl2times/config/veda-attr-defaults.json b/xl2times/config/veda-attr-defaults.json
@@ -95,6 +95,13 @@
   },
   "AFC": {
     "defaults": {
+      "other_indexes": [
+        "commodity",
+        "commodity-in",
+        "commodity-out",
+        "commodity-in-aux",
+        "commodity-out-aux"
+      ],
       "ts-level": "ANNUAL"
     },
     "times-attribute": "NCAP_AFC"
@@ -410,7 +417,9 @@
     "defaults": {
       "commodity": [
         "commodity-in",
-        "commodity-in-aux"
+        "commodity-out",
+        "commodity-in-aux",
+        "commodity-out-aux"
       ],
       "ts-level": "ANNUAL"
     },
@@ -578,7 +587,9 @@
     "defaults": {
       "commodity": [
         "commodity-in",
-        "commodity-in-aux"
+        "commodity-out",
+        "commodity-in-aux",
+        "commodity-out-aux"
       ],
       "ts-level": "ANNUAL"
     }
@@ -619,10 +630,10 @@
   "FLO_MARK": {
     "defaults": {
       "commodity": [
-        "commodity-in",
         "commodity-out",
-        "commodity-in-aux",
-        "commodity-out-aux"
+        "commodity-in",
+        "commodity-out-aux",
+        "commodity-in-aux"
       ],
       "limtype": "UP"
     }
@@ -781,6 +792,13 @@
   },
   "NCAP_AFC": {
     "defaults": {
+      "other_indexes": [
+        "commodity",
+        "commodity-in",
+        "commodity-out",
+        "commodity-in-aux",
+        "commodity-out-aux"
+      ],
       "ts-level": "ANNUAL"
     }
   },

diff --git a/xl2times/config/veda-tags.json b/xl2times/config/veda-tags.json
@@ -134,7 +134,8 @@
         "query_field": false,
         "inherit_above": true,
         "remove_first_row_if_absent": false,
-        "remove_any_row_if_absent": false
+        "remove_any_row_if_absent": false,
+        "default_to": "NRG"
       },
       {
         "name": "ctslvl",
@@ -270,7 +271,8 @@
         "query_field": false,
         "inherit_above": true,
         "remove_first_row_if_absent": false,
-        "remove_any_row_if_absent": false
+        "remove_any_row_if_absent": false,
+        "default_to": "PRE"
       },
       {
         "name": "tact",

diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py
@@ -294,6 +294,8 @@ class Config:
     attr_aliases: set[str]
     # For each tag, this dictionary maps each column alias to the normalized name
     column_aliases: dict[Tag, dict[str, str]]
+    # For each tag, this dictionary maps each column name to its default value
+    column_default_value: dict[Tag, dict[str, str]]
     # For each tag, this dictionary specifies comment row symbols by column name
     row_comment_chars: dict[Tag, dict[str, list]]
     # List of tags for which empty tables should be discarded
@@ -329,11 +331,13 @@ def __init__(
         self.times_sets = Config._read_times_sets(times_sets_file)
         (
             self.column_aliases,
+            self.column_default_value,
             self.row_comment_chars,
             self.discard_if_empty,
             self.query_columns,
             self.known_columns,
             self.required_columns,
+            self.forward_fill_cols,
         ) = Config._read_veda_tags_info(veda_tags_file)
         self.veda_attr_defaults, self.attr_aliases = Config._read_veda_attr_defaults(
             veda_attr_defaults_file
@@ -494,12 +498,14 @@ def _read_mappings(filename: str) -> list[TimesXlMap]:
     def _read_veda_tags_info(
         veda_tags_file: str,
     ) -> tuple[
+        dict[Tag, dict[str, str]],
         dict[Tag, dict[str, str]],
         dict[Tag, dict[str, list]],
         Iterable[Tag],
         dict[Tag, set[str]],
         dict[Tag, set[str]],
         dict[Tag, set[str]],
+        dict[Tag, set[str]],
     ]:
         def to_tag(s: str) -> Tag:
             # The file stores the tag name in lowercase, and without the ~
@@ -518,18 +524,20 @@ def to_tag(s: str) -> Tag:
                 )
 
         valid_column_names = {}
+        column_default_value = {}
         row_comment_chars = {}
         discard_if_empty = []
         query_cols = defaultdict(set)
         known_cols = defaultdict(set)
         required_cols = defaultdict(set)
+        forward_fill_cols = defaultdict(set)
 
         for tag_info in veda_tags_info:
             tag_name = to_tag(tag_info["tag_name"])
             if "valid_fields" in tag_info:
                 discard_if_empty.append(tag_name)
-
                 valid_column_names[tag_name] = {}
+                column_default_value[tag_name] = {}
                 row_comment_chars[tag_name] = {}
                 # Process column aliases and comment chars:
                 for valid_field in tag_info["valid_fields"]:
@@ -543,12 +551,20 @@ def to_tag(s: str) -> Tag:
                     else:
                         field_name = valid_field["name"]
 
+                    if "default_to" in valid_field:
+                        column_default_value[tag_name][field_name] = valid_field[
+                            "default_to"
+                        ]
+
                     if valid_field["query_field"]:
                         query_cols[tag_name].add(field_name)
 
                     if valid_field["remove_any_row_if_absent"]:
                         required_cols[tag_name].add(field_name)
 
+                    if valid_field["inherit_above"]:
+                        forward_fill_cols[tag_name].add(field_name)
+
                     known_cols[tag_name].add(field_name)
 
                     for valid_field_name in valid_field_names:
@@ -564,20 +580,26 @@ def to_tag(s: str) -> Tag:
                 if base_tag in valid_column_names:
                     valid_column_names[tag_name] = valid_column_names[base_tag]
                     discard_if_empty.append(tag_name)
+                if base_tag in column_default_value:
+                    column_default_value[tag_name] = column_default_value[base_tag]
                 if base_tag in row_comment_chars:
                     row_comment_chars[tag_name] = row_comment_chars[base_tag]
                 if base_tag in query_cols:
                     query_cols[tag_name] = query_cols[base_tag]
                 if base_tag in known_cols:
                     known_cols[tag_name] = known_cols[base_tag]
+                if base_tag in forward_fill_cols:
+                    forward_fill_cols[tag_name] = forward_fill_cols[base_tag]
 
         return (
             valid_column_names,
+            column_default_value,
             row_comment_chars,
             discard_if_empty,
             query_cols,
             known_cols,
             required_cols,
+            forward_fill_cols,
         )
 
     @staticmethod

diff --git a/xl2times/transforms.py b/xl2times/transforms.py
@@ -884,10 +884,17 @@ def fill_in_missing_values(
 
     def fill_in_missing_values_table(table):
         df = table.dataframe.copy()
+        default_values = config.column_default_value.get(table.tag, {})
+
         for colname in df.columns:
             # TODO make this more declarative
-            if colname in ["sets", "csets", "process"]:
+            # Forwards fill values in columns
+            if colname in config.forward_fill_cols[table.tag]:
                 df[colname] = df[colname].ffill()
+            # Apply default values to missing cells
+            col_default_value = default_values.get(colname)
+            if col_default_value is not None:
+                df[colname] = df[colname].fillna(col_default_value)
             elif colname == "limtype" and table.tag == Tag.fi_comm and False:
                 isna = df[colname].isna()
                 ismat = df["csets"] == "MAT"
@@ -944,8 +951,8 @@ def fill_in_missing_values_table(table):
         return replace(table, dataframe=df)
 
     for table in tables:
-        if table.tag == Tag.tfm_upd:
-            # Missing values in update tables are wildcards and should not be filled in
+        if table.tag in [Tag.tfm_mig, Tag.tfm_upd]:
+            # Missing values in these tables are wildcards and should not be filled in
             result.append(table)
         else:
             result.append(fill_in_missing_values_table(table))
@@ -1194,7 +1201,7 @@ def capitalise_table_values(
     """Ensure that all table entries are uppercase. Strip leading and trailing whitespace."""
 
     def capitalise_table_entries(table: EmbeddedXlTable):
-        df = table.dataframe.copy()
+        df = table.dataframe
         # Capitalise all entries if column type string
         colnames = df.select_dtypes(include="object").columns
         seen_cols = [colname for colname in colnames if colname in df.columns]
@@ -1203,8 +1210,7 @@ def capitalise_table_entries(table: EmbeddedXlTable):
                 # Index of rows with string entries
                 i = df[seen_col].apply(lambda x: isinstance(x, str))
                 if any(i):
-                    df.loc[i, seen_col] = df[seen_col][i].str.upper()
-                    df.loc[i, seen_col] = df[seen_col][i].str.strip()
+                    df.loc[i, seen_col] = df[seen_col][i].str.upper().str.strip()
             return replace(table, dataframe=df)
         else:
             return table
@@ -2129,22 +2135,13 @@ def process_transform_availability(
     return result
 
 
-def filter_by_pattern(df: pd.DataFrame, pattern: str, combined: bool) -> pd.DataFrame:
-    """
-    Filter dataframe index by a regex pattern. Parameter combined indicates whether commas should
-    be treated as a pattern separator or belong to the pattern.
-    """
+def filter_by_pattern(df: pd.DataFrame, pattern: str) -> pd.DataFrame:
+    """Filter dataframe index by a regex pattern."""
     # Duplicates can be created when a process has multiple commodities that match the pattern
-    df = df.filter(
-        regex=utils.create_regexp(pattern, combined), axis="index"
-    ).drop_duplicates()
-    if combined:
-        exclude = df.filter(
-            regex=utils.create_negative_regexp(pattern), axis="index"
-        ).index
-        return df.drop(exclude)
-    else:
-        return df
+    df = df.filter(regex=utils.create_regexp(pattern), axis="index").drop_duplicates()
+    exclude = df.filter(regex=utils.create_negative_regexp(pattern), axis="index").index
+
+    return df.drop(exclude)
 
 
 def intersect(acc, df):
@@ -2161,7 +2158,7 @@ def get_matching_processes(
         if col in row.index and row[col] not in {None, ""}:
             proc_set = topology[key]
             pattern = row[col].upper()
-            filtered = filter_by_pattern(proc_set, pattern, col != "pset_pd")
+            filtered = filter_by_pattern(proc_set, pattern)
             matching_processes = intersect(matching_processes, filtered)
 
     if matching_processes is not None and any(matching_processes.duplicated()):
@@ -2176,7 +2173,7 @@ def get_matching_commodities(row: pd.Series, topology: dict[str, DataFrame]):
         if col in row.index and row[col] not in {None, ""}:
             matching_commodities = intersect(
                 matching_commodities,
-                filter_by_pattern(topology[key], row[col].upper(), col != "cset_cd"),
+                filter_by_pattern(topology[key], row[col].upper()),
             )
     return matching_commodities
 
@@ -2201,7 +2198,9 @@ def generate_topology_dictionary(
     dictionary = dict()
     pros = model.processes
     coms = model.commodities
-    pros_and_coms = tables[Tag.fi_t]
+    pros_and_coms = model.topology[["process", "commodity", "io"]].drop_duplicates()
+    i_comm_in = pros_and_coms["io"] == "IN"
+    i_comm_out = pros_and_coms["io"] == "OUT"
 
     dict_info = [
         {"key": "processes_by_name", "df": pros[["process"]], "col": "process"},
@@ -2213,13 +2212,13 @@ def generate_topology_dictionary(
         {"key": "processes_by_sets", "df": pros[["process", "sets"]], "col": "sets"},
         {
             "key": "processes_by_comm_in",
-            "df": pros_and_coms[["process", "commodity-in"]],
-            "col": "commodity-in",
+            "df": pros_and_coms[["process", "commodity"]][i_comm_in],
+            "col": "commodity",
         },
         {
             "key": "processes_by_comm_out",
-            "df": pros_and_coms[["process", "commodity-out"]],
-            "col": "commodity-out",
+            "df": pros_and_coms[["process", "commodity"]][i_comm_out],
+            "col": "commodity",
         },
         {"key": "commodities_by_name", "df": coms[["commodity"]], "col": "commodity"},
         {

diff --git a/xl2times/utils.py b/xl2times/utils.py
@@ -247,17 +247,18 @@ def remove_positive_patterns(pattern: str) -> str:
     return ",".join([word[1:] for word in pattern.split(",") if word[0] == "-"])
 
 
+def remove_whitespace(pattern: str) -> str:
+    return ",".join([word.strip() for word in pattern.split(",")])
+
+
 @functools.lru_cache(maxsize=int(1e6))
 def create_regexp(pattern: str, combined: bool = True) -> str:
-    # Distinguish comma-separated list of patterns vs a pattern with a comma(s)
-    if combined:
-        # Remove whitespaces
-        pattern = pattern.replace(" ", "")
-        # Exclude negative patterns
-        if has_negative_patterns(pattern):
-            pattern = remove_negative_patterns(pattern)
-        # Handle comma-separated values
-        pattern = pattern.replace(",", r"$|^")
+    pattern = remove_whitespace(pattern)
+    # Exclude negative patterns
+    if has_negative_patterns(pattern):
+        pattern = remove_negative_patterns(pattern)
+    # Handle comma-separated values
+    pattern = pattern.replace(",", r"$|^")
     if len(pattern) == 0:
         return r".*"  # matches everything
     # Handle substite VEDA wildcards with regex patterns
@@ -271,8 +272,7 @@ def create_regexp(pattern: str, combined: bool = True) -> str:
 
 @functools.lru_cache(maxsize=int(1e6))
 def create_negative_regexp(pattern: str) -> str:
-    # Remove whitespaces
-    pattern = pattern.replace(" ", "")
+    pattern = remove_whitespace(pattern)
     # Exclude positive patterns
     pattern = remove_positive_patterns(pattern)
     if len(pattern) == 0: