From eac45c0d61d5544605e867e47648984652fedce6 Mon Sep 17 00:00:00 2001
From: Olexandr Balyk <ob@facilitate.energy>
Date: Wed, 28 Feb 2024 08:48:31 -0500
Subject: [PATCH] Drop tables / rows without required columns  (#200)

This PR drops tables / rows that either don't include required columns (i.e. tables) or values specified in required columns (rows).
---
 xl2times/__main__.py   |  1 +
 xl2times/datatypes.py  | 12 ++++++++++++
 xl2times/transforms.py | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+)

diff --git a/xl2times/__main__.py b/xl2times/__main__.py
index ae2062bb..3fc8bfaa 100644
--- a/xl2times/__main__.py
+++ b/xl2times/__main__.py
@@ -93,6 +93,7 @@ def convert_xl_to_times(
         lambda config, tables, model: [
             transforms.remove_comment_rows(config, t, model) for t in tables
         ],
+        transforms.revalidate_input_tables,
         transforms.process_regions,
         transforms.remove_exreg_cols,
         transforms.generate_dummy_processes,
diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py
index 79d46339..ab35b3e2 100644
--- a/xl2times/datatypes.py
+++ b/xl2times/datatypes.py
@@ -188,6 +188,10 @@ class Config:
     veda_attr_defaults: Dict[str, Dict[str, list]]
     # Known columns for each tag
     known_columns: Dict[Tag, Set[str]]
+    # Query columns for each tag
+    query_columns: Dict[Tag, Set[str]]
+    # Required columns for each tag
+    required_columns: Dict[Tag, Set[str]]
     # Names of regions to include in the model; if empty, all regions are included.
     filter_regions: Set[str]
     times_sets: Dict[str, List[str]]
@@ -214,6 +218,7 @@ def __init__(
             self.discard_if_empty,
             self.query_columns,
             self.known_columns,
+            self.required_columns,
         ) = Config._read_veda_tags_info(veda_tags_file)
         self.veda_attr_defaults, self.attr_aliases = Config._read_veda_attr_defaults(
             veda_attr_defaults_file
@@ -371,6 +376,7 @@ def _read_veda_tags_info(
         Iterable[Tag],
         Dict[Tag, Set[str]],
         Dict[Tag, Set[str]],
+        Dict[Tag, Set[str]],
     ]:
         def to_tag(s: str) -> Tag:
             # The file stores the tag name in lowercase, and without the ~
@@ -393,6 +399,7 @@ def to_tag(s: str) -> Tag:
         discard_if_empty = []
         query_cols = defaultdict(set)
         known_cols = defaultdict(set)
+        required_cols = defaultdict(set)
 
         for tag_info in veda_tags_info:
             tag_name = to_tag(tag_info["tag_name"])
@@ -415,6 +422,10 @@ def to_tag(s: str) -> Tag:
 
                     if valid_field["query_field"]:
                         query_cols[tag_name].add(field_name)
+
+                    if valid_field["remove_any_row_if_absent"]:
+                        required_cols[tag_name].add(field_name)
+
                     known_cols[tag_name].add(field_name)
 
                     for valid_field_name in valid_field_names:
@@ -443,6 +454,7 @@ def to_tag(s: str) -> Tag:
             discard_if_empty,
             query_cols,
             known_cols,
+            required_cols,
         )
 
     @staticmethod
diff --git a/xl2times/transforms.py b/xl2times/transforms.py
index 211191cb..2d687a66 100644
--- a/xl2times/transforms.py
+++ b/xl2times/transforms.py
@@ -222,6 +222,47 @@ def discard(table):
     return result
 
 
+def revalidate_input_tables(
+    config: datatypes.Config,
+    tables: List[datatypes.EmbeddedXlTable],
+    model: datatypes.TimesModel,
+) -> List[datatypes.EmbeddedXlTable]:
+    """
+    Perform further validation of input tables by checking whether required columns are
+    present / non-empty. Remove tables without required columns or if they are empty.
+    """
+    result = []
+    for table in tables:
+        tag = datatypes.Tag(table.tag.split(":")[0])
+        required_cols = config.required_columns[tag]
+        unique_table_cols = set(table.dataframe.columns)
+        if required_cols:
+            # Drop table if any column in required columns is missing
+            missing_cols = required_cols - unique_table_cols
+            if missing_cols:
+                logger.warning(
+                    f"Dropping {tag.value} table withing range {table.range} on sheet {table.sheetname}"
+                    f" in file {table.filename} due to missing required columns: {missing_cols}"
+                )
+                # Discard the table
+                continue
+            # Check whether any of the required columns is empty
+            else:
+                df = table.dataframe
+                empty_required_cols = {c for c in required_cols if all(df[c].isna())}
+                if empty_required_cols:
+                    logger.warning(
+                        f"Dropping {tag.value} table within range {table.range} on sheet {table.sheetname}"
+                        f" in file {table.filename} due to empty required columns: {empty_required_cols}"
+                    )
+                    # Discard the table
+                    continue
+        # Append table to the list if reached this far
+        result.append(table)
+
+    return result
+
+
 def normalize_tags_columns(
     config: datatypes.Config,
     tables: List[datatypes.EmbeddedXlTable],