From eac45c0d61d5544605e867e47648984652fedce6 Mon Sep 17 00:00:00 2001 From: Olexandr Balyk Date: Wed, 28 Feb 2024 08:48:31 -0500 Subject: [PATCH] Drop tables / rows without required columns (#200) This PR drops tables / rows that either don't include required columns (i.e. tables) or values specified in required columns (rows). --- xl2times/__main__.py | 1 + xl2times/datatypes.py | 12 ++++++++++++ xl2times/transforms.py | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/xl2times/__main__.py b/xl2times/__main__.py index ae2062bb..3fc8bfaa 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -93,6 +93,7 @@ def convert_xl_to_times( lambda config, tables, model: [ transforms.remove_comment_rows(config, t, model) for t in tables ], + transforms.revalidate_input_tables, transforms.process_regions, transforms.remove_exreg_cols, transforms.generate_dummy_processes, diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py index 79d46339..ab35b3e2 100644 --- a/xl2times/datatypes.py +++ b/xl2times/datatypes.py @@ -188,6 +188,10 @@ class Config: veda_attr_defaults: Dict[str, Dict[str, list]] # Known columns for each tag known_columns: Dict[Tag, Set[str]] + # Query columns for each tag + query_columns: Dict[Tag, Set[str]] + # Required columns for each tag + required_columns: Dict[Tag, Set[str]] # Names of regions to include in the model; if empty, all regions are included. filter_regions: Set[str] times_sets: Dict[str, List[str]] @@ -214,6 +218,7 @@ def __init__( self.discard_if_empty, self.query_columns, self.known_columns, + self.required_columns, ) = Config._read_veda_tags_info(veda_tags_file) self.veda_attr_defaults, self.attr_aliases = Config._read_veda_attr_defaults( veda_attr_defaults_file @@ -371,6 +376,7 @@ def _read_veda_tags_info( Iterable[Tag], Dict[Tag, Set[str]], Dict[Tag, Set[str]], + Dict[Tag, Set[str]], ]: def to_tag(s: str) -> Tag: # The file stores the tag name in lowercase, and without the ~ @@ -393,6 +399,7 @@ def to_tag(s: str) -> Tag: discard_if_empty = [] query_cols = defaultdict(set) known_cols = defaultdict(set) + required_cols = defaultdict(set) for tag_info in veda_tags_info: tag_name = to_tag(tag_info["tag_name"]) @@ -415,6 +422,10 @@ def to_tag(s: str) -> Tag: if valid_field["query_field"]: query_cols[tag_name].add(field_name) + + if valid_field["remove_any_row_if_absent"]: + required_cols[tag_name].add(field_name) + known_cols[tag_name].add(field_name) for valid_field_name in valid_field_names: @@ -443,6 +454,7 @@ def to_tag(s: str) -> Tag: discard_if_empty, query_cols, known_cols, + required_cols, ) @staticmethod diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 211191cb..2d687a66 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -222,6 +222,47 @@ def discard(table): return result +def revalidate_input_tables( + config: datatypes.Config, + tables: List[datatypes.EmbeddedXlTable], + model: datatypes.TimesModel, +) -> List[datatypes.EmbeddedXlTable]: + """ + Perform further validation of input tables by checking whether required columns are + present / non-empty. Remove tables without required columns or if they are empty. + """ + result = [] + for table in tables: + tag = datatypes.Tag(table.tag.split(":")[0]) + required_cols = config.required_columns[tag] + unique_table_cols = set(table.dataframe.columns) + if required_cols: + # Drop table if any column in required columns is missing + missing_cols = required_cols - unique_table_cols + if missing_cols: + logger.warning( + f"Dropping {tag.value} table withing range {table.range} on sheet {table.sheetname}" + f" in file {table.filename} due to missing required columns: {missing_cols}" + ) + # Discard the table + continue + # Check whether any of the required columns is empty + else: + df = table.dataframe + empty_required_cols = {c for c in required_cols if all(df[c].isna())} + if empty_required_cols: + logger.warning( + f"Dropping {tag.value} table within range {table.range} on sheet {table.sheetname}" + f" in file {table.filename} due to empty required columns: {empty_required_cols}" + ) + # Discard the table + continue + # Append table to the list if reached this far + result.append(table) + + return result + + def normalize_tags_columns( config: datatypes.Config, tables: List[datatypes.EmbeddedXlTable],