From f386a36446e65574f749aacb892062e95721230a Mon Sep 17 00:00:00 2001 From: Olexandr Balyk Date: Thu, 30 Nov 2023 23:42:01 -0500 Subject: [PATCH] Update comments removal transforms (#140) This PR adds information on comment row indicators to `config`. The info is then used to identify and remove those rows. --------- Co-authored-by: Siddharth Krishna --- times_reader/__main__.py | 4 +- times_reader/config/veda-tags.json | 23 ++++++++ times_reader/datatypes.py | 22 +++++-- times_reader/transforms.py | 94 +++++++++++++++++------------- 4 files changed, 96 insertions(+), 47 deletions(-) diff --git a/times_reader/__main__.py b/times_reader/__main__.py index 741199d..43dd924 100644 --- a/times_reader/__main__.py +++ b/times_reader/__main__.py @@ -57,10 +57,12 @@ def convert_xl_to_times( transforms.normalize_tags_columns, transforms.remove_fill_tables, transforms.remove_empty_tables, - lambda config, tables: [transforms.remove_comment_rows(t) for t in tables], lambda config, tables: [transforms.remove_comment_cols(t) for t in tables], transforms.remove_tables_with_formulas, # slow transforms.normalize_column_aliases, + lambda config, tables: [ + transforms.remove_comment_rows(config, t) for t in tables + ], transforms.apply_postnormalisation_fixes, transforms.generate_dummy_processes, transforms.process_transform_insert_variants, diff --git a/times_reader/config/veda-tags.json b/times_reader/config/veda-tags.json index d8bc2ec..0a2fe5a 100644 --- a/times_reader/config/veda-tags.json +++ b/times_reader/config/veda-tags.json @@ -25,6 +25,29 @@ ] } }, + { + "tag_name": "comemi", + "tag_allowed_in": [ + "BY", + "SubRES" + ], + "tag_fields": { + "fields_names": [ + "commname" + ], + "fields_aliases": [ + [ + "commodity" + ] + ], + "row_ignore_symbol": [ + [ + "\\I:", + "*" + ] + ] + } + }, { "tag_name": "currencies", "tag_allowed_in": [ diff --git a/times_reader/datatypes.py b/times_reader/datatypes.py index b58e9d7..b893ca5 100644 --- a/times_reader/datatypes.py +++ b/times_reader/datatypes.py @@ -148,6 +148,8 @@ class Config: all_attributes: Set[str] # For each tag, this dictionary maps each column alias to the normalized name column_aliases: Dict[Tag, Dict[str, str]] + # For each tag, this dictionary specifies comment row symbols by column name + row_comment_chars: Dict[Tag, Dict[str, list]] veda_attr_defaults: Dict[str, Dict[str, list]] def __init__( @@ -161,7 +163,9 @@ def __init__( self.dd_table_order, self.all_attributes = Config._process_times_info( times_info_file ) - self.column_aliases = Config._read_veda_tags_info(veda_tags_file) + self.column_aliases, self.row_comment_chars = Config._read_veda_tags_info( + veda_tags_file + ) self.veda_attr_defaults = Config._read_veda_attr_defaults( veda_attr_defaults_file ) @@ -263,15 +267,19 @@ def _read_mappings(filename: str) -> List[TimesXlMap]: return mappings @staticmethod - def _read_veda_tags_info(veda_tags_file: str) -> Dict[Tag, Dict[str, str]]: - # Read veda_tags_file + def _read_veda_tags_info( + veda_tags_file: str, + ) -> Tuple[Dict[Tag, Dict[str, str]], Dict[Tag, Dict[str, list]]]: with resources.open_text("times_reader.config", veda_tags_file) as f: veda_tags_info = json.load(f) column_aliases = {} + row_comment_chars = {} + for tag_info in veda_tags_info: if "tag_fields" in tag_info: # The file stores the tag name in lowercase, and without the ~ tag_name = "~" + tag_info["tag_name"].upper() + # Process column aliases: column_aliases[tag_name] = {} names = tag_info["tag_fields"]["fields_names"] aliases = tag_info["tag_fields"]["fields_aliases"] @@ -279,7 +287,13 @@ def _read_veda_tags_info(veda_tags_file: str) -> Dict[Tag, Dict[str, str]]: for name, aliases in zip(names, aliases): for alias in aliases: column_aliases[tag_name][alias] = name - return column_aliases + # Process comment chars: + row_comment_chars[tag_name] = {} + chars = tag_info["tag_fields"]["row_ignore_symbol"] + assert len(names) == len(chars) + for name, chars_list in zip(names, chars): + row_comment_chars[tag_name][name] = chars_list + return column_aliases, row_comment_chars @staticmethod def _read_veda_attr_defaults( diff --git a/times_reader/transforms.py b/times_reader/transforms.py index f4ef4a6..57d4bd6 100644 --- a/times_reader/transforms.py +++ b/times_reader/transforms.py @@ -39,15 +39,13 @@ } -def remove_comment_rows(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedXlTable: +def remove_comment_rows( + config: datatypes.Config, table: datatypes.EmbeddedXlTable +) -> datatypes.EmbeddedXlTable: """ - Return a modified copy of 'table' where rows with cells containig '*' - or '\I:' in their first or third columns have been deleted. These characters - are defined in https://iea-etsap.org/docs/Documentation_for_the_TIMES_Model-Part-IV.pdf - as comment identifiers (pag 15). - TODO: we believe the deletion of the third column is a bug. We tried deleting that part - of the code but we failed to parse a row as a consequence. We need to investigate why, - fix that parsing and remove the deletion of the third column. + Return a modified copy of 'table' where rows with cells starting with symbols + indicating a comment row in any column have been deleted. Comment row symbols + are column name dependant and are specified in the config. :param table: Table object in EmbeddedXlTable format. :return: Table object in EmbeddedXlTable format without comment rows. @@ -56,35 +54,39 @@ def remove_comment_rows(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedX return table df = table.dataframe.copy() - comment_rows = list( - locate( - df.iloc[:, 0], - lambda cell: isinstance(cell, str) - and (cell.startswith("*") or cell.startswith("\\I:")), - ) - ) - df.drop(index=comment_rows, inplace=True) - df.reset_index(drop=True, inplace=True) - # TODO: the deletion of this third column is a bug. Removing it causes the - # program to fail parse all rows. We need to fix the parsing so it can read - # all rows and remove this code block. - if df.shape[1] > 1: - comment_rows = list( - locate( - df.iloc[:, 1], - lambda cell: isinstance(cell, str) and cell.startswith("*"), + tag = table.tag.split(":")[0] + + if tag in config.row_comment_chars: + chars_by_colname = config.row_comment_chars[tag] + else: + return table + + comment_rows = set() + + for colname in df.columns: + if colname in chars_by_colname.keys(): + comment_rows.update( + list( + locate( + df[colname], + lambda cell: isinstance(cell, str) + and (cell.startswith(tuple(chars_by_colname[colname]))), + ) + ) ) - ) - df.drop(index=comment_rows, inplace=True) - df.reset_index(drop=True, inplace=True) + + df.drop(index=list(comment_rows), inplace=True) + df.reset_index(drop=True, inplace=True) + return replace(table, dataframe=df) def remove_comment_cols(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedXlTable: """ Return a modified copy of 'table' where columns with labels starting with '*' - have been deleted. + have been deleted. Assumes that any leading spaces in the original input table + have been removed. :param table: Table object in EmbeddedXlTable format. :return: Table object in EmbeddedXlTable format without comment columns. @@ -92,14 +94,16 @@ def remove_comment_cols(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedX if table.dataframe.size == 0: return table - comment_cols = list( - locate( - table.dataframe.columns, - lambda cell: isinstance(cell, str) and cell.startswith("*"), - ) - ) - df = table.dataframe.drop(table.dataframe.columns[comment_cols], axis=1) + comment_cols = [ + colname + for colname in table.dataframe.columns + if isinstance(colname, str) and colname.startswith("*") + ] + + df = table.dataframe.drop(comment_cols, axis=1) df.reset_index(drop=True, inplace=True) + + # TODO: should we move the code below to a separate transform? seen = set() dupes = [x for x in df.columns if x in seen or seen.add(x)] if len(dupes) > 0: @@ -232,6 +236,7 @@ def apply_postnormalisation_fixes( config: datatypes.Config, tables: List[datatypes.EmbeddedXlTable] ) -> List[datatypes.EmbeddedXlTable]: rename_cols_dict = { + datatypes.Tag.comemi: {"commname": "commodity"}, datatypes.Tag.fi_comm: {"commname": "commodity"}, datatypes.Tag.fi_process: {"techname": "process"}, datatypes.Tag.tfm_comgrp: {"value": "allregions"}, @@ -460,9 +465,14 @@ def process_flexible_import_table( .loc[veda_process_sets["process"] == process] .unique() ) - df.loc[i & (df["process"] == process), other] = cost_mapping[ - veda_process_set[0] - ] + if veda_process_set.shape[0]: + df.loc[i & (df["process"] == process), other] = cost_mapping[ + veda_process_set[0] + ] + else: + print( + f"WARNING: COST won't be processed as IRE_PRICE for {process}, because it is not in IMP/EXP/MIN" + ) # Use CommName to store the active commodity for EXP / IMP i = df[attribute].isin(["COST", "IRE_PRICE"]) @@ -1274,8 +1284,6 @@ def process_commodity_emissions( result.append(table) else: df = table.dataframe.copy() - # TODO either add ~COMEMI to veda-tags.json or do this somewhere less hacky: - df.rename(columns={"commname": "commodity"}, inplace=True) index_columns = ["region", "year", "commodity"] data_columns = [ colname for colname in df.columns if colname not in index_columns @@ -1287,7 +1295,9 @@ def process_commodity_emissions( if "region" in df.columns: df = df.astype({"region": "string"}) - df["region"] = df["region"].map(lambda s: s.split(",")) + df["region"] = df["region"].map( + lambda s: s.split(",") if isinstance(s, str) else s + ) df = df.explode("region", ignore_index=True) df = df[df["region"].isin(regions)]