From 5370fcdce08cf65b81c442345ee5a8bcb6d1ce41 Mon Sep 17 00:00:00 2001 From: Olexandr Balyk Date: Thu, 26 Dec 2024 00:23:09 -0500 Subject: [PATCH] Make misc fixes (#256) Introduce various improvements - use `model.topology` in `generate_topology_dictionary` - include column-specific default values - simplify pattern processing - do not fill missing values in mig tables - modify defaults for processing of `FLO_MARK`, `NCAP_AFC`, and `FLO_DELIV` - apply code review suggestions from #259 --------- Co-authored-by: Siddharth Krishna --- xl2times/config/veda-attr-defaults.json | 28 ++++++++++--- xl2times/config/veda-tags.json | 6 ++- xl2times/datatypes.py | 24 ++++++++++- xl2times/transforms.py | 55 ++++++++++++------------- xl2times/utils.py | 22 +++++----- 5 files changed, 88 insertions(+), 47 deletions(-) diff --git a/xl2times/config/veda-attr-defaults.json b/xl2times/config/veda-attr-defaults.json index 1bceee3..4ae829c 100644 --- a/xl2times/config/veda-attr-defaults.json +++ b/xl2times/config/veda-attr-defaults.json @@ -95,6 +95,13 @@ }, "AFC": { "defaults": { + "other_indexes": [ + "commodity", + "commodity-in", + "commodity-out", + "commodity-in-aux", + "commodity-out-aux" + ], "ts-level": "ANNUAL" }, "times-attribute": "NCAP_AFC" @@ -410,7 +417,9 @@ "defaults": { "commodity": [ "commodity-in", - "commodity-in-aux" + "commodity-out", + "commodity-in-aux", + "commodity-out-aux" ], "ts-level": "ANNUAL" }, @@ -578,7 +587,9 @@ "defaults": { "commodity": [ "commodity-in", - "commodity-in-aux" + "commodity-out", + "commodity-in-aux", + "commodity-out-aux" ], "ts-level": "ANNUAL" } @@ -619,10 +630,10 @@ "FLO_MARK": { "defaults": { "commodity": [ - "commodity-in", "commodity-out", - "commodity-in-aux", - "commodity-out-aux" + "commodity-in", + "commodity-out-aux", + "commodity-in-aux" ], "limtype": "UP" } @@ -781,6 +792,13 @@ }, "NCAP_AFC": { "defaults": { + "other_indexes": [ + "commodity", + "commodity-in", + "commodity-out", + "commodity-in-aux", + "commodity-out-aux" + ], "ts-level": "ANNUAL" } }, diff --git a/xl2times/config/veda-tags.json b/xl2times/config/veda-tags.json index 08e7226..8eef98d 100644 --- a/xl2times/config/veda-tags.json +++ b/xl2times/config/veda-tags.json @@ -134,7 +134,8 @@ "query_field": false, "inherit_above": true, "remove_first_row_if_absent": false, - "remove_any_row_if_absent": false + "remove_any_row_if_absent": false, + "default_to": "NRG" }, { "name": "ctslvl", @@ -270,7 +271,8 @@ "query_field": false, "inherit_above": true, "remove_first_row_if_absent": false, - "remove_any_row_if_absent": false + "remove_any_row_if_absent": false, + "default_to": "PRE" }, { "name": "tact", diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py index 8f8a181..9dd2f3b 100644 --- a/xl2times/datatypes.py +++ b/xl2times/datatypes.py @@ -294,6 +294,8 @@ class Config: attr_aliases: set[str] # For each tag, this dictionary maps each column alias to the normalized name column_aliases: dict[Tag, dict[str, str]] + # For each tag, this dictionary maps each column name to its default value + column_default_value: dict[Tag, dict[str, str]] # For each tag, this dictionary specifies comment row symbols by column name row_comment_chars: dict[Tag, dict[str, list]] # List of tags for which empty tables should be discarded @@ -329,11 +331,13 @@ def __init__( self.times_sets = Config._read_times_sets(times_sets_file) ( self.column_aliases, + self.column_default_value, self.row_comment_chars, self.discard_if_empty, self.query_columns, self.known_columns, self.required_columns, + self.forward_fill_cols, ) = Config._read_veda_tags_info(veda_tags_file) self.veda_attr_defaults, self.attr_aliases = Config._read_veda_attr_defaults( veda_attr_defaults_file @@ -494,12 +498,14 @@ def _read_mappings(filename: str) -> list[TimesXlMap]: def _read_veda_tags_info( veda_tags_file: str, ) -> tuple[ + dict[Tag, dict[str, str]], dict[Tag, dict[str, str]], dict[Tag, dict[str, list]], Iterable[Tag], dict[Tag, set[str]], dict[Tag, set[str]], dict[Tag, set[str]], + dict[Tag, set[str]], ]: def to_tag(s: str) -> Tag: # The file stores the tag name in lowercase, and without the ~ @@ -518,18 +524,20 @@ def to_tag(s: str) -> Tag: ) valid_column_names = {} + column_default_value = {} row_comment_chars = {} discard_if_empty = [] query_cols = defaultdict(set) known_cols = defaultdict(set) required_cols = defaultdict(set) + forward_fill_cols = defaultdict(set) for tag_info in veda_tags_info: tag_name = to_tag(tag_info["tag_name"]) if "valid_fields" in tag_info: discard_if_empty.append(tag_name) - valid_column_names[tag_name] = {} + column_default_value[tag_name] = {} row_comment_chars[tag_name] = {} # Process column aliases and comment chars: for valid_field in tag_info["valid_fields"]: @@ -543,12 +551,20 @@ def to_tag(s: str) -> Tag: else: field_name = valid_field["name"] + if "default_to" in valid_field: + column_default_value[tag_name][field_name] = valid_field[ + "default_to" + ] + if valid_field["query_field"]: query_cols[tag_name].add(field_name) if valid_field["remove_any_row_if_absent"]: required_cols[tag_name].add(field_name) + if valid_field["inherit_above"]: + forward_fill_cols[tag_name].add(field_name) + known_cols[tag_name].add(field_name) for valid_field_name in valid_field_names: @@ -564,20 +580,26 @@ def to_tag(s: str) -> Tag: if base_tag in valid_column_names: valid_column_names[tag_name] = valid_column_names[base_tag] discard_if_empty.append(tag_name) + if base_tag in column_default_value: + column_default_value[tag_name] = column_default_value[base_tag] if base_tag in row_comment_chars: row_comment_chars[tag_name] = row_comment_chars[base_tag] if base_tag in query_cols: query_cols[tag_name] = query_cols[base_tag] if base_tag in known_cols: known_cols[tag_name] = known_cols[base_tag] + if base_tag in forward_fill_cols: + forward_fill_cols[tag_name] = forward_fill_cols[base_tag] return ( valid_column_names, + column_default_value, row_comment_chars, discard_if_empty, query_cols, known_cols, required_cols, + forward_fill_cols, ) @staticmethod diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 7720ddb..ece9d66 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -884,10 +884,17 @@ def fill_in_missing_values( def fill_in_missing_values_table(table): df = table.dataframe.copy() + default_values = config.column_default_value.get(table.tag, {}) + for colname in df.columns: # TODO make this more declarative - if colname in ["sets", "csets", "process"]: + # Forwards fill values in columns + if colname in config.forward_fill_cols[table.tag]: df[colname] = df[colname].ffill() + # Apply default values to missing cells + col_default_value = default_values.get(colname) + if col_default_value is not None: + df[colname] = df[colname].fillna(col_default_value) elif colname == "limtype" and table.tag == Tag.fi_comm and False: isna = df[colname].isna() ismat = df["csets"] == "MAT" @@ -944,8 +951,8 @@ def fill_in_missing_values_table(table): return replace(table, dataframe=df) for table in tables: - if table.tag == Tag.tfm_upd: - # Missing values in update tables are wildcards and should not be filled in + if table.tag in [Tag.tfm_mig, Tag.tfm_upd]: + # Missing values in these tables are wildcards and should not be filled in result.append(table) else: result.append(fill_in_missing_values_table(table)) @@ -1194,7 +1201,7 @@ def capitalise_table_values( """Ensure that all table entries are uppercase. Strip leading and trailing whitespace.""" def capitalise_table_entries(table: EmbeddedXlTable): - df = table.dataframe.copy() + df = table.dataframe # Capitalise all entries if column type string colnames = df.select_dtypes(include="object").columns seen_cols = [colname for colname in colnames if colname in df.columns] @@ -1203,8 +1210,7 @@ def capitalise_table_entries(table: EmbeddedXlTable): # Index of rows with string entries i = df[seen_col].apply(lambda x: isinstance(x, str)) if any(i): - df.loc[i, seen_col] = df[seen_col][i].str.upper() - df.loc[i, seen_col] = df[seen_col][i].str.strip() + df.loc[i, seen_col] = df[seen_col][i].str.upper().str.strip() return replace(table, dataframe=df) else: return table @@ -2129,22 +2135,13 @@ def process_transform_availability( return result -def filter_by_pattern(df: pd.DataFrame, pattern: str, combined: bool) -> pd.DataFrame: - """ - Filter dataframe index by a regex pattern. Parameter combined indicates whether commas should - be treated as a pattern separator or belong to the pattern. - """ +def filter_by_pattern(df: pd.DataFrame, pattern: str) -> pd.DataFrame: + """Filter dataframe index by a regex pattern.""" # Duplicates can be created when a process has multiple commodities that match the pattern - df = df.filter( - regex=utils.create_regexp(pattern, combined), axis="index" - ).drop_duplicates() - if combined: - exclude = df.filter( - regex=utils.create_negative_regexp(pattern), axis="index" - ).index - return df.drop(exclude) - else: - return df + df = df.filter(regex=utils.create_regexp(pattern), axis="index").drop_duplicates() + exclude = df.filter(regex=utils.create_negative_regexp(pattern), axis="index").index + + return df.drop(exclude) def intersect(acc, df): @@ -2161,7 +2158,7 @@ def get_matching_processes( if col in row.index and row[col] not in {None, ""}: proc_set = topology[key] pattern = row[col].upper() - filtered = filter_by_pattern(proc_set, pattern, col != "pset_pd") + filtered = filter_by_pattern(proc_set, pattern) matching_processes = intersect(matching_processes, filtered) if matching_processes is not None and any(matching_processes.duplicated()): @@ -2176,7 +2173,7 @@ def get_matching_commodities(row: pd.Series, topology: dict[str, DataFrame]): if col in row.index and row[col] not in {None, ""}: matching_commodities = intersect( matching_commodities, - filter_by_pattern(topology[key], row[col].upper(), col != "cset_cd"), + filter_by_pattern(topology[key], row[col].upper()), ) return matching_commodities @@ -2201,7 +2198,9 @@ def generate_topology_dictionary( dictionary = dict() pros = model.processes coms = model.commodities - pros_and_coms = tables[Tag.fi_t] + pros_and_coms = model.topology[["process", "commodity", "io"]].drop_duplicates() + i_comm_in = pros_and_coms["io"] == "IN" + i_comm_out = pros_and_coms["io"] == "OUT" dict_info = [ {"key": "processes_by_name", "df": pros[["process"]], "col": "process"}, @@ -2213,13 +2212,13 @@ def generate_topology_dictionary( {"key": "processes_by_sets", "df": pros[["process", "sets"]], "col": "sets"}, { "key": "processes_by_comm_in", - "df": pros_and_coms[["process", "commodity-in"]], - "col": "commodity-in", + "df": pros_and_coms[["process", "commodity"]][i_comm_in], + "col": "commodity", }, { "key": "processes_by_comm_out", - "df": pros_and_coms[["process", "commodity-out"]], - "col": "commodity-out", + "df": pros_and_coms[["process", "commodity"]][i_comm_out], + "col": "commodity", }, {"key": "commodities_by_name", "df": coms[["commodity"]], "col": "commodity"}, { diff --git a/xl2times/utils.py b/xl2times/utils.py index 2c972d9..b548aff 100644 --- a/xl2times/utils.py +++ b/xl2times/utils.py @@ -247,17 +247,18 @@ def remove_positive_patterns(pattern: str) -> str: return ",".join([word[1:] for word in pattern.split(",") if word[0] == "-"]) +def remove_whitespace(pattern: str) -> str: + return ",".join([word.strip() for word in pattern.split(",")]) + + @functools.lru_cache(maxsize=int(1e6)) def create_regexp(pattern: str, combined: bool = True) -> str: - # Distinguish comma-separated list of patterns vs a pattern with a comma(s) - if combined: - # Remove whitespaces - pattern = pattern.replace(" ", "") - # Exclude negative patterns - if has_negative_patterns(pattern): - pattern = remove_negative_patterns(pattern) - # Handle comma-separated values - pattern = pattern.replace(",", r"$|^") + pattern = remove_whitespace(pattern) + # Exclude negative patterns + if has_negative_patterns(pattern): + pattern = remove_negative_patterns(pattern) + # Handle comma-separated values + pattern = pattern.replace(",", r"$|^") if len(pattern) == 0: return r".*" # matches everything # Handle substite VEDA wildcards with regex patterns @@ -271,8 +272,7 @@ def create_regexp(pattern: str, combined: bool = True) -> str: @functools.lru_cache(maxsize=int(1e6)) def create_negative_regexp(pattern: str) -> str: - # Remove whitespaces - pattern = pattern.replace(" ", "") + pattern = remove_whitespace(pattern) # Exclude positive patterns pattern = remove_positive_patterns(pattern) if len(pattern) == 0: