diff --git a/xl2times/__main__.py b/xl2times/__main__.py index ccc16a1..28b8993 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -147,10 +147,10 @@ def convert_xl_to_times( transforms.process_units, transforms.complete_commodity_groups, transforms.process_wildcards, + transforms.convert_aliases, transforms.apply_transform_tables, transforms.explode_process_commodity_cols, transforms.apply_final_fixup, - transforms.convert_aliases, transforms.assign_model_attributes, transforms.fix_topology, transforms.resolve_remaining_cgs, diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py index 4762717..9c9aa89 100644 --- a/xl2times/datatypes.py +++ b/xl2times/datatypes.py @@ -4,6 +4,7 @@ from collections.abc import Iterable from dataclasses import dataclass, field from enum import Enum +from functools import cached_property from importlib import resources from itertools import chain from pathlib import PurePath @@ -266,6 +267,21 @@ def model_years(self) -> set[int]: """ return self.past_years | set(self.time_periods["m"].values) + # TODO: Invalidate and recompute the below property when self.topology changes. + @cached_property + def veda_cgs(self) -> dict[tuple[str, str, str], str]: + """A dictionary mapping commodities to their Veda commodity groups.""" + cols = ["region", "process", "commodity", "csets"] + # Exclude auxillary flows + index = self.topology["io"].isin({"IN", "OUT"}) + veda_cgs = self.topology[cols + ["io"]][index].copy() + veda_cgs.drop_duplicates(subset=cols, keep="last", inplace=True) + veda_cgs["veda_cg"] = veda_cgs["csets"] + veda_cgs["io"].str[:1] + veda_cgs = veda_cgs.set_index(["region", "process", "commodity"])[ + "veda_cg" + ].to_dict() + return veda_cgs + class Config: """Encapsulates all configuration options for a run of the tool, including the diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 85feddd..83e803b 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -360,9 +360,9 @@ def include_tables_source( def include_table_source(table: EmbeddedXlTable): df = table.dataframe.copy() df["source_filename"] = Path(table.filename).stem - df["data_module_type"] = DataModule.module_type(table.filename) - df["data_submodule"] = DataModule.submodule(table.filename) - df["data_module_name"] = DataModule.module_name(table.filename) + df["module_type"] = DataModule.module_type(table.filename) + df["submodule"] = DataModule.submodule(table.filename) + df["module_name"] = DataModule.module_name(table.filename) return replace(table, dataframe=df) return [include_table_source(table) for table in tables] @@ -1210,23 +1210,53 @@ def capitalise_attributes_table(table: EmbeddedXlTable): return [capitalise_attributes_table(table) for table in tables] +def _populate_defaults( + tag: Tag, + dataframe: DataFrame, + col_name: str, + config: Config, + attr_col_name: str = "attribute", +): + """Fill in some of the missing values based on defaults in place.""" + starting_na = ( + dataframe[attr_col_name] + .str.upper() + .isin(config.veda_attr_defaults[col_name].keys()) + & dataframe[col_name].isna() + ) + if any(starting_na): + attributes = dataframe[starting_na][attr_col_name].unique() + for attr in attributes: + i_attr = dataframe[attr_col_name] == attr + default_values = config.veda_attr_defaults[col_name][attr.upper()] + for default_value in default_values: + # Ensure that previously filled values are not overwritten + current_na = dataframe[col_name].isna() + remaining_na = starting_na & i_attr & current_na + if any(remaining_na): + if default_value not in config.known_columns[tag]: + dataframe.loc[remaining_na, [col_name]] = default_value + elif default_value in dataframe.columns: + dataframe.loc[remaining_na, [col_name]] = dataframe[ + remaining_na + ][default_value] + + +def _populate_calculated_defaults(df: DataFrame, model: TimesModel): + """Determine values of and fill in some indexes.""" + if any(df["other_indexes"] == "veda_cg"): + i = df["other_indexes"] == "veda_cg" + df.loc[i, "other_indexes"] = df[i].apply( + lambda x: model.veda_cgs.get((x["region"], x["process"], x["commodity"])), + axis=1, + ) + + def apply_fixups( config: Config, tables: list[EmbeddedXlTable], model: TimesModel, ) -> list[EmbeddedXlTable]: - - # Generate Veda CG info - cols = ["region", "process", "commodity", "csets"] - # Exclude auxillary flows - index = model.topology["io"].isin({"IN", "OUT"}) - veda_cgs = model.topology[cols + ["io"]][index].copy() - veda_cgs.drop_duplicates(subset=cols, keep="last", inplace=True) - veda_cgs["veda_cg"] = veda_cgs["csets"] + veda_cgs["io"].str[:1] - veda_cgs = veda_cgs.set_index(["region", "process", "commodity"])[ - "veda_cg" - ].to_dict() - def apply_fixups_table(table: EmbeddedXlTable): tag = Tag.fi_t if not table.tag == tag: @@ -1238,41 +1268,9 @@ def apply_fixups_table(table: EmbeddedXlTable): if "year" in df.columns: df["year"] = pd.to_numeric(df["year"], errors="coerce") - def _populate_defaults(dataframe: DataFrame, col_name: str): - """Fill in some of the missing values based on defaults in place.""" - i_na = ( - dataframe["attribute"] - .str.upper() - .isin(config.veda_attr_defaults[col_name].keys()) - & dataframe[col_name].isna() - ) - if any(i_na): - for attr in dataframe[i_na]["attribute"].unique(): - i_attr = dataframe["attribute"] == attr - for default_value in config.veda_attr_defaults[col_name][ - attr.upper() - ]: - # Ensure that previously filled values are not overwritten - i_fill = i_na & i_attr & dataframe[col_name].isna() - if any(i_fill): - if default_value not in config.known_columns[tag]: - dataframe.loc[i_fill, [col_name]] = default_value - else: - dataframe.loc[i_fill, [col_name]] = dataframe[i_fill][ - default_value - ] - # Populate commodity and other_indexes based on defaults for col in ("commodity", "other_indexes"): - _populate_defaults(df, col) - - # Determine values of and fill in some indexes - if any(df["other_indexes"] == "veda_cg"): - i = df["other_indexes"] == "veda_cg" - df.loc[i, "other_indexes"] = df[i].apply( - lambda x: veda_cgs.get((x["region"], x["process"], x["commodity"])), - axis=1, - ) + _populate_defaults(tag, df, col, config) return replace(table, dataframe=df) @@ -2158,7 +2156,7 @@ def get_matching_processes( ) -> pd.Series | None: matching_processes = None for col, key in process_map.items(): - if col in row.index and row[col] is not None: + if col in row.index and row[col] not in {None, ""}: proc_set = topology[key] pattern = row[col].upper() filtered = filter_by_pattern(proc_set, pattern, col != "pset_pd") @@ -2173,7 +2171,7 @@ def get_matching_processes( def get_matching_commodities(row: pd.Series, topology: dict[str, DataFrame]): matching_commodities = None for col, key in commodity_map.items(): - if col in row.index and row[col] is not None: + if col in row.index and row[col] not in {None, ""}: matching_commodities = intersect( matching_commodities, filter_by_pattern(topology[key], row[col].upper(), col != "cset_cd"), @@ -2257,12 +2255,13 @@ def process_wildcards( Tag.uc_t, ] + dictionary = generate_topology_dictionary(tables, model) + for tag in tags: if tag in tqdm(tables, desc=f"Processing wildcards in {tag.value} tables"): start_time = time.time() df = tables[tag] - dictionary = generate_topology_dictionary(tables, model) if set(df.columns).intersection(set(process_map.keys())): df = _match_wildcards( @@ -2329,7 +2328,7 @@ def _match_wildcards( wild_cols = list(col_map.keys()) # drop duplicate sets of wildcard columns to save repeated (slow) regex matching. This makes things much faster. - unique_filters = df[wild_cols].drop_duplicates().dropna(axis="rows", how="all") + unique_filters = df[wild_cols].drop_duplicates().dropna(axis=0, how="all") # match all the wildcards columns against the dictionary names matches = unique_filters.apply(lambda row: matcher(row, dictionary), axis=1) @@ -2391,6 +2390,7 @@ def query( attribute: str | None, region: str | None, year: int | None, + limtype: str | None, val: int | float | None, module: str | None, ) -> pd.Index: @@ -2420,10 +2420,12 @@ def query( qs.append(f"region == '{region}'") if year not in [None, ""]: qs.append(f"year == {year}") + if limtype not in [None, ""]: + qs.append(f"limtype == '{limtype}'") if val not in [None, ""]: qs.append(f"value == {val}") if module not in [None, ""]: - qs.append(f"data_module_name == '{module}'") + qs.append(f"module_name == '{module}'") query_str = " and ".join(qs) row_idx = table.query(query_str).index return row_idx @@ -2447,7 +2449,7 @@ def _remove_invalid_rows_( """Remove rows with invalid process / region combination.""" df = df.copy() # Index of rows that won't be checked - index = df[~df["data_module_name"].isin(limit_to_modules)].index + index = df[~df["module_name"].isin(limit_to_modules)].index # Don't check rows with empty process index = index.union(df[df["process"].isin(["", None])].index) # Keep only the valid process / region combinations @@ -2461,7 +2463,8 @@ def _remove_invalid_rows_( row["region"], None, None, - row["data_module_name"], + None, + row["module_name"], ) ) @@ -2475,7 +2478,7 @@ def apply_transform_tables( ) -> dict[str, DataFrame]: """Include data from transformation tables.""" if Tag.tfm_ava in tables: - modules_with_ava = list(tables[Tag.tfm_ava]["data_module_name"].unique()) + modules_with_ava = list(tables[Tag.tfm_ava]["module_name"].unique()) updates = tables[Tag.tfm_ava].explode("process", ignore_index=True) # Overwrite with the last value for each process/region pair updates = updates.drop_duplicates( @@ -2516,27 +2519,27 @@ def apply_transform_tables( for data_module in model.data_modules: if ( Tag.tfm_dins in tables - and data_module in tables[Tag.tfm_dins]["data_module_name"].unique() + and data_module in tables[Tag.tfm_dins]["module_name"].unique() ): table = tables[Tag.fi_t] - index = tables[Tag.tfm_dins]["data_module_name"] == data_module + index = tables[Tag.tfm_dins]["module_name"] == data_module updates = tables[Tag.tfm_dins][index].filter(table.columns, axis=1) tables[Tag.fi_t] = pd.concat([table, updates], ignore_index=True) if ( Tag.tfm_ins in tables - and data_module in tables[Tag.tfm_ins]["data_module_name"].unique() + and data_module in tables[Tag.tfm_ins]["module_name"].unique() ): table = tables[Tag.fi_t] - index = tables[Tag.tfm_ins]["data_module_name"] == data_module + index = tables[Tag.tfm_ins]["module_name"] == data_module updates = tables[Tag.tfm_ins][index].filter(table.columns, axis=1) tables[Tag.fi_t] = pd.concat([table, updates], ignore_index=True) if ( Tag.tfm_ins_txt in tables - and data_module in tables[Tag.tfm_ins_txt]["data_module_name"].unique() + and data_module in tables[Tag.tfm_ins_txt]["module_name"].unique() ): - index = tables[Tag.tfm_ins_txt]["data_module_name"] == data_module + index = tables[Tag.tfm_ins_txt]["module_name"] == data_module updates = tables[Tag.tfm_ins_txt][index] # TFM_INS-TXT: expand row by wildcards, query FI_PROC/COMM for matching rows, @@ -2563,6 +2566,7 @@ def apply_transform_tables( None, None, None, + None, ) # Overwrite (inplace) the column given by the attribute (translated by attr_prop) # with the value from row @@ -2578,9 +2582,9 @@ def apply_transform_tables( if ( Tag.tfm_upd in tables - and data_module in tables[Tag.tfm_upd]["data_module_name"].unique() + and data_module in tables[Tag.tfm_upd]["module_name"].unique() ): - index = tables[Tag.tfm_upd]["data_module_name"] == data_module + index = tables[Tag.tfm_upd]["module_name"] == data_module updates = tables[Tag.tfm_upd][index] table = tables[Tag.fi_t] new_tables = [table] @@ -2595,8 +2599,8 @@ def apply_transform_tables( total=len(updates), desc=f"Applying transformations from {Tag.tfm_upd.value} in {data_module}", ): - if row["data_module_type"] == "trans": - source_module = row["data_module_name"] + if row["module_type"] == "trans": + source_module = row["module_name"] else: source_module = row["sourcescen"] @@ -2607,6 +2611,7 @@ def apply_transform_tables( row["attribute"], row["region"], row["year"], + row["limtype"], row["val_cond"], source_module, ) @@ -2620,19 +2625,19 @@ def apply_transform_tables( # In case more than one data module is present in the table, select the one with the highest index. # TODO: The below code is commented out because it needs to be more sophisticated. """ - if new_rows["data_module_name"].nunique() > 1: + if new_rows["module_name"].nunique() > 1: indices = { model.data_modules.index(x) - for x in new_rows["data_module_name"].unique() + for x in new_rows["module_name"].unique() } new_rows = new_rows[ - new_rows["data_module_name"] == model.data_modules[max(indices)] + new_rows["module_name"] == model.data_modules[max(indices)] ] """ new_rows["source_filename"] = row["source_filename"] - new_rows["data_module_name"] = row["data_module_name"] - new_rows["data_module_type"] = row["data_module_type"] - new_rows["data_submodule"] = row["data_submodule"] + new_rows["module_name"] = row["module_name"] + new_rows["module_type"] = row["module_type"] + new_rows["submodule"] = row["submodule"] new_tables.append(new_rows) # Add new rows to table @@ -2640,9 +2645,9 @@ def apply_transform_tables( if ( Tag.tfm_mig in tables - and data_module in tables[Tag.tfm_mig]["data_module_name"].unique() + and data_module in tables[Tag.tfm_mig]["module_name"].unique() ): - index = tables[Tag.tfm_mig]["data_module_name"] == data_module + index = tables[Tag.tfm_mig]["module_name"] == data_module updates = tables[Tag.tfm_mig][index] table = tables[Tag.fi_t] new_tables = [] @@ -2652,11 +2657,11 @@ def apply_transform_tables( total=len(updates), desc=f"Applying transformations from {Tag.tfm_mig.value} in {data_module}", ): - if row["data_module_type"] == "trans": - source_module = row["data_module_name"] + if row["module_type"] == "trans": + source_module = row["module_name"] else: source_module = row["sourcescen"] - # TODO should we also query on limtype? + rows_to_update = query( table, row["process"], @@ -2664,6 +2669,7 @@ def apply_transform_tables( row["attribute"], row["region"], row["year"], + row["limtype"], row["val_cond"], source_module, ) @@ -2682,19 +2688,19 @@ def apply_transform_tables( # In case more than one data module is present in the table, select the one with the highest index # TODO: The below code is commented out because it needs to be more sophisticated. """ - if new_rows["data_module_name"].nunique() > 1: + if new_rows["module_name"].nunique() > 1: indices = { model.data_modules.index(x) - for x in new_rows["data_module_name"].unique() + for x in new_rows["module_name"].unique() } new_rows = new_rows[ - new_rows["data_module_name"] == model.data_modules[max(indices)] + new_rows["module_name"] == model.data_modules[max(indices)] ] """ new_rows["source_filename"] = row["source_filename"] - new_rows["data_module_name"] = row["data_module_name"] - new_rows["data_module_type"] = row["data_module_type"] - new_rows["data_submodule"] = row["data_submodule"] + new_rows["module_name"] = row["module_name"] + new_rows["module_type"] = row["module_type"] + new_rows["submodule"] = row["submodule"] new_tables.append(new_rows) # Add new rows to table @@ -2724,6 +2730,9 @@ def explode_process_commodity_cols( if "commodity" in df.columns: df = df.explode("commodity", ignore_index=True) + if "other_indexes" in df.columns: + df = df.explode("other_indexes", ignore_index=True) + tables[tag] = df return tables @@ -2875,7 +2884,7 @@ def convert_aliases( tables: dict[str, DataFrame], model: TimesModel, ) -> dict[str, DataFrame]: - # Ensure TIMES names for all attributes + """Ensure TIMES names for all attributes.""" replacement_dict = {} for k, v in config.veda_attr_defaults["aliases"].items(): for alias in v: @@ -2883,17 +2892,10 @@ def convert_aliases( for table_type, df in tables.items(): if "attribute" in df.columns: + df["original_attr"] = df["attribute"] df.replace({"attribute": replacement_dict}, inplace=True) tables[table_type] = df - # Drop duplicates generated due to renaming - # TODO: Clear values in irrelevant columns before doing this - # TODO: Do this comprehensively for all relevant tables - df = tables[Tag.fi_t] - df = df.dropna(subset="value").drop_duplicates( - subset=[col for col in df.columns if col != "value"], keep="last" - ) - tables[Tag.fi_t] = df.reset_index(drop=True) return tables @@ -3048,9 +3050,12 @@ def apply_final_fixup( reg_com_flows.drop_duplicates(inplace=True, ignore_index=True) df = tables[Tag.fi_t] + _populate_defaults(Tag.fi_t, df, "other_indexes", config, "original_attr") + _populate_calculated_defaults(df, model) + # Fill other_indexes for COST cost_mapping = {"MIN": "IMP", "EXP": "EXP", "IMP": "IMP"} - cost_index = (df["attribute"] == "COST") & df["process"].notna() + cost_index = (df["original_attr"] == "COST") & df["process"].notna() if any(cost_index): processes = set(df[cost_index]["process"].unique()) @@ -3078,7 +3083,7 @@ def apply_final_fixup( ) # Use CommName to store the active commodity for EXP / IMP - index = df["attribute"].isin({"COST", "IRE_PRICE"}) + index = df["original_attr"].isin({"COST", "IRE_PRICE"}) if any(index): i_exp = index & (df["other_indexes"] == "EXP") df.loc[i_exp, "commodity"] = df.loc[i_exp, "commodity-in"] @@ -3086,19 +3091,19 @@ def apply_final_fixup( df.loc[i_imp, "commodity"] = df.loc[i_imp, "commodity-out"] # Fill CommName for COST (alias of IRE_PRICE) if missing - i_com_na = (df["attribute"] == "COST") & df["commodity"].isna() + i_com_na = (df["original_attr"] == "COST") & df["commodity"].isna() if any(i_com_na): comm_rp = reg_com_flows.groupby(["region", "process"]).agg(set) comm_rp["commodity"] = comm_rp["commodity"].str.join(",") df.set_index(["region", "process"], inplace=True) - i_cost = df["attribute"] == "COST" + i_cost = df["original_attr"] == "COST" df.loc[i_cost, "commodity"] = df["commodity"][i_cost].fillna( comm_rp["commodity"].to_dict() ) df.reset_index(inplace=True) # Handle STOCK specified for a single year - stock_index = (df["attribute"] == "STOCK") & df["process"].notna() + stock_index = (df["original_attr"] == "STOCK") & df["process"].notna() if any(stock_index): # Temporary solution to include only processes defined in BASE i_vt = stock_index & (df["source_filename"].str.contains("VT_", case=False)) @@ -3128,7 +3133,7 @@ def apply_final_fixup( if any(i_single_stock): default_life = 30 life_rp = ( - df[df["attribute"].isin({"NCAP_TLIFE", "LIFE"})] + df[df["original_attr"].isin({"NCAP_TLIFE", "LIFE"})] .drop_duplicates(subset=["region", "process"], keep="last") .set_index(["region", "process"])["value"] ) @@ -3147,7 +3152,29 @@ def apply_final_fixup( df = pd.concat(df_list) - tables[Tag.fi_t] = df + # Clean up + # TODO: Do this comprehensively for all relevant tables + # TODO: Duplicates should only be removed if in the same file/module + keep_cols = { + "attribute", + "region", + "process", + "commodity", + "other_indexes", + "year", + "year2", + "timeslice", + "currency", + "limtype", + "sow", + "stage", + "module_name", + } + df.dropna(subset="value", inplace=True) + drop_cols = [col for col in df.columns if col != "value" and col not in keep_cols] + df.drop(columns=drop_cols, inplace=True) + df = df.drop_duplicates(subset=list(keep_cols), keep="last") + tables[Tag.fi_t] = df.reset_index(drop=True) return tables