Refine application of trans files (#263)

Refine application of the tfm tables in trans files by removing any row in generated tables that: - for an attribute with process in indices, includes a process that is not declared in the corresponding non-trans file; - for an attribute with commodity in indices, includes a commodity that is not declared in the corresponding non-trans file, base or syssettings. This PR should enable #256
etsap-TIMES · Dec 26, 2024 · 1d853ca · 1d853ca
1 parent d2bc4da
commit 1d853ca
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 14 deletions.
diff --git a/xl2times/__main__.py b/xl2times/__main__.py
@@ -136,12 +136,12 @@ def convert_xl_to_times(
         transforms.generate_uc_properties,
         transforms.expand_rows_parallel,  # slow
         transforms.remove_invalid_values,
+        transforms.include_tables_source,
         transforms.internalise_commodities,
         transforms.generate_commodity_groups,
         transforms.apply_fixups,
         transforms.fill_in_missing_pcgs,
         transforms.generate_trade,
-        transforms.include_tables_source,
         transforms.merge_tables,
         transforms.complete_processes,
         transforms.process_units,

diff --git a/xl2times/transforms.py b/xl2times/transforms.py
@@ -2493,11 +2493,11 @@ def apply_transform_tables(
         # Determine the rows that won't be updated
         tables[Tag.fi_t] = _remove_invalid_rows_(
             tables[Tag.fi_t], updates, modules_with_ava
-        )
+        ).reset_index(drop=True)
         # TODO: This should happen much earlier in the process
         model.processes = _remove_invalid_rows_(
             model.processes, updates, modules_with_ava
-        )
+        ).reset_index(drop=True)
         # TODO: should be unnecessary if model.processes is updated early enough
         # Remove topology rows that are not in the processes
         model.topology = pd.merge(
@@ -2507,6 +2507,28 @@ def apply_transform_tables(
             how="inner",
         )
 
+    # Create a dictionary of processes/commodities indexed by module name
+    obj_by_module = dict()
+    obj_by_module["process"] = (
+        model.processes.groupby("module_name")["process"].agg(set).to_dict()
+    )
+    obj_by_module["commodity"] = (
+        model.commodities.groupby("module_name")["commodity"].agg(set).to_dict()
+    )
+    # Create a dictionary of processes/commodities available in addtion to those declared in a module
+    obj_suppl = dict()
+    obj_suppl["process"] = set()
+    obj_suppl["commodity"] = (
+        obj_by_module["commodity"]
+        .get("BASE", set())
+        .union(obj_by_module["commodity"].get("SYSSETTINGS", set()))
+    )
+    # Create sets attributes that require a process/commodity index
+    attr_with_obj = {
+        obj: {attr.times_name for attr in config.times_xl_maps if obj in attr.xl_cols}
+        for obj in ["process", "commodity"]
+    }
+
     if Tag.tfm_comgrp in tables:
         table = model.commodity_groups
         updates = tables[Tag.tfm_comgrp].filter(table.columns, axis=1)
@@ -2517,23 +2539,23 @@ def apply_transform_tables(
         model.commodity_groups = commodity_groups.dropna()
 
     for data_module in model.data_modules:
+        generated_records = []
         if (
             Tag.tfm_dins in tables
             and data_module in tables[Tag.tfm_dins]["module_name"].unique()
         ):
             table = tables[Tag.fi_t]
             index = tables[Tag.tfm_dins]["module_name"] == data_module
             updates = tables[Tag.tfm_dins][index].filter(table.columns, axis=1)
-            tables[Tag.fi_t] = pd.concat([table, updates], ignore_index=True)
-
+            generated_records.append(updates)
         if (
             Tag.tfm_ins in tables
             and data_module in tables[Tag.tfm_ins]["module_name"].unique()
         ):
             table = tables[Tag.fi_t]
             index = tables[Tag.tfm_ins]["module_name"] == data_module
             updates = tables[Tag.tfm_ins][index].filter(table.columns, axis=1)
-            tables[Tag.fi_t] = pd.concat([table, updates], ignore_index=True)
+            generated_records.append(updates)
 
         if (
             Tag.tfm_ins_txt in tables
@@ -2587,9 +2609,7 @@ def apply_transform_tables(
             index = tables[Tag.tfm_upd]["module_name"] == data_module
             updates = tables[Tag.tfm_upd][index]
             table = tables[Tag.fi_t]
-            new_tables = [table]
-            # Reset FI_T index so that queries can determine unique rows to update
-            tables[Tag.fi_t].reset_index(inplace=True, drop=True)
+            new_tables = []
 
             # TFM_UPD: expand wildcards in each row, query FI_T to find matching rows,
             # evaluate the update formula, and add new rows to FI_T
@@ -2640,8 +2660,7 @@ def apply_transform_tables(
                 new_rows["submodule"] = row["submodule"]
                 new_tables.append(new_rows)
 
-            # Add new rows to table
-            tables[Tag.fi_t] = pd.concat(new_tables, ignore_index=True)
+            generated_records.append(pd.concat(new_tables, ignore_index=True))
 
         if (
             Tag.tfm_mig in tables
@@ -2703,9 +2722,29 @@ def apply_transform_tables(
                 new_rows["submodule"] = row["submodule"]
                 new_tables.append(new_rows)
 
-            # Add new rows to table
-            new_tables.append(tables[Tag.fi_t])
-            tables[Tag.fi_t] = pd.concat(new_tables, ignore_index=True)
+            generated_records.append(pd.concat(new_tables, ignore_index=True))
+
+        if generated_records:
+            module_data = pd.concat(generated_records, ignore_index=True)
+            module_type = module_data["module_type"].iloc[0]
+            # Explode process and commodity columns and remove invalid rows
+            for obj in ["process", "commodity"]:
+                if obj in module_data.columns:
+                    module_data = module_data.explode(obj, ignore_index=True)
+                    if module_type in {"base", "subres"}:
+                        valid_objs = (
+                            obj_by_module[obj]
+                            .get(data_module, set())
+                            .union(obj_suppl[obj])
+                        )
+                        drop = ~module_data[obj].isin(valid_objs) & module_data[
+                            "attribute"
+                        ].isin(attr_with_obj[obj])
+                        module_data = module_data[~drop]
+            if not module_data.empty:
+                tables[Tag.fi_t] = pd.concat(
+                    [tables[Tag.fi_t], module_data], ignore_index=True
+                )
 
     return tables