From 8d066d011322368fdbbee6f481b1274fefc33e41 Mon Sep 17 00:00:00 2001
From: Olexandr Balyk <ob@facilitate.energy>
Date: Fri, 9 Feb 2024 17:00:51 -0500
Subject: [PATCH] Create trade links (#167)

---
 xl2times/__main__.py           |   4 +-
 xl2times/config/veda-tags.json |  66 +++++++++++
 xl2times/transforms.py         | 194 ++++++++++++++++++++++++++++++---
 3 files changed, 250 insertions(+), 14 deletions(-)

diff --git a/xl2times/__main__.py b/xl2times/__main__.py
index f9545256..ceb2d67e 100644
--- a/xl2times/__main__.py
+++ b/xl2times/__main__.py
@@ -71,6 +71,7 @@ def convert_xl_to_times(
         transforms.process_time_slices,
         transforms.process_transform_insert_variants,
         transforms.process_transform_tables,
+        transforms.process_tradelinks,
         transforms.process_processes,
         transforms.process_topology,
         transforms.process_flexible_import_tables,  # slow
@@ -86,9 +87,10 @@ def convert_xl_to_times(
         transforms.apply_fixups,
         transforms.generate_commodity_groups,
         transforms.fill_in_missing_pcgs,
-        transforms.generate_top_ire,
+        transforms.generate_trade,
         transforms.include_tables_source,
         transforms.merge_tables,
+        transforms.complete_processes,
         transforms.apply_more_fixups,
         transforms.process_units,
         transforms.process_years,
diff --git a/xl2times/config/veda-tags.json b/xl2times/config/veda-tags.json
index d94ab8b4..f57d0025 100644
--- a/xl2times/config/veda-tags.json
+++ b/xl2times/config/veda-tags.json
@@ -2845,6 +2845,72 @@
       "SysSettings"
     ]
   },
+  {
+    "tag_name": "tradelinks_dins",
+    "tag_allowed_in": [
+      "TradeScen"
+    ],
+    "valid_fields": [
+      {
+        "name": "reg1",
+        "aliases": [],
+        "use_name": "reg1",
+        "row_ignore_symbol": [
+          "\\I:",
+          "*"
+        ]
+      },
+      {
+        "name": "reg2",
+        "aliases": [],
+        "use_name": "reg2",
+        "row_ignore_symbol": [
+          "\\I:",
+          "*"
+        ]
+      },
+      {
+        "name": "comm",
+        "aliases": [],
+        "use_name": "comm",
+        "row_ignore_symbol": [
+          "\\I:"
+        ]
+      },
+      {
+        "name": "comm1",
+        "aliases": [],
+        "use_name": "comm1",
+        "row_ignore_symbol": [
+          "\\I:"
+        ]
+      },
+      {
+        "name": "comm2",
+        "aliases": [],
+        "use_name": "comm2",
+        "row_ignore_symbol": [
+          "\\I:"
+        ]
+      },
+      {
+        "name": "tech",
+        "aliases": [],
+        "use_name": "process",
+        "row_ignore_symbol": [
+          "\\I:"
+        ]
+      },
+      {
+        "name": "tradelink",
+        "aliases": [],
+        "use_name": "tradelink",
+        "row_ignore_symbol": [
+          "\\I:"
+        ]
+      }
+    ]
+  },
   {
     "tag_name": "uc_t",
     "tag_allowed_in": [
diff --git a/xl2times/transforms.py b/xl2times/transforms.py
index 3a269d23..521a9f7f 100644
--- a/xl2times/transforms.py
+++ b/xl2times/transforms.py
@@ -716,6 +716,7 @@ def remove_invalid_values(
     """
     # TODO: This should be table type specific
     # TODO pull this out
+    # TODO: This should take into account whether a specific dimension is required
     # Rules for allowing entries. Each entry of the dictionary designates a rule for a
     # a given column, and the values that are allowed for that column.
     constraints = {
@@ -1081,7 +1082,7 @@ def complete_commodity_groups(
     return tables
 
 
-def generate_top_ire(
+def generate_trade(
     config: datatypes.Config,
     tables: List[datatypes.EmbeddedXlTable],
     model: datatypes.TimesModel,
@@ -1134,7 +1135,42 @@ def generate_top_ire(
     top_ire.drop(columns=["region", "region2", "sets", "io"], inplace=True)
     top_ire.drop_duplicates(keep="first", inplace=True, ignore_index=True)
 
-    model.trade = top_ire
+    cols_list = ["origin", "in", "destination", "out", "process"]
+    # Include trade between internal regions
+    for table in tables:
+        if table.tag == datatypes.Tag.tradelinks_dins:
+            df = table.dataframe
+            f_links = df.rename(
+                columns={
+                    "reg1": "origin",
+                    "comm1": "in",
+                    "reg2": "destination",
+                    "comm2": "out",
+                }
+            ).copy()
+            top_ire = pd.concat([top_ire, f_links[cols_list]])
+            # Check if any of the links are bi-directional
+            if "b" in df["tradelink"].str.lower().unique():
+                b_links = (
+                    df[df["tradelink"].str.lower() == "b"]
+                    .rename(
+                        columns={
+                            "reg1": "destination",
+                            "comm1": "out",
+                            "reg2": "origin",
+                            "comm2": "in",
+                        }
+                    )
+                    .copy()
+                )
+                top_ire = pd.concat([top_ire, b_links[cols_list]])
+
+    filter_regions = model.internal_regions.union({"IMPEXP", "MINRNW"})
+    i = top_ire["origin"].isin(filter_regions) & top_ire["destination"].isin(
+        filter_regions
+    )
+
+    model.trade = top_ire[i].reset_index()
 
     return tables
 
@@ -1475,6 +1511,84 @@ def generate_dummy_processes(
     return tables
 
 
+def process_tradelinks(
+    config: datatypes.Config,
+    tables: List[datatypes.EmbeddedXlTable],
+    model: datatypes.TimesModel,
+) -> List[datatypes.EmbeddedXlTable]:
+    """
+    Transform tradelinks to tradelinks_dins
+    """
+
+    result = []
+    for table in tables:
+        if table.tag == datatypes.Tag.tradelinks:
+            df = table.dataframe
+            sheetname = table.sheetname.lower()
+            comm = df.columns[0]
+            destinations = [c for c in df.columns if c != comm]
+            df.rename(columns={comm: "origin"}, inplace=True)
+            df = pd.melt(
+                df, id_vars=["origin"], value_vars=destinations, var_name="destination"
+            )
+            df = df[df["value"] == 1].drop(columns=["value"])
+            df["destination"] = df["destination"].str.upper()
+            df.drop_duplicates(keep="first", inplace=True)
+
+            if sheetname == "uni":
+                df["tradelink"] = "u"
+            elif sheetname == "bi":
+                df["tradelink"] = "b"
+            else:
+                df["tradelink"] = 1
+                # Determine whether a trade link is bi- or unidirectional
+                td_type = (
+                    df.groupby(["regions"])["tradelink"].agg("count").reset_index()
+                )
+                td_type.replace({"tradelink": {1: "u", 2: "b"}}, inplace=True)
+                df.drop(columns=["tradelink"], inplace=True)
+                df = df.merge(td_type, how="inner", on="regions")
+
+            # Add a column containing linked regions (directionless for bidirectional links)
+            df["regions"] = df.apply(
+                lambda row: tuple(sorted([row["origin"], row["destination"]]))
+                if row["tradelink"] == "b"
+                else tuple([row["origin"], row["destination"]]),
+                axis=1,
+            )
+
+            # Drop tradelink (bidirectional) duplicates
+            df.drop_duplicates(
+                subset=["regions", "tradelink"], keep="last", inplace=True
+            )
+            df.drop(columns=["regions"], inplace=True)
+            df["comm"] = comm.upper()
+            df["comm1"] = df["comm"]
+            df["comm2"] = df["comm"]
+            df.rename(columns={"origin": "reg1", "destination": "reg2"}, inplace=True)
+            # Use Veda approach to naming of trade processes
+            df["process"] = df.apply(
+                lambda row: "T"
+                + "_".join(
+                    [
+                        row["tradelink"].upper(),
+                        row["comm"],
+                        row["reg1"],
+                        row["reg2"],
+                        "01",
+                    ]
+                ),
+                axis=1,
+            )
+            result.append(
+                replace(table, dataframe=df, tag=datatypes.Tag.tradelinks_dins)
+            )
+        else:
+            result.append(table)
+
+    return result
+
+
 def process_transform_insert_variants(
     config: datatypes.Config,
     tables: List[datatypes.EmbeddedXlTable],
@@ -1503,20 +1617,16 @@ def is_year(col_name):
             # ~TFM_INS-TS: Gather columns whose names are years into a single "Year" column:
             df = table.dataframe
             if "year" in df.columns:
-                raise ValueError(f"TFM_INS-AT table already has Year column: {table}")
-            # TODO can we remove this hacky shortcut?
-            if (
-                table.tag == datatypes.Tag.tfm_ins_ts
-                and set(df.columns) & query_columns == {"cset_cn"}
-                and has_no_wildcards(df["cset_cn"])
+                raise ValueError(f"TFM_INS-TS table already has Year column: {table}")
+            # TODO: can we remove this hacky shortcut? Or should it be also applied to the AT variant?
+            if set(df.columns) & query_columns == {"cset_cn"} and has_no_wildcards(
+                df["cset_cn"]
             ):
                 df.rename(columns={"cset_cn": "commodity"}, inplace=True)
                 result.append(replace(table, dataframe=df, tag=datatypes.Tag.fi_t))
                 continue
-            elif (
-                table.tag == datatypes.Tag.tfm_ins_ts
-                and set(df.columns) & query_columns == {"pset_pn"}
-                and has_no_wildcards(df["pset_pn"])
+            elif set(df.columns) & query_columns == {"pset_pn"} and has_no_wildcards(
+                df["pset_pn"]
             ):
                 df.rename(columns={"pset_pn": "process"}, inplace=True)
                 result.append(replace(table, dataframe=df, tag=datatypes.Tag.fi_t))
@@ -1536,7 +1646,7 @@ def is_year(col_name):
             df["year"] = df["year"].astype("int")
             result.append(replace(table, dataframe=df, tag=datatypes.Tag.tfm_ins))
         elif table.tag == datatypes.Tag.tfm_ins_at:
-            # ~TFM_INS-AT: Gather columns with attribute names into a single "Attribue" column
+            # ~TFM_INS-AT: Gather columns with attribute names into a single "Attribute" column
             df = table.dataframe
             if "attribute" in df.columns:
                 raise ValueError(
@@ -2190,6 +2300,64 @@ def fix_topology(
     return tables
 
 
+def complete_processes(
+    config: datatypes.Config,
+    tables: Dict[str, DataFrame],
+    model: datatypes.TimesModel,
+) -> Dict[str, DataFrame]:
+    # Generate processes based on trade links
+
+    trade_processes = pd.concat(
+        [
+            model.trade.loc[:, ["origin", "process", "in"]].rename(
+                columns={"origin": "region", "in": "commodity"}
+            ),
+            model.trade.loc[:, ["destination", "process", "out"]].rename(
+                columns={"destination": "region", "out": "commodity"}
+            ),
+        ],
+        ignore_index=True,
+        sort=False,
+    )
+
+    undeclared_td = trade_processes.merge(
+        model.processes.loc[:, ["region", "process"]], how="left", indicator=True
+    )
+    undeclared_td = undeclared_td.loc[
+        (
+            undeclared_td["region"].isin(model.internal_regions)
+            & (undeclared_td["_merge"] == "left_only")
+        ),
+        ["region", "process", "commodity"],
+    ]
+
+    undeclared_td = undeclared_td.merge(
+        model.commodities.loc[:, ["region", "commodity", "csets", "ctslvl", "unit"]],
+        how="left",
+    )
+    undeclared_td.drop(columns=["commodity"], inplace=True)
+    undeclared_td.rename(
+        columns={"csets": "primarycg", "ctslvl": "tslvl", "unit": "tact"}, inplace=True
+    )
+    undeclared_td["sets"] = "IRE"
+    undeclared_td.drop_duplicates(keep="last", inplace=True)
+
+    # TODO: Handle possible duplicates
+    for i in ["primarycg", "tslvl", "tact"]:
+        duplicates = undeclared_td.loc[:, ["region", "process", i]].duplicated(
+            keep=False
+        )
+        if any(duplicates):
+            duplicates = undeclared_td.loc[duplicates, ["region", "process", i]]
+            processes = duplicates["process"].unique()
+            regions = duplicates["region"].unique()
+            print(f"WARNING: Multiple possible {i} for {processes} in {regions}")
+
+    model.processes = pd.concat([model.processes, undeclared_td], ignore_index=True)
+
+    return tables
+
+
 def apply_more_fixups(
     config: datatypes.Config,
     tables: Dict[str, DataFrame],