etsap-TIMES · SamRWest · Feb 15, 2024 · Feb 14, 2024 · Feb 14, 2024 · Feb 14, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -34,6 +34,12 @@ jobs:
           pre-commit install
           pre-commit run --all-files
 
+      - name: Run unit tests
+        working-directory: xl2times
+        run: |
+          source .venv/bin/activate
+          pytest
+
       # ---------- Prepare ETSAP Demo models
 
       - uses: actions/checkout@v3

diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,5 @@ benchmarks/
 .python-version
 docs/_build/
 docs/api/
+.coverage
+/out.txt
diff --git a/README.md b/README.md
@@ -111,7 +111,6 @@ No regressions. You're awesome!
 ```
 If you have a large increase in runtime, a decrease in correct rows or fewer rows being produced, then you've broken something and will need to figure out how to fix it.
 
-
 ### Debugging Regressions
 
 If your change is causing regressions on one of the benchmarks, a useful way to debug and find the difference is to run the tool in verbose mode and compare the intermediate tables. For example, if your branch has regressions on Demo 1:
@@ -137,6 +136,7 @@ python -m build
 python -m twine upload dist/*
 ```
 
+
 ## Contributing
 
 This project welcomes contributions and suggestions.  Most contributions require you to agree to a

diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0.0", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [tool.setuptools]
-packages = ["xl2times", "utils"]
+packages = ["xl2times"]
 
 [project]
 name = "xl2times"
@@ -14,21 +14,28 @@ requires-python = ">=3.10"
 license = { file = "LICENSE" }
 keywords = []
 classifiers = [
-  "Development Status :: 4 - Beta",
-  "License :: OSI Approved :: MIT License",
-  "Programming Language :: Python",
-  "Programming Language :: Python :: 3",
+    "Development Status :: 4 - Beta",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
 ]
 dependencies = [
-  "GitPython >= 3.1.31, < 3.2",
-  "more-itertools",
-  "openpyxl >= 3.0, < 3.1",
-  "pandas >= 2.1",
-  "pyarrow"
+    "GitPython >= 3.1.31, < 3.2",
+    "more-itertools",
+    "openpyxl >= 3.0, < 3.1",
+    "pandas >= 2.1",
+    "pyarrow",
+    "tqdm",
 ]
 
 [project.optional-dependencies]
-dev = ["black", "pre-commit", "tabulate"]
+dev = [
+    "black",
+    "pre-commit",
+    "tabulate",
+    "pytest",
+    "pytest-cov"
+]
 
 [project.urls]
 Documentation = "https://github.com/etsap-TIMES/xl2times#readme"
@@ -37,3 +44,9 @@ Source = "https://github.com/etsap-TIMES/xl2times"
 
 [project.scripts]
 xl2times = "xl2times.__main__:main"
+
+[tool.pytest.ini_options]
+# don't print runtime warnings
+filterwarnings = ["ignore::DeprecationWarning", "ignore::UserWarning", "ignore::FutureWarning"]
+# show output, print test coverage report
+addopts = '-s --durations=0 --durations-min=5.0 --tb=native --cov-report term --cov-report html --cov=xl2times --cov=utils'
diff --git a/tests/data/austimes_pcg_test_data.parquet b/tests/data/austimes_pcg_test_data.parquet
diff --git a/tests/data/comm_groups_austimes_test_data.parquet b/tests/data/comm_groups_austimes_test_data.parquet
diff --git a/tests/test_transforms.py b/tests/test_transforms.py
@@ -0,0 +1,67 @@
+from datetime import datetime
+
+import pandas as pd
+
+from xl2times import transforms
+from xl2times.transforms import (
+    _process_comm_groups_vectorised,
+    _count_comm_group_vectorised,
+)
+
+pd.set_option(
+    "display.max_rows",
+    20,
+    "display.max_columns",
+    20,
+    "display.width",
+    300,
+    "display.max_colwidth",
+    75,
+    "display.precision",
+    3,
+)
+
+
+class TestTransforms:
+    def test_generate_commodity_groups(self):
+        """
+        Tests that the _count_comm_group_vectorised function works as expected.
+        Full austimes run:
+            Vectorised version took 0.021999 seconds
+            looped version took 966.653371 seconds
+            43958x speedup
+        """
+        # data extracted immediately before the original for loops
+        comm_groups = pd.read_parquet(
+            "tests/data/comm_groups_austimes_test_data.parquet"
+        ).drop(columns=["commoditygroup"])
+
+        # filter data so test runs faster
+        comm_groups = comm_groups.query("region in ['ACT', 'NSW']")
+
+        comm_groups2 = comm_groups.copy()
+        _count_comm_group_vectorised(comm_groups2)
+        assert comm_groups2.drop(columns=["commoditygroup"]).equals(comm_groups)
+        assert comm_groups2.shape == (comm_groups.shape[0], comm_groups.shape[1] + 1)
+
+    def test_default_pcg_vectorised(self):
+        """Tests the default primary commodity group identification logic runs correctly.
+        Full austimes run:
+            Looped version took 1107.66 seconds
+            Vectorised version took 62.85 seconds
+        """
+
+        # data extracted immediately before the original for loops
+        comm_groups = pd.read_parquet("tests/data/austimes_pcg_test_data.parquet")
+
+        comm_groups = comm_groups[(comm_groups["region"].isin(["ACT", "NT"]))]
+        comm_groups2 = _process_comm_groups_vectorised(
+            comm_groups.copy(), transforms.csets_ordered_for_pcg
+        )
+        assert comm_groups2 is not None and not comm_groups2.empty
+        assert comm_groups2.shape == (comm_groups.shape[0], comm_groups.shape[1] + 1)
+        assert comm_groups2.drop(columns=["DefaultVedaPCG"]).equals(comm_groups)
+
+
+if __name__ == "__main__":
+    TestTransforms().test_default_pcg_vectorised()
diff --git a/utils/dd_to_csv.py b/utils/dd_to_csv.py
@@ -1,4 +1,5 @@
 import argparse
+import sys
 from collections import defaultdict
 import json
 import os
@@ -229,4 +230,4 @@ def main(arg_list: None | list[str] = None):
 
 
 if __name__ == "__main__":
-    main()
+    main(sys.argv[1:])
diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py
@@ -159,7 +159,7 @@ def run_benchmark(
                 sys.exit(5)
         else:
             # If debug option is set, run as a function call to allow stepping with a debugger.
-            from utils.dd_to_csv import main
+            from dd_to_csv import main
 
             main([dd_folder, csv_folder])
 
@@ -259,7 +259,9 @@ def run_all_benchmarks(
     # The rest of this script checks regressions against main
     # so skip it if we're already on main
     repo = git.Repo(".")  # pyright: ignore
-    origin = repo.remotes.origin
+    origin = (
+        repo.remotes.origin if "origin" in repo.remotes else repo.remotes[0]
+    )  # don't assume remote is called 'origin'
     origin.fetch("main")
     if "main" not in repo.heads:
         repo.create_head("main", origin.refs.main).set_tracking_branch(origin.refs.main)
@@ -341,12 +343,28 @@ def run_all_benchmarks(
     addi_regressions = df[df["Additional"] > df["M Additional"]]["Benchmark"]
     time_regressions = df[df["Time (s)"] > 2 * df["M Time (s)"]]["Benchmark"]
 
-    runtime_change = df["Time (s)"].sum() - df["M Time (s)"].sum()
-    print(f"Change in runtime: {runtime_change:+.2f} s")
-    correct_change = df["Correct"].sum() - df["M Correct"].sum()
-    print(f"Change in correct rows: {correct_change:+d}")
-    additional_change = df["Additional"].sum() - df["M Additional"].sum()
-    print(f"Change in additional rows: {additional_change:+d}")
+    our_time = df["Time (s)"].sum()
+    main_time = df["M Time (s)"].sum()
+    runtime_change = our_time - main_time
+
+    print(f"Total runtime: {our_time:.2f}s (main: {main_time:.2f}s)")
+    print(
+        f"Change in runtime (negative == faster): {runtime_change:+.2f}s ({100*runtime_change/main_time:+.1f}%)"
+    )
+
+    our_correct = df["Correct"].sum()
+    main_correct = df["M Correct"].sum()
+    correct_change = our_correct - main_correct
+    print(
+        f"Change in correct rows (higher == better): {correct_change:+d} ({100*correct_change/main_correct:+.1f}%)"
+    )
+
+    our_additional_rows = df["Additional"].sum()
+    main_additional_rows = df["M Additional"].sum()
+    additional_change = our_additional_rows - main_additional_rows
+    print(
+        f"Change in additional rows: {additional_change:+d} ({100*additional_change/main_additional_rows:+.1f}%)"
+    )
 
     if len(accu_regressions) + len(addi_regressions) + len(time_regressions) > 0:
         print()

diff --git a/xl2times/transforms.py b/xl2times/transforms.py
@@ -11,6 +11,8 @@
 import time
 from functools import reduce
 
+from tqdm import tqdm
+
 from .utils import max_workers
 from . import datatypes
 from . import utils
@@ -1055,19 +1057,9 @@ def generate_commodity_groups(
 
     # Commodity groups by process, region and commodity
     comm_groups = pd.merge(prc_top, comm_set, on=["region", "commodity"])
-    comm_groups["commoditygroup"] = 0
-    # Store the number of IN/OUT commodities of the same type per Region and Process in CommodityGroup
-    for region in comm_groups["region"].unique():
-        i_reg = comm_groups["region"] == region
-        for process in comm_groups[i_reg]["process"].unique():
-            i_reg_prc = i_reg & (comm_groups["process"] == process)
-            for cset in comm_groups[i_reg_prc]["csets"].unique():
-                i_reg_prc_cset = i_reg_prc & (comm_groups["csets"] == cset)
-                for io in ["IN", "OUT"]:
-                    i_reg_prc_cset_io = i_reg_prc_cset & (comm_groups["io"] == io)
-                    comm_groups.loc[i_reg_prc_cset_io, "commoditygroup"] = sum(
-                        i_reg_prc_cset_io
-                    )
+
+    # Add columns for the number of IN/OUT commodities of each type
+    _count_comm_group_vectorised(comm_groups)
 
     def name_comm_group(df):
         """
@@ -1084,24 +1076,8 @@ def name_comm_group(df):
     # Replace commodity group member count with the name
     comm_groups["commoditygroup"] = comm_groups.apply(name_comm_group, axis=1)
 
-    # Determine default PCG according to Veda
-    comm_groups["DefaultVedaPCG"] = None
-    for region in comm_groups["region"].unique():
-        i_reg = comm_groups["region"] == region
-        for process in comm_groups[i_reg]["process"]:
-            i_reg_prc = i_reg & (comm_groups["process"] == process)
-            default_set = False
-            for io in ["OUT", "IN"]:
-                if default_set:
-                    break
-                i_reg_prc_io = i_reg_prc & (comm_groups["io"] == io)
-                for cset in csets_ordered_for_pcg:
-                    i_reg_prc_io_cset = i_reg_prc_io & (comm_groups["csets"] == cset)
-                    df = comm_groups[i_reg_prc_io_cset]
-                    if not df.empty:
-                        comm_groups.loc[i_reg_prc_io_cset, "DefaultVedaPCG"] = True
-                        default_set = True
-                        break
+    # Determine default PCG according to Veda's logic
+    comm_groups = _process_comm_groups_vectorised(comm_groups, csets_ordered_for_pcg)
 
     # Add standard Veda PCGS named contrary to name_comm_group
     if reg_prc_veda_pcg.shape[0]:
@@ -1135,6 +1111,62 @@ def name_comm_group(df):
     return tables
 
 
+def _count_comm_group_vectorised(comm_groups: pd.DataFrame) -> None:
+    """
+    Store the number of IN/OUT commodities of the same type per Region and Process in CommodityGroup.
+    `comm_groups` is modified in-place
+    Args:
+        comm_groups: 'Process' DataFrame with additional columns "commoditygroup"
+    """
+    comm_groups["commoditygroup"] = 0
+
+    comm_groups["commoditygroup"] = (
+        comm_groups.groupby(["region", "process", "csets", "io"]).transform("count")
+    )["commoditygroup"]
+    # set comoditygroup to 0 for io rows that aren't IN or OUT
+    comm_groups.loc[~comm_groups["io"].isin(["IN", "OUT"]), "commoditygroup"] = 0
+
+
+def _process_comm_groups_vectorised(
+    comm_groups: pd.DataFrame, csets_ordered_for_pcg: list[str]
+) -> pd.DataFrame:
+    """Sets the first commodity group in the list of csets_ordered_for_pcg as the default pcg for each region/process/io combination,
+    but setting the io="OUT" subset as default before "IN".
+
+    See:
+        Section 3.7.2.2, pg 80. of `TIMES Documentation PART IV` for details.
+    Args:
+        comm_groups: 'Process' DataFrame with columns ["region", "process", "io", "csets", "commoditygroup"]
+        csets_ordered_for_pcg: List of csets in the order they should be considered for default pcg
+    Returns:
+        Processed DataFrame with a new column "DefaultVedaPCG" set to True for the default pcg in each region/process/io combination.
+    """
+
+    def _set_default_veda_pcg(group):
+        """For a given [region, process] group, default group is set as the first cset in the `csets_ordered_for_pcg` list, which is an output, if
+        one exists, otherwise the first input."""
+        if not group["csets"].isin(csets_ordered_for_pcg).all():
+            return group
+
+        for io in ["OUT", "IN"]:
+            for cset in csets_ordered_for_pcg:
+                group.loc[
+                    (group["io"] == io) & (group["csets"] == cset), "DefaultVedaPCG"
+                ] = True
+                if group["DefaultVedaPCG"].any():
+                    break
+        return group
+
+    comm_groups["DefaultVedaPCG"] = None
+    comm_groups_subset = comm_groups.groupby(
+        ["region", "process"], sort=False, as_index=False
+    ).apply(_set_default_veda_pcg)
+    comm_groups_subset = comm_groups_subset.reset_index(
+        level=0, drop=True
+    ).sort_index()  # back to the original index and row order
+    return comm_groups_subset
+
+
 def complete_commodity_groups(
     config: datatypes.Config,
     tables: Dict[str, DataFrame],