diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7f42d5f..323aefc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,6 +34,12 @@ jobs: pre-commit install pre-commit run --all-files + - name: Run unit tests + working-directory: xl2times + run: | + source .venv/bin/activate + pytest + # ---------- Prepare ETSAP Demo models - uses: actions/checkout@v3 diff --git a/.gitignore b/.gitignore index 5b52706..2bb5d07 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ benchmarks/ .python-version docs/_build/ docs/api/ +.coverage +/out.txt diff --git a/README.md b/README.md index df36800..00416c7 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,6 @@ No regressions. You're awesome! ``` If you have a large increase in runtime, a decrease in correct rows or fewer rows being produced, then you've broken something and will need to figure out how to fix it. - ### Debugging Regressions If your change is causing regressions on one of the benchmarks, a useful way to debug and find the difference is to run the tool in verbose mode and compare the intermediate tables. For example, if your branch has regressions on Demo 1: @@ -137,6 +136,7 @@ python -m build python -m twine upload dist/* ``` + ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a diff --git a/pyproject.toml b/pyproject.toml index 484e835..ab7f0f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools>=61.0.0", "wheel"] build-backend = "setuptools.build_meta" [tool.setuptools] -packages = ["xl2times", "utils"] +packages = ["xl2times"] [project] name = "xl2times" @@ -14,21 +14,28 @@ requires-python = ">=3.10" license = { file = "LICENSE" } keywords = [] classifiers = [ - "Development Status :: 4 - Beta", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python", - "Programming Language :: Python :: 3", + "Development Status :: 4 - Beta", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", ] dependencies = [ - "GitPython >= 3.1.31, < 3.2", - "more-itertools", - "openpyxl >= 3.0, < 3.1", - "pandas >= 2.1", - "pyarrow" + "GitPython >= 3.1.31, < 3.2", + "more-itertools", + "openpyxl >= 3.0, < 3.1", + "pandas >= 2.1", + "pyarrow", + "tqdm", ] [project.optional-dependencies] -dev = ["black", "pre-commit", "tabulate"] +dev = [ + "black", + "pre-commit", + "tabulate", + "pytest", + "pytest-cov" +] [project.urls] Documentation = "https://github.com/etsap-TIMES/xl2times#readme" @@ -37,3 +44,9 @@ Source = "https://github.com/etsap-TIMES/xl2times" [project.scripts] xl2times = "xl2times.__main__:main" + +[tool.pytest.ini_options] +# don't print runtime warnings +filterwarnings = ["ignore::DeprecationWarning", "ignore::UserWarning", "ignore::FutureWarning"] +# show output, print test coverage report +addopts = '-s --durations=0 --durations-min=5.0 --tb=native --cov-report term --cov-report html --cov=xl2times --cov=utils' diff --git a/tests/data/austimes_pcg_test_data.parquet b/tests/data/austimes_pcg_test_data.parquet new file mode 100644 index 0000000..c3346d9 Binary files /dev/null and b/tests/data/austimes_pcg_test_data.parquet differ diff --git a/tests/data/comm_groups_austimes_test_data.parquet b/tests/data/comm_groups_austimes_test_data.parquet new file mode 100644 index 0000000..84cfc51 Binary files /dev/null and b/tests/data/comm_groups_austimes_test_data.parquet differ diff --git a/tests/test_transforms.py b/tests/test_transforms.py new file mode 100644 index 0000000..af77b6b --- /dev/null +++ b/tests/test_transforms.py @@ -0,0 +1,67 @@ +from datetime import datetime + +import pandas as pd + +from xl2times import transforms +from xl2times.transforms import ( + _process_comm_groups_vectorised, + _count_comm_group_vectorised, +) + +pd.set_option( + "display.max_rows", + 20, + "display.max_columns", + 20, + "display.width", + 300, + "display.max_colwidth", + 75, + "display.precision", + 3, +) + + +class TestTransforms: + def test_generate_commodity_groups(self): + """ + Tests that the _count_comm_group_vectorised function works as expected. + Full austimes run: + Vectorised version took 0.021999 seconds + looped version took 966.653371 seconds + 43958x speedup + """ + # data extracted immediately before the original for loops + comm_groups = pd.read_parquet( + "tests/data/comm_groups_austimes_test_data.parquet" + ).drop(columns=["commoditygroup"]) + + # filter data so test runs faster + comm_groups = comm_groups.query("region in ['ACT', 'NSW']") + + comm_groups2 = comm_groups.copy() + _count_comm_group_vectorised(comm_groups2) + assert comm_groups2.drop(columns=["commoditygroup"]).equals(comm_groups) + assert comm_groups2.shape == (comm_groups.shape[0], comm_groups.shape[1] + 1) + + def test_default_pcg_vectorised(self): + """Tests the default primary commodity group identification logic runs correctly. + Full austimes run: + Looped version took 1107.66 seconds + Vectorised version took 62.85 seconds + """ + + # data extracted immediately before the original for loops + comm_groups = pd.read_parquet("tests/data/austimes_pcg_test_data.parquet") + + comm_groups = comm_groups[(comm_groups["region"].isin(["ACT", "NT"]))] + comm_groups2 = _process_comm_groups_vectorised( + comm_groups.copy(), transforms.csets_ordered_for_pcg + ) + assert comm_groups2 is not None and not comm_groups2.empty + assert comm_groups2.shape == (comm_groups.shape[0], comm_groups.shape[1] + 1) + assert comm_groups2.drop(columns=["DefaultVedaPCG"]).equals(comm_groups) + + +if __name__ == "__main__": + TestTransforms().test_default_pcg_vectorised() diff --git a/utils/dd_to_csv.py b/utils/dd_to_csv.py index 5de10fd..a9e1132 100644 --- a/utils/dd_to_csv.py +++ b/utils/dd_to_csv.py @@ -1,4 +1,5 @@ import argparse +import sys from collections import defaultdict import json import os @@ -229,4 +230,4 @@ def main(arg_list: None | list[str] = None): if __name__ == "__main__": - main() + main(sys.argv[1:]) diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index 5ae8876..74e0e53 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -159,7 +159,7 @@ def run_benchmark( sys.exit(5) else: # If debug option is set, run as a function call to allow stepping with a debugger. - from utils.dd_to_csv import main + from dd_to_csv import main main([dd_folder, csv_folder]) @@ -259,7 +259,9 @@ def run_all_benchmarks( # The rest of this script checks regressions against main # so skip it if we're already on main repo = git.Repo(".") # pyright: ignore - origin = repo.remotes.origin + origin = ( + repo.remotes.origin if "origin" in repo.remotes else repo.remotes[0] + ) # don't assume remote is called 'origin' origin.fetch("main") if "main" not in repo.heads: repo.create_head("main", origin.refs.main).set_tracking_branch(origin.refs.main) @@ -341,12 +343,28 @@ def run_all_benchmarks( addi_regressions = df[df["Additional"] > df["M Additional"]]["Benchmark"] time_regressions = df[df["Time (s)"] > 2 * df["M Time (s)"]]["Benchmark"] - runtime_change = df["Time (s)"].sum() - df["M Time (s)"].sum() - print(f"Change in runtime: {runtime_change:+.2f} s") - correct_change = df["Correct"].sum() - df["M Correct"].sum() - print(f"Change in correct rows: {correct_change:+d}") - additional_change = df["Additional"].sum() - df["M Additional"].sum() - print(f"Change in additional rows: {additional_change:+d}") + our_time = df["Time (s)"].sum() + main_time = df["M Time (s)"].sum() + runtime_change = our_time - main_time + + print(f"Total runtime: {our_time:.2f}s (main: {main_time:.2f}s)") + print( + f"Change in runtime (negative == faster): {runtime_change:+.2f}s ({100*runtime_change/main_time:+.1f}%)" + ) + + our_correct = df["Correct"].sum() + main_correct = df["M Correct"].sum() + correct_change = our_correct - main_correct + print( + f"Change in correct rows (higher == better): {correct_change:+d} ({100*correct_change/main_correct:+.1f}%)" + ) + + our_additional_rows = df["Additional"].sum() + main_additional_rows = df["M Additional"].sum() + additional_change = our_additional_rows - main_additional_rows + print( + f"Change in additional rows: {additional_change:+d} ({100*additional_change/main_additional_rows:+.1f}%)" + ) if len(accu_regressions) + len(addi_regressions) + len(time_regressions) > 0: print() diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 984701f..eb67b2d 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -11,6 +11,8 @@ import time from functools import reduce +from tqdm import tqdm + from .utils import max_workers from . import datatypes from . import utils @@ -1055,19 +1057,9 @@ def generate_commodity_groups( # Commodity groups by process, region and commodity comm_groups = pd.merge(prc_top, comm_set, on=["region", "commodity"]) - comm_groups["commoditygroup"] = 0 - # Store the number of IN/OUT commodities of the same type per Region and Process in CommodityGroup - for region in comm_groups["region"].unique(): - i_reg = comm_groups["region"] == region - for process in comm_groups[i_reg]["process"].unique(): - i_reg_prc = i_reg & (comm_groups["process"] == process) - for cset in comm_groups[i_reg_prc]["csets"].unique(): - i_reg_prc_cset = i_reg_prc & (comm_groups["csets"] == cset) - for io in ["IN", "OUT"]: - i_reg_prc_cset_io = i_reg_prc_cset & (comm_groups["io"] == io) - comm_groups.loc[i_reg_prc_cset_io, "commoditygroup"] = sum( - i_reg_prc_cset_io - ) + + # Add columns for the number of IN/OUT commodities of each type + _count_comm_group_vectorised(comm_groups) def name_comm_group(df): """ @@ -1084,24 +1076,8 @@ def name_comm_group(df): # Replace commodity group member count with the name comm_groups["commoditygroup"] = comm_groups.apply(name_comm_group, axis=1) - # Determine default PCG according to Veda - comm_groups["DefaultVedaPCG"] = None - for region in comm_groups["region"].unique(): - i_reg = comm_groups["region"] == region - for process in comm_groups[i_reg]["process"]: - i_reg_prc = i_reg & (comm_groups["process"] == process) - default_set = False - for io in ["OUT", "IN"]: - if default_set: - break - i_reg_prc_io = i_reg_prc & (comm_groups["io"] == io) - for cset in csets_ordered_for_pcg: - i_reg_prc_io_cset = i_reg_prc_io & (comm_groups["csets"] == cset) - df = comm_groups[i_reg_prc_io_cset] - if not df.empty: - comm_groups.loc[i_reg_prc_io_cset, "DefaultVedaPCG"] = True - default_set = True - break + # Determine default PCG according to Veda's logic + comm_groups = _process_comm_groups_vectorised(comm_groups, csets_ordered_for_pcg) # Add standard Veda PCGS named contrary to name_comm_group if reg_prc_veda_pcg.shape[0]: @@ -1135,6 +1111,62 @@ def name_comm_group(df): return tables +def _count_comm_group_vectorised(comm_groups: pd.DataFrame) -> None: + """ + Store the number of IN/OUT commodities of the same type per Region and Process in CommodityGroup. + `comm_groups` is modified in-place + Args: + comm_groups: 'Process' DataFrame with additional columns "commoditygroup" + """ + comm_groups["commoditygroup"] = 0 + + comm_groups["commoditygroup"] = ( + comm_groups.groupby(["region", "process", "csets", "io"]).transform("count") + )["commoditygroup"] + # set comoditygroup to 0 for io rows that aren't IN or OUT + comm_groups.loc[~comm_groups["io"].isin(["IN", "OUT"]), "commoditygroup"] = 0 + + +def _process_comm_groups_vectorised( + comm_groups: pd.DataFrame, csets_ordered_for_pcg: list[str] +) -> pd.DataFrame: + """Sets the first commodity group in the list of csets_ordered_for_pcg as the default pcg for each region/process/io combination, + but setting the io="OUT" subset as default before "IN". + + See: + Section 3.7.2.2, pg 80. of `TIMES Documentation PART IV` for details. + Args: + comm_groups: 'Process' DataFrame with columns ["region", "process", "io", "csets", "commoditygroup"] + csets_ordered_for_pcg: List of csets in the order they should be considered for default pcg + Returns: + Processed DataFrame with a new column "DefaultVedaPCG" set to True for the default pcg in each region/process/io combination. + """ + + def _set_default_veda_pcg(group): + """For a given [region, process] group, default group is set as the first cset in the `csets_ordered_for_pcg` list, which is an output, if + one exists, otherwise the first input.""" + if not group["csets"].isin(csets_ordered_for_pcg).all(): + return group + + for io in ["OUT", "IN"]: + for cset in csets_ordered_for_pcg: + group.loc[ + (group["io"] == io) & (group["csets"] == cset), "DefaultVedaPCG" + ] = True + if group["DefaultVedaPCG"].any(): + break + return group + + comm_groups["DefaultVedaPCG"] = None + comm_groups_subset = comm_groups.groupby( + ["region", "process"], sort=False, as_index=False + ).apply(_set_default_veda_pcg) + comm_groups_subset = comm_groups_subset.reset_index( + level=0, drop=True + ).sort_index() # back to the original index and row order + return comm_groups_subset + + def complete_commodity_groups( config: datatypes.Config, tables: Dict[str, DataFrame],