From ef3018400399c9ae472dadc7a083f7cf33fa5282 Mon Sep 17 00:00:00 2001 From: Sam West Date: Fri, 16 Feb 2024 11:50:41 +1100 Subject: [PATCH 01/21] Remove rows with duplicate query cols and cleanup handling of TFM variants (#173) --- .gitignore | 1 + README.md | 3 +- benchmarks.yml | 39 +++++++++++++++++++++++++ utils/run_benchmarks.py | 65 ++++++++++++++++++++++++++++++++++------- xl2times/transforms.py | 22 ++++++++++++++ 5 files changed, 119 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 2bb5d07..848dbcf 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ docs/_build/ docs/api/ .coverage /out.txt +*.log diff --git a/README.md b/README.md index 00416c7..ec24356 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,8 @@ Then to run the benchmarks: # Run a only a single benchmark by name (see benchmarks.yml for name list) python utils/run_benchmarks.py benchmarks.yml --verbose --run DemoS_001-all | tee out.txt -# Run all benchmarks (without GAMS run, just comparing CSV data) +# Run all benchmarks (without GAMS run, just comparing CSV data for regressions) +# Note: if you have multiple remotes, set etsap-TIMES/xl2times as the first one, as it is used for speed/correctness comparisons. python utils/run_benchmarks.py benchmarks.yml --verbose | tee out.txt diff --git a/benchmarks.yml b/benchmarks.yml index 3f3ed6b..236a5f3 100644 --- a/benchmarks.yml +++ b/benchmarks.yml @@ -279,3 +279,42 @@ benchmarks: - "SuppXLS/Scen_B_TRA_EV_Parity.xlsx" - "SuppXLS/Scen_B_TRA_F_ModalShares.xlsx" dd_folder: Ireland + +# - name: AusTIMES +# input_folder: ../../../austimes-lfs +# regions: "ACT" +# inputs: +# - "VT_AUS_COM.xlsx" +# - "VT_AUS_ELC.xlsx" +# - "VT_AUS_IND.xlsm" +# - "VT_AUS_RES.xlsx" +# - "VT_AUS_TRA.xlsx" +# - "SysSettings.xlsx" +# - "SuppXLS/Scen_Par-austimes_CCA.xlsx" +# - "SuppXLS/Scen_nsv-austimes_csiro_1.xlsx" +# - "SuppXLS/Scen_nsv-austimes-cwc_1.xlsx" +# - "SuppXLS/Scen_Base-Electricity.xlsx" +# - "SuppXLS/Scen_Base-Industry.xlsx" +# - "SuppXLS/Scen_Base-Transport.xlsx" +# - "SuppXLS/Scen_CoalPlants_LE-control_disable.xlsx" +# - "SuppXLS/Scen_Hydrogen.xlsx" +# - "SuppXLS/Scen_revMinCoalfactors.xlsx" +# - "SuppXLS/Scen_revMinGPGfactors.xlsx" +# - "SuppXLS/Scen_Transport_Base_Liq-LevPlayFld_v16.xlsx" +# - "SuppXLS/Scen_Transport_SteadyProgress.xlsx" +# - "SuppXLS/Scen_TransportPolicies.xlsx" +# - "SuppXLS/Scen_xComSolarWeighting-TG17-base.xlsx" +# - "SuppXLS/Scen_xComSolarWeighting-TG17-xScale.xlsx" +# - "SuppXLS/Scen_xICEcostAdj.xlsx" +# - "SuppXLS/Scen_zScalingCorrection.xlsx" +# - "SuppXLS/Scen_zzdisable2options.xlsx" +# - "SubRES_Tmpl/SubRES_CoalGasDomExp.xlsx" +# - "SubRES_Tmpl/SubRES_CoalPlants-Ret-and_lifeExt.xlsx" +# - "SubRES_Tmpl/SubRES_ElecFossil.xlsx" +# - "SubRES_Tmpl/SubRES_ETI_Techs.xlsx" +# - "SubRES_Tmpl/SubRES_Frontier-Levers.xlsx" +# - "SubRES_Tmpl/SubRES_Hydrogen_production.xlsx" +# - "SubRES_Tmpl/SubRES_PumpedStorage.xlsx" +# - "SubRES_Tmpl/SubRES_WindSolarWave.xlsx" +# - "SuppXLS/Trades/ScenTrade_TradeParm.xlsx" +# dd_folder: austimes diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index 74e0e53..67b1aaf 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -1,22 +1,67 @@ import argparse +import logging import os +import shutil +import subprocess +import sys +import time from collections import namedtuple from concurrent.futures import ProcessPoolExecutor from functools import partial -import git +from logging.handlers import RotatingFileHandler +from logging import StreamHandler from os import path, symlink -import pandas as pd from re import match -import shutil -import subprocess -import sys -from tabulate import tabulate -import time from typing import Any, Tuple + +import git +import pandas as pd import yaml +from tabulate import tabulate from xl2times.utils import max_workers +# configure logger +# logging.basicConfig( +# level=logging.DEBUG, +# format="%(asctime)s - %(name)s - %(levelname)s - %(message)s - %(filename)s:%(lineno)d", +# handlers=[StreamHandler(), RotatingFileHandler("xl2times.log", maxBytes=1000000, backupCount=5)], +# force=True, +# datefmt="%Y-%m-%d %H:%M:%S", +# ) +# logger = logging.getLogger("xl2times") +# logger.info("Logger!") + +from loguru import logger + +# set global log level via env var. Set to INFO if not already set. +if os.getenv("LOGURU_LEVEL") is None: + os.environ["LOGURU_LEVEL"] = "INFO" + +log_conf = { + "handlers": [ + { + "sink": sys.stdout, + "diagnose": False, + "format": "{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} : {message} ({name}:{" + 'thread.name}:pid-{process} "{' + 'file.path}:{line}")', + }, + { + "sink": "./xl2times.log", + "enqueue": True, + "mode": "a+", + "level": "DEBUG", + "colorize": False, + "serialize": False, + "diagnose": False, + "rotation": "20 MB", + "compression": "zip", + }, + ], +} +logger.configure(**log_conf) + def parse_result(lastline): m = match( @@ -349,21 +394,21 @@ def run_all_benchmarks( print(f"Total runtime: {our_time:.2f}s (main: {main_time:.2f}s)") print( - f"Change in runtime (negative == faster): {runtime_change:+.2f}s ({100*runtime_change/main_time:+.1f}%)" + f"Change in runtime (negative == faster): {runtime_change:+.2f}s ({100 * runtime_change / main_time:+.1f}%)" ) our_correct = df["Correct"].sum() main_correct = df["M Correct"].sum() correct_change = our_correct - main_correct print( - f"Change in correct rows (higher == better): {correct_change:+d} ({100*correct_change/main_correct:+.1f}%)" + f"Change in correct rows (higher == better): {correct_change:+d} ({100 * correct_change / main_correct:+.1f}%)" ) our_additional_rows = df["Additional"].sum() main_additional_rows = df["M Additional"].sum() additional_change = our_additional_rows - main_additional_rows print( - f"Change in additional rows: {additional_change:+d} ({100*additional_change/main_additional_rows:+.1f}%)" + f"Change in additional rows: {additional_change:+d} ({100 * additional_change / main_additional_rows:+.1f}%)" ) if len(accu_regressions) + len(addi_regressions) + len(time_regressions) > 0: diff --git a/xl2times/transforms.py b/xl2times/transforms.py index eb67b2d..9a4d50d 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -1,3 +1,4 @@ +import collections from collections import defaultdict from pandas.core.frame import DataFrame from pathlib import Path @@ -13,6 +14,12 @@ from tqdm import tqdm +import logging +import logging.config + +logger = logging.getLogger(__name__) + + from .utils import max_workers from . import datatypes from . import utils @@ -373,6 +380,21 @@ def process_flexible_import_table( attribute = "attribute" if table.tag != datatypes.Tag.tfm_upd: + + # Check for duplicate DF columns + duplicated_cols = [ + item + for item, count in collections.Counter(data_columns).items() + if count > 1 + ] + if len(duplicated_cols) > 0: + logger.warning( + f"Duplicate data columns in table: {duplicated_cols}. Dropping first duplicated column. Table: \n{repr(table)}" + ) + # drop duplicate Df columns + df = df.loc[:, ~df.columns.duplicated(keep="last")] + data_columns = pd.unique(data_columns).tolist() + df, attribute_suffix = utils.explode(df, data_columns) # Append the data column name to the Attribute column values From b892397eeb024187a2aca50fd0f6f0da96fdf4c9 Mon Sep 17 00:00:00 2001 From: Sam West Date: Fri, 16 Feb 2024 12:56:28 +1100 Subject: [PATCH 02/21] made output parsing regex more robust --- utils/run_benchmarks.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index 67b1aaf..16599a4 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -1,6 +1,7 @@ import argparse import logging import os +import re import shutil import subprocess import sys @@ -63,17 +64,19 @@ logger.configure(**log_conf) -def parse_result(lastline): - m = match( - r"(\d+\.\d)\% of ground truth rows present in output \((\d+)/(\d+)\)" - r", (\d+) additional rows", - lastline, +def parse_result(output: str) -> Tuple[float, int, int]: + # find pattern in multiline string + m = re.findall( + r"(\d+\.\d)\% of ground truth rows present in output \((\d+)/(\d+)\), (\d+) additional rows", + output, + flags=re.MULTILINE, ) - if not m: - print(f"ERROR: could not parse output of run:\n{lastline}") + if not m or len(m) == 0: + print(f"ERROR: could not parse output of run:\n{output}") sys.exit(2) # return (accuracy, num_correct_rows, num_additional_rows) - return (float(m.groups()[0]), int(m.groups()[1]), int(m.groups()[3])) + m = m[0] + return (float(m.groups()[0]), int(m.groups()[1]), int(m.groups()[2])) def run_gams_gdxdiff( @@ -260,7 +263,7 @@ def run_benchmark( with open(path.join(out_folder, "stdout"), "w") as f: f.write(res.stdout) - (accuracy, num_correct, num_additional) = parse_result(res.stdout.splitlines()[-1]) + (accuracy, num_correct, num_additional) = parse_result(res.stdout) if run_gams: dd_res = run_gams_gdxdiff(benchmark, times_folder, dd_folder, out_folder) From 0eb5bddd6da801c33372c42e08b7a36801c65833 Mon Sep 17 00:00:00 2001 From: Sam West Date: Fri, 16 Feb 2024 12:57:45 +1100 Subject: [PATCH 03/21] fixed parse_result --- utils/run_benchmarks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index 16599a4..cd38bc3 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -76,7 +76,7 @@ def parse_result(output: str) -> Tuple[float, int, int]: sys.exit(2) # return (accuracy, num_correct_rows, num_additional_rows) m = m[0] - return (float(m.groups()[0]), int(m.groups()[1]), int(m.groups()[2])) + return (float(m[0]), int(m[1]), int(m[2])) def run_gams_gdxdiff( From 158ce3397f6554df1ebfe527cea3bb4b1d26b52b Mon Sep 17 00:00:00 2001 From: Sam West Date: Fri, 16 Feb 2024 13:37:12 +1100 Subject: [PATCH 04/21] Add loguru, poe and poe shortcuts --- pyproject.toml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ab7f0f3..a729d32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ dependencies = [ "pandas >= 2.1", "pyarrow", "tqdm", + "loguru" ] [project.optional-dependencies] @@ -34,7 +35,8 @@ dev = [ "pre-commit", "tabulate", "pytest", - "pytest-cov" + "pytest-cov", + "poethepoet" ] [project.urls] @@ -50,3 +52,12 @@ xl2times = "xl2times.__main__:main" filterwarnings = ["ignore::DeprecationWarning", "ignore::UserWarning", "ignore::FutureWarning"] # show output, print test coverage report addopts = '-s --durations=0 --durations-min=5.0 --tb=native --cov-report term --cov-report html --cov=xl2times --cov=utils' + +[tool.poe.tasks] +# Automation of common dev tasks etc. +# Run with: `poe `, e,g. `poe lint` or `poe benchmark Ireland`. +# See https://github.com/nat-n/poethepoet for details. +benchmark = { cmd = "python utils/run_benchmarks.py benchmarks.yml --verbose --run", help = "Run a single benchmark. Usage: poe benchmark " } +benchmark_all = { shell = "python utils/run_benchmarks.py benchmarks.yml --verbose | tee out.txt", help = "Run the project", interpreter = "posix" } +lint = { cmd = "poetry run git add .pre-commit-config.yaml & pre-commit run", help = "Run pre-commit hooks" } +test = { cmd = "poetry run pytest", help = "Run unit tests with pytest" } From 783cb88a44dfcfc8b75c771a24ec7b6fa9b6b62d Mon Sep 17 00:00:00 2001 From: Sam West Date: Fri, 16 Feb 2024 13:59:11 +1100 Subject: [PATCH 05/21] support merging tables (as VEDA appears to) where come columns are optional, e.g. ctslvl and ctype from ~FI_COMM --- xl2times/transforms.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 9a4d50d..5f9f502 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -267,17 +267,23 @@ def merge_tables( for key, value in groupby(sorted(tables, key=lambda t: t.tag), lambda t: t.tag): group = list(value) - if not all( - set(t.dataframe.columns) == set(group[0].dataframe.columns) for t in group - ): - cols = [(",".join(g.dataframe.columns), g) for g in group] - print( - f"WARNING: Cannot merge tables with tag {key} as their columns are not identical" - ) - for c, table in cols: - print(f" {c} from {table.range}, {table.sheetname}, {table.filename}") - else: - df = pd.concat([table.dataframe for table in group], ignore_index=True) + + if len(group) == 0: + continue + + df = pd.concat([table.dataframe for table in group], ignore_index=True) + result[key] = df + + # VEDA appears to support merging tables where come columns are optional, e.g. ctslvl and ctype from ~FI_COMM. + # So just print detailed warning if we find tables with fewer columns than the concat'ed table. + concat_cols = set(df.columns) + missing_cols = [concat_cols - set(t.dataframe.columns) for t in group] + + if any([len(m) for m in missing_cols]): + err = f"WARNING: Possible merge error for table: '{key}'! Merged table has more columns than individual table(s), see details below:" + for table in group: + err += f"\n\tColumns: {list(table.dataframe.columns)} from {table.range}, {table.sheetname}, {table.filename}" + logger.warning(err) match key: case datatypes.Tag.fi_comm: @@ -662,7 +668,6 @@ def generate_uc_properties( model.user_constraints = user_constraints.rename( columns={"uc_n": "name", "uc_desc": "description"} ) - return tables From df82670320a9da35dfc933eef33531435500a133 Mon Sep 17 00:00:00 2001 From: Sam West Date: Mon, 19 Feb 2024 10:54:13 +1100 Subject: [PATCH 06/21] Fixed indentation --- xl2times/transforms.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 5f9f502..3b4b4a4 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -285,13 +285,14 @@ def merge_tables( err += f"\n\tColumns: {list(table.dataframe.columns)} from {table.range}, {table.sheetname}, {table.filename}" logger.warning(err) - match key: - case datatypes.Tag.fi_comm: - model.commodities = df - case datatypes.Tag.fi_process: - model.processes = df - case _: - result[key] = df + match key: + case datatypes.Tag.fi_comm: + model.commodities = df + case datatypes.Tag.fi_process: + model.processes = df + case _: + result[key] = df + return result From 6cfe42ecb8f5c06c061fb806bd540936c9f4ec6c Mon Sep 17 00:00:00 2001 From: Sam West Date: Tue, 20 Feb 2024 08:51:47 +1100 Subject: [PATCH 07/21] WIP prototype of more efficient uc_wildcards transform --- benchmarks.yml | 76 ++++++++++---------- pyproject.toml | 4 +- tests/test_transforms.py | 152 ++++++++++++++++++++++++++++++++++++++- xl2times/transforms.py | 42 ++++++++--- xl2times/utils.py | 3 + 5 files changed, 227 insertions(+), 50 deletions(-) diff --git a/benchmarks.yml b/benchmarks.yml index 236a5f3..686472d 100644 --- a/benchmarks.yml +++ b/benchmarks.yml @@ -280,41 +280,41 @@ benchmarks: - "SuppXLS/Scen_B_TRA_F_ModalShares.xlsx" dd_folder: Ireland -# - name: AusTIMES -# input_folder: ../../../austimes-lfs -# regions: "ACT" -# inputs: -# - "VT_AUS_COM.xlsx" -# - "VT_AUS_ELC.xlsx" -# - "VT_AUS_IND.xlsm" -# - "VT_AUS_RES.xlsx" -# - "VT_AUS_TRA.xlsx" -# - "SysSettings.xlsx" -# - "SuppXLS/Scen_Par-austimes_CCA.xlsx" -# - "SuppXLS/Scen_nsv-austimes_csiro_1.xlsx" -# - "SuppXLS/Scen_nsv-austimes-cwc_1.xlsx" -# - "SuppXLS/Scen_Base-Electricity.xlsx" -# - "SuppXLS/Scen_Base-Industry.xlsx" -# - "SuppXLS/Scen_Base-Transport.xlsx" -# - "SuppXLS/Scen_CoalPlants_LE-control_disable.xlsx" -# - "SuppXLS/Scen_Hydrogen.xlsx" -# - "SuppXLS/Scen_revMinCoalfactors.xlsx" -# - "SuppXLS/Scen_revMinGPGfactors.xlsx" -# - "SuppXLS/Scen_Transport_Base_Liq-LevPlayFld_v16.xlsx" -# - "SuppXLS/Scen_Transport_SteadyProgress.xlsx" -# - "SuppXLS/Scen_TransportPolicies.xlsx" -# - "SuppXLS/Scen_xComSolarWeighting-TG17-base.xlsx" -# - "SuppXLS/Scen_xComSolarWeighting-TG17-xScale.xlsx" -# - "SuppXLS/Scen_xICEcostAdj.xlsx" -# - "SuppXLS/Scen_zScalingCorrection.xlsx" -# - "SuppXLS/Scen_zzdisable2options.xlsx" -# - "SubRES_Tmpl/SubRES_CoalGasDomExp.xlsx" -# - "SubRES_Tmpl/SubRES_CoalPlants-Ret-and_lifeExt.xlsx" -# - "SubRES_Tmpl/SubRES_ElecFossil.xlsx" -# - "SubRES_Tmpl/SubRES_ETI_Techs.xlsx" -# - "SubRES_Tmpl/SubRES_Frontier-Levers.xlsx" -# - "SubRES_Tmpl/SubRES_Hydrogen_production.xlsx" -# - "SubRES_Tmpl/SubRES_PumpedStorage.xlsx" -# - "SubRES_Tmpl/SubRES_WindSolarWave.xlsx" -# - "SuppXLS/Trades/ScenTrade_TradeParm.xlsx" -# dd_folder: austimes + - name: AusTIMES + input_folder: ../../../austimes-lfs + regions: "ACT" + inputs: + - "VT_AUS_COM.xlsx" + - "VT_AUS_ELC.xlsx" + - "VT_AUS_IND.xlsm" + - "VT_AUS_RES.xlsx" + - "VT_AUS_TRA.xlsx" + - "SysSettings.xlsx" + - "SuppXLS/Scen_Par-austimes_CCA.xlsx" + - "SuppXLS/Scen_nsv-austimes_csiro_1.xlsx" + - "SuppXLS/Scen_nsv-austimes-cwc_1.xlsx" + - "SuppXLS/Scen_Base-Electricity.xlsx" + - "SuppXLS/Scen_Base-Industry.xlsx" + - "SuppXLS/Scen_Base-Transport.xlsx" + - "SuppXLS/Scen_CoalPlants_LE-control_disable.xlsx" + - "SuppXLS/Scen_Hydrogen.xlsx" + - "SuppXLS/Scen_revMinCoalfactors.xlsx" + - "SuppXLS/Scen_revMinGPGfactors.xlsx" + - "SuppXLS/Scen_Transport_Base_Liq-LevPlayFld_v16.xlsx" + - "SuppXLS/Scen_Transport_SteadyProgress.xlsx" + - "SuppXLS/Scen_TransportPolicies.xlsx" + - "SuppXLS/Scen_xComSolarWeighting-TG17-base.xlsx" + - "SuppXLS/Scen_xComSolarWeighting-TG17-xScale.xlsx" + - "SuppXLS/Scen_xICEcostAdj.xlsx" + - "SuppXLS/Scen_zScalingCorrection.xlsx" + - "SuppXLS/Scen_zzdisable2options.xlsx" + - "SubRES_Tmpl/SubRES_CoalGasDomExp.xlsx" + - "SubRES_Tmpl/SubRES_CoalPlants-Ret-and_lifeExt.xlsx" + - "SubRES_Tmpl/SubRES_ElecFossil.xlsx" + - "SubRES_Tmpl/SubRES_ETI_Techs.xlsx" + - "SubRES_Tmpl/SubRES_Frontier-Levers.xlsx" + - "SubRES_Tmpl/SubRES_Hydrogen_production.xlsx" + - "SubRES_Tmpl/SubRES_PumpedStorage.xlsx" + - "SubRES_Tmpl/SubRES_WindSolarWave.xlsx" + - "SuppXLS/Trades/ScenTrade_TradeParm.xlsx" + dd_folder: austimes diff --git a/pyproject.toml b/pyproject.toml index a729d32..b3e612b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,5 +59,5 @@ addopts = '-s --durations=0 --durations-min=5.0 --tb=native --cov-report term -- # See https://github.com/nat-n/poethepoet for details. benchmark = { cmd = "python utils/run_benchmarks.py benchmarks.yml --verbose --run", help = "Run a single benchmark. Usage: poe benchmark " } benchmark_all = { shell = "python utils/run_benchmarks.py benchmarks.yml --verbose | tee out.txt", help = "Run the project", interpreter = "posix" } -lint = { cmd = "poetry run git add .pre-commit-config.yaml & pre-commit run", help = "Run pre-commit hooks" } -test = { cmd = "poetry run pytest", help = "Run unit tests with pytest" } +lint = { shell = "git add .pre-commit-config.yaml & pre-commit run", help = "Run pre-commit hooks", interpreter = "posix" } +test = { cmd = "pytest", help = "Run unit tests with pytest" } diff --git a/tests/test_transforms.py b/tests/test_transforms.py index af77b6b..011a81e 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -1,12 +1,21 @@ +import re from datetime import datetime +from re import Pattern import pandas as pd +from pandas.core.common import flatten -from xl2times import transforms +from xl2times import transforms, datatypes from xl2times.transforms import ( _process_comm_groups_vectorised, _count_comm_group_vectorised, + get_matching_commodities, + intersect, + filter_by_pattern, + expand_rows, + query_columns, ) +from xl2times.utils import create_regexp, create_negative_regexp pd.set_option( "display.max_rows", @@ -22,7 +31,145 @@ ) +def get_matching_processes(row, dictionary): + matching_processes = None + for col, key in [ + ("pset_pn", "processes_by_name"), + ("pset_pd", "processes_by_desc"), + ("pset_set", "processes_by_sets"), + ("pset_ci", "processes_by_comm_in"), + ("pset_co", "processes_by_comm_out"), + ]: + if row[col] is not None: + matching_processes = intersect( + matching_processes, + filter_by_pattern( + dictionary[key], row[col].upper() + ), # 20% of runtime here. Avoid regex if no wildcard chars in string? + ) + if matching_processes is not None and any(matching_processes.duplicated()): + raise ValueError("duplicated") + return matching_processes + + class TestTransforms: + def test_uc_wildcards(self): + import pickle + + dfo = pd.read_parquet("tests/data/process_uc_wildcards_austimes_data.parquet") + with open("tests/data/process_uc_wildcards_austimes_dict.pkl", "rb") as f: + dictionary = pickle.load(f) + + df = dfo.query("region in ['ACT']") + + # df = dfo.query("region in ['NSW']") + # row count per region + + def make_str(df): + if df is not None and len(df) != 0: + list_from_df = df.iloc[:, 0].unique() # 60% of runtime here + return ",".join(list_from_df) + else: + return None + + wildcard_map = { + "pset_pn": "processes_by_name", + "pset_pd": "processes_by_desc", + "pset_set": "processes_by_sets", + "pset_ci": "processes_by_comm_in", + "pset_co": "processes_by_comm_out", + } + + commodity_map = { + "cset_cn": "commodities_by_name", + "cset_cd": "commodities_by_desc", + "cset_set": "commodities_by_sets", + } + + t0 = datetime.now() + + # This apply() just gets matches the wildcards of process names in the tables against the list of all process names + # Then because there can be multiple matches per table row, `expand_rows` melts the result into long format. + # We can probably do this a lot faster by building a list of all wildcard matches first (avoiding duplicate lookups) as a dataframe + # and then doing an outer-join with the original dataframe. + + match_dfs = [] + for pname_short, pname_long in wildcard_map.items(): + wildcards = df[pname_short].dropna().unique() + processes = dictionary[pname_long] + matches = { + w: filter_by_pattern(processes, wildcards[0].upper())[ + "process" + ].to_list() + for w in wildcards + } + + proc_match_df = pd.DataFrame( + matches.items(), columns=["wildcard", "matches"] + ) + proc_match_df["pname_short"] = pname_short + match_dfs.append(proc_match_df) + wildcard_matches = pd.concat(match_dfs).explode("matches") + + # now cross-join wildcard_matches with df + df2 = df.copy() + df2["process"] = None + for pname_short in wildcard_map.keys(): + if pname_short in df2.columns and any( + pname_short == wildcard_matches["pname_short"] + ): + wild = wildcard_matches[wildcard_matches["pname_short"] == pname_short] + df2 = df2.merge( + wild, left_on=pname_short, right_on="wildcard", how="left" + ).drop(columns=["wildcard", "pname_short"]) + # update process column with matches for pname_short rows + df2["process"].update(df2["matches"]) + df2 = df2.drop(columns=["matches"]) + # df2 = df2.drop(columns=wildcard_map.keys()) + # df2 = df2.drop(columns=commodity_map.keys()) + + df["process"] = df.apply( + lambda row: make_str(get_matching_processes(row, dictionary)), axis=1 + ) + + t1 = datetime.now() + print(f"get_matching_processes took {t1 - t0} seconds") + + # df["commodity"] = df.apply( + # lambda row: make_str(get_matching_commodities(row, dictionary)), axis=1 + # ) + # t2 = datetime.now() + # print(f"get_matching_commodities took {t2 - t1} seconds") + + cols_to_drop = [col for col in df.columns if col in query_columns] + + dfe = expand_rows( + datatypes.EmbeddedXlTable( + tag="", + uc_sets={}, + sheetname="", + range="", + filename="", + dataframe=df, # .drop(columns=cols_to_drop), + ) + ).dataframe + assert len(set(dfe.columns).symmetric_difference(set(df2.columns))) == 0 + + assert all( + dfe.query( + "`pset_ci`=='ELCHYD,ELCSOL,ELCWIN,Wind01,Solar01,Hydro01'" + ).process + == df2.query( + "`pset_ci`=='ELCHYD,ELCSOL,ELCWIN,Wind01,Solar01,Hydro01'" + ).process + ) + # set column order the same + df2 = df2[dfe.columns] + + assert (dfe.reset_index(drop=True) == df2.reset_index(drop=True)).all().all() + + print(dfe) + def test_generate_commodity_groups(self): """ Tests that the _count_comm_group_vectorised function works as expected. @@ -64,4 +211,5 @@ def test_default_pcg_vectorised(self): if __name__ == "__main__": - TestTransforms().test_default_pcg_vectorised() + # TestTransforms().test_default_pcg_vectorised() + TestTransforms().test_uc_wildcards() diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 3b4b4a4..804674d 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -1,4 +1,5 @@ import collections +import functools from collections import defaultdict from pandas.core.frame import DataFrame from pathlib import Path @@ -17,13 +18,14 @@ import logging import logging.config -logger = logging.getLogger(__name__) - from .utils import max_workers from . import datatypes from . import utils +logger = logging.getLogger(__name__) + + query_columns = { "pset_set", "pset_pn", @@ -547,7 +549,8 @@ def process_user_constraint_table( # TODO: apply table.uc_sets # Fill in UC_N blank cells with value from above - df["uc_n"] = df["uc_n"].ffill() + if "uc_n" in df.columns: + df["uc_n"] = df["uc_n"].ffill() data_columns = [ x for x in df.columns if x not in config.known_columns[datatypes.Tag.uc_t] @@ -2059,10 +2062,17 @@ def make_str(df): else: return None - if tag in tables: + if tag in tqdm(tables, desc=f"Processing uc_wildcards on tables"): start_time = time.time() df = tables[tag] dictionary = generate_topology_dictionary(tables, model) + # set year column dtype to str + # df["year"] = df["year"].astype(str) + df.to_parquet("tests/data/process_uc_wildcards_austimes_data.parquet") + import pickle + + with open("tests/data/process_uc_wildcards_austimes_dict.pkl", "wb") as f: + pickle.dump(dictionary, f) df["process"] = df.apply( lambda row: make_str(get_matching_processes(row, dictionary)), axis=1 @@ -2153,7 +2163,11 @@ def eval_and_update( # TFM_UPD: expand wildcards in each row, query FI_T to find matching rows, # evaluate the update formula, and add new rows to FI_T # TODO perf: collect all updates and go through FI_T only once? - for _, row in updates.iterrows(): + for _, row in tqdm( + updates.iterrows(), + total=len(updates), + desc=f"Processing wildcard for {datatypes.Tag.tfm_upd}", + ): if row["value"] is None: # TODO is this really needed? continue match = match_wildcards(row) @@ -2176,7 +2190,11 @@ def eval_and_update( new_tables = [] # TFM_INS: expand each row by wildcards, then add to FI_T - for _, row in updates.iterrows(): + for _, row in tqdm( + updates.iterrows(), + total=len(updates), + desc=f"Processing wildcard for {datatypes.Tag.tfm_ins}", + ): match = match_wildcards(row) # TODO perf: add matched procs/comms into column and use explode? new_rows = pd.DataFrame([row.filter(table.columns)]) @@ -2196,7 +2214,11 @@ def eval_and_update( # TFM_INS-TXT: expand row by wildcards, query FI_PROC/COMM for matching rows, # evaluate the update formula, and inplace update the rows - for _, row in updates.iterrows(): + for _, row in tqdm( + updates.iterrows(), + total=len(updates), + desc=f"Processing wildcard for {datatypes.Tag.tfm_ins_txt}", + ): match = match_wildcards(row) if match is None: print(f"WARNING: TFM_INS-TXT row matched neither commodity nor process") @@ -2221,7 +2243,11 @@ def eval_and_update( table = tables[datatypes.Tag.fi_t] new_tables = [] - for _, row in updates.iterrows(): + for _, row in tqdm( + updates.iterrows(), + total=len(updates), + desc=f"Processing wildcard for {datatypes.Tag.tfm_mig}", + ): match = match_wildcards(row) processes, commodities = match if match is not None else (None, None) # TODO should we also query on limtype? diff --git a/xl2times/utils.py b/xl2times/utils.py index bec4cb9..c042139 100644 --- a/xl2times/utils.py +++ b/xl2times/utils.py @@ -1,3 +1,4 @@ +import functools import os import re from dataclasses import replace @@ -185,6 +186,7 @@ def remove_positive_patterns(pattern): return ",".join([word[1:] for word in pattern.split(",") if word[0] == "-"]) +@functools.cache def create_regexp(pattern): # exclude negative patterns if has_negative_patterns(pattern): @@ -198,6 +200,7 @@ def create_regexp(pattern): return re.compile(pattern) +@functools.cache def create_negative_regexp(pattern): pattern = remove_positive_patterns(pattern) if len(pattern) == 0: From ed82d3dd19d6780c03d2345586cf6ad923b3abfd Mon Sep 17 00:00:00 2001 From: Sam West Date: Tue, 20 Feb 2024 16:05:16 +1100 Subject: [PATCH 08/21] Working prototype, ~10-20x speedup --- pyproject.toml | 4 +- tests/test_transforms.py | 201 ++++++++++++++------------------------- xl2times/transforms.py | 110 +++++++++++++-------- xl2times/utils.py | 3 + 4 files changed, 151 insertions(+), 167 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b3e612b..7dd7580 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ xl2times = "xl2times.__main__:main" # don't print runtime warnings filterwarnings = ["ignore::DeprecationWarning", "ignore::UserWarning", "ignore::FutureWarning"] # show output, print test coverage report -addopts = '-s --durations=0 --durations-min=5.0 --tb=native --cov-report term --cov-report html --cov=xl2times --cov=utils' +addopts = '-s --durations=0 --durations-min=5.0 --tb=native' [tool.poe.tasks] # Automation of common dev tasks etc. @@ -60,4 +60,4 @@ addopts = '-s --durations=0 --durations-min=5.0 --tb=native --cov-report term -- benchmark = { cmd = "python utils/run_benchmarks.py benchmarks.yml --verbose --run", help = "Run a single benchmark. Usage: poe benchmark " } benchmark_all = { shell = "python utils/run_benchmarks.py benchmarks.yml --verbose | tee out.txt", help = "Run the project", interpreter = "posix" } lint = { shell = "git add .pre-commit-config.yaml & pre-commit run", help = "Run pre-commit hooks", interpreter = "posix" } -test = { cmd = "pytest", help = "Run unit tests with pytest" } +test = { cmd = "pytest --cov-report term --cov-report html --cov=xl2times --cov=utils", help = "Run unit tests with pytest" } diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 011a81e..2cfb0f3 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -1,21 +1,22 @@ -import re from datetime import datetime -from re import Pattern +from typing import Callable import pandas as pd -from pandas.core.common import flatten -from xl2times import transforms, datatypes +from xl2times import transforms, utils, datatypes from xl2times.transforms import ( _process_comm_groups_vectorised, _count_comm_group_vectorised, - get_matching_commodities, intersect, - filter_by_pattern, expand_rows, + get_matching_commodities, + filter_by_pattern, + get_matching_processes, query_columns, + _match_uc_wildcards, + process_map, + commodity_map, ) -from xl2times.utils import create_regexp, create_negative_regexp pd.set_option( "display.max_rows", @@ -31,29 +32,51 @@ ) -def get_matching_processes(row, dictionary): - matching_processes = None - for col, key in [ - ("pset_pn", "processes_by_name"), - ("pset_pd", "processes_by_desc"), - ("pset_set", "processes_by_sets"), - ("pset_ci", "processes_by_comm_in"), - ("pset_co", "processes_by_comm_out"), - ]: - if row[col] is not None: - matching_processes = intersect( - matching_processes, - filter_by_pattern( - dictionary[key], row[col].upper() - ), # 20% of runtime here. Avoid regex if no wildcard chars in string? - ) - if matching_processes is not None and any(matching_processes.duplicated()): - raise ValueError("duplicated") - return matching_processes +def _match_uc_wildcards_old( + df: pd.DataFrame, dictionary: dict[str, pd.DataFrame] +) -> pd.DataFrame: + """Old version of the process_uc_wildcards matching logic, for comparison with the new vectorised version. + TODO remove this function once validated. + """ + + def make_str(df): + if df is not None and len(df) != 0: + list_from_df = df.iloc[:, 0].unique() + return ",".join(list_from_df) + else: + return None + + df["process"] = df.apply( + lambda row: make_str(get_matching_processes(row, dictionary)), axis=1 + ) + df["commodity"] = df.apply( + lambda row: make_str(get_matching_commodities(row, dictionary)), axis=1 + ) + + cols_to_drop = [col for col in df.columns if col in query_columns] + + df = expand_rows( + datatypes.EmbeddedXlTable( + tag="", + uc_sets={}, + sheetname="", + range="", + filename="", + dataframe=df.drop(columns=cols_to_drop), + ) + ).dataframe + return df class TestTransforms: def test_uc_wildcards(self): + """ + Tests logic that matches wildcards in the process_uc_wildcards transform . + + Old method took 0:01:35.823996 seconds + New method took 0:00:04.622714 seconds, speedup: 20.7x + + """ import pickle dfo = pd.read_parquet("tests/data/process_uc_wildcards_austimes_data.parquet") @@ -62,113 +85,37 @@ def test_uc_wildcards(self): df = dfo.query("region in ['ACT']") - # df = dfo.query("region in ['NSW']") - # row count per region - - def make_str(df): - if df is not None and len(df) != 0: - list_from_df = df.iloc[:, 0].unique() # 60% of runtime here - return ",".join(list_from_df) - else: - return None - - wildcard_map = { - "pset_pn": "processes_by_name", - "pset_pd": "processes_by_desc", - "pset_set": "processes_by_sets", - "pset_ci": "processes_by_comm_in", - "pset_co": "processes_by_comm_out", - } - - commodity_map = { - "cset_cn": "commodities_by_name", - "cset_cd": "commodities_by_desc", - "cset_set": "commodities_by_sets", - } - t0 = datetime.now() - # This apply() just gets matches the wildcards of process names in the tables against the list of all process names - # Then because there can be multiple matches per table row, `expand_rows` melts the result into long format. - # We can probably do this a lot faster by building a list of all wildcard matches first (avoiding duplicate lookups) as a dataframe - # and then doing an outer-join with the original dataframe. - - match_dfs = [] - for pname_short, pname_long in wildcard_map.items(): - wildcards = df[pname_short].dropna().unique() - processes = dictionary[pname_long] - matches = { - w: filter_by_pattern(processes, wildcards[0].upper())[ - "process" - ].to_list() - for w in wildcards - } - - proc_match_df = pd.DataFrame( - matches.items(), columns=["wildcard", "matches"] - ) - proc_match_df["pname_short"] = pname_short - match_dfs.append(proc_match_df) - wildcard_matches = pd.concat(match_dfs).explode("matches") - - # now cross-join wildcard_matches with df - df2 = df.copy() - df2["process"] = None - for pname_short in wildcard_map.keys(): - if pname_short in df2.columns and any( - pname_short == wildcard_matches["pname_short"] - ): - wild = wildcard_matches[wildcard_matches["pname_short"] == pname_short] - df2 = df2.merge( - wild, left_on=pname_short, right_on="wildcard", how="left" - ).drop(columns=["wildcard", "pname_short"]) - # update process column with matches for pname_short rows - df2["process"].update(df2["matches"]) - df2 = df2.drop(columns=["matches"]) - # df2 = df2.drop(columns=wildcard_map.keys()) - # df2 = df2.drop(columns=commodity_map.keys()) - - df["process"] = df.apply( - lambda row: make_str(get_matching_processes(row, dictionary)), axis=1 + df_new = _match_uc_wildcards( + df, process_map, dictionary, get_matching_processes, "process" ) - - t1 = datetime.now() - print(f"get_matching_processes took {t1 - t0} seconds") - - # df["commodity"] = df.apply( - # lambda row: make_str(get_matching_commodities(row, dictionary)), axis=1 - # ) - # t2 = datetime.now() - # print(f"get_matching_commodities took {t2 - t1} seconds") - - cols_to_drop = [col for col in df.columns if col in query_columns] - - dfe = expand_rows( - datatypes.EmbeddedXlTable( - tag="", - uc_sets={}, - sheetname="", - range="", - filename="", - dataframe=df, # .drop(columns=cols_to_drop), - ) - ).dataframe - assert len(set(dfe.columns).symmetric_difference(set(df2.columns))) == 0 - - assert all( - dfe.query( - "`pset_ci`=='ELCHYD,ELCSOL,ELCWIN,Wind01,Solar01,Hydro01'" - ).process - == df2.query( - "`pset_ci`=='ELCHYD,ELCSOL,ELCWIN,Wind01,Solar01,Hydro01'" - ).process + df_new = _match_uc_wildcards( + df_new, commodity_map, dictionary, get_matching_commodities, "commodity" ) - # set column order the same - df2 = df2[dfe.columns] + t1 = datetime.now() + df_old = _match_uc_wildcards_old(df, dictionary) + t2 = datetime.now() - assert (dfe.reset_index(drop=True) == df2.reset_index(drop=True)).all().all() + print(f"Old method took {t2 - t1} seconds") + print( + f"New method took {t1 - t0} seconds, speedup: {((t2 - t1) / (t1 - t0)):.1f}x" + ) - print(dfe) + # find first row where df_old and df_new are different + for i, (row_old, row_new) in enumerate( + zip(df_old.itertuples(), df_new.itertuples()) + ): + if row_old != row_new: + print(f"First difference at row {i}") + print(f"Old:\n{df_old.iloc[i - 10: i + 10]}") + print(f"New:\n{df_new.iloc[i - 10: i + 10]}") + break + + assert len(set(df_new.columns).symmetric_difference(set(df_old.columns))) == 0 + assert df_new.fillna(-1).equals( + df_old.fillna(-1) + ), "Dataframes should be equal (ignoring Nones and NaNs)" def test_generate_commodity_groups(self): """ diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 804674d..a45795d 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -5,7 +5,7 @@ from pathlib import Path import pandas as pd from dataclasses import replace -from typing import Dict, List +from typing import Dict, List, Callable from more_itertools import locate, one from itertools import groupby import re @@ -18,14 +18,12 @@ import logging import logging.config - from .utils import max_workers from . import datatypes from . import utils logger = logging.getLogger(__name__) - query_columns = { "pset_set", "pset_pn", @@ -51,6 +49,20 @@ "PRC_VINT": "vintage", } +process_map = { + "pset_pn": "processes_by_name", + "pset_pd": "processes_by_desc", + "pset_set": "processes_by_sets", + "pset_ci": "processes_by_comm_in", + "pset_co": "processes_by_comm_out", +} + +commodity_map = { + "cset_cn": "commodities_by_name", + "cset_cd": "commodities_by_desc", + "cset_set": "commodities_by_sets", +} + def remove_comment_rows( config: datatypes.Config, @@ -1943,7 +1955,7 @@ def process_transform_availability( return result -def filter_by_pattern(df, pattern): +def filter_by_pattern(df: pd.DataFrame, pattern: str) -> pd.DataFrame: # Duplicates can be created when a process has multiple commodities that match the pattern df = df.filter(regex=utils.create_regexp(pattern), axis="index").drop_duplicates() exclude = df.filter(regex=utils.create_negative_regexp(pattern), axis="index").index @@ -1966,9 +1978,11 @@ def get_matching_processes(row, dictionary): ("pset_co", "processes_by_comm_out"), ]: if row[col] is not None: - matching_processes = intersect( - matching_processes, filter_by_pattern(dictionary[key], row[col].upper()) - ) + proc_set = dictionary[key] + pattern = row[col].upper() + filtered = filter_by_pattern(proc_set, pattern) + matching_processes = intersect(matching_processes, filtered) + if matching_processes is not None and any(matching_processes.duplicated()): raise ValueError("duplicated") return matching_processes @@ -2055,45 +2069,18 @@ def process_uc_wildcards( ) -> Dict[str, DataFrame]: tag = datatypes.Tag.uc_t - def make_str(df): - if df is not None and len(df) != 0: - list_from_df = df.iloc[:, 0].unique() - return ",".join(list_from_df) - else: - return None - if tag in tqdm(tables, desc=f"Processing uc_wildcards on tables"): start_time = time.time() df = tables[tag] dictionary = generate_topology_dictionary(tables, model) - # set year column dtype to str - # df["year"] = df["year"].astype(str) - df.to_parquet("tests/data/process_uc_wildcards_austimes_data.parquet") - import pickle - - with open("tests/data/process_uc_wildcards_austimes_dict.pkl", "wb") as f: - pickle.dump(dictionary, f) - df["process"] = df.apply( - lambda row: make_str(get_matching_processes(row, dictionary)), axis=1 + df = _match_uc_wildcards( + df, process_map, dictionary, get_matching_processes, "process" ) - df["commodity"] = df.apply( - lambda row: make_str(get_matching_commodities(row, dictionary)), axis=1 + df = _match_uc_wildcards( + df, commodity_map, dictionary, get_matching_commodities, "commodity" ) - cols_to_drop = [col for col in df.columns if col in query_columns] - - df = expand_rows( - datatypes.EmbeddedXlTable( - tag="", - uc_sets={}, - sheetname="", - range="", - filename="", - dataframe=df.drop(columns=cols_to_drop), - ) - ).dataframe - tables[tag] = df print( @@ -2103,6 +2090,53 @@ def make_str(df): return tables +def _match_uc_wildcards( + table: pd.DataFrame, + process_map: dict[str, str], + dictionary: dict[str, pd.DataFrame], + matcher: Callable, + result_col: str, +) -> pd.DataFrame: + """ + Match wildcards in the given table using the given process map and dictionary. + + Args: + table: Table to match wildcards in. + process_map: Mapping of column names to process sets. + dictionary: Dictionary of process sets to match against. + matcher: Matching function to use, e.g. get_matching_processes or get_matching_commodities. + result_col: Name of the column to store the matche results in. + + Returns: + The table with the wildcard columns removed and the results of the wildcard matches added as a column named `results_col` + """ + + proc_cols = list(process_map.keys()) + + # most of the speedup happens here - we drop duplicate sets of wildcard columns to save repeated (slow) regex matching + unique_filters = table[proc_cols].drop_duplicates() + + matches = unique_filters.apply( + lambda row: matcher(row, dictionary), axis=1 + ).to_list() + matches = [ + df.iloc[:, 0].to_list() if df is not None and len(df) != 0 else None + for df in matches + ] + filter_matches = unique_filters.reset_index(drop=True).merge( + pd.DataFrame(matches, columns=[result_col]), left_index=True, right_index=True + ) + + # Then we merge the matches back into the original table, re-duplicating the results where the wildcard sets are repeated. + table = ( + table.merge(filter_matches, left_on=proc_cols, right_on=proc_cols, how="left") + .explode(result_col) + .reset_index(drop=True) + .drop(columns=proc_cols) + ) + return table + + def process_wildcards( config: datatypes.Config, tables: Dict[str, DataFrame], diff --git a/xl2times/utils.py b/xl2times/utils.py index c042139..a69f94b 100644 --- a/xl2times/utils.py +++ b/xl2times/utils.py @@ -174,14 +174,17 @@ def get_scalar(table_tag: str, tables: List[datatypes.EmbeddedXlTable]): return table.dataframe["value"].values[0] +@functools.cache def has_negative_patterns(pattern): return pattern[0] == "-" or ",-" in pattern +@functools.cache def remove_negative_patterns(pattern): return ",".join([word for word in pattern.split(",") if word[0] != "-"]) +@functools.cache def remove_positive_patterns(pattern): return ",".join([word[1:] for word in pattern.split(",") if word[0] == "-"]) From e12dfe65e7ef2b287d0f5c0ca5a66bc991b744ae Mon Sep 17 00:00:00 2001 From: Sam West Date: Wed, 21 Feb 2024 09:32:03 +1100 Subject: [PATCH 09/21] Switched to ireland for unit test data --- .gitignore | 1 + pyproject.toml | 3 + .../process_uc_wildcards_ireland_data.parquet | Bin 0 -> 34634 bytes .../process_uc_wildcards_ireland_dict.pkl | Bin 0 -> 179033 bytes tests/test_transforms.py | 73 ++- xl2times/transforms.py | 451 +++++------------- 6 files changed, 144 insertions(+), 384 deletions(-) create mode 100644 tests/data/process_uc_wildcards_ireland_data.parquet create mode 100644 tests/data/process_uc_wildcards_ireland_dict.pkl diff --git a/.gitignore b/.gitignore index 848dbcf..ceafcba 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,4 @@ docs/api/ .coverage /out.txt *.log +/profile.* diff --git a/pyproject.toml b/pyproject.toml index 7dd7580..a81a5ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,9 @@ filterwarnings = ["ignore::DeprecationWarning", "ignore::UserWarning", "ignore:: # show output, print test coverage report addopts = '-s --durations=0 --durations-min=5.0 --tb=native' +[tool.black] +line-length = 150 # increase default line wrap length for legibility + [tool.poe.tasks] # Automation of common dev tasks etc. # Run with: `poe `, e,g. `poe lint` or `poe benchmark Ireland`. diff --git a/tests/data/process_uc_wildcards_ireland_data.parquet b/tests/data/process_uc_wildcards_ireland_data.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5efddc3b4e3b9c53e1b85b3c84922b3b524d7471 GIT binary patch literal 34634 zcmdUY3tW@s_xST}?_h%s25vYMY*UyM8H|evguWZ&GGL6kA-7BzY{+12$OZ_O_gkiB zX5O+wQ}bG6W?Ej*)a+)aW_B|*D{qyRUA}#d@Bcg-mvOOQC11avKcDS+-}kwl^PJ~A z=bYy}=N(#wE{ad`&yVE)=%?o2B>2c5q3`)A9E8w#WpTD)O0hPtSZSyUVEL(wOtrR* z>YC(?oc!`yTXnQ7dei{cpd@(C$kyiQa`naPjO>hD&cMykPGfSUs+CHpZ>x3ip*y8g zUz}~Tni^|tW~&2t4c+=8Rv=qxtF0p9*zoAb3FqaMmm4gNiG0LMsVlXZtsKCK6UNIj zTWW1KKYgqh^rg+$73b%aD-~KoBL_Tn`jlc_Zdz`!qQ+62t^sgqI7!7x3Qm;7;<08# zCdVX?M986;j!1_i^H0(g6O}Bq zGTR-tT60-3j9;2js2POF)F}!(3T7$!v2vhRxX0;b**I69qOc7$H=yU5uhzn2vW>%^$ zI2i?v;PFzc&33z)cs)s(6I~1wBrDdNt&=RK!KFgPxQqs}?qll5Yej2#>9w{7M}@Rt z;$UaLaA8B2Jk~goh0Heijbp-e&5R_+D-#8Xoo}c&NxzSaLfl-l(O}WVAf_rh{qbdS zF$gfun~F%icFccf9u>e8!n|m70Wbm5oS72>4~8qMOU)*$LuE2J2C=?~6QYo0v(0EW z*-aKf(GbM0HrVZ)6LE=%RNDC8#04U@7OLtNAxWymROSFWnjMWgD-xtSDhyVeMaqsF zkC^EOd-j<)0pM=76MDu)T>@gOwdsMHPmv%C*wk*aMjeUELtMShVyNY8VCr>wW3*(l zIn;k#+Snk(oMf(Xs>wjiG-V!VAXA;5t0_Uz15t2Ev;alJ z?I09bg0d4!5N9)Fvs5FVjT?lJnl+fSJ$e)(RjGb&Kz?8oQsp(_hOA;v6Eup0hry5y zIk@WwUr8BnH9L}RW{at|Sg(=-uZNWf0~1yTJDD)-!Qp}V0;wt=uY&R*D1ZcWr%z)^ zkkV$WGS%AIEPr8x!BPcGQLN{1*=3Do76$j*25z+tc$VO^O~rZ^D@LGB$+H*hy;z)} zT*DMI+%j1{yyWrvVm*(=PEwmERXB=k4Q5NRdb~nkoT)=7I|3)sI0@FF>>?eCreKiH zB$(n4;z4z5v3gJ#Gbr6=GZw4U^CEnqavUToNSNulIzHkzGx$px;rvZid@kRE?-3M- zBy3pVoj418u^hn?qg{f4G&o69(c!YGitvOrDdaLq8ZlGOV1~?;3k7VC*@sGG9N!=b z&ueb-I<1TqAczn0ng7Pc8cz5vFG3aKFJ|$)c-}lVkHhof`SQ3tKb}8t0FTGx^8$DS zc>-P_ZxC-VFNhb+8*+xYFGv_95)2I)780_D7P^Nf-a{L{hbGxW8?lELM&I^f<?RVGy>S(2)gtVy<{ znxx4|wMq8NgyS+%cbTZaOf-ns2!i;_%8IRgTEM2NfF;oZ%LL-J{$l1| zn2x}3&?ksbLy&Ocf)G3>^5fNfrc=2D3-DZJ=u!z!GF8p+l%!;*kdPyVENHTj;5*!> z|5~_&4kZE-mqWEwxdhTMM5kIvG?==5^9^4PLzcuvbl>d)bR&uIzD*Fq%RY$ahqQKw;&8VFDT5CaTxN2NChjz7oA3xB z5dl4?;v@wpN=V{#NW|k05&6M=pibl+0Rm4K$_U;-yu#NJoSt9kji(sP3(HZahJ~jd zB0SkJtgwtzgcr*3^^+hK zA8yk9cqp6i6{||m4f6oZa-|@F!(a)vdh%CzxUa$Q{3NVSID%k21k8MHW5A<|2Z!aeaZs$T>>RhNHNJX6SL3E-7#Sw9+l?K&LDhSWk*49jd-WGn1 zMC>yJL62jgoAYSMlaAyd)R`oY#sOI_7rF$wq+PVTWO=+(?9Y^k_&<^|^l_DhC;7`F zop;{=;YsLrhI}?L6VW&xB#jv>;4-^qrt#6uH8KE6(q?i&kpziChQ<#Qh6qE2V&QP1 zL^whiCLAgp$rI7T10FqE`byDSQW_r7LXT{r%UbABE%fLXdQ1yFwuL^bg&x;JAKgNa zZ=ol&&=Xte_qWiKTIge1=wn+zuF>V*ps@sOuer{P&wS^_|B<8XKbVpPV+-+LtqyxK zE;BJJQJt8bs7cI8)FuXcsa&_?E)lv*g#Hqddx;n?J`mX@^E{n0uclq7Z@4eutiO1@ zM0_PEDDH2~aGw%!9*uvTQ1b^lW&CWyfkisD|FJA|nY>}?PA+pAw!n_Cf;nDnsN%sI zf~{HL&2+t=Bo=I0Hna%?0~Q#J@MP*4h@nPWJ_}|0`3u#!W3Y~4KKM6i{3C=rqyCsH zVL+KyFJQN~#}_@c*iBfL;9fy*G?VBgFc)y5r*A7=jVg$fL^%V==)bNZh!v?Qu?Gt- zry=qNf#`mILx+~oD(Dqt1*5{F!gDjhTtR28r+Hl=ycc+~#mh&zk-7}4m$G(Jf5!+| zK#Ffgix))&#oYj`Uk+6pFj3We|bkQ7mz*tIrMRRVRpZfj&h7$=9UWg|< zCMa7pnczjoA<|PsJgX8dit?DloyYyMi02K>TFw@6+n7khMA1zsvjX`Ed4LI*#bB)o zS!+L$u!rVpA&#^$kcNSHz9TmB4ky$z5IfP6evz1m!lCIUCgR1j;_30;e)Hz}Ah2J> zJI4m`y*g~MMw6k|WvOiT3;5kr+%{UgCobr3OjV!U6~dRO!N|s2e*Y9~sg2j-RGov< zv;ts`O9X#sw-we(N@(M?0$N%DRGBPKgEShX_hym^Y^D1q-Q+L?&2%Cb&7l=Bkc9q| zk+7iKO2YRxi`a0)hoG$?Q4%o$#nS{xMO(#2OX)-iY9$Fe(tegia03Yy6jK0odf0i*HduacXyHIS>#A&x5(Tu3Biv zTSK8XPgrKCG2mThBotyFhtTYFS_p0B!Uou?2y09hi^*XZ8mvZPS%taAF2wE+A@+ap zaDR#dA&5Aa4@Bs6f#`>zzzT`gZpGZBppUWm8V{J9f^W~OiJd8pkP!rB%T4e;~LRwGdrVz02(nuK~= znVG1oQIwULY8-}Ai%F=pIbi>7HdtUol52AqEK$LwCDyv?Qd2D+q0wZj5!QBWc!i-; zB#IhCt)m*+6sMmU1&n}`RGg&XL@A~|R<1U4}{1_M1h{4bf?7^U~2Ak!4= zfeJ>!V|g`BrwgMr1jb!zwt){ogso%cd)35!Tg1q^F%%7)7>bzMJZZ#WB!?<4M}-jk zVd|=D7?yaLIM6UaN~c#QEFuccia??$T^lMlphNouW4KN?g>@;2(1@id2w>R2iDjxR zMutE5#UYiz6R;5n2Ssfxk`DoNK|={$ZW{n1;(^a5S_b{Q{5GP^6NjSE(W1>Qer0){9_YshzHDeA41llh>JZq!unzD4xFZR z#0CdWt|+K-5%%Cr+krheKEs$1O2o998pIzmeX>|S1DME`@huvWxEtk#BUBzhF$H44 z7`$y0C4{?YXJ~yG5`bR^J99+7m0Ik~84+DM%;n5!S^+E}{(_8uvYN7;KAd6oBMFo>d2;=5g!Dd2Z{h-W-b`#Jr1ho&wPL61Ml`-M6pgtq z;#7vAu^gguBw~Z>CZzsrgldP(@bVeUj+jL#UgZaiMzX=kx0f;rGBq!hVGTh}Gv6+H z${O0?x|vnu^sg{Nu_v!js)6Ekf?rI z2=aR!sTRJD8*!c>UWeicc#uFgckq4^v^snc&`_j!Riq26o+|BKKLQpY5$ z0B@JjG`Y@Tu}7R?1vP-IfkOL5y#cZDR-kPhiW>)pKU@>XAuhOc!g88^Lfi&mfLg{+ zB4XJuNTWw%QnBmBz66XNhfa?oRuj>=XrUg#y7Pa4n{Wffimgg?gLK$fW5;|fP{ATH z*-4)kUnY2i2xe@uxypn+tI^Vi3X@3@5|8ydc!3yj!7d;Mmkk>TKVo1K_Wc0+nJq#I zcK(cDgtD<~hsXl*~_0O?ovYAX$0fuPb~_kD&j_98}CQh zW^5%;B5X4@gX74x$DmNBuYUUNJaAtL z;^qH`D~bM=F&gg`w$n3P{=P4XGa+K-J73km4gjzJ+_Zn<_rL!8k7)<-&cp+I&*Gi? zoFAqn4SDR%h_XauqA9UFaZ+MMqS@`vl~b8mm1s$C+l8^2^nHw$t2+PQlzcUa9efONydUlaZTB_*)dpT)am~*X88r zI$erzz<~D~8n}Wrbj*|gLN{yZ>~IMkIuX#JAyoU4zDfX>Pe{Zi77aknw07I36-=8} zUqrXbX(6Y?)$ob%K+MZ&)>(8EAwz9!RL({r-=4L?CytDxw~?sxV@?g{dkK`v_^A#qDZ5TDj=etNog zGe*}$m(MG3@6$6L)VA%V zcYbR3($zwb5nnfN2&nq*kYxCdUoLGQ&7D3wu9>bHyg;!qL9*rOnyIq4Gq&xIzs@<5 zc#4iZ#%==F3QlI?L@wGCv5`nyfZFEc&x!ef<;;t1zC`CI!FQ(*fu=VeX&XXweTs(A zcKQ;K+qR4T&pros(NS4j7Nb0i(bm8`)xbpDc?KUI%*-(#BtG&`m-O|hHBEvohijQF zqymnx#0TocGwk?Wr2DMiwfvpdwKBX7^<~A~cEL^={IK}!!+ov@0Y6C1r#aUHI~*^B z?9}nPXjv325q zO@#Qf>EaF3omA+fySt^u@mT=ALm4qKF}nOAm=cR-5#sM=5`EI*Z|I{HbKJmqV{~`eNRW8nJU7C-{I_Lh7u{bV0=5(4*13du>0BZxu1B|bOY#>MxY2@x zN$xOLo9q^InVSmuaXukFGrw-AV06^S2)%#M>XaYbd84b%R2;+Zwub z<%&lM@ox+78p4XjZVIDqY$x5tn zXRy`n;?1K<4yT)3a;A_}a0xnjNCpXk2zIRpB9S~8sY_d=_?ixfeaM?=M*N7_`!3`idseYnx+-Dm?ejXwc1OkUH;F7V7IW$%& z1F7hlCFH{Ok_fLjR-8v1i%a7Yd=fMdp&^kV=N*4i(GpJ3gRsho#2k0IZn3tC;8*C0f8Ai(j^JJs%-c-ew|Kz&IW6%gY!rzHs0Q^Wcvf?-MK>RyYV3d+5F)+O{G1 z!0Zx zVh~@b#!(v}5qx6t%22EH_yWrY6xaqjzrXk4n5+Dd`0_e8ZXoSuV>=@}Lk65D#D|*+@w{dyBXmLO zlLI!B{I9|8k9MNHoaPK7!HN|NH1JR=R3N~Z5JU%QeohXg${Yn8GHO%(yg>E(B{>fr zhcF?Y0CFJ)OUr}+G7b+1(?NzlhhgM`-!VCZVI;h`>3TSPC%kyjYq(xVc*3cN47(*c z8UduxhZk$7K(BHh-Us_QkLjJRIu{;b(sgTU>+>up=Od;GzR%nEaGO`Ngu?JAR!^73F$&o$Pju6 zJ%vmmOXwx^7P5sLp^wm4$QAl=;3V!VIFZU2v2pp?qN*M6{dWJa=O2wa<_yXaui5N2 zHSOkx&85A5@d((;5dX4~5Z_!6v(pRFKBuS`U%|l&<|}Z?!|22LGIzFGSS6uDxquF( zP$wW4DDFJ4y~S{z)E)pq$ODIz8e#>$`3*zxH@nTF_M1TKr#ji=}Bjt!uuRZ?`0lH zScB+v2`r-UUr0D=3}dMXHNhsxo?jWR1Ya6K98W;v1uwc$gw?}PTjUWihc4a&w&I-a zK#e{I&?ilN`2HO<@pLjrswNKzBIbv=&sm#ULWcqYPJj>tR1isnRsUJn_mOIK)#j2=XSy@MNL@Ig5#y9Ht*3=gCXHIyv;y#$b8u`dy=c z^N)u^b`^XJI5HFgR{@8)^ya}2Ch}#Lm%pTRR~|m`#V3K76fH#uA(Up~g`qvvYI4B@D5(P(4k@hh*>rspx`7e3$TKkx&weBY1a z_(Hx!zI^Nli66g4m#+&8d4A;dq4I0LW({G@8zwhoFZn3>X`XyR1%InmA0savW8B6o zkCpFPNv^&!O(efxsEU=JSaZlYnAFSb7kCdS^3RnA=0`f@ zZzRjN8*W(GD-H6;bC^M%kC)0FwjpxfXtn&I1wpsh+biXZY{O?SzFs8{8#`k`?v-Kk zl*Het_qQa=?eimdc)hHYU-^A=?YqU1@{p=&+uk>4%dZ@`^6g3Q9QmV3(-{}DMDhcw z;Xe-@Etfyxzq)aGutGk6=FxX&rOV_W>wcv8^EC3TMVDj0AFP$1uie4gxm7OLj1E1; zkr&GcKeXUS-n0_=V_VE`ydlbvlfE(YpMPwUeA1DvpFMoOLcZztM^E1@4VB+|==UcM zu~Xz>4~D2;8j&i0b!z<6(^o{vh2Q;{bZuO+JW4nAIkiA1Uz%B4HRR(``O>DbFZ}wM zR-Wju$=I^gAb;;&{CD|ax%2OA`}OOCG0wl!2T|&y(>k#wt#z96T zyZpO@YZRIVtG66HbNb6=8;S$$c-Gs%1<-BoCe;oacDQ9Un`^kqEp1r(w-q%#NdR5Ic zDOah(mv@8~4!%KszOvTvZOM1kjd6R1%)Ra_Uqj4b2Kvcao0K1b79{LCN&P;pa-ZXga};YS8qA4DgE;ZzAUV_)m$0~K0t`ewfh7>8O_XaF zueAEl_LHO2)yE;KMT>r_){;L^tOe)>3u-A?o5@00c+Ayta>fkuQT!# z|K9S$DXKg)ig}irF3PVEoS{As@tFCf@-)?O;MqZupP!*VoN`qh#XLjJwT1J}QV&z3 zCcc(&hKgj22tQ4IeBxJF_ot~Zazusb^x{?I8HzDU?Kn-nvo3Y_Pk)@Duuu@ToT3_u zoRIBjsK*ZfESd1wDNZx{40SqNbP+CduPXw{)6k4$pBgbk_3jzU=0Cgj?R6hgJD>jk z`D0lhQG}H_yY&=j2HGYq3E4tbJpS!N)TkGr1vh9bHDUr$c~icf3MBJKh zA9!{vmC2@!oFA}-+WNz(>{9M#>Mh6}6uE=KgfQO!BDMS%kK$G8@zk<+rmGGINIEdGL8dxSBL(~4#QQJbkfLqzM*63$6-4n;&TS6XLD&!LB; z5{RZ+h30XtpoPFOQvmx#9654G5&ISD2|?&O=>l>;m9d$*ixLqZalhS1ZKx3i3R@_k zAk6T~*;?jKipKCmuX0-0FHtFvDn3RpaWV3r*w9q=*8fFzW;)=YK;xEd)M#fI2w(mGs zT%Tjjju)D!V$TzS<34$SQ-h4zDpU;^S4p|3LRuoMp)RcT+qnPGBpAViP<{yvYXNMY9aF))keI?jJiREjwPSI4%MzNmY^HvbT;ier8PyN8`U-NJqO>D>>Jc~ z4-q5Dt5oWL$S^;m^smBff zAt%08{Ttd>eF3uLnQu}*8op;1?4?4#C;xST(}H$q$D>2l735yZRLtJ}1~u3a$(*^7 zGNdsj6W-5OF~6aLO(U3BsZBFKB5NfAw5aHO>JpGZ&brTK%5<@7NC97E2x!o935J`3hiGk3E5A9 z)Bu6<4mEQL^Q)7GVcL&UG}BV_CPxVL2-xH0q|PbMapo)3`lAEUp7$1@o!M8=uJ>eU zSGF9z41;-@I{GY0?j97+L@xm;Uy+_OzDi}D8uf&2_iC!_yQ1W=S5{LOzuW$K{kqlE zh?{6Nr{?CPtEei~XJuEFE2(EEzWMeahgVZi{%hgr5~#VPX3=2L%d4rJ*V01BhgMVf zU3f`2CU7A7aOg^xh8vmd3`(NL_ntE*UeWSm&uBMc$)AwYd)f8i- z{miBvE2%H51}}~}vYN6?-~ZkRyP%&NU(T62eKoaZ%(*3J?psY!=4tDoP>SW5fo3}FeQgcqv8u&0{FV)iazTv>WJ=E1RPrm{cIM2^S9VkDe@=MA zSGdnnG>9aLk%s}w)&*Ne}H z_EJ+C-z$5VyO$!#M^l)m_fR`-$Zex1?V-M?d->YcpjWAP?e9k?4c|*meMURBRIry? zywCH8(O>SNzNHD2J%M_MjkolAv3`&DC2!wPEHI_WpM2^*=+f^TfeqSvTV>EBhoKfQqp-?4l@ zHFpN}YBBxmU%y{UZT>y<$@zntsnrKA{P6FcFH`Xff9)@tChA|$KN@*v+AOMm{lYBG zFUzRMcPtuEohc7<d2Bo7gxxh zr3S^GII(TO%M|P@R*3o6+z%!M%np;UBMLRPH2{1=y>ns1@#ugHcvlH72-Z|YVNYXTaBxiD&TzGFN8JdKsp!H!! zfe@kQWQKP@!~_PF!WKjuPd3j)u*ZwDTxeaV^dzPUZeFg^|4piaD3 zgcow8Y-!O>BV_NYC~PTV6)^>}8N}-F-Ij}oH)aitoOs|Nf_Ok8D>*#JZ&>-eKs&-W zq*)ypQ+{HtVzvk2lXPkWob)Y*SK!WWCqM%aCU{8|r(xec+KSMWpWr6GhTsuIO&B~s`61I5l{ z_{wLY*x4v{4vL+N;$VMH4e`sc02x2efjnJ+4N5D1VMInj+vgsDFIRsiS z68v!SjrqQWPl!afLFET~I_Awbf8_cw{JF7}XEys0=eXK+$#cDkb7!H;_}a_I8LCl2 zHca*vs3M<5viyl6;6v^Re}ZONhAaqKo_KYqmf>qjeVJEDPO=qj}3rr*=J}AD;89e6Ihfg}Ed2&ucg8URn6z z=DZgs9$GkRMk}rG<%+#qOz*rW9sNqpd&|bZlNV9A_p?1)MrEzs@!-B%jCEu>dEu*fn%-TLFx5l4W|#TQ*21Ue z-59jH;@zF&Pw!s*DWZA5*7$Fa6A#2suKr}vu+b3{66{~S`}BbMnzgS^zI0@6)@VSZ z?)te`?7#XeN>s3phCK>TN)y_0t*Cqv=RQ1|YViMtEoz z;tWYy#a)*?!;2H4o+2wrEMwCLc~@A_Oa>=->>RqSD?fT~!mE=nYgc%Gvv_Sn%{Lz0f~C5zGFM0+ zJTNCv|FwF}sOn?;9*KGW#M4PrujO7n{ptGz8L((@Np0&4BAkv!08X4Rhvb9xY3V~! z*W_M3wL$$<;)B2Qw&*sGTbcP}d>!EQ^@*QHHk|l0zUIQIN3uqrcwo=ui@YV?4c|?8 zZSt4O zF{`0<*1o1+uYI^J|HX+1{F?~Q!#`j989(fA8hHCdm|nQ~+2icCe8NBI=W8F1EO_z! zA%Z2?)<78c&;6N^7;f0lZ6c!P|9mZVdp;qFWreG@PkE{I=tBO;^6dpIR!ZLXJ@B}M zKQeE7!4ESSkBQRPE#(@IFB2(QEcsGV&OG+&XyfsvBIQJF;7$|4V$ORg5LiBhU4DFN z41>kJ%vd9`5&lf$@wI^zw_zUp;}`}ND{i?EbLC_+NDxj@Yz2!|F(~%R2hEBnJUD}5 zD^Ah_md8E($u9HhP1(=IRRf%-;+}@b4>x5$6<6JKsPgo7f))OWqv|ww6Tx{Q`>AOx z33p&?T=1uiMD$KzB6rNEyUd?35@)Pwo%IRb;wzAW)cn++jyQWf*~iDg13~u)KRKr4 zSAs>dFL3-rK#Vs3I)gai$tI%Mv%a9uU?qlJfCk=&J=ub>j6`k#_de}I3?(VJk0?|@ zYqp@~p;mMm-^gZw5E9?|1Kd*`5&jG~#H(v}gdZ;nIFcb=c!?0V>>?n@zqii2gC^|L zVYbwZKZ{iJc}_3Z0>({s8UQba^ZSZwLv58ka*D-1WkN=+$pT?f6ZCa8H3e#Yq`u5# zElw`h7wU@@Mxz;rS;HZt!j6NW@q;}AH;X~&nM@RfU&e<>;|Riv#%=;FIa&sp>x`87BUfOiE!EmI{gem z9`|G1RC?5Swt6y`da~gBpVoD9=4-dr)s~rx%i#rZtD(A`TNZEv+C0P;X4Aw6{sSJ= z|IKjv;=I7y!$%>3haIB&sttTK;Y*b@qbmmoRCGPL2S0S>b_OcAvhdit4ujpn_H{qx~txAP>UBNH=@`!RzHKIT)Gbaydo&sE&s8s4)H3<|t0{x7|Q z-)}F!-=jB5`~Pjq|N0Ac{qoY<|1LnjpQ6nFS>U+%AI5@<|8aP?_#a1Xi|;xJTl^2= zs>OF5D&22^tm2k`8k^exfu!Pp92+XWn{dzm49pbYU96?}pN3D0|4Bro_-=w6#s4^N zvHwHJ#QhN!CccY6FL8gwZuL(vRsY8Q^ml+x|A)8qV?>JhZh}nii3{NW<2_`+tzC4m zi^a@s_8aoSYrIqlzb{M&8lfz&%I1EsI>QL~=z}x25 zNx1F!+%#$Y(1+cSX&+xZ#G&}Nxo{8N@aW9FD@-t{9u5bDI$(ADQdVzx?j`>Pm^FP2 zvHd4(FfRQIvE94SdysAg{R0`(AoObSvn0h}g5!C&Z3|{)O1?H|h=1t)K@)6F5*Caf9yD*r!nF zMbplgWP3yGR@9#`-QF_o4YfyMf5LV1tM!K4qp&~Wx(%u0{ea%^?@*~f@o>Iw+Z%Fc zc7MWhdspesW4=SB{=~!enndqGcjfnhc9$~B?Y+w0KzkJSfP7c$xZ@4V-ay?;dO+-l zv34kz?q);B3zZm~9x(4}l&%+joUf+!9%zrk{)FrD?V5YTbSvlquODdJ8Ft?rZnxJb z`DRyTqTZH#oO~{)O1><*eRtdlmOD^gofv-HQ4bs?+p`_nUeT*S(~lVRpUS z)Cbt^g*_nO)!KCit@MW2ncV}H+g|mb?0$NLKlFy%M};03{DT$bbO^z3Gs4f~JN+~k zb7`%iwh_B&Y8nl-@N0Y+k$U)9GKh=8xlyAcWD&BV;6v$_2ZoEC}yq7(*&LNN}$_%BPLh*cy{Qlu#qlN8EH&b;L0ZsiR)Gqtl^J5KLz5tFsZLh44*oD9j&KRalU%P^e|lK6!LTwWY2oJukK(T2o)D%5#)j zv&}i?nfHWmq#SN~?{Q40FSzjOtMprTOD=ojPOw6uVlfFqg&XD$1I$M=@QVsM;no%A zM_F*)Qfr>QREc30)fDAVsVa!>19o~E;Gfo15ThGamX7gKn2q_PYM}qBB7Fvi4R}U1 znDP>2rh;T=zZumDW^%NANo9&F|GyYDx=S>+)XmJYIt! z)0zq!|7;!VEM?Kz_jD~{ykJep+UKb^y`1tPO=eVO+*7~-+|=HB4v zvbm!*HFvV-+HK!TgG+DS)A=+4j~5ka?1p?zyS?9A-WB3C69aps{JbVu6P4|It-kQ= zcm39}FSz5pT9cjjN?H27v{y9s#yjaRYiHR#p%3W)I*_yfmwnP*AKXj(q&m&+^fy%B zaZegum_NmR?cNi*Ic@W5i`iJvvCr*`@44)}t=j|BS7YRFHhYqtUK&3s3>uam!KhyIz()t9}} z|8}3Irh~8gnDZt>emn2pGrn}m+QR5~C$IN`Q-`jrzoUNrZ>*Q<1R3ldVXx5TU&L#% z5B%zl=@phTO9J@&bQTl%4elwP)?PZaw++?1PG9TezhU3Gd|_R_!QQyF`v&m-?Vi#N z+v0BfpF7!6V0-q`h4*Hi7FB>h{f_pN5p`tG4Z#{R%y8kV?k_Y}_Wo9Ml0q_eYG-AIo_=CU(&1@_z$Q*^A!8RWS z_l>17Wlp*ot6^UhZK(%8EcmfArfBrB4eCmT8T*pl?D*)Sf=rN$33bjsDl_V`&9My` zRvA7Tsm4hpJ{wUtq$?E4ac~BsD8kPv3V2qe;^%0^csQYHfU_B;A{owF#^YxtKC4M- zfU_1w9X^1`Xn?bsT^Qdvx%2eD{K7Ll>=ag8eqiDoQCR9`;+wg4;PMn88O8dz2$VhldPS!ay zaV2~n)bXdqH+{{mA}e3)JN%CEmfz4KbK>g+f(hO6hruNG(haz@0=FOC zqdjJnj-xhwr@*Krcj`Vx7Oo#z)dSz2^mW1qo~|l#QX*wMuAicX#7Uo2XRdQlZuCjT zW$E4Jp_`9Pp~UTFDWm^6fY@kf;t ze*3;X%9svoOrac);Wzbw-zR^lK;Ggr8Q8}|Lxmz6!;6f??cpg|;aU$p`9qDX_Si2e zMq_x5x$)iUi-Ep#3WZKQgHo*hDdEU{Ccie4aTq6 zUK^v5j@yr%)X~0uz4zoNC#gDp3FDOGaQ#M%uNy!0qNkIey8A{H>L!2}DeHiD2YNcs zn_c)?EQJNmM@1Zl*O-#fPR~2cKOTB=mmkYXZpHXWkPo@qn7@KA_A`OLh;sTb6{#3r zLM(2NJH7*b(xFVzvG;&#HeBD<174r&4fx&N{%~J!R@^=zw`08R@~|gAVE#^qbMZ8* zbFF9N`jPP+__ov6$;)o?EEnT@NB?<4igUc#T44SEYkqsISNI7v{N6%V|b1p_&O)WO};rrMZ5fmGblw3 zuAk7<0k55(deYY^f1?0br!P>E?W~^)H2yPwa4v<;^TUP|8OBcyYsOta^^ISr|5%~I z^`mkjand8jDKvV?JIvt8PQNJJK$nwn{D-S+5PXV^&h~#czRt61__1<;H_{Ht|2e)6 z|5@oIWf`s?)nk0Ux-A=*Ub#hcdzRL3TR*I? z@9ZwQxy!V>OkR7{mhSYiQvB-8+cs|5ynfTxrPr?Cy#A_W|hity#+UsvL#$0>DrfpZR-?C-> zjnF!|W#i@7ZQiPAE+NrU+d&liYVCU%_AJ(nc>lgh8|1^wVu&uk{j>}NkMTJ^*ZL!c9 zSiZcAK&M)3j@Lw>A!3FKtth4;(@a##HpXZhG};ho%a~?$Bob>j#-l)^Q?Y?&p&glH z0#+LWkv0UPG_D8(EP@LR%D@U4SQ!Ndm%A8kn-9=79dPDo)y^Cj(B`;+wpwTE#9Tm& zxq!A>2TCKRq*?(2+5#V-1v)U&suk*`W}^eEhZ`bLsJBKUSDAoy6#|i~5QrSlfOk9s zkrNR>?b}*FfT{;{qcDL%AE0UWi2pO zf|3lf3nQ&+acHEit3?untS3v14aX$f zp4hW6RBaSnOYkubm99oeb<56nX}o z40$Ir+-kR+46mIGuPHO!$-_4&cXVB`Wh^>6NR`l1fD7r33@|AKVUu%wd znRX-6Um1+}${_f?tNGGcDPJ#-<;!TkHaJ`ulEF@)DT5PXuzAT!8Cxt2%U~-MZq$Q} z#=_XKLRqFQH-ca%G_o;Pkix}AT?gs0YQ0o!6pROO-7H7fqve+8hepf(x-x7%33=~5 zT>DzU^^mJ6To)UoKCQ12@V=%f?`oOXu4ZxFny@v9>$T$uw!iYy#_V7%f+swqXN)O`(! zI6VBmFDfy^tu6^A&qh+Y7)|><%F)OmD3znE2lZE zR!(zRt(vN#nyR6iYC{!O8EXo)79>4q{594@P6-bAB)G7_n7&t^FbyW`ksCb#xznhotySt^W$Xa$|1BREc$%5l)>OX@h|PnWl8YaRLt z4Li;veWGD^)}@dWxO@4WOxK)fASa}5cSiEfwjCxhmr5gHds5P59!n0LdtfBbV-{1w zC2psgw7iJ&ZkU6wWj^d1$qx+a<2OIk@(>LR|Y>C$4?5 zaP70g^+dxK5!Z!E$K-|Ua;M_2{cV)Io7mvmPhfDZP2Q`OAF7om3){9$qBHCrh1fD> z5M`16brFKUL})k&p{=>N>jMK2k_Ip+3}8?gfS@n{Az=VQ!T^MX0SH9~ zlr~BO7!n5PU}S&}Mg}k#8Ng6v07H=h3`GVo1Opbe@~Dm0@Z=@9k?*Sm#nDOBT1cQd zgaq1+CQSO`gbb8Kpd@1&Wf9XT=)iuhe7iDPfIH9|VhkEmj6pMtG0u=kk25~Tph-mx zntDu+D`6yqZW5Dej~PEjGUyI588mXoOeD%>8Bs3Fh;ms*l*`H3NBurk66mJwdCjPQbGgcmFm^FmlIdMYfJWkk6wBg*AuqH;+ls@C0L#B2)l7zKOCahgD!VB;w8Q}%Xgnd`1qF;RNk_l^< zjPQchBfMZ4;RVYGFIXn@0yT?r!Bb>Jxhx~fWf@T}Clk~z_2`oN+9e~(WmAcASw@u0 zGNN3T5#_Q>vRq;9GWV!kax=(`Z~%NmEq-44VVPA^^BqjbL?dkk3ajiagWIt+rBzI;cChg zD3x}FCtMwP0_6)OoG%Y>xuA6UN;4F44I64E;-(lvm|_TFiXnt4h7hJ0B9V);jn)_G z0inYXA_;ud9FO@3gfPrV2*V5^m1-TrFeBmYFg}9PgpUm8oQC3rrWit)VhCZ1A%rQ0 z5T+PH6xa~XHsd2?D|Em|p<%^QdQsS62$94PB3naHDTVHxC{RK#LVDuaB)YUfMEGHk%IKpQ!s zwu-INxX>3jFDn*@Tkjrp))y-^}qd1uvQfsx5Jyb}`ePe{N+Apsv*0@j8)ED`CjM5MzK&_SzV$3>$pZG{a^G|Pz~)X7*@ znMtdKBUKZ{N|2U8k(NP`mO-W+tJbaa=nNS|+C_-P?W?2pMghd^iI`ToZl0LUwHf@) zHTox-60Y%BtnV|1HV?z*THizV!8N>q+uwlBBM2j#APnoH@L)_H;XoeYKz>C`9^pV9 z;Yc1<2lkk7ByYljd|(ga1A9yu^7Jpt+o?@xb7T+Hp=w=M4x2`&RWMa;u60#g-rOtv2G{JOYZ-0q^57(-i3Dkl@^A=7HKsAn%DGg59)hX^6P2t{depU6)X zwB1%enNGFDqJeRX)wpw6t0d!P{6-5j?_;)8FrmhF3al{U;UT@7KHCX999xJS>Uic+{zary0;y4T`(GyM{vt_u^6Knw5HjW^{9ZB zfE}SkR6HvY70(jbRyjn}1ok`I%U9<0X3b%mcOKUw zfy;4WbJYwILIz8A+zfHeHDb6-;ZjA7HH9&Hu%yYwXxZstdNyNI2^kDDwAtv99n;8U zFo7N|6X=nh*XWUq$xC)#BO@6jGu({4VPqs@WGZ;rasDwfk})#Q&vsaVe_T(~)y+l_ zaCTUTINuJfqipiQeAF4FaMT$j8+8WBMx8;j(XbG)1HMO^a=J1&BVuecEJzj6uprrJ zSdeU*5s@c??7SypY?={+Zdi!PrWrA4hlSAZ!Ppd8yux`Rs&ze(rB#OX2ho zB$Z^NWNXE!GubPpF7P?;SMxu~2Baz^ItFsQb$z7|9v%cvED*Q+ku)gUBtRDmf zD@US$_7jwV^@Bjb`jIGL{U8vqX+}hzFwTy9BF3f}Ax0vR-JqMG#57nVqKedmOEg~; zD#S<>uxZsHMxsDAtvbX=BxJoOFk%F>pCtvXF(NMON1{O1k3<3MMol85F7+6sj2%su_f8Y~qGaUbhaDoGl1lTB%eMQmO03(1G1nyJKnV z#SpH?dI_z=)SQtJ4l;!FsovF=4vGlxAZ%xbmF);AA+n8jwfUm8DL#^D3GZY_cqfC9 zhUpz_O-LQ0EGCz*vY|tkMe7JF+et_}w1l)n6TV(t5??Ql@RjWdU)hfEl?_4{UnY4?#9ed5)v6ayGE>SG z8)MC}AXL$zMU=y0v<~czz{prL2v&Shi{kxauVrK`2vuBY53EvC5pE@Ww2O_wQh_fd zV1rBU<-soO(~+_JTOjOSn%%qNmtlf|qsauDdohvv$WV;IrVSapZ%4)WfM&T6vO91w z#^!Zkv;#P>oEwD{6XakTxiXDJ@}y2APm)L;lGy))$sDCb?j%>Gkb@B^T!-BpQNZEi z+R2D<8PjDf?+z~L?t9WI2d!()ui3%+GCZn+wVQ%sP? z;W5%UJVqLa$4KMw7-<|X$jIR_()czct;0zQ*TKDlu@1M4*yZInl<<^e=qXZA61J_utA-h_9@ z#v#5|z7(JJ<7%wXu~((?*rnZ#_&9%hj}13mjZ*txGqo-?|aEzRArSA2eCr3=YYs{K5PzZo0>|Khfa%@sK?>dVG-?-pl0yB4Tao$HpP_B)uK*~T`aAw6)ZkbG zt7qY$0X@aw;RO7^JPP=4*B1^NSY!`0SU6~4;h=%W!a)Og|0g_3VBw$vydJr5&;Vv2 z=7VGRz6(Cj9pWJ&iSN)C4jQl@PnY?HJKprcdtt(93kMCL`~dos>1E=~ zjfH~-VrL3q)eOBVMxv`N95ldB%(3E%2SjXD;ab1mY_GAZ=y|LT`)e#K`fLA*Akz9b zuW{|)ye98HDa19_p?%umpn(qd*z`u1K6U^h@tJ*rK{Rx1gZWrgseAhovDlgtAuK0~ z5Z6DWgxHr7Ar#z&nt=fbNdp)Z1~4cLKu{QfkT3utVE{tH0E8j~;6C92hJ*n+7#X00 zkpT=w1~3#Ez))lWLy-Xt!2rMg`oDb80NjDMw)J<0_U$$>;-g-^P6P(cEGEM(1KJ*? z;x0=H!-W&`-W>Gs{ku>G4L-);?cf+AKI*mh@V%2z4>wyZ!}oH+a#=={%QB)|mJ#K0 zGEuoCW6EV2Q7+4fa#=={%QB)|mJ#K$OtM_uG-657iMUCQlG+eA&~3U>V^B z%Lp$xnaB&0@wH3M!V6YLc)>El3ziXHu#E76WkN5+$^}ow%4Hc*F3X5=Ihm+jlJR{v zRxT?e%4Hc*F3X5=Sw@u0GRbmVtT#fmIQ{Fku<-dW2={i2*{-|H?rFm1?IvQs`i6x^U3I!a)NI2MypL0zA;!&-w9% z=)yq*`Ur)Eg9hCD4g5gfzMZsi&;VX8u?Ge$95leSo`r)3@;r6`D`X1?4J;fq@IQ6X zK%>Ruy3-FDKpa|Q`cVSI!-d*9r0fyfJ1u4k=?4vr57nxCgSkg6G^ZUjP_4BZ_ze}k zQNpt<(hnMFcJ%wvemOljXaFag=+~=*6;{XKvr)U|>KL;!$|oMj7#YbJ89pY3GLkVe zT=58HBx7Xw%tIObMKSClW4>+z$4@r{&VErGasHbvEEWk-@f}~lVq~_52S^ooXYS;X04 zA>w>Hv<|nTa#)bU(Xb%ds53}58WtoQ4GR$~Kkt?HieW(tN5g_-qhUd^(Xb%dG$SHU z7-z>l5o6Pg5TEyoVu;Us1J)W5RU|rEFg8UN%dttaY1JVD3&Gz_4P) z*aRjvHia^Fb0r*!+=fRqHhIR6AfE9f2$^u)>A?hg!jXunie$k&Ka%^(Uw4ix}oEeD(=Ubh1c)b5b*`U$~qw{|P6@DZN zSU(a4tRDmfD@US$HccPRF<|{55U_qE3Rpi91#FrTktc%e{FoeL(~J-!k;qPrL;-7! z2s|uDA|WhBqJZ@yQNX5EhZu%Ct3fCs$vVJ5AWc^4KurR`mL}cS5 zk>LDD0u!oDYALwtt<1E+|KLOf^yg^S``$e#uPt@4w7P9agn{_Fq*(ZFO7l`4a% zO<rdetx^P>rxuji6MGpj3^ZRE?le%~D9!3=7o^3e^k>)eH*N3_>+_(-saIaNqOe zEl0D9>j?juxFh^$;*Rj&o;$*Sd+rGT?KueDS)@`;NTse9!wz#ufbVdQa6Q&b_#Wsa zxDU#v?-ufmwruU*gx{!`ymHg}tMuhz>o;w>?m=58uejlQ zoLPV$yqVmze#6F1Tf4V!yMuTD@Ws*}{o>J)XVTCA3+)70te43$?))iSkQ4X8o2LakJ*)S2om zwOXC6)~IvTx#~Q1zPdnNs0wOG6;(-<)vy{-6;)LisZmu^W2&wis;MqkE!9>XHLljG z33Z8Dr!H0ZQTJ7ssr#u(b$_*9ZBUo1E7V4HrMgNfb+wvO*QibETD4hSr><8IP+Qbi zwN2fi9;hCq9;|Lu4^aUMR9daQb!npTfj zPf$-(Pf|}-Pf>TOu6n9^ntHl=hI*!YmU^~&j(VZ; zg?goWm3p;$je4zmoqD}`gL`yp=a3J9z!oh??2!|35BOFdRf^a0^D8kW%V+h9*jw9p<#}iH@Ty))FQNmk`zwE+yQD za9_e@g!>UD3HK+gCu|^GPPl@wk#Hs9DuNTW?-ywXL@IAu!2|pnGknkhIj|o2^{FLxB z!p{l6ApDZ>E5fe{zajjV@H@iq34b8`k?<$Np9y~<{FU%G!rux1ApDc?FT%eG{~_$K z6JSTePK2EayAXCI>_*s~um@pJ!Xm<6guMy-5cVbPN7$cm0O3HwL4<<|hY$`W97Z^t za0KBq`7LZ}ih zB8(DhgfT*$&>%Dk7ZX~9HlafpC#)q*5H2CCBV0jG~w}tClH=UcoN~sgr^YhBy6SJuOz&R@M^+q z2(Kl)j_`WI8whVCyovB;!dnP$CA^LBcEURd?x z>`7Qe*o&|?VIRW2g#8Hn6AmC8NH~aaFyRowp@hQ-hZBw<97#Bea5Ui zgcArS5>6tVOgM#bDq%5U3E?!t>4Y-~dBRe{GQx7g0AY}@g0PaXif|_3EW&ES*@QKO za|q`W&Lf;pxPWjWp+FcS6bU6lnJ`QkAyf!e!bOBpLX9v+s1q85CgEa2i_j)?2;+pc zgbBhWgmr{V3HKq~mv9;3euPQF{R!&{8wi&Zt{`kATuHc!pa@qJrU=&%HW98RY$jYs zxSsF;!WP0-!ZyMUDZvMldl2Elgc}JDAv~1uFv3lQhZ7z_cqHLw!lMX}COn343*lD6 zZG_ticMu*+cpPDx@OZ)#2u~zDiST5?QwVnwx`d|^om4=t|O2aF7rQwyl((o!?y_)bE!fOeyBfOsQ2ErQ& zZz8;z@D{>b32!63-NQS0rQw~t((o=`X?QoUG`xpb8s5vR_YvMt_yFO9gbxuuO!x@l zqlAwUK2G=q;V!}_37;Z-n(!IIX9=Gpe4g+H!WRi&B7B+f6~b2uUn6{-@D0K@3Ev{z zO}K~fZNhg5-z9vH@O{D$2tOqJi11^=PY6FH{0zWT9H)MTi_}va`#3#ubVqx5;?xdU zqT%_Adab7a(--fFPG3B=RH(MrP#;o98dcBfD;61u%ei%^ zk|n90UK$hy$3)`OoN}(cuH7k*r6w3ml)sNery4i(A-OCpMjt6M8Xtb4vs^EgOSzF& z;Zih5G`|oGND3UEXcS4AfxyJLFYz0|wedE)U}HE}tToo^Vy&97e$&$}Nvq<;>9S|`w5wk2 zw5r2Z_=vVu>y;{g&b-m$@Udn_<9Z}smS$mYagP*do5qM^RZ;1U;%F}J!X9at+d4M2 zvaDSjZ=sF3_LEy|Ypc`9VdEUz+$h~rUFWk%FyPw2EYciV7{LLc7*E)oYZ#z&7;DS* zPL)5#e%LUM%x`n3iN`Dts%&UoE}tvc%dL@hxp9mya;QhOVX$WhNmN!}$!tUFOLnpL zSYM*2J74uB9UF#_W~S1f-l^7Vg^_ZuFa4+Yb|+4G#(l_pa@owDjHNiqwDp87F|iZV z{TGwM-PqiJW>R-rIC#gr7T22Tu$W*tT3}q)LwX+#Figl{m}C5`6QXnFN+az~9|_M3 zs+Bn+!_%Rl&-BUhZ5EmP;iF$`@5hoR z?|S`Zxx@|GY5t4lafa(g{i#01+!t^I#C)S(sO9h*ANbt>eN!w}nrMw}%=9p5&n#Fg zv_|j(=3!RjSTQ#|UWUSQ4Nz&~Ye*Z@ZVV^l?6eqnh8zw|D%2Y_IYrLrTJ5!X*QyUG zyy(@36h0X^X{_2V7V4eKcso~YHJa@lGlz*|g?1a|MvggT*^e0V>_-fl<|Z(HHzs;z zIFkr6n@I#2orno3ieZa{JAcBRELIx8AY`;j%PWaQ)V30XU0M2;kwy&%EbzF*(SRf^BGJwj*3Oua@yDF)=MxJL_^bX%Js8U`mB<${MxtN6Ym1)SXXg=(8^RP#LYCJ#D%rG6FyI zGhQC9;z_1i?z9@U^7xp{<1i_Uw=@ear~U*PgIBj&jgfMlNy`m%ESuHwsY1-NT5>Wu zPCxAYXRKQ1vps%ELQfrA`oX?Cef)5vQNqJ#DTlY7$}P-;+6kF6I=Ed_Ylxwo4Y-z= z6h9eX>=>2$!nW9?+%8yBz!0`;v(PNJFd8=bId!{2D~*X5K+{YCTKKK z9$ZHW<2Ae#r&ZM3%|@#ugy6LC;X+YAOT-hLV0;jXq*cOE#YP>^8+Iuut|I6lINl6B z2S>}Gw+D?LN9%P=L=Ioc%l3>f*BWg!T$e29uNXh#@#eN@#}{h(n!?Cc1N5)3^?MG)^QTMu0+wz;9yM5PawCN3B*T5#<| zVF*ip6X*}#w3B5OunT0!%`iTtWatMmH$EAe;pRjOYh%`!H90NTY)qiHmI`CKeWOR0 zTWcHD79(@zORDI<93WbydZE}FZ&wPu>24oIxQT5aMY!W_A4T|_q(&dP`;YMrMzMNl zjhx!sK-oDVEhd}A;$X6qkLaC7B5%=BcN~XF-6E~SYRdwj}?WGEC-(ITV zuS+uv$Z2>#{u;utzpgeuhItTt3$NwIv0RiJz==@znsUBVeD5iThwGVc$*A{IwHS!0c)Nb&_?H}0r!P@YQO;3%Bwy(OO?sRFkB@M=wR;7VOA5`1+P=l(Qn5{K|-`nK^j`YJ$^9n4!p;2PB ztzF?7Jr+H-ljfC4%|n?shmF26+=%XCcxM9ZVeYA>J3W6T@ZS8Dpy|(F2?nF=Nk>Ni z30hUBFj2=d#2B{I*|xTyG8Atlv$p!3xpmgjxL#)Ut7#%S5nfM8Q)lR5BMKfBCh)ca zUO(Z{n%(KuEEpz#Ts~ftYmSdKu_|H1;kpeTJIt6}!KMuzJlCXXa&t|JnwV=+G_Pai znw(x*#io~j83(UtU_N*_niF>$Iyr6(aILCd?o8l(((d%B{^?Sz<6uJszq^5j+P>Q^oQa1*sx}Y!r;{*y|w0F-eHV(a6INv z#^B_PM3SQz;a9=q$&O{LYzc7z?8Ksj>OCjd@#7*#Qp8nlJn}e37OUiPw+|YQnR?;iUoep+jFkVgOFvB@v zCK>)p+{{84on{uoI5eY4IGbrk84Nlz%3y*#qYRFAno$M|IvE9P7Fr#>ai#ruV!9uN z5FOcv6g=676kIv8S@7k|LU88HLh$CyLU8BILhxt0&vCfcjLlNWWH(DGl-Vq$P-e3f zLYd7{$YeH4A(PoGg-m9%6f$t*>9WP)H+EWh;g>Js;Z0P#JIU=qyadx2!Hi|Ol*WSW zD2Ds@j`L1&Zfs~x4!r9xZ!DpAVB zr-zmaOs}zdZl6Ww+Rw7I%%PuUshV{^%ThF}ewL+VPW^0N%HsVjOWExES(c_b^|LHx zv+HMBT4vGDvQ*5vpJgeUv7goC$dRR`*Hc#5ZG*ctemAgU-gNPI#$d=Ou*Nwm_>2F9VOZ#KD6sM`55uwnBo|y)OY9BiK9S^pf zE(`~~ZY$15eT@eF7}f7!&`&#>aV)r?50CdA3i@e3{fq?t6lcbPAW#w;2wX+Yu5V>1 znM2>o&@}75m7!-=eJex9och*$bjABthOXK7tqetT>RTDQX4khel+2=UWoVdn-^$Q4 zW8WH;Lq&#?{_nBDLwt0Pb)7W0W8(GK=&2{X#iq&!qd<5|^Gg-88wE0y%wZJB&@}5& zAVbfrMu7|+a~cKaqboiNWaygxD3GCOPNP7EuGx(O8A@g`3S?-Q^(c^`XU0(==v(0^ zFvq@?p=1tyD?`()`&NdYS@o?99dqhi^U)RWTN%1$-?uUp&8cr?=$c*M%1|OGsM0laOJCOKv1HUoFoeulA2I1(^$`>8RUa|Ye(EEZ#MehgjmN}=i@VbqRTxWR zt1v^qY1RD91N(AoCcQCmeg}mcL_SS^kcR&E)Tx zxc7HTO~yop8{L=KLG{GMZS|z+?bY^Y77xrgA-${E-TF0rFDkRYdZEB&r_7+yJ1k=q z@LB^qk4-i-FQ{!yY3^Piuwv-ms%dTa~9A>a-Wio|jB5$?vY5|UNSX;n1^*C;83}+am6eW?2 zbEk2AKo>S^y3QZ_|lnNbu|BL;53~>6bq+xU70FRGrjpHqP zj6XTtEW}H}M^?x5kpU3FA#US11A%8x=;bRMmJ&r)cub_Gm*IPDybp_GHp=*%|KLbO zfBl|J35Vjd+H!7LE;rQTcVkvWru4>PrWz^qPmMTqBw{f+lArx!&Kt$qcR^T8lt1G2 z{5a(tfFL`}vUw+GsYG_Mulw;HSEQEt@R%5J{00tNgX@oqWJJfO>&NkauyAW+n3Tga zNTdK~mekP>F$6}OTXPt|%h@ma@e_^~V*1zzI*$Cq;>N6snF|gU{ z$+mhn;;?%*;;?)+;;{Y1!1mllIGxP5$jH&6GR-k4IJ2pYvrI)@%p`)6%p?NqXAygFUmSUyal!o+-hZQ~JVM5P%;WRs8bbo@dt3N?xeO7wb=yE(u7~evyEsx;+a~=?6 zXNGQgm}5#9H^-E)He=kh$Ml+Sf`q$fBM!S}BMxJhq`MBMN!qWIge85;!i>IUVZ*9) z8?aK0QzlznpVf!DB|I~ybg*qs>0swE>2`8l$HG`-gR2716XsJfXtD^^JQUwd>a$>T$Jrb!cB!=aT`bIkayb>#*H;lSbc|VuiSoH7PEURK}ad9;?aOnU;qnOp7YQe8$X)zhbOO(+5PT4J8S@v&=o96e6} zO?aIL{N?oW1dln#v3dCLg$1wHqXm~hrA}ZTm-|a_26M6clHr`d z(O%6|SAr=*kfLXi$I5ygrmqA#15<*&cRKmrAjuQAB#`^O<@aXFHaQ`Y^*-auJi39#14IslX4? zFvD8muJj^?j~~_*)W+E-OlD6vwxx6_*5urkP8$k&1qV=p9o=$^5x@YeIXlKV?m)`< zF~SP&PtrCFOC3xKQ)0LLVd;Y`9SR?5+3>;LCfSFPMSKe;GlOP99YPT|MCgMVL7)IS zX)Q{sjgPisjgTh&YJttfoV$qGR$YIQnvaS+QN`C_ep@ynXA7pK{4$E$)D`;hBj|3| zS+FeA>2w<|y3=RsAzefdH}F||E%cHW4=8-q9^b)sAG~7_!|xO5Pvh|bg0-hUB{AdZ zQxbEGL-27m&mk~h#Onuf94Cn-2l8??RDbN;T(mmMV9pWgCBYU zgy`YMc!#5EQGY6^Q!Vc!RV>LROkx%U+=th)DT#9d+EJ$V@L_kL!eTA845L}+yyZ(* ztX^WukfUnd-i(&S37XyMz0b$*n(G2-`};_-&%p9L(s)Ppkz}uP2Sgsn$E<3N*Yl0x zd?DY$Z*yRUl-}2ej}7B!-U`--c=)G2;F8S;?$#ec4A*$7@o=j#W=iOMXMgSnrKCsH^@5pGL$}M`L-=1O)GBok^MCgL5tTSuPL2#{txQ;#q=*jZKu8)?<@eF(zwk zsv|{wX?=21pinR1gybPKZ1cM`m}+1UOUk7$4B@wZxc`IS>B8gEKr)rTX{8^Sg9pMM zX<~JSdUR(^VyT+#fu9z;p$v|P0Y=K(2P|9Ni3L9Z7vmPCX zF%qC=@e2xEj*dM$;`jIPYrOb3y)x?|vS<3~ld`hK6Dz)*D%a5ru+ei;miAC=P14Qo zlo)-Z<11PEv6mmY7qNPptKkfH?Bp%S&35jIasQorV%(DFo)}vNb5D#Z&P38-y+^ZR z5kKN!mYv;CbWDmf;c&}n7x4Utlc|!L**VP!skOAqSiA16kNoD7&$q>H5obnAO*hB=ig zGNd%O^ju-eRdFM%;5|kh4Kp$~HQaXRuLlTC7<}XRpP8L&Pxy^nRhx&d6w|g#4{~Rb zav82!;9*!8^G*(Kd;Vl0?C$7^E-G-aUA`D+Ny@Ko{QDr$N3D5InDBq0Ra=+46n8=Q z0~byf_lV$=e#hrnUB+UJukqsmC-VQ-GF{Jmr1{73umdf2lq#BJ>bTksXkj+s6Sa6BIrA(C*N>7nW z-IJ9*{Q&basWmvhBoNf+*FhG)6v`7e_y>=#aQCWA-5E(xO>NBIg3ow+Wo2HnV`j{L z2oEis3701D=73%g559TrTM-|Awuhe5cG80*?zW3P!`n&ENJ3BWK}4T^W{*s-wcuEn z*=b8w-8_^f>u(;)l9f0QWyzYHhq7dK24BCYR=`NBF^(e`(c5S0g2CZKv(c5T$~mh_ zR^*&jC984Hs*;sBXI056tj{>~rdA=x-}Fv7HZO(A%AB9RWR1>GU$R=~r!QHz^V65C z;PYgCaLu4F)NZu!+g5n8D|;p?l{1o^Ad~sACwemg*Y` znaRBi~|Bs4wZht9Pij^UH&g~H_dHcMfOZu~TvRdc- zc$uusIjc%m;lh(D*ZYHKr_>s?kLJm0o!^sXvTo<6FImA0PpZ0dxrK3WKdA;!4t|21 zd`Mk*;Kakr!ULz<77Ln9>V}bdlntKeQ{}`{?7{O)l-1!>n9vA36BZFe`GB^n^Hm z%l%N86$^H^^_PZRVu;@R)LsqB86L^*fAvHQ20yA7X>)IYYMHD zzCg>;bS*rb#J(}i*H-wkX?1Kv&ZMwcP4%|#gYPA_>KD&FXEr8a` zZ*o#va@2IOkd^N=^7y0WGY-GZkk9k=?qz*upg@ zX#B2ICFC=7vx_HW&rY{Pzgx(~Vfh*%zoGD*3_WI5dRpn_xQa>kw9}GDre@y(^8E|z zMw6mEYaqw1BB?w}7|Jo~pWkPK^o|P!=k%{2WwcKzT-Lzdolfywmaz8RiAmF36En9G zbPc@i)`yfs_Z%K5p`m?Rq<@=iH6I73#CKBp&5O^E9bp$nGM7BZ!E0aMyS zdV=)EFhg;YBY4Uw$WeZAaqWQBNe9F_r|r(kk^8J<(mqG+6@8`*N9=Qxsidhhjn*k0 z!#B3Xn3r-ee8wn!a?+~Q5jv$2yRMVSJw<2g$h<0qBMPiVPbPEeo?`-k)p%hyZzV9Q=>a@G4 zSYOdcuERAc*(FXn?=M;EJ>P#`>sMJ)vrM$gKGP+}6WdCA<%Tmia0EnRWauYl1wXr+ zD-RFja6)&ORI}A6jpG;^e2JnDpqb#{iSL;+<8kEKI`@5Z4?TEXZsS|!W6DaB6luZ}cLkW!aLcM??0^yS27~^E$FMnqx}wL;(>FZE@Rm>2mq_K_z9B=%kD! zvEm$+AvQGD#`@|R8L=59JuyRE9wKVWn^qWqx+lJ`Y1(tV;{`?rJhKI7#G4L>0~ztt zNV)bHmiBUs2XKNC&Imgyh{HE2BW1+t^OX>Lh(CCRA6ArydH4|zb~ze7y@@p!eUz3x z^-iZ6DzuR7NExjg@ZwcaX(EsCI-@gJ%x4=oBqb$|#kX1LEXP~05QkuNaEc_31*#*z zGcuCttfNF79J$1wrdsU97n_WiV9Bt?)1SBm-f9%^;Uz@bG6o~jxDBu?NY|{QBJhQm zEXCr`f_Re7_)uA&i-fD6@)TEe~RycdcWk%s->|s&~@IgV9Z(!nw9Uzvf+x;VfDF&IPa+@ z55d%JPj^jw0ITmILHj?Nen~yXRV!E}qi50i^w43|_UL`mzVz-VCDQw!?w}cm%tjqB zEFwCX>qCc&63)H{_DVf-d+wEb<|cipu-whB`W-p4)6Q&lcG{V3K3mv4+kTix>wDA8 z9WkDEHa$@`u+*+74@}PPi{@0;Gqm{MnsjTe`S=(aWBQPb61@25@eV}r5zjggp>&%uE zJ$taXPMlPTsQ}j5&A0H@kgSuPlY^KuXn969u@v@LZtWLwo^?>#L=Cw-^e~)KXm_@t zpB_Z#dFzaL{H~kq2011DI+Lhy>}gqAeeW19`ten}C+#5Xn16Wsw5@}FlL(Ve&N7L| z6${m8X35RuNzs&NaO4bc*HN)|U&N z@mVL1>t?HKGm!KR9QB7jD?4i-7rBnd_v<-PkV2n&?vF9|XZ@udVXh6)Lr8Q;InMs} zXV2mkaC4-2V$!AkBaS8)sRSEKh7FLz{K4Y>Kyq8oqkT+x+_;TCs5Cw{wod#?R=tIT zEfVY)7j+VBsXW%*xUIWkYNuUx*kS4`yYH~W4Yzh%-Hn8)Zz4GLEyCS|dkEhK+|-@= zuKDwQ{JDMW4I8EpxPHrZmv7vr5_4L441-MDRQSM9Ro=I%}1#cFr8NbRHcR|k#m za7$MmavM}H9^G+t$FaM1zO}24x#Kd(9IsALC#sXw$?6n!s#>g;sMFNx>I{`vOVu*9 zTn(r}wL+~_tJIn5EVWvlt=6b>)VbRPo~U8k;B4^UgwR<%vtpdP3mq#mqpR1Z-PRS#1)sfVjas7I=s)uYs-)nn8x>Q;4| zx?SC&9;+Uwrq$!s6Vwyclhl*dQ`DWRtDdT!rk<{zp`NLprJk*xqn@jtr=G7~pkAn6 zq+YCEqF$*CoCl_BP=Hj5C#b=2rCJz2xk(`BCIByO;|%X zhj1?8Ji_^e3kVky3WOm-kx(L(3B!aDLWNKzTtpZp)CgmQI-x;m5-ujR2yH@#Fiu!Y zm>^t2SVy>&a38{b36~M>N0=nspRk^=fp9tD3c^Ohm4vGZif}byif|2K6X9CIX2NxZ z>j@7aY$0qVY$M!2cp%|Hga;FDBs_%hP{P9qHxV9Acm&~*gqsPEB0QS#7{V=tTM4%j zZYSJ9cr4*@glWR#2~QwAk?K@LR&~2)`%%f$&Gdp9p^@{DtsW!rusgC;Wr( zPr|bCxxIbY% zVFTfE!WD##gewVG5ftHS!W7{e!Y0DCgw2HO2-g!HK-faqO4vrYf$%`Wg9r~M+(>u` z;h}_w5pE(pobU+3BMCPX9z}RG;W30;2)7b$Biv57gYa0w;|SA)#}l4Fcp~9RgeMc8 zLb#LAB|MezG{Vyf&mcUL@GQc!3C|%sm+(Bo^9e5?ypZrB!ixznA-t6EGQ!IVuOPgV z@G8Qq39li%mhd{l>j`flypixG!kY`mB*urFah!v2H<2nP}lA{XVJ%^Ta0y`@;ZnkV2=^siMz|kgl5l^*dcp?6<%BB;8wpnut|BPH)r2X+HH1xs zYYCeP*AcEKJb&;bnxE6J9}hCE-|gx3?^KzJkJO@ucS z-a>dQ;cbMs6W&31C*fU$cN5-2crW37g!dCZK=>fxLxc|#K0^2?;bVl46Fxz>i||Rp zrwE@We1`B@!siH|CwzhMMZ%W|UnYEo@KwUs2wx|BgYZqlw+MF=?jd}e@EyW;3Ev}p zpYQ|14+%da{Fv|)!cPf51L!40Oq4J66sU{_kJ6JJ|BZ z%qPOEHvB(XmSpR?w`*-d|AGe=Jh0$_1rPjpc|hJN{4KvGIv#TEp1-7{%`j{ ze7>|jUJ5)ocq#Dm>#n_a^6Jffycei;b?*gE?>XGb7e-n`BUssT*QHu?UBy?NmM^vO zf}_30Z?OAoeAW?Nmm~Q~Ig-aG&RzzeVfpLI;7E7sNBiLi)2-slAR-qP@OdA;ti)$T z-Kn22)-i)*e(ITarL1Ls<{5lxNya2-HpZg>K2g@PzcjLVyWZviSredg6R>(sz$P>* zb((?h)bD(bni=d){XS$?bf^9hGAp}Nf0WGNa>)G2GurYM(sCVecH$S=BRgF{+vx(< z5Fs`wi{k>iI66=oF)q*w5YQI*0DXtRzF0Oz1F06#L=wcHkQf_|Nwht&2)|fd!!MO< z|L?ZHu1_)zQ^UJ^2A`a2nLRwyu&rQE&*1ZEEwhNso&~<4-D(b?O7;o})X3fe0cYIBcAW_1Of=o#idfR1|-nBEZCM){K|-xG{HEvNh`lzasg&;tRvx9M z{9?-Y1S3zMP<{#JdxDXt$0&aq<$HpW*R_dOaysM%gJyC@$e^9%Lk5G;QpxB+Xqjj1 zAT+f+Vp!wQGvFCro5+0-OmB#68-Q&qDBlx=JaSw~`I$i^Lg7|1(ahiz4GtiuGnr^+ zFo|^IL87ymXlAg9*n*H~H51JY`fiAm2#Sq@xmDu2S&pvt$L_pif*%^i7NEVZ4BOi; zcz?^qwZC;!9=G?Y3t@*#RBViT1^!+RL4S{DBFqn_N+J=4-R@MGrAYy&xix`Q!xSPo zwoZ0L5PA|&2?$IA_O9kJ=gZg2WBD@r#zj;Uf)g1o3~7;3iu4ICZKu%GX=_Xyf)kmr zB4ZRu0VmhIWTjSBr(6h5q*xf%A`OaUffH`oe47*w!HG2L#yJ;Lq)%{Z$E^Am(`G>= zT-JuSDI9_mDL0Hrhaw?3k&dm)af*cCL~sU%&S)(~QozaKcsMOLLAem74{GLH^@)6` zFqX#$OUsw%3kBRCQ&`+7Q&`+HQ`nW7?Nsu`;z%ctenPz^%z_~cwjtP#$t}*B+~RDS z-XYu!tX`4Z5N0fCEc6~6*?We^ zDMhH%eGLiGO5NA7W{tUpk@4>I{;s$frcvBc*czwOygiy;H=r}5QLaQyLfW*P&^5Jk zn!{@4G>6s7X%4GZQ#DjmHB?h=sG=%kO`*nW@SO42SQ9xVIOLPy!UkjdUVXwe7&IeM zTg{=gCfghua`i15Vyj|G8aBjM#gx>sCb#xznhotySt{Io(F!_2M{t}hmE)k%m(+2_ zpDu6H);jbPIzgO8`b5L-tVav!8Lqi<|n*f2XQOo zCq&@=LQ9#Rmv-4&!9>0>~aFbF_e|m-D_ta02?!D4R15@bL*p z+=@r5t*2{KuVzYuP6^iAiEBS8!?mBp;o6rM*WN^2<4CLS)ElXh;4~I09g{yKaEe%W z>dhij&+pzM2tSX5v|p3KwVumi6;(VcoXV?D@hi`}XBFw%pJEOn>e%SaD0vfQ0|A0s`;$l>nP`x)s$Mq=gZ z1B~?GBd^zf_#h)a__839e)te$Jp{4v!-pB`A&7+^KEhZJGF$lJqm1+*BjJaSG17yK zgdaZ6NDnd+e)t3q z>p^AMW_Fdy4j z2a2PU=uaVm&KnZw<(e?*ixVyh=ktn8RL@_NR zifI{9OebUO2!)c2DW+vaF)ibY`5SAvrA09<z6GIq2{m;+4{hk9IJaU!YOPYV1!dFBb?%7BBw|us(a`WPO&o1DgPF=2|?i$E9jiE z!vSy#!PPyag;T6_%qfr&#iZlFh+z<5o zicRI5azs>{6ckReg3c*N(J2I5_mmFno=oTzGNPDp9GS50$%tZF8Bt8jh+;~{*FE)! zVpGpu{2it65lILB?opI)}s_Z9U~IVyb>F!kOR%o8#T7^C*}F)BCn@ugn+QzAbse zH4{(3bK3Eqa1Ee9(>x3Jg<`Jy4scn)Tlq>e6mm@vY9`|L7(&=%2w{&QoIO=*k70#9 zhDc=P3}g|C^o3QV%cKdB1m0?n$GinXSZ5@Jb%t=(wIXjJov_Y`Ib)2sAd&Ev;hg(W zCJ=5UFzy2(>@gC;9z!^LE{(i}biy7ZCW>wdXQ1&Gl7!v@A(9kF=}%#dAw&{Gh-?iJ z>Ie%2i7;D3ggS&B`?d2pri_L-tzdA}cZ_M2YcYnYLOo0sGE60yz1#VwO~%`V3}Qt< z2C)_(gIEPX?=BU~u)&TQ#47YU7ZI|x${*He!Yg4$fG1t)vK7e*r&6>eeK2re7kiZJo+=In!aly>a2rKH;{;eq-yWk;0!rUOdf!O8ge8{l)MI19<<&UTxjXeZLE~&Pm*aGd zwyGabdViyuC+s=#C&YOx|3sk%=TEYgB2yLx2ZYL^JlP65+NOVsphb3fS{mmpY9&^> zYw23ElMi#Nq%!odP5N|~6l?jy@z03SR{EKi9<$PxewL*xA^mJik917!S?TAH-m{ZO z+nhA*6MkJ+oYd`eFL-FYg~1PNW_S?jvzK4;pL_z{O_KPHn?dEa}2$sR}9k*quCC1Pelgi0NddRdI zE$Z3J$Y9ZpIz$NRMJP&(rMmn?LECNhlj&4T%T11MQ5EC3wMsHx#xF5L^FC(TAGcF1 z`@;$o9v;%W9zKlwFXoR?F4d8uNnvb5E}EeX80%yx!5~Pb&g*E|AYcb}7PyA06OoD% zVfT{+yb~qDYNd6%D^6uF9~Dp+hm3I>mvM&*xV2~}+E5*XG+gT9u;RJ59c_nO`65JJ z5QFashEWvxfd&3Ugs-vA-(J2lueS{j)4cPkjycoBq^3aT z87UBRgx2M1*yd8aZp$?&LuMS(9TnMtTGZ)QWirx?#hn zt2cYisj<*}WYS}!S8v*K-F4fL)GowhhYl^qSd2=x3%u8Ft7ARyZVXc~aLaahB+pai z9*TtEMDP%!MZQgu5Zr127PI&S21S~g&Q$!lJLGu=q`p z6c#fK8ezQS4D&|BXlq1_wMN8PYebB-M#NZaMC3zwY=t)>MyGjVMLx|FEAnZcSdmZj z#EQHzBCH}`9dX(i5oe7Nan=|SXN?hY7Dlk`82_-b>b5hr(}8Hki3m&ycV@(aVG5IE z7sN~vMC<2@LfC{ggKi(KKe{FyF?`N(siFp)!WcbR(&6Hp?2;%>HiJ^^Bg}MySl{2* z(MF@|yZbrXsCD%IDKvB+09u3d12WJ_kimhBBj`Aji|qI&1IZW}+3}5xWQ@#kQ+sXd zFy=uJ`MbEo8AlNLSJno@46K!m(e3KU4l|H(Jy$m$HY&+&T&(+elol<-w`^NoR>5FC z>RyqGLD9V=8+9+qM%_!Y(LfWi1HO+NFS=4VBVuec&`1^0KqJ{`ppk5v5!w?|r?D0| zJ_dQx9aE=gL}4H4jEpGkBIPrpu!poXBMOg#%NS+v3X$Q#SSabV%93NDlufHFITlLU zXe^9a-(te%erCr)FtM>Pl(CE1;mF5z*$f6Y4@X9MK@3bIjy8+gZiuTmdP1*p zJGz=eL-$3XHP})!12^)GGLB$J9>&RW&SW4NV~HH+jErQAj2qpW%!45EcX1apjv(@v z^D+$iGVp~}@xogF7)=w(}6}~G4tRMFR)(<&?mE&GO z+x~1y?@Hzd-x%x1y+DH>_X0M}2s7?o&N?vTo=Bm`y(==JFz#*4h{CvcWkwXny{j^! zFzzWvB`Qmfd&0UE?*0(93G{O@?gg@Gl_kc#K-Q0Yg7qy1BV0iHIbgv0aW7!~xEHW~ z+zVJg?vahgy&LEvGwz89^z#QY!i;-9!i;+lLd+CF^uTc3V}fwpBjb9R8TZJ9$Zmq$5aTdR zi?}F+I2I*}b0IA0n+0fY&mekgnobm~v4boBdXI*C3e*x3m=Alxm)|APL_z{%|5I#M zS~K4io~+};ggYhchib20dsAw|1A1tYCZvR`WUoUL2??&VIGeQu*I5Dr>ntI0W_Rkj zfw?-}S#dGEGaRNnTM)Y3QnMz`6cVl>LkIR_?4E|L1Vgw+>m{@fQ%y!fc*zj1RnjoO z;pJ*JBD|Nd;StuoBcz1LHrfH>+rp;!NTMZB_pguK=2+DI8v<6EsJFbrx<@)uDwAJW z_mBwd-VxFwE#d3lN%*>Vgs&k-pzhxlm>br;6Z3WN2w(Rgv^S(?P58QpL|FGuhp$-> zVck2WQbKqMbq@*YSggK=UWy3grD$``whlvxsy2jE8g8^Pjb%hWV8qs&l@Z%&mJwTc zf)RSeZW;w-5XLWRqvS1YS#do#3gAZaWmKF7L8A#68E!-P1Pma-*O-7HWP$`}FHvgL zs0qMHktznD>Taqc2pO-6GF}w}fhshNJ=;T=hYn6OF#-7=mF(574C<}D(@H4Hv%JL%^+CuK`n|ep1hWku^?1&p*^q$PDPj{ z?9s;WdX@^@Q8A={Z(ZZo;8@{suU7GP3I(q+p)zY zT!Y`&nv58?M~SlYo8es!GBxIC7({wk(x`Ax50l+3CX5;*y?cyCjgj7iG-}N0V2y!i zjghi7M#)7&fvqvpdy$3%D9!y^Q)7(TI~HSVj4}JfVoZ%OX5UzhsWHav7mG19#+dzM zF{Z{CbAX76Y78;X;Hbtz($*MB>j<`H)Q@0njaf$Q6Z4bp!S($ZJheFr^+L%b zC~0Y!!-gURZ6=lxyUweOGO_C{7`y8XX6jh0z7}m5p9%UnLEFkCowH_BT5rCirI9~g zh*LRiCw)R(-0~+1aa#h2vy@b9OKBOg!F?7~o??@dwuX^DRnS%=Xliunku?RE2tlT| z^l5@-0q~lHNN+8&jM!UW4UKu5l(e;o^in}vi$K#NmmXPDaJdk)7Lgthw6#d2w-#AO zZ0nzmq$_Pw(pC^OvnFGWA%m@dZt?Sn&!)~oA`N7@)vwvrR0V>&{hBzNVjiaV!BHJ+ ztCRToG84Uq@|MIeml^ULklXKc?&;m_5hL9>9MP?I^+$B?fhh~_`vZ71<|SG(FL%Qw zZ~4=^VGa(iJG~2f9B!?vN%J4S2!6Vy27X4mcvORRt}Y8IeRtIsuk+U zad(`Ok$olX+0)r03}X9mG%QM!#$+mu#1?nId$7cwWy}%Rj!B7h2tHTQ6>zreAEY?y#URIi%nv zz@aK8`jiqcbi{PsW9Q|C_1cG8l;qpWHYKOh0<`H``1O`(t|gN$xZ4M|hrD^eC69 zYLp`Bs99O)z1kyHkGy;Vd1#NW6rR(gFOr&j6lb2L=wbZ!(tIx&?@lM!t@ni~1!U%S z!NU!vnMLxVheFfQQd40a#VG7?(L=L0!!#0P;V*ntkBc6e_7mmHBlE8T!JB;kL_}H*}h3Vt+%y!?E^~yZ4~*`d$>= za?C&sQKK9eW-k2y+B@$6Ig9d-Q$k`$0)Z4D5L|jF2}vjlg1C3LN!DCr?=B(q7KBxb z0xBwYihB0LioN$U>HX#>?zq>y^_sq=m&dfVI@4Pef z&ODQO4+q7tOWabhdaq+ zCe8?uJ@Yb?j@gJvKK0SVPa#O;?dRT`-d8t^FC}N@0?edAa)(YtXR5aLvvzJ!vnpD$ zJ6TTc+mzGFe0KISAEc2Vx0g(b_oAB8&aEU@sG!#H3eA+P;T7s{tl<^_VaEfxg!3G?OqXo#dfcVzhb*r#9y)H z6%*A8*-m~c*jJ%^>`NdG$|`4l^fu8uMOWF08!l%C7M~Uo+V_lfLh5#~ARpUjkPu=H zKIVN$%t5dt-c}R6gxg?4p9w(_#CP5*qt;4f7cGLGu$D zvEv!Kc)pvN?b+pb%jj8}Jh?s`ZqxtsUx4H`J=x}7s{in#1l0>{>JXAvPUleuEX*oB z^&=&k6=YU4Ysai;mI3KKBn7*0$I5osy_M~vA9p27kwSc4WH+euReXp(J}=w-_%S~G zCn`?J{kZG&vvCLi0jA()8}3HDc&VB3jQSi1;AJR#z+=gj90<=3`)2<96q0wX3K9L9^P zCo-21+?$$B=bM`C=7bqr2nc3?Snz--&g{b!r#5}|@*idY1WU_ULMRwsk_X$2$ex1D zl{3jOJTceDWSAiPE>3kivAB?bRW{}p@z|eJF$bOC6NEL3&~#eT2t(#Wgh2?22Ay`& z%3jrukEs=un5`IT4#VXk_23iA+2oL>Sv9WlU&DNceH1mr(L!iv!pDB$Q#@J0&PymL z2=>`V**r0+U|SaX*jHpKWP|h&s8UI%g`me$YZ0V_`B?woJ&CRUHI!20^iO_J$X*K} zd%xM+3_e-q5VN`8oZ<3-_9yz`9CIdK&Kq$0%adsqF85O_ZBDtU(;r5Lx;zX+u(ahz zhIvCPI`d&H5QH!mu=g&VtWOwR1R-=XgJ2(6UH<<^$QD%ACzM7uD2%6q5R`4pkXxAH zhYLbb4nE;W2|n4>vpyl7Y*2_W8x*QU5Q1_L>_ep6$#%k91lb0k3Tc)*crvOjOuCuAcV6e@BMf^xQeLfs5P&l3n(4GOh42thepJ|QRw z;YAuMQV1Fwf#I{Vs-1&&ENoJl>O8wTgTKZWRv`#BtJ2I0_D<6v+vAguy{}XVK|!#0 zDxEbFLZU6M%4XI6Olevno}e5u7BZhrA{0vy!sS_^og_k9A@v|+Q*Xy7B-%nbo7Hty zJ|WSNM7Y5a^k@VXjt;a_6-8jFjuHGSo-nXqT4NSmI(-BBvf4s9uJ1uflSNiLEC&h^ z6(r`q^|}>6T9x8f3Te5o1~`Ojxo^6(DCCz>PMmGngmF;Brp7F+CIFW9FizA`Z5=0` z%g)ypq^Q1aq)EWfLHx?w3y*#AeB8i-gnPszS zHiAxwXN`Pvdq^l^bsG zSr^Cj&dMIsd*-?4v~f=F6%+kAz4fbgPHg=ea4onFTn}ylH-ekM&EOWmsk-%B0Vn91 zLv!n#lUu(H+zvPjw|)os2e=d51vuT-9B6CKu(jaqT62JHoil4K?nW}W2g%@Gr2D}A z-~sR;cnCZU9swJ1cEZMBBA5i8049UY!IXfhNCwl847NZr*b>QLI+DRwNCsOY8O%U3 zn2BUC3&~(MlEF4e1{}L;&R#X=ty;{%Z+ox<;7HXvr>NFBIMtk$YK}*(b0TV;Lr~2* zo#tpxa~fxT0e<^}gAUJG6aUJu>?-U!|V-V8>-CEzXK zt>A6o?cg2Yo#0aNF7R&f9`Ii9KJb3<0q{ZaA@E`F5%5v)G4OHl3GhkqDe!6V8Sq)~ zIq-S#1@J}iCGchN74TK?HSl$C8TbbHCioWkHuw(sF8Ch!KKKFnA@~vaG587iDfk)q zIrs(mCAb{?3j7-U2K*LW0e%Or1iuG=0DlC30#|{n!8I*$Eun?Dj?h9}PiP@-AhZxS z5?Y9x2rb0Tgcjl!LJRR{LJM&#p@sMhp@sM>p??Fnf!o2~!5!cq;7)KC_$T-m_&4|u z_%FB{+|v^G5?YA+2rb0@gcjlfLJRRAp@n#e&_X;+XdxaUv=HIA9CKz)Gd9MMZXq@y zv=EyTIst43CW1-e31Bkdw43@AV2-nyhO`CP5=;kMfvsC&2BC$RNoXNv5n71Zgcf2O zLJP4ip@rZ`n|dMU5L$?E%8fbdrWreMZ6W3oT8JG9eIh7g=Ly4;t3Caiax)7wrb^cIk`{XCAPcPkl2bG5X1Rxx%}OjM+vB>e zj#-#B-CnK%+i^>~`#Q>+@N}`8(Tc%v zm{nBfG=WABP4_feQKB z+qtaaTAazXIFoB}#%r~w&tBV8?R7uefOgJJO`M8qb2;_l&mOxF9SWJO5G`RdiyfS7 zKKnb_%yvFOI&Phh^EPW5oNQLZO_YjVu_3{%;g}|Ya?v(Flfmd-oVpdS4JXIa*UVtJ zA2~H+m})Dw{Zcp6fT^3wz|^hS8&YEeF%7r5Q!zB;F1l^*)0hkgWn+;T#01xdnAKDAQ}7-$>2{&23H{&T#a-MxE5Rou5XDO2yKlUQ7Obt zgto@bs1)KBLR;g{s1)K>LR;f6s1)L_gcjm>o?QsN4?z4(+$zXFZr6r~k+8P|WV9s3_1)RZP4q_Mu9Li7+ zSUoSwR^oJqdTWG}8seD^xl%Z_A)eomE6pIuLd+zzHD;kwh}nd;#x|%FBAotUj)2I; zc0^E!IfS;x_NWwM2SN)mm(bSO5tS!`64(js40Zv#2JD7pusf2$9!LiBkPP-jGT=N0 zb0~uaXE2z97%Vt<7OI z1_vP-9E@b}6eNQ~kXqwVR9Yh(R$xvn7zKwBUu!%KmDVVu(g8X_S4&g~ZH*a7uuLWqYV`cWwaCo7oa6^cYlAu*wX752IyCH^nz*%_Bz0ussnn?@N~u#TdP3AXDxe{1xzniSOrw@F zjatq$YB`grIcXq`8g~*k&LnD_Nz^!#sB!8HjbV!BwcvH&^#N}{dSk$wkRslUGhzhi zB>`_idTYSjklqg75%5l=O9S486!C7H?+JJ>lEM3s4Bn4q@Bt))4DKLd=z{P zd>niNd@|ruNS_Y)3{u2raYlR&=jQ{yfb_+HFCl#yd?nzkNM8&1I#R@CIKL6_O(cVF zAsKud$>2Lk2H!>c9{4`^0r(;K5%@9q3HT}anZS_&*04OIa;&=t56vbvdT2TpkNVHK z_!r=p;BxRQ@N4iJ@LO;N_#LMn%1Go{~1a1bmfIov< z!C$~%!Qa4b;CAqLaA&|>NCy8ziuf1Kh=1dZ_z%vA|Kg0e8)w8lI3w=G8F3%Zi2HFy zJb*LeL7Wi};f#0~XT&2oBR0ZHZ^XtpBR0Vqu_?|7jENhI;uaVYHks+gSg$z}mMl`E7xT zZ)4}%0=wSExVOdDI3s4@jF^ctViwMb**GJx+ii?@TWpIn0$bh2V7J8_oDtjOjMxFE z!CWK^aT~MS#>lp@sBP?L*D;n|-xcfzFo12$UDxL!Vb8jb_3HZGU>|@*YGZ%8zAq96 zrj0r2`eLO004AdA2Y>^CG5(CLXKUISE6(+GYqhyvRh(;Ku=$FK!C@v4N@y+1-m|4%E1V$U4ktiyJA{`UCJ%Nm~J9wvjljva+l`DiAJ^o5hj$44dw}$ z7RF_)FHa~NgUjC7v06?qY63qs;Vfg$M$<>z{C`qei5NS@J|y~tEJQDti)iQAqCZY=ds{0K^QNMH6y+>}s9xrVFHmX}#W&Z6nw#eMxMNj?NwgVt-| zWh_QE#x=HHW>I=$?%VS+tKl0jEb?kxGapvkBQzKYm)&Vq4^yQiwy*`RP-g+$o2vH4 zw->1*1X(c!S)JS;hE@9n0A{AfbgxL%XC|I<0I!nwmz0 z-jHV22&8>vNoz6Hu-aQ;C4Y4ZR<|Zp2g|B;6RIoizy2e1cN40eW%Y3MdLuoUWMokR{|2wq|Cig!^76?~;;eVHN(~986R+u+9;4A_rCh}C96PKb$ zT#6uZDI1APeUR};jjhb_GUaubRE_Z}l@!Kf|3rC z#bX3#amEh+6CqT$P|HHS3)L;u%~yV#9k~N+vsCg#8`4f@CHV*##ZcOB^~pCq^4R-KJn;R3t|52D@v|gZ=G>}mDAdm z17EdN&Z_mT{xuY_eiHZbqUoQlD*m&|xXYEqBz=I9RWFLomdS?fq3*A>eBAW$#&n1` zobLsG3W6!q@mw2Y*2#o~Ok1ZD5;)_81gUKR5c4iR80#)9)l`NPJ9Vx?>HyK-EmHcyW^Y?cgHy)?v8!P znEb|RVQ=^euyP`-iZD|Y8~#0|zfiO6t43N;{T)C4zqa|iVN!oga-AkX)i85t>m?<# z5U2f=WI^AzFZ$V%7q?u=i(4+W3GGL_29c&kHRcICu78U)D6}yzs4+=kOd=ig`ZJzu zdpvN`4H~m{b4<#5Od_S)@e*8LMUhtXq{UA9z~m$eN&3J|AxR&&DJ1CwH-)tR@zczO zCN{}SQ!evVnzEax(vDD*>AW65rDJ1FE zH-)%c_vticE=e{S-0RJ+OeA5W^byjku;C{_?a1%et6Y*L8fpg88Ii|zp{2i2v)oiQ zSZtBpCC*qiaK);DD^?9$vHOTibNHLXo+P1iE6>U)LBAc8f zA+cxN91=Uk%^|TL+#C|Szb2%qNYl)PmtB&Vrd;N!G-WqWr77QeDor<#r_yvML8a-I z8grkx9!=jRtu#rLu6TTPN6uQY52To>C3i9c`Kj`wGziR_ohJAYF>oqt+Y`2d=X0dU5uiYq>x zxZ=P9SM09giUSJG!j~KD9Iep*t9@3S=E{>J578OF<-&?vewgl zc4$f<3D(j$w`g@)T)!H{{OCtbKT$}Jy!7}xCTF8qR6#p}hyHxcs{oNx1Q?xRYA)8E_&j~e}&C?vYcs^Qc(-r`!_Od3^Z zZfWKEwoJ=CBfdZq-L(|cTaVd!bPJ~as^@f-G&#nf70}+_uZG5-#n9e=^o7w)YlgLW zljZ!**NctIhZEmK^lOn5UA=w1uSBm~C?>jyVOjE;AEv2uHoNsY@n>rCe;Mv}R+Bcb zHeK6QmE-Z`m-l#i+Hx5$hYhnF+FLE%Y9;l03)^gPaHz(bicKZ6q_5_rU^6Y+K}!vwS*uwl z%T_d5miSm3(2^N5yPpri?8HDYX09aI-5L_?1`G*ysElCD0!gr2{UnTL#?HK{{aN1O zobF)A%pPZL4z^ylzYJU<4VWXYjRCX1Bb;A<&!UAkw7+NZ0antrvajW?b&|96W-NQL zUrWOOFAHIlxOH2UR-#}esbDBdNv?PawFPY@r)PH}m@91IE7TRfktqCjjZ)bfWwtl) zTSy4jse;9+V5X|c^puo)hcq_kD!b)Wf~mCLa8tjk<^n3y;{ z9lu_Ccaq%Q8Ohw8<}K`IKPE6LKzME>xq7X!YB<8A@V}E;OSH&1Bj~n$d-3bfK9{2}v`$ z(2OoL^`j=KM;DrMF;}IzGigSd0&@P+vNEP9ybiM0VJo=TVZO}kAbTCQXL}vy%e)S< z*O6>=7}1*noDux^R6}4tqoDDr7n6AgH_&Vz0p{(iEgt$NV_3I)*T?5aoy6zmbkynh z(>83uSHX;bd9-{=?97wQFjs)|VO|k-yjwJgqNf>P} zI+wYI5Xm;3lH8Lf$u$*{+%sn+(^TjN%qfKI@C{pi8+Z8neXY7PbFN{-&dcnEOZ)Pb za@ctpc4s#3yd)m8^*8P}R+-to#A-Q*YU;$T)n?k_QGYUgmza#%lhHZ8yz}j5UkPT} zLeZ941@33{b#kE-*g)*I-MTHdoB?M8vs-ubHtgy}rKddI6x^_ZJ8bE8Rm*SS)|PIw zx7*AqY^rGj-YS|Mu>a^2vklx;5F_!MF>bX!RO)0e%x~2$XXbV?(*@>3uBh?;fpaOyhx98 zu=%S7AjtbixrO~-jSF0iAufy=g>o8>T@gRF0hA-%tQ48Y} zBa9DURLv-f@!^X(=wf{MqNYn@lJVh-9t({NI(ivPO0eU632R^oXu}@9Y%)QElq)A~ zw9&a2YD}^k)UT4Wel@rTTnnxf&L64YVE^5ue;1#9?uo?UK3D+u1q;CBW0ZYJAunbhe)4_7k4SGN?=mY)Wa8LsSU=R#}6<{Se0;~c@f}_CE z;23Z$I1U^SP5>u@lfcPfH8=%41Dpy@19k9B@GNjTI0Kvs&H~Q{&jDvQ;vAgkg6D$g zf#-wszze_&!HdB8;Kkr2;HBVY;N{>I;FaJ4a3Qz|Tnt_XUJYIYhQVvW>%i;58^9aE zo4}jF2)G2i1-uo!4ZIz^1H2Pl3f=|Y4c-IZ3*HCb4?X}s2tEWp3_b!r3O)ut4n6@s z2|fir4L$=t3qA)v5554t2)+cq488)s3cd!u4lV=V0N(`P0^bJT0pA7R1K$Tf06zpj z0zU>n0Y3#l13w490KWv6gI|GPgWrJPf-Au9z?ID5Gq?r(8Qco~0{#mA25tklgTI42z(2s9;4bh_@GtOh@E`DBa5uOI+zajl_k#z( zgWw_XFn9!Pw3Dzg*aU0}CVT@_r-S97 z8}xu)&VAj-9lN@Xx8P1A~q?n;b%icVOIQB2jT zYUQXrUGsC*PG!cIqmG`{nu*E)xw9)Lrr9zr>4v4GZ*aPkf#3?a;)8dt^j2z1^-J^( z)v78$#n$lKlsZ<84DX(!lqpFRx6e_u1xx6hoKTZH0_Wxhx^gFcmz?m%pP?w0L4ENl zFRhq6nU@nwLrtk1F<8Sr`x)kalSX_vZ|a)#l)qh)|t73tnoSaZcb zIag!~KPFJhQTJ8ACqw~@1EKi zyj7*U(mAN=S{+>FRko6S+93@#N=d}iwep2$SI@0i+uT%om+;)~>fu9`CDlq-slPH< z>+7xz^`zHN(MQX2&0jiO7H(^OODnyoRQiTO?q-Dm;;VM~3#Qm7%q2EMMPC}IC@p

hj!cp*r~Uwd*wcg6Pe#%t_{*0lm{z%!UqTS{2sDsVd?TqdvfDkkf*n zU3y7&l{zXbDmAr=@l~_ypxVA_w|oe9j*@=1uA$E4jWkyumTeS9$b?VOOHeQ8&HDBH z)JBGP>>ujrt`00C7sT6N)5^y*OS^V1tMusT#%`Og+MoG7Wj$3(svUZ-h1cZHwV{p< z^MY3#-733kXEtwp*%t!$uZmgj?v|2DKsVZ*RbJFl9v~b=o3_rDRCA?=smuCiui33f zpI(V@mARRU8j}8=O7CE)cR^)A=Yo*iJ$&1zgt_a!e&ttkyuf!QcGuiX??8WFZ7>N` zAq*`kcbd1nSDa(-Ee559I73HRst=$aiZ;k!!dA@jLy z^N>eBr@gyA$Csk-eE+VAUQTfjwQx(+z{MuACZeD0i49Z<=umsF|g zw{>}(*VMwQ$t+5`l-F>&hStTlQ=W=%Tb)kh+fS?F)bvd@jhB_Dm3ZjiPnL zhQtNh=F>W+Ywe%uK_HV0eV&HY0=|Y9pH6t+ZNSF^~@X9v~;hk z2CAYy`&`o*D86#K3r{X-{f2gT83 zI&~$~GL|*FI>Y?(&AwY}*-}PzoA6$2R-co;v6=&Sus@;ib~Ojj_C34ifLZSK*6hE1 zxu>H_J7|83=Fr)u8S2a4=(dW^bsC$mnQp6XkH$LH`Ml*BJ|rE=X8jqtDYvc&`aGv6 za9XGc);+WN7kA#x^}~9%tO!=6bwzN|Bo#oFVM(pJRBx?@)_hK?(%ILyJbeMA6{ytN zt#<`a{IXQ#i4E5qm9E;VLJhSk**JDe%Bvh;u6Wej1 zQd?1_i8CYpo=UA#uW8K^SYN%EyeI`T-04gLch@R}`I;bTzU<&qL&Ky=wnD{*Vs52q zV|2g&$Z)AJMyc@n7#ZHHP^Du?rzTM&{qNT*;f6QX$nc(pu_o^W^VU#E9z%;I>h`Lx+L;XIVpU7RNJ4YH>P)ATYnq{R-E@2H zwx+amX4w>tV&hFl?zU{2OVb*h-g5sk)o6|Ph>4F4$z;`#KAvb*dOC2?y~nf~114!A zxwksRUJN^K)})iEH>~_JQ^LA9e;6zcJLpwvY1L#H%8ZGIDvJj69f0`&+X+uX7HjC- z5VBny#bmBZ!ZOz}%A)Ku7x{q96{;G2rKjtwga55YhPRfNnS`HjGo=}=k>TkB<&{19 z>S0F(n)uL{nf|^3EwI$o&ye0k3g}#=LA~Y$G@GY_({otXlF(^(POsA4UD1@EYK(Df z%Bx?F1G&$Nroy|-1ZIBG>XVlp#_C;Yhj41P+PP4bgBiQNl}c$_Wv|Z+|4S<>OVzt) z!qin;_j$|CXDEk!sPm55n%2-fe2v$$nd$Talek;+!ke_R+-+ZdZU$N#Mbt(%wS~qn zX$!4RV{>Qxrq~qb*;uwqH5z6tMuwZhY}3~(%(^pu zA?MxnT)3Ln@)@vKwtfbz&a8CaUfbXN6!czcLu)#XdLGlhvYcTUw6~cWEYRYGGg#@- zw<^9&YrVt{4ceLDxuI#D=ewLrInGwki{Xupw4T;I$z)XBE2=g(&KYI1l4l(+T|ImC zIcK-AbTyg^sqexfvt7Y%V0W+wmdK43mr0QLn7!6L92><9J-2Y>^?lfaX~ zLEvEU6mSSQ6g(9i2A&4WpaXP*E>HnWz*4XbRKe51a?lNWKriS6{ors=0|Q_X41pD3 zB{%}C0!MkHC+?Pry&X&%n>YFTgLs<=|J~*Wfqcx8Mr!J8&iVJ@^CoBlr`z3S14Y z0oQ`-!1dq;a3i=0+zf63e+IXLzkt7jzk%Dp?cnd=4)70fC%6mz6Z{MO8~g|S7u*f* z0r!IY!2RF>@E~{yJPaNI;|o@k1*_vrR3Fur^0i%}n(QUtw9+esT{%=QRsN5jY5ZES zr8rZ{e3g7Q6sGsmzPx_!?$Q5J*%;ct$E@c1x$88QY#y>css6f>v#Df#vSr)8&$`Oz x`iv*NyRk@jtmC(N&i;;BpRu^R|DSPY%eH-=bz5R-?P_0Mm$_XBgWq!({2w?-_mThr literal 0 HcmV?d00001 diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 2cfb0f3..faf99ad 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -32,9 +32,7 @@ ) -def _match_uc_wildcards_old( - df: pd.DataFrame, dictionary: dict[str, pd.DataFrame] -) -> pd.DataFrame: +def _match_uc_wildcards_old(df: pd.DataFrame, dictionary: dict[str, pd.DataFrame]) -> pd.DataFrame: """Old version of the process_uc_wildcards matching logic, for comparison with the new vectorised version. TODO remove this function once validated. """ @@ -46,12 +44,8 @@ def make_str(df): else: return None - df["process"] = df.apply( - lambda row: make_str(get_matching_processes(row, dictionary)), axis=1 - ) - df["commodity"] = df.apply( - lambda row: make_str(get_matching_commodities(row, dictionary)), axis=1 - ) + df["process"] = df.apply(lambda row: make_str(get_matching_processes(row, dictionary)), axis=1) + df["commodity"] = df.apply(lambda row: make_str(get_matching_commodities(row, dictionary)), axis=1) cols_to_drop = [col for col in df.columns if col in query_columns] @@ -73,49 +67,48 @@ def test_uc_wildcards(self): """ Tests logic that matches wildcards in the process_uc_wildcards transform . - Old method took 0:01:35.823996 seconds - New method took 0:00:04.622714 seconds, speedup: 20.7x - + Results on Ireland model: + Old method took 0:00:08.42 seconds + New method took 0:00:00.18 seconds, speedup: 46.5x """ import pickle - dfo = pd.read_parquet("tests/data/process_uc_wildcards_austimes_data.parquet") - with open("tests/data/process_uc_wildcards_austimes_dict.pkl", "rb") as f: + df_in = pd.read_parquet("tests/data/process_uc_wildcards_ireland_data.parquet") + with open("tests/data/process_uc_wildcards_ireland_dict.pkl", "rb") as f: dictionary = pickle.load(f) + df = df_in.copy() - df = dfo.query("region in ['ACT']") + df_in = pd.read_parquet("tests/data/process_uc_wildcards_austimes_data.parquet") + with open("tests/data/process_uc_wildcards_austimes_dict.pkl", "rb") as f: + dictionary = pickle.load(f) + df = df_in.query() t0 = datetime.now() - df_new = _match_uc_wildcards( - df, process_map, dictionary, get_matching_processes, "process" - ) - df_new = _match_uc_wildcards( - df_new, commodity_map, dictionary, get_matching_commodities, "commodity" - ) + # optimised functions + df_new = _match_uc_wildcards(df, process_map, dictionary, get_matching_processes, "process") + df_new = _match_uc_wildcards(df_new, commodity_map, dictionary, get_matching_commodities, "commodity") + t1 = datetime.now() + + # Unoptimised function df_old = _match_uc_wildcards_old(df, dictionary) + t2 = datetime.now() print(f"Old method took {t2 - t1} seconds") - print( - f"New method took {t1 - t0} seconds, speedup: {((t2 - t1) / (t1 - t0)):.1f}x" - ) + print(f"New method took {t1 - t0} seconds, speedup: {((t2 - t1) / (t1 - t0)):.1f}x") - # find first row where df_old and df_new are different - for i, (row_old, row_new) in enumerate( - zip(df_old.itertuples(), df_new.itertuples()) - ): - if row_old != row_new: - print(f"First difference at row {i}") - print(f"Old:\n{df_old.iloc[i - 10: i + 10]}") - print(f"New:\n{df_new.iloc[i - 10: i + 10]}") - break + # unit tests + assert df_new is not None and not df_new.empty + assert df_new.shape[0] >= df_in.shape[0], "should have more rows after processing uc_wildcards" + assert df_new.shape[1] < df_in.shape[1], "should have fewer columns after processing uc_wildcards" + assert "process" in df_new.columns, "should have added process column" + assert "commodity" in df_new.columns, "should have added commodity column" + # consistency checks with old method assert len(set(df_new.columns).symmetric_difference(set(df_old.columns))) == 0 - assert df_new.fillna(-1).equals( - df_old.fillna(-1) - ), "Dataframes should be equal (ignoring Nones and NaNs)" + assert df_new.fillna(-1).equals(df_old.fillna(-1)), "Dataframes should be equal (ignoring Nones and NaNs)" def test_generate_commodity_groups(self): """ @@ -126,9 +119,7 @@ def test_generate_commodity_groups(self): 43958x speedup """ # data extracted immediately before the original for loops - comm_groups = pd.read_parquet( - "tests/data/comm_groups_austimes_test_data.parquet" - ).drop(columns=["commoditygroup"]) + comm_groups = pd.read_parquet("tests/data/comm_groups_austimes_test_data.parquet").drop(columns=["commoditygroup"]) # filter data so test runs faster comm_groups = comm_groups.query("region in ['ACT', 'NSW']") @@ -149,9 +140,7 @@ def test_default_pcg_vectorised(self): comm_groups = pd.read_parquet("tests/data/austimes_pcg_test_data.parquet") comm_groups = comm_groups[(comm_groups["region"].isin(["ACT", "NT"]))] - comm_groups2 = _process_comm_groups_vectorised( - comm_groups.copy(), transforms.csets_ordered_for_pcg - ) + comm_groups2 = _process_comm_groups_vectorised(comm_groups.copy(), transforms.csets_ordered_for_pcg) assert comm_groups2 is not None and not comm_groups2.empty assert comm_groups2.shape == (comm_groups.shape[0], comm_groups.shape[1] + 1) assert comm_groups2.drop(columns=["DefaultVedaPCG"]).equals(comm_groups) diff --git a/xl2times/transforms.py b/xl2times/transforms.py index a45795d..c35d312 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -1,5 +1,6 @@ import collections import functools +import pickle from collections import defaultdict from pandas.core.frame import DataFrame from pathlib import Path @@ -36,9 +37,7 @@ } csets_ordered_for_pcg = ["DEM", "MAT", "NRG", "ENV", "FIN"] -default_pcg_suffixes = [ - cset + io for cset in csets_ordered_for_pcg for io in ["I", "O"] -] +default_pcg_suffixes = [cset + io for cset in csets_ordered_for_pcg for io in ["I", "O"]] attr_prop = { "COM_LIM": "limtype", @@ -97,8 +96,7 @@ def remove_comment_rows( list( locate( df[colname], - lambda cell: isinstance(cell, str) - and (cell.startswith(tuple(chars_by_colname[colname]))), + lambda cell: isinstance(cell, str) and (cell.startswith(tuple(chars_by_colname[colname]))), ) ) ) @@ -121,11 +119,7 @@ def remove_comment_cols(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedX if table.dataframe.size == 0: return table - comment_cols = [ - colname - for colname in table.dataframe.columns - if isinstance(colname, str) and colname.startswith("*") - ] + comment_cols = [colname for colname in table.dataframe.columns if isinstance(colname, str) and colname.startswith("*")] df = table.dataframe.drop(comment_cols, axis=1) df.reset_index(drop=True, inplace=True) @@ -187,10 +181,7 @@ def discard(table): seen = set() dupes = [x for x in table.dataframe.columns if x in seen or seen.add(x)] if len(dupes) > 0: - print( - f"WARNING: Duplicate columns in {table.range}, {table.sheetname}," - f" {table.filename}: {','.join(dupes)}" - ) + print(f"WARNING: Duplicate columns in {table.range}, {table.sheetname}," f" {table.filename}: {','.join(dupes)}") result.append(table) return result @@ -235,15 +226,11 @@ def normalize_column_aliases( for table in tables: tag = table.tag.split(":")[0] if tag in config.column_aliases: - table.dataframe = table.dataframe.rename( - columns=config.column_aliases[tag], errors="ignore" - ) + table.dataframe = table.dataframe.rename(columns=config.column_aliases[tag], errors="ignore") else: print(f"WARNING: could not find {table.tag} in config.column_aliases") if len(set(table.dataframe.columns)) > len(table.dataframe.columns): - raise ValueError( - f"Table has duplicate column names (after normalization): {table}" - ) + raise ValueError(f"Table has duplicate column names (after normalization): {table}") return tables @@ -333,9 +320,7 @@ def process_flexible_import_tables( "limtype": {"LO", "UP", "FX"}, # TODO: check what the values for the below should be "timeslice": set(model.ts_tslvl["tslvl"]), - "commodity-out": set( - utils.merge_columns(tables, datatypes.Tag.fi_comm, "commodity") - ), + "commodity-out": set(utils.merge_columns(tables, datatypes.Tag.fi_comm, "commodity")), "region": model.internal_regions, "currency": utils.single_column(tables, datatypes.Tag.currencies, "currency"), "other_indexes": {"INPUT", "OUTPUT", "DEMO", "DEMI"}, @@ -354,9 +339,7 @@ def get_colname(value): # TODO decide whether VedaProcessSets should become a new Enum type or part of TimesModelData type veda_process_sets = utils.single_table(tables, "VedaProcessSets").dataframe - def process_flexible_import_table( - table: datatypes.EmbeddedXlTable, veda_process_sets: DataFrame - ) -> datatypes.EmbeddedXlTable: + def process_flexible_import_table(table: datatypes.EmbeddedXlTable, veda_process_sets: DataFrame) -> datatypes.EmbeddedXlTable: # Make sure it's a flexible import table, and return the table untouched if not if not table.tag.startswith(datatypes.Tag.fi_t) and table.tag not in { datatypes.Tag.tfm_upd, @@ -401,21 +384,6 @@ def process_flexible_import_table( attribute = "attribute" if table.tag != datatypes.Tag.tfm_upd: - - # Check for duplicate DF columns - duplicated_cols = [ - item - for item, count in collections.Counter(data_columns).items() - if count > 1 - ] - if len(duplicated_cols) > 0: - logger.warning( - f"Duplicate data columns in table: {duplicated_cols}. Dropping first duplicated column. Table: \n{repr(table)}" - ) - # drop duplicate Df columns - df = df.loc[:, ~df.columns.duplicated(keep="last")] - data_columns = pd.unique(data_columns).tolist() - df, attribute_suffix = utils.explode(df, data_columns) # Append the data column name to the Attribute column values @@ -456,19 +424,11 @@ def process_flexible_import_table( cost_mapping = {"MIN": "IMP", "EXP": "EXP", "IMP": "IMP"} i = (df[attribute] == "COST") & df["process"] for process in df[i]["process"].unique(): - veda_process_set = ( - veda_process_sets["sets"] - .loc[veda_process_sets["process"] == process] - .unique() - ) + veda_process_set = veda_process_sets["sets"].loc[veda_process_sets["process"] == process].unique() if veda_process_set.shape[0]: - df.loc[i & (df["process"] == process), other] = cost_mapping[ - veda_process_set[0] - ] + df.loc[i & (df["process"] == process), other] = cost_mapping[veda_process_set[0]] else: - print( - f"WARNING: COST won't be processed as IRE_PRICE for {process}, because it is not in IMP/EXP/MIN" - ) + print(f"WARNING: COST won't be processed as IRE_PRICE for {process}, because it is not in IMP/EXP/MIN") # Use CommName to store the active commodity for EXP / IMP i = df[attribute].isin({"COST", "IRE_PRICE"}) @@ -478,22 +438,16 @@ def process_flexible_import_table( df.loc[i_imp, "commodity"] = df.loc[i_imp, "commodity-out"] # Should have all index_columns and VALUE - if table.tag == datatypes.Tag.fi_t and len(df.columns) != ( - len(index_columns) + 1 - ): + if table.tag == datatypes.Tag.fi_t and len(df.columns) != (len(index_columns) + 1): raise ValueError(f"len(df.columns) = {len(df.columns)}") df["year2"] = df.apply( - lambda row: int(row["year"].split("-")[1]) - if "-" in str(row["year"]) - else "EOH", + lambda row: int(row["year"].split("-")[1]) if "-" in str(row["year"]) else "EOH", axis=1, ) df["year"] = df.apply( - lambda row: int(row["year"].split("-")[0]) - if "-" in str(row["year"]) - else (row["year"] if row["year"] != "" else "BOH"), + lambda row: int(row["year"].split("-")[0]) if "-" in str(row["year"]) else (row["year"] if row["year"] != "" else "BOH"), axis=1, ) @@ -564,9 +518,7 @@ def process_user_constraint_table( if "uc_n" in df.columns: df["uc_n"] = df["uc_n"].ffill() - data_columns = [ - x for x in df.columns if x not in config.known_columns[datatypes.Tag.uc_t] - ] + data_columns = [x for x in df.columns if x not in config.known_columns[datatypes.Tag.uc_t]] # Populate columns nrows = df.shape[0] @@ -637,9 +589,7 @@ def generate_uc_properties( df = uc_table.dataframe.loc[:, ["uc_n"]].drop_duplicates(keep="first") # Supplement UC names with descriptions, if they exist df = df.merge( - uc_table.dataframe.loc[:, ["uc_n", "uc_desc"]] - .drop_duplicates(keep="first") - .dropna(), + uc_table.dataframe.loc[:, ["uc_n", "uc_desc"]].drop_duplicates(keep="first").dropna(), how="left", ) # Add info on how regions and timeslices should be treated by the UCs @@ -671,19 +621,13 @@ def generate_uc_properties( index = user_constraints["region"].str.contains(",").fillna(value=False) if any(index): user_constraints["region"][index] = user_constraints.apply( - lambda row: [ - region - for region in str(row["region"]).split(",") - if region in model.internal_regions - ], + lambda row: [region for region in str(row["region"]).split(",") if region in model.internal_regions], axis=1, ) # Explode regions user_constraints = user_constraints.explode("region", ignore_index=True) - model.user_constraints = user_constraints.rename( - columns={"uc_n": "name", "uc_desc": "description"} - ) + model.user_constraints = user_constraints.rename(columns={"uc_n": "name", "uc_desc": "description"}) return tables @@ -712,9 +656,7 @@ def fill_in_missing_values( if row["region"] in model.internal_regions: vt_regions[row["bookname"]].append(row["region"]) - ele_default_tslvl = ( - "DAYNITE" if "DAYNITE" in model.ts_tslvl["tslvl"].unique() else "ANNUAL" - ) + ele_default_tslvl = "DAYNITE" if "DAYNITE" in model.ts_tslvl["tslvl"].unique() else "ANNUAL" def fill_in_missing_values_table(table): df = table.dataframe.copy() @@ -727,33 +669,21 @@ def fill_in_missing_values_table(table): ismat = df["csets"] == "MAT" df.loc[isna & ismat, colname] = "FX" df.loc[isna & ~ismat, colname] = "LO" - elif ( - colname == "limtype" - and (table.tag == datatypes.Tag.fi_t or table.tag.startswith("~TFM")) - and len(df) > 0 - ): + elif colname == "limtype" and (table.tag == datatypes.Tag.fi_t or table.tag.startswith("~TFM")) and len(df) > 0: isna = df[colname].isna() for lim in config.veda_attr_defaults["limtype"].keys(): df.loc[ - isna - & df["attribute"] - .str.upper() - .isin(config.veda_attr_defaults["limtype"][lim]), + isna & df["attribute"].str.upper().isin(config.veda_attr_defaults["limtype"][lim]), colname, ] = lim elif colname == "timeslice" and len(df) > 0 and "attribute" in df.columns: isna = df[colname].isna() for timeslice in config.veda_attr_defaults["tslvl"].keys(): df.loc[ - isna - & df["attribute"] - .str.upper() - .isin(config.veda_attr_defaults["tslvl"][timeslice]), + isna & df["attribute"].str.upper().isin(config.veda_attr_defaults["tslvl"][timeslice]), colname, ] = timeslice - elif ( - colname == "tslvl" and table.tag == datatypes.Tag.fi_process - ): # or colname == "CTSLvl" or colname == "PeakTS": + elif colname == "tslvl" and table.tag == datatypes.Tag.fi_process: # or colname == "CTSLvl" or colname == "PeakTS": isna = df[colname].isna() isele = df["sets"] == "ELE" df.loc[isna & isele, colname] = ele_default_tslvl @@ -805,11 +735,7 @@ def split_by_commas(s): df = table.dataframe.copy() c = df.map(has_comma) - columns_with_commas = [ - colname - for colname in c.columns - if colname not in query_columns and c[colname].any() - ] + columns_with_commas = [colname for colname in c.columns if colname not in query_columns and c[colname].any()] if len(columns_with_commas) > 0: # Transform comma-separated strings into lists df[columns_with_commas] = df[columns_with_commas].map(split_by_commas) @@ -844,11 +770,7 @@ def remove_invalid_values( result = [] for table in tables: df = table.dataframe.copy() - is_valid_list = [ - df[colname].isin(values) - for colname, values in constraints.items() - if colname in df.columns - ] + is_valid_list = [df[colname].isin(values) for colname, values in constraints.items() if colname in df.columns] if is_valid_list: is_valid = reduce(lambda a, b: a & b, is_valid_list) df = df[is_valid] @@ -869,9 +791,7 @@ def process_units( "currency": tables[datatypes.Tag.currencies]["currency"].unique(), } - model.units = pd.concat( - [pd.DataFrame({"unit": v, "type": k}) for k, v in units_map.items()] - ) + model.units = pd.concat([pd.DataFrame({"unit": v, "type": k}) for k, v in units_map.items()]) return tables @@ -891,9 +811,7 @@ def process_time_periods( df = pd.DataFrame({"d": active_series}) # Start years = start year, then cumulative sum of period durations - df["b"] = (active_series.cumsum() + model.start_year).shift( - 1, fill_value=model.start_year - ) + df["b"] = (active_series.cumsum() + model.start_year).shift(1, fill_value=model.start_year) df["e"] = df.b + df.d - 1 df["m"] = df.b + ((df.d - 1) // 2) df["year"] = df.m @@ -914,9 +832,7 @@ def process_regions( """ model.all_regions.update((["IMPEXP", "MINRNW"])) - model.internal_regions.update( - utils.single_column(tables, datatypes.Tag.book_regions_map, "region") - ) + model.internal_regions.update(utils.single_column(tables, datatypes.Tag.book_regions_map, "region")) model.all_regions.update(model.internal_regions) # Apply regions filter @@ -1017,12 +933,7 @@ def apply_fixups_table(table: datatypes.EmbeddedXlTable): df["year"] = pd.to_numeric(df["year"], errors="coerce") # Populate CommName based on defaults - i = ( - df["attribute"] - .str.upper() - .isin(config.veda_attr_defaults["commodity"].keys()) - & df["commodity"].isna() - ) + i = df["attribute"].str.upper().isin(config.veda_attr_defaults["commodity"].keys()) & df["commodity"].isna() if len(df[i]) > 0: for attr in df[i]["attribute"].unique(): for com_in_out in config.veda_attr_defaults["commodity"][attr.upper()]: @@ -1052,8 +963,7 @@ def apply_fixups_table(table: datatypes.EmbeddedXlTable): df.loc[i, "commodity"] = df[i].apply( lambda row: ",".join( reg_com_flows.loc[ - (reg_com_flows["region"] == row["region"]) - & (reg_com_flows["process"] == row["process"]), + (reg_com_flows["region"] == row["region"]) & (reg_com_flows["process"] == row["process"]), "commodity", ].unique() ), @@ -1084,9 +994,7 @@ def generate_commodity_groups( reg_prc_pcg.drop_duplicates(keep="first", inplace=True) # DataFrame with Veda PCGs specified in the process declaration tables - reg_prc_veda_pcg = reg_prc_pcg.loc[ - reg_prc_pcg["primarycg"].isin(default_pcg_suffixes) - ] + reg_prc_veda_pcg = reg_prc_pcg.loc[reg_prc_pcg["primarycg"].isin(default_pcg_suffixes)] # Extract commodities and their sets by region columns = ["region", "csets", "commodity"] @@ -1163,16 +1071,12 @@ def _count_comm_group_vectorised(comm_groups: pd.DataFrame) -> None: """ comm_groups["commoditygroup"] = 0 - comm_groups["commoditygroup"] = ( - comm_groups.groupby(["region", "process", "csets", "io"]).transform("count") - )["commoditygroup"] + comm_groups["commoditygroup"] = (comm_groups.groupby(["region", "process", "csets", "io"]).transform("count"))["commoditygroup"] # set comoditygroup to 0 for io rows that aren't IN or OUT comm_groups.loc[~comm_groups["io"].isin(["IN", "OUT"]), "commoditygroup"] = 0 -def _process_comm_groups_vectorised( - comm_groups: pd.DataFrame, csets_ordered_for_pcg: list[str] -) -> pd.DataFrame: +def _process_comm_groups_vectorised(comm_groups: pd.DataFrame, csets_ordered_for_pcg: list[str]) -> pd.DataFrame: """Sets the first commodity group in the list of csets_ordered_for_pcg as the default pcg for each region/process/io combination, but setting the io="OUT" subset as default before "IN". @@ -1193,20 +1097,14 @@ def _set_default_veda_pcg(group): for io in ["OUT", "IN"]: for cset in csets_ordered_for_pcg: - group.loc[ - (group["io"] == io) & (group["csets"] == cset), "DefaultVedaPCG" - ] = True + group.loc[(group["io"] == io) & (group["csets"] == cset), "DefaultVedaPCG"] = True if group["DefaultVedaPCG"].any(): break return group comm_groups["DefaultVedaPCG"] = None - comm_groups_subset = comm_groups.groupby( - ["region", "process"], sort=False, as_index=False - ).apply(_set_default_veda_pcg) - comm_groups_subset = comm_groups_subset.reset_index( - level=0, drop=True - ).sort_index() # back to the original index and row order + comm_groups_subset = comm_groups.groupby(["region", "process"], sort=False, as_index=False).apply(_set_default_veda_pcg) + comm_groups_subset = comm_groups_subset.reset_index(level=0, drop=True).sort_index() # back to the original index and row order return comm_groups_subset @@ -1219,14 +1117,10 @@ def complete_commodity_groups( Complete the list of commodity groups """ - commodities = generate_topology_dictionary(tables, model)[ - "commodities_by_name" - ].rename(columns={"commodity": "commoditygroup"}) + commodities = generate_topology_dictionary(tables, model)["commodities_by_name"].rename(columns={"commodity": "commoditygroup"}) cgs_in_top = model.topology["commoditygroup"].to_frame() commodity_groups = pd.concat([commodities, cgs_in_top]) - model.commodity_groups = commodity_groups.drop_duplicates( - keep="first" - ).reset_index() + model.commodity_groups = commodity_groups.drop_duplicates(keep="first").reset_index() return tables @@ -1248,9 +1142,7 @@ def generate_trade( for table in tables: if table.tag == datatypes.Tag.fi_process: df = table.dataframe - ire_prc = pd.concat( - [ire_prc, df.loc[df["sets"] == "IRE", ["region", "process"]]] - ) + ire_prc = pd.concat([ire_prc, df.loc[df["sets"] == "IRE", ["region", "process"]]]) ire_prc.drop_duplicates(keep="first", inplace=True) internal_regions = pd.DataFrame(model.internal_regions, columns=["region"]) @@ -1261,9 +1153,7 @@ def generate_trade( top_ire = pd.merge(top_ire, model.topology[["region", "csets", "commodity"]]) top_ire.drop(columns=["csets"], inplace=True) top_ire["io"] = "OUT" - top_ire = pd.concat( - [top_ire, model.topology[["region", "process", "commodity", "io"]]] - ) + top_ire = pd.concat([top_ire, model.topology[["region", "process", "commodity", "io"]]]) top_ire = pd.merge(top_ire, ire_prc) top_ire = pd.merge(top_ire, veda_process_sets) top_ire["region2"] = top_ire["sets"].replace(veda_set_ext_reg_mapping) @@ -1315,9 +1205,7 @@ def generate_trade( top_ire = pd.concat([top_ire, b_links[cols_list]]) filter_regions = model.internal_regions.union({"IMPEXP", "MINRNW"}) - i = top_ire["origin"].isin(filter_regions) & top_ire["destination"].isin( - filter_regions - ) + i = top_ire["origin"].isin(filter_regions) & top_ire["destination"].isin(filter_regions) model.trade = top_ire[i].reset_index() @@ -1394,9 +1282,7 @@ def remove_fill_tables( # TODO: For the moment, assume that these tables are up-to-date. We will need a tool to do this. result = [] for table in tables: - if table.tag != datatypes.Tag.tfm_fill and not table.tag.startswith( - datatypes.Tag.tfm_fill_r - ): + if table.tag != datatypes.Tag.tfm_fill and not table.tag.startswith(datatypes.Tag.tfm_fill_r): result.append(table) return result @@ -1413,9 +1299,7 @@ def process_commodity_emissions( else: df = table.dataframe.copy() index_columns = ["region", "year", "commodity"] - data_columns = [ - colname for colname in df.columns if colname not in index_columns - ] + data_columns = [colname for colname in df.columns if colname not in index_columns] df, names = utils.explode(df, data_columns) df.rename(columns={"value": "emcb"}, inplace=True) df["other_indexes"] = names @@ -1423,9 +1307,7 @@ def process_commodity_emissions( if "region" in df.columns: df = df.astype({"region": "string"}) - df["region"] = df["region"].map( - lambda s: s.split(",") if isinstance(s, str) else s - ) + df["region"] = df["region"].map(lambda s: s.split(",") if isinstance(s, str) else s) df = df.explode("region", ignore_index=True) df = df[df["region"].isin(model.internal_regions)] @@ -1471,11 +1353,7 @@ def process_years( # We ignore values < 1000 because those signify interpolation/extrapolation rules # (see Table 8 of Part IV of the Times Documentation) - datayears = ( - tables[datatypes.Tag.fi_t]["year"] - .apply(lambda x: x if (x is not str) and x >= 1000 else None) - .dropna() - ) + datayears = tables[datatypes.Tag.fi_t]["year"].apply(lambda x: x if (x is not str) and x >= 1000 else None).dropna() model.data_years = datayears.drop_duplicates().sort_values() # Pastyears is the set of all years before ~StartYear @@ -1509,9 +1387,7 @@ def process_processes( result.append(table) else: df = table.dataframe.copy() - processes_and_sets = pd.concat( - [processes_and_sets, df[["sets", "process"]].ffill()] - ) + processes_and_sets = pd.concat([processes_and_sets, df[["sets", "process"]].ffill()]) df.replace({"sets": veda_sets_to_times}, inplace=True) nrows = df.shape[0] # TODO: Use info from config instead. Introduce required columns in the meta file? @@ -1532,9 +1408,7 @@ def process_processes( sheetname="", range="", filename="", - dataframe=processes_and_sets.loc[ - processes_and_sets["sets"].isin(veda_sets_to_times.keys()) - ], + dataframe=processes_and_sets.loc[processes_and_sets["sets"].isin(veda_sets_to_times.keys())], ) result.append(veda_process_sets) @@ -1677,9 +1551,7 @@ def process_tradelinks( comm = df.columns[0] destinations = [c for c in df.columns if c != comm] df.rename(columns={comm: "origin"}, inplace=True) - df = pd.melt( - df, id_vars=["origin"], value_vars=destinations, var_name="destination" - ) + df = pd.melt(df, id_vars=["origin"], value_vars=destinations, var_name="destination") df = df[df["value"] == 1].drop(columns=["value"]) df["destination"] = df["destination"].str.upper() df.drop_duplicates(keep="first", inplace=True) @@ -1691,9 +1563,7 @@ def process_tradelinks( else: df["tradelink"] = 1 # Determine whether a trade link is bi- or unidirectional - td_type = ( - df.groupby(["regions"])["tradelink"].agg("count").reset_index() - ) + td_type = df.groupby(["regions"])["tradelink"].agg("count").reset_index() td_type.replace({"tradelink": {1: "u", 2: "b"}}, inplace=True) df.drop(columns=["tradelink"], inplace=True) df = df.merge(td_type, how="inner", on="regions") @@ -1707,9 +1577,7 @@ def process_tradelinks( ) # Drop tradelink (bidirectional) duplicates - df.drop_duplicates( - subset=["regions", "tradelink"], keep="last", inplace=True - ) + df.drop_duplicates(subset=["regions", "tradelink"], keep="last", inplace=True) df.drop(columns=["regions"], inplace=True) df["comm"] = comm.upper() df["comm1"] = df["comm"] @@ -1729,9 +1597,7 @@ def process_tradelinks( ), axis=1, ) - result.append( - replace(table, dataframe=df, tag=datatypes.Tag.tradelinks_dins) - ) + result.append(replace(table, dataframe=df, tag=datatypes.Tag.tradelinks_dins)) else: result.append(table) @@ -1746,15 +1612,7 @@ def process_transform_insert_variants( """Reduces variants of TFM_INS like TFM_INS-TS to TFM_INS.""" def has_no_wildcards(list): - return all( - list.apply( - lambda x: x is not None - and x[0] != "-" - and "*" not in x - and "," not in x - and "?" not in x - ) - ) + return all(list.apply(lambda x: x is not None and x[0] != "-" and "*" not in x and "," not in x and "?" not in x)) def is_year(col_name): """A column name is a year if it is an int >= 0""" @@ -1768,22 +1626,16 @@ def is_year(col_name): if "year" in df.columns: raise ValueError(f"TFM_INS-TS table already has Year column: {table}") # TODO: can we remove this hacky shortcut? Or should it be also applied to the AT variant? - if set(df.columns) & query_columns == {"cset_cn"} and has_no_wildcards( - df["cset_cn"] - ): + if set(df.columns) & query_columns == {"cset_cn"} and has_no_wildcards(df["cset_cn"]): df.rename(columns={"cset_cn": "commodity"}, inplace=True) result.append(replace(table, dataframe=df, tag=datatypes.Tag.fi_t)) continue - elif set(df.columns) & query_columns == {"pset_pn"} and has_no_wildcards( - df["pset_pn"] - ): + elif set(df.columns) & query_columns == {"pset_pn"} and has_no_wildcards(df["pset_pn"]): df.rename(columns={"pset_pn": "process"}, inplace=True) result.append(replace(table, dataframe=df, tag=datatypes.Tag.fi_t)) continue - other_columns = [ - col_name for col_name in df.columns if not is_year(col_name) - ] + other_columns = [col_name for col_name in df.columns if not is_year(col_name)] df = pd.melt( df, id_vars=other_columns, @@ -1798,14 +1650,8 @@ def is_year(col_name): # ~TFM_INS-AT: Gather columns with attribute names into a single "Attribute" column df = table.dataframe if "attribute" in df.columns: - raise ValueError( - f"TFM_INS-AT table already has Attribute column: {table}" - ) - other_columns = [ - col_name - for col_name in df.columns - if col_name not in (config.all_attributes | config.attr_aliases) - ] + raise ValueError(f"TFM_INS-AT table already has Attribute column: {table}") + other_columns = [col_name for col_name in df.columns if col_name not in (config.all_attributes | config.attr_aliases)] df = pd.melt( df, id_vars=other_columns, @@ -1855,28 +1701,17 @@ def process_transform_tables( known_columns = config.known_columns[table.tag] | query_columns # Handle Regions: - if set(df.columns).isdisjoint( - {x.lower() for x in regions} | {"allregions"} - ): + if set(df.columns).isdisjoint({x.lower() for x in regions} | {"allregions"}): if "region" not in df.columns: # If there's no region information at all, this table is for all regions: df["region"] = ["allregions"] * len(df) # Else, we only have a "region" column so handle it below else: if "region" in df.columns: - raise ValueError( - "ERROR: table has a column called region as well as columns with" - f" region names:\n{table}\n{df.columns}" - ) + raise ValueError("ERROR: table has a column called region as well as columns with" f" region names:\n{table}\n{df.columns}") # We have columns whose names are regions, so gather them into a "region" column: - region_cols = [ - col_name - for col_name in df.columns - if col_name in set([x.lower() for x in regions]) | {"allregions"} - ] - other_columns = [ - col_name for col_name in df.columns if col_name not in region_cols - ] + region_cols = [col_name for col_name in df.columns if col_name in set([x.lower() for x in regions]) | {"allregions"}] + other_columns = [col_name for col_name in df.columns if col_name not in region_cols] df = pd.melt( df, id_vars=other_columns, @@ -1887,18 +1722,12 @@ def process_transform_tables( df = df.sort_index().reset_index(drop=True) # retain original row order # This expands "allregions" into one row for each region: - df["region"] = df["region"].map( - lambda x: regions if x == "allregions" else x - ) + df["region"] = df["region"].map(lambda x: regions if x == "allregions" else x) df = df.explode(["region"]) df["region"] = df["region"].str.upper() # Remove unknown columns and add missing known columns: - unknown_columns = [ - col_name - for col_name in df.columns - if col_name not in known_columns | {"region", "value"} - ] + unknown_columns = [col_name for col_name in df.columns if col_name not in known_columns | {"region", "value"}] df.drop(columns=unknown_columns, inplace=True) for standard_col in known_columns: if standard_col not in df.columns: @@ -1910,17 +1739,9 @@ def process_transform_tables( if len(dropped) > 0: # TODO handle - by_tag = [ - (key, list(group)) - for key, group in groupby( - sorted(dropped, key=lambda t: t.tag), lambda t: t.tag - ) - ] + by_tag = [(key, list(group)) for key, group in groupby(sorted(dropped, key=lambda t: t.tag), lambda t: t.tag)] for key, group in by_tag: - print( - f"WARNING: Dropped {len(group)} transform tables ({key})" - f" rather than processing them" - ) + print(f"WARNING: Dropped {len(group)} transform tables ({key})" f" rather than processing them") return result @@ -1940,17 +1761,9 @@ def process_transform_availability( if len(dropped) > 0: # TODO handle - by_tag = [ - (key, list(group)) - for key, group in groupby( - sorted(dropped, key=lambda t: t.tag), lambda t: t.tag - ) - ] + by_tag = [(key, list(group)) for key, group in groupby(sorted(dropped, key=lambda t: t.tag), lambda t: t.tag)] for key, group in by_tag: - print( - f"WARNING: Dropped {len(group)} transform availability tables ({key})" - f" rather than processing them" - ) + print(f"WARNING: Dropped {len(group)} transform availability tables ({key})" f" rather than processing them") return result @@ -2014,9 +1827,7 @@ def df_indexed_by_col(df, col): return df -def generate_topology_dictionary( - tables: Dict[str, DataFrame], model: datatypes.TimesModel -) -> Dict[str, DataFrame]: +def generate_topology_dictionary(tables: Dict[str, DataFrame], model: datatypes.TimesModel) -> Dict[str, DataFrame]: # We need to be able to fetch processes based on any combination of name, description, set, comm-in, or comm-out # So we construct tables whose indices are names, etc. and use pd.filter @@ -2072,26 +1883,21 @@ def process_uc_wildcards( if tag in tqdm(tables, desc=f"Processing uc_wildcards on tables"): start_time = time.time() df = tables[tag] + dictionary = generate_topology_dictionary(tables, model) - df = _match_uc_wildcards( - df, process_map, dictionary, get_matching_processes, "process" - ) - df = _match_uc_wildcards( - df, commodity_map, dictionary, get_matching_commodities, "commodity" - ) + df = _match_uc_wildcards(df, process_map, dictionary, get_matching_processes, "process") + df = _match_uc_wildcards(df, commodity_map, dictionary, get_matching_commodities, "commodity") tables[tag] = df - print( - f" process_uc_wildcards: {tag} took {time.time() - start_time:.2f} seconds for {len(df)} rows" - ) + print(f" process_uc_wildcards: {tag} took {time.time() - start_time:.2f} seconds for {len(df)} rows") return tables def _match_uc_wildcards( - table: pd.DataFrame, + df: pd.DataFrame, process_map: dict[str, str], dictionary: dict[str, pd.DataFrame], matcher: Callable, @@ -2101,7 +1907,7 @@ def _match_uc_wildcards( Match wildcards in the given table using the given process map and dictionary. Args: - table: Table to match wildcards in. + df: Table to match wildcards in. process_map: Mapping of column names to process sets. dictionary: Dictionary of process sets to match against. matcher: Matching function to use, e.g. get_matching_processes or get_matching_commodities. @@ -2110,31 +1916,29 @@ def _match_uc_wildcards( Returns: The table with the wildcard columns removed and the results of the wildcard matches added as a column named `results_col` """ - proc_cols = list(process_map.keys()) - # most of the speedup happens here - we drop duplicate sets of wildcard columns to save repeated (slow) regex matching - unique_filters = table[proc_cols].drop_duplicates() + # drop duplicate sets of wildcard columns to save repeated (slow) regex matching. This makes things much faster. + unique_filters = df[proc_cols].drop_duplicates().dropna(axis="rows", how="all") - matches = unique_filters.apply( - lambda row: matcher(row, dictionary), axis=1 - ).to_list() - matches = [ - df.iloc[:, 0].to_list() if df is not None and len(df) != 0 else None - for df in matches - ] - filter_matches = unique_filters.reset_index(drop=True).merge( - pd.DataFrame(matches, columns=[result_col]), left_index=True, right_index=True - ) + # match all the wildcards columns against the dictionary names + matches = unique_filters.apply(lambda row: matcher(row, dictionary), axis=1).to_list() + matches = [df.iloc[:, 0].to_list() if df is not None and len(df) != 0 else None for df in matches] + matches = pd.DataFrame({result_col: matches}) + + # then join with the wildcard cols to their list of matched names so we can join them back into the table df. + filter_matches = unique_filters.reset_index(drop=True).merge(matches, left_index=True, right_index=True) - # Then we merge the matches back into the original table, re-duplicating the results where the wildcard sets are repeated. - table = ( - table.merge(filter_matches, left_on=proc_cols, right_on=proc_cols, how="left") - .explode(result_col) - .reset_index(drop=True) - .drop(columns=proc_cols) + # Finally we merge the matches back into the original table. This join re-duplicates the duplicate filters dropped above for speed. + # And we explode any matches to multiple names to give a long-format table. + df = ( + df.merge(filter_matches, left_on=proc_cols, right_on=proc_cols, how="left").explode(result_col).reset_index(drop=True).drop(columns=proc_cols) ) - return table + + # replace NaNs in results_col with None for consistency with older logic + df[result_col] = df[result_col].where(df[result_col].notna(), None) + + return df def process_wildcards( @@ -2175,9 +1979,7 @@ def query( qs.append(f"region == '{region}'") return table.query(" and ".join(qs)).index - def eval_and_update( - table: DataFrame, rows_to_update: pd.Index, new_value: str - ) -> None: + def eval_and_update(table: DataFrame, rows_to_update: pd.Index, new_value: str) -> None: """Performs an inplace update of rows `rows_to_update` of `table` with `new_value`, which can be a update formula like `*2.3`.""" if isinstance(new_value, str) and new_value[0] in {"*", "+", "-", "/"}: @@ -2208,9 +2010,7 @@ def eval_and_update( if match is None: continue processes, commodities = match - rows_to_update = query( - table, processes, commodities, row["attribute"], row["region"] - ) + rows_to_update = query(table, processes, commodities, row["attribute"], row["region"]) new_rows = table.loc[rows_to_update].copy() eval_and_update(new_rows, rows_to_update, row["value"]) new_tables.append(new_rows) @@ -2285,9 +2085,7 @@ def eval_and_update( match = match_wildcards(row) processes, commodities = match if match is not None else (None, None) # TODO should we also query on limtype? - rows_to_update = query( - table, processes, commodities, row["attribute"], row["region"] - ) + rows_to_update = query(table, processes, commodities, row["attribute"], row["region"]) new_rows = table.loc[rows_to_update].copy() # Modify values in all '*2' columns for c, v in row.items(): @@ -2318,10 +2116,7 @@ def timeslices_table( user_ts_levels = ["SEASON", "WEEKLY", "DAYNITE"] # Ensure that all timeslice levels are uppercase - timeslices = { - col.upper(): list(values.unique()) - for col, values in table.dataframe.items() - } + timeslices = {col.upper(): list(values.unique()) for col, values in table.dataframe.items()} # Ensure that timeslices keys contain all user-specified levels for ts_level in user_ts_levels: @@ -2329,19 +2124,14 @@ def timeslices_table( timeslices[ts_level] = list() # Remove ANNUAL if it is the only entry in SEASON - if ( - len(timeslices["SEASON"]) == 1 - and timeslices["SEASON"][0].upper() == "ANNUAL" - ): + if len(timeslices["SEASON"]) == 1 and timeslices["SEASON"][0].upper() == "ANNUAL": timeslices["SEASON"] = list() # Create a dataframe containing regions and timeslices reg_ts = pd.DataFrame({"region": regions}) for ts_level in user_ts_levels: if timeslices[ts_level] != [None]: - reg_ts = pd.merge( - reg_ts, pd.DataFrame({ts_level: timeslices[ts_level]}), how="cross" - ) + reg_ts = pd.merge(reg_ts, pd.DataFrame({ts_level: timeslices[ts_level]}), how="cross") # Include expanded names of timeslices in the dataframe ncols = len(reg_ts.columns) @@ -2433,9 +2223,7 @@ def convert_to_string( model: datatypes.TimesModel, ) -> Dict[str, DataFrame]: for key, value in tables.items(): - tables[key] = value.map( - lambda x: str(int(x)) if isinstance(x, float) and x.is_integer() else str(x) - ) + tables[key] = value.map(lambda x: str(int(x)) if isinstance(x, float) and x.is_integer() else str(x)) return tables @@ -2471,9 +2259,7 @@ def rename_cgs( df = tables.get(datatypes.Tag.fi_t) if df is not None: i = df["other_indexes"].isin(default_pcg_suffixes) - df.loc[i, "other_indexes"] = ( - df["process"].astype(str) + "_" + df["other_indexes"].astype(str) - ) + df.loc[i, "other_indexes"] = df["process"].astype(str) + "_" + df["other_indexes"].astype(str) tables[datatypes.Tag.fi_t] = df return tables @@ -2500,25 +2286,16 @@ def complete_processes( trade_processes = pd.concat( [ - model.trade.loc[:, ["origin", "process", "in"]].rename( - columns={"origin": "region", "in": "commodity"} - ), - model.trade.loc[:, ["destination", "process", "out"]].rename( - columns={"destination": "region", "out": "commodity"} - ), + model.trade.loc[:, ["origin", "process", "in"]].rename(columns={"origin": "region", "in": "commodity"}), + model.trade.loc[:, ["destination", "process", "out"]].rename(columns={"destination": "region", "out": "commodity"}), ], ignore_index=True, sort=False, ) - undeclared_td = trade_processes.merge( - model.processes.loc[:, ["region", "process"]], how="left", indicator=True - ) + undeclared_td = trade_processes.merge(model.processes.loc[:, ["region", "process"]], how="left", indicator=True) undeclared_td = undeclared_td.loc[ - ( - undeclared_td["region"].isin(model.internal_regions) - & (undeclared_td["_merge"] == "left_only") - ), + (undeclared_td["region"].isin(model.internal_regions) & (undeclared_td["_merge"] == "left_only")), ["region", "process", "commodity"], ] @@ -2527,17 +2304,13 @@ def complete_processes( how="left", ) undeclared_td.drop(columns=["commodity"], inplace=True) - undeclared_td.rename( - columns={"csets": "primarycg", "ctslvl": "tslvl", "unit": "tact"}, inplace=True - ) + undeclared_td.rename(columns={"csets": "primarycg", "ctslvl": "tslvl", "unit": "tact"}, inplace=True) undeclared_td["sets"] = "IRE" undeclared_td.drop_duplicates(keep="last", inplace=True) # TODO: Handle possible duplicates for i in ["primarycg", "tslvl", "tact"]: - duplicates = undeclared_td.loc[:, ["region", "process", i]].duplicated( - keep=False - ) + duplicates = undeclared_td.loc[:, ["region", "process", i]].duplicated(keep=False) if any(duplicates): duplicates = undeclared_td.loc[duplicates, ["region", "process", i]] processes = duplicates["process"].unique() @@ -2571,18 +2344,12 @@ def apply_more_fixups( # TODO: TIMES already handles this. Drop? if len(df[i_reg_prc]["year"].unique()) == 1: year = df[i_reg_prc]["year"].unique()[0] - i_attr = ( - df["attribute"].isin({"NCAP_TLIFE", "LIFE"}) - & (df["region"] == region) - & (df["process"] == process) - ) + i_attr = df["attribute"].isin({"NCAP_TLIFE", "LIFE"}) & (df["region"] == region) & (df["process"] == process) if any(i_attr): lifetime = df[i_attr]["value"].unique()[-1] else: lifetime = 30 - extra_rows.append( - ["STOCK", region, process, "", year + lifetime, 0] - ) + extra_rows.append(["STOCK", region, process, "", year + lifetime, 0]) if len(extra_rows) > 0: df = pd.concat( [ From dc07fae13238833d910b2b7a48a28c6e6833065b Mon Sep 17 00:00:00 2001 From: Sam West Date: Wed, 21 Feb 2024 10:06:26 +1100 Subject: [PATCH 10/21] formatting --- utils/dd_to_csv.py | 36 ++++--------- utils/run_benchmarks.py | 29 +++------- xl2times/__main__.py | 116 +++++++++------------------------------- xl2times/datatypes.py | 65 +++++----------------- xl2times/excel.py | 24 ++------- xl2times/utils.py | 8 +-- 6 files changed, 63 insertions(+), 215 deletions(-) diff --git a/utils/dd_to_csv.py b/utils/dd_to_csv.py index a9e1132..fc1db35 100644 --- a/utils/dd_to_csv.py +++ b/utils/dd_to_csv.py @@ -47,9 +47,7 @@ def parse_parameter_values_from_file( while data[index].strip() == "": index += 1 - param_name = data[index].replace( - " ' '/", "" - ) # param_name is followed by this pattern + param_name = data[index].replace(" ' '/", "") # param_name is followed by this pattern index += 1 param_data = [] @@ -64,9 +62,7 @@ def parse_parameter_values_from_file( attributes = words[0].split(".") attributes = [a if " " in a else a.strip("'") for a in attributes] else: - raise ValueError( - f"Unexpected number of spaces in parameter value setting: {data[index]}" - ) + raise ValueError(f"Unexpected number of spaces in parameter value setting: {data[index]}") value = words[-1] param_data.append([*attributes, value]) @@ -106,9 +102,7 @@ def parse_parameter_values_from_file( text = words[1] set_data.add(tuple([*attributes, text])) else: - raise ValueError( - f"Unexpected number of spaces in set value setting: {data[index]}" - ) + raise ValueError(f"Unexpected number of spaces in set value setting: {data[index]}") index += 1 @@ -140,17 +134,11 @@ def save_data_with_headers( try: columns = headers_data[param_name] except KeyError: - raise ValueError( - f"Could not find mapping for {param_name} in mapping file." - ) + raise ValueError(f"Could not find mapping for {param_name} in mapping file.") for row in param_data: if len(row) != len(columns): - raise ValueError( - f"Mismatched number of columns for param {param_name} between data ({len(row)}) and mapping ({len(columns)})" - ) - df = pd.DataFrame( - data=np.asarray(param_data)[:, 0 : len(columns)], columns=columns - ) + raise ValueError(f"Mismatched number of columns for param {param_name} between data ({len(row)}) and mapping ({len(columns)})") + df = pd.DataFrame(data=np.asarray(param_data)[:, 0 : len(columns)], columns=columns) df.to_csv(os.path.join(save_dir, param_name + ".csv"), index=False) return @@ -171,9 +159,7 @@ def generate_headers_by_attr() -> Dict[str, List[str]]: return headers_by_attr -def convert_dd_to_tabular( - basedir: str, output_dir: str, headers_by_attr: Dict[str, List[str]] -) -> None: +def convert_dd_to_tabular(basedir: str, output_dir: str, headers_by_attr: Dict[str, List[str]]) -> None: dd_files = [p for p in Path(basedir).rglob("*.dd")] all_sets = defaultdict(list) @@ -219,12 +205,8 @@ def convert_dd_to_tabular( def main(arg_list: None | list[str] = None): args_parser = argparse.ArgumentParser() - args_parser.add_argument( - "input_dir", type=str, help="Input directory containing .dd files." - ) - args_parser.add_argument( - "output_dir", type=str, help="Output directory to save the .csv files in." - ) + args_parser.add_argument("input_dir", type=str, help="Input directory containing .dd files.") + args_parser.add_argument("output_dir", type=str, help="Output directory to save the .csv files in.") args = args_parser.parse_args(arg_list) convert_dd_to_tabular(args.input_dir, args.output_dir, generate_headers_by_attr()) diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index cd38bc3..afeff18 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -90,9 +90,7 @@ def run_gams_gdxdiff( return "Error: dd_files not in benchmark" # Copy GAMS scaffolding - scaffolding_folder = path.join( - path.dirname(path.realpath(__file__)), "..", "xl2times", "gams_scaffold" - ) + scaffolding_folder = path.join(path.dirname(path.realpath(__file__)), "..", "xl2times", "gams_scaffold") shutil.copytree(scaffolding_folder, out_folder, dirs_exist_ok=True) # Create link to TIMES source if not path.exists(path.join(out_folder, "source")): @@ -307,9 +305,7 @@ def run_all_benchmarks( # The rest of this script checks regressions against main # so skip it if we're already on main repo = git.Repo(".") # pyright: ignore - origin = ( - repo.remotes.origin if "origin" in repo.remotes else repo.remotes[0] - ) # don't assume remote is called 'origin' + origin = repo.remotes.origin if "origin" in repo.remotes else repo.remotes[0] # don't assume remote is called 'origin' origin.fetch("main") if "main" not in repo.heads: repo.create_head("main", origin.refs.main).set_tracking_branch(origin.refs.main) @@ -332,9 +328,7 @@ def run_all_benchmarks( result = parse_result(f.readlines()[-1]) # Use a fake runtime and GAMS result results_main.append((benchmark["name"], 999, "--", *result)) - print( - f"Skipped running on main. Using results from {path.join(benchmarks_folder, 'out-main')}" - ) + print(f"Skipped running on main. Using results from {path.join(benchmarks_folder, 'out-main')}") else: if repo.is_dirty(): @@ -396,23 +390,17 @@ def run_all_benchmarks( runtime_change = our_time - main_time print(f"Total runtime: {our_time:.2f}s (main: {main_time:.2f}s)") - print( - f"Change in runtime (negative == faster): {runtime_change:+.2f}s ({100 * runtime_change / main_time:+.1f}%)" - ) + print(f"Change in runtime (negative == faster): {runtime_change:+.2f}s ({100 * runtime_change / main_time:+.1f}%)") our_correct = df["Correct"].sum() main_correct = df["M Correct"].sum() correct_change = our_correct - main_correct - print( - f"Change in correct rows (higher == better): {correct_change:+d} ({100 * correct_change / main_correct:+.1f}%)" - ) + print(f"Change in correct rows (higher == better): {correct_change:+d} ({100 * correct_change / main_correct:+.1f}%)") our_additional_rows = df["Additional"].sum() main_additional_rows = df["M Additional"].sum() additional_change = our_additional_rows - main_additional_rows - print( - f"Change in additional rows: {additional_change:+d} ({100 * additional_change / main_additional_rows:+.1f}%)" - ) + print(f"Change in additional rows: {additional_change:+d} ({100 * additional_change / main_additional_rows:+.1f}%)") if len(accu_regressions) + len(addi_regressions) + len(time_regressions) > 0: print() @@ -511,10 +499,7 @@ def run_all_benchmarks( verbose=args.verbose, debug=args.debug, ) - print( - f"Ran {args.run} in {runtime:.2f}s. {acc}% ({cor} correct, {add} additional).\n" - f"GAMS: {gms}" - ) + print(f"Ran {args.run} in {runtime:.2f}s. {acc}% ({cor} correct, {add} additional).\n" f"GAMS: {gms}") else: run_all_benchmarks( benchmarks_folder, diff --git a/xl2times/__main__.py b/xl2times/__main__.py index 5f1f3f7..48f4d17 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -41,10 +41,7 @@ def convert_xl_to_times( result = excel.extract_tables(f) raw_tables.extend(result) pickle.dump(raw_tables, open(pickle_file, "wb")) - print( - f"Extracted {len(raw_tables)} tables," - f" {sum(table.dataframe.shape[0] for table in raw_tables)} rows" - ) + print(f"Extracted {len(raw_tables)} tables," f" {sum(table.dataframe.shape[0] for table in raw_tables)} rows") if stop_after_read: # Convert absolute paths to relative paths to enable comparing raw_tables.txt across machines @@ -60,14 +57,10 @@ def convert_xl_to_times( transforms.normalize_tags_columns, transforms.remove_fill_tables, transforms.validate_input_tables, - lambda config, tables, model: [ - transforms.remove_comment_cols(t) for t in tables - ], + lambda config, tables, model: [transforms.remove_comment_cols(t) for t in tables], transforms.remove_tables_with_formulas, # slow transforms.normalize_column_aliases, - lambda config, tables, model: [ - transforms.remove_comment_rows(config, t, model) for t in tables - ], + lambda config, tables, model: [transforms.remove_comment_rows(config, t, model) for t in tables], transforms.process_regions, transforms.generate_dummy_processes, transforms.process_time_slices, @@ -105,9 +98,7 @@ def convert_xl_to_times( transforms.fix_topology, transforms.complete_dictionary, transforms.convert_to_string, - lambda config, tables, model: dump_tables( - tables, os.path.join(output_dir, "merged_tables.txt") - ), + lambda config, tables, model: dump_tables(tables, os.path.join(output_dir, "merged_tables.txt")), lambda config, tables, model: produce_times_tables(config, tables), ] @@ -118,14 +109,10 @@ def convert_xl_to_times( output = transform(config, input, model) end_time = time.time() sep = "\n\n" + "=" * 80 + "\n" if verbose else "" - print( - f"{sep}transform {transform.__code__.co_name} took {end_time - start_time:.2f} seconds" - ) + print(f"{sep}transform {transform.__code__.co_name} took {end_time - start_time:.2f} seconds") if verbose: if isinstance(output, list): - for table in sorted( - output, key=lambda t: (t.tag, t.filename, t.sheetname, t.range) - ): + for table in sorted(output, key=lambda t: (t.tag, t.filename, t.sheetname, t.range)): print(table) elif isinstance(output, dict): for tag, df in output.items(): @@ -134,10 +121,7 @@ def convert_xl_to_times( input = output assert isinstance(output, dict) - print( - f"Conversion complete, {len(output)} tables produced," - f" {sum(df.shape[0] for df in output.values())} rows" - ) + print(f"Conversion complete, {len(output)} tables produced," f" {sum(df.shape[0] for df in output.values())} rows") return output @@ -154,31 +138,20 @@ def write_csv_tables(tables: Dict[str, DataFrame], output_dir: str): def read_csv_tables(input_dir: str) -> Dict[str, DataFrame]: result = {} for filename in os.listdir(input_dir): - result[filename.split(".")[0]] = pd.read_csv( - os.path.join(input_dir, filename), dtype=str - ) + result[filename.split(".")[0]] = pd.read_csv(os.path.join(input_dir, filename), dtype=str) return result -def compare( - data: Dict[str, DataFrame], ground_truth: Dict[str, DataFrame], output_dir: str -) -> str: - print( - f"Ground truth contains {len(ground_truth)} tables," - f" {sum(df.shape[0] for _, df in ground_truth.items())} rows" - ) +def compare(data: Dict[str, DataFrame], ground_truth: Dict[str, DataFrame], output_dir: str) -> str: + print(f"Ground truth contains {len(ground_truth)} tables," f" {sum(df.shape[0] for _, df in ground_truth.items())} rows") missing = set(ground_truth.keys()) - set(data.keys()) - missing_str = ", ".join( - [f"{x} ({ground_truth[x].shape[0]})" for x in sorted(missing)] - ) + missing_str = ", ".join([f"{x} ({ground_truth[x].shape[0]})" for x in sorted(missing)]) if len(missing) > 0: print(f"WARNING: Missing {len(missing)} tables: {missing_str}") additional_tables = set(data.keys()) - set(ground_truth.keys()) - additional_str = ", ".join( - [f"{x} ({data[x].shape[0]})" for x in sorted(additional_tables)] - ) + additional_str = ", ".join([f"{x} ({data[x].shape[0]})" for x in sorted(additional_tables)]) if len(additional_tables) > 0: print(f"WARNING: {len(additional_tables)} additional tables: {additional_str}") # Additional rows starts as the sum of lengths of additional tables produced @@ -186,9 +159,7 @@ def compare( total_gt_rows = 0 total_correct_rows = 0 - for table_name, gt_table in sorted( - ground_truth.items(), reverse=True, key=lambda t: len(t[1]) - ): + for table_name, gt_table in sorted(ground_truth.items(), reverse=True, key=lambda t: len(t[1])): if table_name in data: data_table = data[table_name] @@ -196,10 +167,7 @@ def compare( transformed_gt_cols = [col.split(".")[0] for col in gt_table.columns] data_cols = list(data_table.columns) if transformed_gt_cols != data_cols: - print( - f"WARNING: Table {table_name} header incorrect, was" - f" {data_cols}, should be {transformed_gt_cols}" - ) + print(f"WARNING: Table {table_name} header incorrect, was" f" {data_cols}, should be {transformed_gt_cols}") # both are in string form so can be compared without any issues gt_rows = set(tuple(row) for row in gt_table.to_numpy().tolist()) @@ -235,31 +203,20 @@ def compare( return result -def produce_times_tables( - config: datatypes.Config, input: Dict[str, DataFrame] -) -> Dict[str, DataFrame]: - print( - f"produce_times_tables: {len(input)} tables incoming," - f" {sum(len(value) for (_, value) in input.items())} rows" - ) +def produce_times_tables(config: datatypes.Config, input: Dict[str, DataFrame]) -> Dict[str, DataFrame]: + print(f"produce_times_tables: {len(input)} tables incoming," f" {sum(len(value) for (_, value) in input.items())} rows") result = {} used_tables = set() for mapping in config.times_xl_maps: if not mapping.xl_name in input: - print( - f"WARNING: Cannot produce table {mapping.times_name} because" - f" {mapping.xl_name} does not exist" - ) + print(f"WARNING: Cannot produce table {mapping.times_name} because" f" {mapping.xl_name} does not exist") else: used_tables.add(mapping.xl_name) df = input[mapping.xl_name].copy() # Filter rows according to filter_rows mapping: for filter_col, filter_val in mapping.filter_rows.items(): if filter_col not in df.columns: - print( - f"WARNING: Cannot produce table {mapping.times_name} because" - f" {mapping.xl_name} does not contain column {filter_col}" - ) + print(f"WARNING: Cannot produce table {mapping.times_name} because" f" {mapping.xl_name} does not contain column {filter_col}") # TODO break this loop and continue outer loop? filter = set(x.lower() for x in {filter_val}) i = df[filter_col].str.lower().isin(filter) @@ -283,12 +240,7 @@ def produce_times_tables( df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) # TODO this is a hack. Use pd.StringDtype() so that notna() is sufficient - i = ( - df[mapping.times_cols[-1]].notna() - & (df != "None").all(axis=1) - & (df != "nan").all(axis=1) - & (df != "").all(axis=1) - ) + i = df[mapping.times_cols[-1]].notna() & (df != "None").all(axis=1) & (df != "nan").all(axis=1) & (df != "").all(axis=1) df = df.loc[i, mapping.times_cols] # Drop tables that are empty after filtering and dropping Nones: if len(df) == 0: @@ -297,16 +249,12 @@ def produce_times_tables( unused_tables = set(input.keys()) - used_tables if len(unused_tables) > 0: - print( - f"WARNING: {len(unused_tables)} unused tables: {', '.join(sorted(unused_tables))}" - ) + print(f"WARNING: {len(unused_tables)} unused tables: {', '.join(sorted(unused_tables))}") return result -def write_dd_files( - tables: Dict[str, DataFrame], config: datatypes.Config, output_dir: str -): +def write_dd_files(tables: Dict[str, DataFrame], config: datatypes.Config, output_dir: str): os.makedirs(output_dir, exist_ok=True) for item in os.listdir(output_dir): if item.endswith(".dd"): @@ -315,9 +263,7 @@ def write_dd_files( def convert_set(df: DataFrame): has_description = "TEXT" in df.columns for row in df.itertuples(index=False): - row_str = "'.'".join( - (str(x) for k, x in row._asdict().items() if k != "TEXT") - ) + row_str = "'.'".join((str(x) for k, x in row._asdict().items() if k != "TEXT")) desc = f" '{row.TEXT}'" if has_description else "" yield f"'{row_str}'{desc}\n" @@ -329,9 +275,7 @@ def convert_parameter(tablename: str, df: DataFrame): df = df.drop_duplicates(subset=query_columns, keep="last") for row in df.itertuples(index=False): val = row.VALUE - row_str = "'.'".join( - (str(x) for k, x in row._asdict().items() if k != "VALUE") - ) + row_str = "'.'".join((str(x) for k, x in row._asdict().items() if k != "VALUE")) yield f"'{row_str}' {val}\n" if row_str else f"{val}\n" sets = {m.times_name for m in config.times_xl_maps if "VALUE" not in m.col_map} @@ -412,11 +356,7 @@ def run(args) -> str | None: sys.exit(-1) elif len(args.input) == 1: assert os.path.isdir(args.input[0]) - input_files = [ - str(path) - for path in Path(args.input[0]).rglob("*") - if path.suffix in [".xlsx", ".xlsm"] and not path.name.startswith("~") - ] + input_files = [str(path) for path in Path(args.input[0]).rglob("*") if path.suffix in [".xlsx", ".xlsm"] and not path.name.startswith("~")] print(f"Loading {len(input_files)} files from {args.input[0]}") else: input_files = args.input @@ -433,9 +373,7 @@ def run(args) -> str | None: ) sys.exit(0) - tables = convert_xl_to_times( - input_files, args.output_dir, config, model, args.use_pkl, verbose=args.verbose - ) + tables = convert_xl_to_times(input_files, args.output_dir, config, model, args.use_pkl, verbose=args.verbose) if args.dd: write_dd_files(tables, config, args.output_dir) @@ -471,9 +409,7 @@ def parse_args(arg_list: None | list[str]) -> argparse.Namespace: default="", help="Comma-separated list of regions to include in the model", ) - args_parser.add_argument( - "--output_dir", type=str, default="output", help="Output directory" - ) + args_parser.add_argument("--output_dir", type=str, default="output", help="Output directory") args_parser.add_argument( "--ground_truth_dir", type=str, diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py index 13f2d31..6e42ec7 100644 --- a/xl2times/datatypes.py +++ b/xl2times/datatypes.py @@ -101,9 +101,7 @@ def __eq__(self, o: object) -> bool: and self.dataframe.shape == o.dataframe.shape and ( len(self.dataframe) == 0 # Empty tables don't affect our output - or self.dataframe.sort_index(axis=1).equals( - o.dataframe.sort_index(axis=1) - ) + or self.dataframe.sort_index(axis=1).equals(o.dataframe.sort_index(axis=1)) ) ) @@ -209,9 +207,7 @@ def __init__( self.discard_if_empty, self.known_columns, ) = Config._read_veda_tags_info(veda_tags_file) - self.veda_attr_defaults, self.attr_aliases = Config._read_veda_attr_defaults( - veda_attr_defaults_file - ) + self.veda_attr_defaults, self.attr_aliases = Config._read_veda_attr_defaults(veda_attr_defaults_file) # Migration in progress: use parameter mappings from times_info_file for now name_to_map = {m.times_name: m for m in self.times_xl_maps} for m in param_mappings: @@ -234,16 +230,10 @@ def _process_times_info( unknown_cats = {item["gams-cat"] for item in table_info} - set(categories) if unknown_cats: print(f"WARNING: Unknown categories in times-info.json: {unknown_cats}") - dd_table_order = chain.from_iterable( - (sorted(cat_to_tables[c]) for c in categories) - ) + dd_table_order = chain.from_iterable((sorted(cat_to_tables[c]) for c in categories)) # Compute the set of all attributes, i.e. all entities with category = parameter - attributes = { - item["name"].lower() - for item in table_info - if item["gams-cat"] == "parameter" - } + attributes = {item["name"].lower() for item in table_info if item["gams-cat"] == "parameter"} # Compute the mapping for attributes / parameters: def create_mapping(entity): @@ -252,11 +242,7 @@ def create_mapping(entity): xl_cols = entity["mapping"] + ["value"] # TODO map in json col_map = dict(zip(times_cols, xl_cols)) # If tag starts with UC, then the data is in UCAttributes, else Attributes - xl_name = ( - "UCAttributes" - if entity["name"].lower().startswith("uc") - else "Attributes" - ) + xl_name = "UCAttributes" if entity["name"].lower().startswith("uc") else "Attributes" return TimesXlMap( times_name=entity["name"], times_cols=times_cols, @@ -267,10 +253,7 @@ def create_mapping(entity): ) param_mappings = [ - create_mapping(x) - for x in table_info - if x["gams-cat"] == "parameter" - and "type" not in x # TODO Generalise derived parameters? + create_mapping(x) for x in table_info if x["gams-cat"] == "parameter" and "type" not in x # TODO Generalise derived parameters? ] return dd_table_order, attributes, param_mappings @@ -304,9 +287,7 @@ def _read_mappings(filename: str) -> List[TimesXlMap]: if line == "": break (times, xl) = line.split(" = ") - (times_name, times_cols_str) = list( - filter(None, re.split("\[|\]", times)) - ) + (times_name, times_cols_str) = list(filter(None, re.split("\[|\]", times))) (xl_name, xl_cols_str) = list(filter(None, re.split("\(|\)", xl))) times_cols = times_cols_str.split(",") xl_cols = xl_cols_str.split(",") @@ -318,9 +299,7 @@ def _read_mappings(filename: str) -> List[TimesXlMap]: xl_cols = [s.lower() for s in xl_cols if ":" not in s] # TODO remove: Filter out mappings that are not yet finished - if xl_name != "~TODO" and not any( - c.startswith("TODO") for c in xl_cols - ): + if xl_name != "~TODO" and not any(c.startswith("TODO") for c in xl_cols): col_map = {} assert len(times_cols) <= len(xl_cols) for index, value in enumerate(times_cols): @@ -341,20 +320,13 @@ def _read_mappings(filename: str) -> List[TimesXlMap]: dropped.append(line) if len(dropped) > 0: - print( - f"WARNING: Dropping {len(dropped)} mappings that are not yet complete" - ) + print(f"WARNING: Dropping {len(dropped)} mappings that are not yet complete") return mappings @staticmethod def _read_veda_tags_info( veda_tags_file: str, - ) -> Tuple[ - Dict[Tag, Dict[str, str]], - Dict[Tag, Dict[str, list]], - Iterable[Tag], - Dict[Tag, Set[str]], - ]: + ) -> Tuple[Dict[Tag, Dict[str, str]], Dict[Tag, Dict[str, list]], Iterable[Tag], Dict[Tag, Set[str]],]: def to_tag(s: str) -> Tag: # The file stores the tag name in lowercase, and without the ~ return Tag("~" + s.upper()) @@ -367,9 +339,7 @@ def to_tag(s: str) -> Tag: tags = {to_tag(tag_info["tag_name"]) for tag_info in veda_tags_info} for tag in Tag: if tag not in tags: - print( - f"WARNING: datatypes.Tag has an unknown Tag {tag} not in {veda_tags_file}" - ) + print(f"WARNING: datatypes.Tag has an unknown Tag {tag} not in {veda_tags_file}") valid_column_names = {} row_comment_chars = {} @@ -386,10 +356,7 @@ def to_tag(s: str) -> Tag: # Process column aliases and comment chars: for valid_field in tag_info["valid_fields"]: valid_field_names = valid_field["aliases"] - if ( - "use_name" in valid_field - and valid_field["use_name"] != valid_field["name"] - ): + if "use_name" in valid_field and valid_field["use_name"] != valid_field["name"]: field_name = valid_field["use_name"] valid_field_names.append(valid_field["name"]) else: @@ -399,9 +366,7 @@ def to_tag(s: str) -> Tag: for valid_field_name in valid_field_names: valid_column_names[tag_name][valid_field_name] = field_name - row_comment_chars[tag_name][field_name] = valid_field[ - "row_ignore_symbol" - ] + row_comment_chars[tag_name][field_name] = valid_field["row_ignore_symbol"] # TODO: Account for differences in valid field names with base_tag if "base_tag" in tag_info: @@ -431,9 +396,7 @@ def _read_veda_attr_defaults( "tslvl": {"DAYNITE": [], "ANNUAL": []}, } - attr_aliases = { - attr for attr in defaults if "times-attribute" in defaults[attr] - } + attr_aliases = {attr for attr in defaults if "times-attribute" in defaults[attr]} for attr, attr_info in defaults.items(): # Populate aliases by attribute dictionary diff --git a/xl2times/excel.py b/xl2times/excel.py index 48c85ba..b584540 100644 --- a/xl2times/excel.py +++ b/xl2times/excel.py @@ -34,24 +34,16 @@ def extract_tables(filename: str) -> List[datatypes.EmbeddedXlTable]: for colname in df.columns: value = str(row[colname]) if value.startswith("~"): - match = re.match( - f"{datatypes.Tag.uc_sets.value}:(.*)", value, re.IGNORECASE - ) + match = re.match(f"{datatypes.Tag.uc_sets.value}:(.*)", value, re.IGNORECASE) if match: parts = match.group(1).split(":") if len(parts) == 2: uc_sets[parts[0].strip()] = parts[1].strip() else: - print( - f"WARNING: Malformed UC_SET in {sheet.title}, {filename}" - ) + print(f"WARNING: Malformed UC_SET in {sheet.title}, {filename}") else: col_index = df.columns.get_loc(colname) - sheet_tables.append( - extract_table( - row_index, col_index, uc_sets, df, sheet.title, filename - ) - ) + sheet_tables.append(extract_table(row_index, col_index, uc_sets, df, sheet.title, filename)) for sheet_table in sheet_tables: sheet_table.uc_sets = uc_sets @@ -123,9 +115,7 @@ def extract_table( end_col += 1 end_row = header_row - while end_row < df.shape[0] and not are_cells_all_empty( - df, end_row, start_col, end_col - ): + while end_row < df.shape[0] and not are_cells_all_empty(df, end_row, start_col, end_col): end_row += 1 # Excel cell numbering starts at 1, while pandas starts at 0 @@ -190,8 +180,4 @@ def cell_is_empty(value) -> bool: :param value: Cell value. :return: Boolean indicating if the cells are empty. """ - return ( - value is None - or (isinstance(value, numpy.floating) and numpy.isnan(value)) - or (isinstance(value, str) and len(value.strip()) == 0) - ) + return value is None or (isinstance(value, numpy.floating) and numpy.isnan(value)) or (isinstance(value, str) and len(value.strip()) == 0) diff --git a/xl2times/utils.py b/xl2times/utils.py index a69f94b..7654cf8 100644 --- a/xl2times/utils.py +++ b/xl2times/utils.py @@ -54,9 +54,7 @@ def explode(df, data_columns): column name for each value in each new row. """ data = df[data_columns].values.tolist() - other_columns = [ - colname for colname in df.columns.values if colname not in data_columns - ] + other_columns = [colname for colname in df.columns.values if colname not in data_columns] df = df[other_columns] value_column = "value" df = df.assign(value=data) @@ -110,9 +108,7 @@ def merge_columns(tables: List[datatypes.EmbeddedXlTable], tag: str, colname: st return numpy.concatenate(columns) -def apply_wildcards( - df: DataFrame, candidates: Iterable[str], wildcard_col: str, output_col: str -): +def apply_wildcards(df: DataFrame, candidates: Iterable[str], wildcard_col: str, output_col: str): """ Apply wildcards values to a list of candidates. Wildcards are values containing '*'. For example, a value containing '*SOLID*' would include all the values in 'candidates' containing 'SOLID' in the middle. From d199eeb35a1edd5163dbd9bda098420557c88fde Mon Sep 17 00:00:00 2001 From: Sam West Date: Wed, 21 Feb 2024 12:08:25 +1100 Subject: [PATCH 11/21] cleanup --- benchmarks.yml | 39 --------------------------------------- pyproject.toml | 2 +- tests/test_transforms.py | 8 ++++---- utils/run_benchmarks.py | 11 ----------- 4 files changed, 5 insertions(+), 55 deletions(-) diff --git a/benchmarks.yml b/benchmarks.yml index 686472d..3f3ed6b 100644 --- a/benchmarks.yml +++ b/benchmarks.yml @@ -279,42 +279,3 @@ benchmarks: - "SuppXLS/Scen_B_TRA_EV_Parity.xlsx" - "SuppXLS/Scen_B_TRA_F_ModalShares.xlsx" dd_folder: Ireland - - - name: AusTIMES - input_folder: ../../../austimes-lfs - regions: "ACT" - inputs: - - "VT_AUS_COM.xlsx" - - "VT_AUS_ELC.xlsx" - - "VT_AUS_IND.xlsm" - - "VT_AUS_RES.xlsx" - - "VT_AUS_TRA.xlsx" - - "SysSettings.xlsx" - - "SuppXLS/Scen_Par-austimes_CCA.xlsx" - - "SuppXLS/Scen_nsv-austimes_csiro_1.xlsx" - - "SuppXLS/Scen_nsv-austimes-cwc_1.xlsx" - - "SuppXLS/Scen_Base-Electricity.xlsx" - - "SuppXLS/Scen_Base-Industry.xlsx" - - "SuppXLS/Scen_Base-Transport.xlsx" - - "SuppXLS/Scen_CoalPlants_LE-control_disable.xlsx" - - "SuppXLS/Scen_Hydrogen.xlsx" - - "SuppXLS/Scen_revMinCoalfactors.xlsx" - - "SuppXLS/Scen_revMinGPGfactors.xlsx" - - "SuppXLS/Scen_Transport_Base_Liq-LevPlayFld_v16.xlsx" - - "SuppXLS/Scen_Transport_SteadyProgress.xlsx" - - "SuppXLS/Scen_TransportPolicies.xlsx" - - "SuppXLS/Scen_xComSolarWeighting-TG17-base.xlsx" - - "SuppXLS/Scen_xComSolarWeighting-TG17-xScale.xlsx" - - "SuppXLS/Scen_xICEcostAdj.xlsx" - - "SuppXLS/Scen_zScalingCorrection.xlsx" - - "SuppXLS/Scen_zzdisable2options.xlsx" - - "SubRES_Tmpl/SubRES_CoalGasDomExp.xlsx" - - "SubRES_Tmpl/SubRES_CoalPlants-Ret-and_lifeExt.xlsx" - - "SubRES_Tmpl/SubRES_ElecFossil.xlsx" - - "SubRES_Tmpl/SubRES_ETI_Techs.xlsx" - - "SubRES_Tmpl/SubRES_Frontier-Levers.xlsx" - - "SubRES_Tmpl/SubRES_Hydrogen_production.xlsx" - - "SubRES_Tmpl/SubRES_PumpedStorage.xlsx" - - "SubRES_Tmpl/SubRES_WindSolarWave.xlsx" - - "SuppXLS/Trades/ScenTrade_TradeParm.xlsx" - dd_folder: austimes diff --git a/pyproject.toml b/pyproject.toml index a81a5ca..8d52f1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ line-length = 150 # increase default line wrap length for legibility # Automation of common dev tasks etc. # Run with: `poe `, e,g. `poe lint` or `poe benchmark Ireland`. # See https://github.com/nat-n/poethepoet for details. -benchmark = { cmd = "python utils/run_benchmarks.py benchmarks.yml --verbose --run", help = "Run a single benchmark. Usage: poe benchmark " } +benchmark = { cmd = "python utils/run_benchmarks.py benchmarks.yml --run", help = "Run a single benchmark. Usage: poe benchmark " } benchmark_all = { shell = "python utils/run_benchmarks.py benchmarks.yml --verbose | tee out.txt", help = "Run the project", interpreter = "posix" } lint = { shell = "git add .pre-commit-config.yaml & pre-commit run", help = "Run pre-commit hooks", interpreter = "posix" } test = { cmd = "pytest --cov-report term --cov-report html --cov=xl2times --cov=utils", help = "Run unit tests with pytest" } diff --git a/tests/test_transforms.py b/tests/test_transforms.py index faf99ad..a86f80b 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -78,10 +78,10 @@ def test_uc_wildcards(self): dictionary = pickle.load(f) df = df_in.copy() - df_in = pd.read_parquet("tests/data/process_uc_wildcards_austimes_data.parquet") - with open("tests/data/process_uc_wildcards_austimes_dict.pkl", "rb") as f: - dictionary = pickle.load(f) - df = df_in.query() + # df_in = pd.read_parquet("tests/data/process_uc_wildcards_austimes_data.parquet") + # with open("tests/data/process_uc_wildcards_austimes_dict.pkl", "rb") as f: + # dictionary = pickle.load(f) + # df = df_in.query("region in ['ACT', 'NSW']") t0 = datetime.now() diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index afeff18..e4c56c3 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -22,17 +22,6 @@ from xl2times.utils import max_workers -# configure logger -# logging.basicConfig( -# level=logging.DEBUG, -# format="%(asctime)s - %(name)s - %(levelname)s - %(message)s - %(filename)s:%(lineno)d", -# handlers=[StreamHandler(), RotatingFileHandler("xl2times.log", maxBytes=1000000, backupCount=5)], -# force=True, -# datefmt="%Y-%m-%d %H:%M:%S", -# ) -# logger = logging.getLogger("xl2times") -# logger.info("Logger!") - from loguru import logger # set global log level via env var. Set to INFO if not already set. From 5cdda5fd506edf89ad007f85bae329431e0e533d Mon Sep 17 00:00:00 2001 From: Sam West Date: Wed, 21 Feb 2024 15:27:09 +1100 Subject: [PATCH 12/21] extra --debug logic benchmarks always compare to HEAD of main branch --- utils/run_benchmarks.py | 50 +++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index e4c56c3..558ba5e 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -1,5 +1,4 @@ import argparse -import logging import os import re import shutil @@ -9,21 +8,19 @@ from collections import namedtuple from concurrent.futures import ProcessPoolExecutor from functools import partial -from logging.handlers import RotatingFileHandler -from logging import StreamHandler from os import path, symlink -from re import match from typing import Any, Tuple import git import pandas as pd import yaml +from loguru import logger from tabulate import tabulate +from dd_to_csv import main +from xl2times.__main__ import run, parse_args from xl2times.utils import max_workers -from loguru import logger - # set global log level via env var. Set to INFO if not already set. if os.getenv("LOGURU_LEVEL") is None: os.environ["LOGURU_LEVEL"] = "INFO" @@ -174,7 +171,7 @@ def run_benchmark( # First convert ground truth DD to csv if not skip_csv: shutil.rmtree(csv_folder, ignore_errors=True) - if os.name != "nt": + if debug: res = subprocess.run( [ "python", @@ -194,9 +191,12 @@ def run_benchmark( sys.exit(5) else: # If debug option is set, run as a function call to allow stepping with a debugger. - from dd_to_csv import main - - main([dd_folder, csv_folder]) + try: + main([dd_folder, csv_folder]) + except Exception: + logger.exception(f"dd_to_csv failed on {benchmark['name']}") + shutil.rmtree(csv_folder, ignore_errors=True) + sys.exit(5) elif not path.exists(csv_folder): print(f"ERROR: --skip_csv is true but {csv_folder} does not exist") @@ -217,22 +217,12 @@ def run_benchmark( else: args.append(xl_folder) start = time.time() - res = None - if not debug: - res = subprocess.run( - ["xl2times"] + args, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - ) - else: - # If debug option is set, run as a function call to allow stepping with a debugger. - from xl2times.__main__ import run, parse_args - summary = run(parse_args(args)) + # Call the conversion function directly + summary = run(parse_args(args)) - # pack the results into a namedtuple pretending to be a return value from a subprocess call (as above). - res = namedtuple("stdout", ["stdout", "stderr", "returncode"])(summary, "", 0) + # pack the results into a namedtuple pretending to be a return value from a subprocess call (as above). + res = namedtuple("stdout", ["stdout", "stderr", "returncode"])(summary, "", 0) runtime = time.time() - start @@ -283,8 +273,13 @@ def run_all_benchmarks( debug=debug, ) - with ProcessPoolExecutor(max_workers=max_workers) as executor: - results = list(executor.map(run_a_benchmark, benchmarks)) + if debug: + # bypass process pool and call benchmarks directly if --debug is set. + results = [run_a_benchmark(b) for b in benchmarks] + else: + with ProcessPoolExecutor(max_workers=max_workers) as executor: + results = list(executor.map(run_a_benchmark, benchmarks)) + print("\n\n" + tabulate(results, headers, floatfmt=".1f") + "\n") if skip_regression: @@ -324,8 +319,9 @@ def run_all_benchmarks( print("Your working directory is not clean. Skipping regression tests.") sys.exit(8) - # Re-run benchmarks on main + # Re-run benchmarks on main - check it out and pull repo.heads.main.checkout() + origin.pull("main") # if main already exists, make sure it's up to date print("Running benchmarks on main", end="", flush=True) run_a_benchmark = partial( run_benchmark, From a4d43fadc4252b173a1e923716bd8ed53084876d Mon Sep 17 00:00:00 2001 From: Sam West Date: Wed, 21 Feb 2024 15:59:35 +1100 Subject: [PATCH 13/21] post merge fixes obscure _match_uc_wildcards bugfix --- tests/test_transforms.py | 52 ++++++++++++++++++++++++++++++---------- utils/run_benchmarks.py | 3 ++- xl2times/transforms.py | 32 ++++++++----------------- xl2times/utils.py | 2 +- 4 files changed, 52 insertions(+), 37 deletions(-) diff --git a/tests/test_transforms.py b/tests/test_transforms.py index a86f80b..679cf30 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -2,6 +2,7 @@ from typing import Callable import pandas as pd +from loguru import logger from xl2times import transforms, utils, datatypes from xl2times.transforms import ( @@ -18,6 +19,8 @@ commodity_map, ) +logger = utils.get_logger() + pd.set_option( "display.max_rows", 20, @@ -32,7 +35,9 @@ ) -def _match_uc_wildcards_old(df: pd.DataFrame, dictionary: dict[str, pd.DataFrame]) -> pd.DataFrame: +def _match_uc_wildcards_old( + df: pd.DataFrame, dictionary: dict[str, pd.DataFrame] +) -> pd.DataFrame: """Old version of the process_uc_wildcards matching logic, for comparison with the new vectorised version. TODO remove this function once validated. """ @@ -44,12 +49,17 @@ def make_str(df): else: return None - df["process"] = df.apply(lambda row: make_str(get_matching_processes(row, dictionary)), axis=1) - df["commodity"] = df.apply(lambda row: make_str(get_matching_commodities(row, dictionary)), axis=1) + df["process"] = df.apply( + lambda row: make_str(get_matching_processes(row, dictionary)), axis=1 + ) + df["commodity"] = df.apply( + lambda row: make_str(get_matching_commodities(row, dictionary)), axis=1 + ) cols_to_drop = [col for col in df.columns if col in query_columns] df = expand_rows( + query_columns, datatypes.EmbeddedXlTable( tag="", uc_sets={}, @@ -57,7 +67,7 @@ def make_str(df): range="", filename="", dataframe=df.drop(columns=cols_to_drop), - ) + ), ).dataframe return df @@ -86,8 +96,12 @@ def test_uc_wildcards(self): t0 = datetime.now() # optimised functions - df_new = _match_uc_wildcards(df, process_map, dictionary, get_matching_processes, "process") - df_new = _match_uc_wildcards(df_new, commodity_map, dictionary, get_matching_commodities, "commodity") + df_new = _match_uc_wildcards( + df, process_map, dictionary, get_matching_processes, "process" + ) + df_new = _match_uc_wildcards( + df_new, commodity_map, dictionary, get_matching_commodities, "commodity" + ) t1 = datetime.now() @@ -96,19 +110,27 @@ def test_uc_wildcards(self): t2 = datetime.now() - print(f"Old method took {t2 - t1} seconds") - print(f"New method took {t1 - t0} seconds, speedup: {((t2 - t1) / (t1 - t0)):.1f}x") + logger.info(f"Old method took {t2 - t1} seconds") + logger.info( + f"New method took {t1 - t0} seconds, speedup: {((t2 - t1) / (t1 - t0)):.1f}x" + ) # unit tests assert df_new is not None and not df_new.empty - assert df_new.shape[0] >= df_in.shape[0], "should have more rows after processing uc_wildcards" - assert df_new.shape[1] < df_in.shape[1], "should have fewer columns after processing uc_wildcards" + assert ( + df_new.shape[0] >= df_in.shape[0] + ), "should have more rows after processing uc_wildcards" + assert ( + df_new.shape[1] < df_in.shape[1] + ), "should have fewer columns after processing uc_wildcards" assert "process" in df_new.columns, "should have added process column" assert "commodity" in df_new.columns, "should have added commodity column" # consistency checks with old method assert len(set(df_new.columns).symmetric_difference(set(df_old.columns))) == 0 - assert df_new.fillna(-1).equals(df_old.fillna(-1)), "Dataframes should be equal (ignoring Nones and NaNs)" + assert df_new.fillna(-1).equals( + df_old.fillna(-1) + ), "Dataframes should be equal (ignoring Nones and NaNs)" def test_generate_commodity_groups(self): """ @@ -119,7 +141,9 @@ def test_generate_commodity_groups(self): 43958x speedup """ # data extracted immediately before the original for loops - comm_groups = pd.read_parquet("tests/data/comm_groups_austimes_test_data.parquet").drop(columns=["commoditygroup"]) + comm_groups = pd.read_parquet( + "tests/data/comm_groups_austimes_test_data.parquet" + ).drop(columns=["commoditygroup"]) # filter data so test runs faster comm_groups = comm_groups.query("region in ['ACT', 'NSW']") @@ -140,7 +164,9 @@ def test_default_pcg_vectorised(self): comm_groups = pd.read_parquet("tests/data/austimes_pcg_test_data.parquet") comm_groups = comm_groups[(comm_groups["region"].isin(["ACT", "NT"]))] - comm_groups2 = _process_comm_groups_vectorised(comm_groups.copy(), transforms.csets_ordered_for_pcg) + comm_groups2 = _process_comm_groups_vectorised( + comm_groups.copy(), transforms.csets_ordered_for_pcg + ) assert comm_groups2 is not None and not comm_groups2.empty assert comm_groups2.shape == (comm_groups.shape[0], comm_groups.shape[1] + 1) assert comm_groups2.drop(columns=["DefaultVedaPCG"]).equals(comm_groups) diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index d66f65d..9597781 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -4,7 +4,6 @@ import subprocess import sys import time -from __main__ import parse_args, run from collections import namedtuple from concurrent.futures import ProcessPoolExecutor from functools import partial @@ -19,6 +18,7 @@ from utils.dd_to_csv import main from xl2times import utils +from xl2times.__main__ import parse_args, run from xl2times.utils import max_workers logger = utils.get_logger() @@ -158,6 +158,7 @@ def run_benchmark( stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, + shell=True, ) if res.returncode != 0: # Remove partial outputs diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 5be227d..474221f 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -1,14 +1,3 @@ -import collections -import functools -import pickle -from collections import defaultdict -from pandas.core.frame import DataFrame -from pathlib import Path -import pandas as pd -from dataclasses import replace -from typing import Dict, List, Callable -from more_itertools import locate, one -from itertools import groupby import re import time from collections import defaultdict @@ -17,23 +6,18 @@ from functools import reduce from itertools import groupby from pathlib import Path +from typing import Callable from typing import Dict, List, Set import pandas as pd from loguru import logger from more_itertools import locate, one from pandas.core.frame import DataFrame - -import logging -import logging.config - from tqdm import tqdm -from .utils import max_workers from . import datatypes from . import utils - -logger = logging.getLogger(__name__) +from .utils import max_workers query_columns = { "pset_set", @@ -2175,7 +2159,7 @@ def process_uc_wildcards( tables[tag] = df - print( + logger.info( f" process_uc_wildcards: {tag} took {time.time() - start_time:.2f} seconds for {len(df)} rows" ) @@ -2208,9 +2192,13 @@ def _match_uc_wildcards( unique_filters = df[proc_cols].drop_duplicates().dropna(axis="rows", how="all") # match all the wildcards columns against the dictionary names - matches = unique_filters.apply( - lambda row: matcher(row, dictionary), axis=1 - ).to_list() + matches = unique_filters.apply(lambda row: matcher(row, dictionary), axis=1) + # FIXME: work-around for matchers occasionally DataFrame or a Series + matches = ( + matches.iloc[:, 0].to_list() + if isinstance(matches, pd.DataFrame) + else matches.to_list() + ) matches = [ df.iloc[:, 0].to_list() if df is not None and len(df) != 0 else None for df in matches diff --git a/xl2times/utils.py b/xl2times/utils.py index 1d432b7..3534c20 100644 --- a/xl2times/utils.py +++ b/xl2times/utils.py @@ -268,7 +268,7 @@ def get_logger(log_name: str = default_log_name, log_dir: str = ".") -> loguru.L "level": "DEBUG", "colorize": False, "serialize": False, - "diagnose": False, + "diagnose": True, "rotation": "20 MB", "compression": "zip", }, From e388130d0084822fc6342d33e48d3df0a140824d Mon Sep 17 00:00:00 2001 From: Sam West Date: Wed, 21 Feb 2024 16:03:45 +1100 Subject: [PATCH 14/21] fix import --- utils/run_benchmarks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index 9597781..a30b331 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -16,7 +16,7 @@ from loguru import logger from tabulate import tabulate -from utils.dd_to_csv import main +from dd_to_csv import main from xl2times import utils from xl2times.__main__ import parse_args, run from xl2times.utils import max_workers From 33fb41f43a55405f765641110b1b6edb57a84026 Mon Sep 17 00:00:00 2001 From: Sam West Date: Thu, 22 Feb 2024 08:59:59 +1100 Subject: [PATCH 15/21] Corrected debug logic --- utils/run_benchmarks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index a30b331..0eeb889 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -147,7 +147,8 @@ def run_benchmark( # First convert ground truth DD to csv if not skip_csv: shutil.rmtree(csv_folder, ignore_errors=True) - if debug: + if not debug: + # run as subprocess if not in --debug mode res = subprocess.run( [ "python", From fcd11cdf383aa84c313ac046b54ad085c433fe50 Mon Sep 17 00:00:00 2001 From: Sam West Date: Thu, 22 Feb 2024 09:18:52 +1100 Subject: [PATCH 16/21] remove shell=True in dd_to_csv run on non-windows OSes --- tests/test_transforms.py | 5 ----- utils/run_benchmarks.py | 3 ++- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 679cf30..2810f1b 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -88,11 +88,6 @@ def test_uc_wildcards(self): dictionary = pickle.load(f) df = df_in.copy() - # df_in = pd.read_parquet("tests/data/process_uc_wildcards_austimes_data.parquet") - # with open("tests/data/process_uc_wildcards_austimes_dict.pkl", "rb") as f: - # dictionary = pickle.load(f) - # df = df_in.query("region in ['ACT', 'NSW']") - t0 = datetime.now() # optimised functions diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index 0eeb889..6548bf1 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -1,4 +1,5 @@ import argparse +import os import re import shutil import subprocess @@ -159,7 +160,7 @@ def run_benchmark( stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, - shell=True, + shell=True if os.name == "nt" else False, ) if res.returncode != 0: # Remove partial outputs From ff268dc5689435cba14f7289ec0335e546f5bc56 Mon Sep 17 00:00:00 2001 From: Sam West Date: Thu, 22 Feb 2024 13:22:35 +1100 Subject: [PATCH 17/21] addressed review comments from @olejandro organised imports, appeased linters --- tests/test_transforms.py | 6 +----- utils/run_benchmarks.py | 5 ++--- xl2times/__main__.py | 6 +++--- xl2times/excel.py | 1 - xl2times/transforms.py | 21 ++++----------------- 5 files changed, 10 insertions(+), 29 deletions(-) diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 2810f1b..694c31f 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -1,19 +1,14 @@ from datetime import datetime -from typing import Callable import pandas as pd -from loguru import logger from xl2times import transforms, utils, datatypes from xl2times.transforms import ( _process_comm_groups_vectorised, _count_comm_group_vectorised, - intersect, expand_rows, get_matching_commodities, - filter_by_pattern, get_matching_processes, - query_columns, _match_uc_wildcards, process_map, commodity_map, @@ -56,6 +51,7 @@ def make_str(df): lambda row: make_str(get_matching_commodities(row, dictionary)), axis=1 ) + query_columns = transforms.process_map.keys() | transforms.commodity_map.keys() cols_to_drop = [col for col in df.columns if col in query_columns] df = expand_rows( diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index 6548bf1..d6ae4e6 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -14,7 +14,6 @@ import git import pandas as pd import yaml -from loguru import logger from tabulate import tabulate from dd_to_csv import main @@ -452,11 +451,11 @@ def run_all_benchmarks( benchmarks_folder = spec["benchmarks_folder"] benchmark_names = [b["name"] for b in spec["benchmarks"]] if len(set(benchmark_names)) != len(benchmark_names): - logger.error(f"Found duplicate name in benchmarks YAML file") + logger.error("Found duplicate name in benchmarks YAML file") sys.exit(11) if args.dd and args.times_dir is None: - logger.error(f"--times_dir is required when using --dd") + logger.error("--times_dir is required when using --dd") sys.exit(12) if args.run is not None: diff --git a/xl2times/__main__.py b/xl2times/__main__.py index 97b6d1a..056d425 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -29,7 +29,7 @@ def convert_xl_to_times( pickle_file = "raw_tables.pkl" if use_pkl and os.path.isfile(pickle_file): raw_tables = pickle.load(open(pickle_file, "rb")) - logger.warning(f"Using pickled data not xlsx") + logger.warning("Using pickled data not xlsx") else: raw_tables = [] @@ -248,7 +248,7 @@ def produce_times_tables( result = {} used_tables = set() for mapping in config.times_xl_maps: - if not mapping.xl_name in input: + if mapping.xl_name not in input: logger.warning( f"Cannot produce table {mapping.times_name} because" f" {mapping.xl_name} does not exist" @@ -281,7 +281,7 @@ def produce_times_tables( # Excel columns can be duplicated into multiple Times columns for times_col, xl_col in mapping.col_map.items(): df[times_col] = df[xl_col] - cols_to_drop = [x for x in df.columns if not x in mapping.times_cols] + cols_to_drop = [x for x in df.columns if x not in mapping.times_cols] df.drop(columns=cols_to_drop, inplace=True) df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) diff --git a/xl2times/excel.py b/xl2times/excel.py index b9c4597..102d88a 100644 --- a/xl2times/excel.py +++ b/xl2times/excel.py @@ -4,7 +4,6 @@ from typing import Dict, List import time from pandas.core.frame import DataFrame -import pandas as pd import numpy import re from . import datatypes diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 474221f..d0892ba 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -19,17 +19,6 @@ from . import utils from .utils import max_workers -query_columns = { - "pset_set", - "pset_pn", - "pset_pd", - "pset_ci", - "pset_co", - "cset_set", - "cset_cn", - "cset_cd", -} - csets_ordered_for_pcg = ["DEM", "MAT", "NRG", "ENV", "FIN"] default_pcg_suffixes = [ cset + io for cset in csets_ordered_for_pcg for io in ["I", "O"] @@ -584,8 +573,7 @@ def process_user_constraint_table( # TODO: apply table.uc_sets # Fill in UC_N blank cells with value from above - if "uc_n" in df.columns: - df["uc_n"] = df["uc_n"].ffill() + df["uc_n"] = df["uc_n"].ffill() data_columns = [ x for x in df.columns if x not in config.known_columns[datatypes.Tag.uc_t] @@ -2144,10 +2132,9 @@ def process_uc_wildcards( ) -> Dict[str, DataFrame]: tag = datatypes.Tag.uc_t - if tag in tqdm(tables, desc=f"Processing uc_wildcards on tables"): + if tag in tqdm(tables, desc="Processing uc_wildcards on tables"): start_time = time.time() df = tables[tag] - dictionary = generate_topology_dictionary(tables, model) df = _match_uc_wildcards( @@ -2241,7 +2228,7 @@ def match_wildcards( matching_commodities is None or len(matching_commodities) == 0 ): # TODO is this necessary? Try without? # TODO debug these - logger.warning(f"a row matched no processes or commodities") + logger.warning("a row matched no processes or commodities") return None return matching_processes, matching_commodities @@ -2343,7 +2330,7 @@ def eval_and_update( ): match = match_wildcards(row) if match is None: - logger.warning(f"TFM_INS-TXT row matched neither commodity nor process") + logger.warning("TFM_INS-TXT row matched neither commodity nor process") continue processes, commodities = match if commodities is not None: From a2cbeb204047bebefbe4c53bedca697c917f087d Mon Sep 17 00:00:00 2001 From: Sam West Date: Fri, 23 Feb 2024 10:13:43 +1100 Subject: [PATCH 18/21] switched to lru_cache reused process/commodities maps updated --debug doc --- utils/run_benchmarks.py | 3 ++- xl2times/__main__.py | 2 +- xl2times/transforms.py | 22 +++++++--------------- xl2times/utils.py | 7 ++----- 4 files changed, 12 insertions(+), 22 deletions(-) diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index d6ae4e6..33cc790 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -443,7 +443,8 @@ def run_all_benchmarks( "--debug", action="store_true", default=False, - help="Run each benchmark as a function call to allow a debugger to stop at breakpoints in benchmark runs.", + help="Run each benchmark as a direct function call (disables subprocesses) to allow a debugger to stop at breakpoints " + "in benchmark runs.", ) args = args_parser.parse_args() diff --git a/xl2times/__main__.py b/xl2times/__main__.py index 056d425..c556d64 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -392,7 +392,7 @@ def dump_tables(tables: List, filename: str) -> List: return tables -def run(args) -> str | None: +def run(args: argparse.Namespace) -> str | None: """ Runs the xl2times conversion. Args: diff --git a/xl2times/transforms.py b/xl2times/transforms.py index d0892ba..716661e 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -2031,15 +2031,9 @@ def intersect(acc, df): return acc.merge(df) -def get_matching_processes(row, dictionary): +def get_matching_processes(row, dictionary) -> pd.Series: matching_processes = None - for col, key in [ - ("pset_pn", "processes_by_name"), - ("pset_pd", "processes_by_desc"), - ("pset_set", "processes_by_sets"), - ("pset_ci", "processes_by_comm_in"), - ("pset_co", "processes_by_comm_out"), - ]: + for col, key in process_map.items(): if row[col] is not None: proc_set = dictionary[key] pattern = row[col].upper() @@ -2048,16 +2042,13 @@ def get_matching_processes(row, dictionary): if matching_processes is not None and any(matching_processes.duplicated()): raise ValueError("duplicated") + return matching_processes def get_matching_commodities(row, dictionary): matching_commodities = None - for col, key in [ - ("cset_cn", "commodities_by_name"), - ("cset_cd", "commodities_by_desc"), - ("cset_set", "commodities_by_sets"), - ]: + for col, key in commodity_map.items(): if row[col] is not None: matching_commodities = intersect( matching_commodities, @@ -2168,7 +2159,7 @@ def _match_uc_wildcards( process_map: Mapping of column names to process sets. dictionary: Dictionary of process sets to match against. matcher: Matching function to use, e.g. get_matching_processes or get_matching_commodities. - result_col: Name of the column to store the matche results in. + result_col: Name of the column to store the matched results in. Returns: The table with the wildcard columns removed and the results of the wildcard matches added as a column named `results_col` @@ -2180,7 +2171,8 @@ def _match_uc_wildcards( # match all the wildcards columns against the dictionary names matches = unique_filters.apply(lambda row: matcher(row, dictionary), axis=1) - # FIXME: work-around for matchers occasionally DataFrame or a Series + + # we occasionally get a Dataframe back from the matchers. convert these to Series. matches = ( matches.iloc[:, 0].to_list() if isinstance(matches, pd.DataFrame) diff --git a/xl2times/utils.py b/xl2times/utils.py index 3534c20..fcc10eb 100644 --- a/xl2times/utils.py +++ b/xl2times/utils.py @@ -181,22 +181,19 @@ def get_scalar(table_tag: str, tables: List[datatypes.EmbeddedXlTable]): return table.dataframe["value"].values[0] -@functools.cache def has_negative_patterns(pattern): return pattern[0] == "-" or ",-" in pattern -@functools.cache def remove_negative_patterns(pattern): return ",".join([word for word in pattern.split(",") if word[0] != "-"]) -@functools.cache def remove_positive_patterns(pattern): return ",".join([word[1:] for word in pattern.split(",") if word[0] == "-"]) -@functools.cache +@functools.lru_cache(maxsize=int(1e6)) def create_regexp(pattern): # exclude negative patterns if has_negative_patterns(pattern): @@ -210,7 +207,7 @@ def create_regexp(pattern): return re.compile(pattern) -@functools.cache +@functools.lru_cache(maxsize=int(1e6)) def create_negative_regexp(pattern): pattern = remove_positive_patterns(pattern) if len(pattern) == 0: From f015daaba3a924cd804ad9a1a7b56c91346464d9 Mon Sep 17 00:00:00 2001 From: Sam West Date: Fri, 23 Feb 2024 12:40:15 +1100 Subject: [PATCH 19/21] logging tweaks --- xl2times/__main__.py | 7 +++++-- xl2times/excel.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/xl2times/__main__.py b/xl2times/__main__.py index c556d64..73ca463 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -1,5 +1,7 @@ import argparse from concurrent.futures import ProcessPoolExecutor +from datetime import datetime + from pandas.core.frame import DataFrame import pandas as pd import pickle @@ -27,6 +29,7 @@ def convert_xl_to_times( stop_after_read: bool = False, ) -> Dict[str, DataFrame]: pickle_file = "raw_tables.pkl" + t0 = datetime.now() if use_pkl and os.path.isfile(pickle_file): raw_tables = pickle.load(open(pickle_file, "rb")) logger.warning("Using pickled data not xlsx") @@ -40,12 +43,12 @@ def convert_xl_to_times( raw_tables.extend(result) else: for f in input_files: - result = excel.extract_tables(f) + result = excel.extract_tables(str(Path(f).absolute())) raw_tables.extend(result) pickle.dump(raw_tables, open(pickle_file, "wb")) logger.info( f"Extracted {len(raw_tables)} tables," - f" {sum(table.dataframe.shape[0] for table in raw_tables)} rows" + f" {sum(table.dataframe.shape[0] for table in raw_tables)} rows in {datetime.now() - t0}" ) if stop_after_read: diff --git a/xl2times/excel.py b/xl2times/excel.py index 102d88a..befde29 100644 --- a/xl2times/excel.py +++ b/xl2times/excel.py @@ -42,8 +42,8 @@ def extract_tables(filename: str) -> List[datatypes.EmbeddedXlTable]: if len(parts) == 2: uc_sets[parts[0].strip()] = parts[1].strip() else: - logger.info( - f"WARNING: Malformed UC_SET in {sheet.title}, {filename}" + logger.warning( + f"Malformed UC_SET in {sheet.title}, {filename}" ) else: col_index = df.columns.get_loc(colname) From b00e68eaa53ec5dfc09e0a83768edcbfabf918ef Mon Sep 17 00:00:00 2001 From: Sam West Date: Fri, 23 Feb 2024 12:41:43 +1100 Subject: [PATCH 20/21] Merged with main # Conflicts: # xl2times/transforms.py --- xl2times/__main__.py | 2 +- xl2times/config/times_mapping.txt | 2 +- xl2times/config/veda-tags.json | 6 +- xl2times/datatypes.py | 1 - xl2times/transforms.py | 145 +++++++++++++++++++++--------- 5 files changed, 110 insertions(+), 46 deletions(-) diff --git a/xl2times/__main__.py b/xl2times/__main__.py index 73ca463..e502768 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -107,8 +107,8 @@ def convert_xl_to_times( transforms.process_uc_wildcards, transforms.process_wildcards, transforms.convert_aliases, - transforms.rename_cgs, transforms.fix_topology, + transforms.resolve_remaining_cgs, transforms.complete_dictionary, transforms.convert_to_string, lambda config, tables, model: dump_tables( diff --git a/xl2times/config/times_mapping.txt b/xl2times/config/times_mapping.txt index 8aba747..aaf5457 100644 --- a/xl2times/config/times_mapping.txt +++ b/xl2times/config/times_mapping.txt @@ -3,7 +3,7 @@ ALL_TS[ALL_TS] = TimeSlices(TS) B[DATAYEAR,VALUE] = TimePeriods(Year,B) COM[COM] = Commodities(Commodity) COM_DESC[REG,COM,TEXT] = Commodities(Region,Commodity,Description) -COM_GMAP[REG,COM_GRP,COM] = CommodityGroupMap(Region,CommodityGroup,Commodity) +COM_GMAP[REG,COM_GRP,COM] = CommodityGroups(Region,CommodityGroup,Commodity,Gmap:True) COM_GRP[COM_GRP] = CommodityGroups(CommodityGroup) COM_LIM[REG,COM,BD] = Commodities(Region,Commodity,LimType) COM_PEAK[REG,COM_GRP] = Attributes(Region,Commodity,Attribute:COM_PEAK,VALUE:1) diff --git a/xl2times/config/veda-tags.json b/xl2times/config/veda-tags.json index e84b265..09903ba 100644 --- a/xl2times/config/veda-tags.json +++ b/xl2times/config/veda-tags.json @@ -782,14 +782,14 @@ { "name": "name", "aliases": [], - "use_name": "name", + "use_name": "commoditygroup", "row_ignore_symbol": [ "\\I:", "*" ], "query_field": false, - "inherit_above": false, - "remove_first_row_if_absent": false, + "inherit_above": true, + "remove_first_row_if_absent": true, "remove_any_row_if_absent": false }, { diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py index b0100c7..c815eff 100644 --- a/xl2times/datatypes.py +++ b/xl2times/datatypes.py @@ -150,7 +150,6 @@ class TimesModel: all_regions: Set[str] = field(default_factory=set) processes: DataFrame = field(default_factory=DataFrame) commodities: DataFrame = field(default_factory=DataFrame) - com_gmap: DataFrame = field(default_factory=DataFrame) commodity_groups: DataFrame = field(default_factory=DataFrame) topology: DataFrame = field(default_factory=DataFrame) trade: DataFrame = field(default_factory=DataFrame) diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 716661e..9c74f33 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -1005,7 +1005,6 @@ def complete_dictionary( "Attributes": model.attributes, "Commodities": model.commodities, "CommodityGroups": model.commodity_groups, - "CommodityGroupMap": model.com_gmap, "Processes": model.processes, "Topology": model.topology, "Trade": model.trade, @@ -1122,11 +1121,13 @@ def generate_commodity_groups( tables: List[datatypes.EmbeddedXlTable], model: datatypes.TimesModel, ) -> List[datatypes.EmbeddedXlTable]: + """ + Generate commodity groups. + """ process_tables = [t for t in tables if t.tag == datatypes.Tag.fi_process] commodity_tables = [t for t in tables if t.tag == datatypes.Tag.fi_comm] # Veda determines default PCG based on predetermined order and presence of OUT/IN commodity - columns = ["region", "process", "primarycg"] reg_prc_pcg = pd.DataFrame(columns=columns) for process_table in process_tables: @@ -1157,7 +1158,7 @@ def generate_commodity_groups( def name_comm_group(df): """ - Return the name of a commodity group based on the member count + Return the name of a commodity group based on the member count. """ if df["commoditygroup"] > 1: @@ -1197,10 +1198,7 @@ def name_comm_group(df): # TODO: Include info from ~TFM_TOPINS e.g. include RSDAHT2 in addition to RSDAHT - i = comm_groups["commoditygroup"] != comm_groups["commodity"] - model.topology = comm_groups - model.com_gmap = comm_groups.loc[i, ["region", "commoditygroup", "commodity"]] return tables @@ -1208,7 +1206,7 @@ def name_comm_group(df): def _count_comm_group_vectorised(comm_groups: pd.DataFrame) -> None: """ Store the number of IN/OUT commodities of the same type per Region and Process in CommodityGroup. - `comm_groups` is modified in-place + `comm_groups` is modified in-place. Args: comm_groups: 'Process' DataFrame with additional columns "commoditygroup" """ @@ -1224,8 +1222,8 @@ def _count_comm_group_vectorised(comm_groups: pd.DataFrame) -> None: def _process_comm_groups_vectorised( comm_groups: pd.DataFrame, csets_ordered_for_pcg: list[str] ) -> pd.DataFrame: - """Sets the first commodity group in the list of csets_ordered_for_pcg as the default pcg for each region/process/io combination, - but setting the io="OUT" subset as default before "IN". + """Sets the first commodity group in the list of csets_ordered_for_pcg as the default + pcg for each region/process/io combination, but setting the io="OUT" subset as default before "IN". See: Section 3.7.2.2, pg 80. of `TIMES Documentation PART IV` for details. @@ -1233,12 +1231,12 @@ def _process_comm_groups_vectorised( comm_groups: 'Process' DataFrame with columns ["region", "process", "io", "csets", "commoditygroup"] csets_ordered_for_pcg: List of csets in the order they should be considered for default pcg Returns: - Processed DataFrame with a new column "DefaultVedaPCG" set to True for the default pcg in each region/process/io combination. + Processed DataFrame with a new column "DefaultVedaPCG" set to True for the default pcg in eachregion/process/io combination. """ def _set_default_veda_pcg(group): - """For a given [region, process] group, default group is set as the first cset in the `csets_ordered_for_pcg` list, which is an output, if - one exists, otherwise the first input.""" + """For a given [region, process] group, default group is set as the first cset in the `csets_ordered_for_pcg` + list, which is an output, if one exists, otherwise the first input.""" if not group["csets"].isin(csets_ordered_for_pcg).all(): return group @@ -1267,17 +1265,21 @@ def complete_commodity_groups( model: datatypes.TimesModel, ) -> Dict[str, DataFrame]: """ - Complete the list of commodity groups + Complete the list of commodity groups. """ - commodities = generate_topology_dictionary(tables, model)[ - "commodities_by_name" - ].rename(columns={"commodity": "commoditygroup"}) - cgs_in_top = model.topology["commoditygroup"].to_frame() - commodity_groups = pd.concat([commodities, cgs_in_top]) - model.commodity_groups = commodity_groups.drop_duplicates( - keep="first" - ).reset_index() + # Single member CGs i.e., CG and commodity are the same + single_cgs = model.commodities[["region", "commodity"]].drop_duplicates( + ignore_index=True + ) + single_cgs["commoditygroup"] = single_cgs["commodity"] + # Commodity groups from topology + top_cgs = model.topology[["region", "commodity", "commoditygroup"]].drop_duplicates( + ignore_index=True + ) + cgs = pd.concat([single_cgs, top_cgs], ignore_index=True) + cgs["gmap"] = cgs["commoditygroup"] != cgs["commodity"] + model.commodity_groups = cgs.dropna().drop_duplicates(ignore_index=True) return tables @@ -1413,17 +1415,9 @@ def expand_pcg_from_suffix(df): how="right", ) df = pd.concat([df, default_pcgs]) + # Keep last if a row appears more than once (disregard primarycg) df.drop_duplicates( - subset=[ - "sets", - "region", - "process", - "description", - "tact", - "tcap", - "tslvl", - "vintage", - ], + subset=[c for c in df.columns if c != "primarycg"], keep="last", inplace=True, ) @@ -1605,7 +1599,7 @@ def process_topology( model: datatypes.TimesModel, ) -> List[datatypes.EmbeddedXlTable]: """ - Create topology + Create topology. """ fit_tables = [t for t in tables if t.tag.startswith(datatypes.Tag.fi_t)] @@ -2209,13 +2203,21 @@ def process_wildcards( tables: Dict[str, DataFrame], model: datatypes.TimesModel, ) -> Dict[str, DataFrame]: + """ + Process wildcards specified in TFM tables. + """ + topology = generate_topology_dictionary(tables, model) def match_wildcards( row: pd.Series, ) -> tuple[DataFrame | None, DataFrame | None] | None: + """ + Return matching processes and commodities + """ matching_processes = get_matching_processes(row, topology) matching_commodities = get_matching_commodities(row, topology) + if (matching_processes is None or len(matching_processes) == 0) and ( matching_commodities is None or len(matching_commodities) == 0 ): # TODO is this necessary? Try without? @@ -2368,6 +2370,33 @@ def eval_and_update( new_tables.append(tables[datatypes.Tag.fi_t]) tables[datatypes.Tag.fi_t] = pd.concat(new_tables, ignore_index=True) + if datatypes.Tag.tfm_comgrp in tables: + updates = tables[datatypes.Tag.tfm_comgrp] + table = model.commodity_groups + new_tables = [] + + # Expand each row by wildcards, then add to model.commodity_groups + for _, row in updates.iterrows(): + match = match_wildcards(row) + # Convert series to dataframe; keep only relevant columns + new_rows = pd.DataFrame([row.filter(table.columns)]) + # Match returns both processes and commodities, but only latter is relevant here + processes, commodities = match if match is not None else (None, None) + if commodities is None: + logger.warning(f"TFM_COMGRP row did not match any commodity") + else: + new_rows = commodities.merge(new_rows, how="cross") + new_tables.append(new_rows) + + # Expand model.commodity_groups with user-defined commodity groups + if new_tables: + new_tables.append(model.commodity_groups) + commodity_groups = pd.concat( + new_tables, ignore_index=True + ).drop_duplicates() + commodity_groups.loc[commodity_groups["gmap"].isna(), ["gmap"]] = True + model.commodity_groups = commodity_groups.dropna() + return tables @@ -2530,18 +2559,54 @@ def convert_aliases( return tables -def rename_cgs( +def resolve_remaining_cgs( config: datatypes.Config, tables: Dict[str, DataFrame], model: datatypes.TimesModel, ) -> Dict[str, DataFrame]: - df = tables.get(datatypes.Tag.fi_t) - if df is not None: - i = df["other_indexes"].isin(default_pcg_suffixes) - df.loc[i, "other_indexes"] = ( - df["process"].astype(str) + "_" + df["other_indexes"].astype(str) - ) - tables[datatypes.Tag.fi_t] = df + """ + Resolve commodity group names in model.attributes specified as commodity type. + Supplement model.commodity_groups with resolved commodity groups. + """ + + if not model.attributes.empty: + i = model.attributes["other_indexes"].isin(default_pcg_suffixes) + if any(i): + # Store processes with unresolved commodity groups + check_cgs = model.attributes.loc[ + i, ["region", "process", "other_indexes"] + ].drop_duplicates(ignore_index=True) + # Resolve commodity group names in model.attribues + model.attributes.loc[i, "other_indexes"] = ( + model.attributes["process"].astype(str) + + "_" + + model.attributes["other_indexes"].astype(str) + ) + # TODO: Combine with above to avoid repetition + check_cgs["commoditygroup"] = ( + check_cgs["process"].astype(str) + + "_" + + check_cgs["other_indexes"].astype(str) + ) + check_cgs["csets"] = check_cgs["other_indexes"].str[:3] + check_cgs["io"] = check_cgs["other_indexes"].str[3:] + check_cgs["io"] = check_cgs["io"].replace({"I": "IN", "O": "OUT"}) + check_cgs = check_cgs.drop(columns="other_indexes") + check_cgs = check_cgs.merge( + model.topology[ + ["region", "process", "commodity", "csets", "io"] + ].drop_duplicates(), + how="left", + ) + check_cgs["gmap"] = True + check_cgs = pd.concat( + [ + model.commodity_groups, + check_cgs[["region", "commodity", "commoditygroup", "gmap"]], + ], + ignore_index=True, + ) + model.commodity_groups = check_cgs.drop_duplicates().dropna() return tables From ab76d8dda6757fa4619fa251d1ee26d52b6bd228 Mon Sep 17 00:00:00 2001 From: Sam West Date: Fri, 23 Feb 2024 13:09:49 +1100 Subject: [PATCH 21/21] added extra check in matchers --- xl2times/transforms.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 9c74f33..2bac644 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -2025,11 +2025,11 @@ def intersect(acc, df): return acc.merge(df) -def get_matching_processes(row, dictionary) -> pd.Series: +def get_matching_processes(row: pd.Series, topology: Dict[str, DataFrame]) -> pd.Series: matching_processes = None for col, key in process_map.items(): - if row[col] is not None: - proc_set = dictionary[key] + if col in row.index and row[col] is not None: + proc_set = topology[key] pattern = row[col].upper() filtered = filter_by_pattern(proc_set, pattern) matching_processes = intersect(matching_processes, filtered) @@ -2040,13 +2040,13 @@ def get_matching_processes(row, dictionary) -> pd.Series: return matching_processes -def get_matching_commodities(row, dictionary): +def get_matching_commodities(row: pd.Series, topology: Dict[str, DataFrame]): matching_commodities = None for col, key in commodity_map.items(): - if row[col] is not None: + if col in row.index and row[col] is not None: matching_commodities = intersect( matching_commodities, - filter_by_pattern(dictionary[key], row[col].upper()), + filter_by_pattern(topology[key], row[col].upper()), ) return matching_commodities