From bf5ea29a4f43d4dac91226dfee2ac362cc8a7671 Mon Sep 17 00:00:00 2001 From: Sam West Date: Sat, 23 Mar 2024 11:53:52 +1100 Subject: [PATCH] dd_to_csv bugfixes, move cache to `~/.cache/xl2times/` (#230) Just a minor bugfix and some housekeeping. - Fixes a bug in dd_to_csv.py where DD variable names containing spaces don't parse correctly. - Made dd_to_csv a second script in pyproject.toml (we'd like to use it from the austimes repo, and couldn't easily otherwise) - Moved pickle cache to ~/.cache/xl2times - Adds cache invalidation, to delete old cache files (just anything >365 days, couldn't think of a better way) --------- Co-authored-by: Siddharth Krishna --- .github/workflows/ci.yml | 10 +++-- pyproject.toml | 1 + utils/run_benchmarks.py | 6 ++- xl2times/__main__.py | 70 +++++++++++++++++++++----------- xl2times/config/times-info.json | 2 +- {utils => xl2times}/dd_to_csv.py | 21 +++++----- xl2times/transforms.py | 2 +- 7 files changed, 69 insertions(+), 43 deletions(-) rename {utils => xl2times}/dd_to_csv.py (93%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bd2b3b6..d9aed4b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,6 +21,7 @@ jobs: REF_demos-dd: "2848a8a8e2fdcf0cdf7f83eefbdd563b0bb74e86" REF_tim: "e820d8002adc6b1526a3bffcc439219b28d0eed5" REF_tim-gams: "703f6a4e1d0bedd95c3ebdae534496f3a7e1b7cc" + CACHE_KEY: 0 # Use this for manual cache key bumps, e.g., when caching code changes steps: - uses: actions/checkout@v3 @@ -115,11 +116,12 @@ jobs: id: cache uses: actions/cache/restore@v4 with: - path: ${{ github.workspace }}/xl2times/xl2times/.cache + path: ~/.cache/xl2times # Cache key is refs of the input xlsx repos, since that's what is cached - key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }} + key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}-${{ env.CACHE_KEY }} # If we can't find the exact key for the TIM repo, still use the cache if the demos repo ref matches restore-keys: | + ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}- ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}- ${{ runner.os }}-py-${{ env.PY_VERSION }}- @@ -166,5 +168,5 @@ jobs: # Save the cache even if the regression tests fail if: always() && !steps.cache-restore.outputs.cache-hit with: - path: ${{ github.workspace }}/xl2times/xl2times/.cache - key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }} + path: ~/.cache/xl2times + key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}-${{ env.CACHE_KEY }} diff --git a/pyproject.toml b/pyproject.toml index f7ca779..1f9b15c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ Source = "https://github.com/etsap-TIMES/xl2times" [project.scripts] xl2times = "xl2times.__main__:main" +dd_to_csv = "xl2times.dd_to_csv:main" [tool.pytest.ini_options] # don't print runtime warnings diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index bbf18e1..e28f5ad 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -14,11 +14,11 @@ import git import pandas as pd import yaml -from dd_to_csv import main from tabulate import tabulate from xl2times import utils from xl2times.__main__ import parse_args, run +from xl2times.dd_to_csv import main from xl2times.utils import max_workers logger = utils.get_logger() @@ -155,7 +155,7 @@ def run_benchmark( res = subprocess.run( [ "python", - "utils/dd_to_csv.py", + "xl2times/dd_to_csv.py", dd_folder, csv_folder, ], @@ -164,6 +164,7 @@ def run_benchmark( text=True, shell=True if os.name == "nt" else False, check=False, + encoding="utf-8", ) if res.returncode != 0: # Remove partial outputs @@ -208,6 +209,7 @@ def run_benchmark( stderr=subprocess.STDOUT, text=True, check=False, + encoding="utf-8", ) else: # If debug option is set, run as a function call to allow stepping with a debugger. diff --git a/xl2times/__main__.py b/xl2times/__main__.py index c24675c..7141f96 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -5,13 +5,13 @@ import sys import time from concurrent.futures import ProcessPoolExecutor -from datetime import datetime +from datetime import datetime, timedelta from pathlib import Path +import numpy as np import pandas as pd from pandas.core.frame import DataFrame -from xl2times import __file__ as xl2times_file_path from xl2times.utils import max_workers from . import excel, transforms, utils @@ -20,33 +20,54 @@ logger = utils.get_logger() -cache_dir = os.path.abspath(os.path.dirname(xl2times_file_path)) + "/.cache/" -os.makedirs(cache_dir, exist_ok=True) +cache_dir = Path.home() / ".cache/xl2times/" +cache_dir.mkdir(exist_ok=True, parents=True) -def _read_xlsx_cached(filename: str) -> list[EmbeddedXlTable]: +def invalidate_cache(max_age: timedelta = timedelta(days=365)): + """ + Delete any cache files older than max_age. + + Args: + max_age: Maximum age of a cache file to be considered valid. Any cache files older than this are deleted. + """ + for file in cache_dir.glob("*.pkl"): + if datetime.now() - datetime.fromtimestamp(file.lstat().st_mtime) > max_age: + try: + file.unlink() + except Exception as e: + logger.warning(f"Failed to delete old cache file {file}. {e}") + + +def _read_xlsx_cached(filename: str | Path) -> list[EmbeddedXlTable]: """Extract EmbeddedXlTables from xlsx file (cached). Since excel.extract_tables is quite slow, we cache its results in `cache_dir`. Each file is named by the hash of the contents of an xlsx file, and contains a tuple (filename, modified timestamp, [EmbeddedXlTable]). + + Args: + filename: Path to the xlsx file to extract tables from. """ - with open(filename, "rb") as f: + filename = Path(filename).resolve() + with filename.open("rb") as f: digest = hashlib.file_digest(f, "sha256") # pyright: ignore hsh = digest.hexdigest() - if os.path.isfile(cache_dir + hsh): - fname1, _timestamp, tables = pickle.load(open(cache_dir + hsh, "rb")) - # In the extremely unlikely event that we have a hash collision, also check that - # the filename is the same: - # TODO check modified timestamp also matches - if filename == fname1: - logger.info(f"Using cached data for {filename} from {cache_dir + hsh}") - return tables - # Write extracted data to cache: - tables = excel.extract_tables(filename) - pickle.dump((filename, "TODO ModifiedTime", tables), open(cache_dir + hsh, "wb")) - logger.info(f"Saved cache for {filename} to {cache_dir + hsh}") - return excel.extract_tables(filename) + cached_file = (cache_dir / f"{Path(filename).stem}_{hsh}.pkl").resolve() + + if cached_file.exists(): + # just load and return the cached pickle + with cached_file.open("rb") as f: + tables = pickle.load(f) + logger.info(f"Using cached data for {filename} from {cached_file}") + else: + # extract data and write it to cache before returning it + tables = excel.extract_tables(str(filename)) + with cached_file.open("wb") as f: + pickle.dump(tables, f) + logger.info(f"Saved cache for {filename} to {cached_file}") + + return tables def convert_xl_to_times( @@ -59,6 +80,8 @@ def convert_xl_to_times( stop_after_read: bool = False, ) -> dict[str, DataFrame]: start_time = datetime.now() + + invalidate_cache() with ProcessPoolExecutor(max_workers) as executor: raw_tables = executor.map( excel.extract_tables if no_cache else _read_xlsx_cached, input_files @@ -180,10 +203,9 @@ def write_csv_tables(tables: dict[str, DataFrame], output_dir: str): def read_csv_tables(input_dir: str) -> dict[str, DataFrame]: result = {} - for filename in os.listdir(input_dir): - result[filename.split(".")[0]] = pd.read_csv( - os.path.join(input_dir, filename), dtype=str - ) + csv_files = list(Path(input_dir).glob("*.csv")) + for filename in csv_files: + result[filename.stem] = pd.read_csv(filename, dtype=str) return result @@ -253,7 +275,7 @@ def compare( index=False, ) result = ( - f"{total_correct_rows / total_gt_rows :.1%} of ground truth rows present" + f"{(total_correct_rows / total_gt_rows) if total_gt_rows!=0 else np.nan :.1%} of ground truth rows present" f" in output ({total_correct_rows}/{total_gt_rows})" f", {total_additional_rows} additional rows" ) diff --git a/xl2times/config/times-info.json b/xl2times/config/times-info.json index 8ce8ac4..d087ba7 100644 --- a/xl2times/config/times-info.json +++ b/xl2times/config/times-info.json @@ -4707,7 +4707,7 @@ }, { "name": "UC_DYNBND", - "gams-cat": "parameter", + "gams-cat": "md-set", "indexes": [ "UC_N", "LIM" diff --git a/utils/dd_to_csv.py b/xl2times/dd_to_csv.py similarity index 93% rename from utils/dd_to_csv.py rename to xl2times/dd_to_csv.py index 8b32c18..46f652b 100644 --- a/utils/dd_to_csv.py +++ b/xl2times/dd_to_csv.py @@ -13,7 +13,7 @@ def parse_parameter_values_from_file( path: Path, ) -> tuple[dict[str, list], dict[str, set]]: - """Parse *.dd to turn it into CSV format. + """Parse a `*.dd` file and extract the sets and parameters. There are parameters and sets, and each has a slightly different format. `*.dd` files have data of the following form: @@ -53,20 +53,19 @@ def parse_parameter_values_from_file( param_data = [] # Parse values until a line with / is encountered. while not data[index].startswith("/") and data[index] != "": - words = data[index].split(" ") + line = data[index] # Either "value" for a scalar, or "key value" for an array. - if len(words) == 1: - attributes = [] - elif len(words) == 2: - attributes = words[0].split(".") - attributes = [a if " " in a else a.strip("'") for a in attributes] + # So value is always the last word, or only token + split_point = line.rfind(" ") + if split_point == -1: + # if only one word + attributes, value = [], line else: - raise ValueError( - f"Unexpected number of spaces in parameter value setting: {data[index]}" - ) + attributes, value = line[:split_point], line[split_point + 1 :] + attributes = attributes.split(".") + attributes = [a if " " in a else a.strip("'") for a in attributes] - value = words[-1] param_data.append([*attributes, value]) index += 1 diff --git a/xl2times/transforms.py b/xl2times/transforms.py index e82adb0..8d7d3c6 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -2511,7 +2511,7 @@ def apply_transform_tables( ) if not any(rows_to_update): - logger.info(f"A {Tag.tfm_mig.value} row generated no records.") + logger.warning(f"A {Tag.tfm_mig.value} row generated no records.") continue new_rows = table.loc[rows_to_update].copy()