From bf5ea29a4f43d4dac91226dfee2ac362cc8a7671 Mon Sep 17 00:00:00 2001
From: Sam West <sam.west@csiro.au>
Date: Sat, 23 Mar 2024 11:53:52 +1100
Subject: [PATCH] dd_to_csv bugfixes, move cache to `~/.cache/xl2times/` (#230)

Just a minor bugfix and some housekeeping.

- Fixes a bug in dd_to_csv.py where DD variable names containing spaces
don't parse correctly.
- Made dd_to_csv a second script in pyproject.toml (we'd like to use it
from the austimes repo, and couldn't easily otherwise)
- Moved pickle cache to ~/.cache/xl2times
- Adds cache invalidation, to delete old cache files (just anything >365
days, couldn't think of a better way)

---------

Co-authored-by: Siddharth Krishna <siddharth-krishna@users.noreply.github.com>
---
 .github/workflows/ci.yml         | 10 +++--
 pyproject.toml                   |  1 +
 utils/run_benchmarks.py          |  6 ++-
 xl2times/__main__.py             | 70 +++++++++++++++++++++-----------
 xl2times/config/times-info.json  |  2 +-
 {utils => xl2times}/dd_to_csv.py | 21 +++++-----
 xl2times/transforms.py           |  2 +-
 7 files changed, 69 insertions(+), 43 deletions(-)
 rename {utils => xl2times}/dd_to_csv.py (93%)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bd2b3b6..d9aed4b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -21,6 +21,7 @@ jobs:
       REF_demos-dd: "2848a8a8e2fdcf0cdf7f83eefbdd563b0bb74e86"
       REF_tim: "e820d8002adc6b1526a3bffcc439219b28d0eed5"
       REF_tim-gams: "703f6a4e1d0bedd95c3ebdae534496f3a7e1b7cc"
+      CACHE_KEY: 0 # Use this for manual cache key bumps, e.g., when caching code changes
 
     steps:
       - uses: actions/checkout@v3
@@ -115,11 +116,12 @@ jobs:
         id: cache
         uses: actions/cache/restore@v4
         with:
-          path: ${{ github.workspace }}/xl2times/xl2times/.cache
+          path: ~/.cache/xl2times
           # Cache key is refs of the input xlsx repos, since that's what is cached
-          key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}
+          key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}-${{ env.CACHE_KEY }}
           # If we can't find the exact key for the TIM repo, still use the cache if the demos repo ref matches
           restore-keys: |
+            ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}-
             ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-
             ${{ runner.os }}-py-${{ env.PY_VERSION }}-
 
@@ -166,5 +168,5 @@ jobs:
         # Save the cache even if the regression tests fail
         if: always() && !steps.cache-restore.outputs.cache-hit
         with:
-          path: ${{ github.workspace }}/xl2times/xl2times/.cache
-          key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}
+          path: ~/.cache/xl2times
+          key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}-${{ env.CACHE_KEY }}
diff --git a/pyproject.toml b/pyproject.toml
index f7ca779..1f9b15c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,6 +47,7 @@ Source = "https://github.com/etsap-TIMES/xl2times"
 
 [project.scripts]
 xl2times = "xl2times.__main__:main"
+dd_to_csv = "xl2times.dd_to_csv:main"
 
 [tool.pytest.ini_options]
 # don't print runtime warnings
diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py
index bbf18e1..e28f5ad 100644
--- a/utils/run_benchmarks.py
+++ b/utils/run_benchmarks.py
@@ -14,11 +14,11 @@
 import git
 import pandas as pd
 import yaml
-from dd_to_csv import main
 from tabulate import tabulate
 
 from xl2times import utils
 from xl2times.__main__ import parse_args, run
+from xl2times.dd_to_csv import main
 from xl2times.utils import max_workers
 
 logger = utils.get_logger()
@@ -155,7 +155,7 @@ def run_benchmark(
             res = subprocess.run(
                 [
                     "python",
-                    "utils/dd_to_csv.py",
+                    "xl2times/dd_to_csv.py",
                     dd_folder,
                     csv_folder,
                 ],
@@ -164,6 +164,7 @@ def run_benchmark(
                 text=True,
                 shell=True if os.name == "nt" else False,
                 check=False,
+                encoding="utf-8",
             )
             if res.returncode != 0:
                 # Remove partial outputs
@@ -208,6 +209,7 @@ def run_benchmark(
             stderr=subprocess.STDOUT,
             text=True,
             check=False,
+            encoding="utf-8",
         )
     else:
         # If debug option is set, run as a function call to allow stepping with a debugger.
diff --git a/xl2times/__main__.py b/xl2times/__main__.py
index c24675c..7141f96 100644
--- a/xl2times/__main__.py
+++ b/xl2times/__main__.py
@@ -5,13 +5,13 @@
 import sys
 import time
 from concurrent.futures import ProcessPoolExecutor
-from datetime import datetime
+from datetime import datetime, timedelta
 from pathlib import Path
 
+import numpy as np
 import pandas as pd
 from pandas.core.frame import DataFrame
 
-from xl2times import __file__ as xl2times_file_path
 from xl2times.utils import max_workers
 
 from . import excel, transforms, utils
@@ -20,33 +20,54 @@
 logger = utils.get_logger()
 
 
-cache_dir = os.path.abspath(os.path.dirname(xl2times_file_path)) + "/.cache/"
-os.makedirs(cache_dir, exist_ok=True)
+cache_dir = Path.home() / ".cache/xl2times/"
+cache_dir.mkdir(exist_ok=True, parents=True)
 
 
-def _read_xlsx_cached(filename: str) -> list[EmbeddedXlTable]:
+def invalidate_cache(max_age: timedelta = timedelta(days=365)):
+    """
+    Delete any cache files older than max_age.
+
+    Args:
+        max_age: Maximum age of a cache file to be considered valid. Any cache files older than this are deleted.
+    """
+    for file in cache_dir.glob("*.pkl"):
+        if datetime.now() - datetime.fromtimestamp(file.lstat().st_mtime) > max_age:
+            try:
+                file.unlink()
+            except Exception as e:
+                logger.warning(f"Failed to delete old cache file {file}. {e}")
+
+
+def _read_xlsx_cached(filename: str | Path) -> list[EmbeddedXlTable]:
     """Extract EmbeddedXlTables from xlsx file (cached).
 
     Since excel.extract_tables is quite slow, we cache its results in `cache_dir`.
     Each file is named by the hash of the contents of an xlsx file, and contains
     a tuple (filename, modified timestamp, [EmbeddedXlTable]).
+
+    Args:
+        filename: Path to the xlsx file to extract tables from.
     """
-    with open(filename, "rb") as f:
+    filename = Path(filename).resolve()
+    with filename.open("rb") as f:
         digest = hashlib.file_digest(f, "sha256")  # pyright: ignore
     hsh = digest.hexdigest()
-    if os.path.isfile(cache_dir + hsh):
-        fname1, _timestamp, tables = pickle.load(open(cache_dir + hsh, "rb"))
-        # In the extremely unlikely event that we have a hash collision, also check that
-        # the filename is the same:
-        # TODO check modified timestamp also matches
-        if filename == fname1:
-            logger.info(f"Using cached data for {filename} from {cache_dir + hsh}")
-            return tables
-    # Write extracted data to cache:
-    tables = excel.extract_tables(filename)
-    pickle.dump((filename, "TODO ModifiedTime", tables), open(cache_dir + hsh, "wb"))
-    logger.info(f"Saved cache for {filename} to {cache_dir + hsh}")
-    return excel.extract_tables(filename)
+    cached_file = (cache_dir / f"{Path(filename).stem}_{hsh}.pkl").resolve()
+
+    if cached_file.exists():
+        # just load and return the cached pickle
+        with cached_file.open("rb") as f:
+            tables = pickle.load(f)
+            logger.info(f"Using cached data for {filename} from {cached_file}")
+    else:
+        # extract data and write it to cache before returning it
+        tables = excel.extract_tables(str(filename))
+        with cached_file.open("wb") as f:
+            pickle.dump(tables, f)
+        logger.info(f"Saved cache for {filename} to {cached_file}")
+
+    return tables
 
 
 def convert_xl_to_times(
@@ -59,6 +80,8 @@ def convert_xl_to_times(
     stop_after_read: bool = False,
 ) -> dict[str, DataFrame]:
     start_time = datetime.now()
+
+    invalidate_cache()
     with ProcessPoolExecutor(max_workers) as executor:
         raw_tables = executor.map(
             excel.extract_tables if no_cache else _read_xlsx_cached, input_files
@@ -180,10 +203,9 @@ def write_csv_tables(tables: dict[str, DataFrame], output_dir: str):
 
 def read_csv_tables(input_dir: str) -> dict[str, DataFrame]:
     result = {}
-    for filename in os.listdir(input_dir):
-        result[filename.split(".")[0]] = pd.read_csv(
-            os.path.join(input_dir, filename), dtype=str
-        )
+    csv_files = list(Path(input_dir).glob("*.csv"))
+    for filename in csv_files:
+        result[filename.stem] = pd.read_csv(filename, dtype=str)
     return result
 
 
@@ -253,7 +275,7 @@ def compare(
                     index=False,
                 )
     result = (
-        f"{total_correct_rows / total_gt_rows :.1%} of ground truth rows present"
+        f"{(total_correct_rows / total_gt_rows) if total_gt_rows!=0 else np.nan :.1%} of ground truth rows present"
         f" in output ({total_correct_rows}/{total_gt_rows})"
         f", {total_additional_rows} additional rows"
     )
diff --git a/xl2times/config/times-info.json b/xl2times/config/times-info.json
index 8ce8ac4..d087ba7 100644
--- a/xl2times/config/times-info.json
+++ b/xl2times/config/times-info.json
@@ -4707,7 +4707,7 @@
   },
   {
     "name": "UC_DYNBND",
-    "gams-cat": "parameter",
+    "gams-cat": "md-set",
     "indexes": [
       "UC_N",
       "LIM"
diff --git a/utils/dd_to_csv.py b/xl2times/dd_to_csv.py
similarity index 93%
rename from utils/dd_to_csv.py
rename to xl2times/dd_to_csv.py
index 8b32c18..46f652b 100644
--- a/utils/dd_to_csv.py
+++ b/xl2times/dd_to_csv.py
@@ -13,7 +13,7 @@
 def parse_parameter_values_from_file(
     path: Path,
 ) -> tuple[dict[str, list], dict[str, set]]:
-    """Parse *.dd to turn it into CSV format.
+    """Parse a `*.dd` file and extract the sets and parameters.
 
     There are parameters and sets, and each has a slightly different format.
     `*.dd` files have data of the following form:
@@ -53,20 +53,19 @@ def parse_parameter_values_from_file(
             param_data = []
             # Parse values until a line with / is encountered.
             while not data[index].startswith("/") and data[index] != "":
-                words = data[index].split(" ")
+                line = data[index]
 
                 # Either "value" for a scalar, or "key value" for an array.
-                if len(words) == 1:
-                    attributes = []
-                elif len(words) == 2:
-                    attributes = words[0].split(".")
-                    attributes = [a if " " in a else a.strip("'") for a in attributes]
+                # So value is always the last word, or only token
+                split_point = line.rfind(" ")
+                if split_point == -1:
+                    # if only one word
+                    attributes, value = [], line
                 else:
-                    raise ValueError(
-                        f"Unexpected number of spaces in parameter value setting: {data[index]}"
-                    )
+                    attributes, value = line[:split_point], line[split_point + 1 :]
+                    attributes = attributes.split(".")
+                    attributes = [a if " " in a else a.strip("'") for a in attributes]
 
-                value = words[-1]
                 param_data.append([*attributes, value])
 
                 index += 1
diff --git a/xl2times/transforms.py b/xl2times/transforms.py
index e82adb0..8d7d3c6 100644
--- a/xl2times/transforms.py
+++ b/xl2times/transforms.py
@@ -2511,7 +2511,7 @@ def apply_transform_tables(
             )
 
             if not any(rows_to_update):
-                logger.info(f"A {Tag.tfm_mig.value} row generated no records.")
+                logger.warning(f"A {Tag.tfm_mig.value} row generated no records.")
                 continue
 
             new_rows = table.loc[rows_to_update].copy()