From bab423c25986acc36cf6f0b1d47ef5b1414b4e3c Mon Sep 17 00:00:00 2001
From: Sam West <sam.west@csiro.au>
Date: Thu, 14 Mar 2024 13:34:42 +1100
Subject: [PATCH 01/11] fixed dd_to_csv parsing for lines with multiple spaces
 in the attribute names. corrected the gams-cat for UC_DYNBND (see #218)

---
 utils/dd_to_csv.py              | 19 +++++++++----------
 xl2times/config/times-info.json |  2 +-
 xl2times/datatypes.py           |  8 +++++++-
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/utils/dd_to_csv.py b/utils/dd_to_csv.py
index 7d26227..6437751 100644
--- a/utils/dd_to_csv.py
+++ b/utils/dd_to_csv.py
@@ -55,20 +55,19 @@ def parse_parameter_values_from_file(
             param_data = []
             # Parse values until a line with / is encountered.
             while not data[index].startswith("/") and data[index] != "":
-                words = data[index].split(" ")
+                line = data[index]
 
                 # Either "value" for a scalar, or "key value" for an array.
-                if len(words) == 1:
-                    attributes = []
-                elif len(words) == 2:
-                    attributes = words[0].split(".")
-                    attributes = [a if " " in a else a.strip("'") for a in attributes]
+                # So value is always the last word, or only token
+                split_point = line.rfind(" ")
+                if split_point == -1:
+                    # if only one word
+                    attributes, value = [], line
                 else:
-                    raise ValueError(
-                        f"Unexpected number of spaces in parameter value setting: {data[index]}"
-                    )
+                    attributes, value = line[:split_point], line[split_point + 1 :]
+                    attributes = attributes.split(".")
+                    attributes = [a if " " in a else a.strip("'") for a in attributes]
 
-                value = words[-1]
                 param_data.append([*attributes, value])
 
                 index += 1
diff --git a/xl2times/config/times-info.json b/xl2times/config/times-info.json
index 4dae96a..07b12f6 100644
--- a/xl2times/config/times-info.json
+++ b/xl2times/config/times-info.json
@@ -4707,7 +4707,7 @@
   },
   {
     "name": "UC_DYNBND",
-    "gams-cat": "parameter",
+    "gams-cat": "md-set",
     "indexes": [
       "UC_N",
       "LIM"
diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py
index 7d3c433..76f475c 100644
--- a/xl2times/datatypes.py
+++ b/xl2times/datatypes.py
@@ -175,7 +175,13 @@ def data_years(self) -> set[int]:
         data_years = set()
         for attributes in [self.attributes, self.uc_attributes]:
             if not attributes.empty:
-                data_years.update(attributes["year"].astype(int).values)
+                int_years = attributes["year"].astype(
+                    int, errors="ignore"
+                )  # leave non-parseable vals alone
+                int_years = [
+                    y for y in int_years if isinstance(y, int)
+                ]  # remove non-parseable years
+                data_years.update(int_years)
         # Remove interpolation rules before return
         return {y for y in data_years if y >= 1000}
 

From fd010053b56c5cd6a2a1632a07bff9f18842f635 Mon Sep 17 00:00:00 2001
From: Sam West <sam.west@csiro.au>
Date: Thu, 21 Mar 2024 15:52:11 +1100
Subject: [PATCH 02/11] moved dd_to_csv to src/ and added as a script in
 pyproject.toml moved cache dir to standard ~/.cache/xl2times dir to avoid
 creating inside .venv subdir

---
 pyproject.toml                   |  1 +
 utils/run_benchmarks.py          |  4 +++-
 xl2times/__main__.py             | 14 ++++++++------
 {utils => xl2times}/dd_to_csv.py |  0
 xl2times/transforms.py           |  2 +-
 5 files changed, 13 insertions(+), 8 deletions(-)
 rename {utils => xl2times}/dd_to_csv.py (100%)

diff --git a/pyproject.toml b/pyproject.toml
index 3fb1011..2a12e82 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,6 +47,7 @@ Source = "https://github.com/etsap-TIMES/xl2times"
 
 [project.scripts]
 xl2times = "xl2times.__main__:main"
+dd_to_csv = "xl2times.dd_to_csv:main"
 
 [tool.pytest.ini_options]
 # don't print runtime warnings
diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py
index bbf18e1..a2a64c0 100644
--- a/utils/run_benchmarks.py
+++ b/utils/run_benchmarks.py
@@ -14,11 +14,11 @@
 import git
 import pandas as pd
 import yaml
-from dd_to_csv import main
 from tabulate import tabulate
 
 from xl2times import utils
 from xl2times.__main__ import parse_args, run
+from xl2times.dd_to_csv import main
 from xl2times.utils import max_workers
 
 logger = utils.get_logger()
@@ -164,6 +164,7 @@ def run_benchmark(
                 text=True,
                 shell=True if os.name == "nt" else False,
                 check=False,
+                encoding="utf-8",
             )
             if res.returncode != 0:
                 # Remove partial outputs
@@ -208,6 +209,7 @@ def run_benchmark(
             stderr=subprocess.STDOUT,
             text=True,
             check=False,
+            encoding="utf-8",
         )
     else:
         # If debug option is set, run as a function call to allow stepping with a debugger.
diff --git a/xl2times/__main__.py b/xl2times/__main__.py
index b64a190..0515d05 100644
--- a/xl2times/__main__.py
+++ b/xl2times/__main__.py
@@ -8,10 +8,10 @@
 from datetime import datetime
 from pathlib import Path
 
+import numpy as np
 import pandas as pd
 from pandas.core.frame import DataFrame
 
-from xl2times import __file__ as xl2times_file_path
 from xl2times.utils import max_workers
 
 from . import datatypes, excel, transforms, utils
@@ -19,7 +19,7 @@
 logger = utils.get_logger()
 
 
-cache_dir = os.path.abspath(os.path.dirname(xl2times_file_path)) + "/.cache/"
+cache_dir = str(Path.home() / ".cache/xl2times/")
 os.makedirs(cache_dir, exist_ok=True)
 
 
@@ -34,7 +34,8 @@ def _read_xlsx_cached(filename: str) -> list[datatypes.EmbeddedXlTable]:
         digest = hashlib.file_digest(f, "sha256")  # pyright: ignore
     hsh = digest.hexdigest()
     if os.path.isfile(cache_dir + hsh):
-        fname1, _timestamp, tables = pickle.load(open(cache_dir + hsh, "rb"))
+        with open(cache_dir + hsh, "rb") as f:
+            fname1, _timestamp, tables = pickle.load(f)
         # In the extremely unlikely event that we have a hash collision, also check that
         # the filename is the same:
         # TODO check modified timestamp also matches
@@ -178,8 +179,9 @@ def write_csv_tables(tables: dict[str, DataFrame], output_dir: str):
 
 def read_csv_tables(input_dir: str) -> dict[str, DataFrame]:
     result = {}
-    for filename in os.listdir(input_dir):
-        result[filename.split(".")[0]] = pd.read_csv(
+    csv_files = list(Path(input_dir).glob("*.csv"))
+    for filename in csv_files:
+        result[str(filename).split(".")[0]] = pd.read_csv(
             os.path.join(input_dir, filename), dtype=str
         )
     return result
@@ -251,7 +253,7 @@ def compare(
                     index=False,
                 )
     result = (
-        f"{total_correct_rows / total_gt_rows :.1%} of ground truth rows present"
+        f"{(total_correct_rows / total_gt_rows) if total_gt_rows!=0 else np.nan :.1%} of ground truth rows present"
         f" in output ({total_correct_rows}/{total_gt_rows})"
         f", {total_additional_rows} additional rows"
     )
diff --git a/utils/dd_to_csv.py b/xl2times/dd_to_csv.py
similarity index 100%
rename from utils/dd_to_csv.py
rename to xl2times/dd_to_csv.py
diff --git a/xl2times/transforms.py b/xl2times/transforms.py
index c62eef4..949b1f6 100644
--- a/xl2times/transforms.py
+++ b/xl2times/transforms.py
@@ -2398,7 +2398,7 @@ def apply_transform_tables(
             )
 
             if not any(rows_to_update):
-                logger.info(
+                logger.warning(
                     f"A {datatypes.Tag.tfm_mig.value} row generated no records."
                 )
                 continue

From 86a1480735bc0714e1a851012f1b10fcd3029252 Mon Sep 17 00:00:00 2001
From: Sam West <sam.west@csiro.au>
Date: Thu, 21 Mar 2024 16:19:22 +1100
Subject: [PATCH 03/11] Added cache invalidation for old files

---
 xl2times/__main__.py | 39 +++++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/xl2times/__main__.py b/xl2times/__main__.py
index 0515d05..d44d4e8 100644
--- a/xl2times/__main__.py
+++ b/xl2times/__main__.py
@@ -5,7 +5,7 @@
 import sys
 import time
 from concurrent.futures import ProcessPoolExecutor
-from datetime import datetime
+from datetime import datetime, timedelta
 from pathlib import Path
 
 import numpy as np
@@ -19,8 +19,23 @@
 logger = utils.get_logger()
 
 
-cache_dir = str(Path.home() / ".cache/xl2times/")
-os.makedirs(cache_dir, exist_ok=True)
+cache_dir = Path.home() / ".cache/xl2times/"
+cache_dir.mkdir(exist_ok=True, parents=True)
+
+
+def invalidate_cache(max_age: timedelta = timedelta(days=365)):
+    """
+    Delete any cache files older than max_age.
+
+    Args:
+        max_age: Maximum age of a cache file to be considered valid. Any cache files older than this are deleted.
+    """
+    for file in cache_dir.glob("*.pkl"):
+        if datetime.now() - datetime.fromtimestamp(file.lstat().st_mtime) > max_age:
+            try:
+                file.unlink()
+            except Exception as e:
+                logger.warning(f"Failed to delete old cache file {file}. {e}")
 
 
 def _read_xlsx_cached(filename: str) -> list[datatypes.EmbeddedXlTable]:
@@ -29,23 +44,29 @@ def _read_xlsx_cached(filename: str) -> list[datatypes.EmbeddedXlTable]:
     Since excel.extract_tables is quite slow, we cache its results in `cache_dir`.
     Each file is named by the hash of the contents of an xlsx file, and contains
     a tuple (filename, modified timestamp, [EmbeddedXlTable]).
+
+    Args:
+        filename: Path to the xlsx file to extract tables from.
     """
     with open(filename, "rb") as f:
         digest = hashlib.file_digest(f, "sha256")  # pyright: ignore
     hsh = digest.hexdigest()
-    if os.path.isfile(cache_dir + hsh):
-        with open(cache_dir + hsh, "rb") as f:
+    hash_file = cache_dir / f"{Path(filename).stem}_{hsh}.pkl"
+    if hash_file.is_file():
+        with hash_file.open("rb") as f:
             fname1, _timestamp, tables = pickle.load(f)
         # In the extremely unlikely event that we have a hash collision, also check that
         # the filename is the same:
         # TODO check modified timestamp also matches
         if filename == fname1:
-            logger.info(f"Using cached data for {filename} from {cache_dir + hsh}")
+            logger.info(f"Using cached data for {filename} from {hash_file}")
             return tables
     # Write extracted data to cache:
     tables = excel.extract_tables(filename)
-    pickle.dump((filename, "TODO ModifiedTime", tables), open(cache_dir + hsh, "wb"))
-    logger.info(f"Saved cache for {filename} to {cache_dir + hsh}")
+    with hash_file.open("wb") as f:
+        last_modified = hash_file.lstat().st_mtime
+        pickle.dump((filename, last_modified, tables), f)
+    logger.info(f"Saved cache for {filename} to {hash_file}")
     return excel.extract_tables(filename)
 
 
@@ -59,6 +80,8 @@ def convert_xl_to_times(
     stop_after_read: bool = False,
 ) -> dict[str, DataFrame]:
     start_time = datetime.now()
+
+    invalidate_cache()
     with ProcessPoolExecutor(max_workers) as executor:
         raw_tables = executor.map(
             excel.extract_tables if no_cache else _read_xlsx_cached, input_files

From 3b8719b32df902df996c340548978b807e430950 Mon Sep 17 00:00:00 2001
From: Sam West <sam.west@csiro.au>
Date: Thu, 21 Mar 2024 15:52:11 +1100
Subject: [PATCH 04/11] moved dd_to_csv to src/ and added as a script in
 pyproject.toml moved cache dir to standard ~/.cache/xl2times dir to avoid
 creating inside .venv subdir

---
 pyproject.toml                   |  1 +
 utils/run_benchmarks.py          |  4 +++-
 xl2times/__main__.py             | 14 ++++++++------
 {utils => xl2times}/dd_to_csv.py |  0
 xl2times/transforms.py           |  2 +-
 5 files changed, 13 insertions(+), 8 deletions(-)
 rename {utils => xl2times}/dd_to_csv.py (100%)

diff --git a/pyproject.toml b/pyproject.toml
index f7ca779..1f9b15c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,6 +47,7 @@ Source = "https://github.com/etsap-TIMES/xl2times"
 
 [project.scripts]
 xl2times = "xl2times.__main__:main"
+dd_to_csv = "xl2times.dd_to_csv:main"
 
 [tool.pytest.ini_options]
 # don't print runtime warnings
diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py
index bbf18e1..a2a64c0 100644
--- a/utils/run_benchmarks.py
+++ b/utils/run_benchmarks.py
@@ -14,11 +14,11 @@
 import git
 import pandas as pd
 import yaml
-from dd_to_csv import main
 from tabulate import tabulate
 
 from xl2times import utils
 from xl2times.__main__ import parse_args, run
+from xl2times.dd_to_csv import main
 from xl2times.utils import max_workers
 
 logger = utils.get_logger()
@@ -164,6 +164,7 @@ def run_benchmark(
                 text=True,
                 shell=True if os.name == "nt" else False,
                 check=False,
+                encoding="utf-8",
             )
             if res.returncode != 0:
                 # Remove partial outputs
@@ -208,6 +209,7 @@ def run_benchmark(
             stderr=subprocess.STDOUT,
             text=True,
             check=False,
+            encoding="utf-8",
         )
     else:
         # If debug option is set, run as a function call to allow stepping with a debugger.
diff --git a/xl2times/__main__.py b/xl2times/__main__.py
index b8e5976..6fd68ad 100644
--- a/xl2times/__main__.py
+++ b/xl2times/__main__.py
@@ -8,10 +8,10 @@
 from datetime import datetime
 from pathlib import Path
 
+import numpy as np
 import pandas as pd
 from pandas.core.frame import DataFrame
 
-from xl2times import __file__ as xl2times_file_path
 from xl2times.utils import max_workers
 
 from . import excel, transforms, utils
@@ -20,7 +20,7 @@
 logger = utils.get_logger()
 
 
-cache_dir = os.path.abspath(os.path.dirname(xl2times_file_path)) + "/.cache/"
+cache_dir = str(Path.home() / ".cache/xl2times/")
 os.makedirs(cache_dir, exist_ok=True)
 
 
@@ -35,7 +35,8 @@ def _read_xlsx_cached(filename: str) -> list[EmbeddedXlTable]:
         digest = hashlib.file_digest(f, "sha256")  # pyright: ignore
     hsh = digest.hexdigest()
     if os.path.isfile(cache_dir + hsh):
-        fname1, _timestamp, tables = pickle.load(open(cache_dir + hsh, "rb"))
+        with open(cache_dir + hsh, "rb") as f:
+            fname1, _timestamp, tables = pickle.load(f)
         # In the extremely unlikely event that we have a hash collision, also check that
         # the filename is the same:
         # TODO check modified timestamp also matches
@@ -180,8 +181,9 @@ def write_csv_tables(tables: dict[str, DataFrame], output_dir: str):
 
 def read_csv_tables(input_dir: str) -> dict[str, DataFrame]:
     result = {}
-    for filename in os.listdir(input_dir):
-        result[filename.split(".")[0]] = pd.read_csv(
+    csv_files = list(Path(input_dir).glob("*.csv"))
+    for filename in csv_files:
+        result[str(filename).split(".")[0]] = pd.read_csv(
             os.path.join(input_dir, filename), dtype=str
         )
     return result
@@ -253,7 +255,7 @@ def compare(
                     index=False,
                 )
     result = (
-        f"{total_correct_rows / total_gt_rows :.1%} of ground truth rows present"
+        f"{(total_correct_rows / total_gt_rows) if total_gt_rows!=0 else np.nan :.1%} of ground truth rows present"
         f" in output ({total_correct_rows}/{total_gt_rows})"
         f", {total_additional_rows} additional rows"
     )
diff --git a/utils/dd_to_csv.py b/xl2times/dd_to_csv.py
similarity index 100%
rename from utils/dd_to_csv.py
rename to xl2times/dd_to_csv.py
diff --git a/xl2times/transforms.py b/xl2times/transforms.py
index e82adb0..8d7d3c6 100644
--- a/xl2times/transforms.py
+++ b/xl2times/transforms.py
@@ -2511,7 +2511,7 @@ def apply_transform_tables(
             )
 
             if not any(rows_to_update):
-                logger.info(f"A {Tag.tfm_mig.value} row generated no records.")
+                logger.warning(f"A {Tag.tfm_mig.value} row generated no records.")
                 continue
 
             new_rows = table.loc[rows_to_update].copy()

From 7bcffc83952b9ce63ee72c2c72914f420ce39394 Mon Sep 17 00:00:00 2001
From: Sam West <sam.west@csiro.au>
Date: Thu, 21 Mar 2024 16:19:22 +1100
Subject: [PATCH 05/11] Added cache invalidation for old files

---
 xl2times/__main__.py | 39 +++++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/xl2times/__main__.py b/xl2times/__main__.py
index 6fd68ad..90b9414 100644
--- a/xl2times/__main__.py
+++ b/xl2times/__main__.py
@@ -5,7 +5,7 @@
 import sys
 import time
 from concurrent.futures import ProcessPoolExecutor
-from datetime import datetime
+from datetime import datetime, timedelta
 from pathlib import Path
 
 import numpy as np
@@ -20,8 +20,23 @@
 logger = utils.get_logger()
 
 
-cache_dir = str(Path.home() / ".cache/xl2times/")
-os.makedirs(cache_dir, exist_ok=True)
+cache_dir = Path.home() / ".cache/xl2times/"
+cache_dir.mkdir(exist_ok=True, parents=True)
+
+
+def invalidate_cache(max_age: timedelta = timedelta(days=365)):
+    """
+    Delete any cache files older than max_age.
+
+    Args:
+        max_age: Maximum age of a cache file to be considered valid. Any cache files older than this are deleted.
+    """
+    for file in cache_dir.glob("*.pkl"):
+        if datetime.now() - datetime.fromtimestamp(file.lstat().st_mtime) > max_age:
+            try:
+                file.unlink()
+            except Exception as e:
+                logger.warning(f"Failed to delete old cache file {file}. {e}")
 
 
 def _read_xlsx_cached(filename: str) -> list[EmbeddedXlTable]:
@@ -30,23 +45,29 @@ def _read_xlsx_cached(filename: str) -> list[EmbeddedXlTable]:
     Since excel.extract_tables is quite slow, we cache its results in `cache_dir`.
     Each file is named by the hash of the contents of an xlsx file, and contains
     a tuple (filename, modified timestamp, [EmbeddedXlTable]).
+
+    Args:
+        filename: Path to the xlsx file to extract tables from.
     """
     with open(filename, "rb") as f:
         digest = hashlib.file_digest(f, "sha256")  # pyright: ignore
     hsh = digest.hexdigest()
-    if os.path.isfile(cache_dir + hsh):
-        with open(cache_dir + hsh, "rb") as f:
+    hash_file = cache_dir / f"{Path(filename).stem}_{hsh}.pkl"
+    if hash_file.is_file():
+        with hash_file.open("rb") as f:
             fname1, _timestamp, tables = pickle.load(f)
         # In the extremely unlikely event that we have a hash collision, also check that
         # the filename is the same:
         # TODO check modified timestamp also matches
         if filename == fname1:
-            logger.info(f"Using cached data for {filename} from {cache_dir + hsh}")
+            logger.info(f"Using cached data for {filename} from {hash_file}")
             return tables
     # Write extracted data to cache:
     tables = excel.extract_tables(filename)
-    pickle.dump((filename, "TODO ModifiedTime", tables), open(cache_dir + hsh, "wb"))
-    logger.info(f"Saved cache for {filename} to {cache_dir + hsh}")
+    with hash_file.open("wb") as f:
+        last_modified = hash_file.lstat().st_mtime
+        pickle.dump((filename, last_modified, tables), f)
+    logger.info(f"Saved cache for {filename} to {hash_file}")
     return excel.extract_tables(filename)
 
 
@@ -60,6 +81,8 @@ def convert_xl_to_times(
     stop_after_read: bool = False,
 ) -> dict[str, DataFrame]:
     start_time = datetime.now()
+
+    invalidate_cache()
     with ProcessPoolExecutor(max_workers) as executor:
         raw_tables = executor.map(
             excel.extract_tables if no_cache else _read_xlsx_cached, input_files

From 1615397f1531a4fb3e8e7d90be76de2acafda96c Mon Sep 17 00:00:00 2001
From: Sam West <sam.west@csiro.au>
Date: Thu, 21 Mar 2024 16:29:38 +1100
Subject: [PATCH 06/11] reverted changes to data_years()

---
 xl2times/datatypes.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py
index b6b62f0..2d2a99e 100644
--- a/xl2times/datatypes.py
+++ b/xl2times/datatypes.py
@@ -187,13 +187,7 @@ def data_years(self) -> set[int]:
         data_years = set()
         for attributes in [self.attributes, self.uc_attributes]:
             if not attributes.empty:
-                int_years = attributes["year"].astype(
-                    int, errors="ignore"
-                )  # leave non-parseable vals alone
-                int_years = [
-                    y for y in int_years if isinstance(y, int)
-                ]  # remove non-parseable years
-                data_years.update(int_years)
+                data_years.update(attributes["year"].astype(int).values)
         # Remove interpolation rules before return
         return {y for y in data_years if y >= 1000}
 

From 0c767ff226609290dbdb05d66a7e94698243c1fb Mon Sep 17 00:00:00 2001
From: Sam West <sam.west@csiro.au>
Date: Thu, 21 Mar 2024 16:59:32 +1100
Subject: [PATCH 07/11] fixed path in benchmarks file cleaned up caching code
 as suggested

---
 utils/run_benchmarks.py |  2 +-
 xl2times/__main__.py    | 37 ++++++++++++++++++-------------------
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py
index a2a64c0..e28f5ad 100644
--- a/utils/run_benchmarks.py
+++ b/utils/run_benchmarks.py
@@ -155,7 +155,7 @@ def run_benchmark(
             res = subprocess.run(
                 [
                     "python",
-                    "utils/dd_to_csv.py",
+                    "xl2times/dd_to_csv.py",
                     dd_folder,
                     csv_folder,
                 ],
diff --git a/xl2times/__main__.py b/xl2times/__main__.py
index 90b9414..6c75534 100644
--- a/xl2times/__main__.py
+++ b/xl2times/__main__.py
@@ -39,7 +39,7 @@ def invalidate_cache(max_age: timedelta = timedelta(days=365)):
                 logger.warning(f"Failed to delete old cache file {file}. {e}")
 
 
-def _read_xlsx_cached(filename: str) -> list[EmbeddedXlTable]:
+def _read_xlsx_cached(filename: str | Path) -> list[EmbeddedXlTable]:
     """Extract EmbeddedXlTables from xlsx file (cached).
 
     Since excel.extract_tables is quite slow, we cache its results in `cache_dir`.
@@ -49,26 +49,25 @@ def _read_xlsx_cached(filename: str) -> list[EmbeddedXlTable]:
     Args:
         filename: Path to the xlsx file to extract tables from.
     """
-    with open(filename, "rb") as f:
+    filename = Path(filename).resolve()
+    with filename.open("rb") as f:
         digest = hashlib.file_digest(f, "sha256")  # pyright: ignore
     hsh = digest.hexdigest()
-    hash_file = cache_dir / f"{Path(filename).stem}_{hsh}.pkl"
-    if hash_file.is_file():
-        with hash_file.open("rb") as f:
-            fname1, _timestamp, tables = pickle.load(f)
-        # In the extremely unlikely event that we have a hash collision, also check that
-        # the filename is the same:
-        # TODO check modified timestamp also matches
-        if filename == fname1:
-            logger.info(f"Using cached data for {filename} from {hash_file}")
-            return tables
-    # Write extracted data to cache:
-    tables = excel.extract_tables(filename)
-    with hash_file.open("wb") as f:
-        last_modified = hash_file.lstat().st_mtime
-        pickle.dump((filename, last_modified, tables), f)
-    logger.info(f"Saved cache for {filename} to {hash_file}")
-    return excel.extract_tables(filename)
+    cached_file = (cache_dir / f"{Path(filename).stem}_{hsh}.pkl").resolve()
+
+    if cached_file.exists():
+        # just load and return the cached pickle
+        with cached_file.open("rb") as f:
+            tables = pickle.load(f)
+            logger.info(f"Using cached data for {filename} from {cached_file}")
+    else:
+        # extracted data and write it to cache before returning it
+        tables = excel.extract_tables(str(filename))
+        with cached_file.open("wb") as f:
+            pickle.dump(tables, f)
+        logger.info(f"Saved cache for {filename} to {cached_file}")
+
+    return tables
 
 
 def convert_xl_to_times(

From e53cef1b1deb235ccb63a9b0ab145476a5f870ec Mon Sep 17 00:00:00 2001
From: Sam West <sam.west@csiro.au>
Date: Fri, 22 Mar 2024 11:14:54 +1100
Subject: [PATCH 08/11] bugfix for bad CSV paths

---
 xl2times/__main__.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/xl2times/__main__.py b/xl2times/__main__.py
index 6c75534..9f2debc 100644
--- a/xl2times/__main__.py
+++ b/xl2times/__main__.py
@@ -205,9 +205,7 @@ def read_csv_tables(input_dir: str) -> dict[str, DataFrame]:
     result = {}
     csv_files = list(Path(input_dir).glob("*.csv"))
     for filename in csv_files:
-        result[str(filename).split(".")[0]] = pd.read_csv(
-            os.path.join(input_dir, filename), dtype=str
-        )
+        result[str(filename).split(".")[0]] = pd.read_csv(filename, dtype=str)
     return result
 
 

From 5c5c8bf2c3b4dd5afda25a74434c8aabc5de259c Mon Sep 17 00:00:00 2001
From: Sam West <sam.west@csiro.au>
Date: Fri, 22 Mar 2024 11:36:17 +1100
Subject: [PATCH 09/11] Another csv bugfix

---
 xl2times/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xl2times/__main__.py b/xl2times/__main__.py
index 9f2debc..59606b0 100644
--- a/xl2times/__main__.py
+++ b/xl2times/__main__.py
@@ -205,7 +205,7 @@ def read_csv_tables(input_dir: str) -> dict[str, DataFrame]:
     result = {}
     csv_files = list(Path(input_dir).glob("*.csv"))
     for filename in csv_files:
-        result[str(filename).split(".")[0]] = pd.read_csv(filename, dtype=str)
+        result[filename.stem] = pd.read_csv(filename, dtype=str)
     return result
 
 

From 56f333f04856176d1321e4bb87463413e7b481c3 Mon Sep 17 00:00:00 2001
From: Siddharth Krishna <siddharth-krishna@users.noreply.github.com>
Date: Fri, 22 Mar 2024 09:26:55 +0200
Subject: [PATCH 10/11] Change cache dir in ci.yml, bump cache key

---
 .github/workflows/ci.yml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bd2b3b6..d9aed4b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -21,6 +21,7 @@ jobs:
       REF_demos-dd: "2848a8a8e2fdcf0cdf7f83eefbdd563b0bb74e86"
       REF_tim: "e820d8002adc6b1526a3bffcc439219b28d0eed5"
       REF_tim-gams: "703f6a4e1d0bedd95c3ebdae534496f3a7e1b7cc"
+      CACHE_KEY: 0 # Use this for manual cache key bumps, e.g., when caching code changes
 
     steps:
       - uses: actions/checkout@v3
@@ -115,11 +116,12 @@ jobs:
         id: cache
         uses: actions/cache/restore@v4
         with:
-          path: ${{ github.workspace }}/xl2times/xl2times/.cache
+          path: ~/.cache/xl2times
           # Cache key is refs of the input xlsx repos, since that's what is cached
-          key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}
+          key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}-${{ env.CACHE_KEY }}
           # If we can't find the exact key for the TIM repo, still use the cache if the demos repo ref matches
           restore-keys: |
+            ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}-
             ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-
             ${{ runner.os }}-py-${{ env.PY_VERSION }}-
 
@@ -166,5 +168,5 @@ jobs:
         # Save the cache even if the regression tests fail
         if: always() && !steps.cache-restore.outputs.cache-hit
         with:
-          path: ${{ github.workspace }}/xl2times/xl2times/.cache
-          key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}
+          path: ~/.cache/xl2times
+          key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}-${{ env.CACHE_KEY }}

From bb2a536e9828fa2a48e1fcbdbb392c516f318ea7 Mon Sep 17 00:00:00 2001
From: Siddharth Krishna <siddharth-krishna@users.noreply.github.com>
Date: Fri, 22 Mar 2024 09:37:20 +0200
Subject: [PATCH 11/11] Fix docs

---
 xl2times/__main__.py  | 2 +-
 xl2times/dd_to_csv.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/xl2times/__main__.py b/xl2times/__main__.py
index 59606b0..e03b137 100644
--- a/xl2times/__main__.py
+++ b/xl2times/__main__.py
@@ -61,7 +61,7 @@ def _read_xlsx_cached(filename: str | Path) -> list[EmbeddedXlTable]:
             tables = pickle.load(f)
             logger.info(f"Using cached data for {filename} from {cached_file}")
     else:
-        # extracted data and write it to cache before returning it
+        # extract data and write it to cache before returning it
         tables = excel.extract_tables(str(filename))
         with cached_file.open("wb") as f:
             pickle.dump(tables, f)
diff --git a/xl2times/dd_to_csv.py b/xl2times/dd_to_csv.py
index 6f611ea..46f652b 100644
--- a/xl2times/dd_to_csv.py
+++ b/xl2times/dd_to_csv.py
@@ -13,7 +13,7 @@
 def parse_parameter_values_from_file(
     path: Path,
 ) -> tuple[dict[str, list], dict[str, set]]:
-    """Parse *.dd to turn it into CSV format.
+    """Parse a `*.dd` file and extract the sets and parameters.
 
     There are parameters and sets, and each has a slightly different format.
     `*.dd` files have data of the following form: