etsap-TIMES · siddharth-krishna · Feb 26, 2024 · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -3,9 +3,9 @@ name: CI
 on:
   # Triggers the workflow on push or pull request events but only for the main branch
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
 
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
@@ -14,11 +14,24 @@ jobs:
   CI:
     runs-on: ubuntu-latest
 
+    env:
+      PY_VERSION: "3.11"
+      REF_TIMES_model: "b488fb07f0899ee8b7e710c230b1a9414fa06f7d"
+      REF_demos-xlsx: "f956db07a253d4f5c60e108791ab7bb2b8136690"
+      REF_demos-dd: "2848a8a8e2fdcf0cdf7f83eefbdd563b0bb74e86"
+      REF_tim: "e820d8002adc6b1526a3bffcc439219b28d0eed5"
+      REF_tim-gams: "703f6a4e1d0bedd95c3ebdae534496f3a7e1b7cc"
+
     steps:
       - uses: actions/checkout@v3
         with:
           path: xl2times
 
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PY_VERSION }}
+
       - name: Install tool and dependencies
         working-directory: xl2times
         run: |
@@ -46,16 +59,19 @@ jobs:
         with:
           repository: etsap-TIMES/TIMES_model
           path: TIMES_model
+          ref: ${{ env.REF_TIMES_model }}
 
       - uses: actions/checkout@v3
         with:
           repository: olejandro/demos-dd
           path: xl2times/benchmarks/dd
+          ref: ${{ env.REF_demos-dd }}
 
       - uses: actions/checkout@v3
         with:
           repository: olejandro/demos-xlsx
           path: xl2times/benchmarks/xlsx
+          ref: ${{ env.REF_demos-xlsx }}
           token: ${{ secrets.GH_PAT_DEMOS_XLSX }}
 
       # ---------- Prepare TIMES Ireland Model
@@ -66,11 +82,13 @@ jobs:
         with:
           repository: esma-cgep/tim
           path: xl2times/benchmarks/xlsx/Ireland
+          ref: ${{ env.REF_tim }}
 
       - uses: actions/checkout@v3
         with:
           repository: esma-cgep/tim-gams
           path: xl2times/benchmarks/dd/Ireland
+          ref: ${{ env.REF_tim-gams }}
 
       # ---------- Install GAMS
 
@@ -91,9 +109,20 @@ jobs:
           echo "$GAMS_LICENSE" > $HOME/.local/share/GAMS/gamslice.txt
           ls -l $HOME/.local/share/GAMS/
 
-
       # ---------- Run tool, check for regressions
 
+      - name: Restore XLSX cache directory from cache
+        id: cache
+        uses: actions/cache/restore@v4
+        with:
+          path: xl2times/.cache
+          # Cache key is refs of the input xlsx repos, since that's what is cached
+          key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}
+          # If we can't find the exact key for the TIM repo, still use the cache if the demos repo ref matches
+          restore-keys: |
+            ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-
+            ${{ runner.os }}-py-${{ env.PY_VERSION }}-
+
       - name: Run tool on all benchmarks
         env:
           GAMS_LICENSE: ${{ secrets.GAMS_LICENSE }}
@@ -132,3 +161,10 @@ jobs:
         run: |
           sed -n '/Benchmark *Time.*Accuracy/h;//!H;$!d;x;//p' out.txt
           exit $(cat retcode.txt)
+
+      - uses: actions/cache/save@v4
+        # Save the cache even if the regression tests fail
+        if: always() && !steps.cache-restore.outputs.cache-hit
+        with:
+          path: xl2times/.cache
+          key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -8,7 +8,7 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.10"
+    python: "3.11"
 
 # Build documentation in the "docs/" directory with Sphinx
 sphinx:

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ name = "xl2times"
 version = "0.1.0"
 description = 'An open source tool to convert Excel input files for TIMES models to the DD format accepted by GAMS'
 readme = "README.md"
-requires-python = ">=3.10"
+requires-python = ">=3.11"
 license = { file = "LICENSE" }
 keywords = []
 classifiers = [

diff --git a/pyrightconfig.json b/pyrightconfig.json
@@ -9,7 +9,7 @@
   ],
   "reportMissingImports": true,
   "reportMissingTypeStubs": false,
-  "pythonVersion": "3.10",
+  "pythonVersion": "3.11",
   "pythonPlatform": "All",
   "venv": ".venv",
   "venvPath": "."

diff --git a/xl2times/__main__.py b/xl2times/__main__.py
@@ -1,7 +1,7 @@
 import argparse
 from concurrent.futures import ProcessPoolExecutor
 from datetime import datetime
-
+import hashlib
 from pandas.core.frame import DataFrame
 import pandas as pd
 import pickle
@@ -11,6 +11,7 @@
 import time
 from typing import Dict, List
 
+from xl2times import __file__ as xl2times_file_path
 from xl2times.utils import max_workers
 from . import datatypes, utils
 from . import excel
@@ -19,36 +20,54 @@
 logger = utils.get_logger()
 
 
+cache_dir = os.path.abspath(os.path.dirname(xl2times_file_path)) + "/.cache/"
+os.makedirs(cache_dir, exist_ok=True)
+
+
+def _read_xlsx_cached(filename: str) -> List[datatypes.EmbeddedXlTable]:
+    """Extract EmbeddedXlTables from xlsx file (cached).
+
+    Since excel.extract_tables is quite slow, we cache its results in `cache_dir`.
+    Each file is named by the hash of the contents of an xlsx file, and contains
+    a tuple (filename, modified timestamp, [EmbeddedXlTable]).
+    """
+    with open(filename, "rb") as f:
+        digest = hashlib.file_digest(f, "sha256")  # pyright: ignore
+    hsh = digest.hexdigest()
+    if os.path.isfile(cache_dir + hsh):
+        fname1, _timestamp, tables = pickle.load(open(cache_dir + hsh, "rb"))
+        # In the extremely unlikely event that we have a hash collision, also check that
+        # the filename is the same:
+        # TODO check modified timestamp also matches
+        if filename == fname1:
+            logger.info(f"Using cached data for {filename}")
+            return tables
+    # Write extracted data to cache:
+    tables = excel.extract_tables(filename)
+    pickle.dump((filename, "TODO ModifiedTime", tables), open(cache_dir + hsh, "wb"))
+    return excel.extract_tables(filename)
+
+
 def convert_xl_to_times(
     input_files: List[str],
     output_dir: str,
     config: datatypes.Config,
     model: datatypes.TimesModel,
-    use_pkl: bool,
+    no_cache: bool,
     verbose: bool = False,
     stop_after_read: bool = False,
 ) -> Dict[str, DataFrame]:
-    pickle_file = "raw_tables.pkl"
-    t0 = datetime.now()
-    if use_pkl and os.path.isfile(pickle_file):
-        raw_tables = pickle.load(open(pickle_file, "rb"))
-        logger.warning("Using pickled data not xlsx")
-    else:
-        raw_tables = []
-
-        use_pool = True
-        if use_pool:
-            with ProcessPoolExecutor(max_workers) as executor:
-                for result in executor.map(excel.extract_tables, input_files):
-                    raw_tables.extend(result)
-        else:
-            for f in input_files:
-                result = excel.extract_tables(str(Path(f).absolute()))
-                raw_tables.extend(result)
-        pickle.dump(raw_tables, open(pickle_file, "wb"))
+    start_time = datetime.now()
+    with ProcessPoolExecutor(max_workers) as executor:
+        raw_tables = executor.map(
+            excel.extract_tables if no_cache else _read_xlsx_cached, input_files
+        )
+    # raw_tables is a list of lists, so flatten it:
+    raw_tables = [t for ts in raw_tables for t in ts]
     logger.info(
-        f"Extracted {len(raw_tables)} tables,"
-        f" {sum(table.dataframe.shape[0] for table in raw_tables)} rows in {datetime.now() - t0}"
+        f"Extracted (potentially cached) {len(raw_tables)} tables,"
+        f" {sum(table.dataframe.shape[0] for table in raw_tables)} rows"
+        f" in {datetime.now() - start_time}"
     )
 
     if stop_after_read:
@@ -440,14 +459,14 @@ def run(args: argparse.Namespace) -> str | None:
             args.output_dir,
             config,
             model,
-            args.use_pkl,
+            args.no_cache,
             verbose=args.verbose,
             stop_after_read=True,
         )
         sys.exit(0)
 
     tables = convert_xl_to_times(
-        input_files, args.output_dir, config, model, args.use_pkl, verbose=args.verbose
+        input_files, args.output_dir, config, model, args.no_cache, verbose=args.verbose
     )
 
     if args.dd:
@@ -498,7 +517,11 @@ def parse_args(arg_list: None | list[str]) -> argparse.Namespace:
         action="store_true",
         help="Read xlsx/xlsm files and stop after outputting raw_tables.txt",
     )
-    args_parser.add_argument("--use_pkl", action="store_true")
+    args_parser.add_argument(
+        "--no_cache",
+        action="store_true",
+        help="Ignore cache and re-extract tables from XLSX files",
+    )
     args_parser.add_argument(
         "-v",
         "--verbose",