diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 323aefc..1c858a0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,9 +3,9 @@ name: CI on: # Triggers the workflow on push or pull request events but only for the main branch push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: @@ -14,11 +14,24 @@ jobs: CI: runs-on: ubuntu-latest + env: + PY_VERSION: "3.11" + REF_TIMES_model: "b488fb07f0899ee8b7e710c230b1a9414fa06f7d" + REF_demos-xlsx: "f956db07a253d4f5c60e108791ab7bb2b8136690" + REF_demos-dd: "2848a8a8e2fdcf0cdf7f83eefbdd563b0bb74e86" + REF_tim: "e820d8002adc6b1526a3bffcc439219b28d0eed5" + REF_tim-gams: "703f6a4e1d0bedd95c3ebdae534496f3a7e1b7cc" + steps: - uses: actions/checkout@v3 with: path: xl2times + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PY_VERSION }} + - name: Install tool and dependencies working-directory: xl2times run: | @@ -46,16 +59,19 @@ jobs: with: repository: etsap-TIMES/TIMES_model path: TIMES_model + ref: ${{ env.REF_TIMES_model }} - uses: actions/checkout@v3 with: repository: olejandro/demos-dd path: xl2times/benchmarks/dd + ref: ${{ env.REF_demos-dd }} - uses: actions/checkout@v3 with: repository: olejandro/demos-xlsx path: xl2times/benchmarks/xlsx + ref: ${{ env.REF_demos-xlsx }} token: ${{ secrets.GH_PAT_DEMOS_XLSX }} # ---------- Prepare TIMES Ireland Model @@ -66,11 +82,13 @@ jobs: with: repository: esma-cgep/tim path: xl2times/benchmarks/xlsx/Ireland + ref: ${{ env.REF_tim }} - uses: actions/checkout@v3 with: repository: esma-cgep/tim-gams path: xl2times/benchmarks/dd/Ireland + ref: ${{ env.REF_tim-gams }} # ---------- Install GAMS @@ -91,9 +109,20 @@ jobs: echo "$GAMS_LICENSE" > $HOME/.local/share/GAMS/gamslice.txt ls -l $HOME/.local/share/GAMS/ - # ---------- Run tool, check for regressions + - name: Restore XLSX cache directory from cache + id: cache + uses: actions/cache/restore@v4 + with: + path: ${{ github.workspace }}/xl2times/xl2times/.cache + # Cache key is refs of the input xlsx repos, since that's what is cached + key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }} + # If we can't find the exact key for the TIM repo, still use the cache if the demos repo ref matches + restore-keys: | + ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}- + ${{ runner.os }}-py-${{ env.PY_VERSION }}- + - name: Run tool on all benchmarks env: GAMS_LICENSE: ${{ secrets.GAMS_LICENSE }} @@ -132,3 +161,10 @@ jobs: run: | sed -n '/Benchmark *Time.*Accuracy/h;//!H;$!d;x;//p' out.txt exit $(cat retcode.txt) + + - uses: actions/cache/save@v4 + # Save the cache even if the regression tests fail + if: always() && !steps.cache-restore.outputs.cache-hit + with: + path: ${{ github.workspace }}/xl2times/xl2times/.cache + key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }} diff --git a/.readthedocs.yaml b/.readthedocs.yaml index b7ca4ba..15c53be 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -8,7 +8,7 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.10" + python: "3.11" # Build documentation in the "docs/" directory with Sphinx sphinx: diff --git a/pyproject.toml b/pyproject.toml index 2995c0c..01429a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ name = "xl2times" version = "0.1.0" description = 'An open source tool to convert Excel input files for TIMES models to the DD format accepted by GAMS' readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.11" license = { file = "LICENSE" } keywords = [] classifiers = [ diff --git a/pyrightconfig.json b/pyrightconfig.json index d6bcb2c..9cc76bf 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -9,7 +9,7 @@ ], "reportMissingImports": true, "reportMissingTypeStubs": false, - "pythonVersion": "3.10", + "pythonVersion": "3.11", "pythonPlatform": "All", "venv": ".venv", "venvPath": "." diff --git a/xl2times/__main__.py b/xl2times/__main__.py index 511eaa2..ae2062b 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -1,7 +1,7 @@ import argparse from concurrent.futures import ProcessPoolExecutor from datetime import datetime - +import hashlib from pandas.core.frame import DataFrame import pandas as pd import pickle @@ -11,6 +11,7 @@ import time from typing import Dict, List +from xl2times import __file__ as xl2times_file_path from xl2times.utils import max_workers from . import datatypes, utils from . import excel @@ -19,36 +20,55 @@ logger = utils.get_logger() +cache_dir = os.path.abspath(os.path.dirname(xl2times_file_path)) + "/.cache/" +os.makedirs(cache_dir, exist_ok=True) + + +def _read_xlsx_cached(filename: str) -> List[datatypes.EmbeddedXlTable]: + """Extract EmbeddedXlTables from xlsx file (cached). + + Since excel.extract_tables is quite slow, we cache its results in `cache_dir`. + Each file is named by the hash of the contents of an xlsx file, and contains + a tuple (filename, modified timestamp, [EmbeddedXlTable]). + """ + with open(filename, "rb") as f: + digest = hashlib.file_digest(f, "sha256") # pyright: ignore + hsh = digest.hexdigest() + if os.path.isfile(cache_dir + hsh): + fname1, _timestamp, tables = pickle.load(open(cache_dir + hsh, "rb")) + # In the extremely unlikely event that we have a hash collision, also check that + # the filename is the same: + # TODO check modified timestamp also matches + if filename == fname1: + logger.info(f"Using cached data for {filename} from {cache_dir + hsh}") + return tables + # Write extracted data to cache: + tables = excel.extract_tables(filename) + pickle.dump((filename, "TODO ModifiedTime", tables), open(cache_dir + hsh, "wb")) + logger.info(f"Saved cache for {filename} to {cache_dir + hsh}") + return excel.extract_tables(filename) + + def convert_xl_to_times( input_files: List[str], output_dir: str, config: datatypes.Config, model: datatypes.TimesModel, - use_pkl: bool, + no_cache: bool, verbose: bool = False, stop_after_read: bool = False, ) -> Dict[str, DataFrame]: - pickle_file = "raw_tables.pkl" - t0 = datetime.now() - if use_pkl and os.path.isfile(pickle_file): - raw_tables = pickle.load(open(pickle_file, "rb")) - logger.warning("Using pickled data not xlsx") - else: - raw_tables = [] - - use_pool = True - if use_pool: - with ProcessPoolExecutor(max_workers) as executor: - for result in executor.map(excel.extract_tables, input_files): - raw_tables.extend(result) - else: - for f in input_files: - result = excel.extract_tables(str(Path(f).absolute())) - raw_tables.extend(result) - pickle.dump(raw_tables, open(pickle_file, "wb")) + start_time = datetime.now() + with ProcessPoolExecutor(max_workers) as executor: + raw_tables = executor.map( + excel.extract_tables if no_cache else _read_xlsx_cached, input_files + ) + # raw_tables is a list of lists, so flatten it: + raw_tables = [t for ts in raw_tables for t in ts] logger.info( - f"Extracted {len(raw_tables)} tables," - f" {sum(table.dataframe.shape[0] for table in raw_tables)} rows in {datetime.now() - t0}" + f"Extracted (potentially cached) {len(raw_tables)} tables," + f" {sum(table.dataframe.shape[0] for table in raw_tables)} rows" + f" in {datetime.now() - start_time}" ) if stop_after_read: @@ -440,14 +460,14 @@ def run(args: argparse.Namespace) -> str | None: args.output_dir, config, model, - args.use_pkl, + args.no_cache, verbose=args.verbose, stop_after_read=True, ) sys.exit(0) tables = convert_xl_to_times( - input_files, args.output_dir, config, model, args.use_pkl, verbose=args.verbose + input_files, args.output_dir, config, model, args.no_cache, verbose=args.verbose ) if args.dd: @@ -498,7 +518,11 @@ def parse_args(arg_list: None | list[str]) -> argparse.Namespace: action="store_true", help="Read xlsx/xlsm files and stop after outputting raw_tables.txt", ) - args_parser.add_argument("--use_pkl", action="store_true") + args_parser.add_argument( + "--no_cache", + action="store_true", + help="Ignore cache and re-extract tables from XLSX files", + ) args_parser.add_argument( "-v", "--verbose",