From fc9b4786b587e5bb712a92857aeeb3b3c5eec0c3 Mon Sep 17 00:00:00 2001 From: Olexandr Balyk Date: Fri, 8 Nov 2024 09:46:39 -0600 Subject: [PATCH] Add TIMES-NZ to benchmarks and refactor CI (#239) Move setting up benchmark repos to new script. --------- Co-authored-by: Siddharth Krishna --- .github/workflows/ci.yml | 61 ++++++-------------------- README.md | 32 ++++++-------- benchmarks.yml | 92 ++++++++++++++++++++++++++++++++++++++++ setup-benchmarks.sh | 78 ++++++++++++++++++++++++++++++++++ xl2times/__main__.py | 4 +- 5 files changed, 198 insertions(+), 69 deletions(-) create mode 100755 setup-benchmarks.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 053d6f85..a59d07e6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,15 +20,10 @@ jobs: env: PY_VERSION: "3.11" - REF_TIMES_model: "b488fb07f0899ee8b7e710c230b1a9414fa06f7d" - REF_demos-xlsx: "34a2a5c044cc0bbea1357de50db2f5f02d575181" - REF_demos-dd: "2848a8a8e2fdcf0cdf7f83eefbdd563b0bb74e86" - REF_tim: "e820d8002adc6b1526a3bffcc439219b28d0eed5" - REF_tim-gams: "703f6a4e1d0bedd95c3ebdae534496f3a7e1b7cc" CACHE_KEY: 1 # Use this for manual cache key bumps, e.g., when caching code changes steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: path: xl2times @@ -65,42 +60,17 @@ jobs: source .venv/bin/activate pytest - # ---------- Prepare ETSAP Demo models + # ---------- Setup benchmark repositories - - uses: actions/checkout@v3 - with: - repository: etsap-TIMES/TIMES_model - path: TIMES_model - ref: ${{ env.REF_TIMES_model }} - - - uses: actions/checkout@v3 - with: - repository: olejandro/demos-dd - path: xl2times/benchmarks/dd - ref: ${{ env.REF_demos-dd }} - - - uses: actions/checkout@v3 - with: - repository: olejandro/demos-xlsx - path: xl2times/benchmarks/xlsx - ref: ${{ env.REF_demos-xlsx }} - token: ${{ secrets.GH_PAT_DEMOS_XLSX }} - - # ---------- Prepare TIMES Ireland Model - - # We add this model as the directory `ireland` under `benchmarks/{xlsx,dd}/` - # so that the run_benchmarks.py script runs this model too - - uses: actions/checkout@v3 - with: - repository: esma-cgep/tim - path: xl2times/benchmarks/xlsx/Ireland - ref: ${{ env.REF_tim }} - - - uses: actions/checkout@v3 - with: - repository: esma-cgep/tim-gams - path: xl2times/benchmarks/dd/Ireland - ref: ${{ env.REF_tim-gams }} + - name: Setup benchmark repositories + working-directory: xl2times + run: | + ./setup-benchmarks.sh + # Convert Times-NZ's old XLS files to XLSX so that xl2times can read it: + sudo apt install libreoffice-calc default-jre libreoffice-java-common + find benchmarks/TIMES-NZ -name "*.xls" -type f -printf "soffice --convert-to xlsx --outdir '%h' '%p'\n" | bash + env: + GH_PAT_DEMOS_XLSX: ${{ secrets.GH_PAT_DEMOS_XLSX }} # ---------- Install GAMS @@ -128,13 +98,8 @@ jobs: uses: actions/cache/restore@v4 with: path: ~/.cache/xl2times - # Cache key is refs of the input xlsx repos, since that's what is cached - key: ${{ runner.os }}-${{ env.CACHE_KEY }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }} - # If we can't find the exact key for the TIM repo, still use the cache if the demos repo ref matches - restore-keys: | - ${{ runner.os }}-${{ env.CACHE_KEY }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}- - ${{ runner.os }}-${{ env.CACHE_KEY }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}- - ${{ runner.os }}-${{ env.CACHE_KEY }}-py-${{ env.PY_VERSION }}- + # Cache key is manual key + python version + key: ${{ runner.os }}-${{ env.CACHE_KEY }}-py-${{ env.PY_VERSION }} - name: Run tool on all benchmarks env: diff --git a/README.md b/README.md index 535eccf0..7ddacf6e 100644 --- a/README.md +++ b/README.md @@ -73,38 +73,32 @@ git commit --no-verify ### Running Benchmarks -See our GitHub Actions CI `.github/workflows/ci.yml` and the utility script `utils/run_benchmarks.py` to see how to run the tool on the DemoS models. - -In short, use the commands below to clone the benchmarks data into your local `benchmarks` dir. -Note that this assumes you have access to all these repositories (some are private and -you'll have to request access) - if not, comment out the inaccessible benchmarks from `benchmakrs.yml` before running. +We use the TIMES DemoS models and some public TIMES models as benchmarks. +See our GitHub Actions CI `.github/workflows/ci.yml` and the utility script `utils/run_benchmarks.py` to see how to we benchmark the tool and check PRs automatically for regression. +If you are a developer, you can use the below instructions to set up and run the benchmarks locally: ```bash -mkdir benchmarks -# Get TIMES DemoS example models and reference DD files -# XLSX files are in private repo for licensing reasons, please request access or replace with your own files distributed with Veda. -git clone git@github.com:olejandro/demos-xlsx.git benchmarks/xlsx/ -git clone git@github.com:olejandro/demos-dd.git benchmarks/dd/ - -# Get Ireland model and reference DD files -git clone git@github.com:esma-cgep/tim.git benchmarks/xlsx/Ireland -git clone git@github.com:esma-cgep/tim-gams.git benchmarks/dd/Ireland +./setup-benchmarks.sh ``` +Note that this script assumes you have access to all the relevant repositories (some are private and you'll have to request access) - if not, comment out the inaccessible benchmarks from `benchmarks.yml` before running. + Then to run the benchmarks: ```bash # Run a only a single benchmark by name (see benchmarks.yml for name list) -python utils/run_benchmarks.py benchmarks.yml --verbose --run DemoS_001-all | tee out.txt +python utils/run_benchmarks.py benchmarks.yml --run DemoS_001-all -# Run all benchmarks (without GAMS run, just comparing CSV data for regressions) -# Note: if you have multiple remotes, set etsap-TIMES/xl2times as the first one, as it is used for speed/correctness comparisons. -python utils/run_benchmarks.py benchmarks.yml --verbose | tee out.txt +# To see the full output logs, and save it in a file for convenience +python utils/run_benchmarks.py benchmarks.yml --run DemoS_001-all --verbose | tee out.txt +# Run all benchmarks (without GAMS run, just comparing CSV data for regressions) +# Note: if you have multiple remotes, set etsap-TIMES/xl2times as the `origin`, as it is used for speed/correctness comparisons. +python utils/run_benchmarks.py benchmarks.yml # Run benchmarks with regression tests vs main branch git branch feature/your_new_changes --checkout # ... make your code changes here ... git commit -a -m "your commit message" # code must be committed for comparison to `main` branch to run. -python utils/run_benchmarks.py benchmarks.yml --verbose | tee out.txt +python utils/run_benchmarks.py benchmarks.yml ``` At this point, if you haven't broken anything you should see something like: ``` diff --git a/benchmarks.yml b/benchmarks.yml index dcbb9898..98e869b8 100644 --- a/benchmarks.yml +++ b/benchmarks.yml @@ -348,3 +348,95 @@ benchmarks: - "b_tra_emissions" - "b_tra_ev_parity" - "b_tra_f_modalshares" + - name: TIMES-NZ-KEA + input_folder: TIMES-NZ + inputs: + - "VT_NI_ELC_V4.xlsx" + - "VT_NI_IND_V2.xlsx" + - "VT_NI_OTH_V4.xlsx" + - "VT_NI_PRI_V4.xlsx" + - "VT_NI_TRA_V4.xlsx" + - "VT_SI_ELC_V4.xlsx" + - "VT_SI_IND_V2.xlsx" + - "VT_SI_OTH_V4.xlsx" + - "VT_SI_PRI_V4.xlsx" + - "VT_SI_TRA_V4.xlsx" + - "BY_Trans.xlsx" + - "SuppXLS/Trades/ScenTrade__Trade_Links.xlsx" + - "SubRES_TMPL/SubRES_NewTech_ELC_KEA.xlsx" + - "SubRES_TMPL/SubRES_NewTechs_Industry.xlsx" + - "SubRES_TMPL/SubRES_NewTechs_Industry_Trans.xlsx" + - "SubRES_TMPL/SubRES_NewTransport-KEA.xlsx" + - "SubRES_TMPL/SubRES_NewTech_AGR_KEA.xlsx" + - "SubRES_TMPL/SubRES_NewTech_AGR_KEA_Trans.xlsx" + - "SubRES_TMPL/SubRES_NewTech_RC.xlsx" + - "SuppXLS/Trades/ScenTrade_TRADE_PARMS.xlsx" + - "SysSettings.xlsx" + - "SuppXLS/Scen_Base_constraints.xlsx" + - "SuppXLS/Scen_RE_Potentials.xlsx" + - "SuppXLS/Scen_LoadCurve_COM-FR.xlsx" + - "SuppXLS/Scen_AF_Renewable.xlsx" + - "SuppXLS/Scen_WEM_WCM.xlsx" + - "SuppXLS/Scen_Cohesive.xlsx" + dd_folder: TIMES-NZ-KEA + dd_files: + - "base.dd" + - "newtech_elc_kea.dd" + - "newtechs_industry.dd" + - "newtransport-kea.dd" + - "newtech_agr_kea.dd" + - "newtech_rc.dd" + - "trade_parms.dd" + - "syssettings.dd" + - "base_constraints.dd" + - "re_potentials.dd" + - "loadcurve_com-fr.dd" + - "af_renewable.dd" + - "wem_wcm.dd" + - "cohesive.dd" + - name: TIMES-NZ-TUI + input_folder: TIMES-NZ + inputs: + - "VT_NI_ELC_V4.xlsx" + - "VT_NI_IND_V2.xlsx" + - "VT_NI_OTH_V4.xlsx" + - "VT_NI_PRI_V4.xlsx" + - "VT_NI_TRA_V4.xlsx" + - "VT_SI_ELC_V4.xlsx" + - "VT_SI_IND_V2.xlsx" + - "VT_SI_OTH_V4.xlsx" + - "VT_SI_PRI_V4.xlsx" + - "VT_SI_TRA_V4.xlsx" + - "BY_Trans.xlsx" + - "SuppXLS/Trades/ScenTrade__Trade_Links.xlsx" + - "SubRES_TMPL/SubRES_NewTech_ELC_TUI.xlsx" + - "SubRES_TMPL/SubRES_NewTechs_Industry.xlsx" + - "SubRES_TMPL/SubRES_NewTechs_Industry_Trans.xlsx" + - "SubRES_TMPL/SubRES_NewTransport-TUI.xlsx" + - "SubRES_TMPL/SubRES_NewTech_AGR_TUI.xlsx" + - "SubRES_TMPL/SubRES_NewTech_AGR_TUI_Trans.xlsx" + - "SubRES_TMPL/SubRES_NewTech_RC.xlsx" + - "SuppXLS/Trades/ScenTrade_TRADE_PARMS.xlsx" + - "SysSettings.xlsx" + - "SuppXLS/Scen_Base_constraints.xlsx" + - "SuppXLS/Scen_RE_Potentials.xlsx" + - "SuppXLS/Scen_LoadCurve_COM-FR.xlsx" + - "SuppXLS/Scen_AF_Renewable.xlsx" + - "SuppXLS/Scen_WEM_WCM.xlsx" + - "SuppXLS/Scen_Individualistic.xlsx" + dd_folder: TIMES-NZ-TUI + dd_files: + - "base.dd" + - "newtechs_industry.dd" + - "newtech_elc_tui.dd" + - "newtransport-tui.dd" + - "newtech_agr_tui.dd" + - "newtech_rc.dd" + - "trade_parms.dd" + - "syssettings.dd" + - "base_constraints.dd" + - "re_potentials.dd" + - "loadcurve_com-fr.dd" + - "af_renewable.dd" + - "wem_wcm.dd" + - "individualistic.dd" diff --git a/setup-benchmarks.sh b/setup-benchmarks.sh new file mode 100755 index 00000000..e684223c --- /dev/null +++ b/setup-benchmarks.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +# A helper script to setup or update the repositories containing benchmark models under `benchmarks/` + +set -eo pipefail + +# Commit SHA for each repository: +REF_TIMES_model="b488fb07f0899ee8b7e710c230b1a9414fa06f7d" +REF_demos_xlsx="34a2a5c044cc0bbea1357de50db2f5f02d575181" +REF_demos_dd="2848a8a8e2fdcf0cdf7f83eefbdd563b0bb74e86" +REF_tim="e820d8002adc6b1526a3bffcc439219b28d0eed5" +REF_tim_gams="703f6a4e1d0bedd95c3ebdae534496f3a7e1b7cc" +REF_TIMES_NZ="c83f2d0e51d692cba27a55032c8f8a2a48e4d425" + +# If no GitHub token is provided, try to clone using SSH +if [ -z "$GH_PAT_DEMOS_XLSX" ]; then + echo "Warning: no GitHub token provided, will try to clone private repos using SSH" + use_SSH=1 +fi + +# Move to the directory containing this script +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd "$SCRIPT_DIR" + +mkdir -p benchmarks + +# Function to check out a repository at a specified commit +checkout_repo() { + local repo=$1 + local dest_dir=$2 + local commit=$3 + local private=$4 + + if [ -d "$dest_dir" ]; then + echo "Directory $dest_dir already exists. Checking if on correct commit." + pushd "$dest_dir" > /dev/null + git fetch --depth=1 origin "$commit" + else + echo "Directory $dest_dir does not exist. Cloning repository." + if [ -n "$private" ]; then + if [ -n "$use_SSH" ]; then + repo_url="git@github.com:${repo}.git" + else + repo_url="https://$GH_PAT_DEMOS_XLSX@github.com/${repo}/" + fi + else + repo_url="https://github.com/${repo}/" + fi + git clone --filter=blob:none "$repo_url" "$dest_dir" + pushd "$dest_dir" > /dev/null + fi + git checkout "$commit" || exit 1 + popd > /dev/null + echo "$dest_dir: successfully checked out $repo at $commit" +} + +# Array of repositories to check out, in the form repo|dest_dir|commit|private +repositories=( + "etsap-TIMES/TIMES_model|TIMES_model|$REF_TIMES_model" + "olejandro/demos-dd|benchmarks/dd|$REF_demos_dd" + "olejandro/demos-xlsx|benchmarks/xlsx|$REF_demos_xlsx|true" + "esma-cgep/tim|benchmarks/xlsx/Ireland|$REF_tim" + "esma-cgep/tim-gams|benchmarks/dd/Ireland|$REF_tim_gams" + "olejandro/TIMES-NZ-Model-Files|benchmarks/TIMES-NZ|$REF_TIMES_NZ" +) + +# Setup / update the repositories +for repo_info in "${repositories[@]}"; do + IFS='|' read -r repo dest_dir commit private <<< "$repo_info" + checkout_repo "$repo" "$dest_dir" "$commit" "$private" +done + +# Create symlinks for TIMES-NZ since xlsx & dd files are in same repo +ln -s "$SCRIPT_DIR/benchmarks/TIMES-NZ/TIMES-NZ" "$SCRIPT_DIR/benchmarks/xlsx/TIMES-NZ" +ln -s "$SCRIPT_DIR/benchmarks/TIMES-NZ/TIMES-NZ-GAMS/times_scenarios/kea-v2_1_3" "$SCRIPT_DIR/benchmarks/dd/TIMES-NZ-KEA" +ln -s "$SCRIPT_DIR/benchmarks/TIMES-NZ/TIMES-NZ-GAMS/times_scenarios/tui-v2_1_3" "$SCRIPT_DIR/benchmarks/dd/TIMES-NZ-TUI" + +echo "All benchmark repositories are set up and up to date :)" diff --git a/xl2times/__main__.py b/xl2times/__main__.py index 5f8a28ab..da3f3d56 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -43,8 +43,8 @@ def _read_xlsx_cached(filename: str | Path) -> list[EmbeddedXlTable]: """Extract EmbeddedXlTables from xlsx file (cached). Since excel.extract_tables is quite slow, we cache its results in `cache_dir`. - Each file is named by the hash of the contents of an xlsx file, and contains - a tuple (filename, modified timestamp, [EmbeddedXlTable]). + Each cache file is named {filename}_{hash}.pkl, and contains a pickled + `[EmbeddedXlTable]`. Args: filename: Path to the xlsx file to extract tables from.