From 9c36c37ec010962ff618f1a0c73a7dbf77ef70b2 Mon Sep 17 00:00:00 2001
From: Sam West <sam.west@csiro.au>
Date: Thu, 15 Feb 2024 15:39:32 +1100
Subject: [PATCH] Feature/benchmarks on windows (#184)

This PR contains changes necessary to run the benchmarks locally on a
Windows machine.

Main changes:

- Introduced `shell=True` param to subprocess calls. Without this, the
virtualenv was not accessible to the subprocesses, which would fail
with, e.g. `ModuleNotFound: pandas` exceptions.

- Python subprocesses in windows are heavyweight and so use a lot more
RAM than linux, so I've added a variable in `utils` that limits the
number `max_workers` param in ProcessPoolExecutor constructors to a
small number (used to be the number of cpus). Without this limitation,
the pool-inside-a-pool structure of the benchmarks was spawning 32*32
python.exe processes and using all my 64Gb of RAM in a few seconds.

- Added a `--debug` switch to `run_benchmarks` to run each benchmakr as
a function call, allowing use of debuggers/breakpoints

- CI now runs CSV-only regression tests if GAMS license isn't in secrets
---
 .github/workflows/ci.yml |  30 ++++++++--
 .gitignore               |   4 +-
 README.md                |  40 +++++++++++++
 pyproject.toml           |   3 +-
 utils/__init__.py        |   0
 utils/dd_to_csv.py       |   8 ++-
 utils/run_benchmarks.py  | 108 ++++++++++++++++++++++------------
 xl2times/__main__.py     | 123 +++++++++++++++++++++++++--------------
 xl2times/transforms.py   |   4 +-
 xl2times/utils.py        |   5 ++
 10 files changed, 234 insertions(+), 91 deletions(-)
 create mode 100644 utils/__init__.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c7a3b09..7f42d5f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,9 +3,9 @@ name: CI
 on:
   # Triggers the workflow on push or pull request events but only for the main branch
   push:
-    branches: [main]
+    branches: [ main ]
   pull_request:
-    branches: [main]
+    branches: [ main ]
 
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
@@ -69,6 +69,9 @@ jobs:
       # ---------- Install GAMS
 
       - name: Install GAMS
+        env:
+          GAMS_LICENSE: ${{ secrets.GAMS_LICENSE }}
+        if: ${{ env.GAMS_LICENSE != '' }}
         run: |
           curl https://d37drm4t2jghv5.cloudfront.net/distributions/44.1.0/linux/linux_x64_64_sfx.exe -o linux_x64_64_sfx.exe
           chmod +x linux_x64_64_sfx.exe
@@ -81,17 +84,18 @@ jobs:
           mkdir -p $HOME/.local/share/GAMS
           echo "$GAMS_LICENSE" > $HOME/.local/share/GAMS/gamslice.txt
           ls -l $HOME/.local/share/GAMS/
-        env:
-          GAMS_LICENSE: ${{ secrets.GAMS_LICENSE }}
+
 
       # ---------- Run tool, check for regressions
 
       - name: Run tool on all benchmarks
+        env:
+          GAMS_LICENSE: ${{ secrets.GAMS_LICENSE }}
+        if: ${{ env.GAMS_LICENSE != '' }}
         working-directory: xl2times
         # Use tee to also save the output to out.txt so that the summary table can be
         # printed again in the next step.
         # Save the return code to retcode.txt so that the next step can fail the action
-        # if run_benchmarks.py failed.
         run: |
           source .venv/bin/activate
           export PATH=$PATH:$GITHUB_WORKSPACE/GAMS/gams44.1_linux_x64_64_sfx
@@ -101,6 +105,22 @@ jobs:
               | tee out.txt; \
             echo ${PIPESTATUS[0]} > retcode.txt)
 
+      - name: Run CSV-only regression tests (no GAMS license)
+        env:
+          GAMS_LICENSE: ${{ secrets.GAMS_LICENSE }}
+        if: ${{ env.GAMS_LICENSE == '' }}
+        working-directory: xl2times
+        # Run without --dd flag if GAMS license secret doesn't exist.
+        # Useful for testing for (CSV) regressions in forks before creating PRs.
+        run: |
+          source .venv/bin/activate
+          export PATH=$PATH:$GITHUB_WORKSPACE/GAMS/gams44.1_linux_x64_64_sfx
+          (python utils/run_benchmarks.py benchmarks.yml \
+              --times_dir $GITHUB_WORKSPACE/TIMES_model \
+              --verbose \
+              | tee out.txt; \
+          echo ${PIPESTATUS[0]} > retcode.txt)
+
       - name: Print summary
         working-directory: xl2times
         run: |
diff --git a/.gitignore b/.gitignore
index a707f93..5b52706 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,7 +13,9 @@ ground_truth/*
 *.pyproj.*
 speedscope.json
 *.pkl
-.venv/
+.venv*/
 benchmarks/
+.idea/
+.python-version
 docs/_build/
 docs/api/
diff --git a/README.md b/README.md
index ee2c1e7..df36800 100644
--- a/README.md
+++ b/README.md
@@ -72,6 +72,46 @@ git commit --no-verify
 
 See our GitHub Actions CI `.github/workflows/ci.yml` and the utility script `utils/run_benchmarks.py` to see how to run the tool on the DemoS models.
 
+In short, use the commands below to clone the benchmarks data into your local `benchmarks` dir.
+Note that this assumes you have access to all these repositories (some are private and
+you'll have to request access) - if not, comment out the inaccessible benchmarks from `benchmakrs.yml` before running.
+
+```bash
+mkdir benchmarks
+# Get VEDA example models and reference DD files
+# XLSX files are in private repo for licensing reasons, please request access or replace with your own licensed VEDA example files.
+git clone git@github.com:olejandro/demos-xlsx.git benchmarks/xlsx/
+git clone git@github.com:olejandro/demos-dd.git benchmarks/dd/
+
+# Get Ireland model and reference DD files
+git clone git@github.com:esma-cgep/tim.git benchmarks/xlsx/Ireland
+git clone git@github.com:esma-cgep/tim-gams.git benchmarks/dd/Ireland
+```
+Then to run the benchmarks:
+```bash
+# Run a only a single benchmark by name (see benchmarks.yml for name list)
+python utils/run_benchmarks.py benchmarks.yml --verbose --run DemoS_001-all | tee out.txt
+
+# Run all benchmarks (without GAMS run, just comparing CSV data)
+python utils/run_benchmarks.py benchmarks.yml --verbose | tee out.txt
+
+
+# Run benchmarks with regression tests vs main branch
+git branch feature/your_new_changes --checkout
+# ... make your code changes here ...
+git commit -a -m "your commit message" # code must be committed for comparison to `main` branch to run.
+python utils/run_benchmarks.py benchmarks.yml --verbose | tee out.txt
+```
+At this point, if you haven't broken anything you should see something like:
+```
+Change in runtime: +2.97s
+Change in correct rows: +0
+Change in additional rows: +0
+No regressions. You're awesome!
+```
+If you have a large increase in runtime, a decrease in correct rows or fewer rows being produced, then you've broken something and will need to figure out how to fix it.
+
+
 ### Debugging Regressions
 
 If your change is causing regressions on one of the benchmarks, a useful way to debug and find the difference is to run the tool in verbose mode and compare the intermediate tables. For example, if your branch has regressions on Demo 1:
diff --git a/pyproject.toml b/pyproject.toml
index 3b678e1..484e835 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0.0", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [tool.setuptools]
-packages = ["xl2times"]
+packages = ["xl2times", "utils"]
 
 [project]
 name = "xl2times"
@@ -24,6 +24,7 @@ dependencies = [
   "more-itertools",
   "openpyxl >= 3.0, < 3.1",
   "pandas >= 2.1",
+  "pyarrow"
 ]
 
 [project.optional-dependencies]
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/utils/dd_to_csv.py b/utils/dd_to_csv.py
index df444e7..5de10fd 100644
--- a/utils/dd_to_csv.py
+++ b/utils/dd_to_csv.py
@@ -216,7 +216,7 @@ def convert_dd_to_tabular(
     return
 
 
-if __name__ == "__main__":
+def main(arg_list: None | list[str] = None):
     args_parser = argparse.ArgumentParser()
     args_parser.add_argument(
         "input_dir", type=str, help="Input directory containing .dd files."
@@ -224,5 +224,9 @@ def convert_dd_to_tabular(
     args_parser.add_argument(
         "output_dir", type=str, help="Output directory to save the .csv files in."
     )
-    args = args_parser.parse_args()
+    args = args_parser.parse_args(arg_list)
     convert_dd_to_tabular(args.input_dir, args.output_dir, generate_headers_by_attr())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py
index f2a9a91..5ae8876 100644
--- a/utils/run_benchmarks.py
+++ b/utils/run_benchmarks.py
@@ -1,4 +1,6 @@
 import argparse
+import os
+from collections import namedtuple
 from concurrent.futures import ProcessPoolExecutor
 from functools import partial
 import git
@@ -13,6 +15,8 @@
 from typing import Any, Tuple
 import yaml
 
+from xl2times.utils import max_workers
+
 
 def parse_result(lastline):
     m = match(
@@ -22,7 +26,7 @@ def parse_result(lastline):
     )
     if not m:
         print(f"ERROR: could not parse output of run:\n{lastline}")
-        sys.exit(1)
+        sys.exit(2)
     # return (accuracy, num_correct_rows, num_additional_rows)
     return (float(m.groups()[0]), int(m.groups()[1]), int(m.groups()[3]))
 
@@ -58,7 +62,7 @@ def run_gams_gdxdiff(
         print(res.stdout)
         print(res.stderr if res.stderr is not None else "")
         print(f"ERROR: GAMS failed on {benchmark['name']}")
-        sys.exit(1)
+        sys.exit(3)
     if "error" in res.stdout.lower():
         print(res.stdout)
         print(f"ERROR: GAMS errored on {benchmark['name']}")
@@ -89,7 +93,7 @@ def run_gams_gdxdiff(
         print(res.stdout)
         print(res.stderr if res.stderr is not None else "")
         print(f"ERROR: GAMS failed on {benchmark['name']} ground truth")
-        sys.exit(1)
+        sys.exit(4)
     if "error" in res.stdout.lower():
         print(res.stdout)
         print(f"ERROR: GAMS errored on {benchmark['name']}")
@@ -125,6 +129,7 @@ def run_benchmark(
     skip_csv: bool = False,
     out_folder: str = "out",
     verbose: bool = False,
+    debug: bool = False,
 ) -> Tuple[str, float, str, float, int, int]:
     xl_folder = path.join(benchmarks_folder, "xlsx", benchmark["input_folder"])
     dd_folder = path.join(benchmarks_folder, "dd", benchmark["dd_folder"])
@@ -134,26 +139,33 @@ def run_benchmark(
     # First convert ground truth DD to csv
     if not skip_csv:
         shutil.rmtree(csv_folder, ignore_errors=True)
-        res = subprocess.run(
-            [
-                "python",
-                "utils/dd_to_csv.py",
-                dd_folder,
-                csv_folder,
-            ],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            text=True,
-        )
-        if res.returncode != 0:
-            # Remove partial outputs
-            shutil.rmtree(csv_folder, ignore_errors=True)
-            print(res.stdout)
-            print(f"ERROR: dd_to_csv failed on {benchmark['name']}")
-            sys.exit(1)
+        if os.name != "nt":
+            res = subprocess.run(
+                [
+                    "python",
+                    "utils/dd_to_csv.py",
+                    dd_folder,
+                    csv_folder,
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,
+            )
+            if res.returncode != 0:
+                # Remove partial outputs
+                shutil.rmtree(csv_folder, ignore_errors=True)
+                print(res.stdout)
+                print(f"ERROR: dd_to_csv failed on {benchmark['name']}")
+                sys.exit(5)
+        else:
+            # If debug option is set, run as a function call to allow stepping with a debugger.
+            from utils.dd_to_csv import main
+
+            main([dd_folder, csv_folder])
+
     elif not path.exists(csv_folder):
         print(f"ERROR: --skip_csv is true but {csv_folder} does not exist")
-        sys.exit(1)
+        sys.exit(6)
 
     # Then run the tool
     args = [
@@ -170,12 +182,23 @@ def run_benchmark(
     else:
         args.append(xl_folder)
     start = time.time()
-    res = subprocess.run(
-        ["xl2times"] + args,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        text=True,
-    )
+    res = None
+    if not debug:
+        res = subprocess.run(
+            ["xl2times"] + args,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+        )
+    else:
+        # If debug option is set, run as a function call to allow stepping with a debugger.
+        from xl2times.__main__ import run, parse_args
+
+        summary = run(parse_args(args))
+
+        # pack the results into a namedtuple pretending to be a return value from a subprocess call (as above).
+        res = namedtuple("stdout", ["stdout", "stderr", "returncode"])(summary, "", 0)
+
     runtime = time.time() - start
 
     if verbose:
@@ -188,7 +211,7 @@ def run_benchmark(
     if res.returncode != 0:
         print(res.stdout)
         print(f"ERROR: tool failed on {benchmark['name']}")
-        sys.exit(1)
+        sys.exit(7)
     with open(path.join(out_folder, "stdout"), "w") as f:
         f.write(res.stdout)
 
@@ -211,6 +234,7 @@ def run_all_benchmarks(
     skip_main=False,
     skip_regression=False,
     verbose=False,
+    debug: bool = False,
 ):
     print("Running benchmarks", end="", flush=True)
     headers = ["Benchmark", "Time (s)", "GDX Diff", "Accuracy", "Correct", "Additional"]
@@ -221,9 +245,10 @@ def run_all_benchmarks(
         skip_csv=skip_csv,
         run_gams=run_gams,
         verbose=verbose,
+        debug=debug,
     )
 
-    with ProcessPoolExecutor() as executor:
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
         results = list(executor.map(run_a_benchmark, benchmarks))
     print("\n\n" + tabulate(results, headers, floatfmt=".1f") + "\n")
 
@@ -264,7 +289,7 @@ def run_all_benchmarks(
     else:
         if repo.is_dirty():
             print("Your working directory is not clean. Skipping regression tests.")
-            sys.exit(1)
+            sys.exit(8)
 
         # Re-run benchmarks on main
         repo.heads.main.checkout()
@@ -277,9 +302,10 @@ def run_all_benchmarks(
             run_gams=run_gams,
             out_folder="out-main",
             verbose=verbose,
+            debug=debug,
         )
 
-        with ProcessPoolExecutor() as executor:
+        with ProcessPoolExecutor(max_workers) as executor:
             results_main = list(executor.map(run_a_benchmark, benchmarks))
 
     # Print table with combined results to make comparison easier
@@ -310,13 +336,13 @@ def run_all_benchmarks(
     )
     if df.isna().values.any():
         print(f"ERROR: number of benchmarks changed:\n{df}")
-        sys.exit(1)
+        sys.exit(9)
     accu_regressions = df[df["Correct"] < df["M Correct"]]["Benchmark"]
     addi_regressions = df[df["Additional"] > df["M Additional"]]["Benchmark"]
     time_regressions = df[df["Time (s)"] > 2 * df["M Time (s)"]]["Benchmark"]
 
     runtime_change = df["Time (s)"].sum() - df["M Time (s)"].sum()
-    print(f"Change in runtime: {runtime_change:+.2f}")
+    print(f"Change in runtime: {runtime_change:+.2f} s")
     correct_change = df["Correct"].sum() - df["M Correct"].sum()
     print(f"Change in correct rows: {correct_change:+d}")
     additional_change = df["Additional"].sum() - df["M Additional"].sum()
@@ -330,7 +356,7 @@ def run_all_benchmarks(
             print(f"ERROR: additional rows regressed on: {', '.join(addi_regressions)}")
         if not time_regressions.empty:
             print(f"ERROR: runtime regressed on: {', '.join(time_regressions)}")
-        sys.exit(1)
+        sys.exit(10)
     # TODO also check if any new tables are missing?
 
     print("No regressions. You're awesome!")
@@ -385,6 +411,12 @@ def run_all_benchmarks(
         default=False,
         help="Print output of run on each benchmark",
     )
+    args_parser.add_argument(
+        "--debug",
+        action="store_true",
+        default=False,
+        help="Run each benchmark as a function call to allow a debugger to stop at breakpoints in benchmark runs.",
+    )
     args = args_parser.parse_args()
 
     spec = yaml.safe_load(open(args.benchmarks_yaml))
@@ -392,17 +424,17 @@ def run_all_benchmarks(
     benchmark_names = [b["name"] for b in spec["benchmarks"]]
     if len(set(benchmark_names)) != len(benchmark_names):
         print("ERROR: Found duplicate name in benchmarks YAML file")
-        sys.exit(1)
+        sys.exit(11)
 
     if args.dd and args.times_dir is None:
         print("ERROR: --times_dir is required when using --dd")
-        sys.exit(1)
+        sys.exit(12)
 
     if args.run is not None:
         benchmark = next((b for b in spec["benchmarks"] if b["name"] == args.run), None)
         if benchmark is None:
             print(f"ERROR: could not find {args.run} in {args.benchmarks_yaml}")
-            sys.exit(1)
+            sys.exit(13)
 
         _, runtime, gms, acc, cor, add = run_benchmark(
             benchmark,
@@ -411,6 +443,7 @@ def run_all_benchmarks(
             run_gams=args.dd,
             skip_csv=args.skip_csv,
             verbose=args.verbose,
+            debug=args.debug,
         )
         print(
             f"Ran {args.run} in {runtime:.2f}s. {acc}% ({cor} correct, {add} additional).\n"
@@ -426,4 +459,5 @@ def run_all_benchmarks(
             skip_main=args.skip_main,
             skip_regression=args.skip_regression,
             verbose=args.verbose,
+            debug=args.debug,
         )
diff --git a/xl2times/__main__.py b/xl2times/__main__.py
index d5598f1..5f1f3f7 100644
--- a/xl2times/__main__.py
+++ b/xl2times/__main__.py
@@ -8,6 +8,8 @@
 import sys
 import time
 from typing import Dict, List
+
+from xl2times.utils import max_workers
 from . import datatypes
 from . import excel
 from . import transforms
@@ -31,7 +33,7 @@ def convert_xl_to_times(
 
         use_pool = True
         if use_pool:
-            with ProcessPoolExecutor() as executor:
+            with ProcessPoolExecutor(max_workers) as executor:
                 for result in executor.map(excel.extract_tables, input_files):
                     raw_tables.extend(result)
         else:
@@ -117,7 +119,7 @@ def convert_xl_to_times(
         end_time = time.time()
         sep = "\n\n" + "=" * 80 + "\n" if verbose else ""
         print(
-            f"{sep}transform {transform.__code__.co_name} took {end_time-start_time:.2f} seconds"
+            f"{sep}transform {transform.__code__.co_name} took {end_time - start_time:.2f} seconds"
         )
         if verbose:
             if isinstance(output, list):
@@ -160,7 +162,7 @@ def read_csv_tables(input_dir: str) -> Dict[str, DataFrame]:
 
 def compare(
     data: Dict[str, DataFrame], ground_truth: Dict[str, DataFrame], output_dir: str
-):
+) -> str:
     print(
         f"Ground truth contains {len(ground_truth)} tables,"
         f" {sum(df.shape[0] for _, df in ground_truth.items())} rows"
@@ -223,13 +225,15 @@ def compare(
                     os.path.join(output_dir, table_name + "_missing.csv"),
                     index=False,
                 )
-
-    print(
+    result = (
         f"{total_correct_rows / total_gt_rows :.1%} of ground truth rows present"
         f" in output ({total_correct_rows}/{total_gt_rows})"
         f", {total_additional_rows} additional rows"
     )
 
+    print(result)
+    return result
+
 
 def produce_times_tables(
     config: datatypes.Config, input: Dict[str, DataFrame]
@@ -385,42 +389,14 @@ def dump_tables(tables: List, filename: str) -> List:
     return tables
 
 
-def main():
-    args_parser = argparse.ArgumentParser()
-    args_parser.add_argument(
-        "input",
-        nargs="*",
-        help="Either an input directory, or a list of input xlsx/xlsm files to process",
-    )
-    args_parser.add_argument(
-        "--regions",
-        type=str,
-        default="",
-        help="Comma-separated list of regions to include in the model",
-    )
-    args_parser.add_argument(
-        "--output_dir", type=str, default="output", help="Output directory"
-    )
-    args_parser.add_argument(
-        "--ground_truth_dir",
-        type=str,
-        help="Ground truth directory to compare with output",
-    )
-    args_parser.add_argument("--dd", action="store_true", help="Output DD files")
-    args_parser.add_argument(
-        "--only_read",
-        action="store_true",
-        help="Read xlsx/xlsm files and stop after outputting raw_tables.txt",
-    )
-    args_parser.add_argument("--use_pkl", action="store_true")
-    args_parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-        help="Verbose mode: print tables after every transform",
-    )
-    args = args_parser.parse_args()
-
+def run(args) -> str | None:
+    """
+    Runs the xl2times conversion.
+    Args:
+         args: pre-parsed command line arguments
+    Returns:
+         comparison with ground-truth string if `ground_truth_dir` is provided, else None.
+    """
     config = datatypes.Config(
         "times_mapping.txt",
         "times-info.json",
@@ -433,7 +409,7 @@ def main():
 
     if not isinstance(args.input, list) or len(args.input) < 1:
         print(f"ERROR: expected at least 1 input. Got {args.input}")
-        sys.exit(1)
+        sys.exit(-1)
     elif len(args.input) == 1:
         assert os.path.isdir(args.input[0])
         input_files = [
@@ -468,8 +444,67 @@ def main():
 
     if args.ground_truth_dir:
         ground_truth = read_csv_tables(args.ground_truth_dir)
-        compare(tables, ground_truth, args.output_dir)
+        comparison = compare(tables, ground_truth, args.output_dir)
+        return comparison
+    else:
+        return None
+
+
+def parse_args(arg_list: None | list[str]) -> argparse.Namespace:
+    """Parses command line arguments.
+
+    Args:
+        arg_list: List of command line arguments. Uses sys.argv (default argparse behaviour) if `None`.
+
+    Returns:
+        argparse.Namespace: Parsed arguments.
+    """
+    args_parser = argparse.ArgumentParser()
+    args_parser.add_argument(
+        "input",
+        nargs="*",
+        help="Either an input directory, or a list of input xlsx/xlsm files to process",
+    )
+    args_parser.add_argument(
+        "--regions",
+        type=str,
+        default="",
+        help="Comma-separated list of regions to include in the model",
+    )
+    args_parser.add_argument(
+        "--output_dir", type=str, default="output", help="Output directory"
+    )
+    args_parser.add_argument(
+        "--ground_truth_dir",
+        type=str,
+        help="Ground truth directory to compare with output",
+    )
+    args_parser.add_argument("--dd", action="store_true", help="Output DD files")
+    args_parser.add_argument(
+        "--only_read",
+        action="store_true",
+        help="Read xlsx/xlsm files and stop after outputting raw_tables.txt",
+    )
+    args_parser.add_argument("--use_pkl", action="store_true")
+    args_parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Verbose mode: print tables after every transform",
+    )
+    args = args_parser.parse_args(arg_list)
+    return args
+
+
+def main(arg_list: None | list[str] = None) -> None:
+    """Main entry point for the xl2times package
+    Returns:
+        None.
+    """
+    args = parse_args(arg_list)
+    run(args)
 
 
 if __name__ == "__main__":
-    main()
+    main(sys.argv[1:])
+    sys.exit(0)
diff --git a/xl2times/transforms.py b/xl2times/transforms.py
index 339bd07..984701f 100644
--- a/xl2times/transforms.py
+++ b/xl2times/transforms.py
@@ -10,6 +10,8 @@
 from concurrent.futures import ProcessPoolExecutor
 import time
 from functools import reduce
+
+from .utils import max_workers
 from . import datatypes
 from . import utils
 
@@ -2488,5 +2490,5 @@ def expand_rows_parallel(
     tables: List[datatypes.EmbeddedXlTable],
     model: datatypes.TimesModel,
 ) -> List[datatypes.EmbeddedXlTable]:
-    with ProcessPoolExecutor() as executor:
+    with ProcessPoolExecutor(max_workers) as executor:
         return list(executor.map(expand_rows, tables))
diff --git a/xl2times/utils.py b/xl2times/utils.py
index 033ec9b..bec4cb9 100644
--- a/xl2times/utils.py
+++ b/xl2times/utils.py
@@ -1,3 +1,4 @@
+import os
 import re
 from dataclasses import replace
 from math import log10, floor
@@ -10,6 +11,10 @@
 
 from . import datatypes
 
+# prevent excessive number of processes in Windows and high cpu-count machines
+# TODO make this a cli param or global setting?
+max_workers: int = 4 if os.name == "nt" else min(16, os.cpu_count() or 16)
+
 
 def apply_composite_tag(table: datatypes.EmbeddedXlTable) -> datatypes.EmbeddedXlTable:
     """