Skip to content

Commit

Permalink
Merge branch 'main' into olex/clean-up
Browse files Browse the repository at this point in the history
  • Loading branch information
olejandro authored Mar 23, 2024
2 parents 893dc62 + bf5ea29 commit bfe04eb
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 43 deletions.
10 changes: 6 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ jobs:
REF_demos-dd: "2848a8a8e2fdcf0cdf7f83eefbdd563b0bb74e86"
REF_tim: "e820d8002adc6b1526a3bffcc439219b28d0eed5"
REF_tim-gams: "703f6a4e1d0bedd95c3ebdae534496f3a7e1b7cc"
CACHE_KEY: 0 # Use this for manual cache key bumps, e.g., when caching code changes

steps:
- uses: actions/checkout@v3
Expand Down Expand Up @@ -115,11 +116,12 @@ jobs:
id: cache
uses: actions/cache/restore@v4
with:
path: ${{ github.workspace }}/xl2times/xl2times/.cache
path: ~/.cache/xl2times
# Cache key is refs of the input xlsx repos, since that's what is cached
key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}
key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}-${{ env.CACHE_KEY }}
# If we can't find the exact key for the TIM repo, still use the cache if the demos repo ref matches
restore-keys: |
${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}-
${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-
${{ runner.os }}-py-${{ env.PY_VERSION }}-
Expand Down Expand Up @@ -166,5 +168,5 @@ jobs:
# Save the cache even if the regression tests fail
if: always() && !steps.cache-restore.outputs.cache-hit
with:
path: ${{ github.workspace }}/xl2times/xl2times/.cache
key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}
path: ~/.cache/xl2times
key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}-${{ env.CACHE_KEY }}
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ Source = "https://github.com/etsap-TIMES/xl2times"

[project.scripts]
xl2times = "xl2times.__main__:main"
dd_to_csv = "xl2times.dd_to_csv:main"

[tool.pytest.ini_options]
# don't print runtime warnings
Expand Down
6 changes: 4 additions & 2 deletions utils/run_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
import git
import pandas as pd
import yaml
from dd_to_csv import main
from tabulate import tabulate

from xl2times import utils
from xl2times.__main__ import parse_args, run
from xl2times.dd_to_csv import main
from xl2times.utils import max_workers

logger = utils.get_logger()
Expand Down Expand Up @@ -155,7 +155,7 @@ def run_benchmark(
res = subprocess.run(
[
"python",
"utils/dd_to_csv.py",
"xl2times/dd_to_csv.py",
dd_folder,
csv_folder,
],
Expand All @@ -164,6 +164,7 @@ def run_benchmark(
text=True,
shell=True if os.name == "nt" else False,
check=False,
encoding="utf-8",
)
if res.returncode != 0:
# Remove partial outputs
Expand Down Expand Up @@ -208,6 +209,7 @@ def run_benchmark(
stderr=subprocess.STDOUT,
text=True,
check=False,
encoding="utf-8",
)
else:
# If debug option is set, run as a function call to allow stepping with a debugger.
Expand Down
70 changes: 46 additions & 24 deletions xl2times/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
import sys
import time
from concurrent.futures import ProcessPoolExecutor
from datetime import datetime
from datetime import datetime, timedelta
from pathlib import Path

import numpy as np
import pandas as pd
from pandas.core.frame import DataFrame

from xl2times import __file__ as xl2times_file_path
from xl2times.utils import max_workers

from . import excel, transforms, utils
Expand All @@ -20,33 +20,54 @@
logger = utils.get_logger()


cache_dir = os.path.abspath(os.path.dirname(xl2times_file_path)) + "/.cache/"
os.makedirs(cache_dir, exist_ok=True)
cache_dir = Path.home() / ".cache/xl2times/"
cache_dir.mkdir(exist_ok=True, parents=True)


def _read_xlsx_cached(filename: str) -> list[EmbeddedXlTable]:
def invalidate_cache(max_age: timedelta = timedelta(days=365)):
"""
Delete any cache files older than max_age.
Args:
max_age: Maximum age of a cache file to be considered valid. Any cache files older than this are deleted.
"""
for file in cache_dir.glob("*.pkl"):
if datetime.now() - datetime.fromtimestamp(file.lstat().st_mtime) > max_age:
try:
file.unlink()
except Exception as e:
logger.warning(f"Failed to delete old cache file {file}. {e}")


def _read_xlsx_cached(filename: str | Path) -> list[EmbeddedXlTable]:
"""Extract EmbeddedXlTables from xlsx file (cached).
Since excel.extract_tables is quite slow, we cache its results in `cache_dir`.
Each file is named by the hash of the contents of an xlsx file, and contains
a tuple (filename, modified timestamp, [EmbeddedXlTable]).
Args:
filename: Path to the xlsx file to extract tables from.
"""
with open(filename, "rb") as f:
filename = Path(filename).resolve()
with filename.open("rb") as f:
digest = hashlib.file_digest(f, "sha256") # pyright: ignore
hsh = digest.hexdigest()
if os.path.isfile(cache_dir + hsh):
fname1, _timestamp, tables = pickle.load(open(cache_dir + hsh, "rb"))
# In the extremely unlikely event that we have a hash collision, also check that
# the filename is the same:
# TODO check modified timestamp also matches
if filename == fname1:
logger.info(f"Using cached data for {filename} from {cache_dir + hsh}")
return tables
# Write extracted data to cache:
tables = excel.extract_tables(filename)
pickle.dump((filename, "TODO ModifiedTime", tables), open(cache_dir + hsh, "wb"))
logger.info(f"Saved cache for {filename} to {cache_dir + hsh}")
return excel.extract_tables(filename)
cached_file = (cache_dir / f"{Path(filename).stem}_{hsh}.pkl").resolve()

if cached_file.exists():
# just load and return the cached pickle
with cached_file.open("rb") as f:
tables = pickle.load(f)
logger.info(f"Using cached data for {filename} from {cached_file}")
else:
# extract data and write it to cache before returning it
tables = excel.extract_tables(str(filename))
with cached_file.open("wb") as f:
pickle.dump(tables, f)
logger.info(f"Saved cache for {filename} to {cached_file}")

return tables


def convert_xl_to_times(
Expand All @@ -59,6 +80,8 @@ def convert_xl_to_times(
stop_after_read: bool = False,
) -> dict[str, DataFrame]:
start_time = datetime.now()

invalidate_cache()
with ProcessPoolExecutor(max_workers) as executor:
raw_tables = executor.map(
excel.extract_tables if no_cache else _read_xlsx_cached, input_files
Expand Down Expand Up @@ -180,10 +203,9 @@ def write_csv_tables(tables: dict[str, DataFrame], output_dir: str):

def read_csv_tables(input_dir: str) -> dict[str, DataFrame]:
result = {}
for filename in os.listdir(input_dir):
result[filename.split(".")[0]] = pd.read_csv(
os.path.join(input_dir, filename), dtype=str
)
csv_files = list(Path(input_dir).glob("*.csv"))
for filename in csv_files:
result[filename.stem] = pd.read_csv(filename, dtype=str)
return result


Expand Down Expand Up @@ -253,7 +275,7 @@ def compare(
index=False,
)
result = (
f"{total_correct_rows / total_gt_rows :.1%} of ground truth rows present"
f"{(total_correct_rows / total_gt_rows) if total_gt_rows!=0 else np.nan :.1%} of ground truth rows present"
f" in output ({total_correct_rows}/{total_gt_rows})"
f", {total_additional_rows} additional rows"
)
Expand Down
2 changes: 1 addition & 1 deletion xl2times/config/times-info.json
Original file line number Diff line number Diff line change
Expand Up @@ -4707,7 +4707,7 @@
},
{
"name": "UC_DYNBND",
"gams-cat": "parameter",
"gams-cat": "md-set",
"indexes": [
"UC_N",
"LIM"
Expand Down
21 changes: 10 additions & 11 deletions utils/dd_to_csv.py → xl2times/dd_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
def parse_parameter_values_from_file(
path: Path,
) -> tuple[dict[str, list], dict[str, set]]:
"""Parse *.dd to turn it into CSV format.
"""Parse a `*.dd` file and extract the sets and parameters.
There are parameters and sets, and each has a slightly different format.
`*.dd` files have data of the following form:
Expand Down Expand Up @@ -53,20 +53,19 @@ def parse_parameter_values_from_file(
param_data = []
# Parse values until a line with / is encountered.
while not data[index].startswith("/") and data[index] != "":
words = data[index].split(" ")
line = data[index]

# Either "value" for a scalar, or "key value" for an array.
if len(words) == 1:
attributes = []
elif len(words) == 2:
attributes = words[0].split(".")
attributes = [a if " " in a else a.strip("'") for a in attributes]
# So value is always the last word, or only token
split_point = line.rfind(" ")
if split_point == -1:
# if only one word
attributes, value = [], line
else:
raise ValueError(
f"Unexpected number of spaces in parameter value setting: {data[index]}"
)
attributes, value = line[:split_point], line[split_point + 1 :]
attributes = attributes.split(".")
attributes = [a if " " in a else a.strip("'") for a in attributes]

value = words[-1]
param_data.append([*attributes, value])

index += 1
Expand Down
2 changes: 1 addition & 1 deletion xl2times/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2514,7 +2514,7 @@ def apply_transform_tables(
)

if not any(rows_to_update):
logger.info(f"A {Tag.tfm_mig.value} row generated no records.")
logger.warning(f"A {Tag.tfm_mig.value} row generated no records.")
continue

new_rows = table.loc[rows_to_update].copy()
Expand Down

0 comments on commit bfe04eb

Please sign in to comment.