Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dd_to_csv bugfixes, move cache to ~/.cache/xl2times/ #230

Merged
merged 16 commits into from
Mar 23, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ Source = "https://github.com/etsap-TIMES/xl2times"

[project.scripts]
xl2times = "xl2times.__main__:main"
dd_to_csv = "xl2times.dd_to_csv:main"

[tool.pytest.ini_options]
# don't print runtime warnings
Expand Down
6 changes: 4 additions & 2 deletions utils/run_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
import git
import pandas as pd
import yaml
from dd_to_csv import main
from tabulate import tabulate

from xl2times import utils
from xl2times.__main__ import parse_args, run
from xl2times.dd_to_csv import main
from xl2times.utils import max_workers

logger = utils.get_logger()
Expand Down Expand Up @@ -155,7 +155,7 @@ def run_benchmark(
res = subprocess.run(
[
"python",
"utils/dd_to_csv.py",
"xl2times/dd_to_csv.py",
dd_folder,
csv_folder,
],
Expand All @@ -164,6 +164,7 @@ def run_benchmark(
text=True,
shell=True if os.name == "nt" else False,
check=False,
encoding="utf-8",
)
if res.returncode != 0:
# Remove partial outputs
Expand Down Expand Up @@ -208,6 +209,7 @@ def run_benchmark(
stderr=subprocess.STDOUT,
text=True,
check=False,
encoding="utf-8",
)
else:
# If debug option is set, run as a function call to allow stepping with a debugger.
Expand Down
68 changes: 46 additions & 22 deletions xl2times/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
import sys
import time
from concurrent.futures import ProcessPoolExecutor
from datetime import datetime
from datetime import datetime, timedelta
from pathlib import Path

import numpy as np
import pandas as pd
from pandas.core.frame import DataFrame

from xl2times import __file__ as xl2times_file_path
from xl2times.utils import max_workers

from . import excel, transforms, utils
Expand All @@ -20,33 +20,54 @@
logger = utils.get_logger()


cache_dir = os.path.abspath(os.path.dirname(xl2times_file_path)) + "/.cache/"
os.makedirs(cache_dir, exist_ok=True)
cache_dir = Path.home() / ".cache/xl2times/"
cache_dir.mkdir(exist_ok=True, parents=True)


def _read_xlsx_cached(filename: str) -> list[EmbeddedXlTable]:
def invalidate_cache(max_age: timedelta = timedelta(days=365)):
"""
Delete any cache files older than max_age.

Args:
max_age: Maximum age of a cache file to be considered valid. Any cache files older than this are deleted.
"""
for file in cache_dir.glob("*.pkl"):
if datetime.now() - datetime.fromtimestamp(file.lstat().st_mtime) > max_age:
try:
file.unlink()
except Exception as e:
logger.warning(f"Failed to delete old cache file {file}. {e}")


def _read_xlsx_cached(filename: str | Path) -> list[EmbeddedXlTable]:
"""Extract EmbeddedXlTables from xlsx file (cached).

Since excel.extract_tables is quite slow, we cache its results in `cache_dir`.
Each file is named by the hash of the contents of an xlsx file, and contains
a tuple (filename, modified timestamp, [EmbeddedXlTable]).

Args:
filename: Path to the xlsx file to extract tables from.
"""
with open(filename, "rb") as f:
filename = Path(filename).resolve()
with filename.open("rb") as f:
digest = hashlib.file_digest(f, "sha256") # pyright: ignore
hsh = digest.hexdigest()
if os.path.isfile(cache_dir + hsh):
fname1, _timestamp, tables = pickle.load(open(cache_dir + hsh, "rb"))
# In the extremely unlikely event that we have a hash collision, also check that
# the filename is the same:
# TODO check modified timestamp also matches
if filename == fname1:
logger.info(f"Using cached data for {filename} from {cache_dir + hsh}")
return tables
# Write extracted data to cache:
tables = excel.extract_tables(filename)
pickle.dump((filename, "TODO ModifiedTime", tables), open(cache_dir + hsh, "wb"))
logger.info(f"Saved cache for {filename} to {cache_dir + hsh}")
return excel.extract_tables(filename)
cached_file = (cache_dir / f"{Path(filename).stem}_{hsh}.pkl").resolve()

if cached_file.exists():
# just load and return the cached pickle
with cached_file.open("rb") as f:
tables = pickle.load(f)
logger.info(f"Using cached data for {filename} from {cached_file}")
else:
# extracted data and write it to cache before returning it
tables = excel.extract_tables(str(filename))
with cached_file.open("wb") as f:
pickle.dump(tables, f)
logger.info(f"Saved cache for {filename} to {cached_file}")

return tables
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about this @siddharth-krishna ?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good, thanks!



def convert_xl_to_times(
Expand All @@ -59,6 +80,8 @@ def convert_xl_to_times(
stop_after_read: bool = False,
) -> dict[str, DataFrame]:
start_time = datetime.now()

invalidate_cache()
with ProcessPoolExecutor(max_workers) as executor:
raw_tables = executor.map(
excel.extract_tables if no_cache else _read_xlsx_cached, input_files
Expand Down Expand Up @@ -180,8 +203,9 @@ def write_csv_tables(tables: dict[str, DataFrame], output_dir: str):

def read_csv_tables(input_dir: str) -> dict[str, DataFrame]:
result = {}
for filename in os.listdir(input_dir):
result[filename.split(".")[0]] = pd.read_csv(
csv_files = list(Path(input_dir).glob("*.csv"))
for filename in csv_files:
result[str(filename).split(".")[0]] = pd.read_csv(
os.path.join(input_dir, filename), dtype=str
)
return result
Expand Down Expand Up @@ -253,7 +277,7 @@ def compare(
index=False,
)
result = (
f"{total_correct_rows / total_gt_rows :.1%} of ground truth rows present"
f"{(total_correct_rows / total_gt_rows) if total_gt_rows!=0 else np.nan :.1%} of ground truth rows present"
f" in output ({total_correct_rows}/{total_gt_rows})"
f", {total_additional_rows} additional rows"
)
Expand Down
2 changes: 1 addition & 1 deletion xl2times/config/times-info.json
Original file line number Diff line number Diff line change
Expand Up @@ -4707,7 +4707,7 @@
},
{
"name": "UC_DYNBND",
"gams-cat": "parameter",
"gams-cat": "md-set",
"indexes": [
"UC_N",
"LIM"
Expand Down
19 changes: 9 additions & 10 deletions utils/dd_to_csv.py → xl2times/dd_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,20 +53,19 @@ def parse_parameter_values_from_file(
param_data = []
# Parse values until a line with / is encountered.
while not data[index].startswith("/") and data[index] != "":
words = data[index].split(" ")
line = data[index]

# Either "value" for a scalar, or "key value" for an array.
if len(words) == 1:
attributes = []
elif len(words) == 2:
attributes = words[0].split(".")
attributes = [a if " " in a else a.strip("'") for a in attributes]
# So value is always the last word, or only token
split_point = line.rfind(" ")
if split_point == -1:
# if only one word
attributes, value = [], line
else:
raise ValueError(
f"Unexpected number of spaces in parameter value setting: {data[index]}"
)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This ValueError was getting thrown from veda-produced austimes DD files for a variable with a space in its name

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@SamRWest when you write "variable" do you mean an index of a GAMS parameter or a GAMS parameter?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, 'variable' is the wrong terminology.

I mean the key part referred to in this comment:
# Either "value" for a scalar, or "key value" for an array.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, here's a line from one of our DD files that (i think) would have triggered the ValueError, because of the spaces in UC_non-decreasing EE penetration:

'UC_non-decreasing EE penetration'.RHS.'QLD'.2015.'EE2_Pri_Pub-b'.ANNUAL 1

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, thanks! It should be okay not to raise ValueError, as long as the key is in quotes. Scalar is a numeric, so there should not be spaces in it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool, that was my assumption.
The new code just splits the line at the last space char, whereas the old code split it at all spaces, then warned if it ended up with >2 tokens, which failed on my example string above.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Btw, sometimes there is no value, e.g. from Demo 1:

SET COM_TMAP
/
'REG1'.'DEM'.'TPSCOA'
'REG1'.'NRG'.'COA'
/;

Let's say somebody does this:

SET COM_TMAP
/
'REG1'.'DEM'.'TPS COA'
'REG1'.'NRG'.'COA'
/;

What will happen?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I can see we are handling this above when we check whether it is a parameter.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, the new code will catch this with this check, because rfind returns -1 if no spaces are found in the string.

 if split_point == -1:
     #if only one word
     attributes, value = [], line

attributes, value = line[:split_point], line[split_point + 1 :]
attributes = attributes.split(".")
attributes = [a if " " in a else a.strip("'") for a in attributes]
Comment on lines +59 to +67
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The rfind() works more reliably, on the assumption that there's always a space before the value, so it'll work with 'key with spaces' value style strings


value = words[-1]
param_data.append([*attributes, value])

index += 1
Expand Down
2 changes: 1 addition & 1 deletion xl2times/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2511,7 +2511,7 @@ def apply_transform_tables(
)

if not any(rows_to_update):
logger.info(f"A {Tag.tfm_mig.value} row generated no records.")
logger.warning(f"A {Tag.tfm_mig.value} row generated no records.")
olejandro marked this conversation as resolved.
Show resolved Hide resolved
continue

new_rows = table.loc[rows_to_update].copy()
Expand Down
Loading