From bab423c25986acc36cf6f0b1d47ef5b1414b4e3c Mon Sep 17 00:00:00 2001 From: Sam West Date: Thu, 14 Mar 2024 13:34:42 +1100 Subject: [PATCH 01/11] fixed dd_to_csv parsing for lines with multiple spaces in the attribute names. corrected the gams-cat for UC_DYNBND (see #218) --- utils/dd_to_csv.py | 19 +++++++++---------- xl2times/config/times-info.json | 2 +- xl2times/datatypes.py | 8 +++++++- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/utils/dd_to_csv.py b/utils/dd_to_csv.py index 7d26227..6437751 100644 --- a/utils/dd_to_csv.py +++ b/utils/dd_to_csv.py @@ -55,20 +55,19 @@ def parse_parameter_values_from_file( param_data = [] # Parse values until a line with / is encountered. while not data[index].startswith("/") and data[index] != "": - words = data[index].split(" ") + line = data[index] # Either "value" for a scalar, or "key value" for an array. - if len(words) == 1: - attributes = [] - elif len(words) == 2: - attributes = words[0].split(".") - attributes = [a if " " in a else a.strip("'") for a in attributes] + # So value is always the last word, or only token + split_point = line.rfind(" ") + if split_point == -1: + # if only one word + attributes, value = [], line else: - raise ValueError( - f"Unexpected number of spaces in parameter value setting: {data[index]}" - ) + attributes, value = line[:split_point], line[split_point + 1 :] + attributes = attributes.split(".") + attributes = [a if " " in a else a.strip("'") for a in attributes] - value = words[-1] param_data.append([*attributes, value]) index += 1 diff --git a/xl2times/config/times-info.json b/xl2times/config/times-info.json index 4dae96a..07b12f6 100644 --- a/xl2times/config/times-info.json +++ b/xl2times/config/times-info.json @@ -4707,7 +4707,7 @@ }, { "name": "UC_DYNBND", - "gams-cat": "parameter", + "gams-cat": "md-set", "indexes": [ "UC_N", "LIM" diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py index 7d3c433..76f475c 100644 --- a/xl2times/datatypes.py +++ b/xl2times/datatypes.py @@ -175,7 +175,13 @@ def data_years(self) -> set[int]: data_years = set() for attributes in [self.attributes, self.uc_attributes]: if not attributes.empty: - data_years.update(attributes["year"].astype(int).values) + int_years = attributes["year"].astype( + int, errors="ignore" + ) # leave non-parseable vals alone + int_years = [ + y for y in int_years if isinstance(y, int) + ] # remove non-parseable years + data_years.update(int_years) # Remove interpolation rules before return return {y for y in data_years if y >= 1000} From fd010053b56c5cd6a2a1632a07bff9f18842f635 Mon Sep 17 00:00:00 2001 From: Sam West Date: Thu, 21 Mar 2024 15:52:11 +1100 Subject: [PATCH 02/11] moved dd_to_csv to src/ and added as a script in pyproject.toml moved cache dir to standard ~/.cache/xl2times dir to avoid creating inside .venv subdir --- pyproject.toml | 1 + utils/run_benchmarks.py | 4 +++- xl2times/__main__.py | 14 ++++++++------ {utils => xl2times}/dd_to_csv.py | 0 xl2times/transforms.py | 2 +- 5 files changed, 13 insertions(+), 8 deletions(-) rename {utils => xl2times}/dd_to_csv.py (100%) diff --git a/pyproject.toml b/pyproject.toml index 3fb1011..2a12e82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ Source = "https://github.com/etsap-TIMES/xl2times" [project.scripts] xl2times = "xl2times.__main__:main" +dd_to_csv = "xl2times.dd_to_csv:main" [tool.pytest.ini_options] # don't print runtime warnings diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index bbf18e1..a2a64c0 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -14,11 +14,11 @@ import git import pandas as pd import yaml -from dd_to_csv import main from tabulate import tabulate from xl2times import utils from xl2times.__main__ import parse_args, run +from xl2times.dd_to_csv import main from xl2times.utils import max_workers logger = utils.get_logger() @@ -164,6 +164,7 @@ def run_benchmark( text=True, shell=True if os.name == "nt" else False, check=False, + encoding="utf-8", ) if res.returncode != 0: # Remove partial outputs @@ -208,6 +209,7 @@ def run_benchmark( stderr=subprocess.STDOUT, text=True, check=False, + encoding="utf-8", ) else: # If debug option is set, run as a function call to allow stepping with a debugger. diff --git a/xl2times/__main__.py b/xl2times/__main__.py index b64a190..0515d05 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -8,10 +8,10 @@ from datetime import datetime from pathlib import Path +import numpy as np import pandas as pd from pandas.core.frame import DataFrame -from xl2times import __file__ as xl2times_file_path from xl2times.utils import max_workers from . import datatypes, excel, transforms, utils @@ -19,7 +19,7 @@ logger = utils.get_logger() -cache_dir = os.path.abspath(os.path.dirname(xl2times_file_path)) + "/.cache/" +cache_dir = str(Path.home() / ".cache/xl2times/") os.makedirs(cache_dir, exist_ok=True) @@ -34,7 +34,8 @@ def _read_xlsx_cached(filename: str) -> list[datatypes.EmbeddedXlTable]: digest = hashlib.file_digest(f, "sha256") # pyright: ignore hsh = digest.hexdigest() if os.path.isfile(cache_dir + hsh): - fname1, _timestamp, tables = pickle.load(open(cache_dir + hsh, "rb")) + with open(cache_dir + hsh, "rb") as f: + fname1, _timestamp, tables = pickle.load(f) # In the extremely unlikely event that we have a hash collision, also check that # the filename is the same: # TODO check modified timestamp also matches @@ -178,8 +179,9 @@ def write_csv_tables(tables: dict[str, DataFrame], output_dir: str): def read_csv_tables(input_dir: str) -> dict[str, DataFrame]: result = {} - for filename in os.listdir(input_dir): - result[filename.split(".")[0]] = pd.read_csv( + csv_files = list(Path(input_dir).glob("*.csv")) + for filename in csv_files: + result[str(filename).split(".")[0]] = pd.read_csv( os.path.join(input_dir, filename), dtype=str ) return result @@ -251,7 +253,7 @@ def compare( index=False, ) result = ( - f"{total_correct_rows / total_gt_rows :.1%} of ground truth rows present" + f"{(total_correct_rows / total_gt_rows) if total_gt_rows!=0 else np.nan :.1%} of ground truth rows present" f" in output ({total_correct_rows}/{total_gt_rows})" f", {total_additional_rows} additional rows" ) diff --git a/utils/dd_to_csv.py b/xl2times/dd_to_csv.py similarity index 100% rename from utils/dd_to_csv.py rename to xl2times/dd_to_csv.py diff --git a/xl2times/transforms.py b/xl2times/transforms.py index c62eef4..949b1f6 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -2398,7 +2398,7 @@ def apply_transform_tables( ) if not any(rows_to_update): - logger.info( + logger.warning( f"A {datatypes.Tag.tfm_mig.value} row generated no records." ) continue From 86a1480735bc0714e1a851012f1b10fcd3029252 Mon Sep 17 00:00:00 2001 From: Sam West Date: Thu, 21 Mar 2024 16:19:22 +1100 Subject: [PATCH 03/11] Added cache invalidation for old files --- xl2times/__main__.py | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/xl2times/__main__.py b/xl2times/__main__.py index 0515d05..d44d4e8 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -5,7 +5,7 @@ import sys import time from concurrent.futures import ProcessPoolExecutor -from datetime import datetime +from datetime import datetime, timedelta from pathlib import Path import numpy as np @@ -19,8 +19,23 @@ logger = utils.get_logger() -cache_dir = str(Path.home() / ".cache/xl2times/") -os.makedirs(cache_dir, exist_ok=True) +cache_dir = Path.home() / ".cache/xl2times/" +cache_dir.mkdir(exist_ok=True, parents=True) + + +def invalidate_cache(max_age: timedelta = timedelta(days=365)): + """ + Delete any cache files older than max_age. + + Args: + max_age: Maximum age of a cache file to be considered valid. Any cache files older than this are deleted. + """ + for file in cache_dir.glob("*.pkl"): + if datetime.now() - datetime.fromtimestamp(file.lstat().st_mtime) > max_age: + try: + file.unlink() + except Exception as e: + logger.warning(f"Failed to delete old cache file {file}. {e}") def _read_xlsx_cached(filename: str) -> list[datatypes.EmbeddedXlTable]: @@ -29,23 +44,29 @@ def _read_xlsx_cached(filename: str) -> list[datatypes.EmbeddedXlTable]: Since excel.extract_tables is quite slow, we cache its results in `cache_dir`. Each file is named by the hash of the contents of an xlsx file, and contains a tuple (filename, modified timestamp, [EmbeddedXlTable]). + + Args: + filename: Path to the xlsx file to extract tables from. """ with open(filename, "rb") as f: digest = hashlib.file_digest(f, "sha256") # pyright: ignore hsh = digest.hexdigest() - if os.path.isfile(cache_dir + hsh): - with open(cache_dir + hsh, "rb") as f: + hash_file = cache_dir / f"{Path(filename).stem}_{hsh}.pkl" + if hash_file.is_file(): + with hash_file.open("rb") as f: fname1, _timestamp, tables = pickle.load(f) # In the extremely unlikely event that we have a hash collision, also check that # the filename is the same: # TODO check modified timestamp also matches if filename == fname1: - logger.info(f"Using cached data for {filename} from {cache_dir + hsh}") + logger.info(f"Using cached data for {filename} from {hash_file}") return tables # Write extracted data to cache: tables = excel.extract_tables(filename) - pickle.dump((filename, "TODO ModifiedTime", tables), open(cache_dir + hsh, "wb")) - logger.info(f"Saved cache for {filename} to {cache_dir + hsh}") + with hash_file.open("wb") as f: + last_modified = hash_file.lstat().st_mtime + pickle.dump((filename, last_modified, tables), f) + logger.info(f"Saved cache for {filename} to {hash_file}") return excel.extract_tables(filename) @@ -59,6 +80,8 @@ def convert_xl_to_times( stop_after_read: bool = False, ) -> dict[str, DataFrame]: start_time = datetime.now() + + invalidate_cache() with ProcessPoolExecutor(max_workers) as executor: raw_tables = executor.map( excel.extract_tables if no_cache else _read_xlsx_cached, input_files From 3b8719b32df902df996c340548978b807e430950 Mon Sep 17 00:00:00 2001 From: Sam West Date: Thu, 21 Mar 2024 15:52:11 +1100 Subject: [PATCH 04/11] moved dd_to_csv to src/ and added as a script in pyproject.toml moved cache dir to standard ~/.cache/xl2times dir to avoid creating inside .venv subdir --- pyproject.toml | 1 + utils/run_benchmarks.py | 4 +++- xl2times/__main__.py | 14 ++++++++------ {utils => xl2times}/dd_to_csv.py | 0 xl2times/transforms.py | 2 +- 5 files changed, 13 insertions(+), 8 deletions(-) rename {utils => xl2times}/dd_to_csv.py (100%) diff --git a/pyproject.toml b/pyproject.toml index f7ca779..1f9b15c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ Source = "https://github.com/etsap-TIMES/xl2times" [project.scripts] xl2times = "xl2times.__main__:main" +dd_to_csv = "xl2times.dd_to_csv:main" [tool.pytest.ini_options] # don't print runtime warnings diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index bbf18e1..a2a64c0 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -14,11 +14,11 @@ import git import pandas as pd import yaml -from dd_to_csv import main from tabulate import tabulate from xl2times import utils from xl2times.__main__ import parse_args, run +from xl2times.dd_to_csv import main from xl2times.utils import max_workers logger = utils.get_logger() @@ -164,6 +164,7 @@ def run_benchmark( text=True, shell=True if os.name == "nt" else False, check=False, + encoding="utf-8", ) if res.returncode != 0: # Remove partial outputs @@ -208,6 +209,7 @@ def run_benchmark( stderr=subprocess.STDOUT, text=True, check=False, + encoding="utf-8", ) else: # If debug option is set, run as a function call to allow stepping with a debugger. diff --git a/xl2times/__main__.py b/xl2times/__main__.py index b8e5976..6fd68ad 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -8,10 +8,10 @@ from datetime import datetime from pathlib import Path +import numpy as np import pandas as pd from pandas.core.frame import DataFrame -from xl2times import __file__ as xl2times_file_path from xl2times.utils import max_workers from . import excel, transforms, utils @@ -20,7 +20,7 @@ logger = utils.get_logger() -cache_dir = os.path.abspath(os.path.dirname(xl2times_file_path)) + "/.cache/" +cache_dir = str(Path.home() / ".cache/xl2times/") os.makedirs(cache_dir, exist_ok=True) @@ -35,7 +35,8 @@ def _read_xlsx_cached(filename: str) -> list[EmbeddedXlTable]: digest = hashlib.file_digest(f, "sha256") # pyright: ignore hsh = digest.hexdigest() if os.path.isfile(cache_dir + hsh): - fname1, _timestamp, tables = pickle.load(open(cache_dir + hsh, "rb")) + with open(cache_dir + hsh, "rb") as f: + fname1, _timestamp, tables = pickle.load(f) # In the extremely unlikely event that we have a hash collision, also check that # the filename is the same: # TODO check modified timestamp also matches @@ -180,8 +181,9 @@ def write_csv_tables(tables: dict[str, DataFrame], output_dir: str): def read_csv_tables(input_dir: str) -> dict[str, DataFrame]: result = {} - for filename in os.listdir(input_dir): - result[filename.split(".")[0]] = pd.read_csv( + csv_files = list(Path(input_dir).glob("*.csv")) + for filename in csv_files: + result[str(filename).split(".")[0]] = pd.read_csv( os.path.join(input_dir, filename), dtype=str ) return result @@ -253,7 +255,7 @@ def compare( index=False, ) result = ( - f"{total_correct_rows / total_gt_rows :.1%} of ground truth rows present" + f"{(total_correct_rows / total_gt_rows) if total_gt_rows!=0 else np.nan :.1%} of ground truth rows present" f" in output ({total_correct_rows}/{total_gt_rows})" f", {total_additional_rows} additional rows" ) diff --git a/utils/dd_to_csv.py b/xl2times/dd_to_csv.py similarity index 100% rename from utils/dd_to_csv.py rename to xl2times/dd_to_csv.py diff --git a/xl2times/transforms.py b/xl2times/transforms.py index e82adb0..8d7d3c6 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -2511,7 +2511,7 @@ def apply_transform_tables( ) if not any(rows_to_update): - logger.info(f"A {Tag.tfm_mig.value} row generated no records.") + logger.warning(f"A {Tag.tfm_mig.value} row generated no records.") continue new_rows = table.loc[rows_to_update].copy() From 7bcffc83952b9ce63ee72c2c72914f420ce39394 Mon Sep 17 00:00:00 2001 From: Sam West Date: Thu, 21 Mar 2024 16:19:22 +1100 Subject: [PATCH 05/11] Added cache invalidation for old files --- xl2times/__main__.py | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/xl2times/__main__.py b/xl2times/__main__.py index 6fd68ad..90b9414 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -5,7 +5,7 @@ import sys import time from concurrent.futures import ProcessPoolExecutor -from datetime import datetime +from datetime import datetime, timedelta from pathlib import Path import numpy as np @@ -20,8 +20,23 @@ logger = utils.get_logger() -cache_dir = str(Path.home() / ".cache/xl2times/") -os.makedirs(cache_dir, exist_ok=True) +cache_dir = Path.home() / ".cache/xl2times/" +cache_dir.mkdir(exist_ok=True, parents=True) + + +def invalidate_cache(max_age: timedelta = timedelta(days=365)): + """ + Delete any cache files older than max_age. + + Args: + max_age: Maximum age of a cache file to be considered valid. Any cache files older than this are deleted. + """ + for file in cache_dir.glob("*.pkl"): + if datetime.now() - datetime.fromtimestamp(file.lstat().st_mtime) > max_age: + try: + file.unlink() + except Exception as e: + logger.warning(f"Failed to delete old cache file {file}. {e}") def _read_xlsx_cached(filename: str) -> list[EmbeddedXlTable]: @@ -30,23 +45,29 @@ def _read_xlsx_cached(filename: str) -> list[EmbeddedXlTable]: Since excel.extract_tables is quite slow, we cache its results in `cache_dir`. Each file is named by the hash of the contents of an xlsx file, and contains a tuple (filename, modified timestamp, [EmbeddedXlTable]). + + Args: + filename: Path to the xlsx file to extract tables from. """ with open(filename, "rb") as f: digest = hashlib.file_digest(f, "sha256") # pyright: ignore hsh = digest.hexdigest() - if os.path.isfile(cache_dir + hsh): - with open(cache_dir + hsh, "rb") as f: + hash_file = cache_dir / f"{Path(filename).stem}_{hsh}.pkl" + if hash_file.is_file(): + with hash_file.open("rb") as f: fname1, _timestamp, tables = pickle.load(f) # In the extremely unlikely event that we have a hash collision, also check that # the filename is the same: # TODO check modified timestamp also matches if filename == fname1: - logger.info(f"Using cached data for {filename} from {cache_dir + hsh}") + logger.info(f"Using cached data for {filename} from {hash_file}") return tables # Write extracted data to cache: tables = excel.extract_tables(filename) - pickle.dump((filename, "TODO ModifiedTime", tables), open(cache_dir + hsh, "wb")) - logger.info(f"Saved cache for {filename} to {cache_dir + hsh}") + with hash_file.open("wb") as f: + last_modified = hash_file.lstat().st_mtime + pickle.dump((filename, last_modified, tables), f) + logger.info(f"Saved cache for {filename} to {hash_file}") return excel.extract_tables(filename) @@ -60,6 +81,8 @@ def convert_xl_to_times( stop_after_read: bool = False, ) -> dict[str, DataFrame]: start_time = datetime.now() + + invalidate_cache() with ProcessPoolExecutor(max_workers) as executor: raw_tables = executor.map( excel.extract_tables if no_cache else _read_xlsx_cached, input_files From 1615397f1531a4fb3e8e7d90be76de2acafda96c Mon Sep 17 00:00:00 2001 From: Sam West Date: Thu, 21 Mar 2024 16:29:38 +1100 Subject: [PATCH 06/11] reverted changes to data_years() --- xl2times/datatypes.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py index b6b62f0..2d2a99e 100644 --- a/xl2times/datatypes.py +++ b/xl2times/datatypes.py @@ -187,13 +187,7 @@ def data_years(self) -> set[int]: data_years = set() for attributes in [self.attributes, self.uc_attributes]: if not attributes.empty: - int_years = attributes["year"].astype( - int, errors="ignore" - ) # leave non-parseable vals alone - int_years = [ - y for y in int_years if isinstance(y, int) - ] # remove non-parseable years - data_years.update(int_years) + data_years.update(attributes["year"].astype(int).values) # Remove interpolation rules before return return {y for y in data_years if y >= 1000} From 0c767ff226609290dbdb05d66a7e94698243c1fb Mon Sep 17 00:00:00 2001 From: Sam West Date: Thu, 21 Mar 2024 16:59:32 +1100 Subject: [PATCH 07/11] fixed path in benchmarks file cleaned up caching code as suggested --- utils/run_benchmarks.py | 2 +- xl2times/__main__.py | 37 ++++++++++++++++++------------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/utils/run_benchmarks.py b/utils/run_benchmarks.py index a2a64c0..e28f5ad 100644 --- a/utils/run_benchmarks.py +++ b/utils/run_benchmarks.py @@ -155,7 +155,7 @@ def run_benchmark( res = subprocess.run( [ "python", - "utils/dd_to_csv.py", + "xl2times/dd_to_csv.py", dd_folder, csv_folder, ], diff --git a/xl2times/__main__.py b/xl2times/__main__.py index 90b9414..6c75534 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -39,7 +39,7 @@ def invalidate_cache(max_age: timedelta = timedelta(days=365)): logger.warning(f"Failed to delete old cache file {file}. {e}") -def _read_xlsx_cached(filename: str) -> list[EmbeddedXlTable]: +def _read_xlsx_cached(filename: str | Path) -> list[EmbeddedXlTable]: """Extract EmbeddedXlTables from xlsx file (cached). Since excel.extract_tables is quite slow, we cache its results in `cache_dir`. @@ -49,26 +49,25 @@ def _read_xlsx_cached(filename: str) -> list[EmbeddedXlTable]: Args: filename: Path to the xlsx file to extract tables from. """ - with open(filename, "rb") as f: + filename = Path(filename).resolve() + with filename.open("rb") as f: digest = hashlib.file_digest(f, "sha256") # pyright: ignore hsh = digest.hexdigest() - hash_file = cache_dir / f"{Path(filename).stem}_{hsh}.pkl" - if hash_file.is_file(): - with hash_file.open("rb") as f: - fname1, _timestamp, tables = pickle.load(f) - # In the extremely unlikely event that we have a hash collision, also check that - # the filename is the same: - # TODO check modified timestamp also matches - if filename == fname1: - logger.info(f"Using cached data for {filename} from {hash_file}") - return tables - # Write extracted data to cache: - tables = excel.extract_tables(filename) - with hash_file.open("wb") as f: - last_modified = hash_file.lstat().st_mtime - pickle.dump((filename, last_modified, tables), f) - logger.info(f"Saved cache for {filename} to {hash_file}") - return excel.extract_tables(filename) + cached_file = (cache_dir / f"{Path(filename).stem}_{hsh}.pkl").resolve() + + if cached_file.exists(): + # just load and return the cached pickle + with cached_file.open("rb") as f: + tables = pickle.load(f) + logger.info(f"Using cached data for {filename} from {cached_file}") + else: + # extracted data and write it to cache before returning it + tables = excel.extract_tables(str(filename)) + with cached_file.open("wb") as f: + pickle.dump(tables, f) + logger.info(f"Saved cache for {filename} to {cached_file}") + + return tables def convert_xl_to_times( From e53cef1b1deb235ccb63a9b0ab145476a5f870ec Mon Sep 17 00:00:00 2001 From: Sam West Date: Fri, 22 Mar 2024 11:14:54 +1100 Subject: [PATCH 08/11] bugfix for bad CSV paths --- xl2times/__main__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xl2times/__main__.py b/xl2times/__main__.py index 6c75534..9f2debc 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -205,9 +205,7 @@ def read_csv_tables(input_dir: str) -> dict[str, DataFrame]: result = {} csv_files = list(Path(input_dir).glob("*.csv")) for filename in csv_files: - result[str(filename).split(".")[0]] = pd.read_csv( - os.path.join(input_dir, filename), dtype=str - ) + result[str(filename).split(".")[0]] = pd.read_csv(filename, dtype=str) return result From 5c5c8bf2c3b4dd5afda25a74434c8aabc5de259c Mon Sep 17 00:00:00 2001 From: Sam West Date: Fri, 22 Mar 2024 11:36:17 +1100 Subject: [PATCH 09/11] Another csv bugfix --- xl2times/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xl2times/__main__.py b/xl2times/__main__.py index 9f2debc..59606b0 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -205,7 +205,7 @@ def read_csv_tables(input_dir: str) -> dict[str, DataFrame]: result = {} csv_files = list(Path(input_dir).glob("*.csv")) for filename in csv_files: - result[str(filename).split(".")[0]] = pd.read_csv(filename, dtype=str) + result[filename.stem] = pd.read_csv(filename, dtype=str) return result From 56f333f04856176d1321e4bb87463413e7b481c3 Mon Sep 17 00:00:00 2001 From: Siddharth Krishna Date: Fri, 22 Mar 2024 09:26:55 +0200 Subject: [PATCH 10/11] Change cache dir in ci.yml, bump cache key --- .github/workflows/ci.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bd2b3b6..d9aed4b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,6 +21,7 @@ jobs: REF_demos-dd: "2848a8a8e2fdcf0cdf7f83eefbdd563b0bb74e86" REF_tim: "e820d8002adc6b1526a3bffcc439219b28d0eed5" REF_tim-gams: "703f6a4e1d0bedd95c3ebdae534496f3a7e1b7cc" + CACHE_KEY: 0 # Use this for manual cache key bumps, e.g., when caching code changes steps: - uses: actions/checkout@v3 @@ -115,11 +116,12 @@ jobs: id: cache uses: actions/cache/restore@v4 with: - path: ${{ github.workspace }}/xl2times/xl2times/.cache + path: ~/.cache/xl2times # Cache key is refs of the input xlsx repos, since that's what is cached - key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }} + key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}-${{ env.CACHE_KEY }} # If we can't find the exact key for the TIM repo, still use the cache if the demos repo ref matches restore-keys: | + ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}- ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}- ${{ runner.os }}-py-${{ env.PY_VERSION }}- @@ -166,5 +168,5 @@ jobs: # Save the cache even if the regression tests fail if: always() && !steps.cache-restore.outputs.cache-hit with: - path: ${{ github.workspace }}/xl2times/xl2times/.cache - key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }} + path: ~/.cache/xl2times + key: ${{ runner.os }}-py-${{ env.PY_VERSION }}-${{ env.REF_demos-xlsx }}-${{ env.REF_tim }}-${{ env.CACHE_KEY }} From bb2a536e9828fa2a48e1fcbdbb392c516f318ea7 Mon Sep 17 00:00:00 2001 From: Siddharth Krishna Date: Fri, 22 Mar 2024 09:37:20 +0200 Subject: [PATCH 11/11] Fix docs --- xl2times/__main__.py | 2 +- xl2times/dd_to_csv.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xl2times/__main__.py b/xl2times/__main__.py index 59606b0..e03b137 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -61,7 +61,7 @@ def _read_xlsx_cached(filename: str | Path) -> list[EmbeddedXlTable]: tables = pickle.load(f) logger.info(f"Using cached data for {filename} from {cached_file}") else: - # extracted data and write it to cache before returning it + # extract data and write it to cache before returning it tables = excel.extract_tables(str(filename)) with cached_file.open("wb") as f: pickle.dump(tables, f) diff --git a/xl2times/dd_to_csv.py b/xl2times/dd_to_csv.py index 6f611ea..46f652b 100644 --- a/xl2times/dd_to_csv.py +++ b/xl2times/dd_to_csv.py @@ -13,7 +13,7 @@ def parse_parameter_values_from_file( path: Path, ) -> tuple[dict[str, list], dict[str, set]]: - """Parse *.dd to turn it into CSV format. + """Parse a `*.dd` file and extract the sets and parameters. There are parameters and sets, and each has a slightly different format. `*.dd` files have data of the following form: