Skip to content

Commit

Permalink
fix: filter for most recent files (#163)
Browse files Browse the repository at this point in the history
* fix filter for most recent files

* added private fct for sort key

* changed sorting key including major and minor versions, adapted tests

---------

Co-authored-by: konstantin <[email protected]>
  • Loading branch information
DeltaDaniel and hf-kklein authored Oct 8, 2024
1 parent 2ccafff commit 51f4f6d
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 21 deletions.
2 changes: 1 addition & 1 deletion src/migmose/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def main(
format_version,
output_dir_for_format,
)
document_version = _extract_document_version(file)
document_version, *_ = _extract_document_version(file)
reduced_nested_nachrichtenstruktur.output_tree(m_format, output_dir_for_format, document_version)


Expand Down
73 changes: 59 additions & 14 deletions src/migmose/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,21 @@ def get_latest_file(file_list: list[Path]) -> Path:
try:
# Define the keywords to filter relevant files
keywords = ["konsolidiertelesefassungmitfehlerkorrekturen", "außerordentlicheveröffentlichung"]

files_containing_keywords = [
path for path in file_list if any(keyword in path.name.lower() for keyword in keywords)
]
# Find the most recent file based on keywords and date suffixes
latest_file = max(
(path for path in file_list if any(keyword in path.name.lower() for keyword in keywords)),
key=lambda path: (
int(path.stem.split("_")[-1]), # "gültig von" date
int(path.stem.split("_")[-2]), # "gültig bis" date
),
)
if any(files_containing_keywords):
# Find the most recent file based on keywords and date suffixes
latest_file = max(
(path for path in files_containing_keywords),
key=_get_sort_key,
)
else: # different versions but no kosildierte Lesefassung or außerordentliche Veröffentlichung at all
latest_file = max(
(path for path in file_list),
key=_get_sort_key,
)

except ValueError as e:
logger.error("Error processing file list: {}", e)
Expand Down Expand Up @@ -154,19 +160,58 @@ def parse_raw_nachrichtenstrukturzeile(input_path: Path) -> list[str]:


_pattern = re.compile(
r"MIG(?:Strom|Gas)?-?informatorischeLesefassung?(.*?)"
r"MIG(?:Strom|Gas)?-?informatorischeLesefassung?(?P<version>(?P<major>\d+)\.(?P<minor>\d+)(?P<suffix>[a-z]?))"
r"(?:_|KonsolidierteLesefassung|-AußerordentlicheVeröffentlichung)",
re.IGNORECASE,
)


def _extract_document_version(path: Path) -> str:
document_str = str(path)
def _extract_document_version(path: Path | str) -> tuple[str, int | None, int | None, str]:
"""
Extracts the document version (major.minor+suffix) details from the given file path.
Args:
path (Path | str): The path to the file.
Example: path/to/file/ORDCHGMIG-informatorischeLesefassung1.1a_99991231_20231001.docx
-> version: 1.1a, major: 1, minor: 1, suffix: a
Returns:
tuple: A tuple containing the document version (str), major version (int or None),
minor version (int or None), and suffix (str).
"""

if isinstance(path, str):
document_str = path
else:
document_str = str(path)
matches = _pattern.search(document_str)
if matches:
document_version = matches.group(1)
document_version = matches.group("version")
major = matches.group("major")
minor = matches.group("minor")
suffix = matches.group("suffix")
if document_version == "":
logger.warning(f"❌ No document version found in {path}.", fg="red")
return document_version
return document_version or "", int(major) or 0, int(minor) or 0, suffix or ""
logger.error(f"❌ Unexpected document name in {path}.", fg="red")
return ""
return "", None, None, ""


def _get_sort_key(path: Path) -> tuple[int, int, int | None, int | None, str]:
"""
Extracts the sort key from the given path.
Args:
path (Path): The path object to extract the sort key from.
Example: path/to/file/ORDCHGMIG-informatorischeLesefassung1.1a_99991231_20231001.docx
with gueltig_von_date: 20231001 and gueltig_bis_date: 99991231, major: 1, minor: 1, suffix: a
Returns:
tuple: A tuple containing the "gültig von" date (int),
"gültig bis" date (int), major version (int or None), minor version (int or None), and suffix (str).
"""
parts = path.stem.split("_")
gueltig_von_date = int(parts[-1])
gueltig_bis_date = int(parts[-2])
_, major, minor, suffix = _extract_document_version(parts[-3])
return gueltig_von_date, gueltig_bis_date, major, minor, suffix
35 changes: 30 additions & 5 deletions unittests/__snapshots__/test_parsing.ambr
Original file line number Diff line number Diff line change
@@ -1,16 +1,41 @@
# serializer version: 1
# name: TestParsing.test_extract_document_version[IFTSTA]
''
tuple(
'',
None,
None,
'',
)
# ---
# name: TestParsing.test_extract_document_version[REMADV]
'2.9b'
tuple(
'2.9b',
2,
9,
'b',
)
# ---
# name: TestParsing.test_extract_document_version[REQOTE]
'1.3'
tuple(
'1.3',
1,
3,
'',
)
# ---
# name: TestParsing.test_extract_document_version[UTILMDG]
'G1.0a'
tuple(
'',
None,
None,
'',
)
# ---
# name: TestParsing.test_extract_document_version[UTILMDS]
'S1.1'
tuple(
'',
None,
None,
'',
)
# ---
2 changes: 1 addition & 1 deletion unittests/test_reduced_nested_nachrichtenstruktur.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def test_output_tree(self, message_format: EdifactFormat, tmp_path, snapshot):
reduced_nested_nachrichtenstruktur = ReducedNestedNachrichtenstruktur.create_reduced_nested_nachrichtenstruktur(
nested_nachrichtenstruktur
)
document_version = _extract_document_version(file_path)
document_version, *_ = _extract_document_version(file_path)
reduced_nested_nachrichtenstruktur.output_tree(message_format, tmp_path, document_version)
with open(tmp_path / f"{message_format}{document_version}.tree", "r", encoding="utf-8") as actual_file:
assert actual_file.read() == snapshot
Expand Down

0 comments on commit 51f4f6d

Please sign in to comment.