From eb32adfea7ef193039f7447f7ca90d37134cca82 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 7 Aug 2024 11:59:46 +0000 Subject: [PATCH] Sort WARC directories passed to zimit by modification time --- CHANGELOG.md | 4 ++++ src/zimit/zimit.py | 8 ++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b529179..481ec33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Stop fetching and passing browsertrix crawler version as scraperSuffix to warc2zim (#354) - Do not log number of WARC files found (#357) +### Fixed + +- Sort WARC directories found by modification time (#366) + ## [2.0.6] - 2024-08-02 ### Changed diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 46ed485..a925ba2 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -586,14 +586,18 @@ def cleanup(): ] else: - warc_dirs = list(temp_root_dir.rglob("collections/crawl-*/archive/")) + warc_dirs = sorted( + temp_root_dir.rglob("collections/crawl-*/archive/"), + key=lambda path: path.lstat().st_mtime, + ) if len(warc_dirs) == 0: raise RuntimeError( "Failed to find directory where WARC files have been created" ) elif len(warc_dirs) > 1: logger.info( - "Found many WARC files directories, only last one will be used" + "Found many WARC files directories, only most recently modified one" + " will be used" ) for directory in warc_dirs: logger.info(f"- {directory}")