openzim · benoit74 · Aug 7, 2024 · Jul 23, 2024 · Jul 23, 2024 · Jul 23, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,12 +11,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Add `--custom-behaviors` argument to support path/HTTP(S) URL custom behaviors to pass to the crawler (#313)
 - Add daily automated end-to-end tests of a page with Youtube player (#330)
+- Add `--warcs` option to directly process WARC files (#301)
 
 ### Changed
 
 - Make it clear that `--profile` argument can be an HTTP(S) URL (and not only a path) (#288)
 - Fix README imprecisions + add back warc2zim availability in docker image (#314)
 - Enhance integration test to assert final content of the ZIM (#287)
+- Stop fetching and passing browsertrix crawler version as scraperSuffix to warc2zim (#354)
+- Do not log number of WARC files found (#357)
 
 ## [2.0.6] - 2024-08-02
 

diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
@@ -12,6 +12,7 @@
 import signal
 import subprocess
 import sys
+import tarfile
 import tempfile
 import urllib.parse
 from argparse import ArgumentParser
@@ -363,26 +364,20 @@ def run(raw_args):
         "individual JS files URL/path separated by a comma",
     )
 
+    parser.add_argument(
+        "--warcs",
+        help="Directly convert WARC archives to ZIM, by-passing the crawling phase. "
+        "This argument must contain the path or HTTP(S) URL to either warc.gz files or"
+        "to a tar.gz containing the warc.gz files. Single value with individual "
+        "path/URLs separated by comma",
+    )
+
     zimit_args, warc2zim_args = parser.parse_known_args(raw_args)
 
-    logger.info("Checking browsertrix-crawler version")
-    crawl_version_cmd = ["crawl", "--version"]
-    try:
-        crawl = subprocess.run(
-            crawl_version_cmd, check=True, capture_output=True, text=True
-        )
-    except Exception:
-        logger.error("Failed to get Browsertrix crawler version")
-        raise
-    crawler_version = crawl.stdout.strip()
-    logger.info(f"Browsertrix crawler: version {crawler_version}")
-
-    # pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler
-    # versions are associated with the ZIM
+    # pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are
+    # associated with the ZIM ; make it a CSV for easier parsing
     warc2zim_args.append("--scraper-suffix")
-    warc2zim_args.append(
-        f" + zimit {__version__} + Browsertrix crawler {crawler_version}"
-    )
+    warc2zim_args.append(f"zimit {__version__}")
 
     # pass url and output to warc2zim also
     if zimit_args.output:
@@ -513,36 +508,105 @@ def cleanup():
         f"Output to tempdir: {temp_root_dir} - "
         f"{'will keep' if zimit_args.keep else 'will delete'}"
     )
-    logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
-    crawl = subprocess.run(cmd_args, check=False)
-    if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT:
-        logger.info("crawl interupted by a limit")
-    elif crawl.returncode != 0:
-        raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
 
-    if zimit_args.collection:
-        warc_directory = temp_root_dir.joinpath(
-            f"collections/{zimit_args.collection}/archive/"
-        )
-    else:
-        warc_dirs = list(temp_root_dir.rglob("collections/crawl-*/archive/"))
-        if len(warc_dirs) == 0:
-            raise RuntimeError(
-                "Failed to find directory where WARC files have been created"
+    # if warc files are passed, do not run browsertrix crawler but fetch the files if
+    # they are provided as an HTTP URL + extract the archive if it is a tar.gz
+    warc_files: list[Path] = []
+    if zimit_args.warcs:
+        for warc_location in [
+            warc_location.strip() for warc_location in zimit_args.warcs.split(",")
+        ]:
+            suffix = "".join(Path(urllib.parse.urlparse(warc_location).path).suffixes)
+            if suffix not in {".tar.gz", ".warc", ".warc.gz"}:
+                raise Exception(f"Unsupported file at {warc_location}")
+
+            filename = tempfile.NamedTemporaryFile(
+                dir=temp_root_dir,
+                prefix="warc_",
+                suffix=suffix,
+                delete_on_close=False,
             )
-        elif len(warc_dirs) > 1:
-            logger.info("Found many WARC files directories, only last one will be used")
-            for directory in warc_dirs:
-                logger.info(f"- {directory}")
-        warc_directory = warc_dirs[-1]
+
+            if not re.match(r"^https?\://", warc_location):
+                # warc_location is not a URL, so it is a path, simply add it to the list
+                if not Path(warc_location).exists():
+                    raise Exception(f"Impossible to find file at {warc_location}")
+
+                # if it is a plain warc or warc.gz, simply add it to the list
+                if suffix in {".warc", ".warc.gz"}:
+                    warc_files.append(Path(warc_location))
+                    continue
+
+                # otherwise extract tar.gz but do not delete it afterwards
+                extract_path = temp_root_dir / f"{filename.name}_files"
+                logger.info(
+                    f"Extracting WARC(s) from {warc_location} to {extract_path}"
+                )
+                with tarfile.open(warc_location, "r:gz") as fh:
+                    # Extract all the contents to the specified directory
+                    fh.extractall(path=extract_path, filter="data")
+                warc_files.append(Path(extract_path))
+                continue
+
+            # warc_location is a URL, let's download it to a temp name to avoid name
+            # collisions
+            warc_file = Path(filename.name)
+            logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}")
+            resp = requests.get(warc_location, timeout=REQUESTS_TIMEOUT)
+            resp.raise_for_status()
+            warc_file.write_bytes(resp.content)
+
+            # if it is a plain warc or warc.gz, simply add it to the list
+            if suffix in {".warc", ".warc.gz"}:
+                warc_files.append(warc_file)
+                continue
+
+            # otherwise extract tar.gz and delete it afterwards
+            extract_path = temp_root_dir / f"{filename.name}_files"
+            logger.info(f"Extracting WARC(s) from {warc_file} to {extract_path}")
+            with tarfile.open(warc_file, "r:gz") as fh:
+                # Extract all the contents to the specified directory
+                fh.extractall(path=extract_path, filter="data")
+            logger.info(f"Deleting archive at {warc_file}")
+            warc_file.unlink()
+            warc_files.append(Path(extract_path))
+
+    else:
+
+        logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
+        crawl = subprocess.run(cmd_args, check=False)
+        if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT:
+            logger.info("crawl interupted by a limit")
+        elif crawl.returncode != 0:
+            raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
+
+        if zimit_args.collection:
+            warc_files = [
+                temp_root_dir.joinpath(f"collections/{zimit_args.collection}/archive/")
+            ]
+
+        else:
+            warc_dirs = list(temp_root_dir.rglob("collections/crawl-*/archive/"))
+            if len(warc_dirs) == 0:
+                raise RuntimeError(
+                    "Failed to find directory where WARC files have been created"
+                )
+            elif len(warc_dirs) > 1:
+                logger.info(
+                    "Found many WARC files directories, only last one will be used"
+                )
+                for directory in warc_dirs:
+                    logger.info(f"- {directory}")
+            warc_files = [warc_dirs[-1]]
 
     logger.info("")
     logger.info("----------")
-    logger.info(f"Processing WARC files in {warc_directory}")
-    warc2zim_args.append(str(warc_directory))
+    logger.info(
+        f"Processing WARC files in/at "
+        f'{" ".join(str(warc_file) for warc_file in warc_files)}'
+    )
+    warc2zim_args.extend(str(warc_file) for warc_file in warc_files)
 
-    num_files = sum(1 for _ in warc_directory.iterdir())
-    logger.info(f"{num_files} WARC files found")
     logger.info(f"Calling warc2zim with these args: {warc2zim_args}")
 
     return warc2zim(warc2zim_args)

diff --git a/tests-integration/integration.py b/tests-integration/integration.py
@@ -28,7 +28,7 @@ def test_zim_scraper():
     scraper = zim_fh.get_text_metadata("Scraper")
     assert "zimit " in scraper
     assert "warc2zim " in scraper
-    assert "Browsertrix crawler " in scraper
+    assert "Browsertrix-Crawler " in scraper
 
 
 def test_files_list():