From 861751a7edb84f3f41fcb637f7e2cd2e58d25e9e Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 23 Jul 2024 09:10:16 +0000 Subject: [PATCH 1/3] Stop fetching and passing browsertrix crawler version as scraperSuffix to warc2zim --- CHANGELOG.md | 1 + src/zimit/zimit.py | 20 +++----------------- tests-integration/integration.py | 2 +- 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb68082..3bb1dbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Make it clear that `--profile` argument can be an HTTP(S) URL (and not only a path) (#288) - Fix README imprecisions + add back warc2zim availability in docker image (#314) - Enhance integration test to assert final content of the ZIM (#287) +- Stop fetching and passing browsertrix crawler version as scraperSuffix to warc2zim (#354) ## [2.0.6] - 2024-08-02 diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index c464519..48629dd 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -365,24 +365,10 @@ def run(raw_args): zimit_args, warc2zim_args = parser.parse_known_args(raw_args) - logger.info("Checking browsertrix-crawler version") - crawl_version_cmd = ["crawl", "--version"] - try: - crawl = subprocess.run( - crawl_version_cmd, check=True, capture_output=True, text=True - ) - except Exception: - logger.error("Failed to get Browsertrix crawler version") - raise - crawler_version = crawl.stdout.strip() - logger.info(f"Browsertrix crawler: version {crawler_version}") - - # pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler - # versions are associated with the ZIM + # pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are + # associated with the ZIM ; make it a CSV for easier parsing warc2zim_args.append("--scraper-suffix") - warc2zim_args.append( - f" + zimit {__version__} + Browsertrix crawler {crawler_version}" - ) + warc2zim_args.append(f"zimit {__version__}") # pass url and output to warc2zim also if zimit_args.output: diff --git a/tests-integration/integration.py b/tests-integration/integration.py index c314167..16ab337 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -28,7 +28,7 @@ def test_zim_scraper(): scraper = zim_fh.get_text_metadata("Scraper") assert "zimit " in scraper assert "warc2zim " in scraper - assert "Browsertrix crawler " in scraper + assert "Browsertrix-Crawler " in scraper def test_files_list(): From 459a30a226a8ed56056ab4889d53557e3dbb2cb9 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 23 Jul 2024 09:27:15 +0000 Subject: [PATCH 2/3] Do not log number of WARC files found --- CHANGELOG.md | 1 + src/zimit/zimit.py | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bb1dbc..682d7aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fix README imprecisions + add back warc2zim availability in docker image (#314) - Enhance integration test to assert final content of the ZIM (#287) - Stop fetching and passing browsertrix crawler version as scraperSuffix to warc2zim (#354) +- Do not log number of WARC files found (#357) ## [2.0.6] - 2024-08-02 diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 48629dd..f84fbda 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -527,8 +527,6 @@ def cleanup(): logger.info(f"Processing WARC files in {warc_directory}") warc2zim_args.append(str(warc_directory)) - num_files = sum(1 for _ in warc_directory.iterdir()) - logger.info(f"{num_files} WARC files found") logger.info(f"Calling warc2zim with these args: {warc2zim_args}") return warc2zim(warc2zim_args) From 8cd1db6eef2754f42230c7c3ed7d2f7c29f9b9a5 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 23 Jul 2024 09:27:23 +0000 Subject: [PATCH 3/3] Add option to directly process WARC files --- CHANGELOG.md | 1 + src/zimit/zimit.py | 124 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 103 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 682d7aa..b529179 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `--custom-behaviors` argument to support path/HTTP(S) URL custom behaviors to pass to the crawler (#313) - Add daily automated end-to-end tests of a page with Youtube player (#330) +- Add `--warcs` option to directly process WARC files (#301) ### Changed diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index f84fbda..46ed485 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -12,6 +12,7 @@ import signal import subprocess import sys +import tarfile import tempfile import urllib.parse from argparse import ArgumentParser @@ -363,6 +364,14 @@ def run(raw_args): "individual JS files URL/path separated by a comma", ) + parser.add_argument( + "--warcs", + help="Directly convert WARC archives to ZIM, by-passing the crawling phase. " + "This argument must contain the path or HTTP(S) URL to either warc.gz files or" + "to a tar.gz containing the warc.gz files. Single value with individual " + "path/URLs separated by comma", + ) + zimit_args, warc2zim_args = parser.parse_known_args(raw_args) # pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are @@ -499,33 +508,104 @@ def cleanup(): f"Output to tempdir: {temp_root_dir} - " f"{'will keep' if zimit_args.keep else 'will delete'}" ) - logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") - crawl = subprocess.run(cmd_args, check=False) - if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT: - logger.info("crawl interupted by a limit") - elif crawl.returncode != 0: - raise subprocess.CalledProcessError(crawl.returncode, cmd_args) - if zimit_args.collection: - warc_directory = temp_root_dir.joinpath( - f"collections/{zimit_args.collection}/archive/" - ) - else: - warc_dirs = list(temp_root_dir.rglob("collections/crawl-*/archive/")) - if len(warc_dirs) == 0: - raise RuntimeError( - "Failed to find directory where WARC files have been created" + # if warc files are passed, do not run browsertrix crawler but fetch the files if + # they are provided as an HTTP URL + extract the archive if it is a tar.gz + warc_files: list[Path] = [] + if zimit_args.warcs: + for warc_location in [ + warc_location.strip() for warc_location in zimit_args.warcs.split(",") + ]: + suffix = "".join(Path(urllib.parse.urlparse(warc_location).path).suffixes) + if suffix not in {".tar.gz", ".warc", ".warc.gz"}: + raise Exception(f"Unsupported file at {warc_location}") + + filename = tempfile.NamedTemporaryFile( + dir=temp_root_dir, + prefix="warc_", + suffix=suffix, + delete_on_close=False, ) - elif len(warc_dirs) > 1: - logger.info("Found many WARC files directories, only last one will be used") - for directory in warc_dirs: - logger.info(f"- {directory}") - warc_directory = warc_dirs[-1] + + if not re.match(r"^https?\://", warc_location): + # warc_location is not a URL, so it is a path, simply add it to the list + if not Path(warc_location).exists(): + raise Exception(f"Impossible to find file at {warc_location}") + + # if it is a plain warc or warc.gz, simply add it to the list + if suffix in {".warc", ".warc.gz"}: + warc_files.append(Path(warc_location)) + continue + + # otherwise extract tar.gz but do not delete it afterwards + extract_path = temp_root_dir / f"{filename.name}_files" + logger.info( + f"Extracting WARC(s) from {warc_location} to {extract_path}" + ) + with tarfile.open(warc_location, "r:gz") as fh: + # Extract all the contents to the specified directory + fh.extractall(path=extract_path, filter="data") + warc_files.append(Path(extract_path)) + continue + + # warc_location is a URL, let's download it to a temp name to avoid name + # collisions + warc_file = Path(filename.name) + logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}") + resp = requests.get(warc_location, timeout=REQUESTS_TIMEOUT) + resp.raise_for_status() + warc_file.write_bytes(resp.content) + + # if it is a plain warc or warc.gz, simply add it to the list + if suffix in {".warc", ".warc.gz"}: + warc_files.append(warc_file) + continue + + # otherwise extract tar.gz and delete it afterwards + extract_path = temp_root_dir / f"{filename.name}_files" + logger.info(f"Extracting WARC(s) from {warc_file} to {extract_path}") + with tarfile.open(warc_file, "r:gz") as fh: + # Extract all the contents to the specified directory + fh.extractall(path=extract_path, filter="data") + logger.info(f"Deleting archive at {warc_file}") + warc_file.unlink() + warc_files.append(Path(extract_path)) + + else: + + logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") + crawl = subprocess.run(cmd_args, check=False) + if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT: + logger.info("crawl interupted by a limit") + elif crawl.returncode != 0: + raise subprocess.CalledProcessError(crawl.returncode, cmd_args) + + if zimit_args.collection: + warc_files = [ + temp_root_dir.joinpath(f"collections/{zimit_args.collection}/archive/") + ] + + else: + warc_dirs = list(temp_root_dir.rglob("collections/crawl-*/archive/")) + if len(warc_dirs) == 0: + raise RuntimeError( + "Failed to find directory where WARC files have been created" + ) + elif len(warc_dirs) > 1: + logger.info( + "Found many WARC files directories, only last one will be used" + ) + for directory in warc_dirs: + logger.info(f"- {directory}") + warc_files = [warc_dirs[-1]] logger.info("") logger.info("----------") - logger.info(f"Processing WARC files in {warc_directory}") - warc2zim_args.append(str(warc_directory)) + logger.info( + f"Processing WARC files in/at " + f'{" ".join(str(warc_file) for warc_file in warc_files)}' + ) + warc2zim_args.extend(str(warc_file) for warc_file in warc_files) logger.info(f"Calling warc2zim with these args: {warc2zim_args}")