From a2880820bde54acd884e09594a857235277f4ec8 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Mon, 27 May 2024 13:55:20 +0000 Subject: [PATCH] Added initial zimwriterfs clone --- CHANGELOG.md | 4 + pyproject.toml | 1 + src/zimwriterfs.py | 338 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 343 insertions(+) create mode 100644 src/zimwriterfs.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0864fe08..fa53a7ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- `zimwriterfs` script that mimics zim-tools's zimwriterfs + ### Changed - Simplify type annotations by replacing Union and Optional with pipe character ("|") for improved readability and clarity diff --git a/pyproject.toml b/pyproject.toml index dafdcade..61a61191 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,7 @@ dev = [ [project.scripts] fix_ogvjs_dist = "zimscraperlib.fix_ogvjs_dist:run" +zimwriterfs = "zimwriterfs:entrypoint" [tool.hatch.version] path = "src/zimscraperlib/__about__.py" diff --git a/src/zimwriterfs.py b/src/zimwriterfs.py new file mode 100644 index 00000000..d82e6279 --- /dev/null +++ b/src/zimwriterfs.py @@ -0,0 +1,338 @@ +#!/usr/bin/env python3 + +""" zimwriterfs alternative implementation + + requires libmagic1 and zimscraperlib (python) + + WARN: the following zimwriterfs features are not implemented: + `--inflateHtml` + `--skip-libmagic-check` +""" + +from __future__ import annotations + +import argparse +import datetime +import logging +from pathlib import Path + +from zimscraperlib.__about__ import __version__ +from zimscraperlib.constants import ( + MAXIMUM_DESCRIPTION_METADATA_LENGTH, + MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH, + RECOMMENDED_MAX_TITLE_LENGTH, +) +from zimscraperlib.logging import getLogger +from zimscraperlib.zim.creator import Creator +from zimscraperlib.zim.filesystem import add_redirects_to_zim, add_to_zim + +SCRAPER = f"zimwriterfs(py) {__version__}" +logger = getLogger("zimwriterfs", level=logging.INFO) + + +def main( + *, + src_dir: str, + dst_path: str, + name: str, + language: str, + title: str, + description: str, + creator: str, + publisher: str, + illus_fname: str, + tags: str, + welcome: str, + verbose: bool, + threads: int, + without_ft_index: bool, + redirects_path: str | None = None, + long_description: str | None = None, + flavour: str | None = None, + source: str | None = None, + scraper: str | None = None, + license_m: str | None = None, + relation: str | None = None, + cluster_size: int | None = None, +): + if verbose: + logger = getLogger("zimwriterfs", level=logging.DEBUG) + + logger.info(f"Dumping {src_dir} into {dst_path}") + + build_dir = Path(src_dir).expanduser().resolve() + if not build_dir.exists() or not build_dir.is_dir(): + raise OSError(f"Incorrect build_dir: {build_dir}") + + illustration_fpath = build_dir / illus_fname + if not illustration_fpath.exists() or not illustration_fpath.is_file(): + raise OSError(f"Incorrect illustration: {illus_fname} ({illustration_fpath})") + with open(illustration_fpath, "rb") as fh: + illustration_data = fh.read() + + redirects_fpath = Path(redirects_path) if redirects_path else None + + zim_file = ( + Creator( + filename=Path(dst_path).expanduser().resolve(), + main_path=welcome, + ignore_duplicates=False, + disable_metadata_checks=False, + ) + .config_indexing(not without_ft_index, language) + .config_verbose(verbose) + .config_nbworkers(threads) + ) + + if cluster_size: + zim_file.config_clustersize(cluster_size) + + zim_file.config_metadata( + Name=name, + Language=language, + Title=title, + Description=description, + LongDescription=long_description, + Creator=creator, + Publisher=publisher, + Date=datetime.date.today(), # noqa: DTZ011 + Illustration_48x48_at_1=illustration_data, + Tags=";".join(tags.split(";")) if tags else None, + Scraper=scraper, + Flavour=flavour, + Source=source, + License=license_m, + Relation=relation, + ) + + zim_file.start() + try: + logger.debug(f"Preparing zimfile at {zim_file.filename}") + + # recursively add content from build_dir + logger.debug(f"Recursively adding files from {build_dir}") + add_to_zim(build_dir, zim_file, build_dir) + + if redirects_fpath: + logger.debug("Creating redirects") + add_redirects_to_zim(zim_file, redirects_file=redirects_fpath) + + # prevents .finish() which would create an incomplete .zim file + # this would leave a .zim.tmp folder behind. + # UPSTREAM: wait until a proper cancel() is provided + except Exception: + zim_file.can_finish = False # pragma: no cover + raise + finally: + zim_file.finish() + + +def entrypoint(): + parser = argparse.ArgumentParser( + prog="zimwriterfs", + description="Create a ZIM file off a directory containing a static website", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog="""Notes: + - openZIM Metadata: https://wiki.openzim.org/wiki/Metadata + - Set `ZIMSCRAPERLIB_DEBUG=1` environ to see all files included in ZIM""", + ) + + parser.add_argument("src_dir") + parser.add_argument("dst_path") + + parser.add_argument( + "-V", + "--version", + help="Display version and exit", + action="version", + version=SCRAPER, + ) + + parser.add_argument( + "-n", + "--name", + help="Human identifier for the content (see spec)", + required=True, + ) + + parser.add_argument( + "-t", + "--title", + help=f"Title of the content ({RECOMMENDED_MAX_TITLE_LENGTH} chars max)", + required=True, + ) + + parser.add_argument( + "-l", + "--language", + help="ISO-639-3 Language code of the content", + required=True, + ) + + parser.add_argument( + "-d", + "--description", + help="Description of the content " + f"({MAXIMUM_DESCRIPTION_METADATA_LENGTH} chars max)", + required=True, + ) + + parser.add_argument( + "-w", + "--welcome", + help="Relative path to home/main page (ex: `home.html`)", + required=True, + ) + + parser.add_argument( + "-I", + "--illustration", + dest="illus_fname", + help="Relative path to the ZIM's illustration. Must be a 48x48px PNG", + required=True, + ) + + parser.add_argument( + "-c", + "--creator", + help="Creator of the Content", + required=True, + ) + + parser.add_argument( + "-p", + "--publisher", + help="Publisher of the Content", + required=True, + ) + + parser.add_argument( + "-r", + "--redirects", + dest="redirects_path", + help="Path to a TSV file containing a list of redirects (url title target_url)", + required=False, + ) + + parser.add_argument( + "-a", + "--tags", + help="ZIM Tags, semicolon separated", + default="", + required=False, + ) + + parser.add_argument( + "-e", + "--source", + help="Content source URL", + required=False, + ) + + parser.add_argument( + "-o", + "--flavour", + help="custom (version independent) content flavour", + required=False, + ) + + parser.add_argument( + "-s", + "--scraper", + help="custom (version independent) content flavour", + default=SCRAPER, + required=False, + ) + + parser.add_argument( + "--license", + help="License of the content", + dest="license_m", + required=False, + ) + + parser.add_argument( + "--relation", + help="URI of external related ressources ", + required=False, + ) + + parser.add_argument( + "-L", + "--longDescription", + dest="long_description", + help="Longer description of the content " + f"({MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH} chars max)", + required=False, + ) + + parser.add_argument( + "-v", + "--verbose", + help="Whether to display processing details", + action="store_true", + default=False, + required=False, + ) + + parser.add_argument( + "-m", + "--clusterSize", + dest="cluster_size", + type=int, + help="Number of bytes per ZIM cluster (libzim defaults to 2MiB)", + default=None, + required=False, + ) + + parser.add_argument( + "-J", + "--threads", + type=int, + default=4, + help="Number of threads/workers for libzim to use", + required=False, + ) + + parser.add_argument( + "-j", + "--withoutFTIndex", + dest="without_ft_index", + action="store_true", + default=False, + help="Don't create and add a fulltext index of the content to the ZIM", + required=False, + ) + + # parser.add_argument( + # "-x", + # "--inflateHtml", + # dest="inflate_html", + # action="store_true", + # default=False, + # help="[NOT IMPLEMENTED] try to inflate HTML files before packing", + # required=False, + # ) + + # parser.add_argument( + # "--skip-libmagic-check", + # dest="skip_libmagic_check", + # action="store_true", + # default=False, + # help="Accept to run even if magic file cannot be loaded " + # + "(mimetypes in the zim file may be wrong)", + # required=False, + # ) + + args = parser.parse_args() + + try: + main(**dict(args._get_kwargs())) + except Exception as exc: + logger.exception(exc) + logger.error(f"FAILED. An error occured: {exc}") + raise SystemExit(1) from exc + + +if __name__ == "__main__": + entrypoint()