From 074631c0531d25447bc68c0426f497f284d4b4ad Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 5 Mar 2024 15:40:56 +0000 Subject: [PATCH 1/3] Fix black formatting --- src/gutenberg2zim/entrypoint.py | 6 +++--- src/gutenberg2zim/urls.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/gutenberg2zim/entrypoint.py b/src/gutenberg2zim/entrypoint.py index ebda3e8..cd13f82 100755 --- a/src/gutenberg2zim/entrypoint.py +++ b/src/gutenberg2zim/entrypoint.py @@ -190,9 +190,9 @@ def f(x): only_books=books, force=force, s3_storage=s3_storage, - optimizer_version=optimizer_version - if not use_any_optimized_version - else None, + optimizer_version=( + optimizer_version if not use_any_optimized_version else None + ), ) if one_lang_one_zim_folder: if languages == []: diff --git a/src/gutenberg2zim/urls.py b/src/gutenberg2zim/urls.py index deada12..60fa7ab 100644 --- a/src/gutenberg2zim/urls.py +++ b/src/gutenberg2zim/urls.py @@ -8,7 +8,6 @@ class UrlBuilder: - """ Url builder for the files of a Gutenberg book. Example: From 8f3a0d54d60da45c6d3e1664235c00fb41ea6c34 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 5 Mar 2024 15:40:13 +0000 Subject: [PATCH 2/3] Do not insert all RSYNC paths in database --- ChangeLog | 4 ++++ src/gutenberg2zim/entrypoint.py | 2 +- src/gutenberg2zim/urls.py | 28 ++++++++++++++++++++++++++-- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index aefe0d6..5c75a0e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -8,6 +8,10 @@ as of 2.0.0. ## [Unreleased] +### Changed + +- Insert as few rsync URLs as possible in DB when a book selection is made (#220) + ## [2.1.1] - 2024-01-17 ### Added diff --git a/src/gutenberg2zim/entrypoint.py b/src/gutenberg2zim/entrypoint.py index cd13f82..4ee145f 100755 --- a/src/gutenberg2zim/entrypoint.py +++ b/src/gutenberg2zim/entrypoint.py @@ -178,7 +178,7 @@ def f(x): logger.info(f"PARSING rdf-files in {rdf_path}") parse_and_fill(rdf_path=rdf_path, only_books=books) logger.info("Add possible url to db") - setup_urls(force=force) + setup_urls(force=force, books=books) if do_download: logger.info("DOWNLOADING ebooks from mirror using filters") diff --git a/src/gutenberg2zim/urls.py b/src/gutenberg2zim/urls.py index 60fa7ab..94817e8 100644 --- a/src/gutenberg2zim/urls.py +++ b/src/gutenberg2zim/urls.py @@ -226,7 +226,7 @@ def build_html(files): return list(set(urls)) -def setup_urls(force): +def setup_urls(force, books): file_with_url = TMP_FOLDER_PATH.joinpath(f"file_on_{UrlBuilder.SERVER_NAME}") if file_with_url.exists() and not force: @@ -260,10 +260,34 @@ def setup_urls(force): qry.execute() logger.info("\tAppending urls in DB from rsync result") - # strip rsync file to only contain relative path + count_dir = count_old = count_added = count_processed = 0 with open(file_with_url, errors="replace") as src: + # show progress in debug mode, we expect about 5.4M lines as of early 2024 + if count_processed and count_processed % 100000 == 0: + logger.debug(f"\t{count_processed} rsync results processed") for line in src.readlines(): + count_processed += 1 + # ignore all directory entries + if line.startswith("d"): + count_dir += 1 + continue + # ignore all entries in an /old/ subfolder + if "/old/" in line: + count_old += 1 + continue + # take into account the book selection which might have been passed ; + # this not does completely filter-out useless urls for books IDs 1 to 9 + # but still makes the scraper way faster for all other selections + if books: + if not any(f"/{book}/" in line for book in books): + continue + # strip rsync file to only contain relative path Url.create(url=line[start_rel_path_idx:].strip()) # type: ignore + count_added += 1 + logger.info( + f"\tDB is ready, {count_added} URLs have been added ({count_dir} dirs ignored, " + f"{count_old} old stuff ignored, {count_processed} lines processed)" + ) if __name__ == "__main__": From 51f47f47e1899c12cb6065fb530e9769c4b94b69 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 8 Mar 2024 09:12:30 +0000 Subject: [PATCH 3/3] Rename ChangeLog to CHANGELOG.md and fix markdown issues --- ChangeLog => CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) rename ChangeLog => CHANGELOG.md (99%) diff --git a/ChangeLog b/CHANGELOG.md similarity index 99% rename from ChangeLog rename to CHANGELOG.md index 5c75a0e..6fc5119 100644 --- a/ChangeLog +++ b/CHANGELOG.md @@ -15,12 +15,15 @@ as of 2.0.0. ## [2.1.1] - 2024-01-17 ### Added + - `Publisher` ZIM metadata can now be customized at CLI (#210) ### Changed + - `Publisher` ZIM metadata default value is changed to `openZIM` intead of `Kiwix` (#210) ### Fixed + - Do not fail if temporary directory already exists (#207) - Typo in `Scraper` ZIM metadata (#212) - Adapt to hatchling v1.19.0 which mandates packages setting (#211) @@ -39,11 +42,13 @@ as of 2.0.0. - Removed inline Javascript in HTML files (#145) ### Fixed + - Support single quotes in author names (#162) - Migrated to another Gutenberg server (#187) - Removed useless file languages_06_2018 (#180) ### Removed + - Removed Datatables JS code from repository, fetch online now (#116) - Dropped Python 2 support (#191)