Merge pull request #221 from openzim/enhance_urls_creation

Do not insert all RSYNC paths in database
openzim · Mar 14, 2024 · 4ac977c · 4ac977c
2 parents 59ebd30 + 51f47f4
commit 4ac977c
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 7 deletions.
diff --git a/ChangeLog → CHANGELOG.md b/ChangeLog → CHANGELOG.md
@@ -8,15 +8,22 @@ as of 2.0.0.
 
 ## [Unreleased]
 
+### Changed
+
+- Insert as few rsync URLs as possible in DB when a book selection is made (#220)
+
 ## [2.1.1] - 2024-01-17
 
 ### Added
+
 - `Publisher` ZIM metadata can now be customized at CLI (#210)
 
 ### Changed
+
 - `Publisher` ZIM metadata default value is changed to `openZIM` intead of `Kiwix` (#210)
 
 ### Fixed
+
 - Do not fail if temporary directory already exists (#207)
 - Typo in `Scraper` ZIM metadata (#212)
 - Adapt to hatchling v1.19.0 which mandates packages setting (#211)
@@ -35,11 +42,13 @@ as of 2.0.0.
 - Removed inline Javascript in HTML files (#145)
 
 ### Fixed
+
 - Support single quotes in author names (#162)
 - Migrated to another Gutenberg server (#187)
 - Removed useless file languages_06_2018 (#180)
 
 ### Removed
+
 - Removed Datatables JS code from repository, fetch online now (#116)
 - Dropped Python 2 support (#191)
 

diff --git a/src/gutenberg2zim/entrypoint.py b/src/gutenberg2zim/entrypoint.py
@@ -178,7 +178,7 @@ def f(x):
         logger.info(f"PARSING rdf-files in {rdf_path}")
         parse_and_fill(rdf_path=rdf_path, only_books=books)
         logger.info("Add possible url to db")
-        setup_urls(force=force)
+        setup_urls(force=force, books=books)
 
     if do_download:
         logger.info("DOWNLOADING ebooks from mirror using filters")
@@ -190,9 +190,9 @@ def f(x):
             only_books=books,
             force=force,
             s3_storage=s3_storage,
-            optimizer_version=optimizer_version
-            if not use_any_optimized_version
-            else None,
+            optimizer_version=(
+                optimizer_version if not use_any_optimized_version else None
+            ),
         )
     if one_lang_one_zim_folder:
         if languages == []:

diff --git a/src/gutenberg2zim/urls.py b/src/gutenberg2zim/urls.py
@@ -8,7 +8,6 @@
 
 
 class UrlBuilder:
-
     """
     Url builder for the files of a Gutenberg book.
     Example:
@@ -227,7 +226,7 @@ def build_html(files):
     return list(set(urls))
 
 
-def setup_urls(force):
+def setup_urls(force, books):
     file_with_url = TMP_FOLDER_PATH.joinpath(f"file_on_{UrlBuilder.SERVER_NAME}")
 
     if file_with_url.exists() and not force:
@@ -261,10 +260,34 @@ def setup_urls(force):
     qry.execute()
 
     logger.info("\tAppending urls in DB from rsync result")
-    # strip rsync file to only contain relative path
+    count_dir = count_old = count_added = count_processed = 0
     with open(file_with_url, errors="replace") as src:
+        # show progress in debug mode, we expect about 5.4M lines as of early 2024
+        if count_processed and count_processed % 100000 == 0:
+            logger.debug(f"\t{count_processed} rsync results processed")
         for line in src.readlines():
+            count_processed += 1
+            # ignore all directory entries
+            if line.startswith("d"):
+                count_dir += 1
+                continue
+            # ignore all entries in an /old/ subfolder
+            if "/old/" in line:
+                count_old += 1
+                continue
+            # take into account the book selection which might have been passed ;
+            # this not does completely filter-out useless urls for books IDs 1 to 9
+            # but still makes the scraper way faster for all other selections
+            if books:
+                if not any(f"/{book}/" in line for book in books):
+                    continue
+            # strip rsync file to only contain relative path
             Url.create(url=line[start_rel_path_idx:].strip())  # type: ignore
+            count_added += 1
+    logger.info(
+        f"\tDB is ready, {count_added} URLs have been added ({count_dir} dirs ignored, "
+        f"{count_old} old stuff ignored, {count_processed} lines processed)"
+    )
 
 
 if __name__ == "__main__":