Skip to content

Commit

Permalink
Merge pull request #221 from openzim/enhance_urls_creation
Browse files Browse the repository at this point in the history
Do not insert all RSYNC paths in database
  • Loading branch information
benoit74 authored Mar 14, 2024
2 parents 59ebd30 + 51f47f4 commit 4ac977c
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 7 deletions.
9 changes: 9 additions & 0 deletions ChangeLog → CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,22 @@ as of 2.0.0.

## [Unreleased]

### Changed

- Insert as few rsync URLs as possible in DB when a book selection is made (#220)

## [2.1.1] - 2024-01-17

### Added

- `Publisher` ZIM metadata can now be customized at CLI (#210)

### Changed

- `Publisher` ZIM metadata default value is changed to `openZIM` intead of `Kiwix` (#210)

### Fixed

- Do not fail if temporary directory already exists (#207)
- Typo in `Scraper` ZIM metadata (#212)
- Adapt to hatchling v1.19.0 which mandates packages setting (#211)
Expand All @@ -35,11 +42,13 @@ as of 2.0.0.
- Removed inline Javascript in HTML files (#145)

### Fixed

- Support single quotes in author names (#162)
- Migrated to another Gutenberg server (#187)
- Removed useless file languages_06_2018 (#180)

### Removed

- Removed Datatables JS code from repository, fetch online now (#116)
- Dropped Python 2 support (#191)

Expand Down
8 changes: 4 additions & 4 deletions src/gutenberg2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def f(x):
logger.info(f"PARSING rdf-files in {rdf_path}")
parse_and_fill(rdf_path=rdf_path, only_books=books)
logger.info("Add possible url to db")
setup_urls(force=force)
setup_urls(force=force, books=books)

if do_download:
logger.info("DOWNLOADING ebooks from mirror using filters")
Expand All @@ -190,9 +190,9 @@ def f(x):
only_books=books,
force=force,
s3_storage=s3_storage,
optimizer_version=optimizer_version
if not use_any_optimized_version
else None,
optimizer_version=(
optimizer_version if not use_any_optimized_version else None
),
)
if one_lang_one_zim_folder:
if languages == []:
Expand Down
29 changes: 26 additions & 3 deletions src/gutenberg2zim/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@


class UrlBuilder:

"""
Url builder for the files of a Gutenberg book.
Example:
Expand Down Expand Up @@ -227,7 +226,7 @@ def build_html(files):
return list(set(urls))


def setup_urls(force):
def setup_urls(force, books):
file_with_url = TMP_FOLDER_PATH.joinpath(f"file_on_{UrlBuilder.SERVER_NAME}")

if file_with_url.exists() and not force:
Expand Down Expand Up @@ -261,10 +260,34 @@ def setup_urls(force):
qry.execute()

logger.info("\tAppending urls in DB from rsync result")
# strip rsync file to only contain relative path
count_dir = count_old = count_added = count_processed = 0
with open(file_with_url, errors="replace") as src:
# show progress in debug mode, we expect about 5.4M lines as of early 2024
if count_processed and count_processed % 100000 == 0:
logger.debug(f"\t{count_processed} rsync results processed")
for line in src.readlines():
count_processed += 1
# ignore all directory entries
if line.startswith("d"):
count_dir += 1
continue
# ignore all entries in an /old/ subfolder
if "/old/" in line:
count_old += 1
continue
# take into account the book selection which might have been passed ;
# this not does completely filter-out useless urls for books IDs 1 to 9
# but still makes the scraper way faster for all other selections
if books:
if not any(f"/{book}/" in line for book in books):
continue
# strip rsync file to only contain relative path
Url.create(url=line[start_rel_path_idx:].strip()) # type: ignore
count_added += 1
logger.info(
f"\tDB is ready, {count_added} URLs have been added ({count_dir} dirs ignored, "
f"{count_old} old stuff ignored, {count_processed} lines processed)"
)


if __name__ == "__main__":
Expand Down

0 comments on commit 4ac977c

Please sign in to comment.