Skip to content

Commit

Permalink
Merge pull request #359 from openzim/metadata
Browse files Browse the repository at this point in the history
Source more ZIM metadata from WARC files
  • Loading branch information
benoit74 authored Aug 2, 2024
2 parents 6235567 + 9cc6c68 commit 814b3ed
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 15 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Generate fuzzy rules tests in Python and Javascript (#284)
- Refactor HTML rewriter class to make it more open to change and expressive (#305)
- Detect charset in document header only for HTML documents (#331)
- Use `software` property from `warcinfo` record to set ZIM `Scraper` metadata (#357)
- Store `ContentDate` as metadata, based on `WARC-Date` (#358)

### Fixed

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ dependencies = [
"tinycss2==1.3.0",
"beautifulsoup4==4.12.3", # used to parse base href
"lxml==5.2.2", # used by beautifulsoup4 for parsing html
"python-dateutil==2.9.0.post0",
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]

Expand Down
59 changes: 57 additions & 2 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
# from zimscraperlib import getLogger
from bs4 import BeautifulSoup
from cdxj_indexer import buffering_record_iter, iter_file_or_dir
from dateutil import parser
from jinja2 import Environment, PackageLoader
from warcio import ArchiveIterator
from zimscraperlib.constants import (
Expand Down Expand Up @@ -224,6 +225,11 @@ def __init__(self, args):

self.scraper_suffix = args.scraper_suffix

# metadata about WARC files
self.warc_software = ""
self.warc_start = None
self.warc_end = None

self.continue_on_error = bool(args.continue_on_error)
self.disable_metadata_checks = bool(args.disable_metadata_checks)
self.ignore_content_header_charsets = bool(args.ignore_content_header_charsets)
Expand Down Expand Up @@ -334,9 +340,30 @@ def run(self):
Illustration_48x48_at_1=self.illustration,
Tags=self.tags,
Source=self.source,
Scraper=f"warc2zim {get_version()}{self.scraper_suffix or ''}",
Scraper=",".join(
filter(
lambda x: x, # remove None values
[
f"warc2zim {get_version()}",
self.warc_software,
self.scraper_suffix,
],
)
),
).start()

if self.warc_start and self.warc_end:
if self.warc_start == self.warc_end:
self.creator.add_metadata(
"X-ContentDate", self.warc_start.strftime("%Y-%m-%d")
)
else:
self.creator.add_metadata(
"X-ContentDate",
f"{self.warc_start.strftime('%Y-%m-%d')},"
f"{self.warc_end.strftime('%Y-%m-%d')}",
)

for filename in importlib.resources.files("warc2zim.statics").iterdir():
with importlib.resources.as_file(filename) as file:
self.creator.add_item(
Expand Down Expand Up @@ -398,15 +425,43 @@ def run(self):

self.creator.finish()

def extract_warcinfo(self, record):
"""Extract the software value from a warcinfo record"""
if self.warc_software:
logger.debug("warc_software already set, ignoring warcinfo record")
return
if get_record_mime_type(record) != "application/warc-fields":
logger.warning(
f"Unsupported warcinfo record found: {get_record_mime_type(record)}"
)
return
for warcfield in get_record_content(record).decode("UTF-8").splitlines():
name, value = warcfield.split(":", 1)
if name.strip().lower() != "software":
continue
self.warc_software = str(value).strip()
return

def gather_information_from_warc(self):
main_page_found = False
for record in iter_warc_records(self.warc_files):

if record.rec_type == "warcinfo":
self.extract_warcinfo(record)

# only response records can be considered as main_path and as existing ZIM
# path
if record.rec_type not in ("response", "revisit"):
continue

# update warc_start/warc_end based on WARC-Date header
if record.rec_headers["WARC-Date"]:
record_date = parser.isoparse(record.rec_headers["WARC-Date"]).date()
if self.warc_start is None or self.warc_start > record_date:
self.warc_start = record_date
if self.warc_end is None or self.warc_end < record_date:
self.warc_end = record_date

url = get_record_url(record)

# ignore non HTTP(S) URLs (intent:// for instance, see #332)
Expand Down Expand Up @@ -847,5 +902,5 @@ def iter_warc_records(warc_files):
for filename in warc_files:
with open(filename, "rb") as fh:
for record in buffering_record_iter(ArchiveIterator(fh), post_append=True):
if record and record.rec_type in ("resource", "response", "revisit"):
if record:
yield record
27 changes: 14 additions & 13 deletions tests/test_warc_to_zim.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,12 @@
# `test_all_warcs_root_dir` test
TEST_DATA_SPECIAL_DIR = pathlib.Path(__file__).parent / "data-special"

SCRAPER_SUFFIX = " + zimit x.y.z-devw"
SCRAPER_SUFFIX = "zimit x.y.z-devw"

# ============================================================================
CMDLINES = [
["example-response.warc"],
["example-response.warc", "--progress-file", "progress.json"],
["example-response.warc", "--scraper-suffix", SCRAPER_SUFFIX],
["example-revisit.warc.gz"],
[
"example-revisit.warc.gz",
Expand Down Expand Up @@ -121,7 +120,7 @@ def assert_item_does_not_exist(self, zimfile, path):
payload = None
assert payload is None

def verify_warc_and_zim(self, warcfile, zimfile, verify_scraper_suffix):
def verify_warc_and_zim(self, warcfile, zimfile):
assert pathlib.Path(warcfile).is_file()
assert pathlib.Path(zimfile).is_file()

Expand All @@ -133,13 +132,8 @@ def verify_warc_and_zim(self, warcfile, zimfile, verify_scraper_suffix):

zim_fh = Archive(zimfile)

if verify_scraper_suffix:
assert (
f"warc2zim {__version__}{SCRAPER_SUFFIX}"
== zim_fh.get_text_metadata("Scraper")
)
else:
assert f"warc2zim {__version__}" == zim_fh.get_text_metadata("Scraper")
assert zim_fh.get_text_metadata("Scraper").startswith(f"warc2zim {__version__}")
assert zim_fh.get_text_metadata("X-ContentDate")

for record in iter_warc_records([warcfile]):
url = get_record_url(record)
Expand Down Expand Up @@ -347,6 +341,8 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
"test zim",
"--title",
"Some Title",
"--scraper-suffix",
SCRAPER_SUFFIX,
]
)

Expand Down Expand Up @@ -380,6 +376,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
"Scraper",
"Tags",
"Title",
"X-ContentDate",
]

assert zim_fh.has_fulltext_index
Expand All @@ -400,6 +397,12 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
}
assert self.get_metadata(zim_output, "Title") == b"Some Title"

assert (
zim_fh.get_text_metadata("Scraper") == f"warc2zim {__version__},"
"webrecorder.io 2.0 (warcprox 1.4-20151022181819-1a48f12),zimit x.y.z-devw"
)
assert zim_fh.get_text_metadata("X-ContentDate") == "2016-02-25"

def test_warc_to_zim_main(self, cmdline, tmp_path):
# intput filename
filename = cmdline[0]
Expand All @@ -423,9 +426,7 @@ def test_warc_to_zim_main(self, cmdline, tmp_path):
and progress["written"] <= progress["total"]
)

self.verify_warc_and_zim(
warcfile, tmp_path / zimfile, "--scraper-suffix" in cmdline
)
self.verify_warc_and_zim(warcfile, tmp_path / zimfile)

def test_same_domain_only(self, tmp_path):
zim_output = "same-domain.zim"
Expand Down

0 comments on commit 814b3ed

Please sign in to comment.