Merge pull request #359 from openzim/metadata

Source more ZIM metadata from WARC files
openzim · Aug 2, 2024 · 814b3ed · 814b3ed
2 parents 6235567 + 9cc6c68
commit 814b3ed
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Generate fuzzy rules tests in Python and Javascript (#284)
 - Refactor HTML rewriter class to make it more open to change and expressive (#305)
 - Detect charset in document header only for HTML documents (#331)
+- Use `software` property from `warcinfo` record to set ZIM `Scraper` metadata (#357)
+- Store `ContentDate` as metadata, based on `WARC-Date` (#358)
 
 ### Fixed
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
   "tinycss2==1.3.0",
   "beautifulsoup4==4.12.3", # used to parse base href
   "lxml==5.2.2", # used by beautifulsoup4 for parsing html
+  "python-dateutil==2.9.0.post0",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 

diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
@@ -33,6 +33,7 @@
 # from zimscraperlib import getLogger
 from bs4 import BeautifulSoup
 from cdxj_indexer import buffering_record_iter, iter_file_or_dir
+from dateutil import parser
 from jinja2 import Environment, PackageLoader
 from warcio import ArchiveIterator
 from zimscraperlib.constants import (
@@ -224,6 +225,11 @@ def __init__(self, args):
 
         self.scraper_suffix = args.scraper_suffix
 
+        # metadata about WARC files
+        self.warc_software = ""
+        self.warc_start = None
+        self.warc_end = None
+
         self.continue_on_error = bool(args.continue_on_error)
         self.disable_metadata_checks = bool(args.disable_metadata_checks)
         self.ignore_content_header_charsets = bool(args.ignore_content_header_charsets)
@@ -334,9 +340,30 @@ def run(self):
             Illustration_48x48_at_1=self.illustration,
             Tags=self.tags,
             Source=self.source,
-            Scraper=f"warc2zim {get_version()}{self.scraper_suffix or ''}",
+            Scraper=",".join(
+                filter(
+                    lambda x: x,  # remove None values
+                    [
+                        f"warc2zim {get_version()}",
+                        self.warc_software,
+                        self.scraper_suffix,
+                    ],
+                )
+            ),
         ).start()
 
+        if self.warc_start and self.warc_end:
+            if self.warc_start == self.warc_end:
+                self.creator.add_metadata(
+                    "X-ContentDate", self.warc_start.strftime("%Y-%m-%d")
+                )
+            else:
+                self.creator.add_metadata(
+                    "X-ContentDate",
+                    f"{self.warc_start.strftime('%Y-%m-%d')},"
+                    f"{self.warc_end.strftime('%Y-%m-%d')}",
+                )
+
         for filename in importlib.resources.files("warc2zim.statics").iterdir():
             with importlib.resources.as_file(filename) as file:
                 self.creator.add_item(
@@ -398,15 +425,43 @@ def run(self):
 
         self.creator.finish()
 
+    def extract_warcinfo(self, record):
+        """Extract the software value from a warcinfo record"""
+        if self.warc_software:
+            logger.debug("warc_software already set, ignoring warcinfo record")
+            return
+        if get_record_mime_type(record) != "application/warc-fields":
+            logger.warning(
+                f"Unsupported warcinfo record found: {get_record_mime_type(record)}"
+            )
+            return
+        for warcfield in get_record_content(record).decode("UTF-8").splitlines():
+            name, value = warcfield.split(":", 1)
+            if name.strip().lower() != "software":
+                continue
+            self.warc_software = str(value).strip()
+            return
+
     def gather_information_from_warc(self):
         main_page_found = False
         for record in iter_warc_records(self.warc_files):
 
+            if record.rec_type == "warcinfo":
+                self.extract_warcinfo(record)
+
             # only response records can be considered as main_path and as existing ZIM
             # path
             if record.rec_type not in ("response", "revisit"):
                 continue
 
+            # update warc_start/warc_end based on WARC-Date header
+            if record.rec_headers["WARC-Date"]:
+                record_date = parser.isoparse(record.rec_headers["WARC-Date"]).date()
+                if self.warc_start is None or self.warc_start > record_date:
+                    self.warc_start = record_date
+                if self.warc_end is None or self.warc_end < record_date:
+                    self.warc_end = record_date
+
             url = get_record_url(record)
 
             # ignore non HTTP(S) URLs (intent:// for instance, see #332)
@@ -847,5 +902,5 @@ def iter_warc_records(warc_files):
     for filename in warc_files:
         with open(filename, "rb") as fh:
             for record in buffering_record_iter(ArchiveIterator(fh), post_append=True):
-                if record and record.rec_type in ("resource", "response", "revisit"):
+                if record:
                     yield record
diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py
@@ -24,13 +24,12 @@
 # `test_all_warcs_root_dir` test
 TEST_DATA_SPECIAL_DIR = pathlib.Path(__file__).parent / "data-special"
 
-SCRAPER_SUFFIX = " + zimit x.y.z-devw"
+SCRAPER_SUFFIX = "zimit x.y.z-devw"
 
 # ============================================================================
 CMDLINES = [
     ["example-response.warc"],
     ["example-response.warc", "--progress-file", "progress.json"],
-    ["example-response.warc", "--scraper-suffix", SCRAPER_SUFFIX],
     ["example-revisit.warc.gz"],
     [
         "example-revisit.warc.gz",
@@ -121,7 +120,7 @@ def assert_item_does_not_exist(self, zimfile, path):
             payload = None
         assert payload is None
 
-    def verify_warc_and_zim(self, warcfile, zimfile, verify_scraper_suffix):
+    def verify_warc_and_zim(self, warcfile, zimfile):
         assert pathlib.Path(warcfile).is_file()
         assert pathlib.Path(zimfile).is_file()
 
@@ -133,13 +132,8 @@ def verify_warc_and_zim(self, warcfile, zimfile, verify_scraper_suffix):
 
         zim_fh = Archive(zimfile)
 
-        if verify_scraper_suffix:
-            assert (
-                f"warc2zim {__version__}{SCRAPER_SUFFIX}"
-                == zim_fh.get_text_metadata("Scraper")
-            )
-        else:
-            assert f"warc2zim {__version__}" == zim_fh.get_text_metadata("Scraper")
+        assert zim_fh.get_text_metadata("Scraper").startswith(f"warc2zim {__version__}")
+        assert zim_fh.get_text_metadata("X-ContentDate")
 
         for record in iter_warc_records([warcfile]):
             url = get_record_url(record)
@@ -347,6 +341,8 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
                 "test zim",
                 "--title",
                 "Some Title",
+                "--scraper-suffix",
+                SCRAPER_SUFFIX,
             ]
         )
 
@@ -380,6 +376,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
             "Scraper",
             "Tags",
             "Title",
+            "X-ContentDate",
         ]
 
         assert zim_fh.has_fulltext_index
@@ -400,6 +397,12 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
         }
         assert self.get_metadata(zim_output, "Title") == b"Some Title"
 
+        assert (
+            zim_fh.get_text_metadata("Scraper") == f"warc2zim {__version__},"
+            "webrecorder.io 2.0 (warcprox 1.4-20151022181819-1a48f12),zimit x.y.z-devw"
+        )
+        assert zim_fh.get_text_metadata("X-ContentDate") == "2016-02-25"
+
     def test_warc_to_zim_main(self, cmdline, tmp_path):
         # intput filename
         filename = cmdline[0]
@@ -423,9 +426,7 @@ def test_warc_to_zim_main(self, cmdline, tmp_path):
                     and progress["written"] <= progress["total"]
                 )
 
-        self.verify_warc_and_zim(
-            warcfile, tmp_path / zimfile, "--scraper-suffix" in cmdline
-        )
+        self.verify_warc_and_zim(warcfile, tmp_path / zimfile)
 
     def test_same_domain_only(self, tmp_path):
         zim_output = "same-domain.zim"