Merge pull request #347 from openzim/charset_lookup_error

Further enhance the situation regarding unknown encoding
openzim · Aug 2, 2024 · 6235567 · 6235567
2 parents 3327b57 + ffcdfe3
commit 6235567
Show file tree

Hide file tree

Showing 8 changed files with 114 additions and 28 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,11 +11,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - New fuzzy-rule for cheatography.com (#342), der-postillon.com (#330)
 - Properly rewrite redirect target url when present in <meta> HTML tag (#237)
+- New `--encoding-aliases` argument to pass encoding/charset aliases (#331)
 
 ### Changed
 
 - Generate fuzzy rules tests in Python and Javascript (#284)
 - Refactor HTML rewriter class to make it more open to change and expressive (#305)
+- Detect charset in document header only for HTML documents (#331)
 
 ### Fixed
 

diff --git a/src/warc2zim/content_rewriting/generic.py b/src/warc2zim/content_rewriting/generic.py
@@ -95,7 +95,9 @@ def content_str(self) -> str:
             self.encoding,
             self.charsets_to_try,
             self.content_header_bytes_length,
-            ignore_content_header_charsets=self.ignore_content_header_charsets,
+            ignore_content_header_charsets=(
+                self.rewrite_mode != "html" or self.ignore_content_header_charsets
+            ),
             ignore_http_header_charsets=self.ignore_http_header_charsets,
         )
 

diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
@@ -18,7 +18,6 @@
 import json
 import logging
 import mimetypes
-import os
 import pathlib
 import re
 import sys
@@ -65,6 +64,7 @@
     get_status_code,
     get_version,
     parse_title,
+    set_encoding_aliases,
     status_code_is_processable_redirect,
 )
 
@@ -136,6 +136,7 @@ def __init__(self, args):
 
         self.output = Path(args.output)
         self.zim_file = args.zim_file
+        set_encoding_aliases(args.encoding_aliases)
 
         if not self.zim_file:
             self.zim_file = "{name}_{period}.zim".format(
@@ -145,7 +146,7 @@ def __init__(self, args):
         self.full_filename = self.output / self.zim_file
 
         # ensure output file exists
-        if not os.path.isdir(self.output):
+        if not self.output.is_dir():
             logger.error(
                 f"Output directory {self.output} does not exist. Exiting with error "
                 "code 1"

diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py
@@ -141,11 +141,28 @@ def _create_arguments_parser() -> ArgumentParser:
         default=False,
     )
 
+    parser.add_argument(
+        "--encoding-aliases",
+        help="List of encoding/charset aliases to decode WARC content. Aliases are used"
+        " when the encoding specified in upstream server exists in Python under a"
+        " different name. This parameter has the format alias_encoding=python_encoding."
+        " This parameter is single string, multiple values are separated by a comma, "
+        " like in alias1=encoding1,alias2=encoding2.",
+        type=lambda argument_value: {
+            alias_encoding.strip(): python_encoding.strip()
+            for alias_encoding, python_encoding in (
+                encoding.split("=") for encoding in argument_value.split(",")
+            )
+        },
+        default={},
+    )
+
     return parser
 
 
 def main(raw_args=None):
     parser = _create_arguments_parser()
+
     args = parser.parse_args(args=raw_args)
     converter = Converter(args)
     return converter.run()

diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py
@@ -16,6 +16,14 @@
     re.ASCII,
 )
 
+ENCODING_ALIASES = {}
+
+
+def set_encoding_aliases(aliases: dict[str, str]):
+    """Set the encoding aliases to use to decode"""
+    ENCODING_ALIASES.clear()
+    ENCODING_ALIASES.update(aliases)
+
 
 def get_version():
     return __version__
@@ -172,16 +180,20 @@ def to_string(
             )
             if m := ENCODING_RE.search(content_start):
                 head_encoding = m.group("encoding")
-                return input_.decode(head_encoding, errors="replace")
+                return input_.decode(
+                    ENCODING_ALIASES.get(head_encoding, head_encoding), errors="replace"
+                )
 
     # Search for encofing in HTTP `Content-Type` header
     if not ignore_http_header_charsets and http_encoding:
-        return input_.decode(http_encoding, errors="replace")
+        return input_.decode(
+            ENCODING_ALIASES.get(http_encoding, http_encoding), errors="replace"
+        )
 
     # Try all charsets_to_try passed
     for charset_to_try in charsets_to_try:
         try:
-            return input_.decode(charset_to_try)
+            return input_.decode(ENCODING_ALIASES.get(charset_to_try, charset_to_try))
         except (ValueError, LookupError):
             pass
 

diff --git a/tests/data-special/qsl.net-encoding-alias.warc.gz b/tests/data-special/qsl.net-encoding-alias.warc.gz
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from warc2zim.utils import to_string
+from warc2zim.utils import set_encoding_aliases, to_string
 
 
 @dataclass
@@ -334,3 +334,30 @@ def test_decode_charset_far_away():
         )
         == content
     )
+
+
+def test_decode_charset_too_far_away_with_alias():
+    content = '<html><meta charset="foo"><body>content</body></html>'
+    set_encoding_aliases({"foo": "latin1"})
+    to_string(
+        content.encode("latin1"),
+        None,
+        [],
+        1024,
+        ignore_http_header_charsets=False,
+        ignore_content_header_charsets=False,
+    )
+
+
+def test_decode_charset_too_far_away_without_proper_alias():
+    content = '<html><meta charset="foo"><body>content</body></html>'
+    set_encoding_aliases({"bar": "latin1"})
+    with pytest.raises(LookupError, match="unknown encoding: foo"):
+        to_string(
+            content.encode("latin1"),
+            None,
+            [],
+            1024,
+            ignore_http_header_charsets=False,
+            ignore_content_header_charsets=False,
+        )
diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py
@@ -3,7 +3,6 @@
 
 import io
 import json
-import os
 import pathlib
 import re
 import time
@@ -20,7 +19,10 @@
 from warc2zim.url_rewriting import HttpUrl, ZimPath, normalize
 from warc2zim.utils import get_record_url
 
-TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
+TEST_DATA_DIR = pathlib.Path(__file__).parent / "data"
+# special data dir for WARC files which are not supposed to be ran in the
+# `test_all_warcs_root_dir` test
+TEST_DATA_SPECIAL_DIR = pathlib.Path(__file__).parent / "data-special"
 
 SCRAPER_SUFFIX = " + zimit x.y.z-devw"
 
@@ -120,8 +122,8 @@ def assert_item_does_not_exist(self, zimfile, path):
         assert payload is None
 
     def verify_warc_and_zim(self, warcfile, zimfile, verify_scraper_suffix):
-        assert os.path.isfile(warcfile)
-        assert os.path.isfile(zimfile)
+        assert pathlib.Path(warcfile).is_file()
+        assert pathlib.Path(zimfile).is_file()
 
         # [TOFIX]
         head_insert = b""
@@ -332,7 +334,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
         main(
             [
                 "-v",
-                os.path.join(TEST_DATA_DIR, "example-response.warc"),
+                str(TEST_DATA_DIR / "example-response.warc"),
                 "--name",
                 "example-response",
                 "--output",
@@ -350,7 +352,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
 
         zim_output = tmp_path / zim_output
 
-        assert os.path.isfile(zim_output)
+        assert pathlib.Path(zim_output).is_file()
 
         all_articles = {
             article.path: article.title for article in self.list_articles(zim_output)
@@ -403,8 +405,8 @@ def test_warc_to_zim_main(self, cmdline, tmp_path):
         filename = cmdline[0]
 
         # set intput filename (first arg) to absolute path from test dir
-        warcfile = os.path.join(TEST_DATA_DIR, filename)
-        cmdline[0] = warcfile
+        warcfile = TEST_DATA_DIR / filename
+        cmdline[0] = str(warcfile)
 
         cmdline.extend(["--output", str(tmp_path), "--name", filename])
 
@@ -429,7 +431,7 @@ def test_same_domain_only(self, tmp_path):
         zim_output = "same-domain.zim"
         main(
             [
-                os.path.join(TEST_DATA_DIR, "example-revisit.warc.gz"),
+                str(TEST_DATA_DIR / "example-revisit.warc.gz"),
                 "--favicon",
                 "http://example.com/favicon.ico",
                 "--include-domains",
@@ -457,7 +459,7 @@ def test_skip_self_redirect(self, tmp_path):
         zim_output = "self-redir.zim"
         main(
             [
-                os.path.join(TEST_DATA_DIR, "self-redirect.warc"),
+                str(TEST_DATA_DIR / "self-redirect.warc"),
                 "--output",
                 str(tmp_path),
                 "--zim-file",
@@ -473,7 +475,7 @@ def test_include_domains_favicon_and_language(self, tmp_path):
         zim_output = "spt.zim"
         main(
             [
-                os.path.join(TEST_DATA_DIR, "single-page-test.warc"),
+                str(TEST_DATA_DIR / "single-page-test.warc"),
                 "-i",
                 "reseau-canope.fr",
                 "--output",
@@ -521,7 +523,7 @@ def test_website_with_redirect(self, tmp_path):
         zim_output = "kiwix.zim"
         main(
             [
-                os.path.join(TEST_DATA_DIR, "kiwix-with-redirects.warc.gz"),
+                str(TEST_DATA_DIR / "kiwix-with-redirects.warc.gz"),
                 "-u",
                 "http://www.kiwix.org",
                 "--output",
@@ -557,7 +559,7 @@ def test_all_warcs_root_dir(self, tmp_path):
         zim_output = "test-all.zim"
         main(
             [
-                os.path.join(TEST_DATA_DIR),
+                str(TEST_DATA_DIR),
                 "--output",
                 str(tmp_path),
                 "--zim-file",
@@ -590,7 +592,7 @@ def test_fuzzy_urls(self, tmp_path, fuzzycheck):
         zim_output = fuzzycheck["filename"] + ".zim"
         main(
             [
-                os.path.join(TEST_DATA_DIR, fuzzycheck["filename"]),
+                str(TEST_DATA_DIR / fuzzycheck["filename"]),
                 "--output",
                 str(tmp_path),
                 "--zim-file",
@@ -612,7 +614,7 @@ def test_error_bad_main_page(self, tmp_path):
             main(
                 [
                     "-v",
-                    os.path.join(TEST_DATA_DIR, "example-response.warc"),
+                    str(TEST_DATA_DIR / "example-response.warc"),
                     "-u",
                     "https://no-such-url.example.com",
                     "--output",
@@ -632,7 +634,7 @@ def test_error_main_page_unprocessable(self, tmp_path):
             main(
                 [
                     "-v",
-                    os.path.join(TEST_DATA_DIR, "main-entry-403.warc.gz"),
+                    str(TEST_DATA_DIR / "main-entry-403.warc.gz"),
                     "-u",
                     "https://wikizilla.org/wiki/Doug",
                     "--output",
@@ -676,7 +678,7 @@ def test_custom_css(self, tmp_path):
 
         main(
             [
-                os.path.join(TEST_DATA_DIR, "example-response.warc"),
+                str(TEST_DATA_DIR / "example-response.warc"),
                 "--output",
                 str(tmp_path),
                 "--zim-file",
@@ -704,7 +706,7 @@ def test_custom_css_remote(self, tmp_path):
 
         main(
             [
-                os.path.join(TEST_DATA_DIR, "example-response.warc"),
+                str(TEST_DATA_DIR / "example-response.warc"),
                 "--output",
                 str(tmp_path),
                 "--zim-file",
@@ -729,7 +731,7 @@ def test_http_return_codes(self, tmp_path):
 
         main(
             [
-                os.path.join(TEST_DATA_DIR, "http-return-codes.warc.gz"),
+                str(TEST_DATA_DIR / "http-return-codes.warc.gz"),
                 "--output",
                 str(tmp_path),
                 "--zim-file",
@@ -793,7 +795,7 @@ def test_bad_redirections(self, tmp_path):
 
         main(
             [
-                os.path.join(TEST_DATA_DIR, "bad-redirections.warc.gz"),
+                str(TEST_DATA_DIR / "bad-redirections.warc.gz"),
                 "--output",
                 str(tmp_path),
                 "--zim-file",
@@ -828,7 +830,7 @@ def test_content_resource_types(self, tmp_path):
 
         main(
             [
-                os.path.join(TEST_DATA_DIR, "content-resource-types.warc.gz"),
+                str(TEST_DATA_DIR / "content-resource-types.warc.gz"),
                 "--output",
                 str(tmp_path),
                 "--zim-file",
@@ -850,3 +852,26 @@ def test_content_resource_types(self, tmp_path):
         ]:
             res = self.get_article(zim_output, js_file)
             assert b"wombat" in res  # simple check that rewriting has been done
+
+    def test_content_encoding_aliases(self, tmp_path):
+        zim_output = "tests_en_qsl.net-encoding-alias.zim"
+
+        main(
+            [
+                # cannot be processed like other TEST_DATA_DIR warcs since it needs
+                # special encoding aliases to be used in --encoding-aliases
+                str(TEST_DATA_SPECIAL_DIR / "qsl.net-encoding-alias.warc.gz"),
+                "--output",
+                str(tmp_path),
+                "--zim-file",
+                zim_output,
+                "--encoding-aliases",
+                "foo=bar,iso-8559-1=iso-8859-1,fii=bor",
+                "--name",
+                "tests_en_qsl.net-encoding-alias",
+            ]
+        )
+        zim_output = tmp_path / zim_output
+
+        res = self.get_article(zim_output, "www.qsl.net/vk2jem/swlogs.htm")
+        assert b"<!-- WB Insert -->" in res  # simple check that rewriting has been done