Skip to content

Commit

Permalink
Merge pull request #347 from openzim/charset_lookup_error
Browse files Browse the repository at this point in the history
Further enhance the situation regarding unknown encoding
  • Loading branch information
benoit74 authored Aug 2, 2024
2 parents 3327b57 + ffcdfe3 commit 6235567
Show file tree
Hide file tree
Showing 8 changed files with 114 additions and 28 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- New fuzzy-rule for cheatography.com (#342), der-postillon.com (#330)
- Properly rewrite redirect target url when present in <meta> HTML tag (#237)
- New `--encoding-aliases` argument to pass encoding/charset aliases (#331)

### Changed

- Generate fuzzy rules tests in Python and Javascript (#284)
- Refactor HTML rewriter class to make it more open to change and expressive (#305)
- Detect charset in document header only for HTML documents (#331)

### Fixed

Expand Down
4 changes: 3 additions & 1 deletion src/warc2zim/content_rewriting/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,9 @@ def content_str(self) -> str:
self.encoding,
self.charsets_to_try,
self.content_header_bytes_length,
ignore_content_header_charsets=self.ignore_content_header_charsets,
ignore_content_header_charsets=(
self.rewrite_mode != "html" or self.ignore_content_header_charsets
),
ignore_http_header_charsets=self.ignore_http_header_charsets,
)

Expand Down
5 changes: 3 additions & 2 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import json
import logging
import mimetypes
import os
import pathlib
import re
import sys
Expand Down Expand Up @@ -65,6 +64,7 @@
get_status_code,
get_version,
parse_title,
set_encoding_aliases,
status_code_is_processable_redirect,
)

Expand Down Expand Up @@ -136,6 +136,7 @@ def __init__(self, args):

self.output = Path(args.output)
self.zim_file = args.zim_file
set_encoding_aliases(args.encoding_aliases)

if not self.zim_file:
self.zim_file = "{name}_{period}.zim".format(
Expand All @@ -145,7 +146,7 @@ def __init__(self, args):
self.full_filename = self.output / self.zim_file

# ensure output file exists
if not os.path.isdir(self.output):
if not self.output.is_dir():
logger.error(
f"Output directory {self.output} does not exist. Exiting with error "
"code 1"
Expand Down
17 changes: 17 additions & 0 deletions src/warc2zim/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,28 @@ def _create_arguments_parser() -> ArgumentParser:
default=False,
)

parser.add_argument(
"--encoding-aliases",
help="List of encoding/charset aliases to decode WARC content. Aliases are used"
" when the encoding specified in upstream server exists in Python under a"
" different name. This parameter has the format alias_encoding=python_encoding."
" This parameter is single string, multiple values are separated by a comma, "
" like in alias1=encoding1,alias2=encoding2.",
type=lambda argument_value: {
alias_encoding.strip(): python_encoding.strip()
for alias_encoding, python_encoding in (
encoding.split("=") for encoding in argument_value.split(",")
)
},
default={},
)

return parser


def main(raw_args=None):
parser = _create_arguments_parser()

args = parser.parse_args(args=raw_args)
converter = Converter(args)
return converter.run()
Expand Down
18 changes: 15 additions & 3 deletions src/warc2zim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@
re.ASCII,
)

ENCODING_ALIASES = {}


def set_encoding_aliases(aliases: dict[str, str]):
"""Set the encoding aliases to use to decode"""
ENCODING_ALIASES.clear()
ENCODING_ALIASES.update(aliases)


def get_version():
return __version__
Expand Down Expand Up @@ -172,16 +180,20 @@ def to_string(
)
if m := ENCODING_RE.search(content_start):
head_encoding = m.group("encoding")
return input_.decode(head_encoding, errors="replace")
return input_.decode(
ENCODING_ALIASES.get(head_encoding, head_encoding), errors="replace"
)

# Search for encofing in HTTP `Content-Type` header
if not ignore_http_header_charsets and http_encoding:
return input_.decode(http_encoding, errors="replace")
return input_.decode(
ENCODING_ALIASES.get(http_encoding, http_encoding), errors="replace"
)

# Try all charsets_to_try passed
for charset_to_try in charsets_to_try:
try:
return input_.decode(charset_to_try)
return input_.decode(ENCODING_ALIASES.get(charset_to_try, charset_to_try))
except (ValueError, LookupError):
pass

Expand Down
Binary file not shown.
29 changes: 28 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pytest

from warc2zim.utils import to_string
from warc2zim.utils import set_encoding_aliases, to_string


@dataclass
Expand Down Expand Up @@ -334,3 +334,30 @@ def test_decode_charset_far_away():
)
== content
)


def test_decode_charset_too_far_away_with_alias():
content = '<html><meta charset="foo"><body>content</body></html>'
set_encoding_aliases({"foo": "latin1"})
to_string(
content.encode("latin1"),
None,
[],
1024,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
)


def test_decode_charset_too_far_away_without_proper_alias():
content = '<html><meta charset="foo"><body>content</body></html>'
set_encoding_aliases({"bar": "latin1"})
with pytest.raises(LookupError, match="unknown encoding: foo"):
to_string(
content.encode("latin1"),
None,
[],
1024,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
)
67 changes: 46 additions & 21 deletions tests/test_warc_to_zim.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import io
import json
import os
import pathlib
import re
import time
Expand All @@ -20,7 +19,10 @@
from warc2zim.url_rewriting import HttpUrl, ZimPath, normalize
from warc2zim.utils import get_record_url

TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
TEST_DATA_DIR = pathlib.Path(__file__).parent / "data"
# special data dir for WARC files which are not supposed to be ran in the
# `test_all_warcs_root_dir` test
TEST_DATA_SPECIAL_DIR = pathlib.Path(__file__).parent / "data-special"

SCRAPER_SUFFIX = " + zimit x.y.z-devw"

Expand Down Expand Up @@ -120,8 +122,8 @@ def assert_item_does_not_exist(self, zimfile, path):
assert payload is None

def verify_warc_and_zim(self, warcfile, zimfile, verify_scraper_suffix):
assert os.path.isfile(warcfile)
assert os.path.isfile(zimfile)
assert pathlib.Path(warcfile).is_file()
assert pathlib.Path(zimfile).is_file()

# [TOFIX]
head_insert = b""
Expand Down Expand Up @@ -332,7 +334,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
main(
[
"-v",
os.path.join(TEST_DATA_DIR, "example-response.warc"),
str(TEST_DATA_DIR / "example-response.warc"),
"--name",
"example-response",
"--output",
Expand All @@ -350,7 +352,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):

zim_output = tmp_path / zim_output

assert os.path.isfile(zim_output)
assert pathlib.Path(zim_output).is_file()

all_articles = {
article.path: article.title for article in self.list_articles(zim_output)
Expand Down Expand Up @@ -403,8 +405,8 @@ def test_warc_to_zim_main(self, cmdline, tmp_path):
filename = cmdline[0]

# set intput filename (first arg) to absolute path from test dir
warcfile = os.path.join(TEST_DATA_DIR, filename)
cmdline[0] = warcfile
warcfile = TEST_DATA_DIR / filename
cmdline[0] = str(warcfile)

cmdline.extend(["--output", str(tmp_path), "--name", filename])

Expand All @@ -429,7 +431,7 @@ def test_same_domain_only(self, tmp_path):
zim_output = "same-domain.zim"
main(
[
os.path.join(TEST_DATA_DIR, "example-revisit.warc.gz"),
str(TEST_DATA_DIR / "example-revisit.warc.gz"),
"--favicon",
"http://example.com/favicon.ico",
"--include-domains",
Expand Down Expand Up @@ -457,7 +459,7 @@ def test_skip_self_redirect(self, tmp_path):
zim_output = "self-redir.zim"
main(
[
os.path.join(TEST_DATA_DIR, "self-redirect.warc"),
str(TEST_DATA_DIR / "self-redirect.warc"),
"--output",
str(tmp_path),
"--zim-file",
Expand All @@ -473,7 +475,7 @@ def test_include_domains_favicon_and_language(self, tmp_path):
zim_output = "spt.zim"
main(
[
os.path.join(TEST_DATA_DIR, "single-page-test.warc"),
str(TEST_DATA_DIR / "single-page-test.warc"),
"-i",
"reseau-canope.fr",
"--output",
Expand Down Expand Up @@ -521,7 +523,7 @@ def test_website_with_redirect(self, tmp_path):
zim_output = "kiwix.zim"
main(
[
os.path.join(TEST_DATA_DIR, "kiwix-with-redirects.warc.gz"),
str(TEST_DATA_DIR / "kiwix-with-redirects.warc.gz"),
"-u",
"http://www.kiwix.org",
"--output",
Expand Down Expand Up @@ -557,7 +559,7 @@ def test_all_warcs_root_dir(self, tmp_path):
zim_output = "test-all.zim"
main(
[
os.path.join(TEST_DATA_DIR),
str(TEST_DATA_DIR),
"--output",
str(tmp_path),
"--zim-file",
Expand Down Expand Up @@ -590,7 +592,7 @@ def test_fuzzy_urls(self, tmp_path, fuzzycheck):
zim_output = fuzzycheck["filename"] + ".zim"
main(
[
os.path.join(TEST_DATA_DIR, fuzzycheck["filename"]),
str(TEST_DATA_DIR / fuzzycheck["filename"]),
"--output",
str(tmp_path),
"--zim-file",
Expand All @@ -612,7 +614,7 @@ def test_error_bad_main_page(self, tmp_path):
main(
[
"-v",
os.path.join(TEST_DATA_DIR, "example-response.warc"),
str(TEST_DATA_DIR / "example-response.warc"),
"-u",
"https://no-such-url.example.com",
"--output",
Expand All @@ -632,7 +634,7 @@ def test_error_main_page_unprocessable(self, tmp_path):
main(
[
"-v",
os.path.join(TEST_DATA_DIR, "main-entry-403.warc.gz"),
str(TEST_DATA_DIR / "main-entry-403.warc.gz"),
"-u",
"https://wikizilla.org/wiki/Doug",
"--output",
Expand Down Expand Up @@ -676,7 +678,7 @@ def test_custom_css(self, tmp_path):

main(
[
os.path.join(TEST_DATA_DIR, "example-response.warc"),
str(TEST_DATA_DIR / "example-response.warc"),
"--output",
str(tmp_path),
"--zim-file",
Expand Down Expand Up @@ -704,7 +706,7 @@ def test_custom_css_remote(self, tmp_path):

main(
[
os.path.join(TEST_DATA_DIR, "example-response.warc"),
str(TEST_DATA_DIR / "example-response.warc"),
"--output",
str(tmp_path),
"--zim-file",
Expand All @@ -729,7 +731,7 @@ def test_http_return_codes(self, tmp_path):

main(
[
os.path.join(TEST_DATA_DIR, "http-return-codes.warc.gz"),
str(TEST_DATA_DIR / "http-return-codes.warc.gz"),
"--output",
str(tmp_path),
"--zim-file",
Expand Down Expand Up @@ -793,7 +795,7 @@ def test_bad_redirections(self, tmp_path):

main(
[
os.path.join(TEST_DATA_DIR, "bad-redirections.warc.gz"),
str(TEST_DATA_DIR / "bad-redirections.warc.gz"),
"--output",
str(tmp_path),
"--zim-file",
Expand Down Expand Up @@ -828,7 +830,7 @@ def test_content_resource_types(self, tmp_path):

main(
[
os.path.join(TEST_DATA_DIR, "content-resource-types.warc.gz"),
str(TEST_DATA_DIR / "content-resource-types.warc.gz"),
"--output",
str(tmp_path),
"--zim-file",
Expand All @@ -850,3 +852,26 @@ def test_content_resource_types(self, tmp_path):
]:
res = self.get_article(zim_output, js_file)
assert b"wombat" in res # simple check that rewriting has been done

def test_content_encoding_aliases(self, tmp_path):
zim_output = "tests_en_qsl.net-encoding-alias.zim"

main(
[
# cannot be processed like other TEST_DATA_DIR warcs since it needs
# special encoding aliases to be used in --encoding-aliases
str(TEST_DATA_SPECIAL_DIR / "qsl.net-encoding-alias.warc.gz"),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--encoding-aliases",
"foo=bar,iso-8559-1=iso-8859-1,fii=bor",
"--name",
"tests_en_qsl.net-encoding-alias",
]
)
zim_output = tmp_path / zim_output

res = self.get_article(zim_output, "www.qsl.net/vk2jem/swlogs.htm")
assert b"<!-- WB Insert -->" in res # simple check that rewriting has been done

0 comments on commit 6235567

Please sign in to comment.