Skip to content

Commit

Permalink
Merge pull request #372 from openzim/choose_icon2
Browse files Browse the repository at this point in the history
Revisit retrieve_illustration logic to prefer best favicons
  • Loading branch information
benoit74 authored Aug 7, 2024
2 parents 7eb59aa + 93c866d commit 6cad55b
Show file tree
Hide file tree
Showing 8 changed files with 2,119 additions and 92 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Use `software` property from `warcinfo` record to set ZIM `Scraper` metadata (#357)
- Store `ContentDate` as metadata, based on `WARC-Date` (#358)
- Remove domain specific rules (#328)
- Revisit retrieve_illustration logic to prefer best favicons (#352 and #369)

### Fixed

Expand Down
261 changes: 173 additions & 88 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@
import sys
import tempfile
import time
from collections.abc import Generator
from http import HTTPStatus
from pathlib import Path
from urllib.parse import urljoin, urlsplit, urlunsplit
from uuid import uuid4

import PIL.Image
import requests

# from zimscraperlib import getLogger
Expand All @@ -36,6 +38,7 @@
from dateutil import parser
from jinja2 import Environment, PackageLoader
from warcio import ArchiveIterator
from warcio.recordloader import ArcWarcRecord
from zimscraperlib.constants import (
DEFAULT_DEV_ZIM_METADATA,
RECOMMENDED_MAX_TITLE_LENGTH,
Expand All @@ -55,6 +58,7 @@
)

from warc2zim.constants import logger
from warc2zim.icon_finder import Icon, get_sorted_icons, icons_in_html
from warc2zim.items import StaticArticle, StaticFile, WARCPayloadItem
from warc2zim.language import parse_language
from warc2zim.url_rewriting import HttpUrl, ZimPath, normalize
Expand Down Expand Up @@ -93,6 +97,8 @@

PY2JS_RULE_RX = re.compile(r"\\(\d)", re.ASCII)

ZIM_ILLUSTRATION_SIZE = 48


class UnprocessableWarcError(Exception):
"""Exception raised when it is not possible to process WARC file(s) received"""
Expand All @@ -119,8 +125,9 @@ def __init__(self, args):

self.name = args.name
self.title = args.title
self.favicon_url = args.favicon
self.favicon_path = None
self.favicon_urls: list[HttpUrl] = (
[HttpUrl(args.favicon)] if args.favicon else []
)
self.language = args.lang
self.description = args.description
self.long_description = args.long_description
Expand All @@ -133,7 +140,6 @@ def __init__(self, args):
}
self.source: str | None = str(args.source) if args.source else None or main_url
self.scraper = "warc2zim " + get_version()
self.illustration = b""
self.main_path = normalize(HttpUrl(main_url)) if main_url else None

self.output = Path(args.output)
Expand Down Expand Up @@ -304,7 +310,6 @@ def run(self):
if len(self.title) > RECOMMENDED_MAX_TITLE_LENGTH:
self.title = f"{self.title[0:29]}…"
self.retrieve_illustration()
self.convert_illustration()

# autoescape=False to allow injecting html entities from translated text
self.env = Environment(
Expand Down Expand Up @@ -507,6 +512,18 @@ def gather_information_from_warc(self):
else:
self.expected_zim_items.add(zim_path)

if (
hasattr(self, "favicon_paths")
and zim_path in self.favicon_paths.keys()
and record.rec_type == "response"
and record.http_headers
and record.http_headers.get_statuscode() == "200"
):
# store records corresponding to favicons we are looking for
self.favicon_contents[self.favicon_paths[zim_path]] = (
get_record_content(record)
)

if main_page_found:
continue

Expand Down Expand Up @@ -575,7 +592,10 @@ def gather_information_from_warc(self):

logger.debug(f"Title: {self.title}")
logger.debug(f"Language: {self.language}")
logger.debug(f"Favicon: {self.favicon_url or self.favicon_path}")
logger.debug(
f"Favicons to consider: "
f"{ ' or '.join(url.value for url in self.favicon_urls)}"
)
main_page_found = True

if len(self.expected_zim_items) == 0:
Expand Down Expand Up @@ -663,36 +683,36 @@ def gather_information_from_warc(self):
)

def find_icon_and_language(self, record, content):
soup = BeautifulSoup(content, "html.parser")

if not self.favicon_url:
# find icon
icon = soup.find("link", rel="shortcut icon")
if not icon:
icon = soup.find("link", rel="icon")

if (
icon
and icon.attrs.get( # pyright: ignore[reportGeneralTypeIssues, reportAttributeAccessIssue]
"href"
)
):
icon_url = icon.attrs[ # pyright: ignore[reportGeneralTypeIssues ,reportAttributeAccessIssue]
"href"
]
else:
icon_url = "/favicon.ico"

# transform icon URL into WARC path
self.favicon_path = normalize(
HttpUrl(
urljoin(
get_record_url(record),
icon_url,
)
)
if len(self.favicon_urls) == 0:
# search for favicons in HTML only if user did provided one to use

# add icons found in document head, sorted by fit for warc2zim usage
icons_urls = icons_in_html(content)
record_url = get_record_url(record)
if len(icons_urls) == 0:
icons_urls = ["/favicon.ico"]
# transform into absolute URL
icons_urls = [urljoin(record_url, icon_url) for icon_url in icons_urls]
# add HttpUrl to favicon_urls if absolute URL is indeed HTTP(S)
self.favicon_urls.extend(
HttpUrl(icon_url)
for icon_url in icons_urls
if re.match(r"^https?\://", icon_url)
)

# compute paths of favicons so that we can process them on-the-fly while
# iterating the records
self.favicon_paths = {
normalize(icon_url): icon_url for icon_url in self.favicon_urls
}
self.favicon_contents: dict[HttpUrl, bytes | None] = {
icon_url: None for icon_url in self.favicon_urls
} # None for now since we've not yet found the records

# Find most probable language if not passed via CLI
soup = BeautifulSoup(content, "html.parser")

if not self.language:
# HTML5 Standard
lang_elem = soup.find("html", attrs={"lang": True})
Expand Down Expand Up @@ -731,62 +751,127 @@ def retrieve_illustration(self):
Uses fallback in case of errors/missing"""

if self.favicon_url or self.favicon_path:
# look into WARC records
for record in iter_warc_records(self.warc_files):
if record.rec_type != "response":
continue
url = get_record_url(record)
# ignore non HTTP(S) URLs (intent:// for instance, see #332)
if not (url.startswith("http://") or url.startswith("https://")):
continue
path = normalize(HttpUrl(url))
if path == self.favicon_path or url == self.favicon_url:
logger.debug("Found WARC record for favicon")
if (
record.http_headers
and record.http_headers.get_statuscode() != "200"
): # pragma: no cover
logger.warning("WARC record for favicon is unusable")
break
self.illustration = get_record_content(record)
break
def _get_icon_from_content(favicon_url: str, content: bytes) -> Icon:
"""Returns Icon object from url + content bytes"""
if format_for(io.BytesIO(content), from_suffix=False) == "SVG":
return Icon(
url=favicon_url,
width=ZIM_ILLUSTRATION_SIZE,
height=ZIM_ILLUSTRATION_SIZE,
icon=content,
format="SVG",
)
with PIL.Image.open(io.BytesIO(content)) as img:
return Icon(
url=favicon_url,
width=img.width,
height=img.height,
icon=content,
format=img.format,
)

# download favicon_url (might be custom URL, not present in WARC records)
if not self.illustration and self.favicon_url:
try:
dst = io.BytesIO()
if not stream_file(self.favicon_url, byte_stream=dst)[0]:
raise OSError(
"No bytes received downloading favicon"
) # pragma: no cover
self.illustration = dst.getvalue()
except Exception as exc:
logger.warning("Unable to download favicon", exc_info=exc)

if not self.illustration:
logger.warning("Illustration not found, using default")
self.illustration = DEFAULT_DEV_ZIM_METADATA["Illustration_48x48_at_1"]

# Illustration is now set, no need to keep url/path anymore
del self.favicon_url
del self.favicon_path

def convert_illustration(self):
"""convert self.illustration into a 48x48px PNG with fallback"""
src = io.BytesIO(self.illustration)
dst = io.BytesIO()
try:
if format_for(src, from_suffix=False) == "SVG":
convert_svg2png(src, dst, width=48, height=48)
else:
convert_image(src, dst, fmt="PNG")
resize_image(dst, width=48, height=48, method="cover")
except Exception as exc: # pragma: no cover
logger.warning(f"Failed to convert or resize favicon: {exc}")
self.illustration = DEFAULT_DEV_ZIM_METADATA["Illustration_48x48_at_1"]
else:
self.illustration = dst.getvalue()
def _retrieve_icon_from_existing_content(
favicon_url: str, content: bytes | None
) -> Icon | None:
"""Retrieve icon from an existing content (from WARC record)
Fallbacks nicely to return None if retrieval fails
"""
if content is None:
return None
try:
return _get_icon_from_content(favicon_url, content)
except Exception as exc: # pragma: no cover
logger.warning(
f"Unable to get favicon size from record {favicon_url}",
exc_info=exc,
)
return None

def _download_icon_for_url(favicon_url: str) -> Icon | None:
"""Download icon from an URL
Fallbacks nicely to return None if retrieval fails
"""
try:
logger.debug(f"Downloading favicon at {favicon_url}")
dst = io.BytesIO()
if not stream_file(favicon_url, byte_stream=dst)[0]:
raise OSError(
"No bytes received downloading favicon"
) # pragma: no cover
logger.info(f"Downloaded favicon at {favicon_url}")
return _get_icon_from_content(favicon_url, dst.getvalue())
except Exception as exc:
logger.warning(
f"Unable to download/use favicon at {favicon_url}",
exc_info=exc,
)
return None

# Build list of working favicons
favicons: list[Icon] = []
for favicon_url in self.favicon_urls:
if (
icon := _retrieve_icon_from_existing_content(
favicon_url.value, self.favicon_contents[favicon_url]
)
) is not None:
# Favicon found in WARC
logger.debug(
f"Found favicon in WARC at {icon.url} which is "
f"{icon.width}x{icon.height}"
)
favicons.append(icon)
elif (icon := _download_icon_for_url(favicon_url.value)) is not None:
# Favicon downloaded
logger.debug(
f"Found favicon online at {icon.url} which is "
f"{icon.width}x{icon.height}"
)
favicons.append(icon)

# delete temp objects not needed anymore
if hasattr(self, "favicon_paths"):
del self.favicon_paths
if hasattr(self, "favicon_contents"):
del self.favicon_contents
del self.favicon_urls

# Try to find best working favicon
for favicon in get_sorted_icons(favicons):
try:
illustration = io.BytesIO()
if favicon.format == "SVG":
convert_svg2png(
io.BytesIO(favicon.icon),
illustration,
ZIM_ILLUSTRATION_SIZE,
ZIM_ILLUSTRATION_SIZE,
)
elif favicon.format != "PNG":
convert_image(io.BytesIO(favicon.icon), illustration, fmt="PNG")
else:
illustration = io.BytesIO(favicon.icon) # pragma: no cover
if {favicon.width, favicon.height} != {
ZIM_ILLUSTRATION_SIZE,
ZIM_ILLUSTRATION_SIZE,
}:
resize_image(
illustration,
width=ZIM_ILLUSTRATION_SIZE,
height=ZIM_ILLUSTRATION_SIZE,
method="cover",
)
self.illustration = illustration.getvalue()
return
except Exception as exc: # pragma: no cover
logger.warning(f"Failed to convert/resize image at {favicon.url}")
logger.debug("Error:", exc_info=exc)

# Or fallback to default ZIM illustration
logger.warning("No suitable illustration found, using default")
self.illustration = DEFAULT_DEV_ZIM_METADATA["Illustration_48x48_at_1"]

def is_self_redirect(self, record, url):
if record.rec_type != "response":
Expand Down Expand Up @@ -897,7 +982,7 @@ def add_items_for_warc_record(self, record):
)


def iter_warc_records(warc_files):
def iter_warc_records(warc_files) -> Generator[ArcWarcRecord]:
"""iter warc records, including appending request data to matching response"""
for filename in warc_files:
with open(filename, "rb") as fh:
Expand Down
Loading

0 comments on commit 6cad55b

Please sign in to comment.