Skip to content

Commit

Permalink
Merge pull request #208 from openzim/mindtouch_changes
Browse files Browse the repository at this point in the history
More changes for mindtouch scraper
  • Loading branch information
benoit74 authored Oct 25, 2024
2 parents 1c2c37e + 4ac7665 commit 0cfd96b
Show file tree
Hide file tree
Showing 8 changed files with 514 additions and 213 deletions.
8 changes: 5 additions & 3 deletions src/zimscraperlib/rewriting/css.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __simple_transform(
[
"url(",
m_object["quote"],
url_rewriter(m_object["url"], base_href),
url_rewriter(m_object["url"], base_href).rewriten_url,
m_object["quote"],
")",
]
Expand Down Expand Up @@ -190,7 +190,7 @@ def _process_node(self, node: ast.Node):
new_url = self.url_rewriter(
url_node.value, # pyright: ignore
self.base_href,
)
).rewriten_url
url_node.value = str(new_url) # pyright: ignore
url_node.representation = ( # pyright: ignore
f'"{serialize_url(str(new_url))}"'
Expand All @@ -206,7 +206,9 @@ def _process_node(self, node: ast.Node):
elif isinstance(node, ast.Declaration):
self._process_list(node.value) # pyright: ignore
elif isinstance(node, ast.URLToken):
new_url = self.url_rewriter(node.value, self.base_href) # pyright: ignore
new_url = self.url_rewriter(
node.value, self.base_href
).rewriten_url # pyright: ignore
node.value = new_url
node.representation = f"url({serialize_url(new_url)})"

Expand Down
24 changes: 15 additions & 9 deletions src/zimscraperlib/rewriting/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,9 @@ class HtmlRewriter(HTMLParser):
def __init__(
self,
url_rewriter: ArticleUrlRewriter,
pre_head_insert: str,
pre_head_insert: str | None,
post_head_insert: str | None,
notify_js_module: Callable[[ZimPath], None],
notify_js_module: Callable[[ZimPath], None] | None,
):
super().__init__(convert_charrefs=False)
self.url_rewriter = url_rewriter
Expand Down Expand Up @@ -430,7 +430,7 @@ def do_attribute_rewrite(
css_rewriter: CssRewriter,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
notify_js_module: Callable[[ZimPath], None],
notify_js_module: Callable[[ZimPath], None] | None,
) -> AttrNameAndValue:
"""Utility function to process all attribute rewriting rules
Expand Down Expand Up @@ -587,7 +587,7 @@ def rewrite_href_src_attributes(
attrs: AttrsList,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
notify_js_module: Callable[[ZimPath], None],
notify_js_module: Callable[[ZimPath], None] | None,
):
"""Rewrite href and src attributes
Expand All @@ -596,11 +596,16 @@ def rewrite_href_src_attributes(
"""
if attr_name not in ("href", "src") or not attr_value:
return
if get_html_rewrite_context(tag=tag, attrs=attrs) == "js-module":
if (
notify_js_module
and get_html_rewrite_context(tag=tag, attrs=attrs) == "js-module"
):
notify_js_module(url_rewriter.get_item_path(attr_value, base_href=base_href))
return (
attr_name,
url_rewriter(attr_value, base_href=base_href, rewrite_all_url=tag != "a"),
url_rewriter(
attr_value, base_href=base_href, rewrite_all_url=tag != "a"
).rewriten_url,
)


Expand All @@ -615,10 +620,10 @@ def rewrite_srcset_attribute(
if attr_name != "srcset" or not attr_value:
return
value_list = attr_value.split(",")
new_value_list = []
new_value_list: list[str] = []
for value in value_list:
url, *other = value.strip().split(" ", maxsplit=1)
new_url = url_rewriter(url, base_href=base_href)
new_url = url_rewriter(url, base_href=base_href).rewriten_url
new_value = " ".join([new_url, *other])
new_value_list.append(new_value)
return (attr_name, ", ".join(new_value_list))
Expand Down Expand Up @@ -708,5 +713,6 @@ def rewrite_meta_http_equiv_redirect(
return
return (
attr_name,
f"{match['interval']};url={url_rewriter(match['url'], base_href=base_href)}",
f"{match['interval']};"
f"url={url_rewriter(match['url'], base_href=base_href).rewriten_url}",
)
13 changes: 7 additions & 6 deletions src/zimscraperlib/rewriting/js.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def __init__(
self,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
notify_js_module: Callable[[ZimPath], None],
notify_js_module: Callable[[ZimPath], None] | None,
):
super().__init__(None)
self.first_buff = self._init_local_declaration(GLOBAL_OVERRIDES)
Expand Down Expand Up @@ -286,7 +286,7 @@ def get_rewriten_import_url(url: str) -> str:
This takes into account that the result must be a relative URL, i.e. it
cannot be 'vendor.module.js' but must be './vendor.module.js'.
"""
url = self.url_rewriter(url, base_href=self.base_href)
url = self.url_rewriter(url, base_href=self.base_href).rewriten_url
if not (
url.startswith("/") or url.startswith("./") or url.startswith("../")
):
Expand All @@ -298,11 +298,12 @@ def func(
m_object: re.Match[str], _opts: dict[str, Any] | None = None
) -> str:
def sub_funct(match: re.Match[str]) -> str:
self.notify_js_module(
self.url_rewriter.get_item_path(
match.group(2), base_href=self.base_href
if self.notify_js_module:
self.notify_js_module(
self.url_rewriter.get_item_path(
match.group(2), base_href=self.base_href
)
)
)
return (
f"{match.group(1)}{get_rewriten_import_url(match.group(2))}"
f"{match.group(3)}"
Expand Down
57 changes: 38 additions & 19 deletions src/zimscraperlib/rewriting/url_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def __str__(self) -> str:
return f"HttpUrl({self.value})"

def __repr__(self) -> str:
return f"{self.__str__} - {super().__repr__()}" # pragma: no cover
return f"HttpUrl({self.value})" # pragma: no cover

@property
def value(self) -> str:
Expand Down Expand Up @@ -124,7 +124,7 @@ def __str__(self) -> str:
return f"ZimPath({self.value})"

def __repr__(self) -> str:
return f"{self.__str__} - {super().__repr__()}" # pragma: no cover
return f"ZimPath({self.value})" # pragma: no cover

@property
def value(self) -> str:
Expand All @@ -147,6 +147,12 @@ def check_validity(cls, value: str) -> None:
raise ValueError(f"Unexpected password in value: {value} {parts.password}")


class RewriteResult(NamedTuple):
absolute_url: str
rewriten_url: str
zim_path: ZimPath | None


class ArticleUrlRewriter:
"""
Rewrite urls in article.
Expand Down Expand Up @@ -176,16 +182,11 @@ def __init__(
missing_zim_paths: list of ZIM paths which are known to already be missing
from the existing_zim_paths ; usefull only in complement with this variable ;
new missing entries will be added as URLs are normalized in this function
Results:
items_to_download: populated with the list of rewritten URLs, so that one
might use it to download items after rewriting the document
"""
self.article_path = article_path or ArticleUrlRewriter.normalize(article_url)
self.article_url = article_url
self.existing_zim_paths = existing_zim_paths
self.missing_zim_paths = missing_zim_paths
self.items_to_download: dict[ZimPath, HttpUrl] = {}

def get_item_path(self, item_url: str, base_href: str | None) -> ZimPath:
"""Utility to transform an item URL into a ZimPath"""
Expand All @@ -201,7 +202,7 @@ def __call__(
base_href: str | None,
*,
rewrite_all_url: bool = True,
) -> str:
) -> RewriteResult:
"""Rewrite a url contained in a article.
The url is "fully" rewrited to point to a normalized entry path
Expand All @@ -210,17 +211,25 @@ def __call__(
try:
item_url = item_url.strip()

item_absolute_url = urljoin(
urljoin(self.article_url.value, base_href), item_url
)

# Make case of standalone fragments more straightforward
if item_url.startswith("#"):
return item_url
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=item_url,
zim_path=None,
)

item_scheme = urlsplit(item_url).scheme
if item_scheme and item_scheme not in ("http", "https"):
return item_url

item_absolute_url = urljoin(
urljoin(self.article_url.value, base_href), item_url
)
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=item_url,
zim_path=None,
)

item_fragment = urlsplit(item_absolute_url).fragment

Expand All @@ -229,9 +238,11 @@ def __call__(
if rewrite_all_url or (
self.existing_zim_paths and item_path in self.existing_zim_paths
):
if item_path not in self.items_to_download:
self.items_to_download[item_path] = HttpUrl(item_absolute_url)
return self.get_document_uri(item_path, item_fragment)
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=self.get_document_uri(item_path, item_fragment),
zim_path=item_path,
)
else:
if (
self.missing_zim_paths is not None
Expand All @@ -242,7 +253,11 @@ def __call__(
# with duplicate messages
self.missing_zim_paths.add(item_path)
# The url doesn't point to a known entry
return item_absolute_url
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=item_absolute_url,
zim_path=item_path,
)

except Exception as exc: # pragma: no cover
item_scheme = (
Expand Down Expand Up @@ -275,7 +290,11 @@ def __call__(
f"rewrite_all_url: {rewrite_all_url}",
exc_info=exc,
)
return item_url
return RewriteResult(
absolute_url=item_absolute_url,
rewriten_url=item_url,
zim_path=None,
)

def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str:
"""Given an ZIM item path and its fragment, get the URI to use in document
Expand Down
19 changes: 7 additions & 12 deletions tests/rewriting/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,11 @@
from zimscraperlib.rewriting.url_rewriting import (
ArticleUrlRewriter,
HttpUrl,
RewriteResult,
ZimPath,
)


@pytest.fixture(scope="module")
def no_js_notify():
"""Fixture to not care about notification of detection of a JS file"""

def no_js_notify_handler(_: str):
pass

yield no_js_notify_handler


class SimpleUrlRewriter(ArticleUrlRewriter):
"""Basic URL rewriter mocking most calls"""

Expand All @@ -34,8 +25,12 @@ def __call__(
base_href: str | None, # noqa: ARG002
*,
rewrite_all_url: bool = True, # noqa: ARG002
) -> str:
return item_url + self.suffix
) -> RewriteResult:
return RewriteResult(
absolute_url=item_url + self.suffix,
rewriten_url=item_url + self.suffix,
zim_path=None,
)

def get_item_path(
self, item_url: str, base_href: str | None # noqa: ARG002
Expand Down
Loading

0 comments on commit 0cfd96b

Please sign in to comment.