diff --git a/src/zimscraperlib/rewriting/css.py b/src/zimscraperlib/rewriting/css.py index d992ad9b..c04cbcf0 100644 --- a/src/zimscraperlib/rewriting/css.py +++ b/src/zimscraperlib/rewriting/css.py @@ -51,7 +51,7 @@ def __simple_transform( [ "url(", m_object["quote"], - url_rewriter(m_object["url"], base_href), + url_rewriter(m_object["url"], base_href).rewriten_url, m_object["quote"], ")", ] @@ -190,7 +190,7 @@ def _process_node(self, node: ast.Node): new_url = self.url_rewriter( url_node.value, # pyright: ignore self.base_href, - ) + ).rewriten_url url_node.value = str(new_url) # pyright: ignore url_node.representation = ( # pyright: ignore f'"{serialize_url(str(new_url))}"' @@ -206,7 +206,9 @@ def _process_node(self, node: ast.Node): elif isinstance(node, ast.Declaration): self._process_list(node.value) # pyright: ignore elif isinstance(node, ast.URLToken): - new_url = self.url_rewriter(node.value, self.base_href) # pyright: ignore + new_url = self.url_rewriter( + node.value, self.base_href + ).rewriten_url # pyright: ignore node.value = new_url node.representation = f"url({serialize_url(new_url)})" diff --git a/src/zimscraperlib/rewriting/html.py b/src/zimscraperlib/rewriting/html.py index 3696aabb..cd78aab4 100644 --- a/src/zimscraperlib/rewriting/html.py +++ b/src/zimscraperlib/rewriting/html.py @@ -132,9 +132,9 @@ class HtmlRewriter(HTMLParser): def __init__( self, url_rewriter: ArticleUrlRewriter, - pre_head_insert: str, + pre_head_insert: str | None, post_head_insert: str | None, - notify_js_module: Callable[[ZimPath], None], + notify_js_module: Callable[[ZimPath], None] | None, ): super().__init__(convert_charrefs=False) self.url_rewriter = url_rewriter @@ -430,7 +430,7 @@ def do_attribute_rewrite( css_rewriter: CssRewriter, url_rewriter: ArticleUrlRewriter, base_href: str | None, - notify_js_module: Callable[[ZimPath], None], + notify_js_module: Callable[[ZimPath], None] | None, ) -> AttrNameAndValue: """Utility function to process all attribute rewriting rules @@ -587,7 +587,7 @@ def rewrite_href_src_attributes( attrs: AttrsList, url_rewriter: ArticleUrlRewriter, base_href: str | None, - notify_js_module: Callable[[ZimPath], None], + notify_js_module: Callable[[ZimPath], None] | None, ): """Rewrite href and src attributes @@ -596,11 +596,16 @@ def rewrite_href_src_attributes( """ if attr_name not in ("href", "src") or not attr_value: return - if get_html_rewrite_context(tag=tag, attrs=attrs) == "js-module": + if ( + notify_js_module + and get_html_rewrite_context(tag=tag, attrs=attrs) == "js-module" + ): notify_js_module(url_rewriter.get_item_path(attr_value, base_href=base_href)) return ( attr_name, - url_rewriter(attr_value, base_href=base_href, rewrite_all_url=tag != "a"), + url_rewriter( + attr_value, base_href=base_href, rewrite_all_url=tag != "a" + ).rewriten_url, ) @@ -615,10 +620,10 @@ def rewrite_srcset_attribute( if attr_name != "srcset" or not attr_value: return value_list = attr_value.split(",") - new_value_list = [] + new_value_list: list[str] = [] for value in value_list: url, *other = value.strip().split(" ", maxsplit=1) - new_url = url_rewriter(url, base_href=base_href) + new_url = url_rewriter(url, base_href=base_href).rewriten_url new_value = " ".join([new_url, *other]) new_value_list.append(new_value) return (attr_name, ", ".join(new_value_list)) @@ -708,5 +713,6 @@ def rewrite_meta_http_equiv_redirect( return return ( attr_name, - f"{match['interval']};url={url_rewriter(match['url'], base_href=base_href)}", + f"{match['interval']};" + f"url={url_rewriter(match['url'], base_href=base_href).rewriten_url}", ) diff --git a/src/zimscraperlib/rewriting/js.py b/src/zimscraperlib/rewriting/js.py index 05d68b5c..237243fa 100644 --- a/src/zimscraperlib/rewriting/js.py +++ b/src/zimscraperlib/rewriting/js.py @@ -206,7 +206,7 @@ def __init__( self, url_rewriter: ArticleUrlRewriter, base_href: str | None, - notify_js_module: Callable[[ZimPath], None], + notify_js_module: Callable[[ZimPath], None] | None, ): super().__init__(None) self.first_buff = self._init_local_declaration(GLOBAL_OVERRIDES) @@ -286,7 +286,7 @@ def get_rewriten_import_url(url: str) -> str: This takes into account that the result must be a relative URL, i.e. it cannot be 'vendor.module.js' but must be './vendor.module.js'. """ - url = self.url_rewriter(url, base_href=self.base_href) + url = self.url_rewriter(url, base_href=self.base_href).rewriten_url if not ( url.startswith("/") or url.startswith("./") or url.startswith("../") ): @@ -298,11 +298,12 @@ def func( m_object: re.Match[str], _opts: dict[str, Any] | None = None ) -> str: def sub_funct(match: re.Match[str]) -> str: - self.notify_js_module( - self.url_rewriter.get_item_path( - match.group(2), base_href=self.base_href + if self.notify_js_module: + self.notify_js_module( + self.url_rewriter.get_item_path( + match.group(2), base_href=self.base_href + ) ) - ) return ( f"{match.group(1)}{get_rewriten_import_url(match.group(2))}" f"{match.group(3)}" diff --git a/src/zimscraperlib/rewriting/url_rewriting.py b/src/zimscraperlib/rewriting/url_rewriting.py index fbf0147d..d661acd1 100644 --- a/src/zimscraperlib/rewriting/url_rewriting.py +++ b/src/zimscraperlib/rewriting/url_rewriting.py @@ -82,7 +82,7 @@ def __str__(self) -> str: return f"HttpUrl({self.value})" def __repr__(self) -> str: - return f"{self.__str__} - {super().__repr__()}" # pragma: no cover + return f"HttpUrl({self.value})" # pragma: no cover @property def value(self) -> str: @@ -124,7 +124,7 @@ def __str__(self) -> str: return f"ZimPath({self.value})" def __repr__(self) -> str: - return f"{self.__str__} - {super().__repr__()}" # pragma: no cover + return f"ZimPath({self.value})" # pragma: no cover @property def value(self) -> str: @@ -147,6 +147,12 @@ def check_validity(cls, value: str) -> None: raise ValueError(f"Unexpected password in value: {value} {parts.password}") +class RewriteResult(NamedTuple): + absolute_url: str + rewriten_url: str + zim_path: ZimPath | None + + class ArticleUrlRewriter: """ Rewrite urls in article. @@ -176,16 +182,11 @@ def __init__( missing_zim_paths: list of ZIM paths which are known to already be missing from the existing_zim_paths ; usefull only in complement with this variable ; new missing entries will be added as URLs are normalized in this function - - Results: - items_to_download: populated with the list of rewritten URLs, so that one - might use it to download items after rewriting the document """ self.article_path = article_path or ArticleUrlRewriter.normalize(article_url) self.article_url = article_url self.existing_zim_paths = existing_zim_paths self.missing_zim_paths = missing_zim_paths - self.items_to_download: dict[ZimPath, HttpUrl] = {} def get_item_path(self, item_url: str, base_href: str | None) -> ZimPath: """Utility to transform an item URL into a ZimPath""" @@ -201,7 +202,7 @@ def __call__( base_href: str | None, *, rewrite_all_url: bool = True, - ) -> str: + ) -> RewriteResult: """Rewrite a url contained in a article. The url is "fully" rewrited to point to a normalized entry path @@ -210,17 +211,25 @@ def __call__( try: item_url = item_url.strip() + item_absolute_url = urljoin( + urljoin(self.article_url.value, base_href), item_url + ) + # Make case of standalone fragments more straightforward if item_url.startswith("#"): - return item_url + return RewriteResult( + absolute_url=item_absolute_url, + rewriten_url=item_url, + zim_path=None, + ) item_scheme = urlsplit(item_url).scheme if item_scheme and item_scheme not in ("http", "https"): - return item_url - - item_absolute_url = urljoin( - urljoin(self.article_url.value, base_href), item_url - ) + return RewriteResult( + absolute_url=item_absolute_url, + rewriten_url=item_url, + zim_path=None, + ) item_fragment = urlsplit(item_absolute_url).fragment @@ -229,9 +238,11 @@ def __call__( if rewrite_all_url or ( self.existing_zim_paths and item_path in self.existing_zim_paths ): - if item_path not in self.items_to_download: - self.items_to_download[item_path] = HttpUrl(item_absolute_url) - return self.get_document_uri(item_path, item_fragment) + return RewriteResult( + absolute_url=item_absolute_url, + rewriten_url=self.get_document_uri(item_path, item_fragment), + zim_path=item_path, + ) else: if ( self.missing_zim_paths is not None @@ -242,7 +253,11 @@ def __call__( # with duplicate messages self.missing_zim_paths.add(item_path) # The url doesn't point to a known entry - return item_absolute_url + return RewriteResult( + absolute_url=item_absolute_url, + rewriten_url=item_absolute_url, + zim_path=item_path, + ) except Exception as exc: # pragma: no cover item_scheme = ( @@ -275,7 +290,11 @@ def __call__( f"rewrite_all_url: {rewrite_all_url}", exc_info=exc, ) - return item_url + return RewriteResult( + absolute_url=item_absolute_url, + rewriten_url=item_url, + zim_path=None, + ) def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str: """Given an ZIM item path and its fragment, get the URI to use in document diff --git a/tests/rewriting/conftest.py b/tests/rewriting/conftest.py index 390dd471..40ccbc25 100644 --- a/tests/rewriting/conftest.py +++ b/tests/rewriting/conftest.py @@ -7,20 +7,11 @@ from zimscraperlib.rewriting.url_rewriting import ( ArticleUrlRewriter, HttpUrl, + RewriteResult, ZimPath, ) -@pytest.fixture(scope="module") -def no_js_notify(): - """Fixture to not care about notification of detection of a JS file""" - - def no_js_notify_handler(_: str): - pass - - yield no_js_notify_handler - - class SimpleUrlRewriter(ArticleUrlRewriter): """Basic URL rewriter mocking most calls""" @@ -34,8 +25,12 @@ def __call__( base_href: str | None, # noqa: ARG002 *, rewrite_all_url: bool = True, # noqa: ARG002 - ) -> str: - return item_url + self.suffix + ) -> RewriteResult: + return RewriteResult( + absolute_url=item_url + self.suffix, + rewriten_url=item_url + self.suffix, + zim_path=None, + ) def get_item_path( self, item_url: str, base_href: str | None # noqa: ARG002 diff --git a/tests/rewriting/test_html_rewriting.py b/tests/rewriting/test_html_rewriting.py index bd59b497..d8117fa3 100644 --- a/tests/rewriting/test_html_rewriting.py +++ b/tests/rewriting/test_html_rewriting.py @@ -74,17 +74,15 @@ def no_rewrite_content(request: pytest.FixtureRequest): yield request.param -def test_no_rewrite( - no_rewrite_content: ContentForTests, no_js_notify: Callable[[ZimPath], None] -): +def test_no_rewrite(no_rewrite_content: ContentForTests): assert ( HtmlRewriter( ArticleUrlRewriter( article_url=HttpUrl(f"http://{no_rewrite_content.article_url}"), ), - "", - "", - no_js_notify, + None, + None, + None, ) .rewrite(no_rewrite_content.input_str) .content @@ -116,17 +114,15 @@ def escaped_content(request: pytest.FixtureRequest): yield request.param -def test_escaped_content( - escaped_content: ContentForTests, no_js_notify: Callable[[ZimPath], None] -): +def test_escaped_content(escaped_content: ContentForTests): transformed = ( HtmlRewriter( ArticleUrlRewriter( article_url=HttpUrl(f"http://{escaped_content.article_url}") ), - "", - "", - no_js_notify, + None, + None, + None, ) .rewrite(escaped_content.input_str) .content @@ -239,17 +235,15 @@ def js_rewrites(request: pytest.FixtureRequest): yield request.param -def test_js_rewrites( - js_rewrites: ContentForTests, no_js_notify: Callable[[ZimPath], None] -): +def test_js_rewrites(js_rewrites: ContentForTests): transformed = ( HtmlRewriter( ArticleUrlRewriter( article_url=HttpUrl(f"http://{js_rewrites.article_url}") ), - "", - "", - no_js_notify, + None, + None, + None, ) .rewrite(js_rewrites.input_str) .content @@ -334,16 +328,16 @@ def rewrite_url(request: pytest.FixtureRequest): yield request.param -def test_rewrite(rewrite_url: ContentForTests, no_js_notify: Callable[[ZimPath], None]): +def test_rewrite(rewrite_url: ContentForTests): assert ( HtmlRewriter( ArticleUrlRewriter( article_url=HttpUrl(f"http://{rewrite_url.article_url}"), existing_zim_paths={ZimPath("exemple.com/a/long/path")}, ), - "", - "", - no_js_notify, + None, + None, + None, ) .rewrite(rewrite_url.input_str) .content @@ -351,7 +345,7 @@ def test_rewrite(rewrite_url: ContentForTests, no_js_notify: Callable[[ZimPath], ) -def test_extract_title(no_js_notify: Callable[[ZimPath], None]): +def test_extract_title(): content = """