Merge pull request #208 from openzim/mindtouch_changes

More changes for mindtouch scraper
openzim · Oct 25, 2024 · 0cfd96b · 0cfd96b
2 parents 1c2c37e + 4ac7665
commit 0cfd96b
Show file tree

Hide file tree

Showing 8 changed files with 514 additions and 213 deletions.
diff --git a/src/zimscraperlib/rewriting/css.py b/src/zimscraperlib/rewriting/css.py
@@ -51,7 +51,7 @@ def __simple_transform(
             [
                 "url(",
                 m_object["quote"],
-                url_rewriter(m_object["url"], base_href),
+                url_rewriter(m_object["url"], base_href).rewriten_url,
                 m_object["quote"],
                 ")",
             ]
@@ -190,7 +190,7 @@ def _process_node(self, node: ast.Node):
                 new_url = self.url_rewriter(
                     url_node.value,  # pyright: ignore
                     self.base_href,
-                )
+                ).rewriten_url
                 url_node.value = str(new_url)  # pyright: ignore
                 url_node.representation = (  # pyright: ignore
                     f'"{serialize_url(str(new_url))}"'
@@ -206,7 +206,9 @@ def _process_node(self, node: ast.Node):
         elif isinstance(node, ast.Declaration):
             self._process_list(node.value)  # pyright: ignore
         elif isinstance(node, ast.URLToken):
-            new_url = self.url_rewriter(node.value, self.base_href)  # pyright: ignore
+            new_url = self.url_rewriter(
+                node.value, self.base_href
+            ).rewriten_url  # pyright: ignore
             node.value = new_url
             node.representation = f"url({serialize_url(new_url)})"
 

diff --git a/src/zimscraperlib/rewriting/html.py b/src/zimscraperlib/rewriting/html.py
@@ -132,9 +132,9 @@ class HtmlRewriter(HTMLParser):
     def __init__(
         self,
         url_rewriter: ArticleUrlRewriter,
-        pre_head_insert: str,
+        pre_head_insert: str | None,
         post_head_insert: str | None,
-        notify_js_module: Callable[[ZimPath], None],
+        notify_js_module: Callable[[ZimPath], None] | None,
     ):
         super().__init__(convert_charrefs=False)
         self.url_rewriter = url_rewriter
@@ -430,7 +430,7 @@ def do_attribute_rewrite(
         css_rewriter: CssRewriter,
         url_rewriter: ArticleUrlRewriter,
         base_href: str | None,
-        notify_js_module: Callable[[ZimPath], None],
+        notify_js_module: Callable[[ZimPath], None] | None,
     ) -> AttrNameAndValue:
         """Utility function to process all attribute rewriting rules
 
@@ -587,7 +587,7 @@ def rewrite_href_src_attributes(
     attrs: AttrsList,
     url_rewriter: ArticleUrlRewriter,
     base_href: str | None,
-    notify_js_module: Callable[[ZimPath], None],
+    notify_js_module: Callable[[ZimPath], None] | None,
 ):
     """Rewrite href and src attributes
 
@@ -596,11 +596,16 @@ def rewrite_href_src_attributes(
     """
     if attr_name not in ("href", "src") or not attr_value:
         return
-    if get_html_rewrite_context(tag=tag, attrs=attrs) == "js-module":
+    if (
+        notify_js_module
+        and get_html_rewrite_context(tag=tag, attrs=attrs) == "js-module"
+    ):
         notify_js_module(url_rewriter.get_item_path(attr_value, base_href=base_href))
     return (
         attr_name,
-        url_rewriter(attr_value, base_href=base_href, rewrite_all_url=tag != "a"),
+        url_rewriter(
+            attr_value, base_href=base_href, rewrite_all_url=tag != "a"
+        ).rewriten_url,
     )
 
 
@@ -615,10 +620,10 @@ def rewrite_srcset_attribute(
     if attr_name != "srcset" or not attr_value:
         return
     value_list = attr_value.split(",")
-    new_value_list = []
+    new_value_list: list[str] = []
     for value in value_list:
         url, *other = value.strip().split(" ", maxsplit=1)
-        new_url = url_rewriter(url, base_href=base_href)
+        new_url = url_rewriter(url, base_href=base_href).rewriten_url
         new_value = " ".join([new_url, *other])
         new_value_list.append(new_value)
     return (attr_name, ", ".join(new_value_list))
@@ -708,5 +713,6 @@ def rewrite_meta_http_equiv_redirect(
         return
     return (
         attr_name,
-        f"{match['interval']};url={url_rewriter(match['url'], base_href=base_href)}",
+        f"{match['interval']};"
+        f"url={url_rewriter(match['url'], base_href=base_href).rewriten_url}",
     )
diff --git a/src/zimscraperlib/rewriting/js.py b/src/zimscraperlib/rewriting/js.py
@@ -206,7 +206,7 @@ def __init__(
         self,
         url_rewriter: ArticleUrlRewriter,
         base_href: str | None,
-        notify_js_module: Callable[[ZimPath], None],
+        notify_js_module: Callable[[ZimPath], None] | None,
     ):
         super().__init__(None)
         self.first_buff = self._init_local_declaration(GLOBAL_OVERRIDES)
@@ -286,7 +286,7 @@ def get_rewriten_import_url(url: str) -> str:
             This takes into account that the result must be a relative URL, i.e. it
             cannot be 'vendor.module.js' but must be './vendor.module.js'.
             """
-            url = self.url_rewriter(url, base_href=self.base_href)
+            url = self.url_rewriter(url, base_href=self.base_href).rewriten_url
             if not (
                 url.startswith("/") or url.startswith("./") or url.startswith("../")
             ):
@@ -298,11 +298,12 @@ def func(
                 m_object: re.Match[str], _opts: dict[str, Any] | None = None
             ) -> str:
                 def sub_funct(match: re.Match[str]) -> str:
-                    self.notify_js_module(
-                        self.url_rewriter.get_item_path(
-                            match.group(2), base_href=self.base_href
+                    if self.notify_js_module:
+                        self.notify_js_module(
+                            self.url_rewriter.get_item_path(
+                                match.group(2), base_href=self.base_href
+                            )
                         )
-                    )
                     return (
                         f"{match.group(1)}{get_rewriten_import_url(match.group(2))}"
                         f"{match.group(3)}"

diff --git a/src/zimscraperlib/rewriting/url_rewriting.py b/src/zimscraperlib/rewriting/url_rewriting.py
@@ -82,7 +82,7 @@ def __str__(self) -> str:
         return f"HttpUrl({self.value})"
 
     def __repr__(self) -> str:
-        return f"{self.__str__} - {super().__repr__()}"  # pragma: no cover
+        return f"HttpUrl({self.value})"  # pragma: no cover
 
     @property
     def value(self) -> str:
@@ -124,7 +124,7 @@ def __str__(self) -> str:
         return f"ZimPath({self.value})"
 
     def __repr__(self) -> str:
-        return f"{self.__str__} - {super().__repr__()}"  # pragma: no cover
+        return f"ZimPath({self.value})"  # pragma: no cover
 
     @property
     def value(self) -> str:
@@ -147,6 +147,12 @@ def check_validity(cls, value: str) -> None:
             raise ValueError(f"Unexpected password in value: {value} {parts.password}")
 
 
+class RewriteResult(NamedTuple):
+    absolute_url: str
+    rewriten_url: str
+    zim_path: ZimPath | None
+
+
 class ArticleUrlRewriter:
     """
     Rewrite urls in article.
@@ -176,16 +182,11 @@ def __init__(
           missing_zim_paths: list of ZIM paths which are known to already be missing
         from the existing_zim_paths ; usefull only in complement with this variable ;
         new missing entries will be added as URLs are normalized in this function
-
-        Results:
-          items_to_download: populated with the list of rewritten URLs, so that one
-        might use it to download items after rewriting the document
         """
         self.article_path = article_path or ArticleUrlRewriter.normalize(article_url)
         self.article_url = article_url
         self.existing_zim_paths = existing_zim_paths
         self.missing_zim_paths = missing_zim_paths
-        self.items_to_download: dict[ZimPath, HttpUrl] = {}
 
     def get_item_path(self, item_url: str, base_href: str | None) -> ZimPath:
         """Utility to transform an item URL into a ZimPath"""
@@ -201,7 +202,7 @@ def __call__(
         base_href: str | None,
         *,
         rewrite_all_url: bool = True,
-    ) -> str:
+    ) -> RewriteResult:
         """Rewrite a url contained in a article.
 
         The url is "fully" rewrited to point to a normalized entry path
@@ -210,17 +211,25 @@ def __call__(
         try:
             item_url = item_url.strip()
 
+            item_absolute_url = urljoin(
+                urljoin(self.article_url.value, base_href), item_url
+            )
+
             # Make case of standalone fragments more straightforward
             if item_url.startswith("#"):
-                return item_url
+                return RewriteResult(
+                    absolute_url=item_absolute_url,
+                    rewriten_url=item_url,
+                    zim_path=None,
+                )
 
             item_scheme = urlsplit(item_url).scheme
             if item_scheme and item_scheme not in ("http", "https"):
-                return item_url
-
-            item_absolute_url = urljoin(
-                urljoin(self.article_url.value, base_href), item_url
-            )
+                return RewriteResult(
+                    absolute_url=item_absolute_url,
+                    rewriten_url=item_url,
+                    zim_path=None,
+                )
 
             item_fragment = urlsplit(item_absolute_url).fragment
 
@@ -229,9 +238,11 @@ def __call__(
             if rewrite_all_url or (
                 self.existing_zim_paths and item_path in self.existing_zim_paths
             ):
-                if item_path not in self.items_to_download:
-                    self.items_to_download[item_path] = HttpUrl(item_absolute_url)
-                return self.get_document_uri(item_path, item_fragment)
+                return RewriteResult(
+                    absolute_url=item_absolute_url,
+                    rewriten_url=self.get_document_uri(item_path, item_fragment),
+                    zim_path=item_path,
+                )
             else:
                 if (
                     self.missing_zim_paths is not None
@@ -242,7 +253,11 @@ def __call__(
                     # with duplicate messages
                     self.missing_zim_paths.add(item_path)
                 # The url doesn't point to a known entry
-                return item_absolute_url
+                return RewriteResult(
+                    absolute_url=item_absolute_url,
+                    rewriten_url=item_absolute_url,
+                    zim_path=item_path,
+                )
 
         except Exception as exc:  # pragma: no cover
             item_scheme = (
@@ -275,7 +290,11 @@ def __call__(
                 f"rewrite_all_url: {rewrite_all_url}",
                 exc_info=exc,
             )
-            return item_url
+            return RewriteResult(
+                absolute_url=item_absolute_url,
+                rewriten_url=item_url,
+                zim_path=None,
+            )
 
     def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str:
         """Given an ZIM item path and its fragment, get the URI to use in document

diff --git a/tests/rewriting/conftest.py b/tests/rewriting/conftest.py
@@ -7,20 +7,11 @@
 from zimscraperlib.rewriting.url_rewriting import (
     ArticleUrlRewriter,
     HttpUrl,
+    RewriteResult,
     ZimPath,
 )
 
 
-@pytest.fixture(scope="module")
-def no_js_notify():
-    """Fixture to not care about notification of detection of a JS file"""
-
-    def no_js_notify_handler(_: str):
-        pass
-
-    yield no_js_notify_handler
-
-
 class SimpleUrlRewriter(ArticleUrlRewriter):
     """Basic URL rewriter mocking most calls"""
 
@@ -34,8 +25,12 @@ def __call__(
         base_href: str | None,  # noqa: ARG002
         *,
         rewrite_all_url: bool = True,  # noqa: ARG002
-    ) -> str:
-        return item_url + self.suffix
+    ) -> RewriteResult:
+        return RewriteResult(
+            absolute_url=item_url + self.suffix,
+            rewriten_url=item_url + self.suffix,
+            zim_path=None,
+        )
 
     def get_item_path(
         self, item_url: str, base_href: str | None  # noqa: ARG002