Move text_from_HTML out of metaphor.common (#877)

MetaphorData · Jun 11, 2024 · d878d0a · d878d0a
1 parent a46967c
commit d878d0a
Show file tree

Hide file tree

Showing 8 changed files with 97 additions and 122 deletions.
diff --git a/metaphor/common/utils.py b/metaphor/common/utils.py
@@ -3,8 +3,6 @@
 from hashlib import md5
 from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 
-from bs4 import BeautifulSoup
-from bs4.element import Comment
 from dateutil.parser import isoparse
 from pydantic import validate_email
 
@@ -174,29 +172,3 @@ def is_email(email: str) -> bool:
     except (ValueError, AssertionError):
         return False
     return True
-
-
-def text_from_HTML(html_content: str) -> str:
-    """
-    Extracts and returns visible text from given HTML content as a single string.
-    """
-
-    def filter_visible(el):
-        if el.parent.name in [
-            "style",
-            "script",
-            "head",
-            "title",
-            "meta",
-            "[document]",
-        ]:
-            return False
-        elif isinstance(el, Comment):
-            return False
-        else:
-            return True
-
-    # Use bs4 to find visible text elements
-    soup = BeautifulSoup(html_content, "lxml")
-    visible_text = filter(filter_visible, soup.findAll(string=True))
-    return "\n".join(t.strip() for t in visible_text)
diff --git a/metaphor/sharepoint/extractor.py b/metaphor/sharepoint/extractor.py
@@ -10,9 +10,9 @@
 from metaphor.common.embeddings import embed_documents, map_metadata, sanitize_text
 from metaphor.common.event_util import ENTITY_TYPES
 from metaphor.common.logger import get_logger
-from metaphor.common.utils import text_from_HTML
 from metaphor.models.crawler_run_metadata import Platform
 from metaphor.sharepoint.config import SharepointRunConfig
+from metaphor.static_web.utils import text_from_HTML
 
 logger = get_logger()
 

diff --git a/metaphor/static_web/extractor.py b/metaphor/static_web/extractor.py
@@ -4,7 +4,6 @@
 
 import requests
 from bs4 import BeautifulSoup
-from bs4.element import Comment
 from llama_index.core import Document
 from requests.exceptions import HTTPError, RequestException
 
@@ -15,6 +14,7 @@
 from metaphor.common.utils import md5_digest
 from metaphor.models.crawler_run_metadata import Platform
 from metaphor.static_web.config import StaticWebRunConfig
+from metaphor.static_web.utils import text_from_HTML, title_from_HTML
 
 logger = get_logger()
 
@@ -125,8 +125,8 @@ def _check_page_make_document(self, page: str) -> Tuple[bool, str]:
         if page_content == "ERROR IN PAGE RETRIEVAL":
             return (False, "")
         else:
-            page_text = self._get_text_from_HTML(page_content)
-            page_title = self._get_title_from_HTML(page_content)
+            page_text = text_from_HTML(page_content)
+            page_title = title_from_HTML(page_content)
 
             page_doc = self._make_document(page, page_title, page_text)
             self.docs.append(page_doc)
@@ -173,47 +173,6 @@ def _get_subpages_from_HTML(self, html_content: str, input_URL: str) -> List[str
 
         return subpages
 
-    def _get_text_from_HTML(self, html_content: str) -> str:
-        """
-        Extracts and returns visible text from given HTML content as a single string.
-        Designed to handle output from get_page_HTML.
-        """
-
-        def filter_visible(el):
-            if el.parent.name in [
-                "style",
-                "script",
-                "head",
-                "title",
-                "meta",
-                "[document]",
-            ]:
-                return False
-            elif isinstance(el, Comment):
-                return False
-            else:
-                return True
-
-        # Use bs4 to find visible text elements
-        soup = BeautifulSoup(html_content, "lxml")
-        visible_text = filter(filter_visible, soup.findAll(string=True))
-        return "\n".join(t.strip() for t in visible_text)
-
-    def _get_title_from_HTML(self, html_content: str) -> str:
-        """
-        Extracts the title of a webpage given HTML content as a single string.
-        Designed to handle output from get_page_HTML.
-        """
-
-        soup = BeautifulSoup(html_content, "lxml")
-        title_tag = soup.find("title")
-
-        if title_tag:
-            return title_tag.text
-
-        else:
-            return ""
-
     def _make_document(
         self, page_URL: str, page_title: str, page_text: str
     ) -> Document:

diff --git a/metaphor/static_web/utils.py b/metaphor/static_web/utils.py
@@ -0,0 +1,44 @@
+from bs4 import BeautifulSoup
+from bs4.element import Comment
+
+
+def text_from_HTML(html_content: str) -> str:
+    """
+    Extracts and returns visible text from given HTML content as a single string.
+    """
+
+    def filter_visible(el):
+        if el.parent.name in [
+            "style",
+            "script",
+            "head",
+            "title",
+            "meta",
+            "[document]",
+        ]:
+            return False
+        elif isinstance(el, Comment):
+            return False
+        else:
+            return True
+
+    # Use bs4 to find visible text elements
+    soup = BeautifulSoup(html_content, "lxml")
+    visible_text = filter(filter_visible, soup.findAll(string=True))
+    return "\n".join(t.strip() for t in visible_text)
+
+
+def title_from_HTML(html_content: str) -> str:
+    """
+    Extracts the title of a webpage given HTML content as a single string.
+    Designed to handle output from get_page_HTML.
+    """
+
+    soup = BeautifulSoup(html_content, "lxml")
+    title_tag = soup.find("title")
+
+    if title_tag:
+        return title_tag.text
+
+    else:
+        return ""
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "metaphor-connectors"
-version = "0.14.14"
+version = "0.14.15"
 license = "Apache-2.0"
 description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app."
 authors = ["Metaphor <[email protected]>"]

diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py
@@ -17,10 +17,8 @@
     safe_parse_ISO8601,
     safe_str,
     start_of_day,
-    text_from_HTML,
     unique_list,
 )
-from tests.test_utils import load_text
 
 
 @freeze_time("2020-01-10")
@@ -175,18 +173,3 @@ def test_is_email():
     assert is_email("[email protected]")
     assert not is_email("foo")
     assert not is_email("[email protected]")
-
-
-# Test for extracting visible text from HTML, with filtering
-def test_text_from_HTML(test_root_dir: str):
-    html_content = load_text(f"{test_root_dir}/common/samples/titles_text.html")
-    text = text_from_HTML(html_content)
-    assert "Visible paragraph 1." in text
-    assert "Visible paragraph 2." in text
-    assert "Test Title" not in text
-    assert "Some style" not in text
-    assert "Some script" not in text
-    assert "Some meta" not in text
-    assert "Commented text" not in text
-    assert "Script text" not in text
-    assert "Style text" not in text
diff --git a/tests/static_web/test_extractor.py b/tests/static_web/test_extractor.py
@@ -59,37 +59,6 @@ def test_get_subpages_from_HTML(static_web_extractor):
     assert "https://example.com/test" in result
 
 
-# Test for extracting visible text from HTML, with filtering
-def test_get_text_from_HTML_with_filtering(static_web_extractor, test_root_dir: str):
-    html_content = load_text(
-        f"{test_root_dir}/static_web/sample_pages/titles_text.html"
-    )
-    text = static_web_extractor._get_text_from_HTML(html_content)
-    assert "Visible paragraph 1." in text
-    assert "Visible paragraph 2." in text
-    assert "Test Title" not in text
-    assert "Some style" not in text
-    assert "Some script" not in text
-    assert "Some meta" not in text
-    assert "Commented text" not in text
-    assert "Script text" not in text
-    assert "Style text" not in text
-
-
-# Test for extracting title from HTML
-def test_get_title_from_HTML_success(static_web_extractor):
-    html_content = "<html><head><title>Test Title</title></head></html>"
-    title = static_web_extractor._get_title_from_HTML(html_content)
-    assert title == "Test Title"
-
-
-# Test for extracting empty title
-def test_get_title_from_HTML_failure(static_web_extractor):
-    html_content = "<html><head></head><body><h1>Hello World!</h1></body></html>"
-    title = static_web_extractor._get_title_from_HTML(html_content)
-    assert title == ""
-
-
 # Test for making a document
 def test_make_document(static_web_extractor):
     doc = static_web_extractor._make_document(

diff --git a/tests/static_web/test_utils.py b/tests/static_web/test_utils.py
@@ -0,0 +1,48 @@
+from metaphor.static_web.utils import text_from_HTML, title_from_HTML
+from tests.test_utils import load_text
+
+
+# Test for extracting visible text from HTML, with filtering
+def test_text_from_HTML(test_root_dir: str):
+    html_content = load_text(f"{test_root_dir}/common/samples/titles_text.html")
+    text = text_from_HTML(html_content)
+    assert "Visible paragraph 1." in text
+    assert "Visible paragraph 2." in text
+    assert "Test Title" not in text
+    assert "Some style" not in text
+    assert "Some script" not in text
+    assert "Some meta" not in text
+    assert "Commented text" not in text
+    assert "Script text" not in text
+    assert "Style text" not in text
+
+
+# Test for extracting visible text from HTML, with filtering
+def test_get_text_from_HTML_with_filtering(test_root_dir: str):
+    html_content = load_text(
+        f"{test_root_dir}/static_web/sample_pages/titles_text.html"
+    )
+    text = text_from_HTML(html_content)
+    assert "Visible paragraph 1." in text
+    assert "Visible paragraph 2." in text
+    assert "Test Title" not in text
+    assert "Some style" not in text
+    assert "Some script" not in text
+    assert "Some meta" not in text
+    assert "Commented text" not in text
+    assert "Script text" not in text
+    assert "Style text" not in text
+
+
+# Test for extracting title from HTML
+def test_get_title_from_HTML_success(test_root_dir: str):
+    html_content = "<html><head><title>Test Title</title></head></html>"
+    title = title_from_HTML(html_content)
+    assert title == "Test Title"
+
+
+# Test for extracting empty title
+def test_get_title_from_HTML_failure(test_root_dir: str):
+    html_content = "<html><head></head><body><h1>Hello World!</h1></body></html>"
+    title = title_from_HTML(html_content)
+    assert title == ""