Skip to content

Commit

Permalink
Move text_from_HTML out of metaphor.common (#877)
Browse files Browse the repository at this point in the history
  • Loading branch information
mars-lan authored Jun 11, 2024
1 parent a46967c commit d878d0a
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 122 deletions.
28 changes: 0 additions & 28 deletions metaphor/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
from hashlib import md5
from typing import Any, Callable, Dict, Iterable, List, Optional, Union

from bs4 import BeautifulSoup
from bs4.element import Comment
from dateutil.parser import isoparse
from pydantic import validate_email

Expand Down Expand Up @@ -174,29 +172,3 @@ def is_email(email: str) -> bool:
except (ValueError, AssertionError):
return False
return True


def text_from_HTML(html_content: str) -> str:
"""
Extracts and returns visible text from given HTML content as a single string.
"""

def filter_visible(el):
if el.parent.name in [
"style",
"script",
"head",
"title",
"meta",
"[document]",
]:
return False
elif isinstance(el, Comment):
return False
else:
return True

# Use bs4 to find visible text elements
soup = BeautifulSoup(html_content, "lxml")
visible_text = filter(filter_visible, soup.findAll(string=True))
return "\n".join(t.strip() for t in visible_text)
2 changes: 1 addition & 1 deletion metaphor/sharepoint/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
from metaphor.common.embeddings import embed_documents, map_metadata, sanitize_text
from metaphor.common.event_util import ENTITY_TYPES
from metaphor.common.logger import get_logger
from metaphor.common.utils import text_from_HTML
from metaphor.models.crawler_run_metadata import Platform
from metaphor.sharepoint.config import SharepointRunConfig
from metaphor.static_web.utils import text_from_HTML

logger = get_logger()

Expand Down
47 changes: 3 additions & 44 deletions metaphor/static_web/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
from llama_index.core import Document
from requests.exceptions import HTTPError, RequestException

Expand All @@ -15,6 +14,7 @@
from metaphor.common.utils import md5_digest
from metaphor.models.crawler_run_metadata import Platform
from metaphor.static_web.config import StaticWebRunConfig
from metaphor.static_web.utils import text_from_HTML, title_from_HTML

logger = get_logger()

Expand Down Expand Up @@ -125,8 +125,8 @@ def _check_page_make_document(self, page: str) -> Tuple[bool, str]:
if page_content == "ERROR IN PAGE RETRIEVAL":
return (False, "")
else:
page_text = self._get_text_from_HTML(page_content)
page_title = self._get_title_from_HTML(page_content)
page_text = text_from_HTML(page_content)
page_title = title_from_HTML(page_content)

page_doc = self._make_document(page, page_title, page_text)
self.docs.append(page_doc)
Expand Down Expand Up @@ -173,47 +173,6 @@ def _get_subpages_from_HTML(self, html_content: str, input_URL: str) -> List[str

return subpages

def _get_text_from_HTML(self, html_content: str) -> str:
"""
Extracts and returns visible text from given HTML content as a single string.
Designed to handle output from get_page_HTML.
"""

def filter_visible(el):
if el.parent.name in [
"style",
"script",
"head",
"title",
"meta",
"[document]",
]:
return False
elif isinstance(el, Comment):
return False
else:
return True

# Use bs4 to find visible text elements
soup = BeautifulSoup(html_content, "lxml")
visible_text = filter(filter_visible, soup.findAll(string=True))
return "\n".join(t.strip() for t in visible_text)

def _get_title_from_HTML(self, html_content: str) -> str:
"""
Extracts the title of a webpage given HTML content as a single string.
Designed to handle output from get_page_HTML.
"""

soup = BeautifulSoup(html_content, "lxml")
title_tag = soup.find("title")

if title_tag:
return title_tag.text

else:
return ""

def _make_document(
self, page_URL: str, page_title: str, page_text: str
) -> Document:
Expand Down
44 changes: 44 additions & 0 deletions metaphor/static_web/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from bs4 import BeautifulSoup
from bs4.element import Comment


def text_from_HTML(html_content: str) -> str:
"""
Extracts and returns visible text from given HTML content as a single string.
"""

def filter_visible(el):
if el.parent.name in [
"style",
"script",
"head",
"title",
"meta",
"[document]",
]:
return False
elif isinstance(el, Comment):
return False
else:
return True

# Use bs4 to find visible text elements
soup = BeautifulSoup(html_content, "lxml")
visible_text = filter(filter_visible, soup.findAll(string=True))
return "\n".join(t.strip() for t in visible_text)


def title_from_HTML(html_content: str) -> str:
"""
Extracts the title of a webpage given HTML content as a single string.
Designed to handle output from get_page_HTML.
"""

soup = BeautifulSoup(html_content, "lxml")
title_tag = soup.find("title")

if title_tag:
return title_tag.text

else:
return ""
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "metaphor-connectors"
version = "0.14.14"
version = "0.14.15"
license = "Apache-2.0"
description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app."
authors = ["Metaphor <[email protected]>"]
Expand Down
17 changes: 0 additions & 17 deletions tests/common/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,8 @@
safe_parse_ISO8601,
safe_str,
start_of_day,
text_from_HTML,
unique_list,
)
from tests.test_utils import load_text


@freeze_time("2020-01-10")
Expand Down Expand Up @@ -175,18 +173,3 @@ def test_is_email():
assert is_email("[email protected]")
assert not is_email("foo")
assert not is_email("[email protected]")


# Test for extracting visible text from HTML, with filtering
def test_text_from_HTML(test_root_dir: str):
html_content = load_text(f"{test_root_dir}/common/samples/titles_text.html")
text = text_from_HTML(html_content)
assert "Visible paragraph 1." in text
assert "Visible paragraph 2." in text
assert "Test Title" not in text
assert "Some style" not in text
assert "Some script" not in text
assert "Some meta" not in text
assert "Commented text" not in text
assert "Script text" not in text
assert "Style text" not in text
31 changes: 0 additions & 31 deletions tests/static_web/test_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,37 +59,6 @@ def test_get_subpages_from_HTML(static_web_extractor):
assert "https://example.com/test" in result


# Test for extracting visible text from HTML, with filtering
def test_get_text_from_HTML_with_filtering(static_web_extractor, test_root_dir: str):
html_content = load_text(
f"{test_root_dir}/static_web/sample_pages/titles_text.html"
)
text = static_web_extractor._get_text_from_HTML(html_content)
assert "Visible paragraph 1." in text
assert "Visible paragraph 2." in text
assert "Test Title" not in text
assert "Some style" not in text
assert "Some script" not in text
assert "Some meta" not in text
assert "Commented text" not in text
assert "Script text" not in text
assert "Style text" not in text


# Test for extracting title from HTML
def test_get_title_from_HTML_success(static_web_extractor):
html_content = "<html><head><title>Test Title</title></head></html>"
title = static_web_extractor._get_title_from_HTML(html_content)
assert title == "Test Title"


# Test for extracting empty title
def test_get_title_from_HTML_failure(static_web_extractor):
html_content = "<html><head></head><body><h1>Hello World!</h1></body></html>"
title = static_web_extractor._get_title_from_HTML(html_content)
assert title == ""


# Test for making a document
def test_make_document(static_web_extractor):
doc = static_web_extractor._make_document(
Expand Down
48 changes: 48 additions & 0 deletions tests/static_web/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from metaphor.static_web.utils import text_from_HTML, title_from_HTML
from tests.test_utils import load_text


# Test for extracting visible text from HTML, with filtering
def test_text_from_HTML(test_root_dir: str):
html_content = load_text(f"{test_root_dir}/common/samples/titles_text.html")
text = text_from_HTML(html_content)
assert "Visible paragraph 1." in text
assert "Visible paragraph 2." in text
assert "Test Title" not in text
assert "Some style" not in text
assert "Some script" not in text
assert "Some meta" not in text
assert "Commented text" not in text
assert "Script text" not in text
assert "Style text" not in text


# Test for extracting visible text from HTML, with filtering
def test_get_text_from_HTML_with_filtering(test_root_dir: str):
html_content = load_text(
f"{test_root_dir}/static_web/sample_pages/titles_text.html"
)
text = text_from_HTML(html_content)
assert "Visible paragraph 1." in text
assert "Visible paragraph 2." in text
assert "Test Title" not in text
assert "Some style" not in text
assert "Some script" not in text
assert "Some meta" not in text
assert "Commented text" not in text
assert "Script text" not in text
assert "Style text" not in text


# Test for extracting title from HTML
def test_get_title_from_HTML_success(test_root_dir: str):
html_content = "<html><head><title>Test Title</title></head></html>"
title = title_from_HTML(html_content)
assert title == "Test Title"


# Test for extracting empty title
def test_get_title_from_HTML_failure(test_root_dir: str):
html_content = "<html><head></head><body><h1>Hello World!</h1></body></html>"
title = title_from_HTML(html_content)
assert title == ""

0 comments on commit d878d0a

Please sign in to comment.