-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Move text_from_HTML out of metaphor.common (#877)
- Loading branch information
Showing
8 changed files
with
97 additions
and
122 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from bs4 import BeautifulSoup | ||
from bs4.element import Comment | ||
|
||
|
||
def text_from_HTML(html_content: str) -> str: | ||
""" | ||
Extracts and returns visible text from given HTML content as a single string. | ||
""" | ||
|
||
def filter_visible(el): | ||
if el.parent.name in [ | ||
"style", | ||
"script", | ||
"head", | ||
"title", | ||
"meta", | ||
"[document]", | ||
]: | ||
return False | ||
elif isinstance(el, Comment): | ||
return False | ||
else: | ||
return True | ||
|
||
# Use bs4 to find visible text elements | ||
soup = BeautifulSoup(html_content, "lxml") | ||
visible_text = filter(filter_visible, soup.findAll(string=True)) | ||
return "\n".join(t.strip() for t in visible_text) | ||
|
||
|
||
def title_from_HTML(html_content: str) -> str: | ||
""" | ||
Extracts the title of a webpage given HTML content as a single string. | ||
Designed to handle output from get_page_HTML. | ||
""" | ||
|
||
soup = BeautifulSoup(html_content, "lxml") | ||
title_tag = soup.find("title") | ||
|
||
if title_tag: | ||
return title_tag.text | ||
|
||
else: | ||
return "" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[tool.poetry] | ||
name = "metaphor-connectors" | ||
version = "0.14.14" | ||
version = "0.14.15" | ||
license = "Apache-2.0" | ||
description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app." | ||
authors = ["Metaphor <[email protected]>"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,10 +17,8 @@ | |
safe_parse_ISO8601, | ||
safe_str, | ||
start_of_day, | ||
text_from_HTML, | ||
unique_list, | ||
) | ||
from tests.test_utils import load_text | ||
|
||
|
||
@freeze_time("2020-01-10") | ||
|
@@ -175,18 +173,3 @@ def test_is_email(): | |
assert is_email("[email protected]") | ||
assert not is_email("foo") | ||
assert not is_email("[email protected]") | ||
|
||
|
||
# Test for extracting visible text from HTML, with filtering | ||
def test_text_from_HTML(test_root_dir: str): | ||
html_content = load_text(f"{test_root_dir}/common/samples/titles_text.html") | ||
text = text_from_HTML(html_content) | ||
assert "Visible paragraph 1." in text | ||
assert "Visible paragraph 2." in text | ||
assert "Test Title" not in text | ||
assert "Some style" not in text | ||
assert "Some script" not in text | ||
assert "Some meta" not in text | ||
assert "Commented text" not in text | ||
assert "Script text" not in text | ||
assert "Style text" not in text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
from metaphor.static_web.utils import text_from_HTML, title_from_HTML | ||
from tests.test_utils import load_text | ||
|
||
|
||
# Test for extracting visible text from HTML, with filtering | ||
def test_text_from_HTML(test_root_dir: str): | ||
html_content = load_text(f"{test_root_dir}/common/samples/titles_text.html") | ||
text = text_from_HTML(html_content) | ||
assert "Visible paragraph 1." in text | ||
assert "Visible paragraph 2." in text | ||
assert "Test Title" not in text | ||
assert "Some style" not in text | ||
assert "Some script" not in text | ||
assert "Some meta" not in text | ||
assert "Commented text" not in text | ||
assert "Script text" not in text | ||
assert "Style text" not in text | ||
|
||
|
||
# Test for extracting visible text from HTML, with filtering | ||
def test_get_text_from_HTML_with_filtering(test_root_dir: str): | ||
html_content = load_text( | ||
f"{test_root_dir}/static_web/sample_pages/titles_text.html" | ||
) | ||
text = text_from_HTML(html_content) | ||
assert "Visible paragraph 1." in text | ||
assert "Visible paragraph 2." in text | ||
assert "Test Title" not in text | ||
assert "Some style" not in text | ||
assert "Some script" not in text | ||
assert "Some meta" not in text | ||
assert "Commented text" not in text | ||
assert "Script text" not in text | ||
assert "Style text" not in text | ||
|
||
|
||
# Test for extracting title from HTML | ||
def test_get_title_from_HTML_success(test_root_dir: str): | ||
html_content = "<html><head><title>Test Title</title></head></html>" | ||
title = title_from_HTML(html_content) | ||
assert title == "Test Title" | ||
|
||
|
||
# Test for extracting empty title | ||
def test_get_title_from_HTML_failure(test_root_dir: str): | ||
html_content = "<html><head></head><body><h1>Hello World!</h1></body></html>" | ||
title = title_from_HTML(html_content) | ||
assert title == "" |