generated from mozilla-ai/Blueprint-template
-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add URL input support to streamlit demo (#80)
* feat: add URL input support to streamlit demo - Add URL text input and clean button - Implement URL content fetching and cleaning - Add integration tests - Update documentation - Maintain same cleaning quality as file upload Closes #32 * Add integration tests for URL input functionality * test: improve URL input integration tests and update docs Tests: - Add content quality test - Add size limit test - Remove invalid content test - Follow patterns from test_data_load_and_clean.py Docs: - Update README to include URL input option - Clarify document preprocessing steps * fix: restore file upload functionality and fix test formatting --------- Co-authored-by: fivestarspicy <[email protected]>
- Loading branch information
1 parent
8ebde29
commit 09faf73
Showing
4 changed files
with
125 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -166,3 +166,4 @@ cython_debug/ | |
|
||
# VS files | ||
.vscode | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
from document_to_podcast.preprocessing import DATA_CLEANERS | ||
import pytest | ||
import requests | ||
from bs4 import BeautifulSoup | ||
from requests.exceptions import RequestException | ||
|
||
|
||
def test_url_content_cleaning(): | ||
"""Test basic URL content fetching and cleaning.""" | ||
url = "https://blog.mozilla.ai/introducing-blueprints-customizable-ai-workflows-for-developers/" | ||
response = requests.get(url) | ||
soup = BeautifulSoup(response.text, "html.parser") | ||
raw_text = soup.get_text() | ||
clean_text = DATA_CLEANERS[".html"](raw_text) | ||
|
||
# Verify cleaning maintains same quality as file upload | ||
assert len(clean_text) < len(raw_text) # Should remove HTML | ||
assert "Mozilla" in clean_text # Should preserve key content | ||
|
||
|
||
def test_url_error_handling(): | ||
"""Test handling of network errors.""" | ||
with pytest.raises(RequestException): | ||
response = requests.get("https://nonexistent-url-that-should-fail.com") | ||
response.raise_for_status() | ||
|
||
|
||
def test_url_content_quality(): | ||
"""Test that cleaned URL content maintains expected quality.""" | ||
url = "https://blog.mozilla.org/en/mozilla/introducing-mozilla-ai-investing-in-trustworthy-ai/" | ||
response = requests.get(url) | ||
soup = BeautifulSoup(response.text, "html.parser") | ||
raw_text = soup.get_text() | ||
clean_text = DATA_CLEANERS[".html"](raw_text) | ||
|
||
# Test content quality | ||
assert "mozilla" in clean_text.lower() # Key terms preserved | ||
assert "ai" in clean_text.lower() # Case-insensitive content check | ||
assert "<html>" not in clean_text # HTML tags removed | ||
assert "utm_source" not in clean_text # Marketing parameters removed | ||
|
||
|
||
def test_url_content_size_limits(): | ||
"""Test handling of different content sizes.""" | ||
url = "https://www.mozilla.org/en-US/about/manifesto/" # Substantial page | ||
response = requests.get(url) | ||
soup = BeautifulSoup(response.text, "html.parser") | ||
raw_text = soup.get_text() | ||
clean_text = DATA_CLEANERS[".html"](raw_text) | ||
|
||
# Size checks | ||
assert len(clean_text) > 100 # Not too small | ||
assert len(clean_text) < len(raw_text) # Smaller than raw | ||
assert len(clean_text.split()) > 50 # Has substantial content |