From 09faf73964c4d043613d9b043b8de850f75d946a Mon Sep 17 00:00:00 2001 From: fivestarspicy <25396141+fivestarspicy@users.noreply.github.com> Date: Mon, 6 Jan 2025 01:59:32 -0800 Subject: [PATCH] feat: add URL input support to streamlit demo (#80) * feat: add URL input support to streamlit demo - Add URL text input and clean button - Implement URL content fetching and cleaning - Add integration tests - Update documentation - Maintain same cleaning quality as file upload Closes #32 * Add integration tests for URL input functionality * test: improve URL input integration tests and update docs Tests: - Add content quality test - Add size limit test - Remove invalid content test - Follow patterns from test_data_load_and_clean.py Docs: - Update README to include URL input option - Clarify document preprocessing steps * fix: restore file upload functionality and fix test formatting --------- Co-authored-by: fivestarspicy --- .gitignore | 1 + README.md | 12 ++--- demo/app.py | 70 ++++++++++++++++++++++++++--- tests/integration/test_url_input.py | 54 ++++++++++++++++++++++ 4 files changed, 125 insertions(+), 12 deletions(-) create mode 100644 tests/integration/test_url_input.py diff --git a/.gitignore b/.gitignore index ff3b4eb..f9e9dd3 100644 --- a/.gitignore +++ b/.gitignore @@ -166,3 +166,4 @@ cython_debug/ # VS files .vscode +.DS_Store diff --git a/README.md b/README.md index 0ea7348..67ef712 100644 --- a/README.md +++ b/README.md @@ -63,13 +63,15 @@ Once the Codespaces environment launches, inside the terminal, start the Streaml -1. **Document Upload** - Start by uploading a document in a supported format (e.g., PDF, .txt, or .docx). +1. **Document Input** + Start by either: + - Uploading a document in a supported format (e.g., PDF, .txt, or .docx) + - Entering a website URL to fetch content directly 2. **Document Pre-Processing** - The uploaded document is processed to extract and clean the text. This involves: - - Extracting readable text from the document. - - Removing noise such as URLs, email addresses, and special characters to ensure the text is clean and structured. + The input is processed to extract and clean the text. This involves: + - Extracting readable text from the document or webpage + - Removing noise such as URLs, email addresses, and special characters to ensure the text is clean and structured 3. **Script Generation** The cleaned text is passed to a language model to generate a podcast transcript in the form of a conversation between two speakers. diff --git a/demo/app.py b/demo/app.py index f3f904c..250b925 100644 --- a/demo/app.py +++ b/demo/app.py @@ -1,9 +1,14 @@ +"""Streamlit app for converting documents to podcasts.""" + import re from pathlib import Path import numpy as np import soundfile as sf import streamlit as st +import requests +from bs4 import BeautifulSoup +from requests.exceptions import RequestException from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS from document_to_podcast.inference.model_loaders import ( @@ -44,13 +49,12 @@ def gen_button_clicked(): st.title("Document To Podcast") -st.header("Uploading Data") +st.header("Upload a File") uploaded_file = st.file_uploader( "Choose a file", type=["pdf", "html", "txt", "docx", "md"] ) - if uploaded_file is not None: st.divider() st.header("Loading and Cleaning Data") @@ -78,6 +82,59 @@ def gen_button_clicked(): f"Number of characters after cleaning: {len(clean_text)}", f"{clean_text[:500]} . . .", ) + st.session_state["clean_text"] = clean_text + +st.divider() + +st.header("Or Enter a Website URL") +url = st.text_input("URL", placeholder="https://blog.mozilla.ai/...") +process_url = st.button("Clean URL Content") + + +def process_url_content(url: str) -> tuple[str, str]: + """Fetch and clean content from a URL. + + Args: + url: The URL to fetch content from + + Returns: + tuple containing raw and cleaned text + """ + response = requests.get(url) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + raw_text = soup.get_text() + return raw_text, DATA_CLEANERS[".html"](raw_text) + + +if url and process_url: + try: + with st.spinner("Fetching and cleaning content..."): + raw_text, clean_text = process_url_content(url) + st.session_state["clean_text"] = clean_text + + # Display results + col1, col2 = st.columns(2) + with col1: + st.subheader("Raw Text") + st.text_area( + "Number of characters before cleaning: " f"{len(raw_text)}", + f"{raw_text[:500]}...", + ) + with col2: + st.subheader("Cleaned Text") + st.text_area( + "Number of characters after cleaning: " f"{len(clean_text)}", + f"{clean_text[:500]}...", + ) + except RequestException as e: + st.error(f"Error fetching URL: {str(e)}") + except Exception as e: + st.error(f"Error processing content: {str(e)}") + +# Second part - Podcast generation +if "clean_text" in st.session_state: + clean_text = st.session_state["clean_text"] st.divider() st.header("Downloading and Loading models") @@ -86,6 +143,10 @@ def gen_button_clicked(): ) st.divider() + # Load models + text_model = load_text_to_text_model() + speech_model = load_text_to_speech_model() + st.markdown( "For this demo, we are using the following models: \n" "- [OLMoE-1B-7B-0924-Instruct](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF)\n" @@ -96,9 +157,6 @@ def gen_button_clicked(): " for more information on how to use different models." ) - text_model = load_text_to_text_model() - speech_model = load_text_to_speech_model() - # ~4 characters per token is considered a reasonable default. max_characters = text_model.n_ctx() * 4 if len(clean_text) > max_characters: @@ -154,7 +212,6 @@ def gen_button_clicked(): voice_profile, ) st.audio(speech, sample_rate=speech_model.audio_codec.sr) - st.session_state.audio.append(speech) text = "" @@ -172,5 +229,4 @@ def gen_button_clicked(): with open("script.txt", "w") as f: st.session_state.script += "}" f.write(st.session_state.script) - st.markdown("Script saved to disk!") diff --git a/tests/integration/test_url_input.py b/tests/integration/test_url_input.py new file mode 100644 index 0000000..491cad1 --- /dev/null +++ b/tests/integration/test_url_input.py @@ -0,0 +1,54 @@ +from document_to_podcast.preprocessing import DATA_CLEANERS +import pytest +import requests +from bs4 import BeautifulSoup +from requests.exceptions import RequestException + + +def test_url_content_cleaning(): + """Test basic URL content fetching and cleaning.""" + url = "https://blog.mozilla.ai/introducing-blueprints-customizable-ai-workflows-for-developers/" + response = requests.get(url) + soup = BeautifulSoup(response.text, "html.parser") + raw_text = soup.get_text() + clean_text = DATA_CLEANERS[".html"](raw_text) + + # Verify cleaning maintains same quality as file upload + assert len(clean_text) < len(raw_text) # Should remove HTML + assert "Mozilla" in clean_text # Should preserve key content + + +def test_url_error_handling(): + """Test handling of network errors.""" + with pytest.raises(RequestException): + response = requests.get("https://nonexistent-url-that-should-fail.com") + response.raise_for_status() + + +def test_url_content_quality(): + """Test that cleaned URL content maintains expected quality.""" + url = "https://blog.mozilla.org/en/mozilla/introducing-mozilla-ai-investing-in-trustworthy-ai/" + response = requests.get(url) + soup = BeautifulSoup(response.text, "html.parser") + raw_text = soup.get_text() + clean_text = DATA_CLEANERS[".html"](raw_text) + + # Test content quality + assert "mozilla" in clean_text.lower() # Key terms preserved + assert "ai" in clean_text.lower() # Case-insensitive content check + assert "" not in clean_text # HTML tags removed + assert "utm_source" not in clean_text # Marketing parameters removed + + +def test_url_content_size_limits(): + """Test handling of different content sizes.""" + url = "https://www.mozilla.org/en-US/about/manifesto/" # Substantial page + response = requests.get(url) + soup = BeautifulSoup(response.text, "html.parser") + raw_text = soup.get_text() + clean_text = DATA_CLEANERS[".html"](raw_text) + + # Size checks + assert len(clean_text) > 100 # Not too small + assert len(clean_text) < len(raw_text) # Smaller than raw + assert len(clean_text.split()) > 50 # Has substantial content