From 09faf73964c4d043613d9b043b8de850f75d946a Mon Sep 17 00:00:00 2001
From: fivestarspicy <25396141+fivestarspicy@users.noreply.github.com>
Date: Mon, 6 Jan 2025 01:59:32 -0800
Subject: [PATCH] feat: add URL input support to streamlit demo (#80)

* feat: add URL input support to streamlit demo

- Add URL text input and clean button
- Implement URL content fetching and cleaning
- Add integration tests
- Update documentation
- Maintain same cleaning quality as file upload

Closes #32

* Add integration tests for URL input functionality

* test: improve URL input integration tests and update docs

Tests:
- Add content quality test
- Add size limit test
- Remove invalid content test
- Follow patterns from test_data_load_and_clean.py

Docs:
- Update README to include URL input option
- Clarify document preprocessing steps

* fix: restore file upload functionality and fix test formatting

---------

Co-authored-by: fivestarspicy <your-github-id+your-username@users.noreply.github.com>
---
 .gitignore                          |  1 +
 README.md                           | 12 ++---
 demo/app.py                         | 70 ++++++++++++++++++++++++++---
 tests/integration/test_url_input.py | 54 ++++++++++++++++++++++
 4 files changed, 125 insertions(+), 12 deletions(-)
 create mode 100644 tests/integration/test_url_input.py
diff --git a/.gitignore b/.gitignore
index ff3b4eb..f9e9dd3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -166,3 +166,4 @@ cython_debug/
 
 # VS files
 .vscode
+.DS_Store
diff --git a/README.md b/README.md
index 0ea7348..67ef712 100644
--- a/README.md
+++ b/README.md
@@ -63,13 +63,15 @@ Once the Codespaces environment launches, inside the terminal, start the Streaml
 <img src="./images/document-to-podcast-diagram.png" width="1200" />
 
 
-1. **Document Upload**
-   Start by uploading a document in a supported format (e.g., PDF, .txt, or .docx).
+1. **Document Input**
+   Start by either:
+   - Uploading a document in a supported format (e.g., PDF, .txt, or .docx)
+   - Entering a website URL to fetch content directly
 
 2. **Document Pre-Processing**
-   The uploaded document is processed to extract and clean the text. This involves:
-   - Extracting readable text from the document.
-   - Removing noise such as URLs, email addresses, and special characters to ensure the text is clean and structured.
+   The input is processed to extract and clean the text. This involves:
+   - Extracting readable text from the document or webpage
+   - Removing noise such as URLs, email addresses, and special characters to ensure the text is clean and structured
 
 3. **Script Generation**
    The cleaned text is passed to a language model to generate a podcast transcript in the form of a conversation between two speakers.
diff --git a/demo/app.py b/demo/app.py
index f3f904c..250b925 100644
--- a/demo/app.py
+++ b/demo/app.py
@@ -1,9 +1,14 @@
+"""Streamlit app for converting documents to podcasts."""
+
 import re
 from pathlib import Path
 
 import numpy as np
 import soundfile as sf
 import streamlit as st
+import requests
+from bs4 import BeautifulSoup
+from requests.exceptions import RequestException
 
 from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
 from document_to_podcast.inference.model_loaders import (
@@ -44,13 +49,12 @@ def gen_button_clicked():
 
 st.title("Document To Podcast")
 
-st.header("Uploading Data")
+st.header("Upload a File")
 
 uploaded_file = st.file_uploader(
     "Choose a file", type=["pdf", "html", "txt", "docx", "md"]
 )
 
-
 if uploaded_file is not None:
     st.divider()
     st.header("Loading and Cleaning Data")
@@ -78,6 +82,59 @@ def gen_button_clicked():
             f"Number of characters after cleaning: {len(clean_text)}",
             f"{clean_text[:500]} . . .",
         )
+    st.session_state["clean_text"] = clean_text
+
+st.divider()
+
+st.header("Or Enter a Website URL")
+url = st.text_input("URL", placeholder="https://blog.mozilla.ai/...")
+process_url = st.button("Clean URL Content")
+
+
+def process_url_content(url: str) -> tuple[str, str]:
+    """Fetch and clean content from a URL.
+
+    Args:
+        url: The URL to fetch content from
+
+    Returns:
+        tuple containing raw and cleaned text
+    """
+    response = requests.get(url)
+    response.raise_for_status()
+    soup = BeautifulSoup(response.text, "html.parser")
+    raw_text = soup.get_text()
+    return raw_text, DATA_CLEANERS[".html"](raw_text)
+
+
+if url and process_url:
+    try:
+        with st.spinner("Fetching and cleaning content..."):
+            raw_text, clean_text = process_url_content(url)
+            st.session_state["clean_text"] = clean_text
+
+            # Display results
+            col1, col2 = st.columns(2)
+            with col1:
+                st.subheader("Raw Text")
+                st.text_area(
+                    "Number of characters before cleaning: " f"{len(raw_text)}",
+                    f"{raw_text[:500]}...",
+                )
+            with col2:
+                st.subheader("Cleaned Text")
+                st.text_area(
+                    "Number of characters after cleaning: " f"{len(clean_text)}",
+                    f"{clean_text[:500]}...",
+                )
+    except RequestException as e:
+        st.error(f"Error fetching URL: {str(e)}")
+    except Exception as e:
+        st.error(f"Error processing content: {str(e)}")
+
+# Second part - Podcast generation
+if "clean_text" in st.session_state:
+    clean_text = st.session_state["clean_text"]
 
     st.divider()
     st.header("Downloading and Loading models")
@@ -86,6 +143,10 @@ def gen_button_clicked():
     )
     st.divider()
 
+    # Load models
+    text_model = load_text_to_text_model()
+    speech_model = load_text_to_speech_model()
+
     st.markdown(
         "For this demo, we are using the following models: \n"
         "- [OLMoE-1B-7B-0924-Instruct](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF)\n"
@@ -96,9 +157,6 @@ def gen_button_clicked():
         " for more information on how to use different models."
     )
 
-    text_model = load_text_to_text_model()
-    speech_model = load_text_to_speech_model()
-
     # ~4 characters per token is considered a reasonable default.
     max_characters = text_model.n_ctx() * 4
     if len(clean_text) > max_characters:
@@ -154,7 +212,6 @@ def gen_button_clicked():
                             voice_profile,
                         )
                     st.audio(speech, sample_rate=speech_model.audio_codec.sr)
-
                     st.session_state.audio.append(speech)
                     text = ""
 
@@ -172,5 +229,4 @@ def gen_button_clicked():
             with open("script.txt", "w") as f:
                 st.session_state.script += "}"
                 f.write(st.session_state.script)
-
             st.markdown("Script saved to disk!")
diff --git a/tests/integration/test_url_input.py b/tests/integration/test_url_input.py
new file mode 100644
index 0000000..491cad1
--- /dev/null
+++ b/tests/integration/test_url_input.py
@@ -0,0 +1,54 @@
+from document_to_podcast.preprocessing import DATA_CLEANERS
+import pytest
+import requests
+from bs4 import BeautifulSoup
+from requests.exceptions import RequestException
+
+
+def test_url_content_cleaning():
+    """Test basic URL content fetching and cleaning."""
+    url = "https://blog.mozilla.ai/introducing-blueprints-customizable-ai-workflows-for-developers/"
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, "html.parser")
+    raw_text = soup.get_text()
+    clean_text = DATA_CLEANERS[".html"](raw_text)
+
+    # Verify cleaning maintains same quality as file upload
+    assert len(clean_text) < len(raw_text)  # Should remove HTML
+    assert "Mozilla" in clean_text  # Should preserve key content
+
+
+def test_url_error_handling():
+    """Test handling of network errors."""
+    with pytest.raises(RequestException):
+        response = requests.get("https://nonexistent-url-that-should-fail.com")
+        response.raise_for_status()
+
+
+def test_url_content_quality():
+    """Test that cleaned URL content maintains expected quality."""
+    url = "https://blog.mozilla.org/en/mozilla/introducing-mozilla-ai-investing-in-trustworthy-ai/"
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, "html.parser")
+    raw_text = soup.get_text()
+    clean_text = DATA_CLEANERS[".html"](raw_text)
+
+    # Test content quality
+    assert "mozilla" in clean_text.lower()  # Key terms preserved
+    assert "ai" in clean_text.lower()  # Case-insensitive content check
+    assert "<html>" not in clean_text  # HTML tags removed
+    assert "utm_source" not in clean_text  # Marketing parameters removed
+
+
+def test_url_content_size_limits():
+    """Test handling of different content sizes."""
+    url = "https://www.mozilla.org/en-US/about/manifesto/"  # Substantial page
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, "html.parser")
+    raw_text = soup.get_text()
+    clean_text = DATA_CLEANERS[".html"](raw_text)
+
+    # Size checks
+    assert len(clean_text) > 100  # Not too small
+    assert len(clean_text) < len(raw_text)  # Smaller than raw
+    assert len(clean_text.split()) > 50  # Has substantial content