Skip to content

Commit

Permalink
feat: add URL input support to streamlit demo (#80)
Browse files Browse the repository at this point in the history
* feat: add URL input support to streamlit demo

- Add URL text input and clean button
- Implement URL content fetching and cleaning
- Add integration tests
- Update documentation
- Maintain same cleaning quality as file upload

Closes #32

* Add integration tests for URL input functionality

* test: improve URL input integration tests and update docs

Tests:
- Add content quality test
- Add size limit test
- Remove invalid content test
- Follow patterns from test_data_load_and_clean.py

Docs:
- Update README to include URL input option
- Clarify document preprocessing steps

* fix: restore file upload functionality and fix test formatting

---------

Co-authored-by: fivestarspicy <[email protected]>
  • Loading branch information
fivestarspicy and fivestarspicy authored Jan 6, 2025
1 parent 8ebde29 commit 09faf73
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 12 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,4 @@ cython_debug/

# VS files
.vscode
.DS_Store
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,15 @@ Once the Codespaces environment launches, inside the terminal, start the Streaml
<img src="./images/document-to-podcast-diagram.png" width="1200" />


1. **Document Upload**
Start by uploading a document in a supported format (e.g., PDF, .txt, or .docx).
1. **Document Input**
Start by either:
- Uploading a document in a supported format (e.g., PDF, .txt, or .docx)
- Entering a website URL to fetch content directly

2. **Document Pre-Processing**
The uploaded document is processed to extract and clean the text. This involves:
- Extracting readable text from the document.
- Removing noise such as URLs, email addresses, and special characters to ensure the text is clean and structured.
The input is processed to extract and clean the text. This involves:
- Extracting readable text from the document or webpage
- Removing noise such as URLs, email addresses, and special characters to ensure the text is clean and structured

3. **Script Generation**
The cleaned text is passed to a language model to generate a podcast transcript in the form of a conversation between two speakers.
Expand Down
70 changes: 63 additions & 7 deletions demo/app.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
"""Streamlit app for converting documents to podcasts."""

import re
from pathlib import Path

import numpy as np
import soundfile as sf
import streamlit as st
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException

from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
from document_to_podcast.inference.model_loaders import (
Expand Down Expand Up @@ -44,13 +49,12 @@ def gen_button_clicked():

st.title("Document To Podcast")

st.header("Uploading Data")
st.header("Upload a File")

uploaded_file = st.file_uploader(
"Choose a file", type=["pdf", "html", "txt", "docx", "md"]
)


if uploaded_file is not None:
st.divider()
st.header("Loading and Cleaning Data")
Expand Down Expand Up @@ -78,6 +82,59 @@ def gen_button_clicked():
f"Number of characters after cleaning: {len(clean_text)}",
f"{clean_text[:500]} . . .",
)
st.session_state["clean_text"] = clean_text

st.divider()

st.header("Or Enter a Website URL")
url = st.text_input("URL", placeholder="https://blog.mozilla.ai/...")
process_url = st.button("Clean URL Content")


def process_url_content(url: str) -> tuple[str, str]:
"""Fetch and clean content from a URL.
Args:
url: The URL to fetch content from
Returns:
tuple containing raw and cleaned text
"""
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
raw_text = soup.get_text()
return raw_text, DATA_CLEANERS[".html"](raw_text)


if url and process_url:
try:
with st.spinner("Fetching and cleaning content..."):
raw_text, clean_text = process_url_content(url)
st.session_state["clean_text"] = clean_text

# Display results
col1, col2 = st.columns(2)
with col1:
st.subheader("Raw Text")
st.text_area(
"Number of characters before cleaning: " f"{len(raw_text)}",
f"{raw_text[:500]}...",
)
with col2:
st.subheader("Cleaned Text")
st.text_area(
"Number of characters after cleaning: " f"{len(clean_text)}",
f"{clean_text[:500]}...",
)
except RequestException as e:
st.error(f"Error fetching URL: {str(e)}")
except Exception as e:
st.error(f"Error processing content: {str(e)}")

# Second part - Podcast generation
if "clean_text" in st.session_state:
clean_text = st.session_state["clean_text"]

st.divider()
st.header("Downloading and Loading models")
Expand All @@ -86,6 +143,10 @@ def gen_button_clicked():
)
st.divider()

# Load models
text_model = load_text_to_text_model()
speech_model = load_text_to_speech_model()

st.markdown(
"For this demo, we are using the following models: \n"
"- [OLMoE-1B-7B-0924-Instruct](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF)\n"
Expand All @@ -96,9 +157,6 @@ def gen_button_clicked():
" for more information on how to use different models."
)

text_model = load_text_to_text_model()
speech_model = load_text_to_speech_model()

# ~4 characters per token is considered a reasonable default.
max_characters = text_model.n_ctx() * 4
if len(clean_text) > max_characters:
Expand Down Expand Up @@ -154,7 +212,6 @@ def gen_button_clicked():
voice_profile,
)
st.audio(speech, sample_rate=speech_model.audio_codec.sr)

st.session_state.audio.append(speech)
text = ""

Expand All @@ -172,5 +229,4 @@ def gen_button_clicked():
with open("script.txt", "w") as f:
st.session_state.script += "}"
f.write(st.session_state.script)

st.markdown("Script saved to disk!")
54 changes: 54 additions & 0 deletions tests/integration/test_url_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from document_to_podcast.preprocessing import DATA_CLEANERS
import pytest
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException


def test_url_content_cleaning():
"""Test basic URL content fetching and cleaning."""
url = "https://blog.mozilla.ai/introducing-blueprints-customizable-ai-workflows-for-developers/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
raw_text = soup.get_text()
clean_text = DATA_CLEANERS[".html"](raw_text)

# Verify cleaning maintains same quality as file upload
assert len(clean_text) < len(raw_text) # Should remove HTML
assert "Mozilla" in clean_text # Should preserve key content


def test_url_error_handling():
"""Test handling of network errors."""
with pytest.raises(RequestException):
response = requests.get("https://nonexistent-url-that-should-fail.com")
response.raise_for_status()


def test_url_content_quality():
"""Test that cleaned URL content maintains expected quality."""
url = "https://blog.mozilla.org/en/mozilla/introducing-mozilla-ai-investing-in-trustworthy-ai/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
raw_text = soup.get_text()
clean_text = DATA_CLEANERS[".html"](raw_text)

# Test content quality
assert "mozilla" in clean_text.lower() # Key terms preserved
assert "ai" in clean_text.lower() # Case-insensitive content check
assert "<html>" not in clean_text # HTML tags removed
assert "utm_source" not in clean_text # Marketing parameters removed


def test_url_content_size_limits():
"""Test handling of different content sizes."""
url = "https://www.mozilla.org/en-US/about/manifesto/" # Substantial page
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
raw_text = soup.get_text()
clean_text = DATA_CLEANERS[".html"](raw_text)

# Size checks
assert len(clean_text) > 100 # Not too small
assert len(clean_text) < len(raw_text) # Smaller than raw
assert len(clean_text.split()) > 50 # Has substantial content

0 comments on commit 09faf73

Please sign in to comment.