Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add URL input support to streamlit demo #80

Merged
merged 4 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,4 @@ cython_debug/

# VS files
.vscode
.DS_Store
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,15 @@ Once the Codespaces environment launches, inside the terminal, start the Streaml
<img src="./images/document-to-podcast-diagram.png" width="1200" />


1. **Document Upload**
Start by uploading a document in a supported format (e.g., PDF, .txt, or .docx).
1. **Document Input**
Start by either:
- Uploading a document in a supported format (e.g., PDF, .txt, or .docx)
- Entering a website URL to fetch content directly

2. **Document Pre-Processing**
The uploaded document is processed to extract and clean the text. This involves:
- Extracting readable text from the document.
- Removing noise such as URLs, email addresses, and special characters to ensure the text is clean and structured.
The input is processed to extract and clean the text. This involves:
- Extracting readable text from the document or webpage
- Removing noise such as URLs, email addresses, and special characters to ensure the text is clean and structured

3. **Script Generation**
The cleaned text is passed to a language model to generate a podcast transcript in the form of a conversation between two speakers.
Expand Down
70 changes: 63 additions & 7 deletions demo/app.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
"""Streamlit app for converting documents to podcasts."""

import re
from pathlib import Path

import numpy as np
import soundfile as sf
import streamlit as st
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException

from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
from document_to_podcast.inference.model_loaders import (
Expand Down Expand Up @@ -44,13 +49,12 @@ def gen_button_clicked():

st.title("Document To Podcast")

st.header("Uploading Data")
st.header("Upload a File")

uploaded_file = st.file_uploader(
"Choose a file", type=["pdf", "html", "txt", "docx", "md"]
)


if uploaded_file is not None:
st.divider()
st.header("Loading and Cleaning Data")
Expand Down Expand Up @@ -78,6 +82,59 @@ def gen_button_clicked():
f"Number of characters after cleaning: {len(clean_text)}",
f"{clean_text[:500]} . . .",
)
st.session_state["clean_text"] = clean_text

st.divider()

st.header("Or Enter a Website URL")
url = st.text_input("URL", placeholder="https://blog.mozilla.ai/...")
process_url = st.button("Clean URL Content")


def process_url_content(url: str) -> tuple[str, str]:
"""Fetch and clean content from a URL.

Args:
url: The URL to fetch content from

Returns:
tuple containing raw and cleaned text
"""
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
raw_text = soup.get_text()
return raw_text, DATA_CLEANERS[".html"](raw_text)


if url and process_url:
try:
with st.spinner("Fetching and cleaning content..."):
raw_text, clean_text = process_url_content(url)
st.session_state["clean_text"] = clean_text

# Display results
col1, col2 = st.columns(2)
with col1:
st.subheader("Raw Text")
st.text_area(
"Number of characters before cleaning: " f"{len(raw_text)}",
f"{raw_text[:500]}...",
)
with col2:
st.subheader("Cleaned Text")
st.text_area(
"Number of characters after cleaning: " f"{len(clean_text)}",
f"{clean_text[:500]}...",
)
except RequestException as e:
st.error(f"Error fetching URL: {str(e)}")
except Exception as e:
st.error(f"Error processing content: {str(e)}")

# Second part - Podcast generation
if "clean_text" in st.session_state:
clean_text = st.session_state["clean_text"]

st.divider()
st.header("Downloading and Loading models")
Expand All @@ -86,6 +143,10 @@ def gen_button_clicked():
)
st.divider()

# Load models
text_model = load_text_to_text_model()
speech_model = load_text_to_speech_model()

st.markdown(
"For this demo, we are using the following models: \n"
"- [OLMoE-1B-7B-0924-Instruct](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF)\n"
Expand All @@ -96,9 +157,6 @@ def gen_button_clicked():
" for more information on how to use different models."
)

text_model = load_text_to_text_model()
speech_model = load_text_to_speech_model()

# ~4 characters per token is considered a reasonable default.
max_characters = text_model.n_ctx() * 4
if len(clean_text) > max_characters:
Expand Down Expand Up @@ -154,7 +212,6 @@ def gen_button_clicked():
voice_profile,
)
st.audio(speech, sample_rate=speech_model.audio_codec.sr)

st.session_state.audio.append(speech)
text = ""

Expand All @@ -172,5 +229,4 @@ def gen_button_clicked():
with open("script.txt", "w") as f:
st.session_state.script += "}"
f.write(st.session_state.script)

st.markdown("Script saved to disk!")
54 changes: 54 additions & 0 deletions tests/integration/test_url_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from document_to_podcast.preprocessing import DATA_CLEANERS
import pytest
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException


def test_url_content_cleaning():
"""Test basic URL content fetching and cleaning."""
url = "https://blog.mozilla.ai/introducing-blueprints-customizable-ai-workflows-for-developers/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
raw_text = soup.get_text()
clean_text = DATA_CLEANERS[".html"](raw_text)

# Verify cleaning maintains same quality as file upload
assert len(clean_text) < len(raw_text) # Should remove HTML
assert "Mozilla" in clean_text # Should preserve key content


def test_url_error_handling():
"""Test handling of network errors."""
with pytest.raises(RequestException):
response = requests.get("https://nonexistent-url-that-should-fail.com")
response.raise_for_status()


def test_url_content_quality():
"""Test that cleaned URL content maintains expected quality."""
url = "https://blog.mozilla.org/en/mozilla/introducing-mozilla-ai-investing-in-trustworthy-ai/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
raw_text = soup.get_text()
clean_text = DATA_CLEANERS[".html"](raw_text)

# Test content quality
assert "mozilla" in clean_text.lower() # Key terms preserved
assert "ai" in clean_text.lower() # Case-insensitive content check
assert "<html>" not in clean_text # HTML tags removed
assert "utm_source" not in clean_text # Marketing parameters removed


def test_url_content_size_limits():
"""Test handling of different content sizes."""
url = "https://www.mozilla.org/en-US/about/manifesto/" # Substantial page
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
raw_text = soup.get_text()
clean_text = DATA_CLEANERS[".html"](raw_text)

# Size checks
assert len(clean_text) > 100 # Not too small
assert len(clean_text) < len(raw_text) # Smaller than raw
assert len(clean_text.split()) > 50 # Has substantial content
Loading