Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make output dirs customizable and enable parallel generation #120 #114 #121

Merged
merged 1 commit into from
Oct 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ lint:
mypy podcastfy/*.py

test:
python3 -m pytest tests
poetry run pytest -n auto

doc-gen:
sphinx-apidoc -f -o ./docs/source ./podcastfy
Expand Down
40 changes: 17 additions & 23 deletions podcastfy/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,6 @@ def process_content(
):
"""
Process URLs, a transcript file, image paths, or raw text to generate a podcast or transcript.

Args:
urls (Optional[List[str]]): A list of URLs to process.
transcript_file (Optional[str]): Path to a transcript file.
tts_model (str): The TTS model to use ('openai', 'elevenlabs' or 'edge'). Defaults to 'edge'.
generate_audio (bool): Whether to generate audio or just a transcript. Defaults to True.
config (Config): Configuration object to use. If None, default config will be loaded.
conversation_config (Optional[Dict[str, Any]]): Custom conversation configuration.
image_paths (Optional[List[str]]): List of image file paths to process.
is_local (bool): Whether to use a local LLM. Defaults to False.
text (Optional[str]): Raw text input to be processed.

Returns:
Optional[str]: Path to the final podcast audio file, or None if only generating a transcript.
"""
try:
if config is None:
Expand All @@ -69,32 +55,36 @@ def process_content(
if conversation_config:
conv_config.configure(conversation_config)

# Get output directories from conversation config
tts_config = conv_config.get('text_to_speech', {})
output_directories = tts_config.get('output_directories', {})

if transcript_file:
logger.info(f"Using transcript file: {transcript_file}")
with open(transcript_file, "r") as file:
qa_content = file.read()
else:
content_generator = ContentGenerator(
api_key=config.GEMINI_API_KEY, conversation_config=conv_config.to_dict()
api_key=config.GEMINI_API_KEY,
conversation_config=conv_config.to_dict()
)

combined_content = ""

if urls:
logger.info(f"Processing {len(urls)} links")
content_extractor = ContentExtractor()
# Extract content from links
contents = [content_extractor.extract_content(link) for link in urls]
# Combine all extracted content
combined_content += "\n\n".join(contents)

if text:
combined_content += f"\n\n{text}"

# Generate Q&A content
# Generate Q&A content using output directory from conversation config
random_filename = f"transcript_{uuid.uuid4().hex}.txt"
transcript_filepath = os.path.join(
config.get("output_directories")["transcripts"], random_filename
output_directories.get("transcripts", "data/transcripts"),
random_filename
)
qa_content = content_generator.generate_qa_content(
combined_content,
Expand All @@ -105,15 +95,19 @@ def process_content(

if generate_audio:
api_key = None
# edge does not require an API key
if tts_model != "edge":
api_key = getattr(config, f"{tts_model.upper()}_API_KEY")

text_to_speech = TextToSpeech(model=tts_model, api_key=api_key, conversation_config=conv_config.to_dict())
# Convert text to speech using the specified model
text_to_speech = TextToSpeech(
model=tts_model,
api_key=api_key,
conversation_config=conv_config.to_dict()
)

random_filename = f"podcast_{uuid.uuid4().hex}.mp3"
audio_file = os.path.join(
config.get("output_directories")["audio"], random_filename
output_directories.get("audio", "data/audio"),
random_filename
)
text_to_speech.convert_to_speech(qa_content, audio_file)
logger.info(f"Podcast generated successfully using {tts_model} TTS model")
Expand Down
6 changes: 1 addition & 5 deletions podcastfy/config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
output_directories:
transcripts: "./data/transcripts"
audio: "./data/audio"

content_generator:
gemini_model: "gemini-1.5-pro-latest"
max_output_tokens: 8192
Expand Down Expand Up @@ -46,4 +42,4 @@ website_extractor:
- 'aside'
- 'noscript'
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
timeout: 10 # Request timeout in seconds
timeout: 10 # Request timeout in seconds
5 changes: 4 additions & 1 deletion podcastfy/conversation_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ user_instructions: ""

text_to_speech:
default_tts_model: "edge"
output_directories:
transcripts: "./data/transcripts"
audio: "./data/audio"
elevenlabs:
default_voices:
question: "Chris"
Expand All @@ -38,4 +41,4 @@ text_to_speech:
answer: "en-US-EricNeural"
audio_format: "mp3"
temp_audio_dir: "data/audio/tmp/"
ending_message: "Bye Bye!"
ending_message: "Bye Bye!"
Loading
Loading