From bd46ff61970e36b79bf8437d43604328aad70a82 Mon Sep 17 00:00:00 2001 From: souzatharsis Date: Sat, 26 Oct 2024 22:53:09 -0300 Subject: [PATCH] Make output dirs customizable and enable parallel generation #120 #114 --- Makefile | 2 +- podcastfy/client.py | 40 +++--- podcastfy/config.yaml | 6 +- podcastfy/conversation_config.yaml | 5 +- podcastfy/text_to_speech.py | 218 +++++++++++++---------------- pyproject.toml | 1 + tests/test_client.py | 25 ++-- tests/test_generate_podcast.py | 74 ++++++---- 8 files changed, 185 insertions(+), 186 deletions(-) diff --git a/Makefile b/Makefile index 520e10d..80e6385 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ lint: mypy podcastfy/*.py test: - python3 -m pytest tests + poetry run pytest -n auto doc-gen: sphinx-apidoc -f -o ./docs/source ./podcastfy diff --git a/podcastfy/client.py b/podcastfy/client.py index 686d547..f827bae 100644 --- a/podcastfy/client.py +++ b/podcastfy/client.py @@ -43,20 +43,6 @@ def process_content( ): """ Process URLs, a transcript file, image paths, or raw text to generate a podcast or transcript. - - Args: - urls (Optional[List[str]]): A list of URLs to process. - transcript_file (Optional[str]): Path to a transcript file. - tts_model (str): The TTS model to use ('openai', 'elevenlabs' or 'edge'). Defaults to 'edge'. - generate_audio (bool): Whether to generate audio or just a transcript. Defaults to True. - config (Config): Configuration object to use. If None, default config will be loaded. - conversation_config (Optional[Dict[str, Any]]): Custom conversation configuration. - image_paths (Optional[List[str]]): List of image file paths to process. - is_local (bool): Whether to use a local LLM. Defaults to False. - text (Optional[str]): Raw text input to be processed. - - Returns: - Optional[str]: Path to the final podcast audio file, or None if only generating a transcript. """ try: if config is None: @@ -69,13 +55,18 @@ def process_content( if conversation_config: conv_config.configure(conversation_config) + # Get output directories from conversation config + tts_config = conv_config.get('text_to_speech', {}) + output_directories = tts_config.get('output_directories', {}) + if transcript_file: logger.info(f"Using transcript file: {transcript_file}") with open(transcript_file, "r") as file: qa_content = file.read() else: content_generator = ContentGenerator( - api_key=config.GEMINI_API_KEY, conversation_config=conv_config.to_dict() + api_key=config.GEMINI_API_KEY, + conversation_config=conv_config.to_dict() ) combined_content = "" @@ -83,18 +74,17 @@ def process_content( if urls: logger.info(f"Processing {len(urls)} links") content_extractor = ContentExtractor() - # Extract content from links contents = [content_extractor.extract_content(link) for link in urls] - # Combine all extracted content combined_content += "\n\n".join(contents) if text: combined_content += f"\n\n{text}" - # Generate Q&A content + # Generate Q&A content using output directory from conversation config random_filename = f"transcript_{uuid.uuid4().hex}.txt" transcript_filepath = os.path.join( - config.get("output_directories")["transcripts"], random_filename + output_directories.get("transcripts", "data/transcripts"), + random_filename ) qa_content = content_generator.generate_qa_content( combined_content, @@ -105,15 +95,19 @@ def process_content( if generate_audio: api_key = None - # edge does not require an API key if tts_model != "edge": api_key = getattr(config, f"{tts_model.upper()}_API_KEY") - text_to_speech = TextToSpeech(model=tts_model, api_key=api_key, conversation_config=conv_config.to_dict()) - # Convert text to speech using the specified model + text_to_speech = TextToSpeech( + model=tts_model, + api_key=api_key, + conversation_config=conv_config.to_dict() + ) + random_filename = f"podcast_{uuid.uuid4().hex}.mp3" audio_file = os.path.join( - config.get("output_directories")["audio"], random_filename + output_directories.get("audio", "data/audio"), + random_filename ) text_to_speech.convert_to_speech(qa_content, audio_file) logger.info(f"Podcast generated successfully using {tts_model} TTS model") diff --git a/podcastfy/config.yaml b/podcastfy/config.yaml index 19ed0e2..dc20cb7 100644 --- a/podcastfy/config.yaml +++ b/podcastfy/config.yaml @@ -1,7 +1,3 @@ -output_directories: - transcripts: "./data/transcripts" - audio: "./data/audio" - content_generator: gemini_model: "gemini-1.5-pro-latest" max_output_tokens: 8192 @@ -46,4 +42,4 @@ website_extractor: - 'aside' - 'noscript' user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - timeout: 10 # Request timeout in seconds \ No newline at end of file + timeout: 10 # Request timeout in seconds diff --git a/podcastfy/conversation_config.yaml b/podcastfy/conversation_config.yaml index 8aad06f..b525f64 100644 --- a/podcastfy/conversation_config.yaml +++ b/podcastfy/conversation_config.yaml @@ -22,6 +22,9 @@ user_instructions: "" text_to_speech: default_tts_model: "edge" + output_directories: + transcripts: "./data/transcripts" + audio: "./data/audio" elevenlabs: default_voices: question: "Chris" @@ -38,4 +41,4 @@ text_to_speech: answer: "en-US-EricNeural" audio_format: "mp3" temp_audio_dir: "data/audio/tmp/" - ending_message: "Bye Bye!" \ No newline at end of file + ending_message: "Bye Bye!" diff --git a/podcastfy/text_to_speech.py b/podcastfy/text_to_speech.py index 7f34d66..bd91e3b 100644 --- a/podcastfy/text_to_speech.py +++ b/podcastfy/text_to_speech.py @@ -16,6 +16,7 @@ import os import re import openai +import tempfile from typing import List, Tuple, Optional, Union, Dict, Any logger = logging.getLogger(__name__) @@ -34,7 +35,10 @@ def __init__(self, model: str = 'edge', api_key: Optional[str] = None, conversat self.model = model.lower() self.config = load_config() self.conversation_config = load_conversation_config(conversation_config) - self.tts_config = self.conversation_config.get('text_to_speech') + self.tts_config = self.conversation_config.get('text_to_speech', {}) + + # Get output directories from conversation config + self.output_directories = self.tts_config.get('output_directories', {}) if self.model == 'elevenlabs': self.api_key = api_key or self.config.ELEVENLABS_API_KEY @@ -51,8 +55,17 @@ def __init__(self, model: str = 'edge', api_key: Optional[str] = None, conversat self.temp_audio_dir = self.tts_config.get('temp_audio_dir') self.ending_message = self.tts_config.get('ending_message') - # Create temp_audio_dir if it doesn't exist - if not os.path.exists(self.temp_audio_dir): + # Create output directories if they don't exist + transcripts_dir = self.output_directories.get('transcripts') + audio_dir = self.output_directories.get('audio') + + if transcripts_dir and not os.path.exists(transcripts_dir): + os.makedirs(transcripts_dir) + if audio_dir and not os.path.exists(audio_dir): + os.makedirs(audio_dir) + + # Create temp_audio_dir if it doesn't exist + if self.temp_audio_dir and not os.path.exists(self.temp_audio_dir): os.makedirs(self.temp_audio_dir) def __merge_audio_files(self, input_dir: str, output_file: str) -> None: @@ -78,6 +91,8 @@ def natural_sort_key(filename: str) -> List[Union[int, str]]: file_path = os.path.join(input_dir, file) combined += AudioSegment.from_file(file_path, format=self.audio_format) + # Ensure the output directory exists + os.makedirs(os.path.dirname(output_file), exist_ok=True) combined.export(output_file, format=self.audio_format) logger.info(f"Merged audio saved to {output_file}") except Exception as e: @@ -106,136 +121,102 @@ def convert_to_speech(self, text: str, output_file: str) -> None: self.__convert_to_speech_edge(cleaned_text, output_file) def __convert_to_speech_elevenlabs(self, text: str, output_file: str) -> None: + """Convert text to speech using ElevenLabs.""" try: - qa_pairs = self.split_qa(text) - audio_files = [] - counter = 0 - for question, answer in qa_pairs: - question_audio = self.client.generate( - text=question, - voice=self.tts_config.get("elevenlabs").get("default_voices").get("question"), - model=self.tts_config.get("elevenlabs").get("model") - ) - answer_audio = self.client.generate( - text=answer, - voice=self.tts_config.get("elevenlabs").get("default_voices").get("answer"), - model=self.tts_config.get("elevenlabs").get("model") - ) - - # Save question and answer audio chunks - for audio in [question_audio, answer_audio]: - counter += 1 - file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}" - with open(file_name, "wb") as out: - for chunk in audio: - if chunk: - out.write(chunk) - audio_files.append(file_name) - - # Merge all audio files and save the result - self.__merge_audio_files(self.temp_audio_dir, output_file) - - # Clean up individual audio files - for file in audio_files: - os.remove(file) - - logger.info(f"Audio saved to {output_file}") - - except Exception as e: - logger.error(f"Error converting text to speech with ElevenLabs: {str(e)}") - raise - - def __convert_to_speech_openai(self, text: str, output_file: str) -> None: - try: - qa_pairs = self.split_qa(text) - print(qa_pairs) - audio_files = [] - counter = 0 - for question, answer in qa_pairs: - for speaker, content in [ - (self.tts_config.get("openai").get("default_voices").get("question"), question), - (self.tts_config.get("openai").get("default_voices").get("answer"), answer) - ]: - counter += 1 - file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}" - response = openai.audio.speech.create( - model=self.tts_config.get("openai").get("model"), - voice=speaker, - input=content + with tempfile.TemporaryDirectory(dir=self.temp_audio_dir) as temp_dir: + qa_pairs = self.split_qa(text) + audio_files = [] + counter = 0 + + for question, answer in qa_pairs: + question_audio = self.client.generate( + text=question, + voice=self.tts_config.get("elevenlabs", {}).get("default_voices", {}).get("question"), + model=self.tts_config.get("elevenlabs", {}).get("model") + ) + answer_audio = self.client.generate( + text=answer, + voice=self.tts_config.get("elevenlabs", {}).get("default_voices", {}).get("answer"), + model=self.tts_config.get("elevenlabs", {}).get("model") ) - with open(file_name, "wb") as file: - file.write(response.content) - - audio_files.append(file_name) - # Merge all audio files and save the result - self.__merge_audio_files(self.temp_audio_dir, output_file) + for audio in [question_audio, answer_audio]: + counter += 1 + temp_file = os.path.join(temp_dir, f"{counter}.{self.audio_format}") + with open(temp_file, "wb") as out: + for chunk in audio: + if chunk: + out.write(chunk) + audio_files.append(temp_file) - # Clean up individual audio files - for file in audio_files: - os.remove(file) - - logger.info(f"Audio saved to {output_file}") + self.__merge_audio_files(temp_dir, output_file) + logger.info(f"Audio saved to {output_file}") except Exception as e: - logger.error(f"Error converting text to speech with OpenAI: {str(e)}") + logger.error(f"Error converting text to speech with ElevenLabs: {str(e)}") raise - - def get_or_create_eventloop(): - try: - return asyncio.get_event_loop() - except RuntimeError as ex: - if "There is no current event loop in thread" in str(ex): - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - return asyncio.get_event_loop() - - import nest_asyncio # type: ignore - get_or_create_eventloop() - nest_asyncio.apply() - - def __convert_to_speech_edge(self, text: str, output_file: str) -> None: - """ - Convert text to speech using Edge TTS. - Args: - text (str): The input text to convert to speech. - output_file (str): The path to save the output audio file. - """ + def __convert_to_speech_openai(self, text: str, output_file: str) -> None: + """Convert text to speech using OpenAI.""" try: - qa_pairs = self.split_qa(text) - audio_files = [] - counter = 0 - - async def edge_tts_conversion(text_chunk: str, output_path: str, voice: str): - tts = edge_tts.Communicate(text_chunk, voice) - await tts.save(output_path) - return + with tempfile.TemporaryDirectory(dir=self.temp_audio_dir) as temp_dir: + qa_pairs = self.split_qa(text) + audio_files = [] + counter = 0 - async def process_qa_pairs(qa_pairs): - nonlocal counter - tasks = [] for question, answer in qa_pairs: for speaker, content in [ - (self.tts_config.get("edge").get("default_voices").get("question"), question), - (self.tts_config.get("edge").get("default_voices").get("answer"), answer) + (self.tts_config.get("openai", {}).get("default_voices", {}).get("question"), question), + (self.tts_config.get("openai", {}).get("default_voices", {}).get("answer"), answer) ]: counter += 1 - file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}" - tasks.append(asyncio.ensure_future(edge_tts_conversion(content, file_name, speaker))) - audio_files.append(file_name) + temp_file = os.path.join(temp_dir, f"{counter}.{self.audio_format}") + response = openai.audio.speech.create( + model=self.tts_config.get("openai", {}).get("model"), + voice=speaker, + input=content + ) + with open(temp_file, "wb") as f: + f.write(response.content) + audio_files.append(temp_file) + + self.__merge_audio_files(temp_dir, output_file) + logger.info(f"Audio saved to {output_file}") - await asyncio.gather(*tasks) - - asyncio.run(process_qa_pairs(qa_pairs)) - - # Merge all audio files - self.__merge_audio_files(self.temp_audio_dir, output_file) + except Exception as e: + logger.error(f"Error converting text to speech with OpenAI: {str(e)}") + raise - # Clean up individual audio files - for file in audio_files: - os.remove(file) - logger.info(f"Audio saved to {output_file}") + def __convert_to_speech_edge(self, text: str, output_file: str) -> None: + """Convert text to speech using Edge TTS.""" + try: + with tempfile.TemporaryDirectory(dir=self.temp_audio_dir) as temp_dir: + qa_pairs = self.split_qa(text) + audio_files = [] + counter = 0 + + async def edge_tts_conversion(text_chunk: str, output_path: str, voice: str): + tts = edge_tts.Communicate(text_chunk, voice) + await tts.save(output_path) + + async def process_qa_pairs(qa_pairs): + nonlocal counter + tasks = [] + for question, answer in qa_pairs: + for speaker, content in [ + (self.tts_config.get("edge", {}).get("default_voices", {}).get("question"), question), + (self.tts_config.get("edge", {}).get("default_voices", {}).get("answer"), answer) + ]: + counter += 1 + temp_file = os.path.join(temp_dir, f"{counter}.{self.audio_format}") + tasks.append(asyncio.ensure_future(edge_tts_conversion(content, temp_file, speaker))) + audio_files.append(temp_file) + + await asyncio.gather(*tasks) + + asyncio.run(process_qa_pairs(qa_pairs)) + self.__merge_audio_files(temp_dir, output_file) + logger.info(f"Audio saved to {output_file}") except Exception as e: logger.error(f"Error converting text to speech with Edge: {str(e)}") @@ -350,4 +331,5 @@ def main(seed: int = 42) -> None: raise if __name__ == "__main__": - main(seed=42) \ No newline at end of file + main(seed=42) + diff --git a/pyproject.toml b/pyproject.toml index 47e0b80..8918463 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ types-pyyaml = "^6.0.12.20240917" nest-asyncio = "^1.6.0" ffmpeg = "^1.4" pytest = "^8.3.3" +pytest-xdist = "^3.6.1" [tool.poetry.group.dev.dependencies] diff --git a/tests/test_client.py b/tests/test_client.py index b7c22fd..75638e4 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -7,7 +7,6 @@ import re from typer.testing import CliRunner from podcastfy.client import app -from podcastfy.utils.config import load_config runner = CliRunner() @@ -66,14 +65,24 @@ def mock_files(tmp_path): @pytest.fixture def sample_config(): - config = load_config() - config.configure( - output_directories={ - 'audio': 'tests/data/audio', - 'transcripts': 'tests/data/transcripts' + """ + Fixture to provide a sample conversation configuration for testing. + + Returns: + dict: A dictionary containing sample conversation configuration parameters. + """ + conversation_config = { + "word_count": 300, + "text_to_speech": { + "output_directories": { + "transcripts": "tests/data/transcripts", + "audio": "tests/data/audio" + }, + "temp_audio_dir": "tests/data/audio/tmp", + "ending_message": "Bye Bye!" } - ) - return config + } + return conversation_config def test_generate_podcast_from_urls(sample_config): result = runner.invoke(app, ["--url", MOCK_URLS[0], "--url", MOCK_URLS[1], "--tts-model", "edge"]) diff --git a/tests/test_generate_podcast.py b/tests/test_generate_podcast.py index da95972..c0940eb 100644 --- a/tests/test_generate_podcast.py +++ b/tests/test_generate_podcast.py @@ -8,12 +8,11 @@ @pytest.fixture def sample_config(): config = load_config() - config.configure( - output_directories={ - 'audio': 'tests/data/audio', - 'transcripts': 'tests/data/transcripts' - } - ) + return config + +@pytest.fixture +def default_conversation_config(): + config = load_conversation_config() return config @pytest.fixture @@ -34,11 +33,29 @@ def sample_conversation_config(): "podcast_tagline": "Learning Through Conversation", "output_language": "English", "engagement_techniques": ["examples", "questions"], - "creativity": 0 + "creativity": 0, + "text_to_speech": { + "output_directories": { + "transcripts": "tests/data/transcriptsTEST", + "audio": "tests/data/audioTEST" + }, + "temp_audio_dir": "tests/data/audio/tmpTEST/", + "ending_message": "Bye Bye!" + } } return conversation_config -def test_generate_podcast_from_urls(sample_config): +@pytest.fixture(autouse=True) +def setup_test_directories(sample_conversation_config): + """Create test directories if they don't exist.""" + output_dirs = sample_conversation_config.get("text_to_speech", {}).get("output_directories", {}) + for directory in output_dirs.values(): + os.makedirs(directory, exist_ok=True) + temp_dir = sample_conversation_config.get("text_to_speech", {}).get("temp_audio_dir") + if temp_dir: + os.makedirs(temp_dir, exist_ok=True) + +def test_generate_podcast_from_urls(default_conversation_config): """Test generating a podcast from a list of URLs.""" urls = [ "https://en.wikipedia.org/wiki/Podcast", @@ -47,35 +64,34 @@ def test_generate_podcast_from_urls(sample_config): audio_file = generate_podcast( urls=urls, - tts_model="edge", - config=sample_config + tts_model="edge" ) assert audio_file is not None assert os.path.exists(audio_file) assert audio_file.endswith('.mp3') - assert os.path.dirname(audio_file) == sample_config.get('output_directories', {}).get('audio') - -def test_generate_transcript_only(sample_config): + assert os.path.dirname(audio_file) == default_conversation_config.get('text_to_speech', {}).get('output_directories', {}).get('audio') +def test_generate_transcript_only(default_conversation_config): """Test generating only a transcript without audio.""" urls = ["https://www.souzatharsis.com/"] result = generate_podcast( urls=urls, transcript_only=True, - tts_model="edge", - config=sample_config + tts_model="edge" ) assert result is not None assert os.path.exists(result) assert result.endswith('.txt') - assert os.path.dirname(result) == sample_config.get('output_directories', {}).get('transcripts') - -def test_generate_podcast_from_transcript_file(sample_config): + assert os.path.dirname(result) == default_conversation_config.get('text_to_speech', {}).get('output_directories', {}).get('transcripts') +def test_generate_podcast_from_transcript_file(sample_conversation_config): """Test generating a podcast from an existing transcript file.""" # First, generate a transcript - transcript_file = os.path.join(sample_config.get('output_directories', {}).get('transcripts'), 'test_transcript.txt') + transcript_file = os.path.join( + sample_conversation_config.get("text_to_speech", {}).get("output_directories", {}).get("transcripts"), + 'test_transcript.txt' + ) with open(transcript_file, 'w') as f: f.write("Joe Biden and the US PoliticsJoe Biden is the current president of the United States of America") @@ -83,13 +99,13 @@ def test_generate_podcast_from_transcript_file(sample_config): audio_file = generate_podcast( transcript_file=transcript_file, tts_model="edge", - config=sample_config + conversation_config=sample_conversation_config ) assert audio_file is not None assert os.path.exists(audio_file) assert audio_file.endswith('.mp3') - assert os.path.dirname(audio_file) == sample_config.get('output_directories', {}).get('audio') + assert os.path.dirname(audio_file) == sample_conversation_config.get("text_to_speech", {}).get("output_directories", {}).get("audio") def test_generate_podcast_with_custom_config(sample_config, sample_conversation_config): """Test generating a podcast with a custom conversation config.""" @@ -104,7 +120,7 @@ def test_generate_podcast_with_custom_config(sample_config, sample_conversation_ assert audio_file is not None assert os.path.exists(audio_file) assert audio_file.endswith('.mp3') - assert os.path.dirname(audio_file) == sample_config.get('output_directories', {}).get('audio') + assert os.path.dirname(audio_file) == sample_conversation_config["text_to_speech"]["output_directories"]["audio"] def test_generate_from_local_pdf(sample_config): """Test generating a podcast from a local PDF file.""" @@ -116,14 +132,13 @@ def test_generate_from_local_pdf(sample_config): assert audio_file is not None assert os.path.exists(audio_file) assert audio_file.endswith('.mp3') - assert os.path.dirname(audio_file) == sample_config.get('output_directories', {}).get('audio') def test_generate_podcast_no_urls_or_transcript(): """Test that an error is raised when no URLs or transcript file is provided.""" with pytest.raises(ValueError): generate_podcast() -def test_generate_podcast_from_images(sample_config): +def test_generate_podcast_from_images(sample_config, default_conversation_config): """Test generating a podcast from two input images.""" image_paths = [ "tests/data/images/Senecio.jpeg", @@ -139,14 +154,13 @@ def test_generate_podcast_from_images(sample_config): assert audio_file is not None assert os.path.exists(audio_file) assert audio_file.endswith('.mp3') - assert os.path.dirname(audio_file) == sample_config.get('output_directories', {}).get('audio') # Check if a transcript was generated - transcript_dir = sample_config.get('output_directories', {}).get('transcripts') + transcript_dir = default_conversation_config.get('text_to_speech', {}).get('output_directories', {}).get('transcripts') transcript_files = [f for f in os.listdir(transcript_dir) if f.startswith('transcript_') and f.endswith('.txt')] assert len(transcript_files) > 0 -def test_generate_podcast_from_raw_text(sample_config): +def test_generate_podcast_from_raw_text(sample_config, default_conversation_config): """Test generating a podcast from raw input text.""" raw_text = "The wonderful world of LLMs." @@ -159,9 +173,9 @@ def test_generate_podcast_from_raw_text(sample_config): assert audio_file is not None assert os.path.exists(audio_file) assert audio_file.endswith('.mp3') - assert os.path.dirname(audio_file) == sample_config.get('output_directories', {}).get('audio') + assert os.path.dirname(audio_file) == default_conversation_config.get('text_to_speech', {}).get('output_directories', {}).get('audio') -def test_generate_transcript_with_user_instructions(sample_config): +def test_generate_transcript_with_user_instructions(sample_config, default_conversation_config): """Test generating a transcript with specific user instructions in the conversation config.""" url = "https://en.wikipedia.org/wiki/Artificial_intelligence" @@ -191,7 +205,7 @@ def test_generate_transcript_with_user_instructions(sample_config): assert result is not None assert os.path.exists(result) assert result.endswith('.txt') - assert os.path.dirname(result) == sample_config.get('output_directories', {}).get('transcripts') + assert os.path.dirname(result) == default_conversation_config.get('text_to_speech', {}).get('output_directories', {}).get('transcripts') # Read the generated transcript with open(result, 'r') as f: