-
Notifications
You must be signed in to change notification settings - Fork 8.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(podcast_generator): add new podcast generation tools
- Introduced podcast generator with text-to-speech functionality using OpenAI's API. - Implemented credential validation for TTS services and API keys. - Added support for generating podcast audio with alternating host voices. - Included user-friendly setup with internationalized YAML configuration. - Added SVG icon to enhance visual identification.
- Loading branch information
Showing
5 changed files
with
287 additions
and
0 deletions.
There are no files selected for viewing
24 changes: 24 additions & 0 deletions
24
api/core/tools/provider/builtin/podcast_generator/_assets/icon.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
33 changes: 33 additions & 0 deletions
33
api/core/tools/provider/builtin/podcast_generator/podcast_generator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
from typing import Any | ||
|
||
import openai | ||
|
||
from core.tools.errors import ToolProviderCredentialValidationError | ||
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController | ||
|
||
|
||
class PodcastGeneratorProvider(BuiltinToolProviderController): | ||
def _validate_credentials(self, credentials: dict[str, Any]) -> None: | ||
tts_service = credentials.get("tts_service") | ||
api_key = credentials.get("api_key") | ||
|
||
if not tts_service: | ||
raise ToolProviderCredentialValidationError("TTS service is not specified") | ||
|
||
if not api_key: | ||
raise ToolProviderCredentialValidationError("API key is missing") | ||
|
||
if tts_service == "openai": | ||
self._validate_openai_credentials(api_key) | ||
else: | ||
raise ToolProviderCredentialValidationError(f"Unsupported TTS service: {tts_service}") | ||
|
||
def _validate_openai_credentials(self, api_key: str) -> None: | ||
client = openai.OpenAI(api_key=api_key) | ||
try: | ||
# We're using a simple API call to validate the credentials | ||
client.models.list() | ||
except openai.AuthenticationError: | ||
raise ToolProviderCredentialValidationError("Invalid OpenAI API key") | ||
except Exception as e: | ||
raise ToolProviderCredentialValidationError(f"Error validating OpenAI API key: {str(e)}") |
34 changes: 34 additions & 0 deletions
34
api/core/tools/provider/builtin/podcast_generator/podcast_generator.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
identity: | ||
author: Dify | ||
name: podcast_generator | ||
label: | ||
en_US: Podcast Generator | ||
zh_Hans: 播客生成器 | ||
description: | ||
en_US: Generate podcast audio using Text-to-Speech services | ||
zh_Hans: 使用文字转语音服务生成播客音频 | ||
icon: icon.svg | ||
credentials_for_provider: | ||
tts_service: | ||
type: select | ||
required: true | ||
label: | ||
en_US: TTS Service | ||
zh_Hans: TTS 服务 | ||
placeholder: | ||
en_US: Select a TTS service | ||
zh_Hans: 选择一个 TTS 服务 | ||
options: | ||
- label: | ||
en_US: OpenAI TTS | ||
zh_Hans: OpenAI TTS | ||
value: openai | ||
api_key: | ||
type: secret-input | ||
required: true | ||
label: | ||
en_US: API Key | ||
zh_Hans: API 密钥 | ||
placeholder: | ||
en_US: Enter your TTS service API key | ||
zh_Hans: 输入您的 TTS 服务 API 密钥 |
101 changes: 101 additions & 0 deletions
101
api/core/tools/provider/builtin/podcast_generator/tools/podcast_audio_generator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import concurrent.futures | ||
import random | ||
import struct | ||
from typing import Any, Literal, Optional, Union | ||
|
||
import openai | ||
|
||
from core.tools.entities.tool_entities import ToolInvokeMessage | ||
from core.tools.errors import ToolParameterValidationError, ToolProviderCredentialValidationError | ||
from core.tools.tool.builtin_tool import BuiltinTool | ||
|
||
|
||
class PodcastAudioGeneratorTool(BuiltinTool): | ||
@staticmethod | ||
def _generate_silence(duration): | ||
# Generate silent MP3 data | ||
# This is a simplified version and may not work perfectly with all MP3 players | ||
# For production use, consider using a proper audio library or pre-generated silence MP3 | ||
sample_rate = 44100 | ||
num_samples = int(duration * sample_rate) | ||
silence_data = struct.pack("<" + "h" * num_samples, *([0] * num_samples)) | ||
|
||
# Add a simple MP3 header (this is not a complete MP3 file, but might work for basic needs) | ||
mp3_header = b"\xff\xfb\x90\x04" # A very basic MP3 header | ||
return mp3_header + silence_data | ||
|
||
@staticmethod | ||
def _generate_audio_segment( | ||
client: openai.OpenAI, | ||
line: str, | ||
voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"], | ||
index: int, | ||
) -> tuple[int, Union[bytes, str], Optional[bytes]]: | ||
try: | ||
response = client.audio.speech.create(model="tts-1", voice=voice, input=line.strip()) | ||
audio = response.content | ||
silence_duration = random.uniform(2, 5) | ||
silence = PodcastAudioGeneratorTool._generate_silence(silence_duration) | ||
return index, audio, silence | ||
except Exception as e: | ||
return index, f"Error generating audio: {str(e)}", None | ||
|
||
def _invoke( | ||
self, user_id: str, tool_parameters: dict[str, Any] | ||
) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: | ||
# Extract parameters | ||
script = tool_parameters.get("script", "") | ||
host1_voice = tool_parameters.get("host1_voice") | ||
host2_voice = tool_parameters.get("host2_voice") | ||
|
||
# Split the script into lines | ||
script_lines = [line for line in script.split("\n") if line.strip()] | ||
|
||
# Ensure voices are provided | ||
if not host1_voice or not host2_voice: | ||
raise ToolParameterValidationError("Host voices are required") | ||
|
||
# Get OpenAI API key from credentials | ||
if not self.runtime or not self.runtime.credentials: | ||
raise ToolProviderCredentialValidationError("Tool runtime or credentials are missing") | ||
api_key = self.runtime.credentials.get("api_key") | ||
if not api_key: | ||
raise ToolProviderCredentialValidationError("OpenAI API key is missing") | ||
|
||
# Initialize OpenAI client | ||
client = openai.OpenAI(api_key=api_key) | ||
|
||
# Create a thread pool | ||
max_workers = 5 | ||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: | ||
futures = [] | ||
for i, line in enumerate(script_lines): | ||
voice = host1_voice if i % 2 == 0 else host2_voice | ||
future = executor.submit(self._generate_audio_segment, client, line, voice, i) | ||
futures.append(future) | ||
|
||
# Collect results | ||
audio_segments: list[Any] = [None] * len(script_lines) | ||
for future in concurrent.futures.as_completed(futures): | ||
index, audio, silence = future.result() | ||
if isinstance(audio, str): # Error occurred | ||
return self.create_text_message(audio) | ||
audio_segments[index] = (audio, silence) | ||
|
||
# Combine audio segments in the correct order | ||
combined_audio = b"" | ||
for i, (audio, silence) in enumerate(audio_segments): | ||
if audio: | ||
combined_audio += audio | ||
if i < len(audio_segments) - 1 and silence: | ||
combined_audio += silence | ||
|
||
# Create a blob message with the combined audio | ||
return [ | ||
self.create_text_message("Audio generated successfully"), | ||
self.create_blob_message( | ||
blob=combined_audio, | ||
meta={"mime_type": "audio/mpeg"}, | ||
save_as=self.VariableKey.AUDIO, | ||
), | ||
] |
95 changes: 95 additions & 0 deletions
95
api/core/tools/provider/builtin/podcast_generator/tools/podcast_audio_generator.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
identity: | ||
name: podcast_audio_generator | ||
author: Dify | ||
label: | ||
en_US: Podcast Audio Generator | ||
zh_Hans: 播客音频生成器 | ||
description: | ||
human: | ||
en_US: Generate a podcast audio file from a script with two alternating voices using OpenAI's TTS service. | ||
zh_Hans: 使用 OpenAI 的 TTS 服务,从包含两个交替声音的脚本生成播客音频文件。 | ||
llm: This tool converts a prepared podcast script into an audio file using OpenAI's Text-to-Speech service, with two specified voices for alternating hosts. | ||
parameters: | ||
- name: script | ||
type: string | ||
required: true | ||
label: | ||
en_US: Podcast Script | ||
zh_Hans: 播客脚本 | ||
human_description: | ||
en_US: A string containing alternating lines for two hosts, separated by newline characters. | ||
zh_Hans: 包含两位主持人交替台词的字符串,每行用换行符分隔。 | ||
llm_description: A string representing the script, with alternating lines for two hosts separated by newline characters. | ||
form: llm | ||
- name: host1_voice | ||
type: select | ||
required: true | ||
label: | ||
en_US: Host 1 Voice | ||
zh_Hans: 主持人1 音色 | ||
human_description: | ||
en_US: The voice for the first host. | ||
zh_Hans: 第一位主持人的音色。 | ||
llm_description: The voice identifier for the first host's voice. | ||
options: | ||
- label: | ||
en_US: Alloy | ||
zh_Hans: Alloy | ||
value: alloy | ||
- label: | ||
en_US: Echo | ||
zh_Hans: Echo | ||
value: echo | ||
- label: | ||
en_US: Fable | ||
zh_Hans: Fable | ||
value: fable | ||
- label: | ||
en_US: Onyx | ||
zh_Hans: Onyx | ||
value: onyx | ||
- label: | ||
en_US: Nova | ||
zh_Hans: Nova | ||
value: nova | ||
- label: | ||
en_US: Shimmer | ||
zh_Hans: Shimmer | ||
value: shimmer | ||
form: form | ||
- name: host2_voice | ||
type: select | ||
required: true | ||
label: | ||
en_US: Host 2 Voice | ||
zh_Hans: 主持人2 音色 | ||
human_description: | ||
en_US: The voice for the second host. | ||
zh_Hans: 第二位主持人的音色。 | ||
llm_description: The voice identifier for the second host's voice. | ||
options: | ||
- label: | ||
en_US: Alloy | ||
zh_Hans: Alloy | ||
value: alloy | ||
- label: | ||
en_US: Echo | ||
zh_Hans: Echo | ||
value: echo | ||
- label: | ||
en_US: Fable | ||
zh_Hans: Fable | ||
value: fable | ||
- label: | ||
en_US: Onyx | ||
zh_Hans: Onyx | ||
value: onyx | ||
- label: | ||
en_US: Nova | ||
zh_Hans: Nova | ||
value: nova | ||
- label: | ||
en_US: Shimmer | ||
zh_Hans: Shimmer | ||
value: shimmer | ||
form: form |