Skip to content

Commit

Permalink
feat(podcast_generator): add new podcast generation tools
Browse files Browse the repository at this point in the history
- Introduced podcast generator with text-to-speech functionality using OpenAI's API.
- Implemented credential validation for TTS services and API keys.
- Added support for generating podcast audio with alternating host voices.
- Included user-friendly setup with internationalized YAML configuration.
- Added SVG icon to enhance visual identification.
  • Loading branch information
laipz8200 committed Oct 14, 2024
1 parent ea18dd1 commit 885192d
Show file tree
Hide file tree
Showing 5 changed files with 287 additions and 0 deletions.
24 changes: 24 additions & 0 deletions api/core/tools/provider/builtin/podcast_generator/_assets/icon.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from typing import Any

import openai

from core.tools.errors import ToolProviderCredentialValidationError
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController


class PodcastGeneratorProvider(BuiltinToolProviderController):
def _validate_credentials(self, credentials: dict[str, Any]) -> None:
tts_service = credentials.get("tts_service")
api_key = credentials.get("api_key")

if not tts_service:
raise ToolProviderCredentialValidationError("TTS service is not specified")

if not api_key:
raise ToolProviderCredentialValidationError("API key is missing")

if tts_service == "openai":
self._validate_openai_credentials(api_key)
else:
raise ToolProviderCredentialValidationError(f"Unsupported TTS service: {tts_service}")

def _validate_openai_credentials(self, api_key: str) -> None:
client = openai.OpenAI(api_key=api_key)
try:
# We're using a simple API call to validate the credentials
client.models.list()
except openai.AuthenticationError:
raise ToolProviderCredentialValidationError("Invalid OpenAI API key")
except Exception as e:
raise ToolProviderCredentialValidationError(f"Error validating OpenAI API key: {str(e)}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
identity:
author: Dify
name: podcast_generator
label:
en_US: Podcast Generator
zh_Hans: 播客生成器
description:
en_US: Generate podcast audio using Text-to-Speech services
zh_Hans: 使用文字转语音服务生成播客音频
icon: icon.svg
credentials_for_provider:
tts_service:
type: select
required: true
label:
en_US: TTS Service
zh_Hans: TTS 服务
placeholder:
en_US: Select a TTS service
zh_Hans: 选择一个 TTS 服务
options:
- label:
en_US: OpenAI TTS
zh_Hans: OpenAI TTS
value: openai
api_key:
type: secret-input
required: true
label:
en_US: API Key
zh_Hans: API 密钥
placeholder:
en_US: Enter your TTS service API key
zh_Hans: 输入您的 TTS 服务 API 密钥
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import concurrent.futures
import random
import struct
from typing import Any, Literal, Optional, Union

import openai

from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.errors import ToolParameterValidationError, ToolProviderCredentialValidationError
from core.tools.tool.builtin_tool import BuiltinTool


class PodcastAudioGeneratorTool(BuiltinTool):
@staticmethod
def _generate_silence(duration):
# Generate silent MP3 data
# This is a simplified version and may not work perfectly with all MP3 players
# For production use, consider using a proper audio library or pre-generated silence MP3
sample_rate = 44100
num_samples = int(duration * sample_rate)
silence_data = struct.pack("<" + "h" * num_samples, *([0] * num_samples))

# Add a simple MP3 header (this is not a complete MP3 file, but might work for basic needs)
mp3_header = b"\xff\xfb\x90\x04" # A very basic MP3 header
return mp3_header + silence_data

@staticmethod
def _generate_audio_segment(
client: openai.OpenAI,
line: str,
voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
index: int,
) -> tuple[int, Union[bytes, str], Optional[bytes]]:
try:
response = client.audio.speech.create(model="tts-1", voice=voice, input=line.strip())
audio = response.content
silence_duration = random.uniform(2, 5)
silence = PodcastAudioGeneratorTool._generate_silence(silence_duration)
return index, audio, silence
except Exception as e:
return index, f"Error generating audio: {str(e)}", None

def _invoke(
self, user_id: str, tool_parameters: dict[str, Any]
) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
# Extract parameters
script = tool_parameters.get("script", "")
host1_voice = tool_parameters.get("host1_voice")
host2_voice = tool_parameters.get("host2_voice")

# Split the script into lines
script_lines = [line for line in script.split("\n") if line.strip()]

# Ensure voices are provided
if not host1_voice or not host2_voice:
raise ToolParameterValidationError("Host voices are required")

# Get OpenAI API key from credentials
if not self.runtime or not self.runtime.credentials:
raise ToolProviderCredentialValidationError("Tool runtime or credentials are missing")
api_key = self.runtime.credentials.get("api_key")
if not api_key:
raise ToolProviderCredentialValidationError("OpenAI API key is missing")

# Initialize OpenAI client
client = openai.OpenAI(api_key=api_key)

# Create a thread pool
max_workers = 5
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for i, line in enumerate(script_lines):
voice = host1_voice if i % 2 == 0 else host2_voice
future = executor.submit(self._generate_audio_segment, client, line, voice, i)
futures.append(future)

# Collect results
audio_segments: list[Any] = [None] * len(script_lines)
for future in concurrent.futures.as_completed(futures):
index, audio, silence = future.result()
if isinstance(audio, str): # Error occurred
return self.create_text_message(audio)
audio_segments[index] = (audio, silence)

# Combine audio segments in the correct order
combined_audio = b""
for i, (audio, silence) in enumerate(audio_segments):
if audio:
combined_audio += audio
if i < len(audio_segments) - 1 and silence:
combined_audio += silence

# Create a blob message with the combined audio
return [
self.create_text_message("Audio generated successfully"),
self.create_blob_message(
blob=combined_audio,
meta={"mime_type": "audio/mpeg"},
save_as=self.VariableKey.AUDIO,
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
identity:
name: podcast_audio_generator
author: Dify
label:
en_US: Podcast Audio Generator
zh_Hans: 播客音频生成器
description:
human:
en_US: Generate a podcast audio file from a script with two alternating voices using OpenAI's TTS service.
zh_Hans: 使用 OpenAI 的 TTS 服务,从包含两个交替声音的脚本生成播客音频文件。
llm: This tool converts a prepared podcast script into an audio file using OpenAI's Text-to-Speech service, with two specified voices for alternating hosts.
parameters:
- name: script
type: string
required: true
label:
en_US: Podcast Script
zh_Hans: 播客脚本
human_description:
en_US: A string containing alternating lines for two hosts, separated by newline characters.
zh_Hans: 包含两位主持人交替台词的字符串,每行用换行符分隔。
llm_description: A string representing the script, with alternating lines for two hosts separated by newline characters.
form: llm
- name: host1_voice
type: select
required: true
label:
en_US: Host 1 Voice
zh_Hans: 主持人1 音色
human_description:
en_US: The voice for the first host.
zh_Hans: 第一位主持人的音色。
llm_description: The voice identifier for the first host's voice.
options:
- label:
en_US: Alloy
zh_Hans: Alloy
value: alloy
- label:
en_US: Echo
zh_Hans: Echo
value: echo
- label:
en_US: Fable
zh_Hans: Fable
value: fable
- label:
en_US: Onyx
zh_Hans: Onyx
value: onyx
- label:
en_US: Nova
zh_Hans: Nova
value: nova
- label:
en_US: Shimmer
zh_Hans: Shimmer
value: shimmer
form: form
- name: host2_voice
type: select
required: true
label:
en_US: Host 2 Voice
zh_Hans: 主持人2 音色
human_description:
en_US: The voice for the second host.
zh_Hans: 第二位主持人的音色。
llm_description: The voice identifier for the second host's voice.
options:
- label:
en_US: Alloy
zh_Hans: Alloy
value: alloy
- label:
en_US: Echo
zh_Hans: Echo
value: echo
- label:
en_US: Fable
zh_Hans: Fable
value: fable
- label:
en_US: Onyx
zh_Hans: Onyx
value: onyx
- label:
en_US: Nova
zh_Hans: Nova
value: nova
- label:
en_US: Shimmer
zh_Hans: Shimmer
value: shimmer
form: form

0 comments on commit 885192d

Please sign in to comment.