Skip to content

Commit

Permalink
Adding tts container files and code
Browse files Browse the repository at this point in the history
  • Loading branch information
martinhen0 committed Oct 23, 2024
1 parent 3c101d3 commit d33b956
Show file tree
Hide file tree
Showing 6 changed files with 357 additions and 0 deletions.
25 changes: 25 additions & 0 deletions src/speech/tts/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Use an official Python runtime as a parent image
FROM python:3.9-slim

# Set the working directory in the container
WORKDIR /app

# Copy the current directory contents into the container at /app
COPY . /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
espeak-ng \
libespeak-ng1 \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*

# Install any needed packages specified in requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Make port 5000 available to the world outside this container
EXPOSE 5000

# Run ttssend.py when the container launches
CMD ["python", "tts_server.py"]

16 changes: 16 additions & 0 deletions src/speech/tts/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
version: '3.8'

services:
ttssend:
build: .
ports:
- "5000:5000"
environment:
- RECEIVER_IP=${RECEIVER_IP}
- RECEIVER_PORT=${RECEIVER_PORT}
- TTS_ENGINE=narakeet
- NARAKEET_API_KEY=${NARAKEET_API_KEY}
- CACHING_MAX_SIZE=100
volumes:
- ./cache:/cache

29 changes: 29 additions & 0 deletions src/speech/tts/narakeet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import requests
import time
import os
from loguru import logger


def narakeet(text,filename,api_key,voice='harry',speed=1):
url = f'https://api.narakeet.com/text-to-speech/mp3?voice={voice}&voice-speed={speed}'
options = {
'headers': {
'Accept': 'application/octet-stream',
'Content-Type': 'text/plain',
'x-api-key': api_key,
},
'data': text.encode('utf8')
}

start_time = time.time()
response = requests.post(url, **options)
if response.status_code != 200:
raise ValueError(f"Failed to generate TTS: {response.status_code} {response.text}")
end_time = time.time()
logger.info(f"TTS generated in {end_time - start_time} seconds")

with open(filename, 'wb') as f:
f.write(response.content)



6 changes: 6 additions & 0 deletions src/speech/tts/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Flask==2.1.0
pydub==0.25.1
requests==2.26.0
loguru==0.6.0
Werkzeug==2.0.3

115 changes: 115 additions & 0 deletions src/speech/tts/soundplayer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import socket
import struct
import pyaudio
import queue
import threading
import time

# Network parameters
RECEIVER_IP = '0.0.0.0'
RECEIVER_PORT = 42069
CHUNK_SIZE = 1024

# Audio parameters
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100

# Initialize PyAudio
p = pyaudio.PyAudio()

# Open a stream using PulseAudio
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
output=True,
frames_per_buffer=CHUNK_SIZE,
output_device_index=6) # Use the PulseAudio device

# Create a TCP socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.bind((RECEIVER_IP, RECEIVER_PORT))
sock.listen(1)


def receive_all(sock, n):
data = b''
while len(data) < n:
try:
packet = sock.recv(n - len(data))
if not packet:
return None
data += packet
except socket.timeout:
print("Receiving data timed out.")
return None
except socket.error as e:
print(f"Socket error: {e}")
return None
return data

def socket_thread(sound_queue):
while True:
sock.settimeout(30)
print("Waiting for connection...")
try:
connection, client_address = sock.accept()
connection.settimeout(5)
print(f"Connected to {client_address}")

while True:
try:
# Receive the length of the incoming data
length_data = receive_all(connection, 4)
if length_data is None:
print("Connection closed by client")
break

length = struct.unpack('!I', length_data)[0]
audio_data = receive_all(connection, length)

if audio_data is None:
print("Connection closed by client")
break

sound_queue.put(audio_data)

except socket.timeout:
print("Socket timeout while receiving data")
break
except socket.error as e:
print(f"Socket error: {e}")
break

except socket.timeout:
print("Timeout while waiting for connection")
except socket.error as e:
print(f"Socket error while accepting connection: {e}")

print("Disconnected. Waiting for new connection...")

sound_queue = queue.Queue()
threading.Thread(target=socket_thread, args=(sound_queue,)).start()


try:
while True:
audio_data = sound_queue.get()
if audio_data is None:
print("No audio data")
time.sleep(0.1)
continue

print(f"Received {len(audio_data)} bytes of audio data")
stream.write(audio_data)

except KeyboardInterrupt:
print("Streaming stopped.")
sock.close()

finally:
print("Stopping stream")
stream.stop_stream()
stream.close()
p.terminate()
sock.close()
166 changes: 166 additions & 0 deletions src/speech/tts/tts_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import os
import socket
import struct
import hashlib
from loguru import logger
from pydub import AudioSegment
from pydub.utils import make_chunks
import subprocess
import queue
import threading
from flask import jsonify, request, Flask
import time
from narakeet import narakeet

# Network parameters
RECEIVER_IP = os.environ.get('RECEIVER_IP')
RECEIVER_PORT = os.environ.get('RECEIVER_PORT')
TTS_ENGINE = os.environ.get('TTS_ENGINE')
NARAKEET_API_KEY = os.environ.get('NARAKEET_API_KEY')
CACHING_MAX_SIZE = os.environ.get('CACHING_MAX_SIZE')
# Validate environment variables
if not RECEIVER_IP:
raise ValueError("RECEIVER_IP environment variable is not set")

if not RECEIVER_PORT:
raise ValueError("RECEIVER_PORT environment variable is not set")
try:
RECEIVER_PORT = int(RECEIVER_PORT)
except ValueError:
raise ValueError("RECEIVER_PORT must be a valid integer")

if not TTS_ENGINE:
raise ValueError("TTS_ENGINE environment variable is not set")
if TTS_ENGINE not in ['narakeet', 'espeak']:
raise ValueError("TTS_ENGINE must be either 'narakeet' or 'espeak'")

if TTS_ENGINE == 'narakeet' and not NARAKEET_API_KEY:
raise ValueError("NARAKEET_API_KEY environment variable is not set, but TTS_ENGINE is set to 'narakeet'")

caching_directory = "/cache"

if CACHING_MAX_SIZE:
try:
CACHING_MAX_SIZE = int(CACHING_MAX_SIZE)
if CACHING_MAX_SIZE <= 0:
raise ValueError("CACHING_MAX_SIZE must be a positive integer")
except ValueError:
raise ValueError("CACHING_MAX_SIZE must be a valid positive integer")

logger.info(f"Caching enabled. Max cache size: {CACHING_MAX_SIZE} MB")
else:
logger.info("Caching disabled")



def generate_audio(text: str) -> AudioSegment:
if CACHING_MAX_SIZE:
sha256_hash = hashlib.sha256(text.encode()).hexdigest()
cache_file = os.path.join(caching_directory, f"{sha256_hash}.mp3")

if os.path.exists(cache_file):
logger.info(f"Cache hit for text: {text[:30]}...")
audio = AudioSegment.from_mp3(cache_file)
# Update the access time of the file
os.utime(cache_file, None)
return audio

logger.info(f"Cache miss for text: {text[:30]}...")

# Generate new audio
if TTS_ENGINE == 'espeak':
subprocess.run(['espeak-ng', '-v', 'en', '-s', '150', '-w', 'temp.mp3', text], check=True)
audio = AudioSegment.from_mp3('temp.mp3')
elif TTS_ENGINE == 'narakeet':
narakeet(text, 'temp.mp3', api_key=NARAKEET_API_KEY)
audio = AudioSegment.from_mp3('temp.mp3')
elif TTS_ENGINE == 'openai':
raise NotImplementedError("OpenAI TTS is not implemented yet")
# TODO: Implement OpenAI TTS
audio = AudioSegment.from_mp3('temp.mp3')


if CACHING_MAX_SIZE:
# Add the new file to the cache
audio.export(cache_file, format="mp3")
logger.info(f"Added new file to cache: {cache_file}")

# Check cache size and remove oldest files if necessary
cache_files = [os.path.join(caching_directory, f) for f in os.listdir(caching_directory) if f.endswith('.mp3')]
cache_files.sort(key=lambda x: os.path.getatime(x))

total_size = sum(os.path.getsize(f) for f in cache_files)
while total_size > CACHING_MAX_SIZE * 1024 * 1024: # Convert MB to bytes
oldest_file = cache_files.pop(0)
file_size = os.path.getsize(oldest_file)
os.remove(oldest_file)
total_size -= file_size
logger.info(f"Removed oldest file from cache: {oldest_file} (size: {file_size / 1024 / 1024:.2f} MB)")

logger.info(f"Current cache size: {total_size / 1024 / 1024:.2f} MB")

return audio


def generate_tts_thread(input_queue, output_queue):
while True:
text = input_queue.get()
print("Generating TTS")

audio = generate_audio(text)


print("TTS generated")

audio = audio.set_channels(2).set_frame_rate(44100).set_sample_width(2)

output_queue.put(audio)
print("TTS put in queue")


def audio_sender_thread(audio_queue:queue.Queue):
while True:
if not audio_queue.empty():
audio = audio_queue.get()
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.connect((RECEIVER_IP, RECEIVER_PORT))

print("Sending audio")
for chunk in make_chunks(audio, 1024):
print("Sending chunk")
raw_data = chunk.raw_data
sock.sendall(struct.pack('!I', len(raw_data)))
sock.sendall(raw_data)

sock.close()
else:
time.sleep(0.1)

audio_queue = queue.Queue()
tts_queue = queue.Queue()

threading.Thread(target=generate_tts_thread, args=(tts_queue, audio_queue)).start()
threading.Thread(target=audio_sender_thread, args=(audio_queue,)).start()


app = Flask(__name__)

@app.route('/tts', methods=['POST'])
def text_to_speech():
data = request.json
if 'text' not in data:
return jsonify({"error": "No text provided"}), 400

print("Got input: " + data['text'])
for sentence in data['text'].split('.'):
tts_queue.put(sentence)
return jsonify({"message": "Text received and processing started"}), 202

if __name__ == '__main__':
logger.info("Starting TTS server")
logger.info(f"RECEIVER_IP: {RECEIVER_IP}")
logger.info(f"RECEIVER_PORT: {RECEIVER_PORT}")
logger.info(f"TTS_ENGINE: {TTS_ENGINE}")
if CACHING_MAX_SIZE:
logger.info(f"CACHING_MAX_SIZE: {CACHING_MAX_SIZE}")
app.run(debug=True, port=5000, host='0.0.0.0')

0 comments on commit d33b956

Please sign in to comment.