generated from CogitoNTNU/README-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3c101d3
commit d33b956
Showing
6 changed files
with
357 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# Use an official Python runtime as a parent image | ||
FROM python:3.9-slim | ||
|
||
# Set the working directory in the container | ||
WORKDIR /app | ||
|
||
# Copy the current directory contents into the container at /app | ||
COPY . /app | ||
|
||
# Install system dependencies | ||
RUN apt-get update && apt-get install -y \ | ||
espeak-ng \ | ||
libespeak-ng1 \ | ||
ffmpeg \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
# Install any needed packages specified in requirements.txt | ||
RUN pip install --no-cache-dir -r requirements.txt | ||
|
||
# Make port 5000 available to the world outside this container | ||
EXPOSE 5000 | ||
|
||
# Run ttssend.py when the container launches | ||
CMD ["python", "tts_server.py"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
version: '3.8' | ||
|
||
services: | ||
ttssend: | ||
build: . | ||
ports: | ||
- "5000:5000" | ||
environment: | ||
- RECEIVER_IP=${RECEIVER_IP} | ||
- RECEIVER_PORT=${RECEIVER_PORT} | ||
- TTS_ENGINE=narakeet | ||
- NARAKEET_API_KEY=${NARAKEET_API_KEY} | ||
- CACHING_MAX_SIZE=100 | ||
volumes: | ||
- ./cache:/cache | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import requests | ||
import time | ||
import os | ||
from loguru import logger | ||
|
||
|
||
def narakeet(text,filename,api_key,voice='harry',speed=1): | ||
url = f'https://api.narakeet.com/text-to-speech/mp3?voice={voice}&voice-speed={speed}' | ||
options = { | ||
'headers': { | ||
'Accept': 'application/octet-stream', | ||
'Content-Type': 'text/plain', | ||
'x-api-key': api_key, | ||
}, | ||
'data': text.encode('utf8') | ||
} | ||
|
||
start_time = time.time() | ||
response = requests.post(url, **options) | ||
if response.status_code != 200: | ||
raise ValueError(f"Failed to generate TTS: {response.status_code} {response.text}") | ||
end_time = time.time() | ||
logger.info(f"TTS generated in {end_time - start_time} seconds") | ||
|
||
with open(filename, 'wb') as f: | ||
f.write(response.content) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
Flask==2.1.0 | ||
pydub==0.25.1 | ||
requests==2.26.0 | ||
loguru==0.6.0 | ||
Werkzeug==2.0.3 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
import socket | ||
import struct | ||
import pyaudio | ||
import queue | ||
import threading | ||
import time | ||
|
||
# Network parameters | ||
RECEIVER_IP = '0.0.0.0' | ||
RECEIVER_PORT = 42069 | ||
CHUNK_SIZE = 1024 | ||
|
||
# Audio parameters | ||
FORMAT = pyaudio.paInt16 | ||
CHANNELS = 2 | ||
RATE = 44100 | ||
|
||
# Initialize PyAudio | ||
p = pyaudio.PyAudio() | ||
|
||
# Open a stream using PulseAudio | ||
stream = p.open(format=FORMAT, | ||
channels=CHANNELS, | ||
rate=RATE, | ||
output=True, | ||
frames_per_buffer=CHUNK_SIZE, | ||
output_device_index=6) # Use the PulseAudio device | ||
|
||
# Create a TCP socket | ||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) | ||
sock.bind((RECEIVER_IP, RECEIVER_PORT)) | ||
sock.listen(1) | ||
|
||
|
||
def receive_all(sock, n): | ||
data = b'' | ||
while len(data) < n: | ||
try: | ||
packet = sock.recv(n - len(data)) | ||
if not packet: | ||
return None | ||
data += packet | ||
except socket.timeout: | ||
print("Receiving data timed out.") | ||
return None | ||
except socket.error as e: | ||
print(f"Socket error: {e}") | ||
return None | ||
return data | ||
|
||
def socket_thread(sound_queue): | ||
while True: | ||
sock.settimeout(30) | ||
print("Waiting for connection...") | ||
try: | ||
connection, client_address = sock.accept() | ||
connection.settimeout(5) | ||
print(f"Connected to {client_address}") | ||
|
||
while True: | ||
try: | ||
# Receive the length of the incoming data | ||
length_data = receive_all(connection, 4) | ||
if length_data is None: | ||
print("Connection closed by client") | ||
break | ||
|
||
length = struct.unpack('!I', length_data)[0] | ||
audio_data = receive_all(connection, length) | ||
|
||
if audio_data is None: | ||
print("Connection closed by client") | ||
break | ||
|
||
sound_queue.put(audio_data) | ||
|
||
except socket.timeout: | ||
print("Socket timeout while receiving data") | ||
break | ||
except socket.error as e: | ||
print(f"Socket error: {e}") | ||
break | ||
|
||
except socket.timeout: | ||
print("Timeout while waiting for connection") | ||
except socket.error as e: | ||
print(f"Socket error while accepting connection: {e}") | ||
|
||
print("Disconnected. Waiting for new connection...") | ||
|
||
sound_queue = queue.Queue() | ||
threading.Thread(target=socket_thread, args=(sound_queue,)).start() | ||
|
||
|
||
try: | ||
while True: | ||
audio_data = sound_queue.get() | ||
if audio_data is None: | ||
print("No audio data") | ||
time.sleep(0.1) | ||
continue | ||
|
||
print(f"Received {len(audio_data)} bytes of audio data") | ||
stream.write(audio_data) | ||
|
||
except KeyboardInterrupt: | ||
print("Streaming stopped.") | ||
sock.close() | ||
|
||
finally: | ||
print("Stopping stream") | ||
stream.stop_stream() | ||
stream.close() | ||
p.terminate() | ||
sock.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
import os | ||
import socket | ||
import struct | ||
import hashlib | ||
from loguru import logger | ||
from pydub import AudioSegment | ||
from pydub.utils import make_chunks | ||
import subprocess | ||
import queue | ||
import threading | ||
from flask import jsonify, request, Flask | ||
import time | ||
from narakeet import narakeet | ||
|
||
# Network parameters | ||
RECEIVER_IP = os.environ.get('RECEIVER_IP') | ||
RECEIVER_PORT = os.environ.get('RECEIVER_PORT') | ||
TTS_ENGINE = os.environ.get('TTS_ENGINE') | ||
NARAKEET_API_KEY = os.environ.get('NARAKEET_API_KEY') | ||
CACHING_MAX_SIZE = os.environ.get('CACHING_MAX_SIZE') | ||
# Validate environment variables | ||
if not RECEIVER_IP: | ||
raise ValueError("RECEIVER_IP environment variable is not set") | ||
|
||
if not RECEIVER_PORT: | ||
raise ValueError("RECEIVER_PORT environment variable is not set") | ||
try: | ||
RECEIVER_PORT = int(RECEIVER_PORT) | ||
except ValueError: | ||
raise ValueError("RECEIVER_PORT must be a valid integer") | ||
|
||
if not TTS_ENGINE: | ||
raise ValueError("TTS_ENGINE environment variable is not set") | ||
if TTS_ENGINE not in ['narakeet', 'espeak']: | ||
raise ValueError("TTS_ENGINE must be either 'narakeet' or 'espeak'") | ||
|
||
if TTS_ENGINE == 'narakeet' and not NARAKEET_API_KEY: | ||
raise ValueError("NARAKEET_API_KEY environment variable is not set, but TTS_ENGINE is set to 'narakeet'") | ||
|
||
caching_directory = "/cache" | ||
|
||
if CACHING_MAX_SIZE: | ||
try: | ||
CACHING_MAX_SIZE = int(CACHING_MAX_SIZE) | ||
if CACHING_MAX_SIZE <= 0: | ||
raise ValueError("CACHING_MAX_SIZE must be a positive integer") | ||
except ValueError: | ||
raise ValueError("CACHING_MAX_SIZE must be a valid positive integer") | ||
|
||
logger.info(f"Caching enabled. Max cache size: {CACHING_MAX_SIZE} MB") | ||
else: | ||
logger.info("Caching disabled") | ||
|
||
|
||
|
||
def generate_audio(text: str) -> AudioSegment: | ||
if CACHING_MAX_SIZE: | ||
sha256_hash = hashlib.sha256(text.encode()).hexdigest() | ||
cache_file = os.path.join(caching_directory, f"{sha256_hash}.mp3") | ||
|
||
if os.path.exists(cache_file): | ||
logger.info(f"Cache hit for text: {text[:30]}...") | ||
audio = AudioSegment.from_mp3(cache_file) | ||
# Update the access time of the file | ||
os.utime(cache_file, None) | ||
return audio | ||
|
||
logger.info(f"Cache miss for text: {text[:30]}...") | ||
|
||
# Generate new audio | ||
if TTS_ENGINE == 'espeak': | ||
subprocess.run(['espeak-ng', '-v', 'en', '-s', '150', '-w', 'temp.mp3', text], check=True) | ||
audio = AudioSegment.from_mp3('temp.mp3') | ||
elif TTS_ENGINE == 'narakeet': | ||
narakeet(text, 'temp.mp3', api_key=NARAKEET_API_KEY) | ||
audio = AudioSegment.from_mp3('temp.mp3') | ||
elif TTS_ENGINE == 'openai': | ||
raise NotImplementedError("OpenAI TTS is not implemented yet") | ||
# TODO: Implement OpenAI TTS | ||
audio = AudioSegment.from_mp3('temp.mp3') | ||
|
||
|
||
if CACHING_MAX_SIZE: | ||
# Add the new file to the cache | ||
audio.export(cache_file, format="mp3") | ||
logger.info(f"Added new file to cache: {cache_file}") | ||
|
||
# Check cache size and remove oldest files if necessary | ||
cache_files = [os.path.join(caching_directory, f) for f in os.listdir(caching_directory) if f.endswith('.mp3')] | ||
cache_files.sort(key=lambda x: os.path.getatime(x)) | ||
|
||
total_size = sum(os.path.getsize(f) for f in cache_files) | ||
while total_size > CACHING_MAX_SIZE * 1024 * 1024: # Convert MB to bytes | ||
oldest_file = cache_files.pop(0) | ||
file_size = os.path.getsize(oldest_file) | ||
os.remove(oldest_file) | ||
total_size -= file_size | ||
logger.info(f"Removed oldest file from cache: {oldest_file} (size: {file_size / 1024 / 1024:.2f} MB)") | ||
|
||
logger.info(f"Current cache size: {total_size / 1024 / 1024:.2f} MB") | ||
|
||
return audio | ||
|
||
|
||
def generate_tts_thread(input_queue, output_queue): | ||
while True: | ||
text = input_queue.get() | ||
print("Generating TTS") | ||
|
||
audio = generate_audio(text) | ||
|
||
|
||
print("TTS generated") | ||
|
||
audio = audio.set_channels(2).set_frame_rate(44100).set_sample_width(2) | ||
|
||
output_queue.put(audio) | ||
print("TTS put in queue") | ||
|
||
|
||
def audio_sender_thread(audio_queue:queue.Queue): | ||
while True: | ||
if not audio_queue.empty(): | ||
audio = audio_queue.get() | ||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) | ||
sock.connect((RECEIVER_IP, RECEIVER_PORT)) | ||
|
||
print("Sending audio") | ||
for chunk in make_chunks(audio, 1024): | ||
print("Sending chunk") | ||
raw_data = chunk.raw_data | ||
sock.sendall(struct.pack('!I', len(raw_data))) | ||
sock.sendall(raw_data) | ||
|
||
sock.close() | ||
else: | ||
time.sleep(0.1) | ||
|
||
audio_queue = queue.Queue() | ||
tts_queue = queue.Queue() | ||
|
||
threading.Thread(target=generate_tts_thread, args=(tts_queue, audio_queue)).start() | ||
threading.Thread(target=audio_sender_thread, args=(audio_queue,)).start() | ||
|
||
|
||
app = Flask(__name__) | ||
|
||
@app.route('/tts', methods=['POST']) | ||
def text_to_speech(): | ||
data = request.json | ||
if 'text' not in data: | ||
return jsonify({"error": "No text provided"}), 400 | ||
|
||
print("Got input: " + data['text']) | ||
for sentence in data['text'].split('.'): | ||
tts_queue.put(sentence) | ||
return jsonify({"message": "Text received and processing started"}), 202 | ||
|
||
if __name__ == '__main__': | ||
logger.info("Starting TTS server") | ||
logger.info(f"RECEIVER_IP: {RECEIVER_IP}") | ||
logger.info(f"RECEIVER_PORT: {RECEIVER_PORT}") | ||
logger.info(f"TTS_ENGINE: {TTS_ENGINE}") | ||
if CACHING_MAX_SIZE: | ||
logger.info(f"CACHING_MAX_SIZE: {CACHING_MAX_SIZE}") | ||
app.run(debug=True, port=5000, host='0.0.0.0') |