diff --git a/.gitignore b/.gitignore index adf61a7..e0890d0 100644 --- a/.gitignore +++ b/.gitignore @@ -147,5 +147,6 @@ dmypy.json #wav files *.wav +*.webm #calender json /core/tools/calendarjson diff --git a/core/Dockerfile b/core/Dockerfile index a006035..2bc5266 100644 --- a/core/Dockerfile +++ b/core/Dockerfile @@ -13,9 +13,6 @@ RUN pip install --no-cache-dir -r requirements.txt # Copy the current directory contents into the container at /app COPY . . -# Make port 8000 available to the world outside this container -EXPOSE 3001 - # Run app.py when the container launches CMD ["python","-u", "main.py"] #CMD ["gunicorn", "--worker-class", "eventlet", "-w", "1", "-b", "0.0.0.0:8000", "app:app"] \ No newline at end of file diff --git a/core/main.py b/core/main.py index aec49c6..fcf54f1 100644 --- a/core/main.py +++ b/core/main.py @@ -10,9 +10,11 @@ import asyncio from modules.user_data_setup import check_folders from modules.chat import read_chat +import requests import logging log = logging.getLogger('werkzeug') -log.setLevel(logging.ERROR) #INFO, DEBUG, WARNING, ERROR, or CRITICAL - config as needed during development. +log.setLevel(logging.ERROR) +from time import sleep from collections import defaultdict # @@ -50,7 +52,7 @@ def hello_world(): # Route to get metadata like name, id, descriptions of all user chats @app.route("/chats/metadata") def get_chats(): - return "lmao" + return "lmao" # Why does this return lmao? @app.route('/vectorize_chat', methods=['POST']) def summarize_store(): @@ -133,6 +135,56 @@ async def run_and_store(): print(f'Something very bad happened: {e}') return jsonify({"status": "error"}) +# Custom event. Fired when the user click the button with the cute little microphone icon. +@app.route('/start_recording', methods=['POST']) +def start_recording_route(): + data = request.json + conversation_id = data.get('conversation_id') + + print("Starting recording...") + + # Send POST request to the recorder to start recording + headers = {'Content-Type': 'application/json'} + response = requests.post(f'http://speech-to-text:3001/start_recording/{conversation_id}', headers=headers, json=data) + + if response.status_code != 200: + return jsonify({"status": "error", "text": "Failed to start recording"}), 500 + + return jsonify({"status": "recording_started"}), 200 + + +@socketio.on('start_recording') +def start_recording_socket(data): + # This function handles the socket event to start recording + conversation_id = data.get('conversation_id') + + print("Starting recording via socket...") + + # Send POST request to the recorder to start recording + headers = {'Content-Type': 'application/json'} + response = requests.post(f'http://speech-to-text:3001/start_recording/{conversation_id}', headers=headers, json=data) + + if response.status_code != 200: + socketio.emit('recording_failed', {"status": "error", "text": "Failed to start recording"}) + return + + socketio.emit('recording_started', {"status": "recording_started"}) + +@app.route('/recording_completed', methods=['POST']) +def recording_completed(): + data = request.json + text = data.get('text', '') + socketio.emit("recording", text) + + conversation_id = data.get('conversation_id', '') + print(f"Recording completed for conversation ID {conversation_id} with text:", text) + + # Process the recorded text as needed (e.g., send to Jarvis or other services) + asyncio.run(jarvis.run(text, socketio)) # Assuming jarvis.run is asynchronous + + return jsonify({"status": "success"}), 200 + + @socketio.on('get_chat_history') def get_chat_history(): session_id = request.sid @@ -143,4 +195,5 @@ def get_chat_history(): if __name__ == '__main__': socketio.run(app, debug=True, host='0.0.0.0', port=PORT, allow_unsafe_werkzeug=True) -# hello \ No newline at end of file +# hello +# TODO say hello back to whoever wrote this \ No newline at end of file diff --git a/core/static/chat.js b/core/static/chat.js index edd9acd..366e68d 100644 --- a/core/static/chat.js +++ b/core/static/chat.js @@ -26,6 +26,16 @@ sendMessage = () => { } } +addRecordedMessage = (message) => { + let chat_history = document.getElementById("chat_history") + if (message != "") { + addUserMessage(marked.parse(message)) + chat_history.scrollTop = chat_history.scrollHeight; + } + +} + + addStreamedChunk = (messagePart) => { if(state.activeAIMessage){ state.activeAIMessage.innerHTML += messagePart; // Append to innertext of the message diff --git a/core/static/index.css b/core/static/index.css index 0e72545..94ab354 100644 --- a/core/static/index.css +++ b/core/static/index.css @@ -100,6 +100,14 @@ body { font-size: 24px; margin-left: 12px; } +#voice_button_recording { + width: 10%; + height: 9vh; + background-color: #673636; + border-radius: 10px; + font-size: 24px; + +} .chat_input_container{ display: flex; diff --git a/core/static/index.html b/core/static/index.html index 701e97d..5f650cc 100644 --- a/core/static/index.html +++ b/core/static/index.html @@ -8,6 +8,7 @@ + diff --git a/core/static/index.js b/core/static/index.js index 6d2e8df..61b3e7a 100644 --- a/core/static/index.js +++ b/core/static/index.js @@ -5,25 +5,23 @@ Main js file for loading the dynamic UI elements. */ // Runs on inital startup, after window (html) has finished loading -init = () => { - document.getElementById("send_button").addEventListener("click", sendMessage); - document.getElementById("clear_log").addEventListener("click", clear_log); - - document.querySelector(".chatHistory").innerHTML += chatHistoryList(); - - // To hide settings page when clicking somewhere else after it's opened. - document.addEventListener("click", function (event) { - const settings = document.getElementById("settingsPage"); - const settingsButton = document.getElementById("settingsButton"); - if ( - !settings.contains(event.target) && - !settingsButton.contains(event.target) && - settings.style.display == "block" - ) { - settingsPage(); - } - }); -}; + init = () => { + document.getElementById('send_button').addEventListener('click', sendMessage) + document.getElementById('clear_log').addEventListener('click', clear_log) + + document.getElementById('voice_button').addEventListener('click', startRecording) + + document.querySelector(".chatHistory").innerHTML += chatHistoryList() + + // To hide settings page when clicking somewhere else after it's opened. + document.addEventListener('click', function(event){ + const settings = document.getElementById("settingsPage"); + const settingsButton = document.getElementById("settingsButton"); + if(!settings.contains(event.target) && !settingsButton.contains(event.target) && settings.style.display=="block") { + settingsPage() + } + }); +} window.onload = init; // global state of the UI @@ -107,6 +105,17 @@ async function addToolResponseToProcessContainer(toolResponse) { let processesContainer = document.querySelector(".processesContainer"); processesContainer.scrollTop = processesContainer.scrollHeight; } +async function addStreamedRecording(uuid, messagePart) { + let element = document.getElementById(uuid); + + if (element == null) { + await addRecordedMessage(messagePart, uuid); + element = document.getElementById(uuid); + } else { + // Concat ChatPart on message with uuid + element.innerHTML += messagePart; + } +} addUserMessage = (message) => { let html = /*html*/ ` diff --git a/core/static/recording.js b/core/static/recording.js new file mode 100644 index 0000000..9298f9b --- /dev/null +++ b/core/static/recording.js @@ -0,0 +1,7 @@ +startRecording = () => { + document.getElementById('voice_button').style.backgroundColor = "#673636"; // Change button color to indicate recording + document.getElementById('voice_button').enabled = false; // Disable button while recording + const payload = {conversation_id: state.activeConversationId} + let res = socket.emit('start_recording', payload) + console.log("Recording started"); +} \ No newline at end of file diff --git a/core/static/socketEvents.js b/core/static/socketEvents.js index 1a30438..39573aa 100644 --- a/core/static/socketEvents.js +++ b/core/static/socketEvents.js @@ -30,6 +30,19 @@ socket.on("chunk", async (chunk) => { await addStreamedMessage(uuid, chunk); }); +socket.on("recording", async (recording)=>{ + if(!state.activeAIMessage){ + console.log("RECIVED MESSAGE") + document.getElementById('voice_button').style.backgroundColor = ""; // Change button color to indicate recording + document.getElementById('voice_button').enabled = true; // Disable button while recording + uuid = generateUUID(); + await addStreamedRecording(uuid, ""); + ai_message = document.getElementById(uuid) + state.activeAIMessage = ai_message + } + await addStreamedRecording(uuid, recording ); +}) + socket.on("tokens", async (tokens) => { state.totalTokensUsed += tokens; console.log("Total tokens so far:", state.totalTokensUsed); diff --git a/docker-compose.yml b/docker-compose.yml index 8587781..455a5de 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,3 +1,5 @@ +version: '2.1' + services: llm-service: build: ./core @@ -21,7 +23,26 @@ services: stop_signal: SIGINT ports: - "3000:3000" + deploy: + resources: + limits: + cpus: '0.5' + memory: 2048M # Memory limit for the compose + speech-to-text: + build: ./speechToText + restart: unless-stopped + environment: + FLASK_ENV: ${FLASK_ENV} # Autorestarts flask when code changes are detected + OPENAI_API_KEY: ${OPENAI_API_KEY} + PORT_STT: ${PORT_STT} + volumes: + - ./speechToText:/app # Mount the application code to detect live changes + networks: + - backend + stop_signal: SIGINT + ports: + - "3001:3001" networks: backend: diff --git a/docs/images/enter_server_url.png b/docs/images/enter_server_url.png deleted file mode 100644 index 7b82d56..0000000 Binary files a/docs/images/enter_server_url.png and /dev/null differ diff --git a/docs/images/select_kernel.png b/docs/images/select_kernel.png deleted file mode 100644 index f83e9ba..0000000 Binary files a/docs/images/select_kernel.png and /dev/null differ diff --git a/speech/Dockerfile b/speech/Dockerfile deleted file mode 100644 index e69de29..0000000 diff --git a/speech/requirements.txt b/speech/requirements.txt deleted file mode 100644 index e69de29..0000000 diff --git a/speechToText/Dockerfile b/speechToText/Dockerfile new file mode 100644 index 0000000..a5e501e --- /dev/null +++ b/speechToText/Dockerfile @@ -0,0 +1,32 @@ +# Use an official Python runtime as a parent image +FROM python:3.10-bookworm + +# Set the working directory in the container +WORKDIR /app +RUN apt-get update && apt-get install -y \ + portaudio19-dev \ + libasound2-dev \ + libpulse-dev \ + sox \ + libsox-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN apt-get update && apt-get install -y ffmpeg + + + + + + +# Copy only requrirements to keep cache. +COPY requirements.txt requirements.txt + +# Install any needed packages specified in requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the current directory contents into the container at /app +COPY . . + +# Run app.py when the container launches +CMD ["python","-u", "main.py"] +#CMD ["gunicorn", "--worker-class", "eventlet", "-w", "1", "-b", "0.0.0.0:8000", "app:app"] \ No newline at end of file diff --git a/src/speech/README.md b/speechToText/README.md similarity index 100% rename from src/speech/README.md rename to speechToText/README.md diff --git a/src/__init__.py b/speechToText/__init__.py similarity index 100% rename from src/__init__.py rename to speechToText/__init__.py diff --git a/src/speech/audioProcessor.py b/speechToText/audioProcessor.py similarity index 68% rename from src/speech/audioProcessor.py rename to speechToText/audioProcessor.py index 74c44db..057a621 100644 --- a/src/speech/audioProcessor.py +++ b/speechToText/audioProcessor.py @@ -1,99 +1,114 @@ -import numpy as np -import wave -from scipy.io import wavfile -import noisereduce as nr -from silero_vad import load_silero_vad, get_speech_timestamps, read_audio -from scipy.signal import resample - - - - - -class AudioProcessor: - def __init__(self, audio_path=None): - if audio_path is not None: - self.load_audio(audio_path) - else: - self.audio_path = None - self.sr = None - self.audio = None - - def load_audio(self, audio_path): - self.audio_path = audio_path - self.audio = read_audio(audio_path) - - # so far, very shit - def reduce_noise(self): - reduced_noise = nr.reduce_noise(y=self.audio, sr=16000) - scaled_audio = np.int16(reduced_noise * 32767) - self.audio = scaled_audio - - - def save_audio(self, output_path): - wavfile.write(output_path, 16000, self.audio) - - def remove_silence(self, output_path=None): - model = load_silero_vad() - wav = read_audio(self.audio_path) - speech_timestamps = get_speech_timestamps(wav, model) - speech_segments = [wav[segment['start']:segment['end']] for segment in speech_timestamps] - combined_speech = np.concatenate(speech_segments) - - if output_path is not None: - wavfile.write(output_path, 16000, combined_speech) - else: - self.audio = combined_speech - - - def boost_audio(self, factor=1.5): - boosted_audio = self.audio * factor - self.audio= np.clip(boosted_audio, -32768, 32767).astype(np.int16) - - def verify_audio_long_enough(self, length=16000): - if len(self.audio) < length: - return False - return True - - def process(self): - self.remove_silence() - #self.reduce_noise() - #self.boost_audio() - - return self.verify_audio_long_enough() - -# for debugging -if __name__ == "__main__": - - import sys - boosting = False - remove_silence = False - if len(sys.argv) >= 4 and sys.argv[1] == "boost": - boosting = True - processor = AudioProcessor(audio_path=sys.argv[2]) - - - boosted_audio = processor.boost_audio(float(sys.argv[3])) - processor.save_audio("boosted.wav") - - elif len(sys.argv) >= 4 and sys.argv[1] == "silence": - remove_silence = True - processor = AudioProcessor(audio_path=sys.argv[2]) - processor.remove_silence(sys.argv[3]) - - if not boosting and not remove_silence: - - if len(sys.argv) < 3: - print("Usage: python AudioProcessor.py ") - sys.exit(1) - - input_path = sys.argv[1] - output_path = sys.argv[2] - - # Initialize processor - processor = AudioProcessor(audio_path=input_path) - audio_enough = processor.process() - if audio_enough == False: - print("Audio too short") - else: - processor.save_audio(output_path) - +import numpy as np +import wave +from scipy.io import wavfile +import noisereduce as nr +from silero_vad import load_silero_vad, get_speech_timestamps, read_audio +import subprocess + +class AudioProcessor: + def __init__(self, audio_path=None): + if audio_path is not None: + self.load_audio(audio_path) + else: + self.audio_path = None + self.sr = None + self.audio = None + + def load_audio(self, audio_path): + self.audio_path = audio_path + self.audio = read_audio(audio_path) + + # so far, very shit + def reduce_noise(self): + reduced_noise = nr.reduce_noise(y=self.audio, sr=16000) + scaled_audio = np.int16(reduced_noise * 32767) + self.audio = scaled_audio + + def save_audio(self, output_path): + wavfile.write(output_path, 16000, self.audio) + + def remove_silence(self, output_path=None): + model = load_silero_vad() + wav = read_audio(self.audio_path) + speech_timestamps = get_speech_timestamps(wav, model) + speech_timestamps = get_speech_timestamps(wav, model) + if not speech_timestamps: + print("No speech detected in the audio.") + else: + speech_segments = [wav[segment['start']:segment['end']] for segment in speech_timestamps] + combined_speech = np.concatenate(speech_segments) + + if output_path is not None: + wavfile.write(output_path, 16000, combined_speech) + else: + self.audio = combined_speech + + def boost_audio(self, factor=1.5): + boosted_audio = self.audio * factor + self.audio= np.clip(boosted_audio, -32768, 32767).astype(np.int16) + + def verify_audio_long_enough(self, length=16000): + if len(self.audio) < length: + return False + return True + + def process(self): + self.remove_silence() + #self.reduce_noise() + #self.boost_audio() + + return self.verify_audio_long_enough() + + +def convert_webm_to_wav(input_file, output_file_path): + """ + Convert a .webm file to .wav using ffmpeg. + """ + try: + # Run ffmpeg command to convert the .webm file to .wav + result = subprocess.run([ + 'ffmpeg', '-i', input_file, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '2', output_file_path + ], capture_output=True, text=True) + + if result.returncode != 0: + print("FFmpeg error:", result.stderr) + print(f"Successfully converted {input_file} to {output_file_path}") + except subprocess.CalledProcessError as e: + print(f"Error during ffmpeg conversion: {e}") + raise e +# for debugging +if __name__ == "__main__": + + import sys + boosting = False + remove_silence = False + if len(sys.argv) >= 4 and sys.argv[1] == "boost": + boosting = True + processor = AudioProcessor(audio_path=sys.argv[2]) + + + boosted_audio = processor.boost_audio(float(sys.argv[3])) + processor.save_audio("boosted.wav") + + elif len(sys.argv) >= 4 and sys.argv[1] == "silence": + remove_silence = True + processor = AudioProcessor(audio_path=sys.argv[2]) + processor.remove_silence(sys.argv[3]) + + if not boosting and not remove_silence: + + if len(sys.argv) < 3: + print("Usage: python AudioProcessor.py ") + sys.exit(1) + + input_path = sys.argv[1] + output_path = sys.argv[2] + + # Initialize processor + processor = AudioProcessor(audio_path=input_path) + audio_enough = processor.process() + if audio_enough == False: + print("Audio too short") + else: + processor.save_audio(output_path) + diff --git a/src/speech/audioRecorder.py b/speechToText/audioRecorder.py similarity index 81% rename from src/speech/audioRecorder.py rename to speechToText/audioRecorder.py index c7be29c..d68f24f 100644 --- a/src/speech/audioRecorder.py +++ b/speechToText/audioRecorder.py @@ -1,75 +1,91 @@ -import pyaudio -import time -import numpy as np -import sys - - -class AudioRecorder: - def __init__(self, chunk_size=1024, rate=16000, channels=1, silence_threshold=500, max_silence_duration=5): - self.chunk_size = chunk_size - self.rate = rate - self.channels = channels - self.silence_threshold = silence_threshold - self.max_silence_duration = max_silence_duration - - self.audio_chunks = [] - - def start_recording(self): - self.audio = pyaudio.PyAudio() - self.stream = self.audio.open(format=pyaudio.paInt16, channels=self.channels, rate=self.rate, input=True, frames_per_buffer=self.chunk_size) - self.audio_chunks = [] - print("Recording started") - - - def stop_recording(self): - self.stream.stop_stream() - self.stream.close() - print("Recording stopped") - - def silent(self, data): - audio_data = np.frombuffer(data, dtype=np.int16) - if np.isnan(audio_data).any() or np.mean(audio_data**2) <= 0: - return True - rms = np.sqrt(np.mean(audio_data ** 2)) - # sys.stdout.write(f"\rRMS: {rms}") - # sys.stdout.flush() - return rms < self.silence_threshold - - def record(self, MERGE_SIZE=60, min_frames_with_sound=20): - self.start_recording() - - self.audio_chunks = [] - silence_start = None - frames_with_sound = 0 - MERGE_SIZE_UPPER = MERGE_SIZE * 3 - - try: - while True: - data = self.stream.read(self.chunk_size) - self.audio_chunks.append(data) - - if self.silent(data): - - if silence_start is None: - silence_start = time.time() - - if len(self.audio_chunks) >= MERGE_SIZE and time.time() - silence_start > 0.2 or len(self.audio_chunks) >= MERGE_SIZE_UPPER: - if frames_with_sound > min_frames_with_sound: - yield b''.join(self.audio_chunks) - frames_with_sound = 0 - self.audio_chunks = [] - - - elif time.time() - silence_start > self.max_silence_duration: - print(f"Silence detected for more than {self.max_silence_duration} seconds, stopping recording.") - break - else: - silence_start = None - frames_with_sound += 1 - - except KeyboardInterrupt: - print("Recording stopped by user.") - finally: - self.stop_recording() - - return self.audio_chunks +import pyaudio +import time +import numpy as np + +''' + chunk_size: + rate: + channels: Stereo or mono? + silence_threshold: + max_silence_duration: +''' + + +class AudioRecorder: + def __init__(self, chunk_size=1024, rate=16000, channels=1, silence_threshold=500, max_silence_duration=5): + self.chunk_size = chunk_size + self.rate = rate + self.channels = channels + self.silence_threshold = silence_threshold + self.max_silence_duration = max_silence_duration + + self.audio_chunks = [] + + def start_recording(self): + self.audio = pyaudio.PyAudio() + print("Available audio devices:") + for i in range(self.audio.get_device_count()): + device = self.audio.get_device_info_by_index(i) + print(f"Device {i}: {device['name']}") + self.stream = self.audio.open( + format=pyaudio.paInt16, + channels=self.channels, + rate=self.rate, + input=True, + frames_per_buffer=self.chunk_size + ) + self.audio_chunks = [] + print("Recording started") + + def stop_recording(self): + self.stream.stop_stream() + self.stream.close() + print("Recording stopped") + + def silent(self, data): + audio_data = np.frombuffer(data, dtype=np.int16) + if np.isnan(audio_data).any() or np.mean(audio_data**2) <= 0: + return True + rms = np.sqrt(np.mean(audio_data ** 2)) + # sys.stdout.write(f"\rRMS: {rms}") + # sys.stdout.flush() + return rms < self.silence_threshold + + def record(self, MERGE_SIZE=60, min_frames_with_sound=20): + self.start_recording() + + self.audio_chunks = [] + silence_start = None + frames_with_sound = 0 + MERGE_SIZE_UPPER = MERGE_SIZE * 3 + + try: + while True: + data = self.stream.read(self.chunk_size) + self.audio_chunks.append(data) + + if self.silent(data): + + if silence_start is None: + silence_start = time.time() + + if len(self.audio_chunks) >= MERGE_SIZE and time.time() - silence_start > 0.2 or len(self.audio_chunks) >= MERGE_SIZE_UPPER: + if frames_with_sound > min_frames_with_sound: + yield b''.join(self.audio_chunks) + frames_with_sound = 0 + self.audio_chunks = [] + + + elif time.time() - silence_start > self.max_silence_duration: + print(f"Silence detected for more than {self.max_silence_duration} seconds, stopping recording.") + break + else: + silence_start = None + frames_with_sound += 1 + + except KeyboardInterrupt: + print("Recording stopped by user.") + finally: + self.stop_recording() + + return self.audio_chunks diff --git a/speechToText/config.py b/speechToText/config.py new file mode 100644 index 0000000..c8bb7bf --- /dev/null +++ b/speechToText/config.py @@ -0,0 +1,22 @@ +import os +from dotenv import load_dotenv +load_dotenv() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +PORT_STT = os.getenv("PORT_STT") +PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY") + +#add langsmith api to env as LANGSMITH_API_KEY = "your_api_key" on EU server +LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY", "no_key") + +os.environ["LANGCHAIN_TRACING_V2"] = "true" +os.environ["LANGCHAIN_ENDPOINT"] = "https://eu.api.smith.langchain.com" +try: + os.environ["LANGCHAIN_API_KEY"] = LANGSMITH_API_KEY +except: + print("No langsmith key found") + +if __name__ == "__main__": + print(f"[INFO] OPENAI_API_KEY: {OPENAI_API_KEY}") + if(LANGSMITH_API_KEY): + print(f"[INFO] LANGSMITH_API_KEY: {LANGSMITH_API_KEY}") \ No newline at end of file diff --git a/speechToText/main.py b/speechToText/main.py new file mode 100644 index 0000000..e03b16e --- /dev/null +++ b/speechToText/main.py @@ -0,0 +1,94 @@ +from flask import Flask, request, url_for, jsonify +from flask_socketio import SocketIO, send, emit +from flask_cors import CORS +from config import PORT_STT +import asyncio +import speech_to_text +import requests +import os +from audioProcessor import convert_webm_to_wav +import subprocess + + + +app = Flask(__name__, static_url_path='/static') +app.config['SECRET_KEY'] = 'secret_key_xdddd' # TODO: Make a better key +CORS(app, resources={r"/*": {"origins": "*"}}) # TODO: Make the CORS actually not accept everything +socketio = SocketIO(app, cors_allowed_origins="*") + + +# Routes +@app.route("/") +def hello_world(): + return app.send_static_file('index.html') + + +@app.route('/start_recording/', methods=['POST']) +def start_recording(conversation_id): + print(f"Recording triggered for conversation ID: {conversation_id}") + + try: + # Emit 'start_recording' event to clients + socketio.emit('start_recording', {'conversation_id': conversation_id}) + + return jsonify({"status": "success", "message": "Recording started on client"}), 200 + + except Exception as e: + print(f'Error starting recording: {e}') + return jsonify({"status": "error", "message": "Failed to start recording"}), 500 + + +@app.route('/upload_audio', methods=['POST']) + +def upload_audio(): + audio_file = request.files['audio'] + print(f"Received audio file: {audio_file.filename}") + file_path = os.path.join("uploads", f"{audio_file.filename}") + wav_file_path = file_path.rsplit('.', 1)[0] + '.wav' + + # Save the audio file if it's in the expected format + if audio_file.filename.endswith('.webm'): + # Save the original .webm file + audio_file.save(file_path) + try: + # Convert the .webm file to .wav + if os.path.exists(wav_file_path): + os.remove(wav_file_path) + convert_webm_to_wav(file_path, wav_file_path) + file_path = wav_file_path # Now the path will point to the converted .wav file + except Exception as e: + print(f"Error converting webm to wav: {e}") + return jsonify({"status": "error", "message": str(e)}), 500 + else: + # For other file types, save directly + audio_file.save(file_path) + + # Process the converted .wav file (speech-to-text, etc.) + try: + text_result = speech_to_text.speech_to_text(filepath=wav_file_path) + except Exception as e: + print(f"Error during speech-to-text processing: {e}") + return jsonify({"status": "error", "message": "Failed to process audio"}), 500 + + # Send a POST request to notify main service about recording completion + conversation_id = request.form.get("conversation_id") + try: + response = requests.post(f'http://llm-service:3000/recording_completed', json={ + "text": text_result, + "conversation_id": conversation_id + }) + response.raise_for_status() # Ensure the request was successful + except requests.exceptions.RequestException as e: + print(f"Error notifying LLM service: {e}") + return jsonify({"status": "error", "message": "Failed to notify LLM service"}), 500 + + # Return the result to the client + return jsonify({"status": "success", "message": text_result}), 200 + + + +# Routes end +if __name__ == '__main__': + if not os.path.exists('uploads'): + os.makedirs('uploads') + socketio.run(app, debug=True, host='0.0.0.0', port=PORT_STT, allow_unsafe_werkzeug=True) \ No newline at end of file diff --git a/speechToText/requirements.txt b/speechToText/requirements.txt new file mode 100644 index 0000000..e84dcd1 Binary files /dev/null and b/speechToText/requirements.txt differ diff --git a/src/speech/speech_to_text.py b/speechToText/speech_to_text.py similarity index 61% rename from src/speech/speech_to_text.py rename to speechToText/speech_to_text.py index 84179cf..2fabf78 100644 --- a/src/speech/speech_to_text.py +++ b/speechToText/speech_to_text.py @@ -1,104 +1,116 @@ -from openai import OpenAI -import os -from dotenv import load_dotenv -import numpy as np -import wave -from audioRecorder import AudioRecorder -from audioProcessor import AudioProcessor -import time -from threading import Thread - -load_dotenv() -client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) - - -def create_tmp_wav_file(chunk, rate=16000, channels=1, path="tmp.wav"): - with wave.open(path, 'wb') as wav_file: - wav_file.setnchannels(channels) # Mono audio - wav_file.setsampwidth(2) # 2 bytes per sample, assuming 16-bit audio - wav_file.setframerate(rate) # Assuming 16kHz sample rate - wav_file.writeframes(chunk) - -def remove_tmp_wav_file(index=None): - if index is not None: - if os.path.exists(f"tmp{index}.wav"): - os.remove(f"tmp{index}.wav") - else: - if os.path.exists("tmp.wav"): - os.remove("tmp.wav") - -def speech_to_text(audio_file): - prompt="Transcribe the following Norwegian speech to text, the sentances may be cut off, do not make up words or fill in the sentances" - - transcription = client.audio.transcriptions.create( - model="whisper-1", - file=audio_file, - language="no", - prompt=prompt, - ) - transcription.text.replace(prompt, "") - return transcription - - -def path_to_audio_file(path): - audio_file = open(path, "rb") - return audio_file - -def chunks_to_text(chunks): - text = [] - for chunk in chunks: - audio = np.frombuffer(chunk, np.int16) - text.append(speech_to_text(audio)) - return text - -def chunks_to_full_audio(chunks): - return b"".join(chunks) - - -def handle_chunk(chunk, index): - create_tmp_wav_file(chunk, path=f"tmp{index}.wav") - processor = AudioProcessor(f"tmp{index}.wav") - processor.process() - processor.save_audio(f"tmp{index}.wav") - audio_file = path_to_audio_file(f"tmp{index}.wav") - - text.append(speech_to_text(audio_file=audio_file)) - audio_file.close() - print(text[-1].text) - remove_tmp_wav_file(index) - - -if __name__ == "__main__": - import sys - if len(sys.argv) ==1: - - CHUNK_SIZE = 1024 # Number of frames in a buffer - RATE = 16000 # 16 000 Hz is a common rate for speech processing - CHANNELS = 1 # Mono audio - SILENCE_THRESHOLD = 25 # Used to detect silence for stopping recording - MAX_SILENCE_DURATION = 5 # Seconds of silence to stop recording - - recorder = AudioRecorder(chunk_size=CHUNK_SIZE, rate=RATE, channels=CHANNELS, silence_threshold=SILENCE_THRESHOLD, max_silence_duration=MAX_SILENCE_DURATION) - - text = [] - index = 0 - for chunk in recorder.record(30): - t = Thread(target=handle_chunk, args=(chunk,index)) - index += 1 - t.start() - - time.sleep(2) - - remove_tmp_wav_file() - - else: - audio_file = path_to_audio_file(sys.argv[1]) - transcription = speech_to_text(audio_file) - print(transcription.text) - audio_file.close() - - - # audio_file = path_to_audio_file("nb-whisper-main/nb-whisper-main/audio/erna.mp3") - # transcription = speech_to_text(audio_file) - # print(transcription.text) - # audio_file.close() +from openai import OpenAI +import os +from dotenv import load_dotenv +import numpy as np +import wave +from audioRecorder import AudioRecorder +from audioProcessor import AudioProcessor +import time +from threading import Thread + +load_dotenv() +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +def create_tmp_wav_file(chunk, rate=16000, channels=1, path="tmp.wav"): + with wave.open(path, 'wb') as wav_file: + wav_file.setnchannels(channels) # Mono audio + wav_file.setsampwidth(2) # 2 bytes per sample, assuming 16-bit audio + wav_file.setframerate(rate) # Assuming 16kHz sample rate + wav_file.writeframes(chunk) + +def remove_tmp_wav_file(index=None): + if index is not None: + if os.path.exists(f"tmp{index}.wav"): + os.remove(f"tmp{index}.wav") + else: + if os.path.exists("tmp.wav"): + os.remove("tmp.wav") + +def speech_to_text(audio_file = None, filepath = None): + if audio_file is None: + if filepath is None: + raise ValueError("Either audio_file or filepath must be provided") + audio_file = path_to_audio_file(filepath) + #audio_file=handle_audio(audio_file, path=filepath) + + prompt="Transcribe the following Norwegian speech to text, the sentances may be cut off, do not make up words or fill in the sentances" + transcription = client.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + language="no", + ) + transcription.text.replace(prompt, "") + return transcription.text + +def path_to_audio_file(path): + audio_file = open(path, "rb") + return audio_file + +def chunks_to_text(chunks): + text = [] + for chunk in chunks: + audio = np.frombuffer(chunk, np.int16) + text.append(speech_to_text(audio)) + return text + +def chunks_to_full_audio(chunks): + return b"".join(chunks) + + +def handle_chunk(chunk, index, text): + create_tmp_wav_file(chunk, path=f"tmp{index}.wav") + processor = AudioProcessor(f"tmp{index}.wav") + processor.process() + processor.save_audio(f"tmp{index}.wav") + audio_file = path_to_audio_file(f"tmp{index}.wav") + + text.append(speech_to_text(audio_file=audio_file)) + audio_file.close() + print(text[-1].text) + remove_tmp_wav_file(index) + +def handle_audio(audio_file, path="tmp.wav"): + processor = AudioProcessor(audio_file) + processor.process() + processor.save_audio(path) + audio_file = path_to_audio_file(path) + return audio_file + + +async def startRecording(): + CHUNK_SIZE = 1024 # Number of frames in a buffer + RATE = 16000 # 16 000 Hz is a common rate for speech processing + CHANNELS = 1 # Mono audio + SILENCE_THRESHOLD = 25 # Used to detect silence for stopping recording + MAX_SILENCE_DURATION = 5 # Seconds of silence to stop recording + return "can you hear me?" + recorder = AudioRecorder(chunk_size=CHUNK_SIZE, rate=RATE, channels=CHANNELS, silence_threshold=SILENCE_THRESHOLD, max_silence_duration=MAX_SILENCE_DURATION) + + text = [] + threads = [] + index = 0 + for chunk in recorder.record(30): + t = Thread(target=handle_chunk, args=(chunk,index,text)) + threads.append(t) + index += 1 + t.start() + + for t in threads: + t.join() + return " ".join([t.text for t in text]) + +if __name__ == "__main__": + import sys + if len(sys.argv) ==1: + startRecording() + else: + audio_file = path_to_audio_file(sys.argv[1]) + transcription = speech_to_text(audio_file) + print(transcription.text) + audio_file.close() + + + # audio_file = path_to_audio_file("nb-whisper-main/nb-whisper-main/audio/erna.mp3") + # transcription = speech_to_text(audio_file) + # print(transcription.text) + # audio_file.close() \ No newline at end of file diff --git a/speechToText/static/index.html b/speechToText/static/index.html new file mode 100644 index 0000000..ebc6f5a --- /dev/null +++ b/speechToText/static/index.html @@ -0,0 +1,71 @@ + + + + + + Audio recorder + + + + + +
Waiting for server to start recording...
+ +

I Audio Recorder Your Mother

+

This is certanly one of the main page of the audio recorders of all time

+

I don't know whywhat you are hereeither

+ + + \ No newline at end of file diff --git a/speechToText/static/record.js b/speechToText/static/record.js new file mode 100644 index 0000000..7fe16a9 --- /dev/null +++ b/speechToText/static/record.js @@ -0,0 +1,107 @@ +const silenceThreshold = 10; // RMS threshold to detect silence +const maxSilenceDuration = 3; // seconds +const maxRecordingDuration = 10000; // Maximum recording time in milliseconds (10 seconds) +let silenceStartTime = null; +let mediaRecorder, audioChunks = []; +let isRecording = false; +let recordingTimeout; + +// Connect to the WebSocket server +const socket = io.connect(window.location.origin); + +// Listen for "start_recording" event +socket.on('start_recording', () => { + document.getElementById('status').innerText = "Recording started by server..."; + // reset variables + audioChunks = []; + silenceStartTime = null; + mediaRecorder = []; + clearTimeout(recordingTimeout); + startRecording(); +}); + +// Start recording audio +async function startRecording() { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + + // Create MediaRecorder instance + mediaRecorder = new MediaRecorder(stream); + mediaRecorder.start(); + + isRecording = true; + document.getElementById('status').innerText = "Recording in progress..."; + + // Collect audio data chunks + mediaRecorder.ondataavailable = (event) => { + audioChunks.push(event.data); + }; + + // When the recording stops, we process and upload the audio + mediaRecorder.onstop = async () => { + // Combine audio chunks into a single Blob + const combinedBlob = new Blob(audioChunks, { type: 'audio/webm' }); + + // Send the WebM file to the server (no conversion needed) + const formData = new FormData(); + formData.append('audio', combinedBlob, 'recording.webm'); + + try { + const response = await fetch('/upload_audio', { + method: 'POST', + body: formData, + }); + + if (!response.ok) { + throw new Error('Network response was not ok'); + } + + const data = await response.json(); + console.log(data); + document.getElementById('status').innerText = `Audio uploaded. Server response: ${data.message}`; + } catch (error) { + console.error('Error uploading audio:', error); + document.getElementById('status').innerText = 'Error uploading audio'; + } + }; + + // Start the timeout to stop recording after maxRecordingDuration + recordingTimeout = setTimeout(() => { + if (isRecording) { + document.getElementById('status').innerText = "Maximum recording duration reached. Stopping recording..."; + mediaRecorder.stop(); // Stop recording after 10 seconds + } + }, maxRecordingDuration); + + // Detect silence during the recording + detectSilence(); +} + +// Detect silence based on time and silenceThreshold +function detectSilence() { + const silenceCheckInterval = setInterval(() => { + if (mediaRecorder.state === "inactive" || !isRecording) { + clearInterval(silenceCheckInterval); + return; + } + + // Here, we're checking if the audio chunks' duration is long enough to consider it "silence". + const currentTime = performance.now(); + + if (audioChunks.length === 0) { + // If no data has been recorded yet, we consider it silent + if (!silenceStartTime) { + silenceStartTime = currentTime; // Start the silence timer + } + } else { + // Sound is detected, reset the silence timer + silenceStartTime = null; + } + + // Check if silence duration exceeds the maximum silence duration + if (silenceStartTime && (currentTime - silenceStartTime) / 1000 >= maxSilenceDuration) { + document.getElementById('status').innerText = "Silence detected. Stopping recording..."; + mediaRecorder.stop(); // Stop recording when silence is detected + clearInterval(silenceCheckInterval); // Stop checking for silence + } + }, 1000); +} diff --git a/src/speech/audioAnalyse.py b/speechToText/tools/audioAnalyse.py similarity index 96% rename from src/speech/audioAnalyse.py rename to speechToText/tools/audioAnalyse.py index 04650fa..f53be63 100644 --- a/src/speech/audioAnalyse.py +++ b/speechToText/tools/audioAnalyse.py @@ -1,41 +1,41 @@ -import numpy as np -import matplotlib.pyplot as plt -from scipy.io import wavfile -import seaborn as sns -import librosa -import librosa.display - -def analyze_audio(audio_path): - # Load the WAV file - sr, audio_data = wavfile.read(audio_path) - - # If stereo, take only one channel (convert to mono) - if len(audio_data.shape) == 2: - audio_data = audio_data.mean(axis=1) - - # Normalize audio data to range between -1 and 1 - audio_data = audio_data / np.max(np.abs(audio_data)) - - - - # Plot the spectrogram - audio_data_librosa, _ = librosa.load(audio_path, sr=sr) - D = librosa.amplitude_to_db(np.abs(librosa.stft(audio_data_librosa)), ref=np.max) - librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log') - plt.colorbar(format='%+2.0f dB') - plt.title('Spectrogram') - - plt.tight_layout() - plt.show() - -if __name__ == "__main__": - import sys - if len(sys.argv) < 2: - print("Usage: python audio_analysis.py ") - sys.exit(1) - - audio_path = sys.argv[1] - analyze_audio(audio_path) - if len(sys.argv) > 2: - audio_path2 = sys.argv[2] - analyze_audio(audio_path2) +import numpy as np +import matplotlib.pyplot as plt +from scipy.io import wavfile +import seaborn as sns +import librosa +import librosa.display + +def analyze_audio(audio_path): + # Load the WAV file + sr, audio_data = wavfile.read(audio_path) + + # If stereo, take only one channel (convert to mono) + if len(audio_data.shape) == 2: + audio_data = audio_data.mean(axis=1) + + # Normalize audio data to range between -1 and 1 + audio_data = audio_data / np.max(np.abs(audio_data)) + + + + # Plot the spectrogram + audio_data_librosa, _ = librosa.load(audio_path, sr=sr) + D = librosa.amplitude_to_db(np.abs(librosa.stft(audio_data_librosa)), ref=np.max) + librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log') + plt.colorbar(format='%+2.0f dB') + plt.title('Spectrogram') + + plt.tight_layout() + plt.show() + +if __name__ == "__main__": + import sys + if len(sys.argv) < 2: + print("Usage: python audio_analysis.py ") + sys.exit(1) + + audio_path = sys.argv[1] + analyze_audio(audio_path) + if len(sys.argv) > 2: + audio_path2 = sys.argv[2] + analyze_audio(audio_path2) diff --git a/src/speech/__init__.py b/src/speech/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/speech/requirements.txt b/src/speech/requirements.txt deleted file mode 100644 index 42810b2..0000000 Binary files a/src/speech/requirements.txt and /dev/null differ diff --git a/src/speech/tts/Dockerfile b/textToSpeech/Dockerfile similarity index 100% rename from src/speech/tts/Dockerfile rename to textToSpeech/Dockerfile diff --git a/src/speech/tts/docker-compose.yml b/textToSpeech/docker-compose.yml similarity index 100% rename from src/speech/tts/docker-compose.yml rename to textToSpeech/docker-compose.yml diff --git a/src/speech/tts/narakeet.py b/textToSpeech/narakeet.py similarity index 100% rename from src/speech/tts/narakeet.py rename to textToSpeech/narakeet.py diff --git a/src/speech/tts/requirements.txt b/textToSpeech/requirements.txt similarity index 100% rename from src/speech/tts/requirements.txt rename to textToSpeech/requirements.txt diff --git a/src/speech/tts/soundplayer.py b/textToSpeech/soundplayer.py similarity index 100% rename from src/speech/tts/soundplayer.py rename to textToSpeech/soundplayer.py diff --git a/src/speech/tts/tts_server.py b/textToSpeech/tts_server.py similarity index 100% rename from src/speech/tts/tts_server.py rename to textToSpeech/tts_server.py