', methods=['POST'])
+def start_recording(conversation_id):
+ print(f"Recording triggered for conversation ID: {conversation_id}")
+
+ try:
+ # Emit 'start_recording' event to clients
+ socketio.emit('start_recording', {'conversation_id': conversation_id})
+
+ return jsonify({"status": "success", "message": "Recording started on client"}), 200
+
+ except Exception as e:
+ print(f'Error starting recording: {e}')
+ return jsonify({"status": "error", "message": "Failed to start recording"}), 500
+
+
+@app.route('/upload_audio', methods=['POST'])
+
+def upload_audio():
+ audio_file = request.files['audio']
+ print(f"Received audio file: {audio_file.filename}")
+ file_path = os.path.join("uploads", f"{audio_file.filename}")
+ wav_file_path = file_path.rsplit('.', 1)[0] + '.wav'
+
+ # Save the audio file if it's in the expected format
+ if audio_file.filename.endswith('.webm'):
+ # Save the original .webm file
+ audio_file.save(file_path)
+ try:
+ # Convert the .webm file to .wav
+ if os.path.exists(wav_file_path):
+ os.remove(wav_file_path)
+ convert_webm_to_wav(file_path, wav_file_path)
+ file_path = wav_file_path # Now the path will point to the converted .wav file
+ except Exception as e:
+ print(f"Error converting webm to wav: {e}")
+ return jsonify({"status": "error", "message": str(e)}), 500
+ else:
+ # For other file types, save directly
+ audio_file.save(file_path)
+
+ # Process the converted .wav file (speech-to-text, etc.)
+ try:
+ text_result = speech_to_text.speech_to_text(filepath=wav_file_path)
+ except Exception as e:
+ print(f"Error during speech-to-text processing: {e}")
+ return jsonify({"status": "error", "message": "Failed to process audio"}), 500
+
+ # Send a POST request to notify main service about recording completion
+ conversation_id = request.form.get("conversation_id")
+ try:
+ response = requests.post(f'http://llm-service:3000/recording_completed', json={
+ "text": text_result,
+ "conversation_id": conversation_id
+ })
+ response.raise_for_status() # Ensure the request was successful
+ except requests.exceptions.RequestException as e:
+ print(f"Error notifying LLM service: {e}")
+ return jsonify({"status": "error", "message": "Failed to notify LLM service"}), 500
+
+ # Return the result to the client
+ return jsonify({"status": "success", "message": text_result}), 200
+
+
+
+# Routes end
+if __name__ == '__main__':
+ if not os.path.exists('uploads'):
+ os.makedirs('uploads')
+ socketio.run(app, debug=True, host='0.0.0.0', port=PORT_STT, allow_unsafe_werkzeug=True)
\ No newline at end of file
diff --git a/speechToText/requirements.txt b/speechToText/requirements.txt
new file mode 100644
index 0000000..e84dcd1
Binary files /dev/null and b/speechToText/requirements.txt differ
diff --git a/src/speech/speech_to_text.py b/speechToText/speech_to_text.py
similarity index 61%
rename from src/speech/speech_to_text.py
rename to speechToText/speech_to_text.py
index 84179cf..2fabf78 100644
--- a/src/speech/speech_to_text.py
+++ b/speechToText/speech_to_text.py
@@ -1,104 +1,116 @@
-from openai import OpenAI
-import os
-from dotenv import load_dotenv
-import numpy as np
-import wave
-from audioRecorder import AudioRecorder
-from audioProcessor import AudioProcessor
-import time
-from threading import Thread
-
-load_dotenv()
-client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-
-
-def create_tmp_wav_file(chunk, rate=16000, channels=1, path="tmp.wav"):
- with wave.open(path, 'wb') as wav_file:
- wav_file.setnchannels(channels) # Mono audio
- wav_file.setsampwidth(2) # 2 bytes per sample, assuming 16-bit audio
- wav_file.setframerate(rate) # Assuming 16kHz sample rate
- wav_file.writeframes(chunk)
-
-def remove_tmp_wav_file(index=None):
- if index is not None:
- if os.path.exists(f"tmp{index}.wav"):
- os.remove(f"tmp{index}.wav")
- else:
- if os.path.exists("tmp.wav"):
- os.remove("tmp.wav")
-
-def speech_to_text(audio_file):
- prompt="Transcribe the following Norwegian speech to text, the sentances may be cut off, do not make up words or fill in the sentances"
-
- transcription = client.audio.transcriptions.create(
- model="whisper-1",
- file=audio_file,
- language="no",
- prompt=prompt,
- )
- transcription.text.replace(prompt, "")
- return transcription
-
-
-def path_to_audio_file(path):
- audio_file = open(path, "rb")
- return audio_file
-
-def chunks_to_text(chunks):
- text = []
- for chunk in chunks:
- audio = np.frombuffer(chunk, np.int16)
- text.append(speech_to_text(audio))
- return text
-
-def chunks_to_full_audio(chunks):
- return b"".join(chunks)
-
-
-def handle_chunk(chunk, index):
- create_tmp_wav_file(chunk, path=f"tmp{index}.wav")
- processor = AudioProcessor(f"tmp{index}.wav")
- processor.process()
- processor.save_audio(f"tmp{index}.wav")
- audio_file = path_to_audio_file(f"tmp{index}.wav")
-
- text.append(speech_to_text(audio_file=audio_file))
- audio_file.close()
- print(text[-1].text)
- remove_tmp_wav_file(index)
-
-
-if __name__ == "__main__":
- import sys
- if len(sys.argv) ==1:
-
- CHUNK_SIZE = 1024 # Number of frames in a buffer
- RATE = 16000 # 16 000 Hz is a common rate for speech processing
- CHANNELS = 1 # Mono audio
- SILENCE_THRESHOLD = 25 # Used to detect silence for stopping recording
- MAX_SILENCE_DURATION = 5 # Seconds of silence to stop recording
-
- recorder = AudioRecorder(chunk_size=CHUNK_SIZE, rate=RATE, channels=CHANNELS, silence_threshold=SILENCE_THRESHOLD, max_silence_duration=MAX_SILENCE_DURATION)
-
- text = []
- index = 0
- for chunk in recorder.record(30):
- t = Thread(target=handle_chunk, args=(chunk,index))
- index += 1
- t.start()
-
- time.sleep(2)
-
- remove_tmp_wav_file()
-
- else:
- audio_file = path_to_audio_file(sys.argv[1])
- transcription = speech_to_text(audio_file)
- print(transcription.text)
- audio_file.close()
-
-
- # audio_file = path_to_audio_file("nb-whisper-main/nb-whisper-main/audio/erna.mp3")
- # transcription = speech_to_text(audio_file)
- # print(transcription.text)
- # audio_file.close()
+from openai import OpenAI
+import os
+from dotenv import load_dotenv
+import numpy as np
+import wave
+from audioRecorder import AudioRecorder
+from audioProcessor import AudioProcessor
+import time
+from threading import Thread
+
+load_dotenv()
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+def create_tmp_wav_file(chunk, rate=16000, channels=1, path="tmp.wav"):
+ with wave.open(path, 'wb') as wav_file:
+ wav_file.setnchannels(channels) # Mono audio
+ wav_file.setsampwidth(2) # 2 bytes per sample, assuming 16-bit audio
+ wav_file.setframerate(rate) # Assuming 16kHz sample rate
+ wav_file.writeframes(chunk)
+
+def remove_tmp_wav_file(index=None):
+ if index is not None:
+ if os.path.exists(f"tmp{index}.wav"):
+ os.remove(f"tmp{index}.wav")
+ else:
+ if os.path.exists("tmp.wav"):
+ os.remove("tmp.wav")
+
+def speech_to_text(audio_file = None, filepath = None):
+ if audio_file is None:
+ if filepath is None:
+ raise ValueError("Either audio_file or filepath must be provided")
+ audio_file = path_to_audio_file(filepath)
+ #audio_file=handle_audio(audio_file, path=filepath)
+
+ prompt="Transcribe the following Norwegian speech to text, the sentances may be cut off, do not make up words or fill in the sentances"
+ transcription = client.audio.transcriptions.create(
+ model="whisper-1",
+ file=audio_file,
+ language="no",
+ )
+ transcription.text.replace(prompt, "")
+ return transcription.text
+
+def path_to_audio_file(path):
+ audio_file = open(path, "rb")
+ return audio_file
+
+def chunks_to_text(chunks):
+ text = []
+ for chunk in chunks:
+ audio = np.frombuffer(chunk, np.int16)
+ text.append(speech_to_text(audio))
+ return text
+
+def chunks_to_full_audio(chunks):
+ return b"".join(chunks)
+
+
+def handle_chunk(chunk, index, text):
+ create_tmp_wav_file(chunk, path=f"tmp{index}.wav")
+ processor = AudioProcessor(f"tmp{index}.wav")
+ processor.process()
+ processor.save_audio(f"tmp{index}.wav")
+ audio_file = path_to_audio_file(f"tmp{index}.wav")
+
+ text.append(speech_to_text(audio_file=audio_file))
+ audio_file.close()
+ print(text[-1].text)
+ remove_tmp_wav_file(index)
+
+def handle_audio(audio_file, path="tmp.wav"):
+ processor = AudioProcessor(audio_file)
+ processor.process()
+ processor.save_audio(path)
+ audio_file = path_to_audio_file(path)
+ return audio_file
+
+
+async def startRecording():
+ CHUNK_SIZE = 1024 # Number of frames in a buffer
+ RATE = 16000 # 16 000 Hz is a common rate for speech processing
+ CHANNELS = 1 # Mono audio
+ SILENCE_THRESHOLD = 25 # Used to detect silence for stopping recording
+ MAX_SILENCE_DURATION = 5 # Seconds of silence to stop recording
+ return "can you hear me?"
+ recorder = AudioRecorder(chunk_size=CHUNK_SIZE, rate=RATE, channels=CHANNELS, silence_threshold=SILENCE_THRESHOLD, max_silence_duration=MAX_SILENCE_DURATION)
+
+ text = []
+ threads = []
+ index = 0
+ for chunk in recorder.record(30):
+ t = Thread(target=handle_chunk, args=(chunk,index,text))
+ threads.append(t)
+ index += 1
+ t.start()
+
+ for t in threads:
+ t.join()
+ return " ".join([t.text for t in text])
+
+if __name__ == "__main__":
+ import sys
+ if len(sys.argv) ==1:
+ startRecording()
+ else:
+ audio_file = path_to_audio_file(sys.argv[1])
+ transcription = speech_to_text(audio_file)
+ print(transcription.text)
+ audio_file.close()
+
+
+ # audio_file = path_to_audio_file("nb-whisper-main/nb-whisper-main/audio/erna.mp3")
+ # transcription = speech_to_text(audio_file)
+ # print(transcription.text)
+ # audio_file.close()
\ No newline at end of file
diff --git a/speechToText/static/index.html b/speechToText/static/index.html
new file mode 100644
index 0000000..ebc6f5a
--- /dev/null
+++ b/speechToText/static/index.html
@@ -0,0 +1,71 @@
+
+
+
+
+
+ Audio recorder
+
+
+
+
+
+ Waiting for server to start recording...
+
+ I Audio Recorder Your Mother
+ This is certanly one of the main page of the audio recorders of all time
+ I don't know whywhat you are hereeither
+
+
+
\ No newline at end of file
diff --git a/speechToText/static/record.js b/speechToText/static/record.js
new file mode 100644
index 0000000..7fe16a9
--- /dev/null
+++ b/speechToText/static/record.js
@@ -0,0 +1,107 @@
+const silenceThreshold = 10; // RMS threshold to detect silence
+const maxSilenceDuration = 3; // seconds
+const maxRecordingDuration = 10000; // Maximum recording time in milliseconds (10 seconds)
+let silenceStartTime = null;
+let mediaRecorder, audioChunks = [];
+let isRecording = false;
+let recordingTimeout;
+
+// Connect to the WebSocket server
+const socket = io.connect(window.location.origin);
+
+// Listen for "start_recording" event
+socket.on('start_recording', () => {
+ document.getElementById('status').innerText = "Recording started by server...";
+ // reset variables
+ audioChunks = [];
+ silenceStartTime = null;
+ mediaRecorder = [];
+ clearTimeout(recordingTimeout);
+ startRecording();
+});
+
+// Start recording audio
+async function startRecording() {
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+
+ // Create MediaRecorder instance
+ mediaRecorder = new MediaRecorder(stream);
+ mediaRecorder.start();
+
+ isRecording = true;
+ document.getElementById('status').innerText = "Recording in progress...";
+
+ // Collect audio data chunks
+ mediaRecorder.ondataavailable = (event) => {
+ audioChunks.push(event.data);
+ };
+
+ // When the recording stops, we process and upload the audio
+ mediaRecorder.onstop = async () => {
+ // Combine audio chunks into a single Blob
+ const combinedBlob = new Blob(audioChunks, { type: 'audio/webm' });
+
+ // Send the WebM file to the server (no conversion needed)
+ const formData = new FormData();
+ formData.append('audio', combinedBlob, 'recording.webm');
+
+ try {
+ const response = await fetch('/upload_audio', {
+ method: 'POST',
+ body: formData,
+ });
+
+ if (!response.ok) {
+ throw new Error('Network response was not ok');
+ }
+
+ const data = await response.json();
+ console.log(data);
+ document.getElementById('status').innerText = `Audio uploaded. Server response: ${data.message}`;
+ } catch (error) {
+ console.error('Error uploading audio:', error);
+ document.getElementById('status').innerText = 'Error uploading audio';
+ }
+ };
+
+ // Start the timeout to stop recording after maxRecordingDuration
+ recordingTimeout = setTimeout(() => {
+ if (isRecording) {
+ document.getElementById('status').innerText = "Maximum recording duration reached. Stopping recording...";
+ mediaRecorder.stop(); // Stop recording after 10 seconds
+ }
+ }, maxRecordingDuration);
+
+ // Detect silence during the recording
+ detectSilence();
+}
+
+// Detect silence based on time and silenceThreshold
+function detectSilence() {
+ const silenceCheckInterval = setInterval(() => {
+ if (mediaRecorder.state === "inactive" || !isRecording) {
+ clearInterval(silenceCheckInterval);
+ return;
+ }
+
+ // Here, we're checking if the audio chunks' duration is long enough to consider it "silence".
+ const currentTime = performance.now();
+
+ if (audioChunks.length === 0) {
+ // If no data has been recorded yet, we consider it silent
+ if (!silenceStartTime) {
+ silenceStartTime = currentTime; // Start the silence timer
+ }
+ } else {
+ // Sound is detected, reset the silence timer
+ silenceStartTime = null;
+ }
+
+ // Check if silence duration exceeds the maximum silence duration
+ if (silenceStartTime && (currentTime - silenceStartTime) / 1000 >= maxSilenceDuration) {
+ document.getElementById('status').innerText = "Silence detected. Stopping recording...";
+ mediaRecorder.stop(); // Stop recording when silence is detected
+ clearInterval(silenceCheckInterval); // Stop checking for silence
+ }
+ }, 1000);
+}
diff --git a/src/speech/audioAnalyse.py b/speechToText/tools/audioAnalyse.py
similarity index 96%
rename from src/speech/audioAnalyse.py
rename to speechToText/tools/audioAnalyse.py
index 04650fa..f53be63 100644
--- a/src/speech/audioAnalyse.py
+++ b/speechToText/tools/audioAnalyse.py
@@ -1,41 +1,41 @@
-import numpy as np
-import matplotlib.pyplot as plt
-from scipy.io import wavfile
-import seaborn as sns
-import librosa
-import librosa.display
-
-def analyze_audio(audio_path):
- # Load the WAV file
- sr, audio_data = wavfile.read(audio_path)
-
- # If stereo, take only one channel (convert to mono)
- if len(audio_data.shape) == 2:
- audio_data = audio_data.mean(axis=1)
-
- # Normalize audio data to range between -1 and 1
- audio_data = audio_data / np.max(np.abs(audio_data))
-
-
-
- # Plot the spectrogram
- audio_data_librosa, _ = librosa.load(audio_path, sr=sr)
- D = librosa.amplitude_to_db(np.abs(librosa.stft(audio_data_librosa)), ref=np.max)
- librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
- plt.colorbar(format='%+2.0f dB')
- plt.title('Spectrogram')
-
- plt.tight_layout()
- plt.show()
-
-if __name__ == "__main__":
- import sys
- if len(sys.argv) < 2:
- print("Usage: python audio_analysis.py ")
- sys.exit(1)
-
- audio_path = sys.argv[1]
- analyze_audio(audio_path)
- if len(sys.argv) > 2:
- audio_path2 = sys.argv[2]
- analyze_audio(audio_path2)
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.io import wavfile
+import seaborn as sns
+import librosa
+import librosa.display
+
+def analyze_audio(audio_path):
+ # Load the WAV file
+ sr, audio_data = wavfile.read(audio_path)
+
+ # If stereo, take only one channel (convert to mono)
+ if len(audio_data.shape) == 2:
+ audio_data = audio_data.mean(axis=1)
+
+ # Normalize audio data to range between -1 and 1
+ audio_data = audio_data / np.max(np.abs(audio_data))
+
+
+
+ # Plot the spectrogram
+ audio_data_librosa, _ = librosa.load(audio_path, sr=sr)
+ D = librosa.amplitude_to_db(np.abs(librosa.stft(audio_data_librosa)), ref=np.max)
+ librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
+ plt.colorbar(format='%+2.0f dB')
+ plt.title('Spectrogram')
+
+ plt.tight_layout()
+ plt.show()
+
+if __name__ == "__main__":
+ import sys
+ if len(sys.argv) < 2:
+ print("Usage: python audio_analysis.py ")
+ sys.exit(1)
+
+ audio_path = sys.argv[1]
+ analyze_audio(audio_path)
+ if len(sys.argv) > 2:
+ audio_path2 = sys.argv[2]
+ analyze_audio(audio_path2)
diff --git a/src/speech/__init__.py b/src/speech/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/speech/requirements.txt b/src/speech/requirements.txt
deleted file mode 100644
index 42810b2..0000000
Binary files a/src/speech/requirements.txt and /dev/null differ
diff --git a/src/speech/tts/Dockerfile b/textToSpeech/Dockerfile
similarity index 100%
rename from src/speech/tts/Dockerfile
rename to textToSpeech/Dockerfile
diff --git a/src/speech/tts/docker-compose.yml b/textToSpeech/docker-compose.yml
similarity index 100%
rename from src/speech/tts/docker-compose.yml
rename to textToSpeech/docker-compose.yml
diff --git a/src/speech/tts/narakeet.py b/textToSpeech/narakeet.py
similarity index 100%
rename from src/speech/tts/narakeet.py
rename to textToSpeech/narakeet.py
diff --git a/src/speech/tts/requirements.txt b/textToSpeech/requirements.txt
similarity index 100%
rename from src/speech/tts/requirements.txt
rename to textToSpeech/requirements.txt
diff --git a/src/speech/tts/soundplayer.py b/textToSpeech/soundplayer.py
similarity index 100%
rename from src/speech/tts/soundplayer.py
rename to textToSpeech/soundplayer.py
diff --git a/src/speech/tts/tts_server.py b/textToSpeech/tts_server.py
similarity index 100%
rename from src/speech/tts/tts_server.py
rename to textToSpeech/tts_server.py