From 924c6ecea3cd08361ce001b72a5aa2c78471870c Mon Sep 17 00:00:00 2001
From: DHRUMIL PATEL <123137675+dhrumilp12@users.noreply.github.com>
Date: Thu, 13 Jun 2024 13:27:25 -0400
Subject: [PATCH] audio

---
 client/src/Components/chatComponent.jsx | 27 ++++++++---
 server/services/speech_service.py       | 60 +++++++++++++++++++++----
 2 files changed, 72 insertions(+), 15 deletions(-)

diff --git a/client/src/Components/chatComponent.jsx b/client/src/Components/chatComponent.jsx
index fdfd8707..cd58dd12 100644
--- a/client/src/Components/chatComponent.jsx
+++ b/client/src/Components/chatComponent.jsx
@@ -154,10 +154,15 @@ const ChatComponent = () => {
 
         // Function to handle recording start
         const startRecording = () => {
+            setAudioChunks([]);
             navigator.mediaDevices.getUserMedia({ audio: true })
                 .then(stream => {
-                    const recorder = new MediaRecorder(stream);
-                    recorder.ondataavailable = (e) => setAudioChunks(current => [...current, e.data]);
+                    const options = { mimeType: 'audio/webm' };
+                    const recorder = new MediaRecorder(stream, options);
+                    recorder.ondataavailable = (e) => {
+                        console.log('Data available:', e.data.size); // Log size to check if data is present
+                        setAudioChunks(current => [...current, e.data]);
+                    };
                     recorder.onstop = sendAudioToServer;
                     recorder.start();
                     setMediaRecorder(recorder);
@@ -167,13 +172,21 @@ const ChatComponent = () => {
 
         // Function to handle recording stop
         const stopRecording = () => {
-            mediaRecorder.stop();
-            setIsRecording(false);
-            setMediaRecorder(null);
+            if (mediaRecorder) {
+                mediaRecorder.stop();
+                setIsRecording(false);
+                setMediaRecorder(null);
+            }
         };
 
         const sendAudioToServer = useCallback(() => {
-            const audioBlob = new Blob(audioChunks, { 'type': 'audio/wav' });
+            console.log('Audio chunks size:', audioChunks.reduce((sum, chunk) => sum + chunk.size, 0)); // Log total size of chunks
+            const audioBlob = new Blob(audioChunks, { 'type': 'audio/webm' });
+            if (audioBlob.size === 0) {
+                console.error('Audio Blob is empty');
+                return;
+            }
+            console.log(`Sending audio blob of size: ${audioBlob.size} bytes`);
             const formData = new FormData();
             formData.append('audio', audioBlob);
             setIsLoading(true);
@@ -197,7 +210,7 @@ const ChatComponent = () => {
             .finally(() => {
                 setIsLoading(false);
             });
-        }, []);
+        }, [audioChunks]);
         
 
 
diff --git a/server/services/speech_service.py b/server/services/speech_service.py
index 0cf823cc..c4cc9c77 100644
--- a/server/services/speech_service.py
+++ b/server/services/speech_service.py
@@ -1,22 +1,60 @@
 import azure.cognitiveservices.speech as speechsdk
 import io
+import subprocess
+import os
+
+def check_ffmpeg():
+    try:
+        result = subprocess.run(["ffmpeg", "-version"], capture_output=True, text=True)
+        print("FFmpeg version:")
+        print(result.stdout)
+    except Exception as e:
+        print("Failed to run FFmpeg:", str(e))
+
+check_ffmpeg()
+
+def convert_audio_to_wav(input_audio_path, output_audio_path):
+    try:
+        command = ['ffmpeg', '-i', input_audio_path, '-acodec', 'pcm_s16le', '-ac', '1', '-ar', '16000', output_audio_path]
+        result = subprocess.run(command, check=True, text=True, capture_output=True)
+        print(f"FFmpeg output: {result.stdout}")
+    except subprocess.CalledProcessError as e:
+        print(f"FFmpeg error: {e.stderr}")
+        raise Exception("Failed to convert audio") from e
+
 
 def speech_to_text(audio_file):
+    
+        # Save original audio to a temporary file
+    temp_input_path = 'temp_input.webm'
+    temp_output_path = 'temp_output.wav'
+
     try:
+        with open(temp_input_path, 'wb') as f:
+            f.write(audio_file.read())
+
+            # Convert to WAV format
+        convert_audio_to_wav(temp_input_path, temp_output_path)
+
+         # Load converted audio and process
+        with open(temp_output_path, 'rb') as f:
+            audio_data = f.read()
         # Convert the audio file received into a stream
-        audio_stream = io.BytesIO()
-        audio_file.save(audio_stream)
+        audio_stream = io.BytesIO(audio_data)
+        
+        print(f"Size of audio file: {audio_stream.getbuffer().nbytes} bytes")  # Debugging the size of the file
         audio_stream.seek(0)
-
+        print(f"Size of audio file: {audio_stream.getbuffer().nbytes} bytes")
         # Set up the speech config with your subscription details
         speech_key = "c833c8ef4bb0441b98971cc2d850f462"
         service_region = "eastus"
         speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
-        speech_config.set_property(speechsdk.PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs, "15000")  # Timeout in milliseconds
+        speech_config.set_property(speechsdk.PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs, "5000")  # Timeout in milliseconds
         # Create a push stream that can be used with the speech recognizer
         push_stream = speechsdk.audio.PushAudioInputStream()
         audio_config = speechsdk.audio.AudioConfig(stream=push_stream)
-        
+        speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
+           
 
         # Read the buffer and push into the push stream
         data = audio_stream.read(1024)
@@ -26,8 +64,6 @@ def speech_to_text(audio_file):
             data = audio_stream.read(1024)
         push_stream.close()
 
-        # Create a recognizer with the given settings
-        speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
 
         print("Speak into your microphone.")
         result = speech_recognizer.recognize_once()
@@ -43,8 +79,9 @@ def speech_to_text(audio_file):
             cancellation_details = result.cancellation_details
             print("Speech Recognition canceled: {}".format(cancellation_details.reason))
             if cancellation_details.reason == speechsdk.CancellationReason.Error:
-                print("Error details: {}".for_sessiondetails.error_details)
+                print("Error details: {}".format(cancellation_details.error_details))
             return "Speech Recognition canceled"
+
         else:
             print("Speech Recognition canceled: {}".format(result.cancellation_details.reason))
             if result.cancellation_details.reason == speechsdk.CancellationReason.Error:
@@ -55,3 +92,10 @@ def speech_to_text(audio_file):
     except Exception as e:
         print(f"Error during speech recognition: {str(e)}")
         return None
+    finally:
+        # Clean up temporary files
+        if os.path.exists(temp_input_path):
+            os.remove(temp_input_path)
+        if os.path.exists(temp_output_path):
+            os.remove(temp_output_path)
+        print("Temporary files removed")