Merge pull request #17 from CogitoNTNU/speech_integration

Speech integration
CogitoNTNU · Nov 11, 2024 · 37ef343 · 37ef343
2 parents 63a2ea6 + 9187515
commit 37ef343
Show file tree

Hide file tree

Showing 34 changed files with 833 additions and 344 deletions.
diff --git a/.gitignore b/.gitignore
@@ -147,5 +147,6 @@ dmypy.json
 #wav files
 *.wav
 
+*.webm
 #calender json
 /core/tools/calendarjson
diff --git a/core/Dockerfile b/core/Dockerfile
@@ -13,9 +13,6 @@ RUN pip install --no-cache-dir -r requirements.txt
 # Copy the current directory contents into the container at /app
 COPY . .
 
-# Make port 8000 available to the world outside this container
-EXPOSE 3001
-
 # Run app.py when the container launches
 CMD ["python","-u", "main.py"]
 #CMD ["gunicorn", "--worker-class", "eventlet", "-w", "1", "-b", "0.0.0.0:8000", "app:app"]
diff --git a/core/main.py b/core/main.py
@@ -10,9 +10,11 @@
 import asyncio  
 from modules.user_data_setup import check_folders
 from modules.chat import read_chat
+import requests
 import logging
 log = logging.getLogger('werkzeug')
-log.setLevel(logging.ERROR) #INFO, DEBUG, WARNING, ERROR, or CRITICAL - config as needed during development.
+log.setLevel(logging.ERROR)
+from time import sleep
 from collections import defaultdict
 
 #
@@ -50,7 +52,7 @@ def hello_world():
 # Route to get metadata like name, id, descriptions of all user chats
 @app.route("/chats/metadata")
 def get_chats():
-    return "lmao"
+    return "lmao" # Why does this return lmao?
 
 @app.route('/vectorize_chat', methods=['POST'])
 def summarize_store():
@@ -133,6 +135,56 @@ async def run_and_store():
         print(f'Something very bad happened: {e}')
         return jsonify({"status": "error"})
 
+# Custom event. Fired when the user click the button with the cute little microphone icon.
+@app.route('/start_recording', methods=['POST'])
+def start_recording_route():
+    data = request.json
+    conversation_id = data.get('conversation_id')
+
+    print("Starting recording...")
+
+    # Send POST request to the recorder to start recording
+    headers = {'Content-Type': 'application/json'}
+    response = requests.post(f'http://speech-to-text:3001/start_recording/{conversation_id}', headers=headers, json=data)
+
+    if response.status_code != 200:
+        return jsonify({"status": "error", "text": "Failed to start recording"}), 500
+
+    return jsonify({"status": "recording_started"}), 200
+
+
+@socketio.on('start_recording')
+def start_recording_socket(data):
+    # This function handles the socket event to start recording
+    conversation_id = data.get('conversation_id')
+
+    print("Starting recording via socket...")
+
+    # Send POST request to the recorder to start recording
+    headers = {'Content-Type': 'application/json'}
+    response = requests.post(f'http://speech-to-text:3001/start_recording/{conversation_id}', headers=headers, json=data)
+
+    if response.status_code != 200:
+        socketio.emit('recording_failed', {"status": "error", "text": "Failed to start recording"})
+        return
+
+    socketio.emit('recording_started', {"status": "recording_started"})
+
+@app.route('/recording_completed', methods=['POST'])
+def recording_completed():
+    data = request.json
+    text = data.get('text', '')
+    socketio.emit("recording", text)
+
+    conversation_id = data.get('conversation_id', '')
+    print(f"Recording completed for conversation ID {conversation_id} with text:", text)
+
+    # Process the recorded text as needed (e.g., send to Jarvis or other services)
+    asyncio.run(jarvis.run(text, socketio))  # Assuming jarvis.run is asynchronous
+
+    return jsonify({"status": "success"}), 200
+
+
 @socketio.on('get_chat_history')
 def get_chat_history():
     session_id = request.sid
@@ -143,4 +195,5 @@ def get_chat_history():
 if __name__ == '__main__':
     socketio.run(app, debug=True, host='0.0.0.0', port=PORT, allow_unsafe_werkzeug=True)
 
-# hello
+# hello
+# TODO say hello back to whoever wrote this
diff --git a/core/static/chat.js b/core/static/chat.js
@@ -26,6 +26,16 @@ sendMessage = () => {
     }
 }
 
+addRecordedMessage = (message) => {
+    let chat_history = document.getElementById("chat_history")
+    if (message != "") {
+    addUserMessage(marked.parse(message))
+    chat_history.scrollTop = chat_history.scrollHeight;
+    }
+
+}
+
+
 addStreamedChunk = (messagePart) => {
     if(state.activeAIMessage){
         state.activeAIMessage.innerHTML += messagePart; // Append to innertext of the message

diff --git a/core/static/index.css b/core/static/index.css
@@ -100,6 +100,14 @@ body {
     font-size: 24px;
     margin-left: 12px;
 }
+#voice_button_recording {
+    width: 10%;
+    height: 9vh;
+    background-color: #673636;
+    border-radius: 10px;
+    font-size: 24px;
+
+}
 
 .chat_input_container{
     display: flex;

diff --git a/core/static/index.html b/core/static/index.html
@@ -8,6 +8,7 @@
     <script src="/static/ui_elements/settings.js"></script>
     <script src="/static/ui_elements/chatHistoryList.js"></script>
     <script src="/static/chat.js"></script>
+    <script src="static/recording.js"></script>
     <script src="/static/socketEvents.js"></script>
     <script  src="/static/index.js" defer></script>
     <script src="/static/eventlisteners.js"></script>

diff --git a/core/static/index.js b/core/static/index.js
@@ -5,25 +5,23 @@ Main js file for loading the dynamic UI elements.
 */
 
 // Runs on inital startup, after window (html) has finished loading
-init = () => {
-  document.getElementById("send_button").addEventListener("click", sendMessage);
-  document.getElementById("clear_log").addEventListener("click", clear_log);
-
-  document.querySelector(".chatHistory").innerHTML += chatHistoryList();
-
-  // To hide settings page when clicking somewhere else after it's opened.
-  document.addEventListener("click", function (event) {
-    const settings = document.getElementById("settingsPage");
-    const settingsButton = document.getElementById("settingsButton");
-    if (
-      !settings.contains(event.target) &&
-      !settingsButton.contains(event.target) &&
-      settings.style.display == "block"
-    ) {
-      settingsPage();
-    }
-  });
-};
+ init = () => {
+    document.getElementById('send_button').addEventListener('click', sendMessage)
+    document.getElementById('clear_log').addEventListener('click', clear_log)
+
+    document.getElementById('voice_button').addEventListener('click', startRecording)
+
+    document.querySelector(".chatHistory").innerHTML += chatHistoryList()
+
+    // To hide settings page when clicking somewhere else after it's opened.
+    document.addEventListener('click', function(event){
+        const settings = document.getElementById("settingsPage");
+        const settingsButton = document.getElementById("settingsButton");
+        if(!settings.contains(event.target) && !settingsButton.contains(event.target) && settings.style.display=="block") {
+            settingsPage()
+        }
+     });
+}
 window.onload = init;
 
 // global state of the UI
@@ -107,6 +105,17 @@ async function addToolResponseToProcessContainer(toolResponse) {
   let processesContainer = document.querySelector(".processesContainer");
   processesContainer.scrollTop = processesContainer.scrollHeight;
 }
+async function addStreamedRecording(uuid, messagePart) {
+    let element = document.getElementById(uuid);
+
+    if (element == null) {
+        await addRecordedMessage(messagePart, uuid);
+        element = document.getElementById(uuid);
+    } else {
+        // Concat ChatPart on message with uuid
+        element.innerHTML += messagePart;
+    }
+}
 
 addUserMessage = (message) => {
   let html = /*html*/ `

diff --git a/core/static/recording.js b/core/static/recording.js
@@ -0,0 +1,7 @@
+startRecording = () => {
+    document.getElementById('voice_button').style.backgroundColor = "#673636"; // Change button color to indicate recording
+    document.getElementById('voice_button').enabled = false; // Disable button while recording
+    const payload = {conversation_id: state.activeConversationId}
+    let res = socket.emit('start_recording', payload)
+    console.log("Recording started");
+}
diff --git a/core/static/socketEvents.js b/core/static/socketEvents.js
@@ -30,6 +30,19 @@ socket.on("chunk", async (chunk) => {
   await addStreamedMessage(uuid, chunk);
 });
 
+socket.on("recording", async (recording)=>{
+    if(!state.activeAIMessage){
+        console.log("RECIVED MESSAGE")
+        document.getElementById('voice_button').style.backgroundColor = ""; // Change button color to indicate recording
+        document.getElementById('voice_button').enabled = true; // Disable button while recording
+        uuid = generateUUID();
+        await addStreamedRecording(uuid, "");
+        ai_message = document.getElementById(uuid)
+        state.activeAIMessage = ai_message
+    }
+    await addStreamedRecording(uuid, recording );
+})
+
 socket.on("tokens", async (tokens) => {
   state.totalTokensUsed += tokens;
   console.log("Total tokens so far:", state.totalTokensUsed);

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,3 +1,5 @@
+version: '2.1'
+
 services:
   llm-service:
     build: ./core
@@ -21,7 +23,26 @@ services:
     stop_signal: SIGINT
     ports:
       - "3000:3000"
+    deploy:
+      resources:
+        limits:
+          cpus: '0.5'
+          memory: 2048M # Memory limit for the compose
 
+  speech-to-text:
+    build: ./speechToText
+    restart: unless-stopped
+    environment:
+      FLASK_ENV: ${FLASK_ENV} # Autorestarts flask when code changes are detected
+      OPENAI_API_KEY: ${OPENAI_API_KEY}
+      PORT_STT: ${PORT_STT}
+    volumes:
+      - ./speechToText:/app  # Mount the application code to detect live changes
+    networks:
+      - backend
+    stop_signal: SIGINT
+    ports:
+      - "3001:3001"
 
 networks:
   backend:

diff --git a/docs/images/enter_server_url.png b/docs/images/enter_server_url.png
diff --git a/docs/images/select_kernel.png b/docs/images/select_kernel.png
diff --git a/speech/Dockerfile b/speech/Dockerfile
diff --git a/speech/requirements.txt b/speech/requirements.txt
diff --git a/speechToText/Dockerfile b/speechToText/Dockerfile
@@ -0,0 +1,32 @@
+# Use an official Python runtime as a parent image
+FROM python:3.10-bookworm
+
+# Set the working directory in the container
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    portaudio19-dev \
+    libasound2-dev \
+    libpulse-dev \
+    sox \
+    libsox-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN apt-get update && apt-get install -y ffmpeg
+
+
+
+
+
+
+# Copy only requrirements to keep cache.
+COPY requirements.txt requirements.txt
+
+# Install any needed packages specified in requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the current directory contents into the container at /app
+COPY . .
+
+# Run app.py when the container launches
+CMD ["python","-u", "main.py"]
+#CMD ["gunicorn", "--worker-class", "eventlet", "-w", "1", "-b", "0.0.0.0:8000", "app:app"]
diff --git a/src/speech/README.md → speechToText/README.md b/src/speech/README.md → speechToText/README.md
diff --git a/src/__init__.py → speechToText/__init__.py b/src/__init__.py → speechToText/__init__.py