From 1ca99b6eb01fa7319de36d18dc0db0a7ebff68c7 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 22 Nov 2023 01:38:51 -0800
Subject: [PATCH 01/18] Add speech to text model configuration to Database

---
 src/khoj/database/adapters/__init__.py        |  5 +++
 src/khoj/database/admin.py                    |  2 ++
 .../0020_speechtotextmodeloptions_and_more.py | 35 +++++++++++++++++++
 src/khoj/database/models/__init__.py          | 11 +++++-
 src/khoj/utils/initialization.py              | 10 ++++++
 5 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 src/khoj/database/migrations/0020_speechtotextmodeloptions_and_more.py

diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py
index ea0c0a858..471f3605e 100644
--- a/src/khoj/database/adapters/__init__.py
+++ b/src/khoj/database/adapters/__init__.py
@@ -28,6 +28,7 @@
     Conversation,
     ChatModelOptions,
     SearchModelConfig,
+    SpeechToTextModelOptions,
     Subscription,
     UserConversationConfig,
     OpenAIProcessorConversationConfig,
@@ -339,6 +340,10 @@ async def get_openai_chat():
     async def get_openai_chat_config():
         return await OpenAIProcessorConversationConfig.objects.filter().afirst()
 
+    @staticmethod
+    async def get_speech_to_text_config():
+        return await SpeechToTextModelOptions.objects.filter().afirst()
+
 
 class EntryAdapters:
     word_filer = WordFilter()
diff --git a/src/khoj/database/admin.py b/src/khoj/database/admin.py
index 69f15b2f2..4383056f9 100644
--- a/src/khoj/database/admin.py
+++ b/src/khoj/database/admin.py
@@ -9,12 +9,14 @@
     OpenAIProcessorConversationConfig,
     OfflineChatProcessorConversationConfig,
     SearchModelConfig,
+    SpeechToTextModelOptions,
     Subscription,
 )
 
 admin.site.register(KhojUser, UserAdmin)
 
 admin.site.register(ChatModelOptions)
+admin.site.register(SpeechToTextModelOptions)
 admin.site.register(OpenAIProcessorConversationConfig)
 admin.site.register(OfflineChatProcessorConversationConfig)
 admin.site.register(SearchModelConfig)
diff --git a/src/khoj/database/migrations/0020_speechtotextmodeloptions_and_more.py b/src/khoj/database/migrations/0020_speechtotextmodeloptions_and_more.py
new file mode 100644
index 000000000..1185d298a
--- /dev/null
+++ b/src/khoj/database/migrations/0020_speechtotextmodeloptions_and_more.py
@@ -0,0 +1,35 @@
+# Generated by Django 4.2.7 on 2023-11-22 08:51
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("database", "0019_alter_googleuser_family_name_and_more"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="SpeechToTextModelOptions",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("updated_at", models.DateTimeField(auto_now=True)),
+                ("model_name", models.CharField(default="whisper-1", max_length=200)),
+                (
+                    "model_type",
+                    models.CharField(
+                        choices=[("openai", "Openai"), ("offline", "Offline")], default="openai", max_length=200
+                    ),
+                ),
+            ],
+            options={
+                "abstract": False,
+            },
+        ),
+        migrations.AlterField(
+            model_name="chatmodeloptions",
+            name="chat_model",
+            field=models.CharField(default="gpt-4", max_length=200),
+        ),
+    ]
diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py
index 92da3e6e8..8098a731f 100644
--- a/src/khoj/database/models/__init__.py
+++ b/src/khoj/database/models/__init__.py
@@ -120,6 +120,15 @@ class OfflineChatProcessorConversationConfig(BaseModel):
     enabled = models.BooleanField(default=False)
 
 
+class SpeechToTextModelOptions(BaseModel):
+    class ModelType(models.TextChoices):
+        OPENAI = "openai"
+        OFFLINE = "offline"
+
+    model_name = models.CharField(max_length=200, default="whisper-1")
+    model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OPENAI)
+
+
 class ChatModelOptions(BaseModel):
     class ModelType(models.TextChoices):
         OPENAI = "openai"
@@ -127,7 +136,7 @@ class ModelType(models.TextChoices):
 
     max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
     tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True)
-    chat_model = models.CharField(max_length=200, default=None, null=True, blank=True)
+    chat_model = models.CharField(max_length=200, default="gpt-4")
     model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OPENAI)
 
 
diff --git a/src/khoj/utils/initialization.py b/src/khoj/utils/initialization.py
index ffc4d47eb..ee0454c49 100644
--- a/src/khoj/utils/initialization.py
+++ b/src/khoj/utils/initialization.py
@@ -6,6 +6,7 @@
     OfflineChatProcessorConversationConfig,
     OpenAIProcessorConversationConfig,
     ChatModelOptions,
+    SpeechToTextModelOptions,
 )
 
 from khoj.utils.constants import default_offline_chat_model, default_online_chat_model
@@ -94,6 +95,15 @@ def _create_chat_configuration():
                 chat_model=openai_chat_model, model_type=ChatModelOptions.ModelType.OPENAI, max_prompt_size=max_tokens
             )
 
+            default_speech2text_model = "whisper-1"
+            openai_speech2text_model = input(
+                f"Enter the OpenAI speech to text model you want to use (default: {default_speech2text_model}): "
+            )
+            openai_speech2text_model = openai_speech2text_model or default_speech2text_model
+            SpeechToTextModelOptions.objects.create(
+                model_name=openai_speech2text_model, model_type=SpeechToTextModelOptions.ModelType.OPENAI
+            )
+
         logger.info("🗣️  Chat model configuration complete")
 
     admin_user = KhojUser.objects.filter(is_staff=True).first()

From cc77bc4076624cee7084d9c0715ea400d42ebc1c Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 21 Nov 2023 20:37:34 -0800
Subject: [PATCH 02/18] Create speech to text API endpoint. Use OpenAI whisper
 for ASR

- Wrap audio transcription in try/catch and delete audio file after
processing
- Use configured speech to text model, else handle error
---
 src/khoj/routers/api.py | 53 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index f2e5c9662..0d0d4bb10 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -1,13 +1,16 @@
 # Standard Packages
 import concurrent.futures
 import math
+import os
 import time
 import logging
 import json
 from typing import Annotated, List, Optional, Union, Any
+import uuid
 
 # External Packages
-from fastapi import APIRouter, Depends, HTTPException, Header, Request
+from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File
+import openai
 from starlette.authentication import requires
 from asgiref.sync import sync_to_async
 
@@ -553,6 +556,54 @@ async def chat_options(
     return Response(content=json.dumps(cmd_options), media_type="application/json", status_code=200)
 
 
+@api.post("/speak")
+@requires(["authenticated"])
+async def transcribe_audio(request: Request, common: CommonQueryParams, file: UploadFile = File(...)):
+    user: KhojUser = request.user.object
+    audio_filename = f"{user.uuid}-{str(uuid.uuid4())}.webm"
+    user_message: str = None
+
+    # Transcribe the audio from the request
+    try:
+        # Store the audio from the request in a temporary file
+        audio_data = await file.read()
+        with open(audio_filename, "wb") as audio_file_writer:
+            audio_file_writer.write(audio_data)
+        audio_file = open(audio_filename, "rb")
+
+        # Send the audio data to the Whisper API
+        speech_to_text_config = await ConversationAdapters.get_speech_to_text_config()
+        openai_chat_config = await ConversationAdapters.get_openai_chat_config()
+        if not openai_chat_config or not speech_to_text_config:
+            # If the user has not configured a speech to text model, return an unprocessable entity error
+            status_code = 422
+        elif speech_to_text_config.model_type == ChatModelOptions.ModelType.OPENAI:
+            api_key = openai_chat_config.api_key
+            speech2text_model = speech_to_text_config.model_name
+            response = await sync_to_async(openai.Audio.translate)(
+                model=speech2text_model, file=audio_file, api_key=api_key
+            )
+            user_message = response["text"]
+    finally:
+        # Close and Delete the temporary audio file
+        audio_file.close()
+        os.remove(audio_filename)
+
+    if user_message is None:
+        return Response(status_code=status_code or 500)
+
+    update_telemetry_state(
+        request=request,
+        telemetry_type="api",
+        api="speech_to_text",
+        **common.__dict__,
+    )
+
+    # Return the spoken text
+    content = json.dumps({"text": user_message})
+    return Response(content=content, media_type="application/json", status_code=200)
+
+
 @api.get("/chat", response_class=Response)
 @requires(["authenticated"])
 async def chat(

From 2951fc92d7431e31734951aba7955cb7378f8d28 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 21 Nov 2023 23:38:36 -0800
Subject: [PATCH 03/18] Speak to Khoj from the Web client

- Use icons to style speech to text recording state
---
 .../web/assets/icons/microphone-solid.svg     |  1 +
 .../interface/web/assets/icons/stop-solid.svg | 37 ++++++++
 src/khoj/interface/web/chat.html              | 87 +++++++++++++++++--
 3 files changed, 120 insertions(+), 5 deletions(-)
 create mode 100644 src/khoj/interface/web/assets/icons/microphone-solid.svg
 create mode 100644 src/khoj/interface/web/assets/icons/stop-solid.svg

diff --git a/src/khoj/interface/web/assets/icons/microphone-solid.svg b/src/khoj/interface/web/assets/icons/microphone-solid.svg
new file mode 100644
index 000000000..3fc4b91d2
--- /dev/null
+++ b/src/khoj/interface/web/assets/icons/microphone-solid.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 384 512"><!--! Font Awesome Pro 6.4.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license (Commercial License) Copyright 2023 Fonticons, Inc. --><path d="M192 0C139 0 96 43 96 96V256c0 53 43 96 96 96s96-43 96-96V96c0-53-43-96-96-96zM64 216c0-13.3-10.7-24-24-24s-24 10.7-24 24v40c0 89.1 66.2 162.7 152 174.4V464H120c-13.3 0-24 10.7-24 24s10.7 24 24 24h72 72c13.3 0 24-10.7 24-24s-10.7-24-24-24H216V430.4c85.8-11.7 152-85.3 152-174.4V216c0-13.3-10.7-24-24-24s-24 10.7-24 24v40c0 70.7-57.3 128-128 128s-128-57.3-128-128V216z"/></svg>
diff --git a/src/khoj/interface/web/assets/icons/stop-solid.svg b/src/khoj/interface/web/assets/icons/stop-solid.svg
new file mode 100644
index 000000000..a9aaba284
--- /dev/null
+++ b/src/khoj/interface/web/assets/icons/stop-solid.svg
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   viewBox="0 0 384 512"
+   version="1.1"
+   id="svg1"
+   sodipodi:docname="stop-solid.svg"
+   inkscape:version="1.3 (0e150ed, 2023-07-21)"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <defs
+     id="defs1" />
+  <sodipodi:namedview
+     id="namedview1"
+     pagecolor="#ffffff"
+     bordercolor="#000000"
+     borderopacity="0.25"
+     inkscape:showpageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:deskcolor="#d1d1d1"
+     inkscape:zoom="0.4609375"
+     inkscape:cx="192"
+     inkscape:cy="256"
+     inkscape:window-width="1312"
+     inkscape:window-height="449"
+     inkscape:window-x="0"
+     inkscape:window-y="88"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="svg1" />
+  <!--! Font Awesome Pro 6.4.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license (Commercial License) Copyright 2023 Fonticons, Inc. -->
+  <path
+     d="M0 128C0 92.7 28.7 64 64 64H320c35.3 0 64 28.7 64 64V384c0 35.3-28.7 64-64 64H64c-35.3 0-64-28.7-64-64V128z"
+     id="path1"
+     style="fill:#aa0000" />
+</svg>
diff --git a/src/khoj/interface/web/chat.html b/src/khoj/interface/web/chat.html
index 1c661a926..d346294f1 100644
--- a/src/khoj/interface/web/chat.html
+++ b/src/khoj/interface/web/chat.html
@@ -369,6 +369,56 @@
                 chat();
             }
         }
+
+        let mediaRecorder;
+        function speechToText() {
+            const speakButton = document.getElementById('speak-button');
+            const speakButtonImg = document.getElementById('speak-button-img');
+            const chatInput = document.getElementById('chat-input');
+
+            const sendToServer = (audioBlob) => {
+                const formData = new FormData();
+                formData.append('file', audioBlob);
+
+                fetch('/api/speak?client=web', { method: 'POST', body: formData })
+                    .then(response => response.ok ? response.json() : Promise.reject(response))
+                    .then(data => { chatInput.value += data.text; })
+                    .catch(err => err.status == 422 ? console.error("Configure speech-to-text model on server.") : console.error("Failed to transcribe audio"));
+            };
+
+            const handleRecording = (stream) => {
+                const audioChunks = [];
+                const recordingConfig = { mimeType: 'audio/webm' };
+                mediaRecorder = new MediaRecorder(stream, recordingConfig);
+
+                mediaRecorder.addEventListener("dataavailable", function(event) {
+                    if (event.data.size > 0) audioChunks.push(event.data);
+                });
+
+                mediaRecorder.addEventListener("stop", function() {
+                    const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
+                    sendToServer(audioBlob);
+                });
+
+                mediaRecorder.start();
+                speakButtonImg.src = '/static/assets/icons/stop-solid.svg';
+                speakButtonImg.alt = 'Stop Speaking';
+            };
+
+            // Toggle recording
+            if (!mediaRecorder || mediaRecorder.state === 'inactive') {
+                navigator.mediaDevices
+                .getUserMedia({ audio: true })
+                .then(handleRecording)
+                .catch((e) => {
+                    console.error(e);
+                });
+            } else if (mediaRecorder.state === 'recording') {
+                mediaRecorder.stop();
+                speakButtonImg.src = '/static/assets/icons/microphone-solid.svg';
+                speakButtonImg.alt = 'Speak';
+            }
+        }
     </script>
     <body>
         <div id="khoj-empty-container" class="khoj-empty-container">
@@ -384,7 +434,12 @@
         <!-- Chat Footer -->
         <div id="chat-footer">
             <div id="chat-tooltip" style="display: none;"></div>
-            <textarea id="chat-input" class="option" oninput="onChatInput()" onkeydown=incrementalChat(event) autofocus="autofocus" placeholder="Type / to see a list of commands, or just type your questions and hit enter."></textarea>
+            <div id="input-row">
+                <textarea id="chat-input" class="option" oninput="onChatInput()" onkeydown=incrementalChat(event) autofocus="autofocus" placeholder="Type / to see a list of commands, or just type your questions and hit enter."></textarea>
+                <button id="speak-button" onclick="speechToText()">
+                    <img id="speak-button-img" src="/static/assets/icons/microphone-solid.svg" alt="Speak"></img>
+                </button>
+            </div>
         </div>
     </body>
     <script>
@@ -580,15 +635,17 @@
 
         #chat-footer {
             padding: 0;
+            margin: 8px;
             display: grid;
             grid-template-columns: minmax(70px, 100%);
             grid-column-gap: 10px;
             grid-row-gap: 10px;
         }
-        #chat-footer > * {
-            padding: 15px;
-            border-radius: 5px;
-            border: 1px solid var(--main-text-color);
+        #input-row {
+            display: grid;
+            grid-template-columns: auto 32px;
+            grid-column-gap: 10px;
+            grid-row-gap: 10px;
             background: #f9fafc
         }
         .option:hover {
@@ -609,6 +666,26 @@
         #chat-input:focus {
             outline: none !important;
         }
+        #speak-button {
+            background: var(--background-color);
+            border: none;
+            border-radius: 5px;
+            padding: 5px;
+            font-size: 14px;
+            font-weight: 300;
+            line-height: 1.5em;
+            cursor: pointer;
+            transition: background 0.3s ease-in-out;
+        }
+        #speak-button:hover {
+            background: var(--primary-hover);
+        }
+        #speak-button:active {
+            background: var(--primary-active);
+        }
+        #speak-button-img {
+            width: 24px;
+        }
 
         .option-enabled {
             box-shadow: 0 0 12px rgb(119, 156, 46);

From 63675b32992c3f518daf827c13d0ac161e89dce4 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 22 Nov 2023 02:19:22 -0800
Subject: [PATCH 04/18] Speak to Khoj from the Desktop client

- Use icons to style speech to text recording state
---
 .../desktop/assets/icons/microphone-solid.svg |  1 +
 .../desktop/assets/icons/stop-solid.svg       | 37 ++++++++
 src/interface/desktop/chat.html               | 93 ++++++++++++++++++-
 3 files changed, 126 insertions(+), 5 deletions(-)
 create mode 100644 src/interface/desktop/assets/icons/microphone-solid.svg
 create mode 100644 src/interface/desktop/assets/icons/stop-solid.svg

diff --git a/src/interface/desktop/assets/icons/microphone-solid.svg b/src/interface/desktop/assets/icons/microphone-solid.svg
new file mode 100644
index 000000000..3fc4b91d2
--- /dev/null
+++ b/src/interface/desktop/assets/icons/microphone-solid.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 384 512"><!--! Font Awesome Pro 6.4.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license (Commercial License) Copyright 2023 Fonticons, Inc. --><path d="M192 0C139 0 96 43 96 96V256c0 53 43 96 96 96s96-43 96-96V96c0-53-43-96-96-96zM64 216c0-13.3-10.7-24-24-24s-24 10.7-24 24v40c0 89.1 66.2 162.7 152 174.4V464H120c-13.3 0-24 10.7-24 24s10.7 24 24 24h72 72c13.3 0 24-10.7 24-24s-10.7-24-24-24H216V430.4c85.8-11.7 152-85.3 152-174.4V216c0-13.3-10.7-24-24-24s-24 10.7-24 24v40c0 70.7-57.3 128-128 128s-128-57.3-128-128V216z"/></svg>
diff --git a/src/interface/desktop/assets/icons/stop-solid.svg b/src/interface/desktop/assets/icons/stop-solid.svg
new file mode 100644
index 000000000..a9aaba284
--- /dev/null
+++ b/src/interface/desktop/assets/icons/stop-solid.svg
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   viewBox="0 0 384 512"
+   version="1.1"
+   id="svg1"
+   sodipodi:docname="stop-solid.svg"
+   inkscape:version="1.3 (0e150ed, 2023-07-21)"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <defs
+     id="defs1" />
+  <sodipodi:namedview
+     id="namedview1"
+     pagecolor="#ffffff"
+     bordercolor="#000000"
+     borderopacity="0.25"
+     inkscape:showpageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:deskcolor="#d1d1d1"
+     inkscape:zoom="0.4609375"
+     inkscape:cx="192"
+     inkscape:cy="256"
+     inkscape:window-width="1312"
+     inkscape:window-height="449"
+     inkscape:window-x="0"
+     inkscape:window-y="88"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="svg1" />
+  <!--! Font Awesome Pro 6.4.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license (Commercial License) Copyright 2023 Fonticons, Inc. -->
+  <path
+     d="M0 128C0 92.7 28.7 64 64 64H320c35.3 0 64 28.7 64 64V384c0 35.3-28.7 64-64 64H64c-35.3 0-64-28.7-64-64V128z"
+     id="path1"
+     style="fill:#aa0000" />
+</svg>
diff --git a/src/interface/desktop/chat.html b/src/interface/desktop/chat.html
index 4997ef99f..6c6d1ca14 100644
--- a/src/interface/desktop/chat.html
+++ b/src/interface/desktop/chat.html
@@ -377,6 +377,62 @@
                 chat();
             }
         }
+
+        let mediaRecorder;
+        async function speechToText() {
+            const speakButton = document.getElementById('speak-button');
+            const speakButtonImg = document.getElementById('speak-button-img');
+            const chatInput = document.getElementById('chat-input');
+
+            const hostURL = await window.hostURLAPI.getURL();
+            let url = `${hostURL}/api/speak?client=desktop`;
+            const khojToken = await window.tokenAPI.getToken();
+            const headers = { 'Authorization': `Bearer ${khojToken}` };
+
+            const sendToServer = (audioBlob) => {
+                const formData = new FormData();
+                formData.append('file', audioBlob);
+
+                fetch(url, { method: 'POST', body: formData, headers})
+                    .then(response => response.ok ? response.json() : Promise.reject(response))
+                    .then(data => { chatInput.value += data.text; })
+                    .catch(err => err.status == 422 ? console.error("Configure speech-to-text model on server.") : console.error("Failed to transcribe audio"));
+            };
+
+            const handleRecording = (stream) => {
+                const audioChunks = [];
+                const recordingConfig = { mimeType: 'audio/webm' };
+                mediaRecorder = new MediaRecorder(stream, recordingConfig);
+
+                mediaRecorder.addEventListener("dataavailable", function(event) {
+                    if (event.data.size > 0) audioChunks.push(event.data);
+                });
+
+                mediaRecorder.addEventListener("stop", function() {
+                    const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
+                    sendToServer(audioBlob);
+                });
+
+                mediaRecorder.start();
+                speakButtonImg.src = './assets/icons/stop-solid.svg';
+                speakButtonImg.alt = 'Stop Speaking';
+            };
+
+            // Toggle recording
+            if (!mediaRecorder || mediaRecorder.state === 'inactive') {
+                navigator.mediaDevices
+                .getUserMedia({ audio: true })
+                .then(handleRecording)
+                .catch((e) => {
+                    console.error(e);
+                });
+            } else if (mediaRecorder.state === 'recording') {
+                mediaRecorder.stop();
+                speakButtonImg.src = './assets/icons/microphone-solid.svg';
+                speakButtonImg.alt = 'Speak';
+            }
+        }
+
     </script>
     <body>
         <div id="khoj-empty-container" class="khoj-empty-container">
@@ -400,7 +456,12 @@
         <!-- Chat Footer -->
         <div id="chat-footer">
             <div id="chat-tooltip" style="display: none;"></div>
-            <textarea id="chat-input" class="option" oninput="onChatInput()" onkeydown=incrementalChat(event) autofocus="autofocus" placeholder="Type / to see a list of commands, or just type your questions and hit enter."></textarea>
+            <div id="input-row">
+                <textarea id="chat-input" class="option" oninput="onChatInput()" onkeydown=incrementalChat(event) autofocus="autofocus" placeholder="Type / to see a list of commands, or just type your questions and hit enter."></textarea>
+                <button id="speak-button" onclick="speechToText()">
+                    <img id="speak-button-img" src="./assets/icons/microphone-solid.svg" alt="Speak"></img>
+                </button>
+            </div>
         </div>
     </body>
 
@@ -514,15 +575,17 @@
 
         #chat-footer {
             padding: 0;
+            margin: 8px;
             display: grid;
             grid-template-columns: minmax(70px, 100%);
             grid-column-gap: 10px;
             grid-row-gap: 10px;
         }
-        #chat-footer > * {
-            padding: 15px;
-            border-radius: 5px;
-            border: 1px solid #475569;
+        #input-row {
+            display: grid;
+            grid-template-columns: auto 32px;
+            grid-column-gap: 10px;
+            grid-row-gap: 10px;
             background: #f9fafc
         }
         .option:hover {
@@ -543,6 +606,26 @@
         #chat-input:focus {
             outline: none !important;
         }
+        #speak-button {
+            background: var(--background-color);
+            border: none;
+            border-radius: 5px;
+            padding: 5px;
+            font-size: 14px;
+            font-weight: 300;
+            line-height: 1.5em;
+            cursor: pointer;
+            transition: background 0.3s ease-in-out;
+        }
+        #speak-button:hover {
+            background: var(--primary-hover);
+        }
+        #speak-button:active {
+            background: var(--primary-active);
+        }
+        #speak-button-img {
+            width: 24px;
+        }
 
         .option-enabled {
             box-shadow: 0 0 12px rgb(119, 156, 46);

From 3e252036c3df48dc0c5d097b09f0ea4b00642f5f Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 26 Nov 2023 00:27:29 -0800
Subject: [PATCH 05/18] Remove whitespace: pre-line from chat html, since
 markdown rendering

---
 src/interface/desktop/chat.html  | 2 --
 src/khoj/interface/web/chat.html | 2 --
 2 files changed, 4 deletions(-)

diff --git a/src/interface/desktop/chat.html b/src/interface/desktop/chat.html
index 41e185d11..82ab0f16d 100644
--- a/src/interface/desktop/chat.html
+++ b/src/interface/desktop/chat.html
@@ -679,7 +679,6 @@
         .chat-message.you {
             margin-right: auto;
             text-align: right;
-            white-space: pre-line;
         }
         /* basic style chat message text */
         .chat-message-text {
@@ -696,7 +695,6 @@
             color: var(--primary-inverse);
             background: var(--primary);
             margin-left: auto;
-            white-space: pre-line;
         }
         /* Spinner symbol when the chat message is loading */
         .spinner {
diff --git a/src/khoj/interface/web/chat.html b/src/khoj/interface/web/chat.html
index 7ca51fcce..641c983ec 100644
--- a/src/khoj/interface/web/chat.html
+++ b/src/khoj/interface/web/chat.html
@@ -802,7 +802,6 @@
         .chat-message.you {
             margin-right: auto;
             text-align: right;
-            white-space: pre-line;
         }
         /* basic style chat message text */
         .chat-message-text {
@@ -819,7 +818,6 @@
             color: var(--primary-inverse);
             background: var(--primary);
             margin-left: auto;
-            white-space: pre-line;
         }
         /* Spinner symbol when the chat message is loading */
         .spinner {

From 5a6547677ca681db7b6b72009ad96c55976e0d05 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 26 Nov 2023 00:38:18 -0800
Subject: [PATCH 06/18] Add type of operation variable in latest migration

---
 src/khoj/database/migrations/0021_merge_20231126_0650.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/khoj/database/migrations/0021_merge_20231126_0650.py b/src/khoj/database/migrations/0021_merge_20231126_0650.py
index 579c00726..78fd2e523 100644
--- a/src/khoj/database/migrations/0021_merge_20231126_0650.py
+++ b/src/khoj/database/migrations/0021_merge_20231126_0650.py
@@ -9,4 +9,4 @@ class Migration(migrations.Migration):
         ("database", "0020_speechtotextmodeloptions_and_more"),
     ]
 
-    operations = []
+    operations: list = []

From fc040825b27c6fd9218dc2906295d0ba23b855f9 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 26 Nov 2023 01:07:20 -0800
Subject: [PATCH 07/18] Default to Offline chat with Mistral as minimal setup,
 no API key reqd.

---
 src/khoj/database/models/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py
index 7b28521bb..77478ef53 100644
--- a/src/khoj/database/models/__init__.py
+++ b/src/khoj/database/models/__init__.py
@@ -136,8 +136,8 @@ class ModelType(models.TextChoices):
 
     max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
     tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True)
-    chat_model = models.CharField(max_length=200, default="gpt-4")
-    model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OPENAI)
+    chat_model = models.CharField(max_length=200, default="mistral-7b-instruct-v0.1.Q4_0.gguf")
+    model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OFFLINE)
 
 
 class UserConversationConfig(BaseModel):

From 28090216f634eb961f6673df06967d1037be5d94 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 26 Nov 2023 01:08:38 -0800
Subject: [PATCH 08/18] Show transcription error status in chatInput
 placeholder on web, desktop

- Extract flashing status message in chat input placeholder into
  reusable function
- Use emoji prefixes for status messages
- Improve alt text of transcribe button to indicate what the button does
---
 src/interface/desktop/chat.html  | 35 +++++++++++++++++++++-----------
 src/khoj/interface/web/chat.html | 35 +++++++++++++++++++++-----------
 2 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/src/interface/desktop/chat.html b/src/interface/desktop/chat.html
index 82ab0f16d..958cb5d92 100644
--- a/src/interface/desktop/chat.html
+++ b/src/interface/desktop/chat.html
@@ -516,6 +516,18 @@
             }
         }
 
+        function flashStatusInChatInput(message) {
+            // Get chat input element and original placeholder
+            let chatInput = document.getElementById("chat-input");
+            let originalPlaceholder = chatInput.placeholder;
+            // Set placeholder to message
+            chatInput.placeholder = message;
+            // Reset placeholder after 2 seconds
+            setTimeout(() => {
+                chatInput.placeholder = originalPlaceholder;
+            }, 2000);
+        }
+
         async function clearConversationHistory() {
             let chatInput = document.getElementById("chat-input");
             let originalPlaceholder = chatInput.placeholder;
@@ -530,16 +542,11 @@
                 .then(data => {
                     chatBody.innerHTML = "";
                     loadChat();
-                    chatInput.placeholder = "Cleared conversation history";
+                    flashStatusInChatInput("🗑 Cleared conversation history");
                 })
                 .catch(err => {
-                    chatInput.placeholder = "Failed to clear conversation history";
+                    flashStatusInChatInput("⛔️ Failed to clear conversation history");
                 })
-                .finally(() => {
-                    setTimeout(() => {
-                        chatInput.placeholder = originalPlaceholder;
-                    }, 2000);
-                });
         }
 
         let mediaRecorder;
@@ -560,7 +567,11 @@
                 fetch(url, { method: 'POST', body: formData, headers})
                     .then(response => response.ok ? response.json() : Promise.reject(response))
                     .then(data => { chatInput.value += data.text; })
-                    .catch(err => err.status == 422 ? console.error("Configure speech-to-text model on server.") : console.error("Failed to transcribe audio"));
+                    .catch(err => {
+                        err.status == 422
+                        ? flashStatusInChatInput("⛔️ Configure speech-to-text model on server.")
+                        : flashStatusInChatInput("⛔️ Failed to transcribe audio")
+                    });
             };
 
             const handleRecording = (stream) => {
@@ -579,7 +590,7 @@
 
                 mediaRecorder.start();
                 speakButtonImg.src = './assets/icons/stop-solid.svg';
-                speakButtonImg.alt = 'Stop Speaking';
+                speakButtonImg.alt = 'Stop Transcription';
             };
 
             // Toggle recording
@@ -588,12 +599,12 @@
                 .getUserMedia({ audio: true })
                 .then(handleRecording)
                 .catch((e) => {
-                    console.error(e);
+                    flashStatusInChatInput("⛔️ Failed to access microphone");
                 });
             } else if (mediaRecorder.state === 'recording') {
                 mediaRecorder.stop();
                 speakButtonImg.src = './assets/icons/microphone-solid.svg';
-                speakButtonImg.alt = 'Speak';
+                speakButtonImg.alt = 'Transcribe';
             }
         }
 
@@ -626,7 +637,7 @@
             <div id="input-row">
                 <textarea id="chat-input" class="option" oninput="onChatInput()" onkeydown=incrementalChat(event) autofocus="autofocus" placeholder="Type / to see a list of commands, or just type your questions and hit enter."></textarea>
                 <button id="speak-button" class="input-row-button" onclick="speechToText()">
-                    <img id="speak-button-img" src="input-row-button-img" src="./assets/icons/microphone-solid.svg" alt="Speak"></img>
+                    <img id="speak-button-img" class="input-row-button-img" src="./assets/icons/microphone-solid.svg" alt="Transcribe"></img>
                 </button>
                 <button id="clear-chat" class="input-row-button" onclick="clearConversationHistory()">
                     <img class="input-row-button-img" src="./assets/icons/trash-solid.svg" alt="Clear Chat History"></img>
diff --git a/src/khoj/interface/web/chat.html b/src/khoj/interface/web/chat.html
index 641c983ec..920cacc45 100644
--- a/src/khoj/interface/web/chat.html
+++ b/src/khoj/interface/web/chat.html
@@ -543,6 +543,18 @@
             }
         }
 
+        function flashStatusInChatInput(message) {
+            // Get chat input element and original placeholder
+            let chatInput = document.getElementById("chat-input");
+            let originalPlaceholder = chatInput.placeholder;
+            // Set placeholder to message
+            chatInput.placeholder = message;
+            // Reset placeholder after 2 seconds
+            setTimeout(() => {
+                chatInput.placeholder = originalPlaceholder;
+            }, 2000);
+        }
+
         function clearConversationHistory() {
             let chatInput = document.getElementById("chat-input");
             let originalPlaceholder = chatInput.placeholder;
@@ -553,15 +565,10 @@
                 .then(data => {
                     chatBody.innerHTML = "";
                     loadChat();
-                    chatInput.placeholder = "Cleared conversation history";
+                    flashStatusInChatInput("🗑 Cleared conversation history");
                 })
                 .catch(err => {
-                    chatInput.placeholder = "Failed to clear conversation history";
-                })
-                .finally(() => {
-                    setTimeout(() => {
-                        chatInput.placeholder = originalPlaceholder;
-                    }, 2000);
+                    flashStatusInChatInput("⛔️ Failed to clear conversation history");
                 });
         }
 
@@ -578,7 +585,11 @@
                 fetch('/api/speak?client=web', { method: 'POST', body: formData })
                     .then(response => response.ok ? response.json() : Promise.reject(response))
                     .then(data => { chatInput.value += data.text; })
-                    .catch(err => err.status == 422 ? console.error("Configure speech-to-text model on server.") : console.error("Failed to transcribe audio"));
+                    .catch(err => {
+                        err.status == 422
+                        ? flashStatusInChatInput("⛔️ Configure speech-to-text model on server.")
+                        : flashStatusInChatInput("⛔️ Failed to transcribe audio")
+                    });
             };
 
             const handleRecording = (stream) => {
@@ -597,7 +608,7 @@
 
                 mediaRecorder.start();
                 speakButtonImg.src = '/static/assets/icons/stop-solid.svg';
-                speakButtonImg.alt = 'Stop Speaking';
+                speakButtonImg.alt = 'Stop Transcription';
             };
 
             // Toggle recording
@@ -606,12 +617,12 @@
                 .getUserMedia({ audio: true })
                 .then(handleRecording)
                 .catch((e) => {
-                    console.error(e);
+                    flashStatusInChatInput("⛔️ Failed to access microphone");
                 });
             } else if (mediaRecorder.state === 'recording') {
                 mediaRecorder.stop();
                 speakButtonImg.src = '/static/assets/icons/microphone-solid.svg';
-                speakButtonImg.alt = 'Speak';
+                speakButtonImg.alt = 'Transcribe';
             }
         }
     </script>
@@ -635,7 +646,7 @@
             <div id="input-row">
                 <textarea id="chat-input" class="option" oninput="onChatInput()" onkeydown=incrementalChat(event) autofocus="autofocus" placeholder="Type / to see a list of commands, or just type your questions and hit enter."></textarea>
                 <button id="speak-button" class="input-row-button" onclick="speechToText()">
-                    <img id="speak-button-img" class="input-row-button-img" src="/static/assets/icons/microphone-solid.svg" alt="Speak"></img>
+                    <img id="speak-button-img" class="input-row-button-img" src="/static/assets/icons/microphone-solid.svg" alt="Transcribe"></img>
                 </button>
                 <button class="input-row-button" onclick="clearConversationHistory()">
                     <img class="input-row-button-img" src="/static/assets/icons/trash-solid.svg" alt="Clear Chat History"></img>

From 897170ab15dc2d136004f52bcb89a9fa096452ea Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 26 Nov 2023 01:55:59 -0800
Subject: [PATCH 09/18] Use single db migration script for transcribe model,
 related updates

---
 .../database/migrations/0021_merge_20231126_0650.py | 12 ------------
 ...py => 0021_speechtotextmodeloptions_and_more.py} | 13 ++++++++++---
 2 files changed, 10 insertions(+), 15 deletions(-)
 delete mode 100644 src/khoj/database/migrations/0021_merge_20231126_0650.py
 rename src/khoj/database/migrations/{0020_speechtotextmodeloptions_and_more.py => 0021_speechtotextmodeloptions_and_more.py} (68%)

diff --git a/src/khoj/database/migrations/0021_merge_20231126_0650.py b/src/khoj/database/migrations/0021_merge_20231126_0650.py
deleted file mode 100644
index 78fd2e523..000000000
--- a/src/khoj/database/migrations/0021_merge_20231126_0650.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by Django 4.2.7 on 2023-11-26 06:50
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-    dependencies = [
-        ("database", "0020_reflectivequestion"),
-        ("database", "0020_speechtotextmodeloptions_and_more"),
-    ]
-
-    operations: list = []
diff --git a/src/khoj/database/migrations/0020_speechtotextmodeloptions_and_more.py b/src/khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py
similarity index 68%
rename from src/khoj/database/migrations/0020_speechtotextmodeloptions_and_more.py
rename to src/khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py
index 1185d298a..c3e3c41dd 100644
--- a/src/khoj/database/migrations/0020_speechtotextmodeloptions_and_more.py
+++ b/src/khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py
@@ -1,11 +1,11 @@
-# Generated by Django 4.2.7 on 2023-11-22 08:51
+# Generated by Django 4.2.7 on 2023-11-26 09:37
 
 from django.db import migrations, models
 
 
 class Migration(migrations.Migration):
     dependencies = [
-        ("database", "0019_alter_googleuser_family_name_and_more"),
+        ("database", "0020_reflectivequestion"),
     ]
 
     operations = [
@@ -30,6 +30,13 @@ class Migration(migrations.Migration):
         migrations.AlterField(
             model_name="chatmodeloptions",
             name="chat_model",
-            field=models.CharField(default="gpt-4", max_length=200),
+            field=models.CharField(default="mistral-7b-instruct-v0.1.Q4_0.gguf", max_length=200),
+        ),
+        migrations.AlterField(
+            model_name="chatmodeloptions",
+            name="model_type",
+            field=models.CharField(
+                choices=[("openai", "Openai"), ("offline", "Offline")], default="offline", max_length=200
+            ),
         ),
     ]

From 499adf86a0be55373d72399854e3c3ce7feb0d73 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 26 Nov 2023 02:32:15 -0800
Subject: [PATCH 10/18] Move transcription using OpenAI API into independent
 package

---
 src/khoj/processor/conversation/openai/whisper.py | 15 +++++++++++++++
 src/khoj/routers/api.py                           | 11 ++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)
 create mode 100644 src/khoj/processor/conversation/openai/whisper.py

diff --git a/src/khoj/processor/conversation/openai/whisper.py b/src/khoj/processor/conversation/openai/whisper.py
new file mode 100644
index 000000000..6690c2360
--- /dev/null
+++ b/src/khoj/processor/conversation/openai/whisper.py
@@ -0,0 +1,15 @@
+# Standard Packages
+from io import BufferedReader
+
+# External Packages
+from asgiref.sync import sync_to_async
+import openai
+
+
+async def transcribe_audio(audio_file: BufferedReader, model, api_key) -> str | None:
+    """
+    Transcribe audio file using Whisper model via OpenAI's API
+    """
+    # Send the audio data to the Whisper API
+    response = await sync_to_async(openai.Audio.translate)(model=model, file=audio_file, api_key=api_key)
+    return response["text"]
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index 7ac4f5ec6..95b257ce0 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -10,10 +10,9 @@
 
 # External Packages
 from asgiref.sync import sync_to_async
-from fastapi import APIRouter, Depends, File, Header, HTTPException, Request, UploadFile
+from fastapi import APIRouter, Depends, File, HTTPException, Request, UploadFile
 from fastapi.requests import Request
 from fastapi.responses import Response, StreamingResponse
-import openai
 from starlette.authentication import requires
 
 # Internal Packages
@@ -33,6 +32,7 @@
 )
 from khoj.processor.conversation.gpt4all.chat_model import extract_questions_offline
 from khoj.processor.conversation.openai.gpt import extract_questions
+from khoj.processor.conversation.openai.whisper import transcribe_audio
 from khoj.processor.conversation.prompts import help_message, no_entries_found
 from khoj.processor.tools.online_search import search_with_google
 from khoj.routers.helpers import (
@@ -589,7 +589,7 @@ async def chat_options(
 
 @api.post("/speak")
 @requires(["authenticated"])
-async def transcribe_audio(request: Request, common: CommonQueryParams, file: UploadFile = File(...)):
+async def transcribe(request: Request, common: CommonQueryParams, file: UploadFile = File(...)):
     user: KhojUser = request.user.object
     audio_filename = f"{user.uuid}-{str(uuid.uuid4())}.webm"
     user_message: str = None
@@ -611,10 +611,7 @@ async def transcribe_audio(request: Request, common: CommonQueryParams, file: Up
         elif speech_to_text_config.model_type == ChatModelOptions.ModelType.OPENAI:
             api_key = openai_chat_config.api_key
             speech2text_model = speech_to_text_config.model_name
-            response = await sync_to_async(openai.Audio.translate)(
-                model=speech2text_model, file=audio_file, api_key=api_key
-            )
-            user_message = response["text"]
+            user_message = await transcribe_audio(model=speech2text_model, audio_file=audio_file, api_key=api_key)
     finally:
         # Close and Delete the temporary audio file
         audio_file.close()

From a0a7ab7ec82e5975812f2367e3bd7ebdf762f63d Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 26 Nov 2023 02:34:23 -0800
Subject: [PATCH 11/18] Rename conversation.gpt4all package to
 conversation.offline

---
 .../processor/conversation/{gpt4all => offline}/__init__.py   | 0
 .../processor/conversation/{gpt4all => offline}/chat_model.py | 0
 src/khoj/processor/conversation/{gpt4all => offline}/utils.py | 0
 src/khoj/routers/api.py                                       | 2 +-
 src/khoj/routers/helpers.py                                   | 2 +-
 src/khoj/utils/config.py                                      | 2 +-
 tests/test_gpt4all_chat_actors.py                             | 4 ++--
 7 files changed, 5 insertions(+), 5 deletions(-)
 rename src/khoj/processor/conversation/{gpt4all => offline}/__init__.py (100%)
 rename src/khoj/processor/conversation/{gpt4all => offline}/chat_model.py (100%)
 rename src/khoj/processor/conversation/{gpt4all => offline}/utils.py (100%)

diff --git a/src/khoj/processor/conversation/gpt4all/__init__.py b/src/khoj/processor/conversation/offline/__init__.py
similarity index 100%
rename from src/khoj/processor/conversation/gpt4all/__init__.py
rename to src/khoj/processor/conversation/offline/__init__.py
diff --git a/src/khoj/processor/conversation/gpt4all/chat_model.py b/src/khoj/processor/conversation/offline/chat_model.py
similarity index 100%
rename from src/khoj/processor/conversation/gpt4all/chat_model.py
rename to src/khoj/processor/conversation/offline/chat_model.py
diff --git a/src/khoj/processor/conversation/gpt4all/utils.py b/src/khoj/processor/conversation/offline/utils.py
similarity index 100%
rename from src/khoj/processor/conversation/gpt4all/utils.py
rename to src/khoj/processor/conversation/offline/utils.py
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index 95b257ce0..67b959a76 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -30,7 +30,7 @@
     LocalPlaintextConfig,
     NotionConfig,
 )
-from khoj.processor.conversation.gpt4all.chat_model import extract_questions_offline
+from khoj.processor.conversation.offline.chat_model import extract_questions_offline
 from khoj.processor.conversation.openai.gpt import extract_questions
 from khoj.processor.conversation.openai.whisper import transcribe_audio
 from khoj.processor.conversation.prompts import help_message, no_entries_found
diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py
index c6fcb4364..e1ab05b51 100644
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -15,7 +15,7 @@
 from khoj.database.adapters import ConversationAdapters
 from khoj.database.models import KhojUser, Subscription
 from khoj.processor.conversation import prompts
-from khoj.processor.conversation.gpt4all.chat_model import converse_offline, send_message_to_model_offline
+from khoj.processor.conversation.offline.chat_model import converse_offline, send_message_to_model_offline
 from khoj.processor.conversation.openai.gpt import converse, send_message_to_model
 from khoj.processor.conversation.utils import ThreadedGenerator, message_to_log
 
diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
index 7795d695d..aba6366a8 100644
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -11,7 +11,7 @@
 import torch
 
 # Internal Packages
-from khoj.processor.conversation.gpt4all.utils import download_model
+from khoj.processor.conversation.offline.utils import download_model
 
 
 logger = logging.getLogger(__name__)
diff --git a/tests/test_gpt4all_chat_actors.py b/tests/test_gpt4all_chat_actors.py
index 782b54f20..7b59e1e3f 100644
--- a/tests/test_gpt4all_chat_actors.py
+++ b/tests/test_gpt4all_chat_actors.py
@@ -19,8 +19,8 @@
     print("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
 
 # Internal Packages
-from khoj.processor.conversation.gpt4all.chat_model import converse_offline, extract_questions_offline, filter_questions
-from khoj.processor.conversation.gpt4all.utils import download_model
+from khoj.processor.conversation.offline.chat_model import converse_offline, extract_questions_offline, filter_questions
+from khoj.processor.conversation.offline.utils import download_model
 
 from khoj.processor.conversation.utils import message_to_log
 

From 4636390f7fc4f289ad83372adc5902848b2a1ff2 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 26 Nov 2023 03:37:45 -0800
Subject: [PATCH 12/18] Transcribe speech to text offline with Whisper

- Allow server admin to configure offline speech to text model during
  initialization
- Use offline speech to text model to transcribe audio from clients
- Set offline whisper as default speech to text model as no setup api key reqd
---
 pyproject.toml                                |  1 +
 .../0021_speechtotextmodeloptions_and_more.py |  6 ++---
 src/khoj/database/models/__init__.py          |  4 +--
 .../processor/conversation/offline/whisper.py | 17 +++++++++++++
 src/khoj/routers/api.py                       | 10 +++++---
 src/khoj/utils/initialization.py              | 25 ++++++++++++++++---
 src/khoj/utils/state.py                       |  1 +
 7 files changed, 52 insertions(+), 12 deletions(-)
 create mode 100644 src/khoj/processor/conversation/offline/whisper.py

diff --git a/pyproject.toml b/pyproject.toml
index 63a50fac7..42adf2099 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,6 +75,7 @@ dependencies = [
     "tzdata == 2023.3",
     "rapidocr-onnxruntime == 1.3.8",
     "stripe == 7.3.0",
+    "openai-whisper >= 20231117",
 ]
 dynamic = ["version"]
 
diff --git a/src/khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py b/src/khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py
index c3e3c41dd..373377915 100644
--- a/src/khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py
+++ b/src/khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py
@@ -1,4 +1,4 @@
-# Generated by Django 4.2.7 on 2023-11-26 09:37
+# Generated by Django 4.2.7 on 2023-11-26 13:54
 
 from django.db import migrations, models
 
@@ -15,11 +15,11 @@ class Migration(migrations.Migration):
                 ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
                 ("created_at", models.DateTimeField(auto_now_add=True)),
                 ("updated_at", models.DateTimeField(auto_now=True)),
-                ("model_name", models.CharField(default="whisper-1", max_length=200)),
+                ("model_name", models.CharField(default="base", max_length=200)),
                 (
                     "model_type",
                     models.CharField(
-                        choices=[("openai", "Openai"), ("offline", "Offline")], default="openai", max_length=200
+                        choices=[("openai", "Openai"), ("offline", "Offline")], default="offline", max_length=200
                     ),
                 ),
             ],
diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py
index 77478ef53..82348fbe6 100644
--- a/src/khoj/database/models/__init__.py
+++ b/src/khoj/database/models/__init__.py
@@ -125,8 +125,8 @@ class ModelType(models.TextChoices):
         OPENAI = "openai"
         OFFLINE = "offline"
 
-    model_name = models.CharField(max_length=200, default="whisper-1")
-    model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OPENAI)
+    model_name = models.CharField(max_length=200, default="base")
+    model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OFFLINE)
 
 
 class ChatModelOptions(BaseModel):
diff --git a/src/khoj/processor/conversation/offline/whisper.py b/src/khoj/processor/conversation/offline/whisper.py
new file mode 100644
index 000000000..d22486a9a
--- /dev/null
+++ b/src/khoj/processor/conversation/offline/whisper.py
@@ -0,0 +1,17 @@
+# External Packages
+from asgiref.sync import sync_to_async
+import whisper
+
+# Internal Packages
+from khoj.utils import state
+
+
+async def transcribe_audio_offline(audio_filename: str, model: str) -> str | None:
+    """
+    Transcribe audio file offline using Whisper
+    """
+    # Send the audio data to the Whisper API
+    if not state.whisper_model:
+        state.whisper_model = whisper.load_model(model)
+    response = await sync_to_async(state.whisper_model.transcribe)(audio_filename)
+    return response["text"]
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index 67b959a76..9f1b118ef 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -31,6 +31,7 @@
     NotionConfig,
 )
 from khoj.processor.conversation.offline.chat_model import extract_questions_offline
+from khoj.processor.conversation.offline.whisper import transcribe_audio_offline
 from khoj.processor.conversation.openai.gpt import extract_questions
 from khoj.processor.conversation.openai.whisper import transcribe_audio
 from khoj.processor.conversation.prompts import help_message, no_entries_found
@@ -605,13 +606,16 @@ async def transcribe(request: Request, common: CommonQueryParams, file: UploadFi
         # Send the audio data to the Whisper API
         speech_to_text_config = await ConversationAdapters.get_speech_to_text_config()
         openai_chat_config = await ConversationAdapters.get_openai_chat_config()
-        if not openai_chat_config or not speech_to_text_config:
+        if not speech_to_text_config:
             # If the user has not configured a speech to text model, return an unprocessable entity error
             status_code = 422
-        elif speech_to_text_config.model_type == ChatModelOptions.ModelType.OPENAI:
+        elif openai_chat_config and speech_to_text_config.model_type == ChatModelOptions.ModelType.OPENAI:
             api_key = openai_chat_config.api_key
             speech2text_model = speech_to_text_config.model_name
-            user_message = await transcribe_audio(model=speech2text_model, audio_file=audio_file, api_key=api_key)
+            user_message = await transcribe_audio(audio_file, model=speech2text_model, api_key=api_key)
+        elif speech_to_text_config.model_type == ChatModelOptions.ModelType.OFFLINE:
+            speech2text_model = speech_to_text_config.model_name
+            user_message = await transcribe_audio_offline(audio_filename, model=speech2text_model)
     finally:
         # Close and Delete the temporary audio file
         audio_file.close()
diff --git a/src/khoj/utils/initialization.py b/src/khoj/utils/initialization.py
index ee0454c49..313b18fcd 100644
--- a/src/khoj/utils/initialization.py
+++ b/src/khoj/utils/initialization.py
@@ -74,10 +74,9 @@ def _create_chat_configuration():
         except ModuleNotFoundError as e:
             logger.warning("Offline models are not supported on this device.")
 
-        use_openai_model = input("Use OpenAI chat model? (y/n): ")
-
+        use_openai_model = input("Use OpenAI models? (y/n): ")
         if use_openai_model == "y":
-            logger.info("🗣️ Setting up OpenAI chat model")
+            logger.info("🗣️ Setting up your OpenAI configuration")
             api_key = input("Enter your OpenAI API key: ")
             OpenAIProcessorConversationConfig.objects.create(api_key=api_key)
 
@@ -104,7 +103,25 @@ def _create_chat_configuration():
                 model_name=openai_speech2text_model, model_type=SpeechToTextModelOptions.ModelType.OPENAI
             )
 
-        logger.info("🗣️  Chat model configuration complete")
+        if use_offline_model == "y" or use_openai_model == "y":
+            logger.info("🗣️  Chat model configuration complete")
+
+        use_offline_speech2text_model = input("Use offline speech to text model? (y/n): ")
+        if use_offline_speech2text_model == "y":
+            logger.info("🗣️ Setting up offline speech to text model")
+            # Delete any existing speech to text model options. There can only be one.
+            SpeechToTextModelOptions.objects.all().delete()
+
+            default_offline_speech2text_model = "base"
+            offline_speech2text_model = input(
+                f"Enter the Whisper model to use Offline (default: {default_offline_speech2text_model}): "
+            )
+            offline_speech2text_model = offline_speech2text_model or default_offline_speech2text_model
+            SpeechToTextModelOptions.objects.create(
+                model_name=offline_speech2text_model, model_type=SpeechToTextModelOptions.ModelType.OFFLINE
+            )
+
+            logger.info(f"🗣️  Offline speech to text model configured to {offline_speech2text_model}")
 
     admin_user = KhojUser.objects.filter(is_staff=True).first()
     if admin_user is None:
diff --git a/src/khoj/utils/state.py b/src/khoj/utils/state.py
index 91f5f0cee..ce4d5804c 100644
--- a/src/khoj/utils/state.py
+++ b/src/khoj/utils/state.py
@@ -21,6 +21,7 @@
 cross_encoder_model: CrossEncoderModel = None
 content_index = ContentIndex()
 gpt4all_processor_config: GPT4AllProcessorModel = None
+whisper_model = None
 config_file: Path = None
 verbose: int = 0
 host: str = None

From cc9eae5d181cf687c900d284f23b4fb9e60af1f0 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 26 Nov 2023 03:46:01 -0800
Subject: [PATCH 13/18] Update default chat model to Mistral in
 GPT4AllProcessor config

---
 src/khoj/utils/config.py | 2 +-
 src/khoj/utils/state.py  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
index aba6366a8..abda12b6f 100644
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -80,7 +80,7 @@ class GPT4AllProcessorConfig:
 class GPT4AllProcessorModel:
     def __init__(
         self,
-        chat_model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
+        chat_model: str = "mistral-7b-instruct-v0.1.Q4_0.gguf",
     ):
         self.chat_model = chat_model
         self.loaded_model = None
diff --git a/src/khoj/utils/state.py b/src/khoj/utils/state.py
index ce4d5804c..b54cf4b39 100644
--- a/src/khoj/utils/state.py
+++ b/src/khoj/utils/state.py
@@ -7,6 +7,7 @@
 # External Packages
 from pathlib import Path
 from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
+from whisper import Whisper
 
 # Internal Packages
 from khoj.utils import config as utils_config
@@ -21,7 +22,7 @@
 cross_encoder_model: CrossEncoderModel = None
 content_index = ContentIndex()
 gpt4all_processor_config: GPT4AllProcessorModel = None
-whisper_model = None
+whisper_model: Whisper = None
 config_file: Path = None
 verbose: int = 0
 host: str = None

From 877532a167935b793ca7f992502b6707eebf8c56 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 26 Nov 2023 05:47:49 -0800
Subject: [PATCH 14/18] Speak to Khoj from the Obsidian client

- Add transcription button with mic icon
- Collect audio recording on pressing mic
- Process and send audio recording to server for transcription
- Extract the functionality to flash status in chat input for reuse
---
 src/interface/obsidian/src/chat_modal.ts | 107 +++++++++++++++++++++--
 src/interface/obsidian/styles.css        |   2 +-
 2 files changed, 99 insertions(+), 10 deletions(-)

diff --git a/src/interface/obsidian/src/chat_modal.ts b/src/interface/obsidian/src/chat_modal.ts
index fc6d5a488..7f9dfae6b 100644
--- a/src/interface/obsidian/src/chat_modal.ts
+++ b/src/interface/obsidian/src/chat_modal.ts
@@ -1,4 +1,4 @@
-import { App, Modal, request, setIcon } from 'obsidian';
+import { App, Modal, RequestUrlParam, request, requestUrl, setIcon } from 'obsidian';
 import { KhojSetting } from 'src/settings';
 import fetch from "node-fetch";
 
@@ -51,6 +51,16 @@ export class KhojChatModal extends Modal {
             })
         chatInput.addEventListener('change', (event) => { this.result = (<HTMLInputElement>event.target).value });
 
+        let transcribe = inputRow.createEl("button", {
+            text: "Transcribe",
+            attr: {
+                id: "khoj-transcribe",
+                class: "khoj-transcribe khoj-input-row-button",
+            },
+        })
+        transcribe.addEventListener('click', async (_) => { await this.speechToText() });
+        setIcon(transcribe, "mic");
+
         let clearChat = inputRow.createEl("button", {
             text: "Clear History",
             attr: {
@@ -205,9 +215,19 @@ export class KhojChatModal extends Modal {
         }
     }
 
-    async clearConversationHistory() {
+    flashStatusInChatInput(message: string) {
+        // Get chat input element and original placeholder
         let chatInput = <HTMLInputElement>this.contentEl.getElementsByClassName("khoj-chat-input")[0];
         let originalPlaceholder = chatInput.placeholder;
+        // Set placeholder to message
+        chatInput.placeholder = message;
+        // Reset placeholder after 2 seconds
+        setTimeout(() => {
+            chatInput.placeholder = originalPlaceholder;
+        }, 2000);
+    }
+
+    async clearConversationHistory() {
         let chatBody = this.contentEl.getElementsByClassName("khoj-chat-body")[0];
 
         let response = await request({
@@ -224,15 +244,84 @@ export class KhojChatModal extends Modal {
                 // If conversation history is cleared successfully, clear chat logs from modal
                 chatBody.innerHTML = "";
                 await this.getChatHistory();
-                chatInput.placeholder = result.message;
+                this.flashStatusInChatInput(result.message);
             }
         } catch (err) {
-            chatInput.placeholder = "Failed to clear conversation history";
-        } finally {
-            // Reset to original placeholder text after some time
-            setTimeout(() => {
-                chatInput.placeholder = originalPlaceholder;
-            }, 2000);
+            this.flashStatusInChatInput("Failed to clear conversation history");
+        }
+    }
+
+    mediaRecorder: MediaRecorder | undefined;
+    async speechToText() {
+        const transcribeButton = <HTMLButtonElement>this.contentEl.getElementsByClassName("khoj-transcribe")[0];
+        const chatInput = <HTMLInputElement>this.contentEl.getElementsByClassName("khoj-chat-input")[0];
+
+        const generateRequestBody = async (audioBlob: Blob, boundary_string: string) => {
+            const boundary = `------${boundary_string}`;
+            const chunks: ArrayBuffer[] = [];
+
+            chunks.push(new TextEncoder().encode(`${boundary}\r\n`));
+            chunks.push(new TextEncoder().encode(`Content-Disposition: form-data; name="file"; filename="blob"\r\nContent-Type: "application/octet-stream"\r\n\r\n`));
+            chunks.push(await audioBlob.arrayBuffer());
+            chunks.push(new TextEncoder().encode('\r\n'));
+
+            await Promise.all(chunks);
+            chunks.push(new TextEncoder().encode(`${boundary}--\r\n`));
+            return await new Blob(chunks).arrayBuffer();
+        };
+
+        const sendToServer = async (audioBlob: Blob) => {
+            const boundary_string = `Boundary${Math.random().toString(36).slice(2)}`;
+            const requestBody = await generateRequestBody(audioBlob, boundary_string);
+
+            const response = await requestUrl({
+                url: `${this.setting.khojUrl}/api/speak?client=obsidian`,
+                method: 'POST',
+                headers: { "Authorization": `Bearer ${this.setting.khojApiKey}` },
+                contentType: `multipart/form-data; boundary=----${boundary_string}`,
+                body: requestBody,
+            });
+
+            // Parse response from Khoj backend
+            if (response.status === 200) {
+                console.log(response);
+                chatInput.value += response.json.text;
+            } else if (response.status === 422) {
+                throw new Error("⛔️ Failed to transcribe audio");
+            } else {
+                throw new Error("⛔️ Configure speech-to-text model on server.");
+            }
+        };
+
+        const handleRecording = (stream: MediaStream) => {
+            const audioChunks: Blob[] = [];
+            const recordingConfig = { mimeType: 'audio/webm' };
+            this.mediaRecorder = new MediaRecorder(stream, recordingConfig);
+
+            this.mediaRecorder.addEventListener("dataavailable", function(event) {
+                if (event.data.size > 0) audioChunks.push(event.data);
+            });
+
+            this.mediaRecorder.addEventListener("stop", async function() {
+                const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
+                await sendToServer(audioBlob);
+            });
+
+            this.mediaRecorder.start();
+            setIcon(transcribeButton, "mic-off");
+        };
+
+        // Toggle recording
+        if (!this.mediaRecorder || this.mediaRecorder.state === 'inactive') {
+            navigator.mediaDevices
+                .getUserMedia({ audio: true })
+                .then(handleRecording)
+                .catch((e) => {
+                    this.flashStatusInChatInput("⛔️ Failed to access microphone");
+                });
+        } else if (this.mediaRecorder.state === 'recording') {
+            this.mediaRecorder.stop();
+            setIcon(transcribeButton, "mic");
         }
     }
 }
diff --git a/src/interface/obsidian/styles.css b/src/interface/obsidian/styles.css
index 95a304f1b..ff2dee8a8 100644
--- a/src/interface/obsidian/styles.css
+++ b/src/interface/obsidian/styles.css
@@ -112,7 +112,7 @@ If your plugin does not need CSS, delete this file.
 }
 .khoj-input-row {
     display: grid;
-    grid-template-columns: auto 32px;
+    grid-template-columns: auto 32px 32px;
     grid-column-gap: 10px;
     grid-row-gap: 10px;
     background: var(--background-primary);

From 56a1a61c77ce4ec5c109eed3807fe6bbe0762c14 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 26 Nov 2023 05:49:52 -0800
Subject: [PATCH 15/18] Remove unused button element retrieval code from web,
 desktop

---
 src/interface/desktop/chat.html  | 1 -
 src/khoj/interface/web/chat.html | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/interface/desktop/chat.html b/src/interface/desktop/chat.html
index 958cb5d92..00192d6f3 100644
--- a/src/interface/desktop/chat.html
+++ b/src/interface/desktop/chat.html
@@ -551,7 +551,6 @@
 
         let mediaRecorder;
         async function speechToText() {
-            const speakButton = document.getElementById('speak-button');
             const speakButtonImg = document.getElementById('speak-button-img');
             const chatInput = document.getElementById('chat-input');
 
diff --git a/src/khoj/interface/web/chat.html b/src/khoj/interface/web/chat.html
index 920cacc45..573915c98 100644
--- a/src/khoj/interface/web/chat.html
+++ b/src/khoj/interface/web/chat.html
@@ -574,7 +574,6 @@
 
         let mediaRecorder;
         function speechToText() {
-            const speakButton = document.getElementById('speak-button');
             const speakButtonImg = document.getElementById('speak-button-img');
             const chatInput = document.getElementById('chat-input');
 

From 06f99ceb3c4fbff298ce3b6c180e52632c0397ba Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 26 Nov 2023 05:58:07 -0800
Subject: [PATCH 16/18] Rename /api/speak API endpoint to /api/transcribe

---
 src/interface/desktop/chat.html          | 2 +-
 src/interface/obsidian/src/chat_modal.ts | 2 +-
 src/khoj/interface/web/chat.html         | 2 +-
 src/khoj/routers/api.py                  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/interface/desktop/chat.html b/src/interface/desktop/chat.html
index 00192d6f3..ae17c6372 100644
--- a/src/interface/desktop/chat.html
+++ b/src/interface/desktop/chat.html
@@ -555,7 +555,7 @@
             const chatInput = document.getElementById('chat-input');
 
             const hostURL = await window.hostURLAPI.getURL();
-            let url = `${hostURL}/api/speak?client=desktop`;
+            let url = `${hostURL}/api/transcribe?client=desktop`;
             const khojToken = await window.tokenAPI.getToken();
             const headers = { 'Authorization': `Bearer ${khojToken}` };
 
diff --git a/src/interface/obsidian/src/chat_modal.ts b/src/interface/obsidian/src/chat_modal.ts
index 7f9dfae6b..16c5614fa 100644
--- a/src/interface/obsidian/src/chat_modal.ts
+++ b/src/interface/obsidian/src/chat_modal.ts
@@ -275,7 +275,7 @@ export class KhojChatModal extends Modal {
             const requestBody = await generateRequestBody(audioBlob, boundary_string);
 
             const response = await requestUrl({
-                url: `${this.setting.khojUrl}/api/speak?client=obsidian`,
+                url: `${this.setting.khojUrl}/api/transcribe?client=obsidian`,
                 method: 'POST',
                 headers: { "Authorization": `Bearer ${this.setting.khojApiKey}` },
                 contentType: `multipart/form-data; boundary=----${boundary_string}`,
diff --git a/src/khoj/interface/web/chat.html b/src/khoj/interface/web/chat.html
index 573915c98..256193a78 100644
--- a/src/khoj/interface/web/chat.html
+++ b/src/khoj/interface/web/chat.html
@@ -581,7 +581,7 @@
                 const formData = new FormData();
                 formData.append('file', audioBlob);
 
-                fetch('/api/speak?client=web', { method: 'POST', body: formData })
+                fetch('/api/transcribe?client=web', { method: 'POST', body: formData })
                     .then(response => response.ok ? response.json() : Promise.reject(response))
                     .then(data => { chatInput.value += data.text; })
                     .catch(err => {
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index 9f1b118ef..bc55ce481 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -588,7 +588,7 @@ async def chat_options(
     return Response(content=json.dumps(cmd_options), media_type="application/json", status_code=200)
 
 
-@api.post("/speak")
+@api.post("/transcribe")
 @requires(["authenticated"])
 async def transcribe(request: Request, common: CommonQueryParams, file: UploadFile = File(...)):
     user: KhojUser = request.user.object

From a79604b6010cf9febdd5d6ccfbd20707a100df7b Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 26 Nov 2023 06:26:34 -0800
Subject: [PATCH 17/18] Fix return types of offline, online transcribe methods
 for python 3.9

---
 src/khoj/processor/conversation/offline/whisper.py | 2 +-
 src/khoj/processor/conversation/openai/whisper.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/khoj/processor/conversation/offline/whisper.py b/src/khoj/processor/conversation/offline/whisper.py
index d22486a9a..56d2aaf5c 100644
--- a/src/khoj/processor/conversation/offline/whisper.py
+++ b/src/khoj/processor/conversation/offline/whisper.py
@@ -6,7 +6,7 @@
 from khoj.utils import state
 
 
-async def transcribe_audio_offline(audio_filename: str, model: str) -> str | None:
+async def transcribe_audio_offline(audio_filename: str, model: str) -> str:
     """
     Transcribe audio file offline using Whisper
     """
diff --git a/src/khoj/processor/conversation/openai/whisper.py b/src/khoj/processor/conversation/openai/whisper.py
index 6690c2360..72834d921 100644
--- a/src/khoj/processor/conversation/openai/whisper.py
+++ b/src/khoj/processor/conversation/openai/whisper.py
@@ -6,7 +6,7 @@
 import openai
 
 
-async def transcribe_audio(audio_file: BufferedReader, model, api_key) -> str | None:
+async def transcribe_audio(audio_file: BufferedReader, model, api_key) -> str:
     """
     Transcribe audio file using Whisper model via OpenAI's API
     """

From b249bbb5b50059120ee43dcd12ce10eb911edd7c Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 26 Nov 2023 14:19:46 -0800
Subject: [PATCH 18/18] Limit max audio file size allowed for transcription on
 API endpoint

---
 src/khoj/routers/api.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index bc55ce481..3fd2285d8 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -595,6 +595,11 @@ async def transcribe(request: Request, common: CommonQueryParams, file: UploadFi
     audio_filename = f"{user.uuid}-{str(uuid.uuid4())}.webm"
     user_message: str = None
 
+    # If the file is too large, return an unprocessable entity error
+    if file.size > 10 * 1024 * 1024:
+        logger.warning(f"Audio file too large to transcribe. Audio file size: {file.size}. Exceeds 10Mb limit.")
+        return Response(content="Audio size larger than 10Mb limit", status_code=422)
+
     # Transcribe the audio from the request
     try:
         # Store the audio from the request in a temporary file
@@ -627,7 +632,7 @@ async def transcribe(request: Request, common: CommonQueryParams, file: UploadFi
     update_telemetry_state(
         request=request,
         telemetry_type="api",
-        api="speech_to_text",
+        api="transcribe",
         **common.__dict__,
     )