From 1ca99b6eb01fa7319de36d18dc0db0a7ebff68c7 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 22 Nov 2023 01:38:51 -0800 Subject: [PATCH 01/18] Add speech to text model configuration to Database --- src/khoj/database/adapters/__init__.py | 5 +++ src/khoj/database/admin.py | 2 ++ .../0020_speechtotextmodeloptions_and_more.py | 35 +++++++++++++++++++ src/khoj/database/models/__init__.py | 11 +++++- src/khoj/utils/initialization.py | 10 ++++++ 5 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 src/khoj/database/migrations/0020_speechtotextmodeloptions_and_more.py diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index ea0c0a858..471f3605e 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -28,6 +28,7 @@ Conversation, ChatModelOptions, SearchModelConfig, + SpeechToTextModelOptions, Subscription, UserConversationConfig, OpenAIProcessorConversationConfig, @@ -339,6 +340,10 @@ async def get_openai_chat(): async def get_openai_chat_config(): return await OpenAIProcessorConversationConfig.objects.filter().afirst() + @staticmethod + async def get_speech_to_text_config(): + return await SpeechToTextModelOptions.objects.filter().afirst() + class EntryAdapters: word_filer = WordFilter() diff --git a/src/khoj/database/admin.py b/src/khoj/database/admin.py index 69f15b2f2..4383056f9 100644 --- a/src/khoj/database/admin.py +++ b/src/khoj/database/admin.py @@ -9,12 +9,14 @@ OpenAIProcessorConversationConfig, OfflineChatProcessorConversationConfig, SearchModelConfig, + SpeechToTextModelOptions, Subscription, ) admin.site.register(KhojUser, UserAdmin) admin.site.register(ChatModelOptions) +admin.site.register(SpeechToTextModelOptions) admin.site.register(OpenAIProcessorConversationConfig) admin.site.register(OfflineChatProcessorConversationConfig) admin.site.register(SearchModelConfig) diff --git a/src/khoj/database/migrations/0020_speechtotextmodeloptions_and_more.py b/src/khoj/database/migrations/0020_speechtotextmodeloptions_and_more.py new file mode 100644 index 000000000..1185d298a --- /dev/null +++ b/src/khoj/database/migrations/0020_speechtotextmodeloptions_and_more.py @@ -0,0 +1,35 @@ +# Generated by Django 4.2.7 on 2023-11-22 08:51 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0019_alter_googleuser_family_name_and_more"), + ] + + operations = [ + migrations.CreateModel( + name="SpeechToTextModelOptions", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("model_name", models.CharField(default="whisper-1", max_length=200)), + ( + "model_type", + models.CharField( + choices=[("openai", "Openai"), ("offline", "Offline")], default="openai", max_length=200 + ), + ), + ], + options={ + "abstract": False, + }, + ), + migrations.AlterField( + model_name="chatmodeloptions", + name="chat_model", + field=models.CharField(default="gpt-4", max_length=200), + ), + ] diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 92da3e6e8..8098a731f 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -120,6 +120,15 @@ class OfflineChatProcessorConversationConfig(BaseModel): enabled = models.BooleanField(default=False) +class SpeechToTextModelOptions(BaseModel): + class ModelType(models.TextChoices): + OPENAI = "openai" + OFFLINE = "offline" + + model_name = models.CharField(max_length=200, default="whisper-1") + model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OPENAI) + + class ChatModelOptions(BaseModel): class ModelType(models.TextChoices): OPENAI = "openai" @@ -127,7 +136,7 @@ class ModelType(models.TextChoices): max_prompt_size = models.IntegerField(default=None, null=True, blank=True) tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True) - chat_model = models.CharField(max_length=200, default=None, null=True, blank=True) + chat_model = models.CharField(max_length=200, default="gpt-4") model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OPENAI) diff --git a/src/khoj/utils/initialization.py b/src/khoj/utils/initialization.py index ffc4d47eb..ee0454c49 100644 --- a/src/khoj/utils/initialization.py +++ b/src/khoj/utils/initialization.py @@ -6,6 +6,7 @@ OfflineChatProcessorConversationConfig, OpenAIProcessorConversationConfig, ChatModelOptions, + SpeechToTextModelOptions, ) from khoj.utils.constants import default_offline_chat_model, default_online_chat_model @@ -94,6 +95,15 @@ def _create_chat_configuration(): chat_model=openai_chat_model, model_type=ChatModelOptions.ModelType.OPENAI, max_prompt_size=max_tokens ) + default_speech2text_model = "whisper-1" + openai_speech2text_model = input( + f"Enter the OpenAI speech to text model you want to use (default: {default_speech2text_model}): " + ) + openai_speech2text_model = openai_speech2text_model or default_speech2text_model + SpeechToTextModelOptions.objects.create( + model_name=openai_speech2text_model, model_type=SpeechToTextModelOptions.ModelType.OPENAI + ) + logger.info("🗣️ Chat model configuration complete") admin_user = KhojUser.objects.filter(is_staff=True).first() From cc77bc4076624cee7084d9c0715ea400d42ebc1c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 21 Nov 2023 20:37:34 -0800 Subject: [PATCH 02/18] Create speech to text API endpoint. Use OpenAI whisper for ASR - Wrap audio transcription in try/catch and delete audio file after processing - Use configured speech to text model, else handle error --- src/khoj/routers/api.py | 53 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index f2e5c9662..0d0d4bb10 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -1,13 +1,16 @@ # Standard Packages import concurrent.futures import math +import os import time import logging import json from typing import Annotated, List, Optional, Union, Any +import uuid # External Packages -from fastapi import APIRouter, Depends, HTTPException, Header, Request +from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File +import openai from starlette.authentication import requires from asgiref.sync import sync_to_async @@ -553,6 +556,54 @@ async def chat_options( return Response(content=json.dumps(cmd_options), media_type="application/json", status_code=200) +@api.post("/speak") +@requires(["authenticated"]) +async def transcribe_audio(request: Request, common: CommonQueryParams, file: UploadFile = File(...)): + user: KhojUser = request.user.object + audio_filename = f"{user.uuid}-{str(uuid.uuid4())}.webm" + user_message: str = None + + # Transcribe the audio from the request + try: + # Store the audio from the request in a temporary file + audio_data = await file.read() + with open(audio_filename, "wb") as audio_file_writer: + audio_file_writer.write(audio_data) + audio_file = open(audio_filename, "rb") + + # Send the audio data to the Whisper API + speech_to_text_config = await ConversationAdapters.get_speech_to_text_config() + openai_chat_config = await ConversationAdapters.get_openai_chat_config() + if not openai_chat_config or not speech_to_text_config: + # If the user has not configured a speech to text model, return an unprocessable entity error + status_code = 422 + elif speech_to_text_config.model_type == ChatModelOptions.ModelType.OPENAI: + api_key = openai_chat_config.api_key + speech2text_model = speech_to_text_config.model_name + response = await sync_to_async(openai.Audio.translate)( + model=speech2text_model, file=audio_file, api_key=api_key + ) + user_message = response["text"] + finally: + # Close and Delete the temporary audio file + audio_file.close() + os.remove(audio_filename) + + if user_message is None: + return Response(status_code=status_code or 500) + + update_telemetry_state( + request=request, + telemetry_type="api", + api="speech_to_text", + **common.__dict__, + ) + + # Return the spoken text + content = json.dumps({"text": user_message}) + return Response(content=content, media_type="application/json", status_code=200) + + @api.get("/chat", response_class=Response) @requires(["authenticated"]) async def chat( From 2951fc92d7431e31734951aba7955cb7378f8d28 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 21 Nov 2023 23:38:36 -0800 Subject: [PATCH 03/18] Speak to Khoj from the Web client - Use icons to style speech to text recording state --- .../web/assets/icons/microphone-solid.svg | 1 + .../interface/web/assets/icons/stop-solid.svg | 37 ++++++++ src/khoj/interface/web/chat.html | 87 +++++++++++++++++-- 3 files changed, 120 insertions(+), 5 deletions(-) create mode 100644 src/khoj/interface/web/assets/icons/microphone-solid.svg create mode 100644 src/khoj/interface/web/assets/icons/stop-solid.svg diff --git a/src/khoj/interface/web/assets/icons/microphone-solid.svg b/src/khoj/interface/web/assets/icons/microphone-solid.svg new file mode 100644 index 000000000..3fc4b91d2 --- /dev/null +++ b/src/khoj/interface/web/assets/icons/microphone-solid.svg @@ -0,0 +1 @@ + diff --git a/src/khoj/interface/web/assets/icons/stop-solid.svg b/src/khoj/interface/web/assets/icons/stop-solid.svg new file mode 100644 index 000000000..a9aaba284 --- /dev/null +++ b/src/khoj/interface/web/assets/icons/stop-solid.svg @@ -0,0 +1,37 @@ + + + + + + + diff --git a/src/khoj/interface/web/chat.html b/src/khoj/interface/web/chat.html index 1c661a926..d346294f1 100644 --- a/src/khoj/interface/web/chat.html +++ b/src/khoj/interface/web/chat.html @@ -369,6 +369,56 @@ chat(); } } + + let mediaRecorder; + function speechToText() { + const speakButton = document.getElementById('speak-button'); + const speakButtonImg = document.getElementById('speak-button-img'); + const chatInput = document.getElementById('chat-input'); + + const sendToServer = (audioBlob) => { + const formData = new FormData(); + formData.append('file', audioBlob); + + fetch('/api/speak?client=web', { method: 'POST', body: formData }) + .then(response => response.ok ? response.json() : Promise.reject(response)) + .then(data => { chatInput.value += data.text; }) + .catch(err => err.status == 422 ? console.error("Configure speech-to-text model on server.") : console.error("Failed to transcribe audio")); + }; + + const handleRecording = (stream) => { + const audioChunks = []; + const recordingConfig = { mimeType: 'audio/webm' }; + mediaRecorder = new MediaRecorder(stream, recordingConfig); + + mediaRecorder.addEventListener("dataavailable", function(event) { + if (event.data.size > 0) audioChunks.push(event.data); + }); + + mediaRecorder.addEventListener("stop", function() { + const audioBlob = new Blob(audioChunks, { type: 'audio/webm' }); + sendToServer(audioBlob); + }); + + mediaRecorder.start(); + speakButtonImg.src = '/static/assets/icons/stop-solid.svg'; + speakButtonImg.alt = 'Stop Speaking'; + }; + + // Toggle recording + if (!mediaRecorder || mediaRecorder.state === 'inactive') { + navigator.mediaDevices + .getUserMedia({ audio: true }) + .then(handleRecording) + .catch((e) => { + console.error(e); + }); + } else if (mediaRecorder.state === 'recording') { + mediaRecorder.stop(); + speakButtonImg.src = '/static/assets/icons/microphone-solid.svg'; + speakButtonImg.alt = 'Speak'; + } + }
@@ -384,7 +434,12 @@
@@ -400,7 +456,12 @@ @@ -514,15 +575,17 @@ #chat-footer { padding: 0; + margin: 8px; display: grid; grid-template-columns: minmax(70px, 100%); grid-column-gap: 10px; grid-row-gap: 10px; } - #chat-footer > * { - padding: 15px; - border-radius: 5px; - border: 1px solid #475569; + #input-row { + display: grid; + grid-template-columns: auto 32px; + grid-column-gap: 10px; + grid-row-gap: 10px; background: #f9fafc } .option:hover { @@ -543,6 +606,26 @@ #chat-input:focus { outline: none !important; } + #speak-button { + background: var(--background-color); + border: none; + border-radius: 5px; + padding: 5px; + font-size: 14px; + font-weight: 300; + line-height: 1.5em; + cursor: pointer; + transition: background 0.3s ease-in-out; + } + #speak-button:hover { + background: var(--primary-hover); + } + #speak-button:active { + background: var(--primary-active); + } + #speak-button-img { + width: 24px; + } .option-enabled { box-shadow: 0 0 12px rgb(119, 156, 46); From 3e252036c3df48dc0c5d097b09f0ea4b00642f5f Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 26 Nov 2023 00:27:29 -0800 Subject: [PATCH 05/18] Remove whitespace: pre-line from chat html, since markdown rendering --- src/interface/desktop/chat.html | 2 -- src/khoj/interface/web/chat.html | 2 -- 2 files changed, 4 deletions(-) diff --git a/src/interface/desktop/chat.html b/src/interface/desktop/chat.html index 41e185d11..82ab0f16d 100644 --- a/src/interface/desktop/chat.html +++ b/src/interface/desktop/chat.html @@ -679,7 +679,6 @@ .chat-message.you { margin-right: auto; text-align: right; - white-space: pre-line; } /* basic style chat message text */ .chat-message-text { @@ -696,7 +695,6 @@ color: var(--primary-inverse); background: var(--primary); margin-left: auto; - white-space: pre-line; } /* Spinner symbol when the chat message is loading */ .spinner { diff --git a/src/khoj/interface/web/chat.html b/src/khoj/interface/web/chat.html index 7ca51fcce..641c983ec 100644 --- a/src/khoj/interface/web/chat.html +++ b/src/khoj/interface/web/chat.html @@ -802,7 +802,6 @@ .chat-message.you { margin-right: auto; text-align: right; - white-space: pre-line; } /* basic style chat message text */ .chat-message-text { @@ -819,7 +818,6 @@ color: var(--primary-inverse); background: var(--primary); margin-left: auto; - white-space: pre-line; } /* Spinner symbol when the chat message is loading */ .spinner { From 5a6547677ca681db7b6b72009ad96c55976e0d05 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 26 Nov 2023 00:38:18 -0800 Subject: [PATCH 06/18] Add type of operation variable in latest migration --- src/khoj/database/migrations/0021_merge_20231126_0650.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/khoj/database/migrations/0021_merge_20231126_0650.py b/src/khoj/database/migrations/0021_merge_20231126_0650.py index 579c00726..78fd2e523 100644 --- a/src/khoj/database/migrations/0021_merge_20231126_0650.py +++ b/src/khoj/database/migrations/0021_merge_20231126_0650.py @@ -9,4 +9,4 @@ class Migration(migrations.Migration): ("database", "0020_speechtotextmodeloptions_and_more"), ] - operations = [] + operations: list = [] From fc040825b27c6fd9218dc2906295d0ba23b855f9 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 26 Nov 2023 01:07:20 -0800 Subject: [PATCH 07/18] Default to Offline chat with Mistral as minimal setup, no API key reqd. --- src/khoj/database/models/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 7b28521bb..77478ef53 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -136,8 +136,8 @@ class ModelType(models.TextChoices): max_prompt_size = models.IntegerField(default=None, null=True, blank=True) tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True) - chat_model = models.CharField(max_length=200, default="gpt-4") - model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OPENAI) + chat_model = models.CharField(max_length=200, default="mistral-7b-instruct-v0.1.Q4_0.gguf") + model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OFFLINE) class UserConversationConfig(BaseModel): From 28090216f634eb961f6673df06967d1037be5d94 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 26 Nov 2023 01:08:38 -0800 Subject: [PATCH 08/18] Show transcription error status in chatInput placeholder on web, desktop - Extract flashing status message in chat input placeholder into reusable function - Use emoji prefixes for status messages - Improve alt text of transcribe button to indicate what the button does --- src/interface/desktop/chat.html | 35 +++++++++++++++++++++----------- src/khoj/interface/web/chat.html | 35 +++++++++++++++++++++----------- 2 files changed, 46 insertions(+), 24 deletions(-) diff --git a/src/interface/desktop/chat.html b/src/interface/desktop/chat.html index 82ab0f16d..958cb5d92 100644 --- a/src/interface/desktop/chat.html +++ b/src/interface/desktop/chat.html @@ -516,6 +516,18 @@ } } + function flashStatusInChatInput(message) { + // Get chat input element and original placeholder + let chatInput = document.getElementById("chat-input"); + let originalPlaceholder = chatInput.placeholder; + // Set placeholder to message + chatInput.placeholder = message; + // Reset placeholder after 2 seconds + setTimeout(() => { + chatInput.placeholder = originalPlaceholder; + }, 2000); + } + async function clearConversationHistory() { let chatInput = document.getElementById("chat-input"); let originalPlaceholder = chatInput.placeholder; @@ -530,16 +542,11 @@ .then(data => { chatBody.innerHTML = ""; loadChat(); - chatInput.placeholder = "Cleared conversation history"; + flashStatusInChatInput("🗑 Cleared conversation history"); }) .catch(err => { - chatInput.placeholder = "Failed to clear conversation history"; + flashStatusInChatInput("⛔️ Failed to clear conversation history"); }) - .finally(() => { - setTimeout(() => { - chatInput.placeholder = originalPlaceholder; - }, 2000); - }); } let mediaRecorder; @@ -560,7 +567,11 @@ fetch(url, { method: 'POST', body: formData, headers}) .then(response => response.ok ? response.json() : Promise.reject(response)) .then(data => { chatInput.value += data.text; }) - .catch(err => err.status == 422 ? console.error("Configure speech-to-text model on server.") : console.error("Failed to transcribe audio")); + .catch(err => { + err.status == 422 + ? flashStatusInChatInput("⛔️ Configure speech-to-text model on server.") + : flashStatusInChatInput("⛔️ Failed to transcribe audio") + }); }; const handleRecording = (stream) => { @@ -579,7 +590,7 @@ mediaRecorder.start(); speakButtonImg.src = './assets/icons/stop-solid.svg'; - speakButtonImg.alt = 'Stop Speaking'; + speakButtonImg.alt = 'Stop Transcription'; }; // Toggle recording @@ -588,12 +599,12 @@ .getUserMedia({ audio: true }) .then(handleRecording) .catch((e) => { - console.error(e); + flashStatusInChatInput("⛔️ Failed to access microphone"); }); } else if (mediaRecorder.state === 'recording') { mediaRecorder.stop(); speakButtonImg.src = './assets/icons/microphone-solid.svg'; - speakButtonImg.alt = 'Speak'; + speakButtonImg.alt = 'Transcribe'; } } @@ -626,7 +637,7 @@