Groq whisper (#305)

* Support for Groq's Whisper API for audio transcription * Update README.md to include support for groq's Whisper API for audio transcription * Update README.md to include support for groq's Whisper API for audio transcription * update readme
WAppAI · Sep 16, 2024 · 5933977 · 5933977
1 parent 0f6097f
commit 5933977
Show file tree

Hide file tree

Showing 7 changed files with 102 additions and 12 deletions.
diff --git a/.env.example b/.env.example
@@ -31,6 +31,7 @@ ANTHROPIC_API_KEY="" # sk-...
 
 # ------------------------------
 # Obligatory if you're using Groq's models and want to use tool calling:
+# Also, if you're using Groq's Whisper API for audio transcription:
 # ------------------------------
 # You can get this at https://console.groq.com/keys
 GROQ_API_KEY="" # gsk-...
@@ -49,11 +50,11 @@ OPENROUTER_API_KEY="" # sk-90...
 # Determines whether the bot should detect and convert your voice messages into text
 # Accepted values are "true" or "false"
 TRANSCRIPTION_ENABLED="false"
-
-# There are 2 ways to transcribe audio: using OpenAI Whisper API, which costs US$0.06 per 10 minutes of audio (check for current prices in their website), or using Whisper locally.
-# Local transcription is slower and provides worse results, but it is free.
-# If you choose to use the local method, you need to do some things. Refer to the readme.md file for more information.
-TRANSCRIPTION_METHOD="local"  # options are 'local' or 'whisper-api'
+# There are 3 ways to transcribe audio: 
+# Using Groq's Whisper API, which is currently free and has better performance than both local and OpenAI's Whisper API, since it uses whisper-large-v3 model.
+# Using OpenAI Whisper API, which costs US$0.06 per 10 minutes of audio (check for current prices in their website) uses Whisper-v1,
+# Or using Whisper locally. If you choose to use the local method, you need to do some things. Refer to the readme.md file for more information.
+TRANSCRIPTION_METHOD="whisper-groq"  # options are 'local', 'whisper-api' and 'whisper-groq'
 
 # ONLY NECESSARY IF TRANSCRIPTION_METHOD IS SET TO 'local'
 # Name of the model to use for local transcription. Refer to the readme.md file for more information.
@@ -63,7 +64,7 @@ TRANSCRIPTION_MODEL="ggml-model-whisper-base.bin"
 # If you only plan to send audio in one language, it is recommended to specify the language. 
 # List of languages are: https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
 # Leave it as "auto" if you will use multiple languages.
-TRANSCRIPTION_LANGUAGE="auto"  # Example: "pt" (portuguese), "en" (english), "es" (spanish).  
+TRANSCRIPTION_LANGUAGE="auto"  # Example: "pt" (portuguese), "en" (english), "es" (spanish), "auto" for automatic detection.  
 
 # ------------------------------
 # LangChain Features:

diff --git a/README.md b/README.md
@@ -128,19 +128,34 @@ The AI's are designed to respond to natural language queries from users. You can
 
 ### Voice Messages
 
-When dealing with voice messages, you have two options for transcription: utilizing the Whisper API or the local method. Each option has its own considerations, including cost and performance.
+When dealing with voice messages, you have 3 options for transcription: using groq's Whisper API for free (recommended), utilizing the Whisper API or the local method. Each option has its own considerations, including cost and performance.
+
+<details>
+<summary><strong>Groq API:</strong></summary>
+
+- **Setup:**
+  1. Obtain a Groq API key from [Groq Console](https://console.groq.com/keys).
+  2. In the `.env` file, set `TRANSCRIPTION_ENABLED` to `"true"` and `TRANSCRIPTION_METHOD` to `"whisper-groq"`.
+  </details>
 
 <details>
 <summary><strong>Whisper API:</strong></summary>
 
 - **Cost:** Utilizing the Whisper API incurs a cost of US$0.06 per 10 minutes of audio.
-- **Setup:** 1. Obtain an OpenAI API key and place it in the `.env` file under the `OPENAI_API_KEY` variable. 2. Set `TRANSCRIPTION_ENABLED` to `"true"` and `TRANSCRIPTION_METHOD` to `"whisper-api"`. While setting a language in `TRANSCRIPTION_LANGUAGE` is not mandatory, it is recommended for better performance.
+- **Setup:**
+  1. Obtain an OpenAI API key and place it in the `.env` file under the `OPENAI_API_KEY` variable.
+  2. In the `.env` file, set `TRANSCRIPTION_ENABLED` to `"true"` and `TRANSCRIPTION_METHOD` to `"whisper-api"`.
+
 </details>
+
 <details>
 <summary><strong>Local Mode:</strong></summary>
 
 - **Cost:** The local method is free but may be slower and less precise.
-- **Setup:** 1. Download a model of your choice from [here](https://huggingface.co/ggerganov/whisper.cpp/tree/main). Download any `.bin` file and place it in the `./whisper/models` folder. 2. Modify the `.env` file by changing `TRANSCRIPTION_ENABLED` to `"true"`, `TRANSCRIPTION_METHOD` to `"local"`, and `"TRANSCRIPTION_MODEL"` with the name of the model you downloaded. While setting a language in `TRANSCRIPTION_LANGUAGE` is not mandatory, it is recommended for better performance.
+- **Setup:**
+  1. Download a model of your choice from [here](https://huggingface.co/ggerganov/whisper.cpp/tree/main). Download any `.bin` file and place it in the `./whisper/models` folder.
+  2. Modify the `.env` file by changing `TRANSCRIPTION_ENABLED` to `"true"`, `TRANSCRIPTION_METHOD` to `"local"`, and `"TRANSCRIPTION_MODEL"` with the name of the model you downloaded. While setting a language in `TRANSCRIPTION_LANGUAGE` is not mandatory, it is recommended for better performance.
+
 </details>
 
 ### Group Chat

diff --git a/package.json b/package.json
@@ -48,6 +48,7 @@
     "dotenv-expand": "^10.0.0",
     "fluent-ffmpeg": "^2.1.2",
     "googleapis": "^126.0.1",
+    "groq-sdk": "^0.7.0",
     "keyv": "^4.5.4",
     "langchain": "0.2.9",
     "node-schedule": "^2.1.1",

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/src/handlers/audio-message/index.ts b/src/handlers/audio-message/index.ts
@@ -11,6 +11,7 @@ import {
 import { convertOggToWav } from "./audio-helper";
 import { handleAudioMessageWithWhisperApi } from "./whisper-api";
 import { handleAudioMessageWithWhisperLocal } from "./whisper-local";
+import { handleAudioMessageWithGroqApi } from "./whisper-groq";
 
 export async function handleAudioMessage(
   media: MessageMedia,
@@ -44,6 +45,18 @@ export async function handleAudioMessage(
       console.error(error);
       throw new Error("Error transcribing audio");
     }
+  } else if (TRANSCRIPTION_METHOD === "whisper-groq") {
+    try {
+      transcribedAudio = await handleAudioMessageWithGroqApi(wavPath);
+    } catch (error) {
+      console.error(error);
+      throw new Error("Error transcribing audio");
+    }
+  } else {
+    throw new Error(
+      "Invalid transcription method, TRANSCRIPTION_METHOD: " +
+        TRANSCRIPTION_METHOD
+    );
   }
 
   if (REPLY_TRANSCRIPTION === "true") {

diff --git a/src/handlers/audio-message/whisper-groq.ts b/src/handlers/audio-message/whisper-groq.ts
@@ -0,0 +1,34 @@
+import fs from "fs";
+import { openai } from "../../clients/openai";
+import { TRANSCRIPTION_LANGUAGE } from "../../constants";
+import Groq from "groq-sdk";
+
+export async function handleAudioMessageWithGroqApi(wavPath: string) {
+  console.log("Transcribing audio with Groq API");
+  try {
+    const groq = new Groq()
+
+    const transcriptionOptions: any = {
+      file: fs.createReadStream(wavPath),
+      model: "whisper-large-v3",
+      //prompt: "Specify context or spelling", // Optional
+      response_format: "json", // Optional
+      temperature: 0.0, // Optional
+    };
+
+    if (TRANSCRIPTION_LANGUAGE !== "auto") {
+      transcriptionOptions.language = TRANSCRIPTION_LANGUAGE;
+    }
+
+    const transcription =
+      await groq.audio.transcriptions.create(transcriptionOptions);
+    return transcription.text;
+  } finally {
+    // Regardless of success or failure, attempt to delete the WAV file
+    try {
+      fs.unlinkSync(wavPath); // Delete the WAV file
+    } catch (error) {
+      console.error("Error deleting WAV file:", error);
+    }
+  }
+}
diff --git a/src/helpers/utils.ts b/src/helpers/utils.ts
@@ -15,7 +15,9 @@ export function checkEnv() {
   if (process.env.TRANSCRIPTION_ENABLED === "true") {
     if (
       !process.env.TRANSCRIPTION_METHOD ||
-      !["whisper-api", "local"].includes(process.env.TRANSCRIPTION_METHOD)
+      !["whisper-api", "local", "whisper-groq"].includes(
+        process.env.TRANSCRIPTION_METHOD
+      )
     ) {
       throw new Error(
         `Invalid TRANSCRIPTION_METHOD="${process.env.TRANSCRIPTION_METHOD}" provided. Please check the TRANSCRIPTION_METHOD variable in your .env file.`
@@ -34,13 +36,19 @@ export function checkEnv() {
     if (process.env.TRANSCRIPTION_METHOD === "local") {
       if (!process.env.TRANSCRIPTION_MODEL) {
         throw new Error(
-          `Invalid TRANSCRIPTION_MODEL="${process.env.TRANSCRIPTION_MODEL}" provided. Please check the TRANSCRIPTION_MODEL variable in your .env file.`
+          `Invalid TRANSCRIPTION_MODEL="${process.env.TRANSCRIPTION_MODEL}" provided. Please check the TRANSCRIPTION_MODEL variable in your .env file. Or disable audio transcription by setting TRANSCRIPTION_ENABLED to "false".`
         );
       }
     } else if (process.env.TRANSCRIPTION_METHOD === "whisper-api") {
       if (!process.env.OPENAI_API_KEY || process.env.OPENAI_API_KEY === "") {
         throw new Error(
-          `Invalid OPENAI_API_KEY="${process.env.OPENAI_API_KEY}" provided. Please check the OPENAI_API_KEY variable in your .env file.`
+          `Invalid OPENAI_API_KEY="${process.env.OPENAI_API_KEY}" provided. Please check the OPENAI_API_KEY variable in your .env file. Or disable audio transcription by setting TRANSCRIPTION_ENABLED to "false".`
+        );
+      }
+    } else if (process.env.TRANSCRIPTION_METHOD === "whisper-groq") {
+      if (!process.env.GROQ_API_KEY || process.env.GROQ_API_KEY === "") {
+        throw new Error(
+          `Invalid GROQ_API_KEY="${process.env.GROQ_API_KEY}" provided. Please check the GROQ_API_KEY variable in your .env file. Or disable audio transcription by setting TRANSCRIPTION_ENABLED to "false".`
         );
       }
     }