From 5933977639d712f342dfcc62cd8e09f36cc90b59 Mon Sep 17 00:00:00 2001 From: Luis Otavio Martins Date: Mon, 16 Sep 2024 04:27:26 -0300 Subject: [PATCH] Groq whisper (#305) * Support for Groq's Whisper API for audio transcription * Update README.md to include support for groq's Whisper API for audio transcription * Update README.md to include support for groq's Whisper API for audio transcription * update readme --- .env.example | 13 +++++---- README.md | 21 +++++++++++-- package.json | 1 + pnpm-lock.yaml | 18 ++++++++++++ src/handlers/audio-message/index.ts | 13 +++++++++ src/handlers/audio-message/whisper-groq.ts | 34 ++++++++++++++++++++++ src/helpers/utils.ts | 14 +++++++-- 7 files changed, 102 insertions(+), 12 deletions(-) create mode 100644 src/handlers/audio-message/whisper-groq.ts diff --git a/.env.example b/.env.example index 34d3cc6..402f7b8 100644 --- a/.env.example +++ b/.env.example @@ -31,6 +31,7 @@ ANTHROPIC_API_KEY="" # sk-... # ------------------------------ # Obligatory if you're using Groq's models and want to use tool calling: +# Also, if you're using Groq's Whisper API for audio transcription: # ------------------------------ # You can get this at https://console.groq.com/keys GROQ_API_KEY="" # gsk-... @@ -49,11 +50,11 @@ OPENROUTER_API_KEY="" # sk-90... # Determines whether the bot should detect and convert your voice messages into text # Accepted values are "true" or "false" TRANSCRIPTION_ENABLED="false" - -# There are 2 ways to transcribe audio: using OpenAI Whisper API, which costs US$0.06 per 10 minutes of audio (check for current prices in their website), or using Whisper locally. -# Local transcription is slower and provides worse results, but it is free. -# If you choose to use the local method, you need to do some things. Refer to the readme.md file for more information. -TRANSCRIPTION_METHOD="local" # options are 'local' or 'whisper-api' +# There are 3 ways to transcribe audio: +# Using Groq's Whisper API, which is currently free and has better performance than both local and OpenAI's Whisper API, since it uses whisper-large-v3 model. +# Using OpenAI Whisper API, which costs US$0.06 per 10 minutes of audio (check for current prices in their website) uses Whisper-v1, +# Or using Whisper locally. If you choose to use the local method, you need to do some things. Refer to the readme.md file for more information. +TRANSCRIPTION_METHOD="whisper-groq" # options are 'local', 'whisper-api' and 'whisper-groq' # ONLY NECESSARY IF TRANSCRIPTION_METHOD IS SET TO 'local' # Name of the model to use for local transcription. Refer to the readme.md file for more information. @@ -63,7 +64,7 @@ TRANSCRIPTION_MODEL="ggml-model-whisper-base.bin" # If you only plan to send audio in one language, it is recommended to specify the language. # List of languages are: https://github.com/openai/whisper/blob/main/whisper/tokenizer.py # Leave it as "auto" if you will use multiple languages. -TRANSCRIPTION_LANGUAGE="auto" # Example: "pt" (portuguese), "en" (english), "es" (spanish). +TRANSCRIPTION_LANGUAGE="auto" # Example: "pt" (portuguese), "en" (english), "es" (spanish), "auto" for automatic detection. # ------------------------------ # LangChain Features: diff --git a/README.md b/README.md index 481b2c3..4b82f36 100644 --- a/README.md +++ b/README.md @@ -128,19 +128,34 @@ The AI's are designed to respond to natural language queries from users. You can ### Voice Messages -When dealing with voice messages, you have two options for transcription: utilizing the Whisper API or the local method. Each option has its own considerations, including cost and performance. +When dealing with voice messages, you have 3 options for transcription: using groq's Whisper API for free (recommended), utilizing the Whisper API or the local method. Each option has its own considerations, including cost and performance. + +
+Groq API: + +- **Setup:** + 1. Obtain a Groq API key from [Groq Console](https://console.groq.com/keys). + 2. In the `.env` file, set `TRANSCRIPTION_ENABLED` to `"true"` and `TRANSCRIPTION_METHOD` to `"whisper-groq"`. +
Whisper API: - **Cost:** Utilizing the Whisper API incurs a cost of US$0.06 per 10 minutes of audio. -- **Setup:** 1. Obtain an OpenAI API key and place it in the `.env` file under the `OPENAI_API_KEY` variable. 2. Set `TRANSCRIPTION_ENABLED` to `"true"` and `TRANSCRIPTION_METHOD` to `"whisper-api"`. While setting a language in `TRANSCRIPTION_LANGUAGE` is not mandatory, it is recommended for better performance. +- **Setup:** + 1. Obtain an OpenAI API key and place it in the `.env` file under the `OPENAI_API_KEY` variable. + 2. In the `.env` file, set `TRANSCRIPTION_ENABLED` to `"true"` and `TRANSCRIPTION_METHOD` to `"whisper-api"`. +
+
Local Mode: - **Cost:** The local method is free but may be slower and less precise. -- **Setup:** 1. Download a model of your choice from [here](https://huggingface.co/ggerganov/whisper.cpp/tree/main). Download any `.bin` file and place it in the `./whisper/models` folder. 2. Modify the `.env` file by changing `TRANSCRIPTION_ENABLED` to `"true"`, `TRANSCRIPTION_METHOD` to `"local"`, and `"TRANSCRIPTION_MODEL"` with the name of the model you downloaded. While setting a language in `TRANSCRIPTION_LANGUAGE` is not mandatory, it is recommended for better performance. +- **Setup:** + 1. Download a model of your choice from [here](https://huggingface.co/ggerganov/whisper.cpp/tree/main). Download any `.bin` file and place it in the `./whisper/models` folder. + 2. Modify the `.env` file by changing `TRANSCRIPTION_ENABLED` to `"true"`, `TRANSCRIPTION_METHOD` to `"local"`, and `"TRANSCRIPTION_MODEL"` with the name of the model you downloaded. While setting a language in `TRANSCRIPTION_LANGUAGE` is not mandatory, it is recommended for better performance. +
### Group Chat diff --git a/package.json b/package.json index 516c37a..5a79f92 100644 --- a/package.json +++ b/package.json @@ -48,6 +48,7 @@ "dotenv-expand": "^10.0.0", "fluent-ffmpeg": "^2.1.2", "googleapis": "^126.0.1", + "groq-sdk": "^0.7.0", "keyv": "^4.5.4", "langchain": "0.2.9", "node-schedule": "^2.1.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ac155ab..52971a3 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -65,6 +65,9 @@ importers: googleapis: specifier: ^126.0.1 version: 126.0.1(encoding@0.1.13) + groq-sdk: + specifier: ^0.7.0 + version: 0.7.0(encoding@0.1.13) keyv: specifier: ^4.5.4 version: 4.5.4 @@ -1419,6 +1422,9 @@ packages: groq-sdk@0.5.0: resolution: {integrity: sha512-RVmhW7qZ+XZoy5fIuSdx/LGQJONpL8MHgZEW7dFwTdgkzStub2XQx6OKv28CHogijdwH41J+Npj/z2jBPu3vmw==} + groq-sdk@0.7.0: + resolution: {integrity: sha512-OgPqrRtti5MjEVclR8sgBHrhSkTLdFCmi47yrEF29uJZaiCkX3s7bXpnMhq8Lwoe1f4AwgC0qGOeHXpeSgu5lg==} + gtoken@7.1.0: resolution: {integrity: sha512-pCcEwRi+TKpMlxAQObHDQ56KawURgyAf6jtIY046fJ5tIv3zDe/LEIubckAO8fj6JnAxLdmWkUfNyulQ2iKdEw==} engines: {node: '>=14.0.0'} @@ -4232,6 +4238,18 @@ snapshots: transitivePeerDependencies: - encoding + groq-sdk@0.7.0(encoding@0.1.13): + dependencies: + '@types/node': 18.19.31 + '@types/node-fetch': 2.6.11 + abort-controller: 3.0.0 + agentkeepalive: 4.5.0 + form-data-encoder: 1.7.2 + formdata-node: 4.4.1 + node-fetch: 2.7.0(encoding@0.1.13) + transitivePeerDependencies: + - encoding + gtoken@7.1.0(encoding@0.1.13): dependencies: gaxios: 6.5.0(encoding@0.1.13) diff --git a/src/handlers/audio-message/index.ts b/src/handlers/audio-message/index.ts index a625106..a83a9ba 100644 --- a/src/handlers/audio-message/index.ts +++ b/src/handlers/audio-message/index.ts @@ -11,6 +11,7 @@ import { import { convertOggToWav } from "./audio-helper"; import { handleAudioMessageWithWhisperApi } from "./whisper-api"; import { handleAudioMessageWithWhisperLocal } from "./whisper-local"; +import { handleAudioMessageWithGroqApi } from "./whisper-groq"; export async function handleAudioMessage( media: MessageMedia, @@ -44,6 +45,18 @@ export async function handleAudioMessage( console.error(error); throw new Error("Error transcribing audio"); } + } else if (TRANSCRIPTION_METHOD === "whisper-groq") { + try { + transcribedAudio = await handleAudioMessageWithGroqApi(wavPath); + } catch (error) { + console.error(error); + throw new Error("Error transcribing audio"); + } + } else { + throw new Error( + "Invalid transcription method, TRANSCRIPTION_METHOD: " + + TRANSCRIPTION_METHOD + ); } if (REPLY_TRANSCRIPTION === "true") { diff --git a/src/handlers/audio-message/whisper-groq.ts b/src/handlers/audio-message/whisper-groq.ts new file mode 100644 index 0000000..f6b4d4a --- /dev/null +++ b/src/handlers/audio-message/whisper-groq.ts @@ -0,0 +1,34 @@ +import fs from "fs"; +import { openai } from "../../clients/openai"; +import { TRANSCRIPTION_LANGUAGE } from "../../constants"; +import Groq from "groq-sdk"; + +export async function handleAudioMessageWithGroqApi(wavPath: string) { + console.log("Transcribing audio with Groq API"); + try { + const groq = new Groq() + + const transcriptionOptions: any = { + file: fs.createReadStream(wavPath), + model: "whisper-large-v3", + //prompt: "Specify context or spelling", // Optional + response_format: "json", // Optional + temperature: 0.0, // Optional + }; + + if (TRANSCRIPTION_LANGUAGE !== "auto") { + transcriptionOptions.language = TRANSCRIPTION_LANGUAGE; + } + + const transcription = + await groq.audio.transcriptions.create(transcriptionOptions); + return transcription.text; + } finally { + // Regardless of success or failure, attempt to delete the WAV file + try { + fs.unlinkSync(wavPath); // Delete the WAV file + } catch (error) { + console.error("Error deleting WAV file:", error); + } + } +} diff --git a/src/helpers/utils.ts b/src/helpers/utils.ts index 49dd31f..096057a 100644 --- a/src/helpers/utils.ts +++ b/src/helpers/utils.ts @@ -15,7 +15,9 @@ export function checkEnv() { if (process.env.TRANSCRIPTION_ENABLED === "true") { if ( !process.env.TRANSCRIPTION_METHOD || - !["whisper-api", "local"].includes(process.env.TRANSCRIPTION_METHOD) + !["whisper-api", "local", "whisper-groq"].includes( + process.env.TRANSCRIPTION_METHOD + ) ) { throw new Error( `Invalid TRANSCRIPTION_METHOD="${process.env.TRANSCRIPTION_METHOD}" provided. Please check the TRANSCRIPTION_METHOD variable in your .env file.` @@ -34,13 +36,19 @@ export function checkEnv() { if (process.env.TRANSCRIPTION_METHOD === "local") { if (!process.env.TRANSCRIPTION_MODEL) { throw new Error( - `Invalid TRANSCRIPTION_MODEL="${process.env.TRANSCRIPTION_MODEL}" provided. Please check the TRANSCRIPTION_MODEL variable in your .env file.` + `Invalid TRANSCRIPTION_MODEL="${process.env.TRANSCRIPTION_MODEL}" provided. Please check the TRANSCRIPTION_MODEL variable in your .env file. Or disable audio transcription by setting TRANSCRIPTION_ENABLED to "false".` ); } } else if (process.env.TRANSCRIPTION_METHOD === "whisper-api") { if (!process.env.OPENAI_API_KEY || process.env.OPENAI_API_KEY === "") { throw new Error( - `Invalid OPENAI_API_KEY="${process.env.OPENAI_API_KEY}" provided. Please check the OPENAI_API_KEY variable in your .env file.` + `Invalid OPENAI_API_KEY="${process.env.OPENAI_API_KEY}" provided. Please check the OPENAI_API_KEY variable in your .env file. Or disable audio transcription by setting TRANSCRIPTION_ENABLED to "false".` + ); + } + } else if (process.env.TRANSCRIPTION_METHOD === "whisper-groq") { + if (!process.env.GROQ_API_KEY || process.env.GROQ_API_KEY === "") { + throw new Error( + `Invalid GROQ_API_KEY="${process.env.GROQ_API_KEY}" provided. Please check the GROQ_API_KEY variable in your .env file. Or disable audio transcription by setting TRANSCRIPTION_ENABLED to "false".` ); } }