diff --git a/.env.example b/.env.example
index 34d3cc6..402f7b8 100644
--- a/.env.example
+++ b/.env.example
@@ -31,6 +31,7 @@ ANTHROPIC_API_KEY="" # sk-...
# ------------------------------
# Obligatory if you're using Groq's models and want to use tool calling:
+# Also, if you're using Groq's Whisper API for audio transcription:
# ------------------------------
# You can get this at https://console.groq.com/keys
GROQ_API_KEY="" # gsk-...
@@ -49,11 +50,11 @@ OPENROUTER_API_KEY="" # sk-90...
# Determines whether the bot should detect and convert your voice messages into text
# Accepted values are "true" or "false"
TRANSCRIPTION_ENABLED="false"
-
-# There are 2 ways to transcribe audio: using OpenAI Whisper API, which costs US$0.06 per 10 minutes of audio (check for current prices in their website), or using Whisper locally.
-# Local transcription is slower and provides worse results, but it is free.
-# If you choose to use the local method, you need to do some things. Refer to the readme.md file for more information.
-TRANSCRIPTION_METHOD="local" # options are 'local' or 'whisper-api'
+# There are 3 ways to transcribe audio:
+# Using Groq's Whisper API, which is currently free and has better performance than both local and OpenAI's Whisper API, since it uses whisper-large-v3 model.
+# Using OpenAI Whisper API, which costs US$0.06 per 10 minutes of audio (check for current prices in their website) uses Whisper-v1,
+# Or using Whisper locally. If you choose to use the local method, you need to do some things. Refer to the readme.md file for more information.
+TRANSCRIPTION_METHOD="whisper-groq" # options are 'local', 'whisper-api' and 'whisper-groq'
# ONLY NECESSARY IF TRANSCRIPTION_METHOD IS SET TO 'local'
# Name of the model to use for local transcription. Refer to the readme.md file for more information.
@@ -63,7 +64,7 @@ TRANSCRIPTION_MODEL="ggml-model-whisper-base.bin"
# If you only plan to send audio in one language, it is recommended to specify the language.
# List of languages are: https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
# Leave it as "auto" if you will use multiple languages.
-TRANSCRIPTION_LANGUAGE="auto" # Example: "pt" (portuguese), "en" (english), "es" (spanish).
+TRANSCRIPTION_LANGUAGE="auto" # Example: "pt" (portuguese), "en" (english), "es" (spanish), "auto" for automatic detection.
# ------------------------------
# LangChain Features:
diff --git a/README.md b/README.md
index 481b2c3..4b82f36 100644
--- a/README.md
+++ b/README.md
@@ -128,19 +128,34 @@ The AI's are designed to respond to natural language queries from users. You can
### Voice Messages
-When dealing with voice messages, you have two options for transcription: utilizing the Whisper API or the local method. Each option has its own considerations, including cost and performance.
+When dealing with voice messages, you have 3 options for transcription: using groq's Whisper API for free (recommended), utilizing the Whisper API or the local method. Each option has its own considerations, including cost and performance.
+
+
+Groq API:
+
+- **Setup:**
+ 1. Obtain a Groq API key from [Groq Console](https://console.groq.com/keys).
+ 2. In the `.env` file, set `TRANSCRIPTION_ENABLED` to `"true"` and `TRANSCRIPTION_METHOD` to `"whisper-groq"`.
+
Whisper API:
- **Cost:** Utilizing the Whisper API incurs a cost of US$0.06 per 10 minutes of audio.
-- **Setup:** 1. Obtain an OpenAI API key and place it in the `.env` file under the `OPENAI_API_KEY` variable. 2. Set `TRANSCRIPTION_ENABLED` to `"true"` and `TRANSCRIPTION_METHOD` to `"whisper-api"`. While setting a language in `TRANSCRIPTION_LANGUAGE` is not mandatory, it is recommended for better performance.
+- **Setup:**
+ 1. Obtain an OpenAI API key and place it in the `.env` file under the `OPENAI_API_KEY` variable.
+ 2. In the `.env` file, set `TRANSCRIPTION_ENABLED` to `"true"` and `TRANSCRIPTION_METHOD` to `"whisper-api"`.
+
+
Local Mode:
- **Cost:** The local method is free but may be slower and less precise.
-- **Setup:** 1. Download a model of your choice from [here](https://huggingface.co/ggerganov/whisper.cpp/tree/main). Download any `.bin` file and place it in the `./whisper/models` folder. 2. Modify the `.env` file by changing `TRANSCRIPTION_ENABLED` to `"true"`, `TRANSCRIPTION_METHOD` to `"local"`, and `"TRANSCRIPTION_MODEL"` with the name of the model you downloaded. While setting a language in `TRANSCRIPTION_LANGUAGE` is not mandatory, it is recommended for better performance.
+- **Setup:**
+ 1. Download a model of your choice from [here](https://huggingface.co/ggerganov/whisper.cpp/tree/main). Download any `.bin` file and place it in the `./whisper/models` folder.
+ 2. Modify the `.env` file by changing `TRANSCRIPTION_ENABLED` to `"true"`, `TRANSCRIPTION_METHOD` to `"local"`, and `"TRANSCRIPTION_MODEL"` with the name of the model you downloaded. While setting a language in `TRANSCRIPTION_LANGUAGE` is not mandatory, it is recommended for better performance.
+
### Group Chat
diff --git a/package.json b/package.json
index 516c37a..5a79f92 100644
--- a/package.json
+++ b/package.json
@@ -48,6 +48,7 @@
"dotenv-expand": "^10.0.0",
"fluent-ffmpeg": "^2.1.2",
"googleapis": "^126.0.1",
+ "groq-sdk": "^0.7.0",
"keyv": "^4.5.4",
"langchain": "0.2.9",
"node-schedule": "^2.1.1",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index ac155ab..52971a3 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -65,6 +65,9 @@ importers:
googleapis:
specifier: ^126.0.1
version: 126.0.1(encoding@0.1.13)
+ groq-sdk:
+ specifier: ^0.7.0
+ version: 0.7.0(encoding@0.1.13)
keyv:
specifier: ^4.5.4
version: 4.5.4
@@ -1419,6 +1422,9 @@ packages:
groq-sdk@0.5.0:
resolution: {integrity: sha512-RVmhW7qZ+XZoy5fIuSdx/LGQJONpL8MHgZEW7dFwTdgkzStub2XQx6OKv28CHogijdwH41J+Npj/z2jBPu3vmw==}
+ groq-sdk@0.7.0:
+ resolution: {integrity: sha512-OgPqrRtti5MjEVclR8sgBHrhSkTLdFCmi47yrEF29uJZaiCkX3s7bXpnMhq8Lwoe1f4AwgC0qGOeHXpeSgu5lg==}
+
gtoken@7.1.0:
resolution: {integrity: sha512-pCcEwRi+TKpMlxAQObHDQ56KawURgyAf6jtIY046fJ5tIv3zDe/LEIubckAO8fj6JnAxLdmWkUfNyulQ2iKdEw==}
engines: {node: '>=14.0.0'}
@@ -4232,6 +4238,18 @@ snapshots:
transitivePeerDependencies:
- encoding
+ groq-sdk@0.7.0(encoding@0.1.13):
+ dependencies:
+ '@types/node': 18.19.31
+ '@types/node-fetch': 2.6.11
+ abort-controller: 3.0.0
+ agentkeepalive: 4.5.0
+ form-data-encoder: 1.7.2
+ formdata-node: 4.4.1
+ node-fetch: 2.7.0(encoding@0.1.13)
+ transitivePeerDependencies:
+ - encoding
+
gtoken@7.1.0(encoding@0.1.13):
dependencies:
gaxios: 6.5.0(encoding@0.1.13)
diff --git a/src/handlers/audio-message/index.ts b/src/handlers/audio-message/index.ts
index a625106..a83a9ba 100644
--- a/src/handlers/audio-message/index.ts
+++ b/src/handlers/audio-message/index.ts
@@ -11,6 +11,7 @@ import {
import { convertOggToWav } from "./audio-helper";
import { handleAudioMessageWithWhisperApi } from "./whisper-api";
import { handleAudioMessageWithWhisperLocal } from "./whisper-local";
+import { handleAudioMessageWithGroqApi } from "./whisper-groq";
export async function handleAudioMessage(
media: MessageMedia,
@@ -44,6 +45,18 @@ export async function handleAudioMessage(
console.error(error);
throw new Error("Error transcribing audio");
}
+ } else if (TRANSCRIPTION_METHOD === "whisper-groq") {
+ try {
+ transcribedAudio = await handleAudioMessageWithGroqApi(wavPath);
+ } catch (error) {
+ console.error(error);
+ throw new Error("Error transcribing audio");
+ }
+ } else {
+ throw new Error(
+ "Invalid transcription method, TRANSCRIPTION_METHOD: " +
+ TRANSCRIPTION_METHOD
+ );
}
if (REPLY_TRANSCRIPTION === "true") {
diff --git a/src/handlers/audio-message/whisper-groq.ts b/src/handlers/audio-message/whisper-groq.ts
new file mode 100644
index 0000000..f6b4d4a
--- /dev/null
+++ b/src/handlers/audio-message/whisper-groq.ts
@@ -0,0 +1,34 @@
+import fs from "fs";
+import { openai } from "../../clients/openai";
+import { TRANSCRIPTION_LANGUAGE } from "../../constants";
+import Groq from "groq-sdk";
+
+export async function handleAudioMessageWithGroqApi(wavPath: string) {
+ console.log("Transcribing audio with Groq API");
+ try {
+ const groq = new Groq()
+
+ const transcriptionOptions: any = {
+ file: fs.createReadStream(wavPath),
+ model: "whisper-large-v3",
+ //prompt: "Specify context or spelling", // Optional
+ response_format: "json", // Optional
+ temperature: 0.0, // Optional
+ };
+
+ if (TRANSCRIPTION_LANGUAGE !== "auto") {
+ transcriptionOptions.language = TRANSCRIPTION_LANGUAGE;
+ }
+
+ const transcription =
+ await groq.audio.transcriptions.create(transcriptionOptions);
+ return transcription.text;
+ } finally {
+ // Regardless of success or failure, attempt to delete the WAV file
+ try {
+ fs.unlinkSync(wavPath); // Delete the WAV file
+ } catch (error) {
+ console.error("Error deleting WAV file:", error);
+ }
+ }
+}
diff --git a/src/helpers/utils.ts b/src/helpers/utils.ts
index 49dd31f..096057a 100644
--- a/src/helpers/utils.ts
+++ b/src/helpers/utils.ts
@@ -15,7 +15,9 @@ export function checkEnv() {
if (process.env.TRANSCRIPTION_ENABLED === "true") {
if (
!process.env.TRANSCRIPTION_METHOD ||
- !["whisper-api", "local"].includes(process.env.TRANSCRIPTION_METHOD)
+ !["whisper-api", "local", "whisper-groq"].includes(
+ process.env.TRANSCRIPTION_METHOD
+ )
) {
throw new Error(
`Invalid TRANSCRIPTION_METHOD="${process.env.TRANSCRIPTION_METHOD}" provided. Please check the TRANSCRIPTION_METHOD variable in your .env file.`
@@ -34,13 +36,19 @@ export function checkEnv() {
if (process.env.TRANSCRIPTION_METHOD === "local") {
if (!process.env.TRANSCRIPTION_MODEL) {
throw new Error(
- `Invalid TRANSCRIPTION_MODEL="${process.env.TRANSCRIPTION_MODEL}" provided. Please check the TRANSCRIPTION_MODEL variable in your .env file.`
+ `Invalid TRANSCRIPTION_MODEL="${process.env.TRANSCRIPTION_MODEL}" provided. Please check the TRANSCRIPTION_MODEL variable in your .env file. Or disable audio transcription by setting TRANSCRIPTION_ENABLED to "false".`
);
}
} else if (process.env.TRANSCRIPTION_METHOD === "whisper-api") {
if (!process.env.OPENAI_API_KEY || process.env.OPENAI_API_KEY === "") {
throw new Error(
- `Invalid OPENAI_API_KEY="${process.env.OPENAI_API_KEY}" provided. Please check the OPENAI_API_KEY variable in your .env file.`
+ `Invalid OPENAI_API_KEY="${process.env.OPENAI_API_KEY}" provided. Please check the OPENAI_API_KEY variable in your .env file. Or disable audio transcription by setting TRANSCRIPTION_ENABLED to "false".`
+ );
+ }
+ } else if (process.env.TRANSCRIPTION_METHOD === "whisper-groq") {
+ if (!process.env.GROQ_API_KEY || process.env.GROQ_API_KEY === "") {
+ throw new Error(
+ `Invalid GROQ_API_KEY="${process.env.GROQ_API_KEY}" provided. Please check the GROQ_API_KEY variable in your .env file. Or disable audio transcription by setting TRANSCRIPTION_ENABLED to "false".`
);
}
}