From cdc7e44a0c807efa14dee81b6d3914d6faebbe83 Mon Sep 17 00:00:00 2001 From: Jhen <developer@jhen.me> Date: Thu, 21 Sep 2023 13:05:14 +0800 Subject: [PATCH 1/6] feat(ios): initial work for simple VAD --- cpp/rn-whisper.cpp | 52 ++++++++++++++++++++++++++++++++++++++++ cpp/rn-whisper.h | 3 ++- example/ios/Podfile.lock | 12 +++++----- example/src/App.tsx | 2 ++ ios/RNWhisperContext.mm | 18 ++++++++++++++ src/index.ts | 6 +++++ 6 files changed, 86 insertions(+), 7 deletions(-) diff --git a/cpp/rn-whisper.cpp b/cpp/rn-whisper.cpp index fcd3133..bbc304f 100644 --- a/cpp/rn-whisper.cpp +++ b/cpp/rn-whisper.cpp @@ -38,4 +38,56 @@ void rn_whisper_abort_all_transcribe() { } } +void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) { + const float rc = 1.0f / (2.0f * M_PI * cutoff); + const float dt = 1.0f / sample_rate; + const float alpha = dt / (rc + dt); + + float y = data[0]; + + for (size_t i = 1; i < data.size(); i++) { + y = alpha * (y + data[i] - data[i - 1]); + data[i] = y; + } +} + +bool rn_whisper_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) { + const int n_samples = pcmf32.size(); + const int n_samples_last = (sample_rate * last_ms) / 1000; + + if (n_samples_last >= n_samples) { + // not enough samples - assume no speech + printf("not enough samples - assume no speech\n"); + return false; + } + + if (freq_thold > 0.0f) { + high_pass_filter(pcmf32, freq_thold, sample_rate); + } + + float energy_all = 0.0f; + float energy_last = 0.0f; + + for (int i = 0; i < n_samples; i++) { + energy_all += fabsf(pcmf32[i]); + + if (i >= n_samples - n_samples_last) { + energy_last += fabsf(pcmf32[i]); + } + } + + energy_all /= n_samples; + energy_last /= n_samples_last; + + if (verbose) { + fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold); + } + + if (energy_last > vad_thold*energy_all) { + return false; + } + + return true; +} + } \ No newline at end of file diff --git a/cpp/rn-whisper.h b/cpp/rn-whisper.h index 4fd2c1b..4f65158 100644 --- a/cpp/rn-whisper.h +++ b/cpp/rn-whisper.h @@ -10,7 +10,8 @@ void rn_whisper_remove_abort_map(int job_id); void rn_whisper_abort_transcribe(int job_id); bool rn_whisper_transcribe_is_aborted(int job_id); void rn_whisper_abort_all_transcribe(); +bool rn_whisper_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose); #ifdef __cplusplus } -#endif \ No newline at end of file +#endif diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock index a94a1fd..2abfa4c 100644 --- a/example/ios/Podfile.lock +++ b/example/ios/Podfile.lock @@ -750,16 +750,16 @@ PODS: - React-perflogger (= 0.71.11) - RNFS (2.20.0): - React-Core - - RNZipArchive (6.0.9): + - RNZipArchive (6.1.0): - React-Core - - RNZipArchive/Core (= 6.0.9) + - RNZipArchive/Core (= 6.1.0) - SSZipArchive (~> 2.2) - - RNZipArchive/Core (6.0.9): + - RNZipArchive/Core (6.1.0): - React-Core - SSZipArchive (~> 2.2) - SocketRocket (0.6.0) - SSZipArchive (2.4.3) - - whisper-rn (0.3.5): + - whisper-rn (0.3.6): - RCT-Folly - RCTRequired - RCTTypeSafety @@ -994,10 +994,10 @@ SPEC CHECKSUMS: React-runtimeexecutor: 4817d63dbc9d658f8dc0ec56bd9b83ce531129f0 ReactCommon: 08723d2ed328c5cbcb0de168f231bc7bae7f8aa1 RNFS: 4ac0f0ea233904cb798630b3c077808c06931688 - RNZipArchive: 68a0c6db4b1c103f846f1559622050df254a3ade + RNZipArchive: ef9451b849c45a29509bf44e65b788829ab07801 SocketRocket: fccef3f9c5cedea1353a9ef6ada904fde10d6608 SSZipArchive: fe6a26b2a54d5a0890f2567b5cc6de5caa600aef - whisper-rn: 6f293154b175fee138a994fa00d0f414fb1f44e9 + whisper-rn: e80c0482f6a632faafd601f98f10da0255c1e1ec Yoga: f7decafdc5e8c125e6fa0da38a687e35238420fa YogaKit: f782866e155069a2cca2517aafea43200b01fd5a diff --git a/example/src/App.tsx b/example/src/App.tsx index d75a013..287c289 100644 --- a/example/src/App.tsx +++ b/example/src/App.tsx @@ -266,6 +266,8 @@ export default function App() { realtimeAudioSec: 60, // Slice audio into 25 (or < 30) sec chunks for better performance realtimeAudioSliceSec: 25, + // + useVad: true, }) setStopTranscribe({ stop }) subscribe((evt) => { diff --git a/ios/RNWhisperContext.mm b/ios/RNWhisperContext.mm index 57baa8f..0f67933 100644 --- a/ios/RNWhisperContext.mm +++ b/ios/RNWhisperContext.mm @@ -1,4 +1,5 @@ #import "RNWhisperContext.h" +#include <vector> #define NUM_BYTES_PER_BUFFER 16 * 1024 @@ -142,11 +143,28 @@ void AudioInputCallback(void * inUserData, for (int i = 0; i < n; i++) { audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i]; } + + bool isSpeech = true; + if (state->options[@"useVad"]) { + if (nSamples + n > WHISPER_SAMPLE_RATE * 2) { + int start = nSamples + n - WHISPER_SAMPLE_RATE * 2; + std::vector<float> audioBufferF32Vec(WHISPER_SAMPLE_RATE * 2); + for (int i = 0; i < WHISPER_SAMPLE_RATE * 2; i++) { + audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f; + } + isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, 0.6f, 100.0f, false); + NSLog(@"[RNWhisper] VAD result: %d", isSpeech); + } else { + isSpeech = false; + } + } nSamples += n; state->sliceNSamples[state->sliceIndex] = [NSNumber numberWithInt:nSamples]; AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL); + if (!isSpeech) return; + if (!state->isTranscribing) { state->isTranscribing = true; dispatch_async([state->mSelf getDispatchQueue], ^{ diff --git a/src/index.ts b/src/index.ts index 2c291d9..d185cc8 100644 --- a/src/index.ts +++ b/src/index.ts @@ -58,6 +58,12 @@ export type TranscribeRealtimeOptions = TranscribeOptions & { * (Default: Equal to `realtimeMaxAudioSec`) */ realtimeAudioSliceSec?: number + /** + * Start transcribe on recording when the audio volume is greater than the threshold by using VAD (Voice Activity Detection). + * The first VAD will be triggered after 2 second of recording. + * (Default: false) + */ + useVad?: boolean } export type TranscribeRealtimeEvent = { From 47260a3839f54f28aa971dad8f454a8a2121956b Mon Sep 17 00:00:00 2001 From: Jhen <developer@jhen.me> Date: Fri, 22 Sep 2023 09:38:29 +0800 Subject: [PATCH 2/6] feat(ios): skip vad if isTranscribing --- ios/RNWhisperContext.mm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ios/RNWhisperContext.mm b/ios/RNWhisperContext.mm index 0f67933..18866d4 100644 --- a/ios/RNWhisperContext.mm +++ b/ios/RNWhisperContext.mm @@ -145,7 +145,7 @@ void AudioInputCallback(void * inUserData, } bool isSpeech = true; - if (state->options[@"useVad"]) { + if (!state->isTranscribing && state->options[@"useVad"]) { if (nSamples + n > WHISPER_SAMPLE_RATE * 2) { int start = nSamples + n - WHISPER_SAMPLE_RATE * 2; std::vector<float> audioBufferF32Vec(WHISPER_SAMPLE_RATE * 2); From e38f7d62d39c0b3fdca5226f162b4eb8b3eb9752 Mon Sep 17 00:00:00 2001 From: Jhen <developer@jhen.me> Date: Fri, 22 Sep 2023 12:52:28 +0800 Subject: [PATCH 3/6] feat(ios): add vadMs / vadThold / vadFreqThold options --- cpp/rn-whisper.cpp | 1 - ios/RNWhisperContext.mm | 14 +++++++++----- src/index.ts | 12 ++++++++++++ 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/cpp/rn-whisper.cpp b/cpp/rn-whisper.cpp index bbc304f..b5648bc 100644 --- a/cpp/rn-whisper.cpp +++ b/cpp/rn-whisper.cpp @@ -57,7 +57,6 @@ bool rn_whisper_vad_simple(std::vector<float> & pcmf32, int sample_rate, int las if (n_samples_last >= n_samples) { // not enough samples - assume no speech - printf("not enough samples - assume no speech\n"); return false; } diff --git a/ios/RNWhisperContext.mm b/ios/RNWhisperContext.mm index 18866d4..dd120ff 100644 --- a/ios/RNWhisperContext.mm +++ b/ios/RNWhisperContext.mm @@ -146,13 +146,17 @@ void AudioInputCallback(void * inUserData, bool isSpeech = true; if (!state->isTranscribing && state->options[@"useVad"]) { - if (nSamples + n > WHISPER_SAMPLE_RATE * 2) { - int start = nSamples + n - WHISPER_SAMPLE_RATE * 2; - std::vector<float> audioBufferF32Vec(WHISPER_SAMPLE_RATE * 2); - for (int i = 0; i < WHISPER_SAMPLE_RATE * 2; i++) { + int vadSec = state->options[@"vadMs"] != nil ? [state->options[@"vadMs"] intValue] / 1000 : 2; + int sampleSize = vadSec * WHISPER_SAMPLE_RATE; + if (nSamples + n > sampleSize) { + int start = nSamples + n - sampleSize; + std::vector<float> audioBufferF32Vec(sampleSize); + for (int i = 0; i < sampleSize; i++) { audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f; } - isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, 0.6f, 100.0f, false); + float vadThold = state->options[@"vadThold"] != nil ? [state->options[@"vadThold"] floatValue] : 0.6f; + float vadFreqThold = state->options[@"vadFreqThold"] != nil ? [state->options[@"vadFreqThold"] floatValue] : 100.0f; + isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, vadThold, vadFreqThold, false); NSLog(@"[RNWhisper] VAD result: %d", isSpeech); } else { isSpeech = false; diff --git a/src/index.ts b/src/index.ts index d185cc8..4c2787f 100644 --- a/src/index.ts +++ b/src/index.ts @@ -64,6 +64,18 @@ export type TranscribeRealtimeOptions = TranscribeOptions & { * (Default: false) */ useVad?: boolean + /** + * The length of the collected audio is used for VAD. (ms) (Default: 2000) + */ + vadMs?: number + /** + * VAD threshold. (Default: 0.6) + */ + vadThold?: number + /** + * Frequency to apply High-pass filter in VAD. (Default: 100.0) + */ + vadFreqThold?: number } export type TranscribeRealtimeEvent = { From ef922c7e4434c1fcc593846498c0096f8bb9d8d6 Mon Sep 17 00:00:00 2001 From: Jhen <developer@jhen.me> Date: Fri, 22 Sep 2023 13:10:33 +0800 Subject: [PATCH 4/6] feat(android): implement vad on realtime transcription --- .../java/com/rnwhisper/WhisperContext.java | 21 ++++++++++++++++++ android/src/main/jni.cpp | 22 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/android/src/main/java/com/rnwhisper/WhisperContext.java b/android/src/main/java/com/rnwhisper/WhisperContext.java index 3b73e72..bea9eb4 100644 --- a/android/src/main/java/com/rnwhisper/WhisperContext.java +++ b/android/src/main/java/com/rnwhisper/WhisperContext.java @@ -166,9 +166,29 @@ public void run() { for (int i = 0; i < n; i++) { shortBuffer[nSamples + i] = buffer[i]; } + + boolean isSpeech = true; + if (!isTranscribing && options.hasKey("useVad") && options.getBoolean("useVad")) { + int vadSec = options.hasKey("vadMs") ? options.getInt("vadMs") / 1000 : 2; + int sampleSize = vadSec * SAMPLE_RATE; + if (nSamples + n > sampleSize) { + float vadThold = options.hasKey("vadThold") ? (float) options.getDouble("vadThold") : 0.6f; + float vadFreqThold = options.hasKey("vadFreqThold") ? (float) options.getDouble("vadFreqThold") : 0.6f; + float[] audioData = new float[sampleSize]; + for (int i = 0; i < sampleSize; i++) { + audioData[i] = shortBuffer[nSamples + i] / 32768.0f; + } + isSpeech = vadSample(audioData, sampleSize, vadThold, vadFreqThold); + } else { + isSpeech = false; + } + } + nSamples += n; sliceNSamples.set(sliceIndex, nSamples); + if (!isSpeech) continue; + if (!isTranscribing && nSamples > SAMPLE_RATE / 2) { isTranscribing = true; fullHandler = new Thread(new Runnable() { @@ -513,6 +533,7 @@ private static String cpuInfo() { protected static native long initContext(String modelPath); protected static native long initContextWithAsset(AssetManager assetManager, String modelPath); protected static native long initContextWithInputStream(PushbackInputStream inputStream); + protected static native boolean vadSample(float[] audio_data, int audio_data_len, float vad_thold, float vad_freq_thold); protected static native int fullTranscribe( int job_id, long context, diff --git a/android/src/main/jni.cpp b/android/src/main/jni.cpp index 3b97417..20e6f02 100644 --- a/android/src/main/jni.cpp +++ b/android/src/main/jni.cpp @@ -6,6 +6,7 @@ #include <sys/sysinfo.h> #include <string> #include <thread> +#include <vector> #include "whisper.h" #include "rn-whisper.h" #include "ggml.h" @@ -184,6 +185,27 @@ Java_com_rnwhisper_WhisperContext_initContextWithInputStream( return reinterpret_cast<jlong>(context); } +JNIEXPORT jboolean JNICALL +Java_com_rnwhisper_WhisperContext_vadSimple( + JNIEnv *env, + jobject thiz, + jfloatArray audio_data, + jint audio_data_len, + jfloat vad_thold, + jfloat vad_freq_thold +) { + UNUSED(thiz); + + std::vector<float> samples(audio_data_len); + jfloat *audio_data_arr = env->GetFloatArrayElements(audio_data, nullptr); + for (int i = 0; i < audio_data_len; i++) { + samples[i] = audio_data_arr[i]; + } + bool is_speech = rn_whisper_vad_simple(samples, WHISPER_SAMPLE_RATE, 1000, vad_thold, vad_freq_thold, false); + env->ReleaseFloatArrayElements(audio_data, audio_data_arr, JNI_ABORT); + return is_speech; +} + struct progress_callback_context { JNIEnv *env; jobject progress_callback_instance; From 655d9b3f87ef26329f0f65be4ef713553b0fe4b3 Mon Sep 17 00:00:00 2001 From: Jhen <developer@jhen.me> Date: Sat, 23 Sep 2023 13:37:04 +0800 Subject: [PATCH 5/6] feat: use vad to check last transcription --- .../java/com/rnwhisper/WhisperContext.java | 46 +++++++++++------- example/src/App.tsx | 2 +- ios/RNWhisperContext.mm | 47 ++++++++++++------- 3 files changed, 59 insertions(+), 36 deletions(-) diff --git a/android/src/main/java/com/rnwhisper/WhisperContext.java b/android/src/main/java/com/rnwhisper/WhisperContext.java index 84029f1..3f8552e 100644 --- a/android/src/main/java/com/rnwhisper/WhisperContext.java +++ b/android/src/main/java/com/rnwhisper/WhisperContext.java @@ -162,6 +162,27 @@ private void saveWavFile(byte[] rawData, String audioOutputFile) throws IOExcept } } + private boolean vad(ReadableMap options, short[] shortBuffer, int nSamples, int n) { + boolean isSpeech = true; + if (!isTranscribing && options.hasKey("useVad") && options.getBoolean("useVad")) { + int vadSec = options.hasKey("vadMs") ? options.getInt("vadMs") / 1000 : 2; + int sampleSize = vadSec * SAMPLE_RATE; + if (nSamples + n > sampleSize) { + int start = nSamples + n - sampleSize; + float[] audioData = new float[sampleSize]; + for (int i = 0; i < sampleSize; i++) { + audioData[i] = shortBuffer[i + start] / 32768.0f; + } + float vadThold = options.hasKey("vadThold") ? (float) options.getDouble("vadThold") : 0.6f; + float vadFreqThold = options.hasKey("vadFreqThold") ? (float) options.getDouble("vadFreqThold") : 0.6f; + isSpeech = vadSimple(audioData, sampleSize, vadThold, vadFreqThold); + } else { + isSpeech = false; + } + } + return isSpeech; + } + public int startRealtimeTranscribe(int jobId, ReadableMap options) { if (isCapturing || isTranscribing) { return -100; @@ -223,6 +244,12 @@ public void run() { ) { emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap()); } else if (!isTranscribing) { + short[] shortBuffer = shortBufferSlices.get(sliceIndex); + boolean isSpeech = vad(options, shortBuffer, nSamples, 0); + if (!isSpeech) { + emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap()); + break; + } isTranscribing = true; fullTranscribeSamples(options, true); } @@ -245,22 +272,7 @@ public void run() { shortBuffer[nSamples + i] = buffer[i]; } - boolean isSpeech = true; - if (!isTranscribing && options.hasKey("useVad") && options.getBoolean("useVad")) { - int vadSec = options.hasKey("vadMs") ? options.getInt("vadMs") / 1000 : 2; - int sampleSize = vadSec * SAMPLE_RATE; - if (nSamples + n > sampleSize) { - float vadThold = options.hasKey("vadThold") ? (float) options.getDouble("vadThold") : 0.6f; - float vadFreqThold = options.hasKey("vadFreqThold") ? (float) options.getDouble("vadFreqThold") : 0.6f; - float[] audioData = new float[sampleSize]; - for (int i = 0; i < sampleSize; i++) { - audioData[i] = shortBuffer[nSamples + i] / 32768.0f; - } - isSpeech = vadSample(audioData, sampleSize, vadThold, vadFreqThold); - } else { - isSpeech = false; - } - } + boolean isSpeech = vad(options, shortBuffer, nSamples, n); nSamples += n; sliceNSamples.set(sliceIndex, nSamples); @@ -613,7 +625,7 @@ private static String cpuInfo() { protected static native long initContext(String modelPath); protected static native long initContextWithAsset(AssetManager assetManager, String modelPath); protected static native long initContextWithInputStream(PushbackInputStream inputStream); - protected static native boolean vadSample(float[] audio_data, int audio_data_len, float vad_thold, float vad_freq_thold); + protected static native boolean vadSimple(float[] audio_data, int audio_data_len, float vad_thold, float vad_freq_thold); protected static native int fullTranscribe( int job_id, long context, diff --git a/example/src/App.tsx b/example/src/App.tsx index 287c289..e374f39 100644 --- a/example/src/App.tsx +++ b/example/src/App.tsx @@ -266,7 +266,7 @@ export default function App() { realtimeAudioSec: 60, // Slice audio into 25 (or < 30) sec chunks for better performance realtimeAudioSliceSec: 25, - // + // Voice Activity Detection - Start transcribing when speech is detected useVad: true, }) setStopTranscribe({ stop }) diff --git a/ios/RNWhisperContext.mm b/ios/RNWhisperContext.mm index dd120ff..58e4911 100644 --- a/ios/RNWhisperContext.mm +++ b/ios/RNWhisperContext.mm @@ -78,6 +78,29 @@ - (void)freeBufferIfNeeded { } } +bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n) +{ + bool isSpeech = true; + if (!state->isTranscribing && state->options[@"useVad"]) { + int vadSec = state->options[@"vadMs"] != nil ? [state->options[@"vadMs"] intValue] / 1000 : 2; + int sampleSize = vadSec * WHISPER_SAMPLE_RATE; + if (nSamples + n > sampleSize) { + int start = nSamples + n - sampleSize; + std::vector<float> audioBufferF32Vec(sampleSize); + for (int i = 0; i < sampleSize; i++) { + audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f; + } + float vadThold = state->options[@"vadThold"] != nil ? [state->options[@"vadThold"] floatValue] : 0.6f; + float vadFreqThold = state->options[@"vadFreqThold"] != nil ? [state->options[@"vadFreqThold"] floatValue] : 100.0f; + isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, vadThold, vadFreqThold, false); + NSLog(@"[RNWhisper] VAD result: %d", isSpeech); + } else { + isSpeech = false; + } + } + return isSpeech; +} + void AudioInputCallback(void * inUserData, AudioQueueRef inAQ, AudioQueueBufferRef inBuffer, @@ -118,6 +141,11 @@ void AudioInputCallback(void * inUserData, !state->isTranscribing && nSamples != state->nSamplesTranscribing ) { + int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue]; + if (!vad(state, audioBufferI16, nSamples, 0)) { + state->transcribeHandler(state->jobId, @"end", @{}); + return; + } state->isTranscribing = true; dispatch_async([state->mSelf getDispatchQueue], ^{ [state->mSelf fullTranscribeSamples:state]; @@ -144,24 +172,7 @@ void AudioInputCallback(void * inUserData, audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i]; } - bool isSpeech = true; - if (!state->isTranscribing && state->options[@"useVad"]) { - int vadSec = state->options[@"vadMs"] != nil ? [state->options[@"vadMs"] intValue] / 1000 : 2; - int sampleSize = vadSec * WHISPER_SAMPLE_RATE; - if (nSamples + n > sampleSize) { - int start = nSamples + n - sampleSize; - std::vector<float> audioBufferF32Vec(sampleSize); - for (int i = 0; i < sampleSize; i++) { - audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f; - } - float vadThold = state->options[@"vadThold"] != nil ? [state->options[@"vadThold"] floatValue] : 0.6f; - float vadFreqThold = state->options[@"vadFreqThold"] != nil ? [state->options[@"vadFreqThold"] floatValue] : 100.0f; - isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, vadThold, vadFreqThold, false); - NSLog(@"[RNWhisper] VAD result: %d", isSpeech); - } else { - isSpeech = false; - } - } + bool isSpeech = vad(state, audioBufferI16, nSamples, n); nSamples += n; state->sliceNSamples[state->sliceIndex] = [NSNumber numberWithInt:nSamples]; From 7a9ff8060c3c071677e8cef7d557590eb2f7af85 Mon Sep 17 00:00:00 2001 From: Jhen <developer@jhen.me> Date: Sat, 23 Sep 2023 14:28:25 +0800 Subject: [PATCH 6/6] feat(example): do not use vad by default --- example/src/App.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/src/App.tsx b/example/src/App.tsx index e374f39..0317b23 100644 --- a/example/src/App.tsx +++ b/example/src/App.tsx @@ -267,7 +267,7 @@ export default function App() { // Slice audio into 25 (or < 30) sec chunks for better performance realtimeAudioSliceSec: 25, // Voice Activity Detection - Start transcribing when speech is detected - useVad: true, + // useVad: true, }) setStopTranscribe({ stop }) subscribe((evt) => {