From cdc7e44a0c807efa14dee81b6d3914d6faebbe83 Mon Sep 17 00:00:00 2001
From: Jhen <developer@jhen.me>
Date: Thu, 21 Sep 2023 13:05:14 +0800
Subject: [PATCH 1/6] feat(ios): initial work for simple VAD

---
 cpp/rn-whisper.cpp       | 52 ++++++++++++++++++++++++++++++++++++++++
 cpp/rn-whisper.h         |  3 ++-
 example/ios/Podfile.lock | 12 +++++-----
 example/src/App.tsx      |  2 ++
 ios/RNWhisperContext.mm  | 18 ++++++++++++++
 src/index.ts             |  6 +++++
 6 files changed, 86 insertions(+), 7 deletions(-)

diff --git a/cpp/rn-whisper.cpp b/cpp/rn-whisper.cpp
index fcd3133..bbc304f 100644
--- a/cpp/rn-whisper.cpp
+++ b/cpp/rn-whisper.cpp
@@ -38,4 +38,56 @@ void rn_whisper_abort_all_transcribe() {
   }
 }
 
+void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+
+    float y = data[0];
+
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+
+bool rn_whisper_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+  const int n_samples      = pcmf32.size();
+  const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+  if (n_samples_last >= n_samples) {
+    // not enough samples - assume no speech
+    printf("not enough samples - assume no speech\n");
+    return false;
+  }
+
+  if (freq_thold > 0.0f) {
+    high_pass_filter(pcmf32, freq_thold, sample_rate);
+  }
+
+  float energy_all  = 0.0f;
+  float energy_last = 0.0f;
+
+  for (int i = 0; i < n_samples; i++) {
+    energy_all += fabsf(pcmf32[i]);
+
+    if (i >= n_samples - n_samples_last) {
+      energy_last += fabsf(pcmf32[i]);
+    }
+  }
+
+  energy_all  /= n_samples;
+  energy_last /= n_samples_last;
+
+  if (verbose) {
+    fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+  }
+
+  if (energy_last > vad_thold*energy_all) {
+    return false;
+  }
+
+  return true;
+}
+
 }
\ No newline at end of file
diff --git a/cpp/rn-whisper.h b/cpp/rn-whisper.h
index 4fd2c1b..4f65158 100644
--- a/cpp/rn-whisper.h
+++ b/cpp/rn-whisper.h
@@ -10,7 +10,8 @@ void rn_whisper_remove_abort_map(int job_id);
 void rn_whisper_abort_transcribe(int job_id);
 bool rn_whisper_transcribe_is_aborted(int job_id);
 void rn_whisper_abort_all_transcribe();
+bool rn_whisper_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose);
 
 #ifdef __cplusplus
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock
index a94a1fd..2abfa4c 100644
--- a/example/ios/Podfile.lock
+++ b/example/ios/Podfile.lock
@@ -750,16 +750,16 @@ PODS:
     - React-perflogger (= 0.71.11)
   - RNFS (2.20.0):
     - React-Core
-  - RNZipArchive (6.0.9):
+  - RNZipArchive (6.1.0):
     - React-Core
-    - RNZipArchive/Core (= 6.0.9)
+    - RNZipArchive/Core (= 6.1.0)
     - SSZipArchive (~> 2.2)
-  - RNZipArchive/Core (6.0.9):
+  - RNZipArchive/Core (6.1.0):
     - React-Core
     - SSZipArchive (~> 2.2)
   - SocketRocket (0.6.0)
   - SSZipArchive (2.4.3)
-  - whisper-rn (0.3.5):
+  - whisper-rn (0.3.6):
     - RCT-Folly
     - RCTRequired
     - RCTTypeSafety
@@ -994,10 +994,10 @@ SPEC CHECKSUMS:
   React-runtimeexecutor: 4817d63dbc9d658f8dc0ec56bd9b83ce531129f0
   ReactCommon: 08723d2ed328c5cbcb0de168f231bc7bae7f8aa1
   RNFS: 4ac0f0ea233904cb798630b3c077808c06931688
-  RNZipArchive: 68a0c6db4b1c103f846f1559622050df254a3ade
+  RNZipArchive: ef9451b849c45a29509bf44e65b788829ab07801
   SocketRocket: fccef3f9c5cedea1353a9ef6ada904fde10d6608
   SSZipArchive: fe6a26b2a54d5a0890f2567b5cc6de5caa600aef
-  whisper-rn: 6f293154b175fee138a994fa00d0f414fb1f44e9
+  whisper-rn: e80c0482f6a632faafd601f98f10da0255c1e1ec
   Yoga: f7decafdc5e8c125e6fa0da38a687e35238420fa
   YogaKit: f782866e155069a2cca2517aafea43200b01fd5a
 
diff --git a/example/src/App.tsx b/example/src/App.tsx
index d75a013..287c289 100644
--- a/example/src/App.tsx
+++ b/example/src/App.tsx
@@ -266,6 +266,8 @@ export default function App() {
                     realtimeAudioSec: 60,
                     // Slice audio into 25 (or < 30) sec chunks for better performance
                     realtimeAudioSliceSec: 25,
+                    // 
+                    useVad: true,
                   })
                 setStopTranscribe({ stop })
                 subscribe((evt) => {
diff --git a/ios/RNWhisperContext.mm b/ios/RNWhisperContext.mm
index 57baa8f..0f67933 100644
--- a/ios/RNWhisperContext.mm
+++ b/ios/RNWhisperContext.mm
@@ -1,4 +1,5 @@
 #import "RNWhisperContext.h"
+#include <vector>
 
 #define NUM_BYTES_PER_BUFFER 16 * 1024
 
@@ -142,11 +143,28 @@ void AudioInputCallback(void * inUserData,
     for (int i = 0; i < n; i++) {
         audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i];
     }
+
+    bool isSpeech = true;
+    if (state->options[@"useVad"]) {
+        if (nSamples + n > WHISPER_SAMPLE_RATE * 2) {
+            int start = nSamples + n - WHISPER_SAMPLE_RATE * 2;
+            std::vector<float> audioBufferF32Vec(WHISPER_SAMPLE_RATE * 2);
+            for (int i = 0; i < WHISPER_SAMPLE_RATE * 2; i++) {
+                audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
+            }
+            isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, 0.6f, 100.0f, false);
+            NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
+        } else {
+            isSpeech = false;
+        }
+    }
     nSamples += n;
     state->sliceNSamples[state->sliceIndex] = [NSNumber numberWithInt:nSamples];
 
     AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL);
 
+    if (!isSpeech) return;
+
     if (!state->isTranscribing) {
         state->isTranscribing = true;
         dispatch_async([state->mSelf getDispatchQueue], ^{
diff --git a/src/index.ts b/src/index.ts
index 2c291d9..d185cc8 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -58,6 +58,12 @@ export type TranscribeRealtimeOptions = TranscribeOptions & {
    * (Default: Equal to `realtimeMaxAudioSec`)
    */
   realtimeAudioSliceSec?: number
+  /**
+   * Start transcribe on recording when the audio volume is greater than the threshold by using VAD (Voice Activity Detection).
+   * The first VAD will be triggered after 2 second of recording.
+   * (Default: false)
+   */
+  useVad?: boolean
 }
 
 export type TranscribeRealtimeEvent = {

From 47260a3839f54f28aa971dad8f454a8a2121956b Mon Sep 17 00:00:00 2001
From: Jhen <developer@jhen.me>
Date: Fri, 22 Sep 2023 09:38:29 +0800
Subject: [PATCH 2/6] feat(ios): skip vad if isTranscribing

---
 ios/RNWhisperContext.mm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ios/RNWhisperContext.mm b/ios/RNWhisperContext.mm
index 0f67933..18866d4 100644
--- a/ios/RNWhisperContext.mm
+++ b/ios/RNWhisperContext.mm
@@ -145,7 +145,7 @@ void AudioInputCallback(void * inUserData,
     }
 
     bool isSpeech = true;
-    if (state->options[@"useVad"]) {
+    if (!state->isTranscribing && state->options[@"useVad"]) {
         if (nSamples + n > WHISPER_SAMPLE_RATE * 2) {
             int start = nSamples + n - WHISPER_SAMPLE_RATE * 2;
             std::vector<float> audioBufferF32Vec(WHISPER_SAMPLE_RATE * 2);

From e38f7d62d39c0b3fdca5226f162b4eb8b3eb9752 Mon Sep 17 00:00:00 2001
From: Jhen <developer@jhen.me>
Date: Fri, 22 Sep 2023 12:52:28 +0800
Subject: [PATCH 3/6] feat(ios): add vadMs / vadThold / vadFreqThold options

---
 cpp/rn-whisper.cpp      |  1 -
 ios/RNWhisperContext.mm | 14 +++++++++-----
 src/index.ts            | 12 ++++++++++++
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/cpp/rn-whisper.cpp b/cpp/rn-whisper.cpp
index bbc304f..b5648bc 100644
--- a/cpp/rn-whisper.cpp
+++ b/cpp/rn-whisper.cpp
@@ -57,7 +57,6 @@ bool rn_whisper_vad_simple(std::vector<float> & pcmf32, int sample_rate, int las
 
   if (n_samples_last >= n_samples) {
     // not enough samples - assume no speech
-    printf("not enough samples - assume no speech\n");
     return false;
   }
 
diff --git a/ios/RNWhisperContext.mm b/ios/RNWhisperContext.mm
index 18866d4..dd120ff 100644
--- a/ios/RNWhisperContext.mm
+++ b/ios/RNWhisperContext.mm
@@ -146,13 +146,17 @@ void AudioInputCallback(void * inUserData,
 
     bool isSpeech = true;
     if (!state->isTranscribing && state->options[@"useVad"]) {
-        if (nSamples + n > WHISPER_SAMPLE_RATE * 2) {
-            int start = nSamples + n - WHISPER_SAMPLE_RATE * 2;
-            std::vector<float> audioBufferF32Vec(WHISPER_SAMPLE_RATE * 2);
-            for (int i = 0; i < WHISPER_SAMPLE_RATE * 2; i++) {
+        int vadSec = state->options[@"vadMs"] != nil ? [state->options[@"vadMs"] intValue] / 1000 : 2;
+        int sampleSize = vadSec * WHISPER_SAMPLE_RATE;
+        if (nSamples + n > sampleSize) {
+            int start = nSamples + n - sampleSize;
+            std::vector<float> audioBufferF32Vec(sampleSize);
+            for (int i = 0; i < sampleSize; i++) {
                 audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
             }
-            isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, 0.6f, 100.0f, false);
+            float vadThold = state->options[@"vadThold"] != nil ? [state->options[@"vadThold"] floatValue] : 0.6f;
+            float vadFreqThold = state->options[@"vadFreqThold"] != nil ? [state->options[@"vadFreqThold"] floatValue] : 100.0f;
+            isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, vadThold, vadFreqThold, false);
             NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
         } else {
             isSpeech = false;
diff --git a/src/index.ts b/src/index.ts
index d185cc8..4c2787f 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -64,6 +64,18 @@ export type TranscribeRealtimeOptions = TranscribeOptions & {
    * (Default: false)
    */
   useVad?: boolean
+  /**
+   * The length of the collected audio is used for VAD. (ms) (Default: 2000)
+   */
+  vadMs?: number
+  /**
+   * VAD threshold. (Default: 0.6)
+   */
+  vadThold?: number
+  /**
+   * Frequency to apply High-pass filter in VAD. (Default: 100.0)
+   */
+  vadFreqThold?: number
 }
 
 export type TranscribeRealtimeEvent = {

From ef922c7e4434c1fcc593846498c0096f8bb9d8d6 Mon Sep 17 00:00:00 2001
From: Jhen <developer@jhen.me>
Date: Fri, 22 Sep 2023 13:10:33 +0800
Subject: [PATCH 4/6] feat(android): implement vad on realtime transcription

---
 .../java/com/rnwhisper/WhisperContext.java    | 21 ++++++++++++++++++
 android/src/main/jni.cpp                      | 22 +++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/android/src/main/java/com/rnwhisper/WhisperContext.java b/android/src/main/java/com/rnwhisper/WhisperContext.java
index 3b73e72..bea9eb4 100644
--- a/android/src/main/java/com/rnwhisper/WhisperContext.java
+++ b/android/src/main/java/com/rnwhisper/WhisperContext.java
@@ -166,9 +166,29 @@ public void run() {
               for (int i = 0; i < n; i++) {
                 shortBuffer[nSamples + i] = buffer[i];
               }
+
+              boolean isSpeech = true;
+              if (!isTranscribing && options.hasKey("useVad") && options.getBoolean("useVad")) {
+                int vadSec = options.hasKey("vadMs") ? options.getInt("vadMs") / 1000 : 2;
+                int sampleSize = vadSec * SAMPLE_RATE;
+                if (nSamples + n > sampleSize) {
+                  float vadThold = options.hasKey("vadThold") ? (float) options.getDouble("vadThold") : 0.6f;
+                  float vadFreqThold = options.hasKey("vadFreqThold") ? (float) options.getDouble("vadFreqThold") : 0.6f;
+                  float[] audioData = new float[sampleSize];
+                  for (int i = 0; i < sampleSize; i++) {
+                    audioData[i] = shortBuffer[nSamples + i] / 32768.0f;
+                  }
+                  isSpeech = vadSample(audioData, sampleSize, vadThold, vadFreqThold);
+                } else {
+                  isSpeech = false;
+                }
+              }
+
               nSamples += n;
               sliceNSamples.set(sliceIndex, nSamples);
 
+              if (!isSpeech) continue;
+
               if (!isTranscribing && nSamples > SAMPLE_RATE / 2) {
                 isTranscribing = true;
                 fullHandler = new Thread(new Runnable() {
@@ -513,6 +533,7 @@ private static String cpuInfo() {
   protected static native long initContext(String modelPath);
   protected static native long initContextWithAsset(AssetManager assetManager, String modelPath);
   protected static native long initContextWithInputStream(PushbackInputStream inputStream);
+  protected static native boolean vadSample(float[] audio_data, int audio_data_len, float vad_thold, float vad_freq_thold);
   protected static native int fullTranscribe(
     int job_id,
     long context,
diff --git a/android/src/main/jni.cpp b/android/src/main/jni.cpp
index 3b97417..20e6f02 100644
--- a/android/src/main/jni.cpp
+++ b/android/src/main/jni.cpp
@@ -6,6 +6,7 @@
 #include <sys/sysinfo.h>
 #include <string>
 #include <thread>
+#include <vector>
 #include "whisper.h"
 #include "rn-whisper.h"
 #include "ggml.h"
@@ -184,6 +185,27 @@ Java_com_rnwhisper_WhisperContext_initContextWithInputStream(
     return reinterpret_cast<jlong>(context);
 }
 
+JNIEXPORT jboolean JNICALL
+Java_com_rnwhisper_WhisperContext_vadSimple(
+    JNIEnv *env,
+    jobject thiz,
+    jfloatArray audio_data,
+    jint audio_data_len,
+    jfloat vad_thold,
+    jfloat vad_freq_thold
+) {
+    UNUSED(thiz);
+
+    std::vector<float> samples(audio_data_len);
+    jfloat *audio_data_arr = env->GetFloatArrayElements(audio_data, nullptr);
+    for (int i = 0; i < audio_data_len; i++) {
+        samples[i] = audio_data_arr[i];
+    }
+    bool is_speech = rn_whisper_vad_simple(samples, WHISPER_SAMPLE_RATE, 1000, vad_thold, vad_freq_thold, false);
+    env->ReleaseFloatArrayElements(audio_data, audio_data_arr, JNI_ABORT);
+    return is_speech;
+}
+
 struct progress_callback_context {
     JNIEnv *env;
     jobject progress_callback_instance;

From 655d9b3f87ef26329f0f65be4ef713553b0fe4b3 Mon Sep 17 00:00:00 2001
From: Jhen <developer@jhen.me>
Date: Sat, 23 Sep 2023 13:37:04 +0800
Subject: [PATCH 5/6] feat: use vad to check last transcription

---
 .../java/com/rnwhisper/WhisperContext.java    | 46 +++++++++++-------
 example/src/App.tsx                           |  2 +-
 ios/RNWhisperContext.mm                       | 47 ++++++++++++-------
 3 files changed, 59 insertions(+), 36 deletions(-)

diff --git a/android/src/main/java/com/rnwhisper/WhisperContext.java b/android/src/main/java/com/rnwhisper/WhisperContext.java
index 84029f1..3f8552e 100644
--- a/android/src/main/java/com/rnwhisper/WhisperContext.java
+++ b/android/src/main/java/com/rnwhisper/WhisperContext.java
@@ -162,6 +162,27 @@ private void saveWavFile(byte[] rawData, String audioOutputFile) throws IOExcept
     }
   }
 
+  private boolean vad(ReadableMap options, short[] shortBuffer, int nSamples, int n) {
+    boolean isSpeech = true;
+    if (!isTranscribing && options.hasKey("useVad") && options.getBoolean("useVad")) {
+      int vadSec = options.hasKey("vadMs") ? options.getInt("vadMs") / 1000 : 2;
+      int sampleSize = vadSec * SAMPLE_RATE;
+      if (nSamples + n > sampleSize) {
+        int start = nSamples + n - sampleSize;
+        float[] audioData = new float[sampleSize];
+        for (int i = 0; i < sampleSize; i++) {
+          audioData[i] = shortBuffer[i + start] / 32768.0f;
+        }
+        float vadThold = options.hasKey("vadThold") ? (float) options.getDouble("vadThold") : 0.6f;
+        float vadFreqThold = options.hasKey("vadFreqThold") ? (float) options.getDouble("vadFreqThold") : 0.6f;
+        isSpeech = vadSimple(audioData, sampleSize, vadThold, vadFreqThold);
+      } else {
+        isSpeech = false;
+      }
+    }
+    return isSpeech;
+  }
+
   public int startRealtimeTranscribe(int jobId, ReadableMap options) {
     if (isCapturing || isTranscribing) {
       return -100;
@@ -223,6 +244,12 @@ public void run() {
                 ) {
                   emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
                 } else if (!isTranscribing) {
+                  short[] shortBuffer = shortBufferSlices.get(sliceIndex);
+                  boolean isSpeech = vad(options, shortBuffer, nSamples, 0);
+                  if (!isSpeech) {
+                    emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
+                    break;
+                  }
                   isTranscribing = true;
                   fullTranscribeSamples(options, true);
                 }
@@ -245,22 +272,7 @@ public void run() {
                 shortBuffer[nSamples + i] = buffer[i];
               }
 
-              boolean isSpeech = true;
-              if (!isTranscribing && options.hasKey("useVad") && options.getBoolean("useVad")) {
-                int vadSec = options.hasKey("vadMs") ? options.getInt("vadMs") / 1000 : 2;
-                int sampleSize = vadSec * SAMPLE_RATE;
-                if (nSamples + n > sampleSize) {
-                  float vadThold = options.hasKey("vadThold") ? (float) options.getDouble("vadThold") : 0.6f;
-                  float vadFreqThold = options.hasKey("vadFreqThold") ? (float) options.getDouble("vadFreqThold") : 0.6f;
-                  float[] audioData = new float[sampleSize];
-                  for (int i = 0; i < sampleSize; i++) {
-                    audioData[i] = shortBuffer[nSamples + i] / 32768.0f;
-                  }
-                  isSpeech = vadSample(audioData, sampleSize, vadThold, vadFreqThold);
-                } else {
-                  isSpeech = false;
-                }
-              }
+              boolean isSpeech = vad(options, shortBuffer, nSamples, n);
 
               nSamples += n;
               sliceNSamples.set(sliceIndex, nSamples);
@@ -613,7 +625,7 @@ private static String cpuInfo() {
   protected static native long initContext(String modelPath);
   protected static native long initContextWithAsset(AssetManager assetManager, String modelPath);
   protected static native long initContextWithInputStream(PushbackInputStream inputStream);
-  protected static native boolean vadSample(float[] audio_data, int audio_data_len, float vad_thold, float vad_freq_thold);
+  protected static native boolean vadSimple(float[] audio_data, int audio_data_len, float vad_thold, float vad_freq_thold);
   protected static native int fullTranscribe(
     int job_id,
     long context,
diff --git a/example/src/App.tsx b/example/src/App.tsx
index 287c289..e374f39 100644
--- a/example/src/App.tsx
+++ b/example/src/App.tsx
@@ -266,7 +266,7 @@ export default function App() {
                     realtimeAudioSec: 60,
                     // Slice audio into 25 (or < 30) sec chunks for better performance
                     realtimeAudioSliceSec: 25,
-                    // 
+                    // Voice Activity Detection - Start transcribing when speech is detected
                     useVad: true,
                   })
                 setStopTranscribe({ stop })
diff --git a/ios/RNWhisperContext.mm b/ios/RNWhisperContext.mm
index dd120ff..58e4911 100644
--- a/ios/RNWhisperContext.mm
+++ b/ios/RNWhisperContext.mm
@@ -78,6 +78,29 @@ - (void)freeBufferIfNeeded {
     }
 }
 
+bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n)
+{
+    bool isSpeech = true;
+    if (!state->isTranscribing && state->options[@"useVad"]) {
+        int vadSec = state->options[@"vadMs"] != nil ? [state->options[@"vadMs"] intValue] / 1000 : 2;
+        int sampleSize = vadSec * WHISPER_SAMPLE_RATE;
+        if (nSamples + n > sampleSize) {
+            int start = nSamples + n - sampleSize;
+            std::vector<float> audioBufferF32Vec(sampleSize);
+            for (int i = 0; i < sampleSize; i++) {
+                audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
+            }
+            float vadThold = state->options[@"vadThold"] != nil ? [state->options[@"vadThold"] floatValue] : 0.6f;
+            float vadFreqThold = state->options[@"vadFreqThold"] != nil ? [state->options[@"vadFreqThold"] floatValue] : 100.0f;
+            isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, vadThold, vadFreqThold, false);
+            NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
+        } else {
+            isSpeech = false;
+        }
+    }
+    return isSpeech;
+}
+
 void AudioInputCallback(void * inUserData,
     AudioQueueRef inAQ,
     AudioQueueBufferRef inBuffer,
@@ -118,6 +141,11 @@ void AudioInputCallback(void * inUserData,
             !state->isTranscribing &&
             nSamples != state->nSamplesTranscribing
         ) {
+            int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
+            if (!vad(state, audioBufferI16, nSamples, 0)) {
+                state->transcribeHandler(state->jobId, @"end", @{});
+                return;
+            }
             state->isTranscribing = true;
             dispatch_async([state->mSelf getDispatchQueue], ^{
                 [state->mSelf fullTranscribeSamples:state];
@@ -144,24 +172,7 @@ void AudioInputCallback(void * inUserData,
         audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i];
     }
 
-    bool isSpeech = true;
-    if (!state->isTranscribing && state->options[@"useVad"]) {
-        int vadSec = state->options[@"vadMs"] != nil ? [state->options[@"vadMs"] intValue] / 1000 : 2;
-        int sampleSize = vadSec * WHISPER_SAMPLE_RATE;
-        if (nSamples + n > sampleSize) {
-            int start = nSamples + n - sampleSize;
-            std::vector<float> audioBufferF32Vec(sampleSize);
-            for (int i = 0; i < sampleSize; i++) {
-                audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
-            }
-            float vadThold = state->options[@"vadThold"] != nil ? [state->options[@"vadThold"] floatValue] : 0.6f;
-            float vadFreqThold = state->options[@"vadFreqThold"] != nil ? [state->options[@"vadFreqThold"] floatValue] : 100.0f;
-            isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, vadThold, vadFreqThold, false);
-            NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
-        } else {
-            isSpeech = false;
-        }
-    }
+    bool isSpeech = vad(state, audioBufferI16, nSamples, n);
     nSamples += n;
     state->sliceNSamples[state->sliceIndex] = [NSNumber numberWithInt:nSamples];
 

From 7a9ff8060c3c071677e8cef7d557590eb2f7af85 Mon Sep 17 00:00:00 2001
From: Jhen <developer@jhen.me>
Date: Sat, 23 Sep 2023 14:28:25 +0800
Subject: [PATCH 6/6] feat(example): do not use vad by default

---
 example/src/App.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/src/App.tsx b/example/src/App.tsx
index e374f39..0317b23 100644
--- a/example/src/App.tsx
+++ b/example/src/App.tsx
@@ -267,7 +267,7 @@ export default function App() {
                     // Slice audio into 25 (or < 30) sec chunks for better performance
                     realtimeAudioSliceSec: 25,
                     // Voice Activity Detection - Start transcribing when speech is detected
-                    useVad: true,
+                    // useVad: true,
                   })
                 setStopTranscribe({ stop })
                 subscribe((evt) => {