diff --git a/cpp/rn-whisper.cpp b/cpp/rn-whisper.cpp index fcd3133..bbc304f 100644 --- a/cpp/rn-whisper.cpp +++ b/cpp/rn-whisper.cpp @@ -38,4 +38,56 @@ void rn_whisper_abort_all_transcribe() { } } +void high_pass_filter(std::vector & data, float cutoff, float sample_rate) { + const float rc = 1.0f / (2.0f * M_PI * cutoff); + const float dt = 1.0f / sample_rate; + const float alpha = dt / (rc + dt); + + float y = data[0]; + + for (size_t i = 1; i < data.size(); i++) { + y = alpha * (y + data[i] - data[i - 1]); + data[i] = y; + } +} + +bool rn_whisper_vad_simple(std::vector & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) { + const int n_samples = pcmf32.size(); + const int n_samples_last = (sample_rate * last_ms) / 1000; + + if (n_samples_last >= n_samples) { + // not enough samples - assume no speech + printf("not enough samples - assume no speech\n"); + return false; + } + + if (freq_thold > 0.0f) { + high_pass_filter(pcmf32, freq_thold, sample_rate); + } + + float energy_all = 0.0f; + float energy_last = 0.0f; + + for (int i = 0; i < n_samples; i++) { + energy_all += fabsf(pcmf32[i]); + + if (i >= n_samples - n_samples_last) { + energy_last += fabsf(pcmf32[i]); + } + } + + energy_all /= n_samples; + energy_last /= n_samples_last; + + if (verbose) { + fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold); + } + + if (energy_last > vad_thold*energy_all) { + return false; + } + + return true; +} + } \ No newline at end of file diff --git a/cpp/rn-whisper.h b/cpp/rn-whisper.h index 4fd2c1b..4f65158 100644 --- a/cpp/rn-whisper.h +++ b/cpp/rn-whisper.h @@ -10,7 +10,8 @@ void rn_whisper_remove_abort_map(int job_id); void rn_whisper_abort_transcribe(int job_id); bool rn_whisper_transcribe_is_aborted(int job_id); void rn_whisper_abort_all_transcribe(); +bool rn_whisper_vad_simple(std::vector & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose); #ifdef __cplusplus } -#endif \ No newline at end of file +#endif diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock index a94a1fd..2abfa4c 100644 --- a/example/ios/Podfile.lock +++ b/example/ios/Podfile.lock @@ -750,16 +750,16 @@ PODS: - React-perflogger (= 0.71.11) - RNFS (2.20.0): - React-Core - - RNZipArchive (6.0.9): + - RNZipArchive (6.1.0): - React-Core - - RNZipArchive/Core (= 6.0.9) + - RNZipArchive/Core (= 6.1.0) - SSZipArchive (~> 2.2) - - RNZipArchive/Core (6.0.9): + - RNZipArchive/Core (6.1.0): - React-Core - SSZipArchive (~> 2.2) - SocketRocket (0.6.0) - SSZipArchive (2.4.3) - - whisper-rn (0.3.5): + - whisper-rn (0.3.6): - RCT-Folly - RCTRequired - RCTTypeSafety @@ -994,10 +994,10 @@ SPEC CHECKSUMS: React-runtimeexecutor: 4817d63dbc9d658f8dc0ec56bd9b83ce531129f0 ReactCommon: 08723d2ed328c5cbcb0de168f231bc7bae7f8aa1 RNFS: 4ac0f0ea233904cb798630b3c077808c06931688 - RNZipArchive: 68a0c6db4b1c103f846f1559622050df254a3ade + RNZipArchive: ef9451b849c45a29509bf44e65b788829ab07801 SocketRocket: fccef3f9c5cedea1353a9ef6ada904fde10d6608 SSZipArchive: fe6a26b2a54d5a0890f2567b5cc6de5caa600aef - whisper-rn: 6f293154b175fee138a994fa00d0f414fb1f44e9 + whisper-rn: e80c0482f6a632faafd601f98f10da0255c1e1ec Yoga: f7decafdc5e8c125e6fa0da38a687e35238420fa YogaKit: f782866e155069a2cca2517aafea43200b01fd5a diff --git a/example/src/App.tsx b/example/src/App.tsx index d75a013..287c289 100644 --- a/example/src/App.tsx +++ b/example/src/App.tsx @@ -266,6 +266,8 @@ export default function App() { realtimeAudioSec: 60, // Slice audio into 25 (or < 30) sec chunks for better performance realtimeAudioSliceSec: 25, + // + useVad: true, }) setStopTranscribe({ stop }) subscribe((evt) => { diff --git a/ios/RNWhisperContext.mm b/ios/RNWhisperContext.mm index 57baa8f..0f67933 100644 --- a/ios/RNWhisperContext.mm +++ b/ios/RNWhisperContext.mm @@ -1,4 +1,5 @@ #import "RNWhisperContext.h" +#include #define NUM_BYTES_PER_BUFFER 16 * 1024 @@ -142,11 +143,28 @@ void AudioInputCallback(void * inUserData, for (int i = 0; i < n; i++) { audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i]; } + + bool isSpeech = true; + if (state->options[@"useVad"]) { + if (nSamples + n > WHISPER_SAMPLE_RATE * 2) { + int start = nSamples + n - WHISPER_SAMPLE_RATE * 2; + std::vector audioBufferF32Vec(WHISPER_SAMPLE_RATE * 2); + for (int i = 0; i < WHISPER_SAMPLE_RATE * 2; i++) { + audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f; + } + isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, 0.6f, 100.0f, false); + NSLog(@"[RNWhisper] VAD result: %d", isSpeech); + } else { + isSpeech = false; + } + } nSamples += n; state->sliceNSamples[state->sliceIndex] = [NSNumber numberWithInt:nSamples]; AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL); + if (!isSpeech) return; + if (!state->isTranscribing) { state->isTranscribing = true; dispatch_async([state->mSelf getDispatchQueue], ^{ diff --git a/src/index.ts b/src/index.ts index 2c291d9..d185cc8 100644 --- a/src/index.ts +++ b/src/index.ts @@ -58,6 +58,12 @@ export type TranscribeRealtimeOptions = TranscribeOptions & { * (Default: Equal to `realtimeMaxAudioSec`) */ realtimeAudioSliceSec?: number + /** + * Start transcribe on recording when the audio volume is greater than the threshold by using VAD (Voice Activity Detection). + * The first VAD will be triggered after 2 second of recording. + * (Default: false) + */ + useVad?: boolean } export type TranscribeRealtimeEvent = {