From d01682d9682b7d79c21d89a7bc14f8230a49cdb2 Mon Sep 17 00:00:00 2001 From: yujinqiu Date: Mon, 16 Oct 2023 14:40:47 +0800 Subject: [PATCH] Add vad clear api for better performance (#366) * Add vad clear api for better performance * rename to make naming consistent and remove macro * Fix linker error * Fix Vad.kt --- .../com/k2fsa/sherpa/onnx/MainActivity.kt | 2 +- .../main/java/com/k2fsa/sherpa/onnx/Vad.kt | 3 ++ sherpa-onnx/c-api/c-api.cc | 9 +++-- sherpa-onnx/c-api/c-api.h | 4 +++ sherpa-onnx/csrc/voice-activity-detector.cc | 4 +++ sherpa-onnx/csrc/voice-activity-detector.h | 1 + sherpa-onnx/jni/jni.cc | 10 ++++++ swift-api-examples/SherpaOnnx.swift | 6 +++- swift-api-examples/generate-subtitles.swift | 35 +++++++++---------- 9 files changed, 52 insertions(+), 22 deletions(-) diff --git a/android/SherpaOnnxVad/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt b/android/SherpaOnnxVad/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt index eb847b020..4d5ce7e74 100644 --- a/android/SherpaOnnxVad/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt +++ b/android/SherpaOnnxVad/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt @@ -161,9 +161,9 @@ class MainActivity : AppCompatActivity() { val samples = FloatArray(ret) { buffer[it] / 32768.0f } vad.acceptWaveform(samples) - while(!vad.empty()) {vad.pop();} val isSpeechDetected = vad.isSpeechDetected() + vad.clear() runOnUiThread { onVad(isSpeechDetected) diff --git a/android/SherpaOnnxVad/app/src/main/java/com/k2fsa/sherpa/onnx/Vad.kt b/android/SherpaOnnxVad/app/src/main/java/com/k2fsa/sherpa/onnx/Vad.kt index fc21593ab..081ae3e8a 100644 --- a/android/SherpaOnnxVad/app/src/main/java/com/k2fsa/sherpa/onnx/Vad.kt +++ b/android/SherpaOnnxVad/app/src/main/java/com/k2fsa/sherpa/onnx/Vad.kt @@ -46,6 +46,8 @@ class Vad( // [start: Int, samples: FloatArray] fun front() = front(ptr) + fun clear() = clear(ptr) + fun isSpeechDetected(): Boolean = isSpeechDetected(ptr) fun reset() = reset(ptr) @@ -64,6 +66,7 @@ class Vad( private external fun acceptWaveform(ptr: Long, samples: FloatArray) private external fun empty(ptr: Long): Boolean private external fun pop(ptr: Long) + private external fun clear(ptr: Long) private external fun front(ptr: Long): Array private external fun isSpeechDetected(ptr: Long): Boolean private external fun reset(ptr: Long) diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 3f6b35466..ca97da25f 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -493,12 +493,17 @@ int32_t SherpaOnnxVoiceActivityDetectorDetected( return p->impl->IsSpeechDetected(); } -SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop( +void SherpaOnnxVoiceActivityDetectorPop( SherpaOnnxVoiceActivityDetector *p) { p->impl->Pop(); } -SHERPA_ONNX_API const SherpaOnnxSpeechSegment * +void SherpaOnnxVoiceActivityDetectorClear( + SherpaOnnxVoiceActivityDetector *p) { + p->impl->Clear(); +} + +const SherpaOnnxSpeechSegment * SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p) { const sherpa_onnx::SpeechSegment &segment = p->impl->Front(); diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index b79b83072..b4b5780a9 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -580,6 +580,10 @@ SherpaOnnxVoiceActivityDetectorDetected(SherpaOnnxVoiceActivityDetector *p); SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop( SherpaOnnxVoiceActivityDetector *p); +// Clear current speech segments. +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorClear( + SherpaOnnxVoiceActivityDetector *p); + // Return the first speech segment. // The user has to use SherpaOnnxDestroySpeechSegment() to free the returned // pointer to avoid memory leak. diff --git a/sherpa-onnx/csrc/voice-activity-detector.cc b/sherpa-onnx/csrc/voice-activity-detector.cc index 05660cd9d..86c0f7e47 100644 --- a/sherpa-onnx/csrc/voice-activity-detector.cc +++ b/sherpa-onnx/csrc/voice-activity-detector.cc @@ -76,6 +76,8 @@ class VoiceActivityDetector::Impl { void Pop() { segments_.pop(); } + void Clear() { std::queue().swap(segments_); } + const SpeechSegment &Front() const { return segments_.front(); } void Reset() { @@ -121,6 +123,8 @@ bool VoiceActivityDetector::Empty() const { return impl_->Empty(); } void VoiceActivityDetector::Pop() { impl_->Pop(); } +void VoiceActivityDetector::Clear() { impl_->Clear(); } + const SpeechSegment &VoiceActivityDetector::Front() const { return impl_->Front(); } diff --git a/sherpa-onnx/csrc/voice-activity-detector.h b/sherpa-onnx/csrc/voice-activity-detector.h index 61552139b..603bfbe78 100644 --- a/sherpa-onnx/csrc/voice-activity-detector.h +++ b/sherpa-onnx/csrc/voice-activity-detector.h @@ -36,6 +36,7 @@ class VoiceActivityDetector { void AcceptWaveform(const float *samples, int32_t n); bool Empty() const; void Pop(); + void Clear(); const SpeechSegment &Front() const; bool IsSpeechDetected() const; diff --git a/sherpa-onnx/jni/jni.cc b/sherpa-onnx/jni/jni.cc index f4f0f6d67..92e6e7203 100644 --- a/sherpa-onnx/jni/jni.cc +++ b/sherpa-onnx/jni/jni.cc @@ -124,6 +124,8 @@ class SherpaOnnxVad { void Pop() { vad_.Pop(); } + void Clear() { vad_.Clear();} + const SpeechSegment &Front() const { return vad_.Front(); } bool IsSpeechDetected() const { return vad_.IsSpeechDetected(); } @@ -556,6 +558,14 @@ JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_pop(JNIEnv *env, model->Pop(); } +SHERPA_ONNX_EXTERN_C +JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_clear(JNIEnv *env, + jobject /*obj*/, + jlong ptr) { + auto model = reinterpret_cast(ptr); + model->Clear(); +} + // see // https://stackoverflow.com/questions/29043872/android-jni-return-multiple-variables static jobject NewInteger(JNIEnv *env, int32_t value) { diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index 53637011c..e786d7cc7 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -551,7 +551,7 @@ class SherpaOnnxVoiceActivityDetectorWrapper { return SherpaOnnxVoiceActivityDetectorEmpty(vad) == 1 } - func isDetected() -> Bool { + func isSpeechDetected() -> Bool { return SherpaOnnxVoiceActivityDetectorDetected(vad) == 1 } @@ -559,6 +559,10 @@ class SherpaOnnxVoiceActivityDetectorWrapper { SherpaOnnxVoiceActivityDetectorPop(vad) } + func clear() { + SherpaOnnxVoiceActivityDetectorClear(vad) + } + func front() -> SherpaOnnxSpeechSegmentWrapper { let p: UnsafePointer? = SherpaOnnxVoiceActivityDetectorFront(vad) return SherpaOnnxSpeechSegmentWrapper(p: p) diff --git a/swift-api-examples/generate-subtitles.swift b/swift-api-examples/generate-subtitles.swift index d06829455..b04b6faef 100644 --- a/swift-api-examples/generate-subtitles.swift +++ b/swift-api-examples/generate-subtitles.swift @@ -174,32 +174,31 @@ func run() { var segments: [SpeechSegment] = [] - while array.count > windowSize { - // todo(fangjun): avoid extra copies here - vad.acceptWaveform(samples: [Float](array[0..