From c2cc9dec5866e00a1464554f382668f2887de70c Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 9 Jul 2024 16:15:56 +0800 Subject: [PATCH] Add Flush to VAD so that the last segment can be detected. (#1099) --- .github/workflows/dot-net.yaml | 14 ++++++---- CHANGELOG.md | 5 ++++ CMakeLists.txt | 4 +-- .../bin/vad-with-paraformer.dart | 22 +++++++++++++++ .../non-streaming-asr/pubspec.yaml | 2 +- dart-api-examples/streaming-asr/pubspec.yaml | 2 +- dart-api-examples/tts/pubspec.yaml | 2 +- dart-api-examples/vad/bin/vad.dart | 6 ++++ dart-api-examples/vad/pubspec.yaml | 2 +- .../Program.cs | 20 +++++++++++++ flutter-examples/streaming_asr/pubspec.yaml | 4 +-- flutter-examples/tts/pubspec.yaml | 2 +- .../lib/src/sherpa_onnx_bindings.dart | 13 +++++++++ flutter/sherpa_onnx/lib/src/vad.dart | 4 +++ flutter/sherpa_onnx/pubspec.yaml | 12 ++++---- .../ios/sherpa_onnx_ios.podspec | 2 +- .../macos/sherpa_onnx_macos.podspec | 2 +- .../VadNonStreamingParaformer.java | 19 +++++++++++++ java-api-examples/VadRemoveSilence.java | 10 +++++++ nodejs-addon-examples/package.json | 2 +- ...ad-remove-non-speech-segments-from-file.py | 6 ++++ scripts/dart/sherpa-onnx-pubspec.yaml | 2 +- scripts/dotnet/VoiceActivityDetector.cs | 7 +++++ scripts/go/sherpa_onnx.go | 4 +++ scripts/node-addon-api/lib/vad.js | 8 ++++-- scripts/node-addon-api/src/vad.cc | 28 +++++++++++++++++++ sherpa-onnx/c-api/c-api.cc | 4 +++ sherpa-onnx/c-api/c-api.h | 3 ++ sherpa-onnx/csrc/voice-activity-detector.cc | 27 +++++++++++++++++- sherpa-onnx/csrc/voice-activity-detector.h | 6 +++- .../src/com/k2fsa/sherpa/onnx/Vad.java | 6 ++++ sherpa-onnx/jni/voice-activity-detector.cc | 8 ++++++ sherpa-onnx/kotlin-api/Vad.kt | 3 ++ .../python/csrc/voice-activity-detector.cc | 1 + swift-api-examples/SherpaOnnx.swift | 4 +++ 35 files changed, 237 insertions(+), 29 deletions(-) diff --git a/.github/workflows/dot-net.yaml b/.github/workflows/dot-net.yaml index 917e85b95..973f2adea 100644 --- a/.github/workflows/dot-net.yaml +++ b/.github/workflows/dot-net.yaml @@ -52,11 +52,6 @@ jobs: cmake --build . --target install --config Release rm -rf install/pkgconfig - - uses: actions/upload-artifact@v4 - with: - name: windows-${{ matrix.arch }} - path: ./build/install/lib/ - - name: Create tar file shell: bash run: | @@ -72,6 +67,11 @@ jobs: ls -lh *.tar.bz2 mv *.tar.bz2 ../ + - uses: actions/upload-artifact@v4 + with: + name: windows-${{ matrix.arch }} + path: ./*.tar.bz2 + # https://huggingface.co/docs/hub/spaces-github-actions - name: Publish to huggingface if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event_name == 'push' || github.event_name == 'workflow_dispatch') @@ -88,7 +88,9 @@ jobs: rm -rf huggingface export GIT_CLONE_PROTECTION_ACTIVE=false - GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface + export GIT_LFS_SKIP_SMUDGE=1 + + git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface cd huggingface mkdir -p windows-for-dotnet diff --git a/CHANGELOG.md b/CHANGELOG.md index d3d96570f..7975feb76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 1.10.12 + +* Add Flush to VAD so that the last speech segment can be detected. See also + https://github.com/k2-fsa/sherpa-onnx/discussions/1077#discussioncomment-9979740 + ## 1.10.11 * Support the iOS platform for iOS. diff --git a/CMakeLists.txt b/CMakeLists.txt index bb5cc8c41..8e7ebc518 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,8 +10,8 @@ project(sherpa-onnx) # Remember to update # ./nodejs-addon-examples # ./dart-api-examples/ -# ./sherpa-onnx/flutter/CHANGELOG.md -set(SHERPA_ONNX_VERSION "1.10.11") +# ./CHANGELOG.md +set(SHERPA_ONNX_VERSION "1.10.12") # Disable warning about # diff --git a/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart b/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart index 5e7cfb485..12a28196f 100644 --- a/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart +++ b/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart @@ -93,6 +93,28 @@ void main(List arguments) async { } } + vad.flush(); + while (!vad.isEmpty()) { + final stream = recognizer.createStream(); + final segment = vad.front(); + stream.acceptWaveform( + samples: segment.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + + final startTime = segment.start * 1.0 / waveData.sampleRate; + final duration = segment.samples.length * 1.0 / waveData.sampleRate; + final stopTime = startTime + duration; + if (result.text != '') { + print( + '${startTime.toStringAsPrecision(4)} -- ${stopTime.toStringAsPrecision(4)}: ${result.text}'); + } + + stream.free(); + vad.pop(); + } + vad.free(); recognizer.free(); } diff --git a/dart-api-examples/non-streaming-asr/pubspec.yaml b/dart-api-examples/non-streaming-asr/pubspec.yaml index 5b5fb695b..25117d377 100644 --- a/dart-api-examples/non-streaming-asr/pubspec.yaml +++ b/dart-api-examples/non-streaming-asr/pubspec.yaml @@ -10,7 +10,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.11 + sherpa_onnx: ^1.10.12 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/streaming-asr/pubspec.yaml b/dart-api-examples/streaming-asr/pubspec.yaml index 34e78b094..ecaf94df9 100644 --- a/dart-api-examples/streaming-asr/pubspec.yaml +++ b/dart-api-examples/streaming-asr/pubspec.yaml @@ -11,7 +11,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.11 + sherpa_onnx: ^1.10.12 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/tts/pubspec.yaml b/dart-api-examples/tts/pubspec.yaml index 46941fd15..250870674 100644 --- a/dart-api-examples/tts/pubspec.yaml +++ b/dart-api-examples/tts/pubspec.yaml @@ -8,7 +8,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.11 + sherpa_onnx: ^1.10.12 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/vad/bin/vad.dart b/dart-api-examples/vad/bin/vad.dart index 01618cc19..5baccd2f6 100644 --- a/dart-api-examples/vad/bin/vad.dart +++ b/dart-api-examples/vad/bin/vad.dart @@ -65,6 +65,12 @@ void main(List arguments) async { } } + vad.flush(); + while (!vad.isEmpty()) { + allSamples.add(vad.front().samples); + vad.pop(); + } + vad.free(); final s = Float32List.fromList(allSamples.expand((x) => x).toList()); diff --git a/dart-api-examples/vad/pubspec.yaml b/dart-api-examples/vad/pubspec.yaml index 397b530a3..ade27e869 100644 --- a/dart-api-examples/vad/pubspec.yaml +++ b/dart-api-examples/vad/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ^3.4.0 dependencies: - sherpa_onnx: ^1.10.11 + sherpa_onnx: ^1.10.12 path: ^1.9.0 args: ^2.5.0 diff --git a/dotnet-examples/vad-non-streaming-asr-paraformer/Program.cs b/dotnet-examples/vad-non-streaming-asr-paraformer/Program.cs index 13c19f0b7..3a76b4b87 100644 --- a/dotnet-examples/vad-non-streaming-asr-paraformer/Program.cs +++ b/dotnet-examples/vad-non-streaming-asr-paraformer/Program.cs @@ -57,6 +57,26 @@ static void Main(string[] args) } } } + + vad.Flush(); + + while (!vad.IsEmpty()) { + SpeechSegment segment = vad.Front(); + float startTime = segment.Start / (float)sampleRate; + float duration = segment.Samples.Length / (float)sampleRate; + + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(sampleRate, segment.Samples); + recognizer.Decode(stream); + String text = stream.Result.Text; + + if (!String.IsNullOrEmpty(text)) { + Console.WriteLine("{0}--{1}: {2}", String.Format("{0:0.00}", startTime), + String.Format("{0:0.00}", startTime+duration), text); + } + + vad.Pop(); + } } } diff --git a/flutter-examples/streaming_asr/pubspec.yaml b/flutter-examples/streaming_asr/pubspec.yaml index e978396ca..6e20307cf 100644 --- a/flutter-examples/streaming_asr/pubspec.yaml +++ b/flutter-examples/streaming_asr/pubspec.yaml @@ -5,7 +5,7 @@ description: > publish_to: 'none' -version: 1.10.11 +version: 1.10.12 topics: - speech-recognition @@ -30,7 +30,7 @@ dependencies: record: ^5.1.0 url_launcher: ^6.2.6 - sherpa_onnx: ^1.10.11 + sherpa_onnx: ^1.10.12 # sherpa_onnx: # path: ../../flutter/sherpa_onnx diff --git a/flutter-examples/tts/pubspec.yaml b/flutter-examples/tts/pubspec.yaml index e37e08ed1..491d06a32 100644 --- a/flutter-examples/tts/pubspec.yaml +++ b/flutter-examples/tts/pubspec.yaml @@ -17,7 +17,7 @@ dependencies: cupertino_icons: ^1.0.6 path_provider: ^2.1.3 path: ^1.9.0 - sherpa_onnx: ^1.10.11 + sherpa_onnx: ^1.10.12 url_launcher: ^6.2.6 audioplayers: ^5.0.0 diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index ac7a241ee..c00e337ac 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -491,6 +491,12 @@ typedef SherpaOnnxVoiceActivityDetectorResetNative = Void Function( typedef SherpaOnnxVoiceActivityDetectorReset = void Function( Pointer); +typedef SherpaOnnxVoiceActivityDetectorFlushNative = Void Function( + Pointer); + +typedef SherpaOnnxVoiceActivityDetectorFlush = void Function( + Pointer); + typedef SherpaOnnxVoiceActivityDetectorFrontNative = Pointer Function( Pointer); @@ -779,6 +785,8 @@ class SherpaOnnxBindings { static SherpaOnnxVoiceActivityDetectorReset? voiceActivityDetectorReset; + static SherpaOnnxVoiceActivityDetectorFlush? voiceActivityDetectorFlush; + static SherpaOnnxCreateCircularBuffer? createCircularBuffer; static SherpaOnnxDestroyCircularBuffer? destroyCircularBuffer; @@ -1036,6 +1044,11 @@ class SherpaOnnxBindings { 'SherpaOnnxVoiceActivityDetectorReset') .asFunction(); + voiceActivityDetectorFlush ??= dynamicLibrary + .lookup>( + 'SherpaOnnxVoiceActivityDetectorFlush') + .asFunction(); + createCircularBuffer ??= dynamicLibrary .lookup>( 'SherpaOnnxCreateCircularBuffer') diff --git a/flutter/sherpa_onnx/lib/src/vad.dart b/flutter/sherpa_onnx/lib/src/vad.dart index 6c36cd8f0..bcab3fd30 100644 --- a/flutter/sherpa_onnx/lib/src/vad.dart +++ b/flutter/sherpa_onnx/lib/src/vad.dart @@ -207,6 +207,10 @@ class VoiceActivityDetector { SherpaOnnxBindings.voiceActivityDetectorReset?.call(ptr); } + void flush() { + SherpaOnnxBindings.voiceActivityDetectorFlush?.call(ptr); + } + Pointer ptr; final VadModelConfig config; } diff --git a/flutter/sherpa_onnx/pubspec.yaml b/flutter/sherpa_onnx/pubspec.yaml index 8b0923333..08d3f6861 100644 --- a/flutter/sherpa_onnx/pubspec.yaml +++ b/flutter/sherpa_onnx/pubspec.yaml @@ -17,7 +17,7 @@ topics: - voice-activity-detection # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec -version: 1.10.11 +version: 1.10.12 homepage: https://github.com/k2-fsa/sherpa-onnx @@ -30,19 +30,19 @@ dependencies: flutter: sdk: flutter - sherpa_onnx_android: ^1.10.11 + sherpa_onnx_android: ^1.10.12 # path: ../sherpa_onnx_android - sherpa_onnx_macos: ^1.10.11 + sherpa_onnx_macos: ^1.10.12 # path: ../sherpa_onnx_macos - sherpa_onnx_linux: ^1.10.11 + sherpa_onnx_linux: ^1.10.12 # path: ../sherpa_onnx_linux # - sherpa_onnx_windows: ^1.10.11 + sherpa_onnx_windows: ^1.10.12 # path: ../sherpa_onnx_windows - sherpa_onnx_ios: ^1.10.11 + sherpa_onnx_ios: ^1.10.12 # sherpa_onnx_ios: # path: ../sherpa_onnx_ios diff --git a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec index 2103adb82..eca697e84 100644 --- a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec +++ b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec @@ -7,7 +7,7 @@ # https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c Pod::Spec.new do |s| s.name = 'sherpa_onnx_ios' - s.version = '1.10.11' + s.version = '1.10.12' s.summary = 'A new Flutter FFI plugin project.' s.description = <<-DESC A new Flutter FFI plugin project. diff --git a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec index 918eb44f1..e56fb9e87 100644 --- a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec +++ b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec @@ -4,7 +4,7 @@ # Pod::Spec.new do |s| s.name = 'sherpa_onnx_macos' - s.version = '1.10.11' + s.version = '1.10.12' s.summary = 'sherpa-onnx Flutter FFI plugin project.' s.description = <<-DESC sherpa-onnx Flutter FFI plugin project. diff --git a/java-api-examples/VadNonStreamingParaformer.java b/java-api-examples/VadNonStreamingParaformer.java index 61c2b53d9..eb5e146be 100644 --- a/java-api-examples/VadNonStreamingParaformer.java +++ b/java-api-examples/VadNonStreamingParaformer.java @@ -98,6 +98,25 @@ public static void main(String[] args) { } } + vad.flush(); + while (!vad.empty()) { + SpeechSegment segment = vad.front(); + float startTime = segment.getStart() / 16000.0f; + float duration = segment.getSamples().length / 16000.0f; + + OfflineStream stream = recognizer.createStream(); + stream.acceptWaveform(segment.getSamples(), 16000); + recognizer.decode(stream); + String text = recognizer.getResult(stream).getText(); + stream.release(); + + if (!text.isEmpty()) { + System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text); + } + + vad.pop(); + } + vad.release(); recognizer.release(); } diff --git a/java-api-examples/VadRemoveSilence.java b/java-api-examples/VadRemoveSilence.java index 4ee40d0d2..3af1caa7f 100644 --- a/java-api-examples/VadRemoveSilence.java +++ b/java-api-examples/VadRemoveSilence.java @@ -59,6 +59,16 @@ public static void main(String[] args) { } } + vad.flush(); + while (!vad.empty()) { + + // if you want to get the starting time of this segment, you can use + /* float startTime = vad.front().getStart() / 16000.0f; */ + + segments.add(vad.front().getSamples()); + vad.pop(); + } + // get total number of samples int n = 0; for (float[] s : segments) { diff --git a/nodejs-addon-examples/package.json b/nodejs-addon-examples/package.json index 4f1282d7a..8944d62cf 100644 --- a/nodejs-addon-examples/package.json +++ b/nodejs-addon-examples/package.json @@ -1,5 +1,5 @@ { "dependencies": { - "sherpa-onnx-node": "^1.10.6" + "sherpa-onnx-node": "^1.10.12" } } diff --git a/python-api-examples/vad-remove-non-speech-segments-from-file.py b/python-api-examples/vad-remove-non-speech-segments-from-file.py index 34d0f955b..21819647f 100755 --- a/python-api-examples/vad-remove-non-speech-segments-from-file.py +++ b/python-api-examples/vad-remove-non-speech-segments-from-file.py @@ -105,6 +105,12 @@ def main(): speech_samples.extend(vad.front.samples) vad.pop() + vad.flush() + + while not vad.empty(): + speech_samples.extend(vad.front.samples) + vad.pop() + speech_samples = np.array(speech_samples, dtype=np.float32) sf.write(args.output, speech_samples, samplerate=sample_rate) diff --git a/scripts/dart/sherpa-onnx-pubspec.yaml b/scripts/dart/sherpa-onnx-pubspec.yaml index 6dc338ad5..0d6ea2115 100644 --- a/scripts/dart/sherpa-onnx-pubspec.yaml +++ b/scripts/dart/sherpa-onnx-pubspec.yaml @@ -17,7 +17,7 @@ topics: - voice-activity-detection # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx.podspec -version: 1.10.6 +version: 1.10.12 homepage: https://github.com/k2-fsa/sherpa-onnx diff --git a/scripts/dotnet/VoiceActivityDetector.cs b/scripts/dotnet/VoiceActivityDetector.cs index 532859f66..bd85116c2 100644 --- a/scripts/dotnet/VoiceActivityDetector.cs +++ b/scripts/dotnet/VoiceActivityDetector.cs @@ -53,6 +53,11 @@ public void Reset() SherpaOnnxVoiceActivityDetectorReset(_handle.Handle); } + public void Flush() + { + SherpaOnnxVoiceActivityDetectorFlush(_handle.Handle); + } + public void Dispose() { Cleanup(); @@ -106,5 +111,7 @@ private void Cleanup() [DllImport(Dll.Filename)] private static extern void SherpaOnnxVoiceActivityDetectorReset(IntPtr handle); + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxVoiceActivityDetectorFlush(IntPtr handle); } } diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index 898c0c21c..cb66aeb69 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -856,6 +856,10 @@ func (vad *VoiceActivityDetector) Reset() { C.SherpaOnnxVoiceActivityDetectorReset(vad.impl) } +func (vad *VoiceActivityDetector) Flush() { + C.SherpaOnnxVoiceActivityDetectorFlush(vad.impl) +} + // Spoken language identification type SpokenLanguageIdentificationWhisperConfig struct { diff --git a/scripts/node-addon-api/lib/vad.js b/scripts/node-addon-api/lib/vad.js index 977255206..3c8681976 100644 --- a/scripts/node-addon-api/lib/vad.js +++ b/scripts/node-addon-api/lib/vad.js @@ -29,7 +29,7 @@ class CircularBuffer { } reset() { - return addon.circularBufferReset(this.handle); + addon.circularBufferReset(this.handle); } } @@ -79,7 +79,11 @@ config = { } reset() { - return addon.VoiceActivityDetectorResetWrapper(this.handle); + addon.VoiceActivityDetectorResetWrapper(this.handle); + } + + flush() { + addon.VoiceActivityDetectorFlushWrapper(this.handle); } } diff --git a/scripts/node-addon-api/src/vad.cc b/scripts/node-addon-api/src/vad.cc index edebe291f..de92337db 100644 --- a/scripts/node-addon-api/src/vad.cc +++ b/scripts/node-addon-api/src/vad.cc @@ -590,6 +590,31 @@ static void VoiceActivityDetectorResetWrapper(const Napi::CallbackInfo &info) { SherpaOnnxVoiceActivityDetectorReset(vad); } +static void VoiceActivityDetectorFlushWrapper(const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New(env, "Argument 0 should be a VAD pointer.") + .ThrowAsJavaScriptException(); + + return; + } + + SherpaOnnxVoiceActivityDetector *vad = + info[0].As>().Data(); + + SherpaOnnxVoiceActivityDetectorFlush(vad); +} + void InitVad(Napi::Env env, Napi::Object exports) { exports.Set(Napi::String::New(env, "createCircularBuffer"), Napi::Function::New(env, CreateCircularBufferWrapper)); @@ -636,4 +661,7 @@ void InitVad(Napi::Env env, Napi::Object exports) { exports.Set(Napi::String::New(env, "voiceActivityDetectorReset"), Napi::Function::New(env, VoiceActivityDetectorResetWrapper)); + + exports.Set(Napi::String::New(env, "voiceActivityDetectorFlush"), + Napi::Function::New(env, VoiceActivityDetectorFlushWrapper)); } diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index feec532c2..e23305fb7 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -876,6 +876,10 @@ void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) { p->impl->Reset(); } +void SherpaOnnxVoiceActivityDetectorFlush(SherpaOnnxVoiceActivityDetector *p) { + p->impl->Flush(); +} + #if SHERPA_ONNX_ENABLE_TTS == 1 struct SherpaOnnxOfflineTts { std::unique_ptr impl; diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 6c49491cd..2bfba98c7 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -815,6 +815,9 @@ SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment( SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset( SherpaOnnxVoiceActivityDetector *p); +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorFlush( + SherpaOnnxVoiceActivityDetector *p); + // ============================================================ // For offline Text-to-Speech (i.e., non-streaming TTS) // ============================================================ diff --git a/sherpa-onnx/csrc/voice-activity-detector.cc b/sherpa-onnx/csrc/voice-activity-detector.cc index 0f80f9cb5..73b77b558 100644 --- a/sherpa-onnx/csrc/voice-activity-detector.cc +++ b/sherpa-onnx/csrc/voice-activity-detector.cc @@ -118,6 +118,29 @@ class VoiceActivityDetector::Impl { start_ = -1; } + void Flush() { + if (start_ == -1 || buffer_.Size() == 0) { + return; + } + + int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples(); + if (end <= start_) { + return; + } + + std::vector s = buffer_.Get(start_, end - start_); + + SpeechSegment segment; + + segment.start = start_; + segment.samples = std::move(s); + + segments_.push(std::move(segment)); + + buffer_.Pop(end - buffer_.Head()); + start_ = -1; + } + bool IsSpeechDetected() const { return start_ != -1; } const VadModelConfig &GetConfig() const { return config_; } @@ -164,7 +187,9 @@ const SpeechSegment &VoiceActivityDetector::Front() const { return impl_->Front(); } -void VoiceActivityDetector::Reset() { impl_->Reset(); } +void VoiceActivityDetector::Reset() const { impl_->Reset(); } + +void VoiceActivityDetector::Flush() const { impl_->Flush(); } bool VoiceActivityDetector::IsSpeechDetected() const { return impl_->IsSpeechDetected(); diff --git a/sherpa-onnx/csrc/voice-activity-detector.h b/sherpa-onnx/csrc/voice-activity-detector.h index c7a3cb999..9eb53c554 100644 --- a/sherpa-onnx/csrc/voice-activity-detector.h +++ b/sherpa-onnx/csrc/voice-activity-detector.h @@ -41,7 +41,11 @@ class VoiceActivityDetector { bool IsSpeechDetected() const; - void Reset(); + void Reset() const; + + // At the end of the utterance, you can invoke this method so that + // the last speech segment can be detected. + void Flush() const; const VadModelConfig &GetConfig() const; diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java index c0115e8bd..b50c2566b 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java @@ -46,6 +46,10 @@ public void reset() { reset(this.ptr); } + public void flush() { + flush(this.ptr); + } + public SpeechSegment front() { Object[] arr = front(this.ptr); int start = (int) arr[0]; @@ -75,4 +79,6 @@ public boolean isSpeechDetected() { private native boolean isSpeechDetected(long ptr); private native void reset(long ptr); + + private native void flush(long ptr); } diff --git a/sherpa-onnx/jni/voice-activity-detector.cc b/sherpa-onnx/jni/voice-activity-detector.cc index 19861acf4..1f59ae62b 100644 --- a/sherpa-onnx/jni/voice-activity-detector.cc +++ b/sherpa-onnx/jni/voice-activity-detector.cc @@ -173,3 +173,11 @@ JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_reset(JNIEnv * /*env*/, auto model = reinterpret_cast(ptr); model->Reset(); } + +SHERPA_ONNX_EXTERN_C +JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_flush(JNIEnv * /*env*/, + jobject /*obj*/, + jlong ptr) { + auto model = reinterpret_cast(ptr); + model->Flush(); +} diff --git a/sherpa-onnx/kotlin-api/Vad.kt b/sherpa-onnx/kotlin-api/Vad.kt index 7791166c9..57198130f 100644 --- a/sherpa-onnx/kotlin-api/Vad.kt +++ b/sherpa-onnx/kotlin-api/Vad.kt @@ -52,6 +52,8 @@ class Vad( fun reset() = reset(ptr) + fun flush() = flush(ptr) + private external fun delete(ptr: Long) private external fun newFromAsset( @@ -70,6 +72,7 @@ class Vad( private external fun front(ptr: Long): Array private external fun isSpeechDetected(ptr: Long): Boolean private external fun reset(ptr: Long) + private external fun flush(ptr: Long) companion object { init { diff --git a/sherpa-onnx/python/csrc/voice-activity-detector.cc b/sherpa-onnx/python/csrc/voice-activity-detector.cc index 698297bcf..eb07fa48c 100644 --- a/sherpa-onnx/python/csrc/voice-activity-detector.cc +++ b/sherpa-onnx/python/csrc/voice-activity-detector.cc @@ -38,6 +38,7 @@ void PybindVoiceActivityDetector(py::module *m) { .def("is_speech_detected", &PyClass::IsSpeechDetected, py::call_guard()) .def("reset", &PyClass::Reset, py::call_guard()) + .def("flush", &PyClass::Flush, py::call_guard()) .def_property_readonly("front", &PyClass::Front); } diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index f69405d0c..8365af7db 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -633,6 +633,10 @@ class SherpaOnnxVoiceActivityDetectorWrapper { func reset() { SherpaOnnxVoiceActivityDetectorReset(vad) } + + func flush() { + SherpaOnnxVoiceActivityDetectorFlush(vad) + } } // offline tts