From 4282a914e2c9cb5c6cf430c3f174197fdd7880eb Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 18 Sep 2023 11:42:24 +0800 Subject: [PATCH] Add C API for VAD --- sherpa-onnx/c-api/c-api.cc | 127 +++++++++++++++++++++ sherpa-onnx/c-api/c-api.h | 120 +++++++++++++++++++ sherpa-onnx/csrc/silero-vad-model-config.h | 4 +- 3 files changed, 249 insertions(+), 2 deletions(-) diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 520c47542..87d4f3274 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -9,9 +9,11 @@ #include #include +#include "sherpa-onnx/csrc/circular-buffer.h" #include "sherpa-onnx/csrc/display.h" #include "sherpa-onnx/csrc/offline-recognizer.h" #include "sherpa-onnx/csrc/online-recognizer.h" +#include "sherpa-onnx/csrc/voice-activity-detector.h" struct SherpaOnnxOnlineRecognizer { std::unique_ptr impl; @@ -372,3 +374,128 @@ void DestroyOfflineRecognizerResult( delete[] r->timestamps; delete r; } + +// ============================================================ +// For VAD +// ============================================================ +// +struct SherpaOnnxCircularBuffer { + std::unique_ptr impl; +}; + +SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(int32_t capacity) { + SherpaOnnxCircularBuffer *buffer = new SherpaOnnxCircularBuffer; + buffer->impl = std::make_unique(capacity); + return buffer; +} + +void SherpaOnnxDestroyCircularBuffer(SherpaOnnxCircularBuffer *buffer) { + delete buffer; +} + +void SherpaOnnxCircularBufferPush(SherpaOnnxCircularBuffer *buffer, + const float *p, int32_t n) { + buffer->impl->Push(p, n); +} + +const float *SherpaOnnxCircularBufferGet(SherpaOnnxCircularBuffer *buffer, + int32_t start_index, int32_t n) { + std::vector v = buffer->impl->Get(start_index, n); + + float *p = new float[n]; + std::copy(v.begin(), v.end(), p); + return p; +} + +void SherpaOnnxCircularBufferFree(const float *p) { delete[] p; } + +void SherpaOnnxCircularBufferPop(SherpaOnnxCircularBuffer *buffer, int32_t n) { + buffer->impl->Pop(n); +} + +int32_t SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer) { + return buffer->impl->Size(); +} + +void SherpaOnnxCircularBufferReset(SherpaOnnxCircularBuffer *buffer) { + buffer->impl->Reset(); +} + +struct SherpaOnnxVoiceActivityDetector { + std::unique_ptr impl; +}; + +SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector( + const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds) { + sherpa_onnx::VadModelConfig vad_config; + + vad_config.silero_vad.model = SHERPA_ONNX_OR(config->silero_vad.model, ""); + vad_config.silero_vad.threshold = + SHERPA_ONNX_OR(config->silero_vad.threshold, 0.5); + + vad_config.silero_vad.min_silence_duration = + SHERPA_ONNX_OR(config->silero_vad.min_silence_duration, 0.5); + + vad_config.silero_vad.min_speech_duration = + SHERPA_ONNX_OR(config->silero_vad.min_speech_duration, 0.25); + + vad_config.silero_vad.window_size = + SHERPA_ONNX_OR(config->silero_vad.window_size, 512); + + vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000); + vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1); + vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu"); + vad_config.debug = SHERPA_ONNX_OR(config->debug, false); + + if (vad_config.debug) { + fprintf(stderr, "%s\n", vad_config.ToString().c_str()); + } + + SherpaOnnxVoiceActivityDetector *p = new SherpaOnnxVoiceActivityDetector; + p->impl = std::make_unique( + vad_config, buffer_size_in_seconds); + + return p; +} + +void SherpaOnnxDestroyVoiceActivityDetector( + SherpaOnnxVoiceActivityDetector *p) { + delete p; +} + +void SherpaOnnxVoiceActivityDetectorAcceptWaveform( + SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n) { + p->impl->AcceptWaveform(samples, n); +} + +int32_t SherpaOnnxVoiceActivityDetectorEmpty( + SherpaOnnxVoiceActivityDetector *p) { + return p->impl->Empty(); +} + +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop( + SherpaOnnxVoiceActivityDetector *p) { + p->impl->Pop(); +} + +SHERPA_ONNX_API SherpaOnnxSpeechSegment *SherpaOnnxVoiceActivityDetectorFront( + SherpaOnnxVoiceActivityDetector *p) { + const sherpa_onnx::SpeechSegment &segment = p->impl->Front(); + + SherpaOnnxSpeechSegment *ans = new SherpaOnnxSpeechSegment; + ans->start = segment.start; + ans->samples = new float[segment.samples.size()]; + std::copy(segment.samples.begin(), segment.samples.end(), ans->samples); + ans->n = segment.samples.size(); + + return ans; +} + +void SherpaOnnxDestroySpeechSegment(SherpaOnnxSpeechSegment *p) { + delete[] p->samples; + delete p; +} + +void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) { + p->impl->Reset(); +} diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 71aa56426..99d51c785 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -438,6 +438,126 @@ SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( SHERPA_ONNX_API void DestroyOfflineRecognizerResult( const SherpaOnnxOfflineRecognizerResult *r); +// ============================================================ +// For VAD +// ============================================================ + +SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig { + // Path to the silero VAD model + const char *model; + + // threshold to classify a segment as speech + // + // If the predicted probability of a segment is larger than this + // value, then it is classified as speech. + float threshold; + + // in seconds + float min_silence_duration; + + // in seconds + float min_speech_duration; + + int window_size; +} SherpaOnnxSileroVadModelConfig; + +SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig { + SherpaOnnxSileroVadModelConfig silero_vad; + + int32_t sample_rate; + int32_t num_threads; + const char *provider; + int32_t debug; +} SherpaOnnxVadModelConfig; + +SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer + SherpaOnnxCircularBuffer; + +// Return an instance of circular buffer. The user has to use +// SherpaOnnxDestroyCircularBuffer() to free the returned pointer to avoid +// memory leak. +SHERPA_ONNX_API SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer( + int32_t capacity); + +// Free the pointer returned by SherpaOnnxCreateCircularBuffer() +SHERPA_ONNX_API void SherpaOnnxDestroyCircularBuffer( + SherpaOnnxCircularBuffer *buffer); + +SHERPA_ONNX_API void SherpaOnnxCircularBufferPush( + SherpaOnnxCircularBuffer *buffer, const float *p, int32_t n); + +// Return n samples starting at the given index. +// +// Return a pointer to an array containing n samples starting at start_index. +// The user has to use SherpaOnnxCircularBufferFree() to free the returned +// pointer to avoid memory leak. +SHERPA_ONNX_API const float *SherpaOnnxCircularBufferGet( + SherpaOnnxCircularBuffer *buffer, int32_t start_index, int32_t n); + +// Free the pointer returned by SherpaOnnxCircularBufferGet(). +SHERPA_ONNX_API void SherpaOnnxCircularBufferFree(const float *p); + +// Remove n elements from the buffer +SHERPA_ONNX_API void SherpaOnnxCircularBufferPop( + SherpaOnnxCircularBuffer *buffer, int32_t n); + +// Return number of elements in the buffer. +SHERPA_ONNX_API int32_t +SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer); + +// Clear all elements in the buffer +SHERPA_ONNX_API void SherpaOnnxCircularBufferReset( + SherpaOnnxCircularBuffer *buffer); + +SHERPA_ONNX_API typedef struct SherpaOnnxSpeechSegment { + // The start index in samples of this segment + int32_t start; + + // pointer to the array containing the samples + float *samples; + + // number of samples in this segment + int32_t n; +} SherpaOnnxSpeechSegment; + +typedef struct SherpaOnnxVoiceActivityDetector SherpaOnnxVoiceActivityDetector; + +// Return an instance of VoiceActivityDetector. +// The user has to use SherpaOnnxDestroyVoiceActivityDetector() to free +// the returned pointer to avoid memory leak. +SHERPA_ONNX_API SherpaOnnxVoiceActivityDetector * +SherpaOnnxCreateVoiceActivityDetector(const SherpaOnnxVadModelConfig *config, + float buffer_size_in_seconds); + +SHERPA_ONNX_API void SherpaOnnxDestroyVoiceActivityDetector( + SherpaOnnxVoiceActivityDetector *p); + +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorAcceptWaveform( + SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n); + +// Return 1 if there are no speech segments available. +// Return 0 if there are speech segments. +SHERPA_ONNX_API int32_t +SherpaOnnxVoiceActivityDetectorEmpty(SherpaOnnxVoiceActivityDetector *p); + +// Return the first speech segment. +// It throws if SherpaOnnxVoiceActivityDetectorEmpty() returns 1. +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop( + SherpaOnnxVoiceActivityDetector *p); + +// Return the first speech segment. +// The user has to use SherpaOnnxDestroySpeechSegment() to free the returned +// pointer to avoid memory leak. +SHERPA_ONNX_API SherpaOnnxSpeechSegment *SherpaOnnxVoiceActivityDetectorFront( + SherpaOnnxVoiceActivityDetector *p); + +// Free the pointer returned SherpaOnnxVoiceActivityDetectorFront(). +SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment(SherpaOnnxSpeechSegment *p); + +// Re-initialize the voice activity detector. +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset( + SherpaOnnxVoiceActivityDetector *p); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/sherpa-onnx/csrc/silero-vad-model-config.h b/sherpa-onnx/csrc/silero-vad-model-config.h index fc9309632..b9679dd23 100644 --- a/sherpa-onnx/csrc/silero-vad-model-config.h +++ b/sherpa-onnx/csrc/silero-vad-model-config.h @@ -15,7 +15,7 @@ struct SileroVadModelConfig { // threshold to classify a segment as speech // - // The predicted probability of a segment is larger than this + // If the predicted probability of a segment is larger than this // value, then it is classified as speech. float threshold = 0.5; @@ -25,7 +25,7 @@ struct SileroVadModelConfig { // 512, 1024, 1536 samples for 16000 Hz // 256, 512, 768 samples for 800 Hz - int window_size = 512; // in samples + int32_t window_size = 512; // in samples SileroVadModelConfig() = default;