From 4282a914e2c9cb5c6cf430c3f174197fdd7880eb Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Mon, 18 Sep 2023 11:42:24 +0800
Subject: [PATCH] Add C API for VAD

---
 sherpa-onnx/c-api/c-api.cc                 | 127 +++++++++++++++++++++
 sherpa-onnx/c-api/c-api.h                  | 120 +++++++++++++++++++
 sherpa-onnx/csrc/silero-vad-model-config.h |   4 +-
 3 files changed, 249 insertions(+), 2 deletions(-)
diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc
index 520c47542..87d4f3274 100644
--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -9,9 +9,11 @@
 #include <utility>
 #include <vector>
 
+#include "sherpa-onnx/csrc/circular-buffer.h"
 #include "sherpa-onnx/csrc/display.h"
 #include "sherpa-onnx/csrc/offline-recognizer.h"
 #include "sherpa-onnx/csrc/online-recognizer.h"
+#include "sherpa-onnx/csrc/voice-activity-detector.h"
 
 struct SherpaOnnxOnlineRecognizer {
   std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl;
@@ -372,3 +374,128 @@ void DestroyOfflineRecognizerResult(
   delete[] r->timestamps;
   delete r;
 }
+
+// ============================================================
+// For VAD
+// ============================================================
+//
+struct SherpaOnnxCircularBuffer {
+  std::unique_ptr<sherpa_onnx::CircularBuffer> impl;
+};
+
+SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(int32_t capacity) {
+  SherpaOnnxCircularBuffer *buffer = new SherpaOnnxCircularBuffer;
+  buffer->impl = std::make_unique<sherpa_onnx::CircularBuffer>(capacity);
+  return buffer;
+}
+
+void SherpaOnnxDestroyCircularBuffer(SherpaOnnxCircularBuffer *buffer) {
+  delete buffer;
+}
+
+void SherpaOnnxCircularBufferPush(SherpaOnnxCircularBuffer *buffer,
+                                  const float *p, int32_t n) {
+  buffer->impl->Push(p, n);
+}
+
+const float *SherpaOnnxCircularBufferGet(SherpaOnnxCircularBuffer *buffer,
+                                         int32_t start_index, int32_t n) {
+  std::vector<float> v = buffer->impl->Get(start_index, n);
+
+  float *p = new float[n];
+  std::copy(v.begin(), v.end(), p);
+  return p;
+}
+
+void SherpaOnnxCircularBufferFree(const float *p) { delete[] p; }
+
+void SherpaOnnxCircularBufferPop(SherpaOnnxCircularBuffer *buffer, int32_t n) {
+  buffer->impl->Pop(n);
+}
+
+int32_t SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer) {
+  return buffer->impl->Size();
+}
+
+void SherpaOnnxCircularBufferReset(SherpaOnnxCircularBuffer *buffer) {
+  buffer->impl->Reset();
+}
+
+struct SherpaOnnxVoiceActivityDetector {
+  std::unique_ptr<sherpa_onnx::VoiceActivityDetector> impl;
+};
+
+SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
+    const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds) {
+  sherpa_onnx::VadModelConfig vad_config;
+
+  vad_config.silero_vad.model = SHERPA_ONNX_OR(config->silero_vad.model, "");
+  vad_config.silero_vad.threshold =
+      SHERPA_ONNX_OR(config->silero_vad.threshold, 0.5);
+
+  vad_config.silero_vad.min_silence_duration =
+      SHERPA_ONNX_OR(config->silero_vad.min_silence_duration, 0.5);
+
+  vad_config.silero_vad.min_speech_duration =
+      SHERPA_ONNX_OR(config->silero_vad.min_speech_duration, 0.25);
+
+  vad_config.silero_vad.window_size =
+      SHERPA_ONNX_OR(config->silero_vad.window_size, 512);
+
+  vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
+  vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
+  vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
+  vad_config.debug = SHERPA_ONNX_OR(config->debug, false);
+
+  if (vad_config.debug) {
+    fprintf(stderr, "%s\n", vad_config.ToString().c_str());
+  }
+
+  SherpaOnnxVoiceActivityDetector *p = new SherpaOnnxVoiceActivityDetector;
+  p->impl = std::make_unique<sherpa_onnx::VoiceActivityDetector>(
+      vad_config, buffer_size_in_seconds);
+
+  return p;
+}
+
+void SherpaOnnxDestroyVoiceActivityDetector(
+    SherpaOnnxVoiceActivityDetector *p) {
+  delete p;
+}
+
+void SherpaOnnxVoiceActivityDetectorAcceptWaveform(
+    SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n) {
+  p->impl->AcceptWaveform(samples, n);
+}
+
+int32_t SherpaOnnxVoiceActivityDetectorEmpty(
+    SherpaOnnxVoiceActivityDetector *p) {
+  return p->impl->Empty();
+}
+
+SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
+    SherpaOnnxVoiceActivityDetector *p) {
+  p->impl->Pop();
+}
+
+SHERPA_ONNX_API SherpaOnnxSpeechSegment *SherpaOnnxVoiceActivityDetectorFront(
+    SherpaOnnxVoiceActivityDetector *p) {
+  const sherpa_onnx::SpeechSegment &segment = p->impl->Front();
+
+  SherpaOnnxSpeechSegment *ans = new SherpaOnnxSpeechSegment;
+  ans->start = segment.start;
+  ans->samples = new float[segment.samples.size()];
+  std::copy(segment.samples.begin(), segment.samples.end(), ans->samples);
+  ans->n = segment.samples.size();
+
+  return ans;
+}
+
+void SherpaOnnxDestroySpeechSegment(SherpaOnnxSpeechSegment *p) {
+  delete[] p->samples;
+  delete p;
+}
+
+void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) {
+  p->impl->Reset();
+}
diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h
index 71aa56426..99d51c785 100644
--- a/sherpa-onnx/c-api/c-api.h
+++ b/sherpa-onnx/c-api/c-api.h
@@ -438,6 +438,126 @@ SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
 SHERPA_ONNX_API void DestroyOfflineRecognizerResult(
     const SherpaOnnxOfflineRecognizerResult *r);
 
+// ============================================================
+// For VAD
+// ============================================================
+
+SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
+  // Path to the silero VAD model
+  const char *model;
+
+  // threshold to classify a segment as speech
+  //
+  // If the predicted probability of a segment is larger than this
+  // value, then it is classified as speech.
+  float threshold;
+
+  // in seconds
+  float min_silence_duration;
+
+  // in seconds
+  float min_speech_duration;
+
+  int window_size;
+} SherpaOnnxSileroVadModelConfig;
+
+SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
+  SherpaOnnxSileroVadModelConfig silero_vad;
+
+  int32_t sample_rate;
+  int32_t num_threads;
+  const char *provider;
+  int32_t debug;
+} SherpaOnnxVadModelConfig;
+
+SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer
+    SherpaOnnxCircularBuffer;
+
+// Return an instance of circular buffer. The user has to use
+// SherpaOnnxDestroyCircularBuffer() to free the returned pointer to avoid
+// memory leak.
+SHERPA_ONNX_API SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(
+    int32_t capacity);
+
+// Free the pointer returned by SherpaOnnxCreateCircularBuffer()
+SHERPA_ONNX_API void SherpaOnnxDestroyCircularBuffer(
+    SherpaOnnxCircularBuffer *buffer);
+
+SHERPA_ONNX_API void SherpaOnnxCircularBufferPush(
+    SherpaOnnxCircularBuffer *buffer, const float *p, int32_t n);
+
+// Return n samples starting at the given index.
+//
+// Return a pointer to an array containing n samples starting at start_index.
+// The user has to use SherpaOnnxCircularBufferFree() to free the returned
+// pointer to avoid memory leak.
+SHERPA_ONNX_API const float *SherpaOnnxCircularBufferGet(
+    SherpaOnnxCircularBuffer *buffer, int32_t start_index, int32_t n);
+
+// Free the pointer returned by SherpaOnnxCircularBufferGet().
+SHERPA_ONNX_API void SherpaOnnxCircularBufferFree(const float *p);
+
+// Remove n elements from the buffer
+SHERPA_ONNX_API void SherpaOnnxCircularBufferPop(
+    SherpaOnnxCircularBuffer *buffer, int32_t n);
+
+// Return number of elements in the buffer.
+SHERPA_ONNX_API int32_t
+SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer);
+
+// Clear all elements in the buffer
+SHERPA_ONNX_API void SherpaOnnxCircularBufferReset(
+    SherpaOnnxCircularBuffer *buffer);
+
+SHERPA_ONNX_API typedef struct SherpaOnnxSpeechSegment {
+  // The start index in samples of this segment
+  int32_t start;
+
+  // pointer to the array containing the samples
+  float *samples;
+
+  // number of samples in this segment
+  int32_t n;
+} SherpaOnnxSpeechSegment;
+
+typedef struct SherpaOnnxVoiceActivityDetector SherpaOnnxVoiceActivityDetector;
+
+// Return an instance of VoiceActivityDetector.
+// The user has to use SherpaOnnxDestroyVoiceActivityDetector() to free
+// the returned pointer to avoid memory leak.
+SHERPA_ONNX_API SherpaOnnxVoiceActivityDetector *
+SherpaOnnxCreateVoiceActivityDetector(const SherpaOnnxVadModelConfig *config,
+                                      float buffer_size_in_seconds);
+
+SHERPA_ONNX_API void SherpaOnnxDestroyVoiceActivityDetector(
+    SherpaOnnxVoiceActivityDetector *p);
+
+SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorAcceptWaveform(
+    SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n);
+
+// Return 1 if there are no speech segments available.
+// Return 0 if there are speech segments.
+SHERPA_ONNX_API int32_t
+SherpaOnnxVoiceActivityDetectorEmpty(SherpaOnnxVoiceActivityDetector *p);
+
+// Return the first speech segment.
+// It throws if SherpaOnnxVoiceActivityDetectorEmpty() returns 1.
+SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
+    SherpaOnnxVoiceActivityDetector *p);
+
+// Return the first speech segment.
+// The user has to use SherpaOnnxDestroySpeechSegment() to free the returned
+// pointer to avoid memory leak.
+SHERPA_ONNX_API SherpaOnnxSpeechSegment *SherpaOnnxVoiceActivityDetectorFront(
+    SherpaOnnxVoiceActivityDetector *p);
+
+// Free the pointer returned SherpaOnnxVoiceActivityDetectorFront().
+SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment(SherpaOnnxSpeechSegment *p);
+
+// Re-initialize the voice activity detector.
+SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset(
+    SherpaOnnxVoiceActivityDetector *p);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/sherpa-onnx/csrc/silero-vad-model-config.h b/sherpa-onnx/csrc/silero-vad-model-config.h
index fc9309632..b9679dd23 100644
--- a/sherpa-onnx/csrc/silero-vad-model-config.h
+++ b/sherpa-onnx/csrc/silero-vad-model-config.h
@@ -15,7 +15,7 @@ struct SileroVadModelConfig {
 
   // threshold to classify a segment as speech
   //
-  // The predicted probability of a segment is larger than this
+  // If the predicted probability of a segment is larger than this
   // value, then it is classified as speech.
   float threshold = 0.5;
 
@@ -25,7 +25,7 @@ struct SileroVadModelConfig {
 
   // 512, 1024, 1536 samples for 16000 Hz
   // 256, 512, 768 samples for 800 Hz
-  int window_size = 512;  // in samples
+  int32_t window_size = 512;  // in samples
 
   SileroVadModelConfig() = default;