From 71536f62ca37ff59bd6dde9ed77583c3469b4600 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 14 Sep 2024 10:49:39 +0800 Subject: [PATCH] Support specifying max speech duration for VAD. --- python-api-examples/generate-subtitles.py | 9 ++++- sherpa-onnx/csrc/silero-vad-model-config.cc | 34 +++++++++++++++++++ sherpa-onnx/csrc/silero-vad-model-config.h | 5 +++ sherpa-onnx/csrc/voice-activity-detector.cc | 21 +++++++++--- .../python/csrc/silero-vad-model-config.cc | 8 +++-- 5 files changed, 70 insertions(+), 7 deletions(-) diff --git a/python-api-examples/generate-subtitles.py b/python-api-examples/generate-subtitles.py index b39499e93..85871016a 100755 --- a/python-api-examples/generate-subtitles.py +++ b/python-api-examples/generate-subtitles.py @@ -406,7 +406,14 @@ def main(): config = sherpa_onnx.VadModelConfig() config.silero_vad.model = args.silero_vad_model - config.silero_vad.min_silence_duration = 0.25 + config.silero_vad.threshold = 0.5 + config.silero_vad.min_silence_duration = 0.25 # seconds + config.silero_vad.min_speech_duration = 0.25 # seconds + + # If the current segment is larger than this value, then it increases + # the threshold to 0.9 internally. After detecting this segment, + # it resets the threshold to its original value. + config.silero_vad.max_speech_duration = 5 # seconds config.sample_rate = args.sample_rate window_size = config.silero_vad.window_size diff --git a/sherpa-onnx/csrc/silero-vad-model-config.cc b/sherpa-onnx/csrc/silero-vad-model-config.cc index 7fa844f27..9237963ba 100644 --- a/sherpa-onnx/csrc/silero-vad-model-config.cc +++ b/sherpa-onnx/csrc/silero-vad-model-config.cc @@ -28,6 +28,12 @@ void SileroVadModelConfig::Register(ParseOptions *po) { "In seconds. In the end of each silence chunk wait for " "--silero-vad-min-speech-duration seconds before separating it"); + po->Register( + "silero-vad-max-speech-duration", &max_speech_duration, + "In seconds. If a speech segment is longer than this value, then we " + "increase the threshold to 0.9. After finishing detecting the segment, " + "the threshold value is reset to its original value."); + po->Register( "silero-vad-window-size", &window_size, "In samples. Audio chunks of --silero-vad-window-size samples are fed " @@ -63,6 +69,33 @@ bool SileroVadModelConfig::Validate() const { return false; } + if (min_silence_duration <= 0) { + SHERPA_ONNX_LOGE( + "Please use a larger value for --silero-vad-min-silence-duration. " + "Given: " + "%f", + min_silence_duration); + return false; + } + + if (min_speech_duration <= 0) { + SHERPA_ONNX_LOGE( + "Please use a larger value for --silero-vad-min-speech-duration. " + "Given: " + "%f", + min_speech_duration); + return false; + } + + if (max_speech_duration <= 0) { + SHERPA_ONNX_LOGE( + "Please use a larger value for --silero-vad-max-speech-duration. " + "Given: " + "%f", + max_speech_duration); + return false; + } + return true; } @@ -74,6 +107,7 @@ std::string SileroVadModelConfig::ToString() const { os << "threshold=" << threshold << ", "; os << "min_silence_duration=" << min_silence_duration << ", "; os << "min_speech_duration=" << min_speech_duration << ", "; + os << "max_speech_duration=" << max_speech_duration << ", "; os << "window_size=" << window_size << ")"; return os.str(); diff --git a/sherpa-onnx/csrc/silero-vad-model-config.h b/sherpa-onnx/csrc/silero-vad-model-config.h index b9679dd23..5ae06f5ff 100644 --- a/sherpa-onnx/csrc/silero-vad-model-config.h +++ b/sherpa-onnx/csrc/silero-vad-model-config.h @@ -27,6 +27,11 @@ struct SileroVadModelConfig { // 256, 512, 768 samples for 800 Hz int32_t window_size = 512; // in samples + // If a speech segment is longer than this value, then we increase + // the threshold to 0.9. After finishing detecting the segment, + // the threshold value is reset to its original value. + float max_speech_duration = 20; // in seconds + SileroVadModelConfig() = default; void Register(ParseOptions *po); diff --git a/sherpa-onnx/csrc/voice-activity-detector.cc b/sherpa-onnx/csrc/voice-activity-detector.cc index 29acd70ae..c20d3476d 100644 --- a/sherpa-onnx/csrc/voice-activity-detector.cc +++ b/sherpa-onnx/csrc/voice-activity-detector.cc @@ -18,14 +18,18 @@ class VoiceActivityDetector::Impl { explicit Impl(const VadModelConfig &config, float buffer_size_in_seconds = 60) : model_(VadModel::Create(config)), config_(config), - buffer_(buffer_size_in_seconds * config.sample_rate) {} + buffer_(buffer_size_in_seconds * config.sample_rate) { + Init(); + } #if __ANDROID_API__ >= 9 Impl(AAssetManager *mgr, const VadModelConfig &config, float buffer_size_in_seconds = 60) : model_(VadModel::Create(mgr, config)), config_(config), - buffer_(buffer_size_in_seconds * config.sample_rate) {} + buffer_(buffer_size_in_seconds * config.sample_rate) { + Init(); + } #endif void AcceptWaveform(const float *samples, int32_t n) { @@ -145,6 +149,15 @@ class VoiceActivityDetector::Impl { const VadModelConfig &GetConfig() const { return config_; } + private: + void Init() { + // TODO(fangjun): Currently, we support only one vad model. + // If a new vad model is added, we need to change the place + // where max_speech_duration is placed. + max_utterance_length_ = + config_.sample_rate * config_.silero_vad.max_speech_duration; + } + private: std::queue segments_; @@ -153,9 +166,9 @@ class VoiceActivityDetector::Impl { CircularBuffer buffer_; std::vector last_; - int max_utterance_length_ = 16000 * 20; // in samples + int max_utterance_length_ = -1; // in samples float new_min_silence_duration_s_ = 0.1; - float new_threshold_ = 1.10; + float new_threshold_ = 0.90; int32_t start_ = -1; }; diff --git a/sherpa-onnx/python/csrc/silero-vad-model-config.cc b/sherpa-onnx/python/csrc/silero-vad-model-config.cc index 3065a02d3..7d063963a 100644 --- a/sherpa-onnx/python/csrc/silero-vad-model-config.cc +++ b/sherpa-onnx/python/csrc/silero-vad-model-config.cc @@ -17,7 +17,8 @@ void PybindSileroVadModelConfig(py::module *m) { .def(py::init<>()) .def(py::init([](const std::string &model, float threshold, float min_silence_duration, float min_speech_duration, - int32_t window_size) -> std::unique_ptr { + int32_t window_size, + float max_speech_duration) -> std::unique_ptr { auto ans = std::make_unique(); ans->model = model; @@ -25,17 +26,20 @@ void PybindSileroVadModelConfig(py::module *m) { ans->min_silence_duration = min_silence_duration; ans->min_speech_duration = min_speech_duration; ans->window_size = window_size; + ans->max_speech_duration = max_speech_duration; return ans; }), py::arg("model"), py::arg("threshold") = 0.5, py::arg("min_silence_duration") = 0.5, - py::arg("min_speech_duration") = 0.25, py::arg("window_size") = 512) + py::arg("min_speech_duration") = 0.25, py::arg("window_size") = 512, + py::arg("max_speech_duration") = 20) .def_readwrite("model", &PyClass::model) .def_readwrite("threshold", &PyClass::threshold) .def_readwrite("min_silence_duration", &PyClass::min_silence_duration) .def_readwrite("min_speech_duration", &PyClass::min_speech_duration) .def_readwrite("window_size", &PyClass::window_size) + .def_readwrite("max_speech_duration", &PyClass::max_speech_duration) .def("__str__", &PyClass::ToString) .def("validate", &PyClass::Validate); }