Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Silero VAD #313

Merged
merged 15 commits into from
Sep 17, 2023
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
project(sherpa-onnx)

set(SHERPA_ONNX_VERSION "1.7.14")
set(SHERPA_ONNX_VERSION "1.7.15")

# Disable warning about
#
Expand Down
1 change: 1 addition & 0 deletions cmake/cmake_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def build_extension(self, ext: setuptools.extension.Extension):
binaries += ["sherpa-onnx-online-websocket-server"]
binaries += ["sherpa-onnx-offline-websocket-server"]
binaries += ["sherpa-onnx-online-websocket-client"]
binaries += ["sherpa-onnx-vad-microphone"]

if is_windows():
binaries += ["kaldi-native-fbank-core.dll"]
Expand Down
9 changes: 9 additions & 0 deletions python-api-examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# File description

- [./http_server.py](./http_server.py) It defines which files to server.
Files are saved in [./web](./web).
- [non_streaming_server.py](./non_streaming_server.py) WebSocket server for
non-streaming models.
- [vad-remove-non-speech-segments.py](./vad-remove-non-speech-segments.py) It uses
[silero-vad](https://github.com/snakers4/silero-vad) to remove non-speech
segments and concatenate all speech segments into a single one.
126 changes: 126 additions & 0 deletions python-api-examples/vad-remove-non-speech-segments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env python3

"""
This file shows how to remove non-speech segments
and merge all speech segments into a large segment
and save it to a file.

Usage

python3 ./vad-remove-non-speech-segments.py \
--silero-vad-model silero_vad.onnx

Please visit
https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
to download silero_vad.onnx

For instance,

wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
"""

import argparse
import sys
import time
from pathlib import Path

import numpy as np
import sherpa_onnx
import soundfile as sf

try:
import sounddevice as sd
except ImportError:
print("Please install sounddevice first. You can use")
print()
print(" pip install sounddevice")
print()
print("to install it")
sys.exit(-1)


def assert_file_exists(filename: str):
assert Path(filename).is_file(), (
f"{filename} does not exist!\n"
"Please refer to "
"https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
)


def get_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

parser.add_argument(
"--silero-vad-model",
type=str,
required=True,
help="Path to silero_vad.onnx",
)

return parser.parse_args()


def main():
devices = sd.query_devices()
if len(devices) == 0:
print("No microphone devices found")
sys.exit(0)

print(devices)
default_input_device_idx = sd.default.device[0]
print(f'Use default device: {devices[default_input_device_idx]["name"]}')

args = get_args()
assert_file_exists(args.silero_vad_model)

sample_rate = 16000
samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms

config = sherpa_onnx.VadModelConfig()
config.silero_vad.model = args.silero_vad_model
config.sample_rate = sample_rate

window_size = config.silero_vad.window_size

buffer = []
vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30)

all_samples = []

print("Started! Please speak")

try:
with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
while True:
samples, _ = s.read(samples_per_read) # a blocking read
samples = samples.reshape(-1)
buffer = np.concatenate([buffer, samples])

all_samples = np.concatenate([all_samples, samples])

while len(buffer) > window_size:
vad.accept_waveform(buffer[:window_size])
buffer = buffer[window_size:]
except KeyboardInterrupt:
print("\nCaught Ctrl + C. Saving & Exiting")

speech_samples = []
while not vad.empty():
speech_samples.extend(vad.front.samples)
vad.pop()

speech_samples = np.array(speech_samples, dtype=np.float32)

filename_for_speech = time.strftime("%Y%m%d-%H%M%S-speech.wav")
sf.write(filename_for_speech, speech_samples, samplerate=sample_rate)

filename_for_all = time.strftime("%Y%m%d-%H%M%S-all.wav")
sf.write(filename_for_all, all_samples, samplerate=sample_rate)

print(f"Saved to {filename_for_speech} and {filename_for_all}")


if __name__ == "__main__":
main()
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def get_binaries_to_install():
binaries += ["sherpa-onnx-online-websocket-server"]
binaries += ["sherpa-onnx-offline-websocket-server"]
binaries += ["sherpa-onnx-online-websocket-client"]
binaries += ["sherpa-onnx-vad-microphone"]
if is_windows():
binaries += ["kaldi-native-fbank-core.dll"]
binaries += ["sherpa-onnx-c-api.dll"]
Expand Down Expand Up @@ -95,8 +96,8 @@ def get_binaries_to_install():
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
entry_points={
'console_scripts': [
'sherpa-onnx-cli=sherpa_onnx.cli:cli',
"console_scripts": [
"sherpa-onnx-cli=sherpa_onnx.cli:cli",
],
},
license="Apache licensed, as found in the LICENSE file",
Expand Down
41 changes: 29 additions & 12 deletions sherpa-onnx/csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ endif()
set(sources
base64-decode.cc
cat.cc
circular-buffer.cc
context-graph.cc
endpoint.cc
features.cc
Expand Down Expand Up @@ -66,13 +67,18 @@ set(sources
provider.cc
resample.cc
session.cc
silero-vad-model-config.cc
silero-vad-model.cc
slice.cc
stack.cc
symbol-table.cc
text-utils.cc
transpose.cc
unbind.cc
utils.cc
vad-model-config.cc
vad-model.cc
voice-activity-detector.cc
wave-reader.cc
)

Expand Down Expand Up @@ -172,32 +178,42 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
microphone.cc
)

add_executable(sherpa-onnx-vad-microphone
sherpa-onnx-vad-microphone.cc
microphone.cc
)

if(BUILD_SHARED_LIBS)
set(PA_LIB portaudio)
else()
set(PA_LIB portaudio_static)
endif()

target_link_libraries(sherpa-onnx-microphone ${PA_LIB} sherpa-onnx-core)
target_link_libraries(sherpa-onnx-microphone-offline ${PA_LIB} sherpa-onnx-core)
set(exes
sherpa-onnx-microphone
sherpa-onnx-microphone-offline
sherpa-onnx-vad-microphone
)
foreach(exe IN LISTS exes)
target_link_libraries(${exe} ${PA_LIB} sherpa-onnx-core)
endforeach()

if(NOT WIN32)
target_link_libraries(sherpa-onnx-microphone "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib")
target_link_libraries(sherpa-onnx-microphone "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib")

target_link_libraries(sherpa-onnx-microphone-offline "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib")
target_link_libraries(sherpa-onnx-microphone-offline "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib")
foreach(exe IN LISTS exes)
target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib")
target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib")
endforeach()

if(SHERPA_ONNX_ENABLE_PYTHON)
target_link_libraries(sherpa-onnx-microphone "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib")
target_link_libraries(sherpa-onnx-microphone-offline "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib")

foreach(exe IN LISTS exes)
target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib")
endforeach()
endif()
endif()

install(
TARGETS
sherpa-onnx-microphone
sherpa-onnx-microphone-offline
TARGETS ${exes}
DESTINATION
bin
)
Expand Down Expand Up @@ -269,6 +285,7 @@ endif()
if(SHERPA_ONNX_ENABLE_TESTS)
set(sherpa_onnx_test_srcs
cat-test.cc
circular-buffer-test.cc
context-graph-test.cc
packed-sequence-test.cc
pad-sequence-test.cc
Expand Down
29 changes: 29 additions & 0 deletions sherpa-onnx/csrc/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# File descriptions

- [./sherpa-onnx-alsa.cc](./sherpa-onnx-alsa.cc) For Linux only, especially for
embedded Linux, e.g., Raspberry Pi; it uses a streaming model for real-time
speech recognition with a microphone.

- [./sherpa-onnx-microphone.cc](./sherpa-onnx-microphone.cc)
For Linux/Windows/macOS; it uses a streaming model for real-time speech
recognition with a microphone.

- [./sherpa-onnx-microphone-offline.cc](./sherpa-onnx-microphone-offline.cc)
For Linux/Windows/macOS; it uses a non-streaming model for speech
recognition with a microphone.

- [./sherpa-onnx.cc](./sherpa-onnx.cc)
It uses a streaming model to decode wave files

- [./sherpa-onnx-offline.cc](./sherpa-onnx-offline.cc)
It uses a non-streaming model to decode wave files

- [./online-websocket-server.cc](./online-websocket-server.cc)
WebSocket server for streaming models.

- [./offline-websocket-server.cc](./offline-websocket-server.cc)
WebSocket server for non-streaming models.

- [./sherpa-onnx-vad-microphone.cc](./sherpa-onnx-vad-microphone.cc)
Use silero VAD to detect speeches with a microphone.

Loading
Loading