diff --git a/.github/workflows/c-api.yaml b/.github/workflows/c-api.yaml index 2e34977f4..049240d77 100644 --- a/.github/workflows/c-api.yaml +++ b/.github/workflows/c-api.yaml @@ -81,31 +81,72 @@ jobs: otool -L ./install/lib/libsherpa-onnx-c-api.dylib fi - - name: Test Moonshine + - name: Test vad + Whisper tiny.en shell: bash run: | - gcc -o moonshine-c-api ./c-api-examples/moonshine-c-api.c \ + gcc -o vad-whisper-c-api ./c-api-examples/vad-whisper-c-api.c \ -I ./build/install/include \ -L ./build/install/lib/ \ -l sherpa-onnx-c-api \ -l onnxruntime - ls -lh moonshine-c-api + # Now download models + # + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav - if [[ ${{ matrix.os }} == ubuntu-latest ]]; then - ldd ./moonshine-c-api - echo "----" - readelf -d ./moonshine-c-api - fi + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 + rm sherpa-onnx-whisper-tiny.en.tar.bz2 + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./vad-whisper-c-api + + rm -rf sherpa-onnx-* + rm -rf *.onnx + rm *.wav + + - name: Test vad + Moonshine + shell: bash + run: | + gcc -o vad-moonshine-c-api ./c-api-examples/vad-moonshine-c-api.c \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-c-api \ + -l onnxruntime # Now download models # + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./vad-moonshine-c-api + + rm -rf sherpa-onnx-* + rm -rf *.onnx + rm *.wav + + - name: Test Moonshine + shell: bash + run: | + gcc -o moonshine-c-api ./c-api-examples/moonshine-c-api.c \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-c-api \ + -l onnxruntime + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 - ls -lh sherpa-onnx-moonshine-tiny-en-int8 - echo "---" - ls -lh sherpa-onnx-moonshine-tiny-en-int8/test_wavs export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH diff --git a/c-api-examples/CMakeLists.txt b/c-api-examples/CMakeLists.txt index 58a867726..c7db2bc27 100644 --- a/c-api-examples/CMakeLists.txt +++ b/c-api-examples/CMakeLists.txt @@ -56,6 +56,12 @@ target_link_libraries(telespeech-c-api sherpa-onnx-c-api) add_executable(vad-sense-voice-c-api vad-sense-voice-c-api.c) target_link_libraries(vad-sense-voice-c-api sherpa-onnx-c-api) +add_executable(vad-whisper-c-api vad-whisper-c-api.c) +target_link_libraries(vad-whisper-c-api sherpa-onnx-c-api) + +add_executable(vad-moonshine-c-api vad-moonshine-c-api.c) +target_link_libraries(vad-moonshine-c-api sherpa-onnx-c-api) + add_executable(streaming-zipformer-buffered-tokens-hotwords-c-api streaming-zipformer-buffered-tokens-hotwords-c-api.c) target_link_libraries(streaming-zipformer-buffered-tokens-hotwords-c-api sherpa-onnx-c-api) diff --git a/c-api-examples/vad-moonshine-c-api.c b/c-api-examples/vad-moonshine-c-api.c new file mode 100644 index 000000000..e4a4a3e34 --- /dev/null +++ b/c-api-examples/vad-moonshine-c-api.c @@ -0,0 +1,171 @@ +// c-api-examples/vad-sense-voice-c-api.c +// +// Copyright (c) 2024 Xiaomi Corporation + +// +// This file demonstrates how to use VAD + Moonshine with sherpa-onnx's C API. +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +// tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +// rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +// +// clang-format on + +#include +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +int32_t main() { + const char *wav_filename = "./Obama.wav"; + const char *vad_filename = "./silero_vad.onnx"; + + const char *preprocessor = + "./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx"; + const char *encoder = "./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx"; + const char *uncached_decoder = + "./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx"; + const char *cached_decoder = + "./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx"; + const char *tokens = "./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt"; + + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); + if (wave == NULL) { + fprintf(stderr, "Failed to read %s\n", wav_filename); + return -1; + } + + if (wave->sample_rate != 16000) { + fprintf(stderr, "Expect the sample rate to be 16000. Given: %d\n", + wave->sample_rate); + SherpaOnnxFreeWave(wave); + return -1; + } + + // Offline model config + SherpaOnnxOfflineModelConfig offline_model_config; + memset(&offline_model_config, 0, sizeof(offline_model_config)); + offline_model_config.debug = 0; + offline_model_config.num_threads = 1; + offline_model_config.provider = "cpu"; + offline_model_config.tokens = tokens; + offline_model_config.moonshine.preprocessor = preprocessor; + offline_model_config.moonshine.encoder = encoder; + offline_model_config.moonshine.uncached_decoder = uncached_decoder; + offline_model_config.moonshine.cached_decoder = cached_decoder; + + // Recognizer config + SherpaOnnxOfflineRecognizerConfig recognizer_config; + memset(&recognizer_config, 0, sizeof(recognizer_config)); + recognizer_config.decoding_method = "greedy_search"; + recognizer_config.model_config = offline_model_config; + + const SherpaOnnxOfflineRecognizer *recognizer = + SherpaOnnxCreateOfflineRecognizer(&recognizer_config); + + if (recognizer == NULL) { + fprintf(stderr, "Please check your recognizer config!\n"); + SherpaOnnxFreeWave(wave); + return -1; + } + + SherpaOnnxVadModelConfig vadConfig; + memset(&vadConfig, 0, sizeof(vadConfig)); + vadConfig.silero_vad.model = vad_filename; + vadConfig.silero_vad.threshold = 0.5; + vadConfig.silero_vad.min_silence_duration = 0.5; + vadConfig.silero_vad.min_speech_duration = 0.5; + vadConfig.silero_vad.max_speech_duration = 10; + vadConfig.silero_vad.window_size = 512; + vadConfig.sample_rate = 16000; + vadConfig.num_threads = 1; + vadConfig.debug = 1; + + SherpaOnnxVoiceActivityDetector *vad = + SherpaOnnxCreateVoiceActivityDetector(&vadConfig, 30); + + if (vad == NULL) { + fprintf(stderr, "Please check your recognizer config!\n"); + SherpaOnnxFreeWave(wave); + SherpaOnnxDestroyOfflineRecognizer(recognizer); + return -1; + } + + int32_t window_size = vadConfig.silero_vad.window_size; + int32_t i = 0; + + while (i + window_size < wave->num_samples) { + SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, wave->samples + i, + window_size); + i += window_size; + + while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) { + const SherpaOnnxSpeechSegment *segment = + SherpaOnnxVoiceActivityDetectorFront(vad); + + const SherpaOnnxOfflineStream *stream = + SherpaOnnxCreateOfflineStream(recognizer); + + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, + segment->samples, segment->n); + + SherpaOnnxDecodeOfflineStream(recognizer, stream); + + const SherpaOnnxOfflineRecognizerResult *result = + SherpaOnnxGetOfflineStreamResult(stream); + + float start = segment->start / 16000.0f; + float duration = segment->n / 16000.0f; + float stop = start + duration; + + fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text); + + SherpaOnnxDestroyOfflineRecognizerResult(result); + SherpaOnnxDestroyOfflineStream(stream); + + SherpaOnnxDestroySpeechSegment(segment); + SherpaOnnxVoiceActivityDetectorPop(vad); + } + } + + SherpaOnnxVoiceActivityDetectorFlush(vad); + + while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) { + const SherpaOnnxSpeechSegment *segment = + SherpaOnnxVoiceActivityDetectorFront(vad); + + const SherpaOnnxOfflineStream *stream = + SherpaOnnxCreateOfflineStream(recognizer); + + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, segment->samples, + segment->n); + + SherpaOnnxDecodeOfflineStream(recognizer, stream); + + const SherpaOnnxOfflineRecognizerResult *result = + SherpaOnnxGetOfflineStreamResult(stream); + + float start = segment->start / 16000.0f; + float duration = segment->n / 16000.0f; + float stop = start + duration; + + fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text); + + SherpaOnnxDestroyOfflineRecognizerResult(result); + SherpaOnnxDestroyOfflineStream(stream); + + SherpaOnnxDestroySpeechSegment(segment); + SherpaOnnxVoiceActivityDetectorPop(vad); + } + + SherpaOnnxDestroyOfflineRecognizer(recognizer); + SherpaOnnxDestroyVoiceActivityDetector(vad); + SherpaOnnxFreeWave(wave); + + return 0; +} diff --git a/c-api-examples/vad-sense-voice-c-api.c b/c-api-examples/vad-sense-voice-c-api.c index 3049c9572..ee9504d1a 100644 --- a/c-api-examples/vad-sense-voice-c-api.c +++ b/c-api-examples/vad-sense-voice-c-api.c @@ -81,6 +81,7 @@ int32_t main() { vadConfig.silero_vad.threshold = 0.5; vadConfig.silero_vad.min_silence_duration = 0.5; vadConfig.silero_vad.min_speech_duration = 0.5; + vadConfig.silero_vad.max_speech_duration = 5; vadConfig.silero_vad.window_size = 512; vadConfig.sample_rate = 16000; vadConfig.num_threads = 1; diff --git a/c-api-examples/vad-whisper-c-api.c b/c-api-examples/vad-whisper-c-api.c new file mode 100644 index 000000000..83cf9b258 --- /dev/null +++ b/c-api-examples/vad-whisper-c-api.c @@ -0,0 +1,169 @@ +// c-api-examples/vad-sense-voice-c-api.c +// +// Copyright (c) 2024 Xiaomi Corporation + +// +// This file demonstrates how to use VAD + Whisper tiny.en with +// sherpa-onnx's C API. +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 +// tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 +// rm sherpa-onnx-whisper-tiny.en.tar.bz2 +// +// clang-format on + +#include +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +int32_t main() { + const char *wav_filename = "./Obama.wav"; + const char *vad_filename = "./silero_vad.onnx"; + + const char *encoder = "sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"; + const char *decoder = "sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"; + const char *tokens = "sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt"; + + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); + if (wave == NULL) { + fprintf(stderr, "Failed to read %s\n", wav_filename); + return -1; + } + + if (wave->sample_rate != 16000) { + fprintf(stderr, "Expect the sample rate to be 16000. Given: %d\n", + wave->sample_rate); + SherpaOnnxFreeWave(wave); + return -1; + } + + // Offline model config + SherpaOnnxOfflineModelConfig offline_model_config; + memset(&offline_model_config, 0, sizeof(offline_model_config)); + offline_model_config.debug = 0; + offline_model_config.num_threads = 1; + offline_model_config.provider = "cpu"; + offline_model_config.tokens = tokens; + offline_model_config.whisper.encoder = encoder; + offline_model_config.whisper.decoder = decoder; + offline_model_config.whisper.language = "en"; + offline_model_config.whisper.tail_paddings = 0; + offline_model_config.whisper.task = "transcribe"; + + // Recognizer config + SherpaOnnxOfflineRecognizerConfig recognizer_config; + memset(&recognizer_config, 0, sizeof(recognizer_config)); + recognizer_config.decoding_method = "greedy_search"; + recognizer_config.model_config = offline_model_config; + + const SherpaOnnxOfflineRecognizer *recognizer = + SherpaOnnxCreateOfflineRecognizer(&recognizer_config); + + if (recognizer == NULL) { + fprintf(stderr, "Please check your recognizer config!\n"); + SherpaOnnxFreeWave(wave); + return -1; + } + + SherpaOnnxVadModelConfig vadConfig; + memset(&vadConfig, 0, sizeof(vadConfig)); + vadConfig.silero_vad.model = vad_filename; + vadConfig.silero_vad.threshold = 0.5; + vadConfig.silero_vad.min_silence_duration = 0.5; + vadConfig.silero_vad.min_speech_duration = 0.5; + vadConfig.silero_vad.max_speech_duration = 10; + vadConfig.silero_vad.window_size = 512; + vadConfig.sample_rate = 16000; + vadConfig.num_threads = 1; + vadConfig.debug = 1; + + SherpaOnnxVoiceActivityDetector *vad = + SherpaOnnxCreateVoiceActivityDetector(&vadConfig, 30); + + if (vad == NULL) { + fprintf(stderr, "Please check your recognizer config!\n"); + SherpaOnnxFreeWave(wave); + SherpaOnnxDestroyOfflineRecognizer(recognizer); + return -1; + } + + int32_t window_size = vadConfig.silero_vad.window_size; + int32_t i = 0; + + while (i + window_size < wave->num_samples) { + SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, wave->samples + i, + window_size); + i += window_size; + + while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) { + const SherpaOnnxSpeechSegment *segment = + SherpaOnnxVoiceActivityDetectorFront(vad); + + const SherpaOnnxOfflineStream *stream = + SherpaOnnxCreateOfflineStream(recognizer); + + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, + segment->samples, segment->n); + + SherpaOnnxDecodeOfflineStream(recognizer, stream); + + const SherpaOnnxOfflineRecognizerResult *result = + SherpaOnnxGetOfflineStreamResult(stream); + + float start = segment->start / 16000.0f; + float duration = segment->n / 16000.0f; + float stop = start + duration; + + fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text); + + SherpaOnnxDestroyOfflineRecognizerResult(result); + SherpaOnnxDestroyOfflineStream(stream); + + SherpaOnnxDestroySpeechSegment(segment); + SherpaOnnxVoiceActivityDetectorPop(vad); + } + } + + SherpaOnnxVoiceActivityDetectorFlush(vad); + + while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) { + const SherpaOnnxSpeechSegment *segment = + SherpaOnnxVoiceActivityDetectorFront(vad); + + const SherpaOnnxOfflineStream *stream = + SherpaOnnxCreateOfflineStream(recognizer); + + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, segment->samples, + segment->n); + + SherpaOnnxDecodeOfflineStream(recognizer, stream); + + const SherpaOnnxOfflineRecognizerResult *result = + SherpaOnnxGetOfflineStreamResult(stream); + + float start = segment->start / 16000.0f; + float duration = segment->n / 16000.0f; + float stop = start + duration; + + fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text); + + SherpaOnnxDestroyOfflineRecognizerResult(result); + SherpaOnnxDestroyOfflineStream(stream); + + SherpaOnnxDestroySpeechSegment(segment); + SherpaOnnxVoiceActivityDetectorPop(vad); + } + + SherpaOnnxDestroyOfflineRecognizer(recognizer); + SherpaOnnxDestroyVoiceActivityDetector(vad); + SherpaOnnxFreeWave(wave); + + return 0; +}