diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index c41a0de65..03dec04aa 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -9,6 +9,18 @@ git status ls -lh ls -lh node_modules +echo '-----speaker diarization----------' +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + +node ./test-offline-speaker-diarization.js +rm -rfv *.wav *.onnx sherpa-onnx-pyannote-* + echo '-----vad+whisper----------' curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 diff --git a/.github/workflows/test-build-wheel.yaml b/.github/workflows/test-build-wheel.yaml index a9b2db589..8b7472b84 100644 --- a/.github/workflows/test-build-wheel.yaml +++ b/.github/workflows/test-build-wheel.yaml @@ -139,7 +139,7 @@ jobs: export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH - export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH + export PATH=/c/hostedtoolcache/windows/Python/3.12.7/x64/bin:$PATH which sherpa-onnx sherpa-onnx --help diff --git a/.github/workflows/test-pip-install.yaml b/.github/workflows/test-pip-install.yaml index 0f73e3643..b59b66b53 100644 --- a/.github/workflows/test-pip-install.yaml +++ b/.github/workflows/test-pip-install.yaml @@ -104,7 +104,7 @@ jobs: export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH - export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH + export PATH=/c/hostedtoolcache/windows/Python/3.12.7/x64/bin:$PATH sherpa-onnx --help sherpa-onnx-keyword-spotter --help diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md index 73a85de77..496a0062b 100644 --- a/nodejs-examples/README.md +++ b/nodejs-examples/README.md @@ -22,6 +22,22 @@ In the following, we describe how to use [sherpa-onnx](https://github.com/k2-fsa for text-to-speech and speech-to-text. +# Speaker diarization + +In the following, we demonstrate how to run speaker diarization. + +```bash +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + +node ./test-offline-speaker-diarization.js +``` + # Text-to-speech In the following, we demonstrate how to run text-to-speech. diff --git a/nodejs-examples/test-offline-speaker-diarization.js b/nodejs-examples/test-offline-speaker-diarization.js new file mode 100644 index 000000000..de0f4a45b --- /dev/null +++ b/nodejs-examples/test-offline-speaker-diarization.js @@ -0,0 +1,64 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx'); + +// clang-format off +/* Please use the following commands to download files + used in this script + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + + */ +// clang-format on + +const config = { + segmentation: { + pyannote: { + model: './sherpa-onnx-pyannote-segmentation-3-0/model.onnx', + debug: 1, + }, + }, + embedding: { + model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx', + debug: 1, + }, + clustering: { + // since we know that the test wave file + // ./0-four-speakers-zh.wav contains 4 speakers, we use 4 for numClusters + // here. if you don't have such information, please set numClusters to -1 + numClusters: 4, + + // If numClusters is not -1, then threshold is ignored. + // + // A larger threshold leads to fewer clusters, i.e., fewer speakers + // A smaller threshold leads to more clusters, i.e., more speakers + // You need to tune it by yourself. + threshold: 0.5, + }, + + // If a segment is shorter than minDurationOn, we discard it + minDurationOn: 0.2, // in seconds + + // If the gap between two segments is less than minDurationOff, then we + // merge these two segments into a single one + minDurationOff: 0.5, // in seconds +}; + +const waveFilename = './0-four-speakers-zh.wav'; + +const sd = sherpa_onnx.createOfflineSpeakerDiarization(config); +console.log('Started') + +const wave = sherpa_onnx.readWave(waveFilename); +if (sd.sampleRate != wave.sampleRate) { + throw new Error( + `Expected sample rate: ${sd.sampleRate}, given: ${wave.sampleRate}`); +} + +const segments = sd.process(wave.samples); +console.log(segments); diff --git a/scripts/nodejs/index.js b/scripts/nodejs/index.js index 3f0789edb..b1b77841c 100644 --- a/scripts/nodejs/index.js +++ b/scripts/nodejs/index.js @@ -7,6 +7,8 @@ const sherpa_onnx_tts = require('./sherpa-onnx-tts.js'); const sherpa_onnx_kws = require('./sherpa-onnx-kws.js'); const sherpa_onnx_wave = require('./sherpa-onnx-wave.js'); const sherpa_onnx_vad = require('./sherpa-onnx-vad.js'); +const sherpa_onnx_speaker_diarization = + require('./sherpa-onnx-speaker-diarization.js'); function createOnlineRecognizer(config) { return sherpa_onnx_asr.createOnlineRecognizer(wasmModule, config); @@ -32,6 +34,11 @@ function createVad(config) { return sherpa_onnx_vad.createVad(wasmModule, config); } +function createOfflineSpeakerDiarization(config) { + return sherpa_onnx_speaker_diarization.createOfflineSpeakerDiarization( + wasmModule, config); +} + function readWave(filename) { return sherpa_onnx_wave.readWave(filename, wasmModule); } @@ -51,4 +58,5 @@ module.exports = { writeWave, createCircularBuffer, createVad, + createOfflineSpeakerDiarization, }; diff --git a/wasm/nodejs/CMakeLists.txt b/wasm/nodejs/CMakeLists.txt index 4efc879a1..dc8d8c854 100644 --- a/wasm/nodejs/CMakeLists.txt +++ b/wasm/nodejs/CMakeLists.txt @@ -70,6 +70,17 @@ set(exported_functions SherpaOnnxDestroySpeechSegment SherpaOnnxVoiceActivityDetectorReset SherpaOnnxVoiceActivityDetectorFlush + # Speaker diarization + SherpaOnnxCreateOfflineSpeakerDiarization + SherpaOnnxDestroyOfflineSpeakerDiarization + SherpaOnnxOfflineSpeakerDiarizationDestroyResult + SherpaOnnxOfflineSpeakerDiarizationDestroySegment + SherpaOnnxOfflineSpeakerDiarizationGetSampleRate + SherpaOnnxOfflineSpeakerDiarizationProcess + SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback + SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments + SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime + SherpaOnnxOfflineSpeakerDiarizationSetConfig # SherpaOnnxFileExists SherpaOnnxReadWave @@ -109,6 +120,7 @@ install( ${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js ${CMAKE_SOURCE_DIR}/wasm/kws/sherpa-onnx-kws.js ${CMAKE_SOURCE_DIR}/wasm/vad/sherpa-onnx-vad.js + ${CMAKE_SOURCE_DIR}/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js ${CMAKE_SOURCE_DIR}/wasm/nodejs/sherpa-onnx-wave.js "$/sherpa-onnx-wasm-nodejs.js" "$/sherpa-onnx-wasm-nodejs.wasm" diff --git a/wasm/speaker-diarization/assets/README.md b/wasm/speaker-diarization/assets/README.md index 5c06139e2..f09a5899d 100644 --- a/wasm/speaker-diarization/assets/README.md +++ b/wasm/speaker-diarization/assets/README.md @@ -12,7 +12,6 @@ Remember to rename the downloaded files. The following is an example. - ```bash cd wasm/speaker-diarization/assets/ @@ -22,9 +21,6 @@ rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 cp sherpa-onnx-pyannote-segmentation-3-0/model.onnx ./segmentation.onnx rm -rf sherpa-onnx-pyannote-segmentation-3-0 - curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx mv 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./embedding.onnx - - ``` diff --git a/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js b/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js index ccfc8373c..741013480 100644 --- a/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js +++ b/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js @@ -64,7 +64,7 @@ function initSherpaOnnxOfflineSpeakerSegmentationModelConfig(config, Module) { Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); offset += 4; - Module.setValue(ptr + offset, config.debug || 1, 'i32'); + Module.setValue(ptr + offset, config.debug || 0, 'i32'); offset += 4; const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; @@ -103,7 +103,7 @@ function initSherpaOnnxSpeakerEmbeddingExtractorConfig(config, Module) { Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); offset += 4; - Module.setValue(ptr + offset, config.debug || 1, 'i32'); + Module.setValue(ptr + offset, config.debug || 0, 'i32'); offset += 4; Module.setValue(ptr + offset, buffer + modelLen, 'i8*'); @@ -270,11 +270,15 @@ class OfflineSpeakerDiarization { } function createOfflineSpeakerDiarization(Module, myConfig) { - const config = { + let config = { segmentation: { pyannote: {model: './segmentation.onnx'}, + debug: 1, + }, + embedding: { + model: './embedding.onnx', + debug: 1, }, - embedding: {model: './embedding.onnx'}, clustering: {numClusters: -1, threshold: 0.5}, minDurationOn: 0.3, minDurationOff: 0.5,