Skip to content

Commit

Permalink
JavaScript API (using WebAssembly) for speaker diarization.
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj committed Oct 11, 2024
1 parent 7872d1e commit 64f479d
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 8 deletions.
12 changes: 12 additions & 0 deletions .github/scripts/test-nodejs-npm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,18 @@ git status
ls -lh
ls -lh node_modules

echo '-----speaker diarization----------'
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav

node ./test-offline-speaker-diarization.js
rm -rfv *.wav *.onnx sherpa-onnx-pyannote-*

echo '-----vad+whisper----------'

curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
Expand Down
16 changes: 16 additions & 0 deletions nodejs-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,22 @@ In the following, we describe how to use [sherpa-onnx](https://github.com/k2-fsa
for text-to-speech and speech-to-text.


# Speaker diarization

In the following, we demonstrate how to run speaker diarization.

```bash
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav

node ./test-offline-speaker-diarization.js
```

# Text-to-speech

In the following, we demonstrate how to run text-to-speech.
Expand Down
64 changes: 64 additions & 0 deletions nodejs-examples/test-offline-speaker-diarization.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Copyright (c) 2024 Xiaomi Corporation
const sherpa_onnx = require('sherpa-onnx');

// clang-format off
/* Please use the following commands to download files
used in this script
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
*/
// clang-format on

const config = {
segmentation: {
pyannote: {
model: './sherpa-onnx-pyannote-segmentation-3-0/model.onnx',
debug: 1,
},
},
embedding: {
model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx',
debug: 1,
},
clustering: {
// since we know that the test wave file
// ./0-four-speakers-zh.wav contains 4 speakers, we use 4 for numClusters
// here. if you don't have such information, please set numClusters to -1
numClusters: 4,

// If numClusters is not -1, then threshold is ignored.
//
// A larger threshold leads to fewer clusters, i.e., fewer speakers
// A smaller threshold leads to more clusters, i.e., more speakers
// You need to tune it by yourself.
threshold: 0.5,
},

// If a segment is shorter than minDurationOn, we discard it
minDurationOn: 0.2, // in seconds

// If the gap between two segments is less than minDurationOff, then we
// merge these two segments into a single one
minDurationOff: 0.5, // in seconds
};

const waveFilename = './0-four-speakers-zh.wav';

const sd = sherpa_onnx.createOfflineSpeakerDiarization(config);
console.log('Started')

const wave = sherpa_onnx.readWave(waveFilename);
if (sd.sampleRate != wave.sampleRate) {
throw new Error(
`Expected sample rate: ${sd.sampleRate}, given: ${wave.sampleRate}`);
}

const segments = sd.process(wave.samples);
console.log(segments);
8 changes: 8 additions & 0 deletions scripts/nodejs/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ const sherpa_onnx_tts = require('./sherpa-onnx-tts.js');
const sherpa_onnx_kws = require('./sherpa-onnx-kws.js');
const sherpa_onnx_wave = require('./sherpa-onnx-wave.js');
const sherpa_onnx_vad = require('./sherpa-onnx-vad.js');
const sherpa_onnx_speaker_diarization =
require('./sherpa-onnx-speaker-diarization.js');

function createOnlineRecognizer(config) {
return sherpa_onnx_asr.createOnlineRecognizer(wasmModule, config);
Expand All @@ -32,6 +34,11 @@ function createVad(config) {
return sherpa_onnx_vad.createVad(wasmModule, config);
}

function createOfflineSpeakerDiarization(config) {
return sherpa_onnx_speaker_diarization.createOfflineSpeakerDiarization(
wasmModule, config);
}

function readWave(filename) {
return sherpa_onnx_wave.readWave(filename, wasmModule);
}
Expand All @@ -51,4 +58,5 @@ module.exports = {
writeWave,
createCircularBuffer,
createVad,
createOfflineSpeakerDiarization,
};
12 changes: 12 additions & 0 deletions wasm/nodejs/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,17 @@ set(exported_functions
SherpaOnnxDestroySpeechSegment
SherpaOnnxVoiceActivityDetectorReset
SherpaOnnxVoiceActivityDetectorFlush
# Speaker diarization
SherpaOnnxCreateOfflineSpeakerDiarization
SherpaOnnxDestroyOfflineSpeakerDiarization
SherpaOnnxOfflineSpeakerDiarizationDestroyResult
SherpaOnnxOfflineSpeakerDiarizationDestroySegment
SherpaOnnxOfflineSpeakerDiarizationGetSampleRate
SherpaOnnxOfflineSpeakerDiarizationProcess
SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback
SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments
SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime
SherpaOnnxOfflineSpeakerDiarizationSetConfig
#
SherpaOnnxFileExists
SherpaOnnxReadWave
Expand Down Expand Up @@ -109,6 +120,7 @@ install(
${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js
${CMAKE_SOURCE_DIR}/wasm/kws/sherpa-onnx-kws.js
${CMAKE_SOURCE_DIR}/wasm/vad/sherpa-onnx-vad.js
${CMAKE_SOURCE_DIR}/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js
${CMAKE_SOURCE_DIR}/wasm/nodejs/sherpa-onnx-wave.js
"$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.js"
"$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.wasm"
Expand Down
4 changes: 0 additions & 4 deletions wasm/speaker-diarization/assets/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ Remember to rename the downloaded files.

The following is an example.


```bash
cd wasm/speaker-diarization/assets/

Expand All @@ -22,9 +21,6 @@ rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
cp sherpa-onnx-pyannote-segmentation-3-0/model.onnx ./segmentation.onnx
rm -rf sherpa-onnx-pyannote-segmentation-3-0


curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
mv 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./embedding.onnx


```
12 changes: 8 additions & 4 deletions wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ function initSherpaOnnxOfflineSpeakerSegmentationModelConfig(config, Module) {
Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
offset += 4;

Module.setValue(ptr + offset, config.debug || 1, 'i32');
Module.setValue(ptr + offset, config.debug || 0, 'i32');
offset += 4;

const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1;
Expand Down Expand Up @@ -103,7 +103,7 @@ function initSherpaOnnxSpeakerEmbeddingExtractorConfig(config, Module) {
Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
offset += 4;

Module.setValue(ptr + offset, config.debug || 1, 'i32');
Module.setValue(ptr + offset, config.debug || 0, 'i32');
offset += 4;

Module.setValue(ptr + offset, buffer + modelLen, 'i8*');
Expand Down Expand Up @@ -270,11 +270,15 @@ class OfflineSpeakerDiarization {
}

function createOfflineSpeakerDiarization(Module, myConfig) {
const config = {
let config = {
segmentation: {
pyannote: {model: './segmentation.onnx'},
debug: 1,
},
embedding: {
model: './embedding.onnx',
debug: 1,
},
embedding: {model: './embedding.onnx'},
clustering: {numClusters: -1, threshold: 0.5},
minDurationOn: 0.3,
minDurationOff: 0.5,
Expand Down

0 comments on commit 64f479d

Please sign in to comment.