From a7dc6c2c165de16c68daaf78490d159f51c54d44 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 12 Aug 2024 23:33:35 +0800 Subject: [PATCH] Pascal API for non-streaming ASR (#1247) --- .github/workflows/pascal.yaml | 49 ++ README.md | 16 +- pascal-api-examples/README.md | 1 + .../non-streaming-asr/.gitignore | 9 + .../non-streaming-asr/README.md | 15 + .../non-streaming-asr/nemo_ctc.pas | 74 +++ .../non-streaming-asr/nemo_transducer.pas | 77 +++ .../non-streaming-asr/paraformer.pas | 74 +++ .../non-streaming-asr/paraformer_itn.pas | 75 +++ .../non-streaming-asr/run-nemo-ctc.sh | 41 ++ .../non-streaming-asr/run-nemo-transducer.sh | 42 ++ .../non-streaming-asr/run-paraformer-itn.sh | 50 ++ .../non-streaming-asr/run-paraformer.sh | 42 ++ .../non-streaming-asr/run-sense-voice.sh | 41 ++ .../non-streaming-asr/run-telespeech-ctc.sh | 42 ++ .../non-streaming-asr/run-whisper.sh | 42 ++ .../run-zipformer-transducer.sh | 42 ++ .../non-streaming-asr/sense_voice.pas | 76 +++ .../non-streaming-asr/telespeech_ctc.pas | 74 +++ .../non-streaming-asr/whisper.pas | 75 +++ .../zipformer_transducer.pas | 76 +++ pascal-api-examples/streaming-asr/.gitignore | 2 + pascal-api-examples/streaming-asr/README.md | 1 + .../streaming-asr/nemo_transducer.pas | 89 ++++ .../streaming-asr/run-nemo-transducer.sh | 41 ++ sherpa-onnx/pascal-api/sherpa_onnx.pas | 458 +++++++++++++++++- 26 files changed, 1616 insertions(+), 8 deletions(-) create mode 100644 pascal-api-examples/non-streaming-asr/.gitignore create mode 100644 pascal-api-examples/non-streaming-asr/README.md create mode 100644 pascal-api-examples/non-streaming-asr/nemo_ctc.pas create mode 100644 pascal-api-examples/non-streaming-asr/nemo_transducer.pas create mode 100644 pascal-api-examples/non-streaming-asr/paraformer.pas create mode 100644 pascal-api-examples/non-streaming-asr/paraformer_itn.pas create mode 100755 pascal-api-examples/non-streaming-asr/run-nemo-ctc.sh create mode 100755 pascal-api-examples/non-streaming-asr/run-nemo-transducer.sh create mode 100755 pascal-api-examples/non-streaming-asr/run-paraformer-itn.sh create mode 100755 pascal-api-examples/non-streaming-asr/run-paraformer.sh create mode 100755 pascal-api-examples/non-streaming-asr/run-sense-voice.sh create mode 100755 pascal-api-examples/non-streaming-asr/run-telespeech-ctc.sh create mode 100755 pascal-api-examples/non-streaming-asr/run-whisper.sh create mode 100755 pascal-api-examples/non-streaming-asr/run-zipformer-transducer.sh create mode 100644 pascal-api-examples/non-streaming-asr/sense_voice.pas create mode 100644 pascal-api-examples/non-streaming-asr/telespeech_ctc.pas create mode 100644 pascal-api-examples/non-streaming-asr/whisper.pas create mode 100644 pascal-api-examples/non-streaming-asr/zipformer_transducer.pas create mode 100644 pascal-api-examples/streaming-asr/nemo_transducer.pas create mode 100755 pascal-api-examples/streaming-asr/run-nemo-transducer.sh diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml index 9ff7b2295..98abc4c01 100644 --- a/.github/workflows/pascal.yaml +++ b/.github/workflows/pascal.yaml @@ -115,9 +115,11 @@ jobs: if [[ ${{ matrix.os }} == 'windows-latest' ]]; then cp -v install/lib/*.dll ../pascal-api-examples/read-wav cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr + cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr + cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr fi - name: Run Pascal test (Read wav test) @@ -133,6 +135,48 @@ jobs: ls -lh popd + - name: Run Pascal test (Non Streaming ASR) + shell: bash + run: | + export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH + + cd ./pascal-api-examples + + pushd non-streaming-asr + ./run-zipformer-transducer.sh + rm -rf sherpa-onnx-* + echo "---" + + ./run-whisper.sh + rm -rf sherpa-onnx-* + echo "---" + + ./run-nemo-transducer.sh + rm -rf sherpa-onnx-* + echo "---" + + ./run-nemo-ctc.sh + rm -rf sherpa-onnx-* + echo "---" + + ./run-sense-voice.sh + rm -rf sherpa-onnx-* + echo "---" + + ./run-telespeech-ctc.sh + rm -rf sherpa-onnx-* + echo "---" + + ./run-paraformer.sh + + ./run-paraformer-itn.sh + + rm -rf sherpa-onnx-* + echo "---" + + ls -lh + popd + - name: Run Pascal test (Streaming ASR) shell: bash run: | @@ -141,10 +185,15 @@ jobs: cd ./pascal-api-examples pushd streaming-asr + ./run-zipformer-transducer.sh rm -rf sherpa-onnx-* echo "---" + ./run-nemo-transducer.sh + rm -rf sherpa-onnx-* + echo "---" + if [[ ${{ matrix.os }} != 'windows-latest' ]]; then ./run-paraformer.sh rm -rf sherpa-onnx-* diff --git a/README.md b/README.md index 6c281d3e7..7951e7302 100644 --- a/README.md +++ b/README.md @@ -25,13 +25,17 @@ ### Supported programming languages -| 1. C++ | 2. C | 3. Python | 4. C# | 5. Java | 6. JavaScript | -|--------|-------|-----------|-------|---------|---------------| -| ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | +| 1. C++ | 2. C | 3. Python | 4. C# | 5. Java | +|--------|-------|-----------|-------|---------| +| ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | -| 7. Kotlin | 8. Swift | 9. Go | 10. Dart | 11. Rust | 12. Pascal | -|-----------|----------|-------|----------|----------|------------| -| ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | +| 6. JavaScript | 7. Kotlin | 8. Swift | 9. Go | 10. Dart | +|---------------|-----------|----------|-------|----------| +| ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | + +| 11. Rust | 12. Pascal | +|----------|------------| +| ✔️ | ✔️ | For Rust support, please see https://github.com/thewh1teagle/sherpa-rs diff --git a/pascal-api-examples/README.md b/pascal-api-examples/README.md index 4e4310d15..125437788 100644 --- a/pascal-api-examples/README.md +++ b/pascal-api-examples/README.md @@ -7,3 +7,4 @@ APIs of [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx). |---------|------------| |[read-wav](./read-wav)|It shows how to read a wave file.| |[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.| +|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.| diff --git a/pascal-api-examples/non-streaming-asr/.gitignore b/pascal-api-examples/non-streaming-asr/.gitignore new file mode 100644 index 000000000..fbcf1c968 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/.gitignore @@ -0,0 +1,9 @@ +!run-*.sh +zipformer_transducer +whisper +nemo_transducer +nemo_ctc +paraformer +paraformer_itn +sense_voice +telespeech_ctc diff --git a/pascal-api-examples/non-streaming-asr/README.md b/pascal-api-examples/non-streaming-asr/README.md new file mode 100644 index 000000000..f8d35c3a9 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/README.md @@ -0,0 +1,15 @@ +# Introduction + +This folder contains examples about using sherpa-onnx's object pascal +APIs with non-streaming models for speech recognition. + +|File|Description| +|----|-----------| +|[run-nemo-ctc.sh](./run-nemo-ctc.sh)|Use a non-streaming NeMo CTC model for speech recognition| +|[run-nemo-transducer.sh](./run-nemo-transducer.sh)|Use a non-streaming NeMo transducer model for speech recognition| +|[run-paraformer-itn.sh](./run-paraformer-itn.sh)|Use a non-streaming Paraformer model for speech recognition with inverse text normalization for numbers| +|[run-paraformer.sh](./run-paraformer.sh)|Use a non-streaming Paraformer model for speech recognition| +|[run-sense-voice.sh](./run-sense-voice.sh)|Use a non-streaming SenseVoice model for speech recognition| +|[run-telespeech-ctc.sh](./run-telespeech-ctc.sh)|Use a non-streaming TeleSpeech CTC model for speech recognition| +|[run-whisper.sh](./run-whisper.sh)|Use a Whisper model for speech recognition| +|[run-zipformer-transducer.sh](./run-zipformer-transducer.sh)|Use a non-streaming Zipformer transducer model for speech recognition| diff --git a/pascal-api-examples/non-streaming-asr/nemo_ctc.pas b/pascal-api-examples/non-streaming-asr/nemo_ctc.pas new file mode 100644 index 000000000..0b4622c35 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/nemo_ctc.pas @@ -0,0 +1,74 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming NeMo CTC model +to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program nemo_ctc; + +{$mode objfpc} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + +var + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + + Config: TSherpaOnnxOfflineRecognizerConfig; + Recognizer: TSherpaOnnxOfflineRecognizer; + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Config.ModelConfig.NeMoCtC.Model := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx'; + Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + WaveFilename := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/es-spanish.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config); + Stream := Recognizer.CreateStream(); + Start := Now; + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end. diff --git a/pascal-api-examples/non-streaming-asr/nemo_transducer.pas b/pascal-api-examples/non-streaming-asr/nemo_transducer.pas new file mode 100644 index 000000000..cbd8c1fdf --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/nemo_transducer.pas @@ -0,0 +1,77 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming NeMo transducer +to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program nemo_transducer; + +{$mode objfpc} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + +var + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + + Config: TSherpaOnnxOfflineRecognizerConfig; + Recognizer: TSherpaOnnxOfflineRecognizer; + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx'; + Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx'; + Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx'; + Config.ModelConfig.ModelType := 'nemo_transducer'; + Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + WaveFilename := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/de-german.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config); + Stream := Recognizer.CreateStream(); + Start := Now; + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end. diff --git a/pascal-api-examples/non-streaming-asr/paraformer.pas b/pascal-api-examples/non-streaming-asr/paraformer.pas new file mode 100644 index 000000000..3ad76dc27 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/paraformer.pas @@ -0,0 +1,74 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming Paraformer model +to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program paraformer; + +{$mode objfpc} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + +var + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + + Config: TSherpaOnnxOfflineRecognizerConfig; + Recognizer: TSherpaOnnxOfflineRecognizer; + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx'; + Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + WaveFilename := './sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/3-sichuan.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config); + Stream := Recognizer.CreateStream(); + Start := Now; + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end. diff --git a/pascal-api-examples/non-streaming-asr/paraformer_itn.pas b/pascal-api-examples/non-streaming-asr/paraformer_itn.pas new file mode 100644 index 000000000..172af597b --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/paraformer_itn.pas @@ -0,0 +1,75 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming Paraformer model +to decode files with inverse text normalization for numbers. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program paraformer_itn; + +{$mode objfpc} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + +var + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + + Config: TSherpaOnnxOfflineRecognizerConfig; + Recognizer: TSherpaOnnxOfflineRecognizer; + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx'; + Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + Config.RuleFsts := './itn_zh_number.fst'; + + WaveFilename := './itn-zh-number.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config); + Stream := Recognizer.CreateStream(); + Start := Now; + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end. diff --git a/pascal-api-examples/non-streaming-asr/run-nemo-ctc.sh b/pascal-api-examples/non-streaming-asr/run-nemo-ctc.sh new file mode 100755 index 000000000..04fa08259 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/run-nemo-ctc.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + ls -lh lib + popd +fi + +if [ ! -f ./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 + tar xvf sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 + rm sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 +fi + +fpc \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./nemo_ctc.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./nemo_ctc diff --git a/pascal-api-examples/non-streaming-asr/run-nemo-transducer.sh b/pascal-api-examples/non-streaming-asr/run-nemo-transducer.sh new file mode 100755 index 000000000..a53277ec8 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/run-nemo-transducer.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + ls -lh lib + popd +fi + +if [ ! -f ./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 + + tar xvf sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 + rm sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 +fi + +fpc \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./nemo_transducer.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./nemo_transducer diff --git a/pascal-api-examples/non-streaming-asr/run-paraformer-itn.sh b/pascal-api-examples/non-streaming-asr/run-paraformer-itn.sh new file mode 100755 index 000000000..0212d072b --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/run-paraformer-itn.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + ls -lh lib + popd +fi + +if [ ! -f ./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 + + tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 +fi + +if [ ! -f ./itn-zh-number.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +fi + +if [ ! -f ./itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst +fi + +fpc \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./paraformer_itn.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./paraformer_itn diff --git a/pascal-api-examples/non-streaming-asr/run-paraformer.sh b/pascal-api-examples/non-streaming-asr/run-paraformer.sh new file mode 100755 index 000000000..cd1ad6d09 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/run-paraformer.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + ls -lh lib + popd +fi + +if [ ! -f ./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 + + tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 +fi + +fpc \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./paraformer.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./paraformer diff --git a/pascal-api-examples/non-streaming-asr/run-sense-voice.sh b/pascal-api-examples/non-streaming-asr/run-sense-voice.sh new file mode 100755 index 000000000..4323fa59c --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/run-sense-voice.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + ls -lh lib + popd +fi + +if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +fi + +fpc \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./sense_voice.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./sense_voice diff --git a/pascal-api-examples/non-streaming-asr/run-telespeech-ctc.sh b/pascal-api-examples/non-streaming-asr/run-telespeech-ctc.sh new file mode 100755 index 000000000..e35892393 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/run-telespeech-ctc.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + ls -lh lib + popd +fi + +if [ ! -f ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 +fi + +fpc \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./telespeech_ctc.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./telespeech_ctc diff --git a/pascal-api-examples/non-streaming-asr/run-whisper.sh b/pascal-api-examples/non-streaming-asr/run-whisper.sh new file mode 100755 index 000000000..de11620a4 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/run-whisper.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + ls -lh lib + popd +fi + +if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 + + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 + rm sherpa-onnx-whisper-tiny.en.tar.bz2 +fi + +fpc \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./whisper.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./whisper diff --git a/pascal-api-examples/non-streaming-asr/run-zipformer-transducer.sh b/pascal-api-examples/non-streaming-asr/run-zipformer-transducer.sh new file mode 100755 index 000000000..5d8c9687e --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/run-zipformer-transducer.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + ls -lh lib + popd +fi + +if [ ! -f ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-gigaspeech-2023-12-12.tar.bz2 + + tar xvf sherpa-onnx-zipformer-gigaspeech-2023-12-12.tar.bz2 + rm sherpa-onnx-zipformer-gigaspeech-2023-12-12.tar.bz2 +fi + +fpc \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./zipformer_transducer.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./zipformer_transducer diff --git a/pascal-api-examples/non-streaming-asr/sense_voice.pas b/pascal-api-examples/non-streaming-asr/sense_voice.pas new file mode 100644 index 000000000..5963ba1e8 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/sense_voice.pas @@ -0,0 +1,76 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming SenseVoice model +to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program sense_voice; + +{$mode objfpc} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + +var + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + + Config: TSherpaOnnxOfflineRecognizerConfig; + Recognizer: TSherpaOnnxOfflineRecognizer; + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx'; + Config.ModelConfig.SenseVoice.Language := 'auto'; + Config.ModelConfig.SenseVoice.UseItn := False; + Config.ModelConfig.Tokens := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + WaveFilename := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config); + Stream := Recognizer.CreateStream(); + Start := Now; + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end. diff --git a/pascal-api-examples/non-streaming-asr/telespeech_ctc.pas b/pascal-api-examples/non-streaming-asr/telespeech_ctc.pas new file mode 100644 index 000000000..8424775f0 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/telespeech_ctc.pas @@ -0,0 +1,74 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming TeleSpeech CTC model +to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program telespeech_ctc; + +{$mode objfpc} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + +var + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + + Config: TSherpaOnnxOfflineRecognizerConfig; + Recognizer: TSherpaOnnxOfflineRecognizer; + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Config.ModelConfig.TeleSpeechCtc := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx'; + Config.ModelConfig.Tokens := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + WaveFilename := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config); + Stream := Recognizer.CreateStream(); + Start := Now; + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end. diff --git a/pascal-api-examples/non-streaming-asr/whisper.pas b/pascal-api-examples/non-streaming-asr/whisper.pas new file mode 100644 index 000000000..f32c8e232 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/whisper.pas @@ -0,0 +1,75 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming Whisper model +to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program whisper; + +{$mode objfpc} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + +var + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + + Config: TSherpaOnnxOfflineRecognizerConfig; + Recognizer: TSherpaOnnxOfflineRecognizer; + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx'; + Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx'; + Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + WaveFilename := './sherpa-onnx-whisper-tiny.en/test_wavs/0.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config); + Stream := Recognizer.CreateStream(); + Start := Now; + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end. diff --git a/pascal-api-examples/non-streaming-asr/zipformer_transducer.pas b/pascal-api-examples/non-streaming-asr/zipformer_transducer.pas new file mode 100644 index 000000000..343a5c57e --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/zipformer_transducer.pas @@ -0,0 +1,76 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming Zipformer transducer +to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program zipformer_transducer; + +{$mode objfpc} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + +var + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + + Config: TSherpaOnnxOfflineRecognizerConfig; + Recognizer: TSherpaOnnxOfflineRecognizer; + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx'; + Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx'; + Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx'; + Config.ModelConfig.Tokens := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + WaveFilename := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1089-134686-0001.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config); + Stream := Recognizer.CreateStream(); + Start := Now; + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end. diff --git a/pascal-api-examples/streaming-asr/.gitignore b/pascal-api-examples/streaming-asr/.gitignore index 9bfca7ffb..806df8e09 100644 --- a/pascal-api-examples/streaming-asr/.gitignore +++ b/pascal-api-examples/streaming-asr/.gitignore @@ -1,4 +1,6 @@ +!run-*.sh zipformer_transducer paraformer zipformer_ctc zipformer_ctc_hlg +nemo_transducer diff --git a/pascal-api-examples/streaming-asr/README.md b/pascal-api-examples/streaming-asr/README.md index cbd752ead..f2a2315df 100644 --- a/pascal-api-examples/streaming-asr/README.md +++ b/pascal-api-examples/streaming-asr/README.md @@ -9,3 +9,4 @@ APIs with streaming models for speech recognition. |[run-zipformer-ctc-hlg.sh](./run-zipformer-ctc-hlg.sh)|Use a streaming Zipformer CTC model for speech recognition| |[run-zipformer-ctc.sh](./run-zipformer-ctc.sh)|Use a streaming Zipformer CTC model with HLG for speech recognition| |[run-zipformer-transducer.sh](./run-zipformer-transducer.sh)|Use a Zipformer transducer model for speech recognition| +|[run-nemo-transducer.sh](./run-nemo-transducer.sh)|Use a NeMo transducer model for speech recognition| diff --git a/pascal-api-examples/streaming-asr/nemo_transducer.pas b/pascal-api-examples/streaming-asr/nemo_transducer.pas new file mode 100644 index 000000000..65dbdb88f --- /dev/null +++ b/pascal-api-examples/streaming-asr/nemo_transducer.pas @@ -0,0 +1,89 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a streaming NeMo transducer +to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program nemo_transducer; + +{$mode objfpc} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + +var + Config: TSherpaOnnxOnlineRecognizerConfig; + Recognizer: TSherpaOnnxOnlineRecognizer; + Stream: TSherpaOnnxOnlineStream; + RecognitionResult: TSherpaOnnxOnlineRecognizerResult; + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + TailPaddings: array of Single; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Initialize(Config); + + {Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + to download model files used in this file.} + Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms/encoder.onnx'; + Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms/decoder.onnx'; + Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms/joiner.onnx'; + Config.ModelConfig.Tokens := './sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + WaveFilename := './sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms/test_wavs/0.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOnlineRecognizer.Create(Config); + + Start := Now; + + Stream := Recognizer.CreateStream(); + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + + SetLength(TailPaddings, Round(Wave.SampleRate * 0.5)); {0.5 seconds of padding} + Stream.AcceptWaveform(TailPaddings, Wave.SampleRate); + + Stream.InputFinished(); + + while Recognizer.IsReady(Stream) do + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end. diff --git a/pascal-api-examples/streaming-asr/run-nemo-transducer.sh b/pascal-api-examples/streaming-asr/run-nemo-transducer.sh new file mode 100755 index 000000000..aba9fb89f --- /dev/null +++ b/pascal-api-examples/streaming-asr/run-nemo-transducer.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + ls -lh lib + popd +fi + +if [ ! -f ./sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms.tar.bz2 + tar xvf sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms.tar.bz2 + rm sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms.tar.bz2 +fi + +fpc \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./nemo_transducer.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./nemo_transducer diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index a878b45e0..8e1f03b72 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -110,6 +110,109 @@ TSherpaOnnxOnlineRecognizer = class function GetResult(Stream: TSherpaOnnxOnlineStream): TSherpaOnnxOnlineRecognizerResult; end; + TSherpaOnnxOfflineTransducerModelConfig = record + Encoder: AnsiString; + Decoder: AnsiString; + Joiner: AnsiString; + function ToString: AnsiString; + end; + + TSherpaOnnxOfflineParaformerModelConfig = record + Model: AnsiString; + function ToString: AnsiString; + end; + + TSherpaOnnxOfflineNemoEncDecCtcModelConfig = record + Model: AnsiString; + function ToString: AnsiString; + end; + + TSherpaOnnxOfflineWhisperModelConfig = record + Encoder: AnsiString; + Decoder: AnsiString; + Language: AnsiString; + Task: AnsiString; + TailPaddings: Integer; + function ToString: AnsiString; + end; + + TSherpaOnnxOfflineTdnnModelConfig = record + Model: AnsiString; + function ToString: AnsiString; + end; + + TSherpaOnnxOfflineLMConfig = record + Model: AnsiString; + Scale: Single; + function ToString: AnsiString; + end; + + TSherpaOnnxOfflineSenseVoiceModelConfig = record + Model: AnsiString; + Language: AnsiString; + UseItn: Boolean; + function ToString: AnsiString; + end; + + TSherpaOnnxOfflineModelConfig = record + Transducer: TSherpaOnnxOfflineTransducerModelConfig; + Paraformer: TSherpaOnnxOfflineParaformerModelConfig; + NeMoCtc: TSherpaOnnxOfflineNemoEncDecCtcModelConfig; + Whisper: TSherpaOnnxOfflineWhisperModelConfig; + Tdnn: TSherpaOnnxOfflineTdnnModelConfig; + Tokens: AnsiString; + NumThreads: Integer; + Debug: Boolean; + Provider: AnsiString; + ModelType: AnsiString; + ModelingUnit: AnsiString; + BpeVocab: AnsiString; + TeleSpeechCtc: AnsiString; + SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig; + function ToString: AnsiString; + end; + + TSherpaOnnxOfflineRecognizerConfig = record + FeatConfig: TSherpaOnnxFeatureConfig; + ModelConfig: TSherpaOnnxOfflineModelConfig; + LMConfig: TSherpaOnnxOfflineLMConfig; + DecodingMethod: AnsiString; + MaxActivePaths: Integer; + HotwordsFile: AnsiString; + HotwordsScore: Single; + RuleFsts: AnsiString; + RuleFars: AnsiString; + BlankPenalty: Single; + function ToString: AnsiString; + end; + + TSherpaOnnxOfflineRecognizerResult = record + Text: AnsiString; + Tokens: array of AnsiString; + Timestamps: array of Single; + function ToString: AnsiString; + end; + + TSherpaOnnxOfflineStream = class + private + Handle: Pointer; + public + constructor Create(P: Pointer); + destructor Destroy; override; + procedure AcceptWaveform(Samples: array of Single; SampleRate: Integer); + end; + + TSherpaOnnxOfflineRecognizer = class + private + Handle: Pointer; + public + constructor Create(Config: TSherpaOnnxOfflineRecognizerConfig); + destructor Destroy; override; + function CreateStream: TSherpaOnnxOfflineStream; + procedure Decode(Stream: TSherpaOnnxOfflineStream); + function GetResult(Stream: TSherpaOnnxOfflineStream): TSherpaOnnxOfflineRecognizerResult; + end; + { It supports reading a single channel wave with 16-bit encoded samples. Samples are normalized to the range [-1, 1]. } @@ -204,6 +307,68 @@ SherpaOnnxOnlineRecognizerConfig = record PSherpaOnnxOnlineRecognizerConfig = ^SherpaOnnxOnlineRecognizerConfig; + SherpaOnnxOfflineTransducerModelConfig = record + Encoder: PAnsiChar; + Decoder: PAnsiChar; + Joiner: PAnsiChar; + end; + SherpaOnnxOfflineParaformerModelConfig = record + Model: PAnsiChar; + end; + SherpaOnnxOfflineNemoEncDecCtcModelConfig = record + Model: PAnsiChar; + end; + SherpaOnnxOfflineWhisperModelConfig = record + Encoder: PAnsiChar; + Decoder: PAnsiChar; + Language: PAnsiChar; + Task: PAnsiChar; + TailPaddings: cint32; + end; + SherpaOnnxOfflineTdnnModelConfig = record + Model: PAnsiChar; + end; + SherpaOnnxOfflineLMConfig = record + Model: PAnsiChar; + Scale: Single; + end; + SherpaOnnxOfflineSenseVoiceModelConfig = record + Model: PAnsiChar; + Language: PAnsiChar; + UseItn: cint32; + end; + SherpaOnnxOfflineModelConfig = record + Transducer: SherpaOnnxOfflineTransducerModelConfig; + Paraformer: SherpaOnnxOfflineParaformerModelConfig; + NeMoCtc: SherpaOnnxOfflineNemoEncDecCtcModelConfig; + Whisper: SherpaOnnxOfflineWhisperModelConfig; + Tdnn: SherpaOnnxOfflineTdnnModelConfig; + Tokens: PAnsiChar; + NumThreads: cint32; + Debug: cint32; + Provider: PAnsiChar; + ModelType: PAnsiChar; + ModelingUnit: PAnsiChar; + BpeVocab: PAnsiChar; + TeleSpeechCtc: PAnsiChar; + SenseVoice: SherpaOnnxOfflineSenseVoiceModelConfig; + end; + + SherpaOnnxOfflineRecognizerConfig = record + FeatConfig: SherpaOnnxFeatureConfig; + ModelConfig: SherpaOnnxOfflineModelConfig; + LMConfig: SherpaOnnxOfflineLMConfig; + DecodingMethod: PAnsiChar; + MaxActivePaths: cint32; + HotwordsFile: PAnsiChar; + HotwordsScore: Single; + RuleFsts: PAnsiChar; + RuleFars: PAnsiChar; + BlankPenalty: Single; + end; + + PSherpaOnnxOfflineRecognizerConfig = ^SherpaOnnxOfflineRecognizerConfig; + function SherpaOnnxCreateOnlineRecognizer(Config: PSherpaOnnxOnlineRecognizerConfig): Pointer; cdecl; external SherpaOnnxLibName; @@ -244,6 +409,31 @@ function SherpaOnnxGetOnlineStreamResultAsJson(Recognizer: Pointer; Stream: Poin procedure SherpaOnnxDestroyOnlineStreamResultJson(PJson: PAnsiChar); cdecl; external SherpaOnnxLibName; +function SherpaOnnxCreateOfflineRecognizer(Config: PSherpaOnnxOfflineRecognizerConfig): Pointer; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxDestroyOfflineRecognizer(Recognizer: Pointer); cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxCreateOfflineStream(Recognizer: Pointer): Pointer; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxDestroyOfflineStream(Stream: Pointer); cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxAcceptWaveformOffline(Stream: Pointer; + SampleRate: cint32; Samples: pcfloat; N: cint32); cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxDecodeOfflineStream(Recognizer: Pointer; Stream: Pointer); cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxGetOfflineStreamResultAsJson(Stream: Pointer): PAnsiChar; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxDestroyOfflineStreamResultJson(Json: PAnsiChar); cdecl; + external SherpaOnnxLibName; + function SherpaOnnxReadWaveWrapper(Filename: PAnsiChar): PSherpaOnnxWave; cdecl; external SherpaOnnxLibName name 'SherpaOnnxReadWave'; @@ -322,7 +512,7 @@ function TSherpaOnnxOnlineCtcFstDecoderConfig.ToString: AnsiString; function TSherpaOnnxOnlineRecognizerConfig.ToString: AnsiString; begin - Result := Format('TSherpaOnnxOnlineRecognizerConfig(FeatConfg := %s, ' + + Result := Format('TSherpaOnnxOnlineRecognizerConfig(FeatConfig := %s, ' + 'ModelConfig := %s, ' + 'DecodingMethod := %s, ' + 'MaxActivePaths := %d, ' + @@ -375,7 +565,7 @@ function TSherpaOnnxOnlineRecognizerResult.ToString: AnsiString; Result := Format('TSherpaOnnxOnlineRecognizerResult(Text := %s, ' + 'Tokens := %s, ' + - 'Timestamps := %s, ' + + 'Timestamps := %s' + ')', [Self.Text, TokensStr, TimestampStr]); end; @@ -531,4 +721,268 @@ procedure TSherpaOnnxOnlineStream.InputFinished; SherpaOnnxOnlineStreamInputFinished(Self.Handle); end; +function TSherpaOnnxOfflineTransducerModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineTransducerModelConfig(' + + 'Encoder := %s, ' + + 'Decoder := %s, ' + + 'Joiner := %s' + + ')', + [Self.Encoder, Self.Decoder, Self.Joiner]); +end; + +function TSherpaOnnxOfflineParaformerModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineParaformerModelConfig(Model := %s)', + [Self.Model]); +end; + +function TSherpaOnnxOfflineNemoEncDecCtcModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineNemoEncDecCtcModelConfig(Model := %s)', + [Self.Model]); +end; + +function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineWhisperModelConfig(' + + 'Encoder := %s, ' + + 'Decoder := %s, ' + + 'Language := %s, ' + + 'Task := %s, ' + + 'TailPaddings := %d' + + ')', + [Self.Encoder, Self.Decoder, Self.Language, Self.Task, Self.TailPaddings]); +end; + +function TSherpaOnnxOfflineTdnnModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineTdnnModelConfig(Model := %s)', + [Self.Model]); +end; + +function TSherpaOnnxOfflineLMConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineLMConfig(' + + 'Model := %s, ' + + 'Scale := %.1f' + + ')', + [Self.Model, Self.Scale]); +end; + +function TSherpaOnnxOfflineSenseVoiceModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineSenseVoiceModelConfig(' + + 'Model := %s, ' + + 'Language := %s, ' + + 'UseItn := %s' + + ')', + [Self.Model, Self.Language, Self.UseItn.ToString]); +end; + +function TSherpaOnnxOfflineModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineModelConfig(' + + 'Transducer := %s, ' + + 'Paraformer := %s, ' + + 'NeMoCtc := %s, ' + + 'Whisper := %s, ' + + 'Tdnn := %s, ' + + 'Tokens := %s, ' + + 'NumThreads := %d, ' + + 'Debug := %s, ' + + 'Provider := %s, ' + + 'ModelType := %s, ' + + 'ModelingUnit := %s, ' + + 'BpeVocab := %s, ' + + 'TeleSpeechCtc := %s, ' + + 'SenseVoice := %s' + + ')', + [Self.Transducer.ToString, Self.Paraformer.ToString, + Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString, + Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider, + Self.ModelType, Self.ModelingUnit, Self.BpeVocab, + Self.TeleSpeechCtc, Self.SenseVoice.ToString + ]); +end; + +function TSherpaOnnxOfflineRecognizerConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineRecognizerConfig(' + + 'FeatConfig := %s, ' + + 'ModelConfig := %s, ' + + 'LMConfig := %s, ' + + 'DecodingMethod := %s, ' + + 'MaxActivePaths := %d, ' + + 'HotwordsFile := %s, ' + + 'HotwordsScore := %.1f, ' + + 'RuleFsts := %s, ' + + 'RuleFars := %s, ' + + 'BlankPenalty := %1.f' + + ')', + [Self.FeatConfig.ToString, Self.ModelConfig.ToString, + Self.LMConfig.ToString, Self.DecodingMethod, Self.MaxActivePaths, + Self.HotwordsFile, Self.HotwordsScore, Self.RuleFsts, Self.RuleFars, + Self.BlankPenalty + ]); +end; + +constructor TSherpaOnnxOfflineRecognizer.Create(Config: TSherpaOnnxOfflineRecognizerConfig); +var + C: SherpaOnnxOfflineRecognizerConfig; +begin + Initialize(C); + + C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate; + C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim; + + C.ModelConfig.Transducer.Encoder := PAnsiChar(Config.ModelConfig.Transducer.Encoder); + C.ModelConfig.Transducer.Decoder := PAnsiChar(Config.ModelConfig.Transducer.Decoder); + C.ModelConfig.Transducer.Joiner := PAnsiChar(Config.ModelConfig.Transducer.Joiner); + + C.ModelConfig.Paraformer.Model := PAnsiChar(Config.ModelConfig.Paraformer.Model); + C.ModelConfig.NeMoCtc.Model := PAnsiChar(Config.ModelConfig.NeMoCtc.Model); + + C.ModelConfig.Whisper.Encoder := PAnsiChar(Config.ModelConfig.Whisper.Encoder); + C.ModelConfig.Whisper.Decoder := PAnsiChar(Config.ModelConfig.Whisper.Decoder); + C.ModelConfig.Whisper.Language := PAnsiChar(Config.ModelConfig.Whisper.Language); + C.ModelConfig.Whisper.Task := PAnsiChar(Config.ModelConfig.Whisper.Task); + C.ModelConfig.Whisper.TailPaddings := Config.ModelConfig.Whisper.TailPaddings; + + C.ModelConfig.Tdnn.Model := PAnsiChar(Config.ModelConfig.Tdnn.Model); + + + C.ModelConfig.Tokens := PAnsiChar(Config.ModelConfig.Tokens); + C.ModelConfig.NumThreads := Config.ModelConfig.NumThreads; + C.ModelConfig.Debug := Ord(Config.ModelConfig.Debug); + C.ModelConfig.Provider := PAnsiChar(Config.ModelConfig.Provider); + C.ModelConfig.ModelType := PAnsiChar(Config.ModelConfig.ModelType); + C.ModelConfig.ModelingUnit := PAnsiChar(Config.ModelConfig.ModelingUnit); + C.ModelConfig.BpeVocab := PAnsiChar(Config.ModelConfig.BpeVocab); + C.ModelConfig.TeleSpeechCtc := PAnsiChar(Config.ModelConfig.TeleSpeechCtc); + + C.ModelConfig.SenseVoice.Model := PAnsiChar(Config.ModelConfig.SenseVoice.Model); + C.ModelConfig.SenseVoice.Language := PAnsiChar(Config.ModelConfig.SenseVoice.Language); + C.ModelConfig.SenseVoice.UseItn := Ord(Config.ModelConfig.SenseVoice.UseItn); + + C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model); + C.LMConfig.Scale := Config.LMConfig.Scale; + + C.DecodingMethod := PAnsiChar(Config.DecodingMethod); + C.MaxActivePaths := Config.MaxActivePaths; + C.HotwordsFile := PAnsiChar(Config.HotwordsFile); + C.HotwordsScore := Config.HotwordsScore; + C.RuleFsts := PAnsiChar(Config.RuleFsts); + C.RuleFars := PAnsiChar(Config.RuleFars); + C.BlankPenalty := Config.BlankPenalty; + + Self.Handle := SherpaOnnxCreateOfflineRecognizer(@C); +end; + +destructor TSherpaOnnxOfflineRecognizer.Destroy; +begin + SherpaOnnxDestroyOfflineRecognizer(Self.Handle); + Self.Handle := nil; +end; + +function TSherpaOnnxOfflineRecognizer.CreateStream: TSherpaOnnxOfflineStream; +var + Stream: Pointer; +begin + Stream := SherpaOnnxCreateOfflineStream(Self.Handle); + Result := TSherpaOnnxOfflineStream.Create(Stream); +end; + +procedure TSherpaOnnxOfflineRecognizer.Decode(Stream: TSherpaOnnxOfflineStream); +begin + SherpaOnnxDecodeOfflineStream(Self.Handle, Stream.Handle); +end; + +function TSherpaOnnxOfflineRecognizer.GetResult(Stream: TSherpaOnnxOfflineStream): TSherpaOnnxOfflineRecognizerResult; +var + pJson: PAnsiChar; + JsonData: TJSONData; + JsonObject : TJSONObject; + JsonEnum: TJSONEnum; + I: Integer; +begin + pJson := SherpaOnnxGetOfflineStreamResultAsJson(Stream.Handle); + + JsonData := GetJSON(AnsiString(pJson), False); + + JsonObject := JsonData as TJSONObject; + + Result.Text := JsonObject.Strings['text']; + + SetLength(Result.Tokens, JsonObject.Arrays['tokens'].Count); + + I := 0; + for JsonEnum in JsonObject.Arrays['tokens'] do + begin + Result.Tokens[I] := JsonEnum.Value.AsString; + Inc(I); + end; + + SetLength(Result.Timestamps, JsonObject.Arrays['timestamps'].Count); + I := 0; + for JsonEnum in JsonObject.Arrays['timestamps'] do + begin + Result.Timestamps[I] := JsonEnum.Value.AsFloat; + Inc(I); + end; + + SherpaOnnxDestroyOfflineStreamResultJson(pJson); +end; + +constructor TSherpaOnnxOfflineStream.Create(P: Pointer); +begin + Self.Handle := P; +end; + +destructor TSherpaOnnxOfflineStream.Destroy; +begin + SherpaOnnxDestroyOfflineStream(Self.Handle); + Self.Handle := nil; +end; + +procedure TSherpaOnnxOfflineStream.AcceptWaveform(Samples: array of Single; SampleRate: Integer); +begin + SherpaOnnxAcceptWaveformOffline(Self.Handle, SampleRate, pcfloat(Samples), + Length(Samples)); +end; + +function TSherpaOnnxOfflineRecognizerResult.ToString: AnsiString; +var + TokensStr: AnsiString; + S: AnsiString; + TimestampStr: AnsiString; + T: Single; + Sep: AnsiString; +begin + TokensStr := '['; + Sep := ''; + for S in Self.Tokens do + begin + TokensStr := TokensStr + Sep + S; + Sep := ', '; + end; + TokensStr := TokensStr + ']'; + + TimestampStr := '['; + Sep := ''; + for T in Self.Timestamps do + begin + TimestampStr := TimestampStr + Sep + Format('%.2f', [T]); + Sep := ', '; + end; + TimestampStr := TimestampStr + ']'; + + Result := Format('TSherpaOnnxOfflineRecognizerResult(Text := %s, ' + + 'Tokens := %s, ' + + 'Timestamps := %s' + + ')', + [Self.Text, TokensStr, TimestampStr]); +end; + end. +