Pascal API for non-streaming ASR (#1247)

k2-fsa · Aug 12, 2024 · a7dc6c2 · a7dc6c2
1 parent 5791b69
commit a7dc6c2
Show file tree

Hide file tree

Showing 26 changed files with 1,616 additions and 8 deletions.
diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml
@@ -115,9 +115,11 @@ jobs:
           if [[ ${{ matrix.os }} == 'windows-latest' ]]; then
             cp -v install/lib/*.dll ../pascal-api-examples/read-wav
             cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr
+            cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr
 
             cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav
             cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr
+            cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr
           fi
 
       - name:  Run Pascal test (Read wav test)
@@ -133,6 +135,48 @@ jobs:
           ls -lh
           popd
 
+      - name:  Run Pascal test (Non Streaming ASR)
+        shell: bash
+        run: |
+          export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
+
+          cd ./pascal-api-examples
+
+          pushd non-streaming-asr
+          ./run-zipformer-transducer.sh
+          rm -rf sherpa-onnx-*
+          echo "---"
+
+          ./run-whisper.sh
+          rm -rf sherpa-onnx-*
+          echo "---"
+
+          ./run-nemo-transducer.sh
+          rm -rf sherpa-onnx-*
+          echo "---"
+
+          ./run-nemo-ctc.sh
+          rm -rf sherpa-onnx-*
+          echo "---"
+
+          ./run-sense-voice.sh
+          rm -rf sherpa-onnx-*
+          echo "---"
+
+          ./run-telespeech-ctc.sh
+          rm -rf sherpa-onnx-*
+          echo "---"
+
+          ./run-paraformer.sh
+
+          ./run-paraformer-itn.sh
+
+          rm -rf sherpa-onnx-*
+          echo "---"
+
+          ls -lh
+          popd
+
       - name:  Run Pascal test (Streaming ASR)
         shell: bash
         run: |
@@ -141,10 +185,15 @@ jobs:
           cd ./pascal-api-examples
 
           pushd streaming-asr
+
           ./run-zipformer-transducer.sh
           rm -rf sherpa-onnx-*
           echo "---"
 
+          ./run-nemo-transducer.sh
+          rm -rf sherpa-onnx-*
+          echo "---"
+
           if [[ ${{ matrix.os }} != 'windows-latest' ]]; then
             ./run-paraformer.sh
             rm -rf sherpa-onnx-*

diff --git a/README.md b/README.md
@@ -25,13 +25,17 @@
 
 ### Supported programming languages
 
-| 1. C++ | 2. C  | 3. Python | 4. C# | 5. Java | 6. JavaScript |
-|--------|-------|-----------|-------|---------|---------------|
-|   ✔️    | ✔️     | ✔️         | ✔️     |  ✔️      |      ✔️        |
+| 1. C++ | 2. C  | 3. Python | 4. C# | 5. Java |
+|--------|-------|-----------|-------|---------|
+|   ✔️    | ✔️     | ✔️         | ✔️     |  ✔️      |
 
-| 7. Kotlin | 8. Swift | 9. Go | 10. Dart | 11. Rust | 12. Pascal |
-|-----------|----------|-------|----------|----------|------------|
-| ✔️         |  ✔️       | ✔️     |  ✔️       |  ✔️       |    ✔️       |
+| 6. JavaScript | 7. Kotlin | 8. Swift | 9. Go | 10. Dart |
+|---------------|-----------|----------|-------|----------|
+|      ✔️        | ✔️         |  ✔️       | ✔️     |  ✔️       |
+
+| 11. Rust | 12. Pascal |
+|----------|------------|
+|  ✔️       |    ✔️       |
 
 For Rust support, please see https://github.com/thewh1teagle/sherpa-rs
 

diff --git a/pascal-api-examples/README.md b/pascal-api-examples/README.md
@@ -7,3 +7,4 @@ APIs of [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
 |---------|------------|
 |[read-wav](./read-wav)|It shows how to read a wave file.|
 |[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
+|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
diff --git a/pascal-api-examples/non-streaming-asr/.gitignore b/pascal-api-examples/non-streaming-asr/.gitignore
@@ -0,0 +1,9 @@
+!run-*.sh
+zipformer_transducer
+whisper
+nemo_transducer
+nemo_ctc
+paraformer
+paraformer_itn
+sense_voice
+telespeech_ctc
diff --git a/pascal-api-examples/non-streaming-asr/README.md b/pascal-api-examples/non-streaming-asr/README.md
@@ -0,0 +1,15 @@
+# Introduction
+
+This folder contains examples about using sherpa-onnx's object pascal
+APIs with non-streaming models for speech recognition.
+
+|File|Description|
+|----|-----------|
+|[run-nemo-ctc.sh](./run-nemo-ctc.sh)|Use a non-streaming NeMo CTC model for speech recognition|
+|[run-nemo-transducer.sh](./run-nemo-transducer.sh)|Use a non-streaming NeMo transducer model for speech recognition|
+|[run-paraformer-itn.sh](./run-paraformer-itn.sh)|Use a non-streaming Paraformer model for speech recognition with inverse text normalization for numbers|
+|[run-paraformer.sh](./run-paraformer.sh)|Use a non-streaming Paraformer model for speech recognition|
+|[run-sense-voice.sh](./run-sense-voice.sh)|Use a non-streaming SenseVoice model for speech recognition|
+|[run-telespeech-ctc.sh](./run-telespeech-ctc.sh)|Use a non-streaming TeleSpeech CTC model for speech recognition|
+|[run-whisper.sh](./run-whisper.sh)|Use a Whisper model for speech recognition|
+|[run-zipformer-transducer.sh](./run-zipformer-transducer.sh)|Use a non-streaming Zipformer transducer model for speech recognition|
diff --git a/pascal-api-examples/non-streaming-asr/nemo_ctc.pas b/pascal-api-examples/non-streaming-asr/nemo_ctc.pas
@@ -0,0 +1,74 @@
+{ Copyright (c)  2024  Xiaomi Corporation }
+
+{
+This file shows how to use a non-streaming NeMo CTC model
+to decode files.
+
+You can download the model files from
+https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+}
+
+program nemo_ctc;
+
+{$mode objfpc}
+
+uses
+  sherpa_onnx,
+  DateUtils,
+  SysUtils;
+
+var
+  Wave: TSherpaOnnxWave;
+  WaveFilename: AnsiString;
+
+  Config: TSherpaOnnxOfflineRecognizerConfig;
+  Recognizer: TSherpaOnnxOfflineRecognizer;
+  Stream: TSherpaOnnxOfflineStream;
+  RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
+
+  Start: TDateTime;
+  Stop: TDateTime;
+
+  Elapsed: Single;
+  Duration: Single;
+  RealTimeFactor: Single;
+begin
+  Config.ModelConfig.NeMoCtC.Model := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx';
+  Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt';
+  Config.ModelConfig.Provider := 'cpu';
+  Config.ModelConfig.NumThreads := 1;
+  Config.ModelConfig.Debug := False;
+
+  WaveFilename := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/es-spanish.wav';
+
+  Wave := SherpaOnnxReadWave(WaveFilename);
+
+  Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
+  Stream := Recognizer.CreateStream();
+  Start := Now;
+
+  Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
+  Recognizer.Decode(Stream);
+
+  RecognitionResult := Recognizer.GetResult(Stream);
+
+  Stop := Now;
+
+  Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
+  Duration := Length(Wave.Samples) / Wave.SampleRate;
+  RealTimeFactor := Elapsed / Duration;
+
+  WriteLn(RecognitionResult.ToString);
+  WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
+  WriteLn(Format('Elapsed %.3f s', [Elapsed]));
+  WriteLn(Format('Wave duration %.3f s', [Duration]));
+  WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));
+
+  {Free resources to avoid memory leak.
+
+  Note: You don't need to invoke them for this simple script.
+  However, you have to invoke them in your own large/complex project.
+  }
+  FreeAndNil(Stream);
+  FreeAndNil(Recognizer);
+end.
diff --git a/pascal-api-examples/non-streaming-asr/nemo_transducer.pas b/pascal-api-examples/non-streaming-asr/nemo_transducer.pas
@@ -0,0 +1,77 @@
+{ Copyright (c)  2024  Xiaomi Corporation }
+
+{
+This file shows how to use a non-streaming NeMo transducer
+to decode files.
+
+You can download the model files from
+https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+}
+
+program nemo_transducer;
+
+{$mode objfpc}
+
+uses
+  sherpa_onnx,
+  DateUtils,
+  SysUtils;
+
+var
+  Wave: TSherpaOnnxWave;
+  WaveFilename: AnsiString;
+
+  Config: TSherpaOnnxOfflineRecognizerConfig;
+  Recognizer: TSherpaOnnxOfflineRecognizer;
+  Stream: TSherpaOnnxOfflineStream;
+  RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
+
+  Start: TDateTime;
+  Stop: TDateTime;
+
+  Elapsed: Single;
+  Duration: Single;
+  RealTimeFactor: Single;
+begin
+  Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx';
+  Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx';
+  Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx';
+  Config.ModelConfig.ModelType := 'nemo_transducer';
+  Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt';
+  Config.ModelConfig.Provider := 'cpu';
+  Config.ModelConfig.NumThreads := 1;
+  Config.ModelConfig.Debug := False;
+
+  WaveFilename := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/de-german.wav';
+
+  Wave := SherpaOnnxReadWave(WaveFilename);
+
+  Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
+  Stream := Recognizer.CreateStream();
+  Start := Now;
+
+  Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
+  Recognizer.Decode(Stream);
+
+  RecognitionResult := Recognizer.GetResult(Stream);
+
+  Stop := Now;
+
+  Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
+  Duration := Length(Wave.Samples) / Wave.SampleRate;
+  RealTimeFactor := Elapsed / Duration;
+
+  WriteLn(RecognitionResult.ToString);
+  WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
+  WriteLn(Format('Elapsed %.3f s', [Elapsed]));
+  WriteLn(Format('Wave duration %.3f s', [Duration]));
+  WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));
+
+  {Free resources to avoid memory leak.
+
+  Note: You don't need to invoke them for this simple script.
+  However, you have to invoke them in your own large/complex project.
+  }
+  FreeAndNil(Stream);
+  FreeAndNil(Recognizer);
+end.
diff --git a/pascal-api-examples/non-streaming-asr/paraformer.pas b/pascal-api-examples/non-streaming-asr/paraformer.pas
@@ -0,0 +1,74 @@
+{ Copyright (c)  2024  Xiaomi Corporation }
+
+{
+This file shows how to use a non-streaming Paraformer model
+to decode files.
+
+You can download the model files from
+https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+}
+
+program paraformer;
+
+{$mode objfpc}
+
+uses
+  sherpa_onnx,
+  DateUtils,
+  SysUtils;
+
+var
+  Wave: TSherpaOnnxWave;
+  WaveFilename: AnsiString;
+
+  Config: TSherpaOnnxOfflineRecognizerConfig;
+  Recognizer: TSherpaOnnxOfflineRecognizer;
+  Stream: TSherpaOnnxOfflineStream;
+  RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
+
+  Start: TDateTime;
+  Stop: TDateTime;
+
+  Elapsed: Single;
+  Duration: Single;
+  RealTimeFactor: Single;
+begin
+  Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
+  Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
+  Config.ModelConfig.Provider := 'cpu';
+  Config.ModelConfig.NumThreads := 1;
+  Config.ModelConfig.Debug := False;
+
+  WaveFilename := './sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/3-sichuan.wav';
+
+  Wave := SherpaOnnxReadWave(WaveFilename);
+
+  Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
+  Stream := Recognizer.CreateStream();
+  Start := Now;
+
+  Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
+  Recognizer.Decode(Stream);
+
+  RecognitionResult := Recognizer.GetResult(Stream);
+
+  Stop := Now;
+
+  Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
+  Duration := Length(Wave.Samples) / Wave.SampleRate;
+  RealTimeFactor := Elapsed / Duration;
+
+  WriteLn(RecognitionResult.ToString);
+  WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
+  WriteLn(Format('Elapsed %.3f s', [Elapsed]));
+  WriteLn(Format('Wave duration %.3f s', [Duration]));
+  WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));
+
+  {Free resources to avoid memory leak.
+
+  Note: You don't need to invoke them for this simple script.
+  However, you have to invoke them in your own large/complex project.
+  }
+  FreeAndNil(Stream);
+  FreeAndNil(Recognizer);
+end.