Skip to content

Commit

Permalink
Pascal API for non-streaming ASR (#1247)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Aug 12, 2024
1 parent 5791b69 commit a7dc6c2
Show file tree
Hide file tree
Showing 26 changed files with 1,616 additions and 8 deletions.
49 changes: 49 additions & 0 deletions .github/workflows/pascal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,11 @@ jobs:
if [[ ${{ matrix.os }} == 'windows-latest' ]]; then
cp -v install/lib/*.dll ../pascal-api-examples/read-wav
cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr
cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr
fi
- name: Run Pascal test (Read wav test)
Expand All @@ -133,6 +135,48 @@ jobs:
ls -lh
popd
- name: Run Pascal test (Non Streaming ASR)
shell: bash
run: |
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
cd ./pascal-api-examples
pushd non-streaming-asr
./run-zipformer-transducer.sh
rm -rf sherpa-onnx-*
echo "---"
./run-whisper.sh
rm -rf sherpa-onnx-*
echo "---"
./run-nemo-transducer.sh
rm -rf sherpa-onnx-*
echo "---"
./run-nemo-ctc.sh
rm -rf sherpa-onnx-*
echo "---"
./run-sense-voice.sh
rm -rf sherpa-onnx-*
echo "---"
./run-telespeech-ctc.sh
rm -rf sherpa-onnx-*
echo "---"
./run-paraformer.sh
./run-paraformer-itn.sh
rm -rf sherpa-onnx-*
echo "---"
ls -lh
popd
- name: Run Pascal test (Streaming ASR)
shell: bash
run: |
Expand All @@ -141,10 +185,15 @@ jobs:
cd ./pascal-api-examples
pushd streaming-asr
./run-zipformer-transducer.sh
rm -rf sherpa-onnx-*
echo "---"
./run-nemo-transducer.sh
rm -rf sherpa-onnx-*
echo "---"
if [[ ${{ matrix.os }} != 'windows-latest' ]]; then
./run-paraformer.sh
rm -rf sherpa-onnx-*
Expand Down
16 changes: 10 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,17 @@

### Supported programming languages

| 1. C++ | 2. C | 3. Python | 4. C# | 5. Java | 6. JavaScript |
|--------|-------|-----------|-------|---------|---------------|
| ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ |
| 1. C++ | 2. C | 3. Python | 4. C# | 5. Java |
|--------|-------|-----------|-------|---------|
| ✔️ | ✔️ | ✔️ | ✔️ | ✔️ |

| 7. Kotlin | 8. Swift | 9. Go | 10. Dart | 11. Rust | 12. Pascal |
|-----------|----------|-------|----------|----------|------------|
| ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ |
| 6. JavaScript | 7. Kotlin | 8. Swift | 9. Go | 10. Dart |
|---------------|-----------|----------|-------|----------|
| ✔️ | ✔️ | ✔️ | ✔️ | ✔️ |

| 11. Rust | 12. Pascal |
|----------|------------|
| ✔️ | ✔️ |

For Rust support, please see https://github.com/thewh1teagle/sherpa-rs

Expand Down
1 change: 1 addition & 0 deletions pascal-api-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ APIs of [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
|---------|------------|
|[read-wav](./read-wav)|It shows how to read a wave file.|
|[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
9 changes: 9 additions & 0 deletions pascal-api-examples/non-streaming-asr/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
!run-*.sh
zipformer_transducer
whisper
nemo_transducer
nemo_ctc
paraformer
paraformer_itn
sense_voice
telespeech_ctc
15 changes: 15 additions & 0 deletions pascal-api-examples/non-streaming-asr/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Introduction

This folder contains examples about using sherpa-onnx's object pascal
APIs with non-streaming models for speech recognition.

|File|Description|
|----|-----------|
|[run-nemo-ctc.sh](./run-nemo-ctc.sh)|Use a non-streaming NeMo CTC model for speech recognition|
|[run-nemo-transducer.sh](./run-nemo-transducer.sh)|Use a non-streaming NeMo transducer model for speech recognition|
|[run-paraformer-itn.sh](./run-paraformer-itn.sh)|Use a non-streaming Paraformer model for speech recognition with inverse text normalization for numbers|
|[run-paraformer.sh](./run-paraformer.sh)|Use a non-streaming Paraformer model for speech recognition|
|[run-sense-voice.sh](./run-sense-voice.sh)|Use a non-streaming SenseVoice model for speech recognition|
|[run-telespeech-ctc.sh](./run-telespeech-ctc.sh)|Use a non-streaming TeleSpeech CTC model for speech recognition|
|[run-whisper.sh](./run-whisper.sh)|Use a Whisper model for speech recognition|
|[run-zipformer-transducer.sh](./run-zipformer-transducer.sh)|Use a non-streaming Zipformer transducer model for speech recognition|
74 changes: 74 additions & 0 deletions pascal-api-examples/non-streaming-asr/nemo_ctc.pas
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
{ Copyright (c) 2024 Xiaomi Corporation }

{
This file shows how to use a non-streaming NeMo CTC model
to decode files.
You can download the model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}

program nemo_ctc;

{$mode objfpc}

uses
sherpa_onnx,
DateUtils,
SysUtils;

var
Wave: TSherpaOnnxWave;
WaveFilename: AnsiString;

Config: TSherpaOnnxOfflineRecognizerConfig;
Recognizer: TSherpaOnnxOfflineRecognizer;
Stream: TSherpaOnnxOfflineStream;
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;

Start: TDateTime;
Stop: TDateTime;

Elapsed: Single;
Duration: Single;
RealTimeFactor: Single;
begin
Config.ModelConfig.NeMoCtC.Model := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 1;
Config.ModelConfig.Debug := False;

WaveFilename := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/es-spanish.wav';

Wave := SherpaOnnxReadWave(WaveFilename);

Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
Stream := Recognizer.CreateStream();
Start := Now;

Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);

RecognitionResult := Recognizer.GetResult(Stream);

Stop := Now;

Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
Duration := Length(Wave.Samples) / Wave.SampleRate;
RealTimeFactor := Elapsed / Duration;

WriteLn(RecognitionResult.ToString);
WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
WriteLn(Format('Elapsed %.3f s', [Elapsed]));
WriteLn(Format('Wave duration %.3f s', [Duration]));
WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));

{Free resources to avoid memory leak.
Note: You don't need to invoke them for this simple script.
However, you have to invoke them in your own large/complex project.
}
FreeAndNil(Stream);
FreeAndNil(Recognizer);
end.
77 changes: 77 additions & 0 deletions pascal-api-examples/non-streaming-asr/nemo_transducer.pas
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{ Copyright (c) 2024 Xiaomi Corporation }

{
This file shows how to use a non-streaming NeMo transducer
to decode files.
You can download the model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}

program nemo_transducer;

{$mode objfpc}

uses
sherpa_onnx,
DateUtils,
SysUtils;

var
Wave: TSherpaOnnxWave;
WaveFilename: AnsiString;

Config: TSherpaOnnxOfflineRecognizerConfig;
Recognizer: TSherpaOnnxOfflineRecognizer;
Stream: TSherpaOnnxOfflineStream;
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;

Start: TDateTime;
Stop: TDateTime;

Elapsed: Single;
Duration: Single;
RealTimeFactor: Single;
begin
Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx';
Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx';
Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx';
Config.ModelConfig.ModelType := 'nemo_transducer';
Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 1;
Config.ModelConfig.Debug := False;

WaveFilename := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/de-german.wav';

Wave := SherpaOnnxReadWave(WaveFilename);

Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
Stream := Recognizer.CreateStream();
Start := Now;

Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);

RecognitionResult := Recognizer.GetResult(Stream);

Stop := Now;

Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
Duration := Length(Wave.Samples) / Wave.SampleRate;
RealTimeFactor := Elapsed / Duration;

WriteLn(RecognitionResult.ToString);
WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
WriteLn(Format('Elapsed %.3f s', [Elapsed]));
WriteLn(Format('Wave duration %.3f s', [Duration]));
WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));

{Free resources to avoid memory leak.
Note: You don't need to invoke them for this simple script.
However, you have to invoke them in your own large/complex project.
}
FreeAndNil(Stream);
FreeAndNil(Recognizer);
end.
74 changes: 74 additions & 0 deletions pascal-api-examples/non-streaming-asr/paraformer.pas
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
{ Copyright (c) 2024 Xiaomi Corporation }

{
This file shows how to use a non-streaming Paraformer model
to decode files.
You can download the model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}

program paraformer;

{$mode objfpc}

uses
sherpa_onnx,
DateUtils,
SysUtils;

var
Wave: TSherpaOnnxWave;
WaveFilename: AnsiString;

Config: TSherpaOnnxOfflineRecognizerConfig;
Recognizer: TSherpaOnnxOfflineRecognizer;
Stream: TSherpaOnnxOfflineStream;
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;

Start: TDateTime;
Stop: TDateTime;

Elapsed: Single;
Duration: Single;
RealTimeFactor: Single;
begin
Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 1;
Config.ModelConfig.Debug := False;

WaveFilename := './sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/3-sichuan.wav';

Wave := SherpaOnnxReadWave(WaveFilename);

Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
Stream := Recognizer.CreateStream();
Start := Now;

Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);

RecognitionResult := Recognizer.GetResult(Stream);

Stop := Now;

Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
Duration := Length(Wave.Samples) / Wave.SampleRate;
RealTimeFactor := Elapsed / Duration;

WriteLn(RecognitionResult.ToString);
WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
WriteLn(Format('Elapsed %.3f s', [Elapsed]));
WriteLn(Format('Wave duration %.3f s', [Duration]));
WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));

{Free resources to avoid memory leak.
Note: You don't need to invoke them for this simple script.
However, you have to invoke them in your own large/complex project.
}
FreeAndNil(Stream);
FreeAndNil(Recognizer);
end.
Loading

0 comments on commit a7dc6c2

Please sign in to comment.