Add TTS API and examples for Dart (#1010)

k2-fsa · Jun 15, 2024 · e52d32b · e52d32b
1 parent e307767
commit e52d32b
Show file tree

Hide file tree

Showing 20 changed files with 874 additions and 0 deletions.
diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh
@@ -4,6 +4,22 @@ set -ex
 
 cd dart-api-examples
 
+pushd tts
+
+echo '----------piper tts----------'
+./run-piper.sh
+rm -rf vits-piper-*
+
+echo '----------coqui tts----------'
+./run-coqui.sh
+rm -rf vits-coqui-*
+
+echo '----------zh tts----------'
+./run-zh.sh
+rm -rf sherpa-onnx-*
+
+popd # tts
+
 pushd streaming-asr
 
 echo '----------streaming zipformer ctc HLG----------'

diff --git a/.github/workflows/test-dart.yaml b/.github/workflows/test-dart.yaml
@@ -92,5 +92,6 @@ jobs:
           cp scripts/dart/vad-pubspec.yaml dart-api-examples/vad/pubspec.yaml
           cp scripts/dart/non-streaming-asr-pubspec.yaml dart-api-examples/non-streaming-asr/pubspec.yaml
           cp scripts/dart/streaming-asr-pubspec.yaml dart-api-examples/streaming-asr/pubspec.yaml
+          cp scripts/dart/tts-pubspec.yaml dart-api-examples/tts/pubspec.yaml
 
           .github/scripts/test-dart.sh
diff --git a/dart-api-examples/streaming-asr/README.md b/dart-api-examples/streaming-asr/README.md
@@ -1,6 +1,7 @@
 # Introduction
 
 This folder contains examples for streaming ASR with Dart API.
+
 | File | Description|
 |------|------------|
 |[./bin/nemo-transducer.dart](./bin/nemo-transducer.dart)| Use a NeMo transducer model for speech recognition. See [./run-nemo-transducer.sh](./run-nemo-transducer.sh)|

diff --git a/dart-api-examples/tts/.gitignore b/dart-api-examples/tts/.gitignore
@@ -0,0 +1,3 @@
+# https://dart.dev/guides/libraries/private-files
+# Created by `dart pub`
+.dart_tool/
diff --git a/dart-api-examples/tts/CHANGELOG.md b/dart-api-examples/tts/CHANGELOG.md
@@ -0,0 +1,3 @@
+## 1.0.0
+
+- Initial version.
diff --git a/dart-api-examples/tts/README.md b/dart-api-examples/tts/README.md
@@ -0,0 +1,10 @@
+# Introduction
+
+This folder contains examples for text to speech with Dart API.
+
+| File | Description|
+|------|------------|
+|[./bin/piper.dart](./bin/piper.dart)| Use a Piper tts model for text to speech. See [./run-piper.sh](./run-piper.sh)|
+|[./bin/coqui.dart](./bin/coqui.dart)| Use a Coqui tts model for text to speech. See [./run-coqui.sh](./run-coqui.sh)|
+|[./bin/zh.dart](./bin/zh.dart)| Use a Chinese VITS tts model for text to speech. See [./run-zh.sh](./run-zh.sh)|
+
diff --git a/dart-api-examples/tts/analysis_options.yaml b/dart-api-examples/tts/analysis_options.yaml
@@ -0,0 +1,30 @@
+# This file configures the static analysis results for your project (errors,
+# warnings, and lints).
+#
+# This enables the 'recommended' set of lints from `package:lints`.
+# This set helps identify many issues that may lead to problems when running
+# or consuming Dart code, and enforces writing Dart using a single, idiomatic
+# style and format.
+#
+# If you want a smaller set of lints you can change this to specify
+# 'package:lints/core.yaml'. These are just the most critical lints
+# (the recommended set includes the core lints).
+# The core lints are also what is used by pub.dev for scoring packages.
+
+include: package:lints/recommended.yaml
+
+# Uncomment the following section to specify additional rules.
+
+# linter:
+#   rules:
+#     - camel_case_types
+
+# analyzer:
+#   exclude:
+#     - path/to/excluded/files/**
+
+# For more information about the core and recommended set of lints, see
+# https://dart.dev/go/core-lints
+
+# For additional information about configuring this file, see
+# https://dart.dev/guides/language/analysis-options
diff --git a/dart-api-examples/tts/bin/coqui.dart b/dart-api-examples/tts/bin/coqui.dart
@@ -0,0 +1,69 @@
+// Copyright (c)  2024  Xiaomi Corporation
+import 'dart:io';
+import 'dart:typed_data';
+
+import 'package:args/args.dart';
+import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
+
+import './init.dart';
+
+void main(List<String> arguments) async {
+  await initSherpaOnnx();
+
+  final parser = ArgParser()
+    ..addOption('model', help: 'Path to the ONNX model')
+    ..addOption('tokens', help: 'Path to tokens.txt')
+    ..addOption('text', help: 'Text to generate TTS for')
+    ..addOption('output-wav', help: 'Filename to save the generated audio')
+    ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0')
+    ..addOption(
+      'sid',
+      help: 'Speaker ID to select. Used only for multi-speaker TTS',
+      defaultsTo: '0',
+    );
+  final res = parser.parse(arguments);
+  if (res['model'] == null ||
+      res['tokens'] == null ||
+      res['output-wav'] == null ||
+      res['text'] == null) {
+    print(parser.usage);
+    exit(1);
+  }
+  final model = res['model'] as String;
+  final tokens = res['tokens'] as String;
+  final text = res['text'] as String;
+  final outputWav = res['output-wav'] as String;
+  var speed = double.tryParse(res['speed'] as String) ?? 1.0;
+  final sid = int.tryParse(res['sid'] as String) ?? 0;
+
+  if (speed == 0) {
+    speed = 1.0;
+  }
+
+  final vits = sherpa_onnx.OfflineTtsVitsModelConfig(
+    model: model,
+    tokens: tokens,
+    lengthScale: 1 / speed,
+  );
+
+  final modelConfig = sherpa_onnx.OfflineTtsModelConfig(
+    vits: vits,
+    numThreads: 1,
+    debug: true,
+  );
+  final config = sherpa_onnx.OfflineTtsConfig(
+    model: modelConfig,
+    maxNumSenetences: 1,
+  );
+
+  final tts = sherpa_onnx.OfflineTts(config);
+  final audio = tts.generate(text: text, sid: sid, speed: speed);
+  tts.free();
+
+  sherpa_onnx.writeWave(
+    filename: outputWav,
+    samples: audio.samples,
+    sampleRate: audio.sampleRate,
+  );
+  print('Saved to ${outputWav}');
+}
diff --git a/dart-api-examples/tts/bin/init.dart b/dart-api-examples/tts/bin/init.dart
@@ -0,0 +1 @@
+../../vad/bin/init.dart
diff --git a/dart-api-examples/tts/bin/piper.dart b/dart-api-examples/tts/bin/piper.dart
@@ -0,0 +1,80 @@
+// Copyright (c)  2024  Xiaomi Corporation
+import 'dart:io';
+import 'dart:typed_data';
+
+import 'package:args/args.dart';
+import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
+
+import './init.dart';
+
+void main(List<String> arguments) async {
+  await initSherpaOnnx();
+
+  final parser = ArgParser()
+    ..addOption('model', help: 'Path to the ONNX model')
+    ..addOption('tokens', help: 'Path to tokens.txt')
+    ..addOption('data-dir', help: 'Path to espeak-ng-data directory')
+    ..addOption('text', help: 'Text to generate TTS for')
+    ..addOption('output-wav', help: 'Filename to save the generated audio')
+    ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0')
+    ..addOption(
+      'sid',
+      help: 'Speaker ID to select. Used only for multi-speaker TTS',
+      defaultsTo: '0',
+    );
+  final res = parser.parse(arguments);
+  if (res['model'] == null ||
+      res['tokens'] == null ||
+      res['data-dir'] == null ||
+      res['output-wav'] == null ||
+      res['text'] == null) {
+    print(parser.usage);
+    exit(1);
+  }
+  final model = res['model'] as String;
+  final tokens = res['tokens'] as String;
+  final dataDir = res['data-dir'] as String;
+  final text = res['text'] as String;
+  final outputWav = res['output-wav'] as String;
+  var speed = double.tryParse(res['speed'] as String) ?? 1.0;
+  final sid = int.tryParse(res['sid'] as String) ?? 0;
+
+  if (speed == 0) {
+    speed = 1.0;
+  }
+
+  final vits = sherpa_onnx.OfflineTtsVitsModelConfig(
+    model: model,
+    tokens: tokens,
+    dataDir: dataDir,
+    lengthScale: 1 / speed,
+  );
+
+  final modelConfig = sherpa_onnx.OfflineTtsModelConfig(
+    vits: vits,
+    numThreads: 1,
+    debug: true,
+  );
+  final config = sherpa_onnx.OfflineTtsConfig(
+    model: modelConfig,
+    maxNumSenetences: 1,
+  );
+
+  final tts = sherpa_onnx.OfflineTts(config);
+  final audio = tts.generateWithCallback(
+      text: text,
+      sid: sid,
+      speed: speed,
+      callback: (Float32List samples) {
+        print('${samples.length} samples received');
+        // You can play samples in a separate thread/isolate
+      });
+  tts.free();
+
+  sherpa_onnx.writeWave(
+    filename: outputWav,
+    samples: audio.samples,
+    sampleRate: audio.sampleRate,
+  );
+  print('Saved to ${outputWav}');
+}
diff --git a/dart-api-examples/tts/bin/zh.dart b/dart-api-examples/tts/bin/zh.dart
@@ -0,0 +1,86 @@
+// Copyright (c)  2024  Xiaomi Corporation
+import 'dart:io';
+import 'dart:typed_data';
+
+import 'package:args/args.dart';
+import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
+
+import './init.dart';
+
+void main(List<String> arguments) async {
+  await initSherpaOnnx();
+
+  final parser = ArgParser()
+    ..addOption('model', help: 'Path to the ONNX model')
+    ..addOption('tokens', help: 'Path to tokens.txt')
+    ..addOption('lexicon', help: 'Path to lexicon.txt')
+    ..addOption(
+      'dict-dir',
+      help: 'Path to jieba dict directory',
+      defaultsTo: '',
+    )
+    ..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '')
+    ..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '')
+    ..addOption('text', help: 'Text to generate TTS for')
+    ..addOption('output-wav', help: 'Filename to save the generated audio')
+    ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0')
+    ..addOption(
+      'sid',
+      help: 'Speaker ID to select. Used only for multi-speaker TTS',
+      defaultsTo: '0',
+    );
+  final res = parser.parse(arguments);
+  if (res['model'] == null ||
+      res['lexicon'] == null ||
+      res['tokens'] == null ||
+      res['output-wav'] == null ||
+      res['text'] == null) {
+    print(parser.usage);
+    exit(1);
+  }
+  final model = res['model'] as String;
+  final lexicon = res['lexicon'] as String;
+  final tokens = res['tokens'] as String;
+  final dictDir = res['dict-dir'] as String;
+  final ruleFsts = res['rule-fsts'] as String;
+  final ruleFars = res['rule-fars'] as String;
+  final text = res['text'] as String;
+  final outputWav = res['output-wav'] as String;
+  var speed = double.tryParse(res['speed'] as String) ?? 1.0;
+  final sid = int.tryParse(res['sid'] as String) ?? 0;
+
+  if (speed == 0) {
+    speed = 1.0;
+  }
+
+  final vits = sherpa_onnx.OfflineTtsVitsModelConfig(
+    model: model,
+    lexicon: lexicon,
+    tokens: tokens,
+    dictDir: dictDir,
+    lengthScale: 1 / speed,
+  );
+
+  final modelConfig = sherpa_onnx.OfflineTtsModelConfig(
+    vits: vits,
+    numThreads: 1,
+    debug: true,
+  );
+  final config = sherpa_onnx.OfflineTtsConfig(
+    model: modelConfig,
+    maxNumSenetences: 1,
+    ruleFsts: ruleFsts,
+    ruleFars: ruleFars,
+  );
+
+  final tts = sherpa_onnx.OfflineTts(config);
+  final audio = tts.generate(text: text, sid: sid, speed: speed);
+  tts.free();
+
+  sherpa_onnx.writeWave(
+    filename: outputWav,
+    samples: audio.samples,
+    sampleRate: audio.sampleRate,
+  );
+  print('Saved to ${outputWav}');
+}