diff --git a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt index 132e8b06a..d08e803a5 100644 --- a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt +++ b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt @@ -23,7 +23,7 @@ data class OfflineTtsModelConfig( data class OfflineTtsConfig( var model: OfflineTtsModelConfig, var ruleFsts: String = "", - var maxNumSentences: Int = 2, + var maxNumSentences: Int = 1, ) class GeneratedAudio( diff --git a/scripts/apk/generate-tts-apk-script.py b/scripts/apk/generate-tts-apk-script.py index 958f55475..fd24be887 100755 --- a/scripts/apk/generate-tts-apk-script.py +++ b/scripts/apk/generate-tts-apk-script.py @@ -33,6 +33,23 @@ class TtsModel: data_dir: Optional[str] = None +def get_coqui_models() -> List[TtsModel]: + # English (coqui-ai/TTS) + models = [ + TtsModel(model_dir="vits-coqui-en-ljspeech"), + TtsModel(model_dir="vits-coqui-en-ljspeech-neon"), + TtsModel(model_dir="vits-coqui-en-vctk"), + # TtsModel(model_dir="vits-coqui-en-jenny"), + ] + + for m in models: + m.data_dir = m.model_dir + "/" + "espeak-ng-data" + m.model_name = "model.onnx" + m.lang = "en" + + return models + + def get_piper_models() -> List[TtsModel]: models = [ TtsModel(model_dir="vits-piper-ar_JO-kareem-low"), @@ -137,6 +154,7 @@ def get_piper_models() -> List[TtsModel]: TtsModel(model_dir="vits-piper-vi_VN-vivos-x_low"), TtsModel(model_dir="vits-piper-zh_CN-huayan-medium"), ] + for m in models: m.data_dir = m.model_dir + "/" + "espeak-ng-data" m.model_name = m.model_dir[len("vits-piper-") :] + ".onnx" @@ -145,7 +163,7 @@ def get_piper_models() -> List[TtsModel]: return models -def get_all_models() -> List[TtsModel]: +def get_vits_models() -> List[TtsModel]: return [ # Chinese TtsModel( @@ -202,12 +220,6 @@ def get_all_models() -> List[TtsModel]: lang="zh", rule_fsts="vits-zh-hf-theresa/rule.fst", ), - # English (coqui-ai/TTS) - # fmt: off - TtsModel(model_dir="vits-coqui-en-ljspeech", model_name="model.onnx", lang="en"), - TtsModel(model_dir="vits-coqui-en-ljspeech-neon", model_name="model.onnx", lang="en"), - TtsModel(model_dir="vits-coqui-en-vctk", model_name="model.onnx", lang="en"), - # TtsModel(model_dir="vits-coqui-en-jenny", model_name="model.onnx", lang="en"), # English (US) TtsModel(model_dir="vits-vctk", model_name="vits-vctk.onnx", lang="en"), TtsModel(model_dir="vits-ljs", model_name="vits-ljs.onnx", lang="en"), @@ -225,8 +237,11 @@ def main(): s = f.read() template = environment.from_string(s) d = dict() - # all_model_list = get_all_models() - all_model_list = get_piper_models() + + all_model_list = get_vits_models() + all_model_list += get_piper_models() + all_model_list += get_coqui_models() + num_models = len(all_model_list) num_per_runner = num_models // total diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc index c27d92a11..20512e6cb 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc @@ -122,6 +122,8 @@ static std::vector CoquiPhonemesToIds( int32_t eos_id = meta_data.eos_id; int32_t blank_id = meta_data.blank_id; int32_t add_blank = meta_data.add_blank; + int32_t comma_id = token2id.at(','); + SHERPA_ONNX_LOGE("comma id: %d", comma_id); std::vector ans; if (add_blank) { @@ -158,6 +160,9 @@ static std::vector CoquiPhonemesToIds( } } + // add a comma at the end of a sentence so that we can have a longer pause. + ans.push_back(comma_id); + if (use_eos_bos) { ans.push_back(eos_id); }