diff --git a/dev/guides/schemas/index.html b/dev/guides/schemas/index.html index 209bb085..a337bfb0 100644 --- a/dev/guides/schemas/index.html +++ b/dev/guides/schemas/index.html @@ -1006,59 +1006,59 @@
{
"json.schemas": [
{
- "url": "file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-aligner-schema-0.1.json",
+ "url": "file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-aligner-0.1.json",
"fileMatch": [
"everyvoice-aligner.json"
]
},
{
- "url": "file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-shared-data-schema-0.1.json",
+ "url": "file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-shared-data-0.1.json",
"fileMatch": [
"everyvoice-shared-data.json"
]
},
{
- "url": "file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-shared-text-schema-0.1.json",
+ "url": "file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-shared-text-0.1.json",
"fileMatch": [
"everyvoice-shared-text.json"
]
},
{
- "url": "file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-spec-to-wav-schema-0.1.json",
+ "url": "file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-spec-to-wav-0.1.json",
"fileMatch": [
"everyvoice-spec-to-wav.json"
]
},
{
- "url": "file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-spec-schema-0.1.json",
+ "url": "file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-spec-0.1.json",
"fileMatch": [
"everyvoice-text-to-spec.json"
]
},
{
- "url": "file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-wav-schema-0.1.json",
+ "url": "file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-wav-0.1.json",
"fileMatch": [
"everyvoice-text-to-wav.json"
]
}
],
"yaml.schemas": {
- "file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-aligner-schema-0.1.json": [
+ "file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-aligner-0.1.json": [
"everyvoice-aligner.yaml"
],
- "file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-shared-data-schema-0.1.json": [
+ "file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-shared-data-0.1.json": [
"everyvoice-shared-data.yaml"
],
- "file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-shared-text-schema-0.1.json": [
+ "file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-shared-text-0.1.json": [
"everyvoice-shared-text.yaml"
],
- "file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-spec-to-wav-schema-0.1.json": [
+ "file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-spec-to-wav-0.1.json": [
"everyvoice-spec-to-wav.yaml"
],
- "file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-spec-schema-0.1.json": [
+ "file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-spec-0.1.json": [
"everyvoice-text-to-spec.yaml"
],
- "file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-wav-schema-0.1.json": [
+ "file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-wav-0.1.json": [
"everyvoice-text-to-wav.yaml"
]
}
diff --git a/dev/search/search_index.json b/dev/search/search_index.json
index 2f8ae903..9f891b6c 100644
--- a/dev/search/search_index.json
+++ b/dev/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Getting Started","text":"Welcome to the EveryVoice TTS Toolkit Documentation page! Please read the background section below to hear a bit about how this project got started, or head over to the guides section to find a guide to help you get started.
"},{"location":"#background","title":"Background","text":"There are approximately 70 Indigenous languages spoken in Canada from 10 distinct language families. As a consequence of the residential school system and other policies of cultural suppression, the majority of these languages now have fewer than 500 fluent speakers remaining, most of them elderly.
Despite this, Indigenous people have resisted colonial policies and continued speaking their languages, with interest by students and parents in Indigenous language education continuing to grow. Teachers are often overwhelmed by the number of students, and the trend towards online education means many students who have not previously had access to language classes now do. Supporting these growing cohorts of students comes with unique challenges in languages with few fluent first-language speakers. Teachers are particularly concerned with providing their students with opportunities to hear the language outside of class.
While there is no replacement for a speaker of an Indigenous language, there are possible applications for speech synthesis (text-to-speech) to supplement existing text-based tools like verb conjugators, dictionaries and phrasebooks.
The National Research Council has partnered with the Onkwawenna Kentyohkwa Kanyen\u2019k\u00e9ha immersion school, W\u0331S\u00c1NE\u0106 School Board, University nuhelot\u2019\u012fne thaiyots\u2019\u012f nistameyim\u00e2kanak Blue Quills, and the University of Edinburgh to research and develop state-of-the-art speech synthesis (text-to-speech) systems and techniques for Indigenous languages in Canada, with a focus on how to integrate text-to-speech technology into the classroom.
The project is titled Speech Generation for Indigenous Language Education (SGILE) and the EveryVoice TTS toolkit is one of the products of this collaboration. Detailed information about this project can be found in our recent submission to Computer Speech & Language (Under Review).
"},{"location":"#what-is-everyvoice-and-this-documentation-about","title":"What is EveryVoice and this documentation about?","text":"This project is the location of active research and development for the Speech Generation for Indigenous Language Education project. In addition to being a model for this project, it is meant to outline repeatable recipes for other communities and languages to develop their own text-to-speech systems. This documentation describes guides for how to do this.
Note
We are trying to develop a tool that makes the developer experience as smooth as possible. But, building these models and creating your datasets can be complicated. We recommend you are comfortable with Python and using the command line before starting on this project.
"},{"location":"#similar-projects-exist-why-create-another-one","title":"Similar projects exist, why create another one?","text":"It is true that similar excellent projects exist, such as ESPnet, \ud83d\udc38TTS, Comprehensive-Transformer-TTS, and IMS-Toucan among others. Our reasons for creating our own are multi-fold (some of the following features are implemented in the aforementioned projects, but not every one of these features is supported in any of them):
- The EveryVoice TTS toolkit comes with a configuration wizard which helps configure the toolkit to new datasets in new languages.
- We support a heterogeneous source of data, meaning you (hopefully) have to do less work to wrangle data together. The configuration wizard supports multi-dataset configuration.
- We support out-of-the-box integration with g2p which allows the g2p rules for 30+ Indigenous languages to be used in the project.
- We will not try to implement many different models. Instead we will curate a model architecture that we believe to be best for training models on under-resourced languages. In this way we are more similar to IMS-Toucan than ESPnet
- We use a custom, statically-typed configuration architecture between models written in Pydantic that allows for configuration validation and serialization/de-serialization to json and yaml. It also allows us to ensure the same configuration for text and audio processing is used between models.
- We implement our models in PyTorch Lightning
For a detailed comparison of selected features of EveryVoice and other toolkits please see Appendix B & C in our recent paper submission.
Note
These features do not necessarily mean that this is the right project for you. The other projects mentioned are of very high quality and might be a better fit for your project, particularly if you are lucky enough to have lots of data, or a language that is already supported.
"},{"location":"install/","title":"Installation","text":"In order to use EveryVoice on GPUs, you must install PyTorch and Cuda, Python 3.10 or more recent, a number of other dependencies, and EveryVoice itself. The following sections describe three ways to accomplish this:
"},{"location":"install/#scripted-installation-recommended","title":"Scripted installation -- recommended","text":"The EveryVoice installation process can be somewhat involved, so we have automated it as much as we could.
- Install miniconda or conda.
- Clone the EveryVoice repo and its submodules:
git clone https://github.com/roedoejet/EveryVoice.git\ncd EveryVoice\ngit submodule update --init\n
- Run our automated environment creation script
./make-everyvoice-env --name EveryVoice\nconda activate EveryVoice\n
Add the option --cuda CUDA_VERSION
if you need to override the default CUDA version, or --cpu
to use Torch compiled for CPU use only.
"},{"location":"install/#using-pip","title":"Using Pip","text":"We hope direct installation from PyPI will work:
-
Follow the PyTorch installation instructions relevant to your hardware, but specify version 2.1.0: torch==2.1.0
, and torchaudio==2.1.0
.
-
Install EveryVoice:
pip install everyvoice==0.1.0a
"},{"location":"install/#manual-installation","title":"Manual installation","text":"If you prefer to do the complete installation process manually, or if the automated process does not work for you, follow these steps:
"},{"location":"install/#install-conda","title":"Install Conda","text":"Install miniconda or conda.
"},{"location":"install/#create-the-environment","title":"Create the environment","text":"Use conda to create a new environment based on Python 3.10:
conda create --name EveryVoice python=3.10\nconda activate EveryVoice\n
"},{"location":"install/#pytorch-dependencies","title":"Pytorch dependencies","text":"Install our pytorch requirements from requirements.torch.txt
, replacing cu118
below (for CUDA 11.8) by your actual CUDA version tag (118 or higher), or by cpu
for a CPU-only installation:
CUDA_TAG=cu118 pip install -r requirements.torch.txt --find-links https://download.pytorch.org/whl/torch_stable.html\n
Alternatively, you can follow the PyTorch installation instructions relevant to your hardware. Make sure you specify the version declared in requirements.torch.txt
, which is 2.1.0 at the moment.
"},{"location":"install/#other-potentially-tricky-dependencies","title":"Other potentially tricky dependencies","text":"These requirements sometimes require being run separately:
pip install cython\nconda install sox -c conda-forge\n
"},{"location":"install/#handling-running-out-of-temp-disk-space","title":"Handling running out of temp disk space","text":"Installation will require a fair bit of space on ~/.cache
and your $TMPDIR
(/tmp
by default, if $TMPDIR
is not set). If you get the error OSError: [Errno 28] No space left on device
during installation, you may need to do one or both of these operations: - export TMPDIR=/path/to/a/large/tmp/space
(or maybe export TMPDIR=.
) - mkdir /path/to/a/large/filesystem/.cache; ln -s /path/to/a/large/filesystem/.cache ~/.cache
"},{"location":"install/#install-everyvoice-itself","title":"Install EveryVoice itself","text":"Install EveryVoice locally from your cloned sandbox:
pip install -e .\n
"},{"location":"install/#dev-dependencies","title":"Dev dependencies","text":"Before you can run the test suites, you'll also need to install the dev dependencies:
pip install -r requirements.dev.txt\n
"},{"location":"install/#git-hooks","title":"Git hooks","text":"If you plan to contribute to the project, please install our Git hooks:
pre-commit install\ngitlint install-hook\ngit submodule foreach 'pre-commit install'\ngit submodule foreach 'gitlint install-hook'\n
"},{"location":"guides/","title":"Guides","text":"Here are a selection of guides to help you through the process of training and using your own text-to-speech models.
-
Should you build a TTS Model? What are the possible negative outcomes?
This guide is a primer for the ethical questions related to building a TTS system. Please read this section first before doing anything else!
-
Background to TTS
This guide points to resources for learning more about the technical side of how text-to-speech works.
-
How to create a TTS system for your language
This guide provides in-depth information about how to build a TTS system for your language using the EveryVoice TTS Toolkit.
-
How to fine-tune
This guide provides information on how to fine-tune your models. It is an advanced, but recommended, step for building a TTS system.
"},{"location":"guides/background/","title":"Background to Text-to-Speech","text":"Consider what is required in order for speech-based communication to work. A speaker decides to utter a word, contracts their diaphragm to pull air into their lungs, and upon exhaling, returns the air through their vocal tract. They then contort their vocal tract in highly specific ways to reach a series of articulatory targets that they have associated with a particular meaning. The flow of air past these orchestrated contortions causes pressure fluctuations at varying frequencies that, upon impinging on the listener's ear drums, are processed and understood to represent the same meaning the speaker intended - magic!
The idea of creating machines to simulate speech has origins as early as the 18th century when Hungarian inventor Wolfgang von Kempelen created his 'speaking machine' to woo crowds. Speech synthesis has since made tremendous gains, and is now employed to solve a variety of real world problems. While von Kempelen's machine attempted to replicate the anatomy required for speech, modern techniques use computers to work with discrete representations of sound and the last decade of improvements to speech synthesis have grown in tandem with the progress of the field of neural network-based machine learning.
We intend to update this section with a variety of resources to help provide background on Text-to-Speech (TTS). In the meantime, please visit this excellent TTS primer from the NVIDIA NeMo TTS toolkit. If you are interested in more in-depth learning about TTS and speech processing, we recommend the Speech Processing and Speech Synthesis courses on Speech Zone.
"},{"location":"guides/custom/","title":"Customize to your language","text":""},{"location":"guides/custom/#step-1-make-sure-you-have-permission","title":"Step 1: Make sure you have Permission!","text":"So, you want to build a text-to-speech system for a new language or dataset - cool! But, just because you can build a text-to-speech system, doesn't mean you should. There are a lot of important ethical questions around text-to-speech. For example, it's not ethical to just use audio you find somewhere online if it doesn't have explicit permission to use it for the purposes of text-to-speech. The first step is always to make sure you have permission to use the data in question and that whoever contributed their voice to the data you want to use is aware and supportive of your goal.
Creating a text-to-speech model without permission is unethical, but even when you do have permission, you should take great care in how you distribute the model you have created. Increasingly, text-to-speech technology is used in fraud and unauthorized impersonation. The technology has also been used to disenfranchise voice actors and other professionals. When you create an EveryVoice model, you are responsible for ensuring the model is only used and distributed according to the permissions you have. To help with this accountability, you will be required by EveryVoice to attest that you have permission to use your data and to provide a full name and contact information that will also be distributed with the model.
In addition, we invite you to check out our short guide that contains prompts about ethical questions before starting on any of the next steps.
"},{"location":"guides/custom/#step-2-gather-your-data","title":"Step 2: Gather Your Data","text":"The first thing to do is to get all the data you have (in this case audio with text transcripts) together in one place. Your audio should be in a lossless 'wav' format. Ideally it would be 16bit, mono (one channel) audio sampled somewhere between 22.05kHz and 48kHz. If that doesn't mean anything to you, don't worry, we can ensure the right format in later steps. It's best if your audio clips are somewhere between half a second and 10 seconds long. Any longer and it could be difficult to train. If your audio is longer than this, we suggest processing it into smaller chunks first.
Your text should be consistently written and should be in a pipe-separated values spreadsheet, similar to this file. It should have a column that contains text and a column that contains the basename
of your associated audio file. So if you have a recording of somebody saying \"hello how are you?\" and the corresponding audio is called mydata0001.wav
then you should have a psv file that looks like this:
basename|text\nmydata0001|hello how are you?\nmydata0002|some other sentence.\n...\n
We also support comma and tab separated files, but recommend using pipes (|).
You can also use the \"festival\" format which is like this (example from Sinhala TTS):
( sin_2241_0329430812 \" \u0d9a\u0ddd\u0d9a\u0da7\u0dad\u0dca \u0db8\u0d82 \u0dc0\u0dd9\u0db1\u0daf\u0dcf \u0dad\u0dbb\u0db8\u0dca \u0d9a\u0dcf\u0dbd\u0dd9 \u0d9c\u0db1\u0dca\u0db1\u0dd0\u0dad\u0dd2\u0dc0 \u0d87\u0db3 \u0d9c\u0dad\u0dca\u0dad\u0dcf \" )\n( sin_2241_0598895166 \" \u0d87\u0db1\u0dca\u0da2\u0dbd\u0dd3\u0db1\u0dcf \u0da2\u0ddc\u0dbd\u0dd3 \u0d9a\u0dd2\u0dba\u0db1\u0dca\u0db1\u0dda \u0db4\u0dc3\u0dd4\u0d9c\u0dd2\u0dba \u0daf\u0dd2\u0db1\u0dc0\u0dbd \u0db6\u0ddc\u0dc4\u0ddd \u0dc3\u0dd9\u0dba\u0dd2\u0db1\u0dca \u0d9a\u0dad\u0dcf \u0db6\u0dc4\u0da7 \u0dbd\u0d9a\u0dca\u0dc0\u0dd6 \u0da0\u0dbb\u0dd2\u0dad\u0dba\u0d9a\u0dca \" )\n( sin_2241_0701577369 \" \u0d86\u0dbb\u0dca\u0dae\u0dd2\u0d9a \u0da0\u0dd2\u0db1\u0dca\u0dad\u0db1\u0dba \u0dc4\u0dcf \u0dc3\u0dcf\u0db8\u0dcf\u0da2\u0dd3\u0dba \u0daf\u0dd2\u0dba\u0dd4\u0dab\u0dd4\u0dc0 \u0d87\u0dad\u0dd2 \u0d9a\u0dc5 \u0dc4\u0dd0\u0d9a\u0dd2\u0dc0\u0db1\u0dd4\u0dba\u0dda \u0db4\u0dd4\u0daf\u0dca\u0d9c\u0dbd \u0d86\u0dbb\u0dca\u0dae\u0dd2\u0d9a \u0daf\u0dd2\u0dba\u0dd4\u0dab\u0dd4\u0dc0 \u0dc3\u0dbd\u0dc3\u0dcf \u0daf\u0dd3\u0db8\u0dd9\u0db1\u0dca\u0dba \" )\n( sin_2241_0715400935 \" \u0d89\u0db1\u0dca \u0d85\u0daf\u0dc4\u0dc3\u0dca \u0dc0\u0db1\u0dca\u0db1\u0dda \u0dc0\u0dd2\u0da0\u0dcf\u0dbb\u0dcf\u0dad\u0dca\u0db8\u0d9a \u0dc0\u0dd2\u0db1\u0dd2\u0dc0\u0dd2\u0daf \u0daf\u0dd0\u0d9a\u0dd3\u0db8\u0dd9\u0db1\u0dca \u0dad\u0ddc\u0dbb \u0db6\u0dd0\u0dbd\u0dca\u0db8\u0dba\u0dd2 \" )\n( sin_2241_0817100025 \" \u0d85\u0db4 \u0dba\u0dd4\u0daf\u0dca\u0db0\u0dba\u0dda \u0db4\u0dc5\u0db8\u0dd4 \u0db4\u0dd2\u0dba\u0dc0\u0dbb\u0dda\u0daf\u0dd3\u0db8 \u0db4\u0dbb\u0dcf\u0daf \u0dc0\u0dd3 \u0d85\u0dc0\u0dc3\u0dcf\u0db1\u0dba \" )\n
In this format, there are corresponding wav files labelled sin_2241_0329430812.wav etc..
"},{"location":"guides/custom/#step-3-install-everyvoice","title":"Step 3: Install EveryVoice","text":"Head over to the installation documentation and install EveryVoice
"},{"location":"guides/custom/#step-4-run-the-configuration-wizard","title":"Step 4: Run the Configuration Wizard \ud83e\uddd9","text":"Once you have your data, the best thing to do is to run the Configuration Wizard \ud83e\uddd9 which will help you configure a new project. To do that run:
everyvoice new-project\n
After running the wizard, cd into your newly created directory. Let's call it <your_everyvoice_project>
for now.
cd your_everyvoice_project\n
Important
After you run the Configuration Wizard \ud83e\uddd9, please inspect your text configuration config/everyvoice-shared-text.yaml
to make sure everything looks right. That is, if some unexpected symbols show up, please inspect your data (if you remove symbols from the configuration here, they will be ignored during training). Sometimes characters that are treated as punctuation by default will need to be removed from the punctuation list if they are treated as non-punctuation in your language.
"},{"location":"guides/custom/#step-5-run-the-preprocessor","title":"Step 5: Run the Preprocessor","text":"Your models need to do a number of preprocessing steps in order to prepare for training. To preprocess everything you need, run the following:
everyvoice preprocess config/everyvoice-text-to-spec.yaml\n
"},{"location":"guides/custom/#step-6-select-a-vocoder","title":"Step 6: Select a Vocoder","text":"So you don't need to train your own vocoder, EveryVoice has a variety of publicly released vocoders available here. Follow the instructions there for downloading the checkpoints.
EveryVoice is also compatible out-of-the-box with the UNIVERSAL_V1 HiFiGAN checkpoint from the official HiFiGAN implementation, which is very good quality. You can find the EveryVoice-compatible version of this checkpoint here.
Using a pre-trained vocoder is recommended, and the above checkpoint should work well even for new languages after finetuning.
"},{"location":"guides/custom/#train-your-own-vocoder","title":"Train your own Vocoder","text":"You might want to train your own vocoder, but this takes a long time (up to 2 weeks on a single GPU), uses a lot of electricity, and unless you know what you are doing, you are unlikely to improve upon the publicly available models discussed above, even for a new language. So we do not recommend it. You are almost always better off just using the pre-trained vocoder and then finetuning on the predictions from your feature prediction network. If you really do want to train your own vocoder though, you can run the following command:
everyvoice train spec-to-wav config/everyvoice-spec-to-wav.yaml\n
By default, we run our training with PyTorch Lightning's \"auto\" strategy. But, if you are on a machine where you know the hardware, you can specify it like:
everyvoice train spec-to-wav config/everyvoice-spec-to-wav.yaml -d 1 -a gpu\n
Which would use the GPU accelerator (-a gpu
) and specify 1 device/chip (-d 1
).
"},{"location":"guides/custom/#step-7-train-your-feature-prediction-network","title":"Step 7: Train your Feature Prediction Network","text":"To generate audio when you train your feature prediction network, you need to add your vocoder checkpoint to the config/everyvoice-text-to-spec.yaml
At the bottom of that file you'll find a key called vocoder_path
. Add the absolute path to your trained vocoder (here it would be /path/to/test/logs_and_checkpoints/VocoderExperiment/base/checkpoints/last.ckpt
where /path/to
would be the actual path to it on your computer.)
Once you've replaced the vocoder_path
key, you can train your feature prediction network:
everyvoice train text-to-spec config/everyvoice-text-to-spec.yaml\n
Tip
While your model is training, you can use TensorBoard to view the logs which will show information about the progress of training and display spectrogram images. If you have provided a vocoder_path
key, then you will also be able to hear audio in the logs. To use TensorBoard, make sure that your conda environment is activated and run tensorboard --logdir path/to/logs_and_checkpoints
. Then your logs will be viewable at http://localhost:6006.
"},{"location":"guides/custom/#step-8-optional-finetune-your-vocoder","title":"Step 8 (optional): Finetune your Vocoder","text":"When you have finished training your Feature Prediction Network, we recommend finetuning your vocoder. This step is optional, but it will help get rid of metallic artefacts that are often present if you don't finetune your vocoder. Note, it will likely not help with any mispronounciations. If you notice these types of errors, it is likely due to issues with the training data (e.g. too much variation in pronunciation or recording quality in the dataset, or discrepencies between the recording and transcription.)
"},{"location":"guides/custom/#step-9-synthesize-speech-in-your-language","title":"Step 9: Synthesize Speech in Your Language!","text":""},{"location":"guides/custom/#command-line","title":"Command Line","text":"You can synthesize by pointing the CLI to your trained feature prediction network and passing in the text. You can export the wav or spectrogram (pt) files.
everyvoice synthesize from-text logs_and_checkpoints/FeaturePredictionExperiment/base/checkpoints/last.ckpt -t \"\u0db8\u0dd9\u0daf\u0dcf \u0dc3\u0dd0\u0dbb\u0dda \u0dc3\u0dcf\u0d9a\u0da0\u0dca\u0da1\u0dcf\u0dc0\u0d9a\u0dca \u0dc0\u0dd2\u0daf\u0dd2\u0dba\u0da7 \u0db1\u0dd9\u0dc0\u0dd9\u0dba\u0dd2 \u0db1\u0dda\u0daf \u0db4\u0dbd \u0d9a\u0dbb\u0dbd \u0dad\u0dd2\u0dba\u0dd9\u0db1\u0dca\u0db1\u0dd9\" -a gpu -d 1 --output-type wav\n
"},{"location":"guides/custom/#demo-app","title":"Demo App","text":"You can also synthesize audio by starting up the EveryVoice Demo using your Feature Prediction and Vocoder checkpoints:
everyvoice demo logs_and_checkpoints/FeaturePredictionExperiment/base/checkpoints/last.ckpt logs_and_checkpoints/VocoderExperiment/base/checkpoints/last.ckpt\n
And an interactive demo will be available at http://localhost:7260
"},{"location":"guides/ethics/","title":"Should you build a TTS Model? What are the possible negative outcomes?","text":"So, you're here because you want to build a TTS system - maybe for a language that doesn't have an existing one, but just because you can build a text-to-speech system, doesn't mean you should. The most important step of a new text-to-speech project is to consider what possible ethical problems could arise from the technology and which goals you are hoping to achieve with it. This section will walk you through some important questions to consider; you might also find that they apply broadly to other technology projects.
"},{"location":"guides/ethics/#check-before-you-tech","title":"Check Before you Tech!","text":"Technology is flashy, and it seems like you can hardly turn a corner without someone talking about AI. However, as the excitment about the possibilities of this technology grow, so too have the cautionary warnings12. Amidst all the flurry of activity and hype - the fundamental question of why are we building technology X, Y, or Z should hopefully come up. What goals are we hoping to achieve, and what new problems might we be introducing with a new technology?
The following sections provide a couple of questions based on the excellent \"Check Before you Tech\" guide for choosing language technology in a language revitalization context. While the original guide is geared towards technology users, we target our questions to technology developers and researchers. We urge you to consider these questions before beginning your TTS project.
Note
This list is not intended to be a comprehensive list of all the ethical questions to consider, but rather a starting point for discussing and considering the impacts of the technology you are potentially creating.
"},{"location":"guides/ethics/#where-is-the-data-coming-from-do-you-have-explicit-permission-from-the-creator-of-the-data-and-the-speaker","title":"Where is the data coming from? Do you have explicit permission from the creator of the data, and the speaker?","text":"It is not ethical to build a TTS system with data that you do not have permission to use. You should not scrape or re-purpose data that you find online to build TTS systems unless the data comes with explicit permissions to do so.
For TTS, you are building a model of someone's likeness, so you should make sure that you have obtained permission from the data creator as well as the speaker whose likeness will be modeled. When permission is asked for, you should be clear with the person(s) about what the technology could be used for.
If you do not have enough time/resources to ask this question and obtain permission from all the relevant stakeholders, you should not build TTS models with the data.
"},{"location":"guides/ethics/#what-is-your-goal-how-will-tts-help-you-meet-that-goal","title":"What is your goal? How will TTS help you meet that goal?","text":"As mentioned above, it's important to think about what you are actually trying to achieve with TTS. Not only will this help you determine whether EveryVoice TTS is the right toolkit for your application, but it will also help you determine whether you need to spend all the time and resources necessary to build a TTS system in the first place.
We invite you to consider whether your goal serving you or is it serving the people whose language you're working with? And if the answer is the latter, how do you know that, and how are you ensuring that that continues to be true? When discussing the project with relevant stakeholders, you should also mention any other goals you have in building this technology (e.g. publishing papers).
"},{"location":"guides/ethics/#where-is-the-model-going-to-be-stored-who-has-control-and-access-to-the-model-and-who-has-ownership","title":"Where is the model going to be stored? Who has control and access to the model and who has ownership?","text":"If the speaker or permissions-holders for the data or models change their mind about participation, how easy is it for them to stop the model? Do they have access to a 'kill switch'? Do they need to contact someone and make a request? Are there assurances about how long these requests will take to be processed?
Are there clear, agreed-upon guidelines for who has access to the model and data? Who maintains the control and access to these resources? In Canada, we encourage users to engage with the First Nations Principles of OCAP\u00ae when planning a project.
"},{"location":"guides/ethics/#what-are-the-possible-risks-associated-with-this-technology-and-how-will-i-mitigate-them","title":"What are the possible risks associated with this technology and how will I mitigate them?","text":"Have you considered and discussed possible risks with the relevant stakeholders? Spend some time imagining ways that the tool could be misused, either by accidental or malicious actors. What if the model makes pronunciation mistakes? Will that embarrass the speaker? What if the model is made to say inappropriate things? What plans do you have to mitigate these risks?
Since this technology is relatively new, it can sometimes be hard to consider the ways that a technology can be misused, however we already see examples where TTS models are being used to generate fake news3. Can you think of ways that similar so-called 'deep fakes' or impersonations could be used to cause harm?
-
Emily M. Bender, Timnit Gebru, Angelina McMillan-Major, and Shmargaret Shmitchell. 2021. On the Dangers of Stochastic Parrots: Can Language Models Be Too Big? \ud83e\udd9c. In Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency (FAccT '21). Association for Computing Machinery, New York, NY, USA, 610\u2013623. https://doi.org/10.1145/3442188.3445922 \u21a9
-
Marie-Odile Junker. 2024. Data-mining and Extraction: the gold rush of AI on Indigenous Languages. In Proceedings of the Seventh Workshop on the Use of Computational Methods in the Study of Endangered Languages, pages 52\u201357, St. Julians, Malta. Association for Computational Linguistics. https://aclanthology.org/2024.computel-1.8/ \u21a9
-
https://nypost.com/2024/06/14/us-news/michigan-gop-candidate-anthony-hudson-stands-by-ai-generated-mlk-jr-endorsement-video/ \u21a9
"},{"location":"guides/finetune/","title":"How to fine-tune the existing checkpoints","text":""},{"location":"guides/finetune/#vocoder-matching","title":"Vocoder matching","text":"Vocoder (i.e. your spec-to-wav model) matching is an important part of the TTS pipeline. Because your spec-to-wav model is trained with the ground-truth Mel spectrograms from your audio, there is a mismatch between the Mel spectrograms created by your text-to-spec model and the ones that the pre-trained vocoders have seen during training. For that reason, it can be helpful to fine-tune your spec-to-wav model with the generated Mel spectrograms from your text-to-spec model.
Note
Vocoder matching will only help with the metallic artefacts that sometimes occur when synthesizing speech. If your model is not intelligible, has other types of errors like mispronunciations - vocoder matching will not solve it. In these cases, the problem is likely with your text-to-spec model, and probably due to either noisy data (noisy recordings, mistranscriptions etc), too little data, or data that is too varied (many different speakers). Please refer to TODO: troubleshooting for more information.
To finetune your spec-to-wav model with Mel spectrograms from your text-to-spec model (also called 'vocoder matching'), you need to have a pre-trained text-to-spec and spec-to-wav model ready. You also need to have access to some parallel text/audio data (the same or similar data that you used to train your text-to-spec model).
Then you:
-
Generate a folder full of Mel spectrograms from your text-to-spec model (repeat this process for both your training and validation filelists):
Training filelist:
everyvoice synthesize from-text <path-to-your-text-to-spec.ckpt> -O spec --filelist <path-to-your-training-filelist.psv> --teacher-forcing-directory <path-to-your-preprocessed-directory> --output-dir <path-to-your-preprocessed-directory>\n
Validation filelist:
everyvoice synthesize from-text <path-to-your-text-to-spec.ckpt> -O spec --filelist <path-to-your-validation-filelist.psv> --teacher-forcing-directory <path-to-your-preprocessed-directory> --output-dir <path-to-your-preprocessed-directory>\n
Note
For vocoder matching to work, the size of the generated Mel spectrogram has to be the same as the ground truth Mel spectrogram calculated from the audio, so you have to use 'teacher-forcing' to force the text-to-spec model to output spectrograms of a specific size. To do this, we add the --teacher-forcing-directory and point it to the project preprocessed
directory with the processed files from our filelist. This will write a synthesized_spec
folder within your preprocessed
directory, that you can use instead of the groundtruth spec
data by setting finetune
to True as described in the next step.
-
Set the finetune_ckpt value point to the vocoder checkpoint that you want to fine-tune.
-
Lower the learning rate (we suggest starting at 0.00001)
-
Train the vocoder again with finetuning set to True (train for at least 25000 steps):
everyvoice train spec-to-wav config/everyvoice-spec-to-wav.yaml -c training.finetune=True\n
"},{"location":"guides/schemas/","title":"How to Setup Code Completion for Schemas in vim","text":"When manually editing EveryVoice's configuration files, it is convenient to have the file checked/validated and to have documentation about each field. The following setup will work for json and yaml configuration files.
"},{"location":"guides/schemas/#install-nodejs","title":"Install nodejs
","text":"You will need to have a functional npm
which is part of nodejs
. The schemas will be verified using a node process.
"},{"location":"guides/schemas/#install-vim-plug","title":"Install vim-plug","text":"vim-plug: Minimalist Vim Plugin Manager This will take care of install vim's extensions for us.
curl \\\n --create-dirs \\\n -fLo ~/.vim/autoload/plug.vim \\\n https://raw.githubusercontent.com/junegunn/vim-plug/master/plug.vim\n
"},{"location":"guides/schemas/#augment-your-vimrc","title":"Augment your .vimrc
","text":"We want to install Conquer of Completion aka coc We will add a plugins for coc-json and coc-yaml which will be used to handle json and yaml files. Let's also add some key bindings to access coc.nvim
's functionalities. Refer to Example Vim configuration for a more complete example.
call plug#begin()\nPlug 'neoclide/coc.nvim', { 'do': 'npm ci' }\n\nlet g:coc_disable_startup_warning = 1\n\n\" Use tab for trigger completion with characters ahead and navigate\n\" NOTE: There's always complete item selected by default, you may want to enable\n\" no select by `\"suggest.noselect\": true` in your configuration file\n\" NOTE: Use command ':verbose imap <tab>' to make sure tab is not mapped by\n\" other plugin before putting this into your config\ninoremap <silent><expr> <TAB>\n\\ coc#pum#visible() ? coc#pum#next(1) :\n\\ CheckBackspace() ? \"\\<Tab>\" :\n\\ coc#refresh()\ninoremap <expr><S-TAB> coc#pum#visible() ? coc#pum#prev(1) : \"\\<C-h>\"\n\n\" Make <CR> to accept selected completion item or notify coc.nvim to format\n\" <C-g>u breaks current undo, please make your own choice\ninoremap <silent><expr> <CR> coc#pum#visible() ? coc#pum#confirm()\n\\: \"\\<C-g>u\\<CR>\\<c-r>=coc#on_enter()\\<CR>\"\n\nfunction! CheckBackspace() abort\n let col = col('.') - 1\n return !col || getline('.')[col - 1] =~# '\\s'\nendfunction\n\n\" Use <c-space> to trigger completion\nif has('nvim')\n inoremap <silent><expr> <c-space> coc#refresh()\nelse\n inoremap <silent><expr> <c-@> coc#refresh()\nendif\n\n\" Use `[g` and `]g` to navigate diagnostics\n\" Use `:CocDiagnostics` to get all diagnostics of current buffer in location list\nnmap <silent> [g <Plug>(coc-diagnostic-prev)\nnmap <silent> ]g <Plug>(coc-diagnostic-next)\n\n\" GoTo code navigation\nnmap <silent> gd <Plug>(coc-definition)\nnmap <silent> gy <Plug>(coc-type-definition)\nnmap <silent> gi <Plug>(coc-implementation)\nnmap <silent> gr <Plug>(coc-references)\n\n\" Use K to show documentation in preview window\nnnoremap <silent> K :call ShowDocumentation()<CR>\n\nfunction! ShowDocumentation()\n if CocAction('hasProvider', 'hover')\n call CocActionAsync('doHover')\n else\n call feedkeys('K', 'in')\n endif\nendfunction\n\n\" Highlight the symbol and its references when holding the cursor\nautocmd CursorHold * silent call CocActionAsync('highlight')\n\n\" Symbol renaming\nnmap <leader>rn <Plug>(coc-rename)\n\n\" Formatting selected code\nxmap <leader>F <Plug>(coc-format-selected)\nnmap <leader>F <Plug>(coc-format-selected)\n\naugroup mygroup\nautocmd!\n\" Setup formatexpr specified filetype(s)\nautocmd FileType typescript,json setl formatexpr=CocAction('formatSelected')\n\" Update signature help on jump placeholder\nautocmd User CocJumpPlaceholder call CocActionAsync('showSignatureHelp')\naugroup end\n\n\" Applying code actions to the selected code block\n\" Example: `<leader>aap` for current paragraph\nxmap <leader>a <Plug>(coc-codeaction-selected)\nnmap <leader>a <Plug>(coc-codeaction-selected)\n\n\" Remap keys for applying code actions at the cursor position\nnmap <leader>ac <Plug>(coc-codeaction-cursor)\n\" Remap keys for apply code actions affect whole buffer\nnmap <leader>as <Plug>(coc-codeaction-source)\n\" Apply the most preferred quickfix action to fix diagnostic on the current line\nnmap <leader>qf <Plug>(coc-fix-current)\n\n\" Remap keys for applying refactor code actions\nnmap <silent> <leader>re <Plug>(coc-codeaction-refactor)\nxmap <silent> <leader>r <Plug>(coc-codeaction-refactor-selected)\nnmap <silent> <leader>r <Plug>(coc-codeaction-refactor-selected)\n\n\" Run the Code Lens action on the current line\nnmap <leader>cl <Plug>(coc-codelens-action)\n\n\" Map function and class text objects\n\" NOTE: Requires 'textDocument.documentSymbol' support from the language server\nxmap if <Plug>(coc-funcobj-i)\nomap if <Plug>(coc-funcobj-i)\nxmap af <Plug>(coc-funcobj-a)\nomap af <Plug>(coc-funcobj-a)\nxmap ic <Plug>(coc-classobj-i)\nomap ic <Plug>(coc-classobj-i)\nxmap ac <Plug>(coc-classobj-a)\nomap ac <Plug>(coc-classobj-a)\n\n\" Remap <C-f> and <C-b> to scroll float windows/popups\nif has('nvim-0.4.0') || has('patch-8.2.0750')\nnnoremap <silent><nowait><expr> <C-f> coc#float#has_scroll() ? coc#float#scroll(1) : \"\\<C-f>\"\nnnoremap <silent><nowait><expr> <C-b> coc#float#has_scroll() ? coc#float#scroll(0) : \"\\<C-b>\"\ninoremap <silent><nowait><expr> <C-f> coc#float#has_scroll() ? \"\\<c-r>=coc#float#scroll(1)\\<cr>\" : \"\\<Right>\"\ninoremap <silent><nowait><expr> <C-b> coc#float#has_scroll() ? \"\\<c-r>=coc#float#scroll(0)\\<cr>\" : \"\\<Left>\"\nvnoremap <silent><nowait><expr> <C-f> coc#float#has_scroll() ? coc#float#scroll(1) : \"\\<C-f>\"\nvnoremap <silent><nowait><expr> <C-b> coc#float#has_scroll() ? coc#float#scroll(0) : \"\\<C-b>\"\nendif\n\n\" Use CTRL-S for selections ranges\n\" Requires 'textDocument/selectionRange' support of language server\nnmap <silent> <C-s> <Plug>(coc-range-select)\nxmap <silent> <C-s> <Plug>(coc-range-select)\n\n\" Add `:Format` command to format current buffer\ncommand! -nargs=0 Format :call CocActionAsync('format')\n\n\" Add `:Fold` command to fold current buffer\ncommand! -nargs=? Fold :call CocAction('fold', <f-args>)\n\n\" Add `:OR` command for organize imports of the current buffer\ncommand! -nargs=0 OR :call CocActionAsync('runCommand', 'editor.action.organizeImport')\n\n\" Add (Neo)Vim's native statusline support\n\" NOTE: Please see `:h coc-status` for integrations with external plugins that\n\" provide custom statusline: lightline.vim, vim-airline\nset statusline^=%{coc#status()}%{get(b:,'coc_current_function','')}\n\n\" TODO Space is our Leader, this might interfer with the following:\n\" Mappings for CoCList\n\" Show all diagnostics\nnnoremap <silent><nowait> <space>a :<C-u>CocList diagnostics<cr>\n\" Manage extensions\nnnoremap <silent><nowait> <space>e :<C-u>CocList extensions<cr>\n\" Show commands\nnnoremap <silent><nowait> <space>c :<C-u>CocList commands<cr>\n\" Find symbol of current document\nnnoremap <silent><nowait> <space>o :<C-u>CocList outline<cr>\n\" Search workspace symbols\nnnoremap <silent><nowait> <space>s :<C-u>CocList -I symbols<cr>\n\" Do default action for next item\nnnoremap <silent><nowait> <space>j :<C-u>CocNext<CR>\n\" Do default action for previous item\nnnoremap <silent><nowait> <space>k :<C-u>CocPrev<CR>\n\" Resume latest coc list\nnnoremap <silent><nowait> <space>p :<C-u>CocListResume<CR>\ncall plug#end()\n
"},{"location":"guides/schemas/#install-the-new-plugins","title":"Install the New Plugins","text":"Plugins don't automatically install themself thus you have to run the following command to install them. Start vim
then do
vim +PlugInstall \"+:CocInstall coc-json\" \"+:CocInstall coc-yaml\" +:qall\n
"},{"location":"guides/schemas/#compile-cocnvim","title":"Compile coc.nvim","text":"Once your plugins are installed, you will need to compile coc.
cd ~/.vim/plugged/coc.nvim\nnpm ci\n
"},{"location":"guides/schemas/#create-coc-settingsjson","title":"Create Coc-settings.json","text":"Start vim
and run the command :CocConfig
to edit where your everyvoice schemas are located. The following example assumes that you have clone EveryVoice into ~/git/EveryVoice
. Make the proper modifications to match where you have cloned EveryVoice. Also note that you have to change /home/username
with your own username in the yaml section.
{\n \"json.schemas\": [\n {\n \"url\": \"file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-aligner-schema-0.1.json\",\n \"fileMatch\": [\n \"everyvoice-aligner.json\"\n ]\n },\n {\n \"url\": \"file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-shared-data-schema-0.1.json\",\n \"fileMatch\": [\n \"everyvoice-shared-data.json\"\n ]\n },\n {\n \"url\": \"file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-shared-text-schema-0.1.json\",\n \"fileMatch\": [\n \"everyvoice-shared-text.json\"\n ]\n },\n {\n \"url\": \"file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-spec-to-wav-schema-0.1.json\",\n \"fileMatch\": [\n \"everyvoice-spec-to-wav.json\"\n ]\n },\n {\n \"url\": \"file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-spec-schema-0.1.json\",\n \"fileMatch\": [\n \"everyvoice-text-to-spec.json\"\n ]\n },\n {\n \"url\": \"file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-wav-schema-0.1.json\",\n \"fileMatch\": [\n \"everyvoice-text-to-wav.json\"\n ]\n }\n ],\n \"yaml.schemas\": {\n \"file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-aligner-schema-0.1.json\": [\n \"everyvoice-aligner.yaml\"\n ],\n \"file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-shared-data-schema-0.1.json\": [\n \"everyvoice-shared-data.yaml\"\n ],\n \"file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-shared-text-schema-0.1.json\": [\n \"everyvoice-shared-text.yaml\"\n ],\n \"file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-spec-to-wav-schema-0.1.json\": [\n \"everyvoice-spec-to-wav.yaml\"\n ],\n \"file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-spec-schema-0.1.json\": [\n \"everyvoice-text-to-spec.yaml\"\n ],\n \"file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-wav-schema-0.1.json\": [\n \"everyvoice-text-to-wav.yaml\"\n ]\n }\n}\n
"},{"location":"guides/schemas/#usage","title":"Usage","text":"Once everything is installed, start editing a new or existing EveryVoice configuration.
vim everyvoice-shared-data.json\n
Then use CTRL+<space>
to trigger completion.
"},{"location":"reference/","title":"Reference","text":"Here is where you will find information about the various models implemented in EveryVoice. This section will include some fairly technical details. If you just want to build a model using default settings and configurations, please visit the documentation on the guides
"},{"location":"reference/configuration/","title":"Configuration","text":"Each model has a statically typed configuration model. Each configuration has default settings that will be instantiated when the model is instantiated. To create a default preprocessing configuration for example you would:
from everyvoice.config.preprocessing_config import PreprocessingConfig\n\npreprocessing_config = PreprocessingConfig()\n
Static typing means that misconfiguration errors should occur as soon as the configuration is instantiated instead of producing downstream runtime errors. It also means that intellisense is available in your code editor when working with a configuration class.
"},{"location":"reference/configuration/#sharing-configurations","title":"Sharing Configurations","text":"The Text and Preprocessing configurations should only be defined once per dataset and shared between your models to ensure each model makes the same assumptions about your data. To achieve that, each model configuration can also be defined as a path to a configuration file. So, a configuration for an aligner that uses separately defined text and audio preprocessing configurations might look like this:
model:\n lstm_dim: 512\n conv_dim: 512\n ...\ntraining:\n batch_size: 32\n ...\npreprocessing: \"./config/default/everyvoice-shared-data.yaml\"\ntext: \"./config/default/everyvoice-shared-text.yaml\"\n
"},{"location":"reference/configuration/#serialization","title":"Serialization","text":"By default configuration objects are serialized as dictionaries, which works as expected with integers, floats, lists, booleans, dicts etc. But there are some cases where you need to specify a Callable in your configuration. For example the {ref}TextConfig
has a cleaners
field that takes a list of Callables to apply in order to raw text. By default, these functions turn raw text to lowercase, collapse whitespace, and normalize using Unicode NFC normalization. In Python, we could instantiate this by passing the callables directly like so:
from everyvoice.config.text_config import TextConfig\nfrom everyvoice.utils import collapse_whitespace, lower, nfc_normalize\n\ntext_config = TextConfig(cleaners=[lower, collapse_whitespace, nfc_normalize])\n
But, for yaml or json configuration, we need to serialize these functions. To do so, EveryVoice will turn each callable into module dot-notation. That is, your configuration will look like this in yaml:
cleaners:\n - everyvoice.utils.lower\n - everyvoice.utils.collapse_whitespace\n - everyvoice.utils.nfc_normalize\n
This will then be de-serialized upon instantiation of your configuration.
"},{"location":"reference/configuration/#text-configuration","title":"Text Configuration","text":"The TextConfig is where you define the symbol set for your data and any cleaners used to clean your raw text into the text needed for your data. You can share the TextConfig with any models that need it and only need one text configuration per dataset (and possibly only per language).
"},{"location":"reference/configuration/#textconfig","title":"TextConfig","text":""},{"location":"reference/configuration/#everyvoice.config.text_config.TextConfig","title":"everyvoice.config.text_config.TextConfig
","text":" Bases: ConfigModel
Source code in everyvoice/config/text_config.py
class TextConfig(ConfigModel):\n symbols: Symbols = Field(default_factory=Symbols)\n to_replace: Dict[str, str] = {} # Happens before cleaners\n cleaners: list[PossiblySerializedCallable] = [\n collapse_whitespace,\n ]\n\n @model_validator(mode=\"after\")\n def clean_symbols(self) -> \"TextConfig\":\n \"\"\"We should apply all cleaners to the symbols\n\n Returns:\n TextConfig: a text config with cleaned symbols\n \"\"\"\n for k, v in self.symbols:\n if k not in [\"punctuation\", \"silence\"]:\n setattr(\n self.symbols,\n k,\n [\n normalize_text_helper(x, self.to_replace, self.cleaners)\n for x in v\n ],\n )\n return self\n
cleaners: list[PossiblySerializedCallable] = [collapse_whitespace]
class-attribute
instance-attribute
\u00b6 symbols: Symbols = Field(default_factory=Symbols)
class-attribute
instance-attribute
\u00b6 to_replace: Dict[str, str] = {}
class-attribute
instance-attribute
\u00b6"},{"location":"reference/configuration/#symbols","title":"Symbols","text":"Your symbol set is created by taking the union of all values defined. For example:
symbols:\n dataset_0_characters: ['a', 'b', 'c']\n dataset_1_characters: ['b', 'c', 'd']\n
Will create a symbol set equal to {'a', 'b', 'c', 'd'}
(i.e. the union of both key/values). This allows you to train models with data from different languages, for example.
Important
You should always manually inspect your configuration here to make sure it makes sense with respect to your data. Is there a symbol that shouldn't be there? Is there a symbol that's defined as 'punctuation' but is used as non-punctuation in your language? Please inspect these and update the configuration accordingly.
"},{"location":"reference/configuration/#everyvoice.config.text_config.Symbols","title":"everyvoice.config.text_config.Symbols
","text":" Bases: BaseModel
Source code in everyvoice/config/text_config.py
class Symbols(BaseModel):\n silence: list[str] = Field(\n [\"<SIL>\"], description=\"The symbol(s) used to indicate silence.\"\n )\n punctuation: Punctuation = Field(\n default_factory=Punctuation,\n description=\"EveryVoice will combine punctuation and normalize it into a set of five permissible types of punctuation to help tractable training.\",\n )\n model_config = ConfigDict(extra=\"allow\")\n\n @property\n def all_except_punctuation(self) -> set[str]:\n \"\"\"Returns the set containing all characters.\"\"\"\n return set(w for _, v in self if not isinstance(v, Punctuation) for w in v)\n\n @model_validator(mode=\"after\")\n def member_must_be_list_of_strings(self) -> \"Symbols\":\n \"\"\"Except for `punctuation` & `pad`, all user defined member variables\n have to be a list of strings.\n \"\"\"\n for k, v in self:\n if isinstance(v, Punctuation):\n continue\n if k == \"pad\":\n continue\n if not isinstance(v, list) or not all(isinstance(e, str) for e in v):\n raise ValueError(f\"{k} must be a list\")\n\n return self\n
all_except_punctuation: set[str]
property
\u00b6 Returns the set containing all characters.
member_must_be_list_of_strings()
\u00b6 Except for punctuation
& pad
, all user defined member variables have to be a list of strings.
Source code in everyvoice/config/text_config.py
@model_validator(mode=\"after\")\ndef member_must_be_list_of_strings(self) -> \"Symbols\":\n \"\"\"Except for `punctuation` & `pad`, all user defined member variables\n have to be a list of strings.\n \"\"\"\n for k, v in self:\n if isinstance(v, Punctuation):\n continue\n if k == \"pad\":\n continue\n if not isinstance(v, list) or not all(isinstance(e, str) for e in v):\n raise ValueError(f\"{k} must be a list\")\n\n return self\n
"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Getting Started","text":"Welcome to the EveryVoice TTS Toolkit Documentation page! Please read the background section below to hear a bit about how this project got started, or head over to the guides section to find a guide to help you get started.
"},{"location":"#background","title":"Background","text":"There are approximately 70 Indigenous languages spoken in Canada from 10 distinct language families. As a consequence of the residential school system and other policies of cultural suppression, the majority of these languages now have fewer than 500 fluent speakers remaining, most of them elderly.
Despite this, Indigenous people have resisted colonial policies and continued speaking their languages, with interest by students and parents in Indigenous language education continuing to grow. Teachers are often overwhelmed by the number of students, and the trend towards online education means many students who have not previously had access to language classes now do. Supporting these growing cohorts of students comes with unique challenges in languages with few fluent first-language speakers. Teachers are particularly concerned with providing their students with opportunities to hear the language outside of class.
While there is no replacement for a speaker of an Indigenous language, there are possible applications for speech synthesis (text-to-speech) to supplement existing text-based tools like verb conjugators, dictionaries and phrasebooks.
The National Research Council has partnered with the Onkwawenna Kentyohkwa Kanyen\u2019k\u00e9ha immersion school, W\u0331S\u00c1NE\u0106 School Board, University nuhelot\u2019\u012fne thaiyots\u2019\u012f nistameyim\u00e2kanak Blue Quills, and the University of Edinburgh to research and develop state-of-the-art speech synthesis (text-to-speech) systems and techniques for Indigenous languages in Canada, with a focus on how to integrate text-to-speech technology into the classroom.
The project is titled Speech Generation for Indigenous Language Education (SGILE) and the EveryVoice TTS toolkit is one of the products of this collaboration. Detailed information about this project can be found in our recent submission to Computer Speech & Language (Under Review).
"},{"location":"#what-is-everyvoice-and-this-documentation-about","title":"What is EveryVoice and this documentation about?","text":"This project is the location of active research and development for the Speech Generation for Indigenous Language Education project. In addition to being a model for this project, it is meant to outline repeatable recipes for other communities and languages to develop their own text-to-speech systems. This documentation describes guides for how to do this.
Note
We are trying to develop a tool that makes the developer experience as smooth as possible. But, building these models and creating your datasets can be complicated. We recommend you are comfortable with Python and using the command line before starting on this project.
"},{"location":"#similar-projects-exist-why-create-another-one","title":"Similar projects exist, why create another one?","text":"It is true that similar excellent projects exist, such as ESPnet, \ud83d\udc38TTS, Comprehensive-Transformer-TTS, and IMS-Toucan among others. Our reasons for creating our own are multi-fold (some of the following features are implemented in the aforementioned projects, but not every one of these features is supported in any of them):
- The EveryVoice TTS toolkit comes with a configuration wizard which helps configure the toolkit to new datasets in new languages.
- We support a heterogeneous source of data, meaning you (hopefully) have to do less work to wrangle data together. The configuration wizard supports multi-dataset configuration.
- We support out-of-the-box integration with g2p which allows the g2p rules for 30+ Indigenous languages to be used in the project.
- We will not try to implement many different models. Instead we will curate a model architecture that we believe to be best for training models on under-resourced languages. In this way we are more similar to IMS-Toucan than ESPnet
- We use a custom, statically-typed configuration architecture between models written in Pydantic that allows for configuration validation and serialization/de-serialization to json and yaml. It also allows us to ensure the same configuration for text and audio processing is used between models.
- We implement our models in PyTorch Lightning
For a detailed comparison of selected features of EveryVoice and other toolkits please see Appendix B & C in our recent paper submission.
Note
These features do not necessarily mean that this is the right project for you. The other projects mentioned are of very high quality and might be a better fit for your project, particularly if you are lucky enough to have lots of data, or a language that is already supported.
"},{"location":"install/","title":"Installation","text":"In order to use EveryVoice on GPUs, you must install PyTorch and Cuda, Python 3.10 or more recent, a number of other dependencies, and EveryVoice itself. The following sections describe three ways to accomplish this:
"},{"location":"install/#scripted-installation-recommended","title":"Scripted installation -- recommended","text":"The EveryVoice installation process can be somewhat involved, so we have automated it as much as we could.
- Install miniconda or conda.
- Clone the EveryVoice repo and its submodules:
git clone https://github.com/roedoejet/EveryVoice.git\ncd EveryVoice\ngit submodule update --init\n
- Run our automated environment creation script
./make-everyvoice-env --name EveryVoice\nconda activate EveryVoice\n
Add the option --cuda CUDA_VERSION
if you need to override the default CUDA version, or --cpu
to use Torch compiled for CPU use only.
"},{"location":"install/#using-pip","title":"Using Pip","text":"We hope direct installation from PyPI will work:
-
Follow the PyTorch installation instructions relevant to your hardware, but specify version 2.1.0: torch==2.1.0
, and torchaudio==2.1.0
.
-
Install EveryVoice:
pip install everyvoice==0.1.0a
"},{"location":"install/#manual-installation","title":"Manual installation","text":"If you prefer to do the complete installation process manually, or if the automated process does not work for you, follow these steps:
"},{"location":"install/#install-conda","title":"Install Conda","text":"Install miniconda or conda.
"},{"location":"install/#create-the-environment","title":"Create the environment","text":"Use conda to create a new environment based on Python 3.10:
conda create --name EveryVoice python=3.10\nconda activate EveryVoice\n
"},{"location":"install/#pytorch-dependencies","title":"Pytorch dependencies","text":"Install our pytorch requirements from requirements.torch.txt
, replacing cu118
below (for CUDA 11.8) by your actual CUDA version tag (118 or higher), or by cpu
for a CPU-only installation:
CUDA_TAG=cu118 pip install -r requirements.torch.txt --find-links https://download.pytorch.org/whl/torch_stable.html\n
Alternatively, you can follow the PyTorch installation instructions relevant to your hardware. Make sure you specify the version declared in requirements.torch.txt
, which is 2.1.0 at the moment.
"},{"location":"install/#other-potentially-tricky-dependencies","title":"Other potentially tricky dependencies","text":"These requirements sometimes require being run separately:
pip install cython\nconda install sox -c conda-forge\n
"},{"location":"install/#handling-running-out-of-temp-disk-space","title":"Handling running out of temp disk space","text":"Installation will require a fair bit of space on ~/.cache
and your $TMPDIR
(/tmp
by default, if $TMPDIR
is not set). If you get the error OSError: [Errno 28] No space left on device
during installation, you may need to do one or both of these operations: - export TMPDIR=/path/to/a/large/tmp/space
(or maybe export TMPDIR=.
) - mkdir /path/to/a/large/filesystem/.cache; ln -s /path/to/a/large/filesystem/.cache ~/.cache
"},{"location":"install/#install-everyvoice-itself","title":"Install EveryVoice itself","text":"Install EveryVoice locally from your cloned sandbox:
pip install -e .\n
"},{"location":"install/#dev-dependencies","title":"Dev dependencies","text":"Before you can run the test suites, you'll also need to install the dev dependencies:
pip install -r requirements.dev.txt\n
"},{"location":"install/#git-hooks","title":"Git hooks","text":"If you plan to contribute to the project, please install our Git hooks:
pre-commit install\ngitlint install-hook\ngit submodule foreach 'pre-commit install'\ngit submodule foreach 'gitlint install-hook'\n
"},{"location":"guides/","title":"Guides","text":"Here are a selection of guides to help you through the process of training and using your own text-to-speech models.
-
Should you build a TTS Model? What are the possible negative outcomes?
This guide is a primer for the ethical questions related to building a TTS system. Please read this section first before doing anything else!
-
Background to TTS
This guide points to resources for learning more about the technical side of how text-to-speech works.
-
How to create a TTS system for your language
This guide provides in-depth information about how to build a TTS system for your language using the EveryVoice TTS Toolkit.
-
How to fine-tune
This guide provides information on how to fine-tune your models. It is an advanced, but recommended, step for building a TTS system.
"},{"location":"guides/background/","title":"Background to Text-to-Speech","text":"Consider what is required in order for speech-based communication to work. A speaker decides to utter a word, contracts their diaphragm to pull air into their lungs, and upon exhaling, returns the air through their vocal tract. They then contort their vocal tract in highly specific ways to reach a series of articulatory targets that they have associated with a particular meaning. The flow of air past these orchestrated contortions causes pressure fluctuations at varying frequencies that, upon impinging on the listener's ear drums, are processed and understood to represent the same meaning the speaker intended - magic!
The idea of creating machines to simulate speech has origins as early as the 18th century when Hungarian inventor Wolfgang von Kempelen created his 'speaking machine' to woo crowds. Speech synthesis has since made tremendous gains, and is now employed to solve a variety of real world problems. While von Kempelen's machine attempted to replicate the anatomy required for speech, modern techniques use computers to work with discrete representations of sound and the last decade of improvements to speech synthesis have grown in tandem with the progress of the field of neural network-based machine learning.
We intend to update this section with a variety of resources to help provide background on Text-to-Speech (TTS). In the meantime, please visit this excellent TTS primer from the NVIDIA NeMo TTS toolkit. If you are interested in more in-depth learning about TTS and speech processing, we recommend the Speech Processing and Speech Synthesis courses on Speech Zone.
"},{"location":"guides/custom/","title":"Customize to your language","text":""},{"location":"guides/custom/#step-1-make-sure-you-have-permission","title":"Step 1: Make sure you have Permission!","text":"So, you want to build a text-to-speech system for a new language or dataset - cool! But, just because you can build a text-to-speech system, doesn't mean you should. There are a lot of important ethical questions around text-to-speech. For example, it's not ethical to just use audio you find somewhere online if it doesn't have explicit permission to use it for the purposes of text-to-speech. The first step is always to make sure you have permission to use the data in question and that whoever contributed their voice to the data you want to use is aware and supportive of your goal.
Creating a text-to-speech model without permission is unethical, but even when you do have permission, you should take great care in how you distribute the model you have created. Increasingly, text-to-speech technology is used in fraud and unauthorized impersonation. The technology has also been used to disenfranchise voice actors and other professionals. When you create an EveryVoice model, you are responsible for ensuring the model is only used and distributed according to the permissions you have. To help with this accountability, you will be required by EveryVoice to attest that you have permission to use your data and to provide a full name and contact information that will also be distributed with the model.
In addition, we invite you to check out our short guide that contains prompts about ethical questions before starting on any of the next steps.
"},{"location":"guides/custom/#step-2-gather-your-data","title":"Step 2: Gather Your Data","text":"The first thing to do is to get all the data you have (in this case audio with text transcripts) together in one place. Your audio should be in a lossless 'wav' format. Ideally it would be 16bit, mono (one channel) audio sampled somewhere between 22.05kHz and 48kHz. If that doesn't mean anything to you, don't worry, we can ensure the right format in later steps. It's best if your audio clips are somewhere between half a second and 10 seconds long. Any longer and it could be difficult to train. If your audio is longer than this, we suggest processing it into smaller chunks first.
Your text should be consistently written and should be in a pipe-separated values spreadsheet, similar to this file. It should have a column that contains text and a column that contains the basename
of your associated audio file. So if you have a recording of somebody saying \"hello how are you?\" and the corresponding audio is called mydata0001.wav
then you should have a psv file that looks like this:
basename|text\nmydata0001|hello how are you?\nmydata0002|some other sentence.\n...\n
We also support comma and tab separated files, but recommend using pipes (|).
You can also use the \"festival\" format which is like this (example from Sinhala TTS):
( sin_2241_0329430812 \" \u0d9a\u0ddd\u0d9a\u0da7\u0dad\u0dca \u0db8\u0d82 \u0dc0\u0dd9\u0db1\u0daf\u0dcf \u0dad\u0dbb\u0db8\u0dca \u0d9a\u0dcf\u0dbd\u0dd9 \u0d9c\u0db1\u0dca\u0db1\u0dd0\u0dad\u0dd2\u0dc0 \u0d87\u0db3 \u0d9c\u0dad\u0dca\u0dad\u0dcf \" )\n( sin_2241_0598895166 \" \u0d87\u0db1\u0dca\u0da2\u0dbd\u0dd3\u0db1\u0dcf \u0da2\u0ddc\u0dbd\u0dd3 \u0d9a\u0dd2\u0dba\u0db1\u0dca\u0db1\u0dda \u0db4\u0dc3\u0dd4\u0d9c\u0dd2\u0dba \u0daf\u0dd2\u0db1\u0dc0\u0dbd \u0db6\u0ddc\u0dc4\u0ddd \u0dc3\u0dd9\u0dba\u0dd2\u0db1\u0dca \u0d9a\u0dad\u0dcf \u0db6\u0dc4\u0da7 \u0dbd\u0d9a\u0dca\u0dc0\u0dd6 \u0da0\u0dbb\u0dd2\u0dad\u0dba\u0d9a\u0dca \" )\n( sin_2241_0701577369 \" \u0d86\u0dbb\u0dca\u0dae\u0dd2\u0d9a \u0da0\u0dd2\u0db1\u0dca\u0dad\u0db1\u0dba \u0dc4\u0dcf \u0dc3\u0dcf\u0db8\u0dcf\u0da2\u0dd3\u0dba \u0daf\u0dd2\u0dba\u0dd4\u0dab\u0dd4\u0dc0 \u0d87\u0dad\u0dd2 \u0d9a\u0dc5 \u0dc4\u0dd0\u0d9a\u0dd2\u0dc0\u0db1\u0dd4\u0dba\u0dda \u0db4\u0dd4\u0daf\u0dca\u0d9c\u0dbd \u0d86\u0dbb\u0dca\u0dae\u0dd2\u0d9a \u0daf\u0dd2\u0dba\u0dd4\u0dab\u0dd4\u0dc0 \u0dc3\u0dbd\u0dc3\u0dcf \u0daf\u0dd3\u0db8\u0dd9\u0db1\u0dca\u0dba \" )\n( sin_2241_0715400935 \" \u0d89\u0db1\u0dca \u0d85\u0daf\u0dc4\u0dc3\u0dca \u0dc0\u0db1\u0dca\u0db1\u0dda \u0dc0\u0dd2\u0da0\u0dcf\u0dbb\u0dcf\u0dad\u0dca\u0db8\u0d9a \u0dc0\u0dd2\u0db1\u0dd2\u0dc0\u0dd2\u0daf \u0daf\u0dd0\u0d9a\u0dd3\u0db8\u0dd9\u0db1\u0dca \u0dad\u0ddc\u0dbb \u0db6\u0dd0\u0dbd\u0dca\u0db8\u0dba\u0dd2 \" )\n( sin_2241_0817100025 \" \u0d85\u0db4 \u0dba\u0dd4\u0daf\u0dca\u0db0\u0dba\u0dda \u0db4\u0dc5\u0db8\u0dd4 \u0db4\u0dd2\u0dba\u0dc0\u0dbb\u0dda\u0daf\u0dd3\u0db8 \u0db4\u0dbb\u0dcf\u0daf \u0dc0\u0dd3 \u0d85\u0dc0\u0dc3\u0dcf\u0db1\u0dba \" )\n
In this format, there are corresponding wav files labelled sin_2241_0329430812.wav etc..
"},{"location":"guides/custom/#step-3-install-everyvoice","title":"Step 3: Install EveryVoice","text":"Head over to the installation documentation and install EveryVoice
"},{"location":"guides/custom/#step-4-run-the-configuration-wizard","title":"Step 4: Run the Configuration Wizard \ud83e\uddd9","text":"Once you have your data, the best thing to do is to run the Configuration Wizard \ud83e\uddd9 which will help you configure a new project. To do that run:
everyvoice new-project\n
After running the wizard, cd into your newly created directory. Let's call it <your_everyvoice_project>
for now.
cd your_everyvoice_project\n
Important
After you run the Configuration Wizard \ud83e\uddd9, please inspect your text configuration config/everyvoice-shared-text.yaml
to make sure everything looks right. That is, if some unexpected symbols show up, please inspect your data (if you remove symbols from the configuration here, they will be ignored during training). Sometimes characters that are treated as punctuation by default will need to be removed from the punctuation list if they are treated as non-punctuation in your language.
"},{"location":"guides/custom/#step-5-run-the-preprocessor","title":"Step 5: Run the Preprocessor","text":"Your models need to do a number of preprocessing steps in order to prepare for training. To preprocess everything you need, run the following:
everyvoice preprocess config/everyvoice-text-to-spec.yaml\n
"},{"location":"guides/custom/#step-6-select-a-vocoder","title":"Step 6: Select a Vocoder","text":"So you don't need to train your own vocoder, EveryVoice has a variety of publicly released vocoders available here. Follow the instructions there for downloading the checkpoints.
EveryVoice is also compatible out-of-the-box with the UNIVERSAL_V1 HiFiGAN checkpoint from the official HiFiGAN implementation, which is very good quality. You can find the EveryVoice-compatible version of this checkpoint here.
Using a pre-trained vocoder is recommended, and the above checkpoint should work well even for new languages after finetuning.
"},{"location":"guides/custom/#train-your-own-vocoder","title":"Train your own Vocoder","text":"You might want to train your own vocoder, but this takes a long time (up to 2 weeks on a single GPU), uses a lot of electricity, and unless you know what you are doing, you are unlikely to improve upon the publicly available models discussed above, even for a new language. So we do not recommend it. You are almost always better off just using the pre-trained vocoder and then finetuning on the predictions from your feature prediction network. If you really do want to train your own vocoder though, you can run the following command:
everyvoice train spec-to-wav config/everyvoice-spec-to-wav.yaml\n
By default, we run our training with PyTorch Lightning's \"auto\" strategy. But, if you are on a machine where you know the hardware, you can specify it like:
everyvoice train spec-to-wav config/everyvoice-spec-to-wav.yaml -d 1 -a gpu\n
Which would use the GPU accelerator (-a gpu
) and specify 1 device/chip (-d 1
).
"},{"location":"guides/custom/#step-7-train-your-feature-prediction-network","title":"Step 7: Train your Feature Prediction Network","text":"To generate audio when you train your feature prediction network, you need to add your vocoder checkpoint to the config/everyvoice-text-to-spec.yaml
At the bottom of that file you'll find a key called vocoder_path
. Add the absolute path to your trained vocoder (here it would be /path/to/test/logs_and_checkpoints/VocoderExperiment/base/checkpoints/last.ckpt
where /path/to
would be the actual path to it on your computer.)
Once you've replaced the vocoder_path
key, you can train your feature prediction network:
everyvoice train text-to-spec config/everyvoice-text-to-spec.yaml\n
Tip
While your model is training, you can use TensorBoard to view the logs which will show information about the progress of training and display spectrogram images. If you have provided a vocoder_path
key, then you will also be able to hear audio in the logs. To use TensorBoard, make sure that your conda environment is activated and run tensorboard --logdir path/to/logs_and_checkpoints
. Then your logs will be viewable at http://localhost:6006.
"},{"location":"guides/custom/#step-8-optional-finetune-your-vocoder","title":"Step 8 (optional): Finetune your Vocoder","text":"When you have finished training your Feature Prediction Network, we recommend finetuning your vocoder. This step is optional, but it will help get rid of metallic artefacts that are often present if you don't finetune your vocoder. Note, it will likely not help with any mispronounciations. If you notice these types of errors, it is likely due to issues with the training data (e.g. too much variation in pronunciation or recording quality in the dataset, or discrepencies between the recording and transcription.)
"},{"location":"guides/custom/#step-9-synthesize-speech-in-your-language","title":"Step 9: Synthesize Speech in Your Language!","text":""},{"location":"guides/custom/#command-line","title":"Command Line","text":"You can synthesize by pointing the CLI to your trained feature prediction network and passing in the text. You can export the wav or spectrogram (pt) files.
everyvoice synthesize from-text logs_and_checkpoints/FeaturePredictionExperiment/base/checkpoints/last.ckpt -t \"\u0db8\u0dd9\u0daf\u0dcf \u0dc3\u0dd0\u0dbb\u0dda \u0dc3\u0dcf\u0d9a\u0da0\u0dca\u0da1\u0dcf\u0dc0\u0d9a\u0dca \u0dc0\u0dd2\u0daf\u0dd2\u0dba\u0da7 \u0db1\u0dd9\u0dc0\u0dd9\u0dba\u0dd2 \u0db1\u0dda\u0daf \u0db4\u0dbd \u0d9a\u0dbb\u0dbd \u0dad\u0dd2\u0dba\u0dd9\u0db1\u0dca\u0db1\u0dd9\" -a gpu -d 1 --output-type wav\n
"},{"location":"guides/custom/#demo-app","title":"Demo App","text":"You can also synthesize audio by starting up the EveryVoice Demo using your Feature Prediction and Vocoder checkpoints:
everyvoice demo logs_and_checkpoints/FeaturePredictionExperiment/base/checkpoints/last.ckpt logs_and_checkpoints/VocoderExperiment/base/checkpoints/last.ckpt\n
And an interactive demo will be available at http://localhost:7260
"},{"location":"guides/ethics/","title":"Should you build a TTS Model? What are the possible negative outcomes?","text":"So, you're here because you want to build a TTS system - maybe for a language that doesn't have an existing one, but just because you can build a text-to-speech system, doesn't mean you should. The most important step of a new text-to-speech project is to consider what possible ethical problems could arise from the technology and which goals you are hoping to achieve with it. This section will walk you through some important questions to consider; you might also find that they apply broadly to other technology projects.
"},{"location":"guides/ethics/#check-before-you-tech","title":"Check Before you Tech!","text":"Technology is flashy, and it seems like you can hardly turn a corner without someone talking about AI. However, as the excitment about the possibilities of this technology grow, so too have the cautionary warnings12. Amidst all the flurry of activity and hype - the fundamental question of why are we building technology X, Y, or Z should hopefully come up. What goals are we hoping to achieve, and what new problems might we be introducing with a new technology?
The following sections provide a couple of questions based on the excellent \"Check Before you Tech\" guide for choosing language technology in a language revitalization context. While the original guide is geared towards technology users, we target our questions to technology developers and researchers. We urge you to consider these questions before beginning your TTS project.
Note
This list is not intended to be a comprehensive list of all the ethical questions to consider, but rather a starting point for discussing and considering the impacts of the technology you are potentially creating.
"},{"location":"guides/ethics/#where-is-the-data-coming-from-do-you-have-explicit-permission-from-the-creator-of-the-data-and-the-speaker","title":"Where is the data coming from? Do you have explicit permission from the creator of the data, and the speaker?","text":"It is not ethical to build a TTS system with data that you do not have permission to use. You should not scrape or re-purpose data that you find online to build TTS systems unless the data comes with explicit permissions to do so.
For TTS, you are building a model of someone's likeness, so you should make sure that you have obtained permission from the data creator as well as the speaker whose likeness will be modeled. When permission is asked for, you should be clear with the person(s) about what the technology could be used for.
If you do not have enough time/resources to ask this question and obtain permission from all the relevant stakeholders, you should not build TTS models with the data.
"},{"location":"guides/ethics/#what-is-your-goal-how-will-tts-help-you-meet-that-goal","title":"What is your goal? How will TTS help you meet that goal?","text":"As mentioned above, it's important to think about what you are actually trying to achieve with TTS. Not only will this help you determine whether EveryVoice TTS is the right toolkit for your application, but it will also help you determine whether you need to spend all the time and resources necessary to build a TTS system in the first place.
We invite you to consider whether your goal serving you or is it serving the people whose language you're working with? And if the answer is the latter, how do you know that, and how are you ensuring that that continues to be true? When discussing the project with relevant stakeholders, you should also mention any other goals you have in building this technology (e.g. publishing papers).
"},{"location":"guides/ethics/#where-is-the-model-going-to-be-stored-who-has-control-and-access-to-the-model-and-who-has-ownership","title":"Where is the model going to be stored? Who has control and access to the model and who has ownership?","text":"If the speaker or permissions-holders for the data or models change their mind about participation, how easy is it for them to stop the model? Do they have access to a 'kill switch'? Do they need to contact someone and make a request? Are there assurances about how long these requests will take to be processed?
Are there clear, agreed-upon guidelines for who has access to the model and data? Who maintains the control and access to these resources? In Canada, we encourage users to engage with the First Nations Principles of OCAP\u00ae when planning a project.
"},{"location":"guides/ethics/#what-are-the-possible-risks-associated-with-this-technology-and-how-will-i-mitigate-them","title":"What are the possible risks associated with this technology and how will I mitigate them?","text":"Have you considered and discussed possible risks with the relevant stakeholders? Spend some time imagining ways that the tool could be misused, either by accidental or malicious actors. What if the model makes pronunciation mistakes? Will that embarrass the speaker? What if the model is made to say inappropriate things? What plans do you have to mitigate these risks?
Since this technology is relatively new, it can sometimes be hard to consider the ways that a technology can be misused, however we already see examples where TTS models are being used to generate fake news3. Can you think of ways that similar so-called 'deep fakes' or impersonations could be used to cause harm?
-
Emily M. Bender, Timnit Gebru, Angelina McMillan-Major, and Shmargaret Shmitchell. 2021. On the Dangers of Stochastic Parrots: Can Language Models Be Too Big? \ud83e\udd9c. In Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency (FAccT '21). Association for Computing Machinery, New York, NY, USA, 610\u2013623. https://doi.org/10.1145/3442188.3445922 \u21a9
-
Marie-Odile Junker. 2024. Data-mining and Extraction: the gold rush of AI on Indigenous Languages. In Proceedings of the Seventh Workshop on the Use of Computational Methods in the Study of Endangered Languages, pages 52\u201357, St. Julians, Malta. Association for Computational Linguistics. https://aclanthology.org/2024.computel-1.8/ \u21a9
-
https://nypost.com/2024/06/14/us-news/michigan-gop-candidate-anthony-hudson-stands-by-ai-generated-mlk-jr-endorsement-video/ \u21a9
"},{"location":"guides/finetune/","title":"How to fine-tune the existing checkpoints","text":""},{"location":"guides/finetune/#vocoder-matching","title":"Vocoder matching","text":"Vocoder (i.e. your spec-to-wav model) matching is an important part of the TTS pipeline. Because your spec-to-wav model is trained with the ground-truth Mel spectrograms from your audio, there is a mismatch between the Mel spectrograms created by your text-to-spec model and the ones that the pre-trained vocoders have seen during training. For that reason, it can be helpful to fine-tune your spec-to-wav model with the generated Mel spectrograms from your text-to-spec model.
Note
Vocoder matching will only help with the metallic artefacts that sometimes occur when synthesizing speech. If your model is not intelligible, has other types of errors like mispronunciations - vocoder matching will not solve it. In these cases, the problem is likely with your text-to-spec model, and probably due to either noisy data (noisy recordings, mistranscriptions etc), too little data, or data that is too varied (many different speakers). Please refer to TODO: troubleshooting for more information.
To finetune your spec-to-wav model with Mel spectrograms from your text-to-spec model (also called 'vocoder matching'), you need to have a pre-trained text-to-spec and spec-to-wav model ready. You also need to have access to some parallel text/audio data (the same or similar data that you used to train your text-to-spec model).
Then you:
-
Generate a folder full of Mel spectrograms from your text-to-spec model (repeat this process for both your training and validation filelists):
Training filelist:
everyvoice synthesize from-text <path-to-your-text-to-spec.ckpt> -O spec --filelist <path-to-your-training-filelist.psv> --teacher-forcing-directory <path-to-your-preprocessed-directory> --output-dir <path-to-your-preprocessed-directory>\n
Validation filelist:
everyvoice synthesize from-text <path-to-your-text-to-spec.ckpt> -O spec --filelist <path-to-your-validation-filelist.psv> --teacher-forcing-directory <path-to-your-preprocessed-directory> --output-dir <path-to-your-preprocessed-directory>\n
Note
For vocoder matching to work, the size of the generated Mel spectrogram has to be the same as the ground truth Mel spectrogram calculated from the audio, so you have to use 'teacher-forcing' to force the text-to-spec model to output spectrograms of a specific size. To do this, we add the --teacher-forcing-directory and point it to the project preprocessed
directory with the processed files from our filelist. This will write a synthesized_spec
folder within your preprocessed
directory, that you can use instead of the groundtruth spec
data by setting finetune
to True as described in the next step.
-
Set the finetune_ckpt value point to the vocoder checkpoint that you want to fine-tune.
-
Lower the learning rate (we suggest starting at 0.00001)
-
Train the vocoder again with finetuning set to True (train for at least 25000 steps):
everyvoice train spec-to-wav config/everyvoice-spec-to-wav.yaml -c training.finetune=True\n
"},{"location":"guides/schemas/","title":"How to Setup Code Completion for Schemas in vim","text":"When manually editing EveryVoice's configuration files, it is convenient to have the file checked/validated and to have documentation about each field. The following setup will work for json and yaml configuration files.
"},{"location":"guides/schemas/#install-nodejs","title":"Install nodejs
","text":"You will need to have a functional npm
which is part of nodejs
. The schemas will be verified using a node process.
"},{"location":"guides/schemas/#install-vim-plug","title":"Install vim-plug","text":"vim-plug: Minimalist Vim Plugin Manager This will take care of install vim's extensions for us.
curl \\\n --create-dirs \\\n -fLo ~/.vim/autoload/plug.vim \\\n https://raw.githubusercontent.com/junegunn/vim-plug/master/plug.vim\n
"},{"location":"guides/schemas/#augment-your-vimrc","title":"Augment your .vimrc
","text":"We want to install Conquer of Completion aka coc We will add a plugins for coc-json and coc-yaml which will be used to handle json and yaml files. Let's also add some key bindings to access coc.nvim
's functionalities. Refer to Example Vim configuration for a more complete example.
call plug#begin()\nPlug 'neoclide/coc.nvim', { 'do': 'npm ci' }\n\nlet g:coc_disable_startup_warning = 1\n\n\" Use tab for trigger completion with characters ahead and navigate\n\" NOTE: There's always complete item selected by default, you may want to enable\n\" no select by `\"suggest.noselect\": true` in your configuration file\n\" NOTE: Use command ':verbose imap <tab>' to make sure tab is not mapped by\n\" other plugin before putting this into your config\ninoremap <silent><expr> <TAB>\n\\ coc#pum#visible() ? coc#pum#next(1) :\n\\ CheckBackspace() ? \"\\<Tab>\" :\n\\ coc#refresh()\ninoremap <expr><S-TAB> coc#pum#visible() ? coc#pum#prev(1) : \"\\<C-h>\"\n\n\" Make <CR> to accept selected completion item or notify coc.nvim to format\n\" <C-g>u breaks current undo, please make your own choice\ninoremap <silent><expr> <CR> coc#pum#visible() ? coc#pum#confirm()\n\\: \"\\<C-g>u\\<CR>\\<c-r>=coc#on_enter()\\<CR>\"\n\nfunction! CheckBackspace() abort\n let col = col('.') - 1\n return !col || getline('.')[col - 1] =~# '\\s'\nendfunction\n\n\" Use <c-space> to trigger completion\nif has('nvim')\n inoremap <silent><expr> <c-space> coc#refresh()\nelse\n inoremap <silent><expr> <c-@> coc#refresh()\nendif\n\n\" Use `[g` and `]g` to navigate diagnostics\n\" Use `:CocDiagnostics` to get all diagnostics of current buffer in location list\nnmap <silent> [g <Plug>(coc-diagnostic-prev)\nnmap <silent> ]g <Plug>(coc-diagnostic-next)\n\n\" GoTo code navigation\nnmap <silent> gd <Plug>(coc-definition)\nnmap <silent> gy <Plug>(coc-type-definition)\nnmap <silent> gi <Plug>(coc-implementation)\nnmap <silent> gr <Plug>(coc-references)\n\n\" Use K to show documentation in preview window\nnnoremap <silent> K :call ShowDocumentation()<CR>\n\nfunction! ShowDocumentation()\n if CocAction('hasProvider', 'hover')\n call CocActionAsync('doHover')\n else\n call feedkeys('K', 'in')\n endif\nendfunction\n\n\" Highlight the symbol and its references when holding the cursor\nautocmd CursorHold * silent call CocActionAsync('highlight')\n\n\" Symbol renaming\nnmap <leader>rn <Plug>(coc-rename)\n\n\" Formatting selected code\nxmap <leader>F <Plug>(coc-format-selected)\nnmap <leader>F <Plug>(coc-format-selected)\n\naugroup mygroup\nautocmd!\n\" Setup formatexpr specified filetype(s)\nautocmd FileType typescript,json setl formatexpr=CocAction('formatSelected')\n\" Update signature help on jump placeholder\nautocmd User CocJumpPlaceholder call CocActionAsync('showSignatureHelp')\naugroup end\n\n\" Applying code actions to the selected code block\n\" Example: `<leader>aap` for current paragraph\nxmap <leader>a <Plug>(coc-codeaction-selected)\nnmap <leader>a <Plug>(coc-codeaction-selected)\n\n\" Remap keys for applying code actions at the cursor position\nnmap <leader>ac <Plug>(coc-codeaction-cursor)\n\" Remap keys for apply code actions affect whole buffer\nnmap <leader>as <Plug>(coc-codeaction-source)\n\" Apply the most preferred quickfix action to fix diagnostic on the current line\nnmap <leader>qf <Plug>(coc-fix-current)\n\n\" Remap keys for applying refactor code actions\nnmap <silent> <leader>re <Plug>(coc-codeaction-refactor)\nxmap <silent> <leader>r <Plug>(coc-codeaction-refactor-selected)\nnmap <silent> <leader>r <Plug>(coc-codeaction-refactor-selected)\n\n\" Run the Code Lens action on the current line\nnmap <leader>cl <Plug>(coc-codelens-action)\n\n\" Map function and class text objects\n\" NOTE: Requires 'textDocument.documentSymbol' support from the language server\nxmap if <Plug>(coc-funcobj-i)\nomap if <Plug>(coc-funcobj-i)\nxmap af <Plug>(coc-funcobj-a)\nomap af <Plug>(coc-funcobj-a)\nxmap ic <Plug>(coc-classobj-i)\nomap ic <Plug>(coc-classobj-i)\nxmap ac <Plug>(coc-classobj-a)\nomap ac <Plug>(coc-classobj-a)\n\n\" Remap <C-f> and <C-b> to scroll float windows/popups\nif has('nvim-0.4.0') || has('patch-8.2.0750')\nnnoremap <silent><nowait><expr> <C-f> coc#float#has_scroll() ? coc#float#scroll(1) : \"\\<C-f>\"\nnnoremap <silent><nowait><expr> <C-b> coc#float#has_scroll() ? coc#float#scroll(0) : \"\\<C-b>\"\ninoremap <silent><nowait><expr> <C-f> coc#float#has_scroll() ? \"\\<c-r>=coc#float#scroll(1)\\<cr>\" : \"\\<Right>\"\ninoremap <silent><nowait><expr> <C-b> coc#float#has_scroll() ? \"\\<c-r>=coc#float#scroll(0)\\<cr>\" : \"\\<Left>\"\nvnoremap <silent><nowait><expr> <C-f> coc#float#has_scroll() ? coc#float#scroll(1) : \"\\<C-f>\"\nvnoremap <silent><nowait><expr> <C-b> coc#float#has_scroll() ? coc#float#scroll(0) : \"\\<C-b>\"\nendif\n\n\" Use CTRL-S for selections ranges\n\" Requires 'textDocument/selectionRange' support of language server\nnmap <silent> <C-s> <Plug>(coc-range-select)\nxmap <silent> <C-s> <Plug>(coc-range-select)\n\n\" Add `:Format` command to format current buffer\ncommand! -nargs=0 Format :call CocActionAsync('format')\n\n\" Add `:Fold` command to fold current buffer\ncommand! -nargs=? Fold :call CocAction('fold', <f-args>)\n\n\" Add `:OR` command for organize imports of the current buffer\ncommand! -nargs=0 OR :call CocActionAsync('runCommand', 'editor.action.organizeImport')\n\n\" Add (Neo)Vim's native statusline support\n\" NOTE: Please see `:h coc-status` for integrations with external plugins that\n\" provide custom statusline: lightline.vim, vim-airline\nset statusline^=%{coc#status()}%{get(b:,'coc_current_function','')}\n\n\" TODO Space is our Leader, this might interfer with the following:\n\" Mappings for CoCList\n\" Show all diagnostics\nnnoremap <silent><nowait> <space>a :<C-u>CocList diagnostics<cr>\n\" Manage extensions\nnnoremap <silent><nowait> <space>e :<C-u>CocList extensions<cr>\n\" Show commands\nnnoremap <silent><nowait> <space>c :<C-u>CocList commands<cr>\n\" Find symbol of current document\nnnoremap <silent><nowait> <space>o :<C-u>CocList outline<cr>\n\" Search workspace symbols\nnnoremap <silent><nowait> <space>s :<C-u>CocList -I symbols<cr>\n\" Do default action for next item\nnnoremap <silent><nowait> <space>j :<C-u>CocNext<CR>\n\" Do default action for previous item\nnnoremap <silent><nowait> <space>k :<C-u>CocPrev<CR>\n\" Resume latest coc list\nnnoremap <silent><nowait> <space>p :<C-u>CocListResume<CR>\ncall plug#end()\n
"},{"location":"guides/schemas/#install-the-new-plugins","title":"Install the New Plugins","text":"Plugins don't automatically install themself thus you have to run the following command to install them. Start vim
then do
vim +PlugInstall \"+:CocInstall coc-json\" \"+:CocInstall coc-yaml\" +:qall\n
"},{"location":"guides/schemas/#compile-cocnvim","title":"Compile coc.nvim","text":"Once your plugins are installed, you will need to compile coc.
cd ~/.vim/plugged/coc.nvim\nnpm ci\n
"},{"location":"guides/schemas/#create-coc-settingsjson","title":"Create Coc-settings.json","text":"Start vim
and run the command :CocConfig
to edit where your everyvoice schemas are located. The following example assumes that you have clone EveryVoice into ~/git/EveryVoice
. Make the proper modifications to match where you have cloned EveryVoice. Also note that you have to change /home/username
with your own username in the yaml section.
{\n \"json.schemas\": [\n {\n \"url\": \"file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-aligner-0.1.json\",\n \"fileMatch\": [\n \"everyvoice-aligner.json\"\n ]\n },\n {\n \"url\": \"file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-shared-data-0.1.json\",\n \"fileMatch\": [\n \"everyvoice-shared-data.json\"\n ]\n },\n {\n \"url\": \"file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-shared-text-0.1.json\",\n \"fileMatch\": [\n \"everyvoice-shared-text.json\"\n ]\n },\n {\n \"url\": \"file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-spec-to-wav-0.1.json\",\n \"fileMatch\": [\n \"everyvoice-spec-to-wav.json\"\n ]\n },\n {\n \"url\": \"file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-spec-0.1.json\",\n \"fileMatch\": [\n \"everyvoice-text-to-spec.json\"\n ]\n },\n {\n \"url\": \"file://${userHome}/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-wav-0.1.json\",\n \"fileMatch\": [\n \"everyvoice-text-to-wav.json\"\n ]\n }\n ],\n \"yaml.schemas\": {\n \"file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-aligner-0.1.json\": [\n \"everyvoice-aligner.yaml\"\n ],\n \"file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-shared-data-0.1.json\": [\n \"everyvoice-shared-data.yaml\"\n ],\n \"file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-shared-text-0.1.json\": [\n \"everyvoice-shared-text.yaml\"\n ],\n \"file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-spec-to-wav-0.1.json\": [\n \"everyvoice-spec-to-wav.yaml\"\n ],\n \"file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-spec-0.1.json\": [\n \"everyvoice-text-to-spec.yaml\"\n ],\n \"file://home/username/git/EveryVoice/everyvoice/.schema/everyvoice-text-to-wav-0.1.json\": [\n \"everyvoice-text-to-wav.yaml\"\n ]\n }\n}\n
"},{"location":"guides/schemas/#usage","title":"Usage","text":"Once everything is installed, start editing a new or existing EveryVoice configuration.
vim everyvoice-shared-data.json\n
Then use CTRL+<space>
to trigger completion.
"},{"location":"reference/","title":"Reference","text":"Here is where you will find information about the various models implemented in EveryVoice. This section will include some fairly technical details. If you just want to build a model using default settings and configurations, please visit the documentation on the guides
"},{"location":"reference/configuration/","title":"Configuration","text":"Each model has a statically typed configuration model. Each configuration has default settings that will be instantiated when the model is instantiated. To create a default preprocessing configuration for example you would:
from everyvoice.config.preprocessing_config import PreprocessingConfig\n\npreprocessing_config = PreprocessingConfig()\n
Static typing means that misconfiguration errors should occur as soon as the configuration is instantiated instead of producing downstream runtime errors. It also means that intellisense is available in your code editor when working with a configuration class.
"},{"location":"reference/configuration/#sharing-configurations","title":"Sharing Configurations","text":"The Text and Preprocessing configurations should only be defined once per dataset and shared between your models to ensure each model makes the same assumptions about your data. To achieve that, each model configuration can also be defined as a path to a configuration file. So, a configuration for an aligner that uses separately defined text and audio preprocessing configurations might look like this:
model:\n lstm_dim: 512\n conv_dim: 512\n ...\ntraining:\n batch_size: 32\n ...\npreprocessing: \"./config/default/everyvoice-shared-data.yaml\"\ntext: \"./config/default/everyvoice-shared-text.yaml\"\n
"},{"location":"reference/configuration/#serialization","title":"Serialization","text":"By default configuration objects are serialized as dictionaries, which works as expected with integers, floats, lists, booleans, dicts etc. But there are some cases where you need to specify a Callable in your configuration. For example the {ref}TextConfig
has a cleaners
field that takes a list of Callables to apply in order to raw text. By default, these functions turn raw text to lowercase, collapse whitespace, and normalize using Unicode NFC normalization. In Python, we could instantiate this by passing the callables directly like so:
from everyvoice.config.text_config import TextConfig\nfrom everyvoice.utils import collapse_whitespace, lower, nfc_normalize\n\ntext_config = TextConfig(cleaners=[lower, collapse_whitespace, nfc_normalize])\n
But, for yaml or json configuration, we need to serialize these functions. To do so, EveryVoice will turn each callable into module dot-notation. That is, your configuration will look like this in yaml:
cleaners:\n - everyvoice.utils.lower\n - everyvoice.utils.collapse_whitespace\n - everyvoice.utils.nfc_normalize\n
This will then be de-serialized upon instantiation of your configuration.
"},{"location":"reference/configuration/#text-configuration","title":"Text Configuration","text":"The TextConfig is where you define the symbol set for your data and any cleaners used to clean your raw text into the text needed for your data. You can share the TextConfig with any models that need it and only need one text configuration per dataset (and possibly only per language).
"},{"location":"reference/configuration/#textconfig","title":"TextConfig","text":""},{"location":"reference/configuration/#everyvoice.config.text_config.TextConfig","title":"everyvoice.config.text_config.TextConfig
","text":" Bases: ConfigModel
Source code in everyvoice/config/text_config.py
class TextConfig(ConfigModel):\n symbols: Symbols = Field(default_factory=Symbols)\n to_replace: Dict[str, str] = {} # Happens before cleaners\n cleaners: list[PossiblySerializedCallable] = [\n collapse_whitespace,\n ]\n\n @model_validator(mode=\"after\")\n def clean_symbols(self) -> \"TextConfig\":\n \"\"\"We should apply all cleaners to the symbols\n\n Returns:\n TextConfig: a text config with cleaned symbols\n \"\"\"\n for k, v in self.symbols:\n if k not in [\"punctuation\", \"silence\"]:\n setattr(\n self.symbols,\n k,\n [\n normalize_text_helper(x, self.to_replace, self.cleaners)\n for x in v\n ],\n )\n return self\n
cleaners: list[PossiblySerializedCallable] = [collapse_whitespace]
class-attribute
instance-attribute
\u00b6 symbols: Symbols = Field(default_factory=Symbols)
class-attribute
instance-attribute
\u00b6 to_replace: Dict[str, str] = {}
class-attribute
instance-attribute
\u00b6"},{"location":"reference/configuration/#symbols","title":"Symbols","text":"Your symbol set is created by taking the union of all values defined. For example:
symbols:\n dataset_0_characters: ['a', 'b', 'c']\n dataset_1_characters: ['b', 'c', 'd']\n
Will create a symbol set equal to {'a', 'b', 'c', 'd'}
(i.e. the union of both key/values). This allows you to train models with data from different languages, for example.
Important
You should always manually inspect your configuration here to make sure it makes sense with respect to your data. Is there a symbol that shouldn't be there? Is there a symbol that's defined as 'punctuation' but is used as non-punctuation in your language? Please inspect these and update the configuration accordingly.
"},{"location":"reference/configuration/#everyvoice.config.text_config.Symbols","title":"everyvoice.config.text_config.Symbols
","text":" Bases: BaseModel
Source code in everyvoice/config/text_config.py
class Symbols(BaseModel):\n silence: list[str] = Field(\n [\"<SIL>\"], description=\"The symbol(s) used to indicate silence.\"\n )\n punctuation: Punctuation = Field(\n default_factory=Punctuation,\n description=\"EveryVoice will combine punctuation and normalize it into a set of five permissible types of punctuation to help tractable training.\",\n )\n model_config = ConfigDict(extra=\"allow\")\n\n @property\n def all_except_punctuation(self) -> set[str]:\n \"\"\"Returns the set containing all characters.\"\"\"\n return set(w for _, v in self if not isinstance(v, Punctuation) for w in v)\n\n @model_validator(mode=\"after\")\n def member_must_be_list_of_strings(self) -> \"Symbols\":\n \"\"\"Except for `punctuation` & `pad`, all user defined member variables\n have to be a list of strings.\n \"\"\"\n for k, v in self:\n if isinstance(v, Punctuation):\n continue\n if k == \"pad\":\n continue\n if not isinstance(v, list) or not all(isinstance(e, str) for e in v):\n raise ValueError(f\"{k} must be a list\")\n\n return self\n
all_except_punctuation: set[str]
property
\u00b6 Returns the set containing all characters.
member_must_be_list_of_strings()
\u00b6 Except for punctuation
& pad
, all user defined member variables have to be a list of strings.
Source code in everyvoice/config/text_config.py
@model_validator(mode=\"after\")\ndef member_must_be_list_of_strings(self) -> \"Symbols\":\n \"\"\"Except for `punctuation` & `pad`, all user defined member variables\n have to be a list of strings.\n \"\"\"\n for k, v in self:\n if isinstance(v, Punctuation):\n continue\n if k == \"pad\":\n continue\n if not isinstance(v, list) or not all(isinstance(e, str) for e in v):\n raise ValueError(f\"{k} must be a list\")\n\n return self\n
"}]}
\ No newline at end of file
diff --git a/latest b/latest
index 7505acfd..90012116 120000
--- a/latest
+++ b/latest
@@ -1 +1 @@
-v0.1.0a2
\ No newline at end of file
+dev
\ No newline at end of file
diff --git a/versions.json b/versions.json
index f8e44aad..95d7746e 100644
--- a/versions.json
+++ b/versions.json
@@ -2,13 +2,14 @@
{
"version": "dev",
"title": "dev",
- "aliases": []
+ "aliases": [
+ "latest"
+ ]
},
{
"version": "v0.1.0a2",
"title": "v0.1.0a2",
"aliases": [
- "latest",
"stable"
]
}