From 0661d0ab97e763b281294039b2cfe4c322af0604 Mon Sep 17 00:00:00 2001 From: Martin Rys Date: Tue, 13 Aug 2024 15:38:17 +0200 Subject: [PATCH 1/8] Bump up Dockerfile and compose to newer syntax format, Torch 2.4.0, CUDA 12.1 Also rename nvidia dockerfile to sort with other dockerfiles --- Dockerfile | 33 +- Dockerfile.nvidia | 30 ++ docker-compose.yml | 3 +- nvidia.Dockerfile | 22 -- system/requirements/requirements_docker.txt | 373 +++++++++++--------- 5 files changed, 252 insertions(+), 209 deletions(-) create mode 100644 Dockerfile.nvidia delete mode 100644 nvidia.Dockerfile diff --git a/Dockerfile b/Dockerfile index 2eaeea4e..379282bb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,20 +1,29 @@ -FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 -ENV HOST 0.0.0.0 -RUN apt-get update && apt-get upgrade -y \ - && apt-get install -y git build-essential portaudio19-dev \ - python3 python3-pip gcc wget \ - ocl-icd-opencl-dev opencl-headers clinfo \ - libclblast-dev libopenblas-dev libaio-dev \ - && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd \ - && ln -s /usr/bin/python3 /usr/bin/python +FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 +ENV HOST=0.0.0.0 + +RUN < /etc/OpenCL/vendors/nvidia.icd +apt-get clean && rm -rf /var/lib/apt/lists/* +EOR WORKDIR /app ENV CUDA_DOCKER_ARCH=all COPY system/requirements/requirements_docker.txt system/requirements/requirements_docker.txt -RUN pip install --no-cache-dir --no-deps -r system/requirements/requirements_docker.txt && \ - pip install --no-cache-dir deepspeed + COPY . . +RUN < /etc/OpenCL/vendors/nvidia.icd +apt-get clean && rm -rf /var/lib/apt/lists/* +EOR + +WORKDIR /app +ENV CUDA_DOCKER_ARCH=all +ENV PIP_CACHE_DIR=/app/pip_cache +RUN mkdir /app/pip_cache + +COPY . . +RUN < /etc/OpenCL/vendors/nvidia.icd \ - && ln -s /usr/bin/python3 /usr/bin/python - -RUN mkdir /app/pip_cache -ENV PIP_CACHE_DIR=/app/pip_cache - -WORKDIR /app -COPY . . -ENV CUDA_DOCKER_ARCH=all -RUN pip install --no-cache-dir --no-deps -r system/requirements/requirements_standalone.txt --cache-dir=/app/pip_cache && \ - pip install --no-cache-dir deepspeed --cache-dir=/app/pip_cache - -EXPOSE 7851 -RUN chmod +x launch.sh -ENTRYPOINT ["sh", "-c", "./launch.sh"] \ No newline at end of file diff --git a/system/requirements/requirements_docker.txt b/system/requirements/requirements_docker.txt index 48b29b56..eacdb94e 100644 --- a/system/requirements/requirements_docker.txt +++ b/system/requirements/requirements_docker.txt @@ -1,173 +1,200 @@ -absl-py==2.1.0 -aiofiles==23.2.1 -aiohttp==3.9.3 -aiosignal==1.3.1 -altair==5.2.0 -annotated-types==0.6.0 -anyascii==0.3.2 -anyio==4.3.0 -attrs==23.2.0 -audioread==3.0.1 -av==11.0.0 -Babel==2.14.0 -bangla==0.0.2 -blinker==1.7.0 -blis==0.7.11 -bnnumerizer==0.0.2 -bnunicodenormalizer==0.1.6 -catalogue==2.0.10 -certifi==2024.2.2 -cffi==1.16.0 -charset-normalizer==3.3.2 -click==8.1.7 -cloudpathlib==0.16.0 -colorama==0.4.6 -coloredlogs==15.0.1 -confection==0.1.4 -contourpy==1.2.0 -coqpit==0.0.17 -ctranslate2==4.1.0 -cutlet==0.4.0 -cycler==0.12.1 -cymem==2.0.8 -Cython==3.0.9 -dateparser==1.1.8 -decorator==5.1.1 -docopt==0.6.2 -einops==0.7.0 -encodec==0.1.1 -exceptiongroup -fastapi==0.110.0 -faster-whisper==1.0.1 -ffmpy==0.3.2 -filelock==3.13.3 -Flask==3.0.2 -flatbuffers==24.3.25 -fonttools==4.50.0 -frozenlist==1.4.1 -fsspec==2024.3.1 -fugashi==1.3.1 -fuzzywuzzy>=0.18.0 -g2pkk==0.1.2 -gradio==3.50.2 -gradio_client==0.6.1 -grpcio==1.62.1 -gruut==2.2.3 -gruut-ipa==0.13.0 -gruut-lang-de==2.0.0 -gruut-lang-en==2.0.0 -gruut-lang-es==2.0.0 -gruut-lang-fr==2.0.2 -h11==0.14.0 -hangul-romanize==0.1.0 -httpcore==1.0.4 -httpx==0.27.0 -huggingface-hub==0.22.1 -humanfriendly==10.0 -idna==3.6 -importlib_metadata==7.1.0 -importlib_resources==6.4.0 -inflect==7.0.0 -itsdangerous==2.1.2 -jaconv==0.3.4 -jamo==0.4.1 -jieba==0.42.1 -Jinja2==3.1.3 -joblib==1.3.2 -jsonlines==1.2.0 -jsonschema==4.21.1 -jsonschema-specifications==2023.12.1 -kiwisolver==1.4.5 -langcodes==3.3.0 -lazy_loader==0.3 -librosa==0.10.1 -llvmlite==0.42.0 -Markdown==3.6 -MarkupSafe==2.1.5 -matplotlib==3.8.3 -mojimoji==0.0.13 -mpmath==1.3.0 -msgpack==1.0.8 -multidict==6.0.5 -murmurhash==1.0.10 -networkx==2.8.8 -nltk==3.8.1 -num2words==0.5.13 -numba==0.59.1 -numpy==1.26.4 -nvidia-cublas-cu11>=11.11.3.6 -nvidia-cudnn-cu11>=9.0.0.312 -onnxruntime==1.17.1 -orjson==3.9.15 -packaging==24.0 -pandas==1.5.3 -pillow==10.2.0 -platformdirs==4.2.0 -pooch==1.8.1 -preshed==3.0.9 -protobuf==5.26.0 -psutil==5.9.8 -pycparser==2.21 -pydantic==2.6.4 -pydantic_core==2.16.3 -pydub==0.25.1 -pynndescent==0.5.11 -pyparsing==3.1.2 -pypinyin==0.51.0 -pyreadline3==3.4.1 -pysbd==0.3.4 -python-crfsuite==0.9.10 -python-dateutil==2.9.0.post0 -python-Levenshtein>=0.25.0 -python-multipart==0.0.9 -pytz==2024.1 -PyYAML==6.0.1 -referencing==0.34.0 -regex==2023.12.25 -requests==2.31.0 -rpds-py==0.18.0 -safetensors==0.4.2 -scikit-learn==1.4.1.post1 -scipy==1.12.0 -semantic-version==2.10.0 -six==1.16.0 -smart-open==6.4.0 -sniffio==1.3.1 -sounddevice==0.4.6 -soundfile==0.12.1 -soxr==0.3.7 -spacy==3.7.4 -spacy-legacy==3.0.12 -spacy-loggers==1.0.5 -srsly==2.4.8 -starlette==0.36.3 -SudachiDict-core==20240109 -SudachiPy==0.6.8 -sympy==1.12 -tensorboard==2.16.2 -tensorboard-data-server==0.7.2 -thinc==8.2.3 -threadpoolctl==3.4.0 -tokenizers==0.15.2 -toolz==0.12.1 -torch>=2.2.0 -torchaudio>=2.2.0 -tqdm==4.66.2 -trainer==0.0.36 -transformers==4.39.1 -TTS==0.22.0 -typer==0.9.4 -typing_extensions==4.10.0 -tzdata==2024.1 -tzlocal==5.2 -umap-learn==0.5.5 -Unidecode==1.3.8 -unidic-lite==1.0.8 -urllib3==2.2.1 -uvicorn==0.29.0 -wasabi==1.1.2 -weasel==0.3.4 -websockets==11.0.3 -Werkzeug==3.0.1 -yarl==1.9.4 -zipp==3.18.1 +absl-py==2.1.0 +aiofiles==23.2.1 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 +aiosignal==1.3.1 +altair==5.4.0 +annotated-types==0.7.0 +anyascii==0.3.2 +anyio==4.4.0 +attrs==24.2.0 +audioread==3.0.1 +av==12.3.0 +babel==2.16.0 +bangla==0.0.2 +blinker==1.8.2 +blis==0.7.11 +bnnumerizer==0.0.2 +bnunicodenormalizer==0.1.7 +catalogue==2.0.10 +certifi==2024.7.4 +cffi==1.17.0 +charset-normalizer==3.3.2 +click==8.1.7 +cloudpathlib==0.18.1 +colorama==0.4.6 +coloredlogs==15.0.1 +confection==0.1.5 +contourpy==1.2.1 +coqpit==0.0.17 +ctranslate2==4.3.1 +cutlet==0.4.0 +cycler==0.12.1 +cymem==2.0.8 +Cython==3.0.11 +dateparser==1.1.8 +decorator==5.1.1 +docopt==0.6.2 +einops==0.8.0 +encodec==0.1.1 +exceptiongroup==1.2.2 +fastapi==0.112.0 +faster-whisper==1.0.3 +ffmpy==0.4.0 +filelock==3.15.4 +Flask==3.0.3 +flatbuffers==24.3.25 +fonttools==4.53.1 +frozenlist==1.4.1 +fsspec==2024.6.1 +fugashi==1.3.2 +fuzzywuzzy==0.18.0 +g2pkk==0.1.2 +gradio==4.41.0 +gradio_client==1.3.0 +grpcio==1.65.4 +gruut==2.2.3 +gruut-ipa==0.13.0 +gruut_lang_de==2.0.1 +gruut_lang_en==2.0.1 +gruut_lang_es==2.0.1 +gruut_lang_fr==2.0.2 +h11==0.14.0 +hangul-romanize==0.1.0 +httpcore==1.0.5 +httpx==0.27.0 +huggingface-hub==0.24.5 +humanfriendly==10.0 +idna==3.7 +importlib_metadata==8.2.0 +importlib_resources==6.4.0 +inflect==7.3.1 +itsdangerous==2.2.0 +jaconv==0.4.0 +jamo==0.4.1 +jieba==0.42.1 +Jinja2==3.1.4 +joblib==1.4.2 +jsonlines==1.2.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +kiwisolver==1.4.5 +langcodes==3.4.0 +language_data==1.2.0 +lazy_loader==0.4 +Levenshtein==0.25.1 +librosa==0.10.2.post1 +llvmlite==0.43.0 +marisa-trie==1.2.0 +Markdown==3.6 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +matplotlib==3.9.1.post1 +mdurl==0.1.2 +mojimoji==0.0.13 +more-itertools==10.4.0 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +murmurhash==1.0.10 +narwhals==1.3.0 +networkx==2.8.8 +nltk==3.8.2 +num2words==0.5.13 +numba==0.60.0 +numpy==1.26.4 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.20 +nvidia-nvtx-cu12==12.1.105 +onnxruntime==1.18.0 +orjson==3.10.7 +packaging==24.1 +pandas==1.5.3 +pillow==10.4.0 +platformdirs==4.2.2 +pooch==1.8.2 +preshed==3.0.9 +protobuf==5.27.3 +psutil==6.0.0 +pycparser==2.22 +pydantic==2.8.2 +pydantic_core==2.20.1 +pydub==0.25.1 +Pygments==2.18.0 +pynndescent==0.5.13 +pyparsing==3.1.2 +pypinyin==0.52.0 +pyreadline3==3.4.1 +pysbd==0.3.4 +python-crfsuite==0.9.10 +python-dateutil==2.9.0.post0 +python-Levenshtein==0.25.1 +python-multipart==0.0.9 +pytz==2024.1 +PyYAML==6.0.2 +rapidfuzz==3.9.6 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +ruff==0.5.7 +safetensors==0.4.4 +scikit-learn==1.5.1 +scipy==1.14.0 +semantic-version==2.10.0 +shellingham==1.5.4 +six==1.16.0 +smart-open==7.0.4 +sniffio==1.3.1 +sounddevice==0.5.0 +soundfile==0.12.1 +soxr==0.4.0 +spacy==3.7.5 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +srsly==2.4.8 +starlette==0.37.2 +SudachiDict-core==20240716 +SudachiPy==0.6.8 +sympy==1.13.2 +tensorboard==2.16.2 +tensorboard-data-server==0.7.2 +thinc==8.2.4 +threadpoolctl==3.5.0 +tokenizers==0.19.1 +tomlkit==0.12.0 +toolz==0.12.1 +torch>=2.4.0 +torchaudio>=2.4.0 +tqdm==4.66.5 +trainer==0.0.36 +transformers==4.44.0 +triton==3.0.0 +TTS==0.22.0 +typeguard==4.3.0 +typer==0.12.3 +typing_extensions==4.12.2 +tzdata==2024.1 +tzlocal==5.2 +umap-learn==0.5.6 +Unidecode==1.3.8 +unidic-lite==1.0.8 +urllib3==2.2.2 +uvicorn==0.30.5 +wasabi==1.1.3 +weasel==0.4.1 +websockets==12.0 +Werkzeug==3.0.3 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.20.0 From c60bd699e60e67320663664736c8b5b1806f7c40 Mon Sep 17 00:00:00 2001 From: Martin Rys Date: Tue, 13 Aug 2024 18:39:19 +0200 Subject: [PATCH 2/8] Switch to Coqui-TTS fork, Cublas 12 to fix TTS Analyze, use cu124 Torch builds Also tosses out a lot of extra whitespace, you can hide those in Github diff view if needed. --- Dockerfile | 5 + README.md | 170 ++++++++++---------- diagnostics.py | 22 +-- finetune.py | 117 +++++++------- modeldownload.py | 16 +- system/requirements/requirements_docker.txt | 55 ++++--- system/tts_diff/tts_diff.py | 42 ++--- 7 files changed, 217 insertions(+), 210 deletions(-) diff --git a/Dockerfile b/Dockerfile index 379282bb..d3794a95 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,6 +9,11 @@ apt-get install -y git build-essential portaudio19-dev \ ocl-icd-opencl-dev opencl-headers clinfo \ libclblast-dev libopenblas-dev libaio-dev +# Need this 440MB dep on 22.04 otherwise TTS Analyze is very sad if we don't have 11.8 CUDA and lack the dep: +# Could not load library libcudnn_ops_infer.so.8. Error: libcudnn_ops_infer.so.8: cannot open shared object file: No such file or directory +# https://github.com/SYSTRAN/faster-whisper/issues/516#issuecomment-2201088567 +apt-get install -y libcudnn8 + mkdir -p /etc/OpenCL/vendors echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd apt-get clean && rm -rf /var/lib/apt/lists/* diff --git a/README.md b/README.md index 27c90c1a..d74e621e 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ AllTalk is an updated version of the Coqui_tts extension for Text Generation web - πŸ”΄ [Future to-do list & Upcoming updates](https://github.com/erew123/alltalk_tts?#-future-to-do-list) --- -### πŸ› οΈ **About this project & me** +### πŸ› οΈ **About this project & me** AllTalk is a labour of love that has been developed, supported and sustained in my personal free time. As a solo enthusiast (not a business or team) my resources are inherently limited. This project has been one of my passions, but I must balance it with other commitments. To manage AllTalk sustainably, I prioritize support requests based on their overall impact and the number of users affected. I encourage you to utilize the comprehensive documentation and engage with the AllTalk community discussion area. These resources often provide immediate answers and foster a supportive user network. @@ -52,7 +52,7 @@ Should your inquiry extend beyond the documentation, especially if it concerns a It's important to note that **I am not** the developer of any TTS models utilized by AllTalk, nor do I claim to be an expert on them, including understanding all their nuances, issues, and quirks. For specific TTS model concerns, I’ve provided links to the original developers in the Help section for direct assistance. -Thank you for your continued support and understanding. +Thank you for your continued support and understanding. --- @@ -87,7 +87,7 @@ To set up AllTalk within Text-generation-webui, follow either method: - In the text-generation-webui folder, start the environment with the appropriate command: - Windows: `cmd_windows.bat` - Linux: `./cmd_linux.sh`

- + > If you're unfamiliar with Python environments and wish to learn more, consider reviewing **Understanding Python Environments Simplified** in the Help section. 3. **Run AllTalk Setup Script**: @@ -167,7 +167,7 @@ If you're using a Mac or prefer a manual installation for any other reason, plea - For Linux: `./cmd_linux.sh` - For macOS: `cmd_macos.sh` - For WSL: `cmd_wsl.bat` - + - Loading the Text Generation Web UI's Python environment **is crucial**. If unsure about what a loaded Python environment should look like, refer to this [image](https://github.com/erew123/alltalk_tts/issues/25#issuecomment-1869344442) and [video guide](https://www.youtube.com/watch?v=9BPKuwaav5w). > If you're unfamiliar with Python environments and wish to learn more, consider reviewing **Understanding Python Environments Simplified** in the Help section. @@ -186,7 +186,7 @@ If you're using a Mac or prefer a manual installation for any other reason, plea - For Linux/Mac: `pip install -r system/requirements/requirements_textgen.txt` 7. **Optional DeepSpeed Installation**: -- If you're using an Nvidia graphics card on Linux or Windows and wish to install **DeepSpeed**, follow the instructions [here](https://github.com/erew123/alltalk_tts?#-deepspeed-installation-options). +- If you're using an Nvidia graphics card on Linux or Windows and wish to install **DeepSpeed**, follow the instructions [here](https://github.com/erew123/alltalk_tts?#-deepspeed-installation-options). - **Recommendation**: Start Text Generation Web UI and ensure AllTalk functions correctly before installing DeepSpeed. 8. **Start Text Generation Web UI**: @@ -302,7 +302,7 @@ Some extra voices for AllTalk are downloadable [here](https://drive.google.com/f #### 🟩 Changing AllTalks IP address & Accessing AllTalk over your Network
Click to expand
- + AllTalk is coded to start on 127.0.0.1, meaning that it will ONLY be accessable to the local computer it is running on. If you want to make AllTalk available to other systems on your network, you will need to change its IP address to match the IP address of your network card/computers current IP address. There are 2x ways to change the IP address: 1) Start AllTalk and within its web interface and you can edit the IP address on the "AllTalk Startup Settings". 2) You can edit the `confignew.json`file in a text editor and change `"ip_address": "127.0.0.1",` to the IP address of your choosing. @@ -314,7 +314,7 @@ So, for example, if your computer's network card was on IP address 192.168.0.20, #### 🟩 Text-geneneration-webui & Stable-Diffusion Plugin - Load Order & stripped text
Click to expand
- + The Stable Diffusion plugin for Text-generation-webui **strips out** some of the text, which is passed to Stable Diffusion for image/scene generation. Because this text is stripped, its important to consider the load order of the plugins to get the desired result you want. Lets assume the AI has just generated the following message `*He walks into the room with a smile on his face and says* Hello how are you?`. Depending on the load order will change what text reaches AllTalk for generation e.g. **SD Plugin loaded before AllTalk** - Only `Hi how are you?` is sent to AllTalk, with the `*He walks into the room with a smile on his face and says*` being sent over to SD for image generation. Narration of the scene is not possible.

@@ -326,7 +326,7 @@ The load order can be changed within Text-generation-webui's `settings.yaml` fil #### 🟩 A note on Character Cards & Greeting Messages
Click to expand
- + Messages intended for the Narrator should be enclosed in asterisks `*` and those for the character inside quotation marks `"`. However, AI systems often deviate from these rules, resulting in text that is neither in quotes nor asterisks. Sometimes, text may appear with only a single asterisk, and AI models may vary their formatting mid-conversation. For example, they might use asterisks initially and then switch to unmarked text. A properly formatted line should look like this: `"`Hey! I'm so excited to finally meet you. I've heard so many great things about you and I'm eager to pick your brain about computers.`"` `*`She walked across the room and picked up her cup of coffee`*` @@ -339,7 +339,7 @@ With improvements to the splitter/processor, I'm confident it's functioning well #### 🟩 I want to know more about the XTTS AI model used
Click to expand
- + Currently the XTTS model is the main model used by AllTalk for TTS generation. If you want to know more details about the XTTS model, its capabilties or its technical features you can look at resources such as: - https://docs.coqui.ai/en/latest/models/xtts.html - https://github.com/coqui-ai/TTS @@ -354,7 +354,7 @@ Currently the XTTS model is the main model used by AllTalk for TTS generation. I Maintaining the latest version of your setup ensures access to new features and improvements. Below are the steps to update your installation, whether you're using Text-Generation-webui or running as a Standalone Application. **NOTE** Future updates will be handled by using the `atsetup` utility.

-**NOTE** If you have an install **prior to 28th March 2024** that you are updating, perform the `git pull` instructions below, then run the `atsetup` utility and select option 1 in either the Standalone ot Text-generation-webui menu (as matches your system). +**NOTE** If you have an install **prior to 28th March 2024** that you are updating, perform the `git pull` instructions below, then run the `atsetup` utility and select option 1 in either the Standalone ot Text-generation-webui menu (as matches your system).
UPDATING - Text-Generation-webui @@ -420,7 +420,7 @@ For Standalone Application users, here's how to update your setup: 2. **Pull the Latest Updates**: - Retrieve the latest changes from the repository with: - `git pull` - + 3. **Install Updated Requirements**: - Depending on your machine's OS, install the required dependencies using pip: - **For Windows Machines**: @@ -444,7 +444,7 @@ The process involves renaming your existing `alltalk_tts` directory, setting up 2. **Follow the Quick Setup instructions**: - You will now follow the **Quick Setup** instructions, performing the `git clone https://github.com/erew123/alltalk_tts` to pull down a new copy of AllTalk and install the requirements. - + > If you're not familiar with Python environments, see **Understanding Python Environments Simplified** in the Help section for more info. 3. **Migrate Your Data**: @@ -464,7 +464,7 @@ The process involves renaming your existing `alltalk_tts` directory, setting up **DeepSpeed requires an Nvidia Graphics card** #### πŸ”΅ Linux Installation -DeepSpeed requires access to the **Nvidia CUDA Development Toolkit** to compile on a Linux system. It's important to note that this toolkit is distinct and unrealted to your graphics card driver or the CUDA version the Python environment uses. +DeepSpeed requires access to the **Nvidia CUDA Development Toolkit** to compile on a Linux system. It's important to note that this toolkit is distinct and unrealted to your graphics card driver or the CUDA version the Python environment uses.
Linux DeepSpeed - Text-generation-webui @@ -473,23 +473,23 @@ DeepSpeed requires access to the **Nvidia CUDA Development Toolkit** to compile 1. **Nvidia CUDA Development Toolkit Installation**: - The toolkit is crucial for DeepSpeed to compile/build for your version of Linux and requires around 3GB's of disk space. - - Install using your package manager **(Recommended)** e.g. **CUDA Toolkit 11.8** or download directly from [Nvidia CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive) (choose 11.8 or 12.1 for Linux). + - Install using your package manager **(Recommended)** e.g. **CUDA Toolkit 12.4** or download directly from [Nvidia CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive) (choose 12.4). 2. **Open a Terminal Console**: - After Nvidia CUDA Development Toolkit installation, access your terminal console. 3. **Install libaio-dev**: - Use your Linux distribution's package manager. - + - - `sudo apt install libaio-dev` for Debian-based systems - `sudo yum install libaio-devel` for RPM-based systems. 4. **Navigate to Text generation webUI Folder**: - Change directory to your Text generation webUI folder with `cd text-generation-webui`. - + 5. **Activate Text generation webUI Custom Conda Environment**: - Run `./cmd_linux.sh` to start the environment.

- + > If you're unfamiliar with Python environments and wish to learn more, consider reviewing **Understanding Python Environments Simplified** in the Help section. 6. **Set `CUDA_HOME` Environment Variable**: @@ -498,12 +498,12 @@ DeepSpeed requires access to the **Nvidia CUDA Development Toolkit** to compile 7. **Temporarily Configuring `CUDA_HOME`**: - When the Text generation webUI Python environment is active **(step 5)**, set `CUDA_HOME`. - + - - `export CUDA_HOME=/usr/local/cuda` - `export PATH=${CUDA_HOME}/bin:${PATH}` - `export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:$LD_LIBRARY_PATH` - - You can confirm the path is set correctly and working by running the command `nvcc --version` should confirm `Cuda compilation tools, release 11.8.`. + - You can confirm the path is set correctly and working by running the command `nvcc --version` should confirm `Cuda compilation tools, release 12.4.`. - Incorrect path settings may lead to errors. If you encounter path issues or receive errors like `[Errno 2] No such file or directory` when you run the next step, confirm the path correctness or adjust as necessary. 8. **DeepSpeed Installation**: @@ -520,14 +520,14 @@ DeepSpeed requires access to the **Nvidia CUDA Development Toolkit** to compile 1. **Nvidia CUDA Development Toolkit Installation**: - The toolkit is crucial for DeepSpeed to compile/build for your version of Linux and requires around 3GB's of disk space. - - Install using your package manager **(Recommended)** e.g. **CUDA Toolkit 11.8** or download directly from [Nvidia CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive) (choose 11.8 or 12.1 for Linux). + - Install using your package manager **(Recommended)** e.g. **CUDA Toolkit 12.4** or download directly from [Nvidia CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive) (choose 12.4). 2. **Open a Terminal Console**: - After Nvidia CUDA Development Toolkit installation, access your terminal console. - + 3. **Install libaio-dev**: - Use your Linux distribution's package manager. - + - - `sudo apt install libaio-dev` for Debian-based systems - `sudo yum install libaio-devel` for RPM-based systems. @@ -537,7 +537,7 @@ DeepSpeed requires access to the **Nvidia CUDA Development Toolkit** to compile 5. **Activate AllTalk Custom Conda Environment**: - Run `./start_environment.sh` to start the AllTalk Python environment. - This command will start the custom Python environment that was installed with `./atsetup.sh`.

- + > If you're unfamiliar with Python environments and wish to learn more, consider reviewing **Understanding Python Environments Simplified** in the Help section. 6. **Set `CUDA_HOME` Environment Variable**: @@ -549,12 +549,12 @@ DeepSpeed requires access to the **Nvidia CUDA Development Toolkit** to compile 8. **Configuring `CUDA_HOME`**: - When your Python environment is active **(step 5)**, set `CUDA_HOME`. - + - - `export CUDA_HOME=/usr/local/cuda` - `export PATH=${CUDA_HOME}/bin:${PATH}` - `export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:$LD_LIBRARY_PATH` - - You can confirm the path is set correctly and working by running the command `nvcc --version` should confirm `Cuda compilation tools, release 11.8.`. + - You can confirm the path is set correctly and working by running the command `nvcc --version` should confirm `Cuda compilation tools, release 12.4.`. - Incorrect path settings may lead to errors. If you encounter path issues or receive errors like `[Errno 2] No such file or directory` when you run the next step, confirm the path correctness or adjust as necessary. 9. **DeepSpeed Installation**: @@ -587,7 +587,7 @@ You have 2x options for how to setup DeepSpeed on Windows. Pre-compiled wheel fi - If needed, pre-compiled DeepSpeed wheel files that I have built are available on the [Releases Page](https://github.com/erew123/alltalk_tts/releases). You can manually install or uninstall these wheels using the following commands: - Installation: `pip install {deep-speed-wheel-file-name-here}` - Uninstallation: `pip uninstall deepspeed` - + 3. **Using `atsetup.bat` for Simplified Management**: - For those running the Standalone Installation or a standard build of Text-generation-webui, the `atsetup.bat` utility offers the simplest and most efficient way to manage DeepSpeed installations on Windows. @@ -624,7 +624,7 @@ I'm thrilled to see the enthusiasm and engagement with AllTalk! Your feedback an **Bug Reports:** If you've encountered what you believe is a bug, please first check the [Updates & Bug Fixes List](https://github.com/erew123/alltalk_tts/issues/25) to see if it's a known issue or one that's already been resolved. If not, I encourage you to report it by raising a bug report in the [Issues section](https://github.com/erew123/alltalk_tts/issues), providing as much detail as possible to help identify and fix the issue. -**Feature Requests:** The current Feature request list can be [found here](https://github.com/erew123/alltalk_tts/discussions/74). I love hearing your ideas for new features! While I can't promise to implement every suggestion, I do consider all feedback carefully. Please share your thoughts in the [Discussions area](https://github.com/erew123/alltalk_tts/discussions) or via a Feature Request in the [Issues section](https://github.com/erew123/alltalk_tts/issues). +**Feature Requests:** The current Feature request list can be [found here](https://github.com/erew123/alltalk_tts/discussions/74). I love hearing your ideas for new features! While I can't promise to implement every suggestion, I do consider all feedback carefully. Please share your thoughts in the [Discussions area](https://github.com/erew123/alltalk_tts/discussions) or via a Feature Request in the [Issues section](https://github.com/erew123/alltalk_tts/issues). --- @@ -659,7 +659,7 @@ If you are on a Windows machine or a Linux machine, you should be able to use th - Start the Python environment: - Windows: `start_environment.bat` - Linux: `./start_environment.sh`

- + > If you're unfamiliar with Python environments and wish to learn more, consider reviewing **Understanding Python Environments Simplified** in the Help section. 2) Run the diagnostics and select the requirements file name you installed AllTalk with:
@@ -674,7 +674,7 @@ If you are on a Windows machine or a Linux machine, you should be able to use th
🟨 Understanding Python Environments Simplified
- + Think of Python environments like different rooms in your house, each designed for a specific purpose. Just as you wouldn't cook in the bathroom or sleep in the kitchen, different Python applications need their own "spaces" or environments because they have unique requirements. Sometimes, these requirements can clash with those of other applications (imagine trying to cook a meal in a bathroom!). To avoid this, you can create separate Python environments. #### Why Separate Environments? @@ -698,7 +698,7 @@ This command tells pip to read the list of required packages and versions from t Remember, just as it's important to use the right tools for tasks in different rooms of your house, it's crucial to manage your Python environments and dependencies properly to ensure your applications run as intended. #### How do I know if I am in a Python environment?: -When a Python environment starts up, it changes the command prompt to show the Python environment that it currently running within that terminal/console. +When a Python environment starts up, it changes the command prompt to show the Python environment that it currently running within that terminal/console. ![image](https://github.com/erew123/screenshots/blob/main/pythonenvironment.jpg)
@@ -708,22 +708,22 @@ When a Python environment starts up, it changes the command prompt to show the P `ERROR: Microsoft Visual C++ 14.0 or greater is required` or `ERROR: Could not build wheels for TTS.` or `ModuleNotFoundError: No module named 'TTS` - Python requires that you install C++ development tools on Windows. This is detailed on the [Python site here](https://wiki.python.org/moin/WindowsCompilers). You would need to install `MSVCv142 - VS 2019 C++ x64/x86 build tools` and `Windows 10/11 SDK` from the C++ Build tools section. - - You can get hold of the **Community** edition [here](https://visualstudio.microsoft.com/downloads/) the during installation, selecting `C++ Build tools` and then `MSVCv142 - VS 2019 C++ x64/x86 build tools` and `Windows 10/11 SDK`. + Python requires that you install C++ development tools on Windows. This is detailed on the [Python site here](https://wiki.python.org/moin/WindowsCompilers). You would need to install `MSVCv142 - VS 2019 C++ x64/x86 build tools` and `Windows 10/11 SDK` from the C++ Build tools section. + + You can get hold of the **Community** edition [here](https://visualstudio.microsoft.com/downloads/) the during installation, selecting `C++ Build tools` and then `MSVCv142 - VS 2019 C++ x64/x86 build tools` and `Windows 10/11 SDK`. ![image](https://github.com/erew123/screenshots/raw/main/pythonrequirementswindows.jpg) - +
🟨 Standalone Install - start_{youros}.xx opens and closes instantly and AllTalk doesnt start
-This is more than likely caused by having a space ` ` in your folder path e.g. `c:\program files\alltalk_tts`. In this circumstance you would be best moving the folder to a path without a space e.g. `c:\myfiles\alltalk_tts`. You would have to delete the `alltalk_environment` folder and `start_alltalk.bat` or `start_alltalk.sh` and then re-run `atsetup` to re-create the environment and startup files. +This is more than likely caused by having a space ` ` in your folder path e.g. `c:\program files\alltalk_tts`. In this circumstance you would be best moving the folder to a path without a space e.g. `c:\myfiles\alltalk_tts`. You would have to delete the `alltalk_environment` folder and `start_alltalk.bat` or `start_alltalk.sh` and then re-run `atsetup` to re-create the environment and startup files.
🟨 I think AllTalks requirements file has installed something another extension doesn't like
- -Ive paid very close attention to **not** impact what Text-generation-webui is requesting on a factory install. This is one of the requirements of submitting an extension to Text-generation-webui. If you want to look at a comparison of a factory fresh text-generation-webui installed packages (with cuda 12.1, though AllTalk's requirements were set on cuda 11.8) you can find that comparison [here](https://github.com/erew123/alltalk_tts/issues/23). This comparison shows that AllTalk is requesting the same package version numbers as Text-generation-webui or even lower version numbers (meaning AllTalk will not update them to a later version). What other extensions do, I cant really account for that. + +Ive paid very close attention to **not** impact what Text-generation-webui is requesting on a factory install. This is one of the requirements of submitting an extension to Text-generation-webui. If you want to look at a comparison of a factory fresh text-generation-webui installed packages (with cuda 12.1, though AllTalk's requirements were set on cuda 12.4) you can find that comparison [here](https://github.com/erew123/alltalk_tts/issues/23). This comparison shows that AllTalk is requesting the same package version numbers as Text-generation-webui or even lower version numbers (meaning AllTalk will not update them to a later version). What other extensions do, I cant really account for that. I will note that the TTS engine downgrades Pandas data validator to 1.5.3 though its unlikely to cause any issues. You can upgrade it back to text-generation-webui default (december 2023) with `pip install pandas==2.1.4` when inside of the python environment. I have noticed no ill effects from it being a lower or higher version, as far as AllTalk goes. This is also the same behaviour as the Coqui_tts extension that comes with Text-generation-webui. @@ -733,7 +733,7 @@ Im not sure if the Pydantic version has been recently updated by the Text-genera
🟨 I am having problems getting AllTalk to start after changing settings or making a custom setup/model setup.
- + I would suggest following [Problems Updating](https://github.com/erew123/alltalk_tts#-problems-updating) and if you still have issues after that, you can raise an issue [here](https://github.com/erew123/alltalk_tts/issues)
@@ -749,7 +749,7 @@ You may also need to allow access through your firewall or Antivirus package to
🟨 I am running a Headless system and need to change the IP Address manually as I cannot reach the config page
- + To do this you can edit the `confignew.json` file within the `alltalk_tts` folder. You would look for `"ip_address": "127.0.0.1",` and change the `127.0.0.1` to your chosen IP address,then save the file and start AllTalk.

When doing this, be careful not to impact the formatting of the JSON file. Worst case, you can re-download a fresh copy of `confignew.json` from this website and that will put you back to a factory setting. @@ -758,7 +758,7 @@ When doing this, be careful not to impact the formatting of the JSON file. Worst ### Configuration and Usage Issues
🟨 I activated DeepSpeed in the settings page, but I didnt install DeepSpeed yet and now I have issues starting up
- + You can either follow the [Problems Updating](https://github.com/erew123/alltalk_tts#-problems-updating) and fresh install your config. Or you can edit the `confignew.json` file within the `alltalk_tts` folder. You would look for '"deepspeed_activate": true,' and change the word true to false `"deepspeed_activate": false,' ,then save the file and try starting again. If you want to use DeepSpeed, you need an Nvidia Graphics card and to install DeepSpeed on your system. Instructions are [here](https://github.com/erew123/alltalk_tts#-deepspeed-installation-options) @@ -766,7 +766,7 @@ If you want to use DeepSpeed, you need an Nvidia Graphics card and to install De
🟨 I am having problems updating/some other issue where it wont start up/Im sure this is a bug
- + Please see [Problems Updating](https://github.com/erew123/alltalk_tts#-problems-updating). If that doesnt help you can raise an ticket [here](https://github.com/erew123/alltalk_tts/issues). It would be handy to have any log files from the console where your error is being shown. I can only losely support custom built Python environments and give general pointers. Please create a `diagnostics.log` report file to submit with a support request. Also, is your text-generation-webui up to date? [instructions here](https://github.com/oobabooga/text-generation-webui?tab=readme-ov-file#how-to-install) @@ -774,7 +774,7 @@ Also, is your text-generation-webui up to date? [instructions here](https://gith
🟨 I see some red "asyncio" messages
- + As far as I am aware, these are to do with the chrome browser the gradio text-generation-webui in some way. I raised an issue about this on the text-generation-webui [here](https://github.com/oobabooga/text-generation-webui/issues/4788) where you can see that AllTalk is not loaded and the messages persist. Either way, this is more a warning than an actual issue, so shouldnt affect any functionality of either AllTalk or text-generation-webui, they are more just an annoyance.
@@ -784,7 +784,7 @@ As far as I am aware, these are to do with the chrome browser the gradio text-ge 🟨 Understanding the AllTalk start-up screen
The AllTalk start-up screen provides various bits of information about the detected Python environment and errors. - + ![image](https://github.com/erew123/screenshots/raw/main/alltalkstartup.jpg) **Config file check**
@@ -792,7 +792,7 @@ The AllTalk start-up screen provides various bits of information about the detec **AllTalk startup Mode**
- informational. This will state if AllTalk has detected it is running as part of Text-generation-webui or as a Standalone Application. - + **WAV file deletion**
- If you have set deletion of old generated WAV files, this will state the time frame after which they are purged. @@ -801,10 +801,10 @@ The AllTalk start-up screen provides various bits of information about the detec **Model is available**
- AllTalk is checking if your model files exist. This is not a validity check of the actual model files, they can still be corrupted. If files are missing, AllTalk will attempt to download them from Huggingface, however, if Huggingface has an outage/issue or your internet connection has issues, its possible corrupted or incomplete files will be downloaded. Please read `RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory` if you need to confirm your model files are ok. - + **Current Python Version**
- Informational. Literally tells you the version of Python running in your Python environment. - + **Current PyTorch Version**
- Informational. Tell tells you the version of **PyTorch** running in your Python environment, however if you have an Nvidia card, you should be running a CUDA based version of Pytorch. This is indicated with a `+cXXX` after the PyTorch version e.g. `2.2.2+cu121` would be PyTorch version 2.2.2 with CUDA 12.1 extensions. If you don’t have the PyTorch CUDA extensions installed, but you do have an Nvidia card, you may need to re-install PyTorch. @@ -827,12 +827,12 @@ The AllTalk start-up screen provides various bits of information about the detec
🟨 AllTalk is only loading into CPU, but I have an Nvidia GPU so it should be loading into CUDA
- + This is caused by Pytorch (Torch) not having the CUDA extensions installed (You can check by running the diagnostics). Typically this happens (on Standalone installations) because when the setup routine goes to install Pytorch with CUDA, it looks in the PIP cache and if a previous application has downloaded a version of Pytorch that **doesn't** have CUDA extensions, the PIP installer doesnt recognise this fact and just uses the cached version for installation. To resolve this: 1) On the `atsetup` utility, on the `Standalone menu` select to `Purge the PIP cache`. This will remove cached packages from the PIP cache, meaning it will have to download fresh copies. 2) As we need to force the upgrade to the Python environment, the easiest way to do this will be to use `atsetup` to `Delete AllTalk's custom Python environment`. This means it will have to rebuild the Python environment. **Note**, you may have to run this step twice, as it has to exit the current Python environment, then you have to re-load `atsetup` and select `Delete AllTalk's custom Python environment` again. -3) You can now use `atsetup` to `Install AllTalk as a Standalone Application` which will download fresh copies of everything and re-install the Python environment. +3) You can now use `atsetup` to `Install AllTalk as a Standalone Application` which will download fresh copies of everything and re-install the Python environment. 4) Once this is done you can check if CUDA is now working with the diagnostics or starting AllTalk and checking the model loads into CUDA.
@@ -840,8 +840,8 @@ This is caused by Pytorch (Torch) not having the CUDA extensions installed (You
🟨 RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory
- -This error message is caused by the model being corrupted or damaged in some way. This error can occur if Huggingface, where the model is downloaded from, have an error (when the model is downloaded) or potentailly internet issues occuring while the model is downloaded on first start-up. + +This error message is caused by the model being corrupted or damaged in some way. This error can occur if Huggingface, where the model is downloaded from, have an error (when the model is downloaded) or potentailly internet issues occuring while the model is downloaded on first start-up. ``` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -861,7 +861,7 @@ You can delete one or more suspect files and a factory fresh copy of that file o
🟨 RuntimeError: Found no NVIDIA driver on your system.
- + This error message is caused by DeepSpeed being enabled when you do not have a Nvidia GPU. To resolve this, edit `confignew.json` and change `"deepspeed_activate": true,` to `"deepspeed_activate": false,` then restart AllTalk. ``` @@ -877,11 +877,11 @@ ERROR: Application startup failed. Exiting.
🟨 raise RuntimeError("PyTorch version mismatch! DeepSpeed ops were compiled and installed.
- + This error message is caused by having DeepSpeed enabled, but you have a version of DeepSpeed installed that was compiled for a different version of Python, PyTorch or CUDA (or any mix of those). You will need to start your Python environment and run `pip uninstall deepspeed` to remove DeepSpeed from your Python environment and then install the correct version of DeepSpeed. ``` -raise RuntimeError("PyTorch version mismatch! DeepSpeed ops were compiled and installed +raise RuntimeError("PyTorch version mismatch! DeepSpeed ops were compiled and installed RuntimeError: PyTorch version mismatch! DeepSpeed ops were compiled and installed with a different version than what is being used at runtime. Please re-install DeepSpeed or switch torch versions. Install torch version=2.1, Runtime torch version=2.2 ``` @@ -903,8 +903,8 @@ Other possibilities for this issue are: 1) You are starting AllTalk in both your `CMD FLAG.txt` and `settings.yaml` file. The `CMD FLAG.txt` you would have manually edited and the `settings.yaml` is the one you change and save in the `session` tab of text-generation-webui and you can `Save UI defaults to settings.yaml`. Please only have one of those two starting up AllTalk. 2) You are not starting text-generation-webui with its normal Python environment. Please start it with start_{your OS version} as detailed [here](https://github.com/oobabooga/text-generation-webui#how-to-install) (`start_windows.bat`,`./start_linux.sh`, `start_macos.sh` or `start_wsl.bat`) OR (`cmd_windows.bat`, `./cmd_linux.sh`, `cmd_macos.sh` or `cmd_wsl.bat` and then `python server.py`). - -3) You have installed the wrong version of DeepSpeed on your system, for the wrong version of Python/Text-generation-webui. You can go to your text-generation-webui folder in a terminal/command prompt and run the correct cmd version for your OS e.g. (`cmd_windows.bat`, `./cmd_linux.sh`, `cmd_macos.sh` or `cmd_wsl.bat`) and then you can type `pip uninstall deepspeed` then try loading it again. If that works, please see here for the correct instructions for installing DeepSpeed [here](https://github.com/erew123/alltalk_tts#-deepspeed-installation-options). + +3) You have installed the wrong version of DeepSpeed on your system, for the wrong version of Python/Text-generation-webui. You can go to your text-generation-webui folder in a terminal/command prompt and run the correct cmd version for your OS e.g. (`cmd_windows.bat`, `./cmd_linux.sh`, `cmd_macos.sh` or `cmd_wsl.bat`) and then you can type `pip uninstall deepspeed` then try loading it again. If that works, please see here for the correct instructions for installing DeepSpeed [here](https://github.com/erew123/alltalk_tts#-deepspeed-installation-options). 4) You have an old version of text-generation-webui (pre Dec 2023) I have not tested on older versions of text-generation-webui, so cannot confirm viability on older versions. For instructions on updating the text-generation-webui, please look [here](https://github.com/oobabooga/text-generation-webui#how-to-install) (`update_linux.sh`, `update_windows.bat`, `update_macos.sh`, or `update_wsl.bat`). @@ -914,7 +914,7 @@ Other possibilities for this issue are: 7) You have quite old graphics drivers and may need to update them. -8) Something within text-generation-webui is not playing nicely for some reason. You can go to your text-generation-webui folder in a terminal/command prompt and run the correct cmd version for your OS e.g. (`cmd_windows.bat`, `./cmd_linux.sh`, `cmd_macos.sh` or `cmd_wsl.bat`) and then you can type `python extensions\alltalk_tts\script.py` and see if AllTalk starts up correctly. If it does then something else is interfering. +8) Something within text-generation-webui is not playing nicely for some reason. You can go to your text-generation-webui folder in a terminal/command prompt and run the correct cmd version for your OS e.g. (`cmd_windows.bat`, `./cmd_linux.sh`, `cmd_macos.sh` or `cmd_wsl.bat`) and then you can type `python extensions\alltalk_tts\script.py` and see if AllTalk starts up correctly. If it does then something else is interfering. 9) Something else is already loaded into your VRAM or there is a crashed python process. Either check your task manager for erroneous Python processes or restart your machine and try again. @@ -925,12 +925,12 @@ Other possibilities for this issue are:
🟨 I have multiple GPU's and I have problems running Finetuning
- + Finetuning pulls in various other scripts and some of those scripts can have issues with multiple Nvidia GPU's being present. Until the people that created those other scripts fix up their code, there is a workaround to temporarily tell your system to only use the 1x of your Nvidia GPU's. To do this: - **Windows** - You will start the script with `set CUDA_VISIBLE_DEVICES=0 && python finetune.py`
After you have completed training, you can reset back with `set CUDA_VISIBLE_DEVICES=`
- + - **Linux** - You will start the script with `CUDA_VISIBLE_DEVICES=0 python finetune.py`
After you have completed training, you can reset back with `unset CUDA_VISIBLE_DEVICES`
@@ -942,13 +942,13 @@ Depending on which of your Nvidia GPU's is the more powerful one, you can change
🟨 Firefox - Streaming Audio doesnt work on Firefox
- + This is a long standing issue with Mozilla & Firefox and one I am unable to resolve as Mozilla have not resolved the issue with Firefox. The solution is to use another web browser if you want to use Streaming audio. For details of my prior invesitigation please look at this [ticket](https://github.com/erew123/alltalk_tts/issues/143)
🟨 Hindi Support - Not working or issues
- + Hindi support does not officially exist according to Coqui. Ive added a limited Hindi support at this time, however, It only works with API TTS method and Im sure there will be issues. [ticket](https://github.com/erew123/alltalk_tts/issues/178)
@@ -972,13 +972,13 @@ To manually change the IP address:
🟨 XTTS - Does the XTTS AI Model Support Emotion Control or Singing?
- + No, the XTTS AI model does not currently support direct control over emotions or singing capabilities. While XTTS infuses generated speech with a degree of emotional intonation based on the context of the text, users cannot explicitly control this aspect. It's worth noting that regenerating the same line of TTS may yield slightly different emotional inflections, but there is no way to directly control it with XTTS.
🟨 XTTS - Skips, repeats or pronunciation Issues
- -Firstly, it's important to clarify that the development and maintenance of the XTTS AI models and core scripts are handled by [Coqui](https://docs.coqui.ai/en/latest/index.html), with additional scripts and libraries from entities like [huggingface](https://huggingface.co/docs/transformers/en/index) among many other Python scripts and libraries used by AllTalk. + +Firstly, it's important to clarify that the development and maintenance of the XTTS AI models and core scripts are handled by [Coqui](https://docs.coqui.ai/en/latest/index.html), with additional scripts and libraries from entities like [huggingface](https://huggingface.co/docs/transformers/en/index) among many other Python scripts and libraries used by AllTalk. AllTalk is designed to be a straightforward interface that simplifies setup and interaction with AI TTS models like XTTS. Currently, AllTalk supports the XTTS model, with plans to include more models in the future. Please understand that the deep inner workings of XTTS, including reasons why it may skip, repeat, or mispronounce, along with 3rd party scripts and libraries utilized, are ultimately outside my control. @@ -1014,17 +1014,17 @@ If you have a voice that the model doesnt quite reproduce correctly, or indeed y Everything has been done to make this as simple as possible. At its simplest, you can literally just download a large chunk of audio from an interview, and tell the finetuning to strip through it, find spoken parts and build your dataset. You can literally click 4 buttons, then copy a few files and you are done. At it's more complicated end you will clean up the audio a little beforehand, but its still only 4x buttons and copying a few files. #### ⚫ The audio you will use -I would suggest that if its in an interview format, you cut out the interviewer speaking in audacity or your chosen audio editing package. You dont have to worry about being perfect with your cuts, the finetuning Step 1 will go and find spoken audio and cut it out for you. Is there is music over the spoken parts, for best quality you would cut out those parts, though its not 100% necessary. As always, try to avoid bad quality audio with noises in it (humming sounds, hiss etc). You can try something like [Audioenhancer](https://audioenhancer.ai/) to try clean up noisier audio. There is no need to down-sample any of the audio, all of that is handled for you. Just give the finetuning some good quality audio to work with. +I would suggest that if its in an interview format, you cut out the interviewer speaking in audacity or your chosen audio editing package. You dont have to worry about being perfect with your cuts, the finetuning Step 1 will go and find spoken audio and cut it out for you. Is there is music over the spoken parts, for best quality you would cut out those parts, though its not 100% necessary. As always, try to avoid bad quality audio with noises in it (humming sounds, hiss etc). You can try something like [Audioenhancer](https://audioenhancer.ai/) to try clean up noisier audio. There is no need to down-sample any of the audio, all of that is handled for you. Just give the finetuning some good quality audio to work with. #### ⚫ Can I Finetune a model more than once on more than one voice -Yes you can. You would do these as multiple finetuning's, but its absolutely possible and fine to do. Finetuning the XTTS model does not restrict it to only being able to reproduce that 1x voice you trained it on. Finetuning is generally nuding the model in a direction to learn the ability to sound a bit more like a voice its not heard before. +Yes you can. You would do these as multiple finetuning's, but its absolutely possible and fine to do. Finetuning the XTTS model does not restrict it to only being able to reproduce that 1x voice you trained it on. Finetuning is generally nuding the model in a direction to learn the ability to sound a bit more like a voice its not heard before. #### ⚫ A note about anonymous training Telemetry information & disabling it Portions of Coqui's TTS trainer scripts gather anonymous training information which you can disable. Their statement on this is listed [here](https://github.com/coqui-ai/Trainer?tab=readme-ov-file#anonymized-telemetry). If you start AllTalk Finetuning with `start_finetuning.bat` or `./start_finetuning.sh` telemetry will be disabled. If you manually want to disable it, please expand the below:
Manually disable telemetry
- + Before starting finetuning, run the following in your terminal/command prompt: - On Windows by typing `set TRAINER_TELEMETRY=0` @@ -1033,19 +1033,19 @@ Before starting finetuning, run the following in your terminal/command prompt: Before you start `finetune.py`. You will now be able to finetune offline and no anonymous training data will be sent.
-#### ⚫ Prerequisites for Fine-tuning with Nvidia CUDA Development Toolkit 11.8 +#### ⚫ Prerequisites for Fine-tuning with Nvidia CUDA Development Toolkit 12.4 All the requirements for Finetuning will be installed by using the atsetup utility and installing your correct requirements (Standalone or for Text-generation-webui). The legacy manual instructions are stored below, however these shouldnt be required.
- Legacy manual instructions for installing Nvidia CUDA Development Toolkit 11.8
-- To perform fine-tuning, a specific portion of the **Nvidia CUDA Development Toolkit v11.8** must be installed. This is crucial for step 1 of fine-tuning. The objective is to minimize the installation footprint by installing only the essential components. -- The **Nvidia CUDA Development Toolkit v11.8** operates independently from your graphics card drivers and the CUDA version utilized by your Python environment. + Legacy manual instructions for installing Nvidia CUDA Development Toolkit 12.4
+- To perform fine-tuning, a specific portion of the **Nvidia CUDA Development Toolkit v12.4** must be installed. This is crucial for step 1 of fine-tuning. The objective is to minimize the installation footprint by installing only the essential components. +- The **Nvidia CUDA Development Toolkit v12.4** operates independently from your graphics card drivers and the CUDA version utilized by your Python environment. - This installation process aims to keep the download and install size as minimal as possible, however a full install of the tookit requires 3GB's of disk space. - When running Finetuning it will require upto 20GB's of temporary disk space, so please ensure you have this space available and preferably use a SSD or NVME drive. 1. **Download the Toolkit**: - - Obtain the **network install** version of the Nvidia CUDA Development Toolkit 11.8 from [Nvidia's Archive](https://developer.nvidia.com/cuda-11-8-0-download-archive). + - Obtain the **network install** version of the Nvidia CUDA Development Toolkit 12.4 from [Nvidia's Archive](https://developer.nvidia.com/cuda-12-4-0-download-archive). 2. **Run the Installer**: - Choose **Custom (Advanced)** installation. @@ -1055,18 +1055,18 @@ All the requirements for Finetuning will be installed by using the atsetup utili - `CUDA` > `Development` > `Libraries` > `CUBLAS` (**both** development and runtime) 3. **Configure Environment Search Path**: - - It's essential that `nvcc` and CUDA 11.8 library files are discoverable in your environment's search path. Adjustments can be reverted post-fine-tuning if desired. + - It's essential that `nvcc` and CUDA 12.4 library files are discoverable in your environment's search path. Adjustments can be reverted post-fine-tuning if desired. **For Windows**: - - Edit the `Path` environment variable to include `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin`. - - Add `CUDA_HOME` and set its path to `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8.` + - Edit the `Path` environment variable to include `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin`. + - Add `CUDA_HOME` and set its path to `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4.` **For Linux**: - The path may vary by Linux distribution. Here's a generic setup: - `export CUDA_HOME=/usr/local/cuda` - `export PATH=${CUDA_HOME}/bin:${PATH}` - `export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:$LD_LIBRARY_PATH` - + - Consider adding these to your `~/.bashrc` for permanence, or apply temporarily for the current session by running the above commands each time you start your Python environment. **Note**: If using Text-generation-webui, its best to set these temporarily. @@ -1074,14 +1074,14 @@ All the requirements for Finetuning will be installed by using the atsetup utili 4. **Verify Installation**: - Open a **new** terminal/command prompt to refresh the search paths. - In a terminal or command prompt, execute `nvcc --version`. - - Success is indicated by a response of `Cuda compilation tools, release 11.8.` Specifically, ensure it is version 11.8. + - Success is indicated by a response of `Cuda compilation tools, release 12.4.` Specifically, ensure it is version 12.4. 5. **Troubleshooting**: - If the correct version isn't reported, recheck your environment path settings for accuracy and potential conflicts with other CUDA versions.
#### Additional Note on Torch and Torchaudio: -- Ensure Torch and Torchaudio are CUDA-enabled (any version), which is separate from the CUDA Toolkit installation. CUDA 11.8 corresponds to `cu118` and CUDA 12.1 to `cu121` in AllTalk diagnostics. +- Ensure Torch and Torchaudio are CUDA-enabled (any version), which is separate from the CUDA Toolkit installation. CUDA 12.4 corresponds to `cu124` in AllTalk diagnostics. - Failure to install CUDA for Torch and Torchaudio will result in Step 2 of fine-tuning failing. These requirements are distinct from the CUDA Toolkit installation, so avoid conflating the two.
@@ -1104,7 +1104,7 @@ Depending on your setup (Text-generation-webui or Standalone AllTalk), the steps - Start the Python environment: - Windows: `start_finetune.bat` - Linux: `./start_finetune.sh` - + - **For Text-generation-webui Users**: - Navigate to the Text-generation-webui directory: - `cd text-generation-webui` @@ -1134,7 +1134,7 @@ These steps guide you through the initial preparations, starting the Python envi #### ⚫ How many Epochs etc is the right amount? -In finetuning the suggested/recommended amount of epochs, batch size, evaluation percent etc is already set. However, there is no absolutely correct answer to what the settings should be, it all depends on what you are doing. +In finetuning the suggested/recommended amount of epochs, batch size, evaluation percent etc is already set. However, there is no absolutely correct answer to what the settings should be, it all depends on what you are doing. - If you just want to train a normal human voice that is in an existing language, for most people’s needs, the base settings would work fine. You may choose to increase the epochs up to maybe 20, or run a second round of training if needed. - If you were training an entirely new language, you would need a huge amount of training data and it requires around 1000 epochs (based on things I can find around the internet of people who tried this). @@ -1169,7 +1169,7 @@ If you've compacted and moved your model, its highly unlikely you would want to #### ⚫ I have deeper questions about training the XTTS model, where can I find more information? -If you have deeper questions about the XTTS model, its capabilites, the training process etc, anything thats not covered within the above text or the interface of `finetune.py`, please use the following links to research Coqui's documentation on the XTTS model. +If you have deeper questions about the XTTS model, its capabilites, the training process etc, anything thats not covered within the above text or the interface of `finetune.py`, please use the following links to research Coqui's documentation on the XTTS model. - https://docs.coqui.ai/en/latest/models/xtts.html - https://github.com/coqui-ai/TTS @@ -1178,7 +1178,7 @@ If you have deeper questions about the XTTS model, its capabilites, the training --- ### ⬜ AllTalk TTS Generator -AllTalk TTS Generator is the solution for converting large volumes of text into speech using the voice of your choice. Whether you're creating audio content or just want to hear text read aloud, the TTS Generator is equipped to handle it all efficiently. Please see here for a quick [demo](https://www.youtube.com/watch?v=hunvXn0mLzc)

The link to open the TTS generator can be found on the built-in Settings and Documentation page.

**DeepSpeed** is **highly** recommended to speed up generation. **Low VRAM** would be best turned off and your LLM model unloaded from your GPU VRAM (unload your model). **No Playback** will reduce memory overhead on very large generations (15,000 words or more). Splitting **Export to Wav** into smaller groups will also reduce memory overhead at the point of exporting your wav files (so good for low memory systems). +AllTalk TTS Generator is the solution for converting large volumes of text into speech using the voice of your choice. Whether you're creating audio content or just want to hear text read aloud, the TTS Generator is equipped to handle it all efficiently. Please see here for a quick [demo](https://www.youtube.com/watch?v=hunvXn0mLzc)

The link to open the TTS generator can be found on the built-in Settings and Documentation page.

**DeepSpeed** is **highly** recommended to speed up generation. **Low VRAM** would be best turned off and your LLM model unloaded from your GPU VRAM (unload your model). **No Playback** will reduce memory overhead on very large generations (15,000 words or more). Splitting **Export to Wav** into smaller groups will also reduce memory overhead at the point of exporting your wav files (so good for low memory systems). #### ⬜ Estimated Throughput This will vary by system for a multitude of reasons, however, while generating a 58,000 word document to TTS, with DeepSpeed enabled, LowVram disabled, splitting size 2 and on an Nvidia RTX 4070, throughput was around 1,000 words per minute. Meaning, this took 1 hour to generate the TTS. Exporting to combined wavs took about 2-3 minutes total. @@ -1187,8 +1187,8 @@ This will vary by system for a multitude of reasons, however, while generating a - **Text Input:** Enter the text you wish to convert into speech in the 'Text Input' box. - **Generate TTS:** Hit this to start the text-to-speech conversion. - **Pause/Resume:** Used to pause and resume the playback of the initial generation of wavs or the stream. -- **Stop Playback:** This will stop the current audio playing back. It does not stop the text from being generated however. -Once you have sent text off to be generated, either as a stream or wav file generation, the TTS server will remain busy until this process has competed. As such, think carefully as to how much you want to send to the server. +- **Stop Playback:** This will stop the current audio playing back. It does not stop the text from being generated however. +Once you have sent text off to be generated, either as a stream or wav file generation, the TTS server will remain busy until this process has competed. As such, think carefully as to how much you want to send to the server. If you are generating wav files and populating the queue, you can generate one lot of text to speech, then input your next lot of text and it will continue adding to the list. #### ⬜ Customization and Preferences - **Character Voice:** Choose the voice that will read your text. @@ -1259,7 +1259,7 @@ The Text-to-Speech (TTS) Generation API allows you to generate speech from text #### 🟠 Ready Endpoint
Check if the Text-to-Speech (TTS) service is ready to accept requests. -- URL: `http://127.0.0.1:7851/api/ready`
- Method: `GET`
+- URL: `http://127.0.0.1:7851/api/ready`
- Method: `GET`
`curl -X GET "http://127.0.0.1:7851/api/ready"` @@ -1378,7 +1378,7 @@ Example: 🟠 **narrator_enabled**: Enable or disable the narrator function. If true, minimum text filtering is set to standard. Anything between double quotes is considered the character's speech, and anything between asterisks is considered the narrator's speech. `-d "narrator_enabled=true"`
-`-d "narrator_enabled=false"` +`-d "narrator_enabled=false"` 🟠 **narrator_voice_gen**: The WAV file name for the narrator's voice. diff --git a/diagnostics.py b/diagnostics.py index 8bb45d1c..54b67f2d 100644 --- a/diagnostics.py +++ b/diagnostics.py @@ -37,7 +37,7 @@ def get_requirements_file(): this_dir = Path(__file__).parent # Assuming 'this_dir' is defined as the script's directory requirements_dir = this_dir / 'system' / 'requirements' requirements_files = list(requirements_dir.glob('requirements*.txt')) # Using pathlib for globbing - + if not requirements_files: print("\033[91mNo requirements files found.\033[0m") return None @@ -66,7 +66,7 @@ def get_gpu_info(): return result.stdout except FileNotFoundError: return "NVIDIA GPU information not available" - + def get_cpu_info(): cpu_info = { 'physical_cores': psutil.cpu_count(logical=False), @@ -136,7 +136,7 @@ def find_files_in_path_with_wildcard(pattern): def log_system_info(): # System information os_version = platform.system() + " " + platform.version() - + # Get CUDA_HOME environment variable cuda_home = os.environ.get('CUDA_HOME', 'N/A') @@ -186,7 +186,7 @@ def log_system_info(): path_env = os.environ.get('PATH', 'N/A') # Check for cublas - file_name = 'cublas64_11.*' if platform.system() == "Windows" else 'libcublas.so.11*' + file_name = 'cublas64_12.*' if platform.system() == "Windows" else 'libcublas.so.12*' found_paths = find_files_in_path_with_wildcard(file_name) # Compare with requirements file @@ -228,9 +228,9 @@ def log_system_info(): logging.info(f" CUDA Working: {cuda_test_result}") logging.info(f" CUDA_HOME: {cuda_home}") if found_paths: - logging.info(f" Cublas64_11 Path: {', '.join(found_paths)}") + logging.info(f" Cublas64_12 Path: {', '.join(found_paths)}") else: - logging.info(f" Cublas64_11 Path: Not found in any search path directories.") + logging.info(f" Cublas64_12 Path: Not found in any search path directories.") logging.info("\nPYTHON & PYTORCH:") logging.info(f" Torch Version: {torch_version}") logging.info(f" Python Version: {platform.python_version()}") @@ -268,9 +268,9 @@ def log_system_info(): print(f"\033[94mCUDA Working:\033[0m \033[92m{cuda_test_result}\033[0m") print(f"\033[94mCUDA_HOME:\033[0m \033[92m{cuda_home}\033[0m") if found_paths: - print(f"\033[94mCublas64_11 Path:\033[0m \033[92m{', '.join(found_paths)}\033[0m") + print(f"\033[94mCublas64_12 Path:\033[0m \033[92m{', '.join(found_paths)}\033[0m") else: - print(f"\033[94mCublas64_11 Path:\033[0m \033[91mNot found in any search path directories.\033[0m") + print(f"\033[94mCublas64_12 Path:\033[0m \033[91mNot found in any search path directories.\033[0m") print(f"\033[94m\nTorch Version:\033[0m \033[92m{torch_version}\033[0m") print(f"\033[94mPython Version:\033[0m \033[92m{platform.python_version()}\033[0m") print(f"\033[94mPython Executable:\033[0m \033[92m{python_executable}\033[0m") @@ -302,7 +302,7 @@ def log_system_info(): # Print colored output print(f" {package_name.ljust(max_package_length)} Required: {color_required}{operator} {required_version.ljust(12)}\033[0m Installed: {color_installed}{installed_version}\033[0m") - print("\nOn Nvidia Graphics cards machines, if your \033[92mInstalled\033[0m version of \033[92mTorch\033[0m and \033[92mTorchaudio\033[0m does") + print("\nOn Nvidia Graphics cards machines, if your \033[92mInstalled\033[0m version of \033[92mTorch\033[0m and \033[92mTorchaudio\033[0m does") print("not have \033[92m+cu118\033[0m (Cuda 11.8) or \033[92m+cu121\033[0m (Cuda 12.1) listed after them, you do not have CUDA") print("installed for Torch or Torchaudio in this Python environment. This will cause you problems") print("with \033[94mAllTalk\033[0m and \033[94mFinetuning.\033[0m You may have to 'pip install' a new version of torch and") @@ -310,9 +310,9 @@ def log_system_info(): print("your Python environment.\033[0m") print("\033[94m\nRequirements file specifier meanings:\033[0m") explanation = textwrap.dedent(""" - == Exact version != Any version except < Less than + == Exact version != Any version except < Less than <= Less than or equal to > Greater than >= Greater than or equal to - ~ Compatible release ; Environment marker AND Logical AND + ~ Compatible release ; Environment marker AND Logical AND OR Logical OR """) print(explanation.strip()) diff --git a/finetune.py b/finetune.py index 9fb05ba8..f36143f1 100644 --- a/finetune.py +++ b/finetune.py @@ -21,14 +21,14 @@ import json from pathlib import Path from tqdm import tqdm -from faster_whisper import WhisperModel +from faster_whisper import WhisperModel # Use a local Tokenizer to resolve Japanese support # from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners from system.ft_tokenizer.tokenizer import multilingual_cleaners import importlib.metadata as metadata from packaging import version -# STARTUP VARIABLES +# STARTUP VARIABLES this_dir = Path(__file__).parent.resolve() audio_folder = this_dir / "finetune" / "put-voice-samples-in-here" out_path = this_dir / "finetune" / "tmp-trn" @@ -111,7 +111,7 @@ def test_cuda(): else: cuda_status = "CUDA is not available." pfc_status = "fail" # Update global status - return cuda_status, cuda_icon, cuda_home + return cuda_status, cuda_icon, cuda_home def find_files_in_path_with_wildcard(pattern): # Get the site-packages directory of the current Python environment @@ -136,18 +136,18 @@ def find_files_in_path_with_wildcard(pattern): def generate_cuda_markdown(): global pfc_status cuda_status, cuda_icon, cuda_home = test_cuda() - file_name = 'cublas64_11.*' if platform.system() == "Windows" else 'libcublas.so.11*' + file_name = 'cublas64_12.*' if platform.system() == "Windows" else 'libcublas.so.12*' found_paths = find_files_in_path_with_wildcard(file_name) if found_paths: found_paths_str = ' '.join(found_paths) found_path_icon = 'βœ…' else: - found_paths_str = "cublas64_11 is not accessible." + found_paths_str = "cublas64_12 is not accessible." found_path_icon = '❌' pfc_status = "fail" # Update global status # Check if 'cu118' or 'cu121' is in the PyTorch version string pytorch_version = torch.__version__ - if 'cu118' in pytorch_version or 'cu121' in pytorch_version: + if 'cu121' in pytorch_version or 'cu124' in pytorch_version: pytorch_cuda_version_status = '' pytorch_icon = 'βœ…' else: @@ -156,12 +156,12 @@ def generate_cuda_markdown(): pfc_status = "fail" # Update global status cuda_markdown = f""" ### 🟨 CUDA Information
-      {found_path_icon} **Cublas64_11 found:** {found_paths_str} +      {found_path_icon} **Cublas64_12 found:** {found_paths_str}      {pytorch_icon} **CUDA_HOME path:** {cuda_home} """ pytorch_markdown = f""" - ### 🟦 Python & Pytorch Information -      {pytorch_icon} **PyTorch Version:** {pytorch_cuda_version_status} {torch.__version__} + ### 🟦 Python & Pytorch Information +      {pytorch_icon} **PyTorch Version:** {pytorch_cuda_version_status} {torch.__version__}      {cuda_icon} **CUDA is working:** {cuda_status} """ return cuda_markdown, pytorch_markdown @@ -182,7 +182,7 @@ def get_system_ram_markdown(): if torch.cuda.is_available(): gpu_device_id = torch.cuda.current_device() - gpu_device_name = torch.cuda.get_device_name(gpu_device_id) + gpu_device_name = torch.cuda.get_device_name(gpu_device_id) # Get the total and available memory in bytes, then convert to GB gpu_total_mem_gb = torch.cuda.get_device_properties(gpu_device_id).total_memory / (1024 ** 3) # gpu_available_mem_gb = (torch.cuda.get_device_properties(gpu_device_id).total_memory - torch.cuda.memory_allocated(gpu_device_id)) / (1024 ** 3) @@ -238,26 +238,26 @@ def generate_base_model_markdown(base_model_detected): """ return base_model_markdown -def check_tts_version(required_version="0.22.0"): +def check_tts_version(required_version="0.24.1"): global pfc_status try: # Get the installed version of TTS - installed_version = metadata.version("tts") + installed_version = metadata.version("coqui-tts") # Check if the installed version meets the required version if version.parse(installed_version) >= version.parse(required_version): - tts_status = f"TTS version {installed_version} is installed and meets the requirement." + tts_status = f"Coqui-TTS version {installed_version} is installed and meets the requirement." tts_status_icon = "βœ…" else: - tts_status = f"❌ Fail - TTS version {installed_version} is installed but does not meet the required version {required_version}." + tts_status = f"❌ Fail - Coqui-TTS version {installed_version} is installed but does not meet the required version {required_version}." tts_status_icon = "❌" pfc_status = "fail" # Update global status except metadata.PackageNotFoundError: - # If TTS is not installed - tts_status = "TTS is not installed." + # If Coqui-TTS is not installed + tts_status = "Coqui-TTS is not installed." pfc_status = "fail" # Update global status tts_markdown = f""" - ### πŸŸ₯ TTS Information
-      {tts_status_icon} **TTS Version:** {tts_status} + ### πŸŸ₯ Coqui-TTS Information
+      {tts_status_icon} **Coqui-TTS Version:** {tts_status} """ return tts_markdown @@ -527,7 +527,7 @@ def format_audio_list(target_language, whisper_model, out_path, eval_split_numbe def basemodel_or_finetunedmodel_choice(value): - global basemodel_or_finetunedmodel + global basemodel_or_finetunedmodel if value == "Base Model": basemodel_or_finetunedmodel = True elif value == "Existing finetuned model": @@ -716,7 +716,7 @@ def clear_gpu_cache(): # clear the GPU cache if torch.cuda.is_available(): torch.cuda.empty_cache() - + def find_a_speaker_file(folder_path): search_path = folder_path / "*" / "speakers_xtts.pth" files = glob.glob(str(search_path), recursive=True) @@ -812,14 +812,14 @@ def compact_model(xtts_checkpoint_copy): print("THIS DIR:", this_dir) best_model_path_str = str(xtts_checkpoint_copy) # Convert to string print("best_model_path_str", best_model_path_str) - + # Check if the best model file exists if not best_model_path_str: print("[FINETUNE] No trained model was found.") return "No trained model was found." - + print(f"[FINETUNE] Best model path: {best_model_path_str}") - + # Attempt to load the model try: checkpoint = torch.load(best_model_path_str, map_location=torch.device("cpu")) @@ -827,10 +827,10 @@ def compact_model(xtts_checkpoint_copy): except Exception as e: print("[FINETUNE] Error loading checkpoint:", e) raise - + # Define the target directory target_dir = this_dir / "models" / "trainedmodel" - + # Create the target directory if it doesn't exist target_dir.mkdir(parents=True, exist_ok=True) @@ -839,29 +839,29 @@ def compact_model(xtts_checkpoint_copy): for key in list(checkpoint["model"].keys()): if "dvae" in key: del checkpoint["model"][key] - + # Save the modified checkpoint in the target directory torch.save(checkpoint, str(target_dir / "model.pth")) # Convert to string # Specify the files you want to copy files_to_copy = ["vocab.json", "config.json", "speakers_xtts.pth", "mel_stats.pth", "dvae.pth"] - + for file_name in files_to_copy: src_path = this_dir / base_path / base_model_path / file_name dest_path = target_dir / file_name shutil.copy(str(src_path), str(dest_path)) # Convert to string - + source_wavs_dir = this_dir / "finetune" / "tmp-trn" / "wavs" target_wavs_dir = target_dir / "wavs" target_wavs_dir.mkdir(parents=True, exist_ok=True) - + # Iterate through files in the source directory for file_path in source_wavs_dir.iterdir(): # Check if it's a file and larger than 1000 KB if file_path.is_file() and file_path.stat().st_size > 1000 * 1024: # Copy the file to the target directory shutil.copy(str(file_path), str(target_wavs_dir / file_path.name)) # Convert to string - + print("[FINETUNE] Model copied to '/models/trainedmodel/'") return "Model copied to '/models/trainedmodel/'" @@ -920,7 +920,7 @@ def compact_lastfinetuned_model(xtts_checkpoint_copy): if file_path.is_file() and file_path.stat().st_size > 1000 * 1024: # Copy the file to the target directory shutil.copy(str(file_path), str(target_wavs_dir / file_path.name)) - + print("[FINETUNE] Model copied to '/models/lastfinetuned/'") return "Model copied to '/models/lastfinetuned/'" @@ -979,7 +979,7 @@ def compact_custom_model(xtts_checkpoint_copy, folder_path): if file_path.is_file() and file_path.stat().st_size > 1000 * 1024: # Copy the file to the target directory shutil.copy(str(file_path), str(target_wavs_dir / file_path.name)) - + print("[FINETUNE] Model copied to '/models/",folder_path,"/") return f"Model copied to '/models/{folder_path}/'" @@ -1033,7 +1033,7 @@ def delete_voice_sample_contents(): ####################### #### OTHER Generic #### ####################### -# define a logger to redirect +# define a logger to redirect class Logger: def __init__(self, filename="finetune.log"): self.log_file = filename @@ -1134,7 +1134,7 @@ def refresh(): description="""XTTS fine-tuning demo\n\n""" """ Example runs: - python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port + python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port """, formatter_class=argparse.RawTextHelpFormatter, ) @@ -1174,7 +1174,7 @@ def refresh(): with gr.Tab("πŸš€ Pre-Flight Checklist"): gr.Markdown( f""" - {pfc_markdown} + {pfc_markdown} {disk_space_results} {system_ram_results} {cuda_results} @@ -1209,42 +1209,42 @@ def refresh(): ) with gr.Tab("🟨 CUDA & Cublas Help"): gr.Markdown( - f""" + f""" {cuda_results}

- β—½ It DOESNT matter what version of CUDA you have installed within Python either, CUDA 11.8, CUDA 12.1 etc. The NVIDIA CUDA Development Toolkit is a completly different and seperate thing from Python/PyTorch.
- β—½ Finetuning simply wants to access a tool within the CUDA Development Toolkit 11.8 called Cublas64_11.
+ β—½ It DOESNT matter what version of CUDA you have installed within Python either, CUDA 12.1, CUDA 12.4 etc. The NVIDIA CUDA Development Toolkit is a completly different and seperate thing from Python/PyTorch.
+ β—½ Finetuning simply wants to access a tool within the CUDA Development Toolkit 12.4 called Cublas64_12.
β—½ If you dont have the toolkit installed, the idea is just to install the smallest bit possible and this will not affect or impact other things on your system.

-       β—½ You will need to download the Nvidia Cuda Toolkit 11.8 network install from link here
+       β—½ You will need to download the Nvidia Cuda Toolkit 12.4 network install from link here
      β—½ 1) Run the installer and select Custom Advanced Uncheck everything at the top then expand CUDA, Development > Compiler > and select nvcc then expand Libraries and select CUBLAS.
      β—½ 2) Back at the top of CUDA, expand Runtime > Libraries and select CUBLAS. Click Next, accept the default path (taking a note of its location) and let the install run.
      β—½ 3) You should be able to drop to your terminal or command prompt and type nvcc --version and have it report Cuda compilation tools, release 11.8. If it does you are good to go. If it doesn't > Step 4.
      β—½ 4) Linux users, you can temporarily add these paths on your current terminal window with (you may need to confirm these are correct for your flavour of Linux):

-                  export LD_LIBRARY_PATH=/usr/local/cuda-11.8/lib64:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} (Add it to your ~/.bashrc if you want this to be permanent)
-                  export LD_LIBRARY_PATH=/usr/local/cuda-11.8/bin

+                  export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} (Add it to your ~/.bashrc if you want this to be permanent)
+                  export LD_LIBRARY_PATH=/usr/local/cuda-12.4/bin

                 Windows users need the add the following to the PATH environment variable. Start menu and search for "Environment Variables" or "Edit the system environment variables.".
                 Find and select the "Path" variable, then click on the "Edit...". Click on the "New" button and add:

-                 C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin.

+                 C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin.

      β—½ 5) Once you have these set correctly, you should be able to open a new command prompt/terminal and nvcc --version at the command prompt/terminal, resulting in Cuda compilation tools, release 11.8.
-       β—½ 6) If the nvcc command doesn't work OR it reports a version different from 11.8, finetuning wont work, so you will to double check your environment variables and get them working correctly.
+       β—½ 6) If the nvcc command doesn't work OR it reports a version different from 12.4, finetuning wont work, so you will to double check your environment variables and get them working correctly.
""" ) with gr.Tab("🟦 Python & PyTorch Help"): gr.Markdown( - f""" + f""" {pytorch_results}

β—½ On the PyTorch version the:
          - first few digits are the version of PyTorch e.g. 2.1.0 is PyTorch 2.1.0
          - last few digits refer to the CUDA version e.g. cu118 is Cuda 11.8. cu121 is Cuda 12.1.
β—½ Ensure you have started your Python envuronment before running finetuning otherwise you will have failures on the above checks.
β—½ If PyTorch does not show a CUDA version, then PyTorch will need reinstalling with CUDA. I would suggest running pip cache purge before installing PyTorch again.
- β—½ It DOESNT matter what version of PyTorch and CUDA you have installed within Python, CUDA 11.8, CUDA 12.1 etc. The NVIDIA CUDA Development Toolkit is a completly different and seperate thing.
- β—½ Finetuning simply wants to access a tool within the CUDA Development Toolkit called Cublas64_11.
+ β—½ It DOESNT matter what version of PyTorch and CUDA you have installed within Python, CUDA 12.1, CUDA 12.4 etc. The NVIDIA CUDA Development Toolkit is a completly different and seperate thing.
+ β—½ Finetuning simply wants to access a tool within the CUDA Development Toolkit called Cublas64_12.
β—½ If you dont have the toolkit installed, the idea is just to install the smallest bit possible and this will not affect or impact other things on your system.
""" ) with gr.Tab("⬛ XTTS Base Model Help"): gr.Markdown( - f""" + f""" {base_model_results}

β—½ If your basemodel is not being detected, please ensure that finetune.py is being run from the AllTalk main folder.
β—½ Ensure you have started AllTalk normally at least once. You can start it again and it will download any missing files.
@@ -1254,13 +1254,13 @@ def refresh(): ) with gr.Tab("πŸŸ₯ TTS Version Help"): gr.Markdown( - f""" + f""" {tts_version_status}

β—½ If your TTS version is showing as the incorrect version, please reinstall the Finetuning requirements at the command prompt/terminal.
β—½ pip install -r requirements_finetune.txt
""" ) - + with gr.Tab("ℹ️ General Finetuning info"): gr.Markdown( f""" @@ -1299,7 +1299,7 @@ def refresh(): β—½ You can find information about the Whisper model [here](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages) and you can find data about manually building training data [here](https://docs.coqui.ai/en/latest/formatting_your_dataset.html), as well as details below about the file structure this step performs.
### 🟦 What you need to do β—½ Please read Coqui's guide on what makes a good dataset [here](https://docs.coqui.ai/en/latest/what_makes_a_good_dataset.html#what-makes-a-good-dataset)
- β—½ Place your audio files in {str(audio_folder)} + β—½ Place your audio files in {str(audio_folder)} β—½ Your audio samples can be in the format mp3, wav, or flac.
β—½ You will need a minimum of 2 minutes of audio in either one or multiple audio files. 5 to 10 minutes of audio would probably be better, allowing for more varied sample data to be generated.
β—½ Very small sample files cause errors, so I would recommend that the samples are at least 30 seconds and longer.
@@ -1319,7 +1319,7 @@ def refresh(): β—½ First time, it needs to download the Whisper model which is 3GB. After that a few minutes on an average 3-4 year old system.
""" ) - + out_path = gr.Textbox( label="Output path (where data and checkpoints will be saved):", value=out_path, @@ -1383,7 +1383,7 @@ def refresh(): demo.load(read_logs, None, logs, every=1) prompt_compute_btn = gr.Button(value="Step 1 - Create dataset") - + def preprocess_dataset(language, whisper_model, out_path, eval_split_number, speaker_name_input, progress=gr.Progress(track_tqdm=True)): clear_gpu_cache() test_for_audio_files = [file for file in os.listdir(audio_folder) if any(file.lower().endswith(ext) for ext in ['.wav', '.mp3', '.flac'])] @@ -1408,7 +1408,7 @@ def preprocess_dataset(language, whisper_model, out_path, eval_split_number, spe print("[FINETUNE] Dataset Generated. Move to Step 2") return "Dataset Generated. Move to Step 2", train_meta, eval_meta - + ####################### #### GRADIO STEP 2 #### ####################### @@ -1418,7 +1418,7 @@ def preprocess_dataset(language, whisper_model, out_path, eval_split_number, spe f""" ### πŸ’» Training
### πŸŸ₯ Important Note - Language support. - β—½ If this step is failing/erroring you may wish to check your training data was created correctly (Detailed in Step 1), confirming that wav files have been generated and your `metadata_train.csv` and `metadata_eval.csv` files have been populated.
+ β—½ If this step is failing/erroring you may wish to check your training data was created correctly (Detailed in Step 1), confirming that wav files have been generated and your `metadata_train.csv` and `metadata_eval.csv` files have been populated.
### 🟦 What you need to do β—½ The Train CSV and Eval CSV should already be populated. If not, just go back to Step 1 and click "Create Dataset" again.
β—½ The default settings below are the suggested settings for most purposes, however you may choose to alter them depending on your specific use case.
@@ -1612,11 +1612,11 @@ def train_model(language, train_csv, eval_csv, learning_rates, num_epochs, batch """ ) - + ####################### #### GRADIO STEP 3 #### ####################### - + with gr.Tab("βœ… Step 3 - Testing"): gr.Markdown( f""" @@ -1832,7 +1832,7 @@ def train_model(language, train_csv, eval_csv, learning_rates, num_epochs, batch ], outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio], ) - + load_btn.click( fn=load_model, inputs=[ @@ -1851,7 +1851,7 @@ def train_model(language, train_csv, eval_csv, learning_rates, num_epochs, batch speaker_reference_audio, ], outputs=[progress_gen, tts_output_audio, reference_audio], - ) + ) compact_btn.click( fn=compact_model, inputs=[xtts_checkpoint_copy], @@ -1885,4 +1885,3 @@ def train_model(language, train_csv, eval_csv, learning_rates, num_epochs, batch server_port=7052, server_name="127.0.0.1", ) - diff --git a/modeldownload.py b/modeldownload.py index f9baab48..e07ecaac 100644 --- a/modeldownload.py +++ b/modeldownload.py @@ -47,12 +47,12 @@ def load_config(file_path): tts_version_required = None for req in requirements: - if req.startswith("TTS=="): + if req.startswith("coqui-tts=="): tts_version_required = req.strip().split("==")[1] break if tts_version_required is None: - raise ValueError(f"[{params['branding']}Startup] \033[91mWarning\033[0m Could not find TTS version specifier in requirements file") + raise ValueError(f"[{params['branding']}Startup] \033[91mWarning\033[0m Could not find Coqui-TTS version specifier in requirements file") def create_directory_if_not_exists(directory): if not directory.exists(): @@ -75,16 +75,16 @@ def download_file(url, destination): def check_tts_version(): try: - tts_version = metadata.version("tts") + tts_version = metadata.version("coqui-tts") print(f"[{params['branding']}Startup] \033[92mCurrent TTS Version : \033[93m"+tts_version+"\033[0m") if version.parse(tts_version) < version.parse(tts_version_required): - print(f"[{params['branding']}Startup] \033[91mWarning\033[0m TTS version is too old. Please upgrade to version \033[93m"+tts_version_required+"\033[0m or later.\033[0m") - print(f"[{params['branding']}Startup] \033[91mWarning\033[0m At your terminal/command prompt \033[94mpip install --upgrade tts\033[0m") + print(f"[{params['branding']}Startup] \033[91mWarning\033[0m Coqui-TTS version is too old. Please upgrade to version \033[93m"+tts_version_required+"\033[0m or later.\033[0m") + print(f"[{params['branding']}Startup] \033[91mWarning\033[0m At your terminal/command prompt \033[94mpip install --upgrade coqui-tts\033[0m") else: - print(f"[{params['branding']}Startup] \033[92mCurrent TTS Version is :\033[93m Up to date\033[0m") + print(f"[{params['branding']}Startup] \033[92mCurrent Coqui-TTS Version is :\033[93m Up to date\033[0m") except metadata.PackageNotFoundError: - print(f"[{params['branding']}Startup] \033[91mWarning\033[0m TTS is not installed") + print(f"[{params['branding']}Startup] \033[91mWarning\033[0m Coqui-TTS is not installed") def check_torch_version(): import torch @@ -190,4 +190,4 @@ def update_sha_file(file_path, new_sha): update_sha_file(sha_file_path, latest_commit_sha) elif latest_commit_sha == last_known_commit_sha: #print(f"Your alltalk_tts software is up to date.") - print(f"[{params['branding']}Startup] \033[92m{params['branding']}Github updated :\033[93m {formatted_date}\033[0m") \ No newline at end of file + print(f"[{params['branding']}Startup] \033[92m{params['branding']}Github updated :\033[93m {formatted_date}\033[0m") diff --git a/system/requirements/requirements_docker.txt b/system/requirements/requirements_docker.txt index eacdb94e..71c0a5fd 100644 --- a/system/requirements/requirements_docker.txt +++ b/system/requirements/requirements_docker.txt @@ -1,3 +1,4 @@ +--extra-index-url https://download.pytorch.org/whl/cu124 absl-py==2.1.0 aiofiles==23.2.1 aiohappyeyeballs==2.3.5 @@ -27,6 +28,8 @@ coloredlogs==15.0.1 confection==0.1.5 contourpy==1.2.1 coqpit==0.0.17 +coqui-tts==0.24.1 +coqui-tts-trainer==0.1.4 ctranslate2==4.3.1 cutlet==0.4.0 cycler==0.12.1 @@ -55,10 +58,10 @@ gradio_client==1.3.0 grpcio==1.65.4 gruut==2.2.3 gruut-ipa==0.13.0 -gruut_lang_de==2.0.1 -gruut_lang_en==2.0.1 -gruut_lang_es==2.0.1 -gruut_lang_fr==2.0.2 +gruut-lang-de==2.0.1 +gruut-lang-en==2.0.1 +gruut-lang-es==2.0.1 +gruut-lang-fr==2.0.2 h11==0.14.0 hangul-romanize==0.1.0 httpcore==1.0.5 @@ -89,7 +92,7 @@ marisa-trie==1.2.0 Markdown==3.6 markdown-it-py==3.0.0 MarkupSafe==2.1.5 -matplotlib==3.9.1.post1 +matplotlib==3.9.2 mdurl==0.1.2 mojimoji==0.0.13 more-itertools==10.4.0 @@ -97,33 +100,34 @@ mpmath==1.3.0 msgpack==1.0.8 multidict==6.0.5 murmurhash==1.0.10 -narwhals==1.3.0 +narwhals==1.4.0 networkx==2.8.8 nltk==3.8.2 num2words==0.5.13 numba==0.60.0 numpy==1.26.4 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cublas-cu12==12.4.2.65 +nvidia-cuda-cupti-cu12==12.4.99 +nvidia-cuda-nvrtc-cu12==12.4.99 +nvidia-cuda-runtime-cu12==12.4.99 nvidia-cudnn-cu12==9.1.0.70 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 +nvidia-cufft-cu12==11.2.0.44 +nvidia-curand-cu12==10.3.5.119 +nvidia-cusolver-cu12==11.6.0.99 +nvidia-cusparse-cu12==12.3.0.142 +nvidia-ml-py==12.555.43 nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.6.20 -nvidia-nvtx-cu12==12.1.105 -onnxruntime==1.18.0 +nvidia-nvjitlink-cu12==12.4.99 +nvidia-nvtx-cu12==12.4.99 +onnxruntime==1.18.1 orjson==3.10.7 packaging==24.1 -pandas==1.5.3 +pandas==2.2.2 pillow==10.4.0 platformdirs==4.2.2 pooch==1.8.2 preshed==3.0.9 -protobuf==5.27.3 +protobuf==4.25.4 psutil==6.0.0 pycparser==2.22 pydantic==2.8.2 @@ -167,20 +171,19 @@ starlette==0.37.2 SudachiDict-core==20240716 SudachiPy==0.6.8 sympy==1.13.2 -tensorboard==2.16.2 +tensorboard==2.17.0 tensorboard-data-server==0.7.2 -thinc==8.2.4 +thinc==8.2.5 threadpoolctl==3.5.0 tokenizers==0.19.1 tomlkit==0.12.0 toolz==0.12.1 -torch>=2.4.0 -torchaudio>=2.4.0 +torch==2.4.0+cu124 +torchaudio==2.4.0+cu124 tqdm==4.66.5 trainer==0.0.36 -transformers==4.44.0 +transformers==4.40.2 triton==3.0.0 -TTS==0.22.0 typeguard==4.3.0 typer==0.12.3 typing_extensions==4.12.2 @@ -190,7 +193,7 @@ umap-learn==0.5.6 Unidecode==1.3.8 unidic-lite==1.0.8 urllib3==2.2.2 -uvicorn==0.30.5 +uvicorn==0.30.6 wasabi==1.1.3 weasel==0.4.1 websockets==12.0 diff --git a/system/tts_diff/tts_diff.py b/system/tts_diff/tts_diff.py index 81a795e2..7548de05 100644 --- a/system/tts_diff/tts_diff.py +++ b/system/tts_diff/tts_diff.py @@ -19,7 +19,7 @@ def find_files_in_path_with_wildcard(pattern): sub_directory = os.path.join(sub_directory, "lib") else: sub_directory = os.path.join(sub_directory, "bin") - + # Iterate over each site-packages directory (there can be more than one) for directory in site_packages_path: # Construct the search directory path @@ -32,7 +32,7 @@ def find_files_in_path_with_wildcard(pattern): def detect_cublas(): # Use different file patterns based on the operating system - file_name_pattern = 'cublas64_11.*' if platform.system() == "Windows" else 'libcublas.so.11*' + file_name_pattern = 'cublas64_12.*' if platform.system() == "Windows" else 'libcublas.so.12*' found_paths = find_files_in_path_with_wildcard(file_name_pattern) if found_paths: print("[AllTalk TTSDiff] \033[94mCublas:\033[0m \033[92mDetected\033[0m", found_paths) @@ -40,7 +40,7 @@ def detect_cublas(): print("[AllTalk TTSDiff] \033[94mCublas:\033[0m \033[91mERROR Not Detected\033[0m") print("[AllTalk TTSDiff] \033[94mPlease install Cublas from the Nvidia CUDA Toolkit \033[0mhttps://developer.nvidia.com/cuda-downloads") exit(1) - + parser = argparse.ArgumentParser(description="Compare TTS output with the original text using detailed comparison.") parser.add_argument("--threshold", type=int, default=98, help="Similarity threshold for considering a match (default: 98)") @@ -78,8 +78,8 @@ def disclaimer_text(): print(f" - When you have your ID list, go back into the TTS Generator, correct any lines and regenerate them. If you") print(f" want to re-test everthing again after re-generating, you will need to export your list again and re-run this") print(f" script again, against your newly exported JSON list.") - print(f" - This requires access to \033[93mcublas64_11\033[0m, the same as Finetuning.") - print(f" https://github.com/erew123/alltalk_tts/tree/main?#-important-requirements-cuda-118\n") + print(f" - This requires access to \033[93mcublas64_12\033[0m, the same as Finetuning.") + print(f" https://github.com/erew123/alltalk_tts/tree/main?#-important-requirements-cuda-124\n") return try: @@ -103,12 +103,12 @@ def disclaimer_text(): except Exception as e: print(f"[AllTalk TTSDiff] ERROR STARTING SCRIPT:") print("[AllTalk TTSDiff] Failed to load the spaCy language model.") - + # Attempt to download the spaCy model automatically try: print("[AllTalk TTSDiff] Attempting to automatically download the spaCy language model...") subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_md"], check=True) - + # Try to load the model again after downloading nlp = spacy.load("en_core_web_md") print("[AllTalk TTSDiff] Model downloaded and loaded successfully.") @@ -127,29 +127,29 @@ def texts_are_similar(text1, text2, threshold=0.8): # Process the texts through the NLP model for longer texts doc1 = nlp(text1) doc2 = nlp(text2) - + # Compute semantic similarity similarity = doc1.similarity(doc2) - + return similarity >= threshold def normalize_text(text): # Normalize or remove CRLF and other non-standard whitespaces text = text.replace("\r\n", " ").replace("\n", " ").replace("\r", " ") - + # Convert to lowercase text = text.lower() - + # Standardize and then remove quotation marks text = text.replace("β€œ", '"').replace("”", '"').replace("β€˜", "'").replace("’", "'") text = text.translate(str.maketrans('', '', '"\'')) - + # Remove all other punctuation except hyphens to preserve compound words text = text.translate(str.maketrans('', '', string.punctuation.replace("-", ""))) - + # Collapse any sequence of whitespace (including spaces) into a single space text = re.sub(r'\s+', ' ', text) - + return text.strip() spoken_punctuation_mapping = { @@ -193,21 +193,21 @@ def transcribe_and_compare(file_url, original_text, model, item_id, flagged_ids) except Exception as e: print(f"[AllTalk TTSDiff] Error transcribing file {audio_file_path}: {e}") return # Skip this file if transcription fails - + segments, info = model.transcribe(str(audio_file_path), beam_size=5) transcribed_text = " ".join([segment.text for segment in segments]) - + # Normalize texts for comparison original_text_normalized = normalize_text(original_text) transcribed_text_normalized = normalize_text(transcribed_text) - + # Enhanced comparison using detailed fuzzy matching is_detailed_match = detailed_comparison(original_text_normalized, transcribed_text_normalized, args.threshold) has_spoken_punctuation = contains_spoken_punctuation(transcribed_text_normalized) and not contains_spoken_punctuation(original_text_normalized) - + # Adjust is_match based on detailed comparison and spoken punctuation check is_match = is_detailed_match and not has_spoken_punctuation - + # Only log and flag IDs for review if there's a mismatch or detected issues if not is_match: #print(f"\033[93mMismatch Detected - ID:\033[0m {item_id}") @@ -229,7 +229,7 @@ def main(): model_size = "large-v2" device = "cuda" if torch.cuda.is_available() else "cpu" model = WhisperModel(model_size, device=device, compute_type="float32") - + flagged_ids = [] # Initialize the list to track IDs needing review try: @@ -249,7 +249,7 @@ def main(): for item in tts_list: print(f"[AllTalk TTSDiff] Processing ID: {item['id']}") transcribe_and_compare(item['fileUrl'], item['text'], model, item['id'], flagged_ids) - + print(f"[AllTalk TTSDiff]") # Print summary information at the end if flagged_ids: From 7d09b3f6445ab1008d911cc6898f99b3494c0426 Mon Sep 17 00:00:00 2001 From: Martin Rys Date: Tue, 13 Aug 2024 18:54:45 +0200 Subject: [PATCH 3/8] Update deps in requirements_standalone.txt as they're used in modeldownload.py no matter the distribution --- modeldownload.py | 2 +- .../requirements/requirements_standalone.txt | 231 ++++++++++-------- 2 files changed, 132 insertions(+), 101 deletions(-) diff --git a/modeldownload.py b/modeldownload.py index e07ecaac..32efe1d6 100644 --- a/modeldownload.py +++ b/modeldownload.py @@ -41,7 +41,7 @@ def load_config(file_path): print(f"[{params['branding']}Startup] \033[91mWarning\033[0m modeldownload.json is missing so please re-download it and save it in the coquii_tts main folder.") print(f"[{params['branding']}Startup] \033[91mWarning\033[0m API Local and XTTSv2 Local will error unless this is corrected.") -# Read the version specifier from requirements_nvidia.txt +# Read the version specifier from requirements_standalone.txt with open(this_dir / "system" / "requirements" / "requirements_standalone.txt", "r") as req_file: requirements = req_file.readlines() diff --git a/system/requirements/requirements_standalone.txt b/system/requirements/requirements_standalone.txt index 59cec9c2..57386236 100644 --- a/system/requirements/requirements_standalone.txt +++ b/system/requirements/requirements_standalone.txt @@ -1,172 +1,203 @@ +--extra-index-url https://download.pytorch.org/whl/cu124 absl-py==2.1.0 aiofiles==23.2.1 -aiohttp==3.9.3 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.3 aiosignal==1.3.1 -altair==5.2.0 -annotated-types==0.6.0 +altair==5.4.0 +annotated-types==0.7.0 anyascii==0.3.2 -anyio==4.3.0 -attrs==23.2.0 +anyio==4.4.0 +attrs==24.2.0 audioread==3.0.1 -av==11.0.0 -Babel==2.14.0 +av==12.3.0 +babel==2.16.0 bangla==0.0.2 -blinker==1.7.0 +blinker==1.8.2 blis==0.7.11 bnnumerizer==0.0.2 -bnunicodenormalizer==0.1.6 +bnunicodenormalizer==0.1.7 catalogue==2.0.10 -certifi==2024.2.2 -cffi==1.16.0 +certifi==2024.7.4 +cffi==1.17.0 charset-normalizer==3.3.2 click==8.1.7 -cloudpathlib==0.16.0 +cloudpathlib==0.18.1 colorama==0.4.6 coloredlogs==15.0.1 -confection==0.1.4 -contourpy==1.2.0 +confection==0.1.5 +contourpy==1.2.1 coqpit==0.0.17 -ctranslate2==4.1.0 +coqui-tts==0.24.1 +coqui-tts-trainer==0.1.4 +ctranslate2==4.3.1 cutlet==0.4.0 cycler==0.12.1 cymem==2.0.8 -Cython==3.0.9 +Cython==3.0.11 dateparser==1.1.8 decorator==5.1.1 docopt==0.6.2 -einops==0.7.0 +einops==0.8.0 encodec==0.1.1 -fastapi==0.110.0 -faster-whisper==1.0.1 -ffmpy==0.3.2 -filelock==3.13.3 -Flask==3.0.2 +exceptiongroup==1.2.2 +fastapi==0.112.0 +faster-whisper==1.0.3 +ffmpy==0.4.0 +filelock==3.15.4 +Flask==3.0.3 flatbuffers==24.3.25 -fonttools==4.50.0 +fonttools==4.53.1 frozenlist==1.4.1 -fsspec==2024.3.1 -fugashi==1.3.1 -fuzzywuzzy>=0.18.0 +fsspec==2024.6.1 +fugashi==1.3.2 +fuzzywuzzy==0.18.0 g2pkk==0.1.2 -gradio==3.50.2 -gradio_client==0.6.1 -grpcio==1.62.1 +gradio==4.41.0 +gradio_client==1.3.0 +grpcio==1.65.4 gruut==2.2.3 gruut-ipa==0.13.0 -gruut-lang-de==2.0.0 -gruut-lang-en==2.0.0 -gruut-lang-es==2.0.0 +gruut-lang-de==2.0.1 +gruut-lang-en==2.0.1 +gruut-lang-es==2.0.1 gruut-lang-fr==2.0.2 h11==0.14.0 hangul-romanize==0.1.0 -httpcore==1.0.4 +httpcore==1.0.5 httpx==0.27.0 -huggingface-hub==0.22.1 +huggingface-hub==0.24.5 humanfriendly==10.0 -idna==3.6 -importlib_metadata==7.1.0 +idna==3.7 +importlib_metadata==8.2.0 importlib_resources==6.4.0 -inflect==7.0.0 -itsdangerous==2.1.2 -jaconv==0.3.4 +inflect==7.3.1 +itsdangerous==2.2.0 +jaconv==0.4.0 jamo==0.4.1 jieba==0.42.1 -Jinja2==3.1.3 -joblib==1.3.2 +Jinja2==3.1.4 +joblib==1.4.2 jsonlines==1.2.0 -jsonschema==4.21.1 +jsonschema==4.23.0 jsonschema-specifications==2023.12.1 kiwisolver==1.4.5 -langcodes==3.3.0 -lazy_loader==0.3 -librosa==0.10.1 -llvmlite==0.42.0 +langcodes==3.4.0 +language_data==1.2.0 +lazy_loader==0.4 +Levenshtein==0.25.1 +librosa==0.10.2.post1 +llvmlite==0.43.0 +marisa-trie==1.2.0 Markdown==3.6 +markdown-it-py==3.0.0 MarkupSafe==2.1.5 -matplotlib==3.8.3 +matplotlib==3.9.2 +mdurl==0.1.2 mojimoji==0.0.13 +more-itertools==10.4.0 mpmath==1.3.0 msgpack==1.0.8 multidict==6.0.5 murmurhash==1.0.10 +narwhals==1.4.0 networkx==2.8.8 -nltk==3.8.1 +nltk==3.8.2 num2words==0.5.13 -numba==0.59.1 +numba==0.60.0 numpy==1.26.4 -nvidia-cublas-cu11>=11.11.3.6 -nvidia-cudnn-cu11>=9.0.0.312 -onnxruntime==1.17.1 -orjson==3.9.15 -packaging==24.0 -pandas==1.5.3 -pillow==10.2.0 -platformdirs==4.2.0 -pooch==1.8.1 +nvidia-cublas-cu12==12.4.2.65 +nvidia-cuda-cupti-cu12==12.4.99 +nvidia-cuda-nvrtc-cu12==12.4.99 +nvidia-cuda-runtime-cu12==12.4.99 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.0.44 +nvidia-curand-cu12==10.3.5.119 +nvidia-cusolver-cu12==11.6.0.99 +nvidia-cusparse-cu12==12.3.0.142 +nvidia-ml-py==12.555.43 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.4.99 +nvidia-nvtx-cu12==12.4.99 +onnxruntime==1.18.1 +orjson==3.10.7 +packaging==24.1 +pandas==2.2.2 +pillow==10.4.0 +platformdirs==4.2.2 +pooch==1.8.2 preshed==3.0.9 -protobuf==5.26.0 -psutil==5.9.8 -pycparser==2.21 -pydantic==2.6.4 -pydantic_core==2.16.3 +protobuf==4.25.4 +psutil==6.0.0 +pycparser==2.22 +pydantic==2.8.2 +pydantic_core==2.20.1 pydub==0.25.1 -pynndescent==0.5.11 +Pygments==2.18.0 +pynndescent==0.5.13 pyparsing==3.1.2 -pypinyin==0.51.0 +pypinyin==0.52.0 pyreadline3==3.4.1 pysbd==0.3.4 python-crfsuite==0.9.10 python-dateutil==2.9.0.post0 -python-Levenshtein>=0.25.0 +python-Levenshtein==0.25.1 python-multipart==0.0.9 pytz==2024.1 -PyYAML==6.0.1 -referencing==0.34.0 -regex==2023.12.25 -requests==2.31.0 -rpds-py==0.18.0 -safetensors==0.4.2 -scikit-learn==1.4.1.post1 -scipy==1.12.0 +PyYAML==6.0.2 +rapidfuzz==3.9.6 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rpds-py==0.20.0 +ruff==0.5.7 +safetensors==0.4.4 +scikit-learn==1.5.1 +scipy==1.14.0 semantic-version==2.10.0 +shellingham==1.5.4 six==1.16.0 -smart-open==6.4.0 +smart-open==7.0.4 sniffio==1.3.1 -sounddevice==0.4.6 +sounddevice==0.5.0 soundfile==0.12.1 -soxr==0.3.7 -spacy==3.7.4 +soxr==0.4.0 +spacy==3.7.5 spacy-legacy==3.0.12 spacy-loggers==1.0.5 srsly==2.4.8 -starlette==0.36.3 -SudachiDict-core==20240109 +starlette==0.37.2 +SudachiDict-core==20240716 SudachiPy==0.6.8 -sympy==1.12 -tensorboard==2.16.2 +sympy==1.13.2 +tensorboard==2.17.0 tensorboard-data-server==0.7.2 -thinc==8.2.3 -threadpoolctl==3.4.0 -tokenizers==0.15.2 +thinc==8.2.5 +threadpoolctl==3.5.0 +tokenizers==0.19.1 +tomlkit==0.12.0 toolz==0.12.1 -torch>=2.2.0 -torchaudio>=2.2.0 -tqdm==4.66.2 +torch==2.4.0+cu124 +torchaudio==2.4.0+cu124 +tqdm==4.66.5 trainer==0.0.36 -transformers==4.39.1 -TTS==0.22.0 -typer==0.9.4 -typing_extensions==4.10.0 +transformers==4.40.2 +triton==3.0.0 +typeguard==4.3.0 +typer==0.12.3 +typing_extensions==4.12.2 tzdata==2024.1 tzlocal==5.2 -umap-learn==0.5.5 +umap-learn==0.5.6 Unidecode==1.3.8 unidic-lite==1.0.8 -urllib3==2.2.1 -uvicorn==0.29.0 -wasabi==1.1.2 -weasel==0.3.4 -websockets==11.0.3 -Werkzeug==3.0.1 +urllib3==2.2.2 +uvicorn==0.30.6 +wasabi==1.1.3 +weasel==0.4.1 +websockets==12.0 +Werkzeug==3.0.3 +wrapt==1.16.0 yarl==1.9.4 -zipp==3.18.1 +zipp==3.20.0 From 25da3413cf698dbfa7e813ef7a5fb1edfb52899c Mon Sep 17 00:00:00 2001 From: Martin Rys Date: Tue, 13 Aug 2024 19:04:56 +0200 Subject: [PATCH 4/8] Tiny script and compose fixes --- docker-compose-cuda.yml | 3 +-- launch.sh | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docker-compose-cuda.yml b/docker-compose-cuda.yml index 7532ca8d..18de75d7 100644 --- a/docker-compose-cuda.yml +++ b/docker-compose-cuda.yml @@ -1,4 +1,3 @@ -version: '3.8' services: alltalk-tts: image: erew123/alltalk_tts:cuda @@ -17,4 +16,4 @@ services: devices: - driver: nvidia count: all - capabilities: [ gpu ] + capabilities: [gpu] diff --git a/launch.sh b/launch.sh index 97708c3d..a592accd 100644 --- a/launch.sh +++ b/launch.sh @@ -2,4 +2,4 @@ python modeldownload.py uvicorn tts_server:app --host 0.0.0.0 --port 7851 --workers 1 --proxy-headers & sleep 5 -python script.py \ No newline at end of file +python script.py From e96df1b3ebff1f238cf4ab48484553e006ff7cc0 Mon Sep 17 00:00:00 2001 From: Martin Rys Date: Tue, 13 Aug 2024 19:33:30 +0200 Subject: [PATCH 5/8] Hack out 11.8 mentions --- finetune.py | 8 ++++---- system/admin.html | 38 +++++++++++++++++++------------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/finetune.py b/finetune.py index f36143f1..6e16c68b 100644 --- a/finetune.py +++ b/finetune.py @@ -1217,14 +1217,14 @@ def refresh():       β—½ You will need to download the Nvidia Cuda Toolkit 12.4 network install from link here
      β—½ 1) Run the installer and select Custom Advanced Uncheck everything at the top then expand CUDA, Development > Compiler > and select nvcc then expand Libraries and select CUBLAS.
      β—½ 2) Back at the top of CUDA, expand Runtime > Libraries and select CUBLAS. Click Next, accept the default path (taking a note of its location) and let the install run.
-       β—½ 3) You should be able to drop to your terminal or command prompt and type nvcc --version and have it report Cuda compilation tools, release 11.8. If it does you are good to go. If it doesn't > Step 4.
+       β—½ 3) You should be able to drop to your terminal or command prompt and type nvcc --version and have it report Cuda compilation tools, release 12.4. If it does you are good to go. If it doesn't > Step 4.
      β—½ 4) Linux users, you can temporarily add these paths on your current terminal window with (you may need to confirm these are correct for your flavour of Linux):

                 export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} (Add it to your ~/.bashrc if you want this to be permanent)
                 export LD_LIBRARY_PATH=/usr/local/cuda-12.4/bin

                 Windows users need the add the following to the PATH environment variable. Start menu and search for "Environment Variables" or "Edit the system environment variables.".
                 Find and select the "Path" variable, then click on the "Edit...". Click on the "New" button and add:

                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin.

-       β—½ 5) Once you have these set correctly, you should be able to open a new command prompt/terminal and nvcc --version at the command prompt/terminal, resulting in Cuda compilation tools, release 11.8.
+       β—½ 5) Once you have these set correctly, you should be able to open a new command prompt/terminal and nvcc --version at the command prompt/terminal, resulting in Cuda compilation tools, release 12.4.
      β—½ 6) If the nvcc command doesn't work OR it reports a version different from 12.4, finetuning wont work, so you will to double check your environment variables and get them working correctly.
""" ) @@ -1234,7 +1234,7 @@ def refresh(): {pytorch_results}

β—½ On the PyTorch version the:
          - first few digits are the version of PyTorch e.g. 2.1.0 is PyTorch 2.1.0
-           - last few digits refer to the CUDA version e.g. cu118 is Cuda 11.8. cu121 is Cuda 12.1.
+           - last few digits refer to the CUDA version e.g. cu124 is Cuda 12.4.
β—½ Ensure you have started your Python envuronment before running finetuning otherwise you will have failures on the above checks.
β—½ If PyTorch does not show a CUDA version, then PyTorch will need reinstalling with CUDA. I would suggest running pip cache purge before installing PyTorch again.
β—½ It DOESNT matter what version of PyTorch and CUDA you have installed within Python, CUDA 12.1, CUDA 12.4 etc. The NVIDIA CUDA Development Toolkit is a completly different and seperate thing.
@@ -1747,7 +1747,7 @@ def train_model(language, train_csv, eval_csv, learning_rates, num_epochs, batch β—½ If you are not going to train anything again, you can delete the whisper model from inside of your huggingface cache (3GB approx)
        β—½ **Linux:** ~/.cache/huggingface/hub/(folder-here)
        β—½ **Windows:** C:\users(your-username)\.cache\huggingface\hub\(folder-here).
- β—½ You can also uninstall the Nvidia CUDA 11.8 Toolkit if you wish and remove your environment variable entries.
+ β—½ You can also uninstall the Nvidia CUDA 12.4 Toolkit if you wish and remove your environment variable entries.
""" ) final_progress_data = gr.Label( diff --git a/system/admin.html b/system/admin.html index c4c03b03..81c975d7 100644 --- a/system/admin.html +++ b/system/admin.html @@ -1019,9 +1019,9 @@

DeepSpeed Installation
  • The toolkit is crucial for DeepSpeed to compile/build for your version of Linux and requires around 3GB's of disk space.
  • Install using your package manager (Recommended) e.g. CUDA Toolkit - 11.8 or download directly from or download directly from Nvidia CUDA Toolkit Archive - (choose 11.8 or 12.1 for Linux).
  • + (choose 12.4.1).
  • @@ -1089,7 +1089,7 @@

    DeepSpeed Installation
  • You can confirm the path is set correctly and working by running the command nvcc --version should confirm - Cuda compilation tools, release 11.8.. + Cuda compilation tools, release 12.4..

  • @@ -1128,9 +1128,9 @@

    DeepSpeed Installation fo
  • The toolkit is crucial for DeepSpeed to compile/build for your version of Linux and requires around 3GB's of disk space.
  • Install using your package manager (Recommended) e.g. CUDA Toolkit - 11.8 or download directly from or download directly from Nvidia CUDA Toolkit Archive - (choose 11.8 or 12.1 for Linux).
  • + (choose 12.4).

  • @@ -1208,7 +1208,7 @@

    DeepSpeed Installation fo
  • You can confirm the path is set correctly and working by running the command nvcc --version should confirm - Cuda compilation tools, release 11.8.. + Cuda compilation tools, release 12.4..

  • @@ -1523,7 +1523,7 @@

    How do I know if I am in

    Ive paid very close attention to not impact what Text-generation-webui is requesting on a factory install. This is one of the requirements of submitting an extension to Text-generation-webui. If you want to look at a comparison of a factory fresh text-generation-webui installed packages (with cuda 12.1, - though AllTalk's requirements were set on cuda 11.8) you can find that comparison here. This comparison shows that AllTalk is requesting the same package version numbers as Text-generation-webui or even lower version numbers (meaning AllTalk will not update them to a later version). What other extensions do, I cant really account for that. @@ -1890,16 +1890,16 @@

    ⚫ Prerequisites for Fine-tuning - with Nvidia CUDA Development Toolkit 11.8

    + with Nvidia CUDA Development Toolkit 12.4
  • All the requirements for Finetuning will be installed by using the atsetup utility and installing your correct requirements (Standalone or for Text-generation-webui). The legacy manual instructions are stored below, however these shouldnt be required.

    - Legacy manual instructions for installing Nvidia CUDA Development Toolkit 11.8
    - - To perform fine-tuning, a specific portion of the **Nvidia CUDA Development Toolkit v11.8** must be installed. + Legacy manual instructions for installing Nvidia CUDA Development Toolkit 12.4
    + - To perform fine-tuning, a specific portion of the **Nvidia CUDA Development Toolkit v12.4** must be installed. This is crucial for step 1 of fine-tuning. The objective is to minimize the installation footprint by installing only the essential components.
    - - The **Nvidia CUDA Development Toolkit v11.8** operates independently from your graphics card drivers and the + - The **Nvidia CUDA Development Toolkit v12.4** operates independently from your graphics card drivers and the CUDA version utilized by your Python environment.
    - This installation process aims to keep the download and install size as minimal as possible, however a full install of the tookit requires 3GB's of disk space.
    @@ -1910,8 +1910,8 @@

    Nvidia's +
  • Obtain the network install version of the Nvidia CUDA Development Toolkit 12.4 + from Nvidia's Archive.
  • @@ -1935,16 +1935,16 @@

  • Open a new terminal/command prompt to refresh the search paths.
  • In a terminal or command prompt, execute nvcc --version.
  • -
  • Success is indicated by a response of Cuda compilation tools, release 11.8. - Specifically, ensure it is version 11.8.
  • +
  • Success is indicated by a response of Cuda compilation tools, release 12.4. + Specifically, ensure it is version 12.4.
  • @@ -1993,7 +1993,7 @@

    Additional Note on Torch and Torchaudio:

    • Ensure Torch and Torchaudio are CUDA-enabled (any version), which is separate from the CUDA Toolkit - installation. CUDA 11.8 corresponds to cu118 and CUDA 12.1 to cu121 in AllTalk + installation. CUDA 12.4 corresponds to cu124 in AllTalk diagnostics.
    • Failure to install CUDA for Torch and Torchaudio will result in Step 2 of fine-tuning failing. These requirements are distinct from the CUDA Toolkit installation, so avoid conflating the two.
    • From de051c5148f7ca94780b7ee8a72549ce7f8bd0cc Mon Sep 17 00:00:00 2001 From: Martin Rys Date: Tue, 13 Aug 2024 19:33:44 +0200 Subject: [PATCH 6/8] Also hack out 11.8 mentions in diagnostics.py --- diagnostics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diagnostics.py b/diagnostics.py index 54b67f2d..49dbe98f 100644 --- a/diagnostics.py +++ b/diagnostics.py @@ -303,7 +303,7 @@ def log_system_info(): print(f" {package_name.ljust(max_package_length)} Required: {color_required}{operator} {required_version.ljust(12)}\033[0m Installed: {color_installed}{installed_version}\033[0m") print("\nOn Nvidia Graphics cards machines, if your \033[92mInstalled\033[0m version of \033[92mTorch\033[0m and \033[92mTorchaudio\033[0m does") - print("not have \033[92m+cu118\033[0m (Cuda 11.8) or \033[92m+cu121\033[0m (Cuda 12.1) listed after them, you do not have CUDA") + print("not have \033[92m+cu124\033[0m (Cuda 12.4) listed after them, you do not have CUDA") print("installed for Torch or Torchaudio in this Python environment. This will cause you problems") print("with \033[94mAllTalk\033[0m and \033[94mFinetuning.\033[0m You may have to 'pip install' a new version of torch and") print("torchaudio, using '\033[94m--upgrade --force-reinstall\033[0m' with the correct version of PyTorch for\033[0m") From e1f70915a79d1aae2f0b5f1b6dd33589593c2ca5 Mon Sep 17 00:00:00 2001 From: Martin Rys Date: Tue, 13 Aug 2024 19:52:19 +0200 Subject: [PATCH 7/8] admin.html: Fix typo --- system/admin.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/system/admin.html b/system/admin.html index 81c975d7..f70598ba 100644 --- a/system/admin.html +++ b/system/admin.html @@ -631,7 +631,7 @@

      🏰 TTS Generation Methods in SilllyTavern

      a time and wait for Ready before changing the next setting. To set these options long term you can apply the settings at the top of this page.

      🏰 AllTalk Narrator

      -

      Only available on the Standard Audo Generation method.

      +

      Only available on the Standard Audio Generation method.

      • Narrator Voice Selection:
          From 75bd71ba0a9782552164bca28d30f64002be579f Mon Sep 17 00:00:00 2001 From: Martin Rys Date: Tue, 13 Aug 2024 20:10:33 +0200 Subject: [PATCH 8/8] Sort the voices in WebUI --- script.py | 6 +++--- tts_server.py | 20 ++++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/script.py b/script.py index c7e579b3..43dd2058 100644 --- a/script.py +++ b/script.py @@ -39,7 +39,7 @@ def load_config(file_path): with open(file_path, "r") as config_file: config = json.load(config_file) return config - + config_file_path = this_dir / "confignew.json" # Load the params dictionary from the confignew.json file params = load_config(config_file_path) @@ -112,7 +112,7 @@ def update_config(config_file_path, update_config_path, downgrade_config_path): ######################################### #### Continue on with Startup Checks #### ######################################### - + # Required for sentence splitting try: from TTS.api import TTS @@ -122,7 +122,7 @@ def update_config(config_file_path, update_config_path, downgrade_config_path): print(f"[{params['branding']}]\033[91mWarning\033[0m Could not find the TTS module. Make sure to install the requirements for the {params['branding']} extension.") print(f"[{params['branding']}]\033[91mWarning\033[0m Please use the ATSetup utility or check the Github installation instructions.") # Re-raise the ModuleNotFoundError to stop the program and print the traceback - raise + raise # Suppress logging logging.disable(logging.ERROR) diff --git a/tts_server.py b/tts_server.py index 0cb65d91..41865385 100644 --- a/tts_server.py +++ b/tts_server.py @@ -499,7 +499,7 @@ async def generate_audio_internal(text, voice, language, temperature, repetition if params["low_vram"] and device == "cpu": await switch_device() generate_start_time = time.time() # Record the start time of generating TTS - + # XTTSv2 LOCAL & Xttsv2 FT Method if params["tts_method_xtts_local"] or tts_method_xtts_ft: print(f"[{params['branding']}TTSGen] {text}") @@ -560,12 +560,12 @@ async def generate_audio_internal(text, voice, language, temperature, repetition chunk = np.clip(chunk, -1, 1) chunk = (chunk * 32767).astype(np.int16) yield chunk.tobytes() - print(f"[{params['branding']}Debug] Stream audio generation: Yielded audio chunk {i}.") if debug_generate_audio else None + print(f"[{params['branding']}Debug] Stream audio generation: Yielded audio chunk {i}.") if debug_generate_audio else None else: # Non-streaming-specific operation torchaudio.save(output_file, torch.tensor(output["wav"]).unsqueeze(0), 24000) - + # API LOCAL Methods elif params["tts_method_api_local"]: # Streaming only allowed for XTTSv2 local @@ -645,7 +645,7 @@ def list_files(directory): for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and f.endswith(".wav") ] - return files + return sorted(files) ############################# #### JSON CONFIG UPDATER #### @@ -774,7 +774,7 @@ async def get_audio(filename: str): audio_path = Path("outputs") / filename if not audio_path.is_file(): raise HTTPException(status_code=404, detail="File not found") - + response = FileResponse( path=audio_path, media_type='audio/wav', @@ -808,7 +808,7 @@ async def preview_voice(request: Request, voice: str = Form(...)): # Clean the voice filename for inclusion in the text clean_voice_filename = re.sub(r'\.wav$', '', voice.replace(' ', '_')) clean_voice_filename = re.sub(r'[^a-zA-Z0-9]', ' ', clean_voice_filename) - + # Generate the audio text = f"Hello, this is a preview of voice {clean_voice_filename}." @@ -1134,10 +1134,10 @@ async def tts_generate( else: cleaned_string = text_input response = await generate_audio(cleaned_string, character_voice_gen, language, temperature, repetition_penalty, output_file_path, streaming) - if sounddevice_installed == False or streaming == True: + if not sounddevice_installed or streaming: autoplay = False if autoplay: - play_audio(output_file_path, autoplay_volume) + play_audio(output_file_path, autoplay_volume) if streaming: return StreamingResponse(response, media_type="audio/wav") return JSONResponse(content={"status": "generate-success", "output_file_path": str(output_file_path), "output_file_url": str(output_file_url), "output_cache_url": str(output_cache_url)}, status_code=200) @@ -1238,11 +1238,11 @@ async def srt_generation(): ttslist_path = this_dir / "outputs" / "ttsList.json" wavfile_path = this_dir / "outputs" subprocess.run(["python", "tts_srt.py", f"--ttslistpath={ttslist_path}", f"--wavfilespath={wavfile_path}"], cwd=this_dir / "system" / "tts_srt", env=env) - + srt_file_path = this_dir / "outputs" / "subtitles.srt" if not srt_file_path.exists(): raise HTTPException(status_code=404, detail="Subtitle file not found.") - + return FileResponse(path=srt_file_path, filename="subtitles.srt", media_type='application/octet-stream') ###################################################