diff --git a/F5TTS.py b/F5TTS.py index 29facc1..434c7c6 100644 --- a/F5TTS.py +++ b/F5TTS.py @@ -12,6 +12,7 @@ import sys import numpy as np import re +import io from comfy.utils import ProgressBar from cached_path import cached_path sys.path.append(Install.f5TTSPath) @@ -24,60 +25,32 @@ sys.path.pop() -class F5TTSAudio: - - def __init__(self): - self.use_cli = False - self.voice_reg = re.compile(r"\{(\w+)\}") +class F5TTSCreate: + voice_reg = re.compile(r"\{(\w+)\}") - @staticmethod - def get_txt_file_path(file): - p = Path(file) - return os.path.join(os.path.dirname(file), p.stem + ".txt") + def is_voice_name(self, word): + return self.voice_reg.match(word.strip()) - @classmethod - def INPUT_TYPES(s): - input_dir = folder_paths.get_input_directory() - files = folder_paths.filter_files_content_types( - os.listdir(input_dir), ["audio", "video"] - ) - filesWithTxt = [] - for file in files: - txtFile = F5TTSAudio.get_txt_file_path(file) - if os.path.isfile(os.path.join(input_dir, txtFile)): - filesWithTxt.append(file) - return { - "required": { - "sample": (sorted(filesWithTxt), {"audio_upload": True}), - "speech": ("STRING", { - "multiline": True, - "default": "Hello World" - }), - } - } + def get_voice_names(self, chunks): + voice_names = {} + for text in chunks: + match = self.is_voice_name(text) + if match: + voice_names[match[1]] = True + return voice_names - CATEGORY = "audio" + def split_text(self, speech): + reg1 = r"(?=\{\w+\})" + return re.split(reg1, speech) - RETURN_TYPES = ("AUDIO", ) - FUNCTION = "create" + @staticmethod + def load_voice(ref_audio, ref_text): + main_voice = {"ref_audio": ref_audio, "ref_text": ref_text} - def create_with_cli(self, audio_path, audio_text, speech, output_dir): - subprocess.run( - [ - "python", "inference-cli.py", "--model", "F5-TTS", - "--ref_audio", audio_path, "--ref_text", audio_text, - "--gen_text", speech, - "--output_dir", output_dir - ], - cwd=Install.f5TTSPath + main_voice["ref_audio"], main_voice["ref_text"] = preprocess_ref_audio_text( # noqa E501 + ref_audio, ref_text ) - output_audio = os.path.join(output_dir, "out.wav") - with wave.open(output_audio, "rb") as wave_file: - frame_rate = wave_file.getframerate() - - waveform, sample_rate = torchaudio.load(output_audio) - audio = {"waveform": waveform.unsqueeze(0), "sample_rate": frame_rate} - return audio + return main_voice def load_model(self): model_cls = DiT @@ -95,29 +68,6 @@ def load_model(self): ema_model = load_model(model_cls, model_cfg, ckpt_file, vocab_file) return ema_model - def load_voice(self, ref_audio, ref_text): - main_voice = {"ref_audio": ref_audio, "ref_text": ref_text} - - main_voice["ref_audio"], main_voice["ref_text"] = preprocess_ref_audio_text( # noqa E501 - ref_audio, ref_text - ) - return main_voice - - def is_voice_name(self, word): - return self.voice_reg.match(word.strip()) - - def get_voice_names(self, chunks): - voice_names = {} - for text in chunks: - match = self.is_voice_name(text) - if match: - voice_names[match[1]] = True - return voice_names - - def split_text(self, speech): - reg1 = r"(?=\{\w+\})" - return re.split(reg1, speech) - def generate_audio(self, voices, model_obj, chunks): frame_rate = 44100 generated_audio_segments = [] @@ -133,7 +83,7 @@ def generate_audio(self, voices, model_obj, chunks): if voice not in voices: print(f"Voice {voice} not found, using main.") voice = "main" - text = self.voice_reg.sub("", text) + text = F5TTSCreate.voice_reg.sub("", text) gen_text = text.strip() ref_audio = voices[voice]["ref_audio"] ref_text = voices[voice]["ref_text"] @@ -160,6 +110,137 @@ def generate_audio(self, voices, model_obj, chunks): os.unlink(wave_file.name) return audio + def create(self, voices, chunks): + model_obj = self.load_model() + return self.generate_audio(voices, model_obj, chunks) + + +class F5TTSAudioInputs: + def __init__(self): + self.wave_file = None + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "sample_audio": ("AUDIO",), + "sample_text": ("STRING", {"default": "Text of sample_audio"}), + "speech": ("STRING", { + "multiline": True, + "default": "This is what I want to say" + }), + }, + } + + CATEGORY = "audio" + + RETURN_TYPES = ("AUDIO", ) + FUNCTION = "create" + + def load_voice_from_input(self, sample_audio, sample_text): + self.wave_file = tempfile.NamedTemporaryFile( + suffix=".wav", delete=False + ) + for (batch_number, waveform) in enumerate( + sample_audio["waveform"].cpu()): + buff = io.BytesIO() + torchaudio.save( + buff, waveform, sample_audio["sample_rate"], format="WAV" + ) + with open(self.wave_file.name, 'wb') as f: + f.write(buff.getbuffer()) + break + r = F5TTSCreate.load_voice(self.wave_file.name, sample_text) + return r + + def remove_wave_file(self): + if self.wave_file is not None: + try: + os.unlink(self.wave_file.name) + self.wave_file = None + except Exception as e: + print("F5TTS: Cannot remove? "+self.wave_file.name) + print(e) + + def create(self, sample_audio, sample_text, speech): + try: + main_voice = self.load_voice_from_input(sample_audio, sample_text) + + f5ttsCreate = F5TTSCreate() + + voices = {} + chunks = f5ttsCreate.split_text(speech) + voices['main'] = main_voice + + audio = f5ttsCreate.create(voices, chunks) + finally: + self.remove_wave_file() + return (audio, ) + + @classmethod + def IS_CHANGED(s, sample_audio, sample_text, speech): + m = hashlib.sha256() + m.update(sample_text) + m.update(sample_audio) + m.update(speech) + return m.digest().hex() + + +class F5TTSAudio: + def __init__(self): + self.use_cli = False + + @staticmethod + def get_txt_file_path(file): + p = Path(file) + return os.path.join(os.path.dirname(file), p.stem + ".txt") + + @classmethod + def INPUT_TYPES(s): + input_dir = folder_paths.get_input_directory() + files = folder_paths.filter_files_content_types( + os.listdir(input_dir), ["audio", "video"] + ) + filesWithTxt = [] + for file in files: + txtFile = F5TTSAudio.get_txt_file_path(file) + if os.path.isfile(os.path.join(input_dir, txtFile)): + filesWithTxt.append(file) + filesWithTxt = sorted(filesWithTxt) + + return { + "required": { + "sample": (filesWithTxt, {"audio_upload": True}), + "speech": ("STRING", { + "multiline": True, + "default": "This is what I want to say" + }), + } + } + + CATEGORY = "audio" + + RETURN_TYPES = ("AUDIO", ) + FUNCTION = "create" + + def create_with_cli(self, audio_path, audio_text, speech, output_dir): + subprocess.run( + [ + "python", "inference-cli.py", "--model", "F5-TTS", + "--ref_audio", audio_path, "--ref_text", audio_text, + "--gen_text", speech, + "--output_dir", output_dir + ], + cwd=Install.f5TTSPath + ) + output_audio = os.path.join(output_dir, "out.wav") + with wave.open(output_audio, "rb") as wave_file: + frame_rate = wave_file.getframerate() + + waveform, sample_rate = torchaudio.load(output_audio) + audio = {"waveform": waveform.unsqueeze(0), "sample_rate": frame_rate} + return audio + def load_voice_from_file(self, sample): input_dir = folder_paths.get_input_directory() txt_file = os.path.join( @@ -170,7 +251,7 @@ def load_voice_from_file(self, sample): with open(txt_file, 'r') as file: audio_text = file.read() audio_path = folder_paths.get_annotated_filepath(sample) - return self.load_voice(audio_path, audio_text) + return F5TTSCreate.load_voice(audio_path, audio_text) def load_voices_from_files(self, sample, voice_names): voices = {} @@ -194,6 +275,7 @@ def create(self, sample, speech): # Install.check_install() main_voice = self.load_voice_from_file(sample) + f5ttsCreate = F5TTSCreate() if self.use_cli: # working... output_dir = tempfile.mkdtemp() @@ -204,21 +286,23 @@ def create(self, sample, speech): ) shutil.rmtree(output_dir) else: - model_obj = self.load_model() - chunks = self.split_text(speech) - voice_names = self.get_voice_names(chunks) + chunks = f5ttsCreate.split_text(speech) + voice_names = f5ttsCreate.get_voice_names(chunks) voices = self.load_voices_from_files(sample, voice_names) voices['main'] = main_voice - audio = self.generate_audio(voices, model_obj, chunks) + audio = f5ttsCreate.create(voices, chunks) return (audio, ) @classmethod def IS_CHANGED(s, sample, speech): m = hashlib.sha256() audio_path = folder_paths.get_annotated_filepath(sample) + audio_txt_path = F5TTSAudio.get_txt_file_path(audio_path) last_modified_timestamp = os.path.getmtime(audio_path) + txt_last_modified_timestamp = os.path.getmtime(audio_txt_path) m.update(audio_path) m.update(str(last_modified_timestamp)) + m.update(str(txt_last_modified_timestamp)) m.update(speech) return m.digest().hex() diff --git a/README.md b/README.md index 51ce286..bc66a4b 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,9 @@ Using F5-TTS https://github.com/SWivid/F5-TTS * Press refresh to see it in the node You can use the examples here... -* [examples voices](examples/) -* [simple workflow](examples/simple_ComfyUI_F5TTS_workflow.json) +* [Examples voices](examples/) +* [Simple workflow](examples/simple_ComfyUI_F5TTS_workflow.json) +* [Workflow with input audio only, using OpenAI's Whisper to get the text](examples/F5TTS_whisper_workflow.json) ### Multi voices... diff --git a/__init__.py b/__init__.py index 1bdbaed..590dde2 100644 --- a/__init__.py +++ b/__init__.py @@ -1,9 +1,11 @@ -from .F5TTS import F5TTSAudio +from .F5TTS import F5TTSAudio, F5TTSAudioInputs NODE_CLASS_MAPPINGS = { - "F5TTSAudio": F5TTSAudio + "F5TTSAudio": F5TTSAudio, + "F5TTSAudioInputs": F5TTSAudioInputs } NODE_DISPLAY_NAME_MAPPINGS = { - "F5TTSAudio": "F5-TTS Audio" + "F5TTSAudio": "F5-TTS Audio", + "F5TTSAudioInputs": "F5-TTS Audio from inputs" } diff --git a/examples/F5TTS_whisper_workflow.json b/examples/F5TTS_whisper_workflow.json new file mode 100644 index 0000000..7f7773d --- /dev/null +++ b/examples/F5TTS_whisper_workflow.json @@ -0,0 +1,256 @@ +{ + "last_node_id": 14, + "last_link_id": 7, + "nodes": [ + { + "id": 12, + "type": "F5TTSAudioInputs", + "pos": { + "0": 1025, + "1": 155 + }, + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "sample_audio", + "type": "AUDIO", + "link": 3 + }, + { + "name": "sample_text", + "type": "STRING", + "link": 5, + "widget": { + "name": "sample_text" + } + } + ], + "outputs": [ + { + "name": "AUDIO", + "type": "AUDIO", + "links": [ + 4 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "F5TTSAudioInputs" + }, + "widgets_values": [ + "Text of sample_audio", + "This is what I want to say" + ] + }, + { + "id": 4, + "type": "LoadAudio", + "pos": { + "0": 241, + "1": 146 + }, + "size": { + "0": 315, + "1": 124 + }, + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "AUDIO", + "type": "AUDIO", + "links": [ + 3, + 6 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "LoadAudio" + }, + "widgets_values": [ + "F5TTS_test_en_1_ref_short.wav", + null, + "" + ] + }, + { + "id": 13, + "type": "Apply Whisper", + "pos": { + "0": 618, + "1": 249 + }, + "size": { + "0": 315, + "1": 98 + }, + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "name": "audio", + "type": "AUDIO", + "link": 6 + } + ], + "outputs": [ + { + "name": "text", + "type": "STRING", + "links": [ + 5, + 7 + ], + "slot_index": 0 + }, + { + "name": "segments_alignment", + "type": "whisper_alignment", + "links": null + }, + { + "name": "words_alignment", + "type": "whisper_alignment", + "links": null + } + ], + "properties": { + "Node name for S&R": "Apply Whisper" + }, + "widgets_values": [ + "base" + ] + }, + { + "id": 14, + "type": "DisplayAny", + "pos": { + "0": 651, + "1": 451 + }, + "size": { + "0": 315, + "1": 100 + }, + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "input", + "type": "*", + "link": 7 + } + ], + "outputs": [ + { + "name": "STRING", + "type": "STRING", + "links": null + } + ], + "properties": { + "Node name for S&R": "DisplayAny" + }, + "widgets_values": [ + "raw value", + "Some call me nature. Others call me Mother Nature." + ] + }, + { + "id": 2, + "type": "PreviewAudio", + "pos": { + "0": 1043, + "1": 450 + }, + "size": { + "0": 315, + "1": 76 + }, + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "audio", + "type": "AUDIO", + "link": 4 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewAudio" + }, + "widgets_values": [ + null + ] + } + ], + "links": [ + [ + 3, + 4, + 0, + 12, + 0, + "AUDIO" + ], + [ + 4, + 12, + 0, + 2, + 0, + "AUDIO" + ], + [ + 5, + 13, + 0, + 12, + 1, + "STRING" + ], + [ + 6, + 4, + 0, + 13, + 0, + "AUDIO" + ], + [ + 7, + 13, + 0, + 14, + 0, + "*" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 1, + "offset": [ + -46, + 154 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index b1a9f02..51637be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "comfyui-f5-tts" description = "Text to speech with F5-TTS" -version = "1.0.2" +version = "1.0.3" license = {text = "MIT License"} [project.urls]