diff --git a/F5TTS.py b/F5TTS.py
index 29facc1..434c7c6 100644
--- a/F5TTS.py
+++ b/F5TTS.py
@@ -12,6 +12,7 @@
 import sys
 import numpy as np
 import re
+import io
 from comfy.utils import ProgressBar
 from cached_path import cached_path
 sys.path.append(Install.f5TTSPath)
@@ -24,60 +25,32 @@
 sys.path.pop()
 
 
-class F5TTSAudio:
-
-    def __init__(self):
-        self.use_cli = False
-        self.voice_reg = re.compile(r"\{(\w+)\}")
+class F5TTSCreate:
+    voice_reg = re.compile(r"\{(\w+)\}")
 
-    @staticmethod
-    def get_txt_file_path(file):
-        p = Path(file)
-        return os.path.join(os.path.dirname(file), p.stem + ".txt")
+    def is_voice_name(self, word):
+        return self.voice_reg.match(word.strip())
 
-    @classmethod
-    def INPUT_TYPES(s):
-        input_dir = folder_paths.get_input_directory()
-        files = folder_paths.filter_files_content_types(
-            os.listdir(input_dir), ["audio", "video"]
-            )
-        filesWithTxt = []
-        for file in files:
-            txtFile = F5TTSAudio.get_txt_file_path(file)
-            if os.path.isfile(os.path.join(input_dir, txtFile)):
-                filesWithTxt.append(file)
-        return {
-            "required": {
-                "sample": (sorted(filesWithTxt), {"audio_upload": True}),
-                "speech": ("STRING", {
-                    "multiline": True,
-                    "default": "Hello World"
-                }),
-            }
-        }
+    def get_voice_names(self, chunks):
+        voice_names = {}
+        for text in chunks:
+            match = self.is_voice_name(text)
+            if match:
+                voice_names[match[1]] = True
+        return voice_names
 
-    CATEGORY = "audio"
+    def split_text(self, speech):
+        reg1 = r"(?=\{\w+\})"
+        return re.split(reg1, speech)
 
-    RETURN_TYPES = ("AUDIO", )
-    FUNCTION = "create"
+    @staticmethod
+    def load_voice(ref_audio, ref_text):
+        main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
 
-    def create_with_cli(self, audio_path, audio_text, speech, output_dir):
-        subprocess.run(
-            [
-                "python", "inference-cli.py", "--model", "F5-TTS",
-                "--ref_audio", audio_path, "--ref_text", audio_text,
-                "--gen_text", speech,
-                "--output_dir", output_dir
-            ],
-            cwd=Install.f5TTSPath
+        main_voice["ref_audio"], main_voice["ref_text"] = preprocess_ref_audio_text( # noqa E501
+            ref_audio, ref_text
         )
-        output_audio = os.path.join(output_dir, "out.wav")
-        with wave.open(output_audio, "rb") as wave_file:
-            frame_rate = wave_file.getframerate()
-
-        waveform, sample_rate = torchaudio.load(output_audio)
-        audio = {"waveform": waveform.unsqueeze(0), "sample_rate": frame_rate}
-        return audio
+        return main_voice
 
     def load_model(self):
         model_cls = DiT
@@ -95,29 +68,6 @@ def load_model(self):
         ema_model = load_model(model_cls, model_cfg, ckpt_file, vocab_file)
         return ema_model
 
-    def load_voice(self, ref_audio, ref_text):
-        main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
-
-        main_voice["ref_audio"], main_voice["ref_text"] = preprocess_ref_audio_text( # noqa E501
-            ref_audio, ref_text
-        )
-        return main_voice
-
-    def is_voice_name(self, word):
-        return self.voice_reg.match(word.strip())
-
-    def get_voice_names(self, chunks):
-        voice_names = {}
-        for text in chunks:
-            match = self.is_voice_name(text)
-            if match:
-                voice_names[match[1]] = True
-        return voice_names
-
-    def split_text(self, speech):
-        reg1 = r"(?=\{\w+\})"
-        return re.split(reg1, speech)
-
     def generate_audio(self, voices, model_obj, chunks):
         frame_rate = 44100
         generated_audio_segments = []
@@ -133,7 +83,7 @@ def generate_audio(self, voices, model_obj, chunks):
             if voice not in voices:
                 print(f"Voice {voice} not found, using main.")
                 voice = "main"
-            text = self.voice_reg.sub("", text)
+            text = F5TTSCreate.voice_reg.sub("", text)
             gen_text = text.strip()
             ref_audio = voices[voice]["ref_audio"]
             ref_text = voices[voice]["ref_text"]
@@ -160,6 +110,137 @@ def generate_audio(self, voices, model_obj, chunks):
         os.unlink(wave_file.name)
         return audio
 
+    def create(self, voices, chunks):
+        model_obj = self.load_model()
+        return self.generate_audio(voices, model_obj, chunks)
+
+
+class F5TTSAudioInputs:
+    def __init__(self):
+        self.wave_file = None
+
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "sample_audio": ("AUDIO",),
+                "sample_text": ("STRING", {"default": "Text of sample_audio"}),
+                "speech": ("STRING", {
+                    "multiline": True,
+                    "default": "This is what I want to say"
+                }),
+            },
+        }
+
+    CATEGORY = "audio"
+
+    RETURN_TYPES = ("AUDIO", )
+    FUNCTION = "create"
+
+    def load_voice_from_input(self, sample_audio, sample_text):
+        self.wave_file = tempfile.NamedTemporaryFile(
+            suffix=".wav", delete=False
+            )
+        for (batch_number, waveform) in enumerate(
+                sample_audio["waveform"].cpu()):
+            buff = io.BytesIO()
+            torchaudio.save(
+                buff, waveform, sample_audio["sample_rate"], format="WAV"
+                )
+            with open(self.wave_file.name, 'wb') as f:
+                f.write(buff.getbuffer())
+            break
+        r = F5TTSCreate.load_voice(self.wave_file.name, sample_text)
+        return r
+
+    def remove_wave_file(self):
+        if self.wave_file is not None:
+            try:
+                os.unlink(self.wave_file.name)
+                self.wave_file = None
+            except Exception as e:
+                print("F5TTS: Cannot remove? "+self.wave_file.name)
+                print(e)
+
+    def create(self, sample_audio, sample_text, speech):
+        try:
+            main_voice = self.load_voice_from_input(sample_audio, sample_text)
+
+            f5ttsCreate = F5TTSCreate()
+
+            voices = {}
+            chunks = f5ttsCreate.split_text(speech)
+            voices['main'] = main_voice
+
+            audio = f5ttsCreate.create(voices, chunks)
+        finally:
+            self.remove_wave_file()
+        return (audio, )
+
+    @classmethod
+    def IS_CHANGED(s, sample_audio, sample_text, speech):
+        m = hashlib.sha256()
+        m.update(sample_text)
+        m.update(sample_audio)
+        m.update(speech)
+        return m.digest().hex()
+
+
+class F5TTSAudio:
+    def __init__(self):
+        self.use_cli = False
+
+    @staticmethod
+    def get_txt_file_path(file):
+        p = Path(file)
+        return os.path.join(os.path.dirname(file), p.stem + ".txt")
+
+    @classmethod
+    def INPUT_TYPES(s):
+        input_dir = folder_paths.get_input_directory()
+        files = folder_paths.filter_files_content_types(
+            os.listdir(input_dir), ["audio", "video"]
+            )
+        filesWithTxt = []
+        for file in files:
+            txtFile = F5TTSAudio.get_txt_file_path(file)
+            if os.path.isfile(os.path.join(input_dir, txtFile)):
+                filesWithTxt.append(file)
+        filesWithTxt = sorted(filesWithTxt)
+
+        return {
+            "required": {
+                "sample": (filesWithTxt, {"audio_upload": True}),
+                "speech": ("STRING", {
+                    "multiline": True,
+                    "default": "This is what I want to say"
+                }),
+            }
+        }
+
+    CATEGORY = "audio"
+
+    RETURN_TYPES = ("AUDIO", )
+    FUNCTION = "create"
+
+    def create_with_cli(self, audio_path, audio_text, speech, output_dir):
+        subprocess.run(
+            [
+                "python", "inference-cli.py", "--model", "F5-TTS",
+                "--ref_audio", audio_path, "--ref_text", audio_text,
+                "--gen_text", speech,
+                "--output_dir", output_dir
+            ],
+            cwd=Install.f5TTSPath
+        )
+        output_audio = os.path.join(output_dir, "out.wav")
+        with wave.open(output_audio, "rb") as wave_file:
+            frame_rate = wave_file.getframerate()
+
+        waveform, sample_rate = torchaudio.load(output_audio)
+        audio = {"waveform": waveform.unsqueeze(0), "sample_rate": frame_rate}
+        return audio
+
     def load_voice_from_file(self, sample):
         input_dir = folder_paths.get_input_directory()
         txt_file = os.path.join(
@@ -170,7 +251,7 @@ def load_voice_from_file(self, sample):
         with open(txt_file, 'r') as file:
             audio_text = file.read()
         audio_path = folder_paths.get_annotated_filepath(sample)
-        return self.load_voice(audio_path, audio_text)
+        return F5TTSCreate.load_voice(audio_path, audio_text)
 
     def load_voices_from_files(self, sample, voice_names):
         voices = {}
@@ -194,6 +275,7 @@ def create(self, sample, speech):
         # Install.check_install()
         main_voice = self.load_voice_from_file(sample)
 
+        f5ttsCreate = F5TTSCreate()
         if self.use_cli:
             # working...
             output_dir = tempfile.mkdtemp()
@@ -204,21 +286,23 @@ def create(self, sample, speech):
                 )
             shutil.rmtree(output_dir)
         else:
-            model_obj = self.load_model()
-            chunks = self.split_text(speech)
-            voice_names = self.get_voice_names(chunks)
+            chunks = f5ttsCreate.split_text(speech)
+            voice_names = f5ttsCreate.get_voice_names(chunks)
             voices = self.load_voices_from_files(sample, voice_names)
             voices['main'] = main_voice
 
-            audio = self.generate_audio(voices, model_obj, chunks)
+            audio = f5ttsCreate.create(voices, chunks)
         return (audio, )
 
     @classmethod
     def IS_CHANGED(s, sample, speech):
         m = hashlib.sha256()
         audio_path = folder_paths.get_annotated_filepath(sample)
+        audio_txt_path = F5TTSAudio.get_txt_file_path(audio_path)
         last_modified_timestamp = os.path.getmtime(audio_path)
+        txt_last_modified_timestamp = os.path.getmtime(audio_txt_path)
         m.update(audio_path)
         m.update(str(last_modified_timestamp))
+        m.update(str(txt_last_modified_timestamp))
         m.update(speech)
         return m.digest().hex()
diff --git a/README.md b/README.md
index 51ce286..bc66a4b 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,9 @@ Using F5-TTS https://github.com/SWivid/F5-TTS
 * Press refresh to see it in the node
 
 You can use the examples here...
-* [examples voices](examples/)
-* [simple workflow](examples/simple_ComfyUI_F5TTS_workflow.json)
+* [Examples voices](examples/)
+* [Simple workflow](examples/simple_ComfyUI_F5TTS_workflow.json)
+* [Workflow with input audio only, using OpenAI's Whisper to get the text](examples/F5TTS_whisper_workflow.json)
 
 
 ### Multi voices...
diff --git a/__init__.py b/__init__.py
index 1bdbaed..590dde2 100644
--- a/__init__.py
+++ b/__init__.py
@@ -1,9 +1,11 @@
 
-from .F5TTS import F5TTSAudio
+from .F5TTS import F5TTSAudio, F5TTSAudioInputs
 
 NODE_CLASS_MAPPINGS = {
-    "F5TTSAudio": F5TTSAudio
+    "F5TTSAudio": F5TTSAudio,
+    "F5TTSAudioInputs": F5TTSAudioInputs
 }
 NODE_DISPLAY_NAME_MAPPINGS = {
-    "F5TTSAudio": "F5-TTS Audio"
+    "F5TTSAudio": "F5-TTS Audio",
+    "F5TTSAudioInputs": "F5-TTS Audio from inputs"
 }
diff --git a/examples/F5TTS_whisper_workflow.json b/examples/F5TTS_whisper_workflow.json
new file mode 100644
index 0000000..7f7773d
--- /dev/null
+++ b/examples/F5TTS_whisper_workflow.json
@@ -0,0 +1,256 @@
+{
+  "last_node_id": 14,
+  "last_link_id": 7,
+  "nodes": [
+    {
+      "id": 12,
+      "type": "F5TTSAudioInputs",
+      "pos": {
+        "0": 1025,
+        "1": 155
+      },
+      "size": [
+        400,
+        200
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "sample_audio",
+          "type": "AUDIO",
+          "link": 3
+        },
+        {
+          "name": "sample_text",
+          "type": "STRING",
+          "link": 5,
+          "widget": {
+            "name": "sample_text"
+          }
+        }
+      ],
+      "outputs": [
+        {
+          "name": "AUDIO",
+          "type": "AUDIO",
+          "links": [
+            4
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "F5TTSAudioInputs"
+      },
+      "widgets_values": [
+        "Text of sample_audio",
+        "This is what I want to say"
+      ]
+    },
+    {
+      "id": 4,
+      "type": "LoadAudio",
+      "pos": {
+        "0": 241,
+        "1": 146
+      },
+      "size": {
+        "0": 315,
+        "1": 124
+      },
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "AUDIO",
+          "type": "AUDIO",
+          "links": [
+            3,
+            6
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadAudio"
+      },
+      "widgets_values": [
+        "F5TTS_test_en_1_ref_short.wav",
+        null,
+        ""
+      ]
+    },
+    {
+      "id": 13,
+      "type": "Apply Whisper",
+      "pos": {
+        "0": 618,
+        "1": 249
+      },
+      "size": {
+        "0": 315,
+        "1": 98
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "audio",
+          "type": "AUDIO",
+          "link": 6
+        }
+      ],
+      "outputs": [
+        {
+          "name": "text",
+          "type": "STRING",
+          "links": [
+            5,
+            7
+          ],
+          "slot_index": 0
+        },
+        {
+          "name": "segments_alignment",
+          "type": "whisper_alignment",
+          "links": null
+        },
+        {
+          "name": "words_alignment",
+          "type": "whisper_alignment",
+          "links": null
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "Apply Whisper"
+      },
+      "widgets_values": [
+        "base"
+      ]
+    },
+    {
+      "id": 14,
+      "type": "DisplayAny",
+      "pos": {
+        "0": 651,
+        "1": 451
+      },
+      "size": {
+        "0": 315,
+        "1": 100
+      },
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "input",
+          "type": "*",
+          "link": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "STRING",
+          "type": "STRING",
+          "links": null
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "DisplayAny"
+      },
+      "widgets_values": [
+        "raw value",
+        "Some call me nature. Others call me Mother Nature."
+      ]
+    },
+    {
+      "id": 2,
+      "type": "PreviewAudio",
+      "pos": {
+        "0": 1043,
+        "1": 450
+      },
+      "size": {
+        "0": 315,
+        "1": 76
+      },
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "audio",
+          "type": "AUDIO",
+          "link": 4
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "PreviewAudio"
+      },
+      "widgets_values": [
+        null
+      ]
+    }
+  ],
+  "links": [
+    [
+      3,
+      4,
+      0,
+      12,
+      0,
+      "AUDIO"
+    ],
+    [
+      4,
+      12,
+      0,
+      2,
+      0,
+      "AUDIO"
+    ],
+    [
+      5,
+      13,
+      0,
+      12,
+      1,
+      "STRING"
+    ],
+    [
+      6,
+      4,
+      0,
+      13,
+      0,
+      "AUDIO"
+    ],
+    [
+      7,
+      13,
+      0,
+      14,
+      0,
+      "*"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 1,
+      "offset": [
+        -46,
+        154
+      ]
+    }
+  },
+  "version": 0.4
+}
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index b1a9f02..51637be 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "comfyui-f5-tts"
 description = "Text to speech with F5-TTS"
-version = "1.0.2"
+version = "1.0.3"
 license = {text = "MIT License"}
 
 [project.urls]