From 0c6e113aff4876220bf5a81cf2bcfb1e5c9a5a9e Mon Sep 17 00:00:00 2001 From: Azalea Gui <22280294+hykilpikonna@users.noreply.github.com> Date: Thu, 28 Nov 2024 22:19:44 -0500 Subject: [PATCH] [F] Fix #251: Expected a value of type 'str' --- finetune.py | 14 +++++++------- system/tts_engines/f5tts/model_engine.py | 2 +- system/tts_engines/xtts/model_engine.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/finetune.py b/finetune.py index 692143c..5077a94 100644 --- a/finetune.py +++ b/finetune.py @@ -1148,7 +1148,7 @@ def format_audio_list( continue # Load and process audio - wav, sr = torchaudio.load(audio_path) + wav, sr = torchaudio.load(str(audio_path)) if wav.size(0) != 1: wav = torch.mean(wav, dim=0, keepdim=True) wav = wav.squeeze() @@ -1209,7 +1209,7 @@ def format_audio_list( chunk_path = os.path.join( temp_folder, f"{audio_file_name_without_ext}_chunk_{chunk_idx}.wav") - torchaudio.save(chunk_path, chunk.unsqueeze(0), sr) + torchaudio.save(str(chunk_path), chunk.unsqueeze(0), sr) # Transcribe with appropriate precision if fal_precision == "mixed" and device == "cuda": @@ -1745,7 +1745,7 @@ def save_audio_segment( os.makedirs( os.path.dirname(sas_split_absolute_path), exist_ok=True) - torchaudio.save(sas_split_absolute_path, sas_split_audio, sas_sr) + torchaudio.save(str(sas_split_absolute_path), sas_split_audio, sas_sr) sas_metadata["audio_file"].append( f"wavs/{sas_split_relative_path}") @@ -1755,7 +1755,7 @@ def save_audio_segment( # Only save if segment is at least 1 second if sas_audio_segment.size(-1) >= sas_sr: - torchaudio.save(sas_absolute_path, sas_audio_segment, sas_sr) + torchaudio.save(str(sas_absolute_path), sas_audio_segment, sas_sr) sas_metadata["audio_file"].append(f"wavs/{sas_audio_file_name}") sas_metadata["text"].append(sas_sentence) sas_metadata["speaker_name"].append(sas_speaker_name) @@ -2221,7 +2221,7 @@ def save_audio_and_correction( f"Saving edited audio: {sr}Hz, length: {len(audio)}", "DATA_PROCESS") audio_tensor = torch.tensor(audio).unsqueeze(0) - torchaudio.save(audio_path, audio_tensor, sr) + torchaudio.save(str(audio_path), audio_tensor, sr) save_status_msg.append("Audio saved successfully") debug_print( f"Saved edited audio to {audio_path}", @@ -2959,7 +2959,7 @@ def run_tts(lang, tts_text, speaker_audio_file): with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: out["wav"] = torch.tensor(out["wav"]).unsqueeze(0) out_path = fp.name - torchaudio.save(out_path, out["wav"], 24000) + torchaudio.save(str(out_path), out["wav"], 24000) return "Speech generated !", out_path, speaker_audio_file @@ -3171,7 +3171,7 @@ def compact_custom_model( if file_path.is_file() and file_path.suffix.lower() == ".wav": try: # Load audio file and get duration - waveform, sample_rate = torchaudio.load(file_path) + waveform, sample_rate = torchaudio.load(str(file_path)) duration = waveform.size( 1) / sample_rate # Duration in seconds diff --git a/system/tts_engines/f5tts/model_engine.py b/system/tts_engines/f5tts/model_engine.py index c2110da..edb57b5 100644 --- a/system/tts_engines/f5tts/model_engine.py +++ b/system/tts_engines/f5tts/model_engine.py @@ -756,7 +756,7 @@ async def infer_process( ): """Process text and prepare for batch inference""" # Split the input text into batches - audio, sr = torchaudio.load(ref_audio) + audio, sr = torchaudio.load(str(ref_audio)) max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr)) gen_text_batches = self.chunk_text(gen_text, max_chars=max_chars) diff --git a/system/tts_engines/xtts/model_engine.py b/system/tts_engines/xtts/model_engine.py index 0673e92..c450488 100644 --- a/system/tts_engines/xtts/model_engine.py +++ b/system/tts_engines/xtts/model_engine.py @@ -1113,7 +1113,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena else: self.print_message("Starting non-streaming generation", message_type="debug_tts") output = self.model.inference(**common_args) - torchaudio.save(output_file, torch.tensor(output["wav"]).unsqueeze(0), 24000) + torchaudio.save(str(output_file), torch.tensor(output["wav"]).unsqueeze(0), 24000) self.print_message(f"Saved audio to: {output_file}", message_type="debug_tts") elif self.current_model_loaded.startswith("apitts"):