diff --git a/app.py b/app.py index d6ca8697a..4f311bded 100644 --- a/app.py +++ b/app.py @@ -15,6 +15,9 @@ now_dir = os.getcwd() sys.path.append(now_dir) +# Zluda hijack +import rvc.lib.zluda + # Import Tabs from tabs.inference.inference import inference_tab from tabs.train.train import train_tab diff --git a/rvc/configs/config.py b/rvc/configs/config.py index 7329259ec..e6490936d 100644 --- a/rvc/configs/config.py +++ b/rvc/configs/config.py @@ -131,13 +131,6 @@ def device_config(self) -> tuple: def set_cuda_config(self): i_device = int(self.device.split(":")[-1]) self.gpu_name = torch.cuda.get_device_name(i_device) - # Zluda - if self.gpu_name.endswith("[ZLUDA]"): - print("Zluda compatibility enabled, experimental feature.") - torch.backends.cudnn.enabled = False - torch.backends.cuda.enable_flash_sdp(False) - torch.backends.cuda.enable_math_sdp(True) - torch.backends.cuda.enable_mem_efficient_sdp(False) low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"] if ( any(gpu in self.gpu_name for gpu in low_end_gpus) diff --git a/rvc/lib/algorithm/commons.py b/rvc/lib/algorithm/commons.py index c76328c91..444e095b8 100644 --- a/rvc/lib/algorithm/commons.py +++ b/rvc/lib/algorithm/commons.py @@ -156,25 +156,6 @@ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): acts = t_act * s_act return acts - -# Zluda, same as previous, but without jit.script -def fused_add_tanh_sigmoid_multiply_no_jit(input_a, input_b, n_channels): - """ - Fused add tanh sigmoid multiply operation. - - Args: - input_a: The first input tensor. - input_b: The second input tensor. - n_channels: The number of channels. - """ - n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts - - def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]: """ Convert the pad shape to a list of integers. diff --git a/rvc/lib/algorithm/modules.py b/rvc/lib/algorithm/modules.py index 7a9bc5414..ffd589fa3 100644 --- a/rvc/lib/algorithm/modules.py +++ b/rvc/lib/algorithm/modules.py @@ -1,9 +1,5 @@ import torch -from rvc.lib.algorithm.commons import ( - fused_add_tanh_sigmoid_multiply_no_jit, - fused_add_tanh_sigmoid_multiply, -) - +from rvc.lib.algorithm.commons import fused_add_tanh_sigmoid_multiply class WaveNet(torch.nn.Module): """WaveNet residual blocks as used in WaveGlow @@ -88,11 +84,6 @@ def forward(self, x, x_mask, g=None, **kwargs): if g is not None: g = self.cond_layer(g) - # Zluda - is_zluda = x.device.type == "cuda" and torch.cuda.get_device_name().endswith( - "[ZLUDA]" - ) - for i in range(self.n_layers): x_in = self.in_layers[i](x) if g is not None: @@ -101,14 +92,7 @@ def forward(self, x, x_mask, g=None, **kwargs): else: g_l = torch.zeros_like(x_in) - # Preventing HIP crash by not using jit-decorated function - if is_zluda: - acts = fused_add_tanh_sigmoid_multiply_no_jit( - x_in, g_l, n_channels_tensor - ) - else: - acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) - + acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) acts = self.drop(acts) res_skip_acts = self.res_skip_layers[i](acts) diff --git a/rvc/lib/predictors/F0Extractor.py b/rvc/lib/predictors/F0Extractor.py index d5177c487..bc3b61f3d 100644 --- a/rvc/lib/predictors/F0Extractor.py +++ b/rvc/lib/predictors/F0Extractor.py @@ -40,16 +40,8 @@ def wav16k(self) -> np.ndarray: def extract_f0(self) -> np.ndarray: f0 = None method = self.method - # Fall back to CPU for ZLUDA as these methods use CUcFFT - device = ( - "cpu" - if "cuda" in config.device - and torch.cuda.get_device_name().endswith("[ZLUDA]") - else config.device - ) - if method == "crepe": - wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(device) + wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(config.device) f0 = torchcrepe.predict( wav16k_torch, sample_rate=16000, @@ -57,7 +49,7 @@ def extract_f0(self) -> np.ndarray: batch_size=512, fmin=self.f0_min, fmax=self.f0_max, - device=device, + device=config.device, ) f0 = f0[0].cpu().numpy() elif method == "fcpe": @@ -65,9 +57,13 @@ def extract_f0(self) -> np.ndarray: audio_length = len(audio) f0_target_length = (audio_length // self.hop_length) + 1 audio = ( - torch.from_numpy(audio).float().unsqueeze(0).unsqueeze(-1).to(device) + torch.from_numpy(audio) + .float() + .unsqueeze(0) + .unsqueeze(-1) + .to(config.device) ) - model = torchfcpe.spawn_bundled_infer_model(device=device) + model = torchfcpe.spawn_bundled_infer_model(device=config.device) f0 = model.infer( audio, @@ -81,11 +77,10 @@ def extract_f0(self) -> np.ndarray: ) f0 = f0.squeeze().cpu().numpy() elif method == "rmvpe": - is_half = False if device == "cpu" else config.is_half model_rmvpe = RMVPE0Predictor( os.path.join("rvc", "models", "predictors", "rmvpe.pt"), - is_half=is_half, - device=device, + is_half=config.is_half, + device=config.device, # hop_length=80 ) f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03) diff --git a/rvc/lib/predictors/FCPE.py b/rvc/lib/predictors/FCPE.py index 786bdb7a3..12f6c346a 100644 --- a/rvc/lib/predictors/FCPE.py +++ b/rvc/lib/predictors/FCPE.py @@ -139,12 +139,6 @@ def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode) y = y.squeeze(1) - # Zluda, fall-back to CPU for FFTs since HIP SDK has no cuFFT alternative - source_device = y.device - if y.device.type == "cuda" and torch.cuda.get_device_name().endswith("[ZLUDA]"): - y = y.to("cpu") - hann_window[keyshift_key] = hann_window[keyshift_key].to("cpu") - spec = torch.stft( y, n_fft_new, @@ -156,7 +150,7 @@ def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): normalized=False, onesided=True, return_complex=True, - ).to(source_device) + ) spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9)) # Handle keyshift and mel conversion diff --git a/rvc/lib/predictors/RMVPE.py b/rvc/lib/predictors/RMVPE.py index 7e9f6ddd4..970c5e58f 100644 --- a/rvc/lib/predictors/RMVPE.py +++ b/rvc/lib/predictors/RMVPE.py @@ -408,15 +408,6 @@ def forward(self, audio, keyshift=0, speed=1, center=True): self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( audio.device ) - - # Zluda, fall-back to CPU for FFTs since HIP SDK has no cuFFT alternative - source_device = audio.device - if audio.device.type == "cuda" and torch.cuda.get_device_name().endswith( - "[ZLUDA]" - ): - audio = audio.to("cpu") - self.hann_window[keyshift_key] = self.hann_window[keyshift_key].to("cpu") - fft = torch.stft( audio, n_fft=n_fft_new, @@ -425,7 +416,7 @@ def forward(self, audio, keyshift=0, speed=1, center=True): window=self.hann_window[keyshift_key], center=center, return_complex=True, - ).to(source_device) + ) magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) if keyshift != 0: diff --git a/rvc/lib/zluda.py b/rvc/lib/zluda.py new file mode 100644 index 000000000..c5a55fb96 --- /dev/null +++ b/rvc/lib/zluda.py @@ -0,0 +1,41 @@ +import torch +if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"): + _torch_stft = torch.stft + def z_stft( + audio: torch.Tensor, + n_fft: int, + hop_length: int = None, + win_length: int = None, + window: torch.Tensor = None, + center: bool = True, + pad_mode: str = "reflect", + normalized: bool = False, + onesided: bool = None, + return_complex: bool = None, + ): + sd = audio.device + return _torch_stft( + audio.to("cpu"), + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window.to("cpu"), + center=center, + pad_mode=pad_mode, + normalized=normalized, + onesided=onesided, + return_complex=return_complex, + ).to(sd) + + def z_jit(f, *_, **__): + f.graph = torch._C.Graph() + return f + + # hijacks + torch.stft = z_stft + torch.jit.script = z_jit + # disabling unsupported cudnn + torch.backends.cudnn.enabled = False + torch.backends.cuda.enable_flash_sdp(False) + torch.backends.cuda.enable_math_sdp(True) + torch.backends.cuda.enable_mem_efficient_sdp(False) \ No newline at end of file diff --git a/rvc/train/extract/extract.py b/rvc/train/extract/extract.py index bae0f83f6..96703e141 100644 --- a/rvc/train/extract/extract.py +++ b/rvc/train/extract/extract.py @@ -12,16 +12,12 @@ import shutil from distutils.util import strtobool -# Zluda -if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"): - torch.backends.cudnn.enabled = False - torch.backends.cuda.enable_flash_sdp(False) - torch.backends.cuda.enable_math_sdp(True) - torch.backends.cuda.enable_mem_efficient_sdp(False) - now_dir = os.getcwd() sys.path.append(os.path.join(now_dir)) +# Zluda hijack +import rvc.lib.zluda + from rvc.lib.utils import load_audio, load_embedding from rvc.train.extract.preparing_files import generate_config, generate_filelist from rvc.lib.predictors.RMVPE import RMVPE0Predictor diff --git a/rvc/train/mel_processing.py b/rvc/train/mel_processing.py index 1f5e8f38b..ecad64896 100644 --- a/rvc/train/mel_processing.py +++ b/rvc/train/mel_processing.py @@ -76,12 +76,6 @@ def spectrogram_torch(y, n_fft, hop_size, win_size, center=False): ) y = y.squeeze(1) - # Zluda, fall-back to CPU for FFTs since HIP SDK has no cuFFT alternative - source_device = y.device - if y.device.type == "cuda" and torch.cuda.get_device_name().endswith("[ZLUDA]"): - y = y.to("cpu") - hann_window[wnsize_dtype_device] = hann_window[wnsize_dtype_device].to("cpu") - spec = torch.stft( y, n_fft, @@ -93,7 +87,7 @@ def spectrogram_torch(y, n_fft, hop_size, win_size, center=False): normalized=False, onesided=True, return_complex=True, - ).to(source_device) + ) spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) diff --git a/rvc/train/train.py b/rvc/train/train.py index a76b8feb2..bf1b01bb0 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -24,6 +24,9 @@ now_dir = os.getcwd() sys.path.append(os.path.join(now_dir)) +# Zluda hijack +import rvc.lib.zluda + from utils import ( HParams, plot_spectrogram_to_numpy, @@ -375,12 +378,6 @@ def run( if torch.cuda.is_available(): torch.cuda.set_device(rank) - if torch.cuda.get_device_name().endswith("[ZLUDA]"): - print("Disabling CUDNN for traning with Zluda") - torch.backends.cudnn.enabled = False - torch.backends.cuda.enable_flash_sdp(False) - torch.backends.cuda.enable_math_sdp(True) - torch.backends.cuda.enable_mem_efficient_sdp(False) # Create datasets and dataloaders train_dataset = TextAudioLoaderMultiNSFsid(config.data) @@ -1027,7 +1024,6 @@ def save_to_json( with open(file_path, "w") as f: json.dump(data, f) - if __name__ == "__main__": torch.multiprocessing.set_start_method("spawn") main()