Merge branch 'main' of https://github.com/IAHispano/Applio

IAHispano · Sep 21, 2024 · 869b07b · 869b07b
2 parents 18564c6 + 8b68a28
commit 869b07b
Show file tree

Hide file tree

Showing 7 changed files with 401 additions and 764 deletions.
diff --git a/assets/ICON.ico b/assets/ICON.ico
diff --git a/core.py b/core.py
@@ -517,7 +517,6 @@ def run_train_script(
     index_algorithm: str = "Auto",
     cache_data_in_gpu: bool = False,
     custom_pretrained: bool = False,
-    use_cpu: bool = False,
     g_pretrained_path: str = None,
     d_pretrained_path: str = None,
 ):
@@ -561,7 +560,6 @@ def run_train_script(
                 overtraining_detector,
                 overtraining_threshold,
                 sync_graph,
-                use_cpu,
             ],
         ),
     ]
@@ -1473,13 +1471,6 @@ def parse_arguments():
         default="Auto",
         required=False,
     )
-    train_parser.add_argument(
-        "--use_cpu",
-        type=lambda x: bool(strtobool(x)),
-        choices=[True, False],
-        help="Force the use of CPU for training.",
-        default=False,
-    )
 
     # Parser for 'index' mode
     index_parser = subparsers.add_parser(
@@ -1784,7 +1775,6 @@ def main():
                 sync_graph=args.sync_graph,
                 index_algorithm=args.index_algorithm,
                 cache_data_in_gpu=args.cache_data_in_gpu,
-                use_cpu=args.use_cpu,
                 g_pretrained_path=args.g_pretrained_path,
                 d_pretrained_path=args.d_pretrained_path,
             )

diff --git a/rvc/infer/pipeline.py b/rvc/infer/pipeline.py
@@ -259,7 +259,9 @@ def get_f0_hybrid(
         for method in methods:
             f0 = None
             if method == "crepe":
-                f0 = self.get_f0_crepe(x, f0_min, f0_max, p_len, int(hop_length))
+                f0 = self.get_f0_crepe_computation(
+                    x, f0_min, f0_max, p_len, int(hop_length)
+                )
             elif method == "rmvpe":
                 self.model_rmvpe = RMVPE0Predictor(
                     os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
@@ -412,80 +414,63 @@ def voice_conversion(
             version: Model version ("v1" or "v2").
             protect: Protection level for preserving the original pitch.
         """
-        feats = torch.from_numpy(audio0)
-        if self.is_half:
-            feats = feats.half()
-        else:
-            feats = feats.float()
-        if feats.dim() == 2:
-            feats = feats.mean(-1)
-        assert feats.dim() == 1, feats.dim()
-        feats = feats.view(1, -1)
-        padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
-
         with torch.no_grad():
-            feats = model(feats.to(self.device))["last_hidden_state"]
+            pitch_guidance = pitch != None and pitchf != None
+            # prepare source audio
             feats = (
-                model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats
+                torch.from_numpy(audio0).half()
+                if self.is_half
+                else torch.from_numpy(audio0).float()
             )
-        if protect < 0.5 and pitch != None and pitchf != None:
-            feats0 = feats.clone()
-        if (
-            isinstance(index, type(None)) == False
-            and isinstance(big_npy, type(None)) == False
-            and index_rate != 0
-        ):
-            npy = feats[0].cpu().numpy()
-            if self.is_half:
-                npy = npy.astype("float32")
-
-            score, ix = index.search(npy, k=8)
-            weight = np.square(1 / score)
-            weight /= weight.sum(axis=1, keepdims=True)
-            npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
-
-            if self.is_half:
-                npy = npy.astype("float16")
+            feats = feats.mean(-1) if feats.dim() == 2 else feats
+            assert feats.dim() == 1, feats.dim()
+            feats = feats.view(1, -1).to(self.device)
+            # extract features
+            feats = model(feats)["last_hidden_state"]
             feats = (
-                torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
-                + (1 - index_rate) * feats
+                model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats
             )
-
-        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
-        if protect < 0.5 and pitch != None and pitchf != None:
-            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
+            # make a copy for pitch guidance and protection
+            feats0 = feats.clone() if pitch_guidance else None
+            if (
+                index
+            ):  # set by parent function, only true if index is available, loaded, and index rate > 0
+                feats = self._retrieve_speaker_embeddings(
+                    feats, index, big_npy, index_rate
+                )
+            # feature upsampling
+            feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(
                 0, 2, 1
             )
-        p_len = audio0.shape[0] // self.window
-        if feats.shape[1] < p_len:
-            p_len = feats.shape[1]
-            if pitch != None and pitchf != None:
-                pitch = pitch[:, :p_len]
-                pitchf = pitchf[:, :p_len]
-
-        if protect < 0.5 and pitch != None and pitchf != None:
-            pitchff = pitchf.clone()
-            pitchff[pitchf > 0] = 1
-            pitchff[pitchf < 1] = protect
-            pitchff = pitchff.unsqueeze(-1)
-            feats = feats * pitchff + feats0 * (1 - pitchff)
-            feats = feats.to(feats0.dtype)
-        p_len = torch.tensor([p_len], device=self.device).long()
-        with torch.no_grad():
-            if pitch != None and pitchf != None:
-                audio1 = (
-                    (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
-                    .data.cpu()
-                    .float()
-                    .numpy()
+            # adjust the length if the audio is short
+            p_len = min(audio0.shape[0] // self.window, feats.shape[1])
+            if pitch_guidance:
+                feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
+                    0, 2, 1
                 )
+                pitch, pitchf = pitch[:, :p_len], pitchf[:, :p_len]
+                # Pitch protection blending
+                if protect < 0.5:
+                    pitchff = pitchf.clone()
+                    pitchff[pitchf > 0] = 1
+                    pitchff[pitchf < 1] = protect
+                    feats = feats * pitchff.unsqueeze(-1) + feats0 * (
+                        1 - pitchff.unsqueeze(-1)
+                    )
+                    feats = feats.to(feats0.dtype)
             else:
-                audio1 = (
-                    (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
-                )
-        del feats, p_len, padding_mask
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
+                pitch, pitchf = None, None
+            p_len = torch.tensor([p_len], device=self.device).long()
+            audio1 = (
+                (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
+                .data.cpu()
+                .float()
+                .numpy()
+            )
+            # clean up
+            del feats, feats0, p_len
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
         return audio1
 
     def _retrieve_speaker_embeddings(self, feats, index, big_npy, index_rate):
@@ -689,7 +674,9 @@ def pipeline(
         audio_max = np.abs(audio_opt).max() / 0.99
         if audio_max > 1:
             audio_opt /= audio_max
-        del pitch, pitchf, sid
+        if pitch_guidance:
+            del pitch, pitchf
+        del sid
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         return audio_opt
diff --git a/rvc/lib/algorithm/generators.py b/rvc/lib/algorithm/generators.py
@@ -40,9 +40,11 @@ def __init__(
         )
         resblock = ResBlock1 if resblock == "1" else ResBlock2
 
-        self.ups_and_resblocks = torch.nn.ModuleList()
+        self.ups = torch.nn.ModuleList()
+        self.resblocks = torch.nn.ModuleList()
+
         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            self.ups_and_resblocks.append(
+            self.ups.append(
                 weight_norm(
                     torch.nn.ConvTranspose1d(
                         upsample_initial_channel // (2**i),
@@ -57,35 +59,35 @@ def __init__(
             for j, (k, d) in enumerate(
                 zip(resblock_kernel_sizes, resblock_dilation_sizes)
             ):
-                self.ups_and_resblocks.append(resblock(ch, k, d))
+                self.resblocks.append(resblock(ch, k, d))
 
         self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-        self.ups_and_resblocks.apply(init_weights)
+        self.ups.apply(init_weights)
 
         if gin_channels != 0:
             self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
 
-        def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
-            x = self.conv_pre(x)
-            if g is not None:
-                x = x + self.cond(g)
+    def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
 
-            resblock_idx = 0
-            for _ in range(self.num_upsamples):
-                x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
-                x = self.ups_and_resblocks[resblock_idx](x)
-                resblock_idx += 1
-                xs = 0
-                for _ in range(self.num_kernels):
-                    xs += self.ups_and_resblocks[resblock_idx](x)
-                    resblock_idx += 1
-                x = xs / self.num_kernels
+        for i in range(self.num_upsamples):
+            x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs == None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
 
-            x = torch.nn.functional.leaky_relu(x)
-            x = self.conv_post(x)
-            x = torch.tanh(x)
+        x = torch.nn.functional.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
 
-            return x
+        return x
 
     def __prepare_scriptable__(self):
         """Prepares the module for scripting."""
@@ -100,8 +102,10 @@ def __prepare_scriptable__(self):
 
     def remove_weight_norm(self):
         """Removes weight normalization from the upsampling and residual blocks."""
-        for l in self.ups_and_resblocks:
+        for l in self.ups:
             remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
 
 
 class SineGen(torch.nn.Module):