From 837b945c69d3c057e5de452ef5af7d252ca8293b Mon Sep 17 00:00:00 2001
From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com>
Date: Mon, 6 Jan 2025 23:33:55 -0500
Subject: [PATCH] removed fp16 / mixed precision, minor fixes

---
 core.py                           |   2 +-
 rvc/configs/config.py             |  17 +---
 rvc/infer/infer.py                |  12 +--
 rvc/infer/pipeline.py             |  10 +--
 rvc/lib/predictors/F0Extractor.py |   1 -
 rvc/lib/predictors/RMVPE.py       |  16 +---
 rvc/train/extract/extract.py      |   7 +-
 rvc/train/train.py                | 128 +++++++++++-------------------
 8 files changed, 58 insertions(+), 135 deletions(-)

diff --git a/core.py b/core.py
index ee2e6bee..cf74948f 100644
--- a/core.py
+++ b/core.py
@@ -519,7 +519,7 @@ def run_train_script(
 
         if custom_pretrained == False:
             pg, pd = pretrained_selector(
-                str(vocoder), True, int(sample_rate)
+                str(vocoder), int(sample_rate)
             )
         else:
             if g_pretrained_path is None or d_pretrained_path is None:
diff --git a/rvc/configs/config.py b/rvc/configs/config.py
index 17132475..7f568eed 100644
--- a/rvc/configs/config.py
+++ b/rvc/configs/config.py
@@ -25,7 +25,6 @@ def get_instance(*args, **kwargs):
 class Config:
     def __init__(self):
         self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
-        self.is_half = self.device != "cpu"
         self.gpu_name = (
             torch.cuda.get_device_name(int(self.device.split(":")[-1]))
             if self.device.startswith("cuda")
@@ -82,13 +81,9 @@ def device_config(self):
             self.set_cuda_config()
         else:
             self.device = "cpu"
-            self.is_half = False
-            self.set_precision("fp32")
 
         # Configuration for 6GB GPU memory
-        x_pad, x_query, x_center, x_max = (
-            (3, 10, 60, 65) if self.is_half else (1, 6, 38, 41)
-        )
+        x_pad, x_query, x_center, x_max = (1, 6, 38, 41)
         if self.gpu_mem is not None and self.gpu_mem <= 4:
             # Configuration for 5GB GPU memory
             x_pad, x_query, x_center, x_max = (1, 5, 30, 32)
@@ -98,19 +93,10 @@ def device_config(self):
     def set_cuda_config(self):
         i_device = int(self.device.split(":")[-1])
         self.gpu_name = torch.cuda.get_device_name(i_device)
-        low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"]
-        if (
-            any(gpu in self.gpu_name for gpu in low_end_gpus)
-            and "V100" not in self.gpu_name.upper()
-        ):
-            self.is_half = False
-            self.set_precision("fp32")
-
         self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // (
             1024**3
         )
 
-
 def max_vram_gpu(gpu):
     if torch.cuda.is_available():
         gpu_properties = torch.cuda.get_device_properties(gpu)
@@ -119,7 +105,6 @@ def max_vram_gpu(gpu):
     else:
         return "8"
 
-
 def get_gpu_info():
     ngpu = torch.cuda.device_count()
     gpu_infos = []
diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py
index 1072a21c..231e8038 100644
--- a/rvc/infer/infer.py
+++ b/rvc/infer/infer.py
@@ -70,12 +70,7 @@ def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
             embedder_model_custom (str): Path to the custom HuBERT model.
         """
         self.hubert_model = load_embedding(embedder_model, embedder_model_custom)
-        self.hubert_model.to(self.config.device)
-        self.hubert_model = (
-            self.hubert_model.half()
-            if self.config.is_half
-            else self.hubert_model.float()
-        )
+        self.hubert_model = self.hubert_model.to(self.config.device).float()
         self.hubert_model.eval()
 
     @staticmethod
@@ -482,13 +477,12 @@ def setup_network(self):
                 *self.cpt["config"],
                 use_f0=self.use_f0,
                 text_enc_hidden_dim=self.text_enc_hidden_dim,
-                is_half=False,
                 vocoder=self.vocoder,
             )
             del self.net_g.enc_q
             self.net_g.load_state_dict(self.cpt["weight"], strict=False)
-            self.net_g.eval().to(self.config.device)
-            self.net_g = self.net_g.float()
+            self.net_g = self.net_g.to(self.config.device).float()
+            self.net_g.eval()
 
     def setup_vc_instance(self):
         """
diff --git a/rvc/infer/pipeline.py b/rvc/infer/pipeline.py
index 6352a9a4..3ad393e8 100644
--- a/rvc/infer/pipeline.py
+++ b/rvc/infer/pipeline.py
@@ -133,7 +133,6 @@ def __init__(self, tgt_sr, config):
         self.x_query = config.x_query
         self.x_center = config.x_center
         self.x_max = config.x_max
-        self.is_half = config.is_half
         self.sample_rate = 16000
         self.window = 160
         self.t_pad = self.sample_rate * self.x_pad
@@ -208,7 +207,6 @@ def __init__(self, tgt_sr, config):
         self.note_dict = self.autotune.note_dict
         self.model_rmvpe = RMVPE0Predictor(
             os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
-            is_half=self.is_half,
             device=self.device,
         )
 
@@ -440,11 +438,7 @@ def voice_conversion(
         with torch.no_grad():
             pitch_guidance = pitch != None and pitchf != None
             # prepare source audio
-            feats = (
-                torch.from_numpy(audio0).half()
-                if self.is_half
-                else torch.from_numpy(audio0).float()
-            )
+            feats = torch.from_numpy(audio0).float()
             feats = feats.mean(-1) if feats.dim() == 2 else feats
             assert feats.dim() == 1, feats.dim()
             feats = feats.view(1, -1).to(self.device)
@@ -498,12 +492,10 @@ def voice_conversion(
 
     def _retrieve_speaker_embeddings(self, feats, index, big_npy, index_rate):
         npy = feats[0].cpu().numpy()
-        npy = npy.astype("float32") if self.is_half else npy
         score, ix = index.search(npy, k=8)
         weight = np.square(1 / score)
         weight /= weight.sum(axis=1, keepdims=True)
         npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
-        npy = npy.astype("float16") if self.is_half else npy
         feats = (
             torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
             + (1 - index_rate) * feats
diff --git a/rvc/lib/predictors/F0Extractor.py b/rvc/lib/predictors/F0Extractor.py
index b5dbd602..71065c1f 100644
--- a/rvc/lib/predictors/F0Extractor.py
+++ b/rvc/lib/predictors/F0Extractor.py
@@ -79,7 +79,6 @@ def extract_f0(self):
         elif method == "rmvpe":
             model_rmvpe = RMVPE0Predictor(
                 os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
-                is_half=config.is_half,
                 device=config.device,
                 # hop_length=80
             )
diff --git a/rvc/lib/predictors/RMVPE.py b/rvc/lib/predictors/RMVPE.py
index 065b357f..eeeed208 100644
--- a/rvc/lib/predictors/RMVPE.py
+++ b/rvc/lib/predictors/RMVPE.py
@@ -344,7 +344,6 @@ class MelSpectrogram(torch.nn.Module):
     Extracts Mel-spectrogram features from audio.
 
     Args:
-        is_half (bool): Whether to use half-precision floating-point numbers.
         n_mel_channels (int): Number of Mel-frequency bands.
         sample_rate (int): Sampling rate of the audio.
         win_length (int): Length of the window function in samples.
@@ -357,7 +356,6 @@ class MelSpectrogram(torch.nn.Module):
 
     def __init__(
         self,
-        is_half,
         n_mel_channels,
         sample_rate,
         win_length,
@@ -386,7 +384,6 @@ def __init__(
         self.sample_rate = sample_rate
         self.n_mel_channels = n_mel_channels
         self.clamp = clamp
-        self.is_half = is_half
 
     def forward(self, audio, keyshift=0, speed=1, center=True):
         factor = 2 ** (keyshift / 12)
@@ -416,8 +413,6 @@ def forward(self, audio, keyshift=0, speed=1, center=True):
                 magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
             magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
         mel_output = torch.matmul(self.mel_basis, magnitude)
-        if self.is_half:
-            mel_output = mel_output.half()
         log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
         return log_mel_spec
 
@@ -428,24 +423,19 @@ class RMVPE0Predictor:
 
     Args:
         model_path (str): Path to the RMVPE0 model file.
-        is_half (bool): Whether to use half-precision floating-point numbers.
         device (str, optional): Device to use for computation. Defaults to None, which uses CUDA if available.
     """
 
-    def __init__(self, model_path, is_half, device=None):
+    def __init__(self, model_path, device=None):
         self.resample_kernel = {}
         model = E2E(4, 1, (2, 2))
         ckpt = torch.load(model_path, map_location="cpu")
         model.load_state_dict(ckpt)
         model.eval()
-        if is_half:
-            model = model.half()
         self.model = model
         self.resample_kernel = {}
-        self.is_half = is_half
         self.device = device
-        self.mel_extractor = MelSpectrogram(
-            is_half, N_MELS, 16000, 1024, 160, None, 30, 8000
+        self.mel_extractor = MelSpectrogram(N_MELS, 16000, 1024, 160, None, 30, 8000
         ).to(device)
         self.model = self.model.to(device)
         cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191
@@ -491,8 +481,6 @@ def infer_from_audio(self, audio, thred=0.03):
         mel = self.mel_extractor(audio, center=True)
         hidden = self.mel2hidden(mel)
         hidden = hidden.squeeze(0).cpu().numpy()
-        if self.is_half == True:
-            hidden = hidden.astype("float32")
         f0 = self.decode(hidden, thred=thred)
         return f0
 
diff --git a/rvc/train/extract/extract.py b/rvc/train/extract/extract.py
index c24857f5..b388264f 100644
--- a/rvc/train/extract/extract.py
+++ b/rvc/train/extract/extract.py
@@ -105,7 +105,6 @@ def process_files(self, files, f0_method, hop_length, device, threads):
         if f0_method == "rmvpe":
             self.model_rmvpe = RMVPE0Predictor(
                 os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
-                is_half=False,
                 device=device,
             )
 
@@ -146,15 +145,15 @@ def run_pitch_extraction(files, devices, f0_method, hop_length, threads):
 def process_file_embedding(
     files, embedder_model, embedder_model_custom, device_num, device, n_threads
 ):
-    dtype = torch.float16 if (config.is_half and "cuda" in device) else torch.float32
-    model = load_embedding(embedder_model, embedder_model_custom).to(dtype).to(device)
+    model = load_embedding(embedder_model, embedder_model_custom).to(device).float()
+    model.eval()
     n_threads = max(1, n_threads)
 
     def worker(file_info):
         wav_file_path, _, _, out_file_path = file_info
         if os.path.exists(out_file_path):
             return
-        feats = torch.from_numpy(load_audio(wav_file_path, 16000)).to(dtype).to(device)
+        feats = torch.from_numpy(load_audio(wav_file_path, 16000)).to(device).float()
         feats = feats.view(1, -1)
         with torch.no_grad():
             result = model(feats)["last_hidden_state"]
diff --git a/rvc/train/train.py b/rvc/train/train.py
index 6b78f4d4..5eb7acd7 100644
--- a/rvc/train/train.py
+++ b/rvc/train/train.py
@@ -406,7 +406,6 @@ def run(
         config.train.segment_size // config.data.hop_length,
         **config.model,
         use_f0=True,
-        is_half=config.train.fp16_run and device.type == "cuda",
         sr=sample_rate,
         vocoder=vocoder,
         checkpointing=checkpointing,
@@ -489,7 +488,7 @@ def run(
                     torch.load(pretrainD, map_location="cpu")["model"]
                 )
 
-    # Initialize schedulers and scaler
+    # Initialize schedulers
     scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
         optim_g, gamma=config.train.lr_decay, last_epoch=epoch_str - 2
     )
@@ -497,8 +496,6 @@ def run(
         optim_d, gamma=config.train.lr_decay, last_epoch=epoch_str - 2
     )
 
-    scaler = GradScaler(enabled=config.train.fp16_run and device.type == "cuda")
-
     cache = []
     # get the first sample as reference for tensorboard evaluation
     # custom reference temporarily disabled
@@ -554,7 +551,6 @@ def run(
             config,
             [net_g, net_d],
             [optim_g, optim_d],
-            scaler,
             [train_loader, None],
             [writer_eval],
             cache,
@@ -576,7 +572,6 @@ def train_and_evaluate(
     hps,
     nets,
     optims,
-    scaler,
     loaders,
     writers,
     cache,
@@ -596,7 +591,6 @@ def train_and_evaluate(
         hps (Namespace): Hyperparameters.
         nets (list): List of models [net_g, net_d].
         optims (list): List of optimizers [optim_g, optim_d].
-        scaler (GradScaler): Gradient scaler for mixed precision training.
         loaders (list): List of dataloaders [train_loader, eval_loader].
         writers (list): List of TensorBoard writers [writer_eval].
         cache (list): List to cache data in GPU memory.
@@ -658,76 +652,54 @@ def train_and_evaluate(
             ) = info
 
             # Forward pass
-            use_amp = config.train.fp16_run and device.type == "cuda"
-            with autocast(enabled=use_amp):
-                model_output = net_g(
-                    phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid
-                )
-                y_hat, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q) = (
-                    model_output
-                )
+            model_output = net_g(
+                phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid
+            )
+            y_hat, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q) = (
+                model_output
+            )
                 # slice of the original waveform to match a generate slice
-                if randomized:
-                    wave = commons.slice_segments(
-                        wave,
-                        ids_slice * config.data.hop_length,
-                        config.train.segment_size,
-                        dim=3,
-                    )
-                y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
-                with autocast(enabled=False):
-                    # if vocoder == "HiFi-GAN":
-                    #    loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g)
-                    # else:
-                    #    loss_disc, _, _ = discriminator_loss_scaled(y_d_hat_r, y_d_hat_g)
-                    loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g)
+            if randomized:
+                wave = commons.slice_segments(
+                    wave,
+                    ids_slice * config.data.hop_length,
+                    config.train.segment_size,
+                    dim=3,
+                )
+            y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
+            loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g)
             # Discriminator backward and update
             epoch_disc_sum += loss_disc.item()
             optim_d.zero_grad()
-            scaler.scale(loss_disc).backward()
-            scaler.unscale_(optim_d)
+            loss_disc.backward()
             grad_norm_d = torch.nn.utils.clip_grad_norm_(
                 net_d.parameters(), max_norm=1000.0
             )
-            scaler.step(optim_d)
-            scaler.update()
-            # if not math.isfinite(grad_norm_d):
-            #    print("\nWarning: grad_norm_d is NaN or Inf")
+            optim_d.step()
 
             # Generator backward and update
-            with autocast(enabled=use_amp):
-                _, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
-                with autocast(enabled=False):
-                    loss_mel = fn_mel_loss(wave, y_hat) * config.train.c_mel / 3.0
-                    loss_env = envelope_loss(wave, y_hat)
-                    loss_kl = (
-                        kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl
-                    )
-                    loss_fm = feature_loss(fmap_r, fmap_g)
-                    # if vocoder == "HiFi-GAN":
-                    # 	loss_gen, _ = generator_loss(y_d_hat_g)
-                    # else:
-                    # 	loss_gen, _ = generator_loss_scaled(y_d_hat_g)
-                    loss_gen, _ = generator_loss(y_d_hat_g)
-                    loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_env
-
-                    if loss_gen_all < lowest_value["value"]:
-                        lowest_value = {
-                            "step": global_step,
-                            "value": loss_gen_all,
-                            "epoch": epoch,
-                        }
+            _, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
+            
+            loss_mel = fn_mel_loss(wave, y_hat) * config.train.c_mel / 3.0
+            loss_env = envelope_loss(wave, y_hat)
+            loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl
+            loss_fm = feature_loss(fmap_r, fmap_g)
+            loss_gen, _ = generator_loss(y_d_hat_g)
+            loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_env
+
+            if loss_gen_all < lowest_value["value"]:
+                lowest_value = {
+                    "step": global_step,
+                    "value": loss_gen_all,
+                    "epoch": epoch,
+                }
             epoch_gen_sum += loss_gen_all.item()
             optim_g.zero_grad()
-            scaler.scale(loss_gen_all).backward()
-            scaler.unscale_(optim_g)
+            loss_gen_all.backward()
             grad_norm_g = torch.nn.utils.clip_grad_norm_(
                 net_g.parameters(), max_norm=1000.0
             )
-            scaler.step(optim_g)
-            scaler.update()
-            # if not math.isfinite(grad_norm_g):
-            #    print("\n Warning: grad_norm_g is NaN or Inf")
+            optim_g.step()
 
             global_step += 1
 
@@ -768,7 +740,8 @@ def train_and_evaluate(
                 )
 
             pbar.update(1)
-
+        # end of batch train
+    #end of tqdm
     with torch.no_grad():
         torch.cuda.empty_cache()
 
@@ -798,19 +771,16 @@ def train_and_evaluate(
         else:
             y_mel = mel
         # used for tensorboard chart - slice/mel_gen
-        with autocast(enabled=False):
-            y_hat_mel = mel_spectrogram_torch(
-                y_hat.float().squeeze(1),
-                config.data.filter_length,
-                config.data.n_mel_channels,
-                config.data.sample_rate,
-                config.data.hop_length,
-                config.data.win_length,
-                config.data.mel_fmin,
-                config.data.mel_fmax,
-            )
-            if use_amp:
-                y_hat_mel = y_hat_mel.half()
+        y_hat_mel = mel_spectrogram_torch(
+            y_hat.float().squeeze(1),
+            config.data.filter_length,
+            config.data.n_mel_channels,
+            config.data.sample_rate,
+            config.data.hop_length,
+            config.data.win_length,
+            config.data.mel_fmin,
+            config.data.mel_fmax,
+        )
 
         lr = optim_g.param_groups[0]["lr"]
 
@@ -827,10 +797,6 @@ def train_and_evaluate(
             "loss_avg_epoch/disc": np.mean(avg_losses["disc_loss_queue"]),
             "loss_avg_epoch/gen": np.mean(avg_losses["gen_loss_queue"]),
         }
-        # commented out
-        # scalar_dict.update({f"loss/g/{i}": v for i, v in enumerate(losses_gen)})
-        # scalar_dict.update({f"loss/d_r/{i}": v for i, v in enumerate(losses_disc_r)})
-        # scalar_dict.update({f"loss/d_g/{i}": v for i, v in enumerate(losses_disc_g)})
 
         image_dict = {
             "slice/mel_org": plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),