From 837b945c69d3c057e5de452ef5af7d252ca8293b Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Mon, 6 Jan 2025 23:33:55 -0500 Subject: [PATCH] removed fp16 / mixed precision, minor fixes --- core.py | 2 +- rvc/configs/config.py | 17 +--- rvc/infer/infer.py | 12 +-- rvc/infer/pipeline.py | 10 +-- rvc/lib/predictors/F0Extractor.py | 1 - rvc/lib/predictors/RMVPE.py | 16 +--- rvc/train/extract/extract.py | 7 +- rvc/train/train.py | 128 +++++++++++------------------- 8 files changed, 58 insertions(+), 135 deletions(-) diff --git a/core.py b/core.py index ee2e6bee..cf74948f 100644 --- a/core.py +++ b/core.py @@ -519,7 +519,7 @@ def run_train_script( if custom_pretrained == False: pg, pd = pretrained_selector( - str(vocoder), True, int(sample_rate) + str(vocoder), int(sample_rate) ) else: if g_pretrained_path is None or d_pretrained_path is None: diff --git a/rvc/configs/config.py b/rvc/configs/config.py index 17132475..7f568eed 100644 --- a/rvc/configs/config.py +++ b/rvc/configs/config.py @@ -25,7 +25,6 @@ def get_instance(*args, **kwargs): class Config: def __init__(self): self.device = "cuda:0" if torch.cuda.is_available() else "cpu" - self.is_half = self.device != "cpu" self.gpu_name = ( torch.cuda.get_device_name(int(self.device.split(":")[-1])) if self.device.startswith("cuda") @@ -82,13 +81,9 @@ def device_config(self): self.set_cuda_config() else: self.device = "cpu" - self.is_half = False - self.set_precision("fp32") # Configuration for 6GB GPU memory - x_pad, x_query, x_center, x_max = ( - (3, 10, 60, 65) if self.is_half else (1, 6, 38, 41) - ) + x_pad, x_query, x_center, x_max = (1, 6, 38, 41) if self.gpu_mem is not None and self.gpu_mem <= 4: # Configuration for 5GB GPU memory x_pad, x_query, x_center, x_max = (1, 5, 30, 32) @@ -98,19 +93,10 @@ def device_config(self): def set_cuda_config(self): i_device = int(self.device.split(":")[-1]) self.gpu_name = torch.cuda.get_device_name(i_device) - low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"] - if ( - any(gpu in self.gpu_name for gpu in low_end_gpus) - and "V100" not in self.gpu_name.upper() - ): - self.is_half = False - self.set_precision("fp32") - self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // ( 1024**3 ) - def max_vram_gpu(gpu): if torch.cuda.is_available(): gpu_properties = torch.cuda.get_device_properties(gpu) @@ -119,7 +105,6 @@ def max_vram_gpu(gpu): else: return "8" - def get_gpu_info(): ngpu = torch.cuda.device_count() gpu_infos = [] diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py index 1072a21c..231e8038 100644 --- a/rvc/infer/infer.py +++ b/rvc/infer/infer.py @@ -70,12 +70,7 @@ def load_hubert(self, embedder_model: str, embedder_model_custom: str = None): embedder_model_custom (str): Path to the custom HuBERT model. """ self.hubert_model = load_embedding(embedder_model, embedder_model_custom) - self.hubert_model.to(self.config.device) - self.hubert_model = ( - self.hubert_model.half() - if self.config.is_half - else self.hubert_model.float() - ) + self.hubert_model = self.hubert_model.to(self.config.device).float() self.hubert_model.eval() @staticmethod @@ -482,13 +477,12 @@ def setup_network(self): *self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=self.text_enc_hidden_dim, - is_half=False, vocoder=self.vocoder, ) del self.net_g.enc_q self.net_g.load_state_dict(self.cpt["weight"], strict=False) - self.net_g.eval().to(self.config.device) - self.net_g = self.net_g.float() + self.net_g = self.net_g.to(self.config.device).float() + self.net_g.eval() def setup_vc_instance(self): """ diff --git a/rvc/infer/pipeline.py b/rvc/infer/pipeline.py index 6352a9a4..3ad393e8 100644 --- a/rvc/infer/pipeline.py +++ b/rvc/infer/pipeline.py @@ -133,7 +133,6 @@ def __init__(self, tgt_sr, config): self.x_query = config.x_query self.x_center = config.x_center self.x_max = config.x_max - self.is_half = config.is_half self.sample_rate = 16000 self.window = 160 self.t_pad = self.sample_rate * self.x_pad @@ -208,7 +207,6 @@ def __init__(self, tgt_sr, config): self.note_dict = self.autotune.note_dict self.model_rmvpe = RMVPE0Predictor( os.path.join("rvc", "models", "predictors", "rmvpe.pt"), - is_half=self.is_half, device=self.device, ) @@ -440,11 +438,7 @@ def voice_conversion( with torch.no_grad(): pitch_guidance = pitch != None and pitchf != None # prepare source audio - feats = ( - torch.from_numpy(audio0).half() - if self.is_half - else torch.from_numpy(audio0).float() - ) + feats = torch.from_numpy(audio0).float() feats = feats.mean(-1) if feats.dim() == 2 else feats assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1).to(self.device) @@ -498,12 +492,10 @@ def voice_conversion( def _retrieve_speaker_embeddings(self, feats, index, big_npy, index_rate): npy = feats[0].cpu().numpy() - npy = npy.astype("float32") if self.is_half else npy score, ix = index.search(npy, k=8) weight = np.square(1 / score) weight /= weight.sum(axis=1, keepdims=True) npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) - npy = npy.astype("float16") if self.is_half else npy feats = ( torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats diff --git a/rvc/lib/predictors/F0Extractor.py b/rvc/lib/predictors/F0Extractor.py index b5dbd602..71065c1f 100644 --- a/rvc/lib/predictors/F0Extractor.py +++ b/rvc/lib/predictors/F0Extractor.py @@ -79,7 +79,6 @@ def extract_f0(self): elif method == "rmvpe": model_rmvpe = RMVPE0Predictor( os.path.join("rvc", "models", "predictors", "rmvpe.pt"), - is_half=config.is_half, device=config.device, # hop_length=80 ) diff --git a/rvc/lib/predictors/RMVPE.py b/rvc/lib/predictors/RMVPE.py index 065b357f..eeeed208 100644 --- a/rvc/lib/predictors/RMVPE.py +++ b/rvc/lib/predictors/RMVPE.py @@ -344,7 +344,6 @@ class MelSpectrogram(torch.nn.Module): Extracts Mel-spectrogram features from audio. Args: - is_half (bool): Whether to use half-precision floating-point numbers. n_mel_channels (int): Number of Mel-frequency bands. sample_rate (int): Sampling rate of the audio. win_length (int): Length of the window function in samples. @@ -357,7 +356,6 @@ class MelSpectrogram(torch.nn.Module): def __init__( self, - is_half, n_mel_channels, sample_rate, win_length, @@ -386,7 +384,6 @@ def __init__( self.sample_rate = sample_rate self.n_mel_channels = n_mel_channels self.clamp = clamp - self.is_half = is_half def forward(self, audio, keyshift=0, speed=1, center=True): factor = 2 ** (keyshift / 12) @@ -416,8 +413,6 @@ def forward(self, audio, keyshift=0, speed=1, center=True): magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) magnitude = magnitude[:, :size, :] * self.win_length / win_length_new mel_output = torch.matmul(self.mel_basis, magnitude) - if self.is_half: - mel_output = mel_output.half() log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) return log_mel_spec @@ -428,24 +423,19 @@ class RMVPE0Predictor: Args: model_path (str): Path to the RMVPE0 model file. - is_half (bool): Whether to use half-precision floating-point numbers. device (str, optional): Device to use for computation. Defaults to None, which uses CUDA if available. """ - def __init__(self, model_path, is_half, device=None): + def __init__(self, model_path, device=None): self.resample_kernel = {} model = E2E(4, 1, (2, 2)) ckpt = torch.load(model_path, map_location="cpu") model.load_state_dict(ckpt) model.eval() - if is_half: - model = model.half() self.model = model self.resample_kernel = {} - self.is_half = is_half self.device = device - self.mel_extractor = MelSpectrogram( - is_half, N_MELS, 16000, 1024, 160, None, 30, 8000 + self.mel_extractor = MelSpectrogram(N_MELS, 16000, 1024, 160, None, 30, 8000 ).to(device) self.model = self.model.to(device) cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191 @@ -491,8 +481,6 @@ def infer_from_audio(self, audio, thred=0.03): mel = self.mel_extractor(audio, center=True) hidden = self.mel2hidden(mel) hidden = hidden.squeeze(0).cpu().numpy() - if self.is_half == True: - hidden = hidden.astype("float32") f0 = self.decode(hidden, thred=thred) return f0 diff --git a/rvc/train/extract/extract.py b/rvc/train/extract/extract.py index c24857f5..b388264f 100644 --- a/rvc/train/extract/extract.py +++ b/rvc/train/extract/extract.py @@ -105,7 +105,6 @@ def process_files(self, files, f0_method, hop_length, device, threads): if f0_method == "rmvpe": self.model_rmvpe = RMVPE0Predictor( os.path.join("rvc", "models", "predictors", "rmvpe.pt"), - is_half=False, device=device, ) @@ -146,15 +145,15 @@ def run_pitch_extraction(files, devices, f0_method, hop_length, threads): def process_file_embedding( files, embedder_model, embedder_model_custom, device_num, device, n_threads ): - dtype = torch.float16 if (config.is_half and "cuda" in device) else torch.float32 - model = load_embedding(embedder_model, embedder_model_custom).to(dtype).to(device) + model = load_embedding(embedder_model, embedder_model_custom).to(device).float() + model.eval() n_threads = max(1, n_threads) def worker(file_info): wav_file_path, _, _, out_file_path = file_info if os.path.exists(out_file_path): return - feats = torch.from_numpy(load_audio(wav_file_path, 16000)).to(dtype).to(device) + feats = torch.from_numpy(load_audio(wav_file_path, 16000)).to(device).float() feats = feats.view(1, -1) with torch.no_grad(): result = model(feats)["last_hidden_state"] diff --git a/rvc/train/train.py b/rvc/train/train.py index 6b78f4d4..5eb7acd7 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -406,7 +406,6 @@ def run( config.train.segment_size // config.data.hop_length, **config.model, use_f0=True, - is_half=config.train.fp16_run and device.type == "cuda", sr=sample_rate, vocoder=vocoder, checkpointing=checkpointing, @@ -489,7 +488,7 @@ def run( torch.load(pretrainD, map_location="cpu")["model"] ) - # Initialize schedulers and scaler + # Initialize schedulers scheduler_g = torch.optim.lr_scheduler.ExponentialLR( optim_g, gamma=config.train.lr_decay, last_epoch=epoch_str - 2 ) @@ -497,8 +496,6 @@ def run( optim_d, gamma=config.train.lr_decay, last_epoch=epoch_str - 2 ) - scaler = GradScaler(enabled=config.train.fp16_run and device.type == "cuda") - cache = [] # get the first sample as reference for tensorboard evaluation # custom reference temporarily disabled @@ -554,7 +551,6 @@ def run( config, [net_g, net_d], [optim_g, optim_d], - scaler, [train_loader, None], [writer_eval], cache, @@ -576,7 +572,6 @@ def train_and_evaluate( hps, nets, optims, - scaler, loaders, writers, cache, @@ -596,7 +591,6 @@ def train_and_evaluate( hps (Namespace): Hyperparameters. nets (list): List of models [net_g, net_d]. optims (list): List of optimizers [optim_g, optim_d]. - scaler (GradScaler): Gradient scaler for mixed precision training. loaders (list): List of dataloaders [train_loader, eval_loader]. writers (list): List of TensorBoard writers [writer_eval]. cache (list): List to cache data in GPU memory. @@ -658,76 +652,54 @@ def train_and_evaluate( ) = info # Forward pass - use_amp = config.train.fp16_run and device.type == "cuda" - with autocast(enabled=use_amp): - model_output = net_g( - phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid - ) - y_hat, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q) = ( - model_output - ) + model_output = net_g( + phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid + ) + y_hat, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q) = ( + model_output + ) # slice of the original waveform to match a generate slice - if randomized: - wave = commons.slice_segments( - wave, - ids_slice * config.data.hop_length, - config.train.segment_size, - dim=3, - ) - y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) - with autocast(enabled=False): - # if vocoder == "HiFi-GAN": - # loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g) - # else: - # loss_disc, _, _ = discriminator_loss_scaled(y_d_hat_r, y_d_hat_g) - loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g) + if randomized: + wave = commons.slice_segments( + wave, + ids_slice * config.data.hop_length, + config.train.segment_size, + dim=3, + ) + y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) + loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g) # Discriminator backward and update epoch_disc_sum += loss_disc.item() optim_d.zero_grad() - scaler.scale(loss_disc).backward() - scaler.unscale_(optim_d) + loss_disc.backward() grad_norm_d = torch.nn.utils.clip_grad_norm_( net_d.parameters(), max_norm=1000.0 ) - scaler.step(optim_d) - scaler.update() - # if not math.isfinite(grad_norm_d): - # print("\nWarning: grad_norm_d is NaN or Inf") + optim_d.step() # Generator backward and update - with autocast(enabled=use_amp): - _, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) - with autocast(enabled=False): - loss_mel = fn_mel_loss(wave, y_hat) * config.train.c_mel / 3.0 - loss_env = envelope_loss(wave, y_hat) - loss_kl = ( - kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl - ) - loss_fm = feature_loss(fmap_r, fmap_g) - # if vocoder == "HiFi-GAN": - # loss_gen, _ = generator_loss(y_d_hat_g) - # else: - # loss_gen, _ = generator_loss_scaled(y_d_hat_g) - loss_gen, _ = generator_loss(y_d_hat_g) - loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_env - - if loss_gen_all < lowest_value["value"]: - lowest_value = { - "step": global_step, - "value": loss_gen_all, - "epoch": epoch, - } + _, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) + + loss_mel = fn_mel_loss(wave, y_hat) * config.train.c_mel / 3.0 + loss_env = envelope_loss(wave, y_hat) + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, _ = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_env + + if loss_gen_all < lowest_value["value"]: + lowest_value = { + "step": global_step, + "value": loss_gen_all, + "epoch": epoch, + } epoch_gen_sum += loss_gen_all.item() optim_g.zero_grad() - scaler.scale(loss_gen_all).backward() - scaler.unscale_(optim_g) + loss_gen_all.backward() grad_norm_g = torch.nn.utils.clip_grad_norm_( net_g.parameters(), max_norm=1000.0 ) - scaler.step(optim_g) - scaler.update() - # if not math.isfinite(grad_norm_g): - # print("\n Warning: grad_norm_g is NaN or Inf") + optim_g.step() global_step += 1 @@ -768,7 +740,8 @@ def train_and_evaluate( ) pbar.update(1) - + # end of batch train + #end of tqdm with torch.no_grad(): torch.cuda.empty_cache() @@ -798,19 +771,16 @@ def train_and_evaluate( else: y_mel = mel # used for tensorboard chart - slice/mel_gen - with autocast(enabled=False): - y_hat_mel = mel_spectrogram_torch( - y_hat.float().squeeze(1), - config.data.filter_length, - config.data.n_mel_channels, - config.data.sample_rate, - config.data.hop_length, - config.data.win_length, - config.data.mel_fmin, - config.data.mel_fmax, - ) - if use_amp: - y_hat_mel = y_hat_mel.half() + y_hat_mel = mel_spectrogram_torch( + y_hat.float().squeeze(1), + config.data.filter_length, + config.data.n_mel_channels, + config.data.sample_rate, + config.data.hop_length, + config.data.win_length, + config.data.mel_fmin, + config.data.mel_fmax, + ) lr = optim_g.param_groups[0]["lr"] @@ -827,10 +797,6 @@ def train_and_evaluate( "loss_avg_epoch/disc": np.mean(avg_losses["disc_loss_queue"]), "loss_avg_epoch/gen": np.mean(avg_losses["gen_loss_queue"]), } - # commented out - # scalar_dict.update({f"loss/g/{i}": v for i, v in enumerate(losses_gen)}) - # scalar_dict.update({f"loss/d_r/{i}": v for i, v in enumerate(losses_disc_r)}) - # scalar_dict.update({f"loss/d_g/{i}": v for i, v in enumerate(losses_disc_g)}) image_dict = { "slice/mel_org": plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),