From a4ff5532cca664bfc258bbf2bacbdfa43dbb3add Mon Sep 17 00:00:00 2001 From: Alexey Shmelev Date: Mon, 21 Oct 2024 22:52:40 -0400 Subject: [PATCH] Replaced 'Sync Graph' with 'New Mode' setting on UI. Removed unnecessary config.json manipulations. Moved tensorboard logging to the end of the epoch run. Disabled excessive logging of metrics for individual layers. --- assets/Applio_NoUI.ipynb | 4 +- assets/i18n/languages/en_US.json | 4 +- core.py | 10 +- rvc/train/train.py | 176 +++++++++---------------------- tabs/train/train.py | 8 +- 5 files changed, 65 insertions(+), 137 deletions(-) diff --git a/assets/Applio_NoUI.ipynb b/assets/Applio_NoUI.ipynb index 955a92315..1105dc132 100644 --- a/assets/Applio_NoUI.ipynb +++ b/assets/Applio_NoUI.ipynb @@ -606,7 +606,7 @@ "pitch_guidance = True # @param{type:\"boolean\"}\n", "auto_backups = True # @param{type:\"boolean\"}\n", "pretrained = True # @param{type:\"boolean\"}\n", - "sync_graph = False # @param{type:\"boolean\"}\n", + "cleanup = False # @param{type:\"boolean\"}\n", "cache_data_in_gpu = False # @param{type:\"boolean\"}\n", "tensorboard = True # @param{type:\"boolean\"}\n", "# @markdown ### ➡️ Choose how many epochs your model will be stored\n", @@ -638,7 +638,7 @@ " if tensorboard == True:\n", " %load_ext tensorboard\n", " %tensorboard --logdir /content/Applio/logs/\n", - " !python core.py train --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --save_every_epoch \"{save_every_epoch}\" --save_only_latest \"{save_only_latest}\" --save_every_weights \"{save_every_weights}\" --total_epoch \"{total_epoch}\" --sample_rate \"{sr}\" --batch_size \"{batch_size}\" --gpu \"{gpu}\" --pitch_guidance \"{pitch_guidance}\" --pretrained \"{pretrained}\" --custom_pretrained \"{custom_pretrained}\" --g_pretrained_path \"{g_pretrained_path}\" --d_pretrained_path \"{d_pretrained_path}\" --overtraining_detector \"{overtraining_detector}\" --overtraining_threshold \"{overtraining_threshold}\" --sync_graph \"{sync_graph}\" --cache_data_in_gpu \"{cache_data_in_gpu}\"\n", + " !python core.py train --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --save_every_epoch \"{save_every_epoch}\" --save_only_latest \"{save_only_latest}\" --save_every_weights \"{save_every_weights}\" --total_epoch \"{total_epoch}\" --sample_rate \"{sr}\" --batch_size \"{batch_size}\" --gpu \"{gpu}\" --pitch_guidance \"{pitch_guidance}\" --pretrained \"{pretrained}\" --custom_pretrained \"{custom_pretrained}\" --g_pretrained_path \"{g_pretrained_path}\" --d_pretrained_path \"{d_pretrained_path}\" --overtraining_detector \"{overtraining_detector}\" --overtraining_threshold \"{overtraining_threshold}\" --cleanup \"{cleanup}\" --cache_data_in_gpu \"{cache_data_in_gpu}\"\n", "\n", "\n", "server_thread = threading.Thread(target=start_train)\n", diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json index 972065794..42cd184bc 100644 --- a/assets/i18n/languages/en_US.json +++ b/assets/i18n/languages/en_US.json @@ -85,8 +85,8 @@ "Overtraining Detector Settings": "Overtraining Detector Settings", "Overtraining Threshold": "Overtraining Threshold", "Set the maximum number of epochs you want your model to stop training if no improvement is detected.": "Set the maximum number of epochs you want your model to stop training if no improvement is detected.", - "Sync Graph": "Sync Graph", - "Synchronize the graph of the tensorbaord. Only enable this setting if you are training a new model.": "Synchronize the graph of the tensorbaord. Only enable this setting if you are training a new model.", + "New Model": "New Model", + "Enable this setting only if you are training a new model from scratch or restarting the training. Deletes all previously generated weights and tensorboard logs.": "Enable this setting only if you are training a new model from scratch or restarting the training. Deletes all previously generated weights and tensorboard logs.", "Start Training": "Start Training", "Stop Training": "Stop Training", "Generate Index": "Generate Index", diff --git a/core.py b/core.py index ca6e2c7ba..7b9de5a58 100644 --- a/core.py +++ b/core.py @@ -529,7 +529,7 @@ def run_train_script( overtraining_detector: bool, overtraining_threshold: int, pretrained: bool, - sync_graph: bool, + cleanup: bool, index_algorithm: str = "Auto", cache_data_in_gpu: bool = False, custom_pretrained: bool = False, @@ -575,7 +575,7 @@ def run_train_script( cache_data_in_gpu, overtraining_detector, overtraining_threshold, - sync_graph, + cleanup, ], ), ] @@ -2129,10 +2129,10 @@ def parse_arguments(): default=50, ) train_parser.add_argument( - "--sync_graph", + "--cleanup", type=lambda x: bool(strtobool(x)), choices=[True, False], - help="Enable graph synchronization for distributed training.", + help="Cleanup previous training attempt.", default=False, ) train_parser.add_argument( @@ -2529,7 +2529,7 @@ def main(): overtraining_threshold=args.overtraining_threshold, pretrained=args.pretrained, custom_pretrained=args.custom_pretrained, - sync_graph=args.sync_graph, + cleanup=args.cleanup, index_algorithm=args.index_algorithm, cache_data_in_gpu=args.cache_data_in_gpu, g_pretrained_path=args.g_pretrained_path, diff --git a/rvc/train/train.py b/rvc/train/train.py index 8adabbc34..ca4c8da6b 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -74,7 +74,7 @@ cache_data_in_gpu = strtobool(sys.argv[13]) overtraining_detector = strtobool(sys.argv[14]) overtraining_threshold = int(sys.argv[15]) -sync_graph = strtobool(sys.argv[16]) +cleanup = strtobool(sys.argv[16]) current_dir = os.getcwd() experiment_dir = os.path.join(current_dir, "logs", model_name) @@ -198,8 +198,8 @@ def start(): pretrainG, pretrainD, pitch_guidance, - custom_total_epoch, - custom_save_every_weights, + total_epoch, + save_every_weights, config, device, ), @@ -246,56 +246,9 @@ def continue_overtrain_detector(training_file_path): smoothed_loss_gen_history, ) = load_from_json(training_file_path) - if sync_graph: - print( - "Sync graph is now activated! With sync graph enabled, the model undergoes a single epoch of training. Once the graphs are synchronized, training proceeds for the previously specified number of epochs." - ) - custom_total_epoch = 1 - custom_save_every_weights = True - - start() - - # Synchronize graphs by modifying config files - model_config_file = os.path.join(experiment_dir, "config.json") - rvc_config_file = os.path.join( - now_dir, "rvc", "configs", version, str(sample_rate) + ".json" - ) - if not os.path.exists(rvc_config_file): - rvc_config_file = os.path.join( - now_dir, "rvc", "configs", "v1", str(sample_rate) + ".json" - ) - - pattern = rf"{os.path.basename(model_name)}_(\d+)e_(\d+)s\.pth" - - for filename in os.listdir(experiment_dir): - match = re.match(pattern, filename) - if match: - steps = int(match.group(2)) - - def edit_config(config_file): - """ - Edits the config file to synchronize graphs. - - Args: - config_file (str): Path to the config file. - """ - with open(config_file, "r", encoding="utf8") as json_file: - config_data = json.load(json_file) - - config_data["train"]["log_interval"] = steps - - with open(config_file, "w", encoding="utf8") as json_file: - json.dump( - config_data, - json_file, - indent=2, - separators=(",", ": "), - ensure_ascii=False, - ) - - edit_config(model_config_file) - edit_config(rvc_config_file) - + if cleanup: + print("Removing files from the prior training attempt...") + # Clean up unnecessary files for root, dirs, files in os.walk( os.path.join(now_dir, "logs", model_name), topdown=False @@ -319,17 +272,10 @@ def edit_config(config_file): os.remove(item_path) os.rmdir(folder_path) - print("Successfully synchronized graphs!") - custom_total_epoch = total_epoch - custom_save_every_weights = save_every_weights - continue_overtrain_detector(training_file_path) - start() - else: - custom_total_epoch = total_epoch - custom_save_every_weights = save_every_weights - continue_overtrain_detector(training_file_path) - start() - + print("Cleanup done!") + + continue_overtrain_detector(training_file_path) + start() def run( rank, @@ -729,9 +675,7 @@ def train_and_evaluate( y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) with autocast(enabled=False): loss_mel = F.l1_loss(y_mel, y_hat_mel) * config.train.c_mel - loss_kl = ( - kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl - ) + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl loss_fm = feature_loss(fmap_r, fmap_g) loss_gen, losses_gen = generator_loss(y_d_hat_g) loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl @@ -753,66 +697,50 @@ def train_and_evaluate( scaler.step(optim_g) scaler.update() - # Logging and checkpointing - if rank == 0: - if global_step % config.train.log_interval == 0: - lr = optim_g.param_groups[0]["lr"] - if loss_mel > 75: - loss_mel = 75 - if loss_kl > 9: - loss_kl = 9 - scalar_dict = { - "loss/g/total": loss_gen_all, - "loss/d/total": loss_disc, - "learning_rate": lr, - "grad_norm_d": grad_norm_d, - "grad_norm_g": grad_norm_g, - } - scalar_dict.update( - { - "loss/g/fm": loss_fm, - "loss/g/mel": loss_mel, - "loss/g/kl": loss_kl, - } - ) - scalar_dict.update( - {f"loss/g/{i}": v for i, v in enumerate(losses_gen)} - ) - scalar_dict.update( - {f"loss/d_r/{i}": v for i, v in enumerate(losses_disc_r)} - ) - scalar_dict.update( - {f"loss/d_g/{i}": v for i, v in enumerate(losses_disc_g)} - ) - image_dict = { - "slice/mel_org": plot_spectrogram_to_numpy( - y_mel[0].data.cpu().numpy() - ), - "slice/mel_gen": plot_spectrogram_to_numpy( - y_hat_mel[0].data.cpu().numpy() - ), - "all/mel": plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), - } - - with torch.no_grad(): - if hasattr(net_g, "module"): - o, *_ = net_g.module.infer(*reference) - else: - o, *_ = net_g.infer(*reference) - audio_dict = {f"gen/audio_{global_step:07d}": o[0, :, :]} - - summarize( - writer=writer, - global_step=global_step, - images=image_dict, - scalars=scalar_dict, - audios=audio_dict, - audio_sample_rate=config.data.sample_rate, - ) - global_step += 1 pbar.update(1) + # Logging and checkpointing + if rank == 0: + lr = optim_g.param_groups[0]["lr"] + if loss_mel > 75: + loss_mel = 75 + if loss_kl > 9: + loss_kl = 9 + scalar_dict = { + "loss/g/total": loss_gen_all, + "loss/d/total": loss_disc, + "learning_rate": lr, + "grad_norm_d": grad_norm_d, + "grad_norm_g": grad_norm_g, + "loss/g/fm": loss_fm, + "loss/g/mel": loss_mel, + "loss/g/kl": loss_kl, + } + # commented out + #scalar_dict.update({f"loss/g/{i}": v for i, v in enumerate(losses_gen)}) + #scalar_dict.update({f"loss/d_r/{i}": v for i, v in enumerate(losses_disc_r)}) + #scalar_dict.update({f"loss/d_g/{i}": v for i, v in enumerate(losses_disc_g)}) + + image_dict = { + "slice/mel_org": plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), + "slice/mel_gen": plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), + "all/mel": plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), + } + + with torch.no_grad(): + o, *_ = net_g.infer(*reference) + audio_dict = {f"gen/audio_{global_step:07d}": o[0, :, :]} + + summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + audios=audio_dict, + audio_sample_rate=config.data.sample_rate, + ) + # Save checkpoint model_add = [] model_del = [] diff --git a/tabs/train/train.py b/tabs/train/train.py index 72bcc5e60..bc8ef8b4f 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -622,10 +622,10 @@ def train_tab(): interactive=True, ) with gr.Column(): - sync_graph = gr.Checkbox( - label=i18n("Sync Graph"), + cleanup = gr.Checkbox( + label=i18n("New Model"), info=i18n( - "Synchronize the graph of the tensorbaord. Only enable this setting if you are training a new model." + "Enable this setting only if you are training a new model from scratch or restarting the training. Deletes all previously generated weights and tensorboard logs." ), value=False, interactive=True, @@ -768,7 +768,7 @@ def train_tab(): overtraining_detector, overtraining_threshold, pretrained, - sync_graph, + cleanup, index_algorithm, cache_dataset_in_gpu, custom_pretrained,