diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py index 221ff4cff0..b9e998e739 100644 --- a/TTS/bin/train_vocoder.py +++ b/TTS/bin/train_vocoder.py @@ -1,6 +1,8 @@ import logging import os +import sys from dataclasses import dataclass, field +from typing import Optional from trainer import Trainer, TrainerArgs @@ -16,7 +18,7 @@ class TrainVocoderArgs(TrainerArgs): config_path: str = field(default=None, metadata={"help": "Path to the config file."}) -def main(): +def main(arg_list: Optional[list[str]] = None): """Run `tts` model training directly by a `config.json` file.""" setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) @@ -25,7 +27,7 @@ def main(): parser = train_args.init_argparse(arg_prefix="") # override trainer args from comman-line args - args, config_overrides = parser.parse_known_args() + args, config_overrides = parser.parse_known_args(arg_list) train_args.parse_args(args) # load config.json and register @@ -75,6 +77,7 @@ def main(): parse_command_line_args=False, ) trainer.fit() + sys.exit(0) if __name__ == "__main__": diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py deleted file mode 100644 index 972a47b2af..0000000000 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ /dev/null @@ -1,42 +0,0 @@ -from tests import get_device_id, run_cli -from TTS.vocoder.configs import FullbandMelganConfig - - -def test_train(tmp_path): - config_path = tmp_path / "test_vocoder_config.json" - output_path = tmp_path / "train_outputs" - - config = FullbandMelganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - output_path=output_path, - ) - config.audio.do_trim_silence = True - config.audio.trim_db = 60 - config.save_json(config_path) - - # train the model for one epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " - ) - run_cli(command_train) - - # Find latest folder - continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) - - # restore the model and continue training for one more epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " - ) - run_cli(command_train) diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py deleted file mode 100644 index b110f9bd61..0000000000 --- a/tests/vocoder_tests/test_hifigan_train.py +++ /dev/null @@ -1,41 +0,0 @@ -from tests import get_device_id, run_cli -from TTS.vocoder.configs import HifiganConfig - - -def test_train(tmp_path): - config_path = tmp_path / "test_vocoder_config.json" - output_path = tmp_path / "train_outputs" - - config = HifiganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=1024, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, - ) - config.audio.do_trim_silence = True - config.audio.trim_db = 60 - config.save_json(config_path) - - # train the model for one epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " - ) - run_cli(command_train) - - # Find latest folder - continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) - - # restore the model and continue training for one more epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " - ) - run_cli(command_train) diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py deleted file mode 100644 index 019c511da7..0000000000 --- a/tests/vocoder_tests/test_melgan_train.py +++ /dev/null @@ -1,42 +0,0 @@ -from tests import get_device_id, run_cli -from TTS.vocoder.configs import MelganConfig - - -def test_train(tmp_path): - config_path = tmp_path / "test_vocoder_config.json" - output_path = tmp_path / "train_outputs" - - config = MelganConfig( - batch_size=4, - eval_batch_size=4, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=2048, - eval_split_size=1, - print_step=1, - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, - ) - config.audio.do_trim_silence = True - config.audio.trim_db = 60 - config.save_json(config_path) - - # train the model for one epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " - ) - run_cli(command_train) - - # Find latest folder - continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) - - # restore the model and continue training for one more epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " - ) - run_cli(command_train) diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py deleted file mode 100644 index 4f9de80dfc..0000000000 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ /dev/null @@ -1,43 +0,0 @@ -from tests import get_device_id, run_cli -from TTS.vocoder.configs import MultibandMelganConfig - - -def test_train(tmp_path): - config_path = tmp_path / "test_vocoder_config.json" - output_path = tmp_path / "train_outputs" - - config = MultibandMelganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - steps_to_start_discriminator=1, - data_path="tests/data/ljspeech", - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - output_path=output_path, - ) - config.audio.do_trim_silence = True - config.audio.trim_db = 60 - config.save_json(config_path) - - # train the model for one epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " - ) - run_cli(command_train) - - # Find latest folder - continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) - - # restore the model and continue training for one more epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " - ) - run_cli(command_train) diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py deleted file mode 100644 index 1df44a11de..0000000000 --- a/tests/vocoder_tests/test_parallel_wavegan_train.py +++ /dev/null @@ -1,43 +0,0 @@ -import glob -import os - -from tests import get_device_id, run_cli -from TTS.vocoder.configs import ParallelWaveganConfig - - -def test_train(tmp_path): - config_path = tmp_path / "test_vocoder_config.json" - output_path = tmp_path / "train_outputs" - config = ParallelWaveganConfig( - batch_size=4, - eval_batch_size=4, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=2048, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, - ) - config.audio.do_trim_silence = True - config.audio.trim_db = 60 - config.save_json(config_path) - - # train the model for one epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " - ) - run_cli(command_train) - - # Find latest folder - continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - - # restore the model and continue training for one more epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " - ) - run_cli(command_train) diff --git a/tests/vocoder_tests/test_training.py b/tests/vocoder_tests/test_training.py new file mode 100644 index 0000000000..8965de01ee --- /dev/null +++ b/tests/vocoder_tests/test_training.py @@ -0,0 +1,112 @@ +import glob +import os + +import pytest + +from tests import run_main +from TTS.bin.train_vocoder import main +from TTS.vocoder.configs import ( + FullbandMelganConfig, + HifiganConfig, + MelganConfig, + MultibandMelganConfig, + ParallelWaveganConfig, + WavegradConfig, + WavernnConfig, +) +from TTS.vocoder.models.wavernn import WavernnArgs + +GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" + +BASE_CONFIG = { + "batch_size": 8, + "eval_batch_size": 8, + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "run_eval": True, + "test_delay_epochs": -1, + "epochs": 1, + "seq_len": 8192, + "eval_split_size": 1, + "print_step": 1, + "print_eval": True, + "data_path": "tests/data/ljspeech", +} + +DISCRIMINATOR_MODEL_PARAMS = { + "base_channels": 16, + "max_channels": 64, + "downsample_factors": [4, 4, 4], +} + + +def create_config(config_class, **overrides): + params = {**BASE_CONFIG, **overrides} + return config_class(**params) + + +def run_train(tmp_path, config): + config_path = str(tmp_path / "test_vocoder_config.json") + output_path = tmp_path / "train_outputs" + config.output_path = output_path + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) + + # Train the model for one epoch + run_main(main, ["--config_path", config_path]) + + # Find the latest folder + continue_path = str(max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)) + + # Restore the model and continue training for one more epoch + run_main(main, ["--continue_path", continue_path]) + + +def test_train_hifigan(tmp_path): + config = create_config(HifiganConfig, seq_len=1024) + run_train(tmp_path, config) + + +def test_train_melgan(tmp_path): + config = create_config( + MelganConfig, + batch_size=4, + eval_batch_size=4, + seq_len=2048, + discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS, + ) + run_train(tmp_path, config) + + +def test_train_multiband_melgan(tmp_path): + config = create_config( + MultibandMelganConfig, steps_to_start_discriminator=1, discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS + ) + run_train(tmp_path, config) + + +def test_train_fullband_melgan(tmp_path): + config = create_config(FullbandMelganConfig, discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS) + run_train(tmp_path, config) + + +def test_train_parallel_wavegan(tmp_path): + config = create_config(ParallelWaveganConfig, batch_size=4, eval_batch_size=4, seq_len=2048) + run_train(tmp_path, config) + + +# TODO: Reactivate after improving CI run times +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Takes ~2h on CI (15min/step vs 8sec/step locally)") +def test_train_wavegrad(tmp_path): + config = create_config(WavegradConfig, test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}) + run_train(tmp_path, config) + + +def test_train_wavernn(tmp_path): + config = create_config( + WavernnConfig, + model_args=WavernnArgs(), + seq_len=256, # For shorter test time + ) + run_train(tmp_path, config) diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py deleted file mode 100644 index 02a1fc1228..0000000000 --- a/tests/vocoder_tests/test_wavegrad_train.py +++ /dev/null @@ -1,50 +0,0 @@ -import os - -import pytest - -from tests import get_device_id, run_cli -from TTS.vocoder.configs import WavegradConfig - -GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" - - -# TODO: Reactivate after improving CI run times -@pytest.mark.skipif(GITHUB_ACTIONS, reason="Takes ~2h on CI (15min/step vs 8sec/step locally)") -def test_train(tmp_path): - config_path = tmp_path / "test_vocoder_config.json" - output_path = tmp_path / "train_outputs" - - config = WavegradConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, - test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}, - ) - config.audio.do_trim_silence = True - config.audio.trim_db = 60 - config.save_json(config_path) - - # train the model for one epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " - ) - run_cli(command_train) - - # Find latest folder - continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) - - # restore the model and continue training for one more epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " - ) - run_cli(command_train) diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py deleted file mode 100644 index 85e91efa36..0000000000 --- a/tests/vocoder_tests/test_wavernn_train.py +++ /dev/null @@ -1,43 +0,0 @@ -from tests import get_device_id, run_cli -from TTS.vocoder.configs import WavernnConfig -from TTS.vocoder.models.wavernn import WavernnArgs - - -def test_train(tmp_path): - config_path = tmp_path / "test_vocoder_config.json" - output_path = tmp_path / "train_outputs" - - config = WavernnConfig( - model_args=WavernnArgs(), - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=256, # for shorter test time - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, - ) - config.audio.do_trim_silence = True - config.audio.trim_db = 60 - config.save_json(config_path) - - # train the model for one epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " - ) - run_cli(command_train) - - # Find latest folder - continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) - - # restore the model and continue training for one more epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " - ) - run_cli(command_train)