diff --git a/TTS/encoder/configs/base_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py index 97cbf47893..d2d0ef580d 100644 --- a/TTS/encoder/configs/base_encoder_config.py +++ b/TTS/encoder/configs/base_encoder_config.py @@ -55,6 +55,6 @@ class BaseEncoderConfig(BaseTrainingConfig): def check_values(self): super().check_values() c = asdict(self) - assert ( - c["model_params"]["input_dim"] == self.audio.num_mels - ), " [!] model input dimendion must be equal to melspectrogram dimension." + assert c["model_params"]["input_dim"] == self.audio.num_mels, ( + " [!] model input dimendion must be equal to melspectrogram dimension." + ) diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py index fe57874a99..8d50ffd5f5 100644 --- a/TTS/encoder/utils/prepare_voxceleb.py +++ b/TTS/encoder/utils/prepare_voxceleb.py @@ -16,7 +16,7 @@ # Only support eager mode and TF>=2.0.0 # pylint: disable=no-member, invalid-name, relative-beyond-top-level # pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes -""" voxceleb 1 & 2 """ +"""voxceleb 1 & 2""" import csv import hashlib diff --git a/TTS/tts/configs/neuralhmm_tts_config.py b/TTS/tts/configs/neuralhmm_tts_config.py index be7a81fa89..bd1736c880 100644 --- a/TTS/tts/configs/neuralhmm_tts_config.py +++ b/TTS/tts/configs/neuralhmm_tts_config.py @@ -161,9 +161,9 @@ def check_values(self): AssertionError: transition probability is not between 0 and 1 """ assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model." - assert ( - len(self.outputnet_size) >= 1 - ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" - assert ( - 0 < self.flat_start_params["transition_p"] < 1 - ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + assert len(self.outputnet_size) >= 1, ( + f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" + ) + assert 0 < self.flat_start_params["transition_p"] < 1, ( + f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + ) diff --git a/TTS/tts/configs/overflow_config.py b/TTS/tts/configs/overflow_config.py index 8a113f1f33..93a6a9e377 100644 --- a/TTS/tts/configs/overflow_config.py +++ b/TTS/tts/configs/overflow_config.py @@ -192,9 +192,9 @@ def check_values(self): AssertionError: transition probability is not between 0 and 1 """ assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model." - assert ( - len(self.outputnet_size) >= 1 - ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" - assert ( - 0 < self.flat_start_params["transition_p"] < 1 - ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + assert len(self.outputnet_size) >= 1, ( + f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" + ) + assert 0 < self.flat_start_params["transition_p"] < 1, ( + f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + ) diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index 7badbfac59..e4b419d1fa 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -223,12 +223,12 @@ class TacotronConfig(BaseTTSConfig): def check_values(self): if self.gradual_training: - assert ( - self.gradual_training[0][1] == self.r - ), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}" + assert self.gradual_training[0][1] == self.r, ( + f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}" + ) if self.model == "tacotron" and self.audio is not None: - assert self.out_channels == ( - self.audio.fft_size // 2 + 1 - ), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}" + assert self.out_channels == (self.audio.fft_size // 2 + 1), ( + f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}" + ) if self.model == "tacotron2" and self.audio is not None: assert self.out_channels == self.audio.num_mels diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 2f5357c642..d83abce00a 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -37,9 +37,9 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01): else: eval_split_size = int(len(items) * eval_split_size) - assert ( - eval_split_size > 0 - ), f" [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {1 / len(items)}" + assert eval_split_size > 0, ( + f" [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {1 / len(items)}" + ) np.random.seed(0) np.random.shuffle(items) if is_multi_speaker: diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 6cf65c9b5e..3a4605275a 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -424,7 +424,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic """ file_ext = "flac" items = [] - meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) + meta_files = glob(f"{os.path.join(root_path, 'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] @@ -451,7 +451,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] - meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) + meta_files = glob(f"{os.path.join(root_path, 'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] diff --git a/TTS/tts/layers/bark/hubert/kmeans_hubert.py b/TTS/tts/layers/bark/hubert/kmeans_hubert.py index ade84794eb..87be97d5d1 100644 --- a/TTS/tts/layers/bark/hubert/kmeans_hubert.py +++ b/TTS/tts/layers/bark/hubert/kmeans_hubert.py @@ -7,7 +7,6 @@ # Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py - import torch from einops import pack, unpack from torch import nn diff --git a/TTS/tts/layers/bark/inference_funcs.py b/TTS/tts/layers/bark/inference_funcs.py index 1d141dc537..457a20ea28 100644 --- a/TTS/tts/layers/bark/inference_funcs.py +++ b/TTS/tts/layers/bark/inference_funcs.py @@ -58,9 +58,7 @@ def load_npz(npz_file: str) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64 def load_voice( model, voice: str, extra_voice_dirs: list[str] = [] -) -> tuple[ - npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None -]: # pylint: disable=dangerous-default-value +) -> tuple[npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None]: # pylint: disable=dangerous-default-value if voice == "random": return None, None, None diff --git a/TTS/tts/layers/bark/model.py b/TTS/tts/layers/bark/model.py index 54a9cecec0..4850d0a88b 100644 --- a/TTS/tts/layers/bark/model.py +++ b/TTS/tts/layers/bark/model.py @@ -175,9 +175,9 @@ def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use assert idx.shape[1] >= 256 + 256 + 1 t = idx.shape[1] - 256 else: - assert ( - t <= self.config.block_size - ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + assert t <= self.config.block_size, ( + f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + ) # forward the GPT model itself if merge_context: diff --git a/TTS/tts/layers/bark/model_fine.py b/TTS/tts/layers/bark/model_fine.py index 29126b41ab..20f54d2152 100644 --- a/TTS/tts/layers/bark/model_fine.py +++ b/TTS/tts/layers/bark/model_fine.py @@ -101,9 +101,9 @@ def __init__(self, config): def forward(self, pred_idx, idx): device = idx.device b, t, codes = idx.size() - assert ( - t <= self.config.block_size - ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + assert t <= self.config.block_size, ( + f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + ) assert pred_idx > 0, "cannot predict 0th codebook" assert codes == self.n_codes_total, (b, t, codes) pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t) diff --git a/TTS/tts/layers/feed_forward/encoder.py b/TTS/tts/layers/feed_forward/encoder.py index caf939ffc7..2d08f03c2d 100644 --- a/TTS/tts/layers/feed_forward/encoder.py +++ b/TTS/tts/layers/feed_forward/encoder.py @@ -143,9 +143,9 @@ def __init__( elif encoder_type.lower() == "residual_conv_bn": self.encoder = ResidualConv1dBNEncoder(in_hidden_channels, out_channels, in_hidden_channels, encoder_params) elif encoder_type.lower() == "fftransformer": - assert ( - in_hidden_channels == out_channels - ), "[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'" + assert in_hidden_channels == out_channels, ( + "[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'" + ) # pylint: disable=unexpected-keyword-arg self.encoder = FFTransformerBlock(in_hidden_channels, **encoder_params) else: diff --git a/TTS/tts/layers/generic/pos_encoding.py b/TTS/tts/layers/generic/pos_encoding.py index 695e37a6e0..7765e224aa 100644 --- a/TTS/tts/layers/generic/pos_encoding.py +++ b/TTS/tts/layers/generic/pos_encoding.py @@ -18,7 +18,7 @@ class PositionalEncoding(nn.Module): def __init__(self, channels, dropout_p=0.0, max_len=5000, use_scale=False): super().__init__() if channels % 2 != 0: - raise ValueError("Cannot use sin/cos positional encoding with " f"odd channels (got channels={channels:d})") + raise ValueError(f"Cannot use sin/cos positional encoding with odd channels (got channels={channels:d})") self.use_scale = use_scale if use_scale: self.scale = torch.nn.Parameter(torch.ones(1)) diff --git a/TTS/tts/layers/generic/transformer.py b/TTS/tts/layers/generic/transformer.py index 9b7ecee2ba..2fe9bcc408 100644 --- a/TTS/tts/layers/generic/transformer.py +++ b/TTS/tts/layers/generic/transformer.py @@ -70,9 +70,7 @@ def forward(self, x, mask=None, g=None): # pylint: disable=unused-argument class FFTDurationPredictor: - def __init__( - self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None - ): # pylint: disable=unused-argument + def __init__(self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None): # pylint: disable=unused-argument self.fft = FFTransformerBlock(in_channels, num_heads, hidden_channels, num_layers, dropout_p) self.proj = nn.Linear(in_channels, 1) diff --git a/TTS/tts/layers/tortoise/arch_utils.py b/TTS/tts/layers/tortoise/arch_utils.py index 1bbf676393..00fa559c77 100644 --- a/TTS/tts/layers/tortoise/arch_utils.py +++ b/TTS/tts/layers/tortoise/arch_utils.py @@ -101,9 +101,9 @@ def __init__( if num_head_channels == -1: self.num_heads = num_heads else: - assert ( - channels % num_head_channels == 0 - ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + assert channels % num_head_channels == 0, ( + f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + ) self.num_heads = channels // num_head_channels self.norm = normalization(channels) self.qkv = nn.Conv1d(channels, channels * 3, 1) diff --git a/TTS/tts/layers/tortoise/audio_utils.py b/TTS/tts/layers/tortoise/audio_utils.py index 6d6bb8cdb7..6bbe6c389c 100644 --- a/TTS/tts/layers/tortoise/audio_utils.py +++ b/TTS/tts/layers/tortoise/audio_utils.py @@ -125,14 +125,14 @@ def load_voices(voices: list[str], extra_voice_dirs: list[str] = []): return None, None clip, latent = load_voice(voice, extra_voice_dirs) if latent is None: - assert ( - len(latents) == 0 - ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + assert len(latents) == 0, ( + "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + ) clips.extend(clip) elif clip is None: - assert ( - len(clips) == 0 - ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + assert len(clips) == 0, ( + "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + ) latents.append(latent) if len(latents) == 0: return clips, None diff --git a/TTS/tts/layers/tortoise/autoregressive.py b/TTS/tts/layers/tortoise/autoregressive.py index cbfe076825..eaeb2a03c1 100644 --- a/TTS/tts/layers/tortoise/autoregressive.py +++ b/TTS/tts/layers/tortoise/autoregressive.py @@ -608,9 +608,9 @@ def inference_speech( if input_tokens is None: inputs = fake_inputs else: - assert ( - num_return_sequences % input_tokens.shape[0] == 0 - ), "The number of return sequences must be divisible by the number of input sequences" + assert num_return_sequences % input_tokens.shape[0] == 0, ( + "The number of return sequences must be divisible by the number of input sequences" + ) fake_inputs = fake_inputs.repeat(num_return_sequences, 1) input_tokens = input_tokens.repeat(num_return_sequences // input_tokens.shape[0], 1) inputs = torch.cat([fake_inputs, input_tokens], dim=1) diff --git a/TTS/tts/layers/tortoise/dpm_solver.py b/TTS/tts/layers/tortoise/dpm_solver.py index d34b61f486..c8892d456a 100644 --- a/TTS/tts/layers/tortoise/dpm_solver.py +++ b/TTS/tts/layers/tortoise/dpm_solver.py @@ -563,41 +563,21 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type if order == 3: K = steps // 3 + 1 if steps % 3 == 0: - orders = [ - 3, - ] * ( - K - 2 - ) + [2, 1] + orders = [3] * (K - 2) + [2, 1] elif steps % 3 == 1: - orders = [ - 3, - ] * ( - K - 1 - ) + [1] + orders = [3] * (K - 1) + [1] else: - orders = [ - 3, - ] * ( - K - 1 - ) + [2] + orders = [3] * (K - 1) + [2] elif order == 2: if steps % 2 == 0: K = steps // 2 - orders = [ - 2, - ] * K + orders = [2] * K else: K = steps // 2 + 1 - orders = [ - 2, - ] * ( - K - 1 - ) + [1] + orders = [2] * (K - 1) + [1] elif order == 1: K = 1 - orders = [ - 1, - ] * steps + orders = [1] * steps else: raise ValueError("'order' must be '1' or '2' or '3'.") if skip_type == "logSNR": @@ -605,15 +585,7 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device) else: timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[ - torch.cumsum( - torch.tensor( - [ - 0, - ] - + orders - ), - 0, - ).to(device) + torch.cumsum(torch.tensor([0] + orders), 0).to(device) ] return timesteps_outer, orders @@ -1217,9 +1189,9 @@ def inverse( """ t_0 = 1.0 / self.noise_schedule.total_N if t_start is None else t_start t_T = self.noise_schedule.T if t_end is None else t_end - assert ( - t_0 > 0 and t_T > 0 - ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + assert t_0 > 0 and t_T > 0, ( + "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + ) return self.sample( x, steps=steps, @@ -1362,9 +1334,9 @@ def sample( """ t_0 = 1.0 / self.noise_schedule.total_N if t_end is None else t_end t_T = self.noise_schedule.T if t_start is None else t_start - assert ( - t_0 > 0 and t_T > 0 - ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + assert t_0 > 0 and t_T > 0, ( + "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + ) if return_intermediate: assert method in [ "multistep", diff --git a/TTS/tts/layers/tortoise/transformer.py b/TTS/tts/layers/tortoise/transformer.py index c1854bd196..531f294220 100644 --- a/TTS/tts/layers/tortoise/transformer.py +++ b/TTS/tts/layers/tortoise/transformer.py @@ -43,9 +43,9 @@ def route_args(router, args, depth): class SequentialSequence(nn.Module): def __init__(self, layers, args_route={}, layer_dropout=0.0): super().__init__() - assert all( - len(route) == len(layers) for route in args_route.values() - ), "each argument route map must have the same depth as the number of sequential layers" + assert all(len(route) == len(layers) for route in args_route.values()), ( + "each argument route map must have the same depth as the number of sequential layers" + ) self.layers = layers self.args_route = args_route self.layer_dropout = layer_dropout diff --git a/TTS/tts/layers/tortoise/xtransformers.py b/TTS/tts/layers/tortoise/xtransformers.py index 0892fee19d..b2e74cf118 100644 --- a/TTS/tts/layers/tortoise/xtransformers.py +++ b/TTS/tts/layers/tortoise/xtransformers.py @@ -560,9 +560,9 @@ def __init__( self.rel_pos_bias = rel_pos_bias if rel_pos_bias: - assert ( - rel_pos_num_buckets <= rel_pos_max_distance - ), "number of relative position buckets must be less than the relative position max distance" + assert rel_pos_num_buckets <= rel_pos_max_distance, ( + "number of relative position buckets must be less than the relative position max distance" + ) self.rel_pos = RelativePositionBias( scale=dim_head**0.5, causal=causal, @@ -680,9 +680,9 @@ def forward( del input_mask if exists(attn_mask): - assert ( - 2 <= attn_mask.ndim <= 4 - ), "attention mask must have greater than 2 dimensions but less than or equal to 4" + assert 2 <= attn_mask.ndim <= 4, ( + "attention mask must have greater than 2 dimensions but less than or equal to 4" + ) if attn_mask.ndim == 2: attn_mask = rearrange(attn_mask, "i j -> () () i j") elif attn_mask.ndim == 3: @@ -790,9 +790,9 @@ def __init__( rotary_emb_dim = max(default(rotary_emb_dim, dim_head // 2), 32) self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim) if rotary_pos_emb else None - assert not ( - alibi_pos_bias and rel_pos_bias - ), "you can only choose Alibi positional bias or T5 relative positional bias, not both" + assert not (alibi_pos_bias and rel_pos_bias), ( + "you can only choose Alibi positional bias or T5 relative positional bias, not both" + ) if alibi_pos_bias: alibi_num_heads = default(alibi_num_heads, heads) @@ -922,9 +922,9 @@ def forward( past_key_values=None, expected_seq_len=None, ): - assert not ( - self.cross_attend ^ (exists(context) or exists(full_context)) - ), "context must be passed in if cross_attend is set to True" + assert not (self.cross_attend ^ (exists(context) or exists(full_context))), ( + "context must be passed in if cross_attend is set to True" + ) assert context is None or full_context is None, "only one of full_context or context can be provided" hiddens = [] @@ -940,9 +940,9 @@ def forward( rotary_pos_emb = None if exists(self.rotary_pos_emb): if not self.training and self.causal: - assert ( - expected_seq_len is not None - ), "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`" + assert expected_seq_len is not None, ( + "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`" + ) elif expected_seq_len is None: expected_seq_len = 0 seq_len = x.shape[1] diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py index 20eff26ecc..4e0f53616d 100644 --- a/TTS/tts/layers/xtts/gpt.py +++ b/TTS/tts/layers/xtts/gpt.py @@ -347,12 +347,12 @@ def forward( audio_codes = F.pad(audio_codes, (0, max_mel_len - audio_codes.shape[-1])) # 💖 Lovely assertions - assert ( - max_mel_len <= audio_codes.shape[-1] - ), f" ❗ max_mel_len ({max_mel_len}) > audio_codes.shape[-1] ({audio_codes.shape[-1]})" - assert ( - max_text_len <= text_inputs.shape[-1] - ), f" ❗ max_text_len ({max_text_len}) > text_inputs.shape[-1] ({text_inputs.shape[-1]})" + assert max_mel_len <= audio_codes.shape[-1], ( + f" ❗ max_mel_len ({max_mel_len}) > audio_codes.shape[-1] ({audio_codes.shape[-1]})" + ) + assert max_text_len <= text_inputs.shape[-1], ( + f" ❗ max_text_len ({max_text_len}) > text_inputs.shape[-1] ({text_inputs.shape[-1]})" + ) # Append stop token to text inputs text_inputs = F.pad(text_inputs[:, :max_text_len], (0, 1), value=self.stop_text_token) @@ -454,9 +454,9 @@ def forward( mel_targets[idx, l + 1 :] = -1 # check if stoptoken is in every row of mel_targets - assert (mel_targets == self.stop_audio_token).sum() >= mel_targets.shape[ - 0 - ], f" ❗ mel_targets does not contain stop token ({self.stop_audio_token}) in every row." + assert (mel_targets == self.stop_audio_token).sum() >= mel_targets.shape[0], ( + f" ❗ mel_targets does not contain stop token ({self.stop_audio_token}) in every row." + ) # ignore the loss for the segment used for conditioning # coin flip for the segment to be ignored diff --git a/TTS/tts/layers/xtts/stream_generator.py b/TTS/tts/layers/xtts/stream_generator.py index 303a990c27..e09a5233ac 100644 --- a/TTS/tts/layers/xtts/stream_generator.py +++ b/TTS/tts/layers/xtts/stream_generator.py @@ -953,7 +953,6 @@ def init_stream_support(): def _get_logits_warper(generation_config: GenerationConfig) -> LogitsProcessorList: - warpers = LogitsProcessorList() if generation_config.temperature is not None and generation_config.temperature != 1.0: diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py index e00ce2b4de..6e99d41eb9 100644 --- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -248,7 +248,11 @@ def test_run(self, assets) -> tuple[dict, dict]: # pylint: disable=W0613 return {"audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.args.output_sample_rate) diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 12c3d18252..c2e29c7100 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -232,9 +232,7 @@ def _forward_mdn(self, o_en, y, y_lengths, x_mask): dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, y_mask) return dr_mas, mu, log_sigma, logp - def forward( - self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None - ): # pylint: disable=unused-argument + def forward(self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None): # pylint: disable=unused-argument """ Shapes: - x: :math:`[B, T_max]` @@ -351,9 +349,7 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use train_audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": train_audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) @@ -366,9 +362,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/tts/models/bark.py b/TTS/tts/models/bark.py index 83478926a6..df0b73e3b4 100644 --- a/TTS/tts/models/bark.py +++ b/TTS/tts/models/bark.py @@ -194,9 +194,7 @@ def _set_voice_dirs(self, voice_dirs): return _voice_dirs # TODO: remove config from synthesize - def synthesize( - self, text, config, speaker_id="random", voice_dirs=None, **kwargs - ): # pylint: disable=unused-argument + def synthesize(self, text, config, speaker_id="random", voice_dirs=None, **kwargs): # pylint: disable=unused-argument """Synthesize speech with the given input text. Args: diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py index 8821036b5f..05f4ae168d 100644 --- a/TTS/tts/models/base_tacotron.py +++ b/TTS/tts/models/base_tacotron.py @@ -93,9 +93,7 @@ def forward(self): def inference(self): pass - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin """Load model checkpoint and set up internals. Args: @@ -176,7 +174,11 @@ def test_run(self, assets: dict) -> tuple[dict, dict]: return {"figures": test_figures, "audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.ap.sample_rate) logger.test_figures(steps, outputs["figures"]) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 0976e4cdab..f5bc49e147 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -210,9 +210,9 @@ def format_batch(self, batch: dict) -> dict: extra_frames = dur.sum() - mel_lengths[idx] largest_idxs = torch.argsort(-dur)[:extra_frames] dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + assert dur.sum() == mel_lengths[idx], ( + f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + ) durations[idx, : text_lengths[idx]] = dur # set stop targets wrt reduction factor diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py index 4a3defe665..eeb921503d 100644 --- a/TTS/tts/models/delightful_tts.py +++ b/TTS/tts/models/delightful_tts.py @@ -835,9 +835,7 @@ def _log(self, batch, outputs, name_prefix="train"): audios[f"{name_prefix}/vocoder_audio"] = sample_voice return figures, audios - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=no-self-use, unused-argument + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=no-self-use, unused-argument """Create visualizations and waveform examples. For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to @@ -1050,7 +1048,11 @@ def test_run(self, assets) -> tuple[dict, dict]: return {"figures": test_figures, "audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.config.audio.sample_rate) logger.test_figures(steps, outputs["figures"]) @@ -1262,9 +1264,7 @@ def on_epoch_end(self, trainer): # pylint: disable=unused-argument self.energy_scaler.eval() @staticmethod - def init_from_config( - config: "DelightfulTTSConfig", samples: list[list] | list[dict] = None - ): # pylint: disable=unused-argument + def init_from_config(config: "DelightfulTTSConfig", samples: list[list] | list[dict] = None): # pylint: disable=unused-argument """Initiate model from config Args: diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index 5b68475406..497ac3f63a 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -770,9 +770,7 @@ def _create_logs(self, batch, outputs, ap): train_audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": train_audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) @@ -785,9 +783,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 3289dcdd04..6310751b26 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -124,9 +124,9 @@ def init_multispeaker(self, config: Coqpit): config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 ) if self.speaker_manager is not None: - assert ( - config.d_vector_dim == self.speaker_manager.embedding_dim - ), " [!] d-vector dimension mismatch b/w config and speaker manager." + assert config.d_vector_dim == self.speaker_manager.embedding_dim, ( + " [!] d-vector dimension mismatch b/w config and speaker manager." + ) # init speaker embedding layer if config.use_speaker_embedding and not config.use_d_vector_file: logger.info("Init speaker_embedding layer.") @@ -192,9 +192,7 @@ def _speaker_embedding(self, aux_input: dict) -> torch.tensor | None: g = F.normalize(g).unsqueeze(-1) # [b, h, 1] return g - def forward( - self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} - ): # pylint: disable=dangerous-default-value + def forward(self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=dangerous-default-value """ Args: x (torch.Tensor): @@ -318,9 +316,7 @@ def inference_with_MAS( return outputs @torch.inference_mode() - def decoder_inference( - self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} - ): # pylint: disable=dangerous-default-value + def decoder_inference(self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=dangerous-default-value """ Shapes: - y: :math:`[B, T, C]` @@ -341,9 +337,7 @@ def decoder_inference( return outputs @torch.inference_mode() - def inference( - self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None} - ): # pylint: disable=dangerous-default-value + def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}): # pylint: disable=dangerous-default-value x_lengths = aux_input["x_lengths"] g = self._speaker_embedding(aux_input) # embedding pass @@ -456,9 +450,7 @@ def _create_logs(self, batch, outputs, ap): train_audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": train_audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) @@ -521,9 +513,7 @@ def preprocess(self, y, y_lengths, y_max_length, attn=None): def store_inverse(self): self.decoder.store_inverse() - def load_checkpoint( - self, config, checkpoint_path, eval=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/tts/models/neuralhmm_tts.py b/TTS/tts/models/neuralhmm_tts.py index a7c0ea7f14..2cbf425884 100644 --- a/TTS/tts/models/neuralhmm_tts.py +++ b/TTS/tts/models/neuralhmm_tts.py @@ -345,17 +345,13 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use, unus audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy()) return figures, {"audios": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=unused-argument + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Log training progress.""" figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) - def eval_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=unused-argument + def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Compute and log evaluation metrics.""" # Plot model parameters histograms if isinstance(logger, TensorboardLogger): @@ -369,7 +365,11 @@ def eval_log( logger.eval_audios(steps, audios, self.ap.sample_rate) def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs[1], self.ap.sample_rate) logger.test_figures(steps, outputs[0]) diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py index 85e1523307..aad2e1f553 100644 --- a/TTS/tts/models/overflow.py +++ b/TTS/tts/models/overflow.py @@ -362,17 +362,13 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use, unus audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy()) return figures, {"audios": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=unused-argument + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Log training progress.""" figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) - def eval_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=unused-argument + def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Compute and log evaluation metrics.""" # Plot model parameters histograms if isinstance(logger, TensorboardLogger): @@ -386,7 +382,11 @@ def eval_log( logger.eval_audios(steps, audios, self.ap.sample_rate) def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs[1], self.ap.sample_rate) logger.test_figures(steps, outputs[0]) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 879a2b94b5..59173691f7 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -376,9 +376,7 @@ def _create_logs(self, batch, outputs, ap): audio = ap.inv_spectrogram(pred_linear_spec.T) return figures, {"audio": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index c8c0c875ad..e924d82d42 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -399,9 +399,7 @@ def _create_logs(self, batch, outputs, ap): audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use """Log training progress.""" figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py index 738e9dd9b3..b44a5fbfc6 100644 --- a/TTS/tts/models/tortoise.py +++ b/TTS/tts/models/tortoise.py @@ -685,9 +685,9 @@ def inference( text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).to(self.device) text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary. - assert ( - text_tokens.shape[-1] < 400 - ), "Too much text provided. Break the text up into separate segments and re-try inference." + assert text_tokens.shape[-1] < 400, ( + "Too much text provided. Break the text up into separate segments and re-try inference." + ) if voice_samples is not None: ( diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 819ac7aea0..c92d6f46f7 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -1188,9 +1188,7 @@ def _log(self, ap, batch, outputs, name_prefix="train"): # pylint: disable=unus ) return figures, audios - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=no-self-use """Create visualizations and waveform examples. For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to @@ -1297,7 +1295,11 @@ def test_run(self, assets) -> tuple[dict, dict]: return {"figures": test_figures, "audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.ap.sample_rate) logger.test_figures(steps, outputs["figures"]) @@ -1366,9 +1368,9 @@ def format_batch_on_device(self, batch): ) if self.args.encoder_sample_rate: - assert batch["spec"].shape[2] == int( - batch["mel"].shape[2] / self.interpolate_factor - ), f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" + assert batch["spec"].shape[2] == int(batch["mel"].shape[2] / self.interpolate_factor), ( + f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" + ) else: assert batch["spec"].shape[2] == batch["mel"].shape[2], f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" @@ -1538,9 +1540,7 @@ def get_criterion(self): return [VitsDiscriminatorLoss(self.config), VitsGeneratorLoss(self.config)] - def load_checkpoint( - self, config, checkpoint_path, eval=False, strict=True, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, strict=True, cache=False): # pylint: disable=unused-argument, redefined-builtin """Load the model checkpoint and setup for training or inference""" state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) # compat band-aid for the pre-trained models to not use the encoder baked into the model @@ -1567,9 +1567,7 @@ def load_checkpoint( self.eval() assert not self.training - def load_fairseq_checkpoint( - self, config, checkpoint_dir, eval=False, strict=True - ): # pylint: disable=unused-argument, redefined-builtin + def load_fairseq_checkpoint(self, config, checkpoint_dir, eval=False, strict=True): # pylint: disable=unused-argument, redefined-builtin """Load VITS checkpoints released by fairseq here: https://github.com/facebookresearch/fairseq/tree/main/examples/mms Performs some changes for compatibility. @@ -1625,15 +1623,15 @@ def init_from_config(config: "VitsConfig", samples: list[list] | list[dict] = No upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item() if not config.model_args.encoder_sample_rate: - assert ( - upsample_rate == config.audio.hop_length - ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}" + assert upsample_rate == config.audio.hop_length, ( + f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}" + ) else: encoder_to_vocoder_upsampling_factor = config.audio.sample_rate / config.model_args.encoder_sample_rate effective_hop_length = config.audio.hop_length * encoder_to_vocoder_upsampling_factor - assert ( - upsample_rate == effective_hop_length - ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}" + assert upsample_rate == effective_hop_length, ( + f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}" + ) ap = AudioProcessor.init_from_config(config) tokenizer, new_config = TTSTokenizer.init_from_config(config) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index c1ed3a305b..28eb17d648 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -383,9 +383,9 @@ def synthesize(self, text, config, speaker_wav, language, speaker_id=None, **kwa as latents used at inference. """ - assert ( - "zh-cn" if language == "zh" else language in self.config.languages - ), f" ❗ Language {language} is not supported. Supported languages are {self.config.languages}" + assert "zh-cn" if language == "zh" else language in self.config.languages, ( + f" ❗ Language {language} is not supported. Supported languages are {self.config.languages}" + ) # Use generally found best tuning knobs for generation. settings = { "temperature": config.temperature, @@ -523,9 +523,9 @@ def inference( sent = sent.strip().lower() text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device) - assert ( - text_tokens.shape[-1] < self.args.gpt_max_text_tokens - ), " ❗ XTTS can only generate text with a maximum of 400 tokens." + assert text_tokens.shape[-1] < self.args.gpt_max_text_tokens, ( + " ❗ XTTS can only generate text with a maximum of 400 tokens." + ) with torch.no_grad(): gpt_codes = self.gpt.generate( @@ -631,9 +631,9 @@ def inference_stream( sent = sent.strip().lower() text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device) - assert ( - text_tokens.shape[-1] < self.args.gpt_max_text_tokens - ), " ❗ XTTS can only generate text with a maximum of 400 tokens." + assert text_tokens.shape[-1] < self.args.gpt_max_text_tokens, ( + " ❗ XTTS can only generate text with a maximum of 400 tokens." + ) fake_inputs = self.gpt.compute_embeddings( gpt_cond_latent.to(self.device), diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py index cf02e5282b..a3648eff4b 100644 --- a/TTS/tts/utils/helpers.py +++ b/TTS/tts/utils/helpers.py @@ -105,9 +105,9 @@ def rand_segments( _x_lenghts[len_diff < 0] = segment_size len_diff = _x_lenghts - segment_size else: - assert all( - len_diff > 0 - ), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}" + assert all(len_diff > 0), ( + f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}" + ) segment_indices = (torch.rand([B]).type_as(x) * (len_diff + 1)).long() ret = segment(x, segment_indices, segment_size, pad_short=pad_short) return ret, segment_indices diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 026039ab29..6fab27de5a 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -185,9 +185,9 @@ def get_speaker_manager(c: Coqpit, data: list = None, restore_path: str = None, elif not c.use_d_vector_file: # restor speaker manager with speaker ID file. speaker_ids_from_data = speaker_manager.name_to_id speaker_manager.load_ids_from_file(speakers_file) - assert all( - speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data - ), " [!] You cannot introduce new speakers to a pre-trained model." + assert all(speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data), ( + " [!] You cannot introduce new speakers to a pre-trained model." + ) elif c.use_d_vector_file and c.d_vector_file: # new speaker manager with external speaker embeddings. speaker_manager.load_embeddings_from_file(c.d_vector_file) diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py index 24bab63ca1..660370a832 100644 --- a/TTS/tts/utils/ssim.py +++ b/TTS/tts/utils/ssim.py @@ -49,16 +49,16 @@ def _validate_input( if size_range is None: assert t.size() == x.size(), f"Expected tensors with same size, got {t.size()} and {x.size()}" else: - assert ( - t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]] - ), f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}" + assert t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]], ( + f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}" + ) if dim_range[0] == dim_range[1]: assert t.dim() == dim_range[0], f"Expected number of dimensions to be {dim_range[0]}, got {t.dim()}" elif dim_range[0] < dim_range[1]: - assert ( - dim_range[0] <= t.dim() <= dim_range[1] - ), f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}" + assert dim_range[0] <= t.dim() <= dim_range[1], ( + f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}" + ) if data_range[0] < data_range[1]: assert data_range[0] <= t.min(), f"Expected values to be greater or equal to {data_range[0]}, got {t.min()}" @@ -285,8 +285,7 @@ def _ssim_per_channel( """ if x.size(-1) < kernel.size(-1) or x.size(-2) < kernel.size(-2): raise ValueError( - f"Kernel size can't be greater than actual input size. Input size: {x.size()}. " - f"Kernel size: {kernel.size()}" + f"Kernel size can't be greater than actual input size. Input size: {x.size()}. Kernel size: {kernel.size()}" ) c1 = k1**2 @@ -337,8 +336,7 @@ def _ssim_per_channel_complex( n_channels = x.size(1) if x.size(-2) < kernel.size(-1) or x.size(-3) < kernel.size(-2): raise ValueError( - f"Kernel size can't be greater than actual input size. Input size: {x.size()}. " - f"Kernel size: {kernel.size()}" + f"Kernel size can't be greater than actual input size. Input size: {x.size()}. Kernel size: {kernel.size()}" ) c1 = k1**2 diff --git a/TTS/tts/utils/text/bangla/phonemizer.py b/TTS/tts/utils/text/bangla/phonemizer.py index cddcb00fd5..1537240380 100644 --- a/TTS/tts/utils/text/bangla/phonemizer.py +++ b/TTS/tts/utils/text/bangla/phonemizer.py @@ -45,7 +45,7 @@ def tag_text(text: str): # create start and end text = "start" + text + "end" # tag text - parts = re.split("[\u0600-\u06FF]+", text) + parts = re.split("[\u0600-\u06ff]+", text) # remove non chars parts = [p for p in parts if p.strip()] # unique parts diff --git a/TTS/tts/utils/text/characters.py b/TTS/tts/utils/text/characters.py index da30692f5e..f8beaef036 100644 --- a/TTS/tts/utils/text/characters.py +++ b/TTS/tts/utils/text/characters.py @@ -289,9 +289,9 @@ def _create_vocab(self): self.vocab = _vocab + list(self._punctuations) if self.is_unique: duplicates = {x for x in self.vocab if self.vocab.count(x) > 1} - assert ( - len(self.vocab) == len(self._char_to_id) == len(self._id_to_char) - ), f" [!] There are duplicate characters in the character set. {duplicates}" + assert len(self.vocab) == len(self._char_to_id) == len(self._id_to_char), ( + f" [!] There are duplicate characters in the character set. {duplicates}" + ) def char_to_id(self, char: str) -> int: try: diff --git a/TTS/tts/utils/text/english/number_norm.py b/TTS/tts/utils/text/english/number_norm.py index c5f2f452d5..be2a4b3084 100644 --- a/TTS/tts/utils/text/english/number_norm.py +++ b/TTS/tts/utils/text/english/number_norm.py @@ -1,4 +1,4 @@ -""" from https://github.com/keithito/tacotron """ +"""from https://github.com/keithito/tacotron""" import re diff --git a/TTS/tts/utils/text/korean/korean.py b/TTS/tts/utils/text/korean/korean.py index 0feef3bdfb..1b1e0ca0fb 100644 --- a/TTS/tts/utils/text/korean/korean.py +++ b/TTS/tts/utils/text/korean/korean.py @@ -1,4 +1,4 @@ -īģŋ# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py +# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py import re from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary diff --git a/TTS/tts/utils/text/phonemizers/base.py b/TTS/tts/utils/text/phonemizers/base.py index 4bd03851c7..6cc6ec0b37 100644 --- a/TTS/tts/utils/text/phonemizers/base.py +++ b/TTS/tts/utils/text/phonemizers/base.py @@ -52,7 +52,7 @@ def _init_language(self, language): """ if not self.is_supported_language(language): - raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend") + raise RuntimeError(f'language "{language}" is not supported by the {self.name()} backend') return language @property diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py index 9a8841106c..55b8575aa4 100644 --- a/TTS/utils/audio/processor.py +++ b/TTS/utils/audio/processor.py @@ -222,9 +222,9 @@ def __init__( self.hop_length = hop_length self.win_length = win_length assert min_level_db != 0.0, " [!] min_level_db is 0" - assert ( - self.win_length <= self.fft_size - ), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}" + assert self.win_length <= self.fft_size, ( + f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}" + ) members = vars(self) logger.info("Setting up Audio Processor...") for key, value in members.items(): @@ -283,7 +283,9 @@ def normalize(self, S: np.ndarray) -> np.ndarray: S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm if self.clip_norm: S_norm = np.clip( - S_norm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type + S_norm, + -self.max_norm, # pylint: disable=invalid-unary-operand-type + self.max_norm, ) return S_norm S_norm = self.max_norm * S_norm @@ -318,7 +320,9 @@ def denormalize(self, S: np.ndarray) -> np.ndarray: if self.symmetric_norm: if self.clip_norm: S_denorm = np.clip( - S_denorm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type + S_denorm, + -self.max_norm, # pylint: disable=invalid-unary-operand-type + self.max_norm, ) S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db return S_denorm + self.ref_level_db @@ -351,9 +355,9 @@ def load_stats(self, stats_path: str) -> tuple[np.array, np.array, np.array, np. if key in skip_parameters: continue if key not in ["sample_rate", "trim_db"]: - assert ( - stats_config[key] == self.__dict__[key] - ), f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" + assert stats_config[key] == self.__dict__[key], ( + f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" + ) return mel_mean, mel_std, linear_mean, linear_std, stats_config # pylint: disable=attribute-defined-outside-init diff --git a/TTS/utils/samplers.py b/TTS/utils/samplers.py index 4e8f3825b9..d24733977a 100644 --- a/TTS/utils/samplers.py +++ b/TTS/utils/samplers.py @@ -49,9 +49,9 @@ def __init__( label_key="class_name", ): super().__init__(dataset_items) - assert ( - batch_size % (num_classes_in_batch * num_gpus) == 0 - ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)." + assert batch_size % (num_classes_in_batch * num_gpus) == 0, ( + "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)." + ) label_indices = {} for idx, item in enumerate(dataset_items): diff --git a/TTS/vc/layers/freevc/wavlm/modules.py b/TTS/vc/layers/freevc/wavlm/modules.py index cddacd69ab..cf31a866de 100644 --- a/TTS/vc/layers/freevc/wavlm/modules.py +++ b/TTS/vc/layers/freevc/wavlm/modules.py @@ -330,7 +330,7 @@ def __init__( self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( - "Self-attention requires query, key and " "value to be of the same size" + "Self-attention requires query, key and value to be of the same size" ) k_bias = True diff --git a/TTS/vc/layers/freevc/wavlm/wavlm.py b/TTS/vc/layers/freevc/wavlm/wavlm.py index c5b8c19c32..26f385c267 100644 --- a/TTS/vc/layers/freevc/wavlm/wavlm.py +++ b/TTS/vc/layers/freevc/wavlm/wavlm.py @@ -67,8 +67,7 @@ def compute_mask_indices( all_num_mask = int( # add a random number for probabilistic rounding - mask_prob * all_sz / float(mask_length) - + np.random.rand() + mask_prob * all_sz / float(mask_length) + np.random.rand() ) all_num_mask = max(min_masks, all_num_mask) @@ -79,8 +78,7 @@ def compute_mask_indices( sz = all_sz - padding_mask[i].long().sum().item() num_mask = int( # add a random number for probabilistic rounding - mask_prob * sz / float(mask_length) - + np.random.rand() + mask_prob * sz / float(mask_length) + np.random.rand() ) num_mask = max(min_masks, num_mask) else: @@ -154,9 +152,7 @@ def arrange(s, e, length, keep_length): class WavLMConfig: def __init__(self, cfg=None): - self.extractor_mode: str = ( - "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) - ) + self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) self.encoder_layers: int = 12 # num encoder layers in the transformer self.encoder_embed_dim: int = 768 # encoder embedding dimension @@ -165,9 +161,7 @@ def __init__(self, cfg=None): self.activation_fn: str = "gelu" # activation function to use self.layer_norm_first: bool = False # apply layernorm first in the transformer - self.conv_feature_layers: str = ( - "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] - ) + self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] self.conv_bias: bool = False # include bias in conv encoder self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this diff --git a/TTS/vc/models/base_vc.py b/TTS/vc/models/base_vc.py index c0fe766b7c..9f107edbe0 100644 --- a/TTS/vc/models/base_vc.py +++ b/TTS/vc/models/base_vc.py @@ -199,9 +199,9 @@ def format_batch(self, batch: dict[str, Any]) -> dict[str, Any]: extra_frames = dur.sum() - mel_lengths[idx] largest_idxs = torch.argsort(-dur)[:extra_frames] dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + assert dur.sum() == mel_lengths[idx], ( + f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + ) durations[idx, : text_lengths[idx]] = dur # set stop targets wrt reduction factor diff --git a/TTS/vocoder/datasets/gan_dataset.py b/TTS/vocoder/datasets/gan_dataset.py index c0882c701f..076545f8a2 100644 --- a/TTS/vocoder/datasets/gan_dataset.py +++ b/TTS/vocoder/datasets/gan_dataset.py @@ -128,9 +128,9 @@ def load_item(self, idx): # correct the audio length wrt padding applied in stft audio = np.pad(audio, (0, self.hop_len), mode="edge") audio = audio[: mel.shape[-1] * self.hop_len] - assert ( - mel.shape[-1] * self.hop_len == audio.shape[-1] - ), f" [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}" + assert mel.shape[-1] * self.hop_len == audio.shape[-1], ( + f" [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}" + ) audio = torch.from_numpy(audio).float().unsqueeze(0) mel = torch.from_numpy(mel).float().squeeze(0) diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index 3ae9015451..435330bebe 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -102,9 +102,9 @@ def load_item(self, idx): audio = np.pad( audio, (0, self.seq_len + self.pad_short - len(audio)), mode="constant", constant_values=0.0 ) - assert ( - audio.shape[-1] >= self.seq_len + self.pad_short - ), f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}" + assert audio.shape[-1] >= self.seq_len + self.pad_short, ( + f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}" + ) # correct the audio length wrt hop length p = (audio.shape[-1] // self.hop_len + 1) * self.hop_len - audio.shape[-1] diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index 0fad81864e..81a1f30884 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -224,9 +224,9 @@ class GeneratorLoss(nn.Module): def __init__(self, C): super().__init__() - assert not ( - C.use_mse_gan_loss and C.use_hinge_gan_loss - ), " [!] Cannot use HingeGANLoss and MSEGANLoss together." + assert not (C.use_mse_gan_loss and C.use_hinge_gan_loss), ( + " [!] Cannot use HingeGANLoss and MSEGANLoss together." + ) self.use_stft_loss = C.use_stft_loss if "use_stft_loss" in C else False self.use_subband_stft_loss = C.use_subband_stft_loss if "use_subband_stft_loss" in C else False @@ -311,9 +311,9 @@ class DiscriminatorLoss(nn.Module): def __init__(self, C): super().__init__() - assert not ( - C.use_mse_gan_loss and C.use_hinge_gan_loss - ), " [!] Cannot use HingeGANLoss and MSEGANLoss together." + assert not (C.use_mse_gan_loss and C.use_hinge_gan_loss), ( + " [!] Cannot use HingeGANLoss and MSEGANLoss together." + ) self.use_mse_gan_loss = C.use_mse_gan_loss self.use_hinge_gan_loss = C.use_hinge_gan_loss diff --git a/TTS/vocoder/layers/lvc_block.py b/TTS/vocoder/layers/lvc_block.py index 8913a1132e..ab1a56e7fc 100644 --- a/TTS/vocoder/layers/lvc_block.py +++ b/TTS/vocoder/layers/lvc_block.py @@ -175,9 +175,9 @@ def location_variable_convolution(x, kernel, bias, dilation, hop_size): batch, _, in_length = x.shape batch, _, out_channels, kernel_size, kernel_length = kernel.shape - assert in_length == ( - kernel_length * hop_size - ), f"length of (x, kernel) is not matched, {in_length} vs {kernel_length * hop_size}" + assert in_length == (kernel_length * hop_size), ( + f"length of (x, kernel) is not matched, {in_length} vs {kernel_length * hop_size}" + ) padding = dilation * int((kernel_size - 1) / 2) x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding) diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index 42dfef32b7..ba3852e795 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -204,7 +204,12 @@ def _log(self, name: str, ap: AudioProcessor, batch: dict, outputs: dict) -> tup return figures, audios def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> tuple[dict, np.ndarray]: """Call `_log()` for training.""" figures, audios = self._log("eval", self.ap, batch, outputs) @@ -218,7 +223,12 @@ def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> tu return self.train_step(batch, criterion, optimizer_idx) def eval_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> tuple[dict, np.ndarray]: """Call `_log()` for evaluation.""" figures, audios = self._log("eval", self.ap, batch, outputs) diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py index e8f175ed17..4398300f8e 100644 --- a/TTS/vocoder/models/hifigan_generator.py +++ b/TTS/vocoder/models/hifigan_generator.py @@ -306,9 +306,7 @@ def remove_weight_norm(self): remove_parametrizations(self.conv_pre, "weight") remove_parametrizations(self.conv_post, "weight") - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/vocoder/models/melgan_generator.py b/TTS/vocoder/models/melgan_generator.py index 03c971afa4..53ed700755 100644 --- a/TTS/vocoder/models/melgan_generator.py +++ b/TTS/vocoder/models/melgan_generator.py @@ -84,9 +84,7 @@ def remove_weight_norm(self): except ValueError: layer.remove_weight_norm() - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py index f4ef3a0734..71b38d4c0d 100644 --- a/TTS/vocoder/models/parallel_wavegan_generator.py +++ b/TTS/vocoder/models/parallel_wavegan_generator.py @@ -108,9 +108,9 @@ def forward(self, c): # perform upsampling if c is not None and self.upsample_net is not None: c = self.upsample_net(c) - assert ( - c.shape[-1] == x.shape[-1] - ), f" [!] Upsampling scale does not match the expected output. {c.shape} vs {x.shape}" + assert c.shape[-1] == x.shape[-1], ( + f" [!] Upsampling scale does not match the expected output. {c.shape} vs {x.shape}" + ) # encode to hidden representation x = self.first_conv(x) @@ -155,9 +155,7 @@ def _apply_weight_norm(m): def receptive_field_size(self): return _get_receptive_field_size(self.layers, self.stacks, self.kernel_size) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index 16c66e235b..b1a4a26562 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -217,9 +217,7 @@ def apply_weight_norm(self): self.out_conv = weight_norm(self.out_conv) self.y_conv = weight_norm(self.y_conv) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: @@ -257,7 +255,12 @@ def train_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]: return {"model_output": noise_hat}, {"loss": loss} def train_log( # pylint: disable=no-self-use - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> tuple[dict, np.ndarray]: pass @@ -266,7 +269,12 @@ def eval_step(self, batch: dict, criterion: nn.Module) -> tuple[dict, dict]: return self.train_step(batch, criterion) def eval_log( # pylint: disable=no-self-use - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: pass diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 2fe55f91bc..5a93f125ba 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -225,9 +225,9 @@ class of models has however remained an elusive problem. With a focus on text-to self.aux_dims = self.args.res_out_dims // 4 if self.args.use_upsample_net: - assert ( - np.cumprod(self.args.upsample_factors)[-1] == config.audio.hop_length - ), " [!] upsample scales needs to be equal to hop_length" + assert np.cumprod(self.args.upsample_factors)[-1] == config.audio.hop_length, ( + " [!] upsample scales needs to be equal to hop_length" + ) self.upsample = UpsampleNetwork( self.args.feat_dims, self.args.upsample_factors, @@ -527,9 +527,7 @@ def xfade_and_unfold(y, target, overlap): return unfolded - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: @@ -556,7 +554,10 @@ def eval_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]: @torch.no_grad() def test( - self, assets: dict, test_loader: "DataLoader", output: dict # pylint: disable=unused-argument + self, + assets: dict, + test_loader: "DataLoader", + output: dict, # pylint: disable=unused-argument ) -> tuple[dict, dict]: ap = self.ap figures = {} @@ -578,7 +579,11 @@ def test( return figures, audios def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> tuple[dict, np.ndarray]: figures, audios = outputs logger.eval_figures(steps, figures) diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py index f9067530e6..370a541b97 100644 --- a/tests/text_tests/test_phonemizer.py +++ b/tests/text_tests/test_phonemizer.py @@ -240,12 +240,8 @@ def test_is_available(self): class TestBN_Phonemizer(unittest.TestCase): def setUp(self): self.phonemizer = BN_Phonemizer() - self._TEST_CASES = ( - "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡, āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ, āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨" - ) - self._EXPECTED = ( - "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡ āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨āĨ¤" - ) + self._TEST_CASES = "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡, āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ, āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨" + self._EXPECTED = "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡ āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨āĨ¤" def test_phonemize(self): self.assertEqual(self.phonemizer.phonemize(self._TEST_CASES, separator=""), self._EXPECTED) diff --git a/tests/text_tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py index 25c169eddd..f5d342bb00 100644 --- a/tests/text_tests/test_text_cleaners.py +++ b/tests/text_tests/test_text_cleaners.py @@ -45,11 +45,11 @@ def test_normalize_unicode() -> None: ("na\u0303", "nÃŖ"), ("o\u0302u", "ôu"), ("n\u0303", "Ãą"), - ("\u4E2D\u56FD", "中å›Ŋ"), + ("\u4e2d\u56fd", "中å›Ŋ"), ("niÃąo", "niÃąo"), ("a\u0308", "ä"), ("\u3053\u3093\u306b\u3061\u306f", "こんãĢãĄã¯"), - ("\u03B1\u03B2", "ιβ"), + ("\u03b1\u03b2", "ιβ"), ] for arg, expect in test_cases: assert normalize_unicode(arg) == expect diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index 9a8027736e..72069bf943 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -72,9 +72,9 @@ def test_train_step(self): # pylint: disable=no-self-use for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -131,9 +131,9 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -198,9 +198,9 @@ def test_train_step(self): if name == "gst_layer.encoder.recurrence.weight_hh_l0": # print(param.grad) continue - assert ( - param != param_ref - ).any(), f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 # with file gst style @@ -254,9 +254,9 @@ def test_train_step(self): if name == "gst_layer.encoder.recurrence.weight_hh_l0": # print(param.grad) continue - assert ( - param != param_ref - ).any(), f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -321,9 +321,9 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -384,7 +384,7 @@ def test_train_step(): name, param = name_param if name == "gst_layer.encoder.recurrence.weight_hh_l0": continue - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 3976b9ae8d..5f9af86e7e 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -71,9 +71,9 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -127,9 +127,9 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -186,9 +186,9 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 # with file gst style @@ -238,9 +238,9 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -305,9 +305,9 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 @@ -366,7 +366,7 @@ def test_train_step(): name, param = name_param if name == "gst_layer.encoder.recurrence.weight_hh_l0": continue - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index f0b347b895..790439ecb2 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -373,9 +373,9 @@ def _check_parameter_changes(model, model_ref): name = item1[0] param = item1[1] param_ref = item2[1] - assert ( - param != param_ref - ).any(), f"param {name} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {name} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count = count + 1 def _create_batch(self, config, batch_size): diff --git a/tests/tts_tests2/test_glow_tts.py b/tests/tts_tests2/test_glow_tts.py index 967e9ecb9e..c92063576f 100644 --- a/tests/tts_tests2/test_glow_tts.py +++ b/tests/tts_tests2/test_glow_tts.py @@ -42,9 +42,9 @@ def _create_inputs(batch_size=8): def _check_parameter_changes(model, model_ref): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): - assert ( - param != param_ref - ).any(), f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" + ) count += 1 def test_init_multispeaker(self): @@ -241,10 +241,10 @@ def _test_inference_with_MAS(self, batch_size): # inference encoder and decoder with MAS y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths) y2 = model.decoder_inference(mel_spec, mel_lengths) - assert ( - y2["model_outputs"].shape == y["model_outputs"].shape - ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format( - y["model_outputs"].shape, y2["model_outputs"].shape + assert y2["model_outputs"].shape == y["model_outputs"].shape, ( + "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format( + y["model_outputs"].shape, y2["model_outputs"].shape + ) ) def test_inference_with_MAS(self): diff --git a/tests/vc_tests/test_freevc.py b/tests/vc_tests/test_freevc.py index dd45d6941f..784e32a68d 100644 --- a/tests/vc_tests/test_freevc.py +++ b/tests/vc_tests/test_freevc.py @@ -80,9 +80,9 @@ def _test_inference(self, batch_size): wavlm_vec_lengths = torch.ones(batch_size, dtype=torch.long) output_wav = model.inference(wavlm_vec, None, mel, wavlm_vec_lengths) - assert ( - output_wav.shape[-1] // config.audio.hop_length == wavlm_vec.shape[-1] - ), f"{output_wav.shape[-1] // config.audio.hop_length} != {wavlm_vec.shape}" + assert output_wav.shape[-1] // config.audio.hop_length == wavlm_vec.shape[-1], ( + f"{output_wav.shape[-1] // config.audio.hop_length} != {wavlm_vec.shape}" + ) def test_inference(self): self._test_inference(1) @@ -95,9 +95,9 @@ def test_voice_conversion(self): source_wav, target_wav = self._create_inputs_inference() output_wav = model.voice_conversion(source_wav, target_wav) - assert ( - output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length - ), f"{output_wav.shape} != {source_wav.shape}, {config.audio.hop_length}" + assert output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length, ( + f"{output_wav.shape} != {source_wav.shape}, {config.audio.hop_length}" + ) def test_train_step(self): ... diff --git a/tests/vc_tests/test_openvoice.py b/tests/vc_tests/test_openvoice.py index c9f7ae3931..703873ea47 100644 --- a/tests/vc_tests/test_openvoice.py +++ b/tests/vc_tests/test_openvoice.py @@ -16,7 +16,6 @@ class TestOpenVoice(unittest.TestCase): - @staticmethod def _create_inputs_inference(): source_wav = torch.rand(16100) @@ -37,6 +36,6 @@ def test_voice_conversion(self): source_wav, target_wav = self._create_inputs_inference() output_wav = model.voice_conversion(source_wav, target_wav) - assert ( - output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length - ), f"{output_wav.shape} != {source_wav.shape}" + assert output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length, ( + f"{output_wav.shape} != {source_wav.shape}" + )