Merge pull request #102 from TensorSpeech/refactor-fastspeech2

🚀 Refactor FastSpeech, separate encoder/decoder params.
TensorSpeech · Jul 9, 2020 · f65547c · f65547c
2 parents 21a8f36 + be94cef
commit f65547c
Show file tree

Hide file tree

Showing 10 changed files with 801 additions and 1,013 deletions.
diff --git a/examples/fastspeech/conf/fastspeech.v1.yaml b/examples/fastspeech/conf/fastspeech.v1.yaml
@@ -15,16 +15,24 @@ format: "npy"
 ###########################################################
 fastspeech_params:
     n_speakers: 1
-    hidden_size: 384
-    num_hidden_layers: 4
-    num_attention_heads: 2
-    intermediate_size: 1024
-    intermediate_kernel_size: 3
+    encoder_hidden_size: 384
+    encoder_num_hidden_layers: 4
+    encoder_num_attention_heads: 2
+    encoder_attention_head_size: 192  # hidden_size // num_attention_heads
+    encoder_intermediate_size: 1024
+    encoder_intermediate_kernel_size: 3
+    encoder_hidden_act: "mish"
+    decoder_hidden_size: 384
+    decoder_num_hidden_layers: 4
+    decoder_num_attention_heads: 2
+    decoder_attention_head_size: 192  # hidden_size // num_attention_heads
+    decoder_intermediate_size: 1024
+    decoder_intermediate_kernel_size: 3
+    decoder_hidden_act: "mish"
     num_duration_conv_layers: 2
     duration_predictor_filters: 256
     duration_predictor_kernel_sizes: 3
     num_mels: 80
-    hidden_act: "mish"
     hidden_dropout_prob: 0.1
     attention_probs_dropout_prob: 0.1
     duration_predictor_dropout_probs: 0.1

diff --git a/examples/fastspeech/conf/fastspeech.v3.yaml b/examples/fastspeech/conf/fastspeech.v3.yaml
@@ -15,16 +15,24 @@ format: "npy"
 ###########################################################
 fastspeech_params:
     n_speakers: 1
-    hidden_size: 384
-    num_hidden_layers: 4
-    num_attention_heads: 2
-    intermediate_size: 1024
-    intermediate_kernel_size: 3
+    encoder_hidden_size: 384
+    encoder_num_hidden_layers: 4
+    encoder_num_attention_heads: 2
+    encoder_attention_head_size: 192  # hidden_size // num_attention_heads
+    encoder_intermediate_size: 1024
+    encoder_intermediate_kernel_size: 3
+    encoder_hidden_act: "mish"
+    decoder_hidden_size: 384
+    decoder_num_hidden_layers: 4
+    decoder_num_attention_heads: 2
+    decoder_attention_head_size: 192  # hidden_size // num_attention_heads
+    decoder_intermediate_size: 1024
+    decoder_intermediate_kernel_size: 3
+    decoder_hidden_act: "mish"
     num_duration_conv_layers: 2
     duration_predictor_filters: 256
     duration_predictor_kernel_sizes: 3
     num_mels: 80
-    hidden_act: "mish"
     hidden_dropout_prob: 0.2
     attention_probs_dropout_prob: 0.1
     duration_predictor_dropout_probs: 0.2

diff --git a/examples/fastspeech2/conf/fastspeech2.v1.yaml b/examples/fastspeech2/conf/fastspeech2.v1.yaml
@@ -15,31 +15,31 @@ format: "npy"
 ###########################################################
 fastspeech_params:
     n_speakers: 1
-    hidden_size: 384
-    num_hidden_layers: 4
-    num_attention_heads: 2
-    attention_head_size: 192  # hidden_size // num_attention_heads
-    intermediate_size: 1024
-    intermediate_kernel_size: 3
-    num_duration_conv_layers: 2
-    duration_predictor_filters: 256
-    duration_predictor_kernel_sizes: 3
+    encoder_hidden_size: 384
+    encoder_num_hidden_layers: 4
+    encoder_num_attention_heads: 2
+    encoder_attention_head_size: 192  # hidden_size // num_attention_heads
+    encoder_intermediate_size: 1024
+    encoder_intermediate_kernel_size: 3
+    encoder_hidden_act: "mish"
+    decoder_hidden_size: 384
+    decoder_num_hidden_layers: 4
+    decoder_num_attention_heads: 2
+    decoder_attention_head_size: 192  # hidden_size // num_attention_heads
+    decoder_intermediate_size: 1024
+    decoder_intermediate_kernel_size: 3
+    decoder_hidden_act: "mish"
+    variant_prediction_num_conv_layers: 2
+    variant_predictor_filter: 256
+    variant_predictor_kernel_size: 3
+    variant_predictor_dropout_rate: 0.5
     num_mels: 80
-    hidden_act: "mish"
     hidden_dropout_prob: 0.2
     attention_probs_dropout_prob: 0.1
-    duration_predictor_dropout_probs: 0.5
     max_position_embeddings: 2048
     initializer_range: 0.02
     output_attentions: False
     output_hidden_states: False
-    f0_energy_predictor_filters: 256
-    f0_energy_predictor_kernel_sizes: 3
-    f0_energy_predictor_dropout_probs: 0.5
-    f0_kernel_size: 9
-    energy_kernel_size: 9
-    f0_dropout_rate: 0.5
-    energy_dropout_rate: 0.5
 
 ###########################################################
 #                  DATA LOADER SETTING                    #