-
Notifications
You must be signed in to change notification settings - Fork 66
/
Copy pathhparams.py
executable file
·70 lines (59 loc) · 2.5 KB
/
hparams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# Copyright 2021 Sony Group Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from utils.hparams import HParams
hparams = HParams(
# directory to the data
save_data_dir="precomputed-vctk/", # path to precomputed features
speaker_dir="data/list_of_speakers.txt",
# weighing hyper-parameters
lambda_rec=10.0, # reconstruction term
lambda_con=10.0, # content preservation term
lambda_kld=0.02, # kl divergence term
# optimization parameters
batch_size=8, # batch size
epoch=500, # number of epochs
print_frequency=50, # number of iterations before printing
epochs_per_checkpoint=50, # number of epochs for each checkpoint
output_path="./log/output/", # directory to save results
seed=123456, # random seed
g_lr=1e-4, # learning rate for generator
d_lr=1e-4, # learning rate for discriminator
beta1=0.5,
beta2=0.9,
weight_decay=0,
sr=22050, # sampling rate
segment_length=32768, # sample length
n_speaker_embedding=128, # dimension of speaker embedding
# Discriminator network
ndf=16,
n_layers_D=4, # number of layers in discriminator
num_D=3,
downsamp_factor=4,
n_D_updates=2,
# Generator network
ngf=32,
n_residual_layers=4,
ratios=[8, 8, 2, 2],
bottleneck_dim=4,
# Speaker network
n_spk_layers=5, # number of layers in speaker encoder
# multi-scale spectral loss
window_sizes=[2048, 1024, 512],
# data augmentation
scale_low=0.25, # lower bound used in random scaling
scale_high=1.0, # upper bound used in random scaling
split_low=30, # lower bound used in random shuffle
split_hight=45, # upper bound used in random shuffle
max_jitter_steps=30, # random jitter
)