forked from mrq/DL-Art-School
98 lines
2.2 KiB
YAML
98 lines
2.2 KiB
YAML
|
#### general settings
|
||
|
name: train_tacotron2_lj
|
||
|
use_tb_logger: true
|
||
|
gpu_ids: [0]
|
||
|
start_step: -1
|
||
|
fp16: false
|
||
|
checkpointing_enabled: true
|
||
|
wandb: false
|
||
|
|
||
|
datasets:
|
||
|
train:
|
||
|
name: lj
|
||
|
n_workers: 1
|
||
|
batch_size: 72
|
||
|
mode: nv_tacotron
|
||
|
path: E:\4k6k\datasets\audio\LJSpeech-1.1\ljs_audio_text_train_filelist.txt
|
||
|
|
||
|
networks:
|
||
|
mel_gen:
|
||
|
type: generator
|
||
|
which_model_G: nv_tacotron2
|
||
|
args:
|
||
|
encoder_kernel_size: 5
|
||
|
encoder_n_convolutions: 3
|
||
|
encoder_embedding_dim: 512
|
||
|
decoder_rnn_dim: 1024
|
||
|
prenet_dim: 256
|
||
|
max_decoder_steps: 1000
|
||
|
attention_rnn_dim: 1024
|
||
|
attention_dim: 128
|
||
|
attention_location_n_filters: 32
|
||
|
attention_location_kernel_size: 31
|
||
|
postnet_embedding_dim: 512
|
||
|
postnet_kernel_size: 5
|
||
|
postnet_n_convolutions: 5
|
||
|
|
||
|
#### path
|
||
|
path:
|
||
|
#pretrain_model_generator: ../experiments/diffusion_unet_128_imageset_22000.pt
|
||
|
strict_load: true
|
||
|
#resume_state: ../experiments/train_imgset_unet_diffusion/training_state/54000.state
|
||
|
|
||
|
steps:
|
||
|
generator:
|
||
|
training: mel_gen
|
||
|
|
||
|
optimizer: adamw
|
||
|
optimizer_params:
|
||
|
lr: !!float 1.2e-3
|
||
|
weight_decay: !!float 1e-6
|
||
|
beta1: 0.9
|
||
|
beta2: 0.9999
|
||
|
clip_grad_eps: 1.0
|
||
|
|
||
|
injectors:
|
||
|
mel:
|
||
|
type: generator
|
||
|
generator: mel_gen
|
||
|
in: [padded_text, input_lengths, padded_mel, output_lengths]
|
||
|
out: [mel_outputs, mel_outputs_postnet, gate_outputs, alignments]
|
||
|
losses:
|
||
|
tacotron_loss:
|
||
|
type: nv_tacotron2_loss
|
||
|
weight: 1
|
||
|
mel_target_key: padded_mel
|
||
|
mel_output_key: mel_outputs
|
||
|
mel_output_postnet_key: mel_outputs_postnet
|
||
|
gate_target_key: padded_gate
|
||
|
gate_output_key: gate_outputs
|
||
|
|
||
|
train:
|
||
|
niter: 500000
|
||
|
warmup_iter: -1
|
||
|
mega_batch_factor: 3
|
||
|
ema_rate: .999
|
||
|
val_freq: 500
|
||
|
|
||
|
default_lr_scheme: MultiStepLR
|
||
|
gen_lr_steps: [ 50000, 100000, 150000 ]
|
||
|
lr_gamma: 0.5
|
||
|
|
||
|
eval:
|
||
|
evaluators:
|
||
|
val:
|
||
|
type: mel
|
||
|
for: mel_gen
|
||
|
batch_size: 16
|
||
|
dataset:
|
||
|
mode: nv_tacotron
|
||
|
path: E:\4k6k\datasets\audio\LJSpeech-1.1\ljs_audio_text_val_filelist.txt
|
||
|
|
||
|
|
||
|
logger:
|
||
|
print_freq: 30
|
||
|
save_checkpoint_freq: 500
|
||
|
visuals: [mel_outputs, padded_mel]
|
||
|
is_mel_spectrogram: true
|
||
|
visual_debug_rate: 100
|