89 lines
2.8 KiB
Python
89 lines
2.8 KiB
Python
#import tensorflow as tf
|
|
from models.tacotron2.text import symbols
|
|
|
|
|
|
def create_hparams(hparams_string=None, verbose=False):
|
|
"""Create model hyperparameters. Parse nondefault from given string."""
|
|
|
|
hparams = dict(
|
|
################################
|
|
# Experiment Parameters #
|
|
################################
|
|
epochs=500,
|
|
iters_per_checkpoint=1000,
|
|
seed=1234,
|
|
dynamic_loss_scaling=True,
|
|
fp16_run=False,
|
|
distributed_run=False,
|
|
dist_backend="nccl",
|
|
dist_url="tcp://localhost:54321",
|
|
cudnn_enabled=True,
|
|
cudnn_benchmark=False,
|
|
ignore_layers=['embedding.weight'],
|
|
|
|
################################
|
|
# Data Parameters #
|
|
################################
|
|
load_mel_from_disk=False,
|
|
training_files='filelists/ljs_audio_text_train_filelist.txt',
|
|
validation_files='filelists/ljs_audio_text_val_filelist.txt',
|
|
text_cleaners=['english_cleaners'],
|
|
|
|
################################
|
|
# Audio Parameters #
|
|
################################
|
|
max_wav_value=32768.0,
|
|
input_sample_rate=22050, # When different from sampling_rate, dataset automatically interpolates to sampling_rate
|
|
sampling_rate=22050,
|
|
filter_length=1024,
|
|
hop_length=256, # This means a MEL is 1/256th the equivalent audio.
|
|
win_length=1024,
|
|
n_mel_channels=80,
|
|
mel_fmin=0.0,
|
|
mel_fmax=8000.0,
|
|
|
|
################################
|
|
# Model Parameters #
|
|
################################
|
|
n_symbols=len(symbols),
|
|
symbols_embedding_dim=512,
|
|
|
|
# Encoder parameters
|
|
encoder_kernel_size=5,
|
|
encoder_n_convolutions=3,
|
|
encoder_embedding_dim=512,
|
|
|
|
# Decoder parameters
|
|
n_frames_per_step=1, # currently only 1 is supported
|
|
decoder_rnn_dim=1024,
|
|
prenet_dim=256,
|
|
max_decoder_steps=1000,
|
|
gate_threshold=0.5,
|
|
p_attention_dropout=0.1,
|
|
p_decoder_dropout=0.1,
|
|
|
|
# Attention parameters
|
|
attention_rnn_dim=1024,
|
|
attention_dim=128,
|
|
|
|
# Location Layer parameters
|
|
attention_location_n_filters=32,
|
|
attention_location_kernel_size=31,
|
|
|
|
# Mel-post processing network parameters
|
|
postnet_embedding_dim=512,
|
|
postnet_kernel_size=5,
|
|
postnet_n_convolutions=5,
|
|
|
|
################################
|
|
# Optimization Hyperparameters #
|
|
################################
|
|
use_saved_learning_rate=False,
|
|
learning_rate=1e-3,
|
|
weight_decay=1e-6,
|
|
grad_clip_thresh=1.0,
|
|
batch_size=64,
|
|
mask_padding=True # set model's padded outputs to padded values
|
|
)
|
|
|
|
return hparams |