2023-02-17 03:05:27 +00:00
name : ${name}
model : extensibletrainer
scale : 1
2023-03-04 15:55:06 +00:00
gpu_ids : [ 0 ] # Superfluous, redundant, unnecessary, the way you launch the training script will set this
2023-02-23 06:24:54 +00:00
start_step : 0
2023-03-04 15:55:06 +00:00
checkpointing_enabled : true
fp16 : ${float16}
wandb : false
2023-02-17 03:05:27 +00:00
use_tb_logger : true
datasets :
train :
name : ${dataset_name}
2023-03-05 05:17:19 +00:00
n_workers : ${workers}
2023-03-04 15:55:06 +00:00
batch_size : ${batch_size}
2023-02-17 03:05:27 +00:00
mode : paired_voice_audio
path : ${dataset_path}
2023-03-04 15:55:06 +00:00
fetcher_mode : [ 'lj' ]
2023-02-17 03:05:27 +00:00
phase : train
max_wav_length : 255995
max_text_length : 200
sample_rate : 22050
load_conditioning : True
num_conditioning_candidates : 2
conditioning_length : 44000
use_bpe_tokenizer : True
2023-02-17 13:57:03 +00:00
tokenizer_vocab : ./models/tortoise/bpe_lowercase_asr_256.json
2023-02-17 03:05:27 +00:00
load_aligned_codes : False
2023-02-21 19:31:57 +00:00
val : # I really do not care about validation right now
2023-02-17 03:05:27 +00:00
name : ${validation_name}
n_workers : 1
2023-03-04 15:55:06 +00:00
batch_size : 1
2023-02-17 03:05:27 +00:00
mode : paired_voice_audio
path : ${validation_path}
fetcher_mode : [ 'lj' ]
2023-03-04 15:55:06 +00:00
phase : val
2023-02-17 03:05:27 +00:00
max_wav_length : 255995
max_text_length : 200
sample_rate : 22050
load_conditioning : True
num_conditioning_candidates : 2
conditioning_length : 44000
use_bpe_tokenizer : True
2023-02-17 13:57:03 +00:00
tokenizer_vocab : ./models/tortoise/bpe_lowercase_asr_256.json
2023-02-17 03:05:27 +00:00
load_aligned_codes : False
steps :
gpt_train :
training : gpt
2023-03-04 15:55:06 +00:00
loss_log_buffer : 500
2023-02-17 03:05:27 +00:00
# Generally follows the recipe from the DALLE paper.
optimizer : adamw # this should be adamw_zero if you're using distributed training
optimizer_params :
2023-03-04 15:55:06 +00:00
lr : !!float ${learning_rate} # originally: 1e-4
2023-02-17 03:05:27 +00:00
weight_decay : !!float 1e-2
beta1 : 0.9
beta2 : 0.96
clip_grad_eps : 4
2023-03-04 15:55:06 +00:00
injectors :
2023-02-17 03:05:27 +00:00
paired_to_mel :
type : torch_mel_spectrogram
2023-02-17 13:57:03 +00:00
mel_norm_file : ./models/tortoise/clips_mel_norms.pth
2023-02-17 03:05:27 +00:00
in : wav
out : paired_mel
paired_cond_to_mel :
type : for_each
subtype : torch_mel_spectrogram
2023-02-17 13:57:03 +00:00
mel_norm_file : ./models/tortoise/clips_mel_norms.pth
2023-02-17 03:05:27 +00:00
in : conditioning
out : paired_conditioning_mel
to_codes :
type : discrete_token
in : paired_mel
out : paired_mel_codes
2023-03-04 15:55:06 +00:00
dvae_config : "./models/tortoise/train_diffusion_vocoder_22k_level.yml"
2023-02-17 03:05:27 +00:00
paired_fwd_text :
type : generator
generator : gpt
in : [ paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
out : [ loss_text_ce, loss_mel_ce, logits]
losses :
text_ce :
type : direct
2023-03-01 01:17:38 +00:00
weight : ${text_ce_lr_weight}
2023-02-17 03:05:27 +00:00
key : loss_text_ce
mel_ce :
type : direct
weight : 1
key : loss_mel_ce
networks :
gpt :
type : generator
which_model_G : unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
kwargs :
2023-03-04 15:55:06 +00:00
layers: 30 # originally : 8
model_dim: 1024 # originally : 512
heads: 16 # originally : 8
max_text_tokens: 402 # originally : 120
max_mel_tokens: 604 # originally : 250
max_conditioning_inputs: 2 # originally : 1
2023-02-17 03:05:27 +00:00
mel_length_compression : 1024
number_text_tokens : 256 # supposed to be 255 for newer unified_voice files
number_mel_codes : 8194
start_mel_token : 8192
stop_mel_token : 8193
start_text_token : 255
train_solo_embeddings : False # missing in uv3/4
use_mel_codes_as_input : True # ditto
checkpointing : True
#types: 1 # this is MISSING, but in my analysis 1 is equivalent to not having it.
#only_alignment_head: False # uv3/4
path :
2023-02-19 16:16:44 +00:00
${pretrain_model_gpt}
2023-02-17 03:05:27 +00:00
strict_load : true
2023-02-19 16:16:44 +00:00
${resume_state}
2023-02-17 03:05:27 +00:00
2023-03-04 15:55:06 +00:00
train :
2023-02-18 15:50:51 +00:00
niter : ${iterations}
2023-02-17 03:05:27 +00:00
warmup_iter : -1
2023-03-04 15:55:06 +00:00
mega_batch_factor : ${gradient_accumulation_size}
2023-03-07 20:38:31 +00:00
val_freq : ${validation_rate}
2023-02-17 03:05:27 +00:00
2023-02-21 19:31:57 +00:00
ema_enabled : false # I really don't think EMA matters
2023-02-17 03:05:27 +00:00
default_lr_scheme : MultiStepLR
2023-02-19 16:16:44 +00:00
gen_lr_steps : ${gen_lr_steps} #[50000, 100000, 140000, 180000]
2023-02-17 03:05:27 +00:00
lr_gamma : 0.5
eval :
output_state : gen
injectors :
gen_inj_eval :
type : generator
generator : generator
in : hq
out : [ gen, codebook_commitment_loss]
logger :
2023-02-18 15:50:51 +00:00
print_freq : ${print_rate}
2023-03-04 15:55:06 +00:00
save_checkpoint_freq : ${save_rate}
2023-02-17 03:05:27 +00:00
visuals : [ gen, mel]
2023-02-18 15:50:51 +00:00
visual_debug_rate : ${print_rate}
2023-02-17 03:05:27 +00:00
is_mel_spectrogram : true