renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details)
gpu_ids:[0]# <-- unless you have multiple gpus, use this
gpu_ids:[0]# Superfluous, redundant, unnecessary, the way you launch the training script will set this
start_step:0
start_step:0
checkpointing_enabled:true# <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
checkpointing_enabled:true
fp16:${float16}# might want to check this out
fp16:${float16}
wandb:false# <-- enable to log to wandb. tensorboard logging is always enabled.
wandb:false
use_tb_logger:true
use_tb_logger:true
datasets:
datasets:
train:
train:
name:${dataset_name}
name:${dataset_name}
n_workers:8# idk what this does
n_workers:8
batch_size:${batch_size}# This leads to ~16GB of vram usage on my 3090.
batch_size:${batch_size}
mode:paired_voice_audio
mode:paired_voice_audio
path:${dataset_path}
path:${dataset_path}
fetcher_mode:['lj']# CHANGEME if your dataset isn't in LJSpeech format
fetcher_mode:['lj']
phase:train
phase:train
max_wav_length:255995
max_wav_length:255995
max_text_length:200
max_text_length:200
@ -29,11 +29,11 @@ datasets:
val:# I really do not care about validation right now
val:# I really do not care about validation right now
name:${validation_name}
name:${validation_name}
n_workers:1
n_workers:1
batch_size:1# this could be higher probably
batch_size:1
mode:paired_voice_audio
mode:paired_voice_audio
path:${validation_path}
path:${validation_path}
fetcher_mode:['lj']
fetcher_mode:['lj']
phase:val# might be broken idk
phase:val
max_wav_length:255995
max_wav_length:255995
max_text_length:200
max_text_length:200
sample_rate:22050
sample_rate:22050
@ -47,18 +47,18 @@ datasets:
steps:
steps:
gpt_train:
gpt_train:
training:gpt
training:gpt
loss_log_buffer:500# no idea what this does
loss_log_buffer:500
# Generally follows the recipe from the DALLE paper.
# Generally follows the recipe from the DALLE paper.
optimizer:adamw# this should be adamw_zero if you're using distributed training
optimizer:adamw# this should be adamw_zero if you're using distributed training
optimizer_params:
optimizer_params:
lr:!!float${learning_rate}# CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value**
lr:!!float${learning_rate}# originally: 1e-4
weight_decay:!!float1e-2
weight_decay:!!float1e-2
beta1:0.9
beta1:0.9
beta2:0.96
beta2:0.96
clip_grad_eps:4
clip_grad_eps:4
injectors: # TODO:replace this entire sequence with the GptVoiceLatentInjector
which_model_G:unified_voice2# none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
which_model_G:unified_voice2# none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
kwargs:
kwargs:
layers:30# WAS 8
layers: 30 # originally:8
model_dim:1024# WAS 512
model_dim: 1024 # originally:512
heads:16# WAS 8
heads: 16 # originally:8
max_text_tokens:402# WAS 120
max_text_tokens: 402 # originally:120
max_mel_tokens:604# WAS 250
max_mel_tokens: 604 # originally:250
max_conditioning_inputs:2# WAS 1
max_conditioning_inputs: 2 # originally:1
mel_length_compression:1024
mel_length_compression:1024
number_text_tokens:256# supposed to be 255 for newer unified_voice files
number_text_tokens:256# supposed to be 255 for newer unified_voice files
number_mel_codes:8194
number_mel_codes:8194
@ -118,11 +118,10 @@ path:
strict_load:true
strict_load:true
${resume_state}
${resume_state}
# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit)
train:
train: # CHANGEME:ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
niter:${iterations}
niter:${iterations}
warmup_iter:-1
warmup_iter:-1
mega_batch_factor:${mega_batch_factor}# <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
mega_batch_factor:${gradient_accumulation_size}
val_freq:${iterations}
val_freq:${iterations}
ema_enabled:false# I really don't think EMA matters
ema_enabled:false# I really don't think EMA matters
@ -142,7 +141,7 @@ eval:
logger:
logger:
print_freq:${print_rate}
print_freq:${print_rate}
save_checkpoint_freq:${save_rate}# CHANGEME: especially you should increase this it's really slow
messages.append(f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {gradient_accumulation_size}")
messages.append(f"Mega batch factor is too large for the given batch size, clamping mega batch factor to: {mega_batch_factor}")
messages.append("Resume path specified, but does not exist. Disabling...")
messages.append("Resume path specified, but does not exist. Disabling...")
ifbnb:
messages.append("BitsAndBytes requested. Please note this is ! EXPERIMENTAL !")
ifhalf_p:
ifhalf_p:
ifbnb:
half_p=False
messages.append("Half Precision requested, but BitsAndBytes is also requested. Due to redundancies, disabling half precision...")
else:
messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !")
messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !")
ifnotos.path.exists(get_halfp_model_path()):
ifnotos.path.exists(get_halfp_model_path()):
convert_to_halfp()
convert_to_halfp()
ifbnb:
messages.append("BitsAndBytes requested. Please note this is ! EXPERIMENTAL !")
messages.append(f"For {epochs} epochs with {lines} lines in batches of {batch_size}, iterating for {iterations} steps ({int(iterations/epochs)} steps per epoch)")
messages.append(f"For {epochs} epochs with {lines} lines in batches of {batch_size}, iterating for {iterations} steps ({int(iterations/epochs)} steps per epoch)")