1
0
Fork 0

renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details)

master
mrq 2023-03-04 15:55:06 +07:00
parent 6d5e1e1a80
commit df24827b9a
3 changed files with 56 additions and 48 deletions

@ -1,21 +1,21 @@
name: ${name} name: ${name}
model: extensibletrainer model: extensibletrainer
scale: 1 scale: 1
gpu_ids: [0] # <-- unless you have multiple gpus, use this gpu_ids: [0] # Superfluous, redundant, unnecessary, the way you launch the training script will set this
start_step: 0 start_step: 0
checkpointing_enabled: true # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training. checkpointing_enabled: true
fp16: ${float16} # might want to check this out fp16: ${float16}
wandb: false # <-- enable to log to wandb. tensorboard logging is always enabled. wandb: false
use_tb_logger: true use_tb_logger: true
datasets: datasets:
train: train:
name: ${dataset_name} name: ${dataset_name}
n_workers: 8 # idk what this does n_workers: 8
batch_size: ${batch_size} # This leads to ~16GB of vram usage on my 3090. batch_size: ${batch_size}
mode: paired_voice_audio mode: paired_voice_audio
path: ${dataset_path} path: ${dataset_path}
fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format fetcher_mode: ['lj']
phase: train phase: train
max_wav_length: 255995 max_wav_length: 255995
max_text_length: 200 max_text_length: 200
@ -29,11 +29,11 @@ datasets:
val: # I really do not care about validation right now val: # I really do not care about validation right now
name: ${validation_name} name: ${validation_name}
n_workers: 1 n_workers: 1
batch_size: 1 # this could be higher probably batch_size: 1
mode: paired_voice_audio mode: paired_voice_audio
path: ${validation_path} path: ${validation_path}
fetcher_mode: ['lj'] fetcher_mode: ['lj']
phase: val # might be broken idk phase: val
max_wav_length: 255995 max_wav_length: 255995
max_text_length: 200 max_text_length: 200
sample_rate: 22050 sample_rate: 22050
@ -47,18 +47,18 @@ datasets:
steps: steps:
gpt_train: gpt_train:
training: gpt training: gpt
loss_log_buffer: 500 # no idea what this does loss_log_buffer: 500
# Generally follows the recipe from the DALLE paper. # Generally follows the recipe from the DALLE paper.
optimizer: adamw # this should be adamw_zero if you're using distributed training optimizer: adamw # this should be adamw_zero if you're using distributed training
optimizer_params: optimizer_params:
lr: !!float ${learning_rate} # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value** lr: !!float ${learning_rate} # originally: 1e-4
weight_decay: !!float 1e-2 weight_decay: !!float 1e-2
beta1: 0.9 beta1: 0.9
beta2: 0.96 beta2: 0.96
clip_grad_eps: 4 clip_grad_eps: 4
injectors: # TODO: replace this entire sequence with the GptVoiceLatentInjector injectors:
paired_to_mel: paired_to_mel:
type: torch_mel_spectrogram type: torch_mel_spectrogram
mel_norm_file: ./models/tortoise/clips_mel_norms.pth mel_norm_file: ./models/tortoise/clips_mel_norms.pth
@ -74,7 +74,7 @@ steps:
type: discrete_token type: discrete_token
in: paired_mel in: paired_mel
out: paired_mel_codes out: paired_mel_codes
dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml"
paired_fwd_text: paired_fwd_text:
type: generator type: generator
generator: gpt generator: gpt
@ -95,12 +95,12 @@ networks:
type: generator type: generator
which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter. which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
kwargs: kwargs:
layers: 30 # WAS 8 layers: 30 # originally: 8
model_dim: 1024 # WAS 512 model_dim: 1024 # originally: 512
heads: 16 # WAS 8 heads: 16 # originally: 8
max_text_tokens: 402 # WAS 120 max_text_tokens: 402 # originally: 120
max_mel_tokens: 604 # WAS 250 max_mel_tokens: 604 # originally: 250
max_conditioning_inputs: 2 # WAS 1 max_conditioning_inputs: 2 # originally: 1
mel_length_compression: 1024 mel_length_compression: 1024
number_text_tokens: 256 # supposed to be 255 for newer unified_voice files number_text_tokens: 256 # supposed to be 255 for newer unified_voice files
number_mel_codes: 8194 number_mel_codes: 8194
@ -118,11 +118,10 @@ path:
strict_load: true strict_load: true
${resume_state} ${resume_state}
# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit) train:
train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
niter: ${iterations} niter: ${iterations}
warmup_iter: -1 warmup_iter: -1
mega_batch_factor: ${mega_batch_factor} # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8]. mega_batch_factor: ${gradient_accumulation_size}
val_freq: ${iterations} val_freq: ${iterations}
ema_enabled: false # I really don't think EMA matters ema_enabled: false # I really don't think EMA matters
@ -142,7 +141,7 @@ eval:
logger: logger:
print_freq: ${print_rate} print_freq: ${print_rate}
save_checkpoint_freq: ${save_rate} # CHANGEME: especially you should increase this it's really slow save_checkpoint_freq: ${save_rate}
visuals: [gen, mel] visuals: [gen, mel]
visual_debug_rate: ${print_rate} visual_debug_rate: ${print_rate}
is_mel_spectrogram: true is_mel_spectrogram: true

@ -573,6 +573,9 @@ class TrainingState():
infos = {} infos = {}
highest_step = self.last_info_check_at highest_step = self.last_info_check_at
if not update:
self.losses = []
if use_tensorboard: if use_tensorboard:
logs = sorted([f'{self.dataset_dir}/tb_logger/{d}' for d in os.listdir(f'{self.dataset_dir}/tb_logger/') if d[:6] == "events" ]) logs = sorted([f'{self.dataset_dir}/tb_logger/{d}' for d in os.listdir(f'{self.dataset_dir}/tb_logger/') if d[:6] == "events" ])
if update: if update:
@ -816,6 +819,7 @@ def update_training_dataplot(config_path=None):
del training_state del training_state
training_state = None training_state = None
elif training_state.losses: elif training_state.losses:
training_state.load_losses()
update = gr.LinePlot.update(value=pd.DataFrame(training_state.losses)) update = gr.LinePlot.update(value=pd.DataFrame(training_state.losses))
return update return update
@ -837,7 +841,8 @@ def stop_training():
print("Killing training process...") print("Killing training process...")
training_state.killed = True training_state.killed = True
training_state.process.stdout.close() training_state.process.stdout.close()
training_state.process.terminate() #training_state.process.terminate()
training_state.process.send_signal(signal.SIGINT)
return_code = training_state.process.wait() return_code = training_state.process.wait()
training_state = None training_state = None
return f"Training cancelled: {return_code}" return f"Training cancelled: {return_code}"
@ -938,7 +943,7 @@ EPOCH_SCHEDULE = [ 9, 18, 25, 33 ]
def schedule_learning_rate( iterations, schedule=EPOCH_SCHEDULE ): def schedule_learning_rate( iterations, schedule=EPOCH_SCHEDULE ):
return [int(iterations * d) for d in schedule] return [int(iterations * d) for d in schedule]
def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, mega_batch_factor, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ): def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, gradient_accumulation_size, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ):
name = f"{voice}-finetune" name = f"{voice}-finetune"
dataset_name = f"{voice}-train" dataset_name = f"{voice}-train"
dataset_path = f"./training/{voice}/train.txt" dataset_path = f"./training/{voice}/train.txt"
@ -959,12 +964,11 @@ def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learni
batch_size = int(lines / nearest_slice) batch_size = int(lines / nearest_slice)
messages.append(f"Batch size not neatly divisible by dataset size, adjusting batch size to: {batch_size} ({nearest_slice} steps per epoch)") messages.append(f"Batch size not neatly divisible by dataset size, adjusting batch size to: {batch_size} ({nearest_slice} steps per epoch)")
if batch_size == 1 and mega_batch_factor != 1: if gradient_accumulation_size == 0:
mega_batch_factor = 1 gradient_accumulation_size = 1
messages.append(f"Mega batch factor is too large for the given batch size, clamping mega batch factor to: {mega_batch_factor}") elif batch_size % gradient_accumulation_size != 0:
elif batch_size / mega_batch_factor < 2: gradient_accumulation_size = int(batch_size / gradient_accumulation_size)
mega_batch_factor = int(batch_size / 2) messages.append(f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {gradient_accumulation_size}")
messages.append(f"Mega batch factor is too large for the given batch size, clamping mega batch factor to: {mega_batch_factor}")
iterations = calc_iterations(epochs=epochs, lines=lines, batch_size=batch_size) iterations = calc_iterations(epochs=epochs, lines=lines, batch_size=batch_size)
@ -980,14 +984,18 @@ def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learni
resume_path = None resume_path = None
messages.append("Resume path specified, but does not exist. Disabling...") messages.append("Resume path specified, but does not exist. Disabling...")
if half_p:
messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !")
if not os.path.exists(get_halfp_model_path()):
convert_to_halfp()
if bnb: if bnb:
messages.append("BitsAndBytes requested. Please note this is ! EXPERIMENTAL !") messages.append("BitsAndBytes requested. Please note this is ! EXPERIMENTAL !")
if half_p:
if bnb:
half_p = False
messages.append("Half Precision requested, but BitsAndBytes is also requested. Due to redundancies, disabling half precision...")
else:
messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !")
if not os.path.exists(get_halfp_model_path()):
convert_to_halfp()
messages.append(f"For {epochs} epochs with {lines} lines in batches of {batch_size}, iterating for {iterations} steps ({int(iterations / epochs)} steps per epoch)") messages.append(f"For {epochs} epochs with {lines} lines in batches of {batch_size}, iterating for {iterations} steps ({int(iterations / epochs)} steps per epoch)")
return ( return (
@ -995,14 +1003,14 @@ def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learni
text_ce_lr_weight, text_ce_lr_weight,
learning_rate_schedule, learning_rate_schedule,
batch_size, batch_size,
mega_batch_factor, gradient_accumulation_size,
print_rate, print_rate,
save_rate, save_rate,
resume_path, resume_path,
messages messages
) )
def save_training_settings( iterations=None, learning_rate=None, text_ce_lr_weight=None, learning_rate_schedule=None, batch_size=None, mega_batch_factor=None, print_rate=None, save_rate=None, name=None, dataset_name=None, dataset_path=None, validation_name=None, validation_path=None, output_name=None, resume_path=None, half_p=None, bnb=None, source_model=None ): def save_training_settings( iterations=None, learning_rate=None, text_ce_lr_weight=None, learning_rate_schedule=None, batch_size=None, gradient_accumulation_size=None, print_rate=None, save_rate=None, name=None, dataset_name=None, dataset_path=None, validation_name=None, validation_path=None, output_name=None, resume_path=None, half_p=None, bnb=None, source_model=None ):
if not source_model: if not source_model:
source_model = f"./models/tortoise/autoregressive{'_half' if half_p else ''}.pth" source_model = f"./models/tortoise/autoregressive{'_half' if half_p else ''}.pth"
@ -1011,8 +1019,8 @@ def save_training_settings( iterations=None, learning_rate=None, text_ce_lr_weig
"batch_size": batch_size if batch_size else 64, "batch_size": batch_size if batch_size else 64,
"learning_rate": learning_rate if learning_rate else 1e-5, "learning_rate": learning_rate if learning_rate else 1e-5,
"gen_lr_steps": learning_rate_schedule if learning_rate_schedule else EPOCH_SCHEDULE, "gen_lr_steps": learning_rate_schedule if learning_rate_schedule else EPOCH_SCHEDULE,
"mega_batch_factor": mega_batch_factor if mega_batch_factor else 4, "gradient_accumulation_size": gradient_accumulation_size if gradient_accumulation_size else 4,
"print_rate": print_rate if print_rate else 50, "print_rate": print_rate if print_rate else 1,
"save_rate": save_rate if save_rate else 50, "save_rate": save_rate if save_rate else 50,
"name": name if name else "finetune", "name": name if name else "finetune",
"dataset_name": dataset_name if dataset_name else "finetune", "dataset_name": dataset_name if dataset_name else "finetune",

@ -227,9 +227,8 @@ def import_training_settings_proxy( voice ):
messages.append(f"Basing epoch size to {lines} lines") messages.append(f"Basing epoch size to {lines} lines")
batch_size = config['datasets']['train']['batch_size'] batch_size = config['datasets']['train']['batch_size']
mega_batch_factor = config['train']['mega_batch_factor'] gradient_accumulation_size = config['train']['mega_batch_factor']
iterations = config['train']['niter'] iterations = config['train']['niter']
steps_per_iteration = int(lines / batch_size) steps_per_iteration = int(lines / batch_size)
epochs = int(iterations / steps_per_iteration) epochs = int(iterations / steps_per_iteration)
@ -276,7 +275,7 @@ def import_training_settings_proxy( voice ):
text_ce_lr_weight, text_ce_lr_weight,
learning_rate_schedule, learning_rate_schedule,
batch_size, batch_size,
mega_batch_factor, gradient_accumulation_size,
print_rate, print_rate,
save_rate, save_rate,
resume_path, resume_path,
@ -287,7 +286,7 @@ def import_training_settings_proxy( voice ):
) )
def save_training_settings_proxy( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, mega_batch_factor, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ): def save_training_settings_proxy( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, gradient_accumulation_size, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ):
name = f"{voice}-finetune" name = f"{voice}-finetune"
dataset_name = f"{voice}-train" dataset_name = f"{voice}-train"
dataset_path = f"./training/{voice}/train.txt" dataset_path = f"./training/{voice}/train.txt"
@ -318,7 +317,7 @@ def save_training_settings_proxy( epochs, learning_rate, text_ce_lr_weight, lear
learning_rate=learning_rate, learning_rate=learning_rate,
text_ce_lr_weight=text_ce_lr_weight, text_ce_lr_weight=text_ce_lr_weight,
learning_rate_schedule=learning_rate_schedule, learning_rate_schedule=learning_rate_schedule,
mega_batch_factor=mega_batch_factor, gradient_accumulation_size=gradient_accumulation_size,
print_rate=print_rate, print_rate=print_rate,
save_rate=save_rate, save_rate=save_rate,
name=name, name=name,
@ -489,7 +488,7 @@ def setup_gradio():
with gr.Row(): with gr.Row():
training_settings = training_settings + [ training_settings = training_settings + [
gr.Number(label="Batch Size", value=128, precision=0), gr.Number(label="Batch Size", value=128, precision=0),
gr.Number(label="Mega Batch Factor", value=4, precision=0), gr.Number(label="Gradient Accumulation Size", value=4, precision=0),
] ]
with gr.Row(): with gr.Row():
training_settings = training_settings + [ training_settings = training_settings + [
@ -534,8 +533,10 @@ def setup_gradio():
with gr.Column(): with gr.Column():
training_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8) training_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8)
verbose_training = gr.Checkbox(label="Verbose Console Output", value=True) verbose_training = gr.Checkbox(label="Verbose Console Output", value=True)
training_buffer_size = gr.Slider(label="Console Buffer Size", minimum=4, maximum=32, value=8)
training_keep_x_past_datasets = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1) with gr.Row():
training_buffer_size = gr.Slider(label="Console Buffer Size", minimum=4, maximum=32, value=8)
training_keep_x_past_datasets = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1)
training_gpu_count = gr.Number(label="GPUs", value=1) training_gpu_count = gr.Number(label="GPUs", value=1)
with gr.Row(): with gr.Row():
start_training_button = gr.Button(value="Train") start_training_button = gr.Button(value="Train")