renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details)

2023-03-04 15:55:06 +00:00 · 2023-03-04 15:55:06 +00:00 · df24827b9a
commit df24827b9a
parent 6d5e1e1a80
3 changed files with 56 additions and 48 deletions
--- a/models/.template.yaml
+++ b/models/.template.yaml
@ -1,21 +1,21 @@
 name: ${name}
 model: extensibletrainer
 scale: 1
-gpu_ids: [0] # <-- unless you have multiple gpus, use this
+gpu_ids: [0] # Superfluous, redundant, unnecessary, the way you launch the training script will set this
 start_step: 0
-checkpointing_enabled: true  # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
-fp16: ${float16} # might want to check this out
-wandb: false  # <-- enable to log to wandb. tensorboard logging is always enabled.
+checkpointing_enabled: true 
+fp16: ${float16}
+wandb: false 
 use_tb_logger: true

 datasets:
  train:
    name: ${dataset_name}
-    n_workers: 8 # idk what this does
-    batch_size: ${batch_size} # This leads to ~16GB of vram usage on my 3090.
+    n_workers: 8
+    batch_size: ${batch_size}
    mode: paired_voice_audio
    path: ${dataset_path}
-    fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format
+    fetcher_mode: ['lj']
    phase: train
    max_wav_length: 255995
    max_text_length: 200
@ -29,11 +29,11 @@ datasets:
  val: # I really do not care about validation right now
    name: ${validation_name}
    n_workers: 1
-    batch_size: 1 # this could be higher probably
+    batch_size: 1
    mode: paired_voice_audio
    path: ${validation_path}
    fetcher_mode: ['lj']
-    phase: val # might be broken idk
+    phase: val
    max_wav_length: 255995
    max_text_length: 200
    sample_rate: 22050
@ -47,18 +47,18 @@ datasets:
 steps:        
  gpt_train:
    training: gpt
-    loss_log_buffer: 500 # no idea what this does
+    loss_log_buffer: 500

    # Generally follows the recipe from the DALLE paper.
    optimizer: adamw # this should be adamw_zero if you're using distributed training
    optimizer_params:
-      lr: !!float ${learning_rate} # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value**
+      lr: !!float ${learning_rate} # originally: 1e-4
      weight_decay: !!float 1e-2
      beta1: 0.9
      beta2: 0.96
    clip_grad_eps: 4

-    injectors:  # TODO: replace this entire sequence with the GptVoiceLatentInjector
+    injectors:
      paired_to_mel:
        type: torch_mel_spectrogram
        mel_norm_file: ./models/tortoise/clips_mel_norms.pth
@ -74,7 +74,7 @@ steps:
        type: discrete_token
        in: paired_mel
        out: paired_mel_codes
-        dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT
+        dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml"
      paired_fwd_text:
        type: generator
        generator: gpt
@ -95,12 +95,12 @@ networks:
    type: generator 
    which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
    kwargs:
-      layers: 30 # WAS 8
-      model_dim: 1024 # WAS 512
-      heads: 16 # WAS 8
-      max_text_tokens: 402 # WAS 120
-      max_mel_tokens: 604 # WAS 250
-      max_conditioning_inputs: 2 # WAS 1
+      layers: 30 # originally: 8
+      model_dim: 1024 # originally: 512
+      heads: 16 # originally: 8
+      max_text_tokens: 402 # originally: 120
+      max_mel_tokens: 604 # originally: 250
+      max_conditioning_inputs: 2 # originally: 1
      mel_length_compression: 1024
      number_text_tokens: 256 # supposed to be 255 for newer unified_voice files 
      number_mel_codes: 8194
@ -118,11 +118,10 @@ path:
  strict_load: true
  ${resume_state}

-# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit)
-train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
+train:
  niter: ${iterations}
  warmup_iter: -1
-  mega_batch_factor: ${mega_batch_factor} # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
+  mega_batch_factor: ${gradient_accumulation_size}
  val_freq: ${iterations}

  ema_enabled: false # I really don't think EMA matters
@ -142,7 +141,7 @@ eval:

 logger: 
  print_freq: ${print_rate}
-  save_checkpoint_freq: ${save_rate} # CHANGEME: especially you should increase this it's really slow
+  save_checkpoint_freq: ${save_rate}
  visuals: [gen, mel]
  visual_debug_rate: ${print_rate}
  is_mel_spectrogram: true
--- a/src/utils.py
+++ b/src/utils.py
@ -573,6 +573,9 @@ class TrainingState():
 		infos = {}
 		highest_step = self.last_info_check_at

+		if not update:
+			self.losses = []
+
 		if use_tensorboard:
 			logs = sorted([f'{self.dataset_dir}/tb_logger/{d}' for d in os.listdir(f'{self.dataset_dir}/tb_logger/') if d[:6] == "events" ])
 			if update:
@ -816,6 +819,7 @@ def update_training_dataplot(config_path=None):
 			del training_state
 			training_state = None
 	elif training_state.losses:
+		training_state.load_losses()
 		update = gr.LinePlot.update(value=pd.DataFrame(training_state.losses))

 	return update
@ -837,7 +841,8 @@ def stop_training():
 	print("Killing training process...")
 	training_state.killed = True
 	training_state.process.stdout.close()
-	training_state.process.terminate()
+	#training_state.process.terminate()
+	training_state.process.send_signal(signal.SIGINT)
 	return_code = training_state.process.wait()
 	training_state = None
 	return f"Training cancelled: {return_code}"
@ -938,7 +943,7 @@ EPOCH_SCHEDULE = [ 9, 18, 25, 33 ]
 def schedule_learning_rate( iterations, schedule=EPOCH_SCHEDULE ):
 	return [int(iterations * d) for d in schedule]

-def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, mega_batch_factor, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ):
+def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, gradient_accumulation_size, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ):
 	name = f"{voice}-finetune"
 	dataset_name = f"{voice}-train"
 	dataset_path = f"./training/{voice}/train.txt"
@ -959,12 +964,11 @@ def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learni
 		batch_size = int(lines / nearest_slice)
 		messages.append(f"Batch size not neatly divisible by dataset size, adjusting batch size to: {batch_size} ({nearest_slice} steps per epoch)")
 	
-	if batch_size == 1 and mega_batch_factor != 1:
-		mega_batch_factor = 1
-		messages.append(f"Mega batch factor is too large for the given batch size, clamping mega batch factor to: {mega_batch_factor}")
-	elif batch_size / mega_batch_factor < 2:
-		mega_batch_factor = int(batch_size / 2)
-		messages.append(f"Mega batch factor is too large for the given batch size, clamping mega batch factor to: {mega_batch_factor}")
+	if gradient_accumulation_size == 0:
+		gradient_accumulation_size = 1
+	elif batch_size % gradient_accumulation_size != 0:
+		gradient_accumulation_size = int(batch_size / gradient_accumulation_size)
+		messages.append(f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {gradient_accumulation_size}")

 	iterations = calc_iterations(epochs=epochs, lines=lines, batch_size=batch_size)

@ -980,14 +984,18 @@ def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learni
 		resume_path = None
 		messages.append("Resume path specified, but does not exist. Disabling...")

+	if bnb:
+		messages.append("BitsAndBytes requested. Please note this is ! EXPERIMENTAL !")
+
 	if half_p:
+		if bnb:
+			half_p = False
+			messages.append("Half Precision requested, but BitsAndBytes is also requested. Due to redundancies, disabling half precision...")
+		else:
 			messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !")
 			if not os.path.exists(get_halfp_model_path()):
 				convert_to_halfp()	

-	if bnb:
-		messages.append("BitsAndBytes requested. Please note this is ! EXPERIMENTAL !")
-
 	messages.append(f"For {epochs} epochs with {lines} lines in batches of {batch_size}, iterating for {iterations} steps ({int(iterations / epochs)} steps per epoch)")

 	return (
@ -995,14 +1003,14 @@ def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learni
 		text_ce_lr_weight,
 		learning_rate_schedule,
 		batch_size,
-		mega_batch_factor,
+		gradient_accumulation_size,
 		print_rate,
 		save_rate,
 		resume_path,
 		messages
 	)

-def save_training_settings( iterations=None, learning_rate=None, text_ce_lr_weight=None, learning_rate_schedule=None, batch_size=None, mega_batch_factor=None, print_rate=None, save_rate=None, name=None, dataset_name=None, dataset_path=None, validation_name=None, validation_path=None, output_name=None, resume_path=None, half_p=None, bnb=None, source_model=None ):
+def save_training_settings( iterations=None, learning_rate=None, text_ce_lr_weight=None, learning_rate_schedule=None, batch_size=None, gradient_accumulation_size=None, print_rate=None, save_rate=None, name=None, dataset_name=None, dataset_path=None, validation_name=None, validation_path=None, output_name=None, resume_path=None, half_p=None, bnb=None, source_model=None ):
 	if not source_model:
 		source_model = f"./models/tortoise/autoregressive{'_half' if half_p else ''}.pth"

@ -1011,8 +1019,8 @@ def save_training_settings( iterations=None, learning_rate=None, text_ce_lr_weig
 		"batch_size": batch_size if batch_size else 64,
 		"learning_rate": learning_rate if learning_rate else 1e-5,
 		"gen_lr_steps": learning_rate_schedule if learning_rate_schedule else EPOCH_SCHEDULE,
-		"mega_batch_factor": mega_batch_factor if mega_batch_factor else 4,
-		"print_rate": print_rate if print_rate else 50,
+		"gradient_accumulation_size": gradient_accumulation_size if gradient_accumulation_size else 4,
+		"print_rate": print_rate if print_rate else 1,
 		"save_rate": save_rate if save_rate else 50,
 		"name": name if name else "finetune",
 		"dataset_name": dataset_name if dataset_name else "finetune",
--- a/src/webui.py
+++ b/src/webui.py
@ -227,8 +227,7 @@ def import_training_settings_proxy( voice ):
 		messages.append(f"Basing epoch size to {lines} lines")

 	batch_size = config['datasets']['train']['batch_size']
-	mega_batch_factor = config['train']['mega_batch_factor']
-
+	gradient_accumulation_size = config['train']['mega_batch_factor']

 	iterations = config['train']['niter']
 	steps_per_iteration = int(lines / batch_size)
@ -276,7 +275,7 @@ def import_training_settings_proxy( voice ):
 		text_ce_lr_weight,
 		learning_rate_schedule,
 		batch_size,
-		mega_batch_factor,
+		gradient_accumulation_size,
 		print_rate,
 		save_rate,
 		resume_path,
@ -287,7 +286,7 @@ def import_training_settings_proxy( voice ):
 	)


-def save_training_settings_proxy( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, mega_batch_factor, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ):
+def save_training_settings_proxy( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, gradient_accumulation_size, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ):
 	name = f"{voice}-finetune"
 	dataset_name = f"{voice}-train"
 	dataset_path = f"./training/{voice}/train.txt"
@ -318,7 +317,7 @@ def save_training_settings_proxy( epochs, learning_rate, text_ce_lr_weight, lear
 		learning_rate=learning_rate,
 		text_ce_lr_weight=text_ce_lr_weight,
 		learning_rate_schedule=learning_rate_schedule,
-		mega_batch_factor=mega_batch_factor,
+		gradient_accumulation_size=gradient_accumulation_size,
 		print_rate=print_rate,
 		save_rate=save_rate,
 		name=name,
@ -489,7 +488,7 @@ def setup_gradio():
 						with gr.Row():
 							training_settings = training_settings + [
 								gr.Number(label="Batch Size", value=128, precision=0),
-								gr.Number(label="Mega Batch Factor", value=4, precision=0),
+								gr.Number(label="Gradient Accumulation Size", value=4, precision=0),
 							]
 						with gr.Row():
 							training_settings = training_settings + [
@ -534,6 +533,8 @@ def setup_gradio():
 					with gr.Column():
 						training_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8)
 						verbose_training = gr.Checkbox(label="Verbose Console Output", value=True)
+						
+						with gr.Row():
 							training_buffer_size = gr.Slider(label="Console Buffer Size", minimum=4, maximum=32, value=8)
 							training_keep_x_past_datasets = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1)
 						training_gpu_count = gr.Number(label="GPUs", value=1)