diff --git a/models/.template.yaml b/models/.template.yaml
index 40909f1..825ac0c 100755
--- a/models/.template.yaml
+++ b/models/.template.yaml
@@ -1,21 +1,21 @@
 name: ${name}
 model: extensibletrainer
 scale: 1
-gpu_ids: [0] # <-- unless you have multiple gpus, use this
+gpu_ids: [0] # Superfluous, redundant, unnecessary, the way you launch the training script will set this
 start_step: 0
-checkpointing_enabled: true  # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
-fp16: ${float16} # might want to check this out
-wandb: false  # <-- enable to log to wandb. tensorboard logging is always enabled.
+checkpointing_enabled: true 
+fp16: ${float16}
+wandb: false 
 use_tb_logger: true
 
 datasets:
   train:
     name: ${dataset_name}
-    n_workers: 8 # idk what this does
-    batch_size: ${batch_size} # This leads to ~16GB of vram usage on my 3090.
+    n_workers: 8
+    batch_size: ${batch_size}
     mode: paired_voice_audio
     path: ${dataset_path}
-    fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format
+    fetcher_mode: ['lj']
     phase: train
     max_wav_length: 255995
     max_text_length: 200
@@ -29,11 +29,11 @@ datasets:
   val: # I really do not care about validation right now
     name: ${validation_name}
     n_workers: 1
-    batch_size: 1 # this could be higher probably
+    batch_size: 1
     mode: paired_voice_audio
     path: ${validation_path}
     fetcher_mode: ['lj']
-    phase: val # might be broken idk
+    phase: val
     max_wav_length: 255995
     max_text_length: 200
     sample_rate: 22050
@@ -47,18 +47,18 @@ datasets:
 steps:        
   gpt_train:
     training: gpt
-    loss_log_buffer: 500 # no idea what this does
+    loss_log_buffer: 500
 
     # Generally follows the recipe from the DALLE paper.
     optimizer: adamw # this should be adamw_zero if you're using distributed training
     optimizer_params:
-      lr: !!float ${learning_rate} # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value**
+      lr: !!float ${learning_rate} # originally: 1e-4
       weight_decay: !!float 1e-2
       beta1: 0.9
       beta2: 0.96
     clip_grad_eps: 4
 
-    injectors:  # TODO: replace this entire sequence with the GptVoiceLatentInjector
+    injectors:
       paired_to_mel:
         type: torch_mel_spectrogram
         mel_norm_file: ./models/tortoise/clips_mel_norms.pth
@@ -74,7 +74,7 @@ steps:
         type: discrete_token
         in: paired_mel
         out: paired_mel_codes
-        dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT
+        dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml"
       paired_fwd_text:
         type: generator
         generator: gpt
@@ -95,12 +95,12 @@ networks:
     type: generator 
     which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
     kwargs:
-      layers: 30 # WAS 8
-      model_dim: 1024 # WAS 512
-      heads: 16 # WAS 8
-      max_text_tokens: 402 # WAS 120
-      max_mel_tokens: 604 # WAS 250
-      max_conditioning_inputs: 2 # WAS 1
+      layers: 30 # originally: 8
+      model_dim: 1024 # originally: 512
+      heads: 16 # originally: 8
+      max_text_tokens: 402 # originally: 120
+      max_mel_tokens: 604 # originally: 250
+      max_conditioning_inputs: 2 # originally: 1
       mel_length_compression: 1024
       number_text_tokens: 256 # supposed to be 255 for newer unified_voice files 
       number_mel_codes: 8194
@@ -118,11 +118,10 @@ path:
   strict_load: true
   ${resume_state}
 
-# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit)
-train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
+train:
   niter: ${iterations}
   warmup_iter: -1
-  mega_batch_factor: ${mega_batch_factor} # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
+  mega_batch_factor: ${gradient_accumulation_size}
   val_freq: ${iterations}
 
   ema_enabled: false # I really don't think EMA matters
@@ -142,7 +141,7 @@ eval:
 
 logger: 
   print_freq: ${print_rate}
-  save_checkpoint_freq: ${save_rate} # CHANGEME: especially you should increase this it's really slow
+  save_checkpoint_freq: ${save_rate}
   visuals: [gen, mel]
   visual_debug_rate: ${print_rate}
   is_mel_spectrogram: true
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
index 5c173a4..46abc92 100755
--- a/src/utils.py
+++ b/src/utils.py
@@ -573,6 +573,9 @@ class TrainingState():
 		infos = {}
 		highest_step = self.last_info_check_at
 
+		if not update:
+			self.losses = []
+
 		if use_tensorboard:
 			logs = sorted([f'{self.dataset_dir}/tb_logger/{d}' for d in os.listdir(f'{self.dataset_dir}/tb_logger/') if d[:6] == "events" ])
 			if update:
@@ -816,6 +819,7 @@ def update_training_dataplot(config_path=None):
 			del training_state
 			training_state = None
 	elif training_state.losses:
+		training_state.load_losses()
 		update = gr.LinePlot.update(value=pd.DataFrame(training_state.losses))
 
 	return update
@@ -837,7 +841,8 @@ def stop_training():
 	print("Killing training process...")
 	training_state.killed = True
 	training_state.process.stdout.close()
-	training_state.process.terminate()
+	#training_state.process.terminate()
+	training_state.process.send_signal(signal.SIGINT)
 	return_code = training_state.process.wait()
 	training_state = None
 	return f"Training cancelled: {return_code}"
@@ -938,7 +943,7 @@ EPOCH_SCHEDULE = [ 9, 18, 25, 33 ]
 def schedule_learning_rate( iterations, schedule=EPOCH_SCHEDULE ):
 	return [int(iterations * d) for d in schedule]
 
-def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, mega_batch_factor, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ):
+def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, gradient_accumulation_size, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ):
 	name = f"{voice}-finetune"
 	dataset_name = f"{voice}-train"
 	dataset_path = f"./training/{voice}/train.txt"
@@ -959,12 +964,11 @@ def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learni
 		batch_size = int(lines / nearest_slice)
 		messages.append(f"Batch size not neatly divisible by dataset size, adjusting batch size to: {batch_size} ({nearest_slice} steps per epoch)")
 	
-	if batch_size == 1 and mega_batch_factor != 1:
-		mega_batch_factor = 1
-		messages.append(f"Mega batch factor is too large for the given batch size, clamping mega batch factor to: {mega_batch_factor}")
-	elif batch_size / mega_batch_factor < 2:
-		mega_batch_factor = int(batch_size / 2)
-		messages.append(f"Mega batch factor is too large for the given batch size, clamping mega batch factor to: {mega_batch_factor}")
+	if gradient_accumulation_size == 0:
+		gradient_accumulation_size = 1
+	elif batch_size % gradient_accumulation_size != 0:
+		gradient_accumulation_size = int(batch_size / gradient_accumulation_size)
+		messages.append(f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {gradient_accumulation_size}")
 
 	iterations = calc_iterations(epochs=epochs, lines=lines, batch_size=batch_size)
 
@@ -980,14 +984,18 @@ def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learni
 		resume_path = None
 		messages.append("Resume path specified, but does not exist. Disabling...")
 
-	if half_p:
-		messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !")
-		if not os.path.exists(get_halfp_model_path()):
-			convert_to_halfp()
-	
 	if bnb:
 		messages.append("BitsAndBytes requested. Please note this is ! EXPERIMENTAL !")
 
+	if half_p:
+		if bnb:
+			half_p = False
+			messages.append("Half Precision requested, but BitsAndBytes is also requested. Due to redundancies, disabling half precision...")
+		else:
+			messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !")
+			if not os.path.exists(get_halfp_model_path()):
+				convert_to_halfp()	
+
 	messages.append(f"For {epochs} epochs with {lines} lines in batches of {batch_size}, iterating for {iterations} steps ({int(iterations / epochs)} steps per epoch)")
 
 	return (
@@ -995,14 +1003,14 @@ def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learni
 		text_ce_lr_weight,
 		learning_rate_schedule,
 		batch_size,
-		mega_batch_factor,
+		gradient_accumulation_size,
 		print_rate,
 		save_rate,
 		resume_path,
 		messages
 	)
 
-def save_training_settings( iterations=None, learning_rate=None, text_ce_lr_weight=None, learning_rate_schedule=None, batch_size=None, mega_batch_factor=None, print_rate=None, save_rate=None, name=None, dataset_name=None, dataset_path=None, validation_name=None, validation_path=None, output_name=None, resume_path=None, half_p=None, bnb=None, source_model=None ):
+def save_training_settings( iterations=None, learning_rate=None, text_ce_lr_weight=None, learning_rate_schedule=None, batch_size=None, gradient_accumulation_size=None, print_rate=None, save_rate=None, name=None, dataset_name=None, dataset_path=None, validation_name=None, validation_path=None, output_name=None, resume_path=None, half_p=None, bnb=None, source_model=None ):
 	if not source_model:
 		source_model = f"./models/tortoise/autoregressive{'_half' if half_p else ''}.pth"
 
@@ -1011,8 +1019,8 @@ def save_training_settings( iterations=None, learning_rate=None, text_ce_lr_weig
 		"batch_size": batch_size if batch_size else 64,
 		"learning_rate": learning_rate if learning_rate else 1e-5,
 		"gen_lr_steps": learning_rate_schedule if learning_rate_schedule else EPOCH_SCHEDULE,
-		"mega_batch_factor": mega_batch_factor if mega_batch_factor else 4,
-		"print_rate": print_rate if print_rate else 50,
+		"gradient_accumulation_size": gradient_accumulation_size if gradient_accumulation_size else 4,
+		"print_rate": print_rate if print_rate else 1,
 		"save_rate": save_rate if save_rate else 50,
 		"name": name if name else "finetune",
 		"dataset_name": dataset_name if dataset_name else "finetune",
diff --git a/src/webui.py b/src/webui.py
index c94c59e..db90051 100755
--- a/src/webui.py
+++ b/src/webui.py
@@ -227,9 +227,8 @@ def import_training_settings_proxy( voice ):
 		messages.append(f"Basing epoch size to {lines} lines")
 
 	batch_size = config['datasets']['train']['batch_size']
-	mega_batch_factor = config['train']['mega_batch_factor']
+	gradient_accumulation_size = config['train']['mega_batch_factor']
 
-	
 	iterations = config['train']['niter']
 	steps_per_iteration = int(lines / batch_size)
 	epochs = int(iterations / steps_per_iteration)
@@ -276,7 +275,7 @@ def import_training_settings_proxy( voice ):
 		text_ce_lr_weight,
 		learning_rate_schedule,
 		batch_size,
-		mega_batch_factor,
+		gradient_accumulation_size,
 		print_rate,
 		save_rate,
 		resume_path,
@@ -287,7 +286,7 @@ def import_training_settings_proxy( voice ):
 	)
 
 
-def save_training_settings_proxy( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, mega_batch_factor, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ):
+def save_training_settings_proxy( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, gradient_accumulation_size, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ):
 	name = f"{voice}-finetune"
 	dataset_name = f"{voice}-train"
 	dataset_path = f"./training/{voice}/train.txt"
@@ -318,7 +317,7 @@ def save_training_settings_proxy( epochs, learning_rate, text_ce_lr_weight, lear
 		learning_rate=learning_rate,
 		text_ce_lr_weight=text_ce_lr_weight,
 		learning_rate_schedule=learning_rate_schedule,
-		mega_batch_factor=mega_batch_factor,
+		gradient_accumulation_size=gradient_accumulation_size,
 		print_rate=print_rate,
 		save_rate=save_rate,
 		name=name,
@@ -489,7 +488,7 @@ def setup_gradio():
 						with gr.Row():
 							training_settings = training_settings + [
 								gr.Number(label="Batch Size", value=128, precision=0),
-								gr.Number(label="Mega Batch Factor", value=4, precision=0),
+								gr.Number(label="Gradient Accumulation Size", value=4, precision=0),
 							]
 						with gr.Row():
 							training_settings = training_settings + [
@@ -534,8 +533,10 @@ def setup_gradio():
 					with gr.Column():
 						training_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8)
 						verbose_training = gr.Checkbox(label="Verbose Console Output", value=True)
-						training_buffer_size = gr.Slider(label="Console Buffer Size", minimum=4, maximum=32, value=8)
-						training_keep_x_past_datasets = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1)
+						
+						with gr.Row():
+							training_buffer_size = gr.Slider(label="Console Buffer Size", minimum=4, maximum=32, value=8)
+							training_keep_x_past_datasets = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1)
 						training_gpu_count = gr.Number(label="GPUs", value=1)
 						with gr.Row():
 							start_training_button = gr.Button(value="Train")