added VRAM validation for a given batch:gradient accumulation size ratio (based emprically off of 6GiB, 16GiB, and 16x2GiB, would be nice to have more data on what's safe)

2023-03-09 02:08:06 +00:00 · 2023-03-09 02:08:06 +00:00 · 0e80e311b0
commit 0e80e311b0
parent ef7b957fff
3 changed files with 49 additions and 18 deletions
--- a/src/utils.py
+++ b/src/utils.py
@ -1257,10 +1257,11 @@ def optimize_training_settings( **kwargs ):
 		settings['batch_size'] = lines
 		messages.append(f"Batch size is larger than your dataset, clamping batch size to: {settings['batch_size']}")	

-	if settings['batch_size'] % lines != 0:
-		nearest_slice = int(lines / settings['batch_size']) + 1
-		settings['batch_size'] = int(lines / nearest_slice)
-		messages.append(f"Batch size not neatly divisible by dataset size, adjusting batch size to: {settings['batch_size']} ({nearest_slice} steps per epoch)")
+	if lines % settings['batch_size'] != 0:
+		settings['batch_size'] = int(lines / settings['batch_size'])
+		if settings['batch_size'] == 0:
+			settings['batch_size'] = 1
+		messages.append(f"Batch size not neatly divisible by dataset size, adjusting batch size to: {settings['batch_size']}")
 	
 	if settings['gradient_accumulation_size'] == 0:
 		settings['gradient_accumulation_size'] = 1
@ -1271,6 +1272,7 @@ def optimize_training_settings( **kwargs ):
 			settings['gradient_accumulation_size'] = 1

 		messages.append(f"Gradient accumulation size is too large for a given batch size, clamping gradient accumulation size to: {settings['gradient_accumulation_size']}")
+	"""
 	elif settings['batch_size'] % settings['gradient_accumulation_size'] != 0:
 		settings['gradient_accumulation_size'] = int(settings['batch_size'] / settings['gradient_accumulation_size'])
 		if settings['gradient_accumulation_size'] == 0:
@ -1278,7 +1280,34 @@ def optimize_training_settings( **kwargs ):

 		messages.append(f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}")

-	print("VRAM", get_device_vram())
+	if settings['batch_size'] % settings['gpus'] != 0:
+		settings['batch_size'] = int(settings['batch_size'] / settings['gpus'])
+		if settings['batch_size'] == 0:
+			settings['batch_size'] = 1
+		messages.append(f"Batch size not neatly divisible by GPU count, adjusting batch size to: {settings['batch_size']}")
+	"""
+
+
+	def get_device_batch_size( vram ):
+		DEVICE_BATCH_SIZE_MAP = [
+			(32, 64), # based on my two 6800XTs, I can only really safely get a ratio of 156:2 = 78
+			(16, 8), # based on an A4000, I can do a ratio of 512:64 = 8:1
+			(8, 4), # interpolated
+			(6, 2), # based on my 2060, it only really lets me have a batch ratio of 2:1
+		]
+		for k, v in DEVICE_BATCH_SIZE_MAP:
+			if vram > (k-1):
+				return v
+		return 1
+	
+	# assuming you have equal GPUs
+	vram = get_device_vram() * settings['gpus']
+	batch_ratio = int(settings['batch_size'] / settings['gradient_accumulation_size'])
+	batch_cap = get_device_batch_size(vram)
+
+	if batch_ratio > batch_cap:
+		settings['gradient_accumulation_size'] = int(settings['batch_size'] / batch_cap)
+		messages.append(f"Batch ratio ({batch_ratio}) is expected to exceed your VRAM capacity ({'{:.3f}'.format(vram)}GB, suggested {batch_cap} batch size cap), adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}")

 	iterations = calc_iterations(epochs=settings['epochs'], lines=lines, batch_size=settings['batch_size'])

@ -1308,7 +1337,7 @@ def optimize_training_settings( **kwargs ):
 		else:
 			messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !")
 			if not os.path.exists(get_halfp_model_path()):
-				convert_to_halfp()
+				convert_to_halfp()	

 	messages.append(f"For {settings['epochs']} epochs with {lines} lines in batches of {settings['batch_size']}, iterating for {iterations} steps ({int(iterations / settings['epochs'])} steps per epoch)")

@ -1365,10 +1394,6 @@ def save_training_settings( **kwargs ):

 	if settings['gpus'] > get_device_count():
 		settings['gpus'] = get_device_count()
-	if settings['gpus'] < 1:
-		settings['gpus'] = 1
-
-	settings['optimizer'] = 'adamw' if settings['gpus'] == 1 else 'adamw_zero'

 	LEARNING_RATE_SCHEMES = ["MultiStepLR", "CosineAnnealingLR_Restart"]
 	if 'learning_rate_scheme' not in settings or settings['learning_rate_scheme'] not in LEARNING_RATE_SCHEMES:
@ -1830,12 +1855,10 @@ def import_generate_settings(file="./config/generate.json"):
 	res = []
 	if GENERATE_SETTINGS_ARGS is not None:
 		for k in GENERATE_SETTINGS_ARGS:
-			if k not in defaults:
-				continue
-			res.append(defaults[k] if not settings or k not in settings or not settings[k] is None else settings[k])
+			res.append(defaults[k] if not settings or settings[k] is None else settings[k])
 	else:
 		for k in defaults:
-			res.append(defaults[k] if not settings or k not in settings or not settings[k] is None else settings[k])
+			res.append(defaults[k] if not settings or settings[k] is None else settings[k])

 	return tuple(res)

--- a/src/webui.py
+++ b/src/webui.py
@ -210,8 +210,14 @@ def import_training_settings_proxy( voice ):
 			settings['resume_state'] = f'{statedir}/{resumes[-1]}.state'
 			messages.append(f"Found most recent training state: {settings['resume_state']}")

-	output = list(settings.values())
+	output = {}
+	for k in TRAINING_SETTINGS:
+		if k not in settings:
+			continue
+		output[k] = settings[k]
+	output = list(output.values())
 	messages.append(f"Imported training settings: {injson}")
+
 	return output[:-1] + ["\n".join(messages)]

 def save_training_settings_proxy( *args ):
@ -403,8 +409,10 @@ def setup_gradio():
 							TRAINING_SETTINGS["half_p"] = gr.Checkbox(label="Half Precision", value=args.training_default_halfp)
 							TRAINING_SETTINGS["bitsandbytes"] = gr.Checkbox(label="BitsAndBytes", value=args.training_default_bnb)

-						TRAINING_SETTINGS["workers"] = gr.Number(label="Worker Processes", value=2, precision=0)
-						TRAINING_SETTINGS["gpus"] = gr.Number(label="GPUs", value=get_device_count(), precision=0)
+						with gr.Row():
+							TRAINING_SETTINGS["workers"] = gr.Number(label="Worker Processes", value=2, precision=0)
+							TRAINING_SETTINGS["gpus"] = gr.Number(label="GPUs", value=get_device_count(), precision=0)
+							
 						TRAINING_SETTINGS["source_model"] = gr.Dropdown( choices=autoregressive_models, label="Source Model", type="value", value=autoregressive_models[0] )
 						TRAINING_SETTINGS["resume_state"] = gr.Textbox(label="Resume State Path", placeholder="./training/${voice}/training_state/${last_state}.state")
 						
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit bbeee40ab3ee39dd16a1ebd2388bf82402776664
+Subproject commit 00be48670b5ba358e86aa5781131e9920d8f4def