added VRAM validation for a given batch:gradient accumulation size ratio (based emprically off of 6GiB, 16GiB, and 16x2GiB, would be nice to have more data on what's safe)

This commit is contained in:
mrq 2023-03-09 02:08:06 +00:00
parent ef7b957fff
commit 0e80e311b0
3 changed files with 49 additions and 18 deletions

View File

@ -1257,10 +1257,11 @@ def optimize_training_settings( **kwargs ):
settings['batch_size'] = lines settings['batch_size'] = lines
messages.append(f"Batch size is larger than your dataset, clamping batch size to: {settings['batch_size']}") messages.append(f"Batch size is larger than your dataset, clamping batch size to: {settings['batch_size']}")
if settings['batch_size'] % lines != 0: if lines % settings['batch_size'] != 0:
nearest_slice = int(lines / settings['batch_size']) + 1 settings['batch_size'] = int(lines / settings['batch_size'])
settings['batch_size'] = int(lines / nearest_slice) if settings['batch_size'] == 0:
messages.append(f"Batch size not neatly divisible by dataset size, adjusting batch size to: {settings['batch_size']} ({nearest_slice} steps per epoch)") settings['batch_size'] = 1
messages.append(f"Batch size not neatly divisible by dataset size, adjusting batch size to: {settings['batch_size']}")
if settings['gradient_accumulation_size'] == 0: if settings['gradient_accumulation_size'] == 0:
settings['gradient_accumulation_size'] = 1 settings['gradient_accumulation_size'] = 1
@ -1271,6 +1272,7 @@ def optimize_training_settings( **kwargs ):
settings['gradient_accumulation_size'] = 1 settings['gradient_accumulation_size'] = 1
messages.append(f"Gradient accumulation size is too large for a given batch size, clamping gradient accumulation size to: {settings['gradient_accumulation_size']}") messages.append(f"Gradient accumulation size is too large for a given batch size, clamping gradient accumulation size to: {settings['gradient_accumulation_size']}")
"""
elif settings['batch_size'] % settings['gradient_accumulation_size'] != 0: elif settings['batch_size'] % settings['gradient_accumulation_size'] != 0:
settings['gradient_accumulation_size'] = int(settings['batch_size'] / settings['gradient_accumulation_size']) settings['gradient_accumulation_size'] = int(settings['batch_size'] / settings['gradient_accumulation_size'])
if settings['gradient_accumulation_size'] == 0: if settings['gradient_accumulation_size'] == 0:
@ -1278,7 +1280,34 @@ def optimize_training_settings( **kwargs ):
messages.append(f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}") messages.append(f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}")
print("VRAM", get_device_vram()) if settings['batch_size'] % settings['gpus'] != 0:
settings['batch_size'] = int(settings['batch_size'] / settings['gpus'])
if settings['batch_size'] == 0:
settings['batch_size'] = 1
messages.append(f"Batch size not neatly divisible by GPU count, adjusting batch size to: {settings['batch_size']}")
"""
def get_device_batch_size( vram ):
DEVICE_BATCH_SIZE_MAP = [
(32, 64), # based on my two 6800XTs, I can only really safely get a ratio of 156:2 = 78
(16, 8), # based on an A4000, I can do a ratio of 512:64 = 8:1
(8, 4), # interpolated
(6, 2), # based on my 2060, it only really lets me have a batch ratio of 2:1
]
for k, v in DEVICE_BATCH_SIZE_MAP:
if vram > (k-1):
return v
return 1
# assuming you have equal GPUs
vram = get_device_vram() * settings['gpus']
batch_ratio = int(settings['batch_size'] / settings['gradient_accumulation_size'])
batch_cap = get_device_batch_size(vram)
if batch_ratio > batch_cap:
settings['gradient_accumulation_size'] = int(settings['batch_size'] / batch_cap)
messages.append(f"Batch ratio ({batch_ratio}) is expected to exceed your VRAM capacity ({'{:.3f}'.format(vram)}GB, suggested {batch_cap} batch size cap), adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}")
iterations = calc_iterations(epochs=settings['epochs'], lines=lines, batch_size=settings['batch_size']) iterations = calc_iterations(epochs=settings['epochs'], lines=lines, batch_size=settings['batch_size'])
@ -1365,10 +1394,6 @@ def save_training_settings( **kwargs ):
if settings['gpus'] > get_device_count(): if settings['gpus'] > get_device_count():
settings['gpus'] = get_device_count() settings['gpus'] = get_device_count()
if settings['gpus'] < 1:
settings['gpus'] = 1
settings['optimizer'] = 'adamw' if settings['gpus'] == 1 else 'adamw_zero'
LEARNING_RATE_SCHEMES = ["MultiStepLR", "CosineAnnealingLR_Restart"] LEARNING_RATE_SCHEMES = ["MultiStepLR", "CosineAnnealingLR_Restart"]
if 'learning_rate_scheme' not in settings or settings['learning_rate_scheme'] not in LEARNING_RATE_SCHEMES: if 'learning_rate_scheme' not in settings or settings['learning_rate_scheme'] not in LEARNING_RATE_SCHEMES:
@ -1830,12 +1855,10 @@ def import_generate_settings(file="./config/generate.json"):
res = [] res = []
if GENERATE_SETTINGS_ARGS is not None: if GENERATE_SETTINGS_ARGS is not None:
for k in GENERATE_SETTINGS_ARGS: for k in GENERATE_SETTINGS_ARGS:
if k not in defaults: res.append(defaults[k] if not settings or settings[k] is None else settings[k])
continue
res.append(defaults[k] if not settings or k not in settings or not settings[k] is None else settings[k])
else: else:
for k in defaults: for k in defaults:
res.append(defaults[k] if not settings or k not in settings or not settings[k] is None else settings[k]) res.append(defaults[k] if not settings or settings[k] is None else settings[k])
return tuple(res) return tuple(res)

View File

@ -210,8 +210,14 @@ def import_training_settings_proxy( voice ):
settings['resume_state'] = f'{statedir}/{resumes[-1]}.state' settings['resume_state'] = f'{statedir}/{resumes[-1]}.state'
messages.append(f"Found most recent training state: {settings['resume_state']}") messages.append(f"Found most recent training state: {settings['resume_state']}")
output = list(settings.values()) output = {}
for k in TRAINING_SETTINGS:
if k not in settings:
continue
output[k] = settings[k]
output = list(output.values())
messages.append(f"Imported training settings: {injson}") messages.append(f"Imported training settings: {injson}")
return output[:-1] + ["\n".join(messages)] return output[:-1] + ["\n".join(messages)]
def save_training_settings_proxy( *args ): def save_training_settings_proxy( *args ):
@ -403,8 +409,10 @@ def setup_gradio():
TRAINING_SETTINGS["half_p"] = gr.Checkbox(label="Half Precision", value=args.training_default_halfp) TRAINING_SETTINGS["half_p"] = gr.Checkbox(label="Half Precision", value=args.training_default_halfp)
TRAINING_SETTINGS["bitsandbytes"] = gr.Checkbox(label="BitsAndBytes", value=args.training_default_bnb) TRAINING_SETTINGS["bitsandbytes"] = gr.Checkbox(label="BitsAndBytes", value=args.training_default_bnb)
TRAINING_SETTINGS["workers"] = gr.Number(label="Worker Processes", value=2, precision=0) with gr.Row():
TRAINING_SETTINGS["gpus"] = gr.Number(label="GPUs", value=get_device_count(), precision=0) TRAINING_SETTINGS["workers"] = gr.Number(label="Worker Processes", value=2, precision=0)
TRAINING_SETTINGS["gpus"] = gr.Number(label="GPUs", value=get_device_count(), precision=0)
TRAINING_SETTINGS["source_model"] = gr.Dropdown( choices=autoregressive_models, label="Source Model", type="value", value=autoregressive_models[0] ) TRAINING_SETTINGS["source_model"] = gr.Dropdown( choices=autoregressive_models, label="Source Model", type="value", value=autoregressive_models[0] )
TRAINING_SETTINGS["resume_state"] = gr.Textbox(label="Resume State Path", placeholder="./training/${voice}/training_state/${last_state}.state") TRAINING_SETTINGS["resume_state"] = gr.Textbox(label="Resume State Path", placeholder="./training/${voice}/training_state/${last_state}.state")

@ -1 +1 @@
Subproject commit bbeee40ab3ee39dd16a1ebd2388bf82402776664 Subproject commit 00be48670b5ba358e86aa5781131e9920d8f4def