added VRAM validation for a given batch:gradient accumulation size ratio (based emprically off of 6GiB, 16GiB, and 16x2GiB, would be nice to have more data on what's safe)
This commit is contained in:
parent
ef7b957fff
commit
0e80e311b0
51
src/utils.py
51
src/utils.py
|
@ -1257,10 +1257,11 @@ def optimize_training_settings( **kwargs ):
|
||||||
settings['batch_size'] = lines
|
settings['batch_size'] = lines
|
||||||
messages.append(f"Batch size is larger than your dataset, clamping batch size to: {settings['batch_size']}")
|
messages.append(f"Batch size is larger than your dataset, clamping batch size to: {settings['batch_size']}")
|
||||||
|
|
||||||
if settings['batch_size'] % lines != 0:
|
if lines % settings['batch_size'] != 0:
|
||||||
nearest_slice = int(lines / settings['batch_size']) + 1
|
settings['batch_size'] = int(lines / settings['batch_size'])
|
||||||
settings['batch_size'] = int(lines / nearest_slice)
|
if settings['batch_size'] == 0:
|
||||||
messages.append(f"Batch size not neatly divisible by dataset size, adjusting batch size to: {settings['batch_size']} ({nearest_slice} steps per epoch)")
|
settings['batch_size'] = 1
|
||||||
|
messages.append(f"Batch size not neatly divisible by dataset size, adjusting batch size to: {settings['batch_size']}")
|
||||||
|
|
||||||
if settings['gradient_accumulation_size'] == 0:
|
if settings['gradient_accumulation_size'] == 0:
|
||||||
settings['gradient_accumulation_size'] = 1
|
settings['gradient_accumulation_size'] = 1
|
||||||
|
@ -1271,6 +1272,7 @@ def optimize_training_settings( **kwargs ):
|
||||||
settings['gradient_accumulation_size'] = 1
|
settings['gradient_accumulation_size'] = 1
|
||||||
|
|
||||||
messages.append(f"Gradient accumulation size is too large for a given batch size, clamping gradient accumulation size to: {settings['gradient_accumulation_size']}")
|
messages.append(f"Gradient accumulation size is too large for a given batch size, clamping gradient accumulation size to: {settings['gradient_accumulation_size']}")
|
||||||
|
"""
|
||||||
elif settings['batch_size'] % settings['gradient_accumulation_size'] != 0:
|
elif settings['batch_size'] % settings['gradient_accumulation_size'] != 0:
|
||||||
settings['gradient_accumulation_size'] = int(settings['batch_size'] / settings['gradient_accumulation_size'])
|
settings['gradient_accumulation_size'] = int(settings['batch_size'] / settings['gradient_accumulation_size'])
|
||||||
if settings['gradient_accumulation_size'] == 0:
|
if settings['gradient_accumulation_size'] == 0:
|
||||||
|
@ -1278,7 +1280,34 @@ def optimize_training_settings( **kwargs ):
|
||||||
|
|
||||||
messages.append(f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}")
|
messages.append(f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}")
|
||||||
|
|
||||||
print("VRAM", get_device_vram())
|
if settings['batch_size'] % settings['gpus'] != 0:
|
||||||
|
settings['batch_size'] = int(settings['batch_size'] / settings['gpus'])
|
||||||
|
if settings['batch_size'] == 0:
|
||||||
|
settings['batch_size'] = 1
|
||||||
|
messages.append(f"Batch size not neatly divisible by GPU count, adjusting batch size to: {settings['batch_size']}")
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def get_device_batch_size( vram ):
|
||||||
|
DEVICE_BATCH_SIZE_MAP = [
|
||||||
|
(32, 64), # based on my two 6800XTs, I can only really safely get a ratio of 156:2 = 78
|
||||||
|
(16, 8), # based on an A4000, I can do a ratio of 512:64 = 8:1
|
||||||
|
(8, 4), # interpolated
|
||||||
|
(6, 2), # based on my 2060, it only really lets me have a batch ratio of 2:1
|
||||||
|
]
|
||||||
|
for k, v in DEVICE_BATCH_SIZE_MAP:
|
||||||
|
if vram > (k-1):
|
||||||
|
return v
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# assuming you have equal GPUs
|
||||||
|
vram = get_device_vram() * settings['gpus']
|
||||||
|
batch_ratio = int(settings['batch_size'] / settings['gradient_accumulation_size'])
|
||||||
|
batch_cap = get_device_batch_size(vram)
|
||||||
|
|
||||||
|
if batch_ratio > batch_cap:
|
||||||
|
settings['gradient_accumulation_size'] = int(settings['batch_size'] / batch_cap)
|
||||||
|
messages.append(f"Batch ratio ({batch_ratio}) is expected to exceed your VRAM capacity ({'{:.3f}'.format(vram)}GB, suggested {batch_cap} batch size cap), adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}")
|
||||||
|
|
||||||
iterations = calc_iterations(epochs=settings['epochs'], lines=lines, batch_size=settings['batch_size'])
|
iterations = calc_iterations(epochs=settings['epochs'], lines=lines, batch_size=settings['batch_size'])
|
||||||
|
|
||||||
|
@ -1308,7 +1337,7 @@ def optimize_training_settings( **kwargs ):
|
||||||
else:
|
else:
|
||||||
messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !")
|
messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !")
|
||||||
if not os.path.exists(get_halfp_model_path()):
|
if not os.path.exists(get_halfp_model_path()):
|
||||||
convert_to_halfp()
|
convert_to_halfp()
|
||||||
|
|
||||||
messages.append(f"For {settings['epochs']} epochs with {lines} lines in batches of {settings['batch_size']}, iterating for {iterations} steps ({int(iterations / settings['epochs'])} steps per epoch)")
|
messages.append(f"For {settings['epochs']} epochs with {lines} lines in batches of {settings['batch_size']}, iterating for {iterations} steps ({int(iterations / settings['epochs'])} steps per epoch)")
|
||||||
|
|
||||||
|
@ -1365,10 +1394,6 @@ def save_training_settings( **kwargs ):
|
||||||
|
|
||||||
if settings['gpus'] > get_device_count():
|
if settings['gpus'] > get_device_count():
|
||||||
settings['gpus'] = get_device_count()
|
settings['gpus'] = get_device_count()
|
||||||
if settings['gpus'] < 1:
|
|
||||||
settings['gpus'] = 1
|
|
||||||
|
|
||||||
settings['optimizer'] = 'adamw' if settings['gpus'] == 1 else 'adamw_zero'
|
|
||||||
|
|
||||||
LEARNING_RATE_SCHEMES = ["MultiStepLR", "CosineAnnealingLR_Restart"]
|
LEARNING_RATE_SCHEMES = ["MultiStepLR", "CosineAnnealingLR_Restart"]
|
||||||
if 'learning_rate_scheme' not in settings or settings['learning_rate_scheme'] not in LEARNING_RATE_SCHEMES:
|
if 'learning_rate_scheme' not in settings or settings['learning_rate_scheme'] not in LEARNING_RATE_SCHEMES:
|
||||||
|
@ -1830,12 +1855,10 @@ def import_generate_settings(file="./config/generate.json"):
|
||||||
res = []
|
res = []
|
||||||
if GENERATE_SETTINGS_ARGS is not None:
|
if GENERATE_SETTINGS_ARGS is not None:
|
||||||
for k in GENERATE_SETTINGS_ARGS:
|
for k in GENERATE_SETTINGS_ARGS:
|
||||||
if k not in defaults:
|
res.append(defaults[k] if not settings or settings[k] is None else settings[k])
|
||||||
continue
|
|
||||||
res.append(defaults[k] if not settings or k not in settings or not settings[k] is None else settings[k])
|
|
||||||
else:
|
else:
|
||||||
for k in defaults:
|
for k in defaults:
|
||||||
res.append(defaults[k] if not settings or k not in settings or not settings[k] is None else settings[k])
|
res.append(defaults[k] if not settings or settings[k] is None else settings[k])
|
||||||
|
|
||||||
return tuple(res)
|
return tuple(res)
|
||||||
|
|
||||||
|
|
14
src/webui.py
14
src/webui.py
|
@ -210,8 +210,14 @@ def import_training_settings_proxy( voice ):
|
||||||
settings['resume_state'] = f'{statedir}/{resumes[-1]}.state'
|
settings['resume_state'] = f'{statedir}/{resumes[-1]}.state'
|
||||||
messages.append(f"Found most recent training state: {settings['resume_state']}")
|
messages.append(f"Found most recent training state: {settings['resume_state']}")
|
||||||
|
|
||||||
output = list(settings.values())
|
output = {}
|
||||||
|
for k in TRAINING_SETTINGS:
|
||||||
|
if k not in settings:
|
||||||
|
continue
|
||||||
|
output[k] = settings[k]
|
||||||
|
output = list(output.values())
|
||||||
messages.append(f"Imported training settings: {injson}")
|
messages.append(f"Imported training settings: {injson}")
|
||||||
|
|
||||||
return output[:-1] + ["\n".join(messages)]
|
return output[:-1] + ["\n".join(messages)]
|
||||||
|
|
||||||
def save_training_settings_proxy( *args ):
|
def save_training_settings_proxy( *args ):
|
||||||
|
@ -403,8 +409,10 @@ def setup_gradio():
|
||||||
TRAINING_SETTINGS["half_p"] = gr.Checkbox(label="Half Precision", value=args.training_default_halfp)
|
TRAINING_SETTINGS["half_p"] = gr.Checkbox(label="Half Precision", value=args.training_default_halfp)
|
||||||
TRAINING_SETTINGS["bitsandbytes"] = gr.Checkbox(label="BitsAndBytes", value=args.training_default_bnb)
|
TRAINING_SETTINGS["bitsandbytes"] = gr.Checkbox(label="BitsAndBytes", value=args.training_default_bnb)
|
||||||
|
|
||||||
TRAINING_SETTINGS["workers"] = gr.Number(label="Worker Processes", value=2, precision=0)
|
with gr.Row():
|
||||||
TRAINING_SETTINGS["gpus"] = gr.Number(label="GPUs", value=get_device_count(), precision=0)
|
TRAINING_SETTINGS["workers"] = gr.Number(label="Worker Processes", value=2, precision=0)
|
||||||
|
TRAINING_SETTINGS["gpus"] = gr.Number(label="GPUs", value=get_device_count(), precision=0)
|
||||||
|
|
||||||
TRAINING_SETTINGS["source_model"] = gr.Dropdown( choices=autoregressive_models, label="Source Model", type="value", value=autoregressive_models[0] )
|
TRAINING_SETTINGS["source_model"] = gr.Dropdown( choices=autoregressive_models, label="Source Model", type="value", value=autoregressive_models[0] )
|
||||||
TRAINING_SETTINGS["resume_state"] = gr.Textbox(label="Resume State Path", placeholder="./training/${voice}/training_state/${last_state}.state")
|
TRAINING_SETTINGS["resume_state"] = gr.Textbox(label="Resume State Path", placeholder="./training/${voice}/training_state/${last_state}.state")
|
||||||
|
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit bbeee40ab3ee39dd16a1ebd2388bf82402776664
|
Subproject commit 00be48670b5ba358e86aa5781131e9920d8f4def
|
Loading…
Reference in New Issue
Block a user