Added very experimental float16 training for cards with not enough VRAM (10GiB and below, maybe) \!NOTE\! this is VERY EXPERIMETNAL, I have zero free time to validate it right now, I'll do it later
This commit is contained in:
parent
ed2cf9f5ee
commit
8a1a48f31e
|
@ -4,7 +4,7 @@ scale: 1
|
||||||
gpu_ids: [0] # <-- unless you have multiple gpus, use this
|
gpu_ids: [0] # <-- unless you have multiple gpus, use this
|
||||||
start_step: -1
|
start_step: -1
|
||||||
checkpointing_enabled: true # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
|
checkpointing_enabled: true # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
|
||||||
fp16: false # might want to check this out
|
fp16: ${float16} # might want to check this out
|
||||||
wandb: false # <-- enable to log to wandb. tensorboard logging is always enabled.
|
wandb: false # <-- enable to log to wandb. tensorboard logging is always enabled.
|
||||||
use_tb_logger: true
|
use_tb_logger: true
|
||||||
|
|
||||||
|
@ -26,10 +26,10 @@ datasets:
|
||||||
use_bpe_tokenizer: True
|
use_bpe_tokenizer: True
|
||||||
tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json
|
tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json
|
||||||
load_aligned_codes: False
|
load_aligned_codes: False
|
||||||
val:
|
val: # I really do not care about validation right now
|
||||||
name: ${validation_name}
|
name: ${validation_name}
|
||||||
n_workers: 1
|
n_workers: 1
|
||||||
batch_size: 32 # this could be higher probably
|
batch_size: 1 # this could be higher probably
|
||||||
mode: paired_voice_audio
|
mode: paired_voice_audio
|
||||||
path: ${validation_path}
|
path: ${validation_path}
|
||||||
fetcher_mode: ['lj']
|
fetcher_mode: ['lj']
|
||||||
|
@ -125,6 +125,8 @@ train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
|
||||||
mega_batch_factor: ${mega_batch_factor} # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
|
mega_batch_factor: ${mega_batch_factor} # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
|
||||||
val_freq: ${iterations}
|
val_freq: ${iterations}
|
||||||
|
|
||||||
|
ema_enabled: false # I really don't think EMA matters
|
||||||
|
|
||||||
default_lr_scheme: MultiStepLR
|
default_lr_scheme: MultiStepLR
|
||||||
gen_lr_steps: ${gen_lr_steps} #[50000, 100000, 140000, 180000]
|
gen_lr_steps: ${gen_lr_steps} #[50000, 100000, 140000, 180000]
|
||||||
lr_gamma: 0.5
|
lr_gamma: 0.5
|
||||||
|
|
35
src/utils.py
35
src/utils.py
|
@ -490,16 +490,20 @@ def stop_training():
|
||||||
training_process.kill()
|
training_process.kill()
|
||||||
return "Training cancelled"
|
return "Training cancelled"
|
||||||
|
|
||||||
|
def get_halfp_model():
|
||||||
|
autoregressive_model_path = get_model_path('autoregressive.pth')
|
||||||
|
return autoregressive_model_path.replace(".pth", "_half.pth")
|
||||||
|
|
||||||
def convert_to_halfp():
|
def convert_to_halfp():
|
||||||
autoregressive_model_path = get_model_path('autoregressive.pth')
|
autoregressive_model_path = get_model_path('autoregressive.pth')
|
||||||
|
print(f'Converting model to half precision: {autoregressive_model_path}')
|
||||||
model = torch.load(autoregressive_model_path)
|
model = torch.load(autoregressive_model_path)
|
||||||
for k in model:
|
for k in model:
|
||||||
if re.findall(r'\.weight$', k):
|
|
||||||
print(f"Converting: {k}")
|
|
||||||
model[k] = model[k].half()
|
model[k] = model[k].half()
|
||||||
|
|
||||||
torch.save(model, './models/tortoise/autoregressive_half.pth')
|
outfile = get_halfp_model()
|
||||||
print('Converted model to half precision: ./models/tortoise/autoregressive_half.pth')
|
torch.save(model, outfile)
|
||||||
|
print(f'Converted model to half precision: {outfile}')
|
||||||
|
|
||||||
def prepare_dataset( files, outdir, language=None, progress=None ):
|
def prepare_dataset( files, outdir, language=None, progress=None ):
|
||||||
unload_tts()
|
unload_tts()
|
||||||
|
@ -555,7 +559,7 @@ EPOCH_SCHEDULE = [ 9, 18, 25, 33 ]
|
||||||
def schedule_learning_rate( iterations ):
|
def schedule_learning_rate( iterations ):
|
||||||
return [int(iterations * d) for d in EPOCH_SCHEDULE]
|
return [int(iterations * d) for d in EPOCH_SCHEDULE]
|
||||||
|
|
||||||
def optimize_training_settings( epochs, batch_size, learning_rate, learning_rate_schedule, mega_batch_factor, print_rate, save_rate, resume_path, voice ):
|
def optimize_training_settings( epochs, batch_size, learning_rate, learning_rate_schedule, mega_batch_factor, print_rate, save_rate, resume_path, half_p, voice ):
|
||||||
name = f"{voice}-finetune"
|
name = f"{voice}-finetune"
|
||||||
dataset_name = f"{voice}-train"
|
dataset_name = f"{voice}-train"
|
||||||
dataset_path = f"./training/{voice}/train.txt"
|
dataset_path = f"./training/{voice}/train.txt"
|
||||||
|
@ -594,6 +598,11 @@ def optimize_training_settings( epochs, batch_size, learning_rate, learning_rate
|
||||||
resume_path = None
|
resume_path = None
|
||||||
messages.append("Resume path specified, but does not exist. Disabling...")
|
messages.append("Resume path specified, but does not exist. Disabling...")
|
||||||
|
|
||||||
|
if half_p:
|
||||||
|
messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !")
|
||||||
|
if not os.path.exists(get_halfp_model()):
|
||||||
|
convert_to_halfp()
|
||||||
|
|
||||||
messages.append(f"For {epochs} epochs with {lines} lines in batches of {batch_size}, iterating for {iterations} steps ({int(iterations / epochs)} steps per epoch)")
|
messages.append(f"For {epochs} epochs with {lines} lines in batches of {batch_size}, iterating for {iterations} steps ({int(iterations / epochs)} steps per epoch)")
|
||||||
|
|
||||||
return (
|
return (
|
||||||
|
@ -607,7 +616,7 @@ def optimize_training_settings( epochs, batch_size, learning_rate, learning_rate
|
||||||
messages
|
messages
|
||||||
)
|
)
|
||||||
|
|
||||||
def save_training_settings( iterations=None, batch_size=None, learning_rate=None, learning_rate_schedule=None, mega_batch_factor=None, print_rate=None, save_rate=None, name=None, dataset_name=None, dataset_path=None, validation_name=None, validation_path=None, output_name=None, resume_path=None ):
|
def save_training_settings( iterations=None, batch_size=None, learning_rate=None, learning_rate_schedule=None, mega_batch_factor=None, print_rate=None, save_rate=None, name=None, dataset_name=None, dataset_path=None, validation_name=None, validation_path=None, output_name=None, resume_path=None, half_p=None ):
|
||||||
settings = {
|
settings = {
|
||||||
"iterations": iterations if iterations else 500,
|
"iterations": iterations if iterations else 500,
|
||||||
"batch_size": batch_size if batch_size else 64,
|
"batch_size": batch_size if batch_size else 64,
|
||||||
|
@ -622,10 +631,20 @@ def save_training_settings( iterations=None, batch_size=None, learning_rate=None
|
||||||
"validation_name": validation_name if validation_name else "finetune",
|
"validation_name": validation_name if validation_name else "finetune",
|
||||||
"validation_path": validation_path if validation_path else "./training/finetune/train.txt",
|
"validation_path": validation_path if validation_path else "./training/finetune/train.txt",
|
||||||
|
|
||||||
'resume_state': f"resume_state: '{resume_path}'" if resume_path else f"# resume_state: './training/{name if name else 'finetune'}/training_state/#.state'",
|
'resume_state': f"resume_state: '{resume_path}'",
|
||||||
'pretrain_model_gpt': "pretrain_model_gpt: './models/tortoise/autoregressive.pth'" if not resume_path else "# pretrain_model_gpt: './models/tortoise/autoregressive.pth'"
|
'pretrain_model_gpt': f"pretrain_model_gpt: './models/tortoise/autoregressive{'_half' if half_p else ''}.pth'",
|
||||||
|
|
||||||
|
'float16': 'true' if half_p else 'false'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if resume_path:
|
||||||
|
settings['pretrain_model_gpt'] = f"# {settings['pretrain_model_gpt']}"
|
||||||
|
else:
|
||||||
|
settings['resume_state'] = f"# resume_state: './training/{name if name else 'finetune'}/training_state/#.state'"
|
||||||
|
|
||||||
|
if half_p:
|
||||||
|
if not os.path.exists(get_halfp_model()):
|
||||||
|
convert_to_halfp()
|
||||||
|
|
||||||
if not output_name:
|
if not output_name:
|
||||||
output_name = f'{settings["name"]}.yaml'
|
output_name = f'{settings["name"]}.yaml'
|
||||||
|
|
13
src/webui.py
13
src/webui.py
|
@ -195,7 +195,7 @@ def optimize_training_settings_proxy( *args, **kwargs ):
|
||||||
"\n".join(tup[7])
|
"\n".join(tup[7])
|
||||||
)
|
)
|
||||||
|
|
||||||
def save_training_settings_proxy( epochs, batch_size, learning_rate, learning_rate_schedule, mega_batch_factor, print_rate, save_rate, resume_path, voice ):
|
def save_training_settings_proxy( epochs, batch_size, learning_rate, learning_rate_schedule, mega_batch_factor, print_rate, save_rate, resume_path, half_p, voice ):
|
||||||
name = f"{voice}-finetune"
|
name = f"{voice}-finetune"
|
||||||
dataset_name = f"{voice}-train"
|
dataset_name = f"{voice}-train"
|
||||||
dataset_path = f"./training/{voice}/train.txt"
|
dataset_path = f"./training/{voice}/train.txt"
|
||||||
|
@ -232,6 +232,7 @@ def save_training_settings_proxy( epochs, batch_size, learning_rate, learning_ra
|
||||||
validation_path=validation_path,
|
validation_path=validation_path,
|
||||||
output_name=f"{voice}/train.yaml",
|
output_name=f"{voice}/train.yaml",
|
||||||
resume_path=resume_path,
|
resume_path=resume_path,
|
||||||
|
half_p=half_p,
|
||||||
))
|
))
|
||||||
return "\n".join(messages)
|
return "\n".join(messages)
|
||||||
|
|
||||||
|
@ -373,19 +374,11 @@ def setup_gradio():
|
||||||
gr.Number(label="Print Frequency per Epoch", value=5, precision=0),
|
gr.Number(label="Print Frequency per Epoch", value=5, precision=0),
|
||||||
gr.Number(label="Save Frequency per Epoch", value=5, precision=0),
|
gr.Number(label="Save Frequency per Epoch", value=5, precision=0),
|
||||||
gr.Textbox(label="Resume State Path", placeholder="./training/${voice}-finetune/training_state/${last_state}.state"),
|
gr.Textbox(label="Resume State Path", placeholder="./training/${voice}-finetune/training_state/${last_state}.state"),
|
||||||
|
gr.Checkbox(label="Half Precision", value=False),
|
||||||
]
|
]
|
||||||
dataset_list = gr.Dropdown( get_dataset_list(), label="Dataset", type="value" )
|
dataset_list = gr.Dropdown( get_dataset_list(), label="Dataset", type="value" )
|
||||||
training_settings = training_settings + [ dataset_list ]
|
training_settings = training_settings + [ dataset_list ]
|
||||||
refresh_dataset_list = gr.Button(value="Refresh Dataset List")
|
refresh_dataset_list = gr.Button(value="Refresh Dataset List")
|
||||||
"""
|
|
||||||
training_settings = training_settings + [
|
|
||||||
gr.Textbox(label="Training Name", placeholder="finetune"),
|
|
||||||
gr.Textbox(label="Dataset Name", placeholder="finetune"),
|
|
||||||
gr.Textbox(label="Dataset Path", placeholder="./training/finetune/train.txt"),
|
|
||||||
gr.Textbox(label="Validation Name", placeholder="finetune"),
|
|
||||||
gr.Textbox(label="Validation Path", placeholder="./training/finetune/train.txt"),
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
save_yaml_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8)
|
save_yaml_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8)
|
||||||
optimize_yaml_button = gr.Button(value="Validate Training Configuration")
|
optimize_yaml_button = gr.Button(value="Validate Training Configuration")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user