From 8a1a48f31e30957196213a193c9aab45f1c25520 Mon Sep 17 00:00:00 2001 From: mrq Date: Tue, 21 Feb 2023 19:31:57 +0000 Subject: [PATCH] Added very experimental float16 training for cards with not enough VRAM (10GiB and below, maybe) \!NOTE\! this is VERY EXPERIMETNAL, I have zero free time to validate it right now, I'll do it later --- models/.template.yaml | 8 +++++--- src/utils.py | 37 ++++++++++++++++++++++++++++--------- src/webui.py | 13 +++---------- 3 files changed, 36 insertions(+), 22 deletions(-) diff --git a/models/.template.yaml b/models/.template.yaml index dc4b1ff..d89c889 100755 --- a/models/.template.yaml +++ b/models/.template.yaml @@ -4,7 +4,7 @@ scale: 1 gpu_ids: [0] # <-- unless you have multiple gpus, use this start_step: -1 checkpointing_enabled: true # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training. -fp16: false # might want to check this out +fp16: ${float16} # might want to check this out wandb: false # <-- enable to log to wandb. tensorboard logging is always enabled. use_tb_logger: true @@ -26,10 +26,10 @@ datasets: use_bpe_tokenizer: True tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json load_aligned_codes: False - val: + val: # I really do not care about validation right now name: ${validation_name} n_workers: 1 - batch_size: 32 # this could be higher probably + batch_size: 1 # this could be higher probably mode: paired_voice_audio path: ${validation_path} fetcher_mode: ['lj'] @@ -125,6 +125,8 @@ train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH mega_batch_factor: ${mega_batch_factor} # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8]. val_freq: ${iterations} + ema_enabled: false # I really don't think EMA matters + default_lr_scheme: MultiStepLR gen_lr_steps: ${gen_lr_steps} #[50000, 100000, 140000, 180000] lr_gamma: 0.5 diff --git a/src/utils.py b/src/utils.py index 7cb8c6f..c75d47a 100755 --- a/src/utils.py +++ b/src/utils.py @@ -490,16 +490,20 @@ def stop_training(): training_process.kill() return "Training cancelled" +def get_halfp_model(): + autoregressive_model_path = get_model_path('autoregressive.pth') + return autoregressive_model_path.replace(".pth", "_half.pth") + def convert_to_halfp(): autoregressive_model_path = get_model_path('autoregressive.pth') + print(f'Converting model to half precision: {autoregressive_model_path}') model = torch.load(autoregressive_model_path) for k in model: - if re.findall(r'\.weight$', k): - print(f"Converting: {k}") - model[k] = model[k].half() + model[k] = model[k].half() - torch.save(model, './models/tortoise/autoregressive_half.pth') - print('Converted model to half precision: ./models/tortoise/autoregressive_half.pth') + outfile = get_halfp_model() + torch.save(model, outfile) + print(f'Converted model to half precision: {outfile}') def prepare_dataset( files, outdir, language=None, progress=None ): unload_tts() @@ -555,7 +559,7 @@ EPOCH_SCHEDULE = [ 9, 18, 25, 33 ] def schedule_learning_rate( iterations ): return [int(iterations * d) for d in EPOCH_SCHEDULE] -def optimize_training_settings( epochs, batch_size, learning_rate, learning_rate_schedule, mega_batch_factor, print_rate, save_rate, resume_path, voice ): +def optimize_training_settings( epochs, batch_size, learning_rate, learning_rate_schedule, mega_batch_factor, print_rate, save_rate, resume_path, half_p, voice ): name = f"{voice}-finetune" dataset_name = f"{voice}-train" dataset_path = f"./training/{voice}/train.txt" @@ -594,6 +598,11 @@ def optimize_training_settings( epochs, batch_size, learning_rate, learning_rate resume_path = None messages.append("Resume path specified, but does not exist. Disabling...") + if half_p: + messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !") + if not os.path.exists(get_halfp_model()): + convert_to_halfp() + messages.append(f"For {epochs} epochs with {lines} lines in batches of {batch_size}, iterating for {iterations} steps ({int(iterations / epochs)} steps per epoch)") return ( @@ -607,7 +616,7 @@ def optimize_training_settings( epochs, batch_size, learning_rate, learning_rate messages ) -def save_training_settings( iterations=None, batch_size=None, learning_rate=None, learning_rate_schedule=None, mega_batch_factor=None, print_rate=None, save_rate=None, name=None, dataset_name=None, dataset_path=None, validation_name=None, validation_path=None, output_name=None, resume_path=None ): +def save_training_settings( iterations=None, batch_size=None, learning_rate=None, learning_rate_schedule=None, mega_batch_factor=None, print_rate=None, save_rate=None, name=None, dataset_name=None, dataset_path=None, validation_name=None, validation_path=None, output_name=None, resume_path=None, half_p=None ): settings = { "iterations": iterations if iterations else 500, "batch_size": batch_size if batch_size else 64, @@ -622,10 +631,20 @@ def save_training_settings( iterations=None, batch_size=None, learning_rate=None "validation_name": validation_name if validation_name else "finetune", "validation_path": validation_path if validation_path else "./training/finetune/train.txt", - 'resume_state': f"resume_state: '{resume_path}'" if resume_path else f"# resume_state: './training/{name if name else 'finetune'}/training_state/#.state'", - 'pretrain_model_gpt': "pretrain_model_gpt: './models/tortoise/autoregressive.pth'" if not resume_path else "# pretrain_model_gpt: './models/tortoise/autoregressive.pth'" + 'resume_state': f"resume_state: '{resume_path}'", + 'pretrain_model_gpt': f"pretrain_model_gpt: './models/tortoise/autoregressive{'_half' if half_p else ''}.pth'", + + 'float16': 'true' if half_p else 'false' } + if resume_path: + settings['pretrain_model_gpt'] = f"# {settings['pretrain_model_gpt']}" + else: + settings['resume_state'] = f"# resume_state: './training/{name if name else 'finetune'}/training_state/#.state'" + + if half_p: + if not os.path.exists(get_halfp_model()): + convert_to_halfp() if not output_name: output_name = f'{settings["name"]}.yaml' diff --git a/src/webui.py b/src/webui.py index 49c6210..fb45620 100755 --- a/src/webui.py +++ b/src/webui.py @@ -195,7 +195,7 @@ def optimize_training_settings_proxy( *args, **kwargs ): "\n".join(tup[7]) ) -def save_training_settings_proxy( epochs, batch_size, learning_rate, learning_rate_schedule, mega_batch_factor, print_rate, save_rate, resume_path, voice ): +def save_training_settings_proxy( epochs, batch_size, learning_rate, learning_rate_schedule, mega_batch_factor, print_rate, save_rate, resume_path, half_p, voice ): name = f"{voice}-finetune" dataset_name = f"{voice}-train" dataset_path = f"./training/{voice}/train.txt" @@ -232,6 +232,7 @@ def save_training_settings_proxy( epochs, batch_size, learning_rate, learning_ra validation_path=validation_path, output_name=f"{voice}/train.yaml", resume_path=resume_path, + half_p=half_p, )) return "\n".join(messages) @@ -373,19 +374,11 @@ def setup_gradio(): gr.Number(label="Print Frequency per Epoch", value=5, precision=0), gr.Number(label="Save Frequency per Epoch", value=5, precision=0), gr.Textbox(label="Resume State Path", placeholder="./training/${voice}-finetune/training_state/${last_state}.state"), + gr.Checkbox(label="Half Precision", value=False), ] dataset_list = gr.Dropdown( get_dataset_list(), label="Dataset", type="value" ) training_settings = training_settings + [ dataset_list ] refresh_dataset_list = gr.Button(value="Refresh Dataset List") - """ - training_settings = training_settings + [ - gr.Textbox(label="Training Name", placeholder="finetune"), - gr.Textbox(label="Dataset Name", placeholder="finetune"), - gr.Textbox(label="Dataset Path", placeholder="./training/finetune/train.txt"), - gr.Textbox(label="Validation Name", placeholder="finetune"), - gr.Textbox(label="Validation Path", placeholder="./training/finetune/train.txt"), - ] - """ with gr.Column(): save_yaml_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8) optimize_yaml_button = gr.Button(value="Validate Training Configuration")