diff --git a/models/.template.yaml b/models/.template.yaml index 40909f1..825ac0c 100755 --- a/models/.template.yaml +++ b/models/.template.yaml @@ -1,21 +1,21 @@ name: ${name} model: extensibletrainer scale: 1 -gpu_ids: [0] # <-- unless you have multiple gpus, use this +gpu_ids: [0] # Superfluous, redundant, unnecessary, the way you launch the training script will set this start_step: 0 -checkpointing_enabled: true # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training. -fp16: ${float16} # might want to check this out -wandb: false # <-- enable to log to wandb. tensorboard logging is always enabled. +checkpointing_enabled: true +fp16: ${float16} +wandb: false use_tb_logger: true datasets: train: name: ${dataset_name} - n_workers: 8 # idk what this does - batch_size: ${batch_size} # This leads to ~16GB of vram usage on my 3090. + n_workers: 8 + batch_size: ${batch_size} mode: paired_voice_audio path: ${dataset_path} - fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format + fetcher_mode: ['lj'] phase: train max_wav_length: 255995 max_text_length: 200 @@ -29,11 +29,11 @@ datasets: val: # I really do not care about validation right now name: ${validation_name} n_workers: 1 - batch_size: 1 # this could be higher probably + batch_size: 1 mode: paired_voice_audio path: ${validation_path} fetcher_mode: ['lj'] - phase: val # might be broken idk + phase: val max_wav_length: 255995 max_text_length: 200 sample_rate: 22050 @@ -47,18 +47,18 @@ datasets: steps: gpt_train: training: gpt - loss_log_buffer: 500 # no idea what this does + loss_log_buffer: 500 # Generally follows the recipe from the DALLE paper. optimizer: adamw # this should be adamw_zero if you're using distributed training optimizer_params: - lr: !!float ${learning_rate} # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value** + lr: !!float ${learning_rate} # originally: 1e-4 weight_decay: !!float 1e-2 beta1: 0.9 beta2: 0.96 clip_grad_eps: 4 - injectors: # TODO: replace this entire sequence with the GptVoiceLatentInjector + injectors: paired_to_mel: type: torch_mel_spectrogram mel_norm_file: ./models/tortoise/clips_mel_norms.pth @@ -74,7 +74,7 @@ steps: type: discrete_token in: paired_mel out: paired_mel_codes - dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT + dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml" paired_fwd_text: type: generator generator: gpt @@ -95,12 +95,12 @@ networks: type: generator which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter. kwargs: - layers: 30 # WAS 8 - model_dim: 1024 # WAS 512 - heads: 16 # WAS 8 - max_text_tokens: 402 # WAS 120 - max_mel_tokens: 604 # WAS 250 - max_conditioning_inputs: 2 # WAS 1 + layers: 30 # originally: 8 + model_dim: 1024 # originally: 512 + heads: 16 # originally: 8 + max_text_tokens: 402 # originally: 120 + max_mel_tokens: 604 # originally: 250 + max_conditioning_inputs: 2 # originally: 1 mel_length_compression: 1024 number_text_tokens: 256 # supposed to be 255 for newer unified_voice files number_mel_codes: 8194 @@ -118,11 +118,10 @@ path: strict_load: true ${resume_state} -# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit) -train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH +train: niter: ${iterations} warmup_iter: -1 - mega_batch_factor: ${mega_batch_factor} # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8]. + mega_batch_factor: ${gradient_accumulation_size} val_freq: ${iterations} ema_enabled: false # I really don't think EMA matters @@ -142,7 +141,7 @@ eval: logger: print_freq: ${print_rate} - save_checkpoint_freq: ${save_rate} # CHANGEME: especially you should increase this it's really slow + save_checkpoint_freq: ${save_rate} visuals: [gen, mel] visual_debug_rate: ${print_rate} is_mel_spectrogram: true \ No newline at end of file diff --git a/src/utils.py b/src/utils.py index 5c173a4..46abc92 100755 --- a/src/utils.py +++ b/src/utils.py @@ -573,6 +573,9 @@ class TrainingState(): infos = {} highest_step = self.last_info_check_at + if not update: + self.losses = [] + if use_tensorboard: logs = sorted([f'{self.dataset_dir}/tb_logger/{d}' for d in os.listdir(f'{self.dataset_dir}/tb_logger/') if d[:6] == "events" ]) if update: @@ -816,6 +819,7 @@ def update_training_dataplot(config_path=None): del training_state training_state = None elif training_state.losses: + training_state.load_losses() update = gr.LinePlot.update(value=pd.DataFrame(training_state.losses)) return update @@ -837,7 +841,8 @@ def stop_training(): print("Killing training process...") training_state.killed = True training_state.process.stdout.close() - training_state.process.terminate() + #training_state.process.terminate() + training_state.process.send_signal(signal.SIGINT) return_code = training_state.process.wait() training_state = None return f"Training cancelled: {return_code}" @@ -938,7 +943,7 @@ EPOCH_SCHEDULE = [ 9, 18, 25, 33 ] def schedule_learning_rate( iterations, schedule=EPOCH_SCHEDULE ): return [int(iterations * d) for d in schedule] -def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, mega_batch_factor, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ): +def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, gradient_accumulation_size, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ): name = f"{voice}-finetune" dataset_name = f"{voice}-train" dataset_path = f"./training/{voice}/train.txt" @@ -959,12 +964,11 @@ def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learni batch_size = int(lines / nearest_slice) messages.append(f"Batch size not neatly divisible by dataset size, adjusting batch size to: {batch_size} ({nearest_slice} steps per epoch)") - if batch_size == 1 and mega_batch_factor != 1: - mega_batch_factor = 1 - messages.append(f"Mega batch factor is too large for the given batch size, clamping mega batch factor to: {mega_batch_factor}") - elif batch_size / mega_batch_factor < 2: - mega_batch_factor = int(batch_size / 2) - messages.append(f"Mega batch factor is too large for the given batch size, clamping mega batch factor to: {mega_batch_factor}") + if gradient_accumulation_size == 0: + gradient_accumulation_size = 1 + elif batch_size % gradient_accumulation_size != 0: + gradient_accumulation_size = int(batch_size / gradient_accumulation_size) + messages.append(f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {gradient_accumulation_size}") iterations = calc_iterations(epochs=epochs, lines=lines, batch_size=batch_size) @@ -980,14 +984,18 @@ def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learni resume_path = None messages.append("Resume path specified, but does not exist. Disabling...") - if half_p: - messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !") - if not os.path.exists(get_halfp_model_path()): - convert_to_halfp() - if bnb: messages.append("BitsAndBytes requested. Please note this is ! EXPERIMENTAL !") + if half_p: + if bnb: + half_p = False + messages.append("Half Precision requested, but BitsAndBytes is also requested. Due to redundancies, disabling half precision...") + else: + messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !") + if not os.path.exists(get_halfp_model_path()): + convert_to_halfp() + messages.append(f"For {epochs} epochs with {lines} lines in batches of {batch_size}, iterating for {iterations} steps ({int(iterations / epochs)} steps per epoch)") return ( @@ -995,14 +1003,14 @@ def optimize_training_settings( epochs, learning_rate, text_ce_lr_weight, learni text_ce_lr_weight, learning_rate_schedule, batch_size, - mega_batch_factor, + gradient_accumulation_size, print_rate, save_rate, resume_path, messages ) -def save_training_settings( iterations=None, learning_rate=None, text_ce_lr_weight=None, learning_rate_schedule=None, batch_size=None, mega_batch_factor=None, print_rate=None, save_rate=None, name=None, dataset_name=None, dataset_path=None, validation_name=None, validation_path=None, output_name=None, resume_path=None, half_p=None, bnb=None, source_model=None ): +def save_training_settings( iterations=None, learning_rate=None, text_ce_lr_weight=None, learning_rate_schedule=None, batch_size=None, gradient_accumulation_size=None, print_rate=None, save_rate=None, name=None, dataset_name=None, dataset_path=None, validation_name=None, validation_path=None, output_name=None, resume_path=None, half_p=None, bnb=None, source_model=None ): if not source_model: source_model = f"./models/tortoise/autoregressive{'_half' if half_p else ''}.pth" @@ -1011,8 +1019,8 @@ def save_training_settings( iterations=None, learning_rate=None, text_ce_lr_weig "batch_size": batch_size if batch_size else 64, "learning_rate": learning_rate if learning_rate else 1e-5, "gen_lr_steps": learning_rate_schedule if learning_rate_schedule else EPOCH_SCHEDULE, - "mega_batch_factor": mega_batch_factor if mega_batch_factor else 4, - "print_rate": print_rate if print_rate else 50, + "gradient_accumulation_size": gradient_accumulation_size if gradient_accumulation_size else 4, + "print_rate": print_rate if print_rate else 1, "save_rate": save_rate if save_rate else 50, "name": name if name else "finetune", "dataset_name": dataset_name if dataset_name else "finetune", diff --git a/src/webui.py b/src/webui.py index c94c59e..db90051 100755 --- a/src/webui.py +++ b/src/webui.py @@ -227,9 +227,8 @@ def import_training_settings_proxy( voice ): messages.append(f"Basing epoch size to {lines} lines") batch_size = config['datasets']['train']['batch_size'] - mega_batch_factor = config['train']['mega_batch_factor'] + gradient_accumulation_size = config['train']['mega_batch_factor'] - iterations = config['train']['niter'] steps_per_iteration = int(lines / batch_size) epochs = int(iterations / steps_per_iteration) @@ -276,7 +275,7 @@ def import_training_settings_proxy( voice ): text_ce_lr_weight, learning_rate_schedule, batch_size, - mega_batch_factor, + gradient_accumulation_size, print_rate, save_rate, resume_path, @@ -287,7 +286,7 @@ def import_training_settings_proxy( voice ): ) -def save_training_settings_proxy( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, mega_batch_factor, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ): +def save_training_settings_proxy( epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size, gradient_accumulation_size, print_rate, save_rate, resume_path, half_p, bnb, source_model, voice ): name = f"{voice}-finetune" dataset_name = f"{voice}-train" dataset_path = f"./training/{voice}/train.txt" @@ -318,7 +317,7 @@ def save_training_settings_proxy( epochs, learning_rate, text_ce_lr_weight, lear learning_rate=learning_rate, text_ce_lr_weight=text_ce_lr_weight, learning_rate_schedule=learning_rate_schedule, - mega_batch_factor=mega_batch_factor, + gradient_accumulation_size=gradient_accumulation_size, print_rate=print_rate, save_rate=save_rate, name=name, @@ -489,7 +488,7 @@ def setup_gradio(): with gr.Row(): training_settings = training_settings + [ gr.Number(label="Batch Size", value=128, precision=0), - gr.Number(label="Mega Batch Factor", value=4, precision=0), + gr.Number(label="Gradient Accumulation Size", value=4, precision=0), ] with gr.Row(): training_settings = training_settings + [ @@ -534,8 +533,10 @@ def setup_gradio(): with gr.Column(): training_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8) verbose_training = gr.Checkbox(label="Verbose Console Output", value=True) - training_buffer_size = gr.Slider(label="Console Buffer Size", minimum=4, maximum=32, value=8) - training_keep_x_past_datasets = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1) + + with gr.Row(): + training_buffer_size = gr.Slider(label="Console Buffer Size", minimum=4, maximum=32, value=8) + training_keep_x_past_datasets = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1) training_gpu_count = gr.Number(label="GPUs", value=1) with gr.Row(): start_training_button = gr.Button(value="Train")