forked from mrq/ai-voice-cloning
tab to generate the training YAML
parent
3a078df95e
commit
f8249aa826
@ -0,0 +1,35 @@
|
|||||||
|
import torch
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from ..dlas.codes import *
|
||||||
|
from ..dlas.codes.utils import util, options as option
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_vit_latent.yml')
|
||||||
|
parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher')
|
||||||
|
args = parser.parse_args()
|
||||||
|
opt = option.parse(args.opt, is_train=True)
|
||||||
|
if args.launcher != 'none':
|
||||||
|
# export CUDA_VISIBLE_DEVICES for running in distributed mode.
|
||||||
|
if 'gpu_ids' in opt.keys():
|
||||||
|
gpu_list = ','.join(str(x) for x in opt['gpu_ids'])
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = gpu_list
|
||||||
|
print('export CUDA_VISIBLE_DEVICES=' + gpu_list)
|
||||||
|
trainer = Trainer()
|
||||||
|
|
||||||
|
#### distributed training settings
|
||||||
|
if args.launcher == 'none': # disabled distributed training
|
||||||
|
opt['dist'] = False
|
||||||
|
trainer.rank = -1
|
||||||
|
if len(opt['gpu_ids']) == 1:
|
||||||
|
torch.cuda.set_device(opt['gpu_ids'][0])
|
||||||
|
print('Disabled distributed training.')
|
||||||
|
else:
|
||||||
|
opt['dist'] = True
|
||||||
|
init_dist('nccl')
|
||||||
|
trainer.world_size = torch.distributed.get_world_size()
|
||||||
|
trainer.rank = torch.distributed.get_rank()
|
||||||
|
torch.cuda.set_device(torch.distributed.get_rank())
|
||||||
|
|
||||||
|
trainer.init(args.opt, opt, args.launcher)
|
||||||
|
trainer.do_training()
|
@ -0,0 +1,144 @@
|
|||||||
|
name: ${name}
|
||||||
|
model: extensibletrainer
|
||||||
|
scale: 1
|
||||||
|
gpu_ids: [0] # <-- unless you have multiple gpus, use this
|
||||||
|
start_step: -1
|
||||||
|
checkpointing_enabled: true # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
|
||||||
|
fp16: false # might want to check this out
|
||||||
|
wandb: false # <-- enable to log to wandb. tensorboard logging is always enabled.
|
||||||
|
use_tb_logger: true
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
train:
|
||||||
|
name: ${dataset_name}
|
||||||
|
n_workers: 8 # idk what this does
|
||||||
|
batch_size: ${batch_size} # This leads to ~16GB of vram usage on my 3090.
|
||||||
|
mode: paired_voice_audio
|
||||||
|
path: ${dataset_path}
|
||||||
|
fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format
|
||||||
|
phase: train
|
||||||
|
max_wav_length: 255995
|
||||||
|
max_text_length: 200
|
||||||
|
sample_rate: 22050
|
||||||
|
load_conditioning: True
|
||||||
|
num_conditioning_candidates: 2
|
||||||
|
conditioning_length: 44000
|
||||||
|
use_bpe_tokenizer: True
|
||||||
|
load_aligned_codes: False
|
||||||
|
val:
|
||||||
|
name: ${validation_name}
|
||||||
|
n_workers: 1
|
||||||
|
batch_size: 32 # this could be higher probably
|
||||||
|
mode: paired_voice_audio
|
||||||
|
path: ${validation_path}
|
||||||
|
fetcher_mode: ['lj']
|
||||||
|
phase: val # might be broken idk
|
||||||
|
max_wav_length: 255995
|
||||||
|
max_text_length: 200
|
||||||
|
sample_rate: 22050
|
||||||
|
load_conditioning: True
|
||||||
|
num_conditioning_candidates: 2
|
||||||
|
conditioning_length: 44000
|
||||||
|
use_bpe_tokenizer: True
|
||||||
|
load_aligned_codes: False
|
||||||
|
|
||||||
|
steps:
|
||||||
|
gpt_train:
|
||||||
|
training: gpt
|
||||||
|
loss_log_buffer: 500 # no idea what this does
|
||||||
|
|
||||||
|
# Generally follows the recipe from the DALLE paper.
|
||||||
|
optimizer: adamw # this should be adamw_zero if you're using distributed training
|
||||||
|
optimizer_params:
|
||||||
|
lr: !!float ${learning_rate} # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value**
|
||||||
|
weight_decay: !!float 1e-2
|
||||||
|
beta1: 0.9
|
||||||
|
beta2: 0.96
|
||||||
|
clip_grad_eps: 4
|
||||||
|
|
||||||
|
injectors: # TODO: replace this entire sequence with the GptVoiceLatentInjector
|
||||||
|
paired_to_mel:
|
||||||
|
type: torch_mel_spectrogram
|
||||||
|
mel_norm_file: ./experiments/clips_mel_norms.pth
|
||||||
|
in: wav
|
||||||
|
out: paired_mel
|
||||||
|
paired_cond_to_mel:
|
||||||
|
type: for_each
|
||||||
|
subtype: torch_mel_spectrogram
|
||||||
|
mel_norm_file: ./experiments/clips_mel_norms.pth
|
||||||
|
in: conditioning
|
||||||
|
out: paired_conditioning_mel
|
||||||
|
to_codes:
|
||||||
|
type: discrete_token
|
||||||
|
in: paired_mel
|
||||||
|
out: paired_mel_codes
|
||||||
|
dvae_config: "./experiments/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT
|
||||||
|
paired_fwd_text:
|
||||||
|
type: generator
|
||||||
|
generator: gpt
|
||||||
|
in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
|
||||||
|
out: [loss_text_ce, loss_mel_ce, logits]
|
||||||
|
losses:
|
||||||
|
text_ce:
|
||||||
|
type: direct
|
||||||
|
weight: .01
|
||||||
|
key: loss_text_ce
|
||||||
|
mel_ce:
|
||||||
|
type: direct
|
||||||
|
weight: 1
|
||||||
|
key: loss_mel_ce
|
||||||
|
|
||||||
|
networks:
|
||||||
|
gpt:
|
||||||
|
type: generator
|
||||||
|
which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
|
||||||
|
kwargs:
|
||||||
|
layers: 30 # WAS 8
|
||||||
|
model_dim: 1024 # WAS 512
|
||||||
|
heads: 16 # WAS 8
|
||||||
|
max_text_tokens: 402 # WAS 120
|
||||||
|
max_mel_tokens: 604 # WAS 250
|
||||||
|
max_conditioning_inputs: 2 # WAS 1
|
||||||
|
mel_length_compression: 1024
|
||||||
|
number_text_tokens: 256 # supposed to be 255 for newer unified_voice files
|
||||||
|
number_mel_codes: 8194
|
||||||
|
start_mel_token: 8192
|
||||||
|
stop_mel_token: 8193
|
||||||
|
start_text_token: 255
|
||||||
|
train_solo_embeddings: False # missing in uv3/4
|
||||||
|
use_mel_codes_as_input: True # ditto
|
||||||
|
checkpointing: True
|
||||||
|
#types: 1 # this is MISSING, but in my analysis 1 is equivalent to not having it.
|
||||||
|
#only_alignment_head: False # uv3/4
|
||||||
|
|
||||||
|
path:
|
||||||
|
pretrain_model_gpt: './experiments/autoregressive.pth' # CHANGEME: copy this from tortoise cache
|
||||||
|
strict_load: true
|
||||||
|
#resume_state: ./experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state.
|
||||||
|
|
||||||
|
# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit)
|
||||||
|
train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
|
||||||
|
niter: 50000
|
||||||
|
warmup_iter: -1
|
||||||
|
mega_batch_factor: 4 # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
|
||||||
|
val_freq: 500
|
||||||
|
|
||||||
|
default_lr_scheme: MultiStepLR
|
||||||
|
gen_lr_steps: [500, 1000, 1400, 1800] #[50000, 100000, 140000, 180000]
|
||||||
|
lr_gamma: 0.5
|
||||||
|
|
||||||
|
eval:
|
||||||
|
output_state: gen
|
||||||
|
injectors:
|
||||||
|
gen_inj_eval:
|
||||||
|
type: generator
|
||||||
|
generator: generator
|
||||||
|
in: hq
|
||||||
|
out: [gen, codebook_commitment_loss]
|
||||||
|
|
||||||
|
logger:
|
||||||
|
print_freq: 100
|
||||||
|
save_checkpoint_freq: 500 # CHANGEME: especially you should increase this it's really slow
|
||||||
|
visuals: [gen, mel]
|
||||||
|
visual_debug_rate: 500
|
||||||
|
is_mel_spectrogram: true
|
Loading…
Reference in New Issue