ai-voice-cloning/models/.template.dlas.yaml

name: '${voice}'
model: extensibletrainer
scale: 1
gpu_ids: [0] # Manually edit this if the GPU you want to train on is not your primary, as this will set the env var that exposes CUDA devices
start_step: 0
checkpointing_enabled: true 
fp16: ${half_p}
bitsandbytes: ${bitsandbytes}
gpus: ${gpus}

datasets:
  train:
    name: training
    n_workers: ${workers}
    batch_size: ${batch_size}
    mode: paired_voice_audio
    path: ${dataset_path}
    fetcher_mode: ['lj']
    phase: train
    max_wav_length: 255995 # ~11.6 seconds
    max_text_length: 200
    sample_rate: 22050
    load_conditioning: True
    num_conditioning_candidates: 2
    conditioning_length: 44000
    use_bpe_tokenizer: True
    tokenizer_vocab: ${tokenizer_json} # ./models/tortoise/bpe_lowercase_asr_256.json
    load_aligned_codes: False
  val:
    name: validation
    n_workers: ${workers}
    batch_size: ${validation_batch_size}
    mode: paired_voice_audio
    path: ${validation_path}
    fetcher_mode: ['lj']
    phase: val
    max_wav_length: 255995
    max_text_length: 200
    sample_rate: 22050
    load_conditioning: True
    num_conditioning_candidates: 2
    conditioning_length: 44000
    use_bpe_tokenizer: True
    tokenizer_vocab: ${tokenizer_json} # ./models/tortoise/bpe_lowercase_asr_256.json
    load_aligned_codes: False

steps:        
  gpt_train:
    training: gpt
    loss_log_buffer: 500

    # Generally follows the recipe from the DALLE paper.
    optimizer: ${optimizer} # this should be adamw_zero if you're using distributed training
    optimizer_params:
      lr: !!float ${learning_rate} # originally: 1e-4
      weight_decay: !!float 1e-2
      beta1: 0.9
      beta2: 0.96
    clip_grad_eps: 4

    injectors:
      paired_to_mel:
        type: torch_mel_spectrogram
        mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
        in: wav
        out: paired_mel
      paired_cond_to_mel:
        type: for_each
        subtype: torch_mel_spectrogram
        mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
        in: conditioning
        out: paired_conditioning_mel
      to_codes:
        type: discrete_token
        in: paired_mel
        out: paired_mel_codes
        dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml"
      paired_fwd_text:
        type: generator
        generator: gpt
        in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
        out: [loss_text_ce, loss_mel_ce, logits]      
    losses:
      text_ce:
        type: direct
        weight: ${text_lr_weight}
        key: loss_text_ce
      mel_ce:
        type: direct
        weight: ${mel_lr_weight}
        key: loss_mel_ce

networks:
  gpt:
    type: generator
    which_model_G: unified_voice2
    kwargs:
      layers: 30 # originally: 8
      model_dim: 1024 # originally: 512
      heads: 16 # originally: 8
      max_text_tokens: 402 # originally: 120
      max_mel_tokens: 604 # originally: 250
      max_conditioning_inputs: 2 # originally: 1
      mel_length_compression: 1024
      number_text_tokens: 256 # supposed to be 255 for newer unified_voice files 
      number_mel_codes: 8194
      start_mel_token: 8192
      stop_mel_token: 8193
      start_text_token: 255
      train_solo_embeddings: False # missing in uv3/4
      use_mel_codes_as_input: True # ditto
      checkpointing: True
      tortoise_compat: True
      # freeze_everything_but_position_embeddings: True

path:
  strict_load: true
  ${source_model} 
  ${resume_state}

train:
  niter: ${iterations}
  warmup_iter: -1
  mega_batch_factor: ${gradient_accumulation_size}
  val_freq: ${validation_rate}

  ema_enabled: false # I really don't think EMA matters

  ${learning_rate_scheme}

eval:
  pure: ${validation_enabled}
  output_state: gen

logger: 
  save_checkpoint_freq: ${save_rate}
  visuals: [gen, mel]
  visual_debug_rate: ${save_rate}
  is_mel_spectrogram: true
only God knows why the YAML spec lets you specify string values without quotes 2023-03-10 01:58:30 +00:00			`name: '${voice}'`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`model: extensibletrainer`
			`scale: 1`
added mel LR weight (as I finally understand when to adjust the text), added text validation on dataset creation 2023-03-13 18:51:53 +00:00			`gpu_ids: [0] # Manually edit this if the GPU you want to train on is not your primary, as this will set the env var that exposes CUDA devices`
huge success 2023-02-23 06:24:54 +00:00			`start_step: 0`
renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details) 2023-03-04 15:55:06 +00:00			`checkpointing_enabled: true`
big cleanup to make my life easier when i add more parameters 2023-03-09 00:26:47 +00:00			`fp16: ${half_p}`
			`bitsandbytes: ${bitsandbytes}`
			`gpus: ${gpus}`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00
			`datasets:`
			`train:`
big cleanup to make my life easier when i add more parameters 2023-03-09 00:26:47 +00:00			`name: training`
added option to set worker size in training config generator (because the default is overkill), for whisper transcriptions, load a specialized language model if it exists (for now, only english), output transcription to web UI when done transcribing 2023-03-05 05:17:19 +00:00			`n_workers: ${workers}`
renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details) 2023-03-04 15:55:06 +00:00			`batch_size: ${batch_size}`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`mode: paired_voice_audio`
			`path: ${dataset_path}`
renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details) 2023-03-04 15:55:06 +00:00			`fetcher_mode: ['lj']`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`phase: train`
added mel LR weight (as I finally understand when to adjust the text), added text validation on dataset creation 2023-03-13 18:51:53 +00:00			`max_wav_length: 255995 # ~11.6 seconds`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`max_text_length: 200`
			`sample_rate: 22050`
			`load_conditioning: True`
			`num_conditioning_candidates: 2`
			`conditioning_length: 44000`
			`use_bpe_tokenizer: True`
added options to pick tokenizer json and diffusion model (so I don't have to add it in later when I get bored and add in diffusion training) 2023-03-15 00:37:38 +00:00			`tokenizer_vocab: ${tokenizer_json} # ./models/tortoise/bpe_lowercase_asr_256.json`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`load_aligned_codes: False`
added mel LR weight (as I finally understand when to adjust the text), added text validation on dataset creation 2023-03-13 18:51:53 +00:00			`val:`
big cleanup to make my life easier when i add more parameters 2023-03-09 00:26:47 +00:00			`name: validation`
made validation working (will document later) 2023-03-08 02:58:00 +00:00			`n_workers: ${workers}`
disable validation if validation dataset not found, clamp validation batch size to validation dataset size instead of simply reusing batch size, switch to adamw_zero optimizier when training with multi-gpus (because the yaml comment said to and I think it might be why I'm absolutely having garbage luck training this japanese dataset) 2023-03-08 04:47:05 +00:00			`batch_size: ${validation_batch_size}`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`mode: paired_voice_audio`
			`path: ${validation_path}`
			`fetcher_mode: ['lj']`
renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details) 2023-03-04 15:55:06 +00:00			`phase: val`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`max_wav_length: 255995`
			`max_text_length: 200`
			`sample_rate: 22050`
			`load_conditioning: True`
			`num_conditioning_candidates: 2`
			`conditioning_length: 44000`
			`use_bpe_tokenizer: True`
added options to pick tokenizer json and diffusion model (so I don't have to add it in later when I get bored and add in diffusion training) 2023-03-15 00:37:38 +00:00			`tokenizer_vocab: ${tokenizer_json} # ./models/tortoise/bpe_lowercase_asr_256.json`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`load_aligned_codes: False`

			`steps:`
			`gpt_train:`
			`training: gpt`
renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details) 2023-03-04 15:55:06 +00:00			`loss_log_buffer: 500`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00
			`# Generally follows the recipe from the DALLE paper.`
forgot template 2023-03-09 00:32:35 +00:00			`optimizer: ${optimizer} # this should be adamw_zero if you're using distributed training`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`optimizer_params:`
renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details) 2023-03-04 15:55:06 +00:00			`lr: !!float ${learning_rate} # originally: 1e-4`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`weight_decay: !!float 1e-2`
			`beta1: 0.9`
			`beta2: 0.96`
			`clip_grad_eps: 4`

renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details) 2023-03-04 15:55:06 +00:00			`injectors:`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`paired_to_mel:`
			`type: torch_mel_spectrogram`
removed redundant training data (they exist within tortoise itself anyways), added utility: view tokenized text 2023-03-14 21:51:27 +00:00			`mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`in: wav`
			`out: paired_mel`
			`paired_cond_to_mel:`
			`type: for_each`
			`subtype: torch_mel_spectrogram`
removed redundant training data (they exist within tortoise itself anyways), added utility: view tokenized text 2023-03-14 21:51:27 +00:00			`mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`in: conditioning`
			`out: paired_conditioning_mel`
			`to_codes:`
			`type: discrete_token`
			`in: paired_mel`
			`out: paired_mel_codes`
renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details) 2023-03-04 15:55:06 +00:00			`dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml"`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`paired_fwd_text:`
			`type: generator`
			`generator: gpt`
			`in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]`
			`out: [loss_text_ce, loss_mel_ce, logits]`
			`losses:`
			`text_ce:`
			`type: direct`
added mel LR weight (as I finally understand when to adjust the text), added text validation on dataset creation 2023-03-13 18:51:53 +00:00			`weight: ${text_lr_weight}`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`key: loss_text_ce`
			`mel_ce:`
			`type: direct`
added mel LR weight (as I finally understand when to adjust the text), added text validation on dataset creation 2023-03-13 18:51:53 +00:00			`weight: ${mel_lr_weight}`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`key: loss_mel_ce`

			`networks:`
			`gpt:`
added the mysterious tortoise_compat flag mentioned in DLAS repo 2023-03-09 03:41:40 +00:00			`type: generator`
			`which_model_G: unified_voice2`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`kwargs:`
renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details) 2023-03-04 15:55:06 +00:00			`layers: 30 # originally: 8`
			`model_dim: 1024 # originally: 512`
			`heads: 16 # originally: 8`
			`max_text_tokens: 402 # originally: 120`
			`max_mel_tokens: 604 # originally: 250`
			`max_conditioning_inputs: 2 # originally: 1`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`mel_length_compression: 1024`
			`number_text_tokens: 256 # supposed to be 255 for newer unified_voice files`
			`number_mel_codes: 8194`
			`start_mel_token: 8192`
			`stop_mel_token: 8193`
			`start_text_token: 255`
			`train_solo_embeddings: False # missing in uv3/4`
			`use_mel_codes_as_input: True # ditto`
			`checkpointing: True`
added the mysterious tortoise_compat flag mentioned in DLAS repo 2023-03-09 03:41:40 +00:00			`tortoise_compat: True`
			`# freeze_everything_but_position_embeddings: True`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00
			`path:`
			`strict_load: true`
big cleanup to make my life easier when i add more parameters 2023-03-09 00:26:47 +00:00			`${source_model}`
added more safeties and parameters to training yaml generator, I think I tested it extensively enough 2023-02-19 16:16:44 +00:00			`${resume_state}`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00
renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details) 2023-03-04 15:55:06 +00:00			`train:`
oops 2023-02-18 15:50:51 +00:00			`niter: ${iterations}`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`warmup_iter: -1`
renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details) 2023-03-04 15:55:06 +00:00			`mega_batch_factor: ${gradient_accumulation_size}`
set validation to save rate and validation file if exists (need to test later) 2023-03-07 20:38:31 +00:00			`val_freq: ${validation_rate}`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00
Added very experimental float16 training for cards with not enough VRAM (10GiB and below, maybe) \!NOTE\! this is VERY EXPERIMETNAL, I have zero free time to validate it right now, I'll do it later 2023-02-21 19:31:57 +00:00			`ema_enabled: false # I really don't think EMA matters`

actually make using adamw_zero optimizer for multi-gpus work 2023-03-08 15:31:33 +00:00			`${learning_rate_scheme}`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00
			`eval:`
disable validation if validation dataset not found, clamp validation batch size to validation dataset size instead of simply reusing batch size, switch to adamw_zero optimizier when training with multi-gpus (because the yaml comment said to and I think it might be why I'm absolutely having garbage luck training this japanese dataset) 2023-03-08 04:47:05 +00:00			`pure: ${validation_enabled}`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`output_state: gen`

			`logger:`
renamed mega batch factor to an actual real term: gradient accumulation factor, fixed halting training not actually killing the training process and freeing up resources, some logic cleanup for gradient accumulation (so many brain worms and wrong assumptions from testing on low batch sizes) (read the training section in the wiki for more details) 2023-03-04 15:55:06 +00:00			`save_checkpoint_freq: ${save_rate}`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`visuals: [gen, mel]`
cleanups and fixes, fix DLAS throwing errors from '''too short of sound files''' by just culling them during transcription 2023-03-11 01:19:49 +00:00			`visual_debug_rate: ${save_rate}`
tab to generate the training YAML 2023-02-17 03:05:27 +00:00			`is_mel_spectrogram: true`