model: extensibletrainer
scale: 1
gpu_ids: [0] # Manually edit this if the GPU you want to train on is not your primary, as this will set the env var that exposes CUDA devices
start_step: 0
checkpointing_enabled: true 
fp16: False
bitsandbytes: True
gpus: 1

datasets:
  train:
    name: training
    n_workers: 2
    batch_size: 128
    mode: paired_voice_audio
    path: ./training/vlaams/train.txt
    fetcher_mode: ['lj']
    phase: train
    max_wav_length: 255995 # ~11.6 seconds
    max_text_length: 200
    sample_rate: 22050
    load_conditioning: True
    num_conditioning_candidates: 2
    conditioning_length: 44000
    use_bpe_tokenizer: True
    tokenizer_vocab: ./models/tokenizers/dutch_vl.json # ./models/tortoise/bpe_lowercase_asr_256.json
    load_aligned_codes: False
  val:
    name: validation
    n_workers: 2
    batch_size: 4
    mode: paired_voice_audio
    path: ./training/vlaams/validation.txt
    fetcher_mode: ['lj']
    phase: val
    max_wav_length: 255995
    max_text_length: 200
    sample_rate: 22050
    load_conditioning: True
    num_conditioning_candidates: 2
    conditioning_length: 44000
    use_bpe_tokenizer: True
    tokenizer_vocab: ./models/tokenizers/dutch_vl.json # ./models/tortoise/bpe_lowercase_asr_256.json
    load_aligned_codes: False

steps:        
  gpt_train:
    training: gpt
    loss_log_buffer: 500

    # Generally follows the recipe from the DALLE paper.
    optimizer: adamw # this should be adamw_zero if you're using distributed training
    optimizer_params:
      lr: !!float 1e-05 # originally: 1e-4
      weight_decay: !!float 1e-2
      beta1: 0.9
      beta2: 0.96
    clip_grad_eps: 4

    injectors:
      paired_to_mel:
        type: torch_mel_spectrogram
        mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
        in: wav
        out: paired_mel
      paired_cond_to_mel:
        type: for_each
        subtype: torch_mel_spectrogram
        mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
        in: conditioning
        out: paired_conditioning_mel
      to_codes:
        type: discrete_token
        in: paired_mel
        out: paired_mel_codes
        dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml"
      paired_fwd_text:
        type: generator
        generator: gpt
        in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
        out: [loss_text_ce, loss_mel_ce, logits]      
    losses:
      text_ce:
        type: direct
        weight: 1
        key: loss_text_ce
      mel_ce:
        type: direct
        weight: 1
        key: loss_mel_ce

networks:
  gpt:
    type: generator
    which_model_G: unified_voice2
    kwargs:
      layers: 30 # originally: 8
      model_dim: 1024 # originally: 512
      heads: 16 # originally: 8
      max_text_tokens: 402 # originally: 120
      max_mel_tokens: 604 # originally: 250
      max_conditioning_inputs: 2 # originally: 1
      mel_length_compression: 1024
      number_text_tokens: 256 # supposed to be 255 for newer unified_voice files 
      number_mel_codes: 8194
      start_mel_token: 8192
      stop_mel_token: 8193
      start_text_token: 255
      train_solo_embeddings: False # missing in uv3/4
      use_mel_codes_as_input: True # ditto
      checkpointing: True
      tortoise_compat: True
      # freeze_everything_but_position_embeddings: True

path:
  strict_load: true
  pretrain_model_gpt: './models/tortoise/autoregressive.pth' 
  # resume_state: ''

train:
  niter: 3000
  warmup_iter: -1
  mega_batch_factor: 32
  val_freq: 200

  ema_enabled: false # I really don't think EMA matters

  default_lr_scheme: MultiStepLR
  gen_lr_steps: [400, 800, 1800, 3600, 5000, 6600, 10000]
  lr_gamma: 0.5

eval:
  pure: False
  output_state: gen

logger: 
  save_checkpoint_freq: 200
  visuals: [gen, mel]
  visual_debug_rate: 200
  is_mel_spectrogram: true```

Tokenizer:

```{
  "version": "1.0",
  "truncation": null,
  "padding": null,
  "added_tokens": [
    {
      "id": 0,
      "content": "[STOP]",
      "single_word": false,
      "lstrip": false,
      "rstrip": false,
      "normalized": false,
      "special": true
    },
    {
      "id": 1,
      "content": "[UNK]",
      "single_word": false,
      "lstrip": false,
      "rstrip": false,
      "normalized": false,
      "special": true
    },
    {
      "id": 2,
      "content": "[SPACE]",
      "single_word": false,
      "lstrip": false,
      "rstrip": false,
      "normalized": false,
      "special": true
    }
  ],
  "normalizer": null,
  "pre_tokenizer": {
    "type": "Whitespace"
  },
  "post_processor": null,
  "decoder": null,
  "model": {
    "type": "BPE",
	"language": "nl",
    "dropout": null,
    "unk_token": "[UNK]",
    "continuing_subword_prefix": null,
    "end_of_word_suffix": null,
    "fuse_unk": false,
    "byte_fallback": false,
    "vocab": {
      "[STOP]": 0,
      "[UNK]": 1,
      "[SPACE]": 2,
      "!": 3,
      "\"": 4,
      "$": 5,
      "&": 6,
      "'": 7,
      "(": 8,
      ")": 9,
      "*": 10,
      ",": 11,
      "-": 12,
      ".": 13,
      "/": 14,
      "0": 15,
      "1": 16,
      "2": 17,
      "3": 18,
      "4": 19,
      "5": 20,
      "6": 21,
      "7": 22,
      "8": 23,
      "9": 24,
      ":": 25,
      ";": 26,
      "<": 27,
      "=": 28,
      ">": 29,
      "?": 30,
      "A": 31,
      "B": 32,
      "C": 33,
      "D": 34,
      "E": 35,
      "F": 36,
      "G": 37,
      "H": 38,
      "I": 39,
      "J": 40,
      "K": 41,
      "L": 42,
      "M": 43,
      "N": 44,
      "O": 45,
      "P": 46,
      "Q": 47,
      "R": 48,
      "S": 49,
      "T": 50,
      "U": 51,
      "V": 52,
      "W": 53,
      "X": 54,
      "Y": 55,
      "Z": 56,
      "a": 57,
      "b": 58,
      "c": 59,
      "d": 60,
      "e": 61,
      "f": 62,
      "g": 63,
      "h": 64,
      "i": 65,
      "j": 66,
      "k": 67,
      "l": 68,
      "m": 69,
      "n": 70,
      "o": 71,
      "p": 72,
      "q": 73,
      "r": 74,
      "s": 75,
      "t": 76,
      "u": 77,
      "v": 78,
      "w": 79,
      "x": 80,
      "y": 81,
      "z": 82,
      "©": 83,
      "«": 84,
      "°": 85,
      "»": 86,
      "¿": 87,
      "Ó": 88,
      "Ö": 89,
      "Ü": 90,
      "ß": 91,
      "à": 92,
      "á": 93,
      "ä": 94,
      "ç": 95,
      "è": 96,
      "é": 97,
      "ê": 98,
      "ë": 99,
      "í": 100,
      "î": 101,
      "ï": 102,
      "ñ": 103,
      "ò": 104,
      "ó": 105,
      "ô": 106,
      "ö": 107,
      "ú": 108,
      "û": 109,
      "ü": 110,
      "č": 111,
      "ę": 112,
      "ł": 113,
      "œ": 114,
      "ř": 115,
      "ś": 116,
      "ƒ": 117,
      "α": 118,
      "π": 119,
      "–": 120,
      "‘": 121,
      "’": 122,
      "“": 123,
      "”": 124,
      "•": 125,
      "…": 126,
      "Ω": 127,
      "ℵ": 128,
      "en": 129,
      "er": 130,
      "ij": 131,
      "de": 132,
      "et": 133,
      "aa": 134,
      "an": 135,
      "el": 136,
      "in": 137,
      "st": 138,
      "ch": 139,
      "aar": 140,
      "oo": 141,
      "at": 142,
      "een": 143,
      "ge": 144,
      "on": 145,
      "ie": 146,
      "te": 147,
      "het": 148,
      "al": 149,
      "ver": 150,
      "op": 151,
      "ijn": 152,
      "van": 153,
      "ze": 154,
      "gen": 155,
      "oe": 156,
      "wa": 157,
      "ee": 158,
      "it": 159,
      "den": 160,
      "oor": 161,
      "hij": 162,
      "dat": 163,
      "cht": 164,
      "der": 165,
      "is": 166,
      "iet": 167,
      "zijn": 168,
      "he": 169,
      "om": 170,
      "be": 171,
      "aan": 172,
      "je": 173,
      "ou": 174,
      "ken": 175,
      "niet": 176,
      "ik": 177,
      "ar": 178,
      "eer": 179,
      "or": 180,
      "sch": 181,
      "was": 182,
      "le": 183,
      "die": 184,
      "met": 185,
      "ad": 186,
      "ijk": 187,
      "zi": 188,
      "ing": 189,
      "re": 190,
      "ur": 191,
      "uit": 192,
      "we": 193,
      "had": 194,
      "il": 195,
      "to": 196,
      "ig": 197,
      "ven": 198,
      "voor": 199,
      "zei": 200,
      "ol": 201,
      "no": 202,
      "acht": 203,
      "am": 204,
      "maar": 205,
      "ten": 206,
      "als": 207,
      "naar": 208,
      "us": 209,
      "ien": 210,
      "gr": 211,
      "hem": 212,
      "gel": 213,
      "un": 214,
      "af": 215,
      "vr": 216,
      "over": 217,
      "id": 218,
      "haar": 219,
      "of": 220,
      "zo": 221,
      "ste": 222,
      "and": 223,
      "Hij": 224,
      "men": 225,
      "sp": 226,
      "dr": 227,
      "la": 228,
      "waar": 229,
      "arr": 230,
      "Harr": 231,
      "lijk": 232,
      "Harry": 233,
      "zich": 234,
      "ter": 235,
      "ond": 236,
      ".’": 237,
      "aal": 238,
      "ui": 239,
      "wer": 240,
      "ier": 241,
      "nog": 242,
      "door": 243,
      "Ik": 244,
      "dan": 245,
      "ro": 246,
      "ook": 247,
      "aat": 248,
      "heb": 249,
      "ben": 250,
      "bl": 251,
      "ag": 252,
      "bij": 253,
      "ak": 254
    },
    "merges": [
      "e n",
      "e r",
      "i j",
      "d e",
      "e t",
      "a a",
      "a n",
      "e l",
      "i n",
      "s t",
      "c h",
      "aa r",
      "o o",
      "a t",
      "e en",
      "g e",
      "o n",
      "i e",
      "t e",
      "h et",
      "a l",
      "v er",
      "o p",
      "ij n",
      "v an",
      "z e",
      "g en",
      "o e",
      "w a",
      "e e",
      "i t",
      "d en",
      "oo r",
      "h ij",
      "d at",
      "ch t",
      "d er",
      "i s",
      "i et",
      "z ijn",
      "h e",
      "o m",
      "b e",
      "aa n",
      "j e",
      "o u",
      "k en",
      "n iet",
      "i k",
      "a r",
      "e er",
      "o r",
      "s ch",
      "wa s",
      "l e",
      "d ie",
      "m et",
      "a d",
      "ij k",
      "z i",
      "in g",
      "r e",
      "u r",
      "u it",
      "w e",
      "h ad",
      "i l",
      "t o",
      "i g",
      "v en",
      "v oor",
      "ze i",
      "o l",
      "n o",
      "a cht",
      "a m",
      "m aar",
      "t en",
      "al s",
      "n aar",
      "u s",
      "i en",
      "g r",
      "he m",
      "g el",
      "u n",
      "a f",
      "v r",
      "o ver",
      "i d",
      "h aar",
      "o f",
      "z o",
      "st e",
      "an d",
      "H ij",
      "m en",
      "s p",
      "d r",
      "l a",
      "w aar",
      "ar r",
      "H arr",
      "l ijk",
      "Harr y",
      "zi ch",
      "t er",
      "on d",
      ". ’",
      "aa l",
      "u i",
      "w er",
      "i er",
      "no g",
      "d oor",
      "I k",
      "d an",
      "r o",
      "oo k",
      "aa t",
      "he b",
      "b en",
      "b l",
      "a g",
      "b ij",
      "a k"
    ]
  }
}