model: extensibletrainer scale: 1 gpu_ids: [0] # Manually edit this if the GPU you want to train on is not your primary, as this will set the env var that exposes CUDA devices start_step: 0 checkpointing_enabled: true fp16: False bitsandbytes: True gpus: 1 datasets: train: name: training n_workers: 2 batch_size: 128 mode: paired_voice_audio path: ./training/vlaams/train.txt fetcher_mode: ['lj'] phase: train max_wav_length: 255995 # ~11.6 seconds max_text_length: 200 sample_rate: 22050 load_conditioning: True num_conditioning_candidates: 2 conditioning_length: 44000 use_bpe_tokenizer: True tokenizer_vocab: ./models/tokenizers/dutch_vl.json # ./models/tortoise/bpe_lowercase_asr_256.json load_aligned_codes: False val: name: validation n_workers: 2 batch_size: 4 mode: paired_voice_audio path: ./training/vlaams/validation.txt fetcher_mode: ['lj'] phase: val max_wav_length: 255995 max_text_length: 200 sample_rate: 22050 load_conditioning: True num_conditioning_candidates: 2 conditioning_length: 44000 use_bpe_tokenizer: True tokenizer_vocab: ./models/tokenizers/dutch_vl.json # ./models/tortoise/bpe_lowercase_asr_256.json load_aligned_codes: False steps: gpt_train: training: gpt loss_log_buffer: 500 # Generally follows the recipe from the DALLE paper. optimizer: adamw # this should be adamw_zero if you're using distributed training optimizer_params: lr: !!float 1e-05 # originally: 1e-4 weight_decay: !!float 1e-2 beta1: 0.9 beta2: 0.96 clip_grad_eps: 4 injectors: paired_to_mel: type: torch_mel_spectrogram mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth in: wav out: paired_mel paired_cond_to_mel: type: for_each subtype: torch_mel_spectrogram mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth in: conditioning out: paired_conditioning_mel to_codes: type: discrete_token in: paired_mel out: paired_mel_codes dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml" paired_fwd_text: type: generator generator: gpt in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths] out: [loss_text_ce, loss_mel_ce, logits] losses: text_ce: type: direct weight: 1 key: loss_text_ce mel_ce: type: direct weight: 1 key: loss_mel_ce networks: gpt: type: generator which_model_G: unified_voice2 kwargs: layers: 30 # originally: 8 model_dim: 1024 # originally: 512 heads: 16 # originally: 8 max_text_tokens: 402 # originally: 120 max_mel_tokens: 604 # originally: 250 max_conditioning_inputs: 2 # originally: 1 mel_length_compression: 1024 number_text_tokens: 256 # supposed to be 255 for newer unified_voice files number_mel_codes: 8194 start_mel_token: 8192 stop_mel_token: 8193 start_text_token: 255 train_solo_embeddings: False # missing in uv3/4 use_mel_codes_as_input: True # ditto checkpointing: True tortoise_compat: True # freeze_everything_but_position_embeddings: True path: strict_load: true pretrain_model_gpt: './models/tortoise/autoregressive.pth' # resume_state: '' train: niter: 3000 warmup_iter: -1 mega_batch_factor: 32 val_freq: 200 ema_enabled: false # I really don't think EMA matters default_lr_scheme: MultiStepLR gen_lr_steps: [400, 800, 1800, 3600, 5000, 6600, 10000] lr_gamma: 0.5 eval: pure: False output_state: gen logger: save_checkpoint_freq: 200 visuals: [gen, mel] visual_debug_rate: 200 is_mel_spectrogram: true``` Tokenizer: ```{ "version": "1.0", "truncation": null, "padding": null, "added_tokens": [ { "id": 0, "content": "[STOP]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 1, "content": "[UNK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 2, "content": "[SPACE]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true } ], "normalizer": null, "pre_tokenizer": { "type": "Whitespace" }, "post_processor": null, "decoder": null, "model": { "type": "BPE", "language": "nl", "dropout": null, "unk_token": "[UNK]", "continuing_subword_prefix": null, "end_of_word_suffix": null, "fuse_unk": false, "byte_fallback": false, "vocab": { "[STOP]": 0, "[UNK]": 1, "[SPACE]": 2, "!": 3, "\"": 4, "$": 5, "&": 6, "'": 7, "(": 8, ")": 9, "*": 10, ",": 11, "-": 12, ".": 13, "/": 14, "0": 15, "1": 16, "2": 17, "3": 18, "4": 19, "5": 20, "6": 21, "7": 22, "8": 23, "9": 24, ":": 25, ";": 26, "<": 27, "=": 28, ">": 29, "?": 30, "A": 31, "B": 32, "C": 33, "D": 34, "E": 35, "F": 36, "G": 37, "H": 38, "I": 39, "J": 40, "K": 41, "L": 42, "M": 43, "N": 44, "O": 45, "P": 46, "Q": 47, "R": 48, "S": 49, "T": 50, "U": 51, "V": 52, "W": 53, "X": 54, "Y": 55, "Z": 56, "a": 57, "b": 58, "c": 59, "d": 60, "e": 61, "f": 62, "g": 63, "h": 64, "i": 65, "j": 66, "k": 67, "l": 68, "m": 69, "n": 70, "o": 71, "p": 72, "q": 73, "r": 74, "s": 75, "t": 76, "u": 77, "v": 78, "w": 79, "x": 80, "y": 81, "z": 82, "©": 83, "«": 84, "°": 85, "»": 86, "¿": 87, "Ó": 88, "Ö": 89, "Ü": 90, "ß": 91, "à": 92, "á": 93, "ä": 94, "ç": 95, "è": 96, "é": 97, "ê": 98, "ë": 99, "í": 100, "î": 101, "ï": 102, "ñ": 103, "ò": 104, "ó": 105, "ô": 106, "ö": 107, "ú": 108, "û": 109, "ü": 110, "č": 111, "ę": 112, "ł": 113, "œ": 114, "ř": 115, "ś": 116, "ƒ": 117, "α": 118, "π": 119, "–": 120, "‘": 121, "’": 122, "“": 123, "”": 124, "•": 125, "…": 126, "Ω": 127, "ℵ": 128, "en": 129, "er": 130, "ij": 131, "de": 132, "et": 133, "aa": 134, "an": 135, "el": 136, "in": 137, "st": 138, "ch": 139, "aar": 140, "oo": 141, "at": 142, "een": 143, "ge": 144, "on": 145, "ie": 146, "te": 147, "het": 148, "al": 149, "ver": 150, "op": 151, "ijn": 152, "van": 153, "ze": 154, "gen": 155, "oe": 156, "wa": 157, "ee": 158, "it": 159, "den": 160, "oor": 161, "hij": 162, "dat": 163, "cht": 164, "der": 165, "is": 166, "iet": 167, "zijn": 168, "he": 169, "om": 170, "be": 171, "aan": 172, "je": 173, "ou": 174, "ken": 175, "niet": 176, "ik": 177, "ar": 178, "eer": 179, "or": 180, "sch": 181, "was": 182, "le": 183, "die": 184, "met": 185, "ad": 186, "ijk": 187, "zi": 188, "ing": 189, "re": 190, "ur": 191, "uit": 192, "we": 193, "had": 194, "il": 195, "to": 196, "ig": 197, "ven": 198, "voor": 199, "zei": 200, "ol": 201, "no": 202, "acht": 203, "am": 204, "maar": 205, "ten": 206, "als": 207, "naar": 208, "us": 209, "ien": 210, "gr": 211, "hem": 212, "gel": 213, "un": 214, "af": 215, "vr": 216, "over": 217, "id": 218, "haar": 219, "of": 220, "zo": 221, "ste": 222, "and": 223, "Hij": 224, "men": 225, "sp": 226, "dr": 227, "la": 228, "waar": 229, "arr": 230, "Harr": 231, "lijk": 232, "Harry": 233, "zich": 234, "ter": 235, "ond": 236, ".’": 237, "aal": 238, "ui": 239, "wer": 240, "ier": 241, "nog": 242, "door": 243, "Ik": 244, "dan": 245, "ro": 246, "ook": 247, "aat": 248, "heb": 249, "ben": 250, "bl": 251, "ag": 252, "bij": 253, "ak": 254 }, "merges": [ "e n", "e r", "i j", "d e", "e t", "a a", "a n", "e l", "i n", "s t", "c h", "aa r", "o o", "a t", "e en", "g e", "o n", "i e", "t e", "h et", "a l", "v er", "o p", "ij n", "v an", "z e", "g en", "o e", "w a", "e e", "i t", "d en", "oo r", "h ij", "d at", "ch t", "d er", "i s", "i et", "z ijn", "h e", "o m", "b e", "aa n", "j e", "o u", "k en", "n iet", "i k", "a r", "e er", "o r", "s ch", "wa s", "l e", "d ie", "m et", "a d", "ij k", "z i", "in g", "r e", "u r", "u it", "w e", "h ad", "i l", "t o", "i g", "v en", "v oor", "ze i", "o l", "n o", "a cht", "a m", "m aar", "t en", "al s", "n aar", "u s", "i en", "g r", "he m", "g el", "u n", "a f", "v r", "o ver", "i d", "h aar", "o f", "z o", "st e", "an d", "H ij", "m en", "s p", "d r", "l a", "w aar", "ar r", "H arr", "l ijk", "Harr y", "zi ch", "t er", "on d", ". ’", "aa l", "u i", "w er", "i er", "no g", "d oor", "I k", "d an", "r o", "oo k", "aa t", "he b", "b en", "b l", "a g", "b ij", "a k" ] } }