removed redundant training data (they exist within tortoise itself anyways), added utility: view tokenized text

2023-03-14 21:51:27 +00:00 · 2023-03-14 21:51:27 +00:00 · 07b684c4e7
commit 07b684c4e7
parent 469dd47a44
5 changed files with 36 additions and 541 deletions
--- a/models/.template.dlas.yaml
+++ b/models/.template.dlas.yaml
@ -24,7 +24,7 @@ datasets:
    num_conditioning_candidates: 2
    conditioning_length: 44000
    use_bpe_tokenizer: True
-    tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json
+    tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
    load_aligned_codes: False
  val:
    name: validation
@ -41,7 +41,7 @@ datasets:
    num_conditioning_candidates: 2
    conditioning_length: 44000
    use_bpe_tokenizer: True
-    tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json
+    tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
    load_aligned_codes: False
 steps:        
@ -61,13 +61,13 @@ steps:
    injectors:
      paired_to_mel:
        type: torch_mel_spectrogram
-        mel_norm_file: ./models/tortoise/clips_mel_norms.pth
+        mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
        in: wav
        out: paired_mel
      paired_cond_to_mel:
        type: for_each
        subtype: torch_mel_spectrogram
-        mel_norm_file: ./models/tortoise/clips_mel_norms.pth
+        mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
        in: conditioning
        out: paired_conditioning_mel
      to_codes:
--- a/models/tortoise/bpe_lowercase_asr_256.json
+++ b/models/tortoise/bpe_lowercase_asr_256.json
@ -1,527 +0,0 @@
 {
    "version": "1.0",
    "truncation": null,
    "padding": null,
    "added_tokens":
    [
        {
            "id": 0,
            "special": true,
            "content": "[STOP]",
            "single_word": false,
            "lstrip": false,
            "rstrip": false,
            "normalized": false
        },
        {
            "id": 1,
            "special": true,
            "content": "[UNK]",
            "single_word": false,
            "lstrip": false,
            "rstrip": false,
            "normalized": false
        },
        {
            "id": 2,
            "special": true,
            "content": "[SPACE]",
            "single_word": false,
            "lstrip": false,
            "rstrip": false,
            "normalized": false
        }
    ],
    "normalizer": null,
    "pre_tokenizer":
    {
        "type": "Whitespace"
    },
    "post_processor": null,
    "decoder": null,
    "model":
    {
        "type": "BPE",
        "dropout": null,
        "unk_token": "[UNK]",
        "continuing_subword_prefix": null,
        "end_of_word_suffix": null,
        "fuse_unk": false,
        "vocab":
        {
            "[STOP]": 0,
            "[UNK]": 1,
            "[SPACE]": 2,
            "!": 3,
            "'": 4,
            "(": 5,
            ")": 6,
            ",": 7,
            "-": 8,
            ".": 9,
            "/": 10,
            ":": 11,
            ";": 12,
            "?": 13,
            "a": 14,
            "b": 15,
            "c": 16,
            "d": 17,
            "e": 18,
            "f": 19,
            "g": 20,
            "h": 21,
            "i": 22,
            "j": 23,
            "k": 24,
            "l": 25,
            "m": 26,
            "n": 27,
            "o": 28,
            "p": 29,
            "q": 30,
            "r": 31,
            "s": 32,
            "t": 33,
            "u": 34,
            "v": 35,
            "w": 36,
            "x": 37,
            "y": 38,
            "z": 39,
            "th": 40,
            "in": 41,
            "the": 42,
            "an": 43,
            "er": 44,
            "ou": 45,
            "re": 46,
            "on": 47,
            "at": 48,
            "ed": 49,
            "en": 50,
            "to": 51,
            "ing": 52,
            "and": 53,
            "is": 54,
            "as": 55,
            "al": 56,
            "or": 57,
            "of": 58,
            "ar": 59,
            "it": 60,
            "es": 61,
            "he": 62,
            "st": 63,
            "le": 64,
            "om": 65,
            "se": 66,
            "be": 67,
            "ad": 68,
            "ow": 69,
            "ly": 70,
            "ch": 71,
            "wh": 72,
            "that": 73,
            "you": 74,
            "li": 75,
            "ve": 76,
            "ac": 77,
            "ti": 78,
            "ld": 79,
            "me": 80,
            "was": 81,
            "gh": 82,
            "id": 83,
            "ll": 84,
            "wi": 85,
            "ent": 86,
            "for": 87,
            "ay": 88,
            "ro": 89,
            "ver": 90,
            "ic": 91,
            "her": 92,
            "ke": 93,
            "his": 94,
            "no": 95,
            "ut": 96,
            "un": 97,
            "ir": 98,
            "lo": 99,
            "we": 100,
            "ri": 101,
            "ha": 102,
            "with": 103,
            "ght": 104,
            "out": 105,
            "im": 106,
            "ion": 107,
            "all": 108,
            "ab": 109,
            "one": 110,
            "ne": 111,
            "ge": 112,
            "ould": 113,
            "ter": 114,
            "mo": 115,
            "had": 116,
            "ce": 117,
            "she": 118,
            "go": 119,
            "sh": 120,
            "ur": 121,
            "am": 122,
            "so": 123,
            "pe": 124,
            "my": 125,
            "de": 126,
            "are": 127,
            "but": 128,
            "ome": 129,
            "fr": 130,
            "ther": 131,
            "fe": 132,
            "su": 133,
            "do": 134,
            "con": 135,
            "te": 136,
            "ain": 137,
            "ere": 138,
            "po": 139,
            "if": 140,
            "they": 141,
            "us": 142,
            "ag": 143,
            "tr": 144,
            "now": 145,
            "oun": 146,
            "this": 147,
            "have": 148,
            "not": 149,
            "sa": 150,
            "il": 151,
            "up": 152,
            "thing": 153,
            "from": 154,
            "ap": 155,
            "him": 156,
            "ack": 157,
            "ation": 158,
            "ant": 159,
            "our": 160,
            "op": 161,
            "like": 162,
            "ust": 163,
            "ess": 164,
            "bo": 165,
            "ok": 166,
            "ul": 167,
            "ind": 168,
            "ex": 169,
            "com": 170,
            "some": 171,
            "there": 172,
            "ers": 173,
            "co": 174,
            "res": 175,
            "man": 176,
            "ard": 177,
            "pl": 178,
            "wor": 179,
            "way": 180,
            "tion": 181,
            "fo": 182,
            "ca": 183,
            "were": 184,
            "by": 185,
            "ate": 186,
            "pro": 187,
            "ted": 188,
            "ound": 189,
            "own": 190,
            "would": 191,
            "ts": 192,
            "what": 193,
            "qu": 194,
            "ally": 195,
            "ight": 196,
            "ck": 197,
            "gr": 198,
            "when": 199,
            "ven": 200,
            "can": 201,
            "ough": 202,
            "ine": 203,
            "end": 204,
            "per": 205,
            "ous": 206,
            "od": 207,
            "ide": 208,
            "know": 209,
            "ty": 210,
            "very": 211,
            "si": 212,
            "ak": 213,
            "who": 214,
            "about": 215,
            "ill": 216,
            "them": 217,
            "est": 218,
            "red": 219,
            "ye": 220,
            "could": 221,
            "ong": 222,
            "your": 223,
            "their": 224,
            "em": 225,
            "just": 226,
            "other": 227,
            "into": 228,
            "any": 229,
            "whi": 230,
            "um": 231,
            "tw": 232,
            "ast": 233,
            "der": 234,
            "did": 235,
            "ie": 236,
            "been": 237,
            "ace": 238,
            "ink": 239,
            "ity": 240,
            "back": 241,
            "ting": 242,
            "br": 243,
            "more": 244,
            "ake": 245,
            "pp": 246,
            "then": 247,
            "sp": 248,
            "el": 249,
            "use": 250,
            "bl": 251,
            "said": 252,
            "over": 253,
            "get": 254
        },
        "merges":
        [
            "t h",
            "i n",
            "th e",
            "a n",
            "e r",
            "o u",
            "r e",
            "o n",
            "a t",
            "e d",
            "e n",
            "t o",
            "in g",
            "an d",
            "i s",
            "a s",
            "a l",
            "o r",
            "o f",
            "a r",
            "i t",
            "e s",
            "h e",
            "s t",
            "l e",
            "o m",
            "s e",
            "b e",
            "a d",
            "o w",
            "l y",
            "c h",
            "w h",
            "th at",
            "y ou",
            "l i",
            "v e",
            "a c",
            "t i",
            "l d",
            "m e",
            "w as",
            "g h",
            "i d",
            "l l",
            "w i",
            "en t",
            "f or",
            "a y",
            "r o",
            "v er",
            "i c",
            "h er",
            "k e",
            "h is",
            "n o",
            "u t",
            "u n",
            "i r",
            "l o",
            "w e",
            "r i",
            "h a",
            "wi th",
            "gh t",
            "ou t",
            "i m",
            "i on",
            "al l",
            "a b",
            "on e",
            "n e",
            "g e",
            "ou ld",
            "t er",
            "m o",
            "h ad",
            "c e",
            "s he",
            "g o",
            "s h",
            "u r",
            "a m",
            "s o",
            "p e",
            "m y",
            "d e",
            "a re",
            "b ut",
            "om e",
            "f r",
            "the r",
            "f e",
            "s u",
            "d o",
            "c on",
            "t e",
            "a in",
            "er e",
            "p o",
            "i f",
            "the y",
            "u s",
            "a g",
            "t r",
            "n ow",
            "ou n",
            "th is",
            "ha ve",
            "no t",
            "s a",
            "i l",
            "u p",
            "th ing",
            "fr om",
            "a p",
            "h im",
            "ac k",
            "at ion",
            "an t",
            "ou r",
            "o p",
            "li ke",
            "u st",
            "es s",
            "b o",
            "o k",
            "u l",
            "in d",
            "e x",
            "c om",
            "s ome",
            "the re",
            "er s",
            "c o",
            "re s",
            "m an",
            "ar d",
            "p l",
            "w or",
            "w ay",
            "ti on",
            "f o",
            "c a",
            "w ere",
            "b y",
            "at e",
            "p ro",
            "t ed",
            "oun d",
            "ow n",
            "w ould",
            "t s",
            "wh at",
            "q u",
            "al ly",
            "i ght",
            "c k",
            "g r",
            "wh en",
            "v en",
            "c an",
            "ou gh",
            "in e",
            "en d",
            "p er",
            "ou s",
            "o d",
            "id e",
            "k now",
            "t y",
            "ver y",
            "s i",
            "a k",
            "wh o",
            "ab out",
            "i ll",
            "the m",
            "es t",
            "re d",
            "y e",
            "c ould",
            "on g",
            "you r",
            "the ir",
            "e m",
            "j ust",
            "o ther",
            "in to",
            "an y",
            "wh i",
            "u m",
            "t w",
            "as t",
            "d er",
            "d id",
            "i e",
            "be en",
            "ac e",
            "in k",
            "it y",
            "b ack",
            "t ing",
            "b r",
            "mo re",
            "a ke",
            "p p",
            "the n",
            "s p",
            "e l",
            "u se",
            "b l",
            "sa id",
            "o ver",
            "ge t"
        ]
    }
 }
--- a/models/tortoise/clips_mel_norms.pth
+++ b/models/tortoise/clips_mel_norms.pth
--- a/src/utils.py
+++ b/src/utils.py
@ -1994,6 +1994,15 @@ def save_args_settings():
 	with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
 		f.write(json.dumps(settings, indent='\t') )
 def tokenize_text( text ):
 	from tortoise.utils.tokenizer import VoiceBpeTokenizer
 	tokenizer = VoiceBpeTokenizer()
 	encoded = tokenizer.encode(text)
 	decoded = tokenizer.tokenizer.decode(encoded, skip_special_tokens=False)
 	return "\n".join([ str(encoded), decoded ])
 # super kludgy )`;
 def import_generate_settings(file = None):
 	if not file:
--- a/src/webui.py
+++ b/src/webui.py
@ -409,6 +409,7 @@ def setup_gradio():
 					history_audio = gr.Audio()
 					history_copy_settings_button = gr.Button(value="Copy Settings")
 		with gr.Tab("Utilities"):
 			with gr.Tab("Import / Analyze"):
 				with gr.Row():
 					with gr.Column():
 						audio_in = gr.Files(type="file", label="Audio Input", file_types=["audio"])
@ -420,6 +421,13 @@ def setup_gradio():
 						metadata_out = gr.JSON(label="Audio Metadata")
 						copy_button = gr.Button(value="Copy Settings")
 						latents_out = gr.File(type="binary", label="Voice Latents")
 			with gr.Tab("Tokenizer"):
 				with gr.Row():
 					text_tokenizier_input = gr.TextArea(label="Text", max_lines=4)
 					text_tokenizier_output = gr.TextArea(label="Tokenized Text", max_lines=4)
 				with gr.Row():
 					text_tokenizier_button = gr.Button(value="Tokenize Text")
 		with gr.Tab("Training"):
 			with gr.Tab("Prepare Dataset"):
 				with gr.Row():
@ -712,6 +720,11 @@ def setup_gradio():
 			outputs=generate_settings
 		)
 		text_tokenizier_button.click(tokenize_text,
 			inputs=text_tokenizier_input,
 			outputs=text_tokenizier_output
 		)
 		refresh_configs.click(
 			lambda: gr.update(choices=get_training_list()),
 			inputs=None,