removed redundant training data (they exist within tortoise itself anyways), added utility: view tokenized text

2023-03-14 21:51:27 +00:00 · 2023-03-14 21:51:27 +00:00 · 07b684c4e7
commit 07b684c4e7
parent 469dd47a44
5 changed files with 36 additions and 541 deletions
--- a/models/.template.dlas.yaml
+++ b/models/.template.dlas.yaml
@ -24,7 +24,7 @@ datasets:
    num_conditioning_candidates: 2
    conditioning_length: 44000
    use_bpe_tokenizer: True
-    tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json
+    tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
    load_aligned_codes: False
  val:
    name: validation
@ -41,7 +41,7 @@ datasets:
    num_conditioning_candidates: 2
    conditioning_length: 44000
    use_bpe_tokenizer: True
-    tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json
+    tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
    load_aligned_codes: False

 steps:        
@ -61,13 +61,13 @@ steps:
    injectors:
      paired_to_mel:
        type: torch_mel_spectrogram
-        mel_norm_file: ./models/tortoise/clips_mel_norms.pth
+        mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
        in: wav
        out: paired_mel
      paired_cond_to_mel:
        type: for_each
        subtype: torch_mel_spectrogram
-        mel_norm_file: ./models/tortoise/clips_mel_norms.pth
+        mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
        in: conditioning
        out: paired_conditioning_mel
      to_codes:
--- a/models/tortoise/bpe_lowercase_asr_256.json
+++ b/models/tortoise/bpe_lowercase_asr_256.json
@ -1,527 +0,0 @@
-{
-    "version": "1.0",
-    "truncation": null,
-    "padding": null,
-    "added_tokens":
-    [
-        {
-            "id": 0,
-            "special": true,
-            "content": "[STOP]",
-            "single_word": false,
-            "lstrip": false,
-            "rstrip": false,
-            "normalized": false
-        },
-        {
-            "id": 1,
-            "special": true,
-            "content": "[UNK]",
-            "single_word": false,
-            "lstrip": false,
-            "rstrip": false,
-            "normalized": false
-        },
-        {
-            "id": 2,
-            "special": true,
-            "content": "[SPACE]",
-            "single_word": false,
-            "lstrip": false,
-            "rstrip": false,
-            "normalized": false
-        }
-    ],
-    "normalizer": null,
-    "pre_tokenizer":
-    {
-        "type": "Whitespace"
-    },
-    "post_processor": null,
-    "decoder": null,
-    "model":
-    {
-        "type": "BPE",
-        "dropout": null,
-        "unk_token": "[UNK]",
-        "continuing_subword_prefix": null,
-        "end_of_word_suffix": null,
-        "fuse_unk": false,
-        "vocab":
-        {
-            "[STOP]": 0,
-            "[UNK]": 1,
-            "[SPACE]": 2,
-            "!": 3,
-            "'": 4,
-            "(": 5,
-            ")": 6,
-            ",": 7,
-            "-": 8,
-            ".": 9,
-            "/": 10,
-            ":": 11,
-            ";": 12,
-            "?": 13,
-            "a": 14,
-            "b": 15,
-            "c": 16,
-            "d": 17,
-            "e": 18,
-            "f": 19,
-            "g": 20,
-            "h": 21,
-            "i": 22,
-            "j": 23,
-            "k": 24,
-            "l": 25,
-            "m": 26,
-            "n": 27,
-            "o": 28,
-            "p": 29,
-            "q": 30,
-            "r": 31,
-            "s": 32,
-            "t": 33,
-            "u": 34,
-            "v": 35,
-            "w": 36,
-            "x": 37,
-            "y": 38,
-            "z": 39,
-            "th": 40,
-            "in": 41,
-            "the": 42,
-            "an": 43,
-            "er": 44,
-            "ou": 45,
-            "re": 46,
-            "on": 47,
-            "at": 48,
-            "ed": 49,
-            "en": 50,
-            "to": 51,
-            "ing": 52,
-            "and": 53,
-            "is": 54,
-            "as": 55,
-            "al": 56,
-            "or": 57,
-            "of": 58,
-            "ar": 59,
-            "it": 60,
-            "es": 61,
-            "he": 62,
-            "st": 63,
-            "le": 64,
-            "om": 65,
-            "se": 66,
-            "be": 67,
-            "ad": 68,
-            "ow": 69,
-            "ly": 70,
-            "ch": 71,
-            "wh": 72,
-            "that": 73,
-            "you": 74,
-            "li": 75,
-            "ve": 76,
-            "ac": 77,
-            "ti": 78,
-            "ld": 79,
-            "me": 80,
-            "was": 81,
-            "gh": 82,
-            "id": 83,
-            "ll": 84,
-            "wi": 85,
-            "ent": 86,
-            "for": 87,
-            "ay": 88,
-            "ro": 89,
-            "ver": 90,
-            "ic": 91,
-            "her": 92,
-            "ke": 93,
-            "his": 94,
-            "no": 95,
-            "ut": 96,
-            "un": 97,
-            "ir": 98,
-            "lo": 99,
-            "we": 100,
-            "ri": 101,
-            "ha": 102,
-            "with": 103,
-            "ght": 104,
-            "out": 105,
-            "im": 106,
-            "ion": 107,
-            "all": 108,
-            "ab": 109,
-            "one": 110,
-            "ne": 111,
-            "ge": 112,
-            "ould": 113,
-            "ter": 114,
-            "mo": 115,
-            "had": 116,
-            "ce": 117,
-            "she": 118,
-            "go": 119,
-            "sh": 120,
-            "ur": 121,
-            "am": 122,
-            "so": 123,
-            "pe": 124,
-            "my": 125,
-            "de": 126,
-            "are": 127,
-            "but": 128,
-            "ome": 129,
-            "fr": 130,
-            "ther": 131,
-            "fe": 132,
-            "su": 133,
-            "do": 134,
-            "con": 135,
-            "te": 136,
-            "ain": 137,
-            "ere": 138,
-            "po": 139,
-            "if": 140,
-            "they": 141,
-            "us": 142,
-            "ag": 143,
-            "tr": 144,
-            "now": 145,
-            "oun": 146,
-            "this": 147,
-            "have": 148,
-            "not": 149,
-            "sa": 150,
-            "il": 151,
-            "up": 152,
-            "thing": 153,
-            "from": 154,
-            "ap": 155,
-            "him": 156,
-            "ack": 157,
-            "ation": 158,
-            "ant": 159,
-            "our": 160,
-            "op": 161,
-            "like": 162,
-            "ust": 163,
-            "ess": 164,
-            "bo": 165,
-            "ok": 166,
-            "ul": 167,
-            "ind": 168,
-            "ex": 169,
-            "com": 170,
-            "some": 171,
-            "there": 172,
-            "ers": 173,
-            "co": 174,
-            "res": 175,
-            "man": 176,
-            "ard": 177,
-            "pl": 178,
-            "wor": 179,
-            "way": 180,
-            "tion": 181,
-            "fo": 182,
-            "ca": 183,
-            "were": 184,
-            "by": 185,
-            "ate": 186,
-            "pro": 187,
-            "ted": 188,
-            "ound": 189,
-            "own": 190,
-            "would": 191,
-            "ts": 192,
-            "what": 193,
-            "qu": 194,
-            "ally": 195,
-            "ight": 196,
-            "ck": 197,
-            "gr": 198,
-            "when": 199,
-            "ven": 200,
-            "can": 201,
-            "ough": 202,
-            "ine": 203,
-            "end": 204,
-            "per": 205,
-            "ous": 206,
-            "od": 207,
-            "ide": 208,
-            "know": 209,
-            "ty": 210,
-            "very": 211,
-            "si": 212,
-            "ak": 213,
-            "who": 214,
-            "about": 215,
-            "ill": 216,
-            "them": 217,
-            "est": 218,
-            "red": 219,
-            "ye": 220,
-            "could": 221,
-            "ong": 222,
-            "your": 223,
-            "their": 224,
-            "em": 225,
-            "just": 226,
-            "other": 227,
-            "into": 228,
-            "any": 229,
-            "whi": 230,
-            "um": 231,
-            "tw": 232,
-            "ast": 233,
-            "der": 234,
-            "did": 235,
-            "ie": 236,
-            "been": 237,
-            "ace": 238,
-            "ink": 239,
-            "ity": 240,
-            "back": 241,
-            "ting": 242,
-            "br": 243,
-            "more": 244,
-            "ake": 245,
-            "pp": 246,
-            "then": 247,
-            "sp": 248,
-            "el": 249,
-            "use": 250,
-            "bl": 251,
-            "said": 252,
-            "over": 253,
-            "get": 254
-        },
-        "merges":
-        [
-            "t h",
-            "i n",
-            "th e",
-            "a n",
-            "e r",
-            "o u",
-            "r e",
-            "o n",
-            "a t",
-            "e d",
-            "e n",
-            "t o",
-            "in g",
-            "an d",
-            "i s",
-            "a s",
-            "a l",
-            "o r",
-            "o f",
-            "a r",
-            "i t",
-            "e s",
-            "h e",
-            "s t",
-            "l e",
-            "o m",
-            "s e",
-            "b e",
-            "a d",
-            "o w",
-            "l y",
-            "c h",
-            "w h",
-            "th at",
-            "y ou",
-            "l i",
-            "v e",
-            "a c",
-            "t i",
-            "l d",
-            "m e",
-            "w as",
-            "g h",
-            "i d",
-            "l l",
-            "w i",
-            "en t",
-            "f or",
-            "a y",
-            "r o",
-            "v er",
-            "i c",
-            "h er",
-            "k e",
-            "h is",
-            "n o",
-            "u t",
-            "u n",
-            "i r",
-            "l o",
-            "w e",
-            "r i",
-            "h a",
-            "wi th",
-            "gh t",
-            "ou t",
-            "i m",
-            "i on",
-            "al l",
-            "a b",
-            "on e",
-            "n e",
-            "g e",
-            "ou ld",
-            "t er",
-            "m o",
-            "h ad",
-            "c e",
-            "s he",
-            "g o",
-            "s h",
-            "u r",
-            "a m",
-            "s o",
-            "p e",
-            "m y",
-            "d e",
-            "a re",
-            "b ut",
-            "om e",
-            "f r",
-            "the r",
-            "f e",
-            "s u",
-            "d o",
-            "c on",
-            "t e",
-            "a in",
-            "er e",
-            "p o",
-            "i f",
-            "the y",
-            "u s",
-            "a g",
-            "t r",
-            "n ow",
-            "ou n",
-            "th is",
-            "ha ve",
-            "no t",
-            "s a",
-            "i l",
-            "u p",
-            "th ing",
-            "fr om",
-            "a p",
-            "h im",
-            "ac k",
-            "at ion",
-            "an t",
-            "ou r",
-            "o p",
-            "li ke",
-            "u st",
-            "es s",
-            "b o",
-            "o k",
-            "u l",
-            "in d",
-            "e x",
-            "c om",
-            "s ome",
-            "the re",
-            "er s",
-            "c o",
-            "re s",
-            "m an",
-            "ar d",
-            "p l",
-            "w or",
-            "w ay",
-            "ti on",
-            "f o",
-            "c a",
-            "w ere",
-            "b y",
-            "at e",
-            "p ro",
-            "t ed",
-            "oun d",
-            "ow n",
-            "w ould",
-            "t s",
-            "wh at",
-            "q u",
-            "al ly",
-            "i ght",
-            "c k",
-            "g r",
-            "wh en",
-            "v en",
-            "c an",
-            "ou gh",
-            "in e",
-            "en d",
-            "p er",
-            "ou s",
-            "o d",
-            "id e",
-            "k now",
-            "t y",
-            "ver y",
-            "s i",
-            "a k",
-            "wh o",
-            "ab out",
-            "i ll",
-            "the m",
-            "es t",
-            "re d",
-            "y e",
-            "c ould",
-            "on g",
-            "you r",
-            "the ir",
-            "e m",
-            "j ust",
-            "o ther",
-            "in to",
-            "an y",
-            "wh i",
-            "u m",
-            "t w",
-            "as t",
-            "d er",
-            "d id",
-            "i e",
-            "be en",
-            "ac e",
-            "in k",
-            "it y",
-            "b ack",
-            "t ing",
-            "b r",
-            "mo re",
-            "a ke",
-            "p p",
-            "the n",
-            "s p",
-            "e l",
-            "u se",
-            "b l",
-            "sa id",
-            "o ver",
-            "ge t"
-        ]
-    }
-}
--- a/models/tortoise/clips_mel_norms.pth
+++ b/models/tortoise/clips_mel_norms.pth
--- a/src/utils.py
+++ b/src/utils.py
@ -1994,6 +1994,15 @@ def save_args_settings():
 	with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
 		f.write(json.dumps(settings, indent='\t') )

+def tokenize_text( text ):
+	from tortoise.utils.tokenizer import VoiceBpeTokenizer
+
+	tokenizer = VoiceBpeTokenizer()
+	encoded = tokenizer.encode(text)
+	decoded = tokenizer.tokenizer.decode(encoded, skip_special_tokens=False)
+
+	return "\n".join([ str(encoded), decoded ])
+
 # super kludgy )`;
 def import_generate_settings(file = None):
 	if not file:
--- a/src/webui.py
+++ b/src/webui.py
@ -409,17 +409,25 @@ def setup_gradio():
 					history_audio = gr.Audio()
 					history_copy_settings_button = gr.Button(value="Copy Settings")
 		with gr.Tab("Utilities"):
-			with gr.Row():
-				with gr.Column():
-					audio_in = gr.Files(type="file", label="Audio Input", file_types=["audio"])
-					import_voice_name = gr.Textbox(label="Voice Name")
-					import_voice_button = gr.Button(value="Import Voice")
-				with gr.Column(visible=False) as col:
-					utilities_metadata_column = col
+			with gr.Tab("Import / Analyze"):
+				with gr.Row():
+					with gr.Column():
+						audio_in = gr.Files(type="file", label="Audio Input", file_types=["audio"])
+						import_voice_name = gr.Textbox(label="Voice Name")
+						import_voice_button = gr.Button(value="Import Voice")
+					with gr.Column(visible=False) as col:
+						utilities_metadata_column = col

-					metadata_out = gr.JSON(label="Audio Metadata")
-					copy_button = gr.Button(value="Copy Settings")
-					latents_out = gr.File(type="binary", label="Voice Latents")
+						metadata_out = gr.JSON(label="Audio Metadata")
+						copy_button = gr.Button(value="Copy Settings")
+						latents_out = gr.File(type="binary", label="Voice Latents")
+			with gr.Tab("Tokenizer"):
+				with gr.Row():
+					text_tokenizier_input = gr.TextArea(label="Text", max_lines=4)
+					text_tokenizier_output = gr.TextArea(label="Tokenized Text", max_lines=4)
+
+				with gr.Row():
+					text_tokenizier_button = gr.Button(value="Tokenize Text")
 		with gr.Tab("Training"):
 			with gr.Tab("Prepare Dataset"):
 				with gr.Row():
@ -712,6 +720,11 @@ def setup_gradio():
 			outputs=generate_settings
 		)

+		text_tokenizier_button.click(tokenize_text,
+			inputs=text_tokenizier_input,
+			outputs=text_tokenizier_output
+		)
+
 		refresh_configs.click(
 			lambda: gr.update(choices=get_training_list()),
 			inputs=None,