removed redundant training data (they exist within tortoise itself anyways), added utility: view tokenized text

This commit is contained in:
mrq 2023-03-14 21:51:27 +00:00
parent 469dd47a44
commit 07b684c4e7
5 changed files with 36 additions and 541 deletions

View File

@ -24,7 +24,7 @@ datasets:
num_conditioning_candidates: 2 num_conditioning_candidates: 2
conditioning_length: 44000 conditioning_length: 44000
use_bpe_tokenizer: True use_bpe_tokenizer: True
tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
load_aligned_codes: False load_aligned_codes: False
val: val:
name: validation name: validation
@ -41,7 +41,7 @@ datasets:
num_conditioning_candidates: 2 num_conditioning_candidates: 2
conditioning_length: 44000 conditioning_length: 44000
use_bpe_tokenizer: True use_bpe_tokenizer: True
tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
load_aligned_codes: False load_aligned_codes: False
steps: steps:
@ -61,13 +61,13 @@ steps:
injectors: injectors:
paired_to_mel: paired_to_mel:
type: torch_mel_spectrogram type: torch_mel_spectrogram
mel_norm_file: ./models/tortoise/clips_mel_norms.pth mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
in: wav in: wav
out: paired_mel out: paired_mel
paired_cond_to_mel: paired_cond_to_mel:
type: for_each type: for_each
subtype: torch_mel_spectrogram subtype: torch_mel_spectrogram
mel_norm_file: ./models/tortoise/clips_mel_norms.pth mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
in: conditioning in: conditioning
out: paired_conditioning_mel out: paired_conditioning_mel
to_codes: to_codes:

View File

@ -1,527 +0,0 @@
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens":
[
{
"id": 0,
"special": true,
"content": "[STOP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false
},
{
"id": 1,
"special": true,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false
},
{
"id": 2,
"special": true,
"content": "[SPACE]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false
}
],
"normalizer": null,
"pre_tokenizer":
{
"type": "Whitespace"
},
"post_processor": null,
"decoder": null,
"model":
{
"type": "BPE",
"dropout": null,
"unk_token": "[UNK]",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"vocab":
{
"[STOP]": 0,
"[UNK]": 1,
"[SPACE]": 2,
"!": 3,
"'": 4,
"(": 5,
")": 6,
",": 7,
"-": 8,
".": 9,
"/": 10,
":": 11,
";": 12,
"?": 13,
"a": 14,
"b": 15,
"c": 16,
"d": 17,
"e": 18,
"f": 19,
"g": 20,
"h": 21,
"i": 22,
"j": 23,
"k": 24,
"l": 25,
"m": 26,
"n": 27,
"o": 28,
"p": 29,
"q": 30,
"r": 31,
"s": 32,
"t": 33,
"u": 34,
"v": 35,
"w": 36,
"x": 37,
"y": 38,
"z": 39,
"th": 40,
"in": 41,
"the": 42,
"an": 43,
"er": 44,
"ou": 45,
"re": 46,
"on": 47,
"at": 48,
"ed": 49,
"en": 50,
"to": 51,
"ing": 52,
"and": 53,
"is": 54,
"as": 55,
"al": 56,
"or": 57,
"of": 58,
"ar": 59,
"it": 60,
"es": 61,
"he": 62,
"st": 63,
"le": 64,
"om": 65,
"se": 66,
"be": 67,
"ad": 68,
"ow": 69,
"ly": 70,
"ch": 71,
"wh": 72,
"that": 73,
"you": 74,
"li": 75,
"ve": 76,
"ac": 77,
"ti": 78,
"ld": 79,
"me": 80,
"was": 81,
"gh": 82,
"id": 83,
"ll": 84,
"wi": 85,
"ent": 86,
"for": 87,
"ay": 88,
"ro": 89,
"ver": 90,
"ic": 91,
"her": 92,
"ke": 93,
"his": 94,
"no": 95,
"ut": 96,
"un": 97,
"ir": 98,
"lo": 99,
"we": 100,
"ri": 101,
"ha": 102,
"with": 103,
"ght": 104,
"out": 105,
"im": 106,
"ion": 107,
"all": 108,
"ab": 109,
"one": 110,
"ne": 111,
"ge": 112,
"ould": 113,
"ter": 114,
"mo": 115,
"had": 116,
"ce": 117,
"she": 118,
"go": 119,
"sh": 120,
"ur": 121,
"am": 122,
"so": 123,
"pe": 124,
"my": 125,
"de": 126,
"are": 127,
"but": 128,
"ome": 129,
"fr": 130,
"ther": 131,
"fe": 132,
"su": 133,
"do": 134,
"con": 135,
"te": 136,
"ain": 137,
"ere": 138,
"po": 139,
"if": 140,
"they": 141,
"us": 142,
"ag": 143,
"tr": 144,
"now": 145,
"oun": 146,
"this": 147,
"have": 148,
"not": 149,
"sa": 150,
"il": 151,
"up": 152,
"thing": 153,
"from": 154,
"ap": 155,
"him": 156,
"ack": 157,
"ation": 158,
"ant": 159,
"our": 160,
"op": 161,
"like": 162,
"ust": 163,
"ess": 164,
"bo": 165,
"ok": 166,
"ul": 167,
"ind": 168,
"ex": 169,
"com": 170,
"some": 171,
"there": 172,
"ers": 173,
"co": 174,
"res": 175,
"man": 176,
"ard": 177,
"pl": 178,
"wor": 179,
"way": 180,
"tion": 181,
"fo": 182,
"ca": 183,
"were": 184,
"by": 185,
"ate": 186,
"pro": 187,
"ted": 188,
"ound": 189,
"own": 190,
"would": 191,
"ts": 192,
"what": 193,
"qu": 194,
"ally": 195,
"ight": 196,
"ck": 197,
"gr": 198,
"when": 199,
"ven": 200,
"can": 201,
"ough": 202,
"ine": 203,
"end": 204,
"per": 205,
"ous": 206,
"od": 207,
"ide": 208,
"know": 209,
"ty": 210,
"very": 211,
"si": 212,
"ak": 213,
"who": 214,
"about": 215,
"ill": 216,
"them": 217,
"est": 218,
"red": 219,
"ye": 220,
"could": 221,
"ong": 222,
"your": 223,
"their": 224,
"em": 225,
"just": 226,
"other": 227,
"into": 228,
"any": 229,
"whi": 230,
"um": 231,
"tw": 232,
"ast": 233,
"der": 234,
"did": 235,
"ie": 236,
"been": 237,
"ace": 238,
"ink": 239,
"ity": 240,
"back": 241,
"ting": 242,
"br": 243,
"more": 244,
"ake": 245,
"pp": 246,
"then": 247,
"sp": 248,
"el": 249,
"use": 250,
"bl": 251,
"said": 252,
"over": 253,
"get": 254
},
"merges":
[
"t h",
"i n",
"th e",
"a n",
"e r",
"o u",
"r e",
"o n",
"a t",
"e d",
"e n",
"t o",
"in g",
"an d",
"i s",
"a s",
"a l",
"o r",
"o f",
"a r",
"i t",
"e s",
"h e",
"s t",
"l e",
"o m",
"s e",
"b e",
"a d",
"o w",
"l y",
"c h",
"w h",
"th at",
"y ou",
"l i",
"v e",
"a c",
"t i",
"l d",
"m e",
"w as",
"g h",
"i d",
"l l",
"w i",
"en t",
"f or",
"a y",
"r o",
"v er",
"i c",
"h er",
"k e",
"h is",
"n o",
"u t",
"u n",
"i r",
"l o",
"w e",
"r i",
"h a",
"wi th",
"gh t",
"ou t",
"i m",
"i on",
"al l",
"a b",
"on e",
"n e",
"g e",
"ou ld",
"t er",
"m o",
"h ad",
"c e",
"s he",
"g o",
"s h",
"u r",
"a m",
"s o",
"p e",
"m y",
"d e",
"a re",
"b ut",
"om e",
"f r",
"the r",
"f e",
"s u",
"d o",
"c on",
"t e",
"a in",
"er e",
"p o",
"i f",
"the y",
"u s",
"a g",
"t r",
"n ow",
"ou n",
"th is",
"ha ve",
"no t",
"s a",
"i l",
"u p",
"th ing",
"fr om",
"a p",
"h im",
"ac k",
"at ion",
"an t",
"ou r",
"o p",
"li ke",
"u st",
"es s",
"b o",
"o k",
"u l",
"in d",
"e x",
"c om",
"s ome",
"the re",
"er s",
"c o",
"re s",
"m an",
"ar d",
"p l",
"w or",
"w ay",
"ti on",
"f o",
"c a",
"w ere",
"b y",
"at e",
"p ro",
"t ed",
"oun d",
"ow n",
"w ould",
"t s",
"wh at",
"q u",
"al ly",
"i ght",
"c k",
"g r",
"wh en",
"v en",
"c an",
"ou gh",
"in e",
"en d",
"p er",
"ou s",
"o d",
"id e",
"k now",
"t y",
"ver y",
"s i",
"a k",
"wh o",
"ab out",
"i ll",
"the m",
"es t",
"re d",
"y e",
"c ould",
"on g",
"you r",
"the ir",
"e m",
"j ust",
"o ther",
"in to",
"an y",
"wh i",
"u m",
"t w",
"as t",
"d er",
"d id",
"i e",
"be en",
"ac e",
"in k",
"it y",
"b ack",
"t ing",
"b r",
"mo re",
"a ke",
"p p",
"the n",
"s p",
"e l",
"u se",
"b l",
"sa id",
"o ver",
"ge t"
]
}
}

Binary file not shown.

View File

@ -1994,6 +1994,15 @@ def save_args_settings():
with open(f'./config/exec.json', 'w', encoding="utf-8") as f: with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
f.write(json.dumps(settings, indent='\t') ) f.write(json.dumps(settings, indent='\t') )
def tokenize_text( text ):
from tortoise.utils.tokenizer import VoiceBpeTokenizer
tokenizer = VoiceBpeTokenizer()
encoded = tokenizer.encode(text)
decoded = tokenizer.tokenizer.decode(encoded, skip_special_tokens=False)
return "\n".join([ str(encoded), decoded ])
# super kludgy )`; # super kludgy )`;
def import_generate_settings(file = None): def import_generate_settings(file = None):
if not file: if not file:

View File

@ -409,6 +409,7 @@ def setup_gradio():
history_audio = gr.Audio() history_audio = gr.Audio()
history_copy_settings_button = gr.Button(value="Copy Settings") history_copy_settings_button = gr.Button(value="Copy Settings")
with gr.Tab("Utilities"): with gr.Tab("Utilities"):
with gr.Tab("Import / Analyze"):
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
audio_in = gr.Files(type="file", label="Audio Input", file_types=["audio"]) audio_in = gr.Files(type="file", label="Audio Input", file_types=["audio"])
@ -420,6 +421,13 @@ def setup_gradio():
metadata_out = gr.JSON(label="Audio Metadata") metadata_out = gr.JSON(label="Audio Metadata")
copy_button = gr.Button(value="Copy Settings") copy_button = gr.Button(value="Copy Settings")
latents_out = gr.File(type="binary", label="Voice Latents") latents_out = gr.File(type="binary", label="Voice Latents")
with gr.Tab("Tokenizer"):
with gr.Row():
text_tokenizier_input = gr.TextArea(label="Text", max_lines=4)
text_tokenizier_output = gr.TextArea(label="Tokenized Text", max_lines=4)
with gr.Row():
text_tokenizier_button = gr.Button(value="Tokenize Text")
with gr.Tab("Training"): with gr.Tab("Training"):
with gr.Tab("Prepare Dataset"): with gr.Tab("Prepare Dataset"):
with gr.Row(): with gr.Row():
@ -712,6 +720,11 @@ def setup_gradio():
outputs=generate_settings outputs=generate_settings
) )
text_tokenizier_button.click(tokenize_text,
inputs=text_tokenizier_input,
outputs=text_tokenizier_output
)
refresh_configs.click( refresh_configs.click(
lambda: gr.update(choices=get_training_list()), lambda: gr.update(choices=get_training_list()),
inputs=None, inputs=None,