removed redundant training data (they exist within tortoise itself anyways), added utility: view tokenized text

remotes/1714672194237014644/master
mrq 2023-03-14 21:51:27 +07:00
parent 469dd47a44
commit 07b684c4e7
5 changed files with 36 additions and 541 deletions

@ -24,7 +24,7 @@ datasets:
num_conditioning_candidates: 2
conditioning_length: 44000
use_bpe_tokenizer: True
tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json
tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
load_aligned_codes: False
val:
name: validation
@ -41,7 +41,7 @@ datasets:
num_conditioning_candidates: 2
conditioning_length: 44000
use_bpe_tokenizer: True
tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json
tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
load_aligned_codes: False
steps:
@ -61,13 +61,13 @@ steps:
injectors:
paired_to_mel:
type: torch_mel_spectrogram
mel_norm_file: ./models/tortoise/clips_mel_norms.pth
mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
in: wav
out: paired_mel
paired_cond_to_mel:
type: for_each
subtype: torch_mel_spectrogram
mel_norm_file: ./models/tortoise/clips_mel_norms.pth
mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
in: conditioning
out: paired_conditioning_mel
to_codes:

@ -1,527 +0,0 @@
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens":
[
{
"id": 0,
"special": true,
"content": "[STOP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false
},
{
"id": 1,
"special": true,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false
},
{
"id": 2,
"special": true,
"content": "[SPACE]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false
}
],
"normalizer": null,
"pre_tokenizer":
{
"type": "Whitespace"
},
"post_processor": null,
"decoder": null,
"model":
{
"type": "BPE",
"dropout": null,
"unk_token": "[UNK]",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"vocab":
{
"[STOP]": 0,
"[UNK]": 1,
"[SPACE]": 2,
"!": 3,
"'": 4,
"(": 5,
")": 6,
",": 7,
"-": 8,
".": 9,
"/": 10,
":": 11,
";": 12,
"?": 13,
"a": 14,
"b": 15,
"c": 16,
"d": 17,
"e": 18,
"f": 19,
"g": 20,
"h": 21,
"i": 22,
"j": 23,
"k": 24,
"l": 25,
"m": 26,
"n": 27,
"o": 28,
"p": 29,
"q": 30,
"r": 31,
"s": 32,
"t": 33,
"u": 34,
"v": 35,
"w": 36,
"x": 37,
"y": 38,
"z": 39,
"th": 40,
"in": 41,
"the": 42,
"an": 43,
"er": 44,
"ou": 45,
"re": 46,
"on": 47,
"at": 48,
"ed": 49,
"en": 50,
"to": 51,
"ing": 52,
"and": 53,
"is": 54,
"as": 55,
"al": 56,
"or": 57,
"of": 58,
"ar": 59,
"it": 60,
"es": 61,
"he": 62,
"st": 63,
"le": 64,
"om": 65,
"se": 66,
"be": 67,
"ad": 68,
"ow": 69,
"ly": 70,
"ch": 71,
"wh": 72,
"that": 73,
"you": 74,
"li": 75,
"ve": 76,
"ac": 77,
"ti": 78,
"ld": 79,
"me": 80,
"was": 81,
"gh": 82,
"id": 83,
"ll": 84,
"wi": 85,
"ent": 86,
"for": 87,
"ay": 88,
"ro": 89,
"ver": 90,
"ic": 91,
"her": 92,
"ke": 93,
"his": 94,
"no": 95,
"ut": 96,
"un": 97,
"ir": 98,
"lo": 99,
"we": 100,
"ri": 101,
"ha": 102,
"with": 103,
"ght": 104,
"out": 105,
"im": 106,
"ion": 107,
"all": 108,
"ab": 109,
"one": 110,
"ne": 111,
"ge": 112,
"ould": 113,
"ter": 114,
"mo": 115,
"had": 116,
"ce": 117,
"she": 118,
"go": 119,
"sh": 120,
"ur": 121,
"am": 122,
"so": 123,
"pe": 124,
"my": 125,
"de": 126,
"are": 127,
"but": 128,
"ome": 129,
"fr": 130,
"ther": 131,
"fe": 132,
"su": 133,
"do": 134,
"con": 135,
"te": 136,
"ain": 137,
"ere": 138,
"po": 139,
"if": 140,
"they": 141,
"us": 142,
"ag": 143,
"tr": 144,
"now": 145,
"oun": 146,
"this": 147,
"have": 148,
"not": 149,
"sa": 150,
"il": 151,
"up": 152,
"thing": 153,
"from": 154,
"ap": 155,
"him": 156,
"ack": 157,
"ation": 158,
"ant": 159,
"our": 160,
"op": 161,
"like": 162,
"ust": 163,
"ess": 164,
"bo": 165,
"ok": 166,
"ul": 167,
"ind": 168,
"ex": 169,
"com": 170,
"some": 171,
"there": 172,
"ers": 173,
"co": 174,
"res": 175,
"man": 176,
"ard": 177,
"pl": 178,
"wor": 179,
"way": 180,
"tion": 181,
"fo": 182,
"ca": 183,
"were": 184,
"by": 185,
"ate": 186,
"pro": 187,
"ted": 188,
"ound": 189,
"own": 190,
"would": 191,
"ts": 192,
"what": 193,
"qu": 194,
"ally": 195,
"ight": 196,
"ck": 197,
"gr": 198,
"when": 199,
"ven": 200,
"can": 201,
"ough": 202,
"ine": 203,
"end": 204,
"per": 205,
"ous": 206,
"od": 207,
"ide": 208,
"know": 209,
"ty": 210,
"very": 211,
"si": 212,
"ak": 213,
"who": 214,
"about": 215,
"ill": 216,
"them": 217,
"est": 218,
"red": 219,
"ye": 220,
"could": 221,
"ong": 222,
"your": 223,
"their": 224,
"em": 225,
"just": 226,
"other": 227,
"into": 228,
"any": 229,
"whi": 230,
"um": 231,
"tw": 232,
"ast": 233,
"der": 234,
"did": 235,
"ie": 236,
"been": 237,
"ace": 238,
"ink": 239,
"ity": 240,
"back": 241,
"ting": 242,
"br": 243,
"more": 244,
"ake": 245,
"pp": 246,
"then": 247,
"sp": 248,
"el": 249,
"use": 250,
"bl": 251,
"said": 252,
"over": 253,
"get": 254
},
"merges":
[
"t h",
"i n",
"th e",
"a n",
"e r",
"o u",
"r e",
"o n",
"a t",
"e d",
"e n",
"t o",
"in g",
"an d",
"i s",
"a s",
"a l",
"o r",
"o f",
"a r",
"i t",
"e s",
"h e",
"s t",
"l e",
"o m",
"s e",
"b e",
"a d",
"o w",
"l y",
"c h",
"w h",
"th at",
"y ou",
"l i",
"v e",
"a c",
"t i",
"l d",
"m e",
"w as",
"g h",
"i d",
"l l",
"w i",
"en t",
"f or",
"a y",
"r o",
"v er",
"i c",
"h er",
"k e",
"h is",
"n o",
"u t",
"u n",
"i r",
"l o",
"w e",
"r i",
"h a",
"wi th",
"gh t",
"ou t",
"i m",
"i on",
"al l",
"a b",
"on e",
"n e",
"g e",
"ou ld",
"t er",
"m o",
"h ad",
"c e",
"s he",
"g o",
"s h",
"u r",
"a m",
"s o",
"p e",
"m y",
"d e",
"a re",
"b ut",
"om e",
"f r",
"the r",
"f e",
"s u",
"d o",
"c on",
"t e",
"a in",
"er e",
"p o",
"i f",
"the y",
"u s",
"a g",
"t r",
"n ow",
"ou n",
"th is",
"ha ve",
"no t",
"s a",
"i l",
"u p",
"th ing",
"fr om",
"a p",
"h im",
"ac k",
"at ion",
"an t",
"ou r",
"o p",
"li ke",
"u st",
"es s",
"b o",
"o k",
"u l",
"in d",
"e x",
"c om",
"s ome",
"the re",
"er s",
"c o",
"re s",
"m an",
"ar d",
"p l",
"w or",
"w ay",
"ti on",
"f o",
"c a",
"w ere",
"b y",
"at e",
"p ro",
"t ed",
"oun d",
"ow n",
"w ould",
"t s",
"wh at",
"q u",
"al ly",
"i ght",
"c k",
"g r",
"wh en",
"v en",
"c an",
"ou gh",
"in e",
"en d",
"p er",
"ou s",
"o d",
"id e",
"k now",
"t y",
"ver y",
"s i",
"a k",
"wh o",
"ab out",
"i ll",
"the m",
"es t",
"re d",
"y e",
"c ould",
"on g",
"you r",
"the ir",
"e m",
"j ust",
"o ther",
"in to",
"an y",
"wh i",
"u m",
"t w",
"as t",
"d er",
"d id",
"i e",
"be en",
"ac e",
"in k",
"it y",
"b ack",
"t ing",
"b r",
"mo re",
"a ke",
"p p",
"the n",
"s p",
"e l",
"u se",
"b l",
"sa id",
"o ver",
"ge t"
]
}
}

@ -1994,6 +1994,15 @@ def save_args_settings():
with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
f.write(json.dumps(settings, indent='\t') )
def tokenize_text( text ):
from tortoise.utils.tokenizer import VoiceBpeTokenizer
tokenizer = VoiceBpeTokenizer()
encoded = tokenizer.encode(text)
decoded = tokenizer.tokenizer.decode(encoded, skip_special_tokens=False)
return "\n".join([ str(encoded), decoded ])
# super kludgy )`;
def import_generate_settings(file = None):
if not file:

@ -409,17 +409,25 @@ def setup_gradio():
history_audio = gr.Audio()
history_copy_settings_button = gr.Button(value="Copy Settings")
with gr.Tab("Utilities"):
with gr.Row():
with gr.Column():
audio_in = gr.Files(type="file", label="Audio Input", file_types=["audio"])
import_voice_name = gr.Textbox(label="Voice Name")
import_voice_button = gr.Button(value="Import Voice")
with gr.Column(visible=False) as col:
utilities_metadata_column = col
with gr.Tab("Import / Analyze"):
with gr.Row():
with gr.Column():
audio_in = gr.Files(type="file", label="Audio Input", file_types=["audio"])
import_voice_name = gr.Textbox(label="Voice Name")
import_voice_button = gr.Button(value="Import Voice")
with gr.Column(visible=False) as col:
utilities_metadata_column = col
metadata_out = gr.JSON(label="Audio Metadata")
copy_button = gr.Button(value="Copy Settings")
latents_out = gr.File(type="binary", label="Voice Latents")
with gr.Tab("Tokenizer"):
with gr.Row():
text_tokenizier_input = gr.TextArea(label="Text", max_lines=4)
text_tokenizier_output = gr.TextArea(label="Tokenized Text", max_lines=4)
metadata_out = gr.JSON(label="Audio Metadata")
copy_button = gr.Button(value="Copy Settings")
latents_out = gr.File(type="binary", label="Voice Latents")
with gr.Row():
text_tokenizier_button = gr.Button(value="Tokenize Text")
with gr.Tab("Training"):
with gr.Tab("Prepare Dataset"):
with gr.Row():
@ -712,6 +720,11 @@ def setup_gradio():
outputs=generate_settings
)
text_tokenizier_button.click(tokenize_text,
inputs=text_tokenizier_input,
outputs=text_tokenizier_output
)
refresh_configs.click(
lambda: gr.update(choices=get_training_list()),
inputs=None,