diff --git a/models/.template.dlas.yaml b/models/.template.dlas.yaml index 1127f4f..f0779c5 100755 --- a/models/.template.dlas.yaml +++ b/models/.template.dlas.yaml @@ -24,7 +24,7 @@ datasets: num_conditioning_candidates: 2 conditioning_length: 44000 use_bpe_tokenizer: True - tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json + tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json load_aligned_codes: False val: name: validation @@ -41,7 +41,7 @@ datasets: num_conditioning_candidates: 2 conditioning_length: 44000 use_bpe_tokenizer: True - tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json + tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json load_aligned_codes: False steps: @@ -61,13 +61,13 @@ steps: injectors: paired_to_mel: type: torch_mel_spectrogram - mel_norm_file: ./models/tortoise/clips_mel_norms.pth + mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth in: wav out: paired_mel paired_cond_to_mel: type: for_each subtype: torch_mel_spectrogram - mel_norm_file: ./models/tortoise/clips_mel_norms.pth + mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth in: conditioning out: paired_conditioning_mel to_codes: diff --git a/models/tortoise/bpe_lowercase_asr_256.json b/models/tortoise/bpe_lowercase_asr_256.json deleted file mode 100755 index 1f32162..0000000 --- a/models/tortoise/bpe_lowercase_asr_256.json +++ /dev/null @@ -1,527 +0,0 @@ -{ - "version": "1.0", - "truncation": null, - "padding": null, - "added_tokens": - [ - { - "id": 0, - "special": true, - "content": "[STOP]", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": false - }, - { - "id": 1, - "special": true, - "content": "[UNK]", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": false - }, - { - "id": 2, - "special": true, - "content": "[SPACE]", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": false - } - ], - "normalizer": null, - "pre_tokenizer": - { - "type": "Whitespace" - }, - "post_processor": null, - "decoder": null, - "model": - { - "type": "BPE", - "dropout": null, - "unk_token": "[UNK]", - "continuing_subword_prefix": null, - "end_of_word_suffix": null, - "fuse_unk": false, - "vocab": - { - "[STOP]": 0, - "[UNK]": 1, - "[SPACE]": 2, - "!": 3, - "'": 4, - "(": 5, - ")": 6, - ",": 7, - "-": 8, - ".": 9, - "/": 10, - ":": 11, - ";": 12, - "?": 13, - "a": 14, - "b": 15, - "c": 16, - "d": 17, - "e": 18, - "f": 19, - "g": 20, - "h": 21, - "i": 22, - "j": 23, - "k": 24, - "l": 25, - "m": 26, - "n": 27, - "o": 28, - "p": 29, - "q": 30, - "r": 31, - "s": 32, - "t": 33, - "u": 34, - "v": 35, - "w": 36, - "x": 37, - "y": 38, - "z": 39, - "th": 40, - "in": 41, - "the": 42, - "an": 43, - "er": 44, - "ou": 45, - "re": 46, - "on": 47, - "at": 48, - "ed": 49, - "en": 50, - "to": 51, - "ing": 52, - "and": 53, - "is": 54, - "as": 55, - "al": 56, - "or": 57, - "of": 58, - "ar": 59, - "it": 60, - "es": 61, - "he": 62, - "st": 63, - "le": 64, - "om": 65, - "se": 66, - "be": 67, - "ad": 68, - "ow": 69, - "ly": 70, - "ch": 71, - "wh": 72, - "that": 73, - "you": 74, - "li": 75, - "ve": 76, - "ac": 77, - "ti": 78, - "ld": 79, - "me": 80, - "was": 81, - "gh": 82, - "id": 83, - "ll": 84, - "wi": 85, - "ent": 86, - "for": 87, - "ay": 88, - "ro": 89, - "ver": 90, - "ic": 91, - "her": 92, - "ke": 93, - "his": 94, - "no": 95, - "ut": 96, - "un": 97, - "ir": 98, - "lo": 99, - "we": 100, - "ri": 101, - "ha": 102, - "with": 103, - "ght": 104, - "out": 105, - "im": 106, - "ion": 107, - "all": 108, - "ab": 109, - "one": 110, - "ne": 111, - "ge": 112, - "ould": 113, - "ter": 114, - "mo": 115, - "had": 116, - "ce": 117, - "she": 118, - "go": 119, - "sh": 120, - "ur": 121, - "am": 122, - "so": 123, - "pe": 124, - "my": 125, - "de": 126, - "are": 127, - "but": 128, - "ome": 129, - "fr": 130, - "ther": 131, - "fe": 132, - "su": 133, - "do": 134, - "con": 135, - "te": 136, - "ain": 137, - "ere": 138, - "po": 139, - "if": 140, - "they": 141, - "us": 142, - "ag": 143, - "tr": 144, - "now": 145, - "oun": 146, - "this": 147, - "have": 148, - "not": 149, - "sa": 150, - "il": 151, - "up": 152, - "thing": 153, - "from": 154, - "ap": 155, - "him": 156, - "ack": 157, - "ation": 158, - "ant": 159, - "our": 160, - "op": 161, - "like": 162, - "ust": 163, - "ess": 164, - "bo": 165, - "ok": 166, - "ul": 167, - "ind": 168, - "ex": 169, - "com": 170, - "some": 171, - "there": 172, - "ers": 173, - "co": 174, - "res": 175, - "man": 176, - "ard": 177, - "pl": 178, - "wor": 179, - "way": 180, - "tion": 181, - "fo": 182, - "ca": 183, - "were": 184, - "by": 185, - "ate": 186, - "pro": 187, - "ted": 188, - "ound": 189, - "own": 190, - "would": 191, - "ts": 192, - "what": 193, - "qu": 194, - "ally": 195, - "ight": 196, - "ck": 197, - "gr": 198, - "when": 199, - "ven": 200, - "can": 201, - "ough": 202, - "ine": 203, - "end": 204, - "per": 205, - "ous": 206, - "od": 207, - "ide": 208, - "know": 209, - "ty": 210, - "very": 211, - "si": 212, - "ak": 213, - "who": 214, - "about": 215, - "ill": 216, - "them": 217, - "est": 218, - "red": 219, - "ye": 220, - "could": 221, - "ong": 222, - "your": 223, - "their": 224, - "em": 225, - "just": 226, - "other": 227, - "into": 228, - "any": 229, - "whi": 230, - "um": 231, - "tw": 232, - "ast": 233, - "der": 234, - "did": 235, - "ie": 236, - "been": 237, - "ace": 238, - "ink": 239, - "ity": 240, - "back": 241, - "ting": 242, - "br": 243, - "more": 244, - "ake": 245, - "pp": 246, - "then": 247, - "sp": 248, - "el": 249, - "use": 250, - "bl": 251, - "said": 252, - "over": 253, - "get": 254 - }, - "merges": - [ - "t h", - "i n", - "th e", - "a n", - "e r", - "o u", - "r e", - "o n", - "a t", - "e d", - "e n", - "t o", - "in g", - "an d", - "i s", - "a s", - "a l", - "o r", - "o f", - "a r", - "i t", - "e s", - "h e", - "s t", - "l e", - "o m", - "s e", - "b e", - "a d", - "o w", - "l y", - "c h", - "w h", - "th at", - "y ou", - "l i", - "v e", - "a c", - "t i", - "l d", - "m e", - "w as", - "g h", - "i d", - "l l", - "w i", - "en t", - "f or", - "a y", - "r o", - "v er", - "i c", - "h er", - "k e", - "h is", - "n o", - "u t", - "u n", - "i r", - "l o", - "w e", - "r i", - "h a", - "wi th", - "gh t", - "ou t", - "i m", - "i on", - "al l", - "a b", - "on e", - "n e", - "g e", - "ou ld", - "t er", - "m o", - "h ad", - "c e", - "s he", - "g o", - "s h", - "u r", - "a m", - "s o", - "p e", - "m y", - "d e", - "a re", - "b ut", - "om e", - "f r", - "the r", - "f e", - "s u", - "d o", - "c on", - "t e", - "a in", - "er e", - "p o", - "i f", - "the y", - "u s", - "a g", - "t r", - "n ow", - "ou n", - "th is", - "ha ve", - "no t", - "s a", - "i l", - "u p", - "th ing", - "fr om", - "a p", - "h im", - "ac k", - "at ion", - "an t", - "ou r", - "o p", - "li ke", - "u st", - "es s", - "b o", - "o k", - "u l", - "in d", - "e x", - "c om", - "s ome", - "the re", - "er s", - "c o", - "re s", - "m an", - "ar d", - "p l", - "w or", - "w ay", - "ti on", - "f o", - "c a", - "w ere", - "b y", - "at e", - "p ro", - "t ed", - "oun d", - "ow n", - "w ould", - "t s", - "wh at", - "q u", - "al ly", - "i ght", - "c k", - "g r", - "wh en", - "v en", - "c an", - "ou gh", - "in e", - "en d", - "p er", - "ou s", - "o d", - "id e", - "k now", - "t y", - "ver y", - "s i", - "a k", - "wh o", - "ab out", - "i ll", - "the m", - "es t", - "re d", - "y e", - "c ould", - "on g", - "you r", - "the ir", - "e m", - "j ust", - "o ther", - "in to", - "an y", - "wh i", - "u m", - "t w", - "as t", - "d er", - "d id", - "i e", - "be en", - "ac e", - "in k", - "it y", - "b ack", - "t ing", - "b r", - "mo re", - "a ke", - "p p", - "the n", - "s p", - "e l", - "u se", - "b l", - "sa id", - "o ver", - "ge t" - ] - } -} \ No newline at end of file diff --git a/models/tortoise/clips_mel_norms.pth b/models/tortoise/clips_mel_norms.pth deleted file mode 100755 index d8c7321..0000000 Binary files a/models/tortoise/clips_mel_norms.pth and /dev/null differ diff --git a/src/utils.py b/src/utils.py index 2fe2e7e..d7544fe 100755 --- a/src/utils.py +++ b/src/utils.py @@ -1994,6 +1994,15 @@ def save_args_settings(): with open(f'./config/exec.json', 'w', encoding="utf-8") as f: f.write(json.dumps(settings, indent='\t') ) +def tokenize_text( text ): + from tortoise.utils.tokenizer import VoiceBpeTokenizer + + tokenizer = VoiceBpeTokenizer() + encoded = tokenizer.encode(text) + decoded = tokenizer.tokenizer.decode(encoded, skip_special_tokens=False) + + return "\n".join([ str(encoded), decoded ]) + # super kludgy )`; def import_generate_settings(file = None): if not file: diff --git a/src/webui.py b/src/webui.py index d97138f..2d948b3 100755 --- a/src/webui.py +++ b/src/webui.py @@ -409,17 +409,25 @@ def setup_gradio(): history_audio = gr.Audio() history_copy_settings_button = gr.Button(value="Copy Settings") with gr.Tab("Utilities"): - with gr.Row(): - with gr.Column(): - audio_in = gr.Files(type="file", label="Audio Input", file_types=["audio"]) - import_voice_name = gr.Textbox(label="Voice Name") - import_voice_button = gr.Button(value="Import Voice") - with gr.Column(visible=False) as col: - utilities_metadata_column = col + with gr.Tab("Import / Analyze"): + with gr.Row(): + with gr.Column(): + audio_in = gr.Files(type="file", label="Audio Input", file_types=["audio"]) + import_voice_name = gr.Textbox(label="Voice Name") + import_voice_button = gr.Button(value="Import Voice") + with gr.Column(visible=False) as col: + utilities_metadata_column = col - metadata_out = gr.JSON(label="Audio Metadata") - copy_button = gr.Button(value="Copy Settings") - latents_out = gr.File(type="binary", label="Voice Latents") + metadata_out = gr.JSON(label="Audio Metadata") + copy_button = gr.Button(value="Copy Settings") + latents_out = gr.File(type="binary", label="Voice Latents") + with gr.Tab("Tokenizer"): + with gr.Row(): + text_tokenizier_input = gr.TextArea(label="Text", max_lines=4) + text_tokenizier_output = gr.TextArea(label="Tokenized Text", max_lines=4) + + with gr.Row(): + text_tokenizier_button = gr.Button(value="Tokenize Text") with gr.Tab("Training"): with gr.Tab("Prepare Dataset"): with gr.Row(): @@ -712,6 +720,11 @@ def setup_gradio(): outputs=generate_settings ) + text_tokenizier_button.click(tokenize_text, + inputs=text_tokenizier_input, + outputs=text_tokenizier_output + ) + refresh_configs.click( lambda: gr.update(choices=get_training_list()), inputs=None,