From 6a11bc9cb6be603f7407a18d7e0a5f28ef150ab7 Mon Sep 17 00:00:00 2001 From: mrq Date: Mon, 29 Apr 2024 09:09:26 -0500 Subject: [PATCH] update tokenizer because, for some reason, it had the wrong order for the special tokens to where eos = unk --- data/tokenizer.json | 551 ++++++++++++++++++++++++++++++++++++++++++++ vall_e/data.py | 10 +- 2 files changed, 558 insertions(+), 3 deletions(-) create mode 100644 data/tokenizer.json diff --git a/data/tokenizer.json b/data/tokenizer.json new file mode 100644 index 0000000..f034fc9 --- /dev/null +++ b/data/tokenizer.json @@ -0,0 +1,551 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + } + ], + "pair": [ + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "": { + "id": "", + "ids": [ + 1 + ], + "tokens": [ + "" + ] + }, + "": { + "id": "", + "ids": [ + 2 + ], + "tokens": [ + "" + ] + } + } + }, + "decoder": null, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "", + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "!": 4, + "\"": 5, + "(": 6, + ")": 7, + ",": 8, + "-": 9, + ".": 10, + "1": 11, + ":": 12, + "?": 13, + "a": 14, + "b": 15, + "c": 16, + "d": 17, + "e": 18, + "f": 19, + "h": 20, + "i": 21, + "j": 22, + "k": 23, + "l": 24, + "m": 25, + "n": 26, + "o": 27, + "p": 28, + "q": 29, + "r": 30, + "s": 31, + "t": 32, + "u": 33, + "v": 34, + "w": 35, + "x": 36, + "z": 37, + "¡": 38, + "«": 39, + "»": 40, + "¿": 41, + "æ": 42, + "ç": 43, + "ð": 44, + "ŋ": 45, + "ɐ": 46, + "ɑ": 47, + "ɔ": 48, + "ɕ": 49, + "ə": 50, + "ɚ": 51, + "ɛ": 52, + "ɜ": 53, + "ɟ": 54, + "ɡ": 55, + "ɪ": 56, + "ɬ": 57, + "ɯ": 58, + "ɹ": 59, + "ɾ": 60, + "ʃ": 61, + "ʈ": 62, + "ʊ": 63, + "ʋ": 64, + "ʌ": 65, + "ʑ": 66, + "ʒ": 67, + "ʔ": 68, + "ʲ": 69, + "ˈ": 70, + "ˌ": 71, + "ː": 72, + "̃": 73, + "̩": 74, + "θ": 75, + "ᵻ": 76, + "—": 77, + "…": 78, + "ˈɛ": 79, + "iː": 80, + "aɪ": 81, + "nd": 82, + "ˈɪ": 83, + "eɪ": 84, + "ˈæ": 85, + "oʊ": 86, + "ðə": 87, + "ɑː": 88, + "ˈeɪ": 89, + "ən": 90, + "uː": 91, + "ˈʌ": 92, + "ˈaɪ": 93, + "st": 94, + "ˈɔ": 95, + "ˈoʊ": 96, + "ˈiː": 97, + "ˈɑː": 98, + "ænd": 99, + "ːɹ": 100, + "ɪŋ": 101, + "ɜː": 102, + "ɪn": 103, + "tə": 104, + "ʌv": 105, + "aʊ": 106, + "əl": 107, + "ˈuː": 108, + "tʃ": 109, + "ɪz": 110, + "ˈɜː": 111, + "ˌʌ": 112, + "æt": 113, + "dʒ": 114, + "ˈɔː": 115, + "ɪt": 116, + "ˈaʊ": 117, + "ɚɹ": 118, + "ˈɛn": 119, + "wʌ": 120, + "li": 121, + "hiː": 122, + "ˌɛ": 123, + "wɪ": 124, + "ðæt": 125, + "wʌz": 126, + "juː": 127, + "oːɹ": 128, + "ðɪ": 129, + "sˈɛ": 130, + "ˈɑːɹ": 131, + "ˌɪ": 132, + "nt": 133, + "ˈʊ": 134, + "ənt": 135, + "hɪz": 136, + "hæ": 137, + "ˌɑː": 138, + "ɔːɹ": 139, + "ˈɛɹ": 140, + "wɪð": 141, + "ᵻd": 142, + "ˈoːɹ": 143, + "pɹ": 144, + "ˈɔːl": 145, + "mˌ": 146, + "ʃən": 147, + "kt": 148, + "ˌoʊ": 149, + "ˈɔːɹ": 150, + "fɹ": 151, + "æz": 152, + "ʃiː": 153, + "ˌʌt": 154, + "ˈɛl": 155, + "ˌaʊ": 156, + "ˈʌn": 157, + "əs": 158, + "hɜː": 159, + "lˈaɪ": 160, + "ˈæn": 161, + "ˈɪɹ": 162, + "ʊd": 163, + "ɹᵻ": 164, + "ld": 165, + "bˌʌt": 166, + "ks": 167, + "nˈoʊ": 168, + "ɾɚ": 169, + "hæd": 170, + "ɛɹ": 171, + "ˈɪŋ": 172, + "ɡɹ": 173, + "ɔn": 174, + "nˌɑː": 175, + "maɪ": 176, + "vɚ": 177, + "fɔːɹ": 178, + "ðɚ": 179, + "tʊ": 180, + "ðɛɹ": 181, + "ˈʌm": 182, + "nˌɑːt": 183, + "tɹ": 184, + "sˈiː": 185, + "ʌvðə": 186, + "mˈɪ": 187, + "ˈæp": 188, + "ˌɪm": 189, + "ɪk": 190, + "sp": 191, + "lˈeɪ": 192, + "hˌɪm": 193, + "ɐn": 194, + "ðeɪ": 195, + "lˈɪ": 196, + "ɾi": 197, + "bɹ": 198, + "lˈɛ": 199, + "kɹ": 200, + "ˈɪl": 201, + "jˈuː": 202, + "ʌm": 203, + "mˌiː": 204, + "lˈæ": 205, + "ˌɪn": 206, + "bᵻ": 207, + "wˈʌn": 208, + "ˈɪn": 209, + "ˈoʊn": 210, + "biː": 211, + "sˈɛd": 212, + "ˈɛd": 213, + "ˈaɪt": 214, + "fɹʌm": 215, + "baɪ": 216, + "ɪs": 217, + "ɚz": 218, + "ðɪs": 219, + "əns": 220, + "ɪf": 221, + "bəl": 222, + "ˈænd": 223, + "ɪnðə": 224, + "əm": 225, + "iːz": 226, + "ˌuː": 227, + "ᵻz": 228, + "wˈeɪ": 229, + "ft": 230, + "wiː": 231, + "lˈiː": 232, + "stɹ": 233, + "jʊ": 234, + "ɚd": 235, + "ˌaɪ": 236, + "kw": 237, + "ˌɔn": 238, + "ˈaɪd": 239, + "ts": 240, + "ɪm": 241, + "ˈʌst": 242, + "ˈoʊld": 243, + "ˌɪtʃ": 244, + "dˈɪ": 245, + "sˌoʊ": 246, + "ɑːɹ": 247, + "hɐ": 248, + "sˈeɪ": 249, + "ɾᵻd": 250, + "dᵻ": 251, + "wˌɪtʃ": 252, + "sˈɛl": 253, + "ɹi": 254, + "ˈʌðɚ": 255 + }, + "merges": [ + "ˈ ɛ", + "i ː", + "a ɪ", + "n d", + "ˈ ɪ", + "e ɪ", + "ˈ æ", + "o ʊ", + "ð ə", + "ɑ ː", + "ˈ eɪ", + "ə n", + "u ː", + "ˈ ʌ", + "ˈ aɪ", + "s t", + "ˈ ɔ", + "ˈ oʊ", + "ˈ iː", + "ˈ ɑː", + "æ nd", + "ː ɹ", + "ɪ ŋ", + "ɜ ː", + "ɪ n", + "t ə", + "ʌ v", + "a ʊ", + "ə l", + "ˈ uː", + "t ʃ", + "ɪ z", + "ˈ ɜː", + "ˌ ʌ", + "æ t", + "d ʒ", + "ˈɔ ː", + "ɪ t", + "ˈ aʊ", + "ɚ ɹ", + "ˈɛ n", + "w ʌ", + "l i", + "h iː", + "ˌ ɛ", + "w ɪ", + "ð æt", + "wʌ z", + "j uː", + "o ːɹ", + "ð ɪ", + "s ˈɛ", + "ˈɑː ɹ", + "ˌ ɪ", + "n t", + "ˈ ʊ", + "ən t", + "h ɪz", + "h æ", + "ˌ ɑː", + "ɔ ːɹ", + "ˈɛ ɹ", + "wɪ ð", + "ᵻ d", + "ˈ oːɹ", + "p ɹ", + "ˈɔː l", + "m ˌ", + "ʃ ən", + "k t", + "ˌ oʊ", + "ˈɔ ːɹ", + "f ɹ", + "æ z", + "ʃ iː", + "ˌʌ t", + "ˈɛ l", + "ˌ aʊ", + "ˈʌ n", + "ə s", + "h ɜː", + "l ˈaɪ", + "ˈæ n", + "ˈɪ ɹ", + "ʊ d", + "ɹ ᵻ", + "l d", + "b ˌʌt", + "k s", + "n ˈoʊ", + "ɾ ɚ", + "hæ d", + "ɛ ɹ", + "ˈɪ ŋ", + "ɡ ɹ", + "ɔ n", + "n ˌɑː", + "m aɪ", + "v ɚ", + "f ɔːɹ", + "ð ɚ", + "t ʊ", + "ð ɛɹ", + "ˈʌ m", + "nˌɑː t", + "t ɹ", + "s ˈiː", + "ʌv ðə", + "m ˈɪ", + "ˈæ p", + "ˌɪ m", + "ɪ k", + "s p", + "l ˈeɪ", + "h ˌɪm", + "ɐ n", + "ð eɪ", + "l ˈɪ", + "ɾ i", + "b ɹ", + "l ˈɛ", + "k ɹ", + "ˈɪ l", + "j ˈuː", + "ʌ m", + "mˌ iː", + "l ˈæ", + "ˌ ɪn", + "b ᵻ", + "w ˈʌn", + "ˈɪ n", + "ˈoʊ n", + "b iː", + "sˈɛ d", + "ˈɛ d", + "ˈaɪ t", + "fɹ ʌm", + "b aɪ", + "ɪ s", + "ɚ z", + "ðɪ s", + "ən s", + "ɪ f", + "b əl", + "ˈæ nd", + "ɪn ðə", + "ə m", + "iː z", + "ˌ uː", + "ᵻ z", + "w ˈeɪ", + "f t", + "w iː", + "l ˈiː", + "st ɹ", + "j ʊ", + "ɚ d", + "ˌ aɪ", + "k w", + "ˌ ɔn", + "ˈaɪ d", + "t s", + "ɪ m", + "ˈʌ st", + "ˈoʊ ld", + "ˌɪ tʃ", + "d ˈɪ", + "s ˌoʊ", + "ɑː ɹ", + "h ɐ", + "s ˈeɪ", + "ɾ ᵻd", + "d ᵻ", + "w ˌɪtʃ", + "sˈɛ l", + "ɹ i", + "ˈʌ ðɚ" + ] + } +} \ No newline at end of file diff --git a/vall_e/data.py b/vall_e/data.py index ca2a923..40d54fc 100755 --- a/vall_e/data.py +++ b/vall_e/data.py @@ -753,7 +753,7 @@ def create_train_val_dataloader(): return train_dl, subtrain_dl, val_dl # parse dataset into better to sample metadata -def create_dataset_metadata( skip_existing=False ): +def create_dataset_metadata( skip_existing=True ): symmap = get_phone_symmap() root = str(cfg.data_dir) @@ -828,7 +828,8 @@ def create_dataset_metadata( skip_existing=False ): metadata[id]["phones"] = len(phn) metadata[id]["transcription"] = txt except Exception as e: - raise e + #raise e + print(id, e) #pass with open(str(metadata_path), "w", encoding="utf-8") as f: @@ -885,8 +886,10 @@ def create_dataset_hdf5( skip_existing=True ): key = f'{type}/{name}/{id}' + """ if skip_existing and key in hf: continue + """ group = hf.create_group(key) if key not in hf else hf[key] @@ -948,7 +951,8 @@ def create_dataset_hdf5( skip_existing=True ): group.attrs['phonemes'] = 0 metadata[id]["phones"] = 0 except Exception as e: - raise e + #raise e + print(id, e) #pass with open(str(metadata_path), "w", encoding="utf-8") as f: