update tokenizer because, for some reason, it had the wrong order for the special tokens to where eos = unk

This commit is contained in:
mrq 2024-04-29 09:09:26 -05:00
parent 57810e4ba4
commit 6a11bc9cb6
2 changed files with 558 additions and 3 deletions

551
data/tokenizer.json Normal file
View File

@ -0,0 +1,551 @@
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<bos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "</eos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<mask>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<bos>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "<eos>",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"<bos>": {
"id": "<bos>",
"ids": [
1
],
"tokens": [
"<bos>"
]
},
"<eos>": {
"id": "<eos>",
"ids": [
2
],
"tokens": [
"<eos>"
]
}
}
},
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<unk>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<unk>": 0,
"<bos>": 1,
"</eos>": 2,
"<mask>": 3,
"!": 4,
"\"": 5,
"(": 6,
")": 7,
",": 8,
"-": 9,
".": 10,
"1": 11,
":": 12,
"?": 13,
"a": 14,
"b": 15,
"c": 16,
"d": 17,
"e": 18,
"f": 19,
"h": 20,
"i": 21,
"j": 22,
"k": 23,
"l": 24,
"m": 25,
"n": 26,
"o": 27,
"p": 28,
"q": 29,
"r": 30,
"s": 31,
"t": 32,
"u": 33,
"v": 34,
"w": 35,
"x": 36,
"z": 37,
"¡": 38,
"«": 39,
"»": 40,
"¿": 41,
"æ": 42,
"ç": 43,
"ð": 44,
"ŋ": 45,
"ɐ": 46,
"ɑ": 47,
"ɔ": 48,
"ɕ": 49,
"ə": 50,
"ɚ": 51,
"ɛ": 52,
"ɜ": 53,
"ɟ": 54,
"ɡ": 55,
"ɪ": 56,
"ɬ": 57,
"ɯ": 58,
"ɹ": 59,
"ɾ": 60,
"ʃ": 61,
"ʈ": 62,
"ʊ": 63,
"ʋ": 64,
"ʌ": 65,
"ʑ": 66,
"ʒ": 67,
"ʔ": 68,
"ʲ": 69,
"ˈ": 70,
"ˌ": 71,
"ː": 72,
"̃": 73,
"̩": 74,
"θ": 75,
"ᵻ": 76,
"—": 77,
"…": 78,
"ˈɛ": 79,
"iː": 80,
"aɪ": 81,
"nd": 82,
"ˈɪ": 83,
"eɪ": 84,
"ˈæ": 85,
"oʊ": 86,
"ðə": 87,
"ɑː": 88,
"ˈeɪ": 89,
"ən": 90,
"uː": 91,
"ˈʌ": 92,
"ˈaɪ": 93,
"st": 94,
"ˈɔ": 95,
"ˈoʊ": 96,
"ˈiː": 97,
"ˈɑː": 98,
"ænd": 99,
"ːɹ": 100,
"ɪŋ": 101,
"ɜː": 102,
"ɪn": 103,
"tə": 104,
"ʌv": 105,
"aʊ": 106,
"əl": 107,
"ˈuː": 108,
"tʃ": 109,
"ɪz": 110,
"ˈɜː": 111,
"ˌʌ": 112,
"æt": 113,
"dʒ": 114,
"ˈɔː": 115,
"ɪt": 116,
"ˈaʊ": 117,
"ɚɹ": 118,
"ˈɛn": 119,
"wʌ": 120,
"li": 121,
"hiː": 122,
"ˌɛ": 123,
"wɪ": 124,
"ðæt": 125,
"wʌz": 126,
"juː": 127,
"oːɹ": 128,
"ðɪ": 129,
"sˈɛ": 130,
"ˈɑːɹ": 131,
"ˌɪ": 132,
"nt": 133,
"ˈʊ": 134,
"ənt": 135,
"hɪz": 136,
"hæ": 137,
"ˌɑː": 138,
"ɔːɹ": 139,
"ˈɛɹ": 140,
"wɪð": 141,
"ᵻd": 142,
"ˈoːɹ": 143,
"pɹ": 144,
"ˈɔːl": 145,
"mˌ": 146,
"ʃən": 147,
"kt": 148,
"ˌoʊ": 149,
"ˈɔːɹ": 150,
"fɹ": 151,
"æz": 152,
"ʃiː": 153,
"ˌʌt": 154,
"ˈɛl": 155,
"ˌaʊ": 156,
"ˈʌn": 157,
"əs": 158,
"hɜː": 159,
"lˈaɪ": 160,
"ˈæn": 161,
"ˈɪɹ": 162,
"ʊd": 163,
"ɹᵻ": 164,
"ld": 165,
"bˌʌt": 166,
"ks": 167,
"nˈoʊ": 168,
"ɾɚ": 169,
"hæd": 170,
"ɛɹ": 171,
"ˈɪŋ": 172,
"ɡɹ": 173,
"ɔn": 174,
"nˌɑː": 175,
"maɪ": 176,
"vɚ": 177,
"fɔːɹ": 178,
"ðɚ": 179,
"tʊ": 180,
"ðɛɹ": 181,
"ˈʌm": 182,
"nˌɑːt": 183,
"tɹ": 184,
"sˈiː": 185,
"ʌvðə": 186,
"mˈɪ": 187,
"ˈæp": 188,
ɪm": 189,
"ɪk": 190,
"sp": 191,
"lˈeɪ": 192,
"hˌɪm": 193,
"ɐn": 194,
"ðeɪ": 195,
"lˈɪ": 196,
"ɾi": 197,
"bɹ": 198,
"lˈɛ": 199,
"kɹ": 200,
"ˈɪl": 201,
"jˈuː": 202,
"ʌm": 203,
"mˌiː": 204,
"lˈæ": 205,
ɪn": 206,
"bᵻ": 207,
"wˈʌn": 208,
"ˈɪn": 209,
"ˈoʊn": 210,
"biː": 211,
"sˈɛd": 212,
"ˈɛd": 213,
"ˈaɪt": 214,
"fɹʌm": 215,
"baɪ": 216,
"ɪs": 217,
"ɚz": 218,
ɪs": 219,
"əns": 220,
"ɪf": 221,
"bəl": 222,
"ˈænd": 223,
"ɪnðə": 224,
"əm": 225,
"iːz": 226,
"ˌuː": 227,
"ᵻz": 228,
"wˈeɪ": 229,
"ft": 230,
"wiː": 231,
"lˈiː": 232,
"stɹ": 233,
"jʊ": 234,
"ɚd": 235,
"ˌaɪ": 236,
"kw": 237,
"ˌɔn": 238,
"ˈaɪd": 239,
"ts": 240,
"ɪm": 241,
"ˈʌst": 242,
"ˈoʊld": 243,
ɪtʃ": 244,
"dˈɪ": 245,
"sˌoʊ": 246,
"ɑːɹ": 247,
"hɐ": 248,
"sˈeɪ": 249,
"ɾᵻd": 250,
"dᵻ": 251,
"wˌɪtʃ": 252,
"sˈɛl": 253,
"ɹi": 254,
"ˈʌðɚ": 255
},
"merges": [
"ˈ ɛ",
"i ː",
"a ɪ",
"n d",
"ˈ ɪ",
"e ɪ",
"ˈ æ",
"o ʊ",
"ð ə",
"ɑ ː",
"ˈ eɪ",
"ə n",
"u ː",
"ˈ ʌ",
"ˈ aɪ",
"s t",
"ˈ ɔ",
"ˈ oʊ",
"ˈ iː",
"ˈ ɑː",
"æ nd",
"ː ɹ",
"ɪ ŋ",
ː",
"ɪ n",
"t ə",
"ʌ v",
"a ʊ",
"ə l",
"ˈ uː",
"t ʃ",
"ɪ z",
"ˈ ɜː",
"ˌ ʌ",
"æ t",
"d ʒ",
"ˈɔ ː",
"ɪ t",
"ˈ aʊ",
"ɚ ɹ",
"ˈɛ n",
"w ʌ",
"l i",
"h iː",
"ˌ ɛ",
"w ɪ",
"ð æt",
"wʌ z",
"j uː",
"o ːɹ",
ɪ",
"s ˈɛ",
"ˈɑː ɹ",
ɪ",
"n t",
"ˈ ʊ",
"ən t",
"h ɪz",
"h æ",
ɑː",
"ɔ ːɹ",
"ˈɛ ɹ",
"wɪ ð",
"ᵻ d",
"ˈ oːɹ",
"p ɹ",
"ˈɔː l",
"m ˌ",
"ʃ ən",
"k t",
"ˌ oʊ",
"ˈɔ ːɹ",
"f ɹ",
"æ z",
"ʃ iː",
"ˌʌ t",
"ˈɛ l",
"ˌ aʊ",
"ˈʌ n",
"ə s",
"h ɜː",
"l ˈaɪ",
"ˈæ n",
"ˈɪ ɹ",
"ʊ d",
"ɹ ᵻ",
"l d",
"b ˌʌt",
"k s",
"n ˈoʊ",
"ɾ ɚ",
"hæ d",
"ɛ ɹ",
"ˈɪ ŋ",
"ɡ ɹ",
"ɔ n",
"n ˌɑː",
"m aɪ",
"v ɚ",
"f ɔːɹ",
"ð ɚ",
"t ʊ",
"ð ɛɹ",
"ˈʌ m",
"nˌɑː t",
"t ɹ",
"s ˈiː",
"ʌv ðə",
"m ˈɪ",
"ˈæ p",
"ˌɪ m",
"ɪ k",
"s p",
"l ˈeɪ",
"h ˌɪm",
"ɐ n",
"ð eɪ",
"l ˈɪ",
"ɾ i",
"b ɹ",
"l ˈɛ",
"k ɹ",
"ˈɪ l",
"j ˈuː",
"ʌ m",
"mˌ iː",
"l ˈæ",
ɪn",
"b ᵻ",
"w ˈʌn",
"ˈɪ n",
"ˈoʊ n",
"b iː",
"sˈɛ d",
"ˈɛ d",
"ˈaɪ t",
"fɹ ʌm",
"b aɪ",
"ɪ s",
"ɚ z",
"ðɪ s",
"ən s",
"ɪ f",
"b əl",
"ˈæ nd",
"ɪn ðə",
"ə m",
"iː z",
"ˌ uː",
"ᵻ z",
"w ˈeɪ",
"f t",
"w iː",
"l ˈiː",
"st ɹ",
"j ʊ",
"ɚ d",
"ˌ aɪ",
"k w",
"ˌ ɔn",
"ˈaɪ d",
"t s",
"ɪ m",
"ˈʌ st",
"ˈoʊ ld",
"ˌɪ tʃ",
"d ˈɪ",
"s ˌoʊ",
"ɑː ɹ",
"h ɐ",
"s ˈeɪ",
"ɾ ᵻd",
"d ᵻ",
"w ˌɪtʃ",
"sˈɛ l",
"ɹ i",
"ˈʌ ðɚ"
]
}
}

View File

@ -753,7 +753,7 @@ def create_train_val_dataloader():
return train_dl, subtrain_dl, val_dl
# parse dataset into better to sample metadata
def create_dataset_metadata( skip_existing=False ):
def create_dataset_metadata( skip_existing=True ):
symmap = get_phone_symmap()
root = str(cfg.data_dir)
@ -828,7 +828,8 @@ def create_dataset_metadata( skip_existing=False ):
metadata[id]["phones"] = len(phn)
metadata[id]["transcription"] = txt
except Exception as e:
raise e
#raise e
print(id, e)
#pass
with open(str(metadata_path), "w", encoding="utf-8") as f:
@ -885,8 +886,10 @@ def create_dataset_hdf5( skip_existing=True ):
key = f'{type}/{name}/{id}'
"""
if skip_existing and key in hf:
continue
"""
group = hf.create_group(key) if key not in hf else hf[key]
@ -948,7 +951,8 @@ def create_dataset_hdf5( skip_existing=True ):
group.attrs['phonemes'] = 0
metadata[id]["phones"] = 0
except Exception as e:
raise e
#raise e
print(id, e)
#pass
with open(str(metadata_path), "w", encoding="utf-8") as f: