vall-e/data/tokenizer.json
2024-12-22 15:05:45 -06:00

583 lines
9.6 KiB
JSON
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<bos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "</eos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<mask>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": null,
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<bos>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "<eos>",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"<bos>": {
"id": "<bos>",
"ids": [
1
],
"tokens": [
"<bos>"
]
},
"<eos>": {
"id": "<eos>",
"ids": [
2
],
"tokens": [
"<eos>"
]
}
}
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": false,
"use_regex": false
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<unk>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<unk>": 0,
"<bos>": 1,
"</eos>": 2,
"<mask>": 3,
" ": 4,
"ᵝ": 4,
"!": 5,
"\"": 6,
"(": 7,
"{": 7,
"[": 7,
")": 8,
"}": 8,
"]": 8,
",": 9,
"-": 10,
".": 11,
"1": 211,
"—": 10,
"“": 6,
"”": 81,
"ˇ": 6,
"ˉ": 12,
"ˊ": 79,
"ˋ": 80,
"_": 81,
":": 13,
";": 14,
"?": 15,
"a": 16,
"ä": 16,
"ɒ": 16,
"b": 17,
"c": 18,
"d": 19,
"e": 20,
"f": 21,
"h": 22,
"i": 23,
"ĩ": 23,
"j": 24,
"k": 25,
"l": 26,
"m": 27,
"n": 28,
"ɴ": 28,
"ɲ": 28,
"o": 29,
"̞": 29,
"p": 30,
"ɸ": 30,
"q": 31,
"r": 32,
"ɽ": 32,
"ʁ": 32,
"s": 33,
"t": 34,
"u": 35,
"ø": 35,
"œ": 35,
"y": 35,
"ɣ": 35,
"ũ": 35,
"v": 36,
"w": 37,
"ʍ": 37,
"x": 38,
"z": 39,
"¡": 40,
"«": 41,
"»": 42,
"¿": 43,
"æ": 44,
"ç": 45,
"ð": 46,
"ŋ": 47,
"ɐ": 48,
"ɑ": 49,
"ɔ": 50,
"ɕ": 51,
"ə": 52,
"ɚ": 53,
"ɛ": 54,
"ɜ": 55,
"ɟ": 56,
"ɡ": 57,
"ɪ": 58,
"ɬ": 59,
"ɯ": 60,
"ɹ": 61,
"ɾ": 62,
"ʃ": 63,
"ʈ": 64,
"ʊ": 65,
"ʋ": 66,
"ʌ": 67,
"ʑ": 68,
"ʒ": 69,
"ʔ": 70,
"ʲ": 71,
"ˈ": 72,
"ˌ": 73,
"ˌ": 73,
"ː": 74,
"̃": 75,
"̩": 76,
"θ": 77,
"ᵻ": 78,
"…": 82,
"ˈɛ": 83,
"iː": 84,
"aɪ": 85,
"nd": 86,
"ˈɪ": 87,
"eɪ": 88,
"ˈæ": 89,
"ðə": 90,
"oʊ": 91,
"ɑː": 92,
"ˈeɪ": 93,
"ən": 94,
"uː": 95,
"ˈʌ": 96,
"ˈaɪ": 97,
"st": 98,
"ˈɔ": 99,
"ˈoʊ": 100,
"ˈiː": 101,
"ˈɑː": 102,
"ænd": 103,
"ːɹ": 104,
"ɪŋ": 105,
"ɜː": 106,
"ɪn": 107,
"tə": 108,
"ʌv": 109,
"aʊ": 110,
"əl": 111,
"ˈuː": 112,
"tʃ": 113,
"ɪz": 114,
"ˈɜː": 115,
"ˌʌ": 116,
"æt": 117,
"dʒ": 118,
"ˈɔː": 119,
"ɪt": 120,
"ˈaʊ": 121,
"ɚɹ": 122,
"ˈɛn": 123,
"wʌ": 124,
"li": 125,
"hiː": 126,
"ˌɛ": 127,
"wɪ": 128,
"wʌz": 129,
"ðæt": 130,
"juː": 131,
"oːɹ": 132,
"ðɪ": 133,
"sˈɛ": 134,
"ˌɪ": 135,
"ˈɑːɹ": 136,
"nt": 137,
"ˈʊ": 138,
"ənt": 139,
"hɪz": 140,
"ˌɑː": 141,
"hæ": 142,
"ɔːɹ": 143,
"ˈɛɹ": 144,
"wɪð": 145,
"ᵻd": 146,
"ˈoːɹ": 147,
"pɹ": 148,
"ˈɔːl": 149,
"mˌ": 150,
"ʃən": 151,
"kt": 152,
"ˌoʊ": 153,
"ˈɔːɹ": 154,
"fɹ": 155,
"æz": 156,
"ˌʌt": 157,
"ʃiː": 158,
"ˈɛl": 159,
"ˌaʊ": 160,
"ˈʌn": 161,
"əs": 162,
"hɜː": 163,
"lˈaɪ": 164,
"ˈæn": 165,
"ˈɪɹ": 166,
"ʊd": 167,
"ɹᵻ": 168,
"ld": 169,
"bˌʌt": 170,
"ks": 171,
"nˈoʊ": 172,
"hæd": 173,
"ɾɚ": 174,
"ɛɹ": 175,
"ˈɪŋ": 176,
"ɡɹ": 177,
"nˌɑː": 178,
"ɔn": 179,
"vɚ": 180,
"maɪ": 181,
"fɔːɹ": 182,
"ðɚ": 183,
"tʊ": 184,
"ðɛɹ": 185,
"nˌɑːt": 186,
"ˈʌm": 187,
"tɹ": 188,
"sˈiː": 189,
"ʌvðə": 190,
"mˈɪ": 191,
"hˈæ": 192,
ɪm": 193,
"lˈeɪ": 194,
"ɪk": 195,
"sp": 196,
"hˌɪm": 197,
"ɐn": 198,
"ðeɪ": 199,
"lˈɪ": 200,
"ɾi": 201,
"lˈɛ": 202,
"bɹ": 203,
"kɹ": 204,
"lˈæ": 205,
"ˈɪl": 206,
"jˈuː": 207,
"ʌm": 208,
"mˌiː": 209,
"bᵻ": 210,
"wˈʌn": 211,
ɪn": 212,
"ˈɪn": 213,
"ˈoʊn": 214,
"sˈɛd": 215,
"biː": 216,
"ˈɛd": 217,
"ˈaɪt": 218,
"baɪ": 219,
"fɹʌm": 220,
"ɪs": 221,
"ɚz": 222,
ɪs": 223,
"əns": 224,
"bəl": 225,
"ɪf": 226,
"ɪnðə": 227,
"əm": 228,
"ᵻz": 229,
"ˌuː": 230,
"wˈeɪ": 231,
"ft": 232,
"wiː": 233,
"stɹ": 234,
"lˈiː": 235,
"iːz": 236,
"pt": 237,
"jʊ": 238,
"ɚd": 239,
"ˌaɪ": 240,
"kw": 241,
"ˌɔn": 242,
"ˈaɪd": 243,
"ɪm": 244,
"ˈʌst": 245,
"ˈoʊld": 246,
"ts": 247,
ɪtʃ": 248,
"sˌoʊ": 249,
"dˈɪ": 250,
"ɑːɹ": 251,
"hɐ": 252,
"sˈeɪ": 253,
"ɾᵻd": 254,
"wˌɪtʃ": 255
},
"merges": [
"ˈ ɛ",
"i ː",
"a ɪ",
"n d",
"ˈ ɪ",
"e ɪ",
"ˈ æ",
"ð ə",
"o ʊ",
"ɑ ː",
"ˈ eɪ",
"ə n",
"u ː",
"ˈ ʌ",
"ˈ aɪ",
"s t",
"ˈ ɔ",
"ˈ oʊ",
"ˈ iː",
"ˈ ɑː",
"æ nd",
"ː ɹ",
"ɪ ŋ",
ː",
"ɪ n",
"t ə",
"ʌ v",
"a ʊ",
"ə l",
"ˈ uː",
"t ʃ",
"ɪ z",
"ˈ ɜː",
"ˌ ʌ",
"æ t",
"d ʒ",
"ˈɔ ː",
"ɪ t",
"ˈ aʊ",
"ɚ ɹ",
"ˈɛ n",
"w ʌ",
"l i",
"h iː",
"ˌ ɛ",
"w ɪ",
"wʌ z",
"ð æt",
"j uː",
"o ːɹ",
ɪ",
"s ˈɛ",
ɪ",
"ˈɑː ɹ",
"n t",
"ˈ ʊ",
"ən t",
"h ɪz",
ɑː",
"h æ",
"ɔ ːɹ",
"ˈɛ ɹ",
"wɪ ð",
"ᵻ d",
"ˈ oːɹ",
"p ɹ",
"ˈɔː l",
"m ˌ",
"ʃ ən",
"k t",
"ˌ oʊ",
"ˈɔ ːɹ",
"f ɹ",
"æ z",
"ˌʌ t",
"ʃ iː",
"ˈɛ l",
"ˌ aʊ",
"ˈʌ n",
"ə s",
"h ɜː",
"l ˈaɪ",
"ˈæ n",
"ˈɪ ɹ",
"ʊ d",
"ɹ ᵻ",
"l d",
"b ˌʌt",
"k s",
"n ˈoʊ",
"hæ d",
"ɾ ɚ",
"ɛ ɹ",
"ˈɪ ŋ",
"ɡ ɹ",
"n ˌɑː",
"ɔ n",
"v ɚ",
"m aɪ",
"f ɔːɹ",
"ð ɚ",
"t ʊ",
"ð ɛɹ",
"nˌɑː t",
"ˈʌ m",
"t ɹ",
"s ˈiː",
"ʌv ðə",
"m ˈɪ",
"h ˈæ",
"ˌɪ m",
"l ˈeɪ",
"ɪ k",
"s p",
"h ˌɪm",
"ɐ n",
"ð eɪ",
"l ˈɪ",
"ɾ i",
"l ˈɛ",
"b ɹ",
"k ɹ",
"l ˈæ",
"ˈɪ l",
"j ˈuː",
"ʌ m",
"mˌ iː",
"b ᵻ",
"w ˈʌn",
ɪn",
"ˈɪ n",
"ˈoʊ n",
"sˈɛ d",
"b iː",
"ˈɛ d",
"ˈaɪ t",
"b aɪ",
"fɹ ʌm",
"ɪ s",
"ɚ z",
"ðɪ s",
"ən s",
"b əl",
"ɪ f",
"ɪn ðə",
"ə m",
"ᵻ z",
"ˌ uː",
"w ˈeɪ",
"f t",
"w iː",
"st ɹ",
"l ˈiː",
"iː z",
"p t",
"j ʊ",
"ɚ d",
"ˌ aɪ",
"k w",
"ˌ ɔn",
"ˈaɪ d",
"ɪ m",
"ˈʌ st",
"ˈoʊ ld",
"t s",
"ˌɪ tʃ",
"s ˌoʊ",
"d ˈɪ",
"ɑː ɹ",
"h ɐ",
"s ˈeɪ",
"ɾ ᵻd",
"w ˌɪtʃ"
]
}
}