vall-e/data/tokenizer.json

583 lines
9.6 KiB
JSON
Raw Normal View History

{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<bos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "</eos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<mask>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
2024-05-16 04:04:19 +00:00
"pre_tokenizer": null,
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<bos>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "<eos>",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"<bos>": {
"id": "<bos>",
"ids": [
1
],
"tokens": [
"<bos>"
]
},
"<eos>": {
"id": "<eos>",
"ids": [
2
],
"tokens": [
"<eos>"
]
}
}
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": false,
"use_regex": false
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<unk>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
2024-12-22 21:05:45 +00:00
"byte_fallback": false,
2024-05-16 04:04:19 +00:00
"ignore_merges": false,
"vocab": {
"<unk>": 0,
"<bos>": 1,
"</eos>": 2,
"<mask>": 3,
2024-05-16 04:04:19 +00:00
" ": 4,
"ᵝ": 4,
2024-05-16 04:04:19 +00:00
"!": 5,
"\"": 6,
"(": 7,
2024-09-21 21:06:01 +00:00
"{": 7,
"[": 7,
2024-05-16 04:04:19 +00:00
")": 8,
"}": 8,
"]": 8,
2024-05-16 04:04:19 +00:00
",": 9,
"-": 10,
".": 11,
"1": 211,
"—": 10,
"“": 6,
"”": 81,
"ˇ": 6,
"ˉ": 12,
"ˊ": 79,
"ˋ": 80,
"_": 81,
2024-05-16 04:04:19 +00:00
":": 13,
";": 14,
"?": 15,
"a": 16,
"ä": 16,
"ɒ": 16,
2024-05-16 04:04:19 +00:00
"b": 17,
"c": 18,
"d": 19,
"e": 20,
"f": 21,
"h": 22,
"i": 23,
"ĩ": 23,
2024-05-16 04:04:19 +00:00
"j": 24,
"k": 25,
"l": 26,
"m": 27,
"n": 28,
"ɴ": 28,
"ɲ": 28,
2024-05-16 04:04:19 +00:00
"o": 29,
"̞": 29,
2024-05-16 04:04:19 +00:00
"p": 30,
"ɸ": 30,
2024-05-16 04:04:19 +00:00
"q": 31,
"r": 32,
"ɽ": 32,
"ʁ": 32,
2024-05-16 04:04:19 +00:00
"s": 33,
"t": 34,
"u": 35,
"ø": 35,
"œ": 35,
"y": 35,
"ɣ": 35,
"ũ": 35,
2024-05-16 04:04:19 +00:00
"v": 36,
"w": 37,
"ʍ": 37,
2024-05-16 04:04:19 +00:00
"x": 38,
"z": 39,
"¡": 40,
"«": 41,
"»": 42,
"¿": 43,
"æ": 44,
"ç": 45,
"ð": 46,
"ŋ": 47,
"ɐ": 48,
"ɑ": 49,
"ɔ": 50,
"ɕ": 51,
"ə": 52,
"ɚ": 53,
"ɛ": 54,
"ɜ": 55,
"ɟ": 56,
"ɡ": 57,
"ɪ": 58,
"ɬ": 59,
"ɯ": 60,
"ɹ": 61,
"ɾ": 62,
"ʃ": 63,
"ʈ": 64,
"ʊ": 65,
"ʋ": 66,
"ʌ": 67,
"ʑ": 68,
"ʒ": 69,
"ʔ": 70,
"ʲ": 71,
"ˈ": 72,
"ˌ": 73,
"ˌ": 73,
2024-05-16 04:04:19 +00:00
"ː": 74,
"̃": 75,
"̩": 76,
"θ": 77,
"ᵻ": 78,
2024-05-16 04:04:19 +00:00
"…": 82,
"ˈɛ": 83,
"iː": 84,
"aɪ": 85,
"nd": 86,
"ˈɪ": 87,
"eɪ": 88,
"ˈæ": 89,
"ðə": 90,
"oʊ": 91,
"ɑː": 92,
"ˈeɪ": 93,
"ən": 94,
"uː": 95,
"ˈʌ": 96,
"ˈaɪ": 97,
"st": 98,
"ˈɔ": 99,
"ˈoʊ": 100,
"ˈiː": 101,
"ˈɑː": 102,
"ænd": 103,
"ːɹ": 104,
"ɪŋ": 105,
"ɜː": 106,
"ɪn": 107,
"tə": 108,
"ʌv": 109,
"aʊ": 110,
"əl": 111,
"ˈuː": 112,
"tʃ": 113,
"ɪz": 114,
"ˈɜː": 115,
"ˌʌ": 116,
"æt": 117,
"dʒ": 118,
"ˈɔː": 119,
"ɪt": 120,
"ˈaʊ": 121,
"ɚɹ": 122,
"ˈɛn": 123,
"wʌ": 124,
"li": 125,
"hiː": 126,
"ˌɛ": 127,
"wɪ": 128,
"wʌz": 129,
"ðæt": 130,
"juː": 131,
"oːɹ": 132,
"ðɪ": 133,
"sˈɛ": 134,
"ˌɪ": 135,
"ˈɑːɹ": 136,
"nt": 137,
"ˈʊ": 138,
"ənt": 139,
"hɪz": 140,
"ˌɑː": 141,
"hæ": 142,
"ɔːɹ": 143,
"ˈɛɹ": 144,
"wɪð": 145,
"ᵻd": 146,
"ˈoːɹ": 147,
"pɹ": 148,
"ˈɔːl": 149,
"mˌ": 150,
"ʃən": 151,
"kt": 152,
"ˌoʊ": 153,
"ˈɔːɹ": 154,
"fɹ": 155,
"æz": 156,
"ˌʌt": 157,
"ʃiː": 158,
"ˈɛl": 159,
"ˌaʊ": 160,
"ˈʌn": 161,
"əs": 162,
"hɜː": 163,
"lˈaɪ": 164,
"ˈæn": 165,
"ˈɪɹ": 166,
"ʊd": 167,
"ɹᵻ": 168,
"ld": 169,
"bˌʌt": 170,
"ks": 171,
"nˈoʊ": 172,
"hæd": 173,
"ɾɚ": 174,
"ɛɹ": 175,
"ˈɪŋ": 176,
"ɡɹ": 177,
"nˌɑː": 178,
"ɔn": 179,
"vɚ": 180,
"maɪ": 181,
"fɔːɹ": 182,
"ðɚ": 183,
"tʊ": 184,
"ðɛɹ": 185,
"nˌɑːt": 186,
"ˈʌm": 187,
"tɹ": 188,
"sˈiː": 189,
"ʌvðə": 190,
"mˈɪ": 191,
"hˈæ": 192,
ɪm": 193,
"lˈeɪ": 194,
"ɪk": 195,
"sp": 196,
"hˌɪm": 197,
"ɐn": 198,
"ðeɪ": 199,
"lˈɪ": 200,
"ɾi": 201,
"lˈɛ": 202,
"bɹ": 203,
"kɹ": 204,
"lˈæ": 205,
2024-05-16 04:04:19 +00:00
"ˈɪl": 206,
"jˈuː": 207,
"ʌm": 208,
"mˌiː": 209,
"bᵻ": 210,
"wˈʌn": 211,
ɪn": 212,
"ˈɪn": 213,
"ˈoʊn": 214,
"sˈɛd": 215,
"biː": 216,
"ˈɛd": 217,
"ˈaɪt": 218,
"baɪ": 219,
"fɹʌm": 220,
"ɪs": 221,
"ɚz": 222,
ɪs": 223,
"əns": 224,
"bəl": 225,
"ɪf": 226,
"ɪnðə": 227,
"əm": 228,
"ᵻz": 229,
"ˌuː": 230,
"wˈeɪ": 231,
"ft": 232,
"wiː": 233,
"stɹ": 234,
"lˈiː": 235,
"iːz": 236,
"pt": 237,
"jʊ": 238,
"ɚd": 239,
"ˌaɪ": 240,
"kw": 241,
"ˌɔn": 242,
"ˈaɪd": 243,
"ɪm": 244,
"ˈʌst": 245,
"ˈoʊld": 246,
"ts": 247,
ɪtʃ": 248,
"sˌoʊ": 249,
"dˈɪ": 250,
"ɑːɹ": 251,
"hɐ": 252,
"sˈeɪ": 253,
"ɾᵻd": 254,
"wˌɪtʃ": 255
},
"merges": [
"ˈ ɛ",
"i ː",
"a ɪ",
"n d",
"ˈ ɪ",
"e ɪ",
"ˈ æ",
"ð ə",
2024-05-16 04:04:19 +00:00
"o ʊ",
"ɑ ː",
"ˈ eɪ",
"ə n",
"u ː",
"ˈ ʌ",
"ˈ aɪ",
"s t",
"ˈ ɔ",
"ˈ oʊ",
"ˈ iː",
"ˈ ɑː",
"æ nd",
"ː ɹ",
"ɪ ŋ",
ː",
"ɪ n",
"t ə",
"ʌ v",
"a ʊ",
"ə l",
"ˈ uː",
"t ʃ",
"ɪ z",
"ˈ ɜː",
"ˌ ʌ",
"æ t",
"d ʒ",
"ˈɔ ː",
"ɪ t",
"ˈ aʊ",
"ɚ ɹ",
"ˈɛ n",
"w ʌ",
"l i",
"h iː",
"ˌ ɛ",
"w ɪ",
"wʌ z",
2024-05-16 04:04:19 +00:00
"ð æt",
"j uː",
"o ːɹ",
ɪ",
"s ˈɛ",
ɪ",
2024-05-16 04:04:19 +00:00
"ˈɑː ɹ",
"n t",
"ˈ ʊ",
"ən t",
"h ɪz",
ɑː",
2024-05-16 04:04:19 +00:00
"h æ",
"ɔ ːɹ",
"ˈɛ ɹ",
"wɪ ð",
"ᵻ d",
"ˈ oːɹ",
"p ɹ",
"ˈɔː l",
"m ˌ",
"ʃ ən",
"k t",
"ˌ oʊ",
"ˈɔ ːɹ",
"f ɹ",
"æ z",
"ˌʌ t",
2024-05-16 04:04:19 +00:00
"ʃ iː",
"ˈɛ l",
"ˌ aʊ",
"ˈʌ n",
"ə s",
"h ɜː",
"l ˈaɪ",
"ˈæ n",
"ˈɪ ɹ",
"ʊ d",
"ɹ ᵻ",
"l d",
"b ˌʌt",
"k s",
"n ˈoʊ",
"hæ d",
2024-05-16 04:04:19 +00:00
"ɾ ɚ",
"ɛ ɹ",
"ˈɪ ŋ",
"ɡ ɹ",
"n ˌɑː",
2024-05-16 04:04:19 +00:00
"ɔ n",
"v ɚ",
2024-05-16 04:04:19 +00:00
"m aɪ",
"f ɔːɹ",
"ð ɚ",
"t ʊ",
"ð ɛɹ",
"nˌɑː t",
2024-05-16 04:04:19 +00:00
"ˈʌ m",
"t ɹ",
"s ˈiː",
"ʌv ðə",
"m ˈɪ",
2024-05-16 04:04:19 +00:00
"h ˈæ",
"ˌɪ m",
2024-05-16 04:04:19 +00:00
"l ˈeɪ",
"ɪ k",
"s p",
"h ˌɪm",
"ɐ n",
"ð eɪ",
"l ˈɪ",
"ɾ i",
"l ˈɛ",
2024-05-16 04:04:19 +00:00
"b ɹ",
"k ɹ",
2024-05-16 04:04:19 +00:00
"l ˈæ",
"ˈɪ l",
"j ˈuː",
"ʌ m",
"mˌ iː",
"b ᵻ",
"w ˈʌn",
2024-05-16 04:04:19 +00:00
ɪn",
"ˈɪ n",
"ˈoʊ n",
"sˈɛ d",
2024-05-16 04:04:19 +00:00
"b iː",
"ˈɛ d",
"ˈaɪ t",
"b aɪ",
2024-05-16 04:04:19 +00:00
"fɹ ʌm",
"ɪ s",
"ɚ z",
"ðɪ s",
"ən s",
"b əl",
2024-05-16 04:04:19 +00:00
"ɪ f",
"ɪn ðə",
"ə m",
"ᵻ z",
2024-05-16 04:04:19 +00:00
"ˌ uː",
"w ˈeɪ",
"f t",
"w iː",
"st ɹ",
2024-05-16 04:04:19 +00:00
"l ˈiː",
"iː z",
"p t",
"j ʊ",
"ɚ d",
"ˌ aɪ",
"k w",
"ˌ ɔn",
"ˈaɪ d",
"ɪ m",
"ˈʌ st",
"ˈoʊ ld",
2024-05-16 04:04:19 +00:00
"t s",
"ˌɪ tʃ",
"s ˌoʊ",
2024-05-16 04:04:19 +00:00
"d ˈɪ",
"ɑː ɹ",
"h ɐ",
"s ˈeɪ",
"ɾ ᵻd",
2024-05-16 04:04:19 +00:00
"w ˌɪtʃ"
]
}
}