vall-e/data/tokenizer.json

551 lines
9.2 KiB
JSON
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<bos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "</eos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<mask>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<bos>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "<eos>",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"<bos>": {
"id": "<bos>",
"ids": [
1
],
"tokens": [
"<bos>"
]
},
"<eos>": {
"id": "<eos>",
"ids": [
2
],
"tokens": [
"<eos>"
]
}
}
},
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<unk>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<unk>": 0,
"<bos>": 1,
"</eos>": 2,
"<mask>": 3,
"!": 4,
"\"": 5,
"(": 6,
")": 7,
",": 8,
"-": 9,
".": 10,
"1": 11,
":": 12,
"?": 13,
"a": 14,
"b": 15,
"c": 16,
"d": 17,
"e": 18,
"f": 19,
"h": 20,
"i": 21,
"j": 22,
"k": 23,
"l": 24,
"m": 25,
"n": 26,
"o": 27,
"p": 28,
"q": 29,
"r": 30,
"s": 31,
"t": 32,
"u": 33,
"v": 34,
"w": 35,
"x": 36,
"z": 37,
"¡": 38,
"«": 39,
"»": 40,
"¿": 41,
"æ": 42,
"ç": 43,
"ð": 44,
"ŋ": 45,
"ɐ": 46,
"ɑ": 47,
"ɔ": 48,
"ɕ": 49,
"ə": 50,
"ɚ": 51,
"ɛ": 52,
"ɜ": 53,
"ɟ": 54,
"ɡ": 55,
"ɪ": 56,
"ɬ": 57,
"ɯ": 58,
"ɹ": 59,
"ɾ": 60,
"ʃ": 61,
"ʈ": 62,
"ʊ": 63,
"ʋ": 64,
"ʌ": 65,
"ʑ": 66,
"ʒ": 67,
"ʔ": 68,
"ʲ": 69,
"ˈ": 70,
"ˌ": 71,
"ː": 72,
"̃": 73,
"̩": 74,
"θ": 75,
"ᵻ": 76,
"—": 77,
"…": 78,
"ˈɛ": 79,
"iː": 80,
"aɪ": 81,
"nd": 82,
"ˈɪ": 83,
"eɪ": 84,
"ˈæ": 85,
"oʊ": 86,
"ðə": 87,
"ɑː": 88,
"ˈeɪ": 89,
"ən": 90,
"uː": 91,
"ˈʌ": 92,
"ˈaɪ": 93,
"st": 94,
"ˈɔ": 95,
"ˈoʊ": 96,
"ˈiː": 97,
"ˈɑː": 98,
"ænd": 99,
"ːɹ": 100,
"ɪŋ": 101,
"ɜː": 102,
"ɪn": 103,
"tə": 104,
"ʌv": 105,
"aʊ": 106,
"əl": 107,
"ˈuː": 108,
"tʃ": 109,
"ɪz": 110,
"ˈɜː": 111,
"ˌʌ": 112,
"æt": 113,
"dʒ": 114,
"ˈɔː": 115,
"ɪt": 116,
"ˈaʊ": 117,
"ɚɹ": 118,
"ˈɛn": 119,
"wʌ": 120,
"li": 121,
"hiː": 122,
"ˌɛ": 123,
"wɪ": 124,
"ðæt": 125,
"wʌz": 126,
"juː": 127,
"oːɹ": 128,
"ðɪ": 129,
"sˈɛ": 130,
"ˈɑːɹ": 131,
"ˌɪ": 132,
"nt": 133,
"ˈʊ": 134,
"ənt": 135,
"hɪz": 136,
"hæ": 137,
"ˌɑː": 138,
"ɔːɹ": 139,
"ˈɛɹ": 140,
"wɪð": 141,
"ᵻd": 142,
"ˈoːɹ": 143,
"pɹ": 144,
"ˈɔːl": 145,
"mˌ": 146,
"ʃən": 147,
"kt": 148,
"ˌoʊ": 149,
"ˈɔːɹ": 150,
"fɹ": 151,
"æz": 152,
"ʃiː": 153,
"ˌʌt": 154,
"ˈɛl": 155,
"ˌaʊ": 156,
"ˈʌn": 157,
"əs": 158,
"hɜː": 159,
"lˈaɪ": 160,
"ˈæn": 161,
"ˈɪɹ": 162,
"ʊd": 163,
"ɹᵻ": 164,
"ld": 165,
"bˌʌt": 166,
"ks": 167,
"nˈoʊ": 168,
"ɾɚ": 169,
"hæd": 170,
"ɛɹ": 171,
"ˈɪŋ": 172,
"ɡɹ": 173,
"ɔn": 174,
"nˌɑː": 175,
"maɪ": 176,
"vɚ": 177,
"fɔːɹ": 178,
"ðɚ": 179,
"tʊ": 180,
"ðɛɹ": 181,
"ˈʌm": 182,
"nˌɑːt": 183,
"tɹ": 184,
"sˈiː": 185,
"ʌvðə": 186,
"mˈɪ": 187,
"ˈæp": 188,
ɪm": 189,
"ɪk": 190,
"sp": 191,
"lˈeɪ": 192,
"hˌɪm": 193,
"ɐn": 194,
"ðeɪ": 195,
"lˈɪ": 196,
"ɾi": 197,
"bɹ": 198,
"lˈɛ": 199,
"kɹ": 200,
"ˈɪl": 201,
"jˈuː": 202,
"ʌm": 203,
"mˌiː": 204,
"lˈæ": 205,
ɪn": 206,
"bᵻ": 207,
"wˈʌn": 208,
"ˈɪn": 209,
"ˈoʊn": 210,
"biː": 211,
"sˈɛd": 212,
"ˈɛd": 213,
"ˈaɪt": 214,
"fɹʌm": 215,
"baɪ": 216,
"ɪs": 217,
"ɚz": 218,
ɪs": 219,
"əns": 220,
"ɪf": 221,
"bəl": 222,
"ˈænd": 223,
"ɪnðə": 224,
"əm": 225,
"iːz": 226,
"ˌuː": 227,
"ᵻz": 228,
"wˈeɪ": 229,
"ft": 230,
"wiː": 231,
"lˈiː": 232,
"stɹ": 233,
"jʊ": 234,
"ɚd": 235,
"ˌaɪ": 236,
"kw": 237,
"ˌɔn": 238,
"ˈaɪd": 239,
"ts": 240,
"ɪm": 241,
"ˈʌst": 242,
"ˈoʊld": 243,
ɪtʃ": 244,
"dˈɪ": 245,
"sˌoʊ": 246,
"ɑːɹ": 247,
"hɐ": 248,
"sˈeɪ": 249,
"ɾᵻd": 250,
"dᵻ": 251,
"wˌɪtʃ": 252,
"sˈɛl": 253,
"ɹi": 254,
"ˈʌðɚ": 255
},
"merges": [
"ˈ ɛ",
"i ː",
"a ɪ",
"n d",
"ˈ ɪ",
"e ɪ",
"ˈ æ",
"o ʊ",
"ð ə",
"ɑ ː",
"ˈ eɪ",
"ə n",
"u ː",
"ˈ ʌ",
"ˈ aɪ",
"s t",
"ˈ ɔ",
"ˈ oʊ",
"ˈ iː",
"ˈ ɑː",
"æ nd",
"ː ɹ",
"ɪ ŋ",
ː",
"ɪ n",
"t ə",
"ʌ v",
"a ʊ",
"ə l",
"ˈ uː",
"t ʃ",
"ɪ z",
"ˈ ɜː",
"ˌ ʌ",
"æ t",
"d ʒ",
"ˈɔ ː",
"ɪ t",
"ˈ aʊ",
"ɚ ɹ",
"ˈɛ n",
"w ʌ",
"l i",
"h iː",
"ˌ ɛ",
"w ɪ",
"ð æt",
"wʌ z",
"j uː",
"o ːɹ",
ɪ",
"s ˈɛ",
"ˈɑː ɹ",
ɪ",
"n t",
"ˈ ʊ",
"ən t",
"h ɪz",
"h æ",
ɑː",
"ɔ ːɹ",
"ˈɛ ɹ",
"wɪ ð",
"ᵻ d",
"ˈ oːɹ",
"p ɹ",
"ˈɔː l",
"m ˌ",
"ʃ ən",
"k t",
"ˌ oʊ",
"ˈɔ ːɹ",
"f ɹ",
"æ z",
"ʃ iː",
"ˌʌ t",
"ˈɛ l",
"ˌ aʊ",
"ˈʌ n",
"ə s",
"h ɜː",
"l ˈaɪ",
"ˈæ n",
"ˈɪ ɹ",
"ʊ d",
"ɹ ᵻ",
"l d",
"b ˌʌt",
"k s",
"n ˈoʊ",
"ɾ ɚ",
"hæ d",
"ɛ ɹ",
"ˈɪ ŋ",
"ɡ ɹ",
"ɔ n",
"n ˌɑː",
"m aɪ",
"v ɚ",
"f ɔːɹ",
"ð ɚ",
"t ʊ",
"ð ɛɹ",
"ˈʌ m",
"nˌɑː t",
"t ɹ",
"s ˈiː",
"ʌv ðə",
"m ˈɪ",
"ˈæ p",
"ˌɪ m",
"ɪ k",
"s p",
"l ˈeɪ",
"h ˌɪm",
"ɐ n",
"ð eɪ",
"l ˈɪ",
"ɾ i",
"b ɹ",
"l ˈɛ",
"k ɹ",
"ˈɪ l",
"j ˈuː",
"ʌ m",
"mˌ iː",
"l ˈæ",
ɪn",
"b ᵻ",
"w ˈʌn",
"ˈɪ n",
"ˈoʊ n",
"b iː",
"sˈɛ d",
"ˈɛ d",
"ˈaɪ t",
"fɹ ʌm",
"b aɪ",
"ɪ s",
"ɚ z",
"ðɪ s",
"ən s",
"ɪ f",
"b əl",
"ˈæ nd",
"ɪn ðə",
"ə m",
"iː z",
"ˌ uː",
"ᵻ z",
"w ˈeɪ",
"f t",
"w iː",
"l ˈiː",
"st ɹ",
"j ʊ",
"ɚ d",
"ˌ aɪ",
"k w",
"ˌ ɔn",
"ˈaɪ d",
"t s",
"ɪ m",
"ˈʌ st",
"ˈoʊ ld",
"ˌɪ tʃ",
"d ˈɪ",
"s ˌoʊ",
"ɑː ɹ",
"h ɐ",
"s ˈeɪ",
"ɾ ᵻd",
"d ᵻ",
"w ˌɪtʃ",
"sˈɛ l",
"ɹ i",
"ˈʌ ðɚ"
]
}
}