added japanese tokenizer (experimental)

This commit is contained in:
mrq 2023-03-17 20:04:40 +00:00
parent f34cc382c5
commit b17260cddf

564
models/tokenizers/japanese.json Executable file
View File

@ -0,0 +1,564 @@
{
"version":"1.0",
"truncation":null,
"padding":null,
"normalizer":null,
"pre_tokenizer":{
"type":"Whitespace"
},
"post_processor":null,
"decoder":null,
"added_tokens":[
{
"id":0,
"special":true,
"content":"[STOP]",
"single_word":false,
"lstrip":false,
"rstrip":false,
"normalized":false
},
{
"id":1,
"special":true,
"content":"[UNK]",
"single_word":false,
"lstrip":false,
"rstrip":false,
"normalized":false
},
{
"id":2,
"special":true,
"content":"[SPACE]",
"single_word":false,
"lstrip":false,
"rstrip":false,
"normalized":false
}
],
"model":{
"type":"BPE",
"language": "ja",
"dropout":null,
"unk_token":"[UNK]",
"continuing_subword_prefix":null,
"end_of_word_suffix":null,
"fuse_unk":false,
"vocab":{
"[STOP]": 0,
"[UNK]": 1,
"[SPACE]": 2,
"ー":255,
"ア": 14,
"イ": 22,
"ウ": 34,
"エ": 18,
"オ": 28,
"カ": 254,
"ガ": 230,
"キ": 253,
"キャ": 215,
"キュ": 214,
"キョ": 213,
"ギ": 229,
"ギャ": 194,
"ギュ": 193,
"ギョ": 192,
"ク": 252,
"グ": 228,
"ケ": 93,
"ゲ": 112,
"コ": 251,
"ゴ": 119,
"サ": 150,
"ザ": 227,
"シ": 250,
"シャ": 212,
"シュ": 211,
"ショ": 210,
"ジ": 226,
"ジャ": 191,
"ジュ": 190,
"ジョ": 189,
"ス": 133,
"ズ": 225,
"セ": 66,
"ゼ": 224,
"ソ": 123,
"ゾ": 223,
"タ": 249,
"ダ": 222,
"チ": 248,
"チャ": 209,
"チュ": 208,
"チョ": 207,
"ヂ": 226,
"ツ": 247,
"ヅ": 225,
"テ": 136,
"デ": 126,
"ト": 51,
"ド": 134,
"ナ": 246,
"ニ": 245,
"ニャ": 206,
"ニュ": 205,
"ニョ": 204,
"ヌ": 244,
"ネ": 111,
"": 95,
"ハ": 102,
"バ": 221,
"パ": 218,
"ヒ": 243,
"ヒャ": 203,
"ヒュ": 202,
"ヒョ": 201,
"ビ": 220,
"ビャ": 188,
"ビュ": 187,
"ビョ": 186,
"ピ": 217,
"ピャ": 185,
"ピュ": 184,
"ピョ": 183,
"フ": 242,
"ブ": 219,
"プ": 216,
"ヘ": 62,
"ベ": 67,
"ペ": 124,
"ホ": 241,
"ボ": 165,
"ポ": 139,
"マ": 240,
"ミ": 239,
"ミャ": 200,
"ミュ": 199,
"ミョ": 198,
"ム": 238,
"メ": 80,
"モ": 115,
"ヤ": 237,
"ユ": 236,
"ヨ": 235,
"ラ": 234,
"リ": 101,
"リャ": 197,
"リュ": 196,
"リョ": 195,
"ル": 233,
"レ": 46,
"ロ": 89,
"ワ": 232,
"ヲ": 231,
"ン": 27,
"ッ":182,
"ャ":181,
"ョ":180,
"ュ":179,
"あ": 14,
"い": 22,
"う": 34,
"え": 18,
"お": 28,
"か": 254,
"が": 230,
"き": 253,
"きゃ": 215,
"きゅ": 214,
"きょ": 213,
"ぎ": 229,
"ぎゃ": 194,
"ぎゅ": 193,
"ぎょ": 192,
"く": 252,
"ぐ": 228,
"け": 93,
"げ": 112,
"こ": 251,
"ご": 119,
"さ": 150,
"ざ": 227,
"し": 250,
"しゃ": 212,
"しゅ": 211,
"しょ": 210,
"じ": 226,
"じゃ": 191,
"じゅ": 190,
"じょ": 189,
"す": 133,
"ず": 225,
"せ": 66,
"ぜ": 224,
"そ": 123,
"ぞ": 223,
"た": 249,
"だ": 222,
"ち": 248,
"ちゃ": 209,
"ちゅ": 208,
"ちょ": 207,
"ぢ": 226,
"つ": 247,
"づ": 225,
"て": 136,
"で": 126,
"と": 51,
"ど": 134,
"な": 246,
"に": 245,
"にゃ": 206,
"にゅ": 205,
"にょ": 204,
"ぬ": 244,
"ね": 111,
"の": 95,
"は": 102,
"ば": 221,
"ぱ": 218,
"ひ": 243,
"ひゃ": 203,
"ひゅ": 202,
"ひょ": 201,
"び": 220,
"びゃ": 188,
"びゅ": 187,
"びょ": 186,
"ぴ": 217,
"ぴゃ": 185,
"ぴゅ": 184,
"ぴょ": 183,
"ふ": 242,
"ぶ": 219,
"ぷ": 216,
"へ": 62,
"べ": 67,
"ぺ": 124,
"ほ": 241,
"ぼ": 165,
"ぽ": 139,
"ま": 240,
"み": 239,
"みゃ": 200,
"みゅ": 199,
"みょ": 198,
"む": 238,
"め": 80,
"も": 115,
"や": 237,
"ゆ": 236,
"よ": 235,
"ら": 234,
"り": 101,
"りゃ": 197,
"りゅ": 196,
"りょ": 195,
"る": 233,
"れ": 46,
"ろ": 89,
"わ": 232,
"を": 231,
"ん": 27,
"っ":182,
"ゃ":181,
"ょ":180,
"ゅ":179,
"ba": 221,
"bi": 220,
"bu": 219,
"bya": 188,
"byo": 186,
"byu": 187,
"cha": 209,
"chi": 248,
"cho": 207,
"chu": 208,
"da": 222,
"fu": 242,
"ga": 230,
"gi": 229,
"gu": 228,
"gya": 194,
"gyo": 192,
"gyu": 193,
"hi": 243,
"ho": 241,
"hya": 203,
"hyo": 201,
"hyu": 202,
"ja": 191,
"ji": 226,
"jo": 189,
"ju": 190,
"ka": 254,
"ki": 253,
"ko": 251,
"ku": 252,
"kya": 215,
"kyo": 213,
"kyu": 214,
"ma": 240,
"mi": 239,
"mu": 238,
"mya": 200,
"myo": 198,
"myu": 199,
"na": 246,
"ni": 245,
"nu": 244,
"nya": 206,
"nyo": 204,
"nyu": 205,
"pa": 218,
"pi": 217,
"pu": 216,
"py": 183,
"pya": 185,
"pyu": 184,
"ra": 234,
"ru": 233,
"rya": 197,
"ryo": 195,
"ryu": 196,
"sha": 212,
"shi": 250,
"sho": 210,
"shu": 211,
"ta": 249,
"tsu": 247,
"wa": 232,
"wo": 231,
"ya": 237,
"yo": 235,
"yu": 236,
"za": 227,
"ze": 224,
"zo": 223,
"zu": 225,
"!": 3,
"'": 4,
"(": 5,
")": 6,
",": 7,
"、": 7,
"-": 8,
".": 9,
"。": 9,
"…": 9,
"/": 10,
":": 11,
";": 12,
"?": 13,
"a": 14,
"b": 15,
"c": 16,
"d": 17,
"e": 18,
"f": 19,
"g": 20,
"h": 21,
"i": 22,
"j": 23,
"k": 24,
"l": 25,
"m": 26,
"n": 27,
"o": 28,
"p": 29,
"q": 30,
"r": 31,
"s": 32,
"t": 33,
"u": 34,
"v": 35,
"w": 36,
"x": 37,
"y": 38,
"z": 39,
"th": 40,
"in": 41,
"the": 42,
"an": 43,
"er": 44,
"ou": 45,
"re": 46,
"on": 47,
"at": 48,
"ed": 49,
"en": 50,
"to": 51,
"ing": 52,
"and": 53,
"is": 54,
"as": 55,
"al": 56,
"or": 57,
"of": 58,
"ar": 59,
"it": 60,
"es": 61,
"he": 62,
"st": 63,
"le": 64,
"om": 65,
"se": 66,
"be": 67,
"ad": 68,
"ow": 69,
"ly": 70,
"ch": 71,
"wh": 72,
"that": 73,
"you": 74,
"li": 75,
"ve": 76,
"ac": 77,
"ti": 78,
"ld": 79,
"me": 80,
"was": 81,
"gh": 82,
"id": 83,
"ll": 84,
"wi": 85,
"ent": 86,
"for": 87,
"ay": 88,
"ro": 89,
"ver": 90,
"ic": 91,
"her": 92,
"ke": 93,
"his": 94,
"no": 95,
"ut": 96,
"un": 97,
"ir": 98,
"lo": 99,
"we": 100,
"ri": 101,
"ha": 102,
"with": 103,
"ght": 104,
"out": 105,
"im": 106,
"ion": 107,
"all": 108,
"ab": 109,
"one": 110,
"ne": 111,
"ge": 112,
"ould": 113,
"ter": 114,
"mo": 115,
"had": 116,
"ce": 117,
"she": 118,
"go": 119,
"sh": 120,
"ur": 121,
"am": 122,
"so": 123,
"pe": 124,
"my": 125,
"de": 126,
"are": 127,
"but": 128,
"ome": 129,
"fr": 130,
"ther": 131,
"fe": 132,
"su": 133,
"do": 134,
"con": 135,
"te": 136,
"ain": 137,
"ere": 138,
"po": 139,
"if": 140,
"they": 141,
"us": 142,
"ag": 143,
"tr": 144,
"now": 145,
"oun": 146,
"this": 147,
"have": 148,
"not": 149,
"sa": 150,
"il": 151,
"up": 152,
"thing": 153,
"from": 154,
"ap": 155,
"him": 156,
"ack": 157,
"ation": 158,
"ant": 159,
"our": 160,
"op": 161,
"like": 162,
"ust": 163,
"ess": 164,
"bo": 165,
"ok": 166,
"ul": 167,
"ind": 168,
"ex": 169,
"com": 170,
"some": 171,
"there": 172,
"ers": 173,
"co": 174,
"res": 175,
"man": 176,
"ard": 177,
"pl": 178
},
"merges":[
"キ ャ",
"キ ュ",
"キ ョ",
"シ ャ",
"シ ュ",
"シ ョ",
"チ ャ",
"チ ュ",
"チ ョ",
"ニ ャ",
"ニ ュ",
"ニ ョ",
"ヒ ャ",
"ヒ ュ",
"ヒ ョ",
"ミ ャ",
"ミ ュ",
"ミ ョ",
"リ ャ",
"リ ュ",
"リ ョ",
"ギ ャ",
"ギ ュ",
"ギ ョ",
"ジ ャ",
"ジ ュ",
"ジ ョ",
"ビ ャ",
"ビ ュ",
"ビ ョ",
"ピ ャ",
"ピ ュ",
"ピ ョ"
]
}
}