From b17260cddffd4578bf7e76b30f6b581fe9b540bd Mon Sep 17 00:00:00 2001 From: mrq Date: Fri, 17 Mar 2023 20:04:40 +0000 Subject: [PATCH] added japanese tokenizer (experimental) --- models/tokenizers/japanese.json | 564 ++++++++++++++++++++++++++++++++ 1 file changed, 564 insertions(+) create mode 100755 models/tokenizers/japanese.json diff --git a/models/tokenizers/japanese.json b/models/tokenizers/japanese.json new file mode 100755 index 0000000..116ce5e --- /dev/null +++ b/models/tokenizers/japanese.json @@ -0,0 +1,564 @@ +{ + "version":"1.0", + "truncation":null, + "padding":null, + "normalizer":null, + "pre_tokenizer":{ + "type":"Whitespace" + }, + "post_processor":null, + "decoder":null, + "added_tokens":[ + { + "id":0, + "special":true, + "content":"[STOP]", + "single_word":false, + "lstrip":false, + "rstrip":false, + "normalized":false + }, + { + "id":1, + "special":true, + "content":"[UNK]", + "single_word":false, + "lstrip":false, + "rstrip":false, + "normalized":false + }, + { + "id":2, + "special":true, + "content":"[SPACE]", + "single_word":false, + "lstrip":false, + "rstrip":false, + "normalized":false + } + ], + "model":{ + "type":"BPE", + "language": "ja", + "dropout":null, + "unk_token":"[UNK]", + "continuing_subword_prefix":null, + "end_of_word_suffix":null, + "fuse_unk":false, + "vocab":{ + "[STOP]": 0, + "[UNK]": 1, + "[SPACE]": 2, + + "ー":255, + + "ア": 14, + "イ": 22, + "ウ": 34, + "エ": 18, + "オ": 28, + "カ": 254, + "ガ": 230, + "キ": 253, + "キャ": 215, + "キュ": 214, + "キョ": 213, + "ギ": 229, + "ギャ": 194, + "ギュ": 193, + "ギョ": 192, + "ク": 252, + "グ": 228, + "ケ": 93, + "ゲ": 112, + "コ": 251, + "ゴ": 119, + "サ": 150, + "ザ": 227, + "シ": 250, + "シャ": 212, + "シュ": 211, + "ショ": 210, + "ジ": 226, + "ジャ": 191, + "ジュ": 190, + "ジョ": 189, + "ス": 133, + "ズ": 225, + "セ": 66, + "ゼ": 224, + "ソ": 123, + "ゾ": 223, + "タ": 249, + "ダ": 222, + "チ": 248, + "チャ": 209, + "チュ": 208, + "チョ": 207, + "ヂ": 226, + "ツ": 247, + "ヅ": 225, + "テ": 136, + "デ": 126, + "ト": 51, + "ド": 134, + "ナ": 246, + "ニ": 245, + "ニャ": 206, + "ニュ": 205, + "ニョ": 204, + "ヌ": 244, + "ネ": 111, + "ノ": 95, + "ハ": 102, + "バ": 221, + "パ": 218, + "ヒ": 243, + "ヒャ": 203, + "ヒュ": 202, + "ヒョ": 201, + "ビ": 220, + "ビャ": 188, + "ビュ": 187, + "ビョ": 186, + "ピ": 217, + "ピャ": 185, + "ピュ": 184, + "ピョ": 183, + "フ": 242, + "ブ": 219, + "プ": 216, + "ヘ": 62, + "ベ": 67, + "ペ": 124, + "ホ": 241, + "ボ": 165, + "ポ": 139, + "マ": 240, + "ミ": 239, + "ミャ": 200, + "ミュ": 199, + "ミョ": 198, + "ム": 238, + "メ": 80, + "モ": 115, + "ヤ": 237, + "ユ": 236, + "ヨ": 235, + "ラ": 234, + "リ": 101, + "リャ": 197, + "リュ": 196, + "リョ": 195, + "ル": 233, + "レ": 46, + "ロ": 89, + "ワ": 232, + "ヲ": 231, + "ン": 27, + "ッ":182, + "ャ":181, + "ョ":180, + "ュ":179, + + "あ": 14, + "い": 22, + "う": 34, + "え": 18, + "お": 28, + "か": 254, + "が": 230, + "き": 253, + "きゃ": 215, + "きゅ": 214, + "きょ": 213, + "ぎ": 229, + "ぎゃ": 194, + "ぎゅ": 193, + "ぎょ": 192, + "く": 252, + "ぐ": 228, + "け": 93, + "げ": 112, + "こ": 251, + "ご": 119, + "さ": 150, + "ざ": 227, + "し": 250, + "しゃ": 212, + "しゅ": 211, + "しょ": 210, + "じ": 226, + "じゃ": 191, + "じゅ": 190, + "じょ": 189, + "す": 133, + "ず": 225, + "せ": 66, + "ぜ": 224, + "そ": 123, + "ぞ": 223, + "た": 249, + "だ": 222, + "ち": 248, + "ちゃ": 209, + "ちゅ": 208, + "ちょ": 207, + "ぢ": 226, + "つ": 247, + "づ": 225, + "て": 136, + "で": 126, + "と": 51, + "ど": 134, + "な": 246, + "に": 245, + "にゃ": 206, + "にゅ": 205, + "にょ": 204, + "ぬ": 244, + "ね": 111, + "の": 95, + "は": 102, + "ば": 221, + "ぱ": 218, + "ひ": 243, + "ひゃ": 203, + "ひゅ": 202, + "ひょ": 201, + "び": 220, + "びゃ": 188, + "びゅ": 187, + "びょ": 186, + "ぴ": 217, + "ぴゃ": 185, + "ぴゅ": 184, + "ぴょ": 183, + "ふ": 242, + "ぶ": 219, + "ぷ": 216, + "へ": 62, + "べ": 67, + "ぺ": 124, + "ほ": 241, + "ぼ": 165, + "ぽ": 139, + "ま": 240, + "み": 239, + "みゃ": 200, + "みゅ": 199, + "みょ": 198, + "む": 238, + "め": 80, + "も": 115, + "や": 237, + "ゆ": 236, + "よ": 235, + "ら": 234, + "り": 101, + "りゃ": 197, + "りゅ": 196, + "りょ": 195, + "る": 233, + "れ": 46, + "ろ": 89, + "わ": 232, + "を": 231, + "ん": 27, + "っ":182, + "ゃ":181, + "ょ":180, + "ゅ":179, + + "ba": 221, + "bi": 220, + "bu": 219, + "bya": 188, + "byo": 186, + "byu": 187, + "cha": 209, + "chi": 248, + "cho": 207, + "chu": 208, + "da": 222, + "fu": 242, + "ga": 230, + "gi": 229, + "gu": 228, + "gya": 194, + "gyo": 192, + "gyu": 193, + "hi": 243, + "ho": 241, + "hya": 203, + "hyo": 201, + "hyu": 202, + "ja": 191, + "ji": 226, + "jo": 189, + "ju": 190, + "ka": 254, + "ki": 253, + "ko": 251, + "ku": 252, + "kya": 215, + "kyo": 213, + "kyu": 214, + "ma": 240, + "mi": 239, + "mu": 238, + "mya": 200, + "myo": 198, + "myu": 199, + "na": 246, + "ni": 245, + "nu": 244, + "nya": 206, + "nyo": 204, + "nyu": 205, + "pa": 218, + "pi": 217, + "pu": 216, + "py": 183, + "pya": 185, + "pyu": 184, + "ra": 234, + "ru": 233, + "rya": 197, + "ryo": 195, + "ryu": 196, + "sha": 212, + "shi": 250, + "sho": 210, + "shu": 211, + "ta": 249, + "tsu": 247, + "wa": 232, + "wo": 231, + "ya": 237, + "yo": 235, + "yu": 236, + "za": 227, + "ze": 224, + "zo": 223, + "zu": 225, + + "!": 3, + "'": 4, + "(": 5, + ")": 6, + ",": 7, + "、": 7, + "-": 8, + ".": 9, + "。": 9, + "…": 9, + "/": 10, + ":": 11, + ";": 12, + "?": 13, + + "a": 14, + "b": 15, + "c": 16, + "d": 17, + "e": 18, + "f": 19, + "g": 20, + "h": 21, + "i": 22, + "j": 23, + "k": 24, + "l": 25, + "m": 26, + "n": 27, + "o": 28, + "p": 29, + "q": 30, + "r": 31, + "s": 32, + "t": 33, + "u": 34, + "v": 35, + "w": 36, + "x": 37, + "y": 38, + "z": 39, + + "th": 40, + "in": 41, + "the": 42, + "an": 43, + "er": 44, + "ou": 45, + "re": 46, + "on": 47, + "at": 48, + "ed": 49, + "en": 50, + "to": 51, + "ing": 52, + "and": 53, + "is": 54, + "as": 55, + "al": 56, + "or": 57, + "of": 58, + "ar": 59, + "it": 60, + "es": 61, + "he": 62, + "st": 63, + "le": 64, + "om": 65, + "se": 66, + "be": 67, + "ad": 68, + "ow": 69, + "ly": 70, + "ch": 71, + "wh": 72, + "that": 73, + "you": 74, + "li": 75, + "ve": 76, + "ac": 77, + "ti": 78, + "ld": 79, + "me": 80, + "was": 81, + "gh": 82, + "id": 83, + "ll": 84, + "wi": 85, + "ent": 86, + "for": 87, + "ay": 88, + "ro": 89, + "ver": 90, + "ic": 91, + "her": 92, + "ke": 93, + "his": 94, + "no": 95, + "ut": 96, + "un": 97, + "ir": 98, + "lo": 99, + "we": 100, + "ri": 101, + "ha": 102, + "with": 103, + "ght": 104, + "out": 105, + "im": 106, + "ion": 107, + "all": 108, + "ab": 109, + "one": 110, + "ne": 111, + "ge": 112, + "ould": 113, + "ter": 114, + "mo": 115, + "had": 116, + "ce": 117, + "she": 118, + "go": 119, + "sh": 120, + "ur": 121, + "am": 122, + "so": 123, + "pe": 124, + "my": 125, + "de": 126, + "are": 127, + "but": 128, + "ome": 129, + "fr": 130, + "ther": 131, + "fe": 132, + "su": 133, + "do": 134, + "con": 135, + "te": 136, + "ain": 137, + "ere": 138, + "po": 139, + "if": 140, + "they": 141, + "us": 142, + "ag": 143, + "tr": 144, + "now": 145, + "oun": 146, + "this": 147, + "have": 148, + "not": 149, + "sa": 150, + "il": 151, + "up": 152, + "thing": 153, + "from": 154, + "ap": 155, + "him": 156, + "ack": 157, + "ation": 158, + "ant": 159, + "our": 160, + "op": 161, + "like": 162, + "ust": 163, + "ess": 164, + "bo": 165, + "ok": 166, + "ul": 167, + "ind": 168, + "ex": 169, + "com": 170, + "some": 171, + "there": 172, + "ers": 173, + "co": 174, + "res": 175, + "man": 176, + "ard": 177, + "pl": 178 + }, + "merges":[ + "キ ャ", + "キ ュ", + "キ ョ", + "シ ャ", + "シ ュ", + "シ ョ", + "チ ャ", + "チ ュ", + "チ ョ", + "ニ ャ", + "ニ ュ", + "ニ ョ", + "ヒ ャ", + "ヒ ュ", + "ヒ ョ", + "ミ ャ", + "ミ ュ", + "ミ ョ", + "リ ャ", + "リ ュ", + "リ ョ", + "ギ ャ", + "ギ ュ", + "ギ ョ", + "ジ ャ", + "ジ ュ", + "ジ ョ", + "ビ ャ", + "ビ ュ", + "ビ ョ", + "ピ ャ", + "ピ ュ", + "ピ ョ" + ] + } +} \ No newline at end of file