added japanese preprocessor for tokenizer

This commit is contained in:
mrq 2023-03-17 20:03:02 +00:00
parent af78e3978a
commit d1ad634ea9

View File

@ -172,17 +172,33 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), '
class VoiceBpeTokenizer: class VoiceBpeTokenizer:
def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, preprocess=None): def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, preprocess=None):
with open(vocab_file, 'r', encoding='utf-8') as f:
vocab = json.load(f)
self.language = vocab['model']['language'] if 'language' in vocab['model'] else None
if preprocess is None: if preprocess is None:
with open(vocab_file, 'r', encoding='utf-8') as f: self.preprocess = 'pre_tokenizer' in vocab and vocab['pre_tokenizer']
vocab = json.load(f)
self.preprocess = 'pre_tokenizer' in vocab and vocab['pre_tokenizer']
else: else:
self.preprocess = preprocess self.preprocess = preprocess
if vocab_file is not None: if vocab_file is not None:
self.tokenizer = Tokenizer.from_file(vocab_file) self.tokenizer = Tokenizer.from_file(vocab_file)
def preprocess_text(self, txt): def preprocess_text(self, txt):
txt = english_cleaners(txt) if self.language == 'ja':
import pykakasi
kks = pykakasi.kakasi()
results = kks.convert(txt)
words = []
for result in results:
words.append(result['kana'])
txt = " ".join(words)
txt = basic_cleaners(txt)
else:
txt = english_cleaners(txt)
return txt return txt
def encode(self, txt): def encode(self, txt):