From d1ad634ea9675b1294e0c7e99fd11dae29f404f4 Mon Sep 17 00:00:00 2001 From: mrq Date: Fri, 17 Mar 2023 20:03:02 +0000 Subject: [PATCH] added japanese preprocessor for tokenizer --- tortoise/utils/tokenizer.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tortoise/utils/tokenizer.py b/tortoise/utils/tokenizer.py index f775470..9e2d8c7 100644 --- a/tortoise/utils/tokenizer.py +++ b/tortoise/utils/tokenizer.py @@ -172,17 +172,33 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), ' class VoiceBpeTokenizer: def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, preprocess=None): + with open(vocab_file, 'r', encoding='utf-8') as f: + vocab = json.load(f) + + self.language = vocab['model']['language'] if 'language' in vocab['model'] else None + if preprocess is None: - with open(vocab_file, 'r', encoding='utf-8') as f: - vocab = json.load(f) - self.preprocess = 'pre_tokenizer' in vocab and vocab['pre_tokenizer'] + self.preprocess = 'pre_tokenizer' in vocab and vocab['pre_tokenizer'] else: self.preprocess = preprocess if vocab_file is not None: self.tokenizer = Tokenizer.from_file(vocab_file) def preprocess_text(self, txt): - txt = english_cleaners(txt) + if self.language == 'ja': + import pykakasi + + kks = pykakasi.kakasi() + results = kks.convert(txt) + words = [] + + for result in results: + words.append(result['kana']) + + txt = " ".join(words) + txt = basic_cleaners(txt) + else: + txt = english_cleaners(txt) return txt def encode(self, txt):