forked from mrq/tortoise-tts
added japanese preprocessor for tokenizer
This commit is contained in:
parent
af78e3978a
commit
d1ad634ea9
|
@ -172,17 +172,33 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), '
|
||||||
|
|
||||||
class VoiceBpeTokenizer:
|
class VoiceBpeTokenizer:
|
||||||
def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, preprocess=None):
|
def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, preprocess=None):
|
||||||
|
with open(vocab_file, 'r', encoding='utf-8') as f:
|
||||||
|
vocab = json.load(f)
|
||||||
|
|
||||||
|
self.language = vocab['model']['language'] if 'language' in vocab['model'] else None
|
||||||
|
|
||||||
if preprocess is None:
|
if preprocess is None:
|
||||||
with open(vocab_file, 'r', encoding='utf-8') as f:
|
self.preprocess = 'pre_tokenizer' in vocab and vocab['pre_tokenizer']
|
||||||
vocab = json.load(f)
|
|
||||||
self.preprocess = 'pre_tokenizer' in vocab and vocab['pre_tokenizer']
|
|
||||||
else:
|
else:
|
||||||
self.preprocess = preprocess
|
self.preprocess = preprocess
|
||||||
if vocab_file is not None:
|
if vocab_file is not None:
|
||||||
self.tokenizer = Tokenizer.from_file(vocab_file)
|
self.tokenizer = Tokenizer.from_file(vocab_file)
|
||||||
|
|
||||||
def preprocess_text(self, txt):
|
def preprocess_text(self, txt):
|
||||||
txt = english_cleaners(txt)
|
if self.language == 'ja':
|
||||||
|
import pykakasi
|
||||||
|
|
||||||
|
kks = pykakasi.kakasi()
|
||||||
|
results = kks.convert(txt)
|
||||||
|
words = []
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
words.append(result['kana'])
|
||||||
|
|
||||||
|
txt = " ".join(words)
|
||||||
|
txt = basic_cleaners(txt)
|
||||||
|
else:
|
||||||
|
txt = english_cleaners(txt)
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
def encode(self, txt):
|
def encode(self, txt):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user