From 1f674a468f4202ac47feb8fb3587dc5837f2af2b Mon Sep 17 00:00:00 2001 From: mrq Date: Thu, 16 Mar 2023 04:33:03 +0000 Subject: [PATCH] added flag to disable preprocessing (because some IPAs will turn into ASCII, implicitly enable for using the specific ipa.json tokenizer vocab) --- tortoise/utils/tokenizer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tortoise/utils/tokenizer.py b/tortoise/utils/tokenizer.py index 3ab1c31..ad49e93 100644 --- a/tortoise/utils/tokenizer.py +++ b/tortoise/utils/tokenizer.py @@ -170,7 +170,11 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), ' class VoiceBpeTokenizer: - def __init__(self, vocab_file=DEFAULT_VOCAB_FILE): + def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, preprocess=None): + if preprocess is None: + self.preprocess = vocab_file[-8:] != "ipa.json" + else: + self.preprocess = preprocess if vocab_file is not None: self.tokenizer = Tokenizer.from_file(vocab_file) @@ -179,7 +183,8 @@ class VoiceBpeTokenizer: return txt def encode(self, txt): - txt = self.preprocess_text(txt) + if self.preprocess: + txt = self.preprocess_text(txt) txt = txt.replace(' ', '[SPACE]') return self.tokenizer.encode(txt).ids