forked from mrq/tortoise-tts
added flag to disable preprocessing (because some IPAs will turn into ASCII, implicitly enable for using the specific ipa.json tokenizer vocab)
This commit is contained in:
parent
42cb1f3674
commit
1f674a468f
|
@ -170,7 +170,11 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), '
|
||||||
|
|
||||||
|
|
||||||
class VoiceBpeTokenizer:
|
class VoiceBpeTokenizer:
|
||||||
def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
|
def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, preprocess=None):
|
||||||
|
if preprocess is None:
|
||||||
|
self.preprocess = vocab_file[-8:] != "ipa.json"
|
||||||
|
else:
|
||||||
|
self.preprocess = preprocess
|
||||||
if vocab_file is not None:
|
if vocab_file is not None:
|
||||||
self.tokenizer = Tokenizer.from_file(vocab_file)
|
self.tokenizer = Tokenizer.from_file(vocab_file)
|
||||||
|
|
||||||
|
@ -179,7 +183,8 @@ class VoiceBpeTokenizer:
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
def encode(self, txt):
|
def encode(self, txt):
|
||||||
txt = self.preprocess_text(txt)
|
if self.preprocess:
|
||||||
|
txt = self.preprocess_text(txt)
|
||||||
txt = txt.replace(' ', '[SPACE]')
|
txt = txt.replace(' ', '[SPACE]')
|
||||||
return self.tokenizer.encode(txt).ids
|
return self.tokenizer.encode(txt).ids
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user