From 730a04708d2cb29f526c3397894950a2733e6e29 Mon Sep 17 00:00:00 2001 From: mrq Date: Thu, 16 Mar 2023 04:24:32 +0000 Subject: [PATCH] added flag to disable preprocessing (because some IPAs will turn into ASCII, implicitly enable for using the specific ipa.json tokenizer vocab) --- codes/data/audio/voice_tokenizer.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/codes/data/audio/voice_tokenizer.py b/codes/data/audio/voice_tokenizer.py index 18188592..b1127664 100644 --- a/codes/data/audio/voice_tokenizer.py +++ b/codes/data/audio/voice_tokenizer.py @@ -29,17 +29,21 @@ def remove_extraneous_punctuation(word): class VoiceBpeTokenizer: - def __init__(self, vocab_file): + def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, preprocess=None): + if preprocess is None: + self.preprocess = vocab_file[-8:] != "ipa.json" + else: + self.preprocess = preprocess if vocab_file is not None: self.tokenizer = Tokenizer.from_file(vocab_file) def preprocess_text(self, txt): txt = english_cleaners(txt) - txt = remove_extraneous_punctuation(txt) return txt def encode(self, txt): - txt = self.preprocess_text(txt) + if self.preprocess: + txt = self.preprocess_text(txt) txt = txt.replace(' ', '[SPACE]') return self.tokenizer.encode(txt).ids @@ -50,7 +54,6 @@ class VoiceBpeTokenizer: txt = txt.replace('[SPACE]', ' ') txt = txt.replace('[STOP]', '') txt = txt.replace('[UNK]', '') - return txt