diff --git a/codes/data/audio/voice_tokenizer.py b/codes/data/audio/voice_tokenizer.py index ff7157aa..a1a7ef18 100644 --- a/codes/data/audio/voice_tokenizer.py +++ b/codes/data/audio/voice_tokenizer.py @@ -29,6 +29,27 @@ def remove_extraneous_punctuation(word): word = extraneous.sub('', word) return word +def expand_numbers(text): + return normalize_numbers(text) + + +def lowercase(text): + return text.lower() + +_whitespace_re = re.compile(r'\s+') + +def collapse_whitespace(text): + return re.sub(_whitespace_re, ' ', text) + + +def convert_to_ascii(text): + return unidecode(text) + +def basic_cleaners(text): + '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' + text = lowercase(text) + text = collapse_whitespace(text) + return text class VoiceBpeTokenizer: def __init__(self, vocab_file, preprocess=None):