From efd038c07676a91eca9ecff526b1fa54e85b03a2 Mon Sep 17 00:00:00 2001 From: mrq Date: Fri, 17 Mar 2023 20:24:17 +0000 Subject: [PATCH] forgot the other things that were in tortoise implementation but not here --- codes/data/audio/voice_tokenizer.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/codes/data/audio/voice_tokenizer.py b/codes/data/audio/voice_tokenizer.py index ff7157aa..a1a7ef18 100644 --- a/codes/data/audio/voice_tokenizer.py +++ b/codes/data/audio/voice_tokenizer.py @@ -29,6 +29,27 @@ def remove_extraneous_punctuation(word): word = extraneous.sub('', word) return word +def expand_numbers(text): + return normalize_numbers(text) + + +def lowercase(text): + return text.lower() + +_whitespace_re = re.compile(r'\s+') + +def collapse_whitespace(text): + return re.sub(_whitespace_re, ' ', text) + + +def convert_to_ascii(text): + return unidecode(text) + +def basic_cleaners(text): + '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' + text = lowercase(text) + text = collapse_whitespace(text) + return text class VoiceBpeTokenizer: def __init__(self, vocab_file, preprocess=None):