From efd038c07676a91eca9ecff526b1fa54e85b03a2 Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Fri, 17 Mar 2023 20:24:17 +0000
Subject: [PATCH] forgot the other things that were in tortoise implementation
 but not here

---
 codes/data/audio/voice_tokenizer.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/codes/data/audio/voice_tokenizer.py b/codes/data/audio/voice_tokenizer.py
index ff7157aa..a1a7ef18 100644
--- a/codes/data/audio/voice_tokenizer.py
+++ b/codes/data/audio/voice_tokenizer.py
@@ -29,6 +29,27 @@ def remove_extraneous_punctuation(word):
     word = extraneous.sub('', word)
     return word
 
+def expand_numbers(text):
+  return normalize_numbers(text)
+
+
+def lowercase(text):
+  return text.lower()
+
+_whitespace_re = re.compile(r'\s+')
+
+def collapse_whitespace(text):
+  return re.sub(_whitespace_re, ' ', text)
+
+
+def convert_to_ascii(text):
+  return unidecode(text)
+
+def basic_cleaners(text):
+  '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
+  text = lowercase(text)
+  text = collapse_whitespace(text)
+  return text
 
 class VoiceBpeTokenizer:
     def __init__(self, vocab_file, preprocess=None):