diff --git a/codes/data/audio/voice_tokenizer.py b/codes/data/audio/voice_tokenizer.py
index ff7157aa..a1a7ef18 100644
--- a/codes/data/audio/voice_tokenizer.py
+++ b/codes/data/audio/voice_tokenizer.py
@@ -29,6 +29,27 @@ def remove_extraneous_punctuation(word):
     word = extraneous.sub('', word)
     return word
 
+def expand_numbers(text):
+  return normalize_numbers(text)
+
+
+def lowercase(text):
+  return text.lower()
+
+_whitespace_re = re.compile(r'\s+')
+
+def collapse_whitespace(text):
+  return re.sub(_whitespace_re, ' ', text)
+
+
+def convert_to_ascii(text):
+  return unidecode(text)
+
+def basic_cleaners(text):
+  '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
+  text = lowercase(text)
+  text = collapse_whitespace(text)
+  return text
 
 class VoiceBpeTokenizer:
     def __init__(self, vocab_file, preprocess=None):