From 730a04708d2cb29f526c3397894950a2733e6e29 Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Thu, 16 Mar 2023 04:24:32 +0000
Subject: [PATCH] added flag to disable preprocessing (because some IPAs will
 turn into ASCII, implicitly enable for using the specific ipa.json tokenizer
 vocab)

---
 codes/data/audio/voice_tokenizer.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/codes/data/audio/voice_tokenizer.py b/codes/data/audio/voice_tokenizer.py
index 18188592..b1127664 100644
--- a/codes/data/audio/voice_tokenizer.py
+++ b/codes/data/audio/voice_tokenizer.py
@@ -29,17 +29,21 @@ def remove_extraneous_punctuation(word):
 
 
 class VoiceBpeTokenizer:
-    def __init__(self, vocab_file):
+    def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, preprocess=None):
+        if preprocess is None:
+            self.preprocess = vocab_file[-8:] != "ipa.json"
+        else:
+            self.preprocess = preprocess
         if vocab_file is not None:
             self.tokenizer = Tokenizer.from_file(vocab_file)
 
     def preprocess_text(self, txt):
         txt = english_cleaners(txt)
-        txt = remove_extraneous_punctuation(txt)
         return txt
 
     def encode(self, txt):
-        txt = self.preprocess_text(txt)
+        if self.preprocess:
+          txt = self.preprocess_text(txt)
         txt = txt.replace(' ', '[SPACE]')
         return self.tokenizer.encode(txt).ids
 
@@ -50,7 +54,6 @@ class VoiceBpeTokenizer:
         txt = txt.replace('[SPACE]', ' ')
         txt = txt.replace('[STOP]', '')
         txt = txt.replace('[UNK]', '')
-
         return txt