From 1f674a468f4202ac47feb8fb3587dc5837f2af2b Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Thu, 16 Mar 2023 04:33:03 +0000
Subject: [PATCH] added flag to disable preprocessing (because some IPAs will
 turn into ASCII, implicitly enable for using the specific ipa.json tokenizer
 vocab)

---
 tortoise/utils/tokenizer.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tortoise/utils/tokenizer.py b/tortoise/utils/tokenizer.py
index 3ab1c31..ad49e93 100644
--- a/tortoise/utils/tokenizer.py
+++ b/tortoise/utils/tokenizer.py
@@ -170,7 +170,11 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), '
 
 
 class VoiceBpeTokenizer:
-    def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
+    def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, preprocess=None):
+        if preprocess is None:
+            self.preprocess = vocab_file[-8:] != "ipa.json"
+        else:
+            self.preprocess = preprocess
         if vocab_file is not None:
             self.tokenizer = Tokenizer.from_file(vocab_file)
 
@@ -179,7 +183,8 @@ class VoiceBpeTokenizer:
         return txt
 
     def encode(self, txt):
-        txt = self.preprocess_text(txt)
+        if self.preprocess:
+          txt = self.preprocess_text(txt)
         txt = txt.replace(' ', '[SPACE]')
         return self.tokenizer.encode(txt).ids