From 0db8ebc543db46c8f533393f39bc1c168f4ee8eb Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Thu, 16 Mar 2023 14:41:21 +0000
Subject: [PATCH] deduce if preprocessing text by checking the JSON itself
 instead

---
 codes/data/audio/voice_tokenizer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/codes/data/audio/voice_tokenizer.py b/codes/data/audio/voice_tokenizer.py
index 414b196a..86fd89fa 100644
--- a/codes/data/audio/voice_tokenizer.py
+++ b/codes/data/audio/voice_tokenizer.py
@@ -1,6 +1,8 @@
 import re
 
 import torch
+import json
+
 from tokenizers import Tokenizer
 from tokenizers.models import BPE
 from tokenizers.pre_tokenizers import Whitespace
@@ -31,7 +33,9 @@ def remove_extraneous_punctuation(word):
 class VoiceBpeTokenizer:
     def __init__(self, vocab_file, preprocess=None):
         if preprocess is None:
-            self.preprocess = vocab_file[-8:] != "ipa.json"
+            with open(vocab_file, 'r', encoding='utf-8') as f:
+                vocab = json.load(f)
+                self.preprocess = 'pre_tokenizer' in vocab and vocab['pre_tokenizer']
         else:
             self.preprocess = preprocess
         if vocab_file is not None: