From 0db8ebc543db46c8f533393f39bc1c168f4ee8eb Mon Sep 17 00:00:00 2001 From: mrq Date: Thu, 16 Mar 2023 14:41:21 +0000 Subject: [PATCH] deduce if preprocessing text by checking the JSON itself instead --- codes/data/audio/voice_tokenizer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/codes/data/audio/voice_tokenizer.py b/codes/data/audio/voice_tokenizer.py index 414b196a..86fd89fa 100644 --- a/codes/data/audio/voice_tokenizer.py +++ b/codes/data/audio/voice_tokenizer.py @@ -1,6 +1,8 @@ import re import torch +import json + from tokenizers import Tokenizer from tokenizers.models import BPE from tokenizers.pre_tokenizers import Whitespace @@ -31,7 +33,9 @@ def remove_extraneous_punctuation(word): class VoiceBpeTokenizer: def __init__(self, vocab_file, preprocess=None): if preprocess is None: - self.preprocess = vocab_file[-8:] != "ipa.json" + with open(vocab_file, 'r', encoding='utf-8') as f: + vocab = json.load(f) + self.preprocess = 'pre_tokenizer' in vocab and vocab['pre_tokenizer'] else: self.preprocess = preprocess if vocab_file is not None: