diff --git a/tortoise/utils/tokenizer.py b/tortoise/utils/tokenizer.py index ad49e93..f775470 100644 --- a/tortoise/utils/tokenizer.py +++ b/tortoise/utils/tokenizer.py @@ -1,5 +1,6 @@ import os import re +import json import inflect import torch @@ -172,7 +173,9 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), ' class VoiceBpeTokenizer: def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, preprocess=None): if preprocess is None: - self.preprocess = vocab_file[-8:] != "ipa.json" + with open(vocab_file, 'r', encoding='utf-8') as f: + vocab = json.load(f) + self.preprocess = 'pre_tokenizer' in vocab and vocab['pre_tokenizer'] else: self.preprocess = preprocess if vocab_file is not None: