From af78e3978a381e5c38aa83c6be8a9f09eb6efebf Mon Sep 17 00:00:00 2001 From: mrq Date: Thu, 16 Mar 2023 14:41:04 +0000 Subject: [PATCH] deduce if preprocessing text by checking the JSON itself instead --- tortoise/utils/tokenizer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tortoise/utils/tokenizer.py b/tortoise/utils/tokenizer.py index ad49e93..f775470 100644 --- a/tortoise/utils/tokenizer.py +++ b/tortoise/utils/tokenizer.py @@ -1,5 +1,6 @@ import os import re +import json import inflect import torch @@ -172,7 +173,9 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), ' class VoiceBpeTokenizer: def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, preprocess=None): if preprocess is None: - self.preprocess = vocab_file[-8:] != "ipa.json" + with open(vocab_file, 'r', encoding='utf-8') as f: + vocab = json.load(f) + self.preprocess = 'pre_tokenizer' in vocab and vocab['pre_tokenizer'] else: self.preprocess = preprocess if vocab_file is not None: