deduce if preprocessing text by checking the JSON itself instead

This commit is contained in:
mrq 2023-03-16 14:41:21 +00:00
parent 730f56aa87
commit 0db8ebc543

View File

@ -1,6 +1,8 @@
import re
import torch
import json
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
@ -31,7 +33,9 @@ def remove_extraneous_punctuation(word):
class VoiceBpeTokenizer:
def __init__(self, vocab_file, preprocess=None):
if preprocess is None:
self.preprocess = vocab_file[-8:] != "ipa.json"
with open(vocab_file, 'r', encoding='utf-8') as f:
vocab = json.load(f)
self.preprocess = 'pre_tokenizer' in vocab and vocab['pre_tokenizer']
else:
self.preprocess = preprocess
if vocab_file is not None: