deduce if preprocessing text by checking the JSON itself instead
This commit is contained in:
parent
730f56aa87
commit
0db8ebc543
|
@ -1,6 +1,8 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
import json
|
||||||
|
|
||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
from tokenizers.models import BPE
|
from tokenizers.models import BPE
|
||||||
from tokenizers.pre_tokenizers import Whitespace
|
from tokenizers.pre_tokenizers import Whitespace
|
||||||
|
@ -31,7 +33,9 @@ def remove_extraneous_punctuation(word):
|
||||||
class VoiceBpeTokenizer:
|
class VoiceBpeTokenizer:
|
||||||
def __init__(self, vocab_file, preprocess=None):
|
def __init__(self, vocab_file, preprocess=None):
|
||||||
if preprocess is None:
|
if preprocess is None:
|
||||||
self.preprocess = vocab_file[-8:] != "ipa.json"
|
with open(vocab_file, 'r', encoding='utf-8') as f:
|
||||||
|
vocab = json.load(f)
|
||||||
|
self.preprocess = 'pre_tokenizer' in vocab and vocab['pre_tokenizer']
|
||||||
else:
|
else:
|
||||||
self.preprocess = preprocess
|
self.preprocess = preprocess
|
||||||
if vocab_file is not None:
|
if vocab_file is not None:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user