forked from mrq/DL-Art-School
Taking another stab at a BPE tokenizer
This commit is contained in:
parent
9aa06542cd
commit
f0c4cd6317
File diff suppressed because one or more lines are too long
|
@ -235,14 +235,14 @@ if __name__ == '__main__':
|
||||||
m = None
|
m = None
|
||||||
for i, b in tqdm(enumerate(dl)):
|
for i, b in tqdm(enumerate(dl)):
|
||||||
for ib in range(batch_sz):
|
for ib in range(batch_sz):
|
||||||
#save(b, i, ib, 'paired_audio')
|
save(b, i, ib, 'paired_audio')
|
||||||
#save(b, i, ib, 'paired_audio_conditioning', 0)
|
save(b, i, ib, 'paired_audio_conditioning', 0)
|
||||||
#save(b, i, ib, 'paired_audio_conditioning', 1)
|
save(b, i, ib, 'paired_audio_conditioning', 1)
|
||||||
#print(f'Paired file: {b["paired_file"][ib]} text: {b["paired_text"][ib]}')
|
print(f'Paired file: {b["paired_file"][ib]} text: {b["paired_text"][ib]}')
|
||||||
#print(f'Paired text decoded: {decode(b, ib, "paired_text_tokens")}')
|
print(f'Paired text decoded: {decode(b, ib, "paired_text_tokens")}')
|
||||||
save(b, i, ib, 'speech_audio')
|
#save(b, i, ib, 'speech_audio')
|
||||||
save(b, i, ib, 'speech_audio_conditioning', 0)
|
#save(b, i, ib, 'speech_audio_conditioning', 0)
|
||||||
save(b, i, ib, 'speech_audio_conditioning', 1)
|
#save(b, i, ib, 'speech_audio_conditioning', 1)
|
||||||
#print(f'Text: {b["text_text"][ib]}')
|
#print(f'Text: {b["text_text"][ib]}')
|
||||||
#print(f'Text decoded: {decode(b, ib, "text_tokens")}')
|
#print(f'Text decoded: {decode(b, ib, "text_tokens")}')
|
||||||
if i > 5:
|
if i > 5:
|
||||||
|
|
|
@ -51,7 +51,7 @@ def load_voxpopuli(filename):
|
||||||
|
|
||||||
class CharacterTokenizer:
|
class CharacterTokenizer:
|
||||||
def encode(self, txt):
|
def encode(self, txt):
|
||||||
return munchify({'ids': text_to_sequence(txt, ['english_cleaners'])})
|
return text_to_sequence(txt, ['english_cleaners'])
|
||||||
|
|
||||||
def decode(self, seq):
|
def decode(self, seq):
|
||||||
return sequence_to_text(seq)
|
return sequence_to_text(seq)
|
||||||
|
@ -99,7 +99,8 @@ class TextWavLoader(torch.utils.data.Dataset):
|
||||||
assert self.max_wav_len is not None and self.max_text_len is not None
|
assert self.max_wav_len is not None and self.max_text_len is not None
|
||||||
self.use_bpe_tokenizer = opt_get(hparams, ['use_bpe_tokenizer'], True)
|
self.use_bpe_tokenizer = opt_get(hparams, ['use_bpe_tokenizer'], True)
|
||||||
if self.use_bpe_tokenizer:
|
if self.use_bpe_tokenizer:
|
||||||
self.tokenizer = Tokenizer.from_file(opt_get(hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json'))
|
from data.audio.voice_tokenizer import VoiceBpeTokenizer
|
||||||
|
self.tokenizer = VoiceBpeTokenizer(opt_get(hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json'))
|
||||||
else:
|
else:
|
||||||
self.tokenizer = CharacterTokenizer()
|
self.tokenizer = CharacterTokenizer()
|
||||||
|
|
||||||
|
@ -111,7 +112,7 @@ class TextWavLoader(torch.utils.data.Dataset):
|
||||||
return (text_seq, wav, text, audiopath_and_text[0])
|
return (text_seq, wav, text, audiopath_and_text[0])
|
||||||
|
|
||||||
def get_text(self, text):
|
def get_text(self, text):
|
||||||
tokens = self.tokenizer.encode(text.strip().lower()).ids
|
tokens = self.tokenizer.encode(text)
|
||||||
tokens = torch.IntTensor(tokens)
|
tokens = torch.IntTensor(tokens)
|
||||||
if self.use_bpe_tokenizer:
|
if self.use_bpe_tokenizer:
|
||||||
# Assert if any UNK,start tokens encountered.
|
# Assert if any UNK,start tokens encountered.
|
||||||
|
@ -226,14 +227,14 @@ if __name__ == '__main__':
|
||||||
'phase': 'train',
|
'phase': 'train',
|
||||||
'n_workers': 0,
|
'n_workers': 0,
|
||||||
'batch_size': batch_sz,
|
'batch_size': batch_sz,
|
||||||
'needs_collate': False,
|
'needs_collate': True,
|
||||||
'max_wav_length': 255995,
|
'max_wav_length': 255995,
|
||||||
'max_text_length': 200,
|
'max_text_length': 200,
|
||||||
'sample_rate': 22050,
|
'sample_rate': 22050,
|
||||||
'load_conditioning': True,
|
'load_conditioning': True,
|
||||||
'num_conditioning_candidates': 2,
|
'num_conditioning_candidates': 2,
|
||||||
'conditioning_length': 44000,
|
'conditioning_length': 44000,
|
||||||
'use_bpe_tokenizer': False,
|
'use_bpe_tokenizer': True,
|
||||||
}
|
}
|
||||||
from data import create_dataset, create_dataloader
|
from data import create_dataset, create_dataloader
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
|
import torch
|
||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
from tokenizers.models import BPE
|
from tokenizers.models import BPE
|
||||||
from tokenizers.pre_tokenizers import Whitespace
|
from tokenizers.pre_tokenizers import Whitespace
|
||||||
|
@ -9,6 +10,25 @@ from tokenizers.trainers import BpeTrainer
|
||||||
|
|
||||||
from data.audio.paired_voice_audio_dataset import load_mozilla_cv, load_voxpopuli, load_tsv
|
from data.audio.paired_voice_audio_dataset import load_mozilla_cv, load_voxpopuli, load_tsv
|
||||||
from models.tacotron2.taco_utils import load_filepaths_and_text
|
from models.tacotron2.taco_utils import load_filepaths_and_text
|
||||||
|
from models.tacotron2.text.cleaners import english_cleaners
|
||||||
|
|
||||||
|
|
||||||
|
class VoiceBpeTokenizer:
|
||||||
|
def __init__(self, vocab_file):
|
||||||
|
self.tokenizer = Tokenizer.from_file(vocab_file)
|
||||||
|
|
||||||
|
def encode(self, txt):
|
||||||
|
txt = english_cleaners(txt)
|
||||||
|
txt = remove_extraneous_punctuation(txt)
|
||||||
|
txt = txt.replace(' ', '[SPACE]')
|
||||||
|
return self.tokenizer.encode(txt).ids
|
||||||
|
|
||||||
|
def decode(self, seq):
|
||||||
|
if isinstance(seq, torch.Tensor):
|
||||||
|
seq = seq.cpu().numpy()
|
||||||
|
txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(' ', '')
|
||||||
|
txt = txt.replace('[SPACE]', ' ')
|
||||||
|
return txt
|
||||||
|
|
||||||
|
|
||||||
def build_text_file_from_priors(priors, output):
|
def build_text_file_from_priors(priors, output):
|
||||||
|
@ -30,14 +50,33 @@ def build_text_file_from_priors(priors, output):
|
||||||
out.flush()
|
out.flush()
|
||||||
|
|
||||||
|
|
||||||
|
def remove_extraneous_punctuation(word):
|
||||||
|
replacement_punctuation = {
|
||||||
|
'{': '(', '}': ')',
|
||||||
|
'[': '(', ']': ')',
|
||||||
|
'`': '\'', '—': '-',
|
||||||
|
'—': '-', '`': '\'',
|
||||||
|
'ʼ': '\''
|
||||||
|
}
|
||||||
|
replace = re.compile("|".join([re.escape(k) for k in sorted(replacement_punctuation, key=len, reverse=True)]), flags=re.DOTALL)
|
||||||
|
word = replace.sub(lambda x: replacement_punctuation[x.group(0)], word)
|
||||||
|
|
||||||
|
# TODO: some of these are spoken ('@', '%', '+', etc). Integrate them into the cleaners.
|
||||||
|
extraneous = re.compile(r'^[@#%_=\$\^&\*\+\\]$')
|
||||||
|
word = extraneous.sub('', word)
|
||||||
|
return word
|
||||||
|
|
||||||
|
|
||||||
def train():
|
def train():
|
||||||
with open('all_texts.txt', 'r', encoding='utf-8') as at:
|
with open('all_texts.txt', 'r', encoding='utf-8') as at:
|
||||||
ttsd = at.readlines()
|
ttsd = at.readlines()
|
||||||
#bcd = datasets.load_dataset('bookcorpus', cache_dir='Z:\\huggingface_datasets\\cache')['train']
|
#bcd = datasets.load_dataset('bookcorpus', cache_dir='Z:\\huggingface_datasets\\cache')['train']
|
||||||
|
|
||||||
allowed_characters_re = re.compile(r'^[0-9a-z!@#%_=:;"/, \-\$\^&\*\(\)\+\{\[\]\}\\\.\'\?—–ʼ]+$')
|
#allowed_characters_re = re.compile(r'^[0-9a-z!@#%_=:;"/, \-\$\^&\*\(\)\+\{\[\]\}\\\.\'\?—–ʼ]+$')
|
||||||
|
allowed_characters_re = re.compile(r'^[a-z!:;"/, \-\(\)\.\'\?ʼ]+$')
|
||||||
def preprocess_word(word, report=False):
|
def preprocess_word(word, report=False):
|
||||||
word = word.strip().lower()
|
word = english_cleaners(word)
|
||||||
|
word = remove_extraneous_punctuation(word)
|
||||||
if not bool(allowed_characters_re.match(word)):
|
if not bool(allowed_characters_re.match(word)):
|
||||||
if report and word:
|
if report and word:
|
||||||
print(f"REPORTING: '{word}'")
|
print(f"REPORTING: '{word}'")
|
||||||
|
@ -53,7 +92,7 @@ def train():
|
||||||
#for i in range(0, len(bcd), batch_size):
|
#for i in range(0, len(bcd), batch_size):
|
||||||
# yield [preprocess_word(t) for t in bcd[i:i+batch_size]['text']]
|
# yield [preprocess_word(t) for t in bcd[i:i+batch_size]['text']]
|
||||||
|
|
||||||
trainer = BpeTrainer(special_tokens=['[STOP]', '[UNK]'], vocab_size=511, continuing_subword_prefix='$$$')
|
trainer = BpeTrainer(special_tokens=['[STOP]', '[UNK]', '[SPACE]'], vocab_size=255)
|
||||||
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
||||||
tokenizer.pre_tokenizer = Whitespace()
|
tokenizer.pre_tokenizer = Whitespace()
|
||||||
tokenizer.train_from_iterator(batch_iterator(), trainer, length=len(ttsd))#+len(bcd))
|
tokenizer.train_from_iterator(batch_iterator(), trainer, length=len(ttsd))#+len(bcd))
|
||||||
|
@ -63,6 +102,18 @@ def train():
|
||||||
tokenizer.save('gpt_tts_tokenizer.json')
|
tokenizer.save('gpt_tts_tokenizer.json')
|
||||||
|
|
||||||
|
|
||||||
|
def test():
|
||||||
|
tok = VoiceBpeTokenizer('gpt_tts_tokenizer.json')
|
||||||
|
with open('all_texts.txt', 'r', encoding='utf-8') as at:
|
||||||
|
ttsd = at.readlines()
|
||||||
|
for line in ttsd:
|
||||||
|
line = line.strip()
|
||||||
|
seq = tok.encode(line)
|
||||||
|
out = tok.decode(seq)
|
||||||
|
print(f">>>{line}")
|
||||||
|
print(f"<<<{out}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
'''
|
'''
|
||||||
build_text_file_from_priors([('Y:\\bigasr_dataset\\libritts\\train-all.txt', 'libritts'),
|
build_text_file_from_priors([('Y:\\bigasr_dataset\\libritts\\train-all.txt', 'libritts'),
|
||||||
|
@ -73,4 +124,5 @@ if __name__ == '__main__':
|
||||||
('Y:\\clips\\books2-transcribed.tsv', 'tsv'),
|
('Y:\\clips\\books2-transcribed.tsv', 'tsv'),
|
||||||
('Y:\\clips\\podcasts-0-transcribed.tsv', 'tsv')], 'all_texts.txt')
|
('Y:\\clips\\podcasts-0-transcribed.tsv', 'tsv')], 'all_texts.txt')
|
||||||
'''
|
'''
|
||||||
train()
|
#train()
|
||||||
|
test()
|
Loading…
Reference in New Issue
Block a user