DL-Art-School/codes/data/audio/voice_tokenizer_builder.py

77 lines
3.1 KiB
Python
Raw Normal View History

2021-12-23 00:46:18 +00:00
import re
import datasets
2021-12-22 22:06:14 +00:00
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
2021-12-23 21:32:33 +00:00
from tokenizers.processors import ByteLevel
2021-12-22 22:06:14 +00:00
from tokenizers.trainers import BpeTrainer
from data.audio.paired_voice_audio_dataset import load_mozilla_cv, load_voxpopuli, load_tsv
from models.tacotron2.taco_utils import load_filepaths_and_text
def build_text_file_from_priors(priors, output):
with open(output, 'w', encoding='utf-8') as out:
for p, fm in priors:
if fm == 'lj' or fm == 'libritts':
fetcher_fn = load_filepaths_and_text
elif fm == 'tsv':
fetcher_fn = load_tsv
elif fm == 'mozilla_cv':
fetcher_fn = load_mozilla_cv
elif fm == 'voxpopuli':
fetcher_fn = load_voxpopuli
else:
raise NotImplementedError()
apt = fetcher_fn(p)
for path, text in apt:
out.write(text + "\n")
out.flush()
def train():
2021-12-23 00:46:18 +00:00
with open('all_texts.txt', 'r', encoding='utf-8') as at:
ttsd = at.readlines()
2021-12-25 15:52:08 +00:00
#bcd = datasets.load_dataset('bookcorpus', cache_dir='Z:\\huggingface_datasets\\cache')['train']
2021-12-23 00:46:18 +00:00
2021-12-23 21:32:33 +00:00
allowed_characters_re = re.compile(r'^[0-9a-z!@#%_=:;"/, \-\$\^&\*\(\)\+\{\[\]\}\\\.\'\?—–ʼ]+$')
2021-12-23 02:21:29 +00:00
def preprocess_word(word, report=False):
word = word.strip().lower()
2021-12-23 00:46:18 +00:00
if not bool(allowed_characters_re.match(word)):
2021-12-23 02:21:29 +00:00
if report and word:
print(f"REPORTING: '{word}'")
2021-12-23 00:46:18 +00:00
return ''
return word
def batch_iterator(batch_size=1000):
print("Processing ASR texts.")
for i in range(0, len(ttsd), batch_size):
2021-12-23 02:21:29 +00:00
yield [preprocess_word(t, True) for t in ttsd[i:i+batch_size]]
2021-12-23 00:46:18 +00:00
2021-12-25 15:52:08 +00:00
#print("Processing bookcorpus.")
#for i in range(0, len(bcd), batch_size):
# yield [preprocess_word(t) for t in bcd[i:i+batch_size]['text']]
2021-12-23 00:46:18 +00:00
2021-12-25 15:52:08 +00:00
trainer = BpeTrainer(special_tokens=['[STOP]', '[UNK]'], vocab_size=511, continuing_subword_prefix='$$$')
2021-12-22 22:06:14 +00:00
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
2021-12-25 15:52:08 +00:00
tokenizer.train_from_iterator(batch_iterator(), trainer, length=len(ttsd))#+len(bcd))
2021-12-23 21:32:33 +00:00
print(tokenizer.decode(tokenizer.encode("i was traveling throughhadslfghds the woods in 1235375t137{{}}").ids))
2021-12-22 22:06:14 +00:00
tokenizer.save('gpt_tts_tokenizer.json')
if __name__ == '__main__':
'''
build_text_file_from_priors([('Y:\\bigasr_dataset\\libritts\\train-all.txt', 'libritts'),
('Y:\\bigasr_dataset\\libritts\\test-clean_list.txt', 'libritts'),
#('Y:\\bigasr_dataset\\voxpopuli\\audio\\transcribed_data\\en\\asr_en.tsv', 'voxpopuli'),
('Y:\\bigasr_dataset\\voxpopuli\\audio\\transcribed_data\\en\\asr_train.tsv', 'voxpopuli'),
('Y:\\clips\\books1-transcribed.tsv', 'tsv'),
('Y:\\clips\\books2-transcribed.tsv', 'tsv'),
('Y:\\clips\\podcasts-0-transcribed.tsv', 'tsv')], 'all_texts.txt')
'''
2021-12-23 21:32:33 +00:00
train()