From 736c2626eeecdc6e3d8a0ab30948514e87282aa8 Mon Sep 17 00:00:00 2001 From: James Betker Date: Sat, 25 Dec 2021 15:21:01 -0700 Subject: [PATCH] build in character tokenizer --- codes/data/audio/paired_voice_audio_dataset.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/codes/data/audio/paired_voice_audio_dataset.py b/codes/data/audio/paired_voice_audio_dataset.py index b7ea203b..fdc63a3a 100644 --- a/codes/data/audio/paired_voice_audio_dataset.py +++ b/codes/data/audio/paired_voice_audio_dataset.py @@ -13,6 +13,7 @@ from transformers import GPT2TokenizerFast from data.audio.unsupervised_audio_dataset import load_audio from data.util import find_files_of_type, is_audio_file from models.tacotron2.taco_utils import load_filepaths_and_text +from models.tacotron2.text import text_to_sequence, sequence_to_text from utils.util import opt_get @@ -46,6 +47,14 @@ def load_voxpopuli(filename): return filepaths_and_text +class CharacterTokenizer: + def encode(self, txt): + return text_to_sequence(txt, ['english_cleaners']) + + def decode(self, seq): + return sequence_to_text(seq) + + class TextWavLoader(torch.utils.data.Dataset): def __init__(self, hparams): self.path = hparams['path'] @@ -86,7 +95,10 @@ class TextWavLoader(torch.utils.data.Dataset): self.needs_collate = opt_get(hparams, ['needs_collate'], True) if not self.needs_collate: assert self.max_wav_len is not None and self.max_text_len is not None - self.tokenizer = Tokenizer.from_file(opt_get(hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json')) + if opt_get(hparams, ['use_bpe_tokenizer'], True): + self.tokenizer = Tokenizer.from_file(opt_get(hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json')) + else: + self.tokenizer = CharacterTokenizer() def get_wav_text_pair(self, audiopath_and_text): # separate filename and text