From a9629f702284122f3c8e8fc6c326b6ebaa6f3342 Mon Sep 17 00:00:00 2001 From: James Betker Date: Wed, 22 Dec 2021 14:03:18 -0700 Subject: [PATCH] Try out using the GPT tokenizer rather than nv_tacotron This results in a significant compression of the text domain, I'm curious what the effect on speech quality will be. --- codes/data/__init__.py | 9 +++++++++ codes/data/audio/paired_voice_audio_dataset.py | 16 +++++----------- codes/models/gpt_voice/gpt_tts_hf.py | 9 ++++----- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/codes/data/__init__.py b/codes/data/__init__.py index 1ede1635..3d6d32be 100644 --- a/codes/data/__init__.py +++ b/codes/data/__init__.py @@ -69,6 +69,15 @@ def create_dataset(dataset_opt, return_collate=False): dataset_opt = munchify(default_params) if opt_get(dataset_opt, ['needs_collate'], True): collate = C() + elif mode == 'paired_voice_audio': + from data.audio.paired_voice_audio_dataset import TextWavLoader as D + from data.audio.paired_voice_audio_dataset import TextMelCollate as C + from models.tacotron2.hparams import create_hparams + default_params = create_hparams() + default_params.update(dataset_opt) + dataset_opt = munchify(default_params) + if opt_get(dataset_opt, ['needs_collate'], True): + collate = C() elif mode == 'gpt_tts': from data.audio.gpt_tts_dataset import GptTtsDataset as D from data.audio.gpt_tts_dataset import GptTtsCollater as C diff --git a/codes/data/audio/paired_voice_audio_dataset.py b/codes/data/audio/paired_voice_audio_dataset.py index b8adf4b7..54cda975 100644 --- a/codes/data/audio/paired_voice_audio_dataset.py +++ b/codes/data/audio/paired_voice_audio_dataset.py @@ -7,11 +7,11 @@ import torch.nn.functional as F import torch.utils.data import torchaudio from tqdm import tqdm +from transformers import GPT2TokenizerFast from data.audio.unsupervised_audio_dataset import load_audio from data.util import find_files_of_type, is_audio_file from models.tacotron2.taco_utils import load_filepaths_and_text -from models.tacotron2.text import text_to_sequence from utils.util import opt_get @@ -84,6 +84,7 @@ class TextWavLoader(torch.utils.data.Dataset): self.needs_collate = opt_get(hparams, ['needs_collate'], True) if not self.needs_collate: assert self.max_wav_len is not None and self.max_text_len is not None + self.tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') def get_wav_text_pair(self, audiopath_and_text): # separate filename and text @@ -93,8 +94,7 @@ class TextWavLoader(torch.utils.data.Dataset): return (text_seq, wav, text, audiopath_and_text[0]) def get_text(self, text): - text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners)) - return text_norm + return torch.IntTensor(self.tokenizer(text)['input_ids']) def load_conditioning_candidates(self, path): candidates = find_files_of_type('img', os.path.dirname(path), qualifier=is_audio_file)[0] @@ -213,7 +213,7 @@ class TextMelCollate(): if __name__ == '__main__': batch_sz = 8 params = { - 'mode': 'nv_tacotron', + 'mode': 'paired_voice_audio', 'path': ['Z:\\bigasr_dataset\\libritts\\test-clean_list.txt'], 'fetcher_mode': ['libritts'], 'phase': 'train', @@ -234,11 +234,5 @@ if __name__ == '__main__': i = 0 m = None for i, b in tqdm(enumerate(dl)): - if i > 5: - break - w = b['wav'] for ib in range(batch_sz): - print(f'{i} {ib} {b["real_text"][ib]}') - torchaudio.save(f'{i}_clip_{ib}.wav', b['wav'][ib], ds.sample_rate) - for c in range(3): - torchaudio.save(f'{i}_clip_{ib}_cond{c}.wav', b['conditioning'][ib, c], ds.sample_rate) + print(f"text_seq: {b['text_lengths'].max()}, speech_seq: {b['wav_lengths'].max()//1024}") diff --git a/codes/models/gpt_voice/gpt_tts_hf.py b/codes/models/gpt_voice/gpt_tts_hf.py index c74fdb3f..2d1b6b06 100644 --- a/codes/models/gpt_voice/gpt_tts_hf.py +++ b/codes/models/gpt_voice/gpt_tts_hf.py @@ -20,7 +20,7 @@ class ConditioningEncoder(nn.Module): def __init__(self, spec_dim, embedding_dim, - attn_blocks=4, + attn_blocks=6, num_attn_heads=4, do_checkpointing=False): super().__init__() @@ -39,14 +39,13 @@ class ConditioningEncoder(nn.Module): class GptTtsHf(nn.Module): - NUMBER_TEXT_TOKENS = len(symbols)+1 - START_TEXT_TOKEN = len(symbols) + NUMBER_TEXT_TOKENS = 50257 # The number of BPE tokens produced by the HF GPT2Tokenizer STOP_TEXT_TOKEN = 0 NUMBER_MEL_CODES = 8194 START_MEL_TOKEN = 8192 STOP_MEL_TOKEN = 8193 - def __init__(self, layers=8, model_dim=512, heads=8, max_symbols_per_phrase=200, max_mel_tokens=250, max_conditioning_inputs=3, + def __init__(self, layers=8, model_dim=512, heads=8, max_symbols_per_phrase=100, max_mel_tokens=250, max_conditioning_inputs=3, checkpointing=True, mel_length_compression=1024, max_conditioning_length=60): super().__init__() self.max_mel_tokens = max_mel_tokens @@ -54,7 +53,7 @@ class GptTtsHf(nn.Module): self.model_dim = model_dim self.max_conditioning_inputs = max_conditioning_inputs self.mel_length_compression = mel_length_compression - self.conditioning_encoder = ConditioningEncoder(80, model_dim) + self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads) self.text_embedding = nn.Embedding(self.NUMBER_TEXT_TOKENS, model_dim) seq_length = 2+self.max_symbols_per_phrase+self.max_conditioning_inputs+self.max_mel_tokens self.gpt_config = GPT2Config(vocab_size=self.NUMBER_MEL_CODES,