forked from mrq/DL-Art-School
Try out using the GPT tokenizer rather than nv_tacotron
This results in a significant compression of the text domain, I'm curious what the effect on speech quality will be.
This commit is contained in:
parent
ced81a760b
commit
a9629f7022
|
@ -69,6 +69,15 @@ def create_dataset(dataset_opt, return_collate=False):
|
||||||
dataset_opt = munchify(default_params)
|
dataset_opt = munchify(default_params)
|
||||||
if opt_get(dataset_opt, ['needs_collate'], True):
|
if opt_get(dataset_opt, ['needs_collate'], True):
|
||||||
collate = C()
|
collate = C()
|
||||||
|
elif mode == 'paired_voice_audio':
|
||||||
|
from data.audio.paired_voice_audio_dataset import TextWavLoader as D
|
||||||
|
from data.audio.paired_voice_audio_dataset import TextMelCollate as C
|
||||||
|
from models.tacotron2.hparams import create_hparams
|
||||||
|
default_params = create_hparams()
|
||||||
|
default_params.update(dataset_opt)
|
||||||
|
dataset_opt = munchify(default_params)
|
||||||
|
if opt_get(dataset_opt, ['needs_collate'], True):
|
||||||
|
collate = C()
|
||||||
elif mode == 'gpt_tts':
|
elif mode == 'gpt_tts':
|
||||||
from data.audio.gpt_tts_dataset import GptTtsDataset as D
|
from data.audio.gpt_tts_dataset import GptTtsDataset as D
|
||||||
from data.audio.gpt_tts_dataset import GptTtsCollater as C
|
from data.audio.gpt_tts_dataset import GptTtsCollater as C
|
||||||
|
|
|
@ -7,11 +7,11 @@ import torch.nn.functional as F
|
||||||
import torch.utils.data
|
import torch.utils.data
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from transformers import GPT2TokenizerFast
|
||||||
|
|
||||||
from data.audio.unsupervised_audio_dataset import load_audio
|
from data.audio.unsupervised_audio_dataset import load_audio
|
||||||
from data.util import find_files_of_type, is_audio_file
|
from data.util import find_files_of_type, is_audio_file
|
||||||
from models.tacotron2.taco_utils import load_filepaths_and_text
|
from models.tacotron2.taco_utils import load_filepaths_and_text
|
||||||
from models.tacotron2.text import text_to_sequence
|
|
||||||
from utils.util import opt_get
|
from utils.util import opt_get
|
||||||
|
|
||||||
|
|
||||||
|
@ -84,6 +84,7 @@ class TextWavLoader(torch.utils.data.Dataset):
|
||||||
self.needs_collate = opt_get(hparams, ['needs_collate'], True)
|
self.needs_collate = opt_get(hparams, ['needs_collate'], True)
|
||||||
if not self.needs_collate:
|
if not self.needs_collate:
|
||||||
assert self.max_wav_len is not None and self.max_text_len is not None
|
assert self.max_wav_len is not None and self.max_text_len is not None
|
||||||
|
self.tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
|
||||||
|
|
||||||
def get_wav_text_pair(self, audiopath_and_text):
|
def get_wav_text_pair(self, audiopath_and_text):
|
||||||
# separate filename and text
|
# separate filename and text
|
||||||
|
@ -93,8 +94,7 @@ class TextWavLoader(torch.utils.data.Dataset):
|
||||||
return (text_seq, wav, text, audiopath_and_text[0])
|
return (text_seq, wav, text, audiopath_and_text[0])
|
||||||
|
|
||||||
def get_text(self, text):
|
def get_text(self, text):
|
||||||
text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
|
return torch.IntTensor(self.tokenizer(text)['input_ids'])
|
||||||
return text_norm
|
|
||||||
|
|
||||||
def load_conditioning_candidates(self, path):
|
def load_conditioning_candidates(self, path):
|
||||||
candidates = find_files_of_type('img', os.path.dirname(path), qualifier=is_audio_file)[0]
|
candidates = find_files_of_type('img', os.path.dirname(path), qualifier=is_audio_file)[0]
|
||||||
|
@ -213,7 +213,7 @@ class TextMelCollate():
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
batch_sz = 8
|
batch_sz = 8
|
||||||
params = {
|
params = {
|
||||||
'mode': 'nv_tacotron',
|
'mode': 'paired_voice_audio',
|
||||||
'path': ['Z:\\bigasr_dataset\\libritts\\test-clean_list.txt'],
|
'path': ['Z:\\bigasr_dataset\\libritts\\test-clean_list.txt'],
|
||||||
'fetcher_mode': ['libritts'],
|
'fetcher_mode': ['libritts'],
|
||||||
'phase': 'train',
|
'phase': 'train',
|
||||||
|
@ -234,11 +234,5 @@ if __name__ == '__main__':
|
||||||
i = 0
|
i = 0
|
||||||
m = None
|
m = None
|
||||||
for i, b in tqdm(enumerate(dl)):
|
for i, b in tqdm(enumerate(dl)):
|
||||||
if i > 5:
|
|
||||||
break
|
|
||||||
w = b['wav']
|
|
||||||
for ib in range(batch_sz):
|
for ib in range(batch_sz):
|
||||||
print(f'{i} {ib} {b["real_text"][ib]}')
|
print(f"text_seq: {b['text_lengths'].max()}, speech_seq: {b['wav_lengths'].max()//1024}")
|
||||||
torchaudio.save(f'{i}_clip_{ib}.wav', b['wav'][ib], ds.sample_rate)
|
|
||||||
for c in range(3):
|
|
||||||
torchaudio.save(f'{i}_clip_{ib}_cond{c}.wav', b['conditioning'][ib, c], ds.sample_rate)
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ class ConditioningEncoder(nn.Module):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
spec_dim,
|
spec_dim,
|
||||||
embedding_dim,
|
embedding_dim,
|
||||||
attn_blocks=4,
|
attn_blocks=6,
|
||||||
num_attn_heads=4,
|
num_attn_heads=4,
|
||||||
do_checkpointing=False):
|
do_checkpointing=False):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
@ -39,14 +39,13 @@ class ConditioningEncoder(nn.Module):
|
||||||
|
|
||||||
|
|
||||||
class GptTtsHf(nn.Module):
|
class GptTtsHf(nn.Module):
|
||||||
NUMBER_TEXT_TOKENS = len(symbols)+1
|
NUMBER_TEXT_TOKENS = 50257 # The number of BPE tokens produced by the HF GPT2Tokenizer
|
||||||
START_TEXT_TOKEN = len(symbols)
|
|
||||||
STOP_TEXT_TOKEN = 0
|
STOP_TEXT_TOKEN = 0
|
||||||
NUMBER_MEL_CODES = 8194
|
NUMBER_MEL_CODES = 8194
|
||||||
START_MEL_TOKEN = 8192
|
START_MEL_TOKEN = 8192
|
||||||
STOP_MEL_TOKEN = 8193
|
STOP_MEL_TOKEN = 8193
|
||||||
|
|
||||||
def __init__(self, layers=8, model_dim=512, heads=8, max_symbols_per_phrase=200, max_mel_tokens=250, max_conditioning_inputs=3,
|
def __init__(self, layers=8, model_dim=512, heads=8, max_symbols_per_phrase=100, max_mel_tokens=250, max_conditioning_inputs=3,
|
||||||
checkpointing=True, mel_length_compression=1024, max_conditioning_length=60):
|
checkpointing=True, mel_length_compression=1024, max_conditioning_length=60):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_mel_tokens = max_mel_tokens
|
self.max_mel_tokens = max_mel_tokens
|
||||||
|
@ -54,7 +53,7 @@ class GptTtsHf(nn.Module):
|
||||||
self.model_dim = model_dim
|
self.model_dim = model_dim
|
||||||
self.max_conditioning_inputs = max_conditioning_inputs
|
self.max_conditioning_inputs = max_conditioning_inputs
|
||||||
self.mel_length_compression = mel_length_compression
|
self.mel_length_compression = mel_length_compression
|
||||||
self.conditioning_encoder = ConditioningEncoder(80, model_dim)
|
self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads)
|
||||||
self.text_embedding = nn.Embedding(self.NUMBER_TEXT_TOKENS, model_dim)
|
self.text_embedding = nn.Embedding(self.NUMBER_TEXT_TOKENS, model_dim)
|
||||||
seq_length = 2+self.max_symbols_per_phrase+self.max_conditioning_inputs+self.max_mel_tokens
|
seq_length = 2+self.max_symbols_per_phrase+self.max_conditioning_inputs+self.max_mel_tokens
|
||||||
self.gpt_config = GPT2Config(vocab_size=self.NUMBER_MEL_CODES,
|
self.gpt_config = GPT2Config(vocab_size=self.NUMBER_MEL_CODES,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user