ctc gen checkin
This commit is contained in:
parent
bb3d1ab03d
commit
5ae816bead
|
@ -115,34 +115,34 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
|
||||
def get_ctc_metadata(self, codes):
|
||||
grouped = groupby(codes.tolist())
|
||||
codes, repeats, pads = [], [], [0]
|
||||
rcodes, repeats, seps = [], [], [0]
|
||||
for val, group in grouped:
|
||||
if val == 0:
|
||||
pads[-1] = len(list(group)) # This is a very important distinction! It means the padding belongs to the character proceeding it.
|
||||
seps[-1] = len(list(group)) # This is a very important distinction! It means the padding belongs to the character proceeding it.
|
||||
else:
|
||||
codes.append(val)
|
||||
rcodes.append(val)
|
||||
repeats.append(len(list(group)))
|
||||
pads.append(0)
|
||||
seps.append(0)
|
||||
|
||||
codes = torch.tensor(codes)
|
||||
rcodes = torch.tensor(rcodes)
|
||||
# These clip values are sane maximum values which I did not see in the datasets I have access to.
|
||||
repeats = torch.clip(torch.tensor(repeats), max=30)
|
||||
pads = torch.clip(torch.tensor(pads[:-1]), max=120)
|
||||
repeats = torch.clip(torch.tensor(repeats), min=1, max=30)
|
||||
seps = torch.clip(torch.tensor(seps[:-1]), max=120)
|
||||
|
||||
# Pad or clip the codes to get them to exactly self.max_text_len
|
||||
orig_lens = codes.shape[0]
|
||||
if codes.shape[0] < self.max_text_len:
|
||||
gap = self.max_text_len - codes.shape[0]
|
||||
codes = F.pad(codes, (0, gap))
|
||||
repeats = F.pad(repeats, (0, gap))
|
||||
pads = F.pad(pads, (0, gap))
|
||||
elif codes.shape[0] > self.max_text_len:
|
||||
codes = codes[:self.max_text_len]
|
||||
repeats = codes[:self.max_text_len]
|
||||
pads = pads[:self.max_text_len]
|
||||
orig_lens = rcodes.shape[0]
|
||||
if rcodes.shape[0] < self.max_text_len:
|
||||
gap = self.max_text_len - rcodes.shape[0]
|
||||
rcodes = F.pad(rcodes, (0, gap))
|
||||
repeats = F.pad(repeats, (0, gap), value=1) # The minimum value for repeats is 1, hence this is the pad value too.
|
||||
seps = F.pad(seps, (0, gap))
|
||||
elif rcodes.shape[0] > self.max_text_len:
|
||||
rcodes = rcodes[:self.max_text_len]
|
||||
repeats = rcodes[:self.max_text_len]
|
||||
seps = seps[:self.max_text_len]
|
||||
return {
|
||||
'ctc_raw_codes': codes,
|
||||
'ctc_pads': pads,
|
||||
'ctc_raw_codes': rcodes,
|
||||
'ctc_separators': seps,
|
||||
'ctc_repeats': repeats,
|
||||
'ctc_raw_lengths': orig_lens,
|
||||
}
|
||||
|
|
|
@ -1,29 +1,34 @@
|
|||
import functools
|
||||
import json
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from transformers import T5Config, T5Model
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from transformers import T5Config, T5Model, T5PreTrainedModel, T5ForConditionalGeneration
|
||||
from transformers.file_utils import replace_return_docstrings
|
||||
from transformers.modeling_outputs import Seq2SeqLMOutput, BaseModelOutput
|
||||
from transformers.utils.model_parallel_utils import get_device_map, assert_device_map
|
||||
from x_transformers import Encoder, XTransformer
|
||||
|
||||
from models.gpt_voice.transformer_builders import null_position_embeddings
|
||||
from models.gpt_voice.unet_diffusion_tts6 import CheckpointedLayer
|
||||
from models.gpt_voice.unified_voice2 import ConditioningEncoder
|
||||
from models.tacotron2.text.cleaners import english_cleaners
|
||||
from trainer.networks import register_model
|
||||
from utils.util import opt_get
|
||||
|
||||
|
||||
class CtcCodeGenerator(nn.Module):
|
||||
def __init__(self, model_dim=512, layers=10, num_heads=8, dropout=.1, ctc_codes=36, max_pad=120, max_repeat=30, checkpointing=True):
|
||||
def __init__(self, model_dim=512, layers=10, num_heads=8, dropout=.1, ctc_codes=36, max_pad=121, max_repeat=30, checkpointing=True):
|
||||
super().__init__()
|
||||
self.max_pad = max_pad
|
||||
self.max_repeat = max_repeat
|
||||
self.start_token = (self.max_repeat+1)*(self.max_pad+1)+1
|
||||
self.start_token = self.max_repeat*self.max_pad+1
|
||||
self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=num_heads)
|
||||
self.embedding = nn.Embedding(ctc_codes, model_dim)
|
||||
self.dec_embedding = nn.Embedding(self.start_token+1, model_dim)
|
||||
self.config = T5Config(
|
||||
vocab_size=1, # T5 embedding will be removed and replaced with custom embedding.
|
||||
vocab_size=self.start_token+1,
|
||||
d_model=model_dim,
|
||||
d_kv=model_dim//num_heads,
|
||||
d_ff=model_dim*4,
|
||||
|
@ -32,29 +37,36 @@ class CtcCodeGenerator(nn.Module):
|
|||
dropout_rate=dropout,
|
||||
feed_forward_proj='gated-gelu',
|
||||
use_cache=not checkpointing,
|
||||
gradient_checkpointing=checkpointing
|
||||
gradient_checkpointing=checkpointing,
|
||||
tie_word_embeddings=False,
|
||||
tie_encoder_decoder=False,
|
||||
decoder_start_token_id=self.start_token,
|
||||
pad_token_id=0,
|
||||
)
|
||||
self.transformer = T5Model(self.config)
|
||||
self.transformer = T5ForConditionalGeneration(self.config)
|
||||
del self.transformer.encoder.embed_tokens
|
||||
del self.transformer.decoder.embed_tokens
|
||||
del self.transformer.shared
|
||||
self.transformer.encoder.embed_tokens = functools.partial(null_position_embeddings, dim=model_dim)
|
||||
self.transformer.decoder.embed_tokens = functools.partial(null_position_embeddings, dim=model_dim)
|
||||
self.output_layer = nn.Linear(model_dim, self.start_token+1)
|
||||
|
||||
|
||||
def forward(self, conditioning_input, codes, pads, repeats, unpadded_lengths):
|
||||
def forward(self, conditioning_input, codes, separators, repeats, unpadded_lengths):
|
||||
max_len = unpadded_lengths.max()
|
||||
codes = codes[:, :max_len]
|
||||
pads = pads[:, :max_len]
|
||||
separators = separators[:, :max_len]
|
||||
repeats = repeats[:, :max_len]
|
||||
|
||||
if pads.max() > self.max_pad:
|
||||
print(f"Got unexpectedly long pads. Max: {pads.max()}, {pads}")
|
||||
pads = torch.clip(pads, 0, self.max_pad)
|
||||
if separators.max() > self.max_pad:
|
||||
print(f"Got unexpectedly long separators. Max: {separators.max()}, {separators}")
|
||||
separators = torch.clip(separators, 0, self.max_pad)
|
||||
if repeats.max() > self.max_repeat:
|
||||
print(f"Got unexpectedly long repeats. Max: {repeats.max()}, {repeats}")
|
||||
repeats = torch.clip(repeats, 0, self.max_repeat)
|
||||
assert not torch.any(repeats < 1)
|
||||
repeats = repeats - 1 # Per above, min(repeats) is 1; make it 0 to avoid wasting a prediction slot.
|
||||
|
||||
assert codes.max() < 36, codes.max()
|
||||
labels = separators + repeats * self.max_pad
|
||||
labels = labels + 1 # We want '0' to be used as the EOS or padding token, so add 1.
|
||||
for i in range(unpadded_lengths.shape[0]):
|
||||
labels[i, unpadded_lengths[i]:] = 0
|
||||
|
||||
conditioning_input = conditioning_input.unsqueeze(1) if len(conditioning_input.shape) == 3 else conditioning_input
|
||||
conds = []
|
||||
|
@ -63,32 +75,99 @@ class CtcCodeGenerator(nn.Module):
|
|||
conds = torch.stack(conds, dim=1)
|
||||
h = torch.cat([conds, self.embedding(codes)], dim=1)
|
||||
|
||||
labels = pads + repeats * self.max_pad + 1
|
||||
for i in range(unpadded_lengths.shape[0]):
|
||||
labels[i, unpadded_lengths[i]:] = 0
|
||||
labels_in = F.pad(labels, (1,0), value=self.start_token)
|
||||
h_dec = self.dec_embedding(labels_in)
|
||||
|
||||
h = self.transformer(inputs_embeds=h, decoder_inputs_embeds=h_dec).last_hidden_state
|
||||
logits = self.output_layer(h)
|
||||
logits = logits.permute(0,2,1)[:,:,:-1] # Strip off the last token. There is no "stop" token here, so this is just an irrelevant prediction on some future that doesn't actually exist.
|
||||
loss = F.cross_entropy(logits, labels, reduction='none')
|
||||
|
||||
# Ignore the first predictions of the sequences. This corresponds to the padding for the first CTC character, which is pretty much random and cannot be predicted.
|
||||
#loss = loss[1:].mean()
|
||||
decoder_inputs = F.pad(labels, (1, 0), value=self.start_token)[:, :-1]
|
||||
loss = self.transformer(inputs_embeds=h, decoder_input_ids=decoder_inputs, labels=labels).loss
|
||||
return loss
|
||||
|
||||
def generate(self, speech_conditioning_inputs, texts, **hf_generate_kwargs):
|
||||
codes = []
|
||||
max_seq = 50
|
||||
for text in texts:
|
||||
# First, generate CTC codes from the given texts.
|
||||
vocab = json.loads('{" ": 4, "E": 5, "T": 6, "A": 7, "O": 8, "N": 9, "I": 10, "H": 11, "S": 12, "R": 13, "D": 14, "L": 15, "U": 16, "M": 17, "W": 18, "C": 19, "F": 20, "G": 21, "Y": 22, "P": 23, "B": 24, "V": 25, "K": 26, "\'": 27, "X": 28, "J": 29, "Q": 30, "Z": 31}')
|
||||
text = english_cleaners(text)
|
||||
text = text.strip().upper()
|
||||
cd = []
|
||||
for c in text:
|
||||
if c not in vocab.keys():
|
||||
continue
|
||||
cd.append(vocab[c])
|
||||
codes.append(torch.tensor(cd, device=speech_conditioning_inputs.device))
|
||||
max_seq = max(max_seq, codes[-1].shape[-1])
|
||||
# Collate
|
||||
for i in range(len(codes)):
|
||||
if codes[i].shape[-1] < max_seq:
|
||||
codes[i] = F.pad(codes[i], (0, max_seq-codes[i].shape[-1]))
|
||||
codes = torch.stack(codes, dim=0)
|
||||
|
||||
conditioning_input = speech_conditioning_inputs.unsqueeze(1) if len(speech_conditioning_inputs.shape) == 3 else speech_conditioning_inputs
|
||||
conds = []
|
||||
for j in range(conditioning_input.shape[1]):
|
||||
conds.append(self.conditioning_encoder(conditioning_input[:, j]))
|
||||
conds = torch.stack(conds, dim=1)
|
||||
h = torch.cat([conds, self.embedding(codes)], dim=1)
|
||||
generate = self.transformer.generate(inputs_embeds=h, max_length=codes.shape[-1]+1, min_length=codes.shape[-1]+1,
|
||||
bos_token_id=self.start_token,
|
||||
bad_words_ids=[[0], [self.start_token]], **hf_generate_kwargs)
|
||||
# The HF generate API returns a sequence with the BOS token included, hence the +1s above. Remove it.
|
||||
generate = generate[:, 1:]
|
||||
|
||||
# De-compress the codes from the generated output
|
||||
generate = generate - 1 # Remember above when we added 1 to the labels to avoid overlapping the EOS pad token?
|
||||
pads = generate % self.max_pad
|
||||
repeats = (generate // self.max_pad) + 1
|
||||
ctc_batch = []
|
||||
max_seq = 0
|
||||
for bc, bp, br in zip(codes, pads, repeats):
|
||||
ctc = []
|
||||
for c, p, r in zip(bc, bp, br):
|
||||
for _ in range(p):
|
||||
ctc.append(0)
|
||||
for _ in range(r):
|
||||
ctc.append(c.item())
|
||||
ctc_batch.append(torch.tensor(ctc, device=speech_conditioning_inputs.device))
|
||||
max_seq = max(max_seq, ctc_batch[-1].shape[-1])
|
||||
|
||||
# Collate the batch
|
||||
for i in range(len(ctc_batch)):
|
||||
if ctc_batch[i].shape[-1] < max_seq:
|
||||
ctc_batch[i] = F.pad(ctc_batch[i], (0, max_seq-ctc_batch[i].shape[-1]))
|
||||
return torch.stack(ctc_batch, dim=0)
|
||||
|
||||
|
||||
@register_model
|
||||
def register_ctc_code_generator2(opt_net, opt):
|
||||
return CtcCodeGenerator(**opt_get(opt_net, ['kwargs'], {}))
|
||||
|
||||
|
||||
def inf():
|
||||
sd = torch.load('D:\\dlas\\experiments\\train_encoder_build_ctc_alignments\\models\\24000_generator.pth', map_location='cpu')
|
||||
model = CtcCodeGenerator(layers=10, checkpointing=False).eval()
|
||||
model.load_state_dict(sd)
|
||||
raw_batch = torch.load('raw_batch.pth')
|
||||
with torch.no_grad():
|
||||
from data.audio.unsupervised_audio_dataset import load_audio
|
||||
from scripts.audio.gen.speech_synthesis_utils import wav_to_mel
|
||||
ref_mel = torch.cat([wav_to_mel(raw_batch['conditioning'][0])[:, :, :256],
|
||||
wav_to_mel(raw_batch['conditioning'][0])[:, :, :256]], dim=0).unsqueeze(0)
|
||||
loss = model(ref_mel, raw_batch['ctc_raw_codes'][0].unsqueeze(0),
|
||||
raw_batch['ctc_pads'][0].unsqueeze(0),
|
||||
raw_batch['ctc_repeats'][0].unsqueeze(0),
|
||||
raw_batch['ctc_raw_lengths'][0].unsqueeze(0),)
|
||||
#ref_mel = torch.cat([wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\atkins\\1.wav", 22050))[:, :, :256],
|
||||
# wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\atkins\\2.wav", 22050))[:, :, :256]], dim=0).unsqueeze(0)
|
||||
#ctc = model.generate(ref_mel, ["i suppose though it's too early for them"], num_beams=4, )
|
||||
print("Break")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
inf()
|
||||
|
||||
model = CtcCodeGenerator()
|
||||
conds = torch.randn(4,2,80,600)
|
||||
inps = torch.randint(0,36, (4, 300))
|
||||
pads = torch.randint(0,100, (4,300))
|
||||
repeats = torch.randint(0,20, (4,300))
|
||||
loss = model(conds, inps, pads, repeats, torch.tensor([250, 300, 280, 30]))
|
||||
print(loss.shape)
|
||||
#loss = model(conds, inps, pads, repeats, torch.tensor([250, 300, 280, 30]))
|
||||
#print(loss.shape)
|
||||
#model.generate(conds, ["Hello, world!", "Ahoi!", "KKKKKK", "what's going on??"])
|
169
codes/scripts/audio/gen/use_diffuse_voice_translation.py
Normal file
169
codes/scripts/audio/gen/use_diffuse_voice_translation.py
Normal file
|
@ -0,0 +1,169 @@
|
|||
import argparse
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torchaudio
|
||||
|
||||
from data.audio.unsupervised_audio_dataset import load_audio
|
||||
from scripts.audio.gen.speech_synthesis_utils import do_spectrogram_diffusion, \
|
||||
load_discrete_vocoder_diffuser, wav_to_mel, convert_mel_to_codes
|
||||
from utils.audio import plot_spectrogram
|
||||
from utils.util import load_model_from_config
|
||||
|
||||
|
||||
def ceil_multiple(base, multiple):
|
||||
res = base % multiple
|
||||
if res == 0:
|
||||
return base
|
||||
return base + (multiple - res)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
conditioning_clips = {
|
||||
# Male
|
||||
'simmons': 'Y:\\clips\\books1\\754_Dan Simmons - The Rise Of Endymion 356 of 450\\00026.wav',
|
||||
'carlin': 'Y:\\clips\\books1\\12_dchha13 Bubonic Nukes\\00097.wav',
|
||||
'entangled': 'Y:\\clips\\books1\\3857_25_The_Entangled_Bank__000000000\\00123.wav',
|
||||
'snowden': 'Y:\\clips\\books1\\7658_Edward_Snowden_-_Permanent_Record__000000004\\00027.wav',
|
||||
# Female
|
||||
'the_doctor': 'Y:\\clips\\books2\\37062___The_Doctor__000000003\\00206.wav',
|
||||
'puppy': 'Y:\\clips\\books2\\17830___3_Puppy_Kisses__000000002\\00046.wav',
|
||||
'adrift': 'Y:\\clips\\books2\\5608_Gear__W_Michael_-_Donovan_1-5_(2018-2021)_(book_4_Gear__W_Michael_-_Donovan_5_-_Adrift_(2021)_Gear__W_Michael_-_Adrift_(Donovan_5)_—_82__000000000\\00019.wav',
|
||||
}
|
||||
|
||||
provided_codes = [
|
||||
# but facts within easy reach of any one who cares to know them go to say that the greater abstenence of women is in some part
|
||||
# due to an imperative conventionality and this conventionality is in a general way strongest were the patriarchal tradition
|
||||
# the tradition that the woman is a chattel has retained its hold in greatest vigor
|
||||
# 3570/5694/3570_5694_000008_000001.wav
|
||||
[0, 0, 24, 0, 16, 0, 6, 0, 4, 0, 0, 0, 0, 0, 20, 0, 7, 0, 0, 19, 19, 0, 0, 6, 0, 0, 12, 12, 0, 4, 4, 0, 18, 18,
|
||||
0, 10, 0, 6, 11, 11, 10, 10, 9, 9, 4, 4, 4, 5, 5, 0, 7, 0, 0, 0, 0, 12, 0, 22, 22, 0, 4, 4, 0, 13, 13, 5, 0, 7,
|
||||
7, 0, 0, 19, 11, 0, 4, 4, 8, 20, 4, 4, 4, 7, 0, 9, 9, 0, 22, 4, 4, 0, 8, 0, 9, 5, 4, 4, 18, 11, 11, 8, 4, 4, 0,
|
||||
0, 0, 19, 19, 7, 0, 0, 13, 5, 5, 0, 12, 12, 4, 4, 6, 6, 8, 8, 4, 4, 0, 26, 9, 9, 8, 0, 18, 0, 0, 4, 4, 6, 6,
|
||||
11, 5, 0, 17, 17, 0, 0, 4, 4, 4, 4, 0, 0, 0, 21, 0, 8, 0, 0, 0, 0, 4, 4, 6, 6, 8, 0, 4, 4, 0, 0, 12, 0, 7, 7,
|
||||
0, 0, 22, 0, 4, 4, 6, 11, 11, 7, 6, 6, 4, 4, 6, 11, 5, 4, 4, 4, 0, 21, 0, 13, 5, 5, 7, 7, 0, 0, 6, 6, 5, 0, 13,
|
||||
0, 4, 4, 0, 7, 0, 0, 0, 24, 0, 0, 12, 12, 0, 0, 6, 0, 5, 0, 0, 9, 9, 0, 5, 0, 9, 0, 0, 19, 5, 5, 4, 4, 8, 20,
|
||||
20, 4, 4, 4, 4, 0, 18, 18, 8, 0, 0, 0, 17, 0, 5, 0, 9, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 10, 0, 0, 12, 12, 4, 4, 0,
|
||||
10, 0, 9, 0, 4, 4, 0, 0, 12, 0, 0, 8, 0, 17, 5, 5, 4, 4, 0, 0, 0, 23, 23, 0, 7, 0, 13, 0, 0, 0, 6, 0, 4, 0, 0,
|
||||
0, 0, 14, 0, 16, 16, 0, 0, 5, 0, 4, 4, 0, 6, 8, 0, 4, 4, 7, 9, 4, 4, 4, 0, 10, 10, 17, 0, 0, 0, 23, 0, 5, 0, 0,
|
||||
13, 13, 0, 7, 0, 0, 6, 6, 0, 10, 0, 25, 5, 5, 4, 4, 0, 0, 0, 19, 19, 8, 8, 9, 0, 0, 0, 0, 0, 25, 0, 5, 0, 9, 0,
|
||||
0, 0, 6, 6, 10, 8, 8, 0, 9, 0, 0, 0, 7, 0, 0, 15, 0, 10, 0, 0, 0, 0, 6, 6, 0, 0, 22, 0, 0, 0, 4, 4, 4, 4, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
7, 0, 9, 14, 0, 4, 0, 0, 6, 11, 10, 0, 0, 0, 12, 0, 4, 4, 0, 19, 19, 8, 9, 9, 0, 0, 25, 0, 5, 0, 9, 0, 0, 6, 6,
|
||||
10, 8, 8, 9, 9, 0, 0, 7, 0, 0, 15, 0, 10, 0, 0, 0, 0, 6, 0, 22, 22, 0, 4, 4, 0, 0, 10, 0, 0, 0, 0, 12, 12, 0,
|
||||
0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 10, 0, 9, 4, 4, 4, 7, 4, 4, 4, 0, 21, 0, 5, 0, 9, 0, 5, 5, 13, 13, 7, 0, 15,
|
||||
15, 0, 0, 4, 4, 0, 18, 18, 0, 7, 0, 0, 22, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 0, 6, 6, 13, 13, 8, 0, 0, 9, 9, 0, 21, 0, 0, 5, 5, 0, 0, 0, 12, 12, 0, 0, 6,
|
||||
0, 0, 0, 4, 4, 0, 0, 0, 18, 0, 5, 0, 13, 0, 5, 4, 4, 6, 11, 5, 0, 4, 4, 23, 23, 7, 7, 0, 0, 0, 6, 0, 13, 13,
|
||||
10, 10, 0, 0, 0, 0, 7, 13, 13, 0, 19, 11, 11, 0, 0, 7, 15, 15, 0, 0, 4, 4, 0, 6, 13, 13, 7, 7, 0, 0, 0, 14, 10,
|
||||
10, 0, 0, 0, 0, 0, 6, 10, 10, 8, 8, 9, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 11, 5, 0, 4, 4, 0, 6, 13, 13, 7, 7, 0, 0, 0, 14, 10, 10, 0, 0, 0, 6, 10, 10,
|
||||
8, 9, 9, 0, 0, 4, 4, 0, 6, 11, 7, 0, 6, 4, 4, 6, 11, 5, 4, 4, 4, 18, 18, 8, 0, 0, 17, 7, 0, 9, 0, 4, 10, 0, 0,
|
||||
12, 12, 4, 4, 4, 7, 4, 4, 0, 0, 0, 19, 11, 0, 7, 0, 6, 0, 0, 0, 6, 0, 5, 0, 15, 15, 0, 0, 0, 4, 4, 4, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 7, 0, 0, 0, 12, 0, 0, 4, 4, 0, 13, 5, 5, 0, 0, 0, 0, 6, 6, 0, 0,
|
||||
7, 10, 10, 0, 9, 0, 5, 0, 14, 4, 4, 4, 0, 10, 0, 0, 0, 6, 0, 0, 0, 0, 0, 12, 0, 4, 4, 0, 0, 0, 11, 0, 0, 8, 0,
|
||||
0, 0, 15, 0, 0, 14, 0, 4, 4, 4, 0, 10, 0, 9, 4, 4, 4, 4, 4, 0, 21, 0, 13, 5, 5, 7, 7, 0, 0, 6, 0, 5, 0, 0, 12,
|
||||
0, 6, 0, 4, 0, 0, 25, 10, 0, 0, 0, 21, 0, 8, 0, 0, 13, 13, 0, 0, 4, 4, 4, 4, 0, 0, 0],
|
||||
# the competitor with whom the entertainer wishes to institute a comparison is by this method made to serve as a means to the end
|
||||
# 3570/5694/3570_5694_000011_000005.wav
|
||||
[0, 0, 6, 11, 5, 0, 4, 0, 19, 19, 8, 17, 0, 0, 0, 0, 23, 0, 5, 5, 0, 0, 6, 6, 10, 10, 0, 0, 6, 6, 0, 8, 0, 13,
|
||||
13, 0, 4, 4, 18, 18, 10, 0, 6, 11, 11, 4, 4, 4, 0, 0, 18, 18, 11, 0, 8, 0, 0, 0, 0, 17, 0, 0, 4, 0, 6, 11, 5,
|
||||
0, 4, 4, 0, 5, 9, 9, 0, 6, 5, 5, 13, 13, 0, 0, 6, 6, 0, 7, 0, 10, 0, 9, 0, 0, 5, 0, 13, 4, 4, 0, 18, 10, 10, 0,
|
||||
0, 12, 11, 11, 0, 5, 0, 0, 0, 12, 0, 0, 4, 4, 0, 0, 6, 6, 8, 0, 0, 4, 4, 4, 0, 10, 9, 9, 0, 0, 0, 0, 12, 0, 0,
|
||||
6, 0, 10, 0, 0, 0, 6, 0, 16, 16, 0, 6, 5, 0, 4, 4, 7, 4, 4, 19, 19, 8, 0, 17, 0, 0, 0, 0, 0, 23, 0, 0, 7, 0, 0,
|
||||
0, 13, 0, 10, 0, 0, 0, 0, 0, 12, 0, 0, 8, 0, 9, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 24, 0, 22, 0, 4, 4,
|
||||
0, 6, 11, 10, 0, 0, 0, 12, 0, 0, 4, 4, 0, 0, 17, 5, 5, 0, 0, 0, 6, 11, 11, 8, 0, 0, 14, 14, 0, 0, 4, 4, 4, 4,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 17, 7, 0, 0, 0, 0, 14, 5, 0, 4, 4, 6, 8, 4,
|
||||
4, 0, 0, 0, 12, 12, 0, 5, 5, 0, 13, 13, 0, 25, 5, 4, 4, 7, 0, 12, 4, 4, 4, 7, 4, 4, 0, 17, 5, 0, 0, 7, 0, 0, 9,
|
||||
0, 0, 0, 0, 12, 0, 4, 4, 0, 6, 0, 8, 0, 4, 4, 6, 11, 5, 4, 4, 4, 0, 0, 5, 0, 9, 9, 0, 0, 0, 0, 14, 0, 0, 4, 4,
|
||||
4, 4, 4, 0, 0],
|
||||
# the livery becomes obnoxious to nearly all who are required to wear it
|
||||
# 3570/5694/3570_5694_000014_000021.wav
|
||||
[0, 0, 6, 11, 5, 0, 0, 4, 4, 0, 15, 10, 10, 0, 0, 25, 5, 0, 13, 13, 0, 22, 0, 0, 4, 0, 24, 24, 5, 0, 0, 0, 19,
|
||||
19, 0, 8, 0, 17, 5, 5, 0, 12, 0, 4, 4, 4, 0, 8, 0, 0, 24, 0, 0, 0, 9, 9, 0, 8, 0, 0, 0, 0, 0, 28, 0, 0, 0, 10,
|
||||
0, 8, 16, 0, 12, 12, 12, 0, 4, 0, 6, 6, 8, 0, 4, 4, 0, 9, 5, 0, 7, 7, 13, 0, 0, 15, 22, 22, 4, 4, 0, 0, 0, 0,
|
||||
0, 0, 0, 7, 0, 15, 0, 0, 15, 0, 4, 4, 4, 18, 11, 11, 8, 0, 4, 4, 0, 7, 0, 13, 5, 4, 4, 13, 13, 5, 0, 0, 0, 30,
|
||||
30, 16, 0, 0, 10, 0, 0, 0, 13, 5, 0, 14, 4, 4, 6, 6, 8, 0, 4, 4, 18, 18, 5, 5, 7, 7, 13, 13, 0, 4, 4, 0, 10, 0,
|
||||
0, 0, 0, 6, 0, 0, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0],
|
||||
# in the nature of things luxuries and the comforts of life belong to the leisure class
|
||||
# 3570/5694/3570_5694_000006_000007.wav
|
||||
[0, 0, 0, 0, 0, 10, 9, 0, 4, 4, 6, 11, 5, 4, 4, 4, 9, 9, 7, 7, 0, 0, 0, 0, 0, 0, 6, 0, 16, 16, 13, 13, 5, 0, 4, 4, 8, 0, 20, 4, 4, 4, 0, 6, 0, 11, 10, 0, 9, 0, 21, 0, 0, 0, 12, 12, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 15, 0, 16, 16, 0, 0, 28, 0, 0, 0, 16, 16, 0, 13, 13, 0, 10, 0, 5, 0, 0, 0, 12, 0, 0, 4, 4, 4, 0, 0, 7, 0, 9, 0, 14, 4, 4, 6, 11, 5, 4, 4, 0, 0, 19, 0, 8, 17, 17, 0, 0, 0, 0, 0, 20, 0, 8, 0, 13, 0, 6, 0, 12, 4, 4, 8, 0, 20, 4, 4, 4, 0, 0, 15, 0, 10, 10, 0, 0, 0, 20, 5, 0, 4, 4, 0, 0, 24, 5, 0, 0, 0, 15, 8, 0, 9, 0, 21, 0, 0, 0, 4, 4, 6, 8, 4, 4, 4, 6, 11, 5, 4, 4, 15, 15, 5, 10, 0, 0, 12, 0, 16, 13, 5, 5, 4, 4, 0, 19, 0, 15, 15, 0, 0, 7, 0, 0, 12, 12, 0, 0, 0, 12, 12, 0, 0, 0, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0],
|
||||
# from arcaic times down through all the length of the patriarchal regime it has been the office of the women to
|
||||
# prepare and administer these luxuries and it has been the perquisite of the men of gentle birth and breeding
|
||||
# to consume them
|
||||
# 3570/5694/3570_5694_000007_000003.wav
|
||||
[0, 0, 0, 0, 0, 0, 20, 13, 8, 0, 17, 0, 4, 4, 0, 7, 0, 13, 0, 0, 0, 0, 0, 19, 0, 0, 0, 7, 0, 0, 0, 0, 10, 0, 19, 0, 0, 0, 4, 4, 0, 0, 0, 0, 6, 0, 0, 0, 10, 0, 0, 17, 5, 0, 0, 0, 12, 0, 4, 0, 0, 0, 0, 14, 0, 0, 8, 0, 18, 0, 0, 0, 9, 0, 0, 0, 0, 4, 4, 0, 0, 0, 6, 11, 13, 8, 0, 16, 21, 21, 11, 0, 4, 4, 7, 0, 15, 0, 15, 15, 4, 4, 6, 11, 5, 5, 4, 4, 0, 15, 0, 5, 0, 0, 9, 9, 0, 21, 0, 0, 6, 11, 0, 4, 4, 8, 8, 20, 4, 4, 4, 6, 11, 5, 4, 4, 0, 0, 0, 23, 0, 7, 7, 0, 0, 0, 0, 0, 6, 6, 13, 13, 13, 10, 0, 0, 0, 0, 0, 7, 13, 13, 0, 19, 11, 11, 11, 0, 0, 7, 15, 15, 0, 4, 4, 4, 13, 13, 5, 0, 0, 0, 0, 21, 21, 0, 0, 10, 0, 0, 0, 0, 17, 5, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 6, 4, 4, 0, 0, 11, 7, 7, 0, 0, 12, 0, 4, 4, 0, 24, 5, 0, 0, 5, 5, 9, 0, 4, 6, 6, 11, 5, 4, 4, 0, 0, 8, 0, 20, 0, 0, 0, 20, 0, 10, 0, 0, 0, 19, 5, 0, 4, 4, 8, 0, 20, 4, 4, 6, 11, 5, 4, 4, 4, 18, 8, 0, 0, 0, 17, 5, 0, 9, 9, 0, 0, 4, 4, 0, 6, 6, 8, 0, 0, 4, 4, 0, 23, 23, 13, 5, 5, 0, 0, 0, 0, 23, 23, 0, 7, 0, 0, 0, 13, 5, 0, 0, 0, 4, 4, 0, 7, 0, 9, 14, 0, 4, 4, 0, 0, 7, 0, 14, 0, 0, 0, 17, 17, 10, 0, 9, 0, 10, 10, 0, 0, 12, 12, 0, 0, 0, 6, 0, 5, 13, 13, 0, 0, 0, 0, 4, 4, 4, 6, 11, 11, 5, 0, 0, 0, 12, 5, 5, 4, 4, 15, 15, 0, 16, 0, 0, 0, 28, 0, 0, 0, 16, 0, 0, 13, 13, 10, 0, 5, 5, 0, 0, 12, 12, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 9, 0, 14, 4, 4, 10, 0, 6, 4, 4, 0, 11, 11, 7, 0, 0, 0, 12, 0, 4, 4, 0, 0, 0, 0, 24, 5, 0, 0, 5, 5, 9, 9, 4, 4, 4, 6, 11, 5, 4, 4, 0, 0, 0, 23, 0, 5, 0, 13, 0, 0, 0, 0, 0, 30, 30, 16, 10, 10, 0, 0, 0, 12, 0, 10, 0, 0, 6, 5, 0, 4, 4, 8, 20, 0, 4, 4, 6, 11, 5, 4, 4, 0, 17, 5, 0, 0, 0, 9, 0, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 20, 4, 4, 4, 0, 0, 21, 0, 5, 5, 0, 9, 9, 0, 0, 0, 6, 0, 15, 0, 5, 0, 4, 0, 0, 0, 24, 0, 10, 0, 13, 0, 0, 0, 0, 6, 11, 0, 0, 4, 0, 0, 7, 0, 9, 14, 14, 4, 4, 4, 0, 0, 24, 13, 5, 0, 0, 0, 5, 0, 0, 14, 10, 0, 9, 21, 21, 0, 4, 4, 0, 6, 8, 0, 4, 4, 0, 19, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 12, 0, 16, 0, 17, 5, 0, 0, 4, 4, 6, 11, 5, 0, 17, 0, 4, 4, 4, 4, 0, 0],
|
||||
# yes it is perfection she declared
|
||||
# 1284/1180/1284_1180_000036_000000.wav
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 22, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 4, 4, 4, 4, 0, 0, 10, 0, 6, 0, 4, 4, 0, 0, 10, 0, 0, 0, 0, 0, 12, 0, 4, 4, 0, 0, 0, 23, 0, 5, 0, 13, 13, 0, 0, 0, 0, 0, 0, 0, 20, 0, 0, 5, 0, 0, 0, 19, 0, 0, 6, 6, 0, 10, 0, 8, 0, 9, 0, 0, 4, 4, 4, 4, 4, 0, 0, 0, 0, 12, 11, 11, 5, 0, 4, 4, 0, 14, 0, 5, 0, 0, 0, 0, 19, 15, 15, 0, 0, 7, 0, 0, 0, 13, 0, 5, 0, 14, 4, 4, 4, 4, 0, 0, 0],
|
||||
# then it must be somewhere in the blue forest
|
||||
# 1284/1180/1284_1180_000016_000002.wav
|
||||
[0, 0, 0, 6, 11, 5, 0, 9, 0, 4, 4, 10, 6, 4, 4, 0, 17, 17, 16, 0, 0, 12, 0, 6, 4, 4, 0, 24, 5, 5, 0, 0, 4, 4, 0, 0, 12, 12, 0, 8, 0, 0, 17, 5, 5, 0, 0, 18, 18, 11, 5, 0, 13, 13, 5, 0, 4, 4, 10, 9, 4, 4, 6, 11, 5, 4, 4, 0, 24, 15, 15, 16, 16, 0, 5, 5, 0, 0, 4, 4, 0, 0, 0, 20, 8, 8, 8, 0, 0, 0, 13, 13, 0, 5, 5, 0, 0, 0, 0, 0, 12, 12, 0, 0, 6, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0],
|
||||
# happy youth that is ready to pack its valus and start for cathay on an hour's notice
|
||||
# 4970/29093/4970_29093_000044_000002.wav
|
||||
[0, 0, 0, 0, 11, 0, 7, 23, 0, 0, 0, 0, 23, 0, 22, 22, 0, 0, 0, 4, 4, 0, 0, 22, 8, 8, 16, 16, 0, 0, 0, 6, 6, 11, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 11, 7, 6, 0, 4, 4, 10, 0, 0, 12, 0, 4, 0, 13, 13, 5, 0, 7, 0, 0, 14, 22, 0, 0, 0, 4, 0, 6, 0, 8, 4, 4, 0, 0, 0, 0, 0, 0, 23, 0, 7, 0, 0, 19, 0, 0, 26, 4, 4, 4, 10, 0, 6, 0, 12, 4, 4, 0, 0, 0, 25, 0, 7, 0, 0, 0, 15, 0, 0, 16, 0, 0, 0, 0, 12, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 9, 0, 14, 4, 4, 0, 12, 12, 0, 6, 0, 7, 0, 13, 0, 0, 0, 6, 0, 0, 4, 4, 0, 0, 0, 0, 20, 8, 0, 13, 0, 4, 4, 4, 0, 0, 19, 0, 7, 7, 0, 0, 0, 0, 0, 6, 11, 0, 0, 7, 0, 0, 0, 22, 0, 0, 0, 0, 0, 4, 4, 0, 0, 8, 0, 9, 0, 4, 4, 7, 9, 4, 4, 4, 0, 0, 0, 11, 8, 8, 16, 0, 0, 13, 13, 0, 0, 0, 27, 0, 12, 0, 4, 4, 0, 9, 8, 8, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 19, 5, 5, 0, 0, 4, 4, 4, 4, 4, 0],
|
||||
# well then i must make some suggestions to you
|
||||
# 1580/141084/1580_141084_000057_000000.wav
|
||||
[0, 0, 0, 0, 0, 0, 0, 18, 0, 5, 0, 15, 0, 0, 15, 15, 4, 4, 0, 0, 6, 11, 5, 0, 0, 0, 9, 0, 0, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 10, 0, 4, 4, 0, 17, 0, 16, 0, 0, 12, 0, 6, 0, 4, 4, 0, 17, 17, 7, 0, 26, 5, 5, 4, 4, 0, 12, 12, 8, 8, 17, 17, 5, 0, 4, 4, 4, 12, 12, 16, 0, 21, 0, 0, 0, 0, 21, 21, 0, 5, 0, 0, 0, 12, 0, 0, 0, 6, 6, 0, 10, 0, 8, 8, 9, 0, 0, 0, 0, 0, 0, 12, 0, 0, 4, 4, 0, 0, 6, 0, 8, 0, 4, 4, 4, 0, 0, 22, 22, 0, 8, 16, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0],
|
||||
# some others too big cotton county
|
||||
# 1995/1826/1995_1826_000010_000002.wav
|
||||
[0, 0, 0, 0, 12, 0, 8, 0, 17, 5, 4, 4, 0, 8, 0, 0, 6, 11, 5, 0, 13, 13, 0, 0, 12, 0, 4, 4, 0, 0, 6, 0, 8, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 10, 0, 0, 0, 0, 21, 0, 0, 4, 4, 4, 0, 0, 0, 19, 0, 8, 0, 6, 6, 0, 0, 0, 6, 8, 0, 9, 9, 0, 0, 4, 0, 0, 0, 0, 19, 8, 8, 16, 0, 9, 9, 0, 0, 6, 6, 0, 0, 22, 0, 0, 0, 0, 4, 4, 0, 0, 0],
|
||||
]
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-opt', type=str, help='Path to options YAML file used to train the diffusion model', default='X:\\dlas\\experiments\\train_diffusion_tts5_medium.yml')
|
||||
parser.add_argument('-diffusion_model_name', type=str, help='Name of the diffusion model in opt.', default='generator')
|
||||
parser.add_argument('-diffusion_model_path', type=str, help='Path to saved model weights', default='X:\\dlas\\experiments\\train_diffusion_tts5_medium\\models\\73000_generator_ema.pth')
|
||||
parser.add_argument('-sr_opt', type=str, help='Path to options YAML file used to train the SR diffusion model', default='X:\\dlas\\experiments\\train_diffusion_tts6_upsample.yml')
|
||||
parser.add_argument('-sr_diffusion_model_name', type=str, help='Name of the SR diffusion model in opt.', default='generator')
|
||||
parser.add_argument('-sr_diffusion_model_path', type=str, help='Path to saved model weights for the SR diffuser', default='X:\\dlas\\experiments\\train_diffusion_tts6_upsample\\models\\7000_generator_ema.pth')
|
||||
parser.add_argument('-cond', type=str, help='Type of conditioning voice', default='carlin')
|
||||
parser.add_argument('-diffusion_steps', type=int, help='Number of diffusion steps to perform to create the generate. Lower steps reduces quality, but >40 is generally pretty good.', default=100)
|
||||
parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='../results/use_diffuse_tts')
|
||||
parser.add_argument('-device', type=str, help='Device to run on', default='cuda')
|
||||
args = parser.parse_args()
|
||||
os.makedirs(args.output_path, exist_ok=True)
|
||||
|
||||
# Fixed parameters.
|
||||
base_sample_rate = 5500
|
||||
sr_sample_rate = 22050
|
||||
|
||||
print("Loading Diffusion Models..")
|
||||
diffusion = load_model_from_config(args.opt, args.diffusion_model_name, also_load_savepoint=False,
|
||||
load_path=args.diffusion_model_path, device='cpu').eval()
|
||||
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=args.diffusion_steps, schedule='cosine')
|
||||
aligned_codes_compression_factor = base_sample_rate * 221 // 11025
|
||||
sr_diffusion = load_model_from_config(args.sr_opt, args.sr_diffusion_model_name, also_load_savepoint=False,
|
||||
load_path=args.sr_diffusion_model_path, device='cpu').eval()
|
||||
sr_diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=args.diffusion_steps, schedule='linear')
|
||||
sr_cond = load_audio(conditioning_clips[args.cond], sr_sample_rate).to(args.device)
|
||||
if sr_cond.shape[-1] > 88000:
|
||||
sr_cond = sr_cond[:,:88000]
|
||||
cond = audio = torchaudio.functional.resample(sr_cond, sr_sample_rate, base_sample_rate)
|
||||
torchaudio.save(os.path.join(args.output_path, 'cond_base.wav'), cond.cpu(), base_sample_rate)
|
||||
torchaudio.save(os.path.join(args.output_path, 'cond_sr.wav'), sr_cond.cpu(), sr_sample_rate)
|
||||
|
||||
with torch.no_grad():
|
||||
for p, code in enumerate(provided_codes):
|
||||
print("Loading data..")
|
||||
aligned_codes = torch.tensor(code).to(args.device)
|
||||
|
||||
print("Performing initial diffusion..")
|
||||
output_shape = (1, 1, ceil_multiple(aligned_codes.shape[-1]*aligned_codes_compression_factor, 2048))
|
||||
diffusion = diffusion.cuda()
|
||||
output_base = diffuser.p_sample_loop(diffusion, output_shape, noise=torch.zeros(output_shape, device=args.device),
|
||||
model_kwargs={'tokens': aligned_codes.unsqueeze(0),
|
||||
'conditioning_input': cond.unsqueeze(0)})
|
||||
diffusion = diffusion.cpu()
|
||||
torchaudio.save(os.path.join(args.output_path, f'{p}_output_mean_base.wav'), output_base.cpu().squeeze(0), base_sample_rate)
|
||||
|
||||
print("Performing SR diffusion..")
|
||||
output_shape = (1, 1, output_base.shape[-1] * (sr_sample_rate // base_sample_rate))
|
||||
sr_diffusion = sr_diffusion.cuda()
|
||||
output = diffuser.p_sample_loop(sr_diffusion, output_shape, noise=torch.zeros(output_shape, device=args.device),
|
||||
model_kwargs={'tokens': aligned_codes.unsqueeze(0),
|
||||
'conditioning_input': sr_cond.unsqueeze(0),
|
||||
'lr_input': output_base})
|
||||
sr_diffusion = sr_diffusion.cpu()
|
||||
torchaudio.save(os.path.join(args.output_path, f'{p}_output_mean_sr.wav'), output.cpu().squeeze(0), sr_sample_rate)
|
Loading…
Reference in New Issue
Block a user