DL-Art-School/codes/models/tacotron2/wave_tacotron.py

from math import sqrt
import torch
from munch import munchify
from torch.autograd import Variable
from torch import nn
from torch.nn import functional as F, Flatten

from models.arch_util import ConvGnSilu
from models.diffusion.unet_diffusion import UNetModel, AttentionPool2d
from models.tacotron2.layers import ConvNorm, LinearNorm
from models.tacotron2.hparams import create_hparams
from models.tacotron2.tacotron2 import Prenet, Attention, Encoder
from trainer.networks import register_model
from models.tacotron2.taco_utils import get_mask_from_lengths
from utils.util import opt_get, checkpoint


class WavDecoder(nn.Module):
    def __init__(self, dec_channels, K_ms=40, sample_rate=8000, dropout_probability=.1):
        super().__init__()
        self.dec_channels = dec_channels
        self.K = int(sample_rate * (K_ms/1000))
        self.clarifier = UNetModel(image_size=self.K,
                                   in_channels=1,
                                   model_channels=dec_channels // 4,  # This is a requirement to enable to load the embedding produced by the decoder into the unet model.
                                   out_channels=2,  # 2 channels: eps_pred and variance_pred
                                   num_res_blocks=2,
                                   attention_resolutions=(8,),
                                   dims=1,
                                   dropout=.1,
                                   channel_mult=(1,2,4,8),
                                   use_raw_y_as_embedding=True)
        assert self.K % 64 == 0  # Otherwise the UNetModel breaks.
        self.pre_rnn = nn.Sequential(ConvGnSilu(1,32,kernel_size=5,convnd=nn.Conv1d),
                                     ConvGnSilu(32,64,kernel_size=5,stride=4,convnd=nn.Conv1d),
                                     ConvGnSilu(64,128,kernel_size=5,stride=4,convnd=nn.Conv1d),
                                     ConvGnSilu(128,256,kernel_size=5,stride=4,convnd=nn.Conv1d),
                                     ConvGnSilu(256,dec_channels,kernel_size=1,convnd=nn.Conv1d),
                                     AttentionPool2d(self.K//64,dec_channels,dec_channels//4))
        self.attention_rnn = nn.LSTMCell(dec_channels*2, dec_channels)
        self.attention_layer = Attention(dec_channels, dec_channels, dec_channels)
        self.decoder_rnn = nn.LSTMCell(dec_channels*2, dec_channels, 1)
        self.linear_projection = LinearNorm(dec_channels*2, self.dec_channels)
        self.gate_layer = LinearNorm(self.dec_channels*2, 1, bias=True, w_init_gain='sigmoid')
        self.dropout_probability = dropout_probability

    def chunk_wav(self, wav):
        wavs = list(torch.split(wav, self.K, dim=-1))
        # Pad the last chunk as needed.
        padding_needed = self.K - wavs[-1].shape[-1]
        if padding_needed > 0:
            wavs[-1] = F.pad(wavs[-1], (0,padding_needed))

        wavs = torch.stack(wavs, dim=1)  # wavs.shape = (b,s,K) where s=decoder sequence length
        return wavs, padding_needed
 
    def prepare_decoder_inputs(self, inp):
        # inp.shape = (b,s,K) chunked waveform.
        b,s,K = inp.shape
        first_frame = torch.zeros(b,1,K).to(inp.device)
        x = torch.cat([first_frame, inp[:,:-1]], dim=1)  # It is now aligned for teacher forcing.
        return x

    def initialize_decoder_states(self, memory, mask):
        """ Initializes attention rnn states, decoder rnn states, attention
        weights, attention cumulative weights, attention context, stores memory
        and stores processed memory
        PARAMS
        ------
        memory: Encoder outputs
        mask: Mask for padded data if training, expects None for inference
        """
        B = memory.size(0)
        MAX_TIME = memory.size(1)

        self.attention_hidden = Variable(memory.data.new(B, self.dec_channels).zero_())
        self.attention_cell = Variable(memory.data.new(B, self.dec_channels).zero_())

        self.decoder_hidden = Variable(memory.data.new(B, self.dec_channels).zero_())
        self.decoder_cell = Variable(memory.data.new(B, self.dec_channels).zero_())

        self.attention_weights = Variable(memory.data.new(B, MAX_TIME).zero_())
        self.attention_weights_cum = Variable(memory.data.new(B, MAX_TIME).zero_())
        self.attention_context = Variable(memory.data.new(B, self.dec_channels).zero_())

        self.memory = memory
        self.processed_memory = checkpoint(self.attention_layer.memory_layer, memory)
        self.mask = mask

    def teardown_states(self):
        self.attention_hidden = None
        self.attention_cell = None
        self.decoder_hidden = None
        self.decoder_cell = None
        self.attention_weights = None
        self.attention_weights_cum = None
        self.attention_context = None
        self.memory = None
        self.processed_memory = None

    def produce_context(self, decoder_input):
        """ Produces a context and a stop token prediction using the built-in RNN.
        PARAMS
        ------
        decoder_input: prior diffusion step that has been resolved.

        RETURNS
        -------
        mel_output:
        gate_output: gate output energies
        attention_weights:
        """
        cell_input = torch.cat((decoder_input, self.attention_context), -1)
        self.attention_hidden, self.attention_cell = self.attention_rnn(cell_input, (self.attention_hidden, self.attention_cell))
        self.attention_hidden = F.dropout(self.attention_hidden, self.dropout_probability, self.training)

        attention_weights_cat = torch.cat((self.attention_weights.unsqueeze(1), self.attention_weights_cum.unsqueeze(1)), dim=1)
        self.attention_context, self.attention_weights = checkpoint(self.attention_layer, self.attention_hidden, self.memory,
                                                                    self.processed_memory, attention_weights_cat, self.mask)

        self.attention_weights_cum += self.attention_weights
        decoder_input = torch.cat((self.attention_hidden, self.attention_context), -1)
        self.decoder_hidden, self.decoder_cell = self.decoder_rnn(decoder_input, (self.decoder_hidden, self.decoder_cell))
        self.decoder_hidden = F.dropout(self.decoder_hidden, self.dropout_probability, self.training)

        decoder_hidden_attention_context = torch.cat((self.decoder_hidden, self.attention_context), dim=1)
        decoder_output = checkpoint(self.linear_projection, decoder_hidden_attention_context)

        gate_prediction = self.gate_layer(decoder_hidden_attention_context)
        return decoder_output, gate_prediction, self.attention_weights

    def recombine(self, diffusion_eps, gate_outputs, alignments, padding_added):
        # (T_out, B) -> (B, T_out)
        alignments = torch.stack(alignments, dim=1).repeat(1, self.K, 1)
        # (T_out, B) -> (B, T_out)
        gate_outputs = torch.stack(gate_outputs, dim=1).repeat(1, self.K)

        b,s,_,K = diffusion_eps.shape
        # (B, S, 2, K) -> (B, 2, S*K)
        diffusion_eps = diffusion_eps.permute(0,2,1,3).reshape(b, 2, s*K)

        return diffusion_eps[:,:,:-padding_added], gate_outputs[:,:-padding_added], alignments[:,:-padding_added]

    def forward(self, wav_noised, wav_real, timesteps, text_enc, memory_lengths):
        '''
        Performs a training forward pass with the given data.
        :param wav_noised: (b,n) diffused waveform tensor on the interval [-1,1]
        :param wav_real: (b,n) actual waveform tensor
        :param text_enc: (b,e) embedding post-encoder with e=self.dec_channels
        '''

        # Start by splitting up the provided waveforms into discrete segments.
        wav_noised, padding_added = self.chunk_wav(wav_noised)
        wav_real, _ = self.chunk_wav(wav_real)
        wav_real = self.prepare_decoder_inputs(wav_real)
        b,s,K = wav_real.shape
        wav_real = checkpoint(self.pre_rnn, wav_real.reshape(b*s,1,K)).reshape(b,s,self.dec_channels)

        self.initialize_decoder_states(text_enc, mask=~get_mask_from_lengths(memory_lengths))
        decoder_contexts, gate_outputs, alignments = [], [], []
        while len(decoder_contexts) < wav_real.size(1):
            decoder_input = wav_real[:, len(decoder_contexts)]
            dec_context, gate_output, attention_weights = self.produce_context(decoder_input)
            decoder_contexts += [dec_context.squeeze(1)]
            gate_outputs += [gate_output.squeeze(1)]
            alignments += [attention_weights]
        self.teardown_states()

        # diffusion_inputs and wavs needs to have the sequence and batch dimensions combined, and needs a channel dimension
        diffusion_emb = torch.stack(decoder_contexts, dim=1)
        b,s,c = diffusion_emb.shape
        diffusion_emb = diffusion_emb.reshape(b*s,c)
        wav_noised = wav_noised.reshape(b*s,1,self.K)
        diffusion_eps = self.clarifier(wav_noised, timesteps.repeat(s), diffusion_emb).reshape(b,s,2,self.K)
        # Recombine diffusion outputs across the sequence into a single prediction.
        diffusion_eps, gate_outputs, alignments = self.recombine(diffusion_eps, gate_outputs, alignments, padding_added)
        return diffusion_eps, gate_outputs, alignments


class WaveTacotron2(nn.Module):
    def __init__(self, hparams):
        super().__init__()
        self.mask_padding = hparams.mask_padding
        self.fp16_run = hparams.fp16_run
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.embedding = nn.Embedding(
            hparams.n_symbols, hparams.symbols_embedding_dim)
        std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
        val = sqrt(3.0) * std  # uniform bounds for std
        self.embedding.weight.data.uniform_(-val, val)
        self.encoder = Encoder(hparams)
        self.decoder = WavDecoder(hparams.encoder_embedding_dim)

    def parse_output(self, outputs, output_lengths=None):
        if self.mask_padding and output_lengths is not None:
            mask_fill = outputs[0].shape[-1]
            mask = ~get_mask_from_lengths(output_lengths, mask_fill)
            mask = mask.unsqueeze(1).repeat(1,2,1)

            outputs[0].data.masked_fill_(mask, 0.0)
            outputs[0] = outputs[0].unsqueeze(1)  # Re-add channel dimension.
            outputs[1].data.masked_fill_(mask[:,0], 1e3)  # gate energies

        return outputs

    def forward(self, wavs_diffused, wavs_corrected, timesteps, text_inputs, text_lengths, output_lengths):
        # Squeeze the channel dimension out of the input wavs - we only handle single-channel audio here.
        wavs_diffused = wavs_diffused.squeeze(dim=1)
        wavs_corrected = wavs_corrected.squeeze(dim=1)

        text_lengths, output_lengths = text_lengths.data, output_lengths.data
        embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
        encoder_outputs = checkpoint(self.encoder, embedded_inputs, text_lengths)
        eps_pred, gate_outputs, alignments = self.decoder(
            wavs_diffused, wavs_corrected, timesteps, encoder_outputs, memory_lengths=text_lengths)

        return self.parse_output([eps_pred, gate_outputs, alignments], output_lengths)


@register_model
def register_diffusion_wavetron(opt_net, opt):
    hparams = create_hparams()
    hparams.update(opt_net)
    hparams = munchify(hparams)
    return WaveTacotron2(hparams)


if __name__ == '__main__':
    tron = register_diffusion_wavetron({}, {})
    out = tron(wavs_diffused=torch.randn(2, 1, 22000),
               wavs_corrected=torch.randn(2, 1, 22000),
               timesteps=torch.LongTensor([555, 543]),
               text_inputs=torch.randint(high=24, size=(2,12)),
               text_lengths=torch.tensor([12, 12]),
               output_lengths=torch.tensor([21995]))
    print([o.shape for o in out])
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00			`from math import sqrt`
			`import torch`
			`from munch import munchify`
			`from torch.autograd import Variable`
			`from torch import nn`
More work on wave-diffusion 2021-07-27 11:36:17 +00:00			`from torch.nn import functional as F, Flatten`
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00
More work on wave-diffusion 2021-07-27 11:36:17 +00:00			`from models.arch_util import ConvGnSilu`
			`from models.diffusion.unet_diffusion import UNetModel, AttentionPool2d`
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00			`from models.tacotron2.layers import ConvNorm, LinearNorm`
			`from models.tacotron2.hparams import create_hparams`
			`from models.tacotron2.tacotron2 import Prenet, Attention, Encoder`
			`from trainer.networks import register_model`
			`from models.tacotron2.taco_utils import get_mask_from_lengths`
			`from utils.util import opt_get, checkpoint`


More work on wave-diffusion 2021-07-27 11:36:17 +00:00
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00			`class WavDecoder(nn.Module):`
Allow audio sample rate interpolation for faster training 2021-07-26 23:44:06 +00:00			`def __init__(self, dec_channels, K_ms=40, sample_rate=8000, dropout_probability=.1):`
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00			`super().__init__()`
			`self.dec_channels = dec_channels`
Allow audio sample rate interpolation for faster training 2021-07-26 23:44:06 +00:00			`self.K = int(sample_rate * (K_ms/1000))`
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00			`self.clarifier = UNetModel(image_size=self.K,`
			`in_channels=1,`
			`model_channels=dec_channels // 4, # This is a requirement to enable to load the embedding produced by the decoder into the unet model.`
			`out_channels=2, # 2 channels: eps_pred and variance_pred`
			`num_res_blocks=2,`
More work on wave-diffusion 2021-07-27 11:36:17 +00:00			`attention_resolutions=(8,),`
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00			`dims=1,`
			`dropout=.1,`
More work on wave-diffusion 2021-07-27 11:36:17 +00:00			`channel_mult=(1,2,4,8),`
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00			`use_raw_y_as_embedding=True)`
			`assert self.K % 64 == 0 # Otherwise the UNetModel breaks.`
More work on wave-diffusion 2021-07-27 11:36:17 +00:00			`self.pre_rnn = nn.Sequential(ConvGnSilu(1,32,kernel_size=5,convnd=nn.Conv1d),`
			`ConvGnSilu(32,64,kernel_size=5,stride=4,convnd=nn.Conv1d),`
			`ConvGnSilu(64,128,kernel_size=5,stride=4,convnd=nn.Conv1d),`
			`ConvGnSilu(128,256,kernel_size=5,stride=4,convnd=nn.Conv1d),`
			`ConvGnSilu(256,dec_channels,kernel_size=1,convnd=nn.Conv1d),`
			`AttentionPool2d(self.K//64,dec_channels,dec_channels//4))`
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00			`self.attention_rnn = nn.LSTMCell(dec_channels*2, dec_channels)`
			`self.attention_layer = Attention(dec_channels, dec_channels, dec_channels)`
			`self.decoder_rnn = nn.LSTMCell(dec_channels*2, dec_channels, 1)`
			`self.linear_projection = LinearNorm(dec_channels*2, self.dec_channels)`
			`self.gate_layer = LinearNorm(self.dec_channels*2, 1, bias=True, w_init_gain='sigmoid')`
			`self.dropout_probability = dropout_probability`

			`def chunk_wav(self, wav):`
			`wavs = list(torch.split(wav, self.K, dim=-1))`
			`# Pad the last chunk as needed.`
			`padding_needed = self.K - wavs[-1].shape[-1]`
			`if padding_needed > 0:`
			`wavs[-1] = F.pad(wavs[-1], (0,padding_needed))`

			`wavs = torch.stack(wavs, dim=1) # wavs.shape = (b,s,K) where s=decoder sequence length`
			`return wavs, padding_needed`
More work on wave-diffusion 2021-07-27 11:36:17 +00:00
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00			`def prepare_decoder_inputs(self, inp):`
			`# inp.shape = (b,s,K) chunked waveform.`
			`b,s,K = inp.shape`
			`first_frame = torch.zeros(b,1,K).to(inp.device)`
			`x = torch.cat([first_frame, inp[:,:-1]], dim=1) # It is now aligned for teacher forcing.`
			`return x`

			`def initialize_decoder_states(self, memory, mask):`
			`""" Initializes attention rnn states, decoder rnn states, attention`
			`weights, attention cumulative weights, attention context, stores memory`
			`and stores processed memory`
			`PARAMS`
			`------`
			`memory: Encoder outputs`
			`mask: Mask for padded data if training, expects None for inference`
			`"""`
			`B = memory.size(0)`
			`MAX_TIME = memory.size(1)`

			`self.attention_hidden = Variable(memory.data.new(B, self.dec_channels).zero_())`
			`self.attention_cell = Variable(memory.data.new(B, self.dec_channels).zero_())`

			`self.decoder_hidden = Variable(memory.data.new(B, self.dec_channels).zero_())`
			`self.decoder_cell = Variable(memory.data.new(B, self.dec_channels).zero_())`

			`self.attention_weights = Variable(memory.data.new(B, MAX_TIME).zero_())`
			`self.attention_weights_cum = Variable(memory.data.new(B, MAX_TIME).zero_())`
			`self.attention_context = Variable(memory.data.new(B, self.dec_channels).zero_())`

			`self.memory = memory`
			`self.processed_memory = checkpoint(self.attention_layer.memory_layer, memory)`
			`self.mask = mask`

			`def teardown_states(self):`
			`self.attention_hidden = None`
			`self.attention_cell = None`
			`self.decoder_hidden = None`
			`self.decoder_cell = None`
			`self.attention_weights = None`
			`self.attention_weights_cum = None`
			`self.attention_context = None`
			`self.memory = None`
			`self.processed_memory = None`

			`def produce_context(self, decoder_input):`
			`""" Produces a context and a stop token prediction using the built-in RNN.`
			`PARAMS`
			`------`
			`decoder_input: prior diffusion step that has been resolved.`

			`RETURNS`
			`-------`
			`mel_output:`
			`gate_output: gate output energies`
			`attention_weights:`
			`"""`
			`cell_input = torch.cat((decoder_input, self.attention_context), -1)`
			`self.attention_hidden, self.attention_cell = self.attention_rnn(cell_input, (self.attention_hidden, self.attention_cell))`
			`self.attention_hidden = F.dropout(self.attention_hidden, self.dropout_probability, self.training)`

			`attention_weights_cat = torch.cat((self.attention_weights.unsqueeze(1), self.attention_weights_cum.unsqueeze(1)), dim=1)`
			`self.attention_context, self.attention_weights = checkpoint(self.attention_layer, self.attention_hidden, self.memory,`
			`self.processed_memory, attention_weights_cat, self.mask)`

			`self.attention_weights_cum += self.attention_weights`
			`decoder_input = torch.cat((self.attention_hidden, self.attention_context), -1)`
			`self.decoder_hidden, self.decoder_cell = self.decoder_rnn(decoder_input, (self.decoder_hidden, self.decoder_cell))`
			`self.decoder_hidden = F.dropout(self.decoder_hidden, self.dropout_probability, self.training)`

			`decoder_hidden_attention_context = torch.cat((self.decoder_hidden, self.attention_context), dim=1)`
			`decoder_output = checkpoint(self.linear_projection, decoder_hidden_attention_context)`

			`gate_prediction = self.gate_layer(decoder_hidden_attention_context)`
			`return decoder_output, gate_prediction, self.attention_weights`

			`def recombine(self, diffusion_eps, gate_outputs, alignments, padding_added):`
			`# (T_out, B) -> (B, T_out)`
			`alignments = torch.stack(alignments, dim=1).repeat(1, self.K, 1)`
			`# (T_out, B) -> (B, T_out)`
			`gate_outputs = torch.stack(gate_outputs, dim=1).repeat(1, self.K)`

			`b,s,_,K = diffusion_eps.shape`
			`# (B, S, 2, K) -> (B, 2, S*K)`
			`diffusion_eps = diffusion_eps.permute(0,2,1,3).reshape(b, 2, s*K)`

			`return diffusion_eps[:,:,:-padding_added], gate_outputs[:,:-padding_added], alignments[:,:-padding_added]`

More work on wave-diffusion 2021-07-27 11:36:17 +00:00			`def forward(self, wav_noised, wav_real, timesteps, text_enc, memory_lengths):`
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00			`'''`
			`Performs a training forward pass with the given data.`
More work on wave-diffusion 2021-07-27 11:36:17 +00:00			`:param wav_noised: (b,n) diffused waveform tensor on the interval [-1,1]`
			`:param wav_real: (b,n) actual waveform tensor`
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00			`:param text_enc: (b,e) embedding post-encoder with e=self.dec_channels`
			`'''`

			`# Start by splitting up the provided waveforms into discrete segments.`
More work on wave-diffusion 2021-07-27 11:36:17 +00:00			`wav_noised, padding_added = self.chunk_wav(wav_noised)`
			`wav_real, _ = self.chunk_wav(wav_real)`
			`wav_real = self.prepare_decoder_inputs(wav_real)`
			`b,s,K = wav_real.shape`
			`wav_real = checkpoint(self.pre_rnn, wav_real.reshape(b*s,1,K)).reshape(b,s,self.dec_channels)`
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00
			`self.initialize_decoder_states(text_enc, mask=~get_mask_from_lengths(memory_lengths))`
			`decoder_contexts, gate_outputs, alignments = [], [], []`
More work on wave-diffusion 2021-07-27 11:36:17 +00:00			`while len(decoder_contexts) < wav_real.size(1):`
			`decoder_input = wav_real[:, len(decoder_contexts)]`
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00			`dec_context, gate_output, attention_weights = self.produce_context(decoder_input)`
			`decoder_contexts += [dec_context.squeeze(1)]`
			`gate_outputs += [gate_output.squeeze(1)]`
			`alignments += [attention_weights]`
			`self.teardown_states()`

			`# diffusion_inputs and wavs needs to have the sequence and batch dimensions combined, and needs a channel dimension`
			`diffusion_emb = torch.stack(decoder_contexts, dim=1)`
			`b,s,c = diffusion_emb.shape`
			`diffusion_emb = diffusion_emb.reshape(b*s,c)`
More work on wave-diffusion 2021-07-27 11:36:17 +00:00			`wav_noised = wav_noised.reshape(b*s,1,self.K)`
			`diffusion_eps = self.clarifier(wav_noised, timesteps.repeat(s), diffusion_emb).reshape(b,s,2,self.K)`
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00			`# Recombine diffusion outputs across the sequence into a single prediction.`
			`diffusion_eps, gate_outputs, alignments = self.recombine(diffusion_eps, gate_outputs, alignments, padding_added)`
			`return diffusion_eps, gate_outputs, alignments`


			`class WaveTacotron2(nn.Module):`
			`def __init__(self, hparams):`
			`super().__init__()`
			`self.mask_padding = hparams.mask_padding`
			`self.fp16_run = hparams.fp16_run`
			`self.n_mel_channels = hparams.n_mel_channels`
			`self.n_frames_per_step = hparams.n_frames_per_step`
			`self.embedding = nn.Embedding(`
			`hparams.n_symbols, hparams.symbols_embedding_dim)`
			`std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))`
			`val = sqrt(3.0) * std # uniform bounds for std`
			`self.embedding.weight.data.uniform_(-val, val)`
			`self.encoder = Encoder(hparams)`
			`self.decoder = WavDecoder(hparams.encoder_embedding_dim)`

			`def parse_output(self, outputs, output_lengths=None):`
			`if self.mask_padding and output_lengths is not None:`
			`mask_fill = outputs[0].shape[-1]`
			`mask = ~get_mask_from_lengths(output_lengths, mask_fill)`
Allow audio sample rate interpolation for faster training 2021-07-26 23:44:06 +00:00			`mask = mask.unsqueeze(1).repeat(1,2,1)`
Add support for a gaussian-diffusion-based wave tacotron 2021-07-26 22:27:31 +00:00
			`outputs[0].data.masked_fill_(mask, 0.0)`
			`outputs[0] = outputs[0].unsqueeze(1) # Re-add channel dimension.`
			`outputs[1].data.masked_fill_(mask[:,0], 1e3) # gate energies`

			`return outputs`

			`def forward(self, wavs_diffused, wavs_corrected, timesteps, text_inputs, text_lengths, output_lengths):`
			`# Squeeze the channel dimension out of the input wavs - we only handle single-channel audio here.`
			`wavs_diffused = wavs_diffused.squeeze(dim=1)`
			`wavs_corrected = wavs_corrected.squeeze(dim=1)`

			`text_lengths, output_lengths = text_lengths.data, output_lengths.data`
			`embedded_inputs = self.embedding(text_inputs).transpose(1, 2)`
			`encoder_outputs = checkpoint(self.encoder, embedded_inputs, text_lengths)`
			`eps_pred, gate_outputs, alignments = self.decoder(`
			`wavs_diffused, wavs_corrected, timesteps, encoder_outputs, memory_lengths=text_lengths)`

			`return self.parse_output([eps_pred, gate_outputs, alignments], output_lengths)`


			`@register_model`
			`def register_diffusion_wavetron(opt_net, opt):`
			`hparams = create_hparams()`
			`hparams.update(opt_net)`
			`hparams = munchify(hparams)`
			`return WaveTacotron2(hparams)`


			`if __name__ == '__main__':`
			`tron = register_diffusion_wavetron({}, {})`
			`out = tron(wavs_diffused=torch.randn(2, 1, 22000),`
			`wavs_corrected=torch.randn(2, 1, 22000),`
			`timesteps=torch.LongTensor([555, 543]),`
			`text_inputs=torch.randint(high=24, size=(2,12)),`
			`text_lengths=torch.tensor([12, 12]),`
			`output_lengths=torch.tensor([21995]))`
			`print([o.shape for o in out])`