diff --git a/codes/data/audio/fast_paired_dataset.py b/codes/data/audio/fast_paired_dataset.py index 20811f8c..074cd647 100644 --- a/codes/data/audio/fast_paired_dataset.py +++ b/codes/data/audio/fast_paired_dataset.py @@ -144,6 +144,7 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset): return self[rv] orig_output = wav.shape[-1] orig_text_len = tseq.shape[0] + orig_aligned_code_length = aligned_codes.shape[0] if wav.shape[-1] != self.max_wav_len: wav = F.pad(wav, (0, self.max_wav_len - wav.shape[-1])) # These codes are aligned to audio inputs, so make sure to pad them as well. @@ -154,6 +155,7 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset): 'real_text': text, 'padded_text': tseq, 'aligned_codes': aligned_codes, + 'aligned_code_lengths': orig_aligned_code_length, 'text_lengths': torch.tensor(orig_text_len, dtype=torch.long), 'wav': wav, 'wav_lengths': torch.tensor(orig_output, dtype=torch.long), diff --git a/codes/models/gpt_voice/unet_diffusion_tts_experimental.py b/codes/models/gpt_voice/unet_diffusion_tts_experimental.py index 91d88986..b092180d 100644 --- a/codes/models/gpt_voice/unet_diffusion_tts_experimental.py +++ b/codes/models/gpt_voice/unet_diffusion_tts_experimental.py @@ -9,6 +9,7 @@ import torch.nn as nn import torch.nn.functional as F from models.gpt_voice.mini_encoder import AudioMiniEncoder, EmbeddingCombiner +from scripts.audio.gen.use_diffuse_tts import ceil_multiple from trainer.networks import register_model from utils.util import get_mask_from_lengths from utils.util import checkpoint @@ -295,7 +296,12 @@ class DiffusionTts(nn.Module): :param tokens: an aligned text input. :return: an [N x C x ...] Tensor of outputs. """ - assert x.shape[-1] % 4096 == 0 # This model operates at base//4096 at it's bottom levels, thus this requirement. + orig_x_shape = x.shape[-1] + cm = ceil_multiple(x.shape[-1], 4096) + if cm != 0: + pc = (cm-x.shape[-1])/x.shape[-1] + x = F.pad(x, (0,cm-x.shape[-1])) + tokens = F.pad(tokens, (0,int(pc*tokens.shape[-1]))) if self.conditioning_enabled: assert conditioning_input is not None @@ -320,7 +326,8 @@ class DiffusionTts(nn.Module): h = torch.cat([h, hs.pop()], dim=1) h = module(h, emb) h = h.type(x.dtype) - return self.out(h) + out = self.out(h) + return out[:, :, :orig_x_shape] def benchmark(self, x, timesteps, tokens, conditioning_input): profile = OrderedDict() diff --git a/codes/trainer/ExtensibleTrainer.py b/codes/trainer/ExtensibleTrainer.py index 1df4b507..b7079941 100644 --- a/codes/trainer/ExtensibleTrainer.py +++ b/codes/trainer/ExtensibleTrainer.py @@ -183,12 +183,36 @@ class ExtensibleTrainer(BaseModel): o.zero_grad() torch.cuda.empty_cache() + sort_key = opt_get(self.opt, ['train', 'sort_key'], None) + if sort_key is not None: + sort_indices = torch.sort(data[sort_key]).indices + else: + sort_indices = None + batch_factor = self.batch_factor if perform_micro_batching else 1 self.dstate = {} for k, v in data.items(): + if sort_indices is not None: + if isinstance(v, list): + v = [v[i] for i in sort_indices] + else: + v = v[sort_indices] if isinstance(v, torch.Tensor): self.dstate[k] = [t.to(self.device) for t in torch.chunk(v, chunks=batch_factor, dim=0)] + if opt_get(self.opt, ['train', 'auto_collate'], False): + for k, v in self.dstate.items(): + if f'{k}_lengths' in self.dstate.keys(): + for c in range(len(v)): + maxlen = self.dstate[f'{k}_lengths'][c].max() + if len(v[c].shape) == 2: + self.dstate[k][c] = self.dstate[k][c][:, :maxlen] + elif len(v[c].shape) == 3: + self.dstate[k][c] = self.dstate[k][c][:, :, :maxlen] + elif len(v[c].shape) == 4: + self.dstate[k][c] = self.dstate[k][c][:, :, :, :maxlen] + + def optimize_parameters(self, step, optimize=True): # Some models need to make parametric adjustments per-step. Do that here. for net in self.networks.values():