From 8e2439f50d192660e6df3a2bfdfafa6104ee0562 Mon Sep 17 00:00:00 2001 From: James Betker Date: Thu, 20 Jan 2022 11:27:49 -0700 Subject: [PATCH] Decrease resolution requirements to 2048 --- codes/models/gpt_voice/unet_diffusion_tts_experimental.py | 2 +- codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py | 2 +- codes/scripts/audio/gen/speech_synthesis_utils.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/codes/models/gpt_voice/unet_diffusion_tts_experimental.py b/codes/models/gpt_voice/unet_diffusion_tts_experimental.py index b092180d..3621a8bc 100644 --- a/codes/models/gpt_voice/unet_diffusion_tts_experimental.py +++ b/codes/models/gpt_voice/unet_diffusion_tts_experimental.py @@ -297,7 +297,7 @@ class DiffusionTts(nn.Module): :return: an [N x C x ...] Tensor of outputs. """ orig_x_shape = x.shape[-1] - cm = ceil_multiple(x.shape[-1], 4096) + cm = ceil_multiple(x.shape[-1], 2048) if cm != 0: pc = (cm-x.shape[-1])/x.shape[-1] x = F.pad(x, (0,cm-x.shape[-1])) diff --git a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py b/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py index 7cfc3ce5..4dbc8b96 100644 --- a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py +++ b/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py @@ -310,7 +310,7 @@ class DiffusionVocoderWithRef(nn.Module): :param y: an [N] Tensor of labels, if class-conditional. :return: an [N x C x ...] Tensor of outputs. """ - assert x.shape[-1] % 4096 == 0 # This model operates at base//4096 at it's bottom levels, thus this requirement. + assert x.shape[-1] % 2048 == 0 # This model operates at base//2048 at it's bottom levels, thus this requirement. if self.conditioning_enabled: assert conditioning_input is not None diff --git a/codes/scripts/audio/gen/speech_synthesis_utils.py b/codes/scripts/audio/gen/speech_synthesis_utils.py index de72bf5a..6fc2ad31 100644 --- a/codes/scripts/audio/gen/speech_synthesis_utils.py +++ b/codes/scripts/audio/gen/speech_synthesis_utils.py @@ -65,9 +65,9 @@ def do_spectrogram_diffusion(diffusion_model, dvae_model, diffuser, mel_codes, c if plt_spec: plot_spectrogram(mel[0].cpu()) - # Pad MEL to multiples of 4096//spectrogram_compression_factor + # Pad MEL to multiples of 2048//spectrogram_compression_factor msl = mel.shape[-1] - dsl = 4096 // spectrogram_compression_factor + dsl = 2048 // spectrogram_compression_factor gap = dsl - (msl % dsl) if gap > 0: mel = torch.nn.functional.pad(mel, (0, gap))