From 8e2439f50d192660e6df3a2bfdfafa6104ee0562 Mon Sep 17 00:00:00 2001
From: James Betker <jbetker@gmail.com>
Date: Thu, 20 Jan 2022 11:27:49 -0700
Subject: [PATCH] Decrease resolution requirements to 2048

---
 codes/models/gpt_voice/unet_diffusion_tts_experimental.py | 2 +-
 codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py | 2 +-
 codes/scripts/audio/gen/speech_synthesis_utils.py         | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/codes/models/gpt_voice/unet_diffusion_tts_experimental.py b/codes/models/gpt_voice/unet_diffusion_tts_experimental.py
index b092180d..3621a8bc 100644
--- a/codes/models/gpt_voice/unet_diffusion_tts_experimental.py
+++ b/codes/models/gpt_voice/unet_diffusion_tts_experimental.py
@@ -297,7 +297,7 @@ class DiffusionTts(nn.Module):
         :return: an [N x C x ...] Tensor of outputs.
         """
         orig_x_shape = x.shape[-1]
-        cm = ceil_multiple(x.shape[-1], 4096)
+        cm = ceil_multiple(x.shape[-1], 2048)
         if cm != 0:
             pc = (cm-x.shape[-1])/x.shape[-1]
             x = F.pad(x, (0,cm-x.shape[-1]))
diff --git a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py b/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py
index 7cfc3ce5..4dbc8b96 100644
--- a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py
+++ b/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py
@@ -310,7 +310,7 @@ class DiffusionVocoderWithRef(nn.Module):
         :param y: an [N] Tensor of labels, if class-conditional.
         :return: an [N x C x ...] Tensor of outputs.
         """
-        assert x.shape[-1] % 4096 == 0  # This model operates at base//4096 at it's bottom levels, thus this requirement.
+        assert x.shape[-1] % 2048 == 0  # This model operates at base//2048 at it's bottom levels, thus this requirement.
         if self.conditioning_enabled:
             assert conditioning_input is not None
 
diff --git a/codes/scripts/audio/gen/speech_synthesis_utils.py b/codes/scripts/audio/gen/speech_synthesis_utils.py
index de72bf5a..6fc2ad31 100644
--- a/codes/scripts/audio/gen/speech_synthesis_utils.py
+++ b/codes/scripts/audio/gen/speech_synthesis_utils.py
@@ -65,9 +65,9 @@ def do_spectrogram_diffusion(diffusion_model, dvae_model, diffuser, mel_codes, c
         if plt_spec:
             plot_spectrogram(mel[0].cpu())
 
-        # Pad MEL to multiples of 4096//spectrogram_compression_factor
+        # Pad MEL to multiples of 2048//spectrogram_compression_factor
         msl = mel.shape[-1]
-        dsl = 4096 // spectrogram_compression_factor
+        dsl = 2048 // spectrogram_compression_factor
         gap = dsl - (msl % dsl)
         if gap > 0:
             mel = torch.nn.functional.pad(mel, (0, gap))