Decrease resolution requirements to 2048

2022-01-20 11:27:49 -07:00 · 2022-01-20 11:27:49 -07:00 · 8e2439f50d
commit 8e2439f50d
parent 4af8525dc3
3 changed files with 4 additions and 4 deletions
--- a/codes/models/gpt_voice/unet_diffusion_tts_experimental.py
+++ b/codes/models/gpt_voice/unet_diffusion_tts_experimental.py
@ -297,7 +297,7 @@ class DiffusionTts(nn.Module):
        :return: an [N x C x ...] Tensor of outputs.
        """
        orig_x_shape = x.shape[-1]
-        cm = ceil_multiple(x.shape[-1], 4096)
+        cm = ceil_multiple(x.shape[-1], 2048)
        if cm != 0:
            pc = (cm-x.shape[-1])/x.shape[-1]
            x = F.pad(x, (0,cm-x.shape[-1]))
--- a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py
+++ b/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py
@ -310,7 +310,7 @@ class DiffusionVocoderWithRef(nn.Module):
        :param y: an [N] Tensor of labels, if class-conditional.
        :return: an [N x C x ...] Tensor of outputs.
        """
-        assert x.shape[-1] % 4096 == 0  # This model operates at base//4096 at it's bottom levels, thus this requirement.
+        assert x.shape[-1] % 2048 == 0  # This model operates at base//2048 at it's bottom levels, thus this requirement.
        if self.conditioning_enabled:
            assert conditioning_input is not None
--- a/codes/scripts/audio/gen/speech_synthesis_utils.py
+++ b/codes/scripts/audio/gen/speech_synthesis_utils.py
@ -65,9 +65,9 @@ def do_spectrogram_diffusion(diffusion_model, dvae_model, diffuser, mel_codes, c
        if plt_spec:
            plot_spectrogram(mel[0].cpu())
-        # Pad MEL to multiples of 4096//spectrogram_compression_factor
+        # Pad MEL to multiples of 2048//spectrogram_compression_factor
        msl = mel.shape[-1]
-        dsl = 4096 // spectrogram_compression_factor
+        dsl = 2048 // spectrogram_compression_factor
        gap = dsl - (msl % dsl)
        if gap > 0:
            mel = torch.nn.functional.pad(mel, (0, gap))