From 0fc877cbc87a1aabf2ea660109d7f4d8a8480431 Mon Sep 17 00:00:00 2001
From: James Betker <jbetker@gmail.com>
Date: Tue, 15 Mar 2022 21:43:14 -0600
Subject: [PATCH] tts9 fix for alignment size

---
 codes/models/audio/tts/unet_diffusion_tts9.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/codes/models/audio/tts/unet_diffusion_tts9.py b/codes/models/audio/tts/unet_diffusion_tts9.py
index 1eeca530..243feab3 100644
--- a/codes/models/audio/tts/unet_diffusion_tts9.py
+++ b/codes/models/audio/tts/unet_diffusion_tts9.py
@@ -187,6 +187,7 @@ class DiffusionTts(nn.Module):
         self.super_sampling_max_noising_factor = super_sampling_max_noising_factor
         self.unconditioned_percentage = unconditioned_percentage
         self.enable_fp16 = use_fp16
+        self.alignment_size = 2 ** (len(channel_mult)+1)
         padding = 1 if kernel_size == 3 else 2
         down_kernel = 1 if efficient_convs else 3
 
@@ -414,7 +415,7 @@ class DiffusionTts(nn.Module):
 
         # Fix input size to the proper multiple of 2 so we don't get alignment errors going down and back up the U-net.
         orig_x_shape = x.shape[-1]
-        cm = ceil_multiple(x.shape[-1], 2048)
+        cm = ceil_multiple(x.shape[-1], self.alignment_size)
         if cm != 0:
             pc = (cm-x.shape[-1])/x.shape[-1]
             x = F.pad(x, (0,cm-x.shape[-1]))