From 9e9ae328f2c1f9300defbf8ba93e03200c313b4d Mon Sep 17 00:00:00 2001
From: James Betker <jbetker@gmail.com>
Date: Tue, 8 Feb 2022 23:51:17 -0700
Subject: [PATCH] mild updates

---
 codes/models/gpt_voice/ctc_code_generator.py    | 17 +++++++----------
 codes/models/gpt_voice/unet_diffusion_tts6.py   |  2 +-
 .../audio/gen/use_diffuse_voice_translation.py  |  2 +-
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/codes/models/gpt_voice/ctc_code_generator.py b/codes/models/gpt_voice/ctc_code_generator.py
index f2bb7da6..1cfe723a 100644
--- a/codes/models/gpt_voice/ctc_code_generator.py
+++ b/codes/models/gpt_voice/ctc_code_generator.py
@@ -132,20 +132,17 @@ def register_ctc_code_generator(opt_net, opt):
 
 
 def inf():
-    sd = torch.load('D:\\dlas\\experiments\\train_encoder_build_ctc_alignments\\models\\11000_generator.pth', map_location='cpu')
-    model = CtcCodeGenerator(layers=10).eval()
+    sd = torch.load('D:\\dlas\\experiments\\train_encoder_build_ctc_alignments_medium\\models\\24000_generator.pth', map_location='cpu')
+    model = CtcCodeGenerator(model_dim=1024,layers=32).eval()
     model.load_state_dict(sd)
-    #raw_batch = torch.load('raw_batch.pth')
     with torch.no_grad():
         from data.audio.unsupervised_audio_dataset import load_audio
         from scripts.audio.gen.speech_synthesis_utils import wav_to_mel
-        #loss = model(wav_to_mel(raw_batch['conditioning'][0]),
-        #             raw_batch['ctc_raw_codes'][0].unsqueeze(0),
-        #             raw_batch['ctc_pads'][0].unsqueeze(0),
-        #             raw_batch['ctc_repeats'][0].unsqueeze(0),
-        #             raw_batch['ctc_raw_lengths'][0].unsqueeze(0),)
-        ref_mel = wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\atkins\\1.wav", 22050))
-        ctc = model.generate(ref_mel, ["i suppose though it's too early for them"])
+        ref_mel = torch.cat([wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\atkins\\1.wav", 22050))[:,:,:450],
+                             wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\kennard\\1.wav", 22050))[:,:,:450],
+                             wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\grace\\1.wav", 22050))[:,:,:450],
+                             wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\atkins\\1.wav", 22050))[:,:,:450]], dim=0)
+        ctc = model.generate(ref_mel, (["i suppose though it's too early for them"] * 3) + ["i suppose though it's too early for them, dear"])
     print("Break")
 
 
diff --git a/codes/models/gpt_voice/unet_diffusion_tts6.py b/codes/models/gpt_voice/unet_diffusion_tts6.py
index d518fbc3..6dd59254 100644
--- a/codes/models/gpt_voice/unet_diffusion_tts6.py
+++ b/codes/models/gpt_voice/unet_diffusion_tts6.py
@@ -379,7 +379,7 @@ class DiffusionTts(nn.Module):
         assert conditioning_input is not None
         if self.super_sampling_enabled:
             assert lr_input is not None
-            if self.super_sampling_max_noising_factor > 0:
+            if self.training and self.super_sampling_max_noising_factor > 0:
                 noising_factor = random.uniform(0,self.super_sampling_max_noising_factor)
                 lr_input = torch.randn_like(lr_input) * noising_factor + lr_input
             lr_input = F.interpolate(lr_input, size=(x.shape[-1],), mode='nearest')
diff --git a/codes/scripts/audio/gen/use_diffuse_voice_translation.py b/codes/scripts/audio/gen/use_diffuse_voice_translation.py
index 5f440a9c..a333d6e9 100644
--- a/codes/scripts/audio/gen/use_diffuse_voice_translation.py
+++ b/codes/scripts/audio/gen/use_diffuse_voice_translation.py
@@ -53,7 +53,7 @@ if __name__ == '__main__':
     parser.add_argument('-diffusion_model_path', type=str, help='Path to saved model weights', default='X:\\dlas\\experiments\\train_diffusion_tts5_medium\\models\\73000_generator_ema.pth')
     parser.add_argument('-sr_opt', type=str, help='Path to options YAML file used to train the SR diffusion model', default='X:\\dlas\\experiments\\train_diffusion_tts6_upsample.yml')
     parser.add_argument('-sr_diffusion_model_name', type=str, help='Name of the SR diffusion model in opt.', default='generator')
-    parser.add_argument('-sr_diffusion_model_path', type=str, help='Path to saved model weights for the SR diffuser', default='X:\\dlas\\experiments\\train_diffusion_tts6_upsample\\models\\26500_generator_ema.pth')
+    parser.add_argument('-sr_diffusion_model_path', type=str, help='Path to saved model weights for the SR diffuser', default='X:\\dlas\\experiments\\train_diffusion_tts6_upsample_continued\\models\\41000_generator_ema.pth')
     parser.add_argument('-voice', type=str, help='Type of conditioning voice', default='puppy')
     parser.add_argument('-diffusion_steps', type=int, help='Number of diffusion steps to perform to create the generate. Lower steps reduces quality, but >40 is generally pretty good.', default=100)
     parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='../results/use_diffuse_voice_translation')