mild updates

2022-02-08 23:51:17 -07:00 · 2022-02-08 23:51:17 -07:00 · 9e9ae328f2
commit 9e9ae328f2
parent ff35d13b99
3 changed files with 9 additions and 12 deletions
--- a/codes/models/gpt_voice/ctc_code_generator.py
+++ b/codes/models/gpt_voice/ctc_code_generator.py
@ -132,20 +132,17 @@ def register_ctc_code_generator(opt_net, opt):


 def inf():
-    sd = torch.load('D:\\dlas\\experiments\\train_encoder_build_ctc_alignments\\models\\11000_generator.pth', map_location='cpu')
-    model = CtcCodeGenerator(layers=10).eval()
+    sd = torch.load('D:\\dlas\\experiments\\train_encoder_build_ctc_alignments_medium\\models\\24000_generator.pth', map_location='cpu')
+    model = CtcCodeGenerator(model_dim=1024,layers=32).eval()
    model.load_state_dict(sd)
-    #raw_batch = torch.load('raw_batch.pth')
    with torch.no_grad():
        from data.audio.unsupervised_audio_dataset import load_audio
        from scripts.audio.gen.speech_synthesis_utils import wav_to_mel
-        #loss = model(wav_to_mel(raw_batch['conditioning'][0]),
-        #             raw_batch['ctc_raw_codes'][0].unsqueeze(0),
-        #             raw_batch['ctc_pads'][0].unsqueeze(0),
-        #             raw_batch['ctc_repeats'][0].unsqueeze(0),
-        #             raw_batch['ctc_raw_lengths'][0].unsqueeze(0),)
-        ref_mel = wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\atkins\\1.wav", 22050))
-        ctc = model.generate(ref_mel, ["i suppose though it's too early for them"])
+        ref_mel = torch.cat([wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\atkins\\1.wav", 22050))[:,:,:450],
+                             wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\kennard\\1.wav", 22050))[:,:,:450],
+                             wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\grace\\1.wav", 22050))[:,:,:450],
+                             wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\atkins\\1.wav", 22050))[:,:,:450]], dim=0)
+        ctc = model.generate(ref_mel, (["i suppose though it's too early for them"] * 3) + ["i suppose though it's too early for them, dear"])
    print("Break")


--- a/codes/models/gpt_voice/unet_diffusion_tts6.py
+++ b/codes/models/gpt_voice/unet_diffusion_tts6.py
@ -379,7 +379,7 @@ class DiffusionTts(nn.Module):
        assert conditioning_input is not None
        if self.super_sampling_enabled:
            assert lr_input is not None
-            if self.super_sampling_max_noising_factor > 0:
+            if self.training and self.super_sampling_max_noising_factor > 0:
                noising_factor = random.uniform(0,self.super_sampling_max_noising_factor)
                lr_input = torch.randn_like(lr_input) * noising_factor + lr_input
            lr_input = F.interpolate(lr_input, size=(x.shape[-1],), mode='nearest')
--- a/codes/scripts/audio/gen/use_diffuse_voice_translation.py
+++ b/codes/scripts/audio/gen/use_diffuse_voice_translation.py
@ -53,7 +53,7 @@ if __name__ == '__main__':
    parser.add_argument('-diffusion_model_path', type=str, help='Path to saved model weights', default='X:\\dlas\\experiments\\train_diffusion_tts5_medium\\models\\73000_generator_ema.pth')
    parser.add_argument('-sr_opt', type=str, help='Path to options YAML file used to train the SR diffusion model', default='X:\\dlas\\experiments\\train_diffusion_tts6_upsample.yml')
    parser.add_argument('-sr_diffusion_model_name', type=str, help='Name of the SR diffusion model in opt.', default='generator')
-    parser.add_argument('-sr_diffusion_model_path', type=str, help='Path to saved model weights for the SR diffuser', default='X:\\dlas\\experiments\\train_diffusion_tts6_upsample\\models\\26500_generator_ema.pth')
+    parser.add_argument('-sr_diffusion_model_path', type=str, help='Path to saved model weights for the SR diffuser', default='X:\\dlas\\experiments\\train_diffusion_tts6_upsample_continued\\models\\41000_generator_ema.pth')
    parser.add_argument('-voice', type=str, help='Type of conditioning voice', default='puppy')
    parser.add_argument('-diffusion_steps', type=int, help='Number of diffusion steps to perform to create the generate. Lower steps reduces quality, but >40 is generally pretty good.', default=100)
    parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='../results/use_diffuse_voice_translation')