From 9e9ae328f2c1f9300defbf8ba93e03200c313b4d Mon Sep 17 00:00:00 2001 From: James Betker Date: Tue, 8 Feb 2022 23:51:17 -0700 Subject: [PATCH] mild updates --- codes/models/gpt_voice/ctc_code_generator.py | 17 +++++++---------- codes/models/gpt_voice/unet_diffusion_tts6.py | 2 +- .../audio/gen/use_diffuse_voice_translation.py | 2 +- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/codes/models/gpt_voice/ctc_code_generator.py b/codes/models/gpt_voice/ctc_code_generator.py index f2bb7da6..1cfe723a 100644 --- a/codes/models/gpt_voice/ctc_code_generator.py +++ b/codes/models/gpt_voice/ctc_code_generator.py @@ -132,20 +132,17 @@ def register_ctc_code_generator(opt_net, opt): def inf(): - sd = torch.load('D:\\dlas\\experiments\\train_encoder_build_ctc_alignments\\models\\11000_generator.pth', map_location='cpu') - model = CtcCodeGenerator(layers=10).eval() + sd = torch.load('D:\\dlas\\experiments\\train_encoder_build_ctc_alignments_medium\\models\\24000_generator.pth', map_location='cpu') + model = CtcCodeGenerator(model_dim=1024,layers=32).eval() model.load_state_dict(sd) - #raw_batch = torch.load('raw_batch.pth') with torch.no_grad(): from data.audio.unsupervised_audio_dataset import load_audio from scripts.audio.gen.speech_synthesis_utils import wav_to_mel - #loss = model(wav_to_mel(raw_batch['conditioning'][0]), - # raw_batch['ctc_raw_codes'][0].unsqueeze(0), - # raw_batch['ctc_pads'][0].unsqueeze(0), - # raw_batch['ctc_repeats'][0].unsqueeze(0), - # raw_batch['ctc_raw_lengths'][0].unsqueeze(0),) - ref_mel = wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\atkins\\1.wav", 22050)) - ctc = model.generate(ref_mel, ["i suppose though it's too early for them"]) + ref_mel = torch.cat([wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\atkins\\1.wav", 22050))[:,:,:450], + wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\kennard\\1.wav", 22050))[:,:,:450], + wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\grace\\1.wav", 22050))[:,:,:450], + wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\atkins\\1.wav", 22050))[:,:,:450]], dim=0) + ctc = model.generate(ref_mel, (["i suppose though it's too early for them"] * 3) + ["i suppose though it's too early for them, dear"]) print("Break") diff --git a/codes/models/gpt_voice/unet_diffusion_tts6.py b/codes/models/gpt_voice/unet_diffusion_tts6.py index d518fbc3..6dd59254 100644 --- a/codes/models/gpt_voice/unet_diffusion_tts6.py +++ b/codes/models/gpt_voice/unet_diffusion_tts6.py @@ -379,7 +379,7 @@ class DiffusionTts(nn.Module): assert conditioning_input is not None if self.super_sampling_enabled: assert lr_input is not None - if self.super_sampling_max_noising_factor > 0: + if self.training and self.super_sampling_max_noising_factor > 0: noising_factor = random.uniform(0,self.super_sampling_max_noising_factor) lr_input = torch.randn_like(lr_input) * noising_factor + lr_input lr_input = F.interpolate(lr_input, size=(x.shape[-1],), mode='nearest') diff --git a/codes/scripts/audio/gen/use_diffuse_voice_translation.py b/codes/scripts/audio/gen/use_diffuse_voice_translation.py index 5f440a9c..a333d6e9 100644 --- a/codes/scripts/audio/gen/use_diffuse_voice_translation.py +++ b/codes/scripts/audio/gen/use_diffuse_voice_translation.py @@ -53,7 +53,7 @@ if __name__ == '__main__': parser.add_argument('-diffusion_model_path', type=str, help='Path to saved model weights', default='X:\\dlas\\experiments\\train_diffusion_tts5_medium\\models\\73000_generator_ema.pth') parser.add_argument('-sr_opt', type=str, help='Path to options YAML file used to train the SR diffusion model', default='X:\\dlas\\experiments\\train_diffusion_tts6_upsample.yml') parser.add_argument('-sr_diffusion_model_name', type=str, help='Name of the SR diffusion model in opt.', default='generator') - parser.add_argument('-sr_diffusion_model_path', type=str, help='Path to saved model weights for the SR diffuser', default='X:\\dlas\\experiments\\train_diffusion_tts6_upsample\\models\\26500_generator_ema.pth') + parser.add_argument('-sr_diffusion_model_path', type=str, help='Path to saved model weights for the SR diffuser', default='X:\\dlas\\experiments\\train_diffusion_tts6_upsample_continued\\models\\41000_generator_ema.pth') parser.add_argument('-voice', type=str, help='Type of conditioning voice', default='puppy') parser.add_argument('-diffusion_steps', type=int, help='Number of diffusion steps to perform to create the generate. Lower steps reduces quality, but >40 is generally pretty good.', default=100) parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='../results/use_diffuse_voice_translation')