mild updates

This commit is contained in:
James Betker 2022-02-08 23:51:17 -07:00
parent ff35d13b99
commit 9e9ae328f2
3 changed files with 9 additions and 12 deletions

View File

@ -132,20 +132,17 @@ def register_ctc_code_generator(opt_net, opt):
def inf():
sd = torch.load('D:\\dlas\\experiments\\train_encoder_build_ctc_alignments\\models\\11000_generator.pth', map_location='cpu')
model = CtcCodeGenerator(layers=10).eval()
sd = torch.load('D:\\dlas\\experiments\\train_encoder_build_ctc_alignments_medium\\models\\24000_generator.pth', map_location='cpu')
model = CtcCodeGenerator(model_dim=1024,layers=32).eval()
model.load_state_dict(sd)
#raw_batch = torch.load('raw_batch.pth')
with torch.no_grad():
from data.audio.unsupervised_audio_dataset import load_audio
from scripts.audio.gen.speech_synthesis_utils import wav_to_mel
#loss = model(wav_to_mel(raw_batch['conditioning'][0]),
# raw_batch['ctc_raw_codes'][0].unsqueeze(0),
# raw_batch['ctc_pads'][0].unsqueeze(0),
# raw_batch['ctc_repeats'][0].unsqueeze(0),
# raw_batch['ctc_raw_lengths'][0].unsqueeze(0),)
ref_mel = wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\atkins\\1.wav", 22050))
ctc = model.generate(ref_mel, ["i suppose though it's too early for them"])
ref_mel = torch.cat([wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\atkins\\1.wav", 22050))[:,:,:450],
wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\kennard\\1.wav", 22050))[:,:,:450],
wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\grace\\1.wav", 22050))[:,:,:450],
wav_to_mel(load_audio("D:\\tortoise-tts\\voices\\atkins\\1.wav", 22050))[:,:,:450]], dim=0)
ctc = model.generate(ref_mel, (["i suppose though it's too early for them"] * 3) + ["i suppose though it's too early for them, dear"])
print("Break")

View File

@ -379,7 +379,7 @@ class DiffusionTts(nn.Module):
assert conditioning_input is not None
if self.super_sampling_enabled:
assert lr_input is not None
if self.super_sampling_max_noising_factor > 0:
if self.training and self.super_sampling_max_noising_factor > 0:
noising_factor = random.uniform(0,self.super_sampling_max_noising_factor)
lr_input = torch.randn_like(lr_input) * noising_factor + lr_input
lr_input = F.interpolate(lr_input, size=(x.shape[-1],), mode='nearest')

View File

@ -53,7 +53,7 @@ if __name__ == '__main__':
parser.add_argument('-diffusion_model_path', type=str, help='Path to saved model weights', default='X:\\dlas\\experiments\\train_diffusion_tts5_medium\\models\\73000_generator_ema.pth')
parser.add_argument('-sr_opt', type=str, help='Path to options YAML file used to train the SR diffusion model', default='X:\\dlas\\experiments\\train_diffusion_tts6_upsample.yml')
parser.add_argument('-sr_diffusion_model_name', type=str, help='Name of the SR diffusion model in opt.', default='generator')
parser.add_argument('-sr_diffusion_model_path', type=str, help='Path to saved model weights for the SR diffuser', default='X:\\dlas\\experiments\\train_diffusion_tts6_upsample\\models\\26500_generator_ema.pth')
parser.add_argument('-sr_diffusion_model_path', type=str, help='Path to saved model weights for the SR diffuser', default='X:\\dlas\\experiments\\train_diffusion_tts6_upsample_continued\\models\\41000_generator_ema.pth')
parser.add_argument('-voice', type=str, help='Type of conditioning voice', default='puppy')
parser.add_argument('-diffusion_steps', type=int, help='Number of diffusion steps to perform to create the generate. Lower steps reduces quality, but >40 is generally pretty good.', default=100)
parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='../results/use_diffuse_voice_translation')