diff --git a/codes/models/gpt_voice/unet_diffusion_tts_experimental.py b/codes/models/gpt_voice/unet_diffusion_tts_experimental.py index 3621a8bc..aba7d926 100644 --- a/codes/models/gpt_voice/unet_diffusion_tts_experimental.py +++ b/codes/models/gpt_voice/unet_diffusion_tts_experimental.py @@ -11,7 +11,6 @@ import torch.nn.functional as F from models.gpt_voice.mini_encoder import AudioMiniEncoder, EmbeddingCombiner from scripts.audio.gen.use_diffuse_tts import ceil_multiple from trainer.networks import register_model -from utils.util import get_mask_from_lengths from utils.util import checkpoint diff --git a/codes/scripts/audio/gen/use_diffuse_tts.py b/codes/scripts/audio/gen/use_diffuse_tts.py index d6121441..a6105146 100644 --- a/codes/scripts/audio/gen/use_diffuse_tts.py +++ b/codes/scripts/audio/gen/use_diffuse_tts.py @@ -20,9 +20,9 @@ def ceil_multiple(base, multiple): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('-opt', type=str, help='Path to options YAML file used to train the diffusion model', default='../options/train_diffusion_tts.yml') + parser.add_argument('-opt', type=str, help='Path to options YAML file used to train the diffusion model', default='../options/train_diffusion_tts_medium.yml') parser.add_argument('-diffusion_model_name', type=str, help='Name of the diffusion model in opt.', default='generator') - parser.add_argument('-diffusion_model_path', type=str, help='Path to saved model weights', default='../experiments/train_diffusion_tts_experimental_fp16\\models\\17800_generator_ema.pth') + parser.add_argument('-diffusion_model_path', type=str, help='Path to saved model weights', default='X:\\dlas\\experiments\\train_diffusion_tts_medium\\models\\5200_generator.pth') parser.add_argument('-aligned_codes', type=str, help='Comma-delimited list of integer codes that defines text & prosody. Get this by apply W2V to an existing audio clip or from a bespoke generator.', default='0,0,0,0,10,10,0,4,0,7,0,17,4,4,0,25,5,0,13,13,0,22,4,4,0,21,15,15,7,0,0,14,4,4,6,8,4,4,0,0,12,5,0,0,5,0,4,4,22,22,8,16,16,0,4,4,4,0,0,0,0,0,0,0') # Default: 'i am very glad to see you', libritts/train-clean-100/103/1241/103_1241_000017_000001.wav. parser.add_argument('-cond', type=str, help='Path to the conditioning input audio file.', default='Y:\\clips\\books1\\754_Dan Simmons - The Rise Of Endymion 356 of 450\\00026.wav') diff --git a/codes/scripts/audio/gen/use_gpt_tts.py b/codes/scripts/audio/gen/use_gpt_tts.py index 8bfff791..1ffb7b60 100644 --- a/codes/scripts/audio/gen/use_gpt_tts.py +++ b/codes/scripts/audio/gen/use_gpt_tts.py @@ -82,7 +82,7 @@ if __name__ == '__main__': 'ed_sheeran': ['D:\\data\\audio\\sample_voices\\ed_sheeran.wav'], 'simmons': ['Y:\\clips\\books1\\754_Dan Simmons - The Rise Of Endymion 356 of 450\\00026.wav'], 'news_girl': ['Y:\\clips\\podcasts-0\\8288_20210113-Is More Violence Coming_\\00022.wav', 'Y:\\clips\\podcasts-0\\8288_20210113-Is More Violence Coming_\\00016.wav'], - 'dan_carlin': ['Y:\\clips\\books1\\5_dchha06 Shield of the West\\00476.wav'], + 'dan_carlin': ['Y:\\clips\\books1\\5_dchha06 Shield of the West\\00476.wav', 'Y:\\clips\\books1\\15_dchha16 Nazi Tidbits\\00036.wav'], 'libri_test': ['Y:\\libritts\\test-clean\\672\\122797\\672_122797_000057_000002.wav'], 'myself': ['D:\\data\\audio\\sample_voices\\myself1.wav', 'D:\\data\\audio\\sample_voices\\myself2.wav'], } @@ -90,7 +90,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-opt_diffuse', type=str, help='Path to options YAML file used to train the diffusion model', default='X:\\dlas\\experiments\\train_diffusion_vocoder_22k_level.yml') parser.add_argument('-diffusion_model_name', type=str, help='Name of the diffusion model in opt.', default='generator') - parser.add_argument('-diffusion_model_path', type=str, help='Diffusion model checkpoint to load.', default='X:\\dlas\\experiments\\train_diffusion_vocoder_22k_level\\models\\12000_generator_ema.pth') + parser.add_argument('-diffusion_model_path', type=str, help='Diffusion model checkpoint to load.', default='X:\\dlas\\experiments\\train_diffusion_vocoder_22k_level\\models\\15000_generator_ema.pth') parser.add_argument('-dvae_model_name', type=str, help='Name of the DVAE model in opt.', default='dvae') parser.add_argument('-opt_gpt_tts', type=str, help='Path to options YAML file used to train the GPT-TTS model', default='X:\\dlas\\experiments\\train_gpt_tts_unified.yml') parser.add_argument('-gpt_tts_model_name', type=str, help='Name of the GPT TTS model in opt.', default='gpt') @@ -131,8 +131,8 @@ if __name__ == '__main__': print("Performing GPT inference..") samples = [] for b in tqdm(range(args.num_batches)): - codes = gpt.inference_speech(conds, text, num_beams=1, repetition_penalty=1.0, do_sample=True, top_k=20, top_p=.95, - num_return_sequences=args.num_samples//args.num_batches, length_penalty=1) + codes = gpt.inference_speech(conds, text, num_beams=1, repetition_penalty=1.0, do_sample=True, top_k=50, top_p=.95, + temperature=.9, num_return_sequences=args.num_samples//args.num_batches, length_penalty=1) padding_needed = 250 - codes.shape[1] codes = F.pad(codes, (0, padding_needed), value=stop_mel_token) samples.append(codes) diff --git a/codes/trainer/injectors/base_injectors.py b/codes/trainer/injectors/base_injectors.py index d82a8a84..5acb61f6 100644 --- a/codes/trainer/injectors/base_injectors.py +++ b/codes/trainer/injectors/base_injectors.py @@ -78,6 +78,7 @@ class GeneratorInjector(Injector): self.grad = opt['grad'] if 'grad' in opt.keys() else True self.method = opt_get(opt, ['method'], None) # If specified, this method is called instead of __call__() self.args = opt_get(opt, ['args'], {}) + self.fp16_override = opt_get(opt, ['fp16'], True) def forward(self, state): gen = self.env['generators'][self.opt['generator']] @@ -86,7 +87,7 @@ class GeneratorInjector(Injector): gen = gen.module # Dereference DDP wrapper. method = gen if self.method is None else getattr(gen, self.method) - with autocast(enabled=self.env['opt']['fp16']): + with autocast(enabled=self.env['opt']['fp16'] and self.fp16_override): if isinstance(self.input, list): params = extract_params_from_state(self.input, state) else: