From ba6e46c02ac697c341f0ca9326777115acdb8061 Mon Sep 17 00:00:00 2001 From: James Betker Date: Tue, 26 Oct 2021 08:54:30 -0600 Subject: [PATCH] Further simplify diffusion_vocoder and make noise_surfer work --- codes/models/gpt_voice/mini_encoder.py | 18 ++++- .../unet_diffusion_vocoder_with_ref.py | 7 +- .../diffusion/diffusion_noise_surfer.py | 70 +++++++++++-------- 3 files changed, 60 insertions(+), 35 deletions(-) diff --git a/codes/models/gpt_voice/mini_encoder.py b/codes/models/gpt_voice/mini_encoder.py index 5d05cebb..8dd99883 100644 --- a/codes/models/gpt_voice/mini_encoder.py +++ b/codes/models/gpt_voice/mini_encoder.py @@ -90,7 +90,17 @@ class ResBlock(nn.Module): class AudioMiniEncoder(nn.Module): - def __init__(self, spec_dim, embedding_dim, base_channels=128, depth=2, resnet_blocks=2, attn_blocks=4, num_attn_heads=4, dropout=0, downsample_factor=2, kernel_size=3): + def __init__(self, spec_dim, + embedding_dim, + base_channels=128, + depth=2, + resnet_blocks=2, + attn_blocks=4, + num_attn_heads=4, + dropout=0, + downsample_factor=2, + kernel_size=3, + do_checkpointing=False): super().__init__() self.init = nn.Sequential( conv_nd(1, spec_dim, base_channels, 3, padding=1) @@ -113,12 +123,16 @@ class AudioMiniEncoder(nn.Module): attn.append(AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=False)) self.attn = nn.Sequential(*attn) self.dim = embedding_dim + self.do_checkpointing = do_checkpointing def forward(self, x): h = self.init(x) h = self.res(h) h = self.final(h) - h = checkpoint(self.attn, h) + if self.do_checkpointing: + h = checkpoint(self.attn, h) + else: + h = self.attn(h) return h[:, :, 0] diff --git a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py b/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py index 925a00fd..ed676ad4 100644 --- a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py +++ b/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py @@ -121,9 +121,6 @@ class DiffusionVocoderWithRef(nn.Module): self.conditioning_enabled = conditioning_inputs_provided if conditioning_inputs_provided: self.contextual_embedder = AudioMiniEncoder(conditioning_input_dim, time_embed_dim) - self.query_gen = AudioMiniEncoder(in_channels, time_embed_dim, base_channels=32, depth=6, resnet_blocks=1, - attn_blocks=2, num_attn_heads=2, dropout=dropout, downsample_factor=4, kernel_size=5) - self.embedding_combiner = EmbeddingCombiner(time_embed_dim, attn_blocks=1) self.input_blocks = nn.ModuleList( [ @@ -302,8 +299,8 @@ class DiffusionVocoderWithRef(nn.Module): hs = [] emb1 = self.time_embed(timestep_embedding(timesteps, self.model_channels)) if self.conditioning_enabled: - emb2 = torch.stack([self.contextual_embedder(ci.squeeze(1)) for ci in list(torch.chunk(conditioning_inputs, conditioning_inputs.shape[1], dim=1))], dim=1) - emb2 = self.embedding_combiner(emb2, None, self.query_gen(x)) + #emb2 = torch.stack([self.contextual_embedder(ci.squeeze(1)) for ci in list(torch.chunk(conditioning_inputs, conditioning_inputs.shape[1], dim=1))], dim=1) + emb2 = self.contextual_embedder(conditioning_inputs[:, 0]) emb = emb1 + emb2 else: emb = emb1 diff --git a/codes/scripts/diffusion/diffusion_noise_surfer.py b/codes/scripts/diffusion/diffusion_noise_surfer.py index b934369a..c2194f8b 100644 --- a/codes/scripts/diffusion/diffusion_noise_surfer.py +++ b/codes/scripts/diffusion/diffusion_noise_surfer.py @@ -23,6 +23,7 @@ import numpy as np # A rough copy of test.py that "surfs" along a set of random noise priors to show the affect of gaussian noise on the results. + def forward_pass(model, data, output_dir, spacing, audio_mode): with torch.no_grad(): model.feed_data(data, 0) @@ -44,38 +45,15 @@ def forward_pass(model, data, output_dir, spacing, audio_mode): util.save_img(util.tensor2img(sr_img), save_img_path) -if __name__ == "__main__": - # Set seeds - torch.manual_seed(5555) - random.seed(5555) - np.random.seed(5555) - - #### options - audio_mode = True # Whether to render audio or images. - torch.backends.cudnn.benchmark = True - want_metrics = False - parser = argparse.ArgumentParser() - parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_diffusion_vocoder_10-20.yml') - opt = option.parse(parser.parse_args().opt, is_train=False) - opt = option.dict_to_nonedict(opt) - utils.util.loaded_options = opt - - util.mkdirs( - (path for key, path in opt['path'].items() - if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) - util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO, - screen=True, tofile=True) - logger = logging.getLogger('base') - logger.info(option.dict2str(opt)) - +def load_image(path, audio_mode): # Load test image if audio_mode: - im, sr = load_wav_to_torch(opt['image']) + im, sr = load_wav_to_torch(path) assert sr == 22050 im = im.unsqueeze(0) im = im[:, :(im.shape[1]//4096)*4096] else: - im = ToTensor()(Image.open(opt['image'])) * 2 - 1 + im = ToTensor()(Image.open(path)) * 2 - 1 _, h, w = im.shape if h % 2 == 1: im = im[:,1:,:] @@ -89,9 +67,43 @@ if __name__ == "__main__": if dw > 0: im = im[:,:,dw:-dw] im = im[:3].unsqueeze(0) + return im - # Build the corruption indexes we are going to use. - correction_factors = opt['correction_factor'] + +if __name__ == "__main__": + # Set seeds + torch.manual_seed(5555) + random.seed(5555) + np.random.seed(5555) + + #### options + audio_mode = True # Whether to render audio or images. + torch.backends.cudnn.benchmark = True + want_metrics = False + parser = argparse.ArgumentParser() + parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_diffusion_vocoder_10-25.yml') + opt = option.parse(parser.parse_args().opt, is_train=False) + opt = option.dict_to_nonedict(opt) + utils.util.loaded_options = opt + + util.mkdirs( + (path for key, path in opt['path'].items() + if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) + util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO, + screen=True, tofile=True) + logger = logging.getLogger('base') + logger.info(option.dict2str(opt)) + + im = load_image(opt['image'], audio_mode) + correction_factors = util.opt_get(opt, ['correction_factor'], None) + if 'ref_images' in opt.keys(): + refs = [load_image(r, audio_mode) for r in opt['ref_images']] + #min_len = min(r.shape[1] for r in refs) + min_len = opt['ref_images_len'] + refs = [r[:, :min_len] for r in refs] + refs = torch.stack(refs, dim=1) + else: + refs = torch.empty((1,1)) #opt['steps']['generator']['injectors']['visual_debug']['zero_noise'] = False model = ExtensibleTrainer(opt) @@ -101,6 +113,8 @@ if __name__ == "__main__": if audio_mode: data = { 'clip': im.to('cuda'), + 'alt_clips': refs.to('cuda'), + 'num_alt_clips': torch.tensor([refs.shape[1]], dtype=torch.int32, device='cuda'), 'GT_path': opt['image'] } else: