From ba6e46c02ac697c341f0ca9326777115acdb8061 Mon Sep 17 00:00:00 2001
From: James Betker <jbetker@gmail.com>
Date: Tue, 26 Oct 2021 08:54:30 -0600
Subject: [PATCH] Further simplify diffusion_vocoder and make noise_surfer work

---
 codes/models/gpt_voice/mini_encoder.py        | 18 ++++-
 .../unet_diffusion_vocoder_with_ref.py        |  7 +-
 .../diffusion/diffusion_noise_surfer.py       | 70 +++++++++++--------
 3 files changed, 60 insertions(+), 35 deletions(-)

diff --git a/codes/models/gpt_voice/mini_encoder.py b/codes/models/gpt_voice/mini_encoder.py
index 5d05cebb..8dd99883 100644
--- a/codes/models/gpt_voice/mini_encoder.py
+++ b/codes/models/gpt_voice/mini_encoder.py
@@ -90,7 +90,17 @@ class ResBlock(nn.Module):
 
 
 class AudioMiniEncoder(nn.Module):
-    def __init__(self, spec_dim, embedding_dim, base_channels=128, depth=2, resnet_blocks=2, attn_blocks=4, num_attn_heads=4, dropout=0, downsample_factor=2, kernel_size=3):
+    def __init__(self, spec_dim,
+                 embedding_dim,
+                 base_channels=128,
+                 depth=2,
+                 resnet_blocks=2,
+                 attn_blocks=4,
+                 num_attn_heads=4,
+                 dropout=0,
+                 downsample_factor=2,
+                 kernel_size=3,
+                 do_checkpointing=False):
         super().__init__()
         self.init = nn.Sequential(
             conv_nd(1, spec_dim, base_channels, 3, padding=1)
@@ -113,12 +123,16 @@ class AudioMiniEncoder(nn.Module):
             attn.append(AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=False))
         self.attn = nn.Sequential(*attn)
         self.dim = embedding_dim
+        self.do_checkpointing = do_checkpointing
 
     def forward(self, x):
         h = self.init(x)
         h = self.res(h)
         h = self.final(h)
-        h = checkpoint(self.attn, h)
+        if self.do_checkpointing:
+            h = checkpoint(self.attn, h)
+        else:
+            h = self.attn(h)
         return h[:, :, 0]
 
 
diff --git a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py b/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py
index 925a00fd..ed676ad4 100644
--- a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py
+++ b/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py
@@ -121,9 +121,6 @@ class DiffusionVocoderWithRef(nn.Module):
         self.conditioning_enabled = conditioning_inputs_provided
         if conditioning_inputs_provided:
             self.contextual_embedder = AudioMiniEncoder(conditioning_input_dim, time_embed_dim)
-            self.query_gen = AudioMiniEncoder(in_channels, time_embed_dim, base_channels=32, depth=6, resnet_blocks=1,
-                                              attn_blocks=2, num_attn_heads=2, dropout=dropout, downsample_factor=4, kernel_size=5)
-            self.embedding_combiner = EmbeddingCombiner(time_embed_dim, attn_blocks=1)
 
         self.input_blocks = nn.ModuleList(
             [
@@ -302,8 +299,8 @@ class DiffusionVocoderWithRef(nn.Module):
         hs = []
         emb1 = self.time_embed(timestep_embedding(timesteps, self.model_channels))
         if self.conditioning_enabled:
-            emb2 = torch.stack([self.contextual_embedder(ci.squeeze(1)) for ci in list(torch.chunk(conditioning_inputs, conditioning_inputs.shape[1], dim=1))], dim=1)
-            emb2 = self.embedding_combiner(emb2, None, self.query_gen(x))
+            #emb2 = torch.stack([self.contextual_embedder(ci.squeeze(1)) for ci in list(torch.chunk(conditioning_inputs, conditioning_inputs.shape[1], dim=1))], dim=1)
+            emb2 = self.contextual_embedder(conditioning_inputs[:, 0])
             emb = emb1 + emb2
         else:
             emb = emb1
diff --git a/codes/scripts/diffusion/diffusion_noise_surfer.py b/codes/scripts/diffusion/diffusion_noise_surfer.py
index b934369a..c2194f8b 100644
--- a/codes/scripts/diffusion/diffusion_noise_surfer.py
+++ b/codes/scripts/diffusion/diffusion_noise_surfer.py
@@ -23,6 +23,7 @@ import numpy as np
 
 # A rough copy of test.py that "surfs" along a set of random noise priors to show the affect of gaussian noise on the results.
 
+
 def forward_pass(model, data, output_dir, spacing, audio_mode):
     with torch.no_grad():
         model.feed_data(data, 0)
@@ -44,38 +45,15 @@ def forward_pass(model, data, output_dir, spacing, audio_mode):
             util.save_img(util.tensor2img(sr_img), save_img_path)
 
 
-if __name__ == "__main__":
-    # Set seeds
-    torch.manual_seed(5555)
-    random.seed(5555)
-    np.random.seed(5555)
-
-    #### options
-    audio_mode = True  # Whether to render audio or images.
-    torch.backends.cudnn.benchmark = True
-    want_metrics = False
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_diffusion_vocoder_10-20.yml')
-    opt = option.parse(parser.parse_args().opt, is_train=False)
-    opt = option.dict_to_nonedict(opt)
-    utils.util.loaded_options = opt
-
-    util.mkdirs(
-        (path for key, path in opt['path'].items()
-         if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key))
-    util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO,
-                      screen=True, tofile=True)
-    logger = logging.getLogger('base')
-    logger.info(option.dict2str(opt))
-
+def load_image(path, audio_mode):
     # Load test image
     if audio_mode:
-        im, sr = load_wav_to_torch(opt['image'])
+        im, sr = load_wav_to_torch(path)
         assert sr == 22050
         im = im.unsqueeze(0)
         im = im[:, :(im.shape[1]//4096)*4096]
     else:
-        im = ToTensor()(Image.open(opt['image'])) * 2 - 1
+        im = ToTensor()(Image.open(path)) * 2 - 1
         _, h, w = im.shape
         if h % 2 == 1:
             im = im[:,1:,:]
@@ -89,9 +67,43 @@ if __name__ == "__main__":
         if dw > 0:
             im = im[:,:,dw:-dw]
         im = im[:3].unsqueeze(0)
+    return im
 
-        # Build the corruption indexes we are going to use.
-        correction_factors = opt['correction_factor']
+
+if __name__ == "__main__":
+    # Set seeds
+    torch.manual_seed(5555)
+    random.seed(5555)
+    np.random.seed(5555)
+
+    #### options
+    audio_mode = True  # Whether to render audio or images.
+    torch.backends.cudnn.benchmark = True
+    want_metrics = False
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_diffusion_vocoder_10-25.yml')
+    opt = option.parse(parser.parse_args().opt, is_train=False)
+    opt = option.dict_to_nonedict(opt)
+    utils.util.loaded_options = opt
+
+    util.mkdirs(
+        (path for key, path in opt['path'].items()
+         if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key))
+    util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO,
+                      screen=True, tofile=True)
+    logger = logging.getLogger('base')
+    logger.info(option.dict2str(opt))
+
+    im = load_image(opt['image'], audio_mode)
+    correction_factors = util.opt_get(opt, ['correction_factor'], None)
+    if 'ref_images' in opt.keys():
+        refs = [load_image(r, audio_mode) for r in opt['ref_images']]
+        #min_len = min(r.shape[1] for r in refs)
+        min_len = opt['ref_images_len']
+        refs = [r[:, :min_len] for r in refs]
+        refs = torch.stack(refs, dim=1)
+    else:
+        refs = torch.empty((1,1))
 
     #opt['steps']['generator']['injectors']['visual_debug']['zero_noise'] = False
     model = ExtensibleTrainer(opt)
@@ -101,6 +113,8 @@ if __name__ == "__main__":
         if audio_mode:
             data = {
                 'clip': im.to('cuda'),
+                'alt_clips': refs.to('cuda'),
+                'num_alt_clips': torch.tensor([refs.shape[1]], dtype=torch.int32, device='cuda'),
                 'GT_path': opt['image']
             }
         else: