Further simplify diffusion_vocoder and make noise_surfer work
This commit is contained in:
parent
c3421b7f6d
commit
ba6e46c02a
|
@ -90,7 +90,17 @@ class ResBlock(nn.Module):
|
||||||
|
|
||||||
|
|
||||||
class AudioMiniEncoder(nn.Module):
|
class AudioMiniEncoder(nn.Module):
|
||||||
def __init__(self, spec_dim, embedding_dim, base_channels=128, depth=2, resnet_blocks=2, attn_blocks=4, num_attn_heads=4, dropout=0, downsample_factor=2, kernel_size=3):
|
def __init__(self, spec_dim,
|
||||||
|
embedding_dim,
|
||||||
|
base_channels=128,
|
||||||
|
depth=2,
|
||||||
|
resnet_blocks=2,
|
||||||
|
attn_blocks=4,
|
||||||
|
num_attn_heads=4,
|
||||||
|
dropout=0,
|
||||||
|
downsample_factor=2,
|
||||||
|
kernel_size=3,
|
||||||
|
do_checkpointing=False):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.init = nn.Sequential(
|
self.init = nn.Sequential(
|
||||||
conv_nd(1, spec_dim, base_channels, 3, padding=1)
|
conv_nd(1, spec_dim, base_channels, 3, padding=1)
|
||||||
|
@ -113,12 +123,16 @@ class AudioMiniEncoder(nn.Module):
|
||||||
attn.append(AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=False))
|
attn.append(AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=False))
|
||||||
self.attn = nn.Sequential(*attn)
|
self.attn = nn.Sequential(*attn)
|
||||||
self.dim = embedding_dim
|
self.dim = embedding_dim
|
||||||
|
self.do_checkpointing = do_checkpointing
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
h = self.init(x)
|
h = self.init(x)
|
||||||
h = self.res(h)
|
h = self.res(h)
|
||||||
h = self.final(h)
|
h = self.final(h)
|
||||||
h = checkpoint(self.attn, h)
|
if self.do_checkpointing:
|
||||||
|
h = checkpoint(self.attn, h)
|
||||||
|
else:
|
||||||
|
h = self.attn(h)
|
||||||
return h[:, :, 0]
|
return h[:, :, 0]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -121,9 +121,6 @@ class DiffusionVocoderWithRef(nn.Module):
|
||||||
self.conditioning_enabled = conditioning_inputs_provided
|
self.conditioning_enabled = conditioning_inputs_provided
|
||||||
if conditioning_inputs_provided:
|
if conditioning_inputs_provided:
|
||||||
self.contextual_embedder = AudioMiniEncoder(conditioning_input_dim, time_embed_dim)
|
self.contextual_embedder = AudioMiniEncoder(conditioning_input_dim, time_embed_dim)
|
||||||
self.query_gen = AudioMiniEncoder(in_channels, time_embed_dim, base_channels=32, depth=6, resnet_blocks=1,
|
|
||||||
attn_blocks=2, num_attn_heads=2, dropout=dropout, downsample_factor=4, kernel_size=5)
|
|
||||||
self.embedding_combiner = EmbeddingCombiner(time_embed_dim, attn_blocks=1)
|
|
||||||
|
|
||||||
self.input_blocks = nn.ModuleList(
|
self.input_blocks = nn.ModuleList(
|
||||||
[
|
[
|
||||||
|
@ -302,8 +299,8 @@ class DiffusionVocoderWithRef(nn.Module):
|
||||||
hs = []
|
hs = []
|
||||||
emb1 = self.time_embed(timestep_embedding(timesteps, self.model_channels))
|
emb1 = self.time_embed(timestep_embedding(timesteps, self.model_channels))
|
||||||
if self.conditioning_enabled:
|
if self.conditioning_enabled:
|
||||||
emb2 = torch.stack([self.contextual_embedder(ci.squeeze(1)) for ci in list(torch.chunk(conditioning_inputs, conditioning_inputs.shape[1], dim=1))], dim=1)
|
#emb2 = torch.stack([self.contextual_embedder(ci.squeeze(1)) for ci in list(torch.chunk(conditioning_inputs, conditioning_inputs.shape[1], dim=1))], dim=1)
|
||||||
emb2 = self.embedding_combiner(emb2, None, self.query_gen(x))
|
emb2 = self.contextual_embedder(conditioning_inputs[:, 0])
|
||||||
emb = emb1 + emb2
|
emb = emb1 + emb2
|
||||||
else:
|
else:
|
||||||
emb = emb1
|
emb = emb1
|
||||||
|
|
|
@ -23,6 +23,7 @@ import numpy as np
|
||||||
|
|
||||||
# A rough copy of test.py that "surfs" along a set of random noise priors to show the affect of gaussian noise on the results.
|
# A rough copy of test.py that "surfs" along a set of random noise priors to show the affect of gaussian noise on the results.
|
||||||
|
|
||||||
|
|
||||||
def forward_pass(model, data, output_dir, spacing, audio_mode):
|
def forward_pass(model, data, output_dir, spacing, audio_mode):
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
model.feed_data(data, 0)
|
model.feed_data(data, 0)
|
||||||
|
@ -44,38 +45,15 @@ def forward_pass(model, data, output_dir, spacing, audio_mode):
|
||||||
util.save_img(util.tensor2img(sr_img), save_img_path)
|
util.save_img(util.tensor2img(sr_img), save_img_path)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def load_image(path, audio_mode):
|
||||||
# Set seeds
|
|
||||||
torch.manual_seed(5555)
|
|
||||||
random.seed(5555)
|
|
||||||
np.random.seed(5555)
|
|
||||||
|
|
||||||
#### options
|
|
||||||
audio_mode = True # Whether to render audio or images.
|
|
||||||
torch.backends.cudnn.benchmark = True
|
|
||||||
want_metrics = False
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_diffusion_vocoder_10-20.yml')
|
|
||||||
opt = option.parse(parser.parse_args().opt, is_train=False)
|
|
||||||
opt = option.dict_to_nonedict(opt)
|
|
||||||
utils.util.loaded_options = opt
|
|
||||||
|
|
||||||
util.mkdirs(
|
|
||||||
(path for key, path in opt['path'].items()
|
|
||||||
if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key))
|
|
||||||
util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO,
|
|
||||||
screen=True, tofile=True)
|
|
||||||
logger = logging.getLogger('base')
|
|
||||||
logger.info(option.dict2str(opt))
|
|
||||||
|
|
||||||
# Load test image
|
# Load test image
|
||||||
if audio_mode:
|
if audio_mode:
|
||||||
im, sr = load_wav_to_torch(opt['image'])
|
im, sr = load_wav_to_torch(path)
|
||||||
assert sr == 22050
|
assert sr == 22050
|
||||||
im = im.unsqueeze(0)
|
im = im.unsqueeze(0)
|
||||||
im = im[:, :(im.shape[1]//4096)*4096]
|
im = im[:, :(im.shape[1]//4096)*4096]
|
||||||
else:
|
else:
|
||||||
im = ToTensor()(Image.open(opt['image'])) * 2 - 1
|
im = ToTensor()(Image.open(path)) * 2 - 1
|
||||||
_, h, w = im.shape
|
_, h, w = im.shape
|
||||||
if h % 2 == 1:
|
if h % 2 == 1:
|
||||||
im = im[:,1:,:]
|
im = im[:,1:,:]
|
||||||
|
@ -89,9 +67,43 @@ if __name__ == "__main__":
|
||||||
if dw > 0:
|
if dw > 0:
|
||||||
im = im[:,:,dw:-dw]
|
im = im[:,:,dw:-dw]
|
||||||
im = im[:3].unsqueeze(0)
|
im = im[:3].unsqueeze(0)
|
||||||
|
return im
|
||||||
|
|
||||||
# Build the corruption indexes we are going to use.
|
|
||||||
correction_factors = opt['correction_factor']
|
if __name__ == "__main__":
|
||||||
|
# Set seeds
|
||||||
|
torch.manual_seed(5555)
|
||||||
|
random.seed(5555)
|
||||||
|
np.random.seed(5555)
|
||||||
|
|
||||||
|
#### options
|
||||||
|
audio_mode = True # Whether to render audio or images.
|
||||||
|
torch.backends.cudnn.benchmark = True
|
||||||
|
want_metrics = False
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_diffusion_vocoder_10-25.yml')
|
||||||
|
opt = option.parse(parser.parse_args().opt, is_train=False)
|
||||||
|
opt = option.dict_to_nonedict(opt)
|
||||||
|
utils.util.loaded_options = opt
|
||||||
|
|
||||||
|
util.mkdirs(
|
||||||
|
(path for key, path in opt['path'].items()
|
||||||
|
if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key))
|
||||||
|
util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO,
|
||||||
|
screen=True, tofile=True)
|
||||||
|
logger = logging.getLogger('base')
|
||||||
|
logger.info(option.dict2str(opt))
|
||||||
|
|
||||||
|
im = load_image(opt['image'], audio_mode)
|
||||||
|
correction_factors = util.opt_get(opt, ['correction_factor'], None)
|
||||||
|
if 'ref_images' in opt.keys():
|
||||||
|
refs = [load_image(r, audio_mode) for r in opt['ref_images']]
|
||||||
|
#min_len = min(r.shape[1] for r in refs)
|
||||||
|
min_len = opt['ref_images_len']
|
||||||
|
refs = [r[:, :min_len] for r in refs]
|
||||||
|
refs = torch.stack(refs, dim=1)
|
||||||
|
else:
|
||||||
|
refs = torch.empty((1,1))
|
||||||
|
|
||||||
#opt['steps']['generator']['injectors']['visual_debug']['zero_noise'] = False
|
#opt['steps']['generator']['injectors']['visual_debug']['zero_noise'] = False
|
||||||
model = ExtensibleTrainer(opt)
|
model = ExtensibleTrainer(opt)
|
||||||
|
@ -101,6 +113,8 @@ if __name__ == "__main__":
|
||||||
if audio_mode:
|
if audio_mode:
|
||||||
data = {
|
data = {
|
||||||
'clip': im.to('cuda'),
|
'clip': im.to('cuda'),
|
||||||
|
'alt_clips': refs.to('cuda'),
|
||||||
|
'num_alt_clips': torch.tensor([refs.shape[1]], dtype=torch.int32, device='cuda'),
|
||||||
'GT_path': opt['image']
|
'GT_path': opt['image']
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user