DL-Art-School/dlas/scripts/diffusion/diffusion_noise_surfer.py

124 lines
4.2 KiB
Python
Raw Normal View History

import argparse
import logging
import os
import os.path as osp
import random
import numpy as np
import torch
from PIL import Image
from scipy.io import wavfile
from torchvision.transforms import ToTensor
import dlas.utils
import dlas.utils.options as option
import dlas.utils.util as util
from dlas.data.audio.unsupervised_audio_dataset import load_audio
from dlas.trainer.ExtensibleTrainer import ExtensibleTrainer
# A rough copy of test.py that "surfs" along a set of random noise priors to show the affect of gaussian noise on the results.
def forward_pass(model, data, output_dir, spacing, audio_mode):
with torch.no_grad():
model.feed_data(data, 0)
model.test()
visuals = model.get_current_visuals()['rlt'].cpu()
img_path = data['GT_path'][0]
img_name = osp.splitext(osp.basename(img_path))[0]
sr_img = visuals[0]
# save images
suffixes = [f'_{int(spacing)}']
for suffix in suffixes:
if audio_mode:
save_img_path = osp.join(output_dir, img_name + suffix + '.wav')
wavfile.write(osp.join(output_dir, save_img_path),
11025, sr_img[0].cpu().numpy())
else:
save_img_path = osp.join(output_dir, img_name + suffix + '.png')
util.save_img(util.tensor2img(sr_img), save_img_path)
def load_image(path, audio_mode):
# Load test image
if audio_mode:
2021-12-11 15:17:26 +00:00
im = load_audio(path, 22050).unsqueeze(0)
else:
im = ToTensor()(Image.open(path)) * 2 - 1
_, h, w = im.shape
if h % 2 == 1:
im = im[:, 1:, :]
h = h-1
if w % 2 == 1:
im = im[:, :, 1:]
w = w-1
dh, dw = (h - 32 * (h // 32)) // 2, (w - 32 * (w // 32)) // 2
if dh > 0:
im = im[:, dh:-dh]
if dw > 0:
im = im[:, :, dw:-dw]
im = im[:3].unsqueeze(0)
return im
if __name__ == "__main__":
# Set seeds
torch.manual_seed(5555)
random.seed(5555)
np.random.seed(5555)
# options
audio_mode = True # Whether to render audio or images.
torch.backends.cudnn.benchmark = True
want_metrics = False
parser = argparse.ArgumentParser()
parser.add_argument('-opt', type=str, help='Path to options YAML file.',
default='../options/test_diffusion_vocoder_dvae.yml')
opt = option.parse(parser.parse_args().opt, is_train=False)
opt = option.dict_to_nonedict(opt)
utils.util.loaded_options = opt
util.mkdirs(
(path for key, path in opt['path'].items()
if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key))
util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO,
screen=True, tofile=True)
logger = logging.getLogger('base')
logger.info(option.dict2str(opt))
im = load_image(opt['image'], audio_mode)
correction_factors = util.opt_get(opt, ['correction_factor'], None)
if 'ref_images' in opt.keys():
refs = [load_image(r, audio_mode) for r in opt['ref_images']]
# min_len = min(r.shape[1] for r in refs)
min_len = opt['ref_images_len']
refs = [r[:, :min_len] for r in refs]
refs = torch.stack(refs, dim=1)
else:
refs = torch.empty((1, 1))
# opt['steps']['generator']['injectors']['visual_debug']['zero_noise'] = False
model = ExtensibleTrainer(opt)
results_dir = osp.join(opt['path']['results_root'],
os.path.basename(opt['image']))
util.mkdir(results_dir)
for i in range(10):
if audio_mode:
data = {
'clip': im.to('cuda'),
2021-11-24 01:53:37 +00:00
'alt_clips': refs.to('cuda'),
'num_alt_clips': torch.tensor([refs.shape[1]], dtype=torch.int32, device='cuda'),
2021-11-24 01:53:37 +00:00
'GT_path': opt['image'],
'resampled_clip': refs[:, 0].to('cuda')
}
else:
data = {
'hq': im.to('cuda'),
'corruption_entropy': torch.tensor([correction_factors], device='cuda',
dtype=torch.float),
'GT_path': opt['image']
}
forward_pass(model, data, results_dir, i, audio_mode)