import argparse import logging import os import os.path as osp import random import numpy as np import torch from PIL import Image from scipy.io import wavfile from torchvision.transforms import ToTensor import dlas.utils import dlas.utils.options as option import dlas.utils.util as util from dlas.data.audio.unsupervised_audio_dataset import load_audio from dlas.trainer.ExtensibleTrainer import ExtensibleTrainer # A rough copy of test.py that "surfs" along a set of random noise priors to show the affect of gaussian noise on the results. def forward_pass(model, data, output_dir, spacing, audio_mode): with torch.no_grad(): model.feed_data(data, 0) model.test() visuals = model.get_current_visuals()['rlt'].cpu() img_path = data['GT_path'][0] img_name = osp.splitext(osp.basename(img_path))[0] sr_img = visuals[0] # save images suffixes = [f'_{int(spacing)}'] for suffix in suffixes: if audio_mode: save_img_path = osp.join(output_dir, img_name + suffix + '.wav') wavfile.write(osp.join(output_dir, save_img_path), 11025, sr_img[0].cpu().numpy()) else: save_img_path = osp.join(output_dir, img_name + suffix + '.png') util.save_img(util.tensor2img(sr_img), save_img_path) def load_image(path, audio_mode): # Load test image if audio_mode: im = load_audio(path, 22050).unsqueeze(0) else: im = ToTensor()(Image.open(path)) * 2 - 1 _, h, w = im.shape if h % 2 == 1: im = im[:, 1:, :] h = h-1 if w % 2 == 1: im = im[:, :, 1:] w = w-1 dh, dw = (h - 32 * (h // 32)) // 2, (w - 32 * (w // 32)) // 2 if dh > 0: im = im[:, dh:-dh] if dw > 0: im = im[:, :, dw:-dw] im = im[:3].unsqueeze(0) return im if __name__ == "__main__": # Set seeds torch.manual_seed(5555) random.seed(5555) np.random.seed(5555) # options audio_mode = True # Whether to render audio or images. torch.backends.cudnn.benchmark = True want_metrics = False parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_diffusion_vocoder_dvae.yml') opt = option.parse(parser.parse_args().opt, is_train=False) opt = option.dict_to_nonedict(opt) utils.util.loaded_options = opt util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) im = load_image(opt['image'], audio_mode) correction_factors = util.opt_get(opt, ['correction_factor'], None) if 'ref_images' in opt.keys(): refs = [load_image(r, audio_mode) for r in opt['ref_images']] # min_len = min(r.shape[1] for r in refs) min_len = opt['ref_images_len'] refs = [r[:, :min_len] for r in refs] refs = torch.stack(refs, dim=1) else: refs = torch.empty((1, 1)) # opt['steps']['generator']['injectors']['visual_debug']['zero_noise'] = False model = ExtensibleTrainer(opt) results_dir = osp.join(opt['path']['results_root'], os.path.basename(opt['image'])) util.mkdir(results_dir) for i in range(10): if audio_mode: data = { 'clip': im.to('cuda'), 'alt_clips': refs.to('cuda'), 'num_alt_clips': torch.tensor([refs.shape[1]], dtype=torch.int32, device='cuda'), 'GT_path': opt['image'], 'resampled_clip': refs[:, 0].to('cuda') } else: data = { 'hq': im.to('cuda'), 'corruption_entropy': torch.tensor([correction_factors], device='cuda', dtype=torch.float), 'GT_path': opt['image'] } forward_pass(model, data, results_dir, i, audio_mode)