import os import os.path as osp import logging import random import time import argparse from collections import OrderedDict import numpy from PIL import Image from scipy.io import wavfile from torchvision.transforms import ToTensor import utils import utils.options as option import utils.util as util from data.audio.unsupervised_audio_dataset import load_audio from models.tacotron2.taco_utils import load_wav_to_torch from trainer.ExtensibleTrainer import ExtensibleTrainer from data import create_dataset, create_dataloader from tqdm import tqdm import torch import numpy as np # A rough copy of test.py that "surfs" along a set of random noise priors to show the affect of gaussian noise on the results. def forward_pass(model, data, output_dir, spacing, audio_mode): with torch.no_grad(): model.feed_data(data, 0) model.test() visuals = model.get_current_visuals()['rlt'].cpu() img_path = data['GT_path'][0] img_name = osp.splitext(osp.basename(img_path))[0] sr_img = visuals[0] # save images suffixes = [f'_{int(spacing)}'] for suffix in suffixes: if audio_mode: save_img_path = osp.join(output_dir, img_name + suffix + '.wav') wavfile.write(osp.join(output_dir, save_img_path), 11025, sr_img[0].cpu().numpy()) else: save_img_path = osp.join(output_dir, img_name + suffix + '.png') util.save_img(util.tensor2img(sr_img), save_img_path) def load_image(path, audio_mode): # Load test image if audio_mode: im = load_audio(path, 22050) padding_needed = ((im.shape[1]//8192)+1)*8192-im.shape[1] im = torch.nn.functional.pad(im, (0, padding_needed)) im = im[:, :(im.shape[1]//8192)*8192].unsqueeze(0) else: im = ToTensor()(Image.open(path)) * 2 - 1 _, h, w = im.shape if h % 2 == 1: im = im[:,1:,:] h = h-1 if w % 2 == 1: im = im[:,:,1:] w = w-1 dh, dw = (h - 32 * (h // 32)) // 2, (w - 32 * (w // 32)) // 2 if dh > 0: im = im[:,dh:-dh] if dw > 0: im = im[:,:,dw:-dw] im = im[:3].unsqueeze(0) return im if __name__ == "__main__": # Set seeds torch.manual_seed(5555) random.seed(5555) np.random.seed(5555) #### options audio_mode = True # Whether to render audio or images. torch.backends.cudnn.benchmark = True want_metrics = False parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_diffusion_vocoder_dvae.yml') opt = option.parse(parser.parse_args().opt, is_train=False) opt = option.dict_to_nonedict(opt) utils.util.loaded_options = opt util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) im = load_image(opt['image'], audio_mode) correction_factors = util.opt_get(opt, ['correction_factor'], None) if 'ref_images' in opt.keys(): refs = [load_image(r, audio_mode) for r in opt['ref_images']] #min_len = min(r.shape[1] for r in refs) min_len = opt['ref_images_len'] refs = [r[:, :min_len] for r in refs] refs = torch.stack(refs, dim=1) else: refs = torch.empty((1,1)) #opt['steps']['generator']['injectors']['visual_debug']['zero_noise'] = False model = ExtensibleTrainer(opt) results_dir = osp.join(opt['path']['results_root'], os.path.basename(opt['image'])) util.mkdir(results_dir) for i in range(10): if audio_mode: data = { 'clip': im.to('cuda'), 'alt_clips': refs.to('cuda'), 'num_alt_clips': torch.tensor([refs.shape[1]], dtype=torch.int32, device='cuda'), 'GT_path': opt['image'], 'resampled_clip': refs[:, 0].to('cuda') } else: data = { 'hq': im.to('cuda'), 'corruption_entropy': torch.tensor([correction_factors], device='cuda', dtype=torch.float), 'GT_path': opt['image'] } forward_pass(model, data, results_dir, i, audio_mode)