import random import torch.nn import torchaudio.functional from kornia.augmentation import RandomResizedCrop from torch.cuda.amp import autocast from data.audio.unsupervised_audio_dataset import load_audio from trainer.inject import Injector, create_injector from trainer.losses import extract_params_from_state from utils.audio import plot_spectrogram from utils.util import opt_get from utils.weight_scheduler import get_scheduler_for_opt class PadInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) self.multiple = opt['multiple'] def forward(self, state): ldim = state[self.input].shape[-1] mod = self.multiple-(ldim % self.multiple) t = state[self.input] if mod != 0: t = torch.nn.functional.pad(t, (0, mod)) return {self.output: t} class SqueezeInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) self.dim = opt['dim'] def forward(self, state): return {self.output: state[self.input].squeeze(dim=self.dim)} # Uses a generator to synthesize an image from [in] and injects the results into [out] # Note that results are *not* detached. class GeneratorInjector(Injector): def __init__(self, opt, env): super(GeneratorInjector, self).__init__(opt, env) self.grad = opt['grad'] if 'grad' in opt.keys() else True self.method = opt_get(opt, ['method'], None) # If specified, this method is called instead of __call__() self.args = opt_get(opt, ['args'], {}) def forward(self, state): gen = self.env['generators'][self.opt['generator']] if self.method is not None and hasattr(gen, 'module'): gen = gen.module # Dereference DDP wrapper. method = gen if self.method is None else getattr(gen, self.method) with autocast(enabled=self.env['opt']['fp16']): if isinstance(self.input, list): params = extract_params_from_state(self.input, state) else: params = [state[self.input]] if self.grad: results = method(*params, **self.args) else: with torch.no_grad(): results = method(*params, **self.args) new_state = {} if isinstance(self.output, list): # Only dereference tuples or lists, not tensors. IF YOU REACH THIS ERROR, REMOVE THE BRACES AROUND YOUR OUTPUTS IN THE YAML CONFIG assert isinstance(results, list) or isinstance(results, tuple) for i, k in enumerate(self.output): new_state[k] = results[i] else: new_state[self.output] = results return new_state # Injects a result from a discriminator network into the state. class DiscriminatorInjector(Injector): def __init__(self, opt, env): super(DiscriminatorInjector, self).__init__(opt, env) def forward(self, state): with autocast(enabled=self.env['opt']['fp16']): d = self.env['discriminators'][self.opt['discriminator']] if isinstance(self.input, list): params = [state[i] for i in self.input] results = d(*params) else: results = d(state[self.input]) new_state = {} if isinstance(self.output, list): # Only dereference tuples or lists, not tensors. assert isinstance(results, list) or isinstance(results, tuple) for i, k in enumerate(self.output): new_state[k] = results[i] else: new_state[self.output] = results return new_state # Injects a scalar that is modulated with a specified schedule. Useful for increasing or decreasing the influence # of something over time. class ScheduledScalarInjector(Injector): def __init__(self, opt, env): super(ScheduledScalarInjector, self).__init__(opt, env) self.scheduler = get_scheduler_for_opt(opt['scheduler']) def forward(self, state): return {self.opt['out']: self.scheduler.get_weight_for_step(self.env['step'])} # Adds gaussian noise to [in], scales it to [0,[scale]] and injects into [out] class AddNoiseInjector(Injector): def __init__(self, opt, env): super(AddNoiseInjector, self).__init__(opt, env) self.mode = opt['mode'] if 'mode' in opt.keys() else 'normal' def forward(self, state): # Scale can be a fixed float, or a state key (e.g. from ScheduledScalarInjector). if isinstance(self.opt['scale'], str): scale = state[self.opt['scale']] else: scale = self.opt['scale'] if scale is None: scale = 1 ref = state[self.opt['in']] if self.mode == 'normal': noise = torch.randn_like(ref) * scale elif self.mode == 'uniform': noise = torch.FloatTensor(ref.shape).uniform_(0.0, scale).to(ref.device) return {self.opt['out']: state[self.opt['in']] + noise} # Averages the channel dimension (1) of [in] and saves to [out]. Dimensions are # kept the same, the average is simply repeated. class GreyInjector(Injector): def __init__(self, opt, env): super(GreyInjector, self).__init__(opt, env) def forward(self, state): mean = torch.mean(state[self.opt['in']], dim=1, keepdim=True) mean = mean.repeat(1, 3, 1, 1) return {self.opt['out']: mean} class InterpolateInjector(Injector): def __init__(self, opt, env): super(InterpolateInjector, self).__init__(opt, env) if 'scale_factor' in opt.keys(): self.scale_factor = opt['scale_factor'] self.size = None else: self.scale_factor = None self.size = (opt['size'], opt['size']) def forward(self, state): scaled = torch.nn.functional.interpolate(state[self.opt['in']], scale_factor=self.opt['scale_factor'], size=self.opt['size'], mode=self.opt['mode']) return {self.opt['out']: scaled} # Extracts four patches from the input image, each a square of 'patch_size'. The input images are taken from each # of the four corners of the image. The intent of this loss is that each patch shares some part of the input, which # can then be used in the translation invariance loss. # # This injector is unique in that it does not only produce the specified output label into state. Instead it produces five # outputs for the specified label, one for each corner of the input as well as the specified output, which is the top left # corner. See the code below to find out how this works. # # Another note: this injector operates differently in eval mode (e.g. when env['training']=False) - in this case, it # simply sets all the output state variables to the input. This is so that you can feed the output of this injector # directly into your generator in training without affecting test performance. class ImagePatchInjector(Injector): def __init__(self, opt, env): super(ImagePatchInjector, self).__init__(opt, env) self.patch_size = opt['patch_size'] self.resize = opt[ 'resize'] if 'resize' in opt.keys() else None # If specified, the output is resized to a square with this size after patch extraction. def forward(self, state): im = state[self.opt['in']] if self.env['training']: res = {self.opt['out']: im[:, :3, :self.patch_size, :self.patch_size], '%s_top_left' % (self.opt['out'],): im[:, :, :self.patch_size, :self.patch_size], '%s_top_right' % (self.opt['out'],): im[:, :, :self.patch_size, -self.patch_size:], '%s_bottom_left' % (self.opt['out'],): im[:, :, -self.patch_size:, :self.patch_size], '%s_bottom_right' % (self.opt['out'],): im[:, :, -self.patch_size:, -self.patch_size:]} else: res = {self.opt['out']: im, '%s_top_left' % (self.opt['out'],): im, '%s_top_right' % (self.opt['out'],): im, '%s_bottom_left' % (self.opt['out'],): im, '%s_bottom_right' % (self.opt['out'],): im} if self.resize is not None: res2 = {} for k, v in res.items(): res2[k] = torch.nn.functional.interpolate(v, size=(self.resize, self.resize), mode="nearest") res = res2 return res # Concatenates a list of tensors on the specified dimension. class ConcatenateInjector(Injector): def __init__(self, opt, env): super(ConcatenateInjector, self).__init__(opt, env) self.dim = opt['dim'] def forward(self, state): input = [state[i] for i in self.input] return {self.opt['out']: torch.cat(input, dim=self.dim)} # Removes margins from an image. class MarginRemoval(Injector): def __init__(self, opt, env): super(MarginRemoval, self).__init__(opt, env) self.margin = opt['margin'] self.random_shift_max = opt['random_shift_max'] if 'random_shift_max' in opt.keys() else 0 def forward(self, state): input = state[self.input] if self.random_shift_max > 0: output = [] # This is a really shitty way of doing this. If it works at all, I should reconsider using Resample2D, for example. for b in range(input.shape[0]): shiftleft = random.randint(-self.random_shift_max, self.random_shift_max) shifttop = random.randint(-self.random_shift_max, self.random_shift_max) output.append(input[b, :, self.margin + shiftleft:-(self.margin - shiftleft), self.margin + shifttop:-(self.margin - shifttop)]) output = torch.stack(output, dim=0) else: output = input[:, :, self.margin:-self.margin, self.margin:-self.margin] return {self.opt['out']: output} # Produces an injection which is composed of applying a single injector multiple times across a single dimension. class ForEachInjector(Injector): def __init__(self, opt, env): super(ForEachInjector, self).__init__(opt, env) o = opt.copy() o['type'] = opt['subtype'] o['in'] = '_in' o['out'] = '_out' self.injector = create_injector(o, self.env) self.aslist = opt['aslist'] if 'aslist' in opt.keys() else False def forward(self, state): injs = [] st = state.copy() inputs = state[self.opt['in']] for i in range(inputs.shape[1]): st['_in'] = inputs[:, i] injs.append(self.injector(st)['_out']) if self.aslist: return {self.output: injs} else: return {self.output: torch.stack(injs, dim=1)} class ConstantInjector(Injector): def __init__(self, opt, env): super(ConstantInjector, self).__init__(opt, env) self.constant_type = opt['constant_type'] self.like = opt['like'] # This injector uses this tensor to determine what batch size and device to use. def forward(self, state): like = state[self.like] if self.constant_type == 'zeroes': out = torch.zeros_like(like) else: raise NotImplementedError return {self.opt['out']: out} class IndicesExtractor(Injector): def __init__(self, opt, env): super(IndicesExtractor, self).__init__(opt, env) self.dim = opt['dim'] assert self.dim == 1 # Honestly not sure how to support an abstract dim here, so just add yours when needed. def forward(self, state): results = {} for i, o in enumerate(self.output): if self.dim == 1: results[o] = state[self.input][:, i] return results class RandomShiftInjector(Injector): def __init__(self, opt, env): super(RandomShiftInjector, self).__init__(opt, env) def forward(self, state): img = state[self.input] return {self.output: img} class BatchRotateInjector(Injector): def __init__(self, opt, env): super(BatchRotateInjector, self).__init__(opt, env) def forward(self, state): img = state[self.input] return {self.output: torch.roll(img, 1, 0)} # Injector used to work with image deltas used in diff-SR class SrDiffsInjector(Injector): def __init__(self, opt, env): super(SrDiffsInjector, self).__init__(opt, env) self.mode = opt['mode'] assert self.mode in ['recombine', 'produce_diff'] self.lq = opt['lq'] self.hq = opt['hq'] if self.mode == 'produce_diff': self.diff_key = opt['diff'] self.include_combined = opt['include_combined'] def forward(self, state): resampled_lq = state[self.lq] hq = state[self.hq] if self.mode == 'produce_diff': diff = hq - resampled_lq if self.include_combined: res = torch.cat([resampled_lq, diff, hq], dim=1) else: res = torch.cat([resampled_lq, diff], dim=1) return {self.output: res, self.diff_key: diff} elif self.mode == 'recombine': combined = resampled_lq + hq return {self.output: combined} class MultiFrameCombiner(Injector): def __init__(self, opt, env): super().__init__(opt, env) self.mode = opt['mode'] self.dim = opt['dim'] if 'dim' in opt.keys() else None self.flow = opt['flow'] self.in_lq_key = opt['in'] self.in_hq_key = opt['in_hq'] self.out_lq_key = opt['out'] self.out_hq_key = opt['out_hq'] from models.flownet2.networks import Resample2d self.resampler = Resample2d() def combine(self, state): flow = self.env['generators'][self.flow] lq = state[self.in_lq_key] hq = state[self.in_hq_key] b, f, c, h, w = lq.shape center = f // 2 center_img = lq[:, center, :, :, :] imgs = [center_img] with torch.no_grad(): for i in range(f): if i == center: continue nimg = lq[:, i, :, :, :] flowfield = flow(torch.stack([center_img, nimg], dim=2).float()) nimg = self.resampler(nimg, flowfield) imgs.append(nimg) hq_out = hq[:, center, :, :, :] return {self.out_lq_key: torch.cat(imgs, dim=1), self.out_hq_key: hq_out, self.out_lq_key + "_flow_sample": torch.cat(imgs, dim=0)} def synthesize(self, state): lq = state[self.in_lq_key] return { self.out_lq_key: lq.repeat(1, self.dim, 1, 1) } def forward(self, state): if self.mode == "synthesize": return self.synthesize(state) elif self.mode == "combine": return self.combine(state) else: raise NotImplementedError # Combines data from multiple different sources and mixes them along the batch dimension. Labels are then emitted # according to how the mixing was performed. class MixAndLabelInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) self.out_labels = opt['out_labels'] def forward(self, state): input_tensors = [state[i] for i in self.input] num_inputs = len(input_tensors) bs = input_tensors[0].shape[0] labels = torch.randint(0, num_inputs, (bs,), device=input_tensors[0].device) # Still don't know of a good way to do this in torch.. TODO make it better.. res = [] for b in range(bs): res.append(input_tensors[labels[b]][b, :, :, :]) output = torch.stack(res, dim=0) return {self.out_labels: labels, self.output: output} # Randomly performs a uniform resize & crop from a base image. # Never resizes below input resolution or messes with the aspect ratio. class RandomCropInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) dim_in = opt['dim_in'] dim_out = opt['dim_out'] scale = dim_out / dim_in self.operator = RandomResizedCrop(size=(dim_out, dim_out), scale=(scale, 1), ratio=(.99,1), # An aspect ratio range is required, but .99,1 is effectively "none". resample='NEAREST') def forward(self, state): return {self.output: self.operator(state[self.input])} class Stylegan2NoiseInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) self.mix_prob = opt_get(opt, ['mix_probability'], .9) self.latent_dim = opt_get(opt, ['latent_dim'], 512) def make_noise(self, batch, latent_dim, n_noise, device): return torch.randn(n_noise, batch, latent_dim, device=device).unbind(0) def forward(self, state): i = state[self.input] if self.mix_prob > 0 and random.random() < self.mix_prob: return {self.output: self.make_noise(i.shape[0], self.latent_dim, 2, i.device)} else: return {self.output: self.make_noise(i.shape[0], self.latent_dim, 1, i.device)} class NoiseInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) self.shape = tuple(opt['shape']) def forward(self, state): shape = (state[self.input].shape[0],) + self.shape return {self.output: torch.randn(shape, device=state[self.input].device)} # Incorporates the specified dimension into the batch dimension. class DecomposeDimensionInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) self.dim = opt['dim'] self.cutoff_dim = opt_get(opt, ['cutoff_dim'], -1) assert self.dim != 0 # Cannot decompose the batch dimension def forward(self, state): inp = state[self.input] dims = list(range(len(inp.shape))) # Looks like [0,1,2,3] shape = list(inp.shape) del dims[self.dim] del shape[self.dim] # Compute the reverse permutation and shape arguments needed to undo this operation. rev_shape = [inp.shape[self.dim]] + shape.copy() rev_permute = list(range(len(inp.shape)))[1:] # Looks like [1,2,3] rev_permute = rev_permute[:self.dim] + [0] + (rev_permute[self.dim:] if self.dim < len(rev_permute) else []) out = inp.permute([self.dim] + dims).reshape((-1,) + tuple(shape[1:])) if self.cutoff_dim > -1: out = out[:self.cutoff_dim] return {self.output: out, f'{self.output}_reverse_shape': rev_shape, f'{self.output}_reverse_permute': rev_permute} # Undoes a decompose. class RecomposeDimensionInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) self.rev_shape_key = opt['reverse_shape'] self.rev_permute_key = opt['reverse_permute'] def forward(self, state): inp = state[self.input] rev_shape = state[self.rev_shape_key] rev_permute = state[self.rev_permute_key] out = inp.reshape(rev_shape) out = out.permute(rev_permute).contiguous() return {self.output: out} # Performs normalization across fixed constants. class NormalizeInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) self.shift = opt['shift'] self.scale = opt['scale'] def forward(self, state): inp = state[self.input] out = (inp - self.shift) / self.scale return {self.output: out} # Performs frequency-bin normalization for spectrograms. class FrequencyBinNormalizeInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) self.shift, self.scale = torch.load(opt['stats_file']) self.shift = self.shift.view(1,-1,1) self.scale = self.scale.view(1,-1,1) def forward(self, state): inp = state[self.input] self.shift = self.shift.to(inp.device) self.scale = self.scale.to(inp.device) out = (inp - self.shift) / self.scale return {self.output: out} # Performs normalization across fixed constants. class DenormalizeInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) self.shift = opt['shift'] self.scale = opt['scale'] def forward(self, state): inp = state[self.input] out = inp * self.scale + self.shift return {self.output: out} class MelSpectrogramInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) from models.tacotron2.layers import TacotronSTFT # These are the default tacotron values for the MEL spectrogram. filter_length = opt_get(opt, ['filter_length'], 1024) hop_length = opt_get(opt, ['hop_length'], 256) win_length = opt_get(opt, ['win_length'], 1024) n_mel_channels = opt_get(opt, ['n_mel_channels'], 80) mel_fmin = opt_get(opt, ['mel_fmin'], 0) mel_fmax = opt_get(opt, ['mel_fmax'], 8000) sampling_rate = opt_get(opt, ['sampling_rate'], 22050) self.stft = TacotronSTFT(filter_length, hop_length, win_length, n_mel_channels, sampling_rate, mel_fmin, mel_fmax) def forward(self, state): inp = state[self.input] if len(inp.shape) == 3: # Automatically squeeze out the channels dimension if it is present (assuming mono-audio) inp = inp.squeeze(1) assert len(inp.shape) == 2 self.stft = self.stft.to(inp.device) return {self.output: self.stft.mel_spectrogram(inp)} class TorchMelSpectrogramInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) # These are the default tacotron values for the MEL spectrogram. self.filter_length = opt_get(opt, ['filter_length'], 1024) self.hop_length = opt_get(opt, ['hop_length'], 256) self.win_length = opt_get(opt, ['win_length'], 1024) self.n_mel_channels = opt_get(opt, ['n_mel_channels'], 80) self.mel_fmin = opt_get(opt, ['mel_fmin'], 0) self.mel_fmax = opt_get(opt, ['mel_fmax'], 8000) self.sampling_rate = opt_get(opt, ['sampling_rate'], 22050) norm = opt_get(opt, ['normalize'], False) self.mel_stft = torchaudio.transforms.MelSpectrogram(n_fft=self.filter_length, hop_length=self.hop_length, win_length=self.win_length, power=2, normalized=norm, sample_rate=self.sampling_rate, f_min=self.mel_fmin, f_max=self.mel_fmax, n_mels=self.n_mel_channels) def forward(self, state): inp = state[self.input] if len(inp.shape) == 3: # Automatically squeeze out the channels dimension if it is present (assuming mono-audio) inp = inp.squeeze(1) assert len(inp.shape) == 2 self.mel_stft = self.mel_stft.to(inp.device) mel = self.mel_stft(inp) return {self.output: mel} def test_torch_mel_injector(): a = load_audio('D:\\data\\audio\\libritts\\train-clean-100\\19\\198\\19_198_000000_000000.wav', 22050) inj = TorchMelSpectrogramInjector({'in': 'in', 'out': 'out'}, {}) f = inj({'in': a.unsqueeze(0)})['out'] plot_spectrogram(f[0]) print('Pause') class RandomAudioCropInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) self.crop_sz = opt['crop_size'] def forward(self, state): inp = state[self.input] len = inp.shape[-1] margin = len - self.crop_sz start = random.randint(0, margin) return {self.output: inp[:, :, start:start+self.crop_sz]} class AudioResampleInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) self.input_sr = opt['input_sample_rate'] self.output_sr = opt['output_sample_rate'] def forward(self, state): inp = state[self.input] return {self.output: torchaudio.functional.resample(inp, self.input_sr, self.output_sr)} def test_audio_resample_injector(): inj = AudioResampleInjector({'in': 'x', 'out': 'y', 'input_sample_rate': 22050, 'output_sample_rate': '1'}, None) print(inj({'x':torch.rand(10,1,40800)})['y'].shape) if __name__ == '__main__': test_torch_mel_injector()