diff --git a/codes/data/util.py b/codes/data/util.py index 521881b8..17b1690c 100644 --- a/codes/data/util.py +++ b/codes/data/util.py @@ -15,7 +15,7 @@ import cv2 #################### ###################### get image path list ###################### -IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP'] +IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', '.webp', '.WEBP'] def torch2cv(tensor): diff --git a/codes/models/lightweight_gan.py b/codes/models/lightweight_gan.py new file mode 100644 index 00000000..6eebba97 --- /dev/null +++ b/codes/models/lightweight_gan.py @@ -0,0 +1,914 @@ +import math +import multiprocessing +import random +from contextlib import contextmanager, ExitStack +from functools import partial +from math import log2, floor +from pathlib import Path +from random import random + +import torch +import torch.nn.functional as F +from gsa_pytorch import GSA + +import trainer.losses as L +import torchvision +from PIL import Image +from einops import rearrange, reduce +from kornia import filter2D +from torch import nn, einsum +from torch.utils.data import Dataset +from torchvision import transforms + +from models.stylegan.stylegan2_lucidrains import gradient_penalty +from trainer.networks import register_model +from utils.util import opt_get + + +def DiffAugment(x, types=[]): + for p in types: + for f in AUGMENT_FNS[p]: + x = f(x) + return x.contiguous() + + +# """ +# Augmentation functions got images as `x` +# where `x` is tensor with this dimensions: +# 0 - count of images +# 1 - channels +# 2 - width +# 3 - height of image +# """ + +def rand_brightness(x): + x = x + (torch.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) - 0.5) + return x + +def rand_saturation(x): + x_mean = x.mean(dim=1, keepdim=True) + x = (x - x_mean) * (torch.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) * 2) + x_mean + return x + +def rand_contrast(x): + x_mean = x.mean(dim=[1, 2, 3], keepdim=True) + x = (x - x_mean) * (torch.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) + 0.5) + x_mean + return x + +def rand_translation(x, ratio=0.125): + shift_x, shift_y = int(x.size(2) * ratio + 0.5), int(x.size(3) * ratio + 0.5) + translation_x = torch.randint(-shift_x, shift_x + 1, size=[x.size(0), 1, 1], device=x.device) + translation_y = torch.randint(-shift_y, shift_y + 1, size=[x.size(0), 1, 1], device=x.device) + grid_batch, grid_x, grid_y = torch.meshgrid( + torch.arange(x.size(0), dtype=torch.long, device=x.device), + torch.arange(x.size(2), dtype=torch.long, device=x.device), + torch.arange(x.size(3), dtype=torch.long, device=x.device), + ) + grid_x = torch.clamp(grid_x + translation_x + 1, 0, x.size(2) + 1) + grid_y = torch.clamp(grid_y + translation_y + 1, 0, x.size(3) + 1) + x_pad = F.pad(x, [1, 1, 1, 1, 0, 0, 0, 0]) + x = x_pad.permute(0, 2, 3, 1).contiguous()[grid_batch, grid_x, grid_y].permute(0, 3, 1, 2) + return x + +def rand_offset(x, ratio=1, ratio_h=1, ratio_v=1): + w, h = x.size(2), x.size(3) + + imgs = [] + for img in x.unbind(dim = 0): + max_h = int(w * ratio * ratio_h) + max_v = int(h * ratio * ratio_v) + + value_h = random.randint(0, max_h) * 2 - max_h + value_v = random.randint(0, max_v) * 2 - max_v + + if abs(value_h) > 0: + img = torch.roll(img, value_h, 2) + + if abs(value_v) > 0: + img = torch.roll(img, value_v, 1) + + imgs.append(img) + + return torch.stack(imgs) + +def rand_offset_h(x, ratio=1): + return rand_offset(x, ratio=1, ratio_h=ratio, ratio_v=0) + +def rand_offset_v(x, ratio=1): + return rand_offset(x, ratio=1, ratio_h=0, ratio_v=ratio) + +def rand_cutout(x, ratio=0.5): + cutout_size = int(x.size(2) * ratio + 0.5), int(x.size(3) * ratio + 0.5) + offset_x = torch.randint(0, x.size(2) + (1 - cutout_size[0] % 2), size=[x.size(0), 1, 1], device=x.device) + offset_y = torch.randint(0, x.size(3) + (1 - cutout_size[1] % 2), size=[x.size(0), 1, 1], device=x.device) + grid_batch, grid_x, grid_y = torch.meshgrid( + torch.arange(x.size(0), dtype=torch.long, device=x.device), + torch.arange(cutout_size[0], dtype=torch.long, device=x.device), + torch.arange(cutout_size[1], dtype=torch.long, device=x.device), + ) + grid_x = torch.clamp(grid_x + offset_x - cutout_size[0] // 2, min=0, max=x.size(2) - 1) + grid_y = torch.clamp(grid_y + offset_y - cutout_size[1] // 2, min=0, max=x.size(3) - 1) + mask = torch.ones(x.size(0), x.size(2), x.size(3), dtype=x.dtype, device=x.device) + mask[grid_batch, grid_x, grid_y] = 0 + x = x * mask.unsqueeze(1) + return x + +AUGMENT_FNS = { + 'color': [rand_brightness, rand_saturation, rand_contrast], + 'offset': [rand_offset], + 'offset_h': [rand_offset_h], + 'offset_v': [rand_offset_v], + 'translation': [rand_translation], + 'cutout': [rand_cutout], +} + +# constants + +NUM_CORES = multiprocessing.cpu_count() +EXTS = ['jpg', 'jpeg', 'png'] + + +# helpers + +def exists(val): + return val is not None + + +@contextmanager +def null_context(): + yield + + +def combine_contexts(contexts): + @contextmanager + def multi_contexts(): + with ExitStack() as stack: + yield [stack.enter_context(ctx()) for ctx in contexts] + + return multi_contexts + + +def is_power_of_two(val): + return log2(val).is_integer() + + +def default(val, d): + return val if exists(val) else d + + +def set_requires_grad(model, bool): + for p in model.parameters(): + p.requires_grad = bool + + +def cycle(iterable): + while True: + for i in iterable: + yield i + + +def raise_if_nan(t): + if torch.isnan(t): + raise NanException + + +def gradient_accumulate_contexts(gradient_accumulate_every, is_ddp, ddps): + if is_ddp: + num_no_syncs = gradient_accumulate_every - 1 + head = [combine_contexts(map(lambda ddp: ddp.no_sync, ddps))] * num_no_syncs + tail = [null_context] + contexts = head + tail + else: + contexts = [null_context] * gradient_accumulate_every + + for context in contexts: + with context(): + yield + + +def hinge_loss(real, fake): + return (F.relu(1 + real) + F.relu(1 - fake)).mean() + + +def evaluate_in_chunks(max_batch_size, model, *args): + split_args = list(zip(*list(map(lambda x: x.split(max_batch_size, dim=0), args)))) + chunked_outputs = [model(*i) for i in split_args] + if len(chunked_outputs) == 1: + return chunked_outputs[0] + return torch.cat(chunked_outputs, dim=0) + + +def slerp(val, low, high): + low_norm = low / torch.norm(low, dim=1, keepdim=True) + high_norm = high / torch.norm(high, dim=1, keepdim=True) + omega = torch.acos((low_norm * high_norm).sum(1)) + so = torch.sin(omega) + res = (torch.sin((1.0 - val) * omega) / so).unsqueeze(1) * low + (torch.sin(val * omega) / so).unsqueeze(1) * high + return res + + +def safe_div(n, d): + try: + res = n / d + except ZeroDivisionError: + prefix = '' if int(n >= 0) else '-' + res = float(f'{prefix}inf') + return res + + +# helper classes + +class NanException(Exception): + pass + + +class EMA(): + def __init__(self, beta): + super().__init__() + self.beta = beta + + def update_average(self, old, new): + if not exists(old): + return new + return old * self.beta + (1 - self.beta) * new + + +class EMAWrapper(nn.Module): + def __init__(self, wrapped_module, following_module, rate=.995, steps_per_ema=10, steps_per_reset=1000, steps_after_no_reset=25000, reset=True): + super().__init__() + self.wrapped = wrapped_module + self.following = following_module + self.ema_updater = EMA(rate) + self.steps_per_ema = steps_per_ema + self.steps_per_reset = steps_per_reset + self.steps_after_no_reset = steps_after_no_reset + if reset: + self.wrapped.load_state_dict(self.following.state_dict()) + for p in self.wrapped.parameters(): + p.DO_NOT_TRAIN = True + + def reset_parameter_averaging(self): + self.wrapped.load_state_dict(self.following.state_dict()) + + def update_moving_average(self): + for current_params, ma_params in zip(self.following.parameters(), self.wrapped.parameters()): + old_weight, up_weight = ma_params.data, current_params.data + ma_params.data = self.ema_updater.update_average(old_weight, up_weight) + + for current_buffer, ma_buffer in zip(self.following.buffers(), self.wrapped.buffers()): + new_buffer_value = self.ema_updater.update_average(ma_buffer, current_buffer) + ma_buffer.copy_(new_buffer_value) + + def custom_optimizer_step(self, step): + if step % self.steps_per_ema == 0: + self.update_moving_average() + if step % self.steps_per_reset and step < self.steps_after_no_reset: + self.reset_parameter_averaging() + + def forward(self, x): + with torch.no_grad(): + return self.wrapped(x) + + +class RandomApply(nn.Module): + def __init__(self, prob, fn, fn_else=lambda x: x): + super().__init__() + self.fn = fn + self.fn_else = fn_else + self.prob = prob + + def forward(self, x): + fn = self.fn if random() < self.prob else self.fn_else + return fn(x) + + +class Rezero(nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + self.g = nn.Parameter(torch.tensor(1e-3)) + + def forward(self, x): + return self.g * self.fn(x) + + +class Residual(nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, x): + return self.fn(x) + x + + +class SumBranches(nn.Module): + def __init__(self, branches): + super().__init__() + self.branches = nn.ModuleList(branches) + + def forward(self, x): + return sum(map(lambda fn: fn(x), self.branches)) + + +class Blur(nn.Module): + def __init__(self): + super().__init__() + f = torch.Tensor([1, 2, 1]) + self.register_buffer('f', f) + + def forward(self, x): + f = self.f + f = f[None, None, :] * f[None, :, None] + return filter2D(x, f, normalized=True) + + +# dataset + +def convert_image_to(img_type, image): + if image.mode != img_type: + return image.convert(img_type) + return image + + +class identity(object): + def __call__(self, tensor): + return tensor + + +class expand_greyscale(object): + def __init__(self, transparent): + self.transparent = transparent + + def __call__(self, tensor): + channels = tensor.shape[0] + num_target_channels = 4 if self.transparent else 3 + + if channels == num_target_channels: + return tensor + + alpha = None + if channels == 1: + color = tensor.expand(3, -1, -1) + elif channels == 2: + color = tensor[:1].expand(3, -1, -1) + alpha = tensor[1:] + else: + raise Exception(f'image with invalid number of channels given {channels}') + + if not exists(alpha) and self.transparent: + alpha = torch.ones(1, *tensor.shape[1:], device=tensor.device) + + return color if not self.transparent else torch.cat((color, alpha)) + + +def resize_to_minimum_size(min_size, image): + if max(*image.size) < min_size: + return torchvision.transforms.functional.resize(image, min_size) + return image + + +class ImageDataset(Dataset): + def __init__( + self, + folder, + image_size, + transparent=False, + greyscale=False, + aug_prob=0. + ): + super().__init__() + self.folder = folder + self.image_size = image_size + self.paths = [p for ext in EXTS for p in Path(f'{folder}').glob(f'**/*.{ext}')] + assert len(self.paths) > 0, f'No images were found in {folder} for training' + + if transparent: + num_channels = 4 + pillow_mode = 'RGBA' + expand_fn = expand_greyscale(transparent) + elif greyscale: + num_channels = 1 + pillow_mode = 'L' + expand_fn = identity() + else: + num_channels = 3 + pillow_mode = 'RGB' + expand_fn = expand_greyscale(transparent) + + convert_image_fn = partial(convert_image_to, pillow_mode) + + self.transform = transforms.Compose([ + transforms.Lambda(convert_image_fn), + transforms.Lambda(partial(resize_to_minimum_size, image_size)), + transforms.Resize(image_size), + RandomApply(aug_prob, transforms.RandomResizedCrop(image_size, scale=(0.5, 1.0), ratio=(0.98, 1.02)), + transforms.CenterCrop(image_size)), + transforms.ToTensor(), + transforms.Lambda(expand_fn) + ]) + + def __len__(self): + return len(self.paths) + + def __getitem__(self, index): + path = self.paths[index] + img = Image.open(path) + return self.transform(img) + + +# augmentations + +def random_hflip(tensor, prob): + if prob > random(): + return tensor + return torch.flip(tensor, dims=(3,)) + + +class AugWrapper(nn.Module): + def __init__(self, D, image_size, prob, types): + super().__init__() + self.D = D + self.prob = prob + self.types = types + + def forward(self, images, detach=False, **kwargs): + context = torch.no_grad if detach else null_context + + with context(): + if random() < self.prob: + images = random_hflip(images, prob=0.5) + images = DiffAugment(images, types=self.types) + + return self.D(images, **kwargs) + + +# modifiable global variables + +norm_class = nn.BatchNorm2d + + +def upsample(scale_factor=2): + return nn.Upsample(scale_factor=scale_factor) + + +# squeeze excitation classes + +# global context network +# https://arxiv.org/abs/2012.13375 +# similar to squeeze-excite, but with a simplified attention pooling and a subsequent layer norm + +class GlobalContext(nn.Module): + def __init__( + self, + *, + chan_in, + chan_out + ): + super().__init__() + self.to_k = nn.Conv2d(chan_in, 1, 1) + chan_intermediate = max(3, chan_out // 2) + + self.net = nn.Sequential( + nn.Conv2d(chan_in, chan_intermediate, 1), + nn.LeakyReLU(0.1), + nn.Conv2d(chan_intermediate, chan_out, 1), + nn.Sigmoid() + ) + + def forward(self, x): + context = self.to_k(x) + context = context.flatten(2).softmax(dim=-1) + out = einsum('b i n, b c n -> b c i', context, x.flatten(2)) + out = out.unsqueeze(-1) + return self.net(out) + + +# frequency channel attention +# https://arxiv.org/abs/2012.11879 + +def get_1d_dct(i, freq, L): + result = math.cos(math.pi * freq * (i + 0.5) / L) / math.sqrt(L) + return result * (1 if freq == 0 else math.sqrt(2)) + + +def get_dct_weights(width, channel, fidx_u, fidx_v): + dct_weights = torch.zeros(1, channel, width, width) + c_part = channel // len(fidx_u) + + for i, (u_x, v_y) in enumerate(zip(fidx_u, fidx_v)): + for x in range(width): + for y in range(width): + coor_value = get_1d_dct(x, u_x, width) * get_1d_dct(y, v_y, width) + dct_weights[:, i * c_part: (i + 1) * c_part, x, y] = coor_value + + return dct_weights + + +class FCANet(nn.Module): + def __init__( + self, + *, + chan_in, + chan_out, + reduction=4, + width + ): + super().__init__() + + freq_w, freq_h = ([0] * 8), list(range(8)) # in paper, it seems 16 frequencies was ideal + dct_weights = get_dct_weights(width, chan_in, [*freq_w, *freq_h], [*freq_h, *freq_w]) + self.register_buffer('dct_weights', dct_weights) + + chan_intermediate = max(3, chan_out // reduction) + + self.net = nn.Sequential( + nn.Conv2d(chan_in, chan_intermediate, 1), + nn.LeakyReLU(0.1), + nn.Conv2d(chan_intermediate, chan_out, 1), + nn.Sigmoid() + ) + + def forward(self, x): + x = reduce(x * self.dct_weights, 'b c (h h1) (w w1) -> b c h1 w1', 'sum', h1=1, w1=1) + return self.net(x) + + +# generative adversarial network + +class Generator(nn.Module): + def __init__( + self, + *, + image_size, + latent_dim=256, + fmap_max=512, + fmap_inverse_coef=12, + transparent=False, + greyscale=False, + freq_chan_attn=False + ): + super().__init__() + resolution = log2(image_size) + assert is_power_of_two(image_size), 'image size must be a power of 2' + + if transparent: + init_channel = 4 + elif greyscale: + init_channel = 1 + else: + init_channel = 3 + + fmap_max = default(fmap_max, latent_dim) + + self.initial_conv = nn.Sequential( + nn.ConvTranspose2d(latent_dim, latent_dim * 2, 4), + norm_class(latent_dim * 2), + nn.GLU(dim=1) + ) + + num_layers = int(resolution) - 2 + features = list(map(lambda n: (n, 2 ** (fmap_inverse_coef - n)), range(2, num_layers + 2))) + features = list(map(lambda n: (n[0], min(n[1], fmap_max)), features)) + features = list(map(lambda n: 3 if n[0] >= 8 else n[1], features)) + features = [latent_dim, *features] + + in_out_features = list(zip(features[:-1], features[1:])) + + self.res_layers = range(2, num_layers + 2) + self.layers = nn.ModuleList([]) + self.res_to_feature_map = dict(zip(self.res_layers, in_out_features)) + + self.sle_map = ((3, 7), (4, 8), (5, 9), (6, 10)) + self.sle_map = list(filter(lambda t: t[0] <= resolution and t[1] <= resolution, self.sle_map)) + self.sle_map = dict(self.sle_map) + + self.num_layers_spatial_res = 1 + + for (res, (chan_in, chan_out)) in zip(self.res_layers, in_out_features): + attn = None + sle = None + if res in self.sle_map: + residual_layer = self.sle_map[res] + sle_chan_out = self.res_to_feature_map[residual_layer - 1][-1] + + if freq_chan_attn: + sle = FCANet( + chan_in=chan_out, + chan_out=sle_chan_out, + width=2 ** (res + 1) + ) + else: + sle = GlobalContext( + chan_in=chan_out, + chan_out=sle_chan_out + ) + + layer = nn.ModuleList([ + nn.Sequential( + upsample(), + Blur(), + nn.Conv2d(chan_in, chan_out * 2, 3, padding=1), + norm_class(chan_out * 2), + nn.GLU(dim=1) + ), + sle, + attn + ]) + self.layers.append(layer) + + self.out_conv = nn.Conv2d(features[-1], init_channel, 3, padding=1) + + for m in self.modules(): + if type(m) in {nn.Conv2d, nn.Linear}: + nn.init.kaiming_normal_(m.weight, a=0, mode='fan_in', nonlinearity='leaky_relu') + + def forward(self, x): + x = rearrange(x, 'b c -> b c () ()') + x = self.initial_conv(x) + x = F.normalize(x, dim=1) + + residuals = dict() + + for (res, (up, sle, attn)) in zip(self.res_layers, self.layers): + if exists(attn): + x = attn(x) + x + + x = up(x) + + if exists(sle): + out_res = self.sle_map[res] + residual = sle(x) + residuals[out_res] = residual + + next_res = res + 1 + if next_res in residuals: + x = x * residuals[next_res] + + return self.out_conv(x) + + +class SimpleDecoder(nn.Module): + def __init__( + self, + *, + chan_in, + chan_out=3, + num_upsamples=4, + ): + super().__init__() + + self.layers = nn.ModuleList([]) + final_chan = chan_out + chans = chan_in + + for ind in range(num_upsamples): + last_layer = ind == (num_upsamples - 1) + chan_out = chans if not last_layer else final_chan * 2 + layer = nn.Sequential( + upsample(), + nn.Conv2d(chans, chan_out, 3, padding=1), + nn.GLU(dim=1) + ) + self.layers.append(layer) + chans //= 2 + + def forward(self, x): + for layer in self.layers: + x = layer(x) + return x + + +class Discriminator(nn.Module): + def __init__( + self, + *, + image_size, + fmap_max=512, + fmap_inverse_coef=12, + transparent=False, + greyscale=False, + disc_output_size=5, + attn_res_layers=[] + ): + super().__init__() + self.image_size = image_size + resolution = log2(image_size) + assert is_power_of_two(image_size), 'image size must be a power of 2' + assert disc_output_size in {1, 5}, 'discriminator output dimensions can only be 5x5 or 1x1' + + resolution = int(resolution) + + if transparent: + init_channel = 4 + elif greyscale: + init_channel = 1 + else: + init_channel = 3 + + num_non_residual_layers = max(0, int(resolution) - 8) + num_residual_layers = 8 - 3 + + non_residual_resolutions = range(min(8, resolution), 2, -1) + features = list(map(lambda n: (n, 2 ** (fmap_inverse_coef - n)), non_residual_resolutions)) + features = list(map(lambda n: (n[0], min(n[1], fmap_max)), features)) + + if num_non_residual_layers == 0: + res, _ = features[0] + features[0] = (res, init_channel) + + chan_in_out = list(zip(features[:-1], features[1:])) + + self.non_residual_layers = nn.ModuleList([]) + for ind in range(num_non_residual_layers): + first_layer = ind == 0 + last_layer = ind == (num_non_residual_layers - 1) + chan_out = features[0][-1] if last_layer else init_channel + + self.non_residual_layers.append(nn.Sequential( + Blur(), + nn.Conv2d(init_channel, chan_out, 4, stride=2, padding=1), + nn.LeakyReLU(0.1) + )) + + self.residual_layers = nn.ModuleList([]) + + for (res, ((_, chan_in), (_, chan_out))) in zip(non_residual_resolutions, chan_in_out): + attn = None + self.residual_layers.append(nn.ModuleList([ + SumBranches([ + nn.Sequential( + Blur(), + nn.Conv2d(chan_in, chan_out, 4, stride=2, padding=1), + nn.LeakyReLU(0.1), + nn.Conv2d(chan_out, chan_out, 3, padding=1), + nn.LeakyReLU(0.1) + ), + nn.Sequential( + Blur(), + nn.AvgPool2d(2), + nn.Conv2d(chan_in, chan_out, 1), + nn.LeakyReLU(0.1), + ) + ]), + attn + ])) + + last_chan = features[-1][-1] + if disc_output_size == 5: + self.to_logits = nn.Sequential( + nn.Conv2d(last_chan, last_chan, 1), + nn.LeakyReLU(0.1), + nn.Conv2d(last_chan, 1, 4) + ) + elif disc_output_size == 1: + self.to_logits = nn.Sequential( + Blur(), + nn.Conv2d(last_chan, last_chan, 3, stride=2, padding=1), + nn.LeakyReLU(0.1), + nn.Conv2d(last_chan, 1, 4) + ) + + self.to_shape_disc_out = nn.Sequential( + nn.Conv2d(init_channel, 64, 3, padding=1), + Residual(Rezero(GSA(dim=64, norm_queries=True, batch_norm=False))), + SumBranches([ + nn.Sequential( + Blur(), + nn.Conv2d(64, 32, 4, stride=2, padding=1), + nn.LeakyReLU(0.1), + nn.Conv2d(32, 32, 3, padding=1), + nn.LeakyReLU(0.1) + ), + nn.Sequential( + Blur(), + nn.AvgPool2d(2), + nn.Conv2d(64, 32, 1), + nn.LeakyReLU(0.1), + ) + ]), + Residual(Rezero(GSA(dim=32, norm_queries=True, batch_norm=False))), + nn.AdaptiveAvgPool2d((4, 4)), + nn.Conv2d(32, 1, 4) + ) + + self.decoder1 = SimpleDecoder(chan_in=last_chan, chan_out=init_channel) + self.decoder2 = SimpleDecoder(chan_in=features[-2][-1], chan_out=init_channel) if resolution >= 9 else None + + for m in self.modules(): + if type(m) in {nn.Conv2d, nn.Linear}: + nn.init.kaiming_normal_(m.weight, a=0, mode='fan_in', nonlinearity='leaky_relu') + + def forward(self, x, calc_aux_loss=False): + orig_img = x + + for layer in self.non_residual_layers: + x = layer(x) + + layer_outputs = [] + + for (net, attn) in self.residual_layers: + if exists(attn): + x = attn(x) + x + + x = net(x) + layer_outputs.append(x) + + out = self.to_logits(x).flatten(1) + + img_32x32 = F.interpolate(orig_img, size=(32, 32)) + out_32x32 = self.to_shape_disc_out(img_32x32) + + if not calc_aux_loss: + return out, out_32x32, None + + # self-supervised auto-encoding loss + + layer_8x8 = layer_outputs[-1] + layer_16x16 = layer_outputs[-2] + + recon_img_8x8 = self.decoder1(layer_8x8) + + aux_loss = F.mse_loss( + recon_img_8x8, + F.interpolate(orig_img, size=recon_img_8x8.shape[2:]) + ) + + if exists(self.decoder2): + select_random_quadrant = lambda rand_quadrant, img: \ + rearrange(img, 'b c (m h) (n w) -> (m n) b c h w', m=2, n=2)[rand_quadrant] + crop_image_fn = partial(select_random_quadrant, floor(random() * 4)) + img_part, layer_16x16_part = map(crop_image_fn, (orig_img, layer_16x16)) + + recon_img_16x16 = self.decoder2(layer_16x16_part) + + aux_loss_16x16 = F.mse_loss( + recon_img_16x16, + F.interpolate(img_part, size=recon_img_16x16.shape[2:]) + ) + + aux_loss = aux_loss + aux_loss_16x16 + + return out, out_32x32, aux_loss + + +class LightweightGanDivergenceLoss(L.ConfigurableLoss): + def __init__(self, opt, env): + super().__init__(opt, env) + self.real = opt['real'] + self.fake = opt['fake'] + self.discriminator = opt['discriminator'] + self.for_gen = opt['gen_loss'] + self.gp_frequency = opt['gradient_penalty_frequency'] + self.noise = opt['noise'] if 'noise' in opt.keys() else 0 + # TODO: Implement generator top-k fractional loss compensation. + + def forward(self, net, state): + real_input = state[self.real] + fake_input = state[self.fake] + if self.noise != 0: + fake_input = fake_input + torch.rand_like(fake_input) * self.noise + real_input = real_input + torch.rand_like(real_input) * self.noise + + D = self.env['discriminators'][self.discriminator] + fake, fake32, _ = D(fake_input, detach=not self.for_gen) + if self.for_gen: + return fake.mean() + fake32.mean() + else: + real_input.requires_grad_() # <-- Needed to compute gradients on the input. + real, real32, real_aux = D(real_input, calc_aux_loss=True) + divergence_loss = hinge_loss(real, fake) + hinge_loss(real32, fake32) + real_aux + + # Apply gradient penalty. TODO: migrate this elsewhere. + if self.env['step'] % self.gp_frequency == 0: + gp = gradient_penalty(real_input, real) + self.metrics.append(("gradient_penalty", gp.clone().detach())) + divergence_loss = divergence_loss + gp + + real_input.requires_grad_(requires_grad=False) + return divergence_loss + + +@register_model +def register_lightweight_gan_g(opt_net, opt, other_nets): + gen = Generator(**opt_net['kwargs']) + if opt_get(opt_net, ['ema'], False): + following = other_nets[opt_net['following']] + return EMAWrapper(gen, following, opt_net['rate']) + return gen + + +@register_model +def register_lightweight_gan_d(opt_net, opt): + d = Discriminator(**opt_net['kwargs']) + if opt_net['aug']: + return AugWrapper(d, d.image_size, opt_net['aug_prob'], opt_net['aug_types']) + return d + + +if __name__ == '__main__': + g = Generator(image_size=256) + d = Discriminator(image_size=256) + j = torch.randn(1,256) + r = g(j) + a, b, c = d(r) + print(a.shape) diff --git a/codes/requirements.txt b/codes/requirements.txt index 0e218500..03656bb3 100644 --- a/codes/requirements.txt +++ b/codes/requirements.txt @@ -16,4 +16,5 @@ kornia linear_attention_transformer vector_quantize_pytorch orjson -einops \ No newline at end of file +einops +gsa-pytorch \ No newline at end of file diff --git a/codes/train.py b/codes/train.py index 2fdfb4b3..26329b50 100644 --- a/codes/train.py +++ b/codes/train.py @@ -295,7 +295,7 @@ class Trainer: if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_cats_stylegan2_rosinality.yml') + parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_lightweight_gan_pna.yml') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() diff --git a/codes/trainer/ExtensibleTrainer.py b/codes/trainer/ExtensibleTrainer.py index e3bcb3f6..f137cf83 100644 --- a/codes/trainer/ExtensibleTrainer.py +++ b/codes/trainer/ExtensibleTrainer.py @@ -59,11 +59,11 @@ class ExtensibleTrainer(BaseModel): new_net = None if net['type'] == 'generator': if new_net is None: - new_net = networks.create_model(opt, net).to(self.device) + new_net = networks.create_model(opt, net, self.netsG).to(self.device) self.netsG[name] = new_net elif net['type'] == 'discriminator': if new_net is None: - new_net = networks.create_model(opt, net).to(self.device) + new_net = networks.create_model(opt, net, self.netsD).to(self.device) self.netsD[name] = new_net else: raise NotImplementedError("Can only handle generators and discriminators") @@ -251,12 +251,18 @@ class ExtensibleTrainer(BaseModel): # And finally perform optimization. [e.before_optimize(state) for e in self.experiments] s.do_step(step) + # Some networks have custom steps, for example EMA + for net in self.networks: + if hasattr(net, "custom_optimizer_step"): + net.custom_optimizer_step(step) [e.after_optimize(state) for e in self.experiments] # Record visual outputs for usage in debugging and testing. if 'visuals' in self.opt['logger'].keys() and self.rank <= 0 and step % self.opt['logger']['visual_debug_rate'] == 0: denorm = opt_get(self.opt, ['logger', 'denormalize'], False) - denorm_range = tuple(opt_get(self.opt, ['logger', 'denormalize_range'], None)) + denorm_range = opt_get(self.opt, ['logger', 'denormalize_range'], None) + if denorm_range: + denorm_range = tuple(denorm_range) sample_save_path = os.path.join(self.opt['path']['models'], "..", "visual_dbg") for v in self.opt['logger']['visuals']: if v not in state.keys(): diff --git a/codes/trainer/eval/fid.py b/codes/trainer/eval/fid.py index 6c662593..eb591a5f 100644 --- a/codes/trainer/eval/fid.py +++ b/codes/trainer/eval/fid.py @@ -5,12 +5,9 @@ import os.path as osp import torchvision import trainer.eval.evaluator as evaluator from pytorch_fid import fid_score - - -# Evaluate that generates uniform noise to feed into a generator, then calculates a FID score on the results. from utils.util import opt_get - +# Evaluator that generates uniform noise to feed into a generator, then calculates a FID score on the results. class StyleTransferEvaluator(evaluator.Evaluator): def __init__(self, model, opt_eval, env): super().__init__(model, opt_eval, env) @@ -23,14 +20,14 @@ class StyleTransferEvaluator(evaluator.Evaluator): self.latent_dim = opt_get(opt_eval, ['latent_dim'], 512) # Not needed if using 'imgnoise' input. def perform_eval(self): - fid_fake_path = osp.join(self.env['base_path'], "../../models", "fid", str(self.env["step"])) + fid_fake_path = osp.join(self.env['base_path'], "../", "fid", str(self.env["step"])) os.makedirs(fid_fake_path, exist_ok=True) counter = 0 for i in range(self.batches_per_eval): if self.noise_type == 'imgnoise': batch = torch.FloatTensor(self.batch_sz, 3, self.im_sz, self.im_sz).uniform_(0., 1.).to(self.env['device']) elif self.noise_type == 'stylenoise': - batch = [torch.randn(self.batch_sz, self.latent_dim).to(self.env['device'])] + batch = torch.randn(self.batch_sz, self.latent_dim).to(self.env['device']) gen = self.model(batch) if not isinstance(gen, list) and not isinstance(gen, tuple): gen = [gen] @@ -39,5 +36,6 @@ class StyleTransferEvaluator(evaluator.Evaluator): torchvision.utils.save_image(gen[b], osp.join(fid_fake_path, "%i_.png" % (counter))) counter += 1 + print("Got all images, computing fid") return {"fid": fid_score.calculate_fid_given_paths([self.fid_real_samples, fid_fake_path], self.batch_sz, True, 2048)} diff --git a/codes/trainer/injectors/base_injectors.py b/codes/trainer/injectors/base_injectors.py index 2acfca6b..7463504e 100644 --- a/codes/trainer/injectors/base_injectors.py +++ b/codes/trainer/injectors/base_injectors.py @@ -402,4 +402,14 @@ class Stylegan2NoiseInjector(Injector): if self.mix_prob > 0 and random.random() < self.mix_prob: return {self.output: self.make_noise(i.shape[0], self.latent_dim, 2, i.device)} else: - return {self.output: self.make_noise(i.shape[0], self.latent_dim, 1, i.device)} \ No newline at end of file + return {self.output: self.make_noise(i.shape[0], self.latent_dim, 1, i.device)} + + +class NoiseInjector(Injector): + def __init__(self, opt, env): + super().__init__(opt, env) + self.shape = tuple(opt['shape']) + + def forward(self, state): + shape = (state[self.input].shape[0],) + self.shape + return {self.output: torch.randn(shape, device=state[self.input].device)} diff --git a/codes/trainer/losses.py b/codes/trainer/losses.py index 37e916cf..c703bd80 100644 --- a/codes/trainer/losses.py +++ b/codes/trainer/losses.py @@ -21,6 +21,9 @@ def create_loss(opt_loss, env): elif 'style_sr_' in type: from models.styled_sr import create_stylesr_loss return create_stylesr_loss(opt_loss, env) + elif 'lightweight_gan_divergence' == type: + from models.lightweight_gan import LightweightGanDivergenceLoss + return LightweightGanDivergenceLoss(opt_loss, env) elif type == 'crossentropy': return CrossEntropy(opt_loss, env) elif type == 'pix': diff --git a/codes/trainer/networks.py b/codes/trainer/networks.py index 0ef9beb9..ed64efb8 100644 --- a/codes/trainer/networks.py +++ b/codes/trainer/networks.py @@ -3,7 +3,7 @@ import logging import pkgutil import sys from collections import OrderedDict -from inspect import isfunction, getmembers +from inspect import isfunction, getmembers, signature import torch import models.feature_arch as feature_arch @@ -56,7 +56,7 @@ class CreateModelError(Exception): f'{available}') -def create_model(opt, opt_net): +def create_model(opt, opt_net, other_nets=None): which_model = opt_net['which_model'] # For backwards compatibility. if not which_model: @@ -66,7 +66,11 @@ def create_model(opt, opt_net): registered_fns = find_registered_model_fns() if which_model not in registered_fns.keys(): raise CreateModelError(which_model, list(registered_fns.keys())) - return registered_fns[which_model](opt_net, opt) + num_params = len(signature(registered_fns[which_model]).parameters) + if num_params == 2: + return registered_fns[which_model](opt_net, opt) + else: + return registered_fns[which_model](opt_net, opt, other_nets) # Define network used for perceptual loss