import logging import os import torch from apex import amp from torch.nn.parallel import DataParallel, DistributedDataParallel import torch.nn as nn import models.lr_scheduler as lr_scheduler import models.networks as networks from models.base_model import BaseModel from models.steps.steps import ConfigurableStep import torchvision.utils as utils logger = logging.getLogger('base') class ExtensibleTrainer(BaseModel): def __init__(self, opt): super(ExtensibleTrainer, self).__init__(opt) if opt['dist']: self.rank = torch.distributed.get_rank() else: self.rank = -1 # non dist training train_opt = opt['train'] self.mega_batch_factor = 1 # env is used as a global state to store things that subcomponents might need. env = {'device': self.device, 'rank': self.rank, 'opt': opt} self.netsG = {} self.netsD = {} self.netF = networks.define_F().to(self.device) # Used to compute feature loss. self.networks = [] self.visuals = {} for name, net in opt['networks'].items(): if net['type'] == 'generator': new_net = networks.define_G(net, None, opt['scale']).to(self.device) self.netsG[name] = new_net elif net['type'] == 'discriminator': new_net = networks.define_D_net(net, opt['datasets']['train']['target_size']).to(self.device) self.netsD[name] = new_net else: raise NotImplementedError("Can only handle generators and discriminators") self.networks.append(new_net) if self.is_train: self.mega_batch_factor = train_opt['mega_batch_factor'] if self.mega_batch_factor is None: self.mega_batch_factor = 1 # Initialize amp. amp_nets, amp_opts = amp.initialize(self.networks, self.optimizers, opt_level=opt['amp_opt_level'], num_losses=len(opt['steps'])) # self.networks is stored unwrapped. It should never be used for forward() or backward() passes, instead use # self.netG and self.netD for that. self.networks = amp_nets # DataParallel dnets = [] for anet in amp_nets: if opt['dist']: dnet = DistributedDataParallel(anet, device_ids=[torch.cuda.current_device()], find_unused_parameters=True) else: dnet = DataParallel(anet) if self.is_train: dnet.train() else: dnet.eval() dnets.append(dnet) # Backpush the wrapped networks into the network dicts.. found = 0 for dnet in dnets: for net_dict in [self.netsD, self.netsG]: for k, v in net_dict.items(): if v == dnet.module: net_dict[k] = dnet found += 1 assert found == len(self.networks) env['generators'] = self.netsG env['discriminators'] = self.netsD # Initialize the training steps self.steps = [] for step_name, step in opt['steps'].items(): step = ConfigurableStep(step, env) self.steps.append(step) self.optimizers.extend(step.get_optimizers()) # Find the optimizers that are using the default scheduler, then build them. def_opt = [] for s in self.steps: def_opt.extend(s.get_optimizers_with_default_scheduler()) lr_scheduler.get_scheduler_for_name(train_opt['default_lr_scheme'], def_opt, train_opt) self.print_network() # print network self.load() # load G and D if needed # Setting this to false triggers SRGAN to call the models update_model() function on the first iteration. self.updated = True def feed_data(self, data): self.lq = torch.chunk(corrupted_L, chunks=self.mega_batch_factor, dim=0) self.hq = [t.to(self.device) for t in torch.chunk(data['GT'], chunks=self.mega_batch_factor, dim=0)] input_ref = data['ref'] if 'ref' in data else data['GT'] self.ref = [t.to(self.device) for t in torch.chunk(input_ref, chunks=self.mega_batch_factor, dim=0)] def optimize_parameters(self, step): # Some models need to make parametric adjustments per-step. Do that here. for net in self.networks.values(): if hasattr(net, "update_for_step"): net.update_for_step(step, os.path.join(self.opt['path']['models'], "..")) # Iterate through the steps, performing them one at a time. self.visuals = {} state = {'lq': self.lq, 'hq': self.hq, 'ref': self.ref} for step_num, s in enumerate(self.steps): # Only set requires_grad=True for the network being trained. nets_to_train = s.get_networks_trained() for name, net in self.networks.items(): net_enabled = name in nets_to_train for p in self.netsG.parameters(): if p.dtype != torch.int64 and p.dtype != torch.bool: p.requires_grad = net_enabled else: p.requires_grad = False # Now do a forward and backward pass for each gradient accumulation step. new_states = {} for m in range(self.mega_batch_factor): ns = s.do_forward_backward(state, m, step_num) for k, v in ns.items(): if k not in new_states.keys(): new_states[k] = [v.detach()] else: new_states[k].append(v.detach()) # Push the detached new state tensors into the state map for use with the next step. for k, v in new_states.items(): # Overwriting existing state keys is not supported. assert k not in state.keys() state[k] = v # And finally perform optimization. s.do_step() # Record visual outputs for usage in debugging and testing. if 'visuals' in self.opt['train'].keys(): sample_save_path = os.path.join(self.opt['path']['models'], "..", "visual_dbg") for v in self.opt['train']['visuals']: self.visuals[v] = state[v].detach().cpu() if step % self.opt['train']['visual_debug_rate'] == 0: for i, dbgv in enumerate(self.visuals[v]): os.makedirs(os.path.join(sample_save_path, v), exist_ok=True) utils.save_image(dbgv, os.path.join(sample_save_path, v, "%05i_%02i.png" % (step, i))) # TODO: Do logging and image dumps def compute_fea_loss(self, real, fake): with torch.no_grad(): logits_real = self.netF(real) logits_fake = self.netF(fake) return nn.L1Loss().to(self.device)(logits_fake, logits_real) def test(self): for net in self.netsG.values(): net.eval() with torch.no_grad(): # Iterate through the steps, performing them one at a time. self.visuals = {} state = {'lq': self.lq, 'hq': self.hq, 'ref': self.ref} for step_num, s in enumerate(self.steps): ns = s.do_forward_backward(state, 0, step_num, backward=False) for k, v in ns.items(): state[k] = [v.detach()] self.eval_state = state for net in self.netsG.values(): net.train() # Fetches a summary of the log. def get_current_log(self, step): log = {} for s in self.steps: log.update(s.get_metrics()) # Some generators can do their own metric logging. for net in self.networks: if hasattr(net.module, "get_debug_values"): log.update(net.module.get_debug_values(step)) return log def get_current_visuals(self, need_GT=True): # Conforms to an archaic format from MMSR. return {'LQ': self.eval_state['lq'][0].float().cpu(), 'GT': self.eval_state['hq'][0].float().cpu(), 'rlt': self.eval_state[self.opt['eval']['output_state']][0].float().cpu()} def print_network(self): for net in self.networks: s, n = self.get_network_description(net) net_struc_str = '{}'.format(net.__class__.__name__) if self.rank <= 0: logger.info('Network structure: {}, with parameters: {:,d}'.format(net_struc_str, n)) logger.info(s) def load(self): for netdict in [self.netsG, self.netsD]: for name, net in netdict.items(): load_path = self.opt['path'][name] if load_path is not None: logger.info('Loading model for [%s]' % (load_path)) self.load_network(load_path, net) def save(self, iter_step): for name, net in self.networks.items(): self.save_network(net, name, iter_step)