DL-Art-School/codes/trainer/ExtensibleTrainer.py

561 lines
27 KiB
Python

import copy
import logging
import os
from math import sqrt
from time import time
import torch
from torch import distributed
from torch.nn.parallel import DataParallel
import torch.nn as nn
import trainer.lr_scheduler as lr_scheduler
import trainer.networks as networks
from trainer.base_model import BaseModel
from trainer.batch_size_optimizer import create_batch_size_optimizer
from trainer.inject import create_injector
from trainer.injectors.audio_injectors import normalize_mel
from trainer.steps import ConfigurableStep
from trainer.experiments.experiments import get_experiment_for_name
import torchvision.utils as utils
from utils.loss_accumulator import LossAccumulator, InfStorageLossAccumulator
from utils.util import opt_get, denormalize
import torch_intermediary as ml
logger = logging.getLogger('base')
# State is immutable to reduce complexity. Overwriting existing state keys is not supported.
class OverwrittenStateError(Exception):
def __init__(self, k, keys):
super().__init__(f'Attempted to overwrite state key: {k}. The state should be considered '
f'immutable and keys should not be overwritten. Current keys: {keys}')
class ExtensibleTrainer(BaseModel):
def __init__(self, opt, cached_networks={}):
super(ExtensibleTrainer, self).__init__(opt)
if opt['dist']:
self.rank = torch.distributed.get_rank()
else:
self.rank = -1 # non dist training
train_opt = opt['train']
# env is used as a global state to store things that subcomponents might need.
self.env = {'device': self.device,
'rank': self.rank,
'opt': opt,
'step': 0,
'dist': opt['dist']
}
if opt['path']['models'] is not None:
self.env['base_path'] = os.path.join(opt['path']['models'])
self.mega_batch_factor = 1
if self.is_train:
self.mega_batch_factor = train_opt['mega_batch_factor']
self.env['mega_batch_factor'] = self.mega_batch_factor
self.batch_factor = self.mega_batch_factor
self.ema_rate = opt_get(train_opt, ['ema_rate'], .999)
# It is advantageous for large networks to do this to save an extra copy of the model weights.
# It does come at the cost of a round trip to CPU memory at every batch.
self.do_emas = opt_get(train_opt, ['ema_enabled'], True)
self.ema_on_cpu = opt_get(train_opt, ['ema_on_cpu'], False)
self.checkpointing_cache = opt['checkpointing_enabled']
self.auto_recover = opt_get(opt, ['automatically_recover_nan_by_reverting_n_saves'], None)
self.batch_size_optimizer = create_batch_size_optimizer(train_opt)
self.auto_scale_grads = opt_get(opt, ['automatically_scale_grads_for_fanin'], False)
self.auto_scale_basis = opt_get(opt, ['automatically_scale_base_layer_size'], 1024)
self.netsG = {}
self.netsD = {}
for name, net in opt['networks'].items():
# Trainable is a required parameter, but the default is simply true. Set it here.
if 'trainable' not in net.keys():
net['trainable'] = True
if name in cached_networks.keys():
new_net = cached_networks[name]
else:
new_net = None
if net['type'] == 'generator':
if new_net is None:
new_net = networks.create_model(opt, net, self.netsG).to(self.device)
self.netsG[name] = new_net
elif net['type'] == 'discriminator':
if new_net is None:
new_net = networks.create_model(opt, net, self.netsD).to(self.device)
self.netsD[name] = new_net
else:
raise NotImplementedError("Can only handle generators and discriminators")
if not net['trainable']:
new_net.eval()
if net['wandb_debug'] and self.rank <= 0:
import wandb
wandb.watch(new_net, log='all', log_freq=3)
# Initialize the train/eval steps
self.step_names = []
self.steps = []
for step_name, step in opt['steps'].items():
step = ConfigurableStep(step, self.env)
self.step_names.append(step_name) # This could be an OrderedDict, but it's a PITA to integrate with AMP below.
self.steps.append(step)
# step.define_optimizers() relies on the networks being placed in the env, so put them there. Even though
# they aren't wrapped yet.
self.env['generators'] = self.netsG
self.env['discriminators'] = self.netsD
# Define the optimizers from the steps
for s in self.steps:
s.define_optimizers()
self.optimizers.extend(s.get_optimizers())
if self.is_train:
# Find the optimizers that are using the default scheduler, then build them.
def_opt = []
for s in self.steps:
def_opt.extend(s.get_optimizers_with_default_scheduler())
self.schedulers = lr_scheduler.get_scheduler_for_name(train_opt['default_lr_scheme'], def_opt, train_opt)
# Set the starting step count for the scheduler.
for sched in self.schedulers:
sched.last_epoch = opt['current_step']
else:
self.schedulers = []
# Wrap networks in distributed shells.
dnets = []
all_networks = [g for g in self.netsG.values()] + [d for d in self.netsD.values()]
for anet in all_networks:
has_any_trainable_params = False
for p in anet.parameters():
if not hasattr(p, 'DO_NOT_TRAIN'):
has_any_trainable_params = True
break
if has_any_trainable_params and opt['dist']:
if opt['dist_backend'] == 'apex':
# Use Apex to enable delay_allreduce, which is compatible with gradient checkpointing.
from apex.parallel import DistributedDataParallel
dnet = DistributedDataParallel(anet, delay_allreduce=True)
else:
from torch.nn.parallel.distributed import DistributedDataParallel
# Do NOT be tempted to put find_unused_parameters=True here. It will not work when checkpointing is
# used and in a few other cases. But you can try it if you really want.
dnet = DistributedDataParallel(anet, device_ids=[torch.cuda.current_device()],
output_device=torch.cuda.current_device(),
find_unused_parameters=opt_get(opt, ['ddp_find_unused_parameters'], False))
# DDP graphs cannot be used with gradient checkpointing unless you use find_unused_parameters=True,
# which does not work with this trainer (as stated above). However, if the graph is not subject
# to control flow alterations, you can set this option to allow gradient checkpointing. Beware that
# if you are wrong about control flow, DDP will not train all your model parameters! User beware!
if opt_get(opt, ['ddp_static_graph'], False):
dnet._set_static_graph()
else:
dnet = DataParallel(anet, device_ids=[torch.cuda.current_device()])
if self.is_train:
dnet.train()
else:
dnet.eval()
dnets.append(dnet)
# Backpush the wrapped networks into the network dicts. Also build the EMA parameters.
self.networks = {}
self.emas = {}
found = 0
for dnet in dnets:
for net_dict in [self.netsD, self.netsG]:
for k, v in net_dict.items():
if v == dnet.module:
net_dict[k] = dnet
self.networks[k] = dnet
if self.is_train and self.do_emas:
self.emas[k] = copy.deepcopy(v)
if self.ema_on_cpu:
self.emas[k] = self.emas[k].cpu()
if hasattr(v, 'provide_ema'):
v.provide_ema(self.emas[k])
found += 1
assert found == len(self.netsG) + len(self.netsD)
# Replace the env networks with the wrapped networks
self.env['generators'] = self.netsG
self.env['discriminators'] = self.netsD
self.env['emas'] = self.emas
# self.print_network() # print network
self.load() # load networks from save states as needed
# Load experiments
self.experiments = []
if 'experiments' in opt.keys():
self.experiments = [get_experiment_for_name(e) for e in opt['experiments']]
# Setting this to false triggers SRGAN to call the models update_model() function on the first iteration.
self.updated = True
def feed_data(self, data, step, need_GT=True, perform_micro_batching=True):
self.env['step'] = step
self.batch_factor = self.mega_batch_factor
self.opt['checkpointing_enabled'] = self.checkpointing_cache
# The batch factor can be adjusted on a period to allow known high-memory steps to fit in GPU memory.
if 'train' in self.opt.keys() and \
'mod_batch_factor' in self.opt['train'].keys() and \
self.env['step'] % self.opt['train']['mod_batch_factor_every'] == 0:
self.batch_factor = self.opt['train']['mod_batch_factor']
if self.opt['train']['mod_batch_factor_also_disable_checkpointing']:
self.opt['checkpointing_enabled'] = False
self.eval_state = {}
for o in self.optimizers:
o.zero_grad()
torch.cuda.empty_cache()
sort_key = opt_get(self.opt, ['train', 'sort_key'], None)
if sort_key is not None:
sort_indices = torch.sort(data[sort_key], descending=True).indices
else:
sort_indices = None
batch_factor = self.batch_factor if perform_micro_batching else 1
self.dstate = {}
for k, v in data.items():
if sort_indices is not None:
if isinstance(v, list):
v = [v[i] for i in sort_indices]
else:
v = v[sort_indices]
if isinstance(v, torch.Tensor):
self.dstate[k] = [t.to(self.device) for t in torch.chunk(v, chunks=batch_factor, dim=0)]
if opt_get(self.opt, ['train', 'auto_collate'], False):
for k, v in self.dstate.items():
if f'{k}_lengths' in self.dstate.keys():
for c in range(len(v)):
maxlen = self.dstate[f'{k}_lengths'][c].max()
if len(v[c].shape) == 2:
self.dstate[k][c] = self.dstate[k][c][:, :maxlen]
elif len(v[c].shape) == 3:
self.dstate[k][c] = self.dstate[k][c][:, :, :maxlen]
elif len(v[c].shape) == 4:
self.dstate[k][c] = self.dstate[k][c][:, :, :, :maxlen]
def optimize_parameters(self, it, optimize=True, return_grad_norms=False):
grad_norms = {}
# Some models need to make parametric adjustments per-step. Do that here.
for net in self.networks.values():
if hasattr(net.module, "update_for_step"):
net.module.update_for_step(it, os.path.join(self.opt['path']['models'], ".."))
# Iterate through the steps, performing them one at a time.
state = self.dstate
for step_num, step in enumerate(self.steps):
train_step = True
# 'every' is used to denote steps that should only occur at a certain integer factor rate. e.g. '2' occurs every 2 steps.
# Note that the injection points for the step might still be required, so address this by setting train_step=False
if 'every' in step.step_opt.keys() and it % step.step_opt['every'] != 0:
train_step = False
# Steps can opt out of early (or late) training, make sure that happens here.
if 'after' in step.step_opt.keys() and it < step.step_opt['after'] or 'before' in step.step_opt.keys() and it > step.step_opt['before']:
continue
# Steps can choose to not execute if a state key is missing.
if 'requires' in step.step_opt.keys():
requirements_met = True
for requirement in step.step_opt['requires']:
if requirement not in state.keys():
requirements_met = False
if not requirements_met:
continue
if train_step:
# Only set requires_grad=True for the network being trained.
nets_to_train = step.get_networks_trained()
enabled = 0
for name, net in self.networks.items():
net_enabled = name in nets_to_train
if net_enabled:
enabled += 1
# Networks can opt out of training before a certain iteration by declaring 'after' in their definition.
if 'after' in self.opt['networks'][name].keys() and it < self.opt['networks'][name]['after']:
net_enabled = False
for p in net.parameters():
do_not_train_flag = hasattr(p, "DO_NOT_TRAIN") or (hasattr(p, "DO_NOT_TRAIN_UNTIL") and it < p.DO_NOT_TRAIN_UNTIL)
if p.dtype != torch.int64 and p.dtype != torch.bool and not do_not_train_flag:
p.requires_grad = net_enabled
else:
p.requires_grad = False
assert enabled == len(nets_to_train)
# Update experiments
[e.before_step(self.opt, self.step_names[step_num], self.env, nets_to_train, state) for e in self.experiments]
for o in step.get_optimizers():
o.zero_grad()
# Now do a forward and backward pass for each gradient accumulation step.
new_states = {}
self.batch_size_optimizer.focus(net)
for m in range(self.batch_factor):
ns = step.do_forward_backward(state, m, step_num, train=train_step, no_ddp_sync=(m+1 < self.batch_factor))
# Call into post-backward hooks.
for name, net in self.networks.items():
if hasattr(net.module, "after_backward"):
net.module.after_backward(it)
for k, v in ns.items():
if k not in new_states.keys():
new_states[k] = [v]
else:
new_states[k].append(v)
# Push the detached new state tensors into the state map for use with the next step.
for k, v in new_states.items():
if k in state.keys():
raise OverwrittenStateError(k, list(state.keys()))
state[k] = v
# (Maybe) perform a step.
if train_step and optimize and self.batch_size_optimizer.should_step(it):
# Unscale gradients within the step. (This is admittedly pretty messy but the API contract between step & ET is pretty much broken at this point)
# This is needed to accurately log the grad norms.
for opt in step.optimizers:
from torch.cuda.amp.grad_scaler import OptState
if step.scaler.is_enabled() and step.scaler._per_optimizer_states[id(opt)]["stage"] is not OptState.UNSCALED:
step.scaler.unscale_(opt)
# Call into pre-step hooks.
for name, net in self.networks.items():
if hasattr(net.module, "before_step"):
net.module.before_step(it)
if self.auto_scale_grads:
asb = sqrt(self.auto_scale_basis)
for net in self.networks.values():
for mod in net.modules():
fan_in = -1
if isinstance(mod, ml.Linear):
fan_in = mod.weight.data.shape[1]
elif isinstance(mod, nn.Conv1d):
fan_in = mod.weight.data.shape[0]
elif isinstance(mod, nn.Conv2d) or isinstance(mod, nn.Conv3d):
assert "Not yet implemented!"
if fan_in != -1:
p = mod.weight
if hasattr(p, 'grad') and p.grad is not None:
p.grad = p.grad * asb / sqrt(fan_in)
if return_grad_norms and train_step:
for name in nets_to_train:
model = self.networks[name]
if hasattr(model.module, 'get_grad_norm_parameter_groups'):
pgroups = {f'{name}_{k}': v for k, v in model.module.get_grad_norm_parameter_groups().items()}
else:
pgroups = {f'{name}_all_parameters': list(model.parameters())}
for name in pgroups.keys():
stacked_grads = []
for p in pgroups[name]:
if hasattr(p, 'grad') and p.grad is not None:
stacked_grads.append(torch.norm(p.grad.detach(), 2))
if not stacked_grads:
continue
grad_norms[name] = torch.norm(torch.stack(stacked_grads), 2)
if distributed.is_available() and distributed.is_initialized():
# Gather the metric from all devices if in a distributed setting.
distributed.all_reduce(grad_norms[name], op=distributed.ReduceOp.SUM)
grad_norms[name] /= distributed.get_world_size()
grad_norms[name] = grad_norms[name].cpu()
self.consume_gradients(state, step, it)
# Record visual outputs for usage in debugging and testing.
if 'visuals' in self.opt['logger'].keys() and self.rank <= 0 and it % self.opt['logger']['visual_debug_rate'] == 0:
def fix_image(img):
if opt_get(self.opt, ['logger', 'is_mel_spectrogram'], False):
if img.min() < -2:
img = normalize_mel(img)
img = img.unsqueeze(dim=1)
if img.shape[1] > 3:
img = img[:, :3, :, :]
if opt_get(self.opt, ['logger', 'reverse_n1_to_1'], False):
img = (img + 1) / 2
if opt_get(self.opt, ['logger', 'reverse_imagenet_norm'], False):
img = denormalize(img)
return img
sample_save_path = os.path.join(self.opt['path']['models'], "..", "visual_dbg")
for v in self.opt['logger']['visuals']:
if v not in state.keys():
continue # This can happen for several reasons (ex: 'after' defs), just ignore it.
for i, dbgv in enumerate(state[v]):
if 'recurrent_visual_indices' in self.opt['logger'].keys() and len(dbgv.shape)==5:
for rvi in self.opt['logger']['recurrent_visual_indices']:
rdbgv = fix_image(dbgv[:, rvi])
os.makedirs(os.path.join(sample_save_path, v), exist_ok=True)
utils.save_image(rdbgv.float(), os.path.join(sample_save_path, v, "%05i_%02i_%02i.png" % (it, rvi, i)))
else:
dbgv = fix_image(dbgv)
os.makedirs(os.path.join(sample_save_path, v), exist_ok=True)
utils.save_image(dbgv.float(), os.path.join(sample_save_path, v, "%05i_%02i.png" % (it, i)))
# Some models have their own specific visual debug routines.
for net_name, net in self.networks.items():
if hasattr(net.module, "visual_dbg"):
model_vdbg_dir = os.path.join(sample_save_path, net_name)
os.makedirs(model_vdbg_dir, exist_ok=True)
net.module.visual_dbg(it, model_vdbg_dir)
return grad_norms
def consume_gradients(self, state, step, it):
[e.before_optimize(state) for e in self.experiments]
self.restore_optimizers()
step.do_step(it)
self.stash_optimizers()
# Call into custom step hooks as well as update EMA params.
for name, net in self.networks.items():
if hasattr(net.module, "after_step"):
net.module.after_step(it)
if self.do_emas:
# When the EMA is on the CPU, only update every 10 steps to save processing time.
if self.ema_on_cpu and it % 10 != 0:
continue
ema_params = self.emas[name].parameters()
net_params = net.parameters()
for ep, np in zip(ema_params, net_params):
ema_rate = self.ema_rate
new_rate = 1 - ema_rate
if self.ema_on_cpu:
np = np.cpu()
ema_rate = ema_rate ** 10 # Because it only happens every 10 steps.
mid = (1 - (ema_rate+new_rate))/2
ema_rate += mid
new_rate += mid
ep.detach().mul_(ema_rate).add_(np, alpha=1 - ema_rate)
[e.after_optimize(state) for e in self.experiments]
def test(self):
for net in self.netsG.values():
net.eval()
accum_metrics = InfStorageLossAccumulator()
with torch.no_grad():
# This can happen one of two ways: Either a 'validation injector' is provided, in which case we run that.
# Or, we run the entire chain of steps in "train" mode and use eval.output_state.
if 'injectors' in self.opt['eval'].keys():
state = {}
for inj in self.opt['eval']['injectors'].values():
# Need to move from mega_batch mode to batch mode (remove chunks)
for k, v in self.dstate.items():
state[k] = v[0]
inj = create_injector(inj, self.env)
state.update(inj(state))
else:
# Iterate through the steps, performing them one at a time.
state = self.dstate
for step_num, s in enumerate(self.steps):
ns = s.do_forward_backward(state, 0, step_num, train=False, loss_accumulator=accum_metrics)
for k, v in ns.items():
state[k] = [v]
self.eval_state = {}
for k, v in state.items():
if isinstance(v, list):
self.eval_state[k] = [s.detach().cpu() if isinstance(s, torch.Tensor) else s for s in v]
else:
self.eval_state[k] = [v.detach().cpu() if isinstance(v, torch.Tensor) else v]
for net in self.netsG.values():
net.train()
return accum_metrics
# Fetches a summary of the log.
def get_current_log(self, step):
log = {}
for s in self.steps:
log.update(s.get_metrics())
for e in self.experiments:
log.update(e.get_log_data())
# Some generators can do their own metric logging.
for net_name, net in self.networks.items():
if hasattr(net.module, "get_debug_values"):
log.update(net.module.get_debug_values(step, net_name))
# Log learning rate (from first param group) too.
for o in self.optimizers:
for pgi, pg in enumerate(o.param_groups):
log['learning_rate_%s_%i' % (o._config['network'], pgi)] = pg['lr']
# The batch size optimizer also outputs loggable data.
log.update(self.batch_size_optimizer.get_statistics())
# In distributed mode, get agreement on all single tensors.
if distributed.is_available() and distributed.is_initialized():
for k, v in log.items():
if not isinstance(v, torch.Tensor):
continue
if len(v.shape) != 1 or v.dtype != torch.float:
continue
distributed.all_reduce(v, op=distributed.ReduceOp.SUM)
log[k] = v / distributed.get_world_size()
return log
def get_current_visuals(self, need_GT=True):
# Conforms to an archaic format from MMSR.
res = {'rlt': self.eval_state[self.opt['eval']['output_state']][0].float().cpu()}
if 'hq' in self.eval_state.keys():
res['hq'] = self.eval_state['hq'][0].float().cpu(),
return res
def print_network(self):
for name, net in self.networks.items():
s, n = self.get_network_description(net)
net_struc_str = '{}'.format(net.__class__.__name__)
if self.rank <= 0:
logger.info('Network {} structure: {}, with parameters: {:,d}'.format(name, net_struc_str, n))
logger.info(s)
def load(self):
for netdict in [self.netsG, self.netsD]:
for name, net in netdict.items():
load_path = self.opt['path']['pretrain_model_%s' % (name,)]
if load_path is None:
return
if self.rank <= 0:
logger.info('Loading model for [%s]' % (load_path,))
self.load_network(load_path, net, self.opt['path']['strict_load'], opt_get(self.opt, ['path', f'pretrain_base_path_{name}']))
load_path_ema = load_path.replace('.pth', '_ema.pth')
if self.is_train and self.do_emas:
ema_model = self.emas[name]
if os.path.exists(load_path_ema):
self.load_network(load_path_ema, ema_model, self.opt['path']['strict_load'], opt_get(self.opt, ['path', f'pretrain_base_path_{name}']))
else:
print("WARNING! Unable to find EMA network! Starting a new EMA from given model parameters.")
self.emas[name] = copy.deepcopy(net)
if self.ema_on_cpu:
self.emas[name] = self.emas[name].cpu()
if hasattr(net.module, 'network_loaded'):
net.module.network_loaded()
def save(self, iter_step):
for name, net in self.networks.items():
# Don't save non-trainable networks.
if self.opt['networks'][name]['trainable']:
self.save_network(net, name, iter_step)
if self.do_emas:
self.save_network(self.emas[name], f'{name}_ema', iter_step)
def force_restore_swapout(self):
# Legacy method. Do nothing.
pass