DL-Art-School/codes/trainer/steps.py

from torch.cuda.amp import GradScaler

from utils.loss_accumulator import LossAccumulator
from torch.nn import Module
import logging
from trainer.losses import create_loss
import torch
from collections import OrderedDict
from trainer.injectors import create_injector
from utils.util import recursively_detach

logger = logging.getLogger('base')


# Defines the expected API for a single training step
class ConfigurableStep(Module):

    def __init__(self, opt_step, env):
        super(ConfigurableStep, self).__init__()

        self.step_opt = opt_step
        self.env = env
        self.opt = env['opt']
        self.gen_outputs = opt_step['generator_outputs']
        self.loss_accumulator = LossAccumulator()
        self.optimizers = None
        self.scaler = GradScaler(enabled=self.opt['fp16'])
        self.grads_generated = False
        self.min_total_loss = opt_step['min_total_loss'] if 'min_total_loss' in opt_step.keys() else -999999999

        self.injectors = []
        if 'injectors' in self.step_opt.keys():
            injector_names = []
            for inj_name, injector in self.step_opt['injectors'].items():
                assert inj_name not in injector_names  # Repeated names are always an error case.
                injector_names.append(inj_name)
                self.injectors.append(create_injector(injector, env))

        losses = []
        self.weights = {}
        if 'losses' in self.step_opt.keys():
            for loss_name, loss in self.step_opt['losses'].items():
                assert loss_name not in self.weights.keys()  # Repeated names are always an error case.
                losses.append((loss_name, create_loss(loss, env)))
                self.weights[loss_name] = loss['weight']
        self.losses = OrderedDict(losses)

    def get_network_for_name(self, name):
        return self.env['generators'][name] if name in self.env['generators'].keys() \
                else self.env['discriminators'][name]

    # Subclasses should override this to define individual optimizers. They should all go into self.optimizers.
    #  This default implementation defines a single optimizer for all Generator parameters.
    #  Must be called after networks are initialized and wrapped.
    def define_optimizers(self):
        training = self.step_opt['training']
        training_net = self.get_network_for_name(training)
        # When only training one network, optimizer params can just embedded in the step params.
        if 'optimizer_params' not in self.step_opt.keys():
            opt_configs = [self.step_opt]
        else:
            opt_configs = [self.step_opt['optimizer_params']]
        nets = [training_net]
        training = [training]
        self.optimizers = []
        for net_name, net, opt_config in zip(training, nets, opt_configs):
            optim_params = []
            for k, v in net.named_parameters():  # can optimize for a part of the model
                # Make some inference about these parameters, which can be used by some optimizers to treat certain
                # parameters differently. For example, it is considered good practice to not do weight decay on
                # BN & bias parameters. TODO: process the module tree instead of the parameter tree to accomplish the
                # same thing, but in a more effective way.
                if k.endswith(".bias"):
                    v.is_bias = True
                if k.endswith(".weight"):
                    v.is_weight = True
                if ".bn" in k or '.batchnorm' in k or '.bnorm' in k:
                    v.is_bn = True
                if v.requires_grad:
                    optim_params.append(v)
                else:
                    if self.env['rank'] <= 0:
                        logger.warning('Params [{:s}] will not optimize.'.format(k))

            if 'optimizer' not in self.step_opt.keys() or self.step_opt['optimizer'] == 'adam':
                opt = torch.optim.Adam(optim_params, lr=opt_config['lr'],
                                       weight_decay=opt_config['weight_decay'],
                                       betas=(opt_config['beta1'], opt_config['beta2']))
            elif self.step_opt['optimizer'] == 'lars':
                from trainer.optimizers.larc import LARC
                from trainer.optimizers.sgd import SGDNoBiasMomentum
                optSGD = SGDNoBiasMomentum(optim_params, lr=opt_config['lr'], momentum=opt_config['momentum'],
                                           weight_decay=opt_config['weight_decay'])
                opt = LARC(optSGD, trust_coefficient=opt_config['lars_coefficient'])
            opt._config = opt_config  # This is a bit seedy, but we will need these configs later.
            opt._config['network'] = net_name
            self.optimizers.append(opt)

    # Returns all optimizers used in this step.
    def get_optimizers(self):
        assert self.optimizers is not None
        return self.optimizers

    # Returns optimizers which are opting in for default LR scheduling.
    def get_optimizers_with_default_scheduler(self):
        assert self.optimizers is not None
        return self.optimizers

    # Returns the names of the networks this step will train. Other networks will be frozen.
    def get_networks_trained(self):
        if isinstance(self.step_opt['training'], list):
            return self.step_opt['training']
        else:
            return [self.step_opt['training']]

    def get_training_network_name(self):
        if isinstance(self.step_opt['training'], list):
            return self.step_opt['training'][0]
        else:
            return self.step_opt['training']

    # Performs all forward and backward passes for this step given an input state. All input states are lists of
    # chunked tensors. Use grad_accum_step to dereference these steps. Should return a dict of tensors that later
    # steps might use. These tensors are automatically detached and accumulated into chunks.
    def do_forward_backward(self, state, grad_accum_step, amp_loss_id, train=True):
        new_state = {}

        # Prepare a de-chunked state dict which will be used for the injectors & losses.
        local_state = {}
        for k, v in state.items():
            local_state[k] = v[grad_accum_step]
        local_state.update(new_state)
        local_state['train_nets'] = str(self.get_networks_trained())

        # Some losses compute backward() internally. Accommodate this by stashing the amp_loss_id in env.
        self.env['amp_loss_id'] = amp_loss_id
        self.env['current_step_optimizers'] = self.optimizers
        self.env['training'] = train

        # Inject in any extra dependencies.
        for inj in self.injectors:
            # Don't do injections tagged with eval unless we are not in train mode.
            if train and 'eval' in inj.opt.keys() and inj.opt['eval']:
                continue
            # Likewise, don't do injections tagged with train unless we are not in eval.
            if not train and 'train' in inj.opt.keys() and inj.opt['train']:
                continue
            # Don't do injections tagged with 'after' or 'before' when we are out of spec.
            if 'after' in inj.opt.keys() and self.env['step'] < inj.opt['after'] or \
               'before' in inj.opt.keys() and self.env['step'] > inj.opt['before'] or \
               'every' in inj.opt.keys() and self.env['step'] % inj.opt['every'] != 0:
                continue
            injected = inj(local_state)
            local_state.update(injected)
            new_state.update(injected)

        if train and len(self.losses) > 0:
            # Finally, compute the losses.
            total_loss = 0
            for loss_name, loss in self.losses.items():
                # Some losses only activate after a set number of steps. For example, proto-discriminator losses can
                # be very disruptive to a generator.
                if 'after' in loss.opt.keys() and loss.opt['after'] > self.env['step'] or \
                   'before' in loss.opt.keys() and self.env['step'] > loss.opt['before'] or \
                   'every' in loss.opt.keys() and self.env['step'] % loss.opt['every'] != 0:
                    continue
                l = loss(self.get_network_for_name(self.step_opt['training']), local_state)
                total_loss += l * self.weights[loss_name]
                # Record metrics.
                if isinstance(l, torch.Tensor):
                    self.loss_accumulator.add_loss(loss_name, l)
                for n, v in loss.extra_metrics():
                    self.loss_accumulator.add_loss("%s_%s" % (loss_name, n), v)
                    loss.clear_metrics()

            # In some cases, the loss could not be set (e.g. all losses have 'after')
            if isinstance(total_loss, torch.Tensor):
                self.loss_accumulator.add_loss("%s_total" % (self.get_training_network_name(),), total_loss)
                reset_required = total_loss < self.min_total_loss

                # Scale the loss down by the accumulation factor.
                total_loss = total_loss / self.env['mega_batch_factor']

                # Get dem grads!
                self.scaler.scale(total_loss).backward()

                if reset_required:
                    # You might be scratching your head at this. Why would you zero grad as opposed to not doing a
                    # backwards? Because DDP uses the backward() pass as a synchronization point and there is not a good
                    # way to simply bypass backward. If you want a more efficient way to specify a min_loss, use or
                    # implement it at the loss level.
                    self.get_network_for_name(self.step_opt['training']).zero_grad()
                    self.loss_accumulator.increment_metric("%s_skipped_steps" % (self.get_training_network_name(),))

                self.grads_generated = True

        # Detach all state variables. Within the step, gradients can flow. Once these variables leave the step
        # we must release the gradients.
        new_state = recursively_detach(new_state)
        return new_state

    # Performs the optimizer step after all gradient accumulation is completed. Default implementation simply steps()
    # all self.optimizers.
    def do_step(self, step):
        if not self.grads_generated:
            return
        self.grads_generated = False
        for opt in self.optimizers:
            # Optimizers can be opted out in the early stages of training.
            after = opt._config['after'] if 'after' in opt._config.keys() else 0
            after_network = self.opt['networks'][opt._config['network']]['after'] if 'after' in self.opt['networks'][opt._config['network']].keys() else 0
            after = max(after, after_network)
            if self.env['step'] < after:
                continue
            before = opt._config['before'] if 'before' in opt._config.keys() else -1
            if before != -1 and self.env['step'] > before:
                continue
            self.scaler.step(opt)
            self.scaler.update()

    def get_metrics(self):
        return self.loss_accumulator.as_dict()
Large cleanup Removed a lot of old code that I won't be touching again. Refactored some code elements into more logical places. 2020-12-18 16:10:44 +00:00			`from torch.cuda.amp import GradScaler`
Move to torch.cuda.amp (not working) Running into OOM errors, needs diagnosing. Checkpointing here. 2020-10-22 19:58:05 +00:00
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`from utils.loss_accumulator import LossAccumulator`
			`from torch.nn import Module`
			`import logging`
More refactoring 2020-12-18 16:18:34 +00:00			`from trainer.losses import create_loss`
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`import torch`
			`from collections import OrderedDict`
More refactoring 2020-12-18 16:18:34 +00:00			`from trainer.injectors import create_injector`
Recursively detach all outputs, even if they are nested in data structures 2020-09-20 03:47:34 +00:00			`from utils.util import recursively_detach`
Extensible trainer (in progress) 2020-08-12 14:45:23 +00:00
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`logger = logging.getLogger('base')`
Extensible trainer (in progress) 2020-08-12 14:45:23 +00:00

ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`# Defines the expected API for a single training step`
			`class ConfigurableStep(Module):`

			`def __init__(self, opt_step, env):`
			`super(ConfigurableStep, self).__init__()`

			`self.step_opt = opt_step`
			`self.env = env`
			`self.opt = env['opt']`
			`self.gen_outputs = opt_step['generator_outputs']`
			`self.loss_accumulator = LossAccumulator()`
More ExtensibleTrainer work It runs now, just need to debug it to reach performance parity with SRGAN. Sweet. 2020-08-23 23:22:34 +00:00			`self.optimizers = None`
Move to torch.cuda.amp (not working) Running into OOM errors, needs diagnosing. Checkpointing here. 2020-10-22 19:58:05 +00:00			`self.scaler = GradScaler(enabled=self.opt['fp16'])`
Finish integration with autocast Note: autocast is broken when also using checkpoint(). Overcome this by modifying torch's checkpoint() function in place to also use autocast. 2020-10-22 20:39:19 +00:00			`self.grads_generated = False`
RRDB with latent 2020-11-05 17:04:17 +00:00			`self.min_total_loss = opt_step['min_total_loss'] if 'min_total_loss' in opt_step.keys() else -999999999`
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00
			`self.injectors = []`
			`if 'injectors' in self.step_opt.keys():`
Don't let duplicate keys be used for injectors and losses 2020-09-29 22:59:44 +00:00			`injector_names = []`
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`for inj_name, injector in self.step_opt['injectors'].items():`
Don't let duplicate keys be used for injectors and losses 2020-09-29 22:59:44 +00:00			`assert inj_name not in injector_names # Repeated names are always an error case.`
			`injector_names.append(inj_name)`
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`self.injectors.append(create_injector(injector, env))`

			`losses = []`
			`self.weights = {}`
Enable testing in ExtensibleTrainer, fix it in SRGAN_model Also compute fea loss for this. 2020-08-31 15:41:48 +00:00			`if 'losses' in self.step_opt.keys():`
			`for loss_name, loss in self.step_opt['losses'].items():`
Don't let duplicate keys be used for injectors and losses 2020-09-29 22:59:44 +00:00			`assert loss_name not in self.weights.keys() # Repeated names are always an error case.`
Fix bugs in extensibletrainer 2020-09-29 04:09:42 +00:00			`losses.append((loss_name, create_loss(loss, env)))`
Enable testing in ExtensibleTrainer, fix it in SRGAN_model Also compute fea loss for this. 2020-08-31 15:41:48 +00:00			`self.weights[loss_name] = loss['weight']`
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`self.losses = OrderedDict(losses)`

Supporting infrastructure in ExtensibleTrainer to train spsr4 Need to be able to train 2 nets in one step: the backbone will be entirely separate with its own optimizer (for an extremely low LR). This functionality was already present, just not implemented correctly. 2020-09-12 04:57:06 +00:00			`def get_network_for_name(self, name):`
			`return self.env['generators'][name] if name in self.env['generators'].keys() \`
			`else self.env['discriminators'][name]`

ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`# Subclasses should override this to define individual optimizers. They should all go into self.optimizers.`
			`# This default implementation defines a single optimizer for all Generator parameters.`
More ExtensibleTrainer work It runs now, just need to debug it to reach performance parity with SRGAN. Sweet. 2020-08-23 23:22:34 +00:00			`# Must be called after networks are initialized and wrapped.`
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`def define_optimizers(self):`
Supporting infrastructure in ExtensibleTrainer to train spsr4 Need to be able to train 2 nets in one step: the backbone will be entirely separate with its own optimizer (for an extremely low LR). This functionality was already present, just not implemented correctly. 2020-09-12 04:57:06 +00:00			`training = self.step_opt['training']`
Fix DDP errors for discriminator - Don't define training_net in define_optimizers - this drops the shell and leads to problems downstream - Get rid of support for multiple training nets per opt. This was half baked and needs a better solution if needed downstream. 2020-12-07 19:50:57 +00:00			`training_net = self.get_network_for_name(training)`
			`# When only training one network, optimizer params can just embedded in the step params.`
			`if 'optimizer_params' not in self.step_opt.keys():`
			`opt_configs = [self.step_opt]`
Supporting infrastructure in ExtensibleTrainer to train spsr4 Need to be able to train 2 nets in one step: the backbone will be entirely separate with its own optimizer (for an extremely low LR). This functionality was already present, just not implemented correctly. 2020-09-12 04:57:06 +00:00			`else:`
Fix DDP errors for discriminator - Don't define training_net in define_optimizers - this drops the shell and leads to problems downstream - Get rid of support for multiple training nets per opt. This was half baked and needs a better solution if needed downstream. 2020-12-07 19:50:57 +00:00			`opt_configs = [self.step_opt['optimizer_params']]`
			`nets = [training_net]`
			`training = [training]`
Supporting infrastructure in ExtensibleTrainer to train spsr4 Need to be able to train 2 nets in one step: the backbone will be entirely separate with its own optimizer (for an extremely low LR). This functionality was already present, just not implemented correctly. 2020-09-12 04:57:06 +00:00			`self.optimizers = []`
More work in support of training flow networks in tandem with generators 2020-11-05 01:07:48 +00:00			`for net_name, net, opt_config in zip(training, nets, opt_configs):`
Supporting infrastructure in ExtensibleTrainer to train spsr4 Need to be able to train 2 nets in one step: the backbone will be entirely separate with its own optimizer (for an extremely low LR). This functionality was already present, just not implemented correctly. 2020-09-12 04:57:06 +00:00			`optim_params = []`
			`for k, v in net.named_parameters(): # can optimize for a part of the model`
Add LARS optimizer & support for BYOL idiosyncrasies - Added LARS and SGD optimizer variants that support turning off certain features for BN and bias layers - Added a variant of pytorch's resnet model that supports gradient checkpointing. - Modify the trainer infrastructure to support above - Fix bug with BYOL (should have been nonfunctional) 2020-12-24 03:33:43 +00:00			`# Make some inference about these parameters, which can be used by some optimizers to treat certain`
			`# parameters differently. For example, it is considered good practice to not do weight decay on`
			`# BN & bias parameters. TODO: process the module tree instead of the parameter tree to accomplish the`
			`# same thing, but in a more effective way.`
			`if k.endswith(".bias"):`
			`v.is_bias = True`
			`if k.endswith(".weight"):`
			`v.is_weight = True`
			`if ".bn" in k or '.batchnorm' in k or '.bnorm' in k:`
			`v.is_bn = True`
Supporting infrastructure in ExtensibleTrainer to train spsr4 Need to be able to train 2 nets in one step: the backbone will be entirely separate with its own optimizer (for an extremely low LR). This functionality was already present, just not implemented correctly. 2020-09-12 04:57:06 +00:00			`if v.requires_grad:`
			`optim_params.append(v)`
			`else:`
			`if self.env['rank'] <= 0:`
			`logger.warning('Params [{:s}] will not optimize.'.format(k))`

			`if 'optimizer' not in self.step_opt.keys() or self.step_opt['optimizer'] == 'adam':`
			`opt = torch.optim.Adam(optim_params, lr=opt_config['lr'],`
			`weight_decay=opt_config['weight_decay'],`
			`betas=(opt_config['beta1'], opt_config['beta2']))`
Add LARS optimizer & support for BYOL idiosyncrasies - Added LARS and SGD optimizer variants that support turning off certain features for BN and bias layers - Added a variant of pytorch's resnet model that supports gradient checkpointing. - Modify the trainer infrastructure to support above - Fix bug with BYOL (should have been nonfunctional) 2020-12-24 03:33:43 +00:00			`elif self.step_opt['optimizer'] == 'lars':`
			`from trainer.optimizers.larc import LARC`
			`from trainer.optimizers.sgd import SGDNoBiasMomentum`
			`optSGD = SGDNoBiasMomentum(optim_params, lr=opt_config['lr'], momentum=opt_config['momentum'],`
			`weight_decay=opt_config['weight_decay'])`
			`opt = LARC(optSGD, trust_coefficient=opt_config['lars_coefficient'])`
Add 'before' and 'after' defs to injections, steps and optimizers 2020-09-22 23:03:22 +00:00			`opt._config = opt_config # This is a bit seedy, but we will need these configs later.`
More work in support of training flow networks in tandem with generators 2020-11-05 01:07:48 +00:00			`opt._config['network'] = net_name`
Supporting infrastructure in ExtensibleTrainer to train spsr4 Need to be able to train 2 nets in one step: the backbone will be entirely separate with its own optimizer (for an extremely low LR). This functionality was already present, just not implemented correctly. 2020-09-12 04:57:06 +00:00			`self.optimizers.append(opt)`
Extensible trainer (in progress) 2020-08-12 14:45:23 +00:00
			`# Returns all optimizers used in this step.`
			`def get_optimizers(self):`
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`assert self.optimizers is not None`
			`return self.optimizers`
Extensible trainer (in progress) 2020-08-12 14:45:23 +00:00
			`# Returns optimizers which are opting in for default LR scheduling.`
			`def get_optimizers_with_default_scheduler(self):`
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`assert self.optimizers is not None`
			`return self.optimizers`
Extensible trainer (in progress) 2020-08-12 14:45:23 +00:00
			`# Returns the names of the networks this step will train. Other networks will be frozen.`
			`def get_networks_trained(self):`
Supporting infrastructure in ExtensibleTrainer to train spsr4 Need to be able to train 2 nets in one step: the backbone will be entirely separate with its own optimizer (for an extremely low LR). This functionality was already present, just not implemented correctly. 2020-09-12 04:57:06 +00:00			`if isinstance(self.step_opt['training'], list):`
			`return self.step_opt['training']`
			`else:`
			`return [self.step_opt['training']]`
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00
Fix bugs in extensibletrainer 2020-09-29 04:09:42 +00:00			`def get_training_network_name(self):`
			`if isinstance(self.step_opt['training'], list):`
			`return self.step_opt['training'][0]`
			`else:`
			`return self.step_opt['training']`

ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`# Performs all forward and backward passes for this step given an input state. All input states are lists of`
			`# chunked tensors. Use grad_accum_step to dereference these steps. Should return a dict of tensors that later`
			`# steps might use. These tensors are automatically detached and accumulated into chunks.`
Support injectors that run in eval only 2020-09-05 13:59:45 +00:00			`def do_forward_backward(self, state, grad_accum_step, amp_loss_id, train=True):`
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`new_state = {}`
Extensible trainer (in progress) 2020-08-12 14:45:23 +00:00
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`# Prepare a de-chunked state dict which will be used for the injectors & losses.`
			`local_state = {}`
			`for k, v in state.items():`
			`local_state[k] = v[grad_accum_step]`
			`local_state.update(new_state)`
Some convenience adjustments to ExtensibleTrainer 2020-09-18 03:05:32 +00:00			`local_state['train_nets'] = str(self.get_networks_trained())`
Extensible trainer (in progress) 2020-08-12 14:45:23 +00:00
Move to torch.cuda.amp (not working) Running into OOM errors, needs diagnosing. Checkpointing here. 2020-10-22 19:58:05 +00:00			`# Some losses compute backward() internally. Accommodate this by stashing the amp_loss_id in env.`
Tecogan implementation work 2020-09-25 22:38:23 +00:00			`self.env['amp_loss_id'] = amp_loss_id`
			`self.env['current_step_optimizers'] = self.optimizers`
Add ImagePatchInjector and TranslationalLoss 2020-09-27 03:25:32 +00:00			`self.env['training'] = train`
Tecogan implementation work 2020-09-25 22:38:23 +00:00
More adjustments to support distributed training with teco & on multi_modal_train 2020-10-28 02:58:03 +00:00			`# Inject in any extra dependencies.`
			`for inj in self.injectors:`
			`# Don't do injections tagged with eval unless we are not in train mode.`
			`if train and 'eval' in inj.opt.keys() and inj.opt['eval']:`
			`continue`
			`# Likewise, don't do injections tagged with train unless we are not in eval.`
			`if not train and 'train' in inj.opt.keys() and inj.opt['train']:`
			`continue`
			`# Don't do injections tagged with 'after' or 'before' when we are out of spec.`
			`if 'after' in inj.opt.keys() and self.env['step'] < inj.opt['after'] or \`
srflow_orig integration 2020-11-20 06:47:24 +00:00			`'before' in inj.opt.keys() and self.env['step'] > inj.opt['before'] or \`
Mods to tecogan to allow use of embeddings as input 2020-11-24 16:24:02 +00:00			`'every' in inj.opt.keys() and self.env['step'] % inj.opt['every'] != 0:`
More adjustments to support distributed training with teco & on multi_modal_train 2020-10-28 02:58:03 +00:00			`continue`
			`injected = inj(local_state)`
			`local_state.update(injected)`
			`new_state.update(injected)`

			`if train and len(self.losses) > 0:`
			`# Finally, compute the losses.`
			`total_loss = 0`
			`for loss_name, loss in self.losses.items():`
			`# Some losses only activate after a set number of steps. For example, proto-discriminator losses can`
			`# be very disruptive to a generator.`
More work on SSIM/PSNR approximators - Add a network that accomodates this style of approximator while retaining structure - Migrate to SSIM approximation - Add a tool to visualize how these approximators are working - Fix some issues that came up while doign this work 2020-11-03 15:09:58 +00:00			`if 'after' in loss.opt.keys() and loss.opt['after'] > self.env['step'] or \`
stylegan2 in ml art school! 2020-11-12 22:42:05 +00:00			`'before' in loss.opt.keys() and self.env['step'] > loss.opt['before'] or \`
			`'every' in loss.opt.keys() and self.env['step'] % loss.opt['every'] != 0:`
More mods 2020-09-09 02:36:27 +00:00			`continue`
Fix DDP errors for discriminator - Don't define training_net in define_optimizers - this drops the shell and leads to problems downstream - Get rid of support for multiple training nets per opt. This was half baked and needs a better solution if needed downstream. 2020-12-07 19:50:57 +00:00			`l = loss(self.get_network_for_name(self.step_opt['training']), local_state)`
More adjustments to support distributed training with teco & on multi_modal_train 2020-10-28 02:58:03 +00:00			`total_loss += l * self.weights[loss_name]`
			`# Record metrics.`
			`if isinstance(l, torch.Tensor):`
			`self.loss_accumulator.add_loss(loss_name, l)`
			`for n, v in loss.extra_metrics():`
			`self.loss_accumulator.add_loss("%s_%s" % (loss_name, n), v)`
			`loss.clear_metrics()`

			`# In some cases, the loss could not be set (e.g. all losses have 'after')`
			`if isinstance(total_loss, torch.Tensor):`
			`self.loss_accumulator.add_loss("%s_total" % (self.get_training_network_name(),), total_loss)`
Add a min_loss that is DDP compatible 2020-10-28 21:46:59 +00:00			`reset_required = total_loss < self.min_total_loss`

More adjustments to support distributed training with teco & on multi_modal_train 2020-10-28 02:58:03 +00:00			`# Scale the loss down by the accumulation factor.`
			`total_loss = total_loss / self.env['mega_batch_factor']`

			`# Get dem grads!`
			`self.scaler.scale(total_loss).backward()`
Add a min_loss that is DDP compatible 2020-10-28 21:46:59 +00:00
			`if reset_required:`
			`# You might be scratching your head at this. Why would you zero grad as opposed to not doing a`
			`# backwards? Because DDP uses the backward() pass as a synchronization point and there is not a good`
			`# way to simply bypass backward. If you want a more efficient way to specify a min_loss, use or`
			`# implement it at the loss level.`
Fix DDP errors for discriminator - Don't define training_net in define_optimizers - this drops the shell and leads to problems downstream - Get rid of support for multiple training nets per opt. This was half baked and needs a better solution if needed downstream. 2020-12-07 19:50:57 +00:00			`self.get_network_for_name(self.step_opt['training']).zero_grad()`
Add a min_loss that is DDP compatible 2020-10-28 21:46:59 +00:00			`self.loss_accumulator.increment_metric("%s_skipped_steps" % (self.get_training_network_name(),))`

More adjustments to support distributed training with teco & on multi_modal_train 2020-10-28 02:58:03 +00:00			`self.grads_generated = True`
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00
More ExtensibleTrainer work It runs now, just need to debug it to reach performance parity with SRGAN. Sweet. 2020-08-23 23:22:34 +00:00			`# Detach all state variables. Within the step, gradients can flow. Once these variables leave the step`
			`# we must release the gradients.`
Recursively detach all outputs, even if they are nested in data structures 2020-09-20 03:47:34 +00:00			`new_state = recursively_detach(new_state)`
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`return new_state`

			`# Performs the optimizer step after all gradient accumulation is completed. Default implementation simply steps()`
			`# all self.optimizers.`
More work in support of training flow networks in tandem with generators 2020-11-05 01:07:48 +00:00			`def do_step(self, step):`
Finish integration with autocast Note: autocast is broken when also using checkpoint(). Overcome this by modifying torch's checkpoint() function in place to also use autocast. 2020-10-22 20:39:19 +00:00			`if not self.grads_generated:`
			`return`
			`self.grads_generated = False`
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00			`for opt in self.optimizers:`
Add 'before' and 'after' defs to injections, steps and optimizers 2020-09-22 23:03:22 +00:00			`# Optimizers can be opted out in the early stages of training.`
			`after = opt._config['after'] if 'after' in opt._config.keys() else 0`
More work in support of training flow networks in tandem with generators 2020-11-05 01:07:48 +00:00			`after_network = self.opt['networks'][opt._config['network']]['after'] if 'after' in self.opt['networks'][opt._config['network']].keys() else 0`
			`after = max(after, after_network)`
Add 'before' and 'after' defs to injections, steps and optimizers 2020-09-22 23:03:22 +00:00			`if self.env['step'] < after:`
			`continue`
			`before = opt._config['before'] if 'before' in opt._config.keys() else -1`
			`if before != -1 and self.env['step'] > before:`
			`continue`
Move to torch.cuda.amp (not working) Running into OOM errors, needs diagnosing. Checkpointing here. 2020-10-22 19:58:05 +00:00			`self.scaler.step(opt)`
			`self.scaler.update()`
ExtensibleTrainer work 2020-08-22 14:24:34 +00:00
			`def get_metrics(self):`
More ExtensibleTrainer work 2020-08-22 19:08:33 +00:00			`return self.loss_accumulator.as_dict()`