From 2a3eec8fd7b0be92b5a6c5af8f3678cf1bc85047 Mon Sep 17 00:00:00 2001 From: James Betker Date: Tue, 27 Oct 2020 15:24:05 -0600 Subject: [PATCH] Fix some distributed training snafus --- codes/models/steps/losses.py | 1 + codes/models/steps/steps.py | 79 ++++++++++++++++++------------------ codes/requirements.txt | 3 +- codes/train.py | 9 ++-- 4 files changed, 47 insertions(+), 45 deletions(-) diff --git a/codes/models/steps/losses.py b/codes/models/steps/losses.py index e31d5a64..830ff6b2 100644 --- a/codes/models/steps/losses.py +++ b/codes/models/steps/losses.py @@ -202,6 +202,7 @@ class DiscriminatorGanLoss(ConfigurableLoss): # generators and discriminators by essentially having them skip steps while their counterparts "catch up". self.min_loss = opt['min_loss'] if 'min_loss' in opt.keys() else 0 if self.min_loss != 0: + assert self.env['rank'] == 0 # distributed training does not support 'min_loss' - it can result in backward() desync by design. self.loss_rotating_buffer = torch.zeros(10, requires_grad=False) self.rb_ptr = 0 self.losses_computed = 0 diff --git a/codes/models/steps/steps.py b/codes/models/steps/steps.py index a8e0bdb2..a6b570e2 100644 --- a/codes/models/steps/steps.py +++ b/codes/models/steps/steps.py @@ -126,48 +126,49 @@ class ConfigurableStep(Module): self.env['current_step_optimizers'] = self.optimizers self.env['training'] = train - # Inject in any extra dependencies. - for inj in self.injectors: - # Don't do injections tagged with eval unless we are not in train mode. - if train and 'eval' in inj.opt.keys() and inj.opt['eval']: - continue - # Likewise, don't do injections tagged with train unless we are not in eval. - if not train and 'train' in inj.opt.keys() and inj.opt['train']: - continue - # Don't do injections tagged with 'after' or 'before' when we are out of spec. - if 'after' in inj.opt.keys() and self.env['step'] < inj.opt['after'] or \ - 'before' in inj.opt.keys() and self.env['step'] > inj.opt['before']: - continue - injected = inj(local_state) - local_state.update(injected) - new_state.update(injected) - - if train and len(self.losses) > 0: - # Finally, compute the losses. - total_loss = 0 - for loss_name, loss in self.losses.items(): - # Some losses only activate after a set number of steps. For example, proto-discriminator losses can - # be very disruptive to a generator. - if 'after' in loss.opt.keys() and loss.opt['after'] > self.env['step']: + with self.get_network_for_name(self.get_networks_trained()[0]).join(): + # Inject in any extra dependencies. + for inj in self.injectors: + # Don't do injections tagged with eval unless we are not in train mode. + if train and 'eval' in inj.opt.keys() and inj.opt['eval']: continue - l = loss(self.training_net, local_state) - total_loss += l * self.weights[loss_name] - # Record metrics. - if isinstance(l, torch.Tensor): - self.loss_accumulator.add_loss(loss_name, l) - for n, v in loss.extra_metrics(): - self.loss_accumulator.add_loss("%s_%s" % (loss_name, n), v) - loss.clear_metrics() + # Likewise, don't do injections tagged with train unless we are not in eval. + if not train and 'train' in inj.opt.keys() and inj.opt['train']: + continue + # Don't do injections tagged with 'after' or 'before' when we are out of spec. + if 'after' in inj.opt.keys() and self.env['step'] < inj.opt['after'] or \ + 'before' in inj.opt.keys() and self.env['step'] > inj.opt['before']: + continue + injected = inj(local_state) + local_state.update(injected) + new_state.update(injected) - # In some cases, the loss could not be set (e.g. all losses have 'after' - if isinstance(total_loss, torch.Tensor): - self.loss_accumulator.add_loss("%s_total" % (self.get_training_network_name(),), total_loss) - # Scale the loss down by the accumulation factor. - total_loss = total_loss / self.env['mega_batch_factor'] + if train and len(self.losses) > 0: + # Finally, compute the losses. + total_loss = 0 + for loss_name, loss in self.losses.items(): + # Some losses only activate after a set number of steps. For example, proto-discriminator losses can + # be very disruptive to a generator. + if 'after' in loss.opt.keys() and loss.opt['after'] > self.env['step']: + continue + l = loss(self.training_net, local_state) + total_loss += l * self.weights[loss_name] + # Record metrics. + if isinstance(l, torch.Tensor): + self.loss_accumulator.add_loss(loss_name, l) + for n, v in loss.extra_metrics(): + self.loss_accumulator.add_loss("%s_%s" % (loss_name, n), v) + loss.clear_metrics() - # Get dem grads! - self.scaler.scale(total_loss).backward() - self.grads_generated = True + # In some cases, the loss could not be set (e.g. all losses have 'after') + if isinstance(total_loss, torch.Tensor): + self.loss_accumulator.add_loss("%s_total" % (self.get_training_network_name(),), total_loss) + # Scale the loss down by the accumulation factor. + total_loss = total_loss / self.env['mega_batch_factor'] + + # Get dem grads! + self.scaler.scale(total_loss).backward() + self.grads_generated = True # Detach all state variables. Within the step, gradients can flow. Once these variables leave the step # we must release the gradients. diff --git a/codes/requirements.txt b/codes/requirements.txt index 54c9ccfc..7bb339a4 100644 --- a/codes/requirements.txt +++ b/codes/requirements.txt @@ -1,6 +1,5 @@ numpy opencv-python -lmdb pyyaml tb-nightly future @@ -11,4 +10,4 @@ scipy munch tqdm scp -tensorboard \ No newline at end of file +tensorboard diff --git a/codes/train.py b/codes/train.py index 8a4fc14b..e4c9e884 100644 --- a/codes/train.py +++ b/codes/train.py @@ -46,7 +46,7 @@ class Trainer: else: opt['dist'] = True - self.init_dist() + self.init_dist('nccl') world_size = torch.distributed.get_world_size() self.rank = torch.distributed.get_rank() @@ -117,11 +117,11 @@ class Trainer: total_iters = int(opt['train']['niter']) self.total_epochs = int(math.ceil(total_iters / train_size)) if opt['dist']: - train_sampler = DistIterSampler(self.train_set, world_size, self.rank, dataset_ratio) + self.train_sampler = DistIterSampler(self.train_set, world_size, self.rank, dataset_ratio) self.total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio))) else: - train_sampler = None - self.train_loader = create_dataloader(self.train_set, dataset_opt, opt, train_sampler) + self.train_sampler = None + self.train_loader = create_dataloader(self.train_set, dataset_opt, opt, self.train_sampler) if self.rank <= 0: self.logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(self.train_set), train_size)) @@ -284,6 +284,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_prog_imgset_multifaceted_chained.yml') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) trainer = Trainer()