Fix some distributed training snafus

2020-10-27 15:24:05 -06:00 · 2020-10-27 15:24:05 -06:00 · 2a3eec8fd7
commit 2a3eec8fd7
parent 11a9e223a6
4 changed files with 47 additions and 45 deletions
--- a/codes/models/steps/losses.py
+++ b/codes/models/steps/losses.py
@ -202,6 +202,7 @@ class DiscriminatorGanLoss(ConfigurableLoss):
        # generators and discriminators by essentially having them skip steps while their counterparts "catch up".
        self.min_loss = opt['min_loss'] if 'min_loss' in opt.keys() else 0
        if self.min_loss != 0:
+            assert self.env['rank'] == 0   # distributed training does not support 'min_loss' - it can result in backward() desync by design.
            self.loss_rotating_buffer = torch.zeros(10, requires_grad=False)
            self.rb_ptr = 0
            self.losses_computed = 0
--- a/codes/models/steps/steps.py
+++ b/codes/models/steps/steps.py
@ -126,6 +126,7 @@ class ConfigurableStep(Module):
        self.env['current_step_optimizers'] = self.optimizers
        self.env['training'] = train

+        with self.get_network_for_name(self.get_networks_trained()[0]).join():
            # Inject in any extra dependencies.
            for inj in self.injectors:
                # Don't do injections tagged with eval unless we are not in train mode.
@ -159,7 +160,7 @@ class ConfigurableStep(Module):
                        self.loss_accumulator.add_loss("%s_%s" % (loss_name, n), v)
                        loss.clear_metrics()

-            # In some cases, the loss could not be set (e.g. all losses have 'after'
+                # In some cases, the loss could not be set (e.g. all losses have 'after')
                if isinstance(total_loss, torch.Tensor):
                    self.loss_accumulator.add_loss("%s_total" % (self.get_training_network_name(),), total_loss)
                    # Scale the loss down by the accumulation factor.
--- a/codes/requirements.txt
+++ b/codes/requirements.txt
@ -1,6 +1,5 @@
 numpy
 opencv-python
-lmdb
 pyyaml
 tb-nightly
 future
--- a/codes/train.py
+++ b/codes/train.py
@ -46,7 +46,7 @@ class Trainer:

        else:
            opt['dist'] = True
-            self.init_dist()
+            self.init_dist('nccl')
            world_size = torch.distributed.get_world_size()
            self.rank = torch.distributed.get_rank()

@ -117,11 +117,11 @@ class Trainer:
                total_iters = int(opt['train']['niter'])
                self.total_epochs = int(math.ceil(total_iters / train_size))
                if opt['dist']:
-                    train_sampler = DistIterSampler(self.train_set, world_size, self.rank, dataset_ratio)
+                    self.train_sampler = DistIterSampler(self.train_set, world_size, self.rank, dataset_ratio)
                    self.total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio)))
                else:
-                    train_sampler = None
-                self.train_loader = create_dataloader(self.train_set, dataset_opt, opt, train_sampler)
+                    self.train_sampler = None
+                self.train_loader = create_dataloader(self.train_set, dataset_opt, opt, self.train_sampler)
                if self.rank <= 0:
                    self.logger.info('Number of train images: {:,d}, iters: {:,d}'.format(
                        len(self.train_set), train_size))
@ -284,6 +284,7 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_prog_imgset_multifaceted_chained.yml')
    parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    opt = option.parse(args.opt, is_train=True)
    trainer = Trainer()