Enable find_unused_parameters for DistributedDataParallel

attention_norm has some parameters which are not used to compute grad, which is causing failures in the distributed case.
2020-07-23 09:08:13 -06:00 · 2020-07-23 09:08:13 -06:00 · bba283776c
commit bba283776c
parent dbf6147504
1 changed files with 4 additions and 2 deletions
--- a/codes/models/SRGAN_model.py
+++ b/codes/models/SRGAN_model.py
@ -136,13 +136,15 @@ class SRGANModel(BaseModel):

            # DataParallel
            if opt['dist']:
-                self.netG = DistributedDataParallel(self.netG, device_ids=[torch.cuda.current_device()])
+                self.netG = DistributedDataParallel(self.netG, device_ids=[torch.cuda.current_device()],
+                                                    find_unused_parameters=True)
            else:
                self.netG = DataParallel(self.netG)
            if self.is_train:
                if opt['dist']:
                    self.netD = DistributedDataParallel(self.netD,
-                                                        device_ids=[torch.cuda.current_device()])
+                                                        device_ids=[torch.cuda.current_device()],
+                                                        find_unused_parameters=True)
                else:
                    self.netD = DataParallel(self.netD)
                self.netG.train()