From bba283776c99b557e7fa3d286cab8f32fcda7e93 Mon Sep 17 00:00:00 2001 From: James Betker Date: Thu, 23 Jul 2020 09:08:13 -0600 Subject: [PATCH] Enable find_unused_parameters for DistributedDataParallel attention_norm has some parameters which are not used to compute grad, which is causing failures in the distributed case. --- codes/models/SRGAN_model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/codes/models/SRGAN_model.py b/codes/models/SRGAN_model.py index a1b2d5d4..411cc13a 100644 --- a/codes/models/SRGAN_model.py +++ b/codes/models/SRGAN_model.py @@ -136,13 +136,15 @@ class SRGANModel(BaseModel): # DataParallel if opt['dist']: - self.netG = DistributedDataParallel(self.netG, device_ids=[torch.cuda.current_device()]) + self.netG = DistributedDataParallel(self.netG, device_ids=[torch.cuda.current_device()], + find_unused_parameters=True) else: self.netG = DataParallel(self.netG) if self.is_train: if opt['dist']: self.netD = DistributedDataParallel(self.netD, - device_ids=[torch.cuda.current_device()]) + device_ids=[torch.cuda.current_device()], + find_unused_parameters=True) else: self.netD = DataParallel(self.netD) self.netG.train()