diff --git a/codes/models/ExtensibleTrainer.py b/codes/models/ExtensibleTrainer.py index 2a8d9336..81d2746c 100644 --- a/codes/models/ExtensibleTrainer.py +++ b/codes/models/ExtensibleTrainer.py @@ -108,14 +108,14 @@ class ExtensibleTrainer(BaseModel): device_ids=[torch.cuda.current_device()], find_unused_parameters=False) else: - dnet = DataParallel(anet) + dnet = DataParallel(anet, device_ids=opt['gpu_ids']) if self.is_train: dnet.train() else: dnet.eval() dnets.append(dnet) if not opt['dist']: - self.netF = DataParallel(self.netF) + self.netF = DataParallel(self.netF, device_ids=opt['gpu_ids']) # Backpush the wrapped networks into the network dicts.. self.networks = {} diff --git a/codes/train.py b/codes/train.py index 14f3f56c..50f6bf14 100644 --- a/codes/train.py +++ b/codes/train.py @@ -284,8 +284,9 @@ if __name__ == '__main__': if args.launcher == 'none': # disabled distributed training opt['dist'] = False trainer.rank = -1 + if len(opt['gpu_ids']) == 1: + torch.cuda.set_device(opt['gpu_ids'][0]) print('Disabled distributed training.') - else: opt['dist'] = True init_dist('nccl') diff --git a/codes/train2.py b/codes/train2.py index c6db77d1..ab63c7b8 100644 --- a/codes/train2.py +++ b/codes/train2.py @@ -284,8 +284,9 @@ if __name__ == '__main__': if args.launcher == 'none': # disabled distributed training opt['dist'] = False trainer.rank = -1 + if len(opt['gpu_ids']) == 1: + torch.cuda.set_device(opt['gpu_ids'][0]) print('Disabled distributed training.') - else: opt['dist'] = True init_dist('nccl')