From db9e9e28a0ec187614fa22b46b20b9d402ad1349 Mon Sep 17 00:00:00 2001 From: James Betker Date: Thu, 12 Nov 2020 15:43:01 -0700 Subject: [PATCH] Fix an issue where GPU0 was always being used in non-ddp Frankly, I don't understand how this has ever worked. WTF. --- codes/models/ExtensibleTrainer.py | 4 ++-- codes/train.py | 3 ++- codes/train2.py | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/codes/models/ExtensibleTrainer.py b/codes/models/ExtensibleTrainer.py index 2a8d9336..81d2746c 100644 --- a/codes/models/ExtensibleTrainer.py +++ b/codes/models/ExtensibleTrainer.py @@ -108,14 +108,14 @@ class ExtensibleTrainer(BaseModel): device_ids=[torch.cuda.current_device()], find_unused_parameters=False) else: - dnet = DataParallel(anet) + dnet = DataParallel(anet, device_ids=opt['gpu_ids']) if self.is_train: dnet.train() else: dnet.eval() dnets.append(dnet) if not opt['dist']: - self.netF = DataParallel(self.netF) + self.netF = DataParallel(self.netF, device_ids=opt['gpu_ids']) # Backpush the wrapped networks into the network dicts.. self.networks = {} diff --git a/codes/train.py b/codes/train.py index 14f3f56c..50f6bf14 100644 --- a/codes/train.py +++ b/codes/train.py @@ -284,8 +284,9 @@ if __name__ == '__main__': if args.launcher == 'none': # disabled distributed training opt['dist'] = False trainer.rank = -1 + if len(opt['gpu_ids']) == 1: + torch.cuda.set_device(opt['gpu_ids'][0]) print('Disabled distributed training.') - else: opt['dist'] = True init_dist('nccl') diff --git a/codes/train2.py b/codes/train2.py index c6db77d1..ab63c7b8 100644 --- a/codes/train2.py +++ b/codes/train2.py @@ -284,8 +284,9 @@ if __name__ == '__main__': if args.launcher == 'none': # disabled distributed training opt['dist'] = False trainer.rank = -1 + if len(opt['gpu_ids']) == 1: + torch.cuda.set_device(opt['gpu_ids'][0]) print('Disabled distributed training.') - else: opt['dist'] = True init_dist('nccl')