forked from mrq/DL-Art-School
Revert "Get rid of CUDA_VISIBLE_DEVICES"
It is actually necessary for training in distributed mode. Only do it then.
This commit is contained in:
parent
8de5a02a48
commit
0eb1f4dd67
|
@ -298,6 +298,12 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('--local_rank', type=int, default=0)
|
parser.add_argument('--local_rank', type=int, default=0)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
opt = option.parse(args.opt, is_train=True)
|
opt = option.parse(args.opt, is_train=True)
|
||||||
|
if args.launcher != 'none':
|
||||||
|
# export CUDA_VISIBLE_DEVICES for running in distributed mode.
|
||||||
|
if 'gpu_ids' in opt.keys():
|
||||||
|
gpu_list = ','.join(str(x) for x in opt['gpu_ids'])
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = gpu_list
|
||||||
|
print('export CUDA_VISIBLE_DEVICES=' + gpu_list)
|
||||||
trainer = Trainer()
|
trainer = Trainer()
|
||||||
|
|
||||||
#### distributed training settings
|
#### distributed training settings
|
||||||
|
@ -309,7 +315,7 @@ if __name__ == '__main__':
|
||||||
print('Disabled distributed training.')
|
print('Disabled distributed training.')
|
||||||
else:
|
else:
|
||||||
opt['dist'] = True
|
opt['dist'] = True
|
||||||
init_dist('nccl')
|
init_dist('nccl', opt)
|
||||||
trainer.world_size = torch.distributed.get_world_size()
|
trainer.world_size = torch.distributed.get_world_size()
|
||||||
trainer.rank = torch.distributed.get_rank()
|
trainer.rank = torch.distributed.get_rank()
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user