Fix distributed launch for large distributed runs

This commit is contained in:
James Betker 2020-08-25 15:42:59 -06:00
parent 03eb29a4d9
commit 19487d9bbd

View File

@ -32,7 +32,7 @@ def init_dist(backend='nccl', **kwargs):
def main(): def main():
#### options #### options
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_imgset_spsr_switched2_fullimgref.yml') parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_imgset_spsr_switched2_xlbatch_ragan.yml')
parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none',
help='job launcher') help='job launcher')
parser.add_argument('--local_rank', type=int, default=0) parser.add_argument('--local_rank', type=int, default=0)
@ -121,7 +121,7 @@ def main():
# torch.backends.cudnn.deterministic = True # torch.backends.cudnn.deterministic = True
#### create train and val dataloader #### create train and val dataloader
dataset_ratio = 200 # enlarge the size of each epoch dataset_ratio = 1 # enlarge the size of each epoch
for phase, dataset_opt in opt['datasets'].items(): for phase, dataset_opt in opt['datasets'].items():
if phase == 'train': if phase == 'train':
train_set = create_dataset(dataset_opt) train_set = create_dataset(dataset_opt)