From 37cab14272d214bf744c87b4e365456581765949 Mon Sep 17 00:00:00 2001 From: mrq Date: Sat, 4 Mar 2023 20:53:00 +0000 Subject: [PATCH] use torchrun instead for multigpu --- src/train.py | 3 --- train.sh | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/train.py b/src/train.py index ef3cb0c..a4e4148 100755 --- a/src/train.py +++ b/src/train.py @@ -18,12 +18,9 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_vit_latent.yml', nargs='+') # ugh parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') - parser.add_argument('--local_rank', type=int, help='Rank Number') args = parser.parse_args() args.opt = " ".join(args.opt) # absolutely disgusting - os.environ['LOCAL_RANK'] = str(args.local_rank) - with open(args.opt, 'r') as file: opt_config = yaml.safe_load(file) diff --git a/train.sh b/train.sh index 70f2651..7459658 100755 --- a/train.sh +++ b/train.sh @@ -6,7 +6,7 @@ CONFIG=$2 PORT=1234 if (( $GPUS > 1 )); then - python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT ./src/train.py -opt "$CONFIG" --launcher=pytorch + torchrun --nproc_per_node=$GPUS --master_port=$PORT ./src/train.py -opt "$CONFIG" --launcher=pytorch else python3 ./src/train.py -opt "$CONFIG" fi