From 776a7abfcc42a9a6c202ec7a8503f6705165d2db Mon Sep 17 00:00:00 2001 From: James Betker Date: Sat, 25 Dec 2021 21:20:06 -0700 Subject: [PATCH] Support torch DDP _set_static_graph --- codes/trainer/ExtensibleTrainer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/codes/trainer/ExtensibleTrainer.py b/codes/trainer/ExtensibleTrainer.py index 24e5fa72..af131394 100644 --- a/codes/trainer/ExtensibleTrainer.py +++ b/codes/trainer/ExtensibleTrainer.py @@ -121,6 +121,12 @@ class ExtensibleTrainer(BaseModel): # Do NOT be tempted to put find_unused_parameters=True here. It will not work in the current incarnation of this trainer. # Use all of your parameters in training, or delete them! dnet = DistributedDataParallel(anet, device_ids=[torch.cuda.current_device()]) + # DDP graphs cannot be used with gradient checkpointing unless you use find_unused_parameters=True, + # which does not work with this trainer (as stated above). However, if the graph is not subject + # to control flow alterations, you can set this option to allow gradient checkpointing. Beware that + # if you are wrong about control flow, DDP will not train all your model parameters! User beware! + if opt_get(opt, ['ddp_static_graph'], False): + dnet._set_static_graph() else: dnet = DataParallel(anet, device_ids=opt['gpu_ids']) if self.is_train: