From 2a9a25e6e738a3ed8a4808598874ebfafd2020d8 Mon Sep 17 00:00:00 2001 From: James Betker Date: Sat, 8 Jan 2022 18:24:58 -0700 Subject: [PATCH] Fix likely defective nan grad recovery --- codes/trainer/steps.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/codes/trainer/steps.py b/codes/trainer/steps.py index f399beea..3f210edd 100644 --- a/codes/trainer/steps.py +++ b/codes/trainer/steps.py @@ -321,6 +321,10 @@ class ConfigurableStep(Module): if not nan_found: self.scaler.step(opt) self.scaler.update() + else: + for pg in opt.param_groups: + for p in pg['params']: + p.grad = 0 def get_metrics(self): return self.loss_accumulator.as_dict()