fixed grad norm and loss scale not reporting for local trainer

This commit is contained in:
mrq 2025-02-23 19:08:26 -06:00
parent b640fabab5
commit cbf6b84e27
2 changed files with 9 additions and 2 deletions

View File

@ -248,7 +248,7 @@ class Engine():
self.global_samples += self.batch_size
if (self.micro_steps + 1) % max(1, self.gradient_accumulation_steps) == 0:
torch.nn.utils.clip_grad_norm_(self.module.parameters(), self.gradient_clipping)
self._global_grad_norm = torch.nn.utils.clip_grad_norm_(self.module.parameters(), self.gradient_clipping)
self.global_steps += 1
if self.loss_scaler is not None:
@ -260,6 +260,7 @@ class Engine():
self._get_grad_norm()
# doesn't actually work
def _get_grad_norm(self):
t = [ param.grad.detach().flatten() for param in self.module.parameters() if param.grad is not None ]
self._global_grad_norm = torch.cat(t).norm().item() if len(t) else None
@ -585,7 +586,9 @@ class Engines(dict[str, Engine]):
loss_scale = 1
if hasattr(engine.optimizer, "loss_scale") and engine.optimizer.loss_scale is not None:
loss_scale = engine.optimizer.loss_scale
elif engine.loss_scaler is not None:
loss_scale = engine.loss_scaler.get_scale()
if grad_norm is not None:
grad_norm /= loss_scale

View File

@ -126,6 +126,10 @@ class Muon(torch.optim.Optimizer):
# Muon #
############################
# this actually doesn't work with deepspeed for the same reason APOLLO required modifications:
# deepspeed's BF16/F16 optimizer wrapper modifies the tensors, so self.state loses the right mapping
# can't be assed to figure it out right now since it's not easy to fix like APOLLO
params = [p for p in group["params"] if self.state[p]["use_muon"]]
# import pdb; pdb.set_trace()
lr = group["lr"]