fixed grad norm and loss scale not reporting for local trainer

2025-02-23 19:08:26 -06:00 · 2025-02-23 19:08:26 -06:00 · cbf6b84e27
commit cbf6b84e27
parent b640fabab5
2 changed files with 9 additions and 2 deletions
--- a/vall_e/engines/base.py
+++ b/vall_e/engines/base.py
@ -248,7 +248,7 @@ class Engine():
 			self.global_samples += self.batch_size

 			if (self.micro_steps + 1) % max(1, self.gradient_accumulation_steps) == 0:
-				torch.nn.utils.clip_grad_norm_(self.module.parameters(), self.gradient_clipping)
+				self._global_grad_norm = torch.nn.utils.clip_grad_norm_(self.module.parameters(), self.gradient_clipping)

 				self.global_steps += 1 
 				if self.loss_scaler is not None:
@ -260,6 +260,7 @@ class Engine():

 				self._get_grad_norm()
 	
+	# doesn't actually work
 	def _get_grad_norm(self):
 		t = [ param.grad.detach().flatten() for param in self.module.parameters() if param.grad is not None ]
 		self._global_grad_norm = torch.cat(t).norm().item() if len(t) else None
@ -585,7 +586,9 @@ class Engines(dict[str, Engine]):
 			loss_scale = 1
 			if hasattr(engine.optimizer, "loss_scale") and engine.optimizer.loss_scale is not None:
 				loss_scale = engine.optimizer.loss_scale
-			
+			elif engine.loss_scaler is not None:
+				loss_scale = engine.loss_scaler.get_scale()
+
 			if grad_norm is not None:
 				grad_norm /= loss_scale

--- a/vall_e/utils/ext/muon.py
+++ b/vall_e/utils/ext/muon.py
@ -126,6 +126,10 @@ class Muon(torch.optim.Optimizer):
 			#		   Muon		   #
 			############################

+			# this actually doesn't work with deepspeed for the same reason APOLLO required modifications:
+			# deepspeed's BF16/F16 optimizer wrapper modifies the tensors, so self.state loses the right mapping
+			# can't be assed to figure it out right now since it's not easy to fix like APOLLO
+			
 			params = [p for p in group["params"] if self.state[p]["use_muon"]]
 			# import pdb; pdb.set_trace()
 			lr = group["lr"]