added ability to export weights mid-training to avoid CBT to yank the weights while the training script is running

2023-08-20 13:39:58 -05:00 · 2023-08-20 13:39:58 -05:00 · b105f6211e
commit b105f6211e
parent fc576010ce
3 changed files with 39 additions and 15 deletions
--- a/vall_e/engines/base.py
+++ b/vall_e/engines/base.py
@ -200,6 +200,9 @@ class Engines(dict[str, Engine]):
 		self._global_step = 0
 		self._micro_step = 0

+		for name, engine in self.items():
+			engine.name = name
+
 	@property
 	def global_step(self):
 		return self._global_step
@ -218,6 +221,18 @@ class Engines(dict[str, Engine]):
 		for engine in self.values():
 			engine.dispatch_attribute(*args, **kwargs)

+	def export(self, userdata={}):
+		for name, engine in self.items():
+			outpath = cfg.ckpt_dir / name / "fp32.pth"
+			state_dict = {
+				"global_step": engine.global_step,
+				"micro_step": engine.micro_step,
+				'module': engine.module.state_dict(),
+			}
+			state_dict.update(userdata)
+			torch.save(state_dict, outpath)
+			print(f"Exported {name} to {outpath}")
+
 	def save_checkpoint(self, tag=None):
 		if not tag:
 			tag = cfg.trainer.save_tag
@ -246,7 +261,7 @@ class Engines(dict[str, Engine]):
 						p.unlink()
 					d.rmdir()

-	def load_checkpoint(self, tag=None):
+	def load_checkpoint(self, tag=None, module_only=False):
 		if not tag:
 			tag = cfg.trainer.load_tag

@ -256,8 +271,9 @@ class Engines(dict[str, Engine]):
 				tag=tag,
 				load_dir=load_dir,
 				load_module_strict=cfg.trainer.strict_loading,
-				load_optimizer_states=cfg.trainer.load_states,
-				load_lr_scheduler_states=cfg.trainer.load_states,
+				load_optimizer_states=False if module_only else cfg.trainer.load_states,
+				load_lr_scheduler_states=False if module_only else cfg.trainer.load_states,
+				load_module_only=module_only,
 			)
 			if cfg.trainer.restart_step_count:
 				engine.global_steps = 0
--- a/vall_e/export.py
+++ b/vall_e/export.py
@ -12,16 +12,7 @@ def main():
 	args = parser.parse_args()

 	engines = load_engines()
-	for name, engine in engines.items():
-		outpath = cfg.ckpt_dir / name / "fp32.pth"
-		torch.save({
-			"global_step": engine.global_step,
-			"micro_step": engine.micro_step,
-			'module': engine.module.to('cpu', dtype=torch.float32).state_dict(),
-			#'optimizer': engine.optimizer.state_dict(),
-			'symmap': get_phone_symmap(),
-		}, outpath)
-		print(f"Exported {name} to {outpath}")
+	engines.export(userdata={"symmap": get_phone_symmap()})

 if __name__ == "__main__":
 	main()
--- a/vall_e/utils/trainer.py
+++ b/vall_e/utils/trainer.py
@ -33,6 +33,7 @@ from ..models import get_models

 from .utils import to_device, do_gc
 from ..utils import wrapper as ml
+from ..data import get_phone_symmap # should decouple from this trainer script

 _logger = logging.getLogger(__name__)
 _engines: Engines
@ -69,6 +70,9 @@ def load_engines():
 		optimizer = None
 		lr_scheduler = None

+		# yuck, should instead check be optimier == "adamw" and backend != "deepspeed"
+		# and then have ds_cfg pass in the config flag to use pytorch adamw
+		# I genuinely cannot validate if this ever actually gets used in DeepSpeed
 		if cfg.hyperparameters.optimizer.lower() == "adamw-torch":
 			optimizer = ml.AdamW(
 				model.parameters(),
@ -86,6 +90,9 @@ def load_engines():
 			if "module" in state:
 				state = state["module"]
 			
+			# should decouple the following from this trainer script
+			# probably with passing a fun that defaults to a lambda x: x deal
+
 			# extend the proms_emb if we ever touch the n_prom_levels or n_prom_tokens (from adding tasks)
 			if model.proms_emb.weight.shape[0] > state['proms_emb.weight'].shape[0] or model.proms_emb.weight.shape[1] > state['proms_emb.weight'].shape[1]:
 				o_prom_levels, o_prom_tokens, d_model = state['proms_emb.weight'].shape
@ -301,8 +308,18 @@ def train(

 			if "lr" in command:
 				rate = float(command.split(" ")[-1])
-				engines.set_lr(rate)
-				print("Updating LR to:", rate)
+				try:
+					engines.set_lr(rate)
+					print("Updating LR to:", rate)
+				except Exception as e:
+					print("Failed to set LR rate to:", rate, str(e))
+
+			if "export" in command:
+				engines.save_checkpoint()
+				last_save_step = engines.global_step
+
+				if is_global_leader():
+					engines.export(userdata={"symmap": get_phone_symmap()})

 			save_ckpt_every = cfg.trainer.save_frequency or cfg.evaluation.frequency