wrapped saving the checkpoint in a try/catch so I can stop waking up to the damn trainer crashing because it ran out of disk space; I'd much rather it keep training to give me time to eventually clear up disk space rather than it silently restarting on its own

2023-08-20 06:29:17 -05:00 · 2023-08-20 06:29:17 -05:00 · fc576010ce
commit fc576010ce
parent 2d1a9f10c0
2 changed files with 10 additions and 5 deletions
--- a/vall_e/data.py
+++ b/vall_e/data.py
@ -302,7 +302,7 @@ class Dataset(_Dataset):
 		if len(set(self.paths_by_spkr_name[spkr_name]) - {path}) < 4:
 			task = "tts"

-		noise_scale = 0.125
+		noise_scale = 0.25
 		# text-to-speech
 		if task == "tts":
 			proms = self.sample_prompts(spkr_name, ignore=path) if random.random() < cfg.dataset.random_utterance else resps
@ -749,6 +749,8 @@ if __name__ == "__main__":

 	task = args.action

+	cfg.dataset.workers = 1
+
 	if args.action == "hdf5":
 		create_dataset_hdf5()
 	elif args.action == "sample":
@ -776,7 +778,7 @@ if __name__ == "__main__":
 			if task not in cfg.dataset.tasks_list:
 				continue

-			print(text, task)
-			decode_to_file( proms, f"./.{task}.proms.wav", device="cpu" )
-			decode_to_file( resps, f"./.{task}.resps.wav", device="cpu" )
+			print(text, task, cfg.models.prom_levels)
+			decode_to_file( proms, f"./data/{task}.proms.wav", device="cpu" )
+			decode_to_file( resps, f"./data/{task}.resps.wav", device="cpu" )
 			break
--- a/vall_e/engines/base.py
+++ b/vall_e/engines/base.py
@ -228,7 +228,10 @@ class Engines(dict[str, Engine]):
 		cfg.ckpt_dir.mkdir(parents=True, exist_ok=True)
 		for name, engine in self.items():
 			save_dir = cfg.ckpt_dir / name
-			engine.save_checkpoint(save_dir, tag=tag)
+			try:
+				engine.save_checkpoint(save_dir, tag=tag)
+			except Exception as e:
+				print(f'Failed to save checkpoint for engine {name}:', str(e))

 			# might be better to prune before saving for safety, but [:0] returns an empty list, but I could do [:-cfg.trainer.keep_last_checkpoints - 1 if cfg.trainer.keep_last_checkpoints > 1 else None]
 			if cfg.trainer.keep_last_checkpoints > 0 and is_global_leader():