From fc576010ceb40a64ce943a1ff24593104b6997c4 Mon Sep 17 00:00:00 2001 From: mrq Date: Sun, 20 Aug 2023 06:29:17 -0500 Subject: [PATCH] wrapped saving the checkpoint in a try/catch so I can stop waking up to the damn trainer crashing because it ran out of disk space; I'd much rather it keep training to give me time to eventually clear up disk space rather than it silently restarting on its own --- vall_e/data.py | 10 ++++++---- vall_e/engines/base.py | 5 ++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/vall_e/data.py b/vall_e/data.py index be81d6d..0cd7d5f 100755 --- a/vall_e/data.py +++ b/vall_e/data.py @@ -302,7 +302,7 @@ class Dataset(_Dataset): if len(set(self.paths_by_spkr_name[spkr_name]) - {path}) < 4: task = "tts" - noise_scale = 0.125 + noise_scale = 0.25 # text-to-speech if task == "tts": proms = self.sample_prompts(spkr_name, ignore=path) if random.random() < cfg.dataset.random_utterance else resps @@ -749,6 +749,8 @@ if __name__ == "__main__": task = args.action + cfg.dataset.workers = 1 + if args.action == "hdf5": create_dataset_hdf5() elif args.action == "sample": @@ -776,7 +778,7 @@ if __name__ == "__main__": if task not in cfg.dataset.tasks_list: continue - print(text, task) - decode_to_file( proms, f"./.{task}.proms.wav", device="cpu" ) - decode_to_file( resps, f"./.{task}.resps.wav", device="cpu" ) + print(text, task, cfg.models.prom_levels) + decode_to_file( proms, f"./data/{task}.proms.wav", device="cpu" ) + decode_to_file( resps, f"./data/{task}.resps.wav", device="cpu" ) break \ No newline at end of file diff --git a/vall_e/engines/base.py b/vall_e/engines/base.py index 5e5ea42..5f11951 100755 --- a/vall_e/engines/base.py +++ b/vall_e/engines/base.py @@ -228,7 +228,10 @@ class Engines(dict[str, Engine]): cfg.ckpt_dir.mkdir(parents=True, exist_ok=True) for name, engine in self.items(): save_dir = cfg.ckpt_dir / name - engine.save_checkpoint(save_dir, tag=tag) + try: + engine.save_checkpoint(save_dir, tag=tag) + except Exception as e: + print(f'Failed to save checkpoint for engine {name}:', str(e)) # might be better to prune before saving for safety, but [:0] returns an empty list, but I could do [:-cfg.trainer.keep_last_checkpoints - 1 if cfg.trainer.keep_last_checkpoints > 1 else None] if cfg.trainer.keep_last_checkpoints > 0 and is_global_leader():