wrapped saving the checkpoint in a try/catch so I can stop waking up to the damn trainer crashing because it ran out of disk space; I'd much rather it keep training to give me time to eventually clear up disk space rather than it silently restarting on its own

This commit is contained in:
mrq 2023-08-20 06:29:17 -05:00
parent 2d1a9f10c0
commit fc576010ce
2 changed files with 10 additions and 5 deletions

View File

@ -302,7 +302,7 @@ class Dataset(_Dataset):
if len(set(self.paths_by_spkr_name[spkr_name]) - {path}) < 4:
task = "tts"
noise_scale = 0.125
noise_scale = 0.25
# text-to-speech
if task == "tts":
proms = self.sample_prompts(spkr_name, ignore=path) if random.random() < cfg.dataset.random_utterance else resps
@ -749,6 +749,8 @@ if __name__ == "__main__":
task = args.action
cfg.dataset.workers = 1
if args.action == "hdf5":
create_dataset_hdf5()
elif args.action == "sample":
@ -776,7 +778,7 @@ if __name__ == "__main__":
if task not in cfg.dataset.tasks_list:
continue
print(text, task)
decode_to_file( proms, f"./.{task}.proms.wav", device="cpu" )
decode_to_file( resps, f"./.{task}.resps.wav", device="cpu" )
print(text, task, cfg.models.prom_levels)
decode_to_file( proms, f"./data/{task}.proms.wav", device="cpu" )
decode_to_file( resps, f"./data/{task}.resps.wav", device="cpu" )
break

View File

@ -228,7 +228,10 @@ class Engines(dict[str, Engine]):
cfg.ckpt_dir.mkdir(parents=True, exist_ok=True)
for name, engine in self.items():
save_dir = cfg.ckpt_dir / name
engine.save_checkpoint(save_dir, tag=tag)
try:
engine.save_checkpoint(save_dir, tag=tag)
except Exception as e:
print(f'Failed to save checkpoint for engine {name}:', str(e))
# might be better to prune before saving for safety, but [:0] returns an empty list, but I could do [:-cfg.trainer.keep_last_checkpoints - 1 if cfg.trainer.keep_last_checkpoints > 1 else None]
if cfg.trainer.keep_last_checkpoints > 0 and is_global_leader():