wrapped saving the checkpoint in a try/catch so I can stop waking up to the damn trainer crashing because it ran out of disk space; I'd much rather it keep training to give me time to eventually clear up disk space rather than it silently restarting on its own
This commit is contained in:
parent
2d1a9f10c0
commit
fc576010ce
|
@ -302,7 +302,7 @@ class Dataset(_Dataset):
|
|||
if len(set(self.paths_by_spkr_name[spkr_name]) - {path}) < 4:
|
||||
task = "tts"
|
||||
|
||||
noise_scale = 0.125
|
||||
noise_scale = 0.25
|
||||
# text-to-speech
|
||||
if task == "tts":
|
||||
proms = self.sample_prompts(spkr_name, ignore=path) if random.random() < cfg.dataset.random_utterance else resps
|
||||
|
@ -749,6 +749,8 @@ if __name__ == "__main__":
|
|||
|
||||
task = args.action
|
||||
|
||||
cfg.dataset.workers = 1
|
||||
|
||||
if args.action == "hdf5":
|
||||
create_dataset_hdf5()
|
||||
elif args.action == "sample":
|
||||
|
@ -776,7 +778,7 @@ if __name__ == "__main__":
|
|||
if task not in cfg.dataset.tasks_list:
|
||||
continue
|
||||
|
||||
print(text, task)
|
||||
decode_to_file( proms, f"./.{task}.proms.wav", device="cpu" )
|
||||
decode_to_file( resps, f"./.{task}.resps.wav", device="cpu" )
|
||||
print(text, task, cfg.models.prom_levels)
|
||||
decode_to_file( proms, f"./data/{task}.proms.wav", device="cpu" )
|
||||
decode_to_file( resps, f"./data/{task}.resps.wav", device="cpu" )
|
||||
break
|
|
@ -228,7 +228,10 @@ class Engines(dict[str, Engine]):
|
|||
cfg.ckpt_dir.mkdir(parents=True, exist_ok=True)
|
||||
for name, engine in self.items():
|
||||
save_dir = cfg.ckpt_dir / name
|
||||
engine.save_checkpoint(save_dir, tag=tag)
|
||||
try:
|
||||
engine.save_checkpoint(save_dir, tag=tag)
|
||||
except Exception as e:
|
||||
print(f'Failed to save checkpoint for engine {name}:', str(e))
|
||||
|
||||
# might be better to prune before saving for safety, but [:0] returns an empty list, but I could do [:-cfg.trainer.keep_last_checkpoints - 1 if cfg.trainer.keep_last_checkpoints > 1 else None]
|
||||
if cfg.trainer.keep_last_checkpoints > 0 and is_global_leader():
|
||||
|
|
Loading…
Reference in New Issue
Block a user