fixed issue with training from scratch (oops)

2023-10-21 09:55:38 -05:00 · 2023-10-21 09:55:38 -05:00 · 32d4271ca8
commit 32d4271ca8
parent 3195026dba
5 changed files with 54 additions and 12 deletions
--- a/vall_e/config.py
+++ b/vall_e/config.py
@ -482,6 +482,8 @@ class Trainer:
 	weight_dtype: str = "float16"
 	amp: bool = False

+	load_webui: bool = False
+
 	backend: str = "local"
 	deepspeed: DeepSpeed = field(default_factory=lambda: DeepSpeed)

--- a/vall_e/engines/init.py
+++ b/vall_e/engines/init.py
@ -21,6 +21,9 @@ try:
 except Exception as e:
 	pass

+from functools import cache
+
+@cache
 def load_engines():
 	models = get_models(cfg.models.get())
 	engines = dict()
@ -71,13 +74,13 @@ def load_engines():
 			lr_scheduler = None

 		# automatically load from state dict if one is provided, but no DeepSpeed checkpoint is present
-		if not loads_state_dict and backend == "deepspeed" and not (cfg.ckpt_dir / name / "latest").exists():
+		load_path = cfg.ckpt_dir / name / "fp32.pth"
+		if not loads_state_dict and backend == "deepspeed" and not (cfg.ckpt_dir / name / "latest").exists() and load_path.exists():
 			print("DeepSpeed checkpoint missing, but weights found.")
 			loads_state_dict = True

 		stats = None
 		if loads_state_dict:
-			load_path = cfg.ckpt_dir / name / "fp32.pth"
 			state = torch.load(load_path, map_location=torch.device(cfg.device))

 			# state dict is not just the module, extract the extra trainer details
--- a/vall_e/train.py
+++ b/vall_e/train.py
@ -147,7 +147,7 @@ def run_eval(engines, eval_name, dl):
 	_logger.info(f"Validation Metrics: {json.dumps(engines_stats)}.")


-def main():
+def train():
 	setup_logging(cfg.log_dir)

 	train_dl, subtrain_dl, val_dl = create_train_val_dataloader()
@ -165,6 +165,12 @@ def main():

 	qnt.unload_model()

+	"""
+	if cfg.trainer.load_webui:
+		from .webui import start
+		start(lock=False)
+	"""
+
 	trainer.train(
 		train_dl=train_dl,
 		train_feeder=train_feeder,
@ -172,4 +178,4 @@ def main():
 	)

 if __name__ == "__main__":
-	main()
+	train()
--- a/vall_e/utils/trainer.py
+++ b/vall_e/utils/trainer.py
@ -173,7 +173,8 @@ def train(


 		elapsed_time = stats.get("elapsed_time", 0)
-		_logger.info(f"Training Metrics: {json.dumps(stats)}.")
+		metrics = json.dumps(stats)
+		_logger.info(f"Training Metrics: {metrics}.")

 		command = _non_blocking_input()

--- a/vall_e/webui.py
+++ b/vall_e/webui.py
@ -12,16 +12,18 @@ from time import perf_counter
 from pathlib import Path

 from .inference import TTS
+from .train import train

 tts = None

 layout = {}
 layout["inference"] = {}
-layout["inference"]["inputs"] = {
-	"progress": None
-}
-layout["inference"]["outputs"] = {}
-layout["inference"]["buttons"] = {}
+layout["training"] = {}
+
+for k in layout.keys():
+	layout[k]["inputs"] = { "progress": None }
+	layout[k]["outputs"] = {}
+	layout[k]["buttons"] = {}

 # there's got to be a better way to go about this
 def gradio_wrapper(inputs):
@ -123,6 +125,14 @@ def do_inference( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
 	wav = wav.squeeze(0).cpu().numpy()
 	return (sr, wav)

+"""
+@gradio_wrapper(inputs=layout["training"]["inputs"].keys())
+def do_training( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
+	while True:
+		metrics = next(it)
+		yield metrics
+"""
+
 def get_random_prompt():
 	harvard_sentences=[
 		"The birch canoe slid on the smooth planks.",
@ -225,6 +235,22 @@ with ui:
 			inputs=[ x for x in layout["inference"]["inputs"].values() if x is not None],
 			outputs=[ x for x in layout["inference"]["outputs"].values() if x is not None]
 		)
+		
+	"""
+	with gr.Tab("Training"):
+		with gr.Row():
+			with gr.Column(scale=1):
+				layout["training"]["outputs"]["console"] = gr.Textbox(lines=8, label="Console Log")
+		with gr.Row():
+			with gr.Column(scale=1):
+				layout["training"]["buttons"]["train"] = gr.Button(value="Train")
+
+		layout["training"]["buttons"]["train"].click(
+			fn=do_training,
+			outputs=[ x for x in layout["training"]["outputs"].values() if x is not None],
+		)
+	"""
+
 	if os.path.exists("README.md") and args.render_markdown:
 		md = open("README.md", "r", encoding="utf-8").read()
 		# remove HF's metadata
@ -232,5 +258,9 @@ with ui:
 			md = "".join(md.split("---")[2:])
 		gr.Markdown(md)

-ui.queue(max_size=8)
-ui.launch(share=args.share, server_name=args.listen_host, server_port=args.listen_port)
+def start( lock=True ):
+	ui.queue(max_size=8)
+	ui.launch(share=args.share, server_name=args.listen_host, server_port=args.listen_port, prevent_thread_lock=not lock)
+
+if __name__ == "__main__":
+	start()