From 9384900ce647cb8c0e6fdc3f8413b98ac1d67e21 Mon Sep 17 00:00:00 2001 From: mrq Date: Fri, 22 Sep 2023 13:04:17 -0500 Subject: [PATCH] revert the frankensteined "train one model but hotload the other" since it kept loading the last exported weights and I'm not supporting this usecase anymore anyways --- vall_e/train.py | 21 +++------------------ vall_e/utils/trainer.py | 19 +------------------ 2 files changed, 4 insertions(+), 36 deletions(-) diff --git a/vall_e/train.py b/vall_e/train.py index 8617ea5..9179553 100755 --- a/vall_e/train.py +++ b/vall_e/train.py @@ -45,7 +45,7 @@ def train_feeder(engine, batch): return loss, stats @torch.inference_mode() -def run_eval(engines, disabled_engines, eval_name, dl): +def run_eval(engines, eval_name, dl): AR = None NAR = None AR_NAR = None @@ -62,16 +62,6 @@ def run_eval(engines, disabled_engines, eval_name, dl): continue names.append(name) - # hotload the missing models - for name, engine in disabled_engines.items(): - if AR is None and name[:2] == "ar": - AR = engine - elif NAR is None and name[:3] == "nar": - NAR = engine - else: - continue - names.append(name) - stats = defaultdict(list) stats['loss'] = [] @@ -163,18 +153,13 @@ def main(): train_dl, subtrain_dl, val_dl = create_train_val_dataloader() def eval_fn(engines): - disabled_engines = load_engines(invert=True) if cfg.evaluation.load_disabled_engines else {} try: - run_eval(engines, disabled_engines, "subtrain", subtrain_dl) - run_eval(engines, disabled_engines, "val", val_dl) + run_eval(engines, "subtrain", subtrain_dl) + run_eval(engines, "val", val_dl) except Exception as e: print("Error occurred while performing eval:", str(e)) print(traceback.format_exc()) - if len(disabled_engines.keys()): - for name, engine in disabled_engines.items(): - engine = engine.to("cpu") - del disabled_engines qnt.unload_model() do_gc() diff --git a/vall_e/utils/trainer.py b/vall_e/utils/trainer.py index ace03b3..d789723 100755 --- a/vall_e/utils/trainer.py +++ b/vall_e/utils/trainer.py @@ -38,28 +38,11 @@ from ..data import get_phone_symmap # should decouple from this trainer script _logger = logging.getLogger(__name__) _command: str -def load_engines(invert=False): +def load_engines(): models = get_models(cfg.models.get()) engines = dict() for name, model in models.items(): - if cfg.mode != "inferencing": - # load only the models for training initially - # loads disabled models at evaluation time (to load updated weights if training separately) - # I'm sure there's a more elegant solution to this - if cfg.evaluation.load_disabled_engines: - if not invert and not model._cfg.training: - continue - if invert and model._cfg.training: - continue - # load only the models for training initially - # if load_disabled_engines, then models not marked for training will be loaded but ignored - # DeepSpeed has some weird quirks where loading an engine and moving it to CPU will have a memory leak or something - # I recommend not using this pathway - elif not cfg.trainer.load_disabled_engines: - if model._cfg.training: - continue - optimizer = None lr_scheduler = None