revert the frankensteined "train one model but hotload the other" since it kept loading the last exported weights and I'm not supporting this usecase anymore anyways

This commit is contained in:
mrq 2023-09-22 13:04:17 -05:00
parent e7da1eb90d
commit 9384900ce6
2 changed files with 4 additions and 36 deletions

View File

@ -45,7 +45,7 @@ def train_feeder(engine, batch):
return loss, stats return loss, stats
@torch.inference_mode() @torch.inference_mode()
def run_eval(engines, disabled_engines, eval_name, dl): def run_eval(engines, eval_name, dl):
AR = None AR = None
NAR = None NAR = None
AR_NAR = None AR_NAR = None
@ -62,16 +62,6 @@ def run_eval(engines, disabled_engines, eval_name, dl):
continue continue
names.append(name) names.append(name)
# hotload the missing models
for name, engine in disabled_engines.items():
if AR is None and name[:2] == "ar":
AR = engine
elif NAR is None and name[:3] == "nar":
NAR = engine
else:
continue
names.append(name)
stats = defaultdict(list) stats = defaultdict(list)
stats['loss'] = [] stats['loss'] = []
@ -163,18 +153,13 @@ def main():
train_dl, subtrain_dl, val_dl = create_train_val_dataloader() train_dl, subtrain_dl, val_dl = create_train_val_dataloader()
def eval_fn(engines): def eval_fn(engines):
disabled_engines = load_engines(invert=True) if cfg.evaluation.load_disabled_engines else {}
try: try:
run_eval(engines, disabled_engines, "subtrain", subtrain_dl) run_eval(engines, "subtrain", subtrain_dl)
run_eval(engines, disabled_engines, "val", val_dl) run_eval(engines, "val", val_dl)
except Exception as e: except Exception as e:
print("Error occurred while performing eval:", str(e)) print("Error occurred while performing eval:", str(e))
print(traceback.format_exc()) print(traceback.format_exc())
if len(disabled_engines.keys()):
for name, engine in disabled_engines.items():
engine = engine.to("cpu")
del disabled_engines
qnt.unload_model() qnt.unload_model()
do_gc() do_gc()

View File

@ -38,28 +38,11 @@ from ..data import get_phone_symmap # should decouple from this trainer script
_logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
_command: str _command: str
def load_engines(invert=False): def load_engines():
models = get_models(cfg.models.get()) models = get_models(cfg.models.get())
engines = dict() engines = dict()
for name, model in models.items(): for name, model in models.items():
if cfg.mode != "inferencing":
# load only the models for training initially
# loads disabled models at evaluation time (to load updated weights if training separately)
# I'm sure there's a more elegant solution to this
if cfg.evaluation.load_disabled_engines:
if not invert and not model._cfg.training:
continue
if invert and model._cfg.training:
continue
# load only the models for training initially
# if load_disabled_engines, then models not marked for training will be loaded but ignored
# DeepSpeed has some weird quirks where loading an engine and moving it to CPU will have a memory leak or something
# I recommend not using this pathway
elif not cfg.trainer.load_disabled_engines:
if model._cfg.training:
continue
optimizer = None optimizer = None
lr_scheduler = None lr_scheduler = None