revert the frankensteined "train one model but hotload the other" since it kept loading the last exported weights and I'm not supporting this usecase anymore anyways
This commit is contained in:
parent
e7da1eb90d
commit
9384900ce6
|
@ -45,7 +45,7 @@ def train_feeder(engine, batch):
|
||||||
return loss, stats
|
return loss, stats
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def run_eval(engines, disabled_engines, eval_name, dl):
|
def run_eval(engines, eval_name, dl):
|
||||||
AR = None
|
AR = None
|
||||||
NAR = None
|
NAR = None
|
||||||
AR_NAR = None
|
AR_NAR = None
|
||||||
|
@ -62,16 +62,6 @@ def run_eval(engines, disabled_engines, eval_name, dl):
|
||||||
continue
|
continue
|
||||||
names.append(name)
|
names.append(name)
|
||||||
|
|
||||||
# hotload the missing models
|
|
||||||
for name, engine in disabled_engines.items():
|
|
||||||
if AR is None and name[:2] == "ar":
|
|
||||||
AR = engine
|
|
||||||
elif NAR is None and name[:3] == "nar":
|
|
||||||
NAR = engine
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
names.append(name)
|
|
||||||
|
|
||||||
stats = defaultdict(list)
|
stats = defaultdict(list)
|
||||||
stats['loss'] = []
|
stats['loss'] = []
|
||||||
|
|
||||||
|
@ -163,18 +153,13 @@ def main():
|
||||||
train_dl, subtrain_dl, val_dl = create_train_val_dataloader()
|
train_dl, subtrain_dl, val_dl = create_train_val_dataloader()
|
||||||
|
|
||||||
def eval_fn(engines):
|
def eval_fn(engines):
|
||||||
disabled_engines = load_engines(invert=True) if cfg.evaluation.load_disabled_engines else {}
|
|
||||||
try:
|
try:
|
||||||
run_eval(engines, disabled_engines, "subtrain", subtrain_dl)
|
run_eval(engines, "subtrain", subtrain_dl)
|
||||||
run_eval(engines, disabled_engines, "val", val_dl)
|
run_eval(engines, "val", val_dl)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Error occurred while performing eval:", str(e))
|
print("Error occurred while performing eval:", str(e))
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
|
|
||||||
if len(disabled_engines.keys()):
|
|
||||||
for name, engine in disabled_engines.items():
|
|
||||||
engine = engine.to("cpu")
|
|
||||||
del disabled_engines
|
|
||||||
qnt.unload_model()
|
qnt.unload_model()
|
||||||
do_gc()
|
do_gc()
|
||||||
|
|
||||||
|
|
|
@ -38,28 +38,11 @@ from ..data import get_phone_symmap # should decouple from this trainer script
|
||||||
_logger = logging.getLogger(__name__)
|
_logger = logging.getLogger(__name__)
|
||||||
_command: str
|
_command: str
|
||||||
|
|
||||||
def load_engines(invert=False):
|
def load_engines():
|
||||||
models = get_models(cfg.models.get())
|
models = get_models(cfg.models.get())
|
||||||
engines = dict()
|
engines = dict()
|
||||||
|
|
||||||
for name, model in models.items():
|
for name, model in models.items():
|
||||||
if cfg.mode != "inferencing":
|
|
||||||
# load only the models for training initially
|
|
||||||
# loads disabled models at evaluation time (to load updated weights if training separately)
|
|
||||||
# I'm sure there's a more elegant solution to this
|
|
||||||
if cfg.evaluation.load_disabled_engines:
|
|
||||||
if not invert and not model._cfg.training:
|
|
||||||
continue
|
|
||||||
if invert and model._cfg.training:
|
|
||||||
continue
|
|
||||||
# load only the models for training initially
|
|
||||||
# if load_disabled_engines, then models not marked for training will be loaded but ignored
|
|
||||||
# DeepSpeed has some weird quirks where loading an engine and moving it to CPU will have a memory leak or something
|
|
||||||
# I recommend not using this pathway
|
|
||||||
elif not cfg.trainer.load_disabled_engines:
|
|
||||||
if model._cfg.training:
|
|
||||||
continue
|
|
||||||
|
|
||||||
optimizer = None
|
optimizer = None
|
||||||
lr_scheduler = None
|
lr_scheduler = None
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user