fixed not cleaning up states while training (oops)
This commit is contained in:
parent
b31bf1206e
commit
7b80f7a42f
|
@ -1 +1 @@
|
|||
Subproject commit 3fdf2a63aaf901f16763fa632269b823915199f4
|
||||
Subproject commit b253da6e353f0170c3eb60fe299c41d2fa21db50
|
12
src/utils.py
12
src/utils.py
|
@ -937,15 +937,19 @@ class TrainingState():
|
|||
|
||||
should_return = True
|
||||
else:
|
||||
# INFO: Training Metrics: {"loss_text_ce": 4.308311939239502, "loss_mel_ce": 2.1610655784606934, "loss_gpt_total": 2.204148769378662, "lr": 0.0001, "it": 2, "step": 1, "steps": 1, "epoch": 1, "iteration_rate": 0.10700102965037028}
|
||||
if line.find('INFO: Training Metrics:') >= 0:
|
||||
data = None
|
||||
if line.find('INFO: Saving models and training states.') >= 0:
|
||||
self.checkpoint += 1
|
||||
message = f"[{self.checkpoint}/{self.checkpoints}] Saving checkpoint..."
|
||||
percent = self.checkpoint / self.checkpoints
|
||||
|
||||
self.cleanup_old(keep=keep_x_past_checkpoints)
|
||||
elif line.find('INFO: Training Metrics:') >= 0:
|
||||
data = json.loads(line.split("INFO: Training Metrics:")[-1])
|
||||
data['mode'] = "training"
|
||||
elif line.find('INFO: Validation Metrics:') >= 0:
|
||||
data = json.loads(line.split("INFO: Validation Metrics:")[-1])
|
||||
data['mode'] = "validation"
|
||||
else:
|
||||
data = None
|
||||
|
||||
if data is not None:
|
||||
if ': nan' in line and not self.nan_detected:
|
||||
|
|
Loading…
Reference in New Issue
Block a user