adjust fp16 loss scaling since I fried a model overnight when it hit 8K scale

This commit is contained in:
mrq 2024-11-14 09:23:52 -06:00
parent e412e98125
commit ef05c951ff

View File

@ -558,6 +558,8 @@ class DeepSpeed:
"fp16": {
"enabled": cfg.trainer.weight_dtype.lower() == "float16",
"auto_cast": True, # ???
"loss_scale_window": 100, # raise every 100 consecutive good steps
"min_loss_scale": 32768.0, # loss scale hitting 8K fries the model, 16K is fine but 32K is comfy
"loss_scale": 0.0 if cfg.trainer.scale_loss else 1.0,
},
"bf16": {