some possible sanity with deepspeed config

2024-05-09 22:48:42 -05:00 · 2024-05-09 22:48:42 -05:00 · b7bd885651
commit b7bd885651
parent c4b696ebeb
1 changed files with 12 additions and 10 deletions
--- a/vall_e/config.py
+++ b/vall_e/config.py
@ -338,6 +338,7 @@ class DeepSpeed:
 	use_compression_training: bool = False
 	compression_bits: int = 8
 	inferencing: bool = False
+	amp: bool = True

 	@cached_property
 	def ds_cfg(self):
@ -353,10 +354,6 @@ class DeepSpeed:
 		if 'total_num_steps' not in scheduler_params:
 			scheduler_params['total_num_steps'] = cfg.trainer.iterations

-		# documentation says neither can work
-		if cfg.trainer.weight_dtype.lower() == "float16":
-			cfg.trainer.amp = False
-
 		autotune_params = cfg.hyperparameters.autotune_params

 		if "enabled" not in autotune_params:
@ -368,6 +365,14 @@ class DeepSpeed:
 		if "exps_dir" not in autotune_params:
 			autotune_params['exps_dir'] = str( cfg.relpath / "autotune" / "exps_" )

+		# DeepSpeed fp16 is incompatible with its AMP
+		if cfg.trainer.weight_dtype.lower() == "float16":
+			self.amp = False
+
+		# disable local AMP
+		if self.amp:
+			cfg.trainer.amp = False
+
 		ds_cfg = {
 			"train_micro_batch_size_per_gpu": cfg.hyperparameters.batch_size,
 			"gradient_accumulation_steps": cfg.hyperparameters.gradient_accumulation_steps,
@ -382,13 +387,13 @@ class DeepSpeed:
 			"gradient_clipping": cfg.hyperparameters.gradient_clipping,
 			"fp16": {
 				"enabled": cfg.trainer.weight_dtype.lower() == "float16",
-				"auto_cast": False, # ???
-			},
+				"auto_cast": True, # ???
+			} if not self.amp else None,
 			"bf16": {
 				"enabled": cfg.trainer.weight_dtype.lower() == "bfloat16",
 			},
 			"amp": {
-				"enabled": cfg.trainer.amp,
+				"enabled": self.amp,
 			},
 			"autotuning": autotune_params if cfg.hyperparameters.autotune else None,
 			"compression_training": {
@ -469,9 +474,6 @@ class DeepSpeed:
 			}
 		}

-		# disable local AMP
-		cfg.trainer.amp = False
-
 		null_keys = [ k for k in ds_cfg if not ds_cfg[k] ]
 		for k in null_keys:
 			del ds_cfg[k]