vall-e/vall_e/engines/deepspeed.py

"""
# https://github.com/enhuiz/pytorch-training-utilities
"""

# to-do: replace this
# to-do: swap out deepspeed

from ..config import cfg
from ..utils import dispatch_attribute, flatten_dict, gather_attribute, do_gc, to_device

import logging
import time
import torch
import torch.distributed

from torch import Tensor
from torch.distributed import all_reduce
from typing import Any, Protocol

from .base import TrainFeeder

_logger = logging.getLogger(__name__)

from deepspeed import DeepSpeedEngine, DeepSpeedConfig, comm as dist, init_distributed as init_deepspeed_dist
from deepspeed.accelerator import get_accelerator

from ..utils.distributed import init_distributed, distributed_initialized
from ..utils import wrapper as ml

from ..models.lora import freeze_non_lora_weights

if not distributed_initialized() and cfg.trainer.backend == "deepspeed":
	init_distributed(init_deepspeed_dist)

class Engine(DeepSpeedEngine):
	def __init__(self, *args, **kwargs):
		self.hyper_config = None
		if 'hyper_config' in kwargs:
			self.hyper_config = kwargs['hyper_config']
			kwargs.pop("hyper_config")

		kwargs['config'] = cfg.trainer.deepspeed.ds_cfg
		kwargs['config_class'] = DeepSpeedConfig(kwargs['config'])

		stats = {
			"global_step": 0,
			"micro_step": 0,
			"global_samples": 0,
			"tokens_processed": 0,
		}

		# kwargs['stats'] = None will return None when popped
		maybe_stats = kwargs.pop('stats', stats)
		if maybe_stats is not None:
			stats = maybe_stats

		super().__init__(None, *args, **kwargs)
		self._frozen_params = set()

		self.global_steps = stats["global_step"]
		self.micro_steps = stats["micro_step"]
		self.global_samples = stats["global_samples"]
		self.tokens_processed = stats["tokens_processed"]

		self.max_nan_losses = 8
		self.current_batch_size = 0

	def freeze(self, freeze_all=True):
		# freeze non-LoRA params if requested
		if not self.hyper_config.frozen_params and not freeze_all and cfg.lora is not None:
			frozen_params = freeze_non_lora_weights( self.module, embeddings=cfg.lora.embeddings )
			for param in frozen_params:
				self._frozen_params.add( param )

			return

		if self.hyper_config is None or not hasattr(self.hyper_config, "frozen_params"):
			raise Exception("freeze_all=False yet self.hyper_config.frozen_params is None")

		for name, param in self.module.named_parameters():
			if (freeze_all and param.requires_grad) or (not freeze_all and name in self.hyper_config.frozen_params):
				param.requires_grad_(False)
				self._frozen_params.add(param)

	def unfreeze(self):
		for param in self._frozen_params:
			param.requires_grad_(True)
		self._frozen_params.clear()
	
	@property
	def _training(self):
		return self.hyper_config.training

	@property
	def global_step(self):
		return self.global_steps

	@property
	def micro_step(self):
		return self.micro_steps	

	@property
	def batch_size(self):
		return self.current_batch_size if self.current_batch_size > 0 else cfg.hyperparameters.batch_size

	def gather_attribute(self, *args, **kwargs):
		return gather_attribute(self.module, *args, **kwargs)

	def dispatch_attribute(self, *args, **kwargs):
		return dispatch_attribute(self.module, *args, **kwargs)

	def set_lr(self, lr):
		try:
			if hasattr(self.optimizer, 'param_groups'):
				for param_group in self.optimizer.param_groups:
					param_group["d_coeff" if "d_coeff" in param_group else "lr"] = lr
			else:
				self.optimizer.set_lr(lr)
		except Exception as e:
			_logger.warning(str(e))

	# we'll just have to live with the LoRA weights living within our main weights
	# they're easy to extract anyways
	def load_checkpoint(self, load_dir, **kwargs ):
		# override to load the lora instead
		if cfg.lora is not None:
			load_dir = cfg.ckpt_dir / cfg.lora.full_name

		return super().load_checkpoint( load_dir, **kwargs )

	def save_checkpoint(self, save_dir, **kwargs ):
		# override to save the lora instead
		if cfg.lora is not None:
			save_dir = cfg.ckpt_dir / cfg.lora.full_name

		return super().save_checkpoint( save_dir, **kwargs )

	def traverse(self, *args, **kwargs):
		with ml.autocast():
			self.forward(*args, **kwargs)

		losses = self.gather_attribute("loss")
		loss = torch.stack([*losses.values()]).sum()

		if torch.isnan(loss).any():
			self.max_nan_losses = self.max_nan_losses - 1
			if self.max_nan_losses < 0:
				raise RuntimeError("Too many NaN losses detected.")

		stats = {}
		stats |= {k: v.item() for k, v in losses.items()}
		stats |= self.gather_attribute("scalar")

		self.backward(loss)
		self.step()

		return stats
big cleanup 2023-08-04 01:26:36 +00:00			`"""`
			`# https://github.com/enhuiz/pytorch-training-utilities`
			`"""`

			`# to-do: replace this`
			`# to-do: swap out deepspeed`

			`from ..config import cfg`
			`from ..utils import dispatch_attribute, flatten_dict, gather_attribute, do_gc, to_device`

			`import logging`
			`import time`
			`import torch`
			`import torch.distributed`

			`from torch import Tensor`
			`from torch.distributed import all_reduce`
			`from typing import Any, Protocol`

			`from .base import TrainFeeder`

			`_logger = logging.getLogger(__name__)`

some fixes for the local framework 2023-08-05 03:22:15 +00:00			`from deepspeed import DeepSpeedEngine, DeepSpeedConfig, comm as dist, init_distributed as init_deepspeed_dist`
big cleanup 2023-08-04 01:26:36 +00:00			`from deepspeed.accelerator import get_accelerator`

some fixes for the local framework 2023-08-05 03:22:15 +00:00			`from ..utils.distributed import init_distributed, distributed_initialized`
added FP8 support through `NVIDIA/TransformerEngine`, added RetNet_HF through `syncdoth/RetNet` (as an alternative to branch away from torchscale) 2024-04-09 01:14:51 +00:00			`from ..utils import wrapper as ml`
some fixes for the local framework 2023-08-05 03:22:15 +00:00
added prom-less training / inferencing, some other things 2024-07-23 00:36:07 +00:00			`from ..models.lora import freeze_non_lora_weights`

some fixes for the local framework 2023-08-05 03:22:15 +00:00			`if not distributed_initialized() and cfg.trainer.backend == "deepspeed":`
			`init_distributed(init_deepspeed_dist)`
big cleanup 2023-08-04 01:26:36 +00:00
			`class Engine(DeepSpeedEngine):`
			`def __init__(self, args, *kwargs):`
feverish cleanup 2024-06-04 02:28:49 +00:00			`self.hyper_config = None`
			`if 'hyper_config' in kwargs:`
			`self.hyper_config = kwargs['hyper_config']`
			`kwargs.pop("hyper_config")`
nightmare of spaghetti that might break compat; mechanism to increase RVQ bins of an existing model without retraining, keeps sampled proms/resps at max RVQ level and trim off excess levels according to what model receives them, some other things I already forgot (I really hope no one else has weights being baked right now) 2023-08-19 20:06:33 +00:00
			`kwargs['config'] = cfg.trainer.deepspeed.ds_cfg`
big cleanup 2023-08-04 01:26:36 +00:00			`kwargs['config_class'] = DeepSpeedConfig(kwargs['config'])`

edge case 2023-09-21 00:20:17 +00:00			`stats = {`
fixed training stats not loading from exported weights, a bit of a readme cleanup, updated example training yaml 2023-09-24 00:59:00 +00:00			`"global_step": 0,`
			`"micro_step": 0,`
edge case 2023-09-21 00:20:17 +00:00			`"global_samples": 0,`
			`"tokens_processed": 0,`
			`}`

			`# kwargs['stats'] = None will return None when popped`
			`maybe_stats = kwargs.pop('stats', stats)`
			`if maybe_stats is not None:`
			`stats = maybe_stats`
restructured some things with the model to remove dead weights 2023-09-21 00:10:59 +00:00
big cleanup 2023-08-04 01:26:36 +00:00			`super().__init__(None, args, *kwargs)`
			`self._frozen_params = set()`

fixed training stats not loading from exported weights, a bit of a readme cleanup, updated example training yaml 2023-09-24 00:59:00 +00:00			`self.global_steps = stats["global_step"]`
			`self.micro_steps = stats["micro_step"]`
restructured some things with the model to remove dead weights 2023-09-21 00:10:59 +00:00			`self.global_samples = stats["global_samples"]`
			`self.tokens_processed = stats["tokens_processed"]`
added total samples processed and tokens processed (len of text tokens + len of target response tokens) 2023-08-28 16:02:45 +00:00
nan loss detection (should have added it earlier), loss scaling for local backend + fp16 2024-05-12 03:23:29 +00:00			`self.max_nan_losses = 8`
local training backend should be a bit more aware of variable batch sizes, maybe 2024-06-29 03:39:05 +00:00			`self.current_batch_size = 0`
nan loss detection (should have added it earlier), loss scaling for local backend + fp16 2024-05-12 03:23:29 +00:00
added option to specify parameters to freeze per-model in YAML (because I need to see about committing atrocities with convering an AR into an AR+NAR) 2023-09-07 23:19:51 +00:00			`def freeze(self, freeze_all=True):`
naive, rudimentary DeepSpeed support (just live with the LoRA weights living with the original weights, they can be split later) 2024-06-17 18:17:24 +00:00			`# freeze non-LoRA params if requested`
			`if not self.hyper_config.frozen_params and not freeze_all and cfg.lora is not None:`
added prom-less training / inferencing, some other things 2024-07-23 00:36:07 +00:00			`frozen_params = freeze_non_lora_weights( self.module, embeddings=cfg.lora.embeddings )`
			`for param in frozen_params:`
			`self._frozen_params.add( param )`

actually make deepspeed work with LoRAs 2024-06-17 18:55:37 +00:00			`return`
naive, rudimentary DeepSpeed support (just live with the LoRA weights living with the original weights, they can be split later) 2024-06-17 18:17:24 +00:00
enable LoRA for targetted RVQ levels (to experiment with, seems to help) 2024-06-18 02:45:03 +00:00			`if self.hyper_config is None or not hasattr(self.hyper_config, "frozen_params"):`
			`raise Exception("freeze_all=False yet self.hyper_config.frozen_params is None")`

added option to specify parameters to freeze per-model in YAML (because I need to see about committing atrocities with convering an AR into an AR+NAR) 2023-09-07 23:19:51 +00:00			`for name, param in self.module.named_parameters():`
feverish cleanup 2024-06-04 02:28:49 +00:00			`if (freeze_all and param.requires_grad) or (not freeze_all and name in self.hyper_config.frozen_params):`
added option to specify parameters to freeze per-model in YAML (because I need to see about committing atrocities with convering an AR into an AR+NAR) 2023-09-07 23:19:51 +00:00			`param.requires_grad_(False)`
			`self._frozen_params.add(param)`
big cleanup 2023-08-04 01:26:36 +00:00
			`def unfreeze(self):`
added option to specify parameters to freeze per-model in YAML (because I need to see about committing atrocities with convering an AR into an AR+NAR) 2023-09-07 23:19:51 +00:00			`for param in self._frozen_params:`
			`param.requires_grad_(True)`
big cleanup 2023-08-04 01:26:36 +00:00			`self._frozen_params.clear()`
added ability to mark models as disabled for training, and hotloading them for eval/validation (useful if training only one model, or training a model per GPU) 2023-08-27 17:26:12 +00:00
			`@property`
ugh 2024-06-07 02:57:11 +00:00			`def _training(self):`
feverish cleanup 2024-06-04 02:28:49 +00:00			`return self.hyper_config.training`
big cleanup 2023-08-04 01:26:36 +00:00
			`@property`
			`def global_step(self):`
			`return self.global_steps`

			`@property`
			`def micro_step(self):`
added total samples processed and tokens processed (len of text tokens + len of target response tokens) 2023-08-28 16:02:45 +00:00			`return self.micro_steps`
big cleanup 2023-08-04 01:26:36 +00:00
added ability to mark models as disabled for training, and hotloading them for eval/validation (useful if training only one model, or training a model per GPU) 2023-08-27 17:26:12 +00:00			`@property`
			`def batch_size(self):`
local training backend should be a bit more aware of variable batch sizes, maybe 2024-06-29 03:39:05 +00:00			`return self.current_batch_size if self.current_batch_size > 0 else cfg.hyperparameters.batch_size`
added ability to mark models as disabled for training, and hotloading them for eval/validation (useful if training only one model, or training a model per GPU) 2023-08-27 17:26:12 +00:00
big cleanup 2023-08-04 01:26:36 +00:00			`def gather_attribute(self, args, *kwargs):`
			`return gather_attribute(self.module, args, *kwargs)`

			`def dispatch_attribute(self, args, *kwargs):`
			`return dispatch_attribute(self.module, args, *kwargs)`

			`def set_lr(self, lr):`
			`try:`
			`if hasattr(self.optimizer, 'param_groups'):`
			`for param_group in self.optimizer.param_groups:`
added Mistral (non-Mixtral) backend, useless optimization when not training, proper adjustment of the LR for Prodigyopt through d_coeff (maybe), recurrent sampling for LLaMA/Mistral/Mixtral backends (again, doesn't actually work) 2024-02-01 03:48:36 +00:00			`param_group["d_coeff" if "d_coeff" in param_group else "lr"] = lr`
big cleanup 2023-08-04 01:26:36 +00:00			`else:`
			`self.optimizer.set_lr(lr)`
			`except Exception as e:`
moved prints to use logger, edited readme (fused_attn doesnt seem stable for training) 2024-08-29 18:27:16 +00:00			`_logger.warning(str(e))`
big cleanup 2023-08-04 01:26:36 +00:00
naive, rudimentary DeepSpeed support (just live with the LoRA weights living with the original weights, they can be split later) 2024-06-17 18:17:24 +00:00			`# we'll just have to live with the LoRA weights living within our main weights`
			`# they're easy to extract anyways`
			`def load_checkpoint(self, load_dir, **kwargs ):`
			`# override to load the lora instead`
			`if cfg.lora is not None:`
			`load_dir = cfg.ckpt_dir / cfg.lora.full_name`

			`return super().load_checkpoint( load_dir, **kwargs )`

			`def save_checkpoint(self, save_dir, **kwargs ):`
			`# override to save the lora instead`
			`if cfg.lora is not None:`
			`save_dir = cfg.ckpt_dir / cfg.lora.full_name`

			`return super().save_checkpoint( save_dir, **kwargs )`

big cleanup 2023-08-04 01:26:36 +00:00			`def traverse(self, args, *kwargs):`
added FP8 support through `NVIDIA/TransformerEngine`, added RetNet_HF through `syncdoth/RetNet` (as an alternative to branch away from torchscale) 2024-04-09 01:14:51 +00:00			`with ml.autocast():`
somewhat got recurrent forward working (it's as accurate as chunkwise forward: it's not accurate at all), added option to use AMP instead of blanket setting the weight's dtype 2023-09-02 01:58:29 +00:00			`self.forward(args, *kwargs)`
added FP8 support through `NVIDIA/TransformerEngine`, added RetNet_HF through `syncdoth/RetNet` (as an alternative to branch away from torchscale) 2024-04-09 01:14:51 +00:00
			`losses = self.gather_attribute("loss")`
			`loss = torch.stack([*losses.values()]).sum()`
big cleanup 2023-08-04 01:26:36 +00:00
nan loss detection (should have added it earlier), loss scaling for local backend + fp16 2024-05-12 03:23:29 +00:00			`if torch.isnan(loss).any():`
			`self.max_nan_losses = self.max_nan_losses - 1`
			`if self.max_nan_losses < 0:`
			`raise RuntimeError("Too many NaN losses detected.")`

big cleanup 2023-08-04 01:26:36 +00:00			`stats = {}`
			`stats \|= {k: v.item() for k, v in losses.items()}`
			`stats \|= self.gather_attribute("scalar")`

			`self.backward(loss)`
			`self.step()`

			`return stats`