vall-e/vall_e/utils/wrapper.py

from contextlib import contextmanager

import torch
import torch.nn.functional as F
from ..config import cfg

Embedding = torch.nn.Embedding
Linear = torch.nn.Linear

# https://github.com/kyegomez/BitNet
if cfg.bitsandbytes.bitnet:
	from bitnet import BitLinear

if cfg.bitsandbytes.enabled:
	import bitsandbytes as bnb

	if cfg.bitsandbytes.linear:

		if cfg.bitsandbytes.bitnet:
			Linear = BitLinear
		else:
			Linear = bnb.nn.Linear8bitLt

	if cfg.bitsandbytes.embedding:
		Embedding = bnb.nn.modules.Embedding
		"""
		Embedding.forward = lambda self, input: ( self.norm(F.embedding(
			input,
			self.weight,
			self.padding_idx,
			self.max_norm,
			self.norm_type,
			self.scale_grad_by_freq,
			self.sparse,
		)).to(self.weight.dtype) )
		"""


if cfg.bitsandbytes.enabled:
	import bitsandbytes as bnb

	Adam = bnb.optim.Adam8bit
	AdamW = bnb.optim.AdamW8bit
	SGD = bnb.optim.SGD8bit
else:
	Adam = torch.optim.Adam
	AdamW = torch.optim.AdamW
	SGD = torch.optim.SGD

# handles generically converting to a specific tensor type and converting back (implemented solely for bfloat16)
@contextmanager
def autocast(input, from_dtype, to_dtype):
	if input.dtype == from_dtype:
		input = input.to(to_dtype)
		yield input
		input = input.to(from_dtype)
	else:
		yield input

@contextmanager
def autocasts(input, from_dtype, to_dtype):
	if input.dtype in from_dtype:
		from_dtype = input.dtype
		input = input.to(to_dtype)
		yield input
		input = input.to(from_dtype)
	else:
		yield input

# handles temporarily upcasting 'index tensors' so torch will stop bitching
def autocast_forward( func ):
	def wrapper( self, input, *args, **kwargs ):
		with autocasts( input, [torch.int16, torch.int8, torch.uint8], torch.int32 ) as k:
			return func( self, k, *args, **kwargs )
	return wrapper
Embedding.forward = autocast_forward(Embedding.forward)

if cfg.bitsandbytes.injects and cfg.bitsandbytes.enabled:
	torch.nn.Linear = Linear
	torch.nn.Embedding = Embedding

	torch.optim.Adam = Adam
	torch.optim.AdamW = AdamW
	torch.optim.SGD = SGD

# disgusting kludge, but it works (just realized BitNet has its own replacement routine)
def replace_linear( model ):
	device =  next(model.parameters()).device
	linears = [k.split('.') for k, m in model.named_modules() if isinstance(m, torch.nn.Linear)]
	for *parent, k in linears:
		name = '.'.join(parent)

		# copy parameters
		m = getattr( model.get_submodule(name), k )

		in_features = m.in_features
		out_features = m.out_features
		bias = m.bias is not None

		# overwrite
		setattr(
			model.get_submodule(name), k,
			Linear( in_features=in_features, out_features=out_features, bias=bias )
		)

	return model.to(device) # because our now Linear is created on the CPU......

# https://github.com/konstmish/prodigy
try:
	from prodigyopt import Prodigy
except Exception as e:
	pass
Rewrite init 2023-08-02 21:53:35 +00:00			`from contextlib import contextmanager`

			`import torch`
			`import torch.nn.functional as F`
adjustments 2023-08-02 23:36:26 +00:00			`from ..config import cfg`
Rewrite init 2023-08-02 21:53:35 +00:00
			`Embedding = torch.nn.Embedding`
			`Linear = torch.nn.Linear`

Yet Another Underlying Transformer Implementation (BitNet, will give it a few days to see how it fares) 2024-03-01 02:29:17 +00:00			`# https://github.com/kyegomez/BitNet`
			`if cfg.bitsandbytes.bitnet:`
			`from bitnet import BitLinear`

adjustments 2023-08-02 23:36:26 +00:00			`if cfg.bitsandbytes.enabled:`
Rewrite init 2023-08-02 21:53:35 +00:00			`import bitsandbytes as bnb`
tweaks to try and get deepspeed quantized inferencing, validating bitsandbytes and deepspeed quantization, nothing seems to work 2023-10-13 03:21:43 +00:00
adjustments 2023-08-02 23:36:26 +00:00			`if cfg.bitsandbytes.linear:`
Yet Another Underlying Transformer Implementation (BitNet, will give it a few days to see how it fares) 2024-03-01 02:29:17 +00:00
			`if cfg.bitsandbytes.bitnet:`
			`Linear = BitLinear`
			`else:`
			`Linear = bnb.nn.Linear8bitLt`
Rewrite init 2023-08-02 21:53:35 +00:00
adjustments 2023-08-02 23:36:26 +00:00			`if cfg.bitsandbytes.embedding:`
tweaks to try and get deepspeed quantized inferencing, validating bitsandbytes and deepspeed quantization, nothing seems to work 2023-10-13 03:21:43 +00:00			`Embedding = bnb.nn.modules.Embedding`
			`"""`
Rewrite init 2023-08-02 21:53:35 +00:00			`Embedding.forward = lambda self, input: ( self.norm(F.embedding(`
			`input,`
			`self.weight,`
			`self.padding_idx,`
			`self.max_norm,`
			`self.norm_type,`
			`self.scale_grad_by_freq,`
			`self.sparse,`
			`)).to(self.weight.dtype) )`
tweaks to try and get deepspeed quantized inferencing, validating bitsandbytes and deepspeed quantization, nothing seems to work 2023-10-13 03:21:43 +00:00			`"""`
Rewrite init 2023-08-02 21:53:35 +00:00

adjustments 2023-08-02 23:36:26 +00:00			`if cfg.bitsandbytes.enabled:`
Rewrite init 2023-08-02 21:53:35 +00:00			`import bitsandbytes as bnb`

added support for optional prodigy optimizer (https://github.com/konstmish/prodigy) although it consumes a lot more VRAM per parameter 2023-09-07 01:33:16 +00:00			`Adam = bnb.optim.Adam8bit`
			`AdamW = bnb.optim.AdamW8bit`
			`SGD = bnb.optim.SGD8bit`
added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing) 2023-09-06 23:58:35 +00:00			`else:`
			`Adam = torch.optim.Adam`
			`AdamW = torch.optim.AdamW`
			`SGD = torch.optim.SGD`
Rewrite init 2023-08-02 21:53:35 +00:00
			`# handles generically converting to a specific tensor type and converting back (implemented solely for bfloat16)`
			`@contextmanager`
			`def autocast(input, from_dtype, to_dtype):`
			`if input.dtype == from_dtype:`
			`input = input.to(to_dtype)`
			`yield input`
			`input = input.to(from_dtype)`
			`else:`
adjustments 2023-08-02 23:36:26 +00:00			`yield input`

			`@contextmanager`
			`def autocasts(input, from_dtype, to_dtype):`
			`if input.dtype in from_dtype:`
			`from_dtype = input.dtype`
			`input = input.to(to_dtype)`
			`yield input`
			`input = input.to(from_dtype)`
			`else:`
			`yield input`

			`# handles temporarily upcasting 'index tensors' so torch will stop bitching`
			`def autocast_forward( func ):`
			`def wrapper( self, input, args, *kwargs ):`
			`with autocasts( input, [torch.int16, torch.int8, torch.uint8], torch.int32 ) as k:`
			`return func( self, k, args, *kwargs )`
			`return wrapper`
			`Embedding.forward = autocast_forward(Embedding.forward)`

			`if cfg.bitsandbytes.injects and cfg.bitsandbytes.enabled:`
			`torch.nn.Linear = Linear`
			`torch.nn.Embedding = Embedding`

			`torch.optim.Adam = Adam`
added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing) 2023-09-06 23:58:35 +00:00			`torch.optim.AdamW = AdamW`
added support for optional prodigy optimizer (https://github.com/konstmish/prodigy) although it consumes a lot more VRAM per parameter 2023-09-07 01:33:16 +00:00			`torch.optim.SGD = SGD`

cleaner replacement code (because I realized BitNet had an implementation for it too), added calculating gradient norm and performing gradient clipping in local trainer (non-deepspeed) 2024-03-02 02:18:43 +00:00			`# disgusting kludge, but it works (just realized BitNet has its own replacement routine)`
Added cfg.bitsandbytes.replace as a less intrusive alternative to cfg.bitsandbytes.inject to replace all Linear modules in a model 2024-03-02 01:20:10 +00:00			`def replace_linear( model ):`
			`device = next(model.parameters()).device`
cleaner replacement code (because I realized BitNet had an implementation for it too), added calculating gradient norm and performing gradient clipping in local trainer (non-deepspeed) 2024-03-02 02:18:43 +00:00			`linears = [k.split('.') for k, m in model.named_modules() if isinstance(m, torch.nn.Linear)]`
Added cfg.bitsandbytes.replace as a less intrusive alternative to cfg.bitsandbytes.inject to replace all Linear modules in a model 2024-03-02 01:20:10 +00:00			`for *parent, k in linears:`
			`name = '.'.join(parent)`

			`# copy parameters`
cleaner replacement code (because I realized BitNet had an implementation for it too), added calculating gradient norm and performing gradient clipping in local trainer (non-deepspeed) 2024-03-02 02:18:43 +00:00			`m = getattr( model.get_submodule(name), k )`
Added cfg.bitsandbytes.replace as a less intrusive alternative to cfg.bitsandbytes.inject to replace all Linear modules in a model 2024-03-02 01:20:10 +00:00
			`in_features = m.in_features`
			`out_features = m.out_features`
cleaner replacement code (because I realized BitNet had an implementation for it too), added calculating gradient norm and performing gradient clipping in local trainer (non-deepspeed) 2024-03-02 02:18:43 +00:00			`bias = m.bias is not None`
Added cfg.bitsandbytes.replace as a less intrusive alternative to cfg.bitsandbytes.inject to replace all Linear modules in a model 2024-03-02 01:20:10 +00:00
cleaner replacement code (because I realized BitNet had an implementation for it too), added calculating gradient norm and performing gradient clipping in local trainer (non-deepspeed) 2024-03-02 02:18:43 +00:00			`# overwrite`
Added cfg.bitsandbytes.replace as a less intrusive alternative to cfg.bitsandbytes.inject to replace all Linear modules in a model 2024-03-02 01:20:10 +00:00			`setattr(`
cleaner replacement code (because I realized BitNet had an implementation for it too), added calculating gradient norm and performing gradient clipping in local trainer (non-deepspeed) 2024-03-02 02:18:43 +00:00			`model.get_submodule(name), k,`
			`Linear( in_features=in_features, out_features=out_features, bias=bias )`
Added cfg.bitsandbytes.replace as a less intrusive alternative to cfg.bitsandbytes.inject to replace all Linear modules in a model 2024-03-02 01:20:10 +00:00			`)`

cleaner replacement code (because I realized BitNet had an implementation for it too), added calculating gradient norm and performing gradient clipping in local trainer (non-deepspeed) 2024-03-02 02:18:43 +00:00			`return model.to(device) # because our now Linear is created on the CPU......`
Added cfg.bitsandbytes.replace as a less intrusive alternative to cfg.bitsandbytes.inject to replace all Linear modules in a model 2024-03-02 01:20:10 +00:00
added support for optional prodigy optimizer (https://github.com/konstmish/prodigy) although it consumes a lot more VRAM per parameter 2023-09-07 01:33:16 +00:00			`# https://github.com/konstmish/prodigy`
			`try:`
			`from prodigyopt import Prodigy`
			`except Exception as e:`
			`pass`