vall-e/vall_e/models/ar_nar.py

"""
# an AR + NAR model that handles:
* inferencing the primary RVQ level in an autoregressive manner (AR)
* inferencing the remaining RVQ levels in parallel (NAR)

This model can fully handle being trained as a unified model (AR + NAR) or separate models (AR | NAR).
It's recommended to train as a unified model, then "distill" knowledge of each tasks separately, just in case.
"""
from .base import Base, list_to_tensor, Categorical
from ..config import cfg

import torch
from torch.nn.utils.rnn import pad_sequence

import random
import math
from einops import rearrange
from torch import Tensor
from tqdm import trange
from time import perf_counter

import logging

_logger = logging.getLogger(__name__)

from ..emb.qnt import trim, encode_as_embedding
from ..utils import get_devices, setup_logging, timer

from .lora import enable_lora

def clamp(n, lo, hi):
	return max(lo, min(n, hi))

class AR_NAR(Base):
	def forward(
		self,
		text_list: list[Tensor],
		proms_list: list[Tensor],
		resps_list: list[Tensor] | None = None,
		
		task_list: list[Tensor] | None = None,
		lang_list: list[Tensor] | None = None,
		tone_list: list[Tensor] | None = None,
		len_list: list[Tensor] | None = None,

		training: bool | None = None,

		max_steps: int = 1000,
		max_levels: int = 0,

		input_prompt_prefix: bool = False,

		sampling_temperature: float = 1.0,
		sampling_min_temperature: float = -1.0,
		sampling_top_k: int = -100,
		sampling_top_p: float = 1.0,
		sampling_repetition_penalty: float = 1.0,
		sampling_repetition_penalty_decay: float = 0.0,
		sampling_length_penalty: float = 0.0,
		sampling_beam_width: int = 0,
		sampling_mirostat_tau: float = 0.0,
		sampling_mirostat_eta: float = 0.1,
		sampling_dry_multiplier=0.0,
		sampling_dry_base=1.75,
		sampling_dry_allowed_length=2,

		disable_tqdm=False,
	):
		text_task = [ "stt" ]

		if text_list is not None:
			default_task = "tts"
			device = text_list[0].device
			batch_size = len(text_list)
		else:
			default_task = "stt"
			device = resps_list[0].device
			batch_size = len(resps_list)

		# generate task list if not provided
		if task_list is None:
			task_list = [ default_task for _ in range(batch_size) ]

		has_none = resps_list is None or text_list is None
		if not has_none:
			for i, task in enumerate( task_list ):
				if resps_list[i] is None or text_list[i] is None:
					has_none = True
					break

		# is training or NAR
		if not has_none:
			n_levels_set = {r.shape[-1] for r in resps_list}
			n_levels = next(iter(n_levels_set))

			if training is None:
				training = n_levels == self.n_resp_levels

			# is training
			if training:
				# specifies how to sample probabilities of which RVQ levels to train against
				p_rvq_levels = self.config.experimental.p_rvq_levels if self.config is not None else "equal"
				# determines which RVQ level to target per batch
				quant_level_range = self.config.experimental.rvq_level_range if self.config is not None and self.config.experimental.rvq_level_range else [ 0 if self.causal else 1, self.n_resp_levels - 1 ]
				# rate to perform token dropout errors
				token_dropout_error = self.config.experimental.token_dropout_error
				# RVQ levels to apply token dropout on
				token_dropout_rvq_levels = self.config.experimental.token_dropout_rvq_levels
				# implicitly set it to all levels
				if not token_dropout_rvq_levels:
					token_dropout_rvq_levels = [0, self.resp_levels - 1]
				# allow passing a specific distribution of RVQ levels
				p_rvq_levels = p_rvq_levels if isinstance(p_rvq_levels, list) else []
				if not p_rvq_levels:
					lo, hi = quant_level_range[0], quant_level_range[1] + 1
					# randomly select a target RVQ-bin level (0 being AR, 1+ being NAR)
					if p_rvq_levels == "equal":
						p_rvq_levels = [ i for i in range( lo, hi ) ]
					else:
						# yuck
						p_rvq_levels = sum([[i for _ in range(hi - i)] for i in range( lo, hi ) ], [])

				# input RVQ levels
				quant_levels = [ random.choice( p_rvq_levels ) for i in range(batch_size) ]
				for i, task in enumerate( task_list ):
					if task in text_task:
						quant_levels[i] = 0 # self.n_resp_levels - 1
				
				# trim resps to only contain all levels below the target level
				resps_list = [r if t in text_task else r[..., :l+1] for r, l, t in zip(resps_list, quant_levels, task_list)]

				# tensor to cat for RVQ level 0
				text_stop_sequence = torch.tensor([[2] * 1], device=device, dtype=torch.int16)
				audio_stop_sequence = torch.tensor([[self.stop_token] * 1], device=device, dtype=torch.int16)
				# I hate python's value/reference semantics so much
				for i, quant_level, resps, proms, task in zip(range(batch_size), quant_levels, resps_list, proms_list, task_list):
					# cap quant_level if it exceeds its corresponding resp/prom
					if quant_level >= resps.shape[-1]:
						quant_levels[i] = resps.shape[-1] - 1

					# proms could be a Tensor, list[Tensor], or None
					if isinstance( proms, torch.Tensor ):
						if quant_level >= proms.shape[-1]:
							quant_levels[i] = proms.shape[-1] - 1

					elif isinstance( proms, list ):
						for j, prom in enumerate( proms ):
							if not isinstance( prom, torch.Tensor ):
								continue
							if quant_level >= prom.shape[-1]:
								quant_levels[i] = prom.shape[-1] - 1

					# apply token dropout error compensation
					if token_dropout_error > 0 and (token_dropout_rvq_levels[0] <= quant_level and quant_level <= token_dropout_rvq_levels[1]):
						steps = resps.shape[0]
						for l in range( quant_level ):
							for t in range( steps ):
								token = resps[t, l].item()

								if random.random() < token_dropout_error:								
									offset = 1 * ( 1 if random.random() < 0.5  else -1 )
									resps_list[i][t, l] = clamp(token + offset, 1, 1022) # +- 1

					# only apply stop token for RVQ level 0
					if quant_level <= 0:
						# append stop tokens for AR
						if task in text_task:
							#text_list[i] = torch.cat([ resps, text_stop_sequence ])
							...
						else:
							resps_list[i] = torch.cat([ resps, audio_stop_sequence ])
					

				inputs = self.inputs(
					text_list=text_list,
					proms_list=proms_list,
					resps_list=resps_list,
					lang_list=lang_list,
					tone_list=tone_list,
					task_list=task_list,

					quant_levels=quant_levels,
				)

				return super().forward(
					inputs=inputs,
					quant_levels=quant_levels, # could technically just grab this from the above inputs since they're included as an RVQ level token
				)
			
			# is NAR
			if max_levels == 0:
				max_levels = self.n_max_levels - 1

			# expand if given a raw 1D tensor
			for i, resp in enumerate(resps_list):
				if resp.dim() == 1:
					resps_list[i] = resp.unsqueeze(-1)
			
			prev_list = resps_list

			for n in trange( max_levels, desc="NAR", disable=disable_tqdm ):				
				level = prev_list[0].shape[-1]
				if level >= max_levels + 1: # min(max_levels + 1, self.n_resp_levels): # commented out to experiment with exceeding trained levels
					break

				if cfg.lora is not None:
					enable_lora( self, cfg.lora.active_level( level ) )

				quant_levels = [ level for _ in range(batch_size) ] # torch.full((len(text_list),), level)

				inputs = self.inputs(
					text_list=text_list,
					proms_list=proms_list,
					resps_list=prev_list,
					lang_list=lang_list,
					tone_list=tone_list,
					quant_levels=quant_levels,
				)

				output = super().forward(
					inputs=inputs,
					quant_levels=quant_levels,
				)
				if not isinstance( output, tuple ):
					output = (output, None)
				logits, state = output

				resps_list = super().sample(
					logits=logits,
					prev_list=prev_list,
					quant_levels=quant_levels,

					temperature=sampling_temperature,
					min_temperature=sampling_min_temperature,
					top_p=sampling_top_p,
					top_k=sampling_top_k,
					#repetition_penalty=sampling_repetition_penalty,
					#repetition_penalty_decay=sampling_repetition_penalty_decay,
					#length_penalty=sampling_length_penalty,
					#beam_width=sampling_beam_width,
					#mirostat=mirostat,
				)

				prev_list = [ torch.cat([rs, r.unsqueeze(-1).to(device=device)], dim=-1) for rs, r in zip(prev_list, resps_list) ]

			if cfg.lora is not None:
				enable_lora( self )

			return prev_list
		
		# is AR
		if cfg.lora is not None:
			enable_lora( self, cfg.lora.active_level( 0 ) )

		# STT
		start_slice = [ 0 for _ in range(batch_size) ]
		sequence_list = [ torch.zeros(0, device=device).to(torch.int16) for _ in range(batch_size) ]
		stopped = torch.zeros(batch_size, device=device).bool()
		
		audio_stop_token = self.stop_token
		text_stop_token = 2

		state = None
		mirostat = [
			{"n": 1024, "tau": sampling_mirostat_tau, "eta": sampling_mirostat_eta, "max_surprise": sampling_mirostat_eta * 2, "error_surprise": 0, "running_total_surprise": 0}
		] * batch_size if sampling_mirostat_tau > 0.0 else None

		scores = [ 1.0 ] * sampling_beam_width

		for i, sequence in enumerate( sequence_list ):
			# add <bos> to text for STT
			if task_list[i] in text_task:
				start_slice[i] = 1
				sequence_list[i] = torch.cat([sequence_list[i], torch.tensor([1], dtype=torch.int16, device=device)])
			# treat input prompt as initial resp (by prefixing with the prompt instead)
			elif input_prompt_prefix:
				start_slice[i] = proms_list[i].shape[0]
				sequence_list[i], proms_list[i] = proms_list[i][:, 0], sequence_list[i]

		# get next in sequence
		for n in trange(max_steps // max(1, self.causal_size), desc="AR", disable=disable_tqdm):
			# it would technically be faster to just append the new token's embedding to the inputs, but there's a VERY small performance gain from doing it, so it's not worth it
			text_list = [ sequence_list[i] if task in text_task else text_list[i] for i, task in enumerate(task_list) ]
			resps_list = [ sequence_list[i] if task not in text_task else resps_list[i] for i, task in enumerate(task_list) ]

			inputs = self.inputs(
				text_list=text_list,
				proms_list=proms_list,
				resps_list=resps_list,
				lang_list=lang_list,
				tone_list=tone_list,
				len_list=len_list,
				task_list=task_list,
				quant_levels=[ 0 for _ in range( max( batch_size, sampling_beam_width ) ) ]
			)

			# to-do: find an elegant way to write this
			output = super().forward(
				inputs=inputs,
				state=state,
			)
			if not isinstance( output, tuple ):
				output = (output, None)
			
			logits, state = output

			r = super().sample(
				logits=logits,
				prev_list=None if sampling_repetition_penalty == 1.0 and sampling_length_penalty == 0.0 else [ resps_list[i] if task not in text_task else text_list[i] for i, task in enumerate( task_list ) ],

				temperature=sampling_temperature,
				min_temperature=sampling_min_temperature,
				top_p=sampling_top_p,
				top_k=sampling_top_k,
				repetition_penalty=sampling_repetition_penalty,
				repetition_penalty_decay=sampling_repetition_penalty_decay,
				length_penalty=sampling_length_penalty,
				beam_width=sampling_beam_width,

				mirostat=mirostat,

				dry_multiplier=sampling_dry_multiplier,
				dry_base=sampling_dry_base,
				dry_allowed_length=sampling_dry_allowed_length,
			)

			if mirostat is not None:
				# r is the state
				mirostat = r
				# extract token from state
				r = [ state["token"] for state in mirostat ]
			# we do it here because the sampler will already expand our logits list
			elif sampling_beam_width > 0:
				# expand tuple
				r, s = r
				# first step, expand batch
				if batch_size == 1:
					batch_size = sampling_beam_width
					text_list = text_list * sampling_beam_width
					proms_list = proms_list * sampling_beam_width
					sequence_list = sequence_list * sampling_beam_width
					stopped = torch.zeros(batch_size, device=device).bool()

				scores = [ scores[i] + score for i, score in enumerate(s) ]

			# append tokens
			for i, ri in enumerate(r):
				task = task_list[i]
				stop_token = audio_stop_token if task not in text_task else text_stop_token
				if stop_token in ri:
					stopped[i] = True
				sequence_list[i] = torch.cat([sequence_list[i], ri.to(device)])

			# stop token found
			# stopped |= r == stop_token
			if stopped.all().item():
				break

		# pick the best scoring candidate
		# desu this is always going to be candidate 0
		if sampling_beam_width:
			sequence_list = [ sequence_list[0] ]

		# remove stop token
		sequence_list = [self._prune(r, audio_stop_token if task_list[i] not in text_task else text_stop_token) for i, r in enumerate(sequence_list)]
		# remove <bos>
		sequence_list = [ sequence_list[i][start_slice[i]:] for i, task in enumerate( task_list ) ]
		return sequence_list


def example_usage():
	cfg.trainer.backend = "local"
	cfg.hyperparameters.gradient_accumulation_steps = 1
	if cfg.audio_backend == "dac":
		cfg.sample_rate = 44_100

	from functools import partial
	from einops import repeat
	from tqdm import tqdm

	from ..emb.qnt import decode_to_file, unload_model, trim_random, repeat_extend_audio, concat_audio, merge_audio
	from ..engines import Engine, Engines
	from ..utils import wrapper as ml
	from ..utils import setup_logging
	
	import numpy as np
	import re

	setup_logging()
	device = "cuda"

	
	# mamba seems to ONLY be used as an AR (any NAR attempts lobotomizes it)
	"""
	if "mamba" in cfg.model.arch_type:
		cfg.model.resp_levels = 1
	"""
	# cfg.model.loss_factors = {}

	def tokenize(content):
		return torch.tensor( cfg.tokenizer.encode(content) )

	def _load_quants(path) -> Tensor:
		qnt = np.load(path, allow_pickle=True)[()]
		return torch.from_numpy(qnt["codes"].astype(np.int16))[0, :cfg.model.resp_levels, :].t().to(torch.int16)

	qnt = _load_quants(f"./data/qnt.{'dac' if cfg.audio_backend == 'dac' else 'enc'}")
	noise = _load_quants(f"./data/noise.{'dac' if cfg.audio_backend == 'dac' else 'enc'}")

	text_list = [
		tokenize("ˈaɪ wɪl nˌɑːt ˈæsk ɐ sˈɛkənd tˈaɪm").to(device),
		#tokenize("ˈaɪ wɪl nˌɑːt ˈæsk").to(device),
	]
	proms_list = [
		qnt[:cfg.dataset.frames_per_second, :].to(device),
		#qnt[:cfg.dataset.frames_per_second, :].to(device),
	]
	resps_list = [
		qnt[:, :].to(device),
		#qnt[:cfg.dataset.frames_per_second, :].to(device),
	]

	text_list = text_list[:1]
	proms_list = proms_list[:1]
	resps_list = resps_list[:1]

	batch_size = len(text_list)

	# rentet-full is the only configuration with BitNet's BitLinear that converges despite the grad_norm saying otherwise
	kwargs = {
		'n_text_tokens': 256,
		'n_audio_tokens': 1024,

		'd_model': 1024, # 256, # 1024, # 1536
		'n_heads': 16, # 4, # 16, # 24
		'n_layers': 12, # 32
		'n_experts': 1 if not cfg.model else cfg.model.experts,

		'p_dropout': 0.1,

		'l_padding': 8 if cfg.optimizations.fp8 else 0,

		'config': cfg.model
	}
	
	"""
	try:
		kwargs['config'] = cfg.model
	except Exception as e:
		pass
	"""

	bos_id, space_id, eos_id = cfg.tokenizer.encode( " " )
	#available_tasks = cfg.dataset.tasks_list
	available_tasks = ["tts", "stt"]

	model = AR_NAR(**kwargs).to(device)
	steps = 150 * len(available_tasks) # * cfg.model.experimental.causal_size

	optimizer = cfg.hyperparameters.optimizer.lower() if cfg.yaml_path is not None else "prodigy"
	scheduler = cfg.hyperparameters.scheduler.lower() if cfg.yaml_path is not None else ""
	learning_rate = cfg.hyperparameters.learning_rate if cfg.yaml_path is not None else None

	if cfg.optimizations.dadaptation:
		# do not combine the two
		if scheduler == "schedulefree":
			scheduler = ""

		learning_rate = 1.0
	
	if optimizer == "prodigy":
		if learning_rate is None:
			learning_rate = 1.0

		optimizer = ml.Prodigy
	elif optimizer == "adagrad":
		if learning_rate is None:
			learning_rate = 1.0e-2

		optimizer = ml.Adagrad
	elif optimizer == "adamw":
		if learning_rate is None:
			learning_rate = 1.0e-4

		optimizer = ml.AdamW
	elif optimizer == "sdg":
		if learning_rate is None:
			learning_rate = 1.0e-4

		optimizer = ml.SGD
	else:
		raise ValueError(f"Unrecognized optimizer: {optimizer}")

	_logger.info(f"Optimizer: {optimizer}\tLearning rate: {learning_rate}")

	optimizer = optimizer(model.parameters(), lr=learning_rate)

	if scheduler == "schedulefree":
		if isinstance(optimizer, ml.AdamW):
			scheduler = ml.schedulefree.AdamWScheduleFree
		elif isinstance(optimizer, ml.SGD):
			scheduler = ml.schedulefree.SGDScheduleFree
		else:
			scheduler = None

		if scheduler is not None:
			_logger.info(f"Scheduler: {scheduler}")
			optimizer = scheduler( model.parameters(), lr = learning_rate )

	if cfg.optimizations.replace and cfg.optimizations.linear:
		model = ml.replace_linear( model )
		
	if cfg.optimizations.replace and cfg.optimizations.embedding:
		model = ml.replace_embedding( model )

	"""
	cfg.optimizations.model_offloading = {
		"devices": ["cuda:0", "cpu"],
	#	"limits": [ 0.9, -1 ],
		"assign": [[ f'layers.{i}.' for i in range(0,10) ], [ f'layers.{i}.' for i in range(11,12) ] + [ "model.norm" ]],
	#	"limits": [ 256 * (1024 ** 2), -1 ]
	}
	"""
	
	engine = Engine(model=model, optimizer=optimizer)
	engines = Engines({"ar+nar": engine})
	engines.setup()
	
	"""
	if cfg.optimizations.model_offloading:
		model = ml.offload_model( model, policy=cfg.optimizations.model_offloading )
	"""

	"""
	torch.save( {
		'module': model.state_dict()
	}, f"./data/{cfg.model.arch_type}.pth" )
	"""

	_logger.info(f"AR+NAR ({cfg.model.arch_type}, {cfg.audio_backend}) parameter count: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

	@torch.no_grad()
	def sample_data(t=None):
		if isinstance(t, list):
			tasks = t
			texts = [ text_list[0].to(device) if task != "stt" else None for i, task in enumerate( tasks ) ]
			proms = [ proms_list[0].to(device) if task != "stt" else [ "stt" ] for i, task in enumerate( tasks ) ]
			resps = [ None if task != "stt" else resps_list[0].to(device) for i, task in enumerate( tasks ) ]

			return texts, proms, resps, tasks

		texts = []
		proms = []
		resps = []
		tasks = []

		for i in range(batch_size):
			task = random.choice(available_tasks) if t is None else t

			text = text_list[i].to(device)
			prom = proms_list[i].to(device)
			resp = resps_list[i].to(device)

			# do nothing
			if task == "tts":
				...
			elif task == "stt":
				prom = [
					task
				]				
			# to-do: reimplement this from data.py
			"""
			elif task == "tts-c":
				trim_length = int(random.uniform(cfg.dataset.prompt_duration_range[0], cfg.dataset.prompt_duration_range[1]) * cfg.dataset.frames_per_second)

				prom = resp[:trim_length]
				resp = resp[trim_length:]

				prom = prom.to(device)
			elif task == "ns" or task == "sr":
				# extend the noise to fill the target audio
				noise_ext = repeat_extend_audio( noise, resp.shape[0] )
				# create the input prompt by merging the target audio with the noise
				prom = merge_audio( resp.cpu(), noise_ext, scale=[1, cfg.dataset.noise_scale], device=cfg.dataset.reencode_device )
				prom = prom.to(device)
				# set the target to just be the noise if <sr>
				if task == "sr":
					resp = noise_ext

				# set the text prompt to empty to train without a guided text prompt
				if random.random() < 0.5:
					text = torch.tensor([bos_id, eos_id], device=device, dtype=torch.uint8)

				prom = [
					task,
					prom,
				]
			"""

			texts.append( text )
			proms.append( prom )
			resps.append( resp )
			tasks.append( task )

		return texts, proms, resps, tasks

	@torch.inference_mode()
	def sample( name, steps=500, task=None ):
		engine.eval()

		texts, proms, resps, tasks = sample_data( task )

		if "ar" in cfg.model.capabilities:
			output = engine( texts, proms, resps, task_list=tasks, max_steps=steps, sampling_temperature=0.95 )

			text = [ cfg.tokenizer.decode( output[i] ) for i, task in enumerate( tasks ) if task == "stt" ]		
			
			texts = [ texts[i] for i, task in enumerate( tasks ) if task != "stt" ]
			proms = [ proms[i] for i, task in enumerate( tasks ) if task != "stt" ]
			resps = [ output[i] for i, task in enumerate( tasks ) if task != "stt" ]
			tasks = [ tasks[i] for i, task in enumerate( tasks ) if task != "stt" ]
			
			print( "STT:", text )
		else:
			resps = [ resp[:, 0] for resp in resps ]

		if "nar" in cfg.model.capabilities:
			resps = engine( texts, proms, resps, task_list=tasks, sampling_temperature=0.2 )

		for i, o in enumerate(resps):
			_ = decode_to_file(o.to(dtype=torch.int32), f"data/{cfg.model.arch_type}.{cfg.audio_backend}.{i}.{task}.{name}.wav", device=device)

		unload_model()

	def train():
		engine.train()
		t = trange(steps)
		for i in t:
			texts, proms, resps, tasks = sample_data()

			stats = {"step": i}
			stats |= engine.traverse(text_list=texts, proms_list=proms, resps_list=resps, task_list=tasks)
			stats |= {"grad_norm": engine.get_global_grad_norm()}

			tqdm.write(f"{stats}")

		"""
		torch.save( {
			'module': model.state_dict()
		}, f"./data/{cfg.model.arch_type}.pth" )
		"""

	#sample("init", 5)
	train()

	"""
	if cfg.optimizations.compile:
		model = ml.compile_model(model, backend=cfg.optimizations.compile)
	"""
	
	"""
	for task in available_tasks:
		sample("final", task=task)
	"""
	sample("final", task=available_tasks)

	engines.quit()

if __name__ == "__main__":
	example_usage()
-												reticulating splines

											
										
										
											2024-06-09 01:30:15 +00:00
+								"""
 								# an AR + NAR model that handles:
 								* inferencing the primary RVQ level in an autoregressive manner (AR)
 								* inferencing the remaining RVQ levels in parallel (NAR)
 								This model can fully handle being trained as a unified model (AR + NAR) or separate models (AR | NAR).
 								It's recommended to train as a unified model, then "distill" knowledge of each tasks separately, just in case.
 								"""
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+								from .base import Base, list_to_tensor, Categorical
-												added support for optional prodigy optimizer (https://github.com/konstmish/prodigy) although it consumes a lot more VRAM per parameter

											
										
										
											2023-09-07 01:33:16 +00:00
+								from ..config import cfg
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
 								import torch
 								from torch.nn.utils.rnn import pad_sequence
 								import random
-												added picking final candidate based on sum of score instead of first candidate (this changes nothing).

											
										
										
											2023-09-13 18:19:11 +00:00
+								import math
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+								from einops import rearrange
 								from torch import Tensor
 								from tqdm import trange
-												sped up inferencing by not doing .tolist() for rep pen / length pen (and a bug fix in the web UI from prev commit)

											
										
										
											2024-10-05 03:18:20 +00:00
+								from time import perf_counter
-												moved prints to use logger, edited readme (fused_attn doesnt seem stable for training)

											
										
										
											2024-08-29 18:27:16 +00:00
+								import logging
 								_logger = logging.getLogger(__name__)
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												experimental method of using DACs quantizer ""embeddings"" to see if it helps with model quality

											
										
										
											2024-06-30 00:46:11 +00:00
+								from ..emb.qnt import trim, encode_as_embedding
-												sped up inferencing by not doing .tolist() for rep pen / length pen (and a bug fix in the web UI from prev commit)

											
										
										
											2024-10-05 03:18:20 +00:00
+								from ..utils import get_devices, setup_logging, timer
-												trim the input prompt to 3 seconds when training NAR tasks (marked as experimental; the paper mentions doing so, but I don't know how much this would harm the retention heads)

											
										
										
											2023-10-10 03:03:58 +00:00
-												enable LoRA for targetted RVQ levels (to experiment with, seems to help)

											
										
										
											2024-06-18 02:45:03 +00:00
+								from .lora import enable_lora
-												added experimental training setting to perform token dropout to MAYBE compensate for errors from the preceding RVQ level (two types: token error offset, token dropout embedding replace)

											
										
										
											2024-07-25 00:35:17 +00:00
+								def clamp(n, lo, hi):
 									return max(lo, min(n, hi))
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+								class AR_NAR(Base):
 									def forward(
 										self,
 										text_list: list[Tensor],
 										proms_list: list[Tensor],
 										resps_list: list[Tensor] | None = None,
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+										task_list: list[Tensor] | None = None,
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
+										lang_list: list[Tensor] | None = None,
-												deprecate sole AR/NAR model by only keeping the AR+NAR (the beauty of no one using this is that I can break compat as much as I want), add tone token for when I classify my dataset with tone/emotion in the future, some other things

											
										
										
											2024-04-16 00:54:32 +00:00
+										tone_list: list[Tensor] | None = None,
-												added experimental NAR only model (inferences text length, need more experimenting), AudioEmbedding logic cleanup (I still think it's being done wrong)

											
										
										
											2024-06-08 20:42:02 +00:00
+										len_list: list[Tensor] | None = None,
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
-												nasty bandaid because some of my DAC dataset only has 8 RVQ levels instead of the full 9

											
										
										
											2024-06-29 15:16:37 +00:00
+										training: bool | None = None,
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+										max_steps: int = 1000,
-												experimental weighting of prom/resp embeds

											
										
										
											2024-01-25 18:18:48 +00:00
+										max_levels: int = 0,
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
-												README tweaks, added --input-prompt-prefix as an experiment (its literally better to just not do this, but i'll retain it in case i have a revelation on how to improve it)

											
										
										
											2024-10-04 23:57:19 +00:00
+										input_prompt_prefix: bool = False,
-												changed dynamic temperature trigger to be a min-(n)ar-temp value between [0,(n)ar-temp), flags to set min temp, checkbox in web UI to request it

											
										
										
											2023-10-10 22:02:33 +00:00
+										sampling_temperature: float = 1.0,
 										sampling_min_temperature: float = -1.0,
-												added lots of sampling options (top-k/top-p, repetition penalty, length penalty)

											
										
										
											2023-09-09 01:30:54 +00:00
+										sampling_top_k: int = -100,
 										sampling_top_p: float = 1.0,
 										sampling_repetition_penalty: float = 1.0,
-												added a length-based decay factor for repetition penalty

											
										
										
											2023-09-09 02:02:00 +00:00
+										sampling_repetition_penalty_decay: float = 0.0,
-												added lots of sampling options (top-k/top-p, repetition penalty, length penalty)

											
										
										
											2023-09-09 01:30:54 +00:00
+										sampling_length_penalty: float = 0.0,
-												implemented a naive beam search (I really should be taking a break)

											
										
										
											2023-09-13 02:28:07 +00:00
+										sampling_beam_width: int = 0,
-												added mirostat sampling (given a partially trained model, it got far decent output than I expected, need to test on a better trained model)

											
										
										
											2023-09-18 23:55:41 +00:00
+										sampling_mirostat_tau: float = 0.0,
 										sampling_mirostat_eta: float = 0.1,
-												added what I think is DRY sampling

											
										
										
											2024-07-30 00:15:07 +00:00
+										sampling_dry_multiplier=0.0,
 										sampling_dry_base=1.75,
 										sampling_dry_allowed_length=2,
-												added rudimentary demo page creator (currently just embeds base64 wavs into the page, need to test not doing that)

											
										
										
											2024-07-20 01:49:40 +00:00
 										disable_tqdm=False,
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+									):
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+										text_task = [ "stt" ]
 										if text_list is not None:
 											default_task = "tts"
 											device = text_list[0].device
 											batch_size = len(text_list)
 										else:
 											default_task = "stt"
 											device = resps_list[0].device
 											batch_size = len(resps_list)
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+										# generate task list if not provided
 										if task_list is None:
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+											task_list = [ default_task for _ in range(batch_size) ]
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+										has_none = resps_list is None or text_list is None
 										if not has_none:
 											for i, task in enumerate( task_list ):
 												if resps_list[i] is None or text_list[i] is None:
 													has_none = True
 													break
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+										# is training or NAR
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+										if not has_none:
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+											n_levels_set = {r.shape[-1] for r in resps_list}
 											n_levels = next(iter(n_levels_set))
-												nasty bandaid because some of my DAC dataset only has 8 RVQ levels instead of the full 9

											
										
										
											2024-06-29 15:16:37 +00:00
+											if training is None:
 												training = n_levels == self.n_resp_levels
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+											# is training
-												nasty bandaid because some of my DAC dataset only has 8 RVQ levels instead of the full 9

											
										
										
											2024-06-29 15:16:37 +00:00
+											if training:
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+												# specifies how to sample probabilities of which RVQ levels to train against
-												more cleanup

											
										
										
											2024-06-30 16:00:12 +00:00
+												p_rvq_levels = self.config.experimental.p_rvq_levels if self.config is not None else "equal"
-												added experimental NAR only model (inferences text length, need more experimenting), AudioEmbedding logic cleanup (I still think it's being done wrong)

											
										
										
											2024-06-08 20:42:02 +00:00
+												# determines which RVQ level to target per batch
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+												quant_level_range = self.config.experimental.rvq_level_range if self.config is not None and self.config.experimental.rvq_level_range else [ 0 if self.causal else 1, self.n_resp_levels - 1 ]
 												# rate to perform token dropout errors
-												added experimental training setting to perform token dropout to MAYBE compensate for errors from the preceding RVQ level (two types: token error offset, token dropout embedding replace)

											
										
										
											2024-07-25 00:35:17 +00:00
+												token_dropout_error = self.config.experimental.token_dropout_error
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+												# RVQ levels to apply token dropout on
-												added experimental training setting to perform token dropout to MAYBE compensate for errors from the preceding RVQ level (two types: token error offset, token dropout embedding replace)

											
										
										
											2024-07-25 00:35:17 +00:00
+												token_dropout_rvq_levels = self.config.experimental.token_dropout_rvq_levels
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+												# implicitly set it to all levels
-												added experimental training setting to perform token dropout to MAYBE compensate for errors from the preceding RVQ level (two types: token error offset, token dropout embedding replace)

											
										
										
											2024-07-25 00:35:17 +00:00
+												if not token_dropout_rvq_levels:
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+													token_dropout_rvq_levels = [0, self.resp_levels - 1]
 												# allow passing a specific distribution of RVQ levels
 												p_rvq_levels = p_rvq_levels if isinstance(p_rvq_levels, list) else []
 												if not p_rvq_levels:
 													lo, hi = quant_level_range[0], quant_level_range[1] + 1
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+													# randomly select a target RVQ-bin level (0 being AR, 1+ being NAR)
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+													if p_rvq_levels == "equal":
 														p_rvq_levels = [ i for i in range( lo, hi ) ]
 													else:
 														# yuck
 														p_rvq_levels = sum([[i for _ in range(hi - i)] for i in range( lo, hi ) ], [])
 												# input RVQ levels
 												quant_levels = [ random.choice( p_rvq_levels ) for i in range(batch_size) ]
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+												for i, task in enumerate( task_list ):
 													if task in text_task:
 														quant_levels[i] = 0 # self.n_resp_levels - 1
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+												# trim resps to only contain all levels below the target level
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+												resps_list = [r if t in text_task else r[..., :l+1] for r, l, t in zip(resps_list, quant_levels, task_list)]
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+												# tensor to cat for RVQ level 0
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+												text_stop_sequence = torch.tensor([[2] * 1], device=device, dtype=torch.int16)
 												audio_stop_sequence = torch.tensor([[self.stop_token] * 1], device=device, dtype=torch.int16)
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+												# I hate python's value/reference semantics so much
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+												for i, quant_level, resps, proms, task in zip(range(batch_size), quant_levels, resps_list, proms_list, task_list):
-												added prom-less training / inferencing, some other things

											
										
										
											2024-07-23 00:36:07 +00:00
+													# cap quant_level if it exceeds its corresponding resp/prom
-												added experimental training setting to perform token dropout to MAYBE compensate for errors from the preceding RVQ level (two types: token error offset, token dropout embedding replace)

											
										
										
											2024-07-25 00:35:17 +00:00
+													if quant_level >= resps.shape[-1]:
 														quant_levels[i] = resps.shape[-1] - 1
-												experimental method of using DACs quantizer ""embeddings"" to see if it helps with model quality

											
										
										
											2024-06-30 00:46:11 +00:00
-												added experimental training setting to perform token dropout to MAYBE compensate for errors from the preceding RVQ level (two types: token error offset, token dropout embedding replace)

											
										
										
											2024-07-25 00:35:17 +00:00
+													# proms could be a Tensor, list[Tensor], or None
 													if isinstance( proms, torch.Tensor ):
 														if quant_level >= proms.shape[-1]:
 															quant_levels[i] = proms.shape[-1] - 1
-												experimental method of using DACs quantizer ""embeddings"" to see if it helps with model quality

											
										
										
											2024-06-30 00:46:11 +00:00
-												added experimental training setting to perform token dropout to MAYBE compensate for errors from the preceding RVQ level (two types: token error offset, token dropout embedding replace)

											
										
										
											2024-07-25 00:35:17 +00:00
+													elif isinstance( proms, list ):
 														for j, prom in enumerate( proms ):
-												added prom-less training / inferencing, some other things

											
										
										
											2024-07-23 00:36:07 +00:00
+															if not isinstance( prom, torch.Tensor ):
 																continue
-												fixes, again

											
										
										
											2024-09-06 16:41:41 +00:00
+															if quant_level >= prom.shape[-1]:
 																quant_levels[i] = prom.shape[-1] - 1
-												added prom-less training / inferencing, some other things

											
										
										
											2024-07-23 00:36:07 +00:00
-												added experimental training setting to perform token dropout to MAYBE compensate for errors from the preceding RVQ level (two types: token error offset, token dropout embedding replace)

											
										
										
											2024-07-25 00:35:17 +00:00
+													# apply token dropout error compensation
 													if token_dropout_error > 0 and (token_dropout_rvq_levels[0] <= quant_level and quant_level <= token_dropout_rvq_levels[1]):
 														steps = resps.shape[0]
 														for l in range( quant_level ):
 															for t in range( steps ):
 																token = resps[t, l].item()
-												experimental method of using DACs quantizer ""embeddings"" to see if it helps with model quality

											
										
										
											2024-06-30 00:46:11 +00:00
-												added experimental training setting to perform token dropout to MAYBE compensate for errors from the preceding RVQ level (two types: token error offset, token dropout embedding replace)

											
										
										
											2024-07-25 00:35:17 +00:00
+																if random.random() < token_dropout_error:
 																	offset = 1 * ( 1 if random.random() < 0.5  else -1 )
 																	resps_list[i][t, l] = clamp(token + offset, 1, 1022) # +- 1
 													# only apply stop token for RVQ level 0
 													if quant_level <= 0:
 														# append stop tokens for AR
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+														if task in text_task:
 															#text_list[i] = torch.cat([ resps, text_stop_sequence ])
 															...
 														else:
 															resps_list[i] = torch.cat([ resps, audio_stop_sequence ])
-												added experimental training setting to perform token dropout to MAYBE compensate for errors from the preceding RVQ level (two types: token error offset, token dropout embedding replace)

											
										
										
											2024-07-25 00:35:17 +00:00
-												experimental method of using DACs quantizer ""embeddings"" to see if it helps with model quality

											
										
										
											2024-06-30 00:46:11 +00:00
-												refractor cleanup, had a revelation on how I can handle a batch of varying tasks

											
										
										
											2024-04-17 02:04:48 +00:00
+												inputs = self.inputs(
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+													text_list=text_list,
 													proms_list=proms_list,
 													resps_list=resps_list,
-												actually use langs from the dataloader

											
										
										
											2023-10-12 02:21:50 +00:00
+													lang_list=lang_list,
-												experimental "just have a token for what rvq level we're on" that seems to help all models (mamba almost works, but it might just have to be relegated as a pure AR model)

											
										
										
											2024-06-05 04:23:31 +00:00
+													tone_list=tone_list,
-												added experimental NAR only model (inferences text length, need more experimenting), AudioEmbedding logic cleanup (I still think it's being done wrong)

											
										
										
											2024-06-08 20:42:02 +00:00
+													task_list=task_list,
-												experimental "just have a token for what rvq level we're on" that seems to help all models (mamba almost works, but it might just have to be relegated as a pure AR model)

											
										
										
											2024-06-05 04:23:31 +00:00
 													quant_levels=quant_levels,
-												refractor cleanup, had a revelation on how I can handle a batch of varying tasks

											
										
										
											2024-04-17 02:04:48 +00:00
+												)
 												return super().forward(
 													inputs=inputs,
-												oops, kept forgetting to actually pass in lang/tone tokens (despite not really using these at the moment)

											
										
										
											2024-07-18 19:18:34 +00:00
+													quant_levels=quant_levels, # could technically just grab this from the above inputs since they're included as an RVQ level token
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+												)
-												added experimental NAR only model (inferences text length, need more experimenting), AudioEmbedding logic cleanup (I still think it's being done wrong)

											
										
										
											2024-06-08 20:42:02 +00:00
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+											# is NAR
-												set default max_levels for NAR to 0 and implicitly set it to max resps levels because the previous way was implicitly assuming all models were outputting at 1+7 RVQ bins.

											
										
										
											2023-09-11 01:33:33 +00:00
+											if max_levels == 0:
-												allow loading a different model within the web ui (apparently I did not have the web UI in the documentation)

											
										
										
											2024-07-16 00:59:48 +00:00
+												max_levels = self.n_max_levels - 1
-												cleanup, putting some thoughts in comments before I forget about them

											
										
										
											2024-06-06 00:50:06 +00:00
 											# expand if given a raw 1D tensor
 											for i, resp in enumerate(resps_list):
 												if resp.dim() == 1:
 													resps_list[i] = resp.unsqueeze(-1)
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
 											prev_list = resps_list
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												added rudimentary demo page creator (currently just embeds base64 wavs into the page, need to test not doing that)

											
										
										
											2024-07-20 01:49:40 +00:00
+											for n in trange( max_levels, desc="NAR", disable=disable_tqdm ):
-												some day I'll get it right

											
										
										
											2023-09-08 20:36:26 +00:00
+												level = prev_list[0].shape[-1]
-												added option to limit (or exceed) inferenced RVQ-bin levels through the NAR

											
										
										
											2023-09-10 18:50:13 +00:00
+												if level >= max_levels + 1: # min(max_levels + 1, self.n_resp_levels): # commented out to experiment with exceeding trained levels
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+													break
-												enable LoRA for targetted RVQ levels (to experiment with, seems to help)

											
										
										
											2024-06-18 02:45:03 +00:00
+												if cfg.lora is not None:
 													enable_lora( self, cfg.lora.active_level( level ) )
-												added experimental NAR only model (inferences text length, need more experimenting), AudioEmbedding logic cleanup (I still think it's being done wrong)

											
										
										
											2024-06-08 20:42:02 +00:00
+												quant_levels = [ level for _ in range(batch_size) ] # torch.full((len(text_list),), level)
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												refractor cleanup, had a revelation on how I can handle a batch of varying tasks

											
										
										
											2024-04-17 02:04:48 +00:00
+												inputs = self.inputs(
-												implemented a naive beam search (I really should be taking a break)

											
										
										
											2023-09-13 02:28:07 +00:00
+													text_list=text_list,
 													proms_list=proms_list,
 													resps_list=prev_list,
-												actually use langs from the dataloader

											
										
										
											2023-10-12 02:21:50 +00:00
+													lang_list=lang_list,
-												deprecate sole AR/NAR model by only keeping the AR+NAR (the beauty of no one using this is that I can break compat as much as I want), add tone token for when I classify my dataset with tone/emotion in the future, some other things

											
										
										
											2024-04-16 00:54:32 +00:00
+													tone_list=tone_list,
-												experimental "just have a token for what rvq level we're on" that seems to help all models (mamba almost works, but it might just have to be relegated as a pure AR model)

											
										
										
											2024-06-05 04:23:31 +00:00
+													quant_levels=quant_levels,
-												refractor cleanup, had a revelation on how I can handle a batch of varying tasks

											
										
										
											2024-04-17 02:04:48 +00:00
+												)
-												readme tweaks, set the (unused) default model download URL back to the base ar+nar-llama-8 model, as ar+nar-tts+stt-llama-8 was renamed back to it since it performs well

											
										
										
											2024-10-06 03:53:53 +00:00
+												output = super().forward(
-												refractor cleanup, had a revelation on how I can handle a batch of varying tasks

											
										
										
											2024-04-17 02:04:48 +00:00
+													inputs=inputs,
-												implemented a naive beam search (I really should be taking a break)

											
										
										
											2023-09-13 02:28:07 +00:00
+													quant_levels=quant_levels,
 												)
-												readme tweaks, set the (unused) default model download URL back to the base ar+nar-llama-8 model, as ar+nar-tts+stt-llama-8 was renamed back to it since it performs well

											
										
										
											2024-10-06 03:53:53 +00:00
+												if not isinstance( output, tuple ):
 													output = (output, None)
 												logits, state = output
-												implemented a naive beam search (I really should be taking a break)

											
										
										
											2023-09-13 02:28:07 +00:00
 												resps_list = super().sample(
 													logits=logits,
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+													prev_list=prev_list,
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+													quant_levels=quant_levels,
-												implemented a naive beam search (I really should be taking a break)

											
										
										
											2023-09-13 02:28:07 +00:00
 													temperature=sampling_temperature,
-												changed dynamic temperature trigger to be a min-(n)ar-temp value between [0,(n)ar-temp), flags to set min temp, checkbox in web UI to request it

											
										
										
											2023-10-10 22:02:33 +00:00
+													min_temperature=sampling_min_temperature,
-												implemented a naive beam search (I really should be taking a break)

											
										
										
											2023-09-13 02:28:07 +00:00
+													top_p=sampling_top_p,
 													top_k=sampling_top_k,
-												added what I think is DRY sampling

											
										
										
											2024-07-30 00:15:07 +00:00
+													#repetition_penalty=sampling_repetition_penalty,
 													#repetition_penalty_decay=sampling_repetition_penalty_decay,
-												implemented a naive beam search (I really should be taking a break)

											
										
										
											2023-09-13 02:28:07 +00:00
+													#length_penalty=sampling_length_penalty,
 													#beam_width=sampling_beam_width,
-												added mirostat sampling (given a partially trained model, it got far decent output than I expected, need to test on a better trained model)

											
										
										
											2023-09-18 23:55:41 +00:00
+													#mirostat=mirostat,
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+												)
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+												prev_list = [ torch.cat([rs, r.unsqueeze(-1).to(device=device)], dim=-1) for rs, r in zip(prev_list, resps_list) ]
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												enable LoRA for targetted RVQ levels (to experiment with, seems to help)

											
										
										
											2024-06-18 02:45:03 +00:00
+											if cfg.lora is not None:
 												enable_lora( self )
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+											return prev_list
-												added experimental NAR only model (inferences text length, need more experimenting), AudioEmbedding logic cleanup (I still think it's being done wrong)

											
										
										
											2024-06-08 20:42:02 +00:00
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+										# is AR
-												enable LoRA for targetted RVQ levels (to experiment with, seems to help)

											
										
										
											2024-06-18 02:45:03 +00:00
+										if cfg.lora is not None:
 											enable_lora( self, cfg.lora.active_level( 0 ) )
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+										# STT
-												README tweaks, added --input-prompt-prefix as an experiment (its literally better to just not do this, but i'll retain it in case i have a revelation on how to improve it)

											
										
										
											2024-10-04 23:57:19 +00:00
+										start_slice = [ 0 for _ in range(batch_size) ]
-												added experimental NAR only model (inferences text length, need more experimenting), AudioEmbedding logic cleanup (I still think it's being done wrong)

											
										
										
											2024-06-08 20:42:02 +00:00
+										sequence_list = [ torch.zeros(0, device=device).to(torch.int16) for _ in range(batch_size) ]
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+										stopped = torch.zeros(batch_size, device=device).bool()
-												added experimental NAR only model (inferences text length, need more experimenting), AudioEmbedding logic cleanup (I still think it's being done wrong)

											
										
										
											2024-06-08 20:42:02 +00:00
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+										audio_stop_token = self.stop_token
 										text_stop_token = 2
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												have eval dataloader use eval batch size for batchedordersampler

											
										
										
											2024-06-29 03:44:00 +00:00
+										state = None
-												added mirostat sampling (given a partially trained model, it got far decent output than I expected, need to test on a better trained model)

											
										
										
											2023-09-18 23:55:41 +00:00
+										mirostat = [
 											{"n": 1024, "tau": sampling_mirostat_tau, "eta": sampling_mirostat_eta, "max_surprise": sampling_mirostat_eta * 2, "error_surprise": 0, "running_total_surprise": 0}
 										] * batch_size if sampling_mirostat_tau > 0.0 else None
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												added picking final candidate based on sum of score instead of first candidate (this changes nothing).

											
										
										
											2023-09-13 18:19:11 +00:00
+										scores = [ 1.0 ] * sampling_beam_width
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+										for i, sequence in enumerate( sequence_list ):
-												README tweaks, added --input-prompt-prefix as an experiment (its literally better to just not do this, but i'll retain it in case i have a revelation on how to improve it)

											
										
										
											2024-10-04 23:57:19 +00:00
+											# add <bos> to text for STT
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+											if task_list[i] in text_task:
-												README tweaks, added --input-prompt-prefix as an experiment (its literally better to just not do this, but i'll retain it in case i have a revelation on how to improve it)

											
										
										
											2024-10-04 23:57:19 +00:00
+												start_slice[i] = 1
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+												sequence_list[i] = torch.cat([sequence_list[i], torch.tensor([1], dtype=torch.int16, device=device)])
-												README tweaks, added --input-prompt-prefix as an experiment (its literally better to just not do this, but i'll retain it in case i have a revelation on how to improve it)

											
										
										
											2024-10-04 23:57:19 +00:00
+											# treat input prompt as initial resp (by prefixing with the prompt instead)
 											elif input_prompt_prefix:
 												start_slice[i] = proms_list[i].shape[0]
 												sequence_list[i], proms_list[i] = proms_list[i][:, 0], sequence_list[i]
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
-												added picking final candidate based on sum of score instead of first candidate (this changes nothing).

											
										
										
											2023-09-13 18:19:11 +00:00
+										# get next in sequence
-												added rudimentary demo page creator (currently just embeds base64 wavs into the page, need to test not doing that)

											
										
										
											2024-07-20 01:49:40 +00:00
+										for n in trange(max_steps // max(1, self.causal_size), desc="AR", disable=disable_tqdm):
-												faster

											
										
										
											2024-10-05 03:30:47 +00:00
+											# it would technically be faster to just append the new token's embedding to the inputs, but there's a VERY small performance gain from doing it, so it's not worth it
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+											text_list = [ sequence_list[i] if task in text_task else text_list[i] for i, task in enumerate(task_list) ]
 											resps_list = [ sequence_list[i] if task not in text_task else resps_list[i] for i, task in enumerate(task_list) ]
-												refractor cleanup, had a revelation on how I can handle a batch of varying tasks

											
										
										
											2024-04-17 02:04:48 +00:00
+											inputs = self.inputs(
 												text_list=text_list,
 												proms_list=proms_list,
 												resps_list=resps_list,
 												lang_list=lang_list,
 												tone_list=tone_list,
-												added experimental NAR only model (inferences text length, need more experimenting), AudioEmbedding logic cleanup (I still think it's being done wrong)

											
										
										
											2024-06-08 20:42:02 +00:00
+												len_list=len_list,
 												task_list=task_list,
-												un-tensor'd quant_level marker since it doesn't need to be one (I forgot why I had it as one but nothing seems to need it as a tensor that didn't already make it one)

											
										
										
											2024-06-08 01:46:22 +00:00
+												quant_levels=[ 0 for _ in range( max( batch_size, sampling_beam_width ) ) ]
-												refractor cleanup, had a revelation on how I can handle a batch of varying tasks

											
										
										
											2024-04-17 02:04:48 +00:00
+											)
-												faster

											
										
										
											2024-10-05 03:30:47 +00:00
+											# to-do: find an elegant way to write this
-												readme tweaks, set the (unused) default model download URL back to the base ar+nar-llama-8 model, as ar+nar-tts+stt-llama-8 was renamed back to it since it performs well

											
										
										
											2024-10-06 03:53:53 +00:00
+											output = super().forward(
 												inputs=inputs,
 												state=state,
 											)
 											if not isinstance( output, tuple ):
 												output = (output, None)
 											logits, state = output
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												implemented a naive beam search (I really should be taking a break)

											
										
										
											2023-09-13 02:28:07 +00:00
+											r = super().sample(
 												logits=logits,
-												sped up inferencing by not doing .tolist() for rep pen / length pen (and a bug fix in the web UI from prev commit)

											
										
										
											2024-10-05 03:18:20 +00:00
+												prev_list=None if sampling_repetition_penalty == 1.0 and sampling_length_penalty == 0.0 else [ resps_list[i] if task not in text_task else text_list[i] for i, task in enumerate( task_list ) ],
-												implemented a naive beam search (I really should be taking a break)

											
										
										
											2023-09-13 02:28:07 +00:00
 												temperature=sampling_temperature,
-												changed dynamic temperature trigger to be a min-(n)ar-temp value between [0,(n)ar-temp), flags to set min temp, checkbox in web UI to request it

											
										
										
											2023-10-10 22:02:33 +00:00
+												min_temperature=sampling_min_temperature,
-												implemented a naive beam search (I really should be taking a break)

											
										
										
											2023-09-13 02:28:07 +00:00
+												top_p=sampling_top_p,
 												top_k=sampling_top_k,
 												repetition_penalty=sampling_repetition_penalty,
 												repetition_penalty_decay=sampling_repetition_penalty_decay,
 												length_penalty=sampling_length_penalty,
 												beam_width=sampling_beam_width,
-												added mirostat sampling (given a partially trained model, it got far decent output than I expected, need to test on a better trained model)

											
										
										
											2023-09-18 23:55:41 +00:00
 												mirostat=mirostat,
-												added what I think is DRY sampling

											
										
										
											2024-07-30 00:15:07 +00:00
 												dry_multiplier=sampling_dry_multiplier,
 												dry_base=sampling_dry_base,
 												dry_allowed_length=sampling_dry_allowed_length,
-												implemented a naive beam search (I really should be taking a break)

											
										
										
											2023-09-13 02:28:07 +00:00
+											)
-												added mirostat sampling (given a partially trained model, it got far decent output than I expected, need to test on a better trained model)

											
										
										
											2023-09-18 23:55:41 +00:00
+											if mirostat is not None:
 												# r is the state
 												mirostat = r
 												# extract token from state
 												r = [ state["token"] for state in mirostat ]
-												implemented a naive beam search (I really should be taking a break)

											
										
										
											2023-09-13 02:28:07 +00:00
+											# we do it here because the sampler will already expand our logits list
-												added mirostat sampling (given a partially trained model, it got far decent output than I expected, need to test on a better trained model)

											
										
										
											2023-09-18 23:55:41 +00:00
+											elif sampling_beam_width > 0:
-												added picking final candidate based on sum of score instead of first candidate (this changes nothing).

											
										
										
											2023-09-13 18:19:11 +00:00
+												# expand tuple
 												r, s = r
 												# first step, expand batch
 												if batch_size == 1:
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+													batch_size = sampling_beam_width
-												added picking final candidate based on sum of score instead of first candidate (this changes nothing).

											
										
										
											2023-09-13 18:19:11 +00:00
+													text_list = text_list * sampling_beam_width
 													proms_list = proms_list * sampling_beam_width
 													sequence_list = sequence_list * sampling_beam_width
 													stopped = torch.zeros(batch_size, device=device).bool()
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+												scores = [ scores[i] + score for i, score in enumerate(s) ]
-												implemented a naive beam search (I really should be taking a break)

											
										
										
											2023-09-13 02:28:07 +00:00
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+											# append tokens
 											for i, ri in enumerate(r):
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+												task = task_list[i]
 												stop_token = audio_stop_token if task not in text_task else text_stop_token
-												added experimental NAR only model (inferences text length, need more experimenting), AudioEmbedding logic cleanup (I still think it's being done wrong)

											
										
										
											2024-06-08 20:42:02 +00:00
+												if stop_token in ri:
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+													stopped[i] = True
-												separated samplers into its own file, don't bother copying the logits back to the GPU after sampling, it's not necessary

											
										
										
											2023-10-11 17:25:31 +00:00
+												sequence_list[i] = torch.cat([sequence_list[i], ri.to(device)])
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
 											# stop token found
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+											# stopped |= r == stop_token
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+											if stopped.all().item():
 												break
-												added picking final candidate based on sum of score instead of first candidate (this changes nothing).

											
										
										
											2023-09-13 18:19:11 +00:00
+										# pick the best scoring candidate
 										# desu this is always going to be candidate 0
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+										if sampling_beam_width:
 											sequence_list = [ sequence_list[0] ]
-												implemented a naive beam search (I really should be taking a break)

											
										
										
											2023-09-13 02:28:07 +00:00
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+										# remove stop token
 										sequence_list = [self._prune(r, audio_stop_token if task_list[i] not in text_task else text_stop_token) for i, r in enumerate(sequence_list)]
 										# remove <bos>
-												README tweaks, added --input-prompt-prefix as an experiment (its literally better to just not do this, but i'll retain it in case i have a revelation on how to improve it)

											
										
										
											2024-10-04 23:57:19 +00:00
+										sequence_list = [ sequence_list[i][start_slice[i]:] for i, task in enumerate( task_list ) ]
-												added experimental NAR only model (inferences text length, need more experimenting), AudioEmbedding logic cleanup (I still think it's being done wrong)

											
										
										
											2024-06-08 20:42:02 +00:00
+										return sequence_list
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
 								def example_usage():
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
+									cfg.trainer.backend = "local"
-												forgot to fix up the test trainer

											
										
										
											2024-04-21 19:58:04 +00:00
+									cfg.hyperparameters.gradient_accumulation_steps = 1
-												DAC just doesn't work well enough......

											
										
										
											2024-05-25 16:07:52 +00:00
+									if cfg.audio_backend == "dac":
-												sanity cleanup

											
										
										
											2024-07-04 20:58:08 +00:00
+										cfg.sample_rate = 44_100
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												forgot to fix up the test trainer

											
										
										
											2024-04-21 19:58:04 +00:00
+									from functools import partial
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+									from einops import repeat
-												forgot to fix up the test trainer

											
										
										
											2024-04-21 19:58:04 +00:00
+									from tqdm import tqdm
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
+									from ..emb.qnt import decode_to_file, unload_model, trim_random, repeat_extend_audio, concat_audio, merge_audio
-												suppress warning on exit about distributed not being cleaned up (because I updated my system)

											
										
										
											2024-07-25 21:50:47 +00:00
+									from ..engines import Engine, Engines
-												tweaks and fixes

											
										
										
											2023-09-07 22:08:38 +00:00
+									from ..utils import wrapper as ml
-												tweak

											
										
										
											2024-09-06 04:21:18 +00:00
+									from ..utils import setup_logging
-												forgot to fix up the test trainer

											
										
										
											2024-04-21 19:58:04 +00:00
 									import numpy as np
-												added an option to allow injecting embeddings from another model, because it dawned upon me how valuable embeddings from a good model can be for subsequent trainings (defined under cfg.models._embeddings as a relative path to the yaml)

											
										
										
											2024-04-05 00:11:49 +00:00
+									import re
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												tweak

											
										
										
											2024-09-06 04:21:18 +00:00
+									setup_logging()
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+									device = "cuda"
-												tweak

											
										
										
											2024-09-06 04:21:18 +00:00
-												re-added mamba as a possible non-experimental arch backend (test trainer will set it as AR only, doing any NAR tasks lobotomizes it)

											
										
										
											2024-06-05 03:41:22 +00:00
 									# mamba seems to ONLY be used as an AR (any NAR attempts lobotomizes it)
-												experimental "just have a token for what rvq level we're on" that seems to help all models (mamba almost works, but it might just have to be relegated as a pure AR model)

											
										
										
											2024-06-05 04:23:31 +00:00
+									"""
-												re-added mamba as a possible non-experimental arch backend (test trainer will set it as AR only, doing any NAR tasks lobotomizes it)

											
										
										
											2024-06-05 03:41:22 +00:00
+									if "mamba" in cfg.model.arch_type:
 										cfg.model.resp_levels = 1
-												experimental "just have a token for what rvq level we're on" that seems to help all models (mamba almost works, but it might just have to be relegated as a pure AR model)

											
										
										
											2024-06-05 04:23:31 +00:00
+									"""
-												madness

											
										
										
											2024-06-05 04:48:51 +00:00
+									# cfg.model.loss_factors = {}
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												forgot to fix up the test trainer

											
										
										
											2024-04-21 19:58:04 +00:00
+									def tokenize(content):
 										return torch.tensor( cfg.tokenizer.encode(content) )
-												finally swallowing the Descript-Audio-Codec pill (I guess I'm going to have to regenerate my entire dataset)

											
										
										
											2024-04-18 01:39:35 +00:00
-												forgot to fix up the test trainer

											
										
										
											2024-04-21 19:58:04 +00:00
+									def _load_quants(path) -> Tensor:
-												probably insane with even entertaining going this route

											
										
										
											2024-06-04 01:26:27 +00:00
+										qnt = np.load(path, allow_pickle=True)[()]
-												allow loading a different model within the web ui (apparently I did not have the web UI in the documentation)

											
										
										
											2024-07-16 00:59:48 +00:00
+										return torch.from_numpy(qnt["codes"].astype(np.int16))[0, :cfg.model.resp_levels, :].t().to(torch.int16)
-												forgot to fix up the test trainer

											
										
										
											2024-04-21 19:58:04 +00:00
-												probably insane with even entertaining going this route

											
										
										
											2024-06-04 01:26:27 +00:00
+									qnt = _load_quants(f"./data/qnt.{'dac' if cfg.audio_backend == 'dac' else 'enc'}")
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
+									noise = _load_quants(f"./data/noise.{'dac' if cfg.audio_backend == 'dac' else 'enc'}")
-												mucked around with the loss calculation, this seems better?

											
										
										
											2023-10-13 23:22:21 +00:00
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+									text_list = [
-												forgot to fix up the test trainer

											
										
										
											2024-04-21 19:58:04 +00:00
+										tokenize("ˈaɪ wɪl nˌɑːt ˈæsk ɐ sˈɛkənd tˈaɪm").to(device),
-												added option to split between text loss and audio loss (to-do: document this better), because it may or may not be a problem with LLaMA-backed models because my loss hovers around 3.9 / 56% accuracy despite sounding decent at the moment

											
										
										
											2024-05-19 16:23:56 +00:00
+										#tokenize("ˈaɪ wɪl nˌɑːt ˈæsk").to(device),
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+									]
 									proms_list = [
-												added option to specify frames per second for the given audio representation (Encodec is 75Hz, DAC is 41Hz (at 24K sources))

											
										
										
											2024-05-04 17:05:41 +00:00
+										qnt[:cfg.dataset.frames_per_second, :].to(device),
-												added option to split between text loss and audio loss (to-do: document this better), because it may or may not be a problem with LLaMA-backed models because my loss hovers around 3.9 / 56% accuracy despite sounding decent at the moment

											
										
										
											2024-05-19 16:23:56 +00:00
+										#qnt[:cfg.dataset.frames_per_second, :].to(device),
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+									]
 									resps_list = [
-												added option to split between text loss and audio loss (to-do: document this better), because it may or may not be a problem with LLaMA-backed models because my loss hovers around 3.9 / 56% accuracy despite sounding decent at the moment

											
										
										
											2024-05-19 16:23:56 +00:00
+										qnt[:, :].to(device),
 										#qnt[:cfg.dataset.frames_per_second, :].to(device),
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+									]
 									text_list = text_list[:1]
 									proms_list = proms_list[:1]
 									resps_list = resps_list[:1]
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
+									batch_size = len(text_list)
-												tweaks

											
										
										
											2024-03-02 02:38:06 +00:00
+									# rentet-full is the only configuration with BitNet's BitLinear that converges despite the grad_norm saying otherwise
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+									kwargs = {
-												cleanup, putting some thoughts in comments before I forget about them

											
										
										
											2024-06-06 00:50:06 +00:00
+										'n_text_tokens': 256,
 										'n_audio_tokens': 1024,
-												add adapted MixtralAttention for when I make a bad decision to actually train a MoE

											
										
										
											2024-08-05 03:03:22 +00:00
+										'd_model': 1024, # 256, # 1024, # 1536
-												added torchscale XMOE integration (because Mixtral 8x7B seems very promising and I want to see if it works)

											
										
										
											2023-12-21 00:45:58 +00:00
+										'n_heads': 16, # 4, # 16, # 24
-												re-added mamba as a possible non-experimental arch backend (test trainer will set it as AR only, doing any NAR tasks lobotomizes it)

											
										
										
											2024-06-05 03:41:22 +00:00
+										'n_layers': 12, # 32
-												added export option to convert Llama to MixtralMoE for another dumb experiment

											
										
										
											2024-08-05 01:25:06 +00:00
+										'n_experts': 1 if not cfg.model else cfg.model.experts,
-												added FP8 support through `NVIDIA/TransformerEngine`, added RetNet_HF through `syncdoth/RetNet` (as an alternative to branch away from torchscale)

											
										
										
											2024-04-09 01:14:51 +00:00
-x better for subtrain/eval to be by group instead

											
										
										
											2024-05-19 21:40:14 +00:00
+										'p_dropout': 0.1,
-												leverage between xformers and `torch.backends.cuda.sdp_kernel` for attention

											
										
										
											2024-05-11 22:14:05 +00:00
-												renamed cfg.bitsandbytes to cfg.optimizations (and having it serve as cfg.optimizations.bitsandbytes)

											
										
										
											2024-05-03 01:08:59 +00:00
+										'l_padding': 8 if cfg.optimizations.fp8 else 0,
-												backwards compat for my shitty old weights (was testing if disabling AudioEmbedding summing magically made things better (it did not))

											
										
										
											2024-04-30 03:14:01 +00:00
 										'config': cfg.model
-												added torchscale XMOE integration (because Mixtral 8x7B seems very promising and I want to see if it works)

											
										
										
											2023-12-21 00:45:58 +00:00
+									}
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
 									"""
 									try:
-												deprecate sole AR/NAR model by only keeping the AR+NAR (the beauty of no one using this is that I can break compat as much as I want), add tone token for when I classify my dataset with tone/emotion in the future, some other things

											
										
										
											2024-04-16 00:54:32 +00:00
+										kwargs['config'] = cfg.model
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+									except Exception as e:
 										pass
 									"""
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
+									bos_id, space_id, eos_id = cfg.tokenizer.encode( " " )
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+									#available_tasks = cfg.dataset.tasks_list
 									available_tasks = ["tts", "stt"]
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+									model = AR_NAR(**kwargs).to(device)
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+									steps = 150 * len(available_tasks) # * cfg.model.experimental.causal_size
-												added FP8 support through `NVIDIA/TransformerEngine`, added RetNet_HF through `syncdoth/RetNet` (as an alternative to branch away from torchscale)

											
										
										
											2024-04-09 01:14:51 +00:00
-												sanity cleanup, backup config yaml for each log file

											
										
										
											2024-06-09 16:22:52 +00:00
+									optimizer = cfg.hyperparameters.optimizer.lower() if cfg.yaml_path is not None else "prodigy"
 									scheduler = cfg.hyperparameters.scheduler.lower() if cfg.yaml_path is not None else ""
 									learning_rate = cfg.hyperparameters.learning_rate if cfg.yaml_path is not None else None
-												crammed in DAdaptation (doesn't seem worth it) and ScheduleFree (forgot I wanted to weeks ago, seems promising), optimization wrapper cleanup, test trainer changes, etc.

											
										
										
											2024-05-10 01:28:20 +00:00
 									if cfg.optimizations.dadaptation:
 										# do not combine the two
 										if scheduler == "schedulefree":
 											scheduler = ""
-												oops

											
										
										
											2023-09-07 14:14:03 +00:00
-												crammed in DAdaptation (doesn't seem worth it) and ScheduleFree (forgot I wanted to weeks ago, seems promising), optimization wrapper cleanup, test trainer changes, etc.

											
										
										
											2024-05-10 01:28:20 +00:00
+										learning_rate = 1.0
 									if optimizer == "prodigy":
 										if learning_rate is None:
 											learning_rate = 1.0
 										optimizer = ml.Prodigy
 									elif optimizer == "adagrad":
 										if learning_rate is None:
 											learning_rate = 1.0e-2
 										optimizer = ml.Adagrad
 									elif optimizer == "adamw":
 										if learning_rate is None:
 											learning_rate = 1.0e-4
 										optimizer = ml.AdamW
 									elif optimizer == "sdg":
 										if learning_rate is None:
 											learning_rate = 1.0e-4
 										optimizer = ml.SGD
 									else:
 										raise ValueError(f"Unrecognized optimizer: {optimizer}")
-												moved prints to use logger, edited readme (fused_attn doesnt seem stable for training)

											
										
										
											2024-08-29 18:27:16 +00:00
+									_logger.info(f"Optimizer: {optimizer}\tLearning rate: {learning_rate}")
-												crammed in DAdaptation (doesn't seem worth it) and ScheduleFree (forgot I wanted to weeks ago, seems promising), optimization wrapper cleanup, test trainer changes, etc.

											
										
										
											2024-05-10 01:28:20 +00:00
 									optimizer = optimizer(model.parameters(), lr=learning_rate)
 									if scheduler == "schedulefree":
 										if isinstance(optimizer, ml.AdamW):
 											scheduler = ml.schedulefree.AdamWScheduleFree
 										elif isinstance(optimizer, ml.SGD):
 											scheduler = ml.schedulefree.SGDScheduleFree
 										else:
 											scheduler = None
 										if scheduler is not None:
-												moved prints to use logger, edited readme (fused_attn doesnt seem stable for training)

											
										
										
											2024-08-29 18:27:16 +00:00
+											_logger.info(f"Scheduler: {scheduler}")
-												crammed in DAdaptation (doesn't seem worth it) and ScheduleFree (forgot I wanted to weeks ago, seems promising), optimization wrapper cleanup, test trainer changes, etc.

											
										
										
											2024-05-10 01:28:20 +00:00
+											optimizer = scheduler( model.parameters(), lr = learning_rate )
 									if cfg.optimizations.replace and cfg.optimizations.linear:
 										model = ml.replace_linear( model )
 									if cfg.optimizations.replace and cfg.optimizations.embedding:
 										model = ml.replace_embedding( model )
-												naive model offloading support (handles automatically splitting parts of the model to requested device per memory constraints, either inferred or requested in the yaml, input tensors are automatically migrated to the right device, it SEEMS to work for training under the test trainer when split between GPU and CPU) (this was specifically only because that Flux imagegen model released so I can test it there)

											
										
										
											2024-08-02 01:12:06 +00:00
 									"""
 									cfg.optimizations.model_offloading = {
 										"devices": ["cuda:0", "cpu"],
-												it actually wasn't working because Engines.__init__() automatically moves the entire module to the requested device, which was being called after offloading the model in the test trainer (and it seems I cant do it without injecting a bunch of shit in modeling_llama.py)

											
										
										
											2024-08-02 01:56:28 +00:00
+									#	"limits": [ 0.9, -1 ],
-												ugh, finally got some form of offloading working (need to test if it works on different GPUs, but GPU and CPU offloading seems to work in the test trainer)

											
										
										
											2024-08-02 03:43:39 +00:00
+										"assign": [[ f'layers.{i}.' for i in range(0,10) ], [ f'layers.{i}.' for i in range(11,12) ] + [ "model.norm" ]],
 									#	"limits": [ 256 * (1024 ** 2), -1 ]
-												naive model offloading support (handles automatically splitting parts of the model to requested device per memory constraints, either inferred or requested in the yaml, input tensors are automatically migrated to the right device, it SEEMS to work for training under the test trainer when split between GPU and CPU) (this was specifically only because that Flux imagegen model released so I can test it there)

											
										
										
											2024-08-02 01:12:06 +00:00
+									}
 									"""
-												crammed in DAdaptation (doesn't seem worth it) and ScheduleFree (forgot I wanted to weeks ago, seems promising), optimization wrapper cleanup, test trainer changes, etc.

											
										
										
											2024-05-10 01:28:20 +00:00
 									engine = Engine(model=model, optimizer=optimizer)
-												suppress warning on exit about distributed not being cleaned up (because I updated my system)

											
										
										
											2024-07-25 21:50:47 +00:00
+									engines = Engines({"ar+nar": engine})
 									engines.setup()
-												it actually wasn't working because Engines.__init__() automatically moves the entire module to the requested device, which was being called after offloading the model in the test trainer (and it seems I cant do it without injecting a bunch of shit in modeling_llama.py)

											
										
										
											2024-08-02 01:56:28 +00:00
-												fixes, throw an exception when using NAR only model with non-unified position IDs, since for some reason it outputs garbage for the NAR

											
										
										
											2024-08-03 03:25:49 +00:00
+									"""
-												it actually wasn't working because Engines.__init__() automatically moves the entire module to the requested device, which was being called after offloading the model in the test trainer (and it seems I cant do it without injecting a bunch of shit in modeling_llama.py)

											
										
										
											2024-08-02 01:56:28 +00:00
+									if cfg.optimizations.model_offloading:
 										model = ml.offload_model( model, policy=cfg.optimizations.model_offloading )
-												fixes, throw an exception when using NAR only model with non-unified position IDs, since for some reason it outputs garbage for the NAR

											
										
										
											2024-08-03 03:25:49 +00:00
+									"""
-												suppress warning on exit about distributed not being cleaned up (because I updated my system)

											
										
										
											2024-07-25 21:50:47 +00:00
-												head hurt

											
										
										
											2024-06-07 01:51:31 +00:00
+									"""
-												restructured some things with the model to remove dead weights

											
										
										
											2023-09-21 00:10:59 +00:00
+									torch.save( {
 										'module': model.state_dict()
-												fixes

											
										
										
											2024-06-04 05:07:00 +00:00
+									}, f"./data/{cfg.model.arch_type}.pth" )
-												head hurt

											
										
										
											2024-06-07 01:51:31 +00:00
+									"""
-												restructured some things with the model to remove dead weights

											
										
										
											2023-09-21 00:10:59 +00:00
-												moved prints to use logger, edited readme (fused_attn doesnt seem stable for training)

											
										
										
											2024-08-29 18:27:16 +00:00
+									_logger.info(f"AR+NAR ({cfg.model.arch_type}, {cfg.audio_backend}) parameter count: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
-												added FP8 support through `NVIDIA/TransformerEngine`, added RetNet_HF through `syncdoth/RetNet` (as an alternative to branch away from torchscale)

											
										
										
											2024-04-09 01:14:51 +00:00
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
+									@torch.no_grad()
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+									def sample_data(t=None):
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+										if isinstance(t, list):
 											tasks = t
 											texts = [ text_list[0].to(device) if task != "stt" else None for i, task in enumerate( tasks ) ]
 											proms = [ proms_list[0].to(device) if task != "stt" else [ "stt" ] for i, task in enumerate( tasks ) ]
 											resps = [ None if task != "stt" else resps_list[0].to(device) for i, task in enumerate( tasks ) ]
 											return texts, proms, resps, tasks
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
+										texts = []
 										proms = []
 										resps = []
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+										tasks = []
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
 										for i in range(batch_size):
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+											task = random.choice(available_tasks) if t is None else t
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+											text = text_list[i].to(device)
 											prom = proms_list[i].to(device)
 											resp = resps_list[i].to(device)
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
 											# do nothing
 											if task == "tts":
 												...
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+											elif task == "stt":
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+												prom = [
 													task
 												]
 											# to-do: reimplement this from data.py
 											"""
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
+											elif task == "tts-c":
 												trim_length = int(random.uniform(cfg.dataset.prompt_duration_range[0], cfg.dataset.prompt_duration_range[1]) * cfg.dataset.frames_per_second)
 												prom = resp[:trim_length]
 												resp = resp[trim_length:]
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
 												prom = prom.to(device)
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
+											elif task == "ns" or task == "sr":
 												# extend the noise to fill the target audio
 												noise_ext = repeat_extend_audio( noise, resp.shape[0] )
 												# create the input prompt by merging the target audio with the noise
 												prom = merge_audio( resp.cpu(), noise_ext, scale=[1, cfg.dataset.noise_scale], device=cfg.dataset.reencode_device )
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+												prom = prom.to(device)
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
+												# set the target to just be the noise if <sr>
 												if task == "sr":
 													resp = noise_ext
 												# set the text prompt to empty to train without a guided text prompt
 												if random.random() < 0.5:
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+													text = torch.tensor([bos_id, eos_id], device=device, dtype=torch.uint8)
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+												prom = [
 													task,
 													prom,
 												]
 											"""
 											texts.append( text )
 											proms.append( prom )
 											resps.append( resp )
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+											tasks.append( task )
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+										return texts, proms, resps, tasks
-												finally swallowing the Descript-Audio-Codec pill (I guess I'm going to have to regenerate my entire dataset)

											
										
										
											2024-04-18 01:39:35 +00:00
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
+									@torch.inference_mode()
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+									def sample( name, steps=500, task=None ):
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+										engine.eval()
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+										texts, proms, resps, tasks = sample_data( task )
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+										if "ar" in cfg.model.capabilities:
 											output = engine( texts, proms, resps, task_list=tasks, max_steps=steps, sampling_temperature=0.95 )
-												madness

											
										
										
											2024-06-05 04:48:51 +00:00
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+											text = [ cfg.tokenizer.decode( output[i] ) for i, task in enumerate( tasks ) if task == "stt" ]
 											texts = [ texts[i] for i, task in enumerate( tasks ) if task != "stt" ]
 											proms = [ proms[i] for i, task in enumerate( tasks ) if task != "stt" ]
 											resps = [ output[i] for i, task in enumerate( tasks ) if task != "stt" ]
 											tasks = [ tasks[i] for i, task in enumerate( tasks ) if task != "stt" ]
 											print( "STT:", text )
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+										else:
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+											resps = [ resp[:, 0] for resp in resps ]
 										if "nar" in cfg.model.capabilities:
 											resps = engine( texts, proms, resps, task_list=tasks, sampling_temperature=0.2 )
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+										for i, o in enumerate(resps):
 											_ = decode_to_file(o.to(dtype=torch.int32), f"data/{cfg.model.arch_type}.{cfg.audio_backend}.{i}.{task}.{name}.wav", device=device)
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												added support for optional prodigy optimizer (https://github.com/konstmish/prodigy) although it consumes a lot more VRAM per parameter

											
										
										
											2023-09-07 01:33:16 +00:00
+										unload_model()
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+									def train():
 										engine.train()
-												tweaks and fixes

											
										
										
											2023-09-07 22:08:38 +00:00
+										t = trange(steps)
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+										for i in t:
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+											texts, proms, resps, tasks = sample_data()
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+											stats = {"step": i}
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+											stats |= engine.traverse(text_list=texts, proms_list=proms, resps_list=resps, task_list=tasks)
-												cleaner replacement code (because I realized BitNet had an implementation for it too), added calculating gradient norm and performing gradient clipping in local trainer (non-deepspeed)

											
										
										
											2024-03-02 02:18:43 +00:00
+											stats |= {"grad_norm": engine.get_global_grad_norm()}
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
 											tqdm.write(f"{stats}")
-												head hurt

											
										
										
											2024-06-07 01:51:31 +00:00
+										"""
-												added LLaMA/Mixtral (if experts>1) model arches, utilize XMoE's loss as well, set MoE frequency to 1 to make every layer MoE'd for RetNet, etc. (going to do tests without burning out again to see how things go)

											
										
										
											2023-12-23 01:27:36 +00:00
+										torch.save( {
 											'module': model.state_dict()
-												fixes

											
										
										
											2024-06-04 05:07:00 +00:00
+										}, f"./data/{cfg.model.arch_type}.pth" )
-												head hurt

											
										
										
											2024-06-07 01:51:31 +00:00
+										"""
-												added LLaMA/Mixtral (if experts>1) model arches, utilize XMoE's loss as well, set MoE frequency to 1 to make every layer MoE'd for RetNet, etc. (going to do tests without burning out again to see how things go)

											
										
										
											2023-12-23 01:27:36 +00:00
-												madness

											
										
										
											2024-06-05 04:48:51 +00:00
+									#sample("init", 5)
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+									train()
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
 									"""
 									if cfg.optimizations.compile:
 										model = ml.compile_model(model, backend=cfg.optimizations.compile)
 									"""
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+									"""
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+									for task in available_tasks:
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
+										sample("final", task=task)
-												cleanup for AR_NAR inferencing to allow both TTS and STT tasks simultaneously (need to have training eval do this to though)

											
										
										
											2024-09-06 19:30:12 +00:00
+									"""
 									sample("final", task=available_tasks)
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
-												suppress warning on exit about distributed not being cleaned up (because I updated my system)

											
										
										
											2024-07-25 21:50:47 +00:00
+									engines.quit()
-												added option to use SGD optimizer through the YAML, added option to pass in additional optimizer parameters through the YAML, added experimental unified AR+NAR model (does not seem fruitful in testing)

											
										
										
											2023-09-06 23:58:35 +00:00
+								if __name__ == "__main__":
-												cleanup

											
										
										
											2024-06-06 01:30:43 +00:00
+									example_usage()