vall-e/vall_e/models/ar_nar.py

"""
# an AR + NAR model that handles:
* inferencing the primary RVQ level in an autoregressive manner (AR)
* inferencing the remaining RVQ levels in parallel (NAR)

This model can fully handle being trained as a unified model (AR + NAR) or separate models (AR | NAR).
It's recommended to train as a unified model, then "distill" knowledge of each tasks separately, just in case.
"""
from .base import Base, list_to_tensor, Categorical
from ..config import cfg

import torch
from torch.nn.utils.rnn import pad_sequence

import random
import math
import time
from einops import rearrange
from torch import Tensor
from tqdm import trange, tqdm

import logging

_logger = logging.getLogger(__name__)

from ..emb.qnt import trim, encode_as_embedding, get_silence
from ..utils import get_devices, setup_logging, timer, clamp, convert_kwargs

from .lora import enable_lora
from ..samplers import cfg_logits

text_task = [ "stt" ]

class AR_NAR(Base):
	# parse inputs for training
	# a lot of this could be delegated back to the dataloader, but it's just easier to keep the task of the dataloader to provide sufficient data, and the model to process the data for training
	def forward_train(
		self,
		text_list: list[Tensor],
		proms_list: list[Tensor],
		resps_list: list[Tensor],

		task_list: list[Tensor] | None = None,
		lang_list: list[Tensor] | None = None,
		tone_list: list[Tensor] | None = None,
		len_list: list[Tensor] | None = None,
	):
		# deduce batch_size
		if text_list is not None:
			default_task = "tts"
			device = text_list[0].device
			batch_size = len(text_list)
		else:
			default_task = "stt"
			device = resps_list[0].device
			batch_size = len(resps_list)

		# specifies how to sample probabilities of which RVQ levels to train against
		rvq_levels_p = self.config.experimental.rvq_levels_p if self.config is not None else "equal"
		# determines which RVQ level to target per batch
		quant_level_range = self.config.experimental.rvq_level_range if self.config is not None and self.config.experimental.rvq_level_range else [ 0 if self.causal else 1, self.n_resp_levels - 1 ]
		# rate to perform token dropout errors
		token_dropout_error = self.config.experimental.token_dropout_error
		# RVQ levels to apply token dropout on
		token_dropout_rvq_levels = self.config.experimental.token_dropout_rvq_levels
		# RVQ levels to apply masking training on
		masking_train_rvq_levels = self.config.experimental.masking_train_rvq_levels
		# CFG
		cfg_text_dropout_p = self.config.experimental.cfg_text_dropout_p if self.config is not None else 0.0
		cfg_cond_dropout_p = self.config.experimental.cfg_cond_dropout_p if self.config is not None else 0.0
		cfg_prom_dropout_p = self.config.experimental.cfg_prom_dropout_p if self.config is not None else 0.0
		# rate to train RVQ level AR-ly or NAR-ly
		masking_train_p = self.config.experimental.masking_train_p if self.config is not None else 0.5
		masking_ratio = self.config.experimental.masking_ratio if self.config is not None else "random"
		# force set mask training
		if "len" not in self.capabilities:
			masking_train_p = 0.0
		elif "ar" not in self.capabilities:
			masking_train_p = 1.0
		# implicitly set it to all levels
		if not token_dropout_rvq_levels:
			token_dropout_rvq_levels = [0, self.resp_levels - 1]
		if not token_dropout_rvq_levels:
			token_dropout_rvq_levels = [0, 0]

		# allow passing a specific distribution of RVQ levels
		rvq_levels_p = rvq_levels_p if isinstance(rvq_levels_p, list) else []
		if not rvq_levels_p:
			lo, hi = quant_level_range[0], quant_level_range[1] + 1
			# randomly select a target RVQ-bin level (0 being AR, 1+ being NAR)
			if rvq_levels_p == "equal":
				rvq_levels_p = [ i for i in range( lo, hi ) ]
			else:
				# yuck
				rvq_levels_p = sum([[i for _ in range(hi - i)] for i in range( lo, hi ) ], [])

		# input RVQ levels
		quant_levels = [ random.choice( rvq_levels_p ) for i in range(batch_size) ]
		# timestep levels (for TTS NAR)
		timesteps = [ None for _ in range(batch_size) ]

		for i, task in enumerate( task_list ):
			lo, hi = masking_train_rvq_levels[0], masking_train_rvq_levels[1]
			if task in text_task:
				quant_levels[i] = 0 # self.n_resp_levels - 1
			elif lo <= quant_levels[i] and quant_levels[i] <= hi and random.random() < masking_train_p:
				# to-do: prioritize lower timesteps over later timesteps
				# ...except that the masking rate is still tied to the cosine scheduling, which does this already
				#r = random.random()
				#p = math.acos(r) / (math.pi * 0.5)
				#timesteps[i] = 1.0 - clamp(p, 0.0, 1.0)
				timesteps[i] = random.random()

				# instead make it between [0.2, 0.8]
				if masking_ratio == "rand":
					timesteps[i] = (timesteps[i] * 0.6) + 0.2

		# trim resps to only contain all levels below the target level
		resps_list = [r if t in text_task else r[..., :l+1] for r, l, t in zip(resps_list, quant_levels, task_list)]

		# tensor to cat for RVQ level 0
		text_stop_sequence = torch.tensor([2], device=device, dtype=torch.int16)
		text_start_stop_sequence = torch.tensor([1, 2], device=device, dtype=torch.int16)
		audio_stop_sequence = torch.tensor([[self.stop_token]], device=device, dtype=torch.int16)

		# final validations and stuff
		for i, quant_level, resps, proms, task in zip(range(batch_size), quant_levels, resps_list, proms_list, task_list):
			# cap quant_level if it exceeds its corresponding resp/prom
			# this was needed for when my DAC-encoded audio was erroneously trimmed to 8 RVQ levels instead of 9
			if quant_level >= resps.shape[-1]:
				quant_levels[i] = resps.shape[-1] - 1

			# proms could be a Tensor, list[Tensor], or None
			if isinstance( proms, torch.Tensor ):
				if quant_level >= proms.shape[-1]:
					quant_levels[i] = proms.shape[-1] - 1

			elif isinstance( proms, list ):
				for j, prom in enumerate( proms ):
					if not isinstance( prom, torch.Tensor ):
						continue
					if quant_level >= prom.shape[-1]:
						quant_levels[i] = prom.shape[-1] - 1

			# apply token dropout error compensation
			if token_dropout_error > 0 and (token_dropout_rvq_levels[0] <= quant_level and quant_level <= token_dropout_rvq_levels[1]):
				steps = resps.shape[0]
				for l in range( quant_level ):
					for t in range( steps ):
						token = resps[t, l].item()

						if random.random() < token_dropout_error:
							offset = 1 * ( 1 if random.random() < 0.5  else -1 )
							resps_list[i][t, l] = clamp(token + offset, 1, 1022) # +- 1

			# only apply stop token for RVQ level 0
			if quant_level <= 0 and timesteps[i] is None:
				# append stop tokens for AR
				if task in text_task:
					#text_list[i] = torch.cat([ resps, text_stop_sequence ])
					...
				else:
					resps_list[i] = torch.cat([ resps, audio_stop_sequence ])

			if task == "len":
				quant_levels[i] = 0

			# apply CFG (should probably only apply to NAR quant level 0)
			if task not in text_task + ["len"]:
				drop_text = False
				drop_audio = False

				if random.random() < cfg_prom_dropout_p:
					drop_audio = True

				if random.random() < cfg_cond_dropout_p:
					drop_audio = True
					drop_text = True

				if drop_text:
					text_list[i] = text_start_stop_sequence

				if drop_audio:
					proms_list[i] = None

		inputs = self.inputs(
			text_list=text_list,
			proms_list=proms_list,
			resps_list=resps_list,
			lang_list=lang_list,
			tone_list=tone_list,
			task_list=task_list,
			time_list=timesteps,

			quant_levels=quant_levels,
		)

		return super().forward(
			inputs=inputs,
			quant_levels=quant_levels,
		)

	def forward_nar_masked(
		self,

		text_list: list[Tensor],
		proms_list: list[Tensor],
		resps_list: list[Tensor] | None = None,

		task_list: list[Tensor] | None = None,
		lang_list: list[Tensor] | None = None,
		tone_list: list[Tensor] | None = None,
		len_list: list[Tensor] | None = None,

		disable_tqdm=False,
		use_lora=None,
		**sampling_kwargs,
	):
		device = text_list[0].device
		batch_size = len(text_list)

		# special "scheduling" to inference RVQ-level 0
		level = 0
		if cfg.lora is not None:
			enable_lora( self, cfg.lora.active_level( level ) if use_lora is None else use_lora )

		# to-do: check if gumbel sampling works / helps
		"""
		def log(x, eps = 1e-20):
			return torch.log(x.clamp(min = eps))

		def gumbel_sample(x, temperature = 1., dim = -1):
			return ((x / max(temperature, 1e-10)) + -log(-log(torch.zeros_like(x).uniform_(0, 1)))).argmax(dim = dim)
		"""

		def log(t, eps=1e-10):
			return torch.log(t + eps)


		def gumbel_noise(t):
			noise = torch.zeros_like(t).uniform_(0, 1)
			return -log(-log(noise))


		def gumbel_sample(t, temperature=1.0, dim=-1):
			return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim=dim)

		# convert (N)AR specific args
		sampling_kwargs = convert_kwargs( sampling_kwargs, "ar_" )

		min_length = sampling_kwargs.pop("min_duration", 1)
		max_length = sampling_kwargs.pop("max_duration", 500)
		max_steps = sampling_kwargs.get("max_steps", 25)
		refine_on_stop = sampling_kwargs.get("refine_on_stop", False)
		entropix_sampling = sampling_kwargs.get("entropix_sampling", False)

		# greedy sampling is very, very much preferred, but using greedy logit scores later helps enough
		temperature = sampling_kwargs.pop("temperature", 0.0)
		# this really helps keep audio coherent so far
		cfg_strength = sampling_kwargs.get("cfg_strength", 2.0)
		cfg_rescale = sampling_kwargs.pop("cfg_rescale", 0.75)
		start_noise = sampling_kwargs.get("denoise_start", 0.0)
		end_noise = sampling_kwargs.get("denoise_end", 1.0)
		max_steps = math.floor(max_steps * (end_noise - start_noise))

		len_list = [ clamp(l, min_length, max_length) for l in len_list ]

		# force set CFG because too low / no CFG causes issues
		cfg_strength = max( cfg_strength, 3.0 )

		# if we're denoising from an existing sequence
		if start_noise > 0.0 and resps_list is not None:
			# flatten if needed
			resps_list = [ resps if resps.dim() == 1 else resps[:, 0] for resps in resps_list ]
			# gen masking ratio
			noise_p = math.cos( start_noise * math.pi * 0.5 )
			# generate scoring mask (because the above mask will get masked off per the scores, so we do not need to mask beforehand)
			scores = [ torch.tensor( [ 1.0 if random.random() < noise_p else 0.0 for _ in range( seq_len ) ], dtype=torch.float32, device=device ) for seq_len in len_list ]
		# deduce that this is a prefix
		elif resps_list is not None:
			# number of remaining tokens
			tokens_to_mask = [ l - resps.shape[0] for resps, l in zip( resps_list, len_list ) ]
			# pad with masked tokens
			resps_list = [ torch.concat([ resps if resps.dim() == 1 else resps[:, 0], torch.tensor( [ self.stop_token ] * l, dtype=resps.dtype, device=resps.device ) ]) for resps, l in zip( resps_list, tokens_to_mask ) ]
			# update scores to ignore the prefix
			scores = [ torch.concat( [ torch.zeros((resps.shape[0],), dtype=torch.int16, device=device), torch.ones((l), dtype=torch.int16, device=device) ] ) for resps, l in zip( resps_list, tokens_to_mask ) ]
			# set start noise
			# only the first because we do not have variable noising at the moment
			# *technically* the prefix can be a fixed portion for all inputs in a batch, rather than a fixed length
			# this will set the starting noise_p with the right ratio
			start_noise = 2 / math.pi * math.acos(resps_list[0].shape[0] / len_list[0])
		else:
			# fill with masked tokens (even though they get masked anyways)
			resps_list = [ torch.ones((seq_len,), dtype=torch.int16, device=device) * self.stop_token for seq_len in len_list ]
			# fill scores
			scores = [ torch.ones((seq_len,), dtype=torch.float32, device=device) for seq_len in len_list ]

		quant_levels = [ level for _ in range(batch_size) ]
		null_text = [ torch.tensor([1, 2], device=device, dtype=torch.int16) for _ in range(batch_size) ]
		null_prom = [ None for _ in range(batch_size) ]

		for timestep in tqdm(torch.linspace(start_noise, end_noise, max_steps), desc="NAR Masked", disable=disable_tqdm):
			# update previous list of tokens
			prev_list = resps_list
			# ramp down over time
			annealing = 1.0 - timestep
			# get noise level, per cosine scheduling
			noise_p = math.cos( timestep * math.pi * 0.5 )
			# pick the worst scoring tokens to mask off
			masked_indices = [ score.topk( max(int( noise_p * seq_len ), 1), dim=-1 ).indices for score, seq_len in zip(scores, len_list) ]
			# mask off inputs
			resps_list = [ resp.scatter(0, indices, self.stop_token) for resp, indices in zip( resps_list, masked_indices ) ]
			# boolean mask
			is_masked = [ resps == self.stop_token for resps in resps_list ]
			# timestep inputs
			time_list = [ timestep for _ in range(batch_size) ]

			sampling_temperature = temperature * annealing
			sampling_cfg = cfg_strength * timestep

			# setup inputs
			inputs = super().inputs(
				text_list=text_list,
				proms_list=proms_list,
				resps_list=resps_list,
				lang_list=lang_list,
				tone_list=tone_list,
				time_list=time_list,
				quant_levels=quant_levels,
			)
			output = super().forward(
				inputs=inputs,
				quant_levels=quant_levels,
				#layer_skip_variables=sampling_layer_skip_variables,
			)

			logits = output.logits

			if cfg_strength > 0:
				null_inputs = super().inputs(
					text_list=null_text,
					proms_list=null_prom,
					resps_list=resps_list,
					lang_list=lang_list,
					tone_list=tone_list,
					time_list=time_list,
					quant_levels=quant_levels,
				)
				null_output = super().forward(
					inputs=null_inputs,
					quant_levels=quant_levels,
					#layer_skip_variables=sampling_layer_skip_variables,
				)

				logits = cfg_logits( logits=output.logits, null=null_output.logits, strength=cfg_strength, rescale=cfg_rescale, lens=[ l for l in len_list ] )

			# sample with sampler settings
			filtered_sampled = super().sample(
				logits=logits,
				prev_list=prev_list,
				quant_levels=quant_levels,

				temperature=sampling_temperature,
				**sampling_kwargs,
			)

			# retrieves unfiltered logits
			unfiltered_sampled = super().sample(
				logits=logits,
				prev_list=prev_list,
				quant_levels=quant_levels,

				temperature=0.0,
				**sampling_kwargs,
			)
			# get sampled tokens
			sampled_ids = filtered_sampled.ids
			# keep unmasked tokens
			resps_list = [ torch.where( masked, input_ids, resps ) for masked, input_ids, resps in zip( is_masked, sampled_ids, resps_list ) ]
			# get probability scores
			scores = [
				# conjugate to have worse scoring tokens picked for topk
				1.0 -
					# only keep scores of tokens we are predicting (and ignore the tokens previously finalized)
					torch.where( masked, torch.tensor([score for index, score in enumerate(scores)], device=device), torch.ones(masked.shape, device=device) )
				# use unmodified logit scores for this, as it offers better stability
				for scores, masked in zip( unfiltered_sampled.scores, is_masked )
			]

		return resps_list

	def forward_nar(
		self,
		text_list: list[Tensor],
		proms_list: list[Tensor],
		resps_list: list[Tensor] | None = None,

		task_list: list[Tensor] | None = None,
		lang_list: list[Tensor] | None = None,
		tone_list: list[Tensor] | None = None,
		len_list: list[Tensor] | None = None,

		disable_tqdm=False,
		use_lora=None,
		**sampling_kwargs,
	):
		# deduce batch_size
		if text_list is not None:
			default_task = "tts"
			device = text_list[0].device
			batch_size = len(text_list)
		else:
			default_task = "stt"
			device = resps_list[0].device
			batch_size = len(resps_list)

		# convert NAR specific args
		sampling_kwargs = convert_kwargs( sampling_kwargs, "nar_" )

		max_levels = sampling_kwargs.get("max_levels", 0)
		cfg_strength = sampling_kwargs.get("cfg_strength", 0.0)
		cfg_rescale = sampling_kwargs.pop("cfg_rescale", 0.7)

		if max_levels == 0:
			max_levels = self.n_max_levels - 1

		"""
		sampling_layer_skip_variables = {} if sampling_layer_skip else None

		if sampling_layer_skip:
			if sampling_layer_skip_entropy_threshold >= 0:
				sampling_layer_skip_variables["entropy_threshold"] = sampling_layer_skip_entropy_threshold
			if sampling_layer_skip_varentropy_threshold >= 0:
				sampling_layer_skip_variables["varentropy_threshold"] = sampling_layer_skip_varentropy_threshold
			if sampling_layer_skip_exit_layer >= 0:
				sampling_layer_skip_variables["max_layer"] = sampling_layer_skip_exit_layer
		"""

		# inference NAR level 0
		if len_list is not None:
			resps_list = self.forward_nar_masked(
				text_list=text_list,
				proms_list=proms_list,
				resps_list=resps_list,
				task_list=task_list,
				lang_list=lang_list,
				tone_list=tone_list,
				len_list=len_list,
				**sampling_kwargs,
			)

		# expand if given a raw 1D tensor
		for i, resp in enumerate(resps_list):
			if resp.dim() == 1:
				resps_list[i] = resp.unsqueeze(-1)

		prev_list = resps_list

		null_text = [ torch.tensor([1, 2], device=device, dtype=torch.int16) for _ in range(batch_size) ]
		null_prom = [ None for _ in range(batch_size) ]

		for n in trange( max_levels, desc="NAR", disable=disable_tqdm ):
			level = prev_list[0].shape[-1]
			if level >= max_levels + 1: # min(max_levels + 1, self.n_resp_levels): # commented out to experiment with exceeding trained levels
				break

			if cfg.lora is not None:
				enable_lora( self, cfg.lora.active_level( level ) if use_lora is None else use_lora )

			quant_levels = [ level for _ in range(batch_size) ] # torch.full((len(text_list),), level)

			inputs = self.inputs(
				text_list=text_list,
				proms_list=proms_list,
				resps_list=prev_list,
				lang_list=lang_list,
				tone_list=tone_list,
				quant_levels=quant_levels,
			)

			output = super().forward(
				inputs=inputs,
				quant_levels=quant_levels,
				#layer_skip_variables=sampling_layer_skip_variables,
			)
			logits, state = output.logits, output.state

			if cfg_strength > 0:
				null_inputs = super().inputs(
					text_list=null_text,
					proms_list=null_prom,
					resps_list=prev_list,
					lang_list=lang_list,
					tone_list=tone_list,
					quant_levels=quant_levels,
				)
				null_output = super().forward(
					inputs=null_inputs,
					quant_levels=quant_levels,
					#layer_skip_variables=sampling_layer_skip_variables,
				)

				logits = cfg_logits( logits=output.logits, null=null_output.logits, strength=cfg_strength, rescale=cfg_rescale, lens=[ resp.shape[0] for resp in resps_list ] )

			sampled = super().sample(
				logits=logits,
				prev_list=prev_list,
				quant_levels=quant_levels,
				#temperature=0.0,
				**(sampling_kwargs | {"temperature": 0.0}),
			)

			resps_list = sampled.ids
			prev_list = [ torch.cat([rs, r.unsqueeze(-1).to(device=device)], dim=-1) for rs, r in zip(prev_list, resps_list) ]

		return prev_list

	def forward_ar(
		self,

		text_list: list[Tensor],
		proms_list: list[Tensor],
		resps_list: list[Tensor] | None = None,

		task_list: list[Tensor] | None = None,
		lang_list: list[Tensor] | None = None,
		tone_list: list[Tensor] | None = None,
		len_list: list[Tensor] | None = None,
		disable_tqdm=False,
		use_lora=None,
		**sampling_kwargs,
	):
		# deduce batch_size
		if text_list is not None:
			default_task = "tts"
			device = text_list[0].device
			batch_size = len(text_list)
		else:
			default_task = "stt"
			device = resps_list[0].device
			batch_size = len(resps_list)

		if cfg.lora is not None:
			enable_lora( self, cfg.lora.active_level( 0 ) if use_lora is None else use_lora )

		# convert AR specific args
		sampling_kwargs = convert_kwargs( sampling_kwargs, "ar_" )

		temperature = sampling_kwargs.get("temperature", 1.0)
		cfg_strength = sampling_kwargs.get("cfg_strength", 0.0)
		cfg_rescale = sampling_kwargs.pop("cfg_rescale", 0.7)
		min_temperature = sampling_kwargs.get("min_temperature", -1.0)
		max_duration = sampling_kwargs.get("max_duration", 500)
		beam_width = sampling_kwargs.get("beam_width", 0)
		entropix_sampling = sampling_kwargs.get("entropix_sampling", False)
		refine_on_stop = sampling_kwargs.get("refine_on_stop", False)
		input_prompt_prefix = sampling_kwargs.get("input_prompt_prefix", False)
		layer_skip = sampling_kwargs.get("layer_skip", False)
		prefix_silence = sampling_kwargs.get("prefix_silence", 0.0)
		mirostat_tau = sampling_kwargs.get("mirostat_tau", 0.0)
		mirostat_eta = sampling_kwargs.get("mirostat_eta", 0.0)

		# inference len
		if task_list is not None and task_list[0] == "len":
			sequence_list = [ torch.tensor([0], device=device,dtype=torch.int16) for _ in range(batch_size) ]
			stopped = torch.zeros(batch_size, device=device).bool()

			stop_token = 10
			task_list = [ "len" for _ in range(batch_size) ]
			quant_levels = [ 0 for _ in range( max( batch_size, beam_width ) ) ]

			for n in trange(10, desc="AR", disable=disable_tqdm):
				len_list = sequence_list

				inputs = self.inputs(
					text_list=text_list,
					proms_list=proms_list,
					resps_list=resps_list,
					lang_list=lang_list,
					tone_list=tone_list,
					len_list=len_list,
					task_list=task_list,
					quant_levels=quant_levels,
				)

				output = super().forward(
					inputs=inputs,
					quant_levels=quant_levels,
				)
				logits = output.logits

				r = [ logit[-1:].argmax(dim=1) for logit in logits ]
				# sanitize
				for i, token in enumerate(r):
					if token > 10:
						r[i][0] = stop_token

				# append tokens
				for i, ri in enumerate(r):
					if stop_token in ri:
						stopped[i] = True
					sequence_list[i] = torch.cat([sequence_list[i], ri.to(device)])

				# stop token found
				stopped |= r == stop_token
				if stopped.all().item():
					break

			# convert tokens into int
			return [ int("".join([ str(token.item()) for token in r if token != stop_token ])) for r in sequence_list ]

		# STT
		start_slice = [ 0 for _ in range(batch_size) ]
		sequence_list = [ torch.zeros(0, device=device).to(torch.int16) for _ in range(batch_size) ]
		stopped = torch.zeros(batch_size, device=device).bool()

		audio_stop_token = self.stop_token
		text_stop_token = 2

		state = None
		mirostat = [
			{"n": 1024, "tau": mirostat_tau, "eta": mirostat_eta, "max_surprise": mirostat_eta * 2, "error_surprise": 0, "running_total_surprise": 0}
		] * batch_size if mirostat_tau > 0.0 else None

		scores = [ 1.0 ] * beam_width
		metrics = []

		"""
		sampling_layer_skip_variables = {} if sampling_layer_skip else None

		if sampling_layer_skip:
			if sampling_layer_skip_entropy_threshold >= 0:
				sampling_layer_skip_variables["entropy_threshold"] = sampling_layer_skip_entropy_threshold
			if sampling_layer_skip_varentropy_threshold >= 0:
				sampling_layer_skip_variables["varentropy_threshold"] = sampling_layer_skip_varentropy_threshold
			if sampling_layer_skip_exit_layer >= 0:
				sampling_layer_skip_variables["max_layer"] = sampling_layer_skip_exit_layer
		"""

		for i, sequence in enumerate( sequence_list ):
			# add <bos> to text for STT
			if task_list[i] in text_task:
				start_slice[i] = 1
				sequence_list[i] = torch.cat([sequence_list[i], torch.tensor([1], dtype=torch.int16, device=device)])
			# treat input prompt as initial resp (by prefixing with the prompt instead)
			elif input_prompt_prefix:
				start_slice[i] = proms_list[i].shape[0]
				sequence_list[i], proms_list[i] = proms_list[i][:, 0], sequence_list[i]
			elif prefix_silence > 0:
				sequence_list[i] = get_silence(prefix_silence, device=sequence_list[i].device)
				sequence_list[i] = sequence_list[i][:, 0]
				# start_slice[i] = sequence_list[i].shape[0]

		null_text = [ torch.tensor([1, 2], device=device, dtype=torch.int16) for _ in range(batch_size) ]
		null_prom = [ None for _ in range(batch_size) ]

		# get next in sequence
		for n in trange(max_duration // max(1, self.causal_size), desc="AR", disable=disable_tqdm):
			# it would technically be faster to just append the new token's embedding to the inputs, but there's a VERY small performance gain from doing it, so it's not worth it
			text_list = [ sequence_list[i] if task in text_task else text_list[i] for i, task in enumerate(task_list) ]
			resps_list = [ sequence_list[i] if task not in text_task else resps_list[i] for i, task in enumerate(task_list) ]
			quant_levels = [ 0 for _ in range( max( batch_size, beam_width ) ) ]

			inputs = self.inputs(
				text_list=text_list,
				proms_list=proms_list,
				resps_list=resps_list,
				lang_list=lang_list,
				tone_list=tone_list,
				len_list=len_list,
				task_list=task_list,
				quant_levels=quant_levels,
			)

			# to-do: find an elegant way to write this
			output = super().forward(
				inputs=inputs,
				state=state,
				#layer_skip_variables=sampling_layer_skip_variables,
				output_attentions=entropix_sampling,
			)

			if cfg_strength > 0:
				null_inputs = super().inputs(
					text_list=null_text,
					proms_list=null_prom,
					resps_list=resps_list,
					lang_list=lang_list,
					tone_list=tone_list,
					quant_levels=quant_levels,
				)
				null_output = super().forward(
					inputs=null_inputs,
					quant_levels=quant_levels,
					#layer_skip_variables=sampling_layer_skip_variables,
				)
				logits = cfg_logits( logits=output.logits, null=null_output.logits, strength=cfg_strength, rescale=cfg_rescale, lens=[ resp.shape[0] + 1 for resp in resps_list ] )

			logits, state = output.logits, output.state

			sampled = super().sample(
				logits=logits,
				prev_list=[ resps_list[i] if task not in text_task else text_list[i] for i, task in enumerate( task_list ) ],
				**(sampling_kwargs | {"attentions": output.attentions if entropix_sampling else None}),
			)

			ids = sampled.ids

			if cfg.experimental:
				if sampled.entropy:
					metrics.append( sampled.entropy )
				elif sampled.scores:
					#metrics.append( [ { "p": p[0], "exited_layer": output.exited_layer } for p in sampled.scores ] )
					metrics.append( [ { "p": p[0] } for p in sampled.scores ] )

			if mirostat is not None:
				mirostat = sampled.scores
			elif beam_width > 0:
				# expand tuple
				s = sampled.scores
				# first step, expand batch
				if batch_size == 1:
					batch_size = beam_width
					text_list = text_list * beam_width
					proms_list = proms_list * beam_width
					sequence_list = sequence_list * beam_width
					task_list = task_list * beam_width
					start_slice = start_slice * beam_width
					stopped = torch.zeros(batch_size, device=device).bool()

				scores = [ scores[i] + score for i, score in enumerate(s) ]

			# append tokens
			for i, token in enumerate(ids):
				task = task_list[i]
				stop_token = audio_stop_token if task not in text_task else text_stop_token
				if stop_token in token:
					stopped[i] = True
				sequence_list[i] = torch.cat([sequence_list[i], token.to(device)])

			# stop token found
			# stopped |= r == stop_token
			if stopped.all().item():
				break

		# to-do for layerskip / speculative sampling: rerun the last sequence again at max depth
		"""
		if metrics:
			from ..plot import plot_sample_metrics
			filename = "metrics"
			if entropix_sampling:
				filename += f'[entropix_sampling]'
			if sampling_layer_skip_exit_layer >= 0:
				filename += f'[{sampling_layer_skip_exit_layer+1}]'

			plot_sample_metrics( metrics, filename=f'{filename}.png' )
		"""

		# pick the best scoring candidate
		# desu this is always going to be candidate 0
		if beam_width:
			sequence_list = sequence_list[:1]
			task_list = task_list[:1]

		# remove stop token
		sequence_list = [self._prune(r, audio_stop_token if task_list[i] not in text_task else text_stop_token) for i, r in enumerate(sequence_list)]
		# remove <bos>
		sequence_list = [ sequence_list[i][start_slice[i]:] for i, task in enumerate( task_list ) ]

		if refine_on_stop:
			# get how much we need to slice from the end
			slice_lengths = [ sequence.shape[-1] for sequence in sequence_list ]
			# -1 for the stop token
			logits = [ logit[-length-1:-1] for logit, length in zip(logits, slice_lengths) ]
			# greedy sample from the sequence
			refined_list = [ logit.argmax(dim=-1) for logit in logits ]
			# to-do: compare scores
			# set the "refined" list as the output
			sequence_list = refined_list

		return sequence_list

	def forward(
		self,
		text_list: list[Tensor],
		proms_list: list[Tensor],
		resps_list: list[Tensor] | None = None,

		task_list: list[Tensor] | None = None,
		lang_list: list[Tensor] | None = None,
		tone_list: list[Tensor] | None = None,
		len_list: list[Tensor] | None = None,

		training: bool | int | None = None,

		disable_tqdm=False,
		use_lora=None,
		**sampling_kwargs,
	):
		# deduce batch_size
		if text_list is not None:
			default_task = "tts"
			device = text_list[0].device
			batch_size = len(text_list)
		else:
			default_task = "stt"
			device = resps_list[0].device
			batch_size = len(resps_list)

		# generate task list if not provided
		if task_list is None:
			task_list = [ default_task for _ in range(batch_size) ]

		# implicitly set for training
		if training is None and text_list is not None and resps_list is not None:
			n_levels_set = {r.shape[-1] for r in resps_list}
			n_levels = next(iter(n_levels_set))

			training = n_levels == self.n_resp_levels

		# is training
		if training:
			return self.forward_train(
				text_list=text_list,
				proms_list=proms_list,
				resps_list=resps_list,
				task_list=task_list,
				lang_list=lang_list,
				tone_list=tone_list,
				len_list=len_list,
			)

		# is NAR
		if (len_list is not None or resps_list is not None) and text_list is not None:
			return self.forward_nar(
				text_list=text_list,
				proms_list=proms_list,
				resps_list=resps_list,
				task_list=task_list,
				lang_list=lang_list,
				tone_list=tone_list,
				len_list=len_list,
				**sampling_kwargs,
			)

		# is AR
		return self.forward_ar(
			text_list=text_list,
			proms_list=proms_list,
			resps_list=resps_list,
			task_list=task_list,
			lang_list=lang_list,
			tone_list=tone_list,
			len_list=len_list,
			**sampling_kwargs,
		)


def example_usage():
	cfg.device = "cuda"
	cfg.trainer.backend = "local"
	if cfg.audio_backend == "dac":
		cfg.sample_rate = 44_100

	from functools import partial
	from einops import repeat
	from tqdm import tqdm

	from ..emb.qnt import decode_to_file, unload_model, trim_random, repeat_extend_audio, concat_audio, merge_audio
	from ..engines import Engine, Engines
	from ..utils import wrapper as ml
	from ..utils import setup_logging

	import numpy as np
	import re

	# cfg.model.experimental.masking_train_p = 0.5
	cfg.hyperparameters.batch_size = 1
	cfg.hyperparameters.gradient_accumulation_steps = 1

	setup_logging()

	def load_artifact( path ):
		artifact = np.load(path, allow_pickle=True)[()]

		text = torch.tensor( cfg.tokenizer.encode( artifact["metadata"]["phonemes"] ) ).to(dtype=torch.uint8, device=cfg.device)
		audio = torch.from_numpy(artifact["codes"].astype(np.int16))[0, :, :].t().to(dtype=torch.int16, device=cfg.device)

		return text, audio

	text, audio = load_artifact(f"./data/qnt.{'dac' if cfg.audio_backend == 'dac' else 'enc'}")
	batch_size = cfg.hyperparameters.batch_size

	text_list = [ text ] * batch_size
	proms_list = [ audio[:cfg.dataset.frames_per_second, :] ] * batch_size
	resps_list = [ audio[:cfg.dataset.frames_per_second * 4, :] ] * batch_size

	kwargs = {
		'n_text_tokens': 256,
		'n_audio_tokens': 1024,

		'd_model': 1024, # 256, # 1024, # 1536
		'n_heads': 16, # 4, # 16, # 24
		'n_layers': 12, # 32
		'n_experts': 1 if not cfg.model else cfg.model.experts,

		'p_dropout': 0.1,

		'l_padding': 8 if cfg.optimizations.fp8 else 0,

		'config': cfg.model
	}

	bos_id, space_id, eos_id = cfg.tokenizer.encode( " " )
	available_tasks = [] + (["tts-ar"] if "ar" in cfg.model.capabilities else []) + (["tts-nar"] if "len" in cfg.model.capabilities else [])

	model = AR_NAR(**kwargs).to(cfg.device)
	steps = 1000 // batch_size

	optimizer = cfg.hyperparameters.optimizer.lower() if cfg.yaml_path is not None else "prodigy"
	scheduler = cfg.hyperparameters.scheduler.lower() if cfg.yaml_path is not None else ""
	learning_rate = cfg.hyperparameters.learning_rate if cfg.yaml_path is not None else None

	if cfg.optimizations.dadaptation:
		# do not combine the two
		if scheduler == "schedulefree":
			scheduler = ""

		learning_rate = 1.0

	if optimizer == "prodigy":
		if learning_rate is None:
			learning_rate = 1.0

		optimizer = ml.Prodigy
	elif optimizer == "adagrad":
		if learning_rate is None:
			learning_rate = 1.0e-2

		optimizer = ml.Adagrad
	elif optimizer == "adamw":
		if learning_rate is None:
			learning_rate = 1.0e-4

		optimizer = ml.AdamW
	elif optimizer == "sdg":
		if learning_rate is None:
			learning_rate = 1.0e-4

		optimizer = ml.SGD
	else:
		raise ValueError(f"Unrecognized optimizer: {optimizer}")

	_logger.info(f"Optimizer: {optimizer}\tLearning rate: {learning_rate}")

	optimizer = optimizer(model.parameters(), lr=learning_rate)

	if scheduler == "schedulefree":
		if isinstance(optimizer, ml.AdamW):
			scheduler = ml.schedulefree.AdamWScheduleFree
		elif isinstance(optimizer, ml.SGD):
			scheduler = ml.schedulefree.SGDScheduleFree
		else:
			scheduler = None

		if scheduler is not None:
			_logger.info(f"Scheduler: {scheduler}")
			optimizer = scheduler( model.parameters(), lr = learning_rate )

	if cfg.optimizations.replace and cfg.optimizations.linear:
		model = ml.replace_linear( model )

	if cfg.optimizations.replace and cfg.optimizations.embedding:
		model = ml.replace_embedding( model )

	"""
	cfg.optimizations.model_offloading = {
		"devices": ["cuda:0", "cpu"],
	#	"limits": [ 0.9, -1 ],
		"assign": [[ f'layers.{i}.' for i in range(0,10) ], [ f'layers.{i}.' for i in range(11,12) ] + [ "model.norm" ]],
	#	"limits": [ 256 * (1024 ** 2), -1 ]
	}
	"""

	engine = Engine(model=model, optimizer=optimizer)
	engines = Engines({"ar+nar": engine})
	engines.setup()

	"""
	if cfg.optimizations.model_offloading:
		model = ml.offload_model( model, policy=cfg.optimizations.model_offloading )
	"""

	"""
	torch.save( {
		'module': model.state_dict()
	}, f"./data/{cfg.model.arch_type}.pth" )
	"""

	_logger.info(f"AR+NAR ({cfg.model.arch_type}, {cfg.audio_backend}) parameter count: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

	@torch.no_grad()
	def sample_data(t=None):
		if isinstance(t, list):
			tasks = t
			texts = [ text_list[0].to(cfg.device) if task not in text_task else None for i, task in enumerate( tasks ) ]
			proms = [ proms_list[0].to(cfg.device) if task not in text_task else [ "stt" ] for i, task in enumerate( tasks ) ]
			resps = [ None if task not in text_task else resps_list[0].to(cfg.device) for i, task in enumerate( tasks ) ]

			return texts, proms, resps, tasks

		texts = []
		proms = []
		resps = []
		tasks = []

		for i in range(batch_size):
			task = random.choice(available_tasks) if t is None else t

			text = text_list[i].to(cfg.device)
			prom = proms_list[i].to(cfg.device)
			resp = resps_list[i].to(cfg.device)

			# do nothing
			if task == "stt":
				prom = [ task ]
			else:
				task = "tts" if random.random() > 0.1 or "len" not in cfg.model.capabilities else "len"

			texts.append( text )
			proms.append( prom )
			resps.append( resp )
			tasks.append( task )

		return texts, proms, resps, tasks

	@torch.inference_mode()
	def sample( name, steps=500, task=None ):
		engine.eval()

		text_list, proms_list, resp_list, task_list = sample_data( task )

		if task == "tts-nar":
			len_list = engine(text_list, proms_list, task_list=["len"], max_steps=5, temperature=0.0 )
			len_list = [ resp_list[0].shape[0] for l in len_list ]
			resps_list = engine( text_list, proms_list, len_list=len_list )
		else:
			resps_list = engine( text_list, proms_list, task_list=["tts"], max_duration=steps, temperature=1.0 )
			resps_list = engine( text_list, proms_list, resps_list=resps_list, temperature=0.0 )

		for i, o in enumerate(resps_list):
			_ = decode_to_file(o.to(dtype=torch.int32), f"data/{cfg.model.arch_type}.{cfg.audio_backend}.{i}.{name}.{task}.wav", device=cfg.device)

		unload_model()

	def train():
		engine.train()
		t = trange(steps)
		for i in t:
			texts, proms, resps, tasks = sample_data()

			stats = {"step": i}
			stats |= engine.traverse(text_list=texts, proms_list=proms, resps_list=resps, task_list=tasks, training=True)
			stats |= {"grad_norm": engine.get_global_grad_norm()}

			tqdm.write(f"{stats}")

		"""
		torch.save( {
			'module': model.state_dict()
		}, f"./data/{cfg.model.arch_type}.pth" )
		"""

	#sample("init", 5)
	train()

	"""
	if cfg.optimizations.compile:
		model = ml.compile_model(model, backend=cfg.optimizations.compile)
	"""

	for task in available_tasks:
		sample("final", task=task)

	engines.quit()

if __name__ == "__main__":
	example_usage()