vall-e/vall_e/data.py

# todo: clean this mess up

import copy
import h5py
import json
import re
import logging
import numpy as np
import os
import random
import torch
import itertools

from .config import cfg
from .emb.qnt import trim, trim_random, repeat_extend_audio, concat_audio, merge_audio, decode_to_file, decode as decode_qnt, encode as encode_qnt, pad_codes_with_silence
from .emb.g2p import encode as encode_phns
from .utils.sampler import PoolSampler, OrderedSampler, BatchedOrderedSampler, RandomSampler
from .utils.distributed import global_rank, local_rank, world_size
from .utils.io import torch_save, torch_load, json_read, json_write, json_stringify, json_parse
from .utils import setup_logging

from collections import defaultdict
from functools import cache, cached_property
from itertools import groupby, zip_longest
from pathlib import Path
from typing import Any

from torch import Tensor
from torch.utils.data import DataLoader, Dataset as _Dataset
from torch.utils.data.distributed import DistributedSampler
from torch.nn.utils.rnn import pad_sequence

from tqdm.auto import tqdm
# torch.multiprocessing.set_sharing_strategy("file_system")

_logger = logging.getLogger(__name__)

@cache
def get_random_prompts( validation=True, min_length=0, tokenized=False ):
	duration_range = [ 5.5, 12.0 ] # to-do: pull from cfg.dataset.duration_range
	sentences = [
		"The birch canoe slid on the smooth planks.",
		"Glue the sheet to the dark blue background.",
		"It's easy to tell the depth of a well.",
		"These days a chicken leg is a rare dish.",
		"Rice is often served in round bowls.",
		"The juice of lemons makes fine punch.",
		"The box was thrown beside the parked truck.",
		"The hogs were fed chopped corn and garbage.",
		"Four hours of steady work faced us.",
		"A large size in stockings is hard to sell.",
		"The boy was there when the sun rose.",
		"A rod is used to catch pink salmon.",
		"The source of the huge river is the clear spring.",
		"Kick the ball straight and follow through.",
		"Help the woman get back to her feet.",
		"A pot of tea helps to pass the evening.",
		"Smoky fires lack flame and heat.",
		"The soft cushion broke the man's fall.",
		"The salt breeze came across from the sea.",
		"The girl at the booth sold fifty bonds.",
		"The small pup gnawed a hole in the sock.",
		"The fish twisted and turned on the bent hook.",
		"Press the pants and sew a button on the vest.",
		"The swan dive was far short of perfect.",
		"The beauty of the view stunned the young boy.",
		"Two blue fish swam in the tank.",
		"Her purse was full of useless trash.",
		"The colt reared and threw the tall rider.",
		"It snowed, rained, and hailed the same morning.",
		"Read verse out loud for pleasure.",
	]

	# Pull from validation dataset if existing + requested
	if validation and cfg.dataset.validation:
		paths = _load_paths(cfg.dataset.validation, type="validation", silent=True)
		paths = list(itertools.chain.from_iterable(paths.values()))
		
		for path in paths:
			duration = 0
			text_string = ""
			if cfg.dataset.use_hdf5:
				key = _get_hdf5_path(path)

				metadata = { f'{k}': f'{v}' for k, v in cfg.hdf5[key].attrs.items() }
				metadata = process_artifact_metadata( { "metadata": metadata } )
				text_string = metadata["text"] if "text" in metadata else ""
				duration = metadata['duration'] if "duration" in metadata else 0
			else:
				_, metadata = _load_quants(path, return_metadata=True)
				metadata = process_artifact_metadata( { "metadata": metadata } )
				text_string = metadata["text"] if "text" in metadata else ""
				duration = metadata['duration'] if "duration" in metadata else 0
			
			if len( text_string ) < min_length or not (duration_range[0] <= duration and duration <= duration_range[1]):
				continue

			sentences.append( text_string )

	# tokenize here because our harvard sentences need to be phonemized anyways
	if tokenized:
		return [ torch.tensor( tokenize( encode_phns( text ) ) ).to(dtype=torch.uint8) for text in sentences ]

	return sentences

# samples a random text prompt
def get_random_prompt( *args, **kwargs ):
	# Harvard sentences
	return random.choice(get_random_prompts( *args, **kwargs ))

# fold into a typical LLM sequence (one embedding rather than split embeddings)
def fold_inputs(
	text_list = [],
	lang_list = [],
	task_list = [],
	tone_list = [],
	prom_list = [],
	resp_list = [],
	targ_list = [],

	ignore_index = None,

	sep = 3,
	stop = 3,
	config = None,
	
	quant_levels = None,
):
	if config is None:
		config = cfg.model

	def _create_mask(l, device):
		seq = torch.arange(max(l), device=device).unsqueeze(0)  # (1 t)
		stop = torch.tensor(l, device=device).unsqueeze(1)  # (b 1)
		return (seq < stop).float()  # (b t)

	def list_to_tensor(x_list: list[Tensor], mask=True):
		l = list(map(len, x_list))
		x = pad_sequence(x_list).t()
		if not mask:
			return x

		m = _create_mask(l, x_list[0].device)
		m = m.to(x)
		return x, m

	def process_prom_or_task(i, prom):
		if prom is None:
			return 0

		if isinstance(prom, str):
			task = get_task_symmap()[f'<{input}>']
			seq = torch.tensor([task_start + task], device=device, dtype=dtype)

			input_ids[i].append( seq )
			input_ids[i].append( sep )
			
			return seq.shape[0] + 1

		# deinterleaved
		if quant_levels is not None:
			quant_level = quant_levels[i]
			if ignore_index is not None:
				seq = torch.tensor( [ ignore_index for _ in range( prom.shape[0] ) ], device=device, dtype=dtype)
			else:
				seq = prom[:, quant_level].to(device=device, dtype=dtype).clone()
				for idx, token in enumerate( seq ):
					token += prom_start + ( config.audio_tokens * quant_level )
		# interleaved
		else:
			if ignore_index is not None:
				seq = torch.tensor( [ ignore_index for _ in range( prom.shape[0] * prom.shape[1] ) ], device=device, dtype=dtype)
			else:
				seq = prom.flatten().to(device=device, dtype=dtype)
				for idx, token in enumerate( seq ):
					token += prom_start + ( config.audio_tokens * ( idx % config.resp_levels ) )

		input_ids[i].append( seq )
		input_ids[i].append( sep )

		return seq.shape[0] + 1

	def generate_position_ids( length, sep=True ):
		return [ i for i in range( length + (1 if sep else 0) ) ]

	"""
	if quant_levels is not None:
		resps_list = [ [] if l == 0 else resp for l, resp in zip(quant_levels, resp_list) ]
	"""

	device = text_list[0].device
	dtype = torch.int64

	batch_size = len(text_list)
	input_ids = [ [] for _ in range(batch_size) ]
	position_ids = [ [] for _ in range(batch_size) ]

	offset = 0
	
	sep = torch.tensor([ sep ], device=device, dtype=dtype)
	stop = torch.tensor([ stop ], device=device, dtype=dtype)

	text_start = 0
	text_end = text_start + config.text_tokens

	lang_start = text_end
	lang_end = lang_start + config.langs

	rvq_start = lang_end
	rvq_end = rvq_start + config.resp_levels

	prom_start = rvq_end
	prom_end = prom_start + config.audio_tokens * config.resp_levels

	task_start = prom_end
	task_end = task_start + config.tasks

	tone_start = task_end
	tone_end = tone_start + config.tones
	
	resp_start = tone_end
	resp_end = resp_start + config.audio_tokens * config.resp_levels

	# text tokens
	for i, text in enumerate(text_list):
		if isinstance(text, torch.Tensor):
			seq = text + text_start
		else:
			seq = torch.tensor([text_start + text], device=device, dtype=dtype)
		
		input_ids[i].append( seq )
		input_ids[i].append( sep )

		position_ids[i].append( generate_position_ids( seq.shape[0] ) )

	# lang tokens
	for i, lang in enumerate(lang_list):
		if isinstance(lang, torch.Tensor):
			seq = lang + lang_start
		else:
			seq = torch.tensor([lang_start + lang], device=device, dtype=dtype)
		
		input_ids[i].append( seq )
		input_ids[i].append( sep )

		position_ids[i].append( generate_position_ids( seq.shape[0] ) )
	
	# inject target quant_level
	if quant_levels is not None:
		for i, rvq in enumerate( quant_levels ):
			if isinstance(rvq, torch.Tensor):
				seq = rvq + rvq_start
			else:
				seq = torch.tensor([rvq_start + rvq], device=device, dtype=dtype)
			input_ids[i].append( seq )
			input_ids[i].append( sep )

			position_ids[i].append( generate_position_ids( seq.shape[0] ) )

	# prom / task tokens
	for i, prom in enumerate(prom_list):
		# list of proms with a possible task token
		length = 0
		if isinstance(prom, list):
			for p in prom:
				length += process_prom_or_task(i, p)
		# raw tensor
		else:
			length += process_prom_or_task(i, prom)

		position_ids[i].append( generate_position_ids( length, sep=False ) )

	# tone tokens
	for i, tone in enumerate(tone_list):
		if isinstance(tone, torch.Tensor):
			seq = tone + tone_start
		else:
			seq = torch.tensor([tone_start + tone], device=device, dtype=dtype)
		input_ids[i].append( seq )
		input_ids[i].append( sep )

		position_ids[i].append( generate_position_ids( seq.shape[0] ) )

	# resp tokens
	for i, resp in enumerate(resp_list):
		# deinterleaved
		if quant_levels is not None:
			# grab the previous rvq level
			quant_level = quant_levels[i] - 1
			# way to signal we want to inference for rvq level 0
			# without it, it's a random chance for any level to be selected again	
			if quant_level < 0:
				continue
			else:
				# my shitcode keeps things as lists of tensors for each level, so this handles it because lists can't index by tuples
				if isinstance(resp, list):
					seq = resp[quant_level].to(device=device, dtype=dtype).clone()
				else:
					seq = resp[:, quant_level].to(device=device, dtype=dtype).clone()

				for idx, token in enumerate( seq ):
					token += resp_start + ( config.audio_tokens * quant_level )

			input_ids[i].append( seq )
			input_ids[i].append( stop )

			position_ids[i].append( generate_position_ids( seq.shape[0] ) )
		# interleaved
		else:
			seq = resp.flatten().to(device=device, dtype=dtype)
			for idx, token in enumerate( seq ):
				token += resp_start + ( config.audio_tokens * ( idx % config.resp_levels ) )
		
			input_ids[i].append( seq )
			input_ids[i].append( stop )
			
			position_ids[i].append( generate_position_ids( seq.shape[0] ) )

	# targ list
	for i, resp in enumerate(targ_list):
		# deinterleaved
		if quant_levels is not None:
			quant_level = quant_levels[i]
			seq = resp[:, quant_level].to(device=device, dtype=dtype)
			for idx, token in enumerate( seq ):
				token += resp_start + ( config.audio_tokens * quant_level )
			
			input_ids[i].append( seq )
			input_ids[i].append( stop )

			position_ids[i].append( generate_position_ids( seq.shape[0] ) )
		# interleaved
		else:
			seq = resp.flatten().to(device=device, dtype=dtype)
			for idx, token in enumerate( seq ):
				token += resp_start + ( config.audio_tokens * ( idx % config.resp_levels ) )
		
			input_ids[i].append( seq )
			input_ids[i].append( stop )
			
			position_ids[i].append( generate_position_ids( seq.shape[0] ) )

	for i, batch in enumerate(input_ids):
		input_ids[i] = torch.concat(input_ids[i], dim=-1).to(device=device, dtype=dtype)
		position_ids[i] = torch.concat([ torch.tensor(ids, device=device, dtype=dtype) for ids in position_ids[i] ], dim=-1)

	input_ids, attention_mask = list_to_tensor(input_ids)
	position_ids = list_to_tensor(position_ids, mask=False)

	return input_ids, attention_mask, position_ids

# unfold from one unified token ID space to separate token spaces
# to-do: unfold at a specific RVQ level instead if requested
def unfold_outputs(
	output_ids,

	sep = 3,
	stop = 3,
	
	config = None,
	quant_levels = None,
):
	def bin_to_rvqs( tokens ):
		length = len(tokens)
		"""
		if length % config.resp_levels == 0:
			tokens = torch.tensor(tokens).reshape( config.resp_levels, length // config.resp_levels ).t()
		"""
		bins = [ [] for _ in range(config.resp_levels) ]
		for pos in range( length ):
			rvq = pos % config.resp_levels
			bins[rvq].append( tokens[pos] )
		nearest = ( len(bins) // config.resp_levels ) * config.resp_levels
		bins = bins[:nearest]
		return torch.tensor(bins, device=device, dtype=dtype).t()

	if config is None:
		config = cfg.model

	device = output_ids.device
	dtype = torch.int64

	batch_size = output_ids.shape[0]

	text_list = [ [] for _ in range(batch_size) ]
	rvq_list  = [ [] for _ in range(batch_size) ]
	lang_list  = [ [] for _ in range(batch_size) ]
	task_list  = [ [] for _ in range(batch_size) ]
	tone_list  = [ [] for _ in range(batch_size) ]
	prom_list = [ [] for _ in range(batch_size) ]
	resp_list = [ [] for _ in range(batch_size) ]

	text_start = 0
	text_end = text_start + config.text_tokens

	lang_start = text_end
	lang_end = lang_start + config.langs

	rvq_start = lang_end
	rvq_end = rvq_start + config.resp_levels

	prom_start = rvq_end
	prom_end = prom_start + config.audio_tokens * config.resp_levels

	task_start = prom_end
	task_end = task_start + config.tasks

	tone_start = task_end
	tone_end = tone_start + config.tones
	
	resp_start = tone_end
	resp_end = resp_start + config.audio_tokens * config.resp_levels

	for i, batch in enumerate( output_ids ):
		# cringe logic to handle prefix resp for rvq levels > 0
		# a better way is to observe if the rvq level increased
		should_flush = False
		flushed = False
		for idx, token in enumerate( batch ):
			id = token.item()
			if id == sep or id == stop:
				if should_flush and quant_levels is not None and quant_levels[i] > 0:
					resp_list[i] = []
					should_flush = False
					flushed = True

				continue

			# text tokens
			if text_start <= id and id < text_end:
				text_list[i].append( (id - text_start) % config.text_tokens )
			# lang tokens
			elif lang_start <= id and id < lang_end:
				lang_list[i].append( (id - lang_start) % config.langs )
			# rvq levels
			elif rvq_start <= id and id < rvq_end:
				rvq_list[i].append( (id - rvq_start) % config.resp_levels )
			# prom tokens
			elif prom_start <= id and id < prom_end:
				prom_list[i].append( (id - prom_start) % config.audio_tokens )
			# task tokens
			elif task_start <= id and id < task_end:
				task_list[i].append( (id - task_start) % config.tasks )
			# lang tokens
			elif tone_start <= id and id < tone_end:
				tone_list[i].append( (id - tone_start) % config.tones )
			# resp tokens
			elif resp_start <= id and id < resp_end:
				resp_list[i].append( (id - resp_start) % config.audio_tokens )

				if not flushed:
					should_flush = True

		if quant_levels is not None:
			prom_list[i] = torch.tensor(prom_list[i], device=device, dtype=dtype).t()
			resp_list[i] = torch.tensor(resp_list[i], device=device, dtype=dtype).t()
		else:
			prom_list[i] = bin_to_rvqs( prom_list[i] )
			resp_list[i] = bin_to_rvqs( resp_list[i] )

		text_list[i] = torch.tensor( text_list[i], device=device, dtype=dtype )
		task_list[i] = torch.tensor( task_list[i], device=device, dtype=dtype )
		lang_list[i] = torch.tensor( lang_list[i], device=device, dtype=dtype )
		tone_list[i] = torch.tensor( tone_list[i], device=device, dtype=dtype )

	return dict(
		text_list=text_list,
		prom_list=prom_list,
		resp_list=resp_list,
		
		task_list=task_list,
		lang_list=lang_list,
		tone_list=tone_list,
	)

# to-do: clean up this symmap mess
def get_phone_symmap():
	return cfg.tokenizer.get_vocab()

def tokenize( phones ):
	if isinstance( phones, list ):
		phones = "".join( phones )
	return cfg.tokenizer.encode( phones )

def get_lang_symmap():
	return {
		"en": 0,
		"ja": 1,
		"de": 2,
		"fr": 3,
	}

def get_tone_symmap():
	return {
		"neutral": 0,
	}
	return symmap

def get_task_symmap():
	return {
		"<tts>": 0,
		"<tts-c>": 1,
		"<ns>": 2,
		"<sr>": 3,
		"<tse>": 4,
		"<soe>": 5,
		"<mask>": 6,
		"<eoe>": 7,
		"<stt>": 8,

		"<nse>": 6, # fake
		"<cse>": 6, # fake
	}

def _replace_file_extension(path, suffix):
	if not isinstance( path, Path ):
		path = Path(path)
	return (path.parent / path.name.split(".")[0]).with_suffix(suffix)

def _get_quant_extension():
	return ".dac" if cfg.audio_backend == "dac" else ".enc"

def _get_phone_extension():
	return ".json" # if cfg.audio_backend == "dac" else ".phn.txt"

def _get_quant_path(path):
	return _replace_file_extension(path, _get_quant_extension())

def _get_phone_path(path):
	return _replace_file_extension(path, _get_phone_extension())

_durations_map = {}
# makeshift caching the above to disk
@cfg.diskcache()
def _get_duration_map( type="training" ):
	return _durations_map[type] if type in _durations_map else {}

@cfg.diskcache()
def _load_paths(dataset, type="training", silent=False):
	return { cfg.get_spkr( cfg.data_dir / data_dir / "dummy" ): _load_paths_from_metadata( data_dir, type=type, validate=cfg.dataset.validate and type == "training" ) for data_dir in tqdm(dataset, desc=f"Parsing dataset: {type}", disable=silent) }

def _load_paths_from_metadata(group_name, type="training", validate=False):
	data_dir = group_name if cfg.dataset.use_hdf5 else cfg.data_dir / group_name

	_fn = _get_hdf5_paths if cfg.dataset.use_hdf5 else _get_paths_of_extensions

	def key( id, entry=None ):
		return f"/{type}/{_get_hdf5_path(data_dir)}/{id}" if cfg.dataset.use_hdf5 else data_dir / id

	metadata_path = cfg.metadata_dir / f'{group_name}.json'
	metadata = {}

	if cfg.dataset.use_metadata and metadata_path.exists():
		#metadata = json.loads(open( metadata_path, "r", encoding="utf-8" ).read())
		metadata = json_read( metadata_path )

	if len(metadata) == 0:
		return _fn( data_dir, type if cfg.dataset.use_hdf5 else _get_quant_extension(), validate )

	def _validate( id, entry ):
		phones = entry['phones'] if "phones" in entry else 0
		duration = entry['duration'] if "duration" in entry else 0

		# add to duration bucket
		k = key(id, entry)
		if type not in _durations_map:
			_durations_map[type] = {}
		_durations_map[type][k] = duration

		if not validate:
			return True

		return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration

	return [ key(id, entry) for id, entry in metadata.items() if _validate(id, entry) ]


def _get_hdf5_path(path):
	# to-do: better validation
	return str(path)

def _get_hdf5_paths( data_dir, type="training", validate=False ):
	data_dir = str(data_dir)
	
	key = f"/{type}/{_get_hdf5_path(data_dir)}"

	def _validate( id, entry ):
		phones = entry.attrs['phonemes']
		duration = entry.attrs['duration']

		if type not in _durations_map:
			_durations_map[type] = {}
		_durations_map[type][f"{key}/{id}"] = duration

		if not validate:
			return True
		
		return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration

	return [ Path(f"{key}/{id}") for id, entry in cfg.hdf5[key].items() if _validate(id, entry) ] if key in cfg.hdf5 else []

def _get_paths_of_extensions( path, extensions=_get_quant_extension(), validate=False ):
	if isinstance(path, str):
		path = Path(path)

	def _validate(path):
		if "".join(path.suffixes) not in extensions:
			return False
		if not _get_phone_path(path).exists() or not _get_quant_path(path).exists():
			return False
		if not validate:
			return True
		# to-do: find an easy way to determine size from pickled quants without loading
		# to-do: find a consistent way to derive phoneme count from filesize (probably can't due to utf-8)
		phones = len(_get_phones(_get_phone_path(path))) # _get_phone_path(path).stat().st_size // 2 + 1
		return cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones


	return [ p for p in list(path.iterdir()) if _validate(p) ] if path.exists() and path.is_dir() else []

def _load_quants(path, return_metadata=False) -> Tensor:
	qnt = np.load(_get_quant_path(path), allow_pickle=True)[()]
	if return_metadata:
		return torch.from_numpy(qnt["codes"].astype(int))[0][:, :].t().to(torch.int16), qnt["metadata"]
	return torch.from_numpy(qnt["codes"].astype(int))[0][:, :].t().to(torch.int16)

# prune consecutive spaces
def _cleanup_phones( phones, targets=[" "]):
	return [ p for i, p in enumerate(phones) if p not in targets or ( p in targets and p != phones[i-1] ) ]

@cache
def _get_phones(path):
	phone_path = _get_phone_path(path)
	quant_path = _get_quant_path(path)
	if phone_path.exists():
		#metadata = json.loads(open(phone_path, "r", encoding="utf-8").read())
		metadata = json_read(phone_path)
	elif quant_path.exists():
		_, metadata = _load_quants( path, return_metadata=True )
	else:
		raise Exception(f"Could not load phonemes: {path}")

	content = metadata["phonemes"]
	return "".join(content)

def _interleaved_reorder(l, fn):
	groups = defaultdict(list)
	for e in l:
		groups[fn(e)].append(e)
	groups = {k: groups[k] for k in sorted(groups)}
	for interleaved in zip_longest(*groups.values()):
		for value in interleaved:
			if value is not None:
				yield value

class Dataset(_Dataset):
	def __init__(
		self,
		phone_symmap=None,
		training=False,
		extra_paths_by_spkr_name: dict[str, list] = {},
	):
		super().__init__()
		self._head = None
		self.sampler = None

		self.paths = []

		self.training = training
		self.dataset_type = "training" if self.training else "validation"
		self.dataset = cfg.dataset.training if self.training else cfg.dataset.validation
		self.sampler_type = cfg.dataset.sample_type if self.dataset_type == "training" else "path"
		self.sampler_order = cfg.dataset.sample_order
		self.sampler_shuffle = cfg.dataset.sample_shuffle if self.dataset_type == "training" else True

		# to-do: do not do validation if there's nothing in the validation
		# this just makes it be happy
		if len(self.dataset) == 0:
			self.dataset = cfg.dataset.training

		# hard error because I kept getting tricked by this myself
		if self.sampler_order == "duration" and self.sampler_type != "path":
			raise Exception(f'Requesting sample_type={self.sampler_type} with sample_order={self.sampler_order}, yet combination will not give expected results.')
		
		# dict of paths keyed by speaker names
		self.paths_by_spkr_name = _load_paths(self.dataset, self.dataset_type)

		# cull speakers if they do not have enough utterances
		if cfg.dataset.min_utterances > 0:
			keys = list(self.paths_by_spkr_name.keys())
			for key in keys:
				if len(self.paths_by_spkr_name[key]) < cfg.dataset.min_utterances:
					del self.paths_by_spkr_name[key]

		# flatten paths
		self.paths = list(itertools.chain.from_iterable(self.paths_by_spkr_name.values()))
		
		# split dataset accordingly per GPU
		if cfg.distributed and self.training:
			"""
			batches = len(self.paths) // world_size()
			start = batches * global_rank()
			end = batches * (global_rank() + 1)

			self.paths = self.paths[start:end]
			"""

			self.paths = [ path for i, path in enumerate(self.paths) if i % world_size() == 0 ]

			# recreate paths_by_spkr_name
			self.paths_by_spkr_name = {}
			for path in self.paths:
				name = cfg.get_spkr( Path(path) )
				if name not in self.paths_by_spkr_name:
					self.paths_by_spkr_name[name] = []
				self.paths_by_spkr_name[name].append( path )

		# do it here due to the above
		self.duration = 0
		self.duration_map = _get_duration_map( self.dataset_type )
		self.duration_buckets = {}

		# store in corresponding bucket
		for path in self.paths:
			duration = self.duration_map[path]
			self.duration += duration
			
			# only calc duration if we're going to order by duration
			if self.sampler_order != "duration":
				continue

			bucket = int(round(duration))
			if bucket not in self.duration_buckets:
				self.duration_buckets[bucket] = []
			self.duration_buckets[bucket].append( ( Path(path), duration ) )

		# ensure they're ordered
		self.duration_buckets = dict(sorted(self.duration_buckets.items()))

		# sort by duration
		if self.sampler_order == "duration":
			flattened = {}
			# sort and interleave
			for bucket in self.duration_buckets:
				# sort by duration
				self.duration_buckets[bucket].sort( key=lambda x: x[1] )
				# split to retain tuples
				flattened[bucket] = self.duration_buckets[bucket]
				# replace with path
				flattened[bucket] = [ x[0] for x in flattened[bucket] ]
				# flatten by paths
				flattened[bucket] = [*_interleaved_reorder(flattened[bucket], self.get_speaker)]
			# flatten paths
			self.paths = list(itertools.chain.from_iterable(flattened.values()))
		elif self.sampler_order == "random":
			random.shuffle( self.paths )
		else:
			# just interleave
			self.paths = [*_interleaved_reorder(self.paths, self.get_speaker)]


		# dict of speakers keyed by speaker group
		self.spkrs_by_spkr_group = {}
		for data_dir in self.dataset:
			spkr = cfg.get_spkr( data_dir / "dummy" )
			spkr_group = cfg.get_spkr_group( data_dir / "dummy" )

			if spkr not in self.paths_by_spkr_name or len(self.paths_by_spkr_name[spkr]) < cfg.dataset.min_utterances:
				continue

			if spkr_group not in self.spkrs_by_spkr_group:
				self.spkrs_by_spkr_group[spkr_group] = []

			self.spkrs_by_spkr_group[spkr_group].append( spkr )

		self.spkr_groups = list(self.spkrs_by_spkr_group.keys())

		self.noise_paths = _load_paths(cfg.dataset.noise, "noise")
		self.noise_paths = list(itertools.chain.from_iterable(self.noise_paths.values()))

		self.phone_symmap = phone_symmap or self._get_phone_symmap()
		self.spkr_symmap = self._get_spkr_symmap()
		self.spkr_group_symmap = self._get_spkr_group_symmap()
		self.lang_symmap = self._get_lang_symmap()
		self.tone_symmap = self._get_tone_symmap()
		self.task_symmap = self._get_task_symmap()

		# grab IDs for bos, space, and eos for easy input creation later
		self.empty_text = [ cfg.tokenizer._bos_token, cfg.tokenizer.get_vocab()[" "], cfg.tokenizer._eos_token ]

		# have it fetch at training time if any is invalid, because the tokenizer obj might not have it easily fetchable ahead of itme
		# encoding before parallelizing things causes things to whine
		if self.empty_text[0] is None or self.empty_text[-1] is None:
			self.empty_text = None

		# assert len(self.phone_symmap) < 256, "Unique token count should be [0,255] to fit within uint8"
		self.text_dtype = torch.uint8 if len(self.phone_symmap) < 256 else torch.int16

		if len(self.paths) == 0:
			raise ValueError(f"No valid path is found for {self.dataset_type}")

		if self.sampler_type == "path" and self.training:
			if self.sampler_order == "duration" and cfg.dataset.sample_max_duration_batch > 0:
				self.sampler = BatchedOrderedSampler(
					self.duration_buckets if not self.sampler_state_dict_path.exists() else {}, # pass nothing if we're just going to load from a state anyways
					max_duration=cfg.dataset.sample_max_duration_batch,
					max_batch_size=cfg.hyperparameters.batch_size if self.training else cfg.evaluation.batch_size,
					shuffle=self.sampler_shuffle
				)
			else:
				self.sampler = OrderedSampler( len(self) ) if not self.sampler_shuffle else RandomSampler( len(self) )
			self.samplers = {}
			self.spkr_samplers = {}
		else:
			self.sampler = RandomSampler( len(self) )
			self.samplers = { name: PoolSampler( paths, keep_all=True, shuffle=self.sampler_shuffle ) for name, paths in self.paths_by_spkr_name.items() }
			self.spkr_samplers = { name: PoolSampler( [*set(speakers)], keep_all=True, shuffle=self.sampler_shuffle ) for name, speakers in self.spkrs_by_spkr_group.items() }

		self.load_state_dict()

	@cached_property
	def sampler_state_dict_path(self):
		return cfg.ckpt_dir / (cfg.lora.full_name if cfg.lora is not None else cfg.model.full_name) / f"sampler.{self.sampler_type}.rank{global_rank()}.pt"
		
	def get_speaker(self, path):
		if isinstance(path, str):
			path = Path(path)
		res = cfg.get_spkr(path)
		return res

	def get_speaker_group(self, path):
		if isinstance(path, str):
			path = Path(path)
		res = cfg.get_spkr_group(path)
		return res

	# this isn't really necessary since our data/metadata contains markers for languages, but this is still in in-case it's needed to force a language setting (for example, whisperX's lang isn't that accurate at times)
	def get_language(self, speaker_group, lang="en"):
		for k, v in cfg.dataset.speaker_languages.items():
			if speaker_group in v:
				lang = k
				break

		return lang.lower()

	@cached_property
	def spkrs(self):
		return sorted({self.get_speaker(path) for path in self.paths})

	@cached_property
	def tasks(self):
		return cfg.dataset.tasks_list # ["tts", "tts", "ns", "sr", "tse", "tts", "tts"] # , "cse", "nse"

	def save_state_dict(self, path = None):
		if path is None:
			path = self.sampler_state_dict_path

		if not path.parent.exists():
			path.parent.mkdir(parents=True, exist_ok=True)

		if self.sampler_type == "path":
			state_dict = self.sampler.get_state()
		else:
			state_dict = {
				"samplers": { name: sampler.get_state() for name, sampler in self.samplers.items() },
				"spkr_samplers": { name: sampler.get_state() for name, sampler in self.spkr_samplers.items() },
			}
		torch_save(state_dict, path)

	def load_state_dict(self, path = None):
		if not self.training:
			return

		if path is None:
			path = self.sampler_state_dict_path

		if not path.exists():
			return

		state_dict = torch_load(path)
		if self.sampler_type == "path":
			state_dict = self.sampler.set_state(state_dict)
		else:
			for name, sampler in state_dict["samplers"].items():
				if name not in self.samplers:
					continue
				self.samplers[name].set_state( sampler )

			for name, sampler in state_dict["spkr_samplers"].items():
				if name not in self.spkr_samplers:
					continue
				self.spkr_samplers[name].set_state( sampler )

	def _get_phone_symmap(self):
		return get_phone_symmap()

	def _get_spkr_symmap(self):
		return {s: i for i, s in enumerate(self.spkrs)}

	def _get_spkr_group_symmap(self):
		return {s: i for i, s in enumerate(self.spkr_groups)}

	def _get_lang_symmap(self):
		return get_lang_symmap()

	def _get_tone_symmap(self):
		return get_tone_symmap()

	def _get_task_symmap(self):
		return get_task_symmap()

	def sample_noise(self):
		path = random.choice(self.noise_paths)

		if cfg.dataset.use_hdf5:
			key = _get_hdf5_path(path)
			qnt = torch.from_numpy(cfg.hdf5[key]["audio"][:, :]).to(torch.int16)
		else:
			qnt = _load_quants(path, return_metadata=False)
		return qnt

	def sample_speakers(self, ignore=[]):
		choices = set(self.spkrs) - set(ignore)
		return random.choice([*choices])

	def sample_utterance(self, spkr_name, ignore=[]):
		choices = [*(set(self.paths_by_spkr_name[spkr_name]) - set(ignore))]

		if len(choices) == 0:
			return None, None, None
		
		path = random.choice(choices)
			
		if cfg.dataset.use_hdf5:
			key = _get_hdf5_path(path)

			if key not in cfg.hdf5:
				raise RuntimeError(f'Key of Path ({path}) not in HDF5: {key}')

			#metadata = cfg.hdf5[key].attrs
			metadata = { f'{k}': f'{v}' for k, v in cfg.hdf5[key].attrs.items() }

			text = cfg.hdf5[key]["text"][:]
			resps = cfg.hdf5[key]["audio"][:, :]
			
			text = torch.from_numpy(text).to(self.text_dtype)
			resps = torch.from_numpy(resps).to(torch.int16)
			
			"""
			lang = metadata["language"] if "language" in metadata else None
			tone = metadata["tone"] if "tone" in metadata else None
			"""
		else:
			resps, metadata = _load_quants(path, return_metadata=True)
			text = torch.tensor(tokenize( metadata["phonemes"] )).to(self.text_dtype)

			"""
			lang = metadata["language"] if "language" in metadata else None
			tone = metadata["tone"] if "tone" in metadata else None
			"""

		return path, text, resps

	# icky slop
	def get_similar_utterance(self, path, offset=None ):
		if offset is None:
			offset = cfg.dataset.prompt_similar_top_k_offset

		reference = path.name

		if cfg.dataset.use_hdf5:
			root = Path( *path.parts[:-1] )
			path = Path( *path.parts[2:-1] )
		else:
			root = Path( *path.parts[:-1] )
			path = Path(*path.parts[len(cfg.data_dir.parts):-1])

		metadata = json_read( cfg.metadata_dir / path.with_suffix(".json"), default={} )

		if reference not in metadata:
			return None

		reference_metadata = metadata[reference]

		if "similar" not in reference_metadata:
			return None

		if len(reference_metadata["similar"]) >= offset:
			offset = 0

		metadata_keys = list(metadata.keys())

		if cfg.dataset.prompt_similar_top_k > 1:
			indices = reference_metadata["similar"][offset:offset+cfg.dataset.prompt_similar_top_k]
			index = random.choice( indices )
		else:
			index = reference_metadata["similar"][offset]
		name = metadata_keys[index]

		return root / name

	def sample_prompts(self, spkr_name, reference, should_trim=True):
		if not cfg.dataset.prompt_duration_range or cfg.dataset.prompt_duration_range[-1] == 0:
			return None

		prom_list = []

		choices = set(self.paths_by_spkr_name[spkr_name]) - {reference}
		choices = [*choices]

		# no other utterances, it'd make more sense to prune speakers with only one utterance in the validation step
		if len(choices) == 0:
			choices = [*set(self.paths_by_spkr_name[spkr_name])]
			"""
			raise ValueError(
				f"Failed to find another different utterance for {spkr_name}."
			)
			"""

		prom_length = 0
		duration_lo, duration_hi = cfg.dataset.prompt_duration_range
		trim_length = int(random.uniform(duration_lo, duration_hi) * cfg.dataset.frames_per_second) if trim else 0

		for _ in range(cfg.dataset.prompt_max_samples):
			if reference is not None:
				# yuck
				path = None
				if random.random() < cfg.dataset.prompt_similar_p:
					path = self.get_similar_utterance( reference, offset = len(prom_list) )
				if not path:
					path = random.choice(choices)
			else:
				path = random.choice(choices)
			if cfg.dataset.use_hdf5:
				key = _get_hdf5_path(path)
				qnt = torch.from_numpy(cfg.hdf5[key]["audio"][:, :]).to(torch.int16)
			else:
				qnt = _load_quants(path, return_metadata=False)

			if 0 < trim_length and trim_length < qnt.shape[0]:
				qnt = trim( qnt, trim_length, reencode=cfg.dataset.reencode_on_concat, device=cfg.dataset.reencode_device )

			prom_list.append(qnt)
			prom_length += qnt.shape[0]

			if prom_length >= trim_length:
				break

		# might be better to decode => concat waveforms with silence in between => reencode
		# as you technically can't just append encodec sequences together like this without issues
		prom = concat_audio( *prom_list, reencode=cfg.dataset.reencode_on_concat, device=cfg.dataset.reencode_device )

		if 0 < trim_length and trim_length < prom.shape[0]:
			prom = trim( prom, trim_length, reencode=cfg.dataset.reencode_on_concat, device=cfg.dataset.reencode_device )

		return prom

	def __getitem__(self, index):
		if self.empty_text is None:
			self.empty_text = tokenize(" ")
		
		bos_id, space_id, eos_id = self.empty_text

		if self.sampler_type == "group":
			spkr_group = self.spkr_groups[index]
			#spkr_group_id = self.spkr_group_symmap[spkr_group]
			spkr_name = self.spkr_samplers[spkr_group].sample()
			spkr_id = self.spkr_symmap[spkr_name]
			path = self.samplers[spkr_name].sample()
		elif self.sampler_type == "speaker":
			spkr_name = self.spkrs[index]
			spkr_id = self.spkr_symmap[spkr_name]
			path = self.samplers[spkr_name].sample()
			spkr_group = self.get_speaker_group(path)
			#spkr_group_id = self.spkr_group_symmap[spkr_group]
		else:
			path = self.paths[index]
			spkr_name = self.get_speaker(path)
			spkr_id = self.spkr_symmap[spkr_name]
			spkr_group = self.get_speaker_group(path)
			#spkr_group_id = self.spkr_group_symmap[spkr_group]

		if not isinstance( path, Path ):
			path = Path( path )

		if cfg.dataset.use_hdf5:
			key = _get_hdf5_path(path)

			if key not in cfg.hdf5:
				raise RuntimeError(f'Key of Path ({path}) not in HDF5: {key}')

			# I need to do some weird coersion to a normal dict because it'll bitch about Hdf5 objects not being pickleable in worker processes
			metadata = { f'{k}': f'{v}' for k, v in cfg.hdf5[key].attrs.items() }

			text = cfg.hdf5[key]["text"][:]
			resps = cfg.hdf5[key]["audio"][:, :]
			
			text = torch.from_numpy(text).to(self.text_dtype)
			resps = torch.from_numpy(resps).to(torch.int16)
			
			lang = metadata["language"] if "language" in metadata else None
			tone = metadata["tone"] if "tone" in metadata else None
			text_string = metadata["text"] if "text" in metadata else None

			if cfg.dataset.retokenize_text and "phonemes" in metadata:
				text = torch.tensor(tokenize( metadata["phonemes"] )).to(self.text_dtype)
		else:
			resps, metadata = _load_quants(path, return_metadata=True)
			text = torch.tensor(tokenize( metadata["phonemes"] )).to(self.text_dtype)

			lang = metadata["language"] if "language" in metadata else None
			tone = metadata["tone"] if "tone" in metadata else None
			text_string = metadata["text"] if "text" in metadata else None

		lang = self.get_language(spkr_group) if not lang else lang.lower()
		
		if not tone:
			tone = "neutral"

		lang = torch.tensor([self.lang_symmap[lang]]).to(torch.uint8)
		tone = torch.tensor([self.tone_symmap[tone]]).to(torch.uint8)

		# a bool to easily experiment with two mindsets later
		naive = cfg.experimental

		# append additional prompts in an attempt to artifically increase lengths / offer new data
		if cfg.dataset.resps_max_samples > 1 and random.random() < cfg.dataset.resps_append_p:
			ignore_paths = []
			for _ in range( 1, cfg.dataset.resps_max_samples ):
				path, txt, qnt = self.sample_utterance(spkr_name, ignore=ignore_paths)
				ignore_paths.append(path)

				# <s>[original text]</s><s>[new text]</s>
				if naive:
					text = torch.concat([ text, txt ])
				# <s>[original text] [new text]</s>
				# removes the original text's </s>, includes a space, and remove the new text's <s>
				else:
					text = torch.concat([ text[:-1], torch.tensor([self.phone_symmap[" "]]).to(torch.int16),  txt[1:] ])

				# might be better to decode => concat waveforms with silence in between => reencode
				# as you technically can't just append encodec sequences together like this without issues
				resps = concat_audio( resps, qnt, reencode=cfg.dataset.reencode_on_concat, device=cfg.dataset.reencode_device )
		
		task = random.choice(self.tasks)

		if f'<{task}>' not in self.task_symmap:
			raise Exception(f'Task not defined: {task}')

		# Base TTS (<text><prompt> => <resp>)
		if task == "tts":
			proms = self.sample_prompts(spkr_name, reference=path)

			if cfg.dataset.prompt_inject_noise:
				# sample random noise
				noise = self.sample_noise()
				# extend the noise to fill the target audio
				noise = repeat_extend_audio(noise, proms.shape[0])
				# create the input prompt by merging the target audio with the noise
				proms = merge_audio( proms, noise, scale=[1, cfg.dataset.noise_scale], device=cfg.dataset.reencode_device )


		# VALL-E Continuous (<text><partial resp> => <remaining resp> )
		#     (this could just be sampled as <text a><text b><audio a> => <audio b>, but I need to experiment with it)
		elif task == "tts-c":
			# trim a piece of the output response
			if naive:
				duration_lo, duration_hi = cfg.dataset.prompt_duration_range
				trim_length = int(random.uniform(duration_lo, duration_hi) * cfg.dataset.frames_per_second)
			
				proms = resps[:trim_length, :]
				resps = resps[trim_length:, :]
			else:
				path, txt, qnt = self.sample_utterance(spkr_name)

				# <s>[original text]</s><s>[new text]</s>
				if naive:
					text = torch.concat([ text, txt ])
				# <s>[original text] [new text]</s>
				# removes the original text's </s>, includes a space, and remove the new text's <s>
				else:
					text = torch.concat([ text[:-1], torch.tensor([space_id]).to(torch.int16), txt[1:] ])

				# set prompt as initial response
				proms = resps
				# set target as newly sampled response
				resps = qnt

			# inject task token
			proms = [
				proms,
				task,
			]

		# Base STT (<resp> => <text>)
		elif task == "stt":
			proms = [
				task
			]

		# Duration prediction (<text><prompt> => len(<resp>))
		elif task == "len":
			proms = self.sample_prompts(spkr_name, reference=path)

		# noise suppression (<text>? <resp+noise> => <resp>)
		# speech removal (<text>?<resp+noise> => <noise>)
		elif task == "ns" or task == "sr":
			# sample random noise
			noise = self.sample_noise()
			# extend the noise to fill the target audio
			noise = repeat_extend_audio(noise, resps.shape[0])
			# create the input prompt by merging the target audio with the noise
			proms = merge_audio( resps, noise, scale=[1, cfg.dataset.noise_scale], device=cfg.dataset.reencode_device )
			
			# set the text prompt to empty to train without a guided text prompt
			if random.random() < 0.5:
				text = None
			
			# inject task token
			proms = [
				task,
				proms
			]

			# set the target to just be the noise if <sr>
			if task == "sr":
				resps = noise


		# target speech extraction ( <text><prom><resp + other resp> => <resp> )
		elif task == "tse":
			# sample a prompt
			proms = self.sample_prompts(spkr_name, reference=path)

			# sample another speaker
			_, __, other_resps = self.sample_utterance(self.sample_speakers(ignore=[spkr_name]))

			# overlay the random speaker over the target audio
			other_resps = merge_audio( resps, other_resps, scale=[1, random.uniform(0.5, 0.75)], device=cfg.dataset.reencode_device )

			# set the text prompt to empty to train without a guided text prompt
			if random.random() < 0.5:
				text = None

			# stitch together the proms
			proms = [
				proms,
				task,
				other_resps,
			]


		# clean speech editing
		elif task == "cse" or task == "nse":
			# speech editing would require higher quality transcription data (phoneme level/word level) unfortunately
			# as I need to get a good clean point to trim into
			# instead we'll just sample a bunch of utterances

			samples = []
			for _ in range( 4 ):
				sampled = self.sample_utterance(spkr_name, ignore=[s[0] for s in samples])
				samples.append( sampled )

			pre_text, mid_text, post_text, edit_text = [ s[1][1:-1] for s in samples ]
			pre_prom, mid_prom, post_prom, edit_prom = [ s[2] for s in samples ]

			# randomly drop out pre
			if random.random() < 0.125:
				pre_text = None
				pre_prom = None
			# randomly drop out post
			elif random.random() < 0.125:
				post_text = None
				post_prom = None

			# create new text
			text = concat_audio(
				torch.tensor( [ bos_id ] ).to(dtype=self.text_dtype), # <s>
				pre_text,
				None if pre_text is None else torch.tensor( [ space_id ] ).to(dtype=self.text_dtype), # " "
				edit_text,
				None if post_text is None else torch.tensor( [ space_id ] ).to(dtype=self.text_dtype), # " "
				post_text,
				torch.tensor( [ eos_id ] ).to(dtype=self.text_dtype), # </s>

				reencode=False,
			)

			if task == "nse":
				# sample random noise
				noise = self.sample_noise()

				# it might be better to extend the noise to the sum of the pre+mid+post or pre+edit+post to keep the noise truly coherent
				# but it's noise, it's supposed to be random
				def noise_proms( p ):
					# ignore if we turned it off
					if p is None:
						return None

					# extend the noise to fill the target audio
					n = repeat_extend_audio(noise, p.shape[0])
					# merge the noise over the utterance
					return merge_audio(p, n, scale=[1, cfg.dataset.noise_scale], device=cfg.dataset.reencode_device)
				
				# apply noise to all pieces
				pre_prom = noise_proms( pre_prom )
				mid_prom = noise_proms( mid_prom )
				post_prom = noise_proms( post_prom )
				edit_prom = noise_proms( edit_prom )

			# create new prom
			proms = [
				pre_prom,
				"soe",
				"mask" if task == "cse" else mid_prom,
				"eoe",
				post_prom,
			]

			# create new resp
			resps = concat_audio( 
				pre_prom,
				edit_prom,
				post_prom,
				reencode=cfg.dataset.reencode_on_concat,
				device=cfg.dataset.reencode_device,
			)
		else:
			raise Exception(f'Undefined task: {task}')

		if text is None:
			text = torch.tensor([bos_id, eos_id]).to(self.text_dtype)

		# pad the target with silence
		if random.random() < cfg.dataset.resps_pad_silence_p:
			resps = pad_codes_with_silence( resps )

		return dict(
			index=index,
			path=Path(path),
			spkr_name=spkr_name,
			spkr_id=spkr_id,
			task=task,
			lang=lang,
			tone=tone,
			text=text,
			proms=proms,
			resps=resps,
			
			metadata=metadata,
		)

	def head_(self, n):
		self._head = n

	def training_(self, value):
		self.training = value

	def __len__(self):
		if self.sampler_type == "group":
			return min(len(self.spkr_groups), self._head or len(self.spkr_groups))
		if self.sampler_type == "speaker":
			return min(len(self.spkrs), self._head or len(self.spkrs))
		return min(len(self.paths), self._head or len(self.paths))


def collate_fn(samples: list[dict]):
	batch: dict[str, Any] = {k: [s[k] for s in samples] for k in samples[0]}
	return batch


def _seed_worker(worker_id):
	worker_seed = torch.initial_seed() % 2**32
	np.random.seed(worker_seed)
	random.seed(worker_seed)


def _create_dataloader(dataset, training):
	kwargs = dict(
		shuffle=not training,
		batch_size=cfg.hyperparameters.batch_size if training else cfg.evaluation.batch_size,
		drop_last=training,
		sampler=dataset.sampler if training else None,
	) if not isinstance(dataset.sampler, BatchedOrderedSampler) else dict(
		batch_sampler=dataset.sampler,
	)

	return DataLoader(
		dataset=dataset,
		num_workers=cfg.dataset.workers,
		collate_fn=collate_fn,
		persistent_workers=cfg.dataset.workers > 1,
		pin_memory=False,
		worker_init_fn=_seed_worker,
		**kwargs,
	)

def create_datasets():
	train_dataset = Dataset( training=True )
	val_dataset = Dataset( phone_symmap=train_dataset.phone_symmap, training=False )

	return train_dataset, val_dataset

def create_train_dataloader():
	train_dataset = Dataset( training=True )
	train_dl = _create_dataloader(train_dataset, training=True)

	_logger.info(str(train_dataset.phone_symmap))
	_logger.info(str(train_dataset.spkr_symmap))
	_logger.info(str(train_dataset.spkr_group_symmap))
	
	_logger.info(f"#samples (train): {len(train_dataset)}.")
	_logger.info(f"#duration (train): {str(train_dataset.duration)}.")

	return train_dl

def create_val_dataloader():
	val_dataset = Dataset( training=False )
	val_dl = _create_dataloader(val_dataset, training=False)

	_logger.info(str(val_dataset.phone_symmap))
	_logger.info(str(val_dataset.spkr_symmap))
	_logger.info(str(val_dataset.spkr_group_symmap))
	
	_logger.info(f"#samples (val): {len(val_dataset)}.")
	_logger.info(f"#duration (val): {str(val_dataset.duration)}.")

	return val_dl

def create_train_val_dataloader():
	train_dataset, val_dataset = create_datasets()

	# deepcopy is slow
	subtrain_dataset = Dataset( training=True ) 

	if subtrain_dataset.sampler_type == "path":
		subtrain_dataset.head_(cfg.evaluation.size)

	train_dl = _create_dataloader(train_dataset, training=True)
	val_dl = _create_dataloader(val_dataset, training=False)
	subtrain_dl = _create_dataloader(subtrain_dataset, training=False)

	_logger.info(str(train_dataset.phone_symmap))
	_logger.info(str(train_dataset.spkr_symmap))
	_logger.info(str(train_dataset.spkr_group_symmap))

	_logger.info(f"#samples (train): {len(train_dataset)}.")
	_logger.info(f"#samples (val): {len(val_dataset)}.")
	_logger.info(f"#samples (subtrain): {len(subtrain_dataset)}.")

	_logger.info(f"#duration (train): {str(train_dataset.duration)}.")
	_logger.info(f"#duration (val): {str(val_dataset.duration)}.")
	_logger.info(f"#duration (subtrain): {str(subtrain_dataset.duration)}.")

	assert isinstance(subtrain_dl.dataset, Dataset)

	return train_dl, subtrain_dl, val_dl

# parse metadata from an numpy file (.enc/.dac) and validate it
def process_artifact_metadata( artifact ):
	metadata = {}

	# text transcription (just in case)
	if "text" in artifact["metadata"]:
		metadata["text"] = artifact["metadata"]["text"]
	# phonemization of text transcription (just in case)
	if "phonemes" in artifact["metadata"]:
		metadata["phonemes"] = artifact["metadata"]["phonemes"]
	# language for sampling / input creation
	if "language" in artifact["metadata"]:
		metadata["language"] = artifact["metadata"]["language"]
	# top-k similar utterances for this utternace
	if "similar" in artifact["metadata"]:
		metadata["similar"] = artifact["metadata"]["similar"]
	# duration for use of culling / sorting dataset
	if "duration" in artifact["metadata"]:
		metadata["duration"] = float(artifact["metadata"]["duration"])
	# derive duration from sample count / sample rate
	elif "original_length" in artifact["metadata"] and "sample_rate" in artifact["metadata"]:
		metadata["duration"] = artifact["metadata"]["original_length"] / artifact["metadata"]["sample_rate"]
	# rephonemize if required
	if "phonemes" not in metadata and "text" in metadata:
		metadata["phonemes"] = encode_phns( metadata["text"], language=metadata["language"] if "language" in metadata["language"] else "en" )

	# clean up phonemes from espeak
	#     for example: Sonnenküste Update => zˈɔnənkˌystə (en)ˈʌpdeɪt(de)
	# to-do: regex replace /([a-z]{2})/ to ""
	if "phonemes" in metadata:
		metadata["phonemes"] = metadata["phonemes"].replace("(en)", "")
		if "language" in metadata:
			metadata["phonemes"] = metadata["phonemes"].replace(f"({metadata['language']})", "")
		metadata["phonemes"] = re.sub(r'\([a-z]{2}\)', "", metadata["phonemes"])

	return metadata

# yucky, but I would like to have the LibriTTS-R utterances remapped to their LibriSpeech counterpart
# to-do: allow this to be adjusted without having to regenerate metadata / HDF5 by remapping name during dataloader creation
def remap_speaker_name( name ):
	# commented out because I don't want the LibriSpeech portion of the dataset to get added
	"""
	if "LibriTTS-R" in speaker_name:
		name = name.replace("LibriTTS-R", "LibriVox")
	"""
	return name

# parse dataset into better to sample metadata
def create_dataset_metadata( skip_existing=True ):
	symmap = get_phone_symmap()
	
	root = str(cfg.data_dir)
	metadata_root = str(cfg.metadata_dir)

	cfg.metadata_dir.mkdir(parents=True, exist_ok=True)

	def add( dir, type="training", audios=True, texts=True ):
		name = str(dir)
		name = name.replace(root, "")
		speaker_name = remap_speaker_name( name )

		metadata_path = Path(f"{metadata_root}/{speaker_name}.json")
		metadata_path.parents[0].mkdir(parents=True, exist_ok=True)

		metadata = json_read( metadata_path, default={} )

		if not os.path.isdir(f'{root}/{name}/'):
			return

		files = os.listdir(f'{root}/{name}/')

		# grab IDs for every file
		ids = { file.replace(_get_quant_extension(), "").replace(_get_phone_extension(), "") for file in files }

		wrote = False

		for id in tqdm(ids, desc=f"Processing {name}", disable=True):
			try:
				quant_path = Path(f'{root}/{name}/{id}{_get_quant_extension()}')

				if audios and not quant_path.exists():
					continue

				key = f'{type}/{speaker_name}/{id}'

				if skip_existing and id in metadata:
					continue
				
				wrote = True

				if id not in metadata:
					metadata[id] = {}

				utterance_metadata = {}
				if audios:
					artifact = np.load(quant_path, allow_pickle=True)[()]
					qnt = torch.from_numpy(artifact["codes"].astype(int))[0].t().to(dtype=torch.int16)

					utterance_metadata = process_artifact_metadata( artifact )

				for k, v in utterance_metadata.items():
					metadata[id][k] = v

			except Exception as e:
				tqdm.write(f'Error while processing {id}: {e}')

		if wrote:
			json_write( metadata, metadata_path )

	# training
	for data_dir in tqdm(sorted(cfg.dataset.training), desc="Processing Training"):
		add( data_dir, type="training" )

	# validation
	for data_dir in tqdm(sorted(cfg.dataset.validation), desc='Processing Validation'):
		add( data_dir, type="validation" )

	# noise
	for data_dir in tqdm(sorted(cfg.dataset.noise), desc='Processing Noise'):
		add( data_dir, type="noise", texts=False )

# parse yaml to create an hdf5 file
def create_dataset_hdf5( skip_existing=True ):
	cfg.dataset.use_hdf5 = True
	cfg.load_hdf5(write=True)
	hf = cfg.hdf5

	symmap = get_phone_symmap()
	
	root = str(cfg.data_dir)
	metadata_root = str(cfg.metadata_dir)


	def add( dir, type="training", audios=True, texts=True, verbose=False ):
		name = str(dir)
		name = name.replace(root, "")
		speaker_name = remap_speaker_name( name )

		metadata_path = Path(f"{metadata_root}/{speaker_name}.json")
		metadata_path.parents[0].mkdir(parents=True, exist_ok=True)

		metadata = json_read(metadata_path, default={})

		if not os.path.isdir(f'{root}/{name}/'):
			return

		files = os.listdir(f'{root}/{name}/')

		# grab IDs for every file
		ids = { file.replace(_get_quant_extension(), "").replace(_get_phone_extension(), "") for file in files }

		"""
		# rephonemizes if you fuck up and use and old tokenizer...
		for id, entry in tqdm(metadata.items(), desc=f"Processing {name}"):
			key = f'{type}/{speaker_name}/{id}'

			if key not in hf:
				continue
			
			group = hf[key]

			if "phonemes" not in entry:
				continue
			if "text" not in group:
				continue

			txt = entry["phonemes"]
			phn = "".join(txt)
			phn = cfg.tokenizer.encode(phn)
			phn = np.array(phn).astype(np.uint8) 

			del group["text"]
			group.create_dataset('text', data=phn, compression='lzf')
		"""

		for id in tqdm(ids, desc=f"Processing {name}", disable=not verbose):
			try:
				quant_exists = os.path.exists(f'{root}/{name}/{id}{_get_quant_extension()}') if audios else True
				text_exists = os.path.exists(f'{root}/{name}/{id}{_get_phone_extension()}') if texts else True

				if not quant_exists:
					continue

				key = f'{type}/{speaker_name}/{id}'

				if skip_existing and key in hf:
					continue

				group = hf.create_group(key) if key not in hf else hf[key]

				if id not in metadata:
					metadata[id] = {}

				utterance_metadata = {}

				# audio
				if audios:
					artifact = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
					qnt = torch.from_numpy(artifact["codes"].astype(int))[0].t().to(dtype=torch.int16)

					utterance_metadata = process_artifact_metadata( artifact )

					if "audio" not in group:
						group.create_dataset('audio', data=qnt.numpy().astype(np.int16), compression='lzf')

				# text
				# this is a relic from when I did have the quantized audio and phoneme transcription separate
				# to-do: ensure I can remove this block
				if texts:
					if not utterance_metadata and text_exists:
						utterance_metadata = json_read(f'{root}/{name}/{id}{_get_phone_extension()}')

					phn = "".join(utterance_metadata["phonemes"])
					phn = cfg.tokenizer.encode(phn)
					phn = np.array(phn).astype(np.uint8) 

					if "text" not in group:
						group.create_dataset('text', data=phn, compression='lzf')

				for k, v in utterance_metadata.items():
					group.attrs[k] = v
					metadata[id][k] = v

			except Exception as e:
				tqdm.write(f'Error while processing {id}: {e}')

		json_write( metadata, metadata_path )

	# training
	for data_dir in tqdm(cfg.dataset.training, desc="Processing Training"):
		add( data_dir, type="training" )

	# validation
	for data_dir in tqdm(cfg.dataset.validation, desc='Processing Validation'):
		add( data_dir, type="validation" )

	# noise
	for data_dir in tqdm(cfg.dataset.noise, desc='Processing Noise'):
		add( data_dir, type="noise", texts=False )

	# write symmap
	if "symmap" in hf:
		del hf['symmap']

	hf.create_dataset('symmap', data=json_stringify(symmap))
	hf.close()

if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser("Save trained model to path.")
	parser.add_argument("--action", type=str)
	parser.add_argument("--tasks", type=str)
	args, unknown = parser.parse_known_args()

	task = args.action

	setup_logging()
	cfg.dataset.workers = 1

	if args.action == "hdf5":
		create_dataset_hdf5()
	elif args.action == "list-dataset":
		dataset = []
		for group in os.listdir(cfg.data_dir):
			for name in os.listdir(cfg.data_dir / group):
				if len(os.listdir(cfg.data_dir / group / name)) == 0:
					continue
				dataset.append(f'{group}/{name}')

		_logger.info(json_stringify(dataset))
	elif args.action == "metadata":
		create_dataset_metadata()
	elif args.action == "sample":
		train_dl, subtrain_dl, val_dl = create_train_val_dataloader()

		samples = {
			"training": [ next(iter(train_dl)),  next(iter(train_dl)) ],
			"evaluation": [ next(iter(subtrain_dl)),  next(iter(subtrain_dl)) ],
			#"validation": [ next(iter(val_dl)),  next(iter(val_dl)) ],
		}

		Path("./data/sample-test/").mkdir(parents=True, exist_ok=True)

		for k, v in samples.items():
			for i in range(len(v)):
				for j in tqdm(range(len(v[i]['proms'])), desc="Decoding..."):
					"""
					"""
					try:
						decode_to_file( v[i]['proms'][j], f"./data/sample-test/{k}.{i}.{j}.proms.wav", device="cpu" )
					except Exception as e:
						_logger.info(f"Error while decoding prom {k}.{i}.{j}.wav: {str(e)}")
					try:
						decode_to_file( v[i]['resps'][j], f"./data/sample-test/{k}.{i}.{j}.resps.wav", device="cpu" )
					except Exception as e:
						_logger.info(f"Error while decoding resp {k}.{i}.{j}.wav: {str(e)}")
					v[i]['proms'][j] = v[i]['proms'][j].shape
					v[i]['resps'][j] = v[i]['resps'][j].shape
		
		for k, v in samples.items():
			for i in range(len(v)):
				_logger.info(f'{k}[{i}]: {v[i]}')
	elif args.action == "validate":
		train_dl, subtrain_dl, val_dl = create_train_val_dataloader()
		dataset = train_dl.dataset

		missing = []
		symmap = get_phone_symmap()

		for index in tqdm(range(len( dataset )), desc="Processing dataset..."):
			if dataset.sampler_type == "group":
				spkr_group = dataset.spkr_groups[index]
				#spkr_group_id = dataset.spkr_group_symmap[spkr_group]
				spkr_name = dataset.spkr_samplers[spkr_group].sample()
				spkr_id = dataset.spkr_symmap[spkr_name]
				path = dataset.samplers[spkr_name].sample()
			elif dataset.sampler_type == "speaker":
				spkr_name = dataset.spkrs[index]
				spkr_id = dataset.spkr_symmap[spkr_name]
				path = dataset.samplers[spkr_name].sample()
				spkr_group = dataset.get_speaker_group(path)
				#spkr_group_id = dataset.spkr_group_symmap[spkr_group]
			else:
				path = dataset.paths[index]
				spkr_name = dataset.get_speaker(path)
				spkr_id = dataset.spkr_symmap[spkr_name]
				spkr_group = dataset.get_speaker_group(path)
				#spkr_group_id = dataset.spkr_group_symmap[spkr_group]

			if cfg.dataset.use_hdf5:
				key = _get_hdf5_path(path)
				if key not in cfg.hdf5:
					continue
				metadata = { f'{k}': f'{v}' for k, v in cfg.hdf5[key].attrs.items() }
			else:
				_, metadata = _load_quants(path, return_metadata=True)
			
			phonemes = metadata["phonemes"]

			for i, phone in enumerate( phonemes ):
				if phone in symmap:
					continue
				if phone in missing:
					continue

				_logger.info( f"{path} | {phonemes}[{i}] | {phone}" )
				missing.append( phone )

			"""
			text = tokenize( phonemes )[1:-1]
			unk_token = tokenize("<unk>")[1]

			if unk_token in text:
				print( unk_token, text, phonemes )

			for i, token in enumerate(text):
				if token != unk_token:
					continue
				
				phone = phonemes[i]
				if phone not in missing:
					_logger.info( f"{path} | {phonemes}[{i}] | {phone}" )
				missing |= set([phone])
			"""

		_logger.info( f"Missing tokens: {missing}" )


	elif args.action == "tasks":
		index = 0
		cfg.dataset.tasks_list = args.tasks.split(",")
		
		train_dl, subtrain_dl, val_dl = create_train_val_dataloader()
		batch = next(iter(train_dl))

		for text, resps, proms, task in zip(batch["text"], batch["resps"], batch["proms"], batch["task"]):
			if task not in cfg.dataset.tasks_list:
				continue

			_logger.info( f'{text} {task} {cfg.model.resp_levels}')
			_logger.info( f'{proms.shape} {resps.shape}' )

			tokens = 0
			tokens += sum([ text.shape[0] for text in batch["text"] ])
			tokens += sum([ resps.shape[0] for resps in batch["resps"] ])
			_logger.info( f'{tokens}' )

			decode_to_file( proms, f"./data/{task}.proms.wav", device="cpu" )
			decode_to_file( resps, f"./data/{task}.resps.wav", device="cpu" )
			break
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+								# todo: clean this mess up
 								import copy
 								import h5py
 								import json
-												regex replace out the (lang) markers in espeak, updated tokenizer vocab as lazily as possible to not have unk tokens

											
										
										
											2024-09-21 17:29:28 +00:00
+								import re
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+								import logging
 								import numpy as np
 								import os
 								import random
 								import torch
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+								import itertools
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
 								from .config import cfg
-												busy work and cleanup while I wait for 1TB of audio to quantize... again.

											
										
										
											2024-08-07 01:23:33 +00:00
+								from .emb.qnt import trim, trim_random, repeat_extend_audio, concat_audio, merge_audio, decode_to_file, decode as decode_qnt, encode as encode_qnt, pad_codes_with_silence
-												added helper script to process Emilia (amphion/Emilia-Dataset), clean up espeak phonemes for non-English transcriptions with English words (because for some reason espeak injects (en){word}(lang) markers and it's annoying)

											
										
										
											2024-09-09 14:57:32 +00:00
+								from .emb.g2p import encode as encode_phns
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+								from .utils.sampler import PoolSampler, OrderedSampler, BatchedOrderedSampler, RandomSampler
-												split sampler dict by global_rank, also handle splitting dataset paths by global_rank if sampler_type == path (because I do not trust DistributedSampler) (need to test)

											
										
										
											2024-06-01 14:29:49 +00:00
+								from .utils.distributed import global_rank, local_rank, world_size
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+								from .utils.io import torch_save, torch_load, json_read, json_write, json_stringify, json_parse
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
+								from .utils import setup_logging
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
 								from collections import defaultdict
 								from functools import cache, cached_property
 								from itertools import groupby, zip_longest
 								from pathlib import Path
 								from typing import Any
 								from torch import Tensor
 								from torch.utils.data import DataLoader, Dataset as _Dataset
-												distributed training works now (hopefully)

											
										
										
											2023-08-14 03:07:45 +00:00
+								from torch.utils.data.distributed import DistributedSampler
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+								from torch.nn.utils.rnn import pad_sequence
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+								from tqdm.auto import tqdm
 								# torch.multiprocessing.set_sharing_strategy("file_system")
 								_logger = logging.getLogger(__name__)
-												added --eval-random-text-prompts to use random text prompts for eval pass, added --random-prompts for demo page and --lora to use a sample with the lora disabled, probably finally fixed validation dataloader breaking on eval

											
										
										
											2024-10-10 18:40:25 +00:00
+								@cache
-												more arg creep for demo page

											
										
										
											2024-10-11 00:40:01 +00:00
+								def get_random_prompts( validation=True, min_length=0, tokenized=False ):
 									duration_range = [ 5.5, 12.0 ] # to-do: pull from cfg.dataset.duration_range
-												added --eval-random-text-prompts to use random text prompts for eval pass, added --random-prompts for demo page and --lora to use a sample with the lora disabled, probably finally fixed validation dataloader breaking on eval

											
										
										
											2024-10-10 18:40:25 +00:00
+									sentences = [
 										"The birch canoe slid on the smooth planks.",
 										"Glue the sheet to the dark blue background.",
 										"It's easy to tell the depth of a well.",
 										"These days a chicken leg is a rare dish.",
 										"Rice is often served in round bowls.",
 										"The juice of lemons makes fine punch.",
 										"The box was thrown beside the parked truck.",
 										"The hogs were fed chopped corn and garbage.",
 										"Four hours of steady work faced us.",
 										"A large size in stockings is hard to sell.",
 										"The boy was there when the sun rose.",
 										"A rod is used to catch pink salmon.",
 										"The source of the huge river is the clear spring.",
 										"Kick the ball straight and follow through.",
 										"Help the woman get back to her feet.",
 										"A pot of tea helps to pass the evening.",
 										"Smoky fires lack flame and heat.",
 										"The soft cushion broke the man's fall.",
 										"The salt breeze came across from the sea.",
 										"The girl at the booth sold fifty bonds.",
 										"The small pup gnawed a hole in the sock.",
 										"The fish twisted and turned on the bent hook.",
 										"Press the pants and sew a button on the vest.",
 										"The swan dive was far short of perfect.",
 										"The beauty of the view stunned the young boy.",
 										"Two blue fish swam in the tank.",
 										"Her purse was full of useless trash.",
 										"The colt reared and threw the tall rider.",
 										"It snowed, rained, and hailed the same morning.",
 										"Read verse out loud for pleasure.",
 									]
 									# Pull from validation dataset if existing + requested
 									if validation and cfg.dataset.validation:
-												demo page tweaks

											
										
										
											2024-10-10 18:52:37 +00:00
+										paths = _load_paths(cfg.dataset.validation, type="validation", silent=True)
-												added --eval-random-text-prompts to use random text prompts for eval pass, added --random-prompts for demo page and --lora to use a sample with the lora disabled, probably finally fixed validation dataloader breaking on eval

											
										
										
											2024-10-10 18:40:25 +00:00
+										paths = list(itertools.chain.from_iterable(paths.values()))
 										for path in paths:
-												more demo page tweaks, added arg to force enable/disable LoRAs for inferencing (to-do: setup arg flags to handle this, and checkbox in web UI)

											
										
										
											2024-10-11 00:04:12 +00:00
+											duration = 0
-												added --eval-random-text-prompts to use random text prompts for eval pass, added --random-prompts for demo page and --lora to use a sample with the lora disabled, probably finally fixed validation dataloader breaking on eval

											
										
										
											2024-10-10 18:40:25 +00:00
+											text_string = ""
 											if cfg.dataset.use_hdf5:
 												key = _get_hdf5_path(path)
 												metadata = { f'{k}': f'{v}' for k, v in cfg.hdf5[key].attrs.items() }
-												more demo page tweaks, added arg to force enable/disable LoRAs for inferencing (to-do: setup arg flags to handle this, and checkbox in web UI)

											
										
										
											2024-10-11 00:04:12 +00:00
+												metadata = process_artifact_metadata( { "metadata": metadata } )
-												added --eval-random-text-prompts to use random text prompts for eval pass, added --random-prompts for demo page and --lora to use a sample with the lora disabled, probably finally fixed validation dataloader breaking on eval

											
										
										
											2024-10-10 18:40:25 +00:00
+												text_string = metadata["text"] if "text" in metadata else ""
-												more demo page tweaks, added arg to force enable/disable LoRAs for inferencing (to-do: setup arg flags to handle this, and checkbox in web UI)

											
										
										
											2024-10-11 00:04:12 +00:00
+												duration = metadata['duration'] if "duration" in metadata else 0
-												added --eval-random-text-prompts to use random text prompts for eval pass, added --random-prompts for demo page and --lora to use a sample with the lora disabled, probably finally fixed validation dataloader breaking on eval

											
										
										
											2024-10-10 18:40:25 +00:00
+											else:
 												_, metadata = _load_quants(path, return_metadata=True)
-												more demo page tweaks, added arg to force enable/disable LoRAs for inferencing (to-do: setup arg flags to handle this, and checkbox in web UI)

											
										
										
											2024-10-11 00:04:12 +00:00
+												metadata = process_artifact_metadata( { "metadata": metadata } )
-												added --eval-random-text-prompts to use random text prompts for eval pass, added --random-prompts for demo page and --lora to use a sample with the lora disabled, probably finally fixed validation dataloader breaking on eval

											
										
										
											2024-10-10 18:40:25 +00:00
+												text_string = metadata["text"] if "text" in metadata else ""
-												more demo page tweaks, added arg to force enable/disable LoRAs for inferencing (to-do: setup arg flags to handle this, and checkbox in web UI)

											
										
										
											2024-10-11 00:04:12 +00:00
+												duration = metadata['duration'] if "duration" in metadata else 0
-												added --eval-random-text-prompts to use random text prompts for eval pass, added --random-prompts for demo page and --lora to use a sample with the lora disabled, probably finally fixed validation dataloader breaking on eval

											
										
										
											2024-10-10 18:40:25 +00:00
-												more arg creep for demo page

											
										
										
											2024-10-11 00:40:01 +00:00
+											if len( text_string ) < min_length or not (duration_range[0] <= duration and duration <= duration_range[1]):
-												added --eval-random-text-prompts to use random text prompts for eval pass, added --random-prompts for demo page and --lora to use a sample with the lora disabled, probably finally fixed validation dataloader breaking on eval

											
										
										
											2024-10-10 18:40:25 +00:00
+												continue
 											sentences.append( text_string )
-												more demo page tweaks, added arg to force enable/disable LoRAs for inferencing (to-do: setup arg flags to handle this, and checkbox in web UI)

											
										
										
											2024-10-11 00:04:12 +00:00
+									# tokenize here because our harvard sentences need to be phonemized anyways
-												added --eval-random-text-prompts to use random text prompts for eval pass, added --random-prompts for demo page and --lora to use a sample with the lora disabled, probably finally fixed validation dataloader breaking on eval

											
										
										
											2024-10-10 18:40:25 +00:00
+									if tokenized:
 										return [ torch.tensor( tokenize( encode_phns( text ) ) ).to(dtype=torch.uint8) for text in sentences ]
 									return sentences
 								# samples a random text prompt
 								def get_random_prompt( *args, **kwargs ):
 									# Harvard sentences
 									return random.choice(get_random_prompts( *args, **kwargs ))
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+								# fold into a typical LLM sequence (one embedding rather than split embeddings)
 								def fold_inputs(
 									text_list = [],
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									lang_list = [],
 									task_list = [],
 									tone_list = [],
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+									prom_list = [],
 									resp_list = [],
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
+									targ_list = [],
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
 									ignore_index = None,
 									sep = 3,
 									stop = 3,
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									config = None,
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
-												re-implemented config.model.interleave for the HF-compat experimental method

											
										
										
											2024-06-04 19:19:52 +00:00
+									quant_levels = None,
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+								):
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									if config is None:
 										config = cfg.model
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+									def _create_mask(l, device):
 										seq = torch.arange(max(l), device=device).unsqueeze(0)  # (1 t)
 										stop = torch.tensor(l, device=device).unsqueeze(1)  # (b 1)
 										return (seq < stop).float()  # (b t)
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
+									def list_to_tensor(x_list: list[Tensor], mask=True):
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+										l = list(map(len, x_list))
 										x = pad_sequence(x_list).t()
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
+										if not mask:
 											return x
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
 										m = _create_mask(l, x_list[0].device)
 										m = m.to(x)
 										return x, m
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									def process_prom_or_task(i, prom):
 										if prom is None:
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
+											return 0
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
 										if isinstance(prom, str):
 											task = get_task_symmap()[f'<{input}>']
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+											seq = torch.tensor([task_start + task], device=device, dtype=dtype)
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
 											input_ids[i].append( seq )
 											input_ids[i].append( sep )
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
 											return seq.shape[0] + 1
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
 										# deinterleaved
 										if quant_levels is not None:
 											quant_level = quant_levels[i]
 											if ignore_index is not None:
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+												seq = torch.tensor( [ ignore_index for _ in range( prom.shape[0] ) ], device=device, dtype=dtype)
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+											else:
 												seq = prom[:, quant_level].to(device=device, dtype=dtype).clone()
 												for idx, token in enumerate( seq ):
 													token += prom_start + ( config.audio_tokens * quant_level )
 										# interleaved
 										else:
 											if ignore_index is not None:
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+												seq = torch.tensor( [ ignore_index for _ in range( prom.shape[0] * prom.shape[1] ) ], device=device, dtype=dtype)
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+											else:
 												seq = prom.flatten().to(device=device, dtype=dtype)
 												for idx, token in enumerate( seq ):
 													token += prom_start + ( config.audio_tokens * ( idx % config.resp_levels ) )
 										input_ids[i].append( seq )
 										input_ids[i].append( sep )
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
+										return seq.shape[0] + 1
 									def generate_position_ids( length, sep=True ):
 										return [ i for i in range( length + (1 if sep else 0) ) ]
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									"""
 									if quant_levels is not None:
 										resps_list = [ [] if l == 0 else resp for l, resp in zip(quant_levels, resp_list) ]
 									"""
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+									device = text_list[0].device
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									dtype = torch.int64
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+									batch_size = len(text_list)
 									input_ids = [ [] for _ in range(batch_size) ]
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
+									position_ids = [ [] for _ in range(batch_size) ]
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
 									offset = 0
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+									sep = torch.tensor([ sep ], device=device, dtype=dtype)
 									stop = torch.tensor([ stop ], device=device, dtype=dtype)
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
 									text_start = 0
 									text_end = text_start + config.text_tokens
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									lang_start = text_end
 									lang_end = lang_start + config.langs
 									rvq_start = lang_end
 									rvq_end = rvq_start + config.resp_levels
 									prom_start = rvq_end
 									prom_end = prom_start + config.audio_tokens * config.resp_levels
 									task_start = prom_end
 									task_end = task_start + config.tasks
 									tone_start = task_end
 									tone_end = tone_start + config.tones
 									resp_start = tone_end
 									resp_end = resp_start + config.audio_tokens * config.resp_levels
 									# text tokens
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+									for i, text in enumerate(text_list):
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+										if isinstance(text, torch.Tensor):
 											seq = text + text_start
 										else:
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+											seq = torch.tensor([text_start + text], device=device, dtype=dtype)
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+										input_ids[i].append( seq )
 										input_ids[i].append( sep )
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
+										position_ids[i].append( generate_position_ids( seq.shape[0] ) )
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									# lang tokens
 									for i, lang in enumerate(lang_list):
 										if isinstance(lang, torch.Tensor):
 											seq = lang + lang_start
 										else:
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+											seq = torch.tensor([lang_start + lang], device=device, dtype=dtype)
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+										input_ids[i].append( seq )
 										input_ids[i].append( sep )
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
 										position_ids[i].append( generate_position_ids( seq.shape[0] ) )
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
-												tweaks

											
										
										
											2024-06-05 01:41:13 +00:00
+									# inject target quant_level
 									if quant_levels is not None:
 										for i, rvq in enumerate( quant_levels ):
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+											if isinstance(rvq, torch.Tensor):
 												seq = rvq + rvq_start
 											else:
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+												seq = torch.tensor([rvq_start + rvq], device=device, dtype=dtype)
-												tweaks

											
										
										
											2024-06-05 01:41:13 +00:00
+											input_ids[i].append( seq )
 											input_ids[i].append( sep )
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
+											position_ids[i].append( generate_position_ids( seq.shape[0] ) )
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									# prom / task tokens
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+									for i, prom in enumerate(prom_list):
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+										# list of proms with a possible task token
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
+										length = 0
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+										if isinstance(prom, list):
 											for p in prom:
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
+												length += process_prom_or_task(i, p)
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+										# raw tensor
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+										else:
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
+											length += process_prom_or_task(i, prom)
 										position_ids[i].append( generate_position_ids( length, sep=False ) )
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									# tone tokens
 									for i, tone in enumerate(tone_list):
 										if isinstance(tone, torch.Tensor):
 											seq = tone + tone_start
 										else:
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+											seq = torch.tensor([tone_start + tone], device=device, dtype=dtype)
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+										input_ids[i].append( seq )
 										input_ids[i].append( sep )
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
+										position_ids[i].append( generate_position_ids( seq.shape[0] ) )
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									# resp tokens
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+									for i, resp in enumerate(resp_list):
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
+										# deinterleaved
 										if quant_levels is not None:
 											# grab the previous rvq level
 											quant_level = quant_levels[i] - 1
-												copy pasted from test to actual trainer

											
										
										
											2024-06-04 23:40:30 +00:00
+											# way to signal we want to inference for rvq level 0
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+											# without it, it's a random chance for any level to be selected again
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
+											if quant_level < 0:
-												tweaks

											
										
										
											2024-06-05 01:41:13 +00:00
+												continue
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
+											else:
 												# my shitcode keeps things as lists of tensors for each level, so this handles it because lists can't index by tuples
 												if isinstance(resp, list):
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+													seq = resp[quant_level].to(device=device, dtype=dtype).clone()
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
+												else:
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+													seq = resp[:, quant_level].to(device=device, dtype=dtype).clone()
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
 												for idx, token in enumerate( seq ):
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+													token += resp_start + ( config.audio_tokens * quant_level )
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
 											input_ids[i].append( seq )
 											input_ids[i].append( stop )
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
 											position_ids[i].append( generate_position_ids( seq.shape[0] ) )
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
+										# interleaved
 										else:
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+											seq = resp.flatten().to(device=device, dtype=dtype)
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
+											for idx, token in enumerate( seq ):
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+												token += resp_start + ( config.audio_tokens * ( idx % config.resp_levels ) )
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
 											input_ids[i].append( seq )
 											input_ids[i].append( stop )
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
 											position_ids[i].append( generate_position_ids( seq.shape[0] ) )
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									# targ list
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
+									for i, resp in enumerate(targ_list):
 										# deinterleaved
-												re-implemented config.model.interleave for the HF-compat experimental method

											
										
										
											2024-06-04 19:19:52 +00:00
+										if quant_levels is not None:
 											quant_level = quant_levels[i]
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+											seq = resp[:, quant_level].to(device=device, dtype=dtype)
-												re-implemented config.model.interleave for the HF-compat experimental method

											
										
										
											2024-06-04 19:19:52 +00:00
+											for idx, token in enumerate( seq ):
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+												token += resp_start + ( config.audio_tokens * quant_level )
-												re-implemented config.model.interleave for the HF-compat experimental method

											
										
										
											2024-06-04 19:19:52 +00:00
 											input_ids[i].append( seq )
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
+											input_ids[i].append( stop )
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
 											position_ids[i].append( generate_position_ids( seq.shape[0] ) )
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
+										# interleaved
-												re-implemented config.model.interleave for the HF-compat experimental method

											
										
										
											2024-06-04 19:19:52 +00:00
+										else:
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+											seq = resp.flatten().to(device=device, dtype=dtype)
-												re-implemented config.model.interleave for the HF-compat experimental method

											
										
										
											2024-06-04 19:19:52 +00:00
+											for idx, token in enumerate( seq ):
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+												token += resp_start + ( config.audio_tokens * ( idx % config.resp_levels ) )
-												re-implemented config.model.interleave for the HF-compat experimental method

											
										
										
											2024-06-04 19:19:52 +00:00
 											input_ids[i].append( seq )
 											input_ids[i].append( stop )
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
 											position_ids[i].append( generate_position_ids( seq.shape[0] ) )
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
 									for i, batch in enumerate(input_ids):
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+										input_ids[i] = torch.concat(input_ids[i], dim=-1).to(device=device, dtype=dtype)
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
+										position_ids[i] = torch.concat([ torch.tensor(ids, device=device, dtype=dtype) for ids in position_ids[i] ], dim=-1)
 									input_ids, attention_mask = list_to_tensor(input_ids)
 									position_ids = list_to_tensor(position_ids, mask=False)
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
-												ughghghhhh

											
										
										
											2024-08-10 02:15:01 +00:00
+									return input_ids, attention_mask, position_ids
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
 								# unfold from one unified token ID space to separate token spaces
-												re-implemented config.model.interleave for the HF-compat experimental method

											
										
										
											2024-06-04 19:19:52 +00:00
+								# to-do: unfold at a specific RVQ level instead if requested
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+								def unfold_outputs(
 									output_ids,
 									sep = 3,
 									stop = 3,
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									config = None,
-												re-implemented config.model.interleave for the HF-compat experimental method

											
										
										
											2024-06-04 19:19:52 +00:00
+									quant_levels = None,
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+								):
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									def bin_to_rvqs( tokens ):
 										length = len(tokens)
 										"""
 										if length % config.resp_levels == 0:
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+											tokens = torch.tensor(tokens).reshape( config.resp_levels, length // config.resp_levels ).t()
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+										"""
 										bins = [ [] for _ in range(config.resp_levels) ]
 										for pos in range( length ):
 											rvq = pos % config.resp_levels
 											bins[rvq].append( tokens[pos] )
 										nearest = ( len(bins) // config.resp_levels ) * config.resp_levels
 										bins = bins[:nearest]
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+										return torch.tensor(bins, device=device, dtype=dtype).t()
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
 									if config is None:
 										config = cfg.model
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+									device = output_ids.device
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									dtype = torch.int64
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+									batch_size = output_ids.shape[0]
 									text_list = [ [] for _ in range(batch_size) ]
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									rvq_list  = [ [] for _ in range(batch_size) ]
 									lang_list  = [ [] for _ in range(batch_size) ]
 									task_list  = [ [] for _ in range(batch_size) ]
 									tone_list  = [ [] for _ in range(batch_size) ]
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+									prom_list = [ [] for _ in range(batch_size) ]
 									resp_list = [ [] for _ in range(batch_size) ]
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+									text_start = 0
 									text_end = text_start + config.text_tokens
 									lang_start = text_end
 									lang_end = lang_start + config.langs
 									rvq_start = lang_end
 									rvq_end = rvq_start + config.resp_levels
 									prom_start = rvq_end
 									prom_end = prom_start + config.audio_tokens * config.resp_levels
 									task_start = prom_end
 									task_end = task_start + config.tasks
 									tone_start = task_end
 									tone_end = tone_start + config.tones
 									resp_start = tone_end
 									resp_end = resp_start + config.audio_tokens * config.resp_levels
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+									for i, batch in enumerate( output_ids ):
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+										# cringe logic to handle prefix resp for rvq levels > 0
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
+										# a better way is to observe if the rvq level increased
 										should_flush = False
 										flushed = False
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+										for idx, token in enumerate( batch ):
 											id = token.item()
 											if id == sep or id == stop:
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
+												if should_flush and quant_levels is not None and quant_levels[i] > 0:
 													resp_list[i] = []
 													should_flush = False
 													flushed = True
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+												continue
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+											# text tokens
 											if text_start <= id and id < text_end:
 												text_list[i].append( (id - text_start) % config.text_tokens )
 											# lang tokens
 											elif lang_start <= id and id < lang_end:
 												lang_list[i].append( (id - lang_start) % config.langs )
 											# rvq levels
 											elif rvq_start <= id and id < rvq_end:
 												rvq_list[i].append( (id - rvq_start) % config.resp_levels )
 											# prom tokens
 											elif prom_start <= id and id < prom_end:
 												prom_list[i].append( (id - prom_start) % config.audio_tokens )
 											# task tokens
 											elif task_start <= id and id < task_end:
 												task_list[i].append( (id - task_start) % config.tasks )
 											# lang tokens
 											elif tone_start <= id and id < tone_end:
 												tone_list[i].append( (id - tone_start) % config.tones )
 											# resp tokens
 											elif resp_start <= id and id < resp_end:
 												resp_list[i].append( (id - resp_start) % config.audio_tokens )
-												forgot one crucial detail (you *need* the previous RVQ level to keep coherence between all RVQ levels) (experimental deinterleaved is a bit crusty though)

											
										
										
											2024-06-04 23:30:30 +00:00
+												if not flushed:
 													should_flush = True
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
-												re-implemented config.model.interleave for the HF-compat experimental method

											
										
										
											2024-06-04 19:19:52 +00:00
+										if quant_levels is not None:
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+											prom_list[i] = torch.tensor(prom_list[i], device=device, dtype=dtype).t()
 											resp_list[i] = torch.tensor(resp_list[i], device=device, dtype=dtype).t()
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+										else:
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+											prom_list[i] = bin_to_rvqs( prom_list[i] )
 											resp_list[i] = bin_to_rvqs( resp_list[i] )
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+										text_list[i] = torch.tensor( text_list[i], device=device, dtype=dtype )
 										task_list[i] = torch.tensor( task_list[i], device=device, dtype=dtype )
 										lang_list[i] = torch.tensor( lang_list[i], device=device, dtype=dtype )
 										tone_list[i] = torch.tensor( tone_list[i], device=device, dtype=dtype )
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
 									return dict(
 										text_list=text_list,
 										prom_list=prom_list,
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+										resp_list=resp_list,
 										task_list=task_list,
 										lang_list=lang_list,
 										tone_list=tone_list,
-												feverish cleanup

											
										
										
											2024-06-04 02:28:49 +00:00
+									)
-												deprecate sole AR/NAR model by only keeping the AR+NAR (the beauty of no one using this is that I can break compat as much as I want), add tone token for when I classify my dataset with tone/emotion in the future, some other things

											
										
										
											2024-04-16 00:54:32 +00:00
+								# to-do: clean up this symmap mess
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+								def get_phone_symmap():
-												dataset preparation script updates, caved and am using HF tokenizer now

											
										
										
											2024-04-21 19:49:18 +00:00
+									return cfg.tokenizer.get_vocab()
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												dataset preparation script updates, caved and am using HF tokenizer now

											
										
										
											2024-04-21 19:49:18 +00:00
+								def tokenize( phones ):
-												busy work and cleanup while I wait for 1TB of audio to quantize... again.

											
										
										
											2024-08-07 01:23:33 +00:00
+									if isinstance( phones, list ):
 										phones = "".join( phones )
 									return cfg.tokenizer.encode( phones )
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
 								def get_lang_symmap():
-												deprecate sole AR/NAR model by only keeping the AR+NAR (the beauty of no one using this is that I can break compat as much as I want), add tone token for when I classify my dataset with tone/emotion in the future, some other things

											
										
										
											2024-04-16 00:54:32 +00:00
+									return {
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
+										"en": 0,
 										"ja": 1,
-												lang fixes / reworked phoneme symmap validation

											
										
										
											2024-09-19 00:36:03 +00:00
+										"de": 2,
 										"fr": 3,
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
+									}
-												deprecate sole AR/NAR model by only keeping the AR+NAR (the beauty of no one using this is that I can break compat as much as I want), add tone token for when I classify my dataset with tone/emotion in the future, some other things

											
										
										
											2024-04-16 00:54:32 +00:00
 								def get_tone_symmap():
 									return {
 										"neutral": 0,
 									}
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+									return symmap
-												pseudocode polyfill stub some other flavor of working on adding the tasks

											
										
										
											2023-08-19 03:22:13 +00:00
+								def get_task_symmap():
-												deprecate sole AR/NAR model by only keeping the AR+NAR (the beauty of no one using this is that I can break compat as much as I want), add tone token for when I classify my dataset with tone/emotion in the future, some other things

											
										
										
											2024-04-16 00:54:32 +00:00
+									return {
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
+										"<tts>": 0,
 										"<tts-c>": 1,
 										"<ns>": 2,
 										"<sr>": 3,
 										"<tse>": 4,
 										"<soe>": 5,
 										"<mask>": 6,
 										"<eoe>": 7,
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+										"<stt>": 8,
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
 										"<nse>": 6, # fake
 										"<cse>": 6, # fake
-												pseudocode polyfill stub some other flavor of working on adding the tasks

											
										
										
											2023-08-19 03:22:13 +00:00
+									}
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+								def _replace_file_extension(path, suffix):
-												saner mask creation? (it doesnt matter, kv cache wont work)

											
										
										
											2024-11-03 02:00:21 +00:00
+									if not isinstance( path, Path ):
 										path = Path(path)
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+									return (path.parent / path.name.split(".")[0]).with_suffix(suffix)
-												converting over to a different intermediary dataset format

											
										
										
											2024-04-19 02:24:06 +00:00
+								def _get_quant_extension():
-												DAC just doesn't work well enough......

											
										
										
											2024-05-25 16:07:52 +00:00
+									return ".dac" if cfg.audio_backend == "dac" else ".enc"
-												converting over to a different intermediary dataset format

											
										
										
											2024-04-19 02:24:06 +00:00
 								def _get_phone_extension():
-												DAC just doesn't work well enough......

											
										
										
											2024-05-25 16:07:52 +00:00
+									return ".json" # if cfg.audio_backend == "dac" else ".phn.txt"
-												converting over to a different intermediary dataset format

											
										
										
											2024-04-19 02:24:06 +00:00
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+								def _get_quant_path(path):
-												converting over to a different intermediary dataset format

											
										
										
											2024-04-19 02:24:06 +00:00
+									return _replace_file_extension(path, _get_quant_extension())
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
 								def _get_phone_path(path):
-												converting over to a different intermediary dataset format

											
										
										
											2024-04-19 02:24:06 +00:00
+									return _replace_file_extension(path, _get_phone_extension())
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+								_durations_map = {}
 								# makeshift caching the above to disk
-												unified more things with training the AR+NAR monolothic model

											
										
										
											2023-09-12 20:54:41 +00:00
+								@cfg.diskcache()
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+								def _get_duration_map( type="training" ):
 									return _durations_map[type] if type in _durations_map else {}
-												unified more things with training the AR+NAR monolothic model

											
										
										
											2023-09-12 20:54:41 +00:00
-												integrated plot script, added tts-c task token to help the model be able to mix between normal VALL-E and VALL-E continuous

											
										
										
											2023-09-02 21:29:53 +00:00
+								@cfg.diskcache()
-												demo page tweaks

											
										
										
											2024-10-10 18:52:37 +00:00
+								def _load_paths(dataset, type="training", silent=False):
 									return { cfg.get_spkr( cfg.data_dir / data_dir / "dummy" ): _load_paths_from_metadata( data_dir, type=type, validate=cfg.dataset.validate and type == "training" ) for data_dir in tqdm(dataset, desc=f"Parsing dataset: {type}", disable=silent) }
-												final tweaks, hopefully

											
										
										
											2024-04-29 03:28:29 +00:00
-												final tweaks, hopefully, again

											
										
										
											2024-05-16 04:04:19 +00:00
+								def _load_paths_from_metadata(group_name, type="training", validate=False):
 									data_dir = group_name if cfg.dataset.use_hdf5 else cfg.data_dir / group_name
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
 									_fn = _get_hdf5_paths if cfg.dataset.use_hdf5 else _get_paths_of_extensions
-												final tweaks, hopefully, again

											
										
										
											2024-05-16 04:04:19 +00:00
+									def key( id, entry=None ):
 										return f"/{type}/{_get_hdf5_path(data_dir)}/{id}" if cfg.dataset.use_hdf5 else data_dir / id
-												ugh

											
										
										
											2024-05-12 03:58:38 +00:00
-												final tweaks, hopefully, again

											
										
										
											2024-05-16 04:04:19 +00:00
+									metadata_path = cfg.metadata_dir / f'{group_name}.json'
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+									metadata = {}
-												final tweaks, hopefully

											
										
										
											2024-04-29 03:28:29 +00:00
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+									if cfg.dataset.use_metadata and metadata_path.exists():
-												maybe final tweaks, I really needed to unify my json read/write and orjson is proven to be fast enough for me to try and rely on it more

											
										
										
											2024-09-18 03:57:04 +00:00
+										#metadata = json.loads(open( metadata_path, "r", encoding="utf-8" ).read())
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+										metadata = json_read( metadata_path )
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+									if len(metadata) == 0:
-												converting over to a different intermediary dataset format

											
										
										
											2024-04-19 02:24:06 +00:00
+										return _fn( data_dir, type if cfg.dataset.use_hdf5 else _get_quant_extension(), validate )
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												final tweaks, hopefully, again

											
										
										
											2024-05-16 04:04:19 +00:00
+									def _validate( id, entry ):
-												ugh

											
										
										
											2024-05-12 03:58:38 +00:00
+										phones = entry['phones'] if "phones" in entry else 0
 										duration = entry['duration'] if "duration" in entry else 0
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+										# add to duration bucket
 										k = key(id, entry)
 										if type not in _durations_map:
 											_durations_map[type] = {}
 										_durations_map[type][k] = duration
 										if not validate:
 											return True
 										return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+									return [ key(id, entry) for id, entry in metadata.items() if _validate(id, entry) ]
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+								def _get_hdf5_path(path):
-												final tweaks, hopefully

											
										
										
											2024-04-29 03:28:29 +00:00
+									# to-do: better validation
 									return str(path)
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+								def _get_hdf5_paths( data_dir, type="training", validate=False ):
 									data_dir = str(data_dir)
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
 									key = f"/{type}/{_get_hdf5_path(data_dir)}"
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												final tweaks, hopefully, again

											
										
										
											2024-05-16 04:04:19 +00:00
+									def _validate( id, entry ):
 										phones = entry.attrs['phonemes']
 										duration = entry.attrs['duration']
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+										if type not in _durations_map:
 											_durations_map[type] = {}
 										_durations_map[type][f"{key}/{id}"] = duration
 										if not validate:
 											return True
 										return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration
 									return [ Path(f"{key}/{id}") for id, entry in cfg.hdf5[key].items() if _validate(id, entry) ] if key in cfg.hdf5 else []
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												converting over to a different intermediary dataset format

											
										
										
											2024-04-19 02:24:06 +00:00
+								def _get_paths_of_extensions( path, extensions=_get_quant_extension(), validate=False ):
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+									if isinstance(path, str):
 										path = Path(path)
 									def _validate(path):
 										if "".join(path.suffixes) not in extensions:
 											return False
 										if not _get_phone_path(path).exists() or not _get_quant_path(path).exists():
 											return False
 										if not validate:
 											return True
 										# to-do: find an easy way to determine size from pickled quants without loading
 										# to-do: find a consistent way to derive phoneme count from filesize (probably can't due to utf-8)
 										phones = len(_get_phones(_get_phone_path(path))) # _get_phone_path(path).stat().st_size // 2 + 1
 										return cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
 									return [ p for p in list(path.iterdir()) if _validate(p) ] if path.exists() and path.is_dir() else []
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												fix loading without needing an hdf5 dataset already prepped (and some other incidental speedups during dataloader prep)

											
										
										
											2024-05-18 12:14:26 +00:00
+								def _load_quants(path, return_metadata=False) -> Tensor:
 									qnt = np.load(_get_quant_path(path), allow_pickle=True)[()]
 									if return_metadata:
 										return torch.from_numpy(qnt["codes"].astype(int))[0][:, :].t().to(torch.int16), qnt["metadata"]
 									return torch.from_numpy(qnt["codes"].astype(int))[0][:, :].t().to(torch.int16)
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												remove double spaces in the text phonemes (might have caused problems.........)

											
										
										
											2023-10-11 00:18:24 +00:00
+								# prune consecutive spaces
 								def _cleanup_phones( phones, targets=[" "]):
 									return [ p for i, p in enumerate(phones) if p not in targets or ( p in targets and p != phones[i-1] ) ]
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+								@cache
-												fix loading without needing an hdf5 dataset already prepped (and some other incidental speedups during dataloader prep)

											
										
										
											2024-05-18 12:14:26 +00:00
+								def _get_phones(path):
 									phone_path = _get_phone_path(path)
 									quant_path = _get_quant_path(path)
 									if phone_path.exists():
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+										#metadata = json.loads(open(phone_path, "r", encoding="utf-8").read())
 										metadata = json_read(phone_path)
-												fix loading without needing an hdf5 dataset already prepped (and some other incidental speedups during dataloader prep)

											
										
										
											2024-05-18 12:14:26 +00:00
+									elif quant_path.exists():
 										_, metadata = _load_quants( path, return_metadata=True )
-												converting over to a different intermediary dataset format

											
										
										
											2024-04-19 02:24:06 +00:00
+									else:
-												fix loading without needing an hdf5 dataset already prepped (and some other incidental speedups during dataloader prep)

											
										
										
											2024-05-18 12:14:26 +00:00
+										raise Exception(f"Could not load phonemes: {path}")
-												dataset preparation script updates, caved and am using HF tokenizer now

											
										
										
											2024-04-21 19:49:18 +00:00
-												fix loading without needing an hdf5 dataset already prepped (and some other incidental speedups during dataloader prep)

											
										
										
											2024-05-18 12:14:26 +00:00
+									content = metadata["phonemes"]
-												dataset preparation script updates, caved and am using HF tokenizer now

											
										
										
											2024-04-21 19:49:18 +00:00
+									return "".join(content)
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
 								def _interleaved_reorder(l, fn):
 									groups = defaultdict(list)
 									for e in l:
 										groups[fn(e)].append(e)
 									groups = {k: groups[k] for k in sorted(groups)}
 									for interleaved in zip_longest(*groups.values()):
 										for value in interleaved:
 											if value is not None:
 												yield value
 								class Dataset(_Dataset):
 									def __init__(
 										self,
 										phone_symmap=None,
 										training=False,
 										extra_paths_by_spkr_name: dict[str, list] = {},
 									):
 										super().__init__()
 										self._head = None
-												preparing for SpeechX extensions

											
										
										
											2023-08-19 01:58:07 +00:00
+										self.sampler = None
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+										self.paths = []
 										self.training = training
 										self.dataset_type = "training" if self.training else "validation"
 										self.dataset = cfg.dataset.training if self.training else cfg.dataset.validation
-												oops (fixed proms being erased from a config oversight)

											
										
										
											2024-07-25 17:39:57 +00:00
+										self.sampler_type = cfg.dataset.sample_type if self.dataset_type == "training" else "path"
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+										self.sampler_order = cfg.dataset.sample_order
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
+										self.sampler_shuffle = cfg.dataset.sample_shuffle if self.dataset_type == "training" else True
-												nasty bandaid if there's no validation dataset specified during training (for example, during finetunes)

											
										
										
											2023-08-30 23:23:05 +00:00
 										# to-do: do not do validation if there's nothing in the validation
 										# this just makes it be happy
 										if len(self.dataset) == 0:
 											self.dataset = cfg.dataset.training
-												skip step on nan loss (ironically I have not had a nan loss after adding this), throw exception with invalid cfg.dataset.sample_type and sample_order combination (because I was tricked by this in my yaml and had inconsistent vram usage)

											
										
										
											2024-11-02 01:54:53 +00:00
 										# hard error because I kept getting tricked by this myself
 										if self.sampler_order == "duration" and self.sampler_type != "path":
 											raise Exception(f'Requesting sample_type={self.sampler_type} with sample_order={self.sampler_order}, yet combination will not give expected results.')
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+										# dict of paths keyed by speaker names
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+										self.paths_by_spkr_name = _load_paths(self.dataset, self.dataset_type)
-												unified more things with training the AR+NAR monolothic model

											
										
										
											2023-09-12 20:54:41 +00:00
 										# cull speakers if they do not have enough utterances
 										if cfg.dataset.min_utterances > 0:
 											keys = list(self.paths_by_spkr_name.keys())
 											for key in keys:
 												if len(self.paths_by_spkr_name[key]) < cfg.dataset.min_utterances:
 													del self.paths_by_spkr_name[key]
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+										# flatten paths
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+										self.paths = list(itertools.chain.from_iterable(self.paths_by_spkr_name.values()))
-												split sampler dict by global_rank, also handle splitting dataset paths by global_rank if sampler_type == path (because I do not trust DistributedSampler) (need to test)

											
										
										
											2024-06-01 14:29:49 +00:00
 										# split dataset accordingly per GPU
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+										if cfg.distributed and self.training:
-												change from chunk-slicing paths for distributed dataloader to instead interleave

											
										
										
											2024-06-29 15:10:35 +00:00
+											"""
-												split sampler dict by global_rank, also handle splitting dataset paths by global_rank if sampler_type == path (because I do not trust DistributedSampler) (need to test)

											
										
										
											2024-06-01 14:29:49 +00:00
+											batches = len(self.paths) // world_size()
 											start = batches * global_rank()
 											end = batches * (global_rank() + 1)
 											self.paths = self.paths[start:end]
-												change from chunk-slicing paths for distributed dataloader to instead interleave

											
										
										
											2024-06-29 15:10:35 +00:00
+											"""
 											self.paths = [ path for i, path in enumerate(self.paths) if i % world_size() == 0 ]
-												split sampler dict by global_rank, also handle splitting dataset paths by global_rank if sampler_type == path (because I do not trust DistributedSampler) (need to test)

											
										
										
											2024-06-01 14:29:49 +00:00
 											# recreate paths_by_spkr_name
 											self.paths_by_spkr_name = {}
 											for path in self.paths:
-												ugh

											
										
										
											2024-06-01 15:30:13 +00:00
+												name = cfg.get_spkr( Path(path) )
 												if name not in self.paths_by_spkr_name:
-												split sampler dict by global_rank, also handle splitting dataset paths by global_rank if sampler_type == path (because I do not trust DistributedSampler) (need to test)

											
										
										
											2024-06-01 14:29:49 +00:00
+													self.paths_by_spkr_name[name] = []
 												self.paths_by_spkr_name[name].append( path )
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+										# do it here due to the above
 										self.duration = 0
 										self.duration_map = _get_duration_map( self.dataset_type )
 										self.duration_buckets = {}
 										# store in corresponding bucket
 										for path in self.paths:
 											duration = self.duration_map[path]
 											self.duration += duration
-												cleaned up demo page creation, added option to pass in RVQ level sampling distribution for training

											
										
										
											2024-07-22 00:12:03 +00:00
+											# only calc duration if we're going to order by duration
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+											if self.sampler_order != "duration":
 												continue
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+											bucket = int(round(duration))
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+											if bucket not in self.duration_buckets:
 												self.duration_buckets[bucket] = []
 											self.duration_buckets[bucket].append( ( Path(path), duration ) )
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+										# ensure they're ordered
 										self.duration_buckets = dict(sorted(self.duration_buckets.items()))
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+										# sort by duration
 										if self.sampler_order == "duration":
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+											flattened = {}
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+											# sort and interleave
 											for bucket in self.duration_buckets:
 												# sort by duration
 												self.duration_buckets[bucket].sort( key=lambda x: x[1] )
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+												# split to retain tuples
 												flattened[bucket] = self.duration_buckets[bucket]
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+												# replace with path
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+												flattened[bucket] = [ x[0] for x in flattened[bucket] ]
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+												# flatten by paths
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+												flattened[bucket] = [*_interleaved_reorder(flattened[bucket], self.get_speaker)]
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+											# flatten paths
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+											self.paths = list(itertools.chain.from_iterable(flattened.values()))
-												modified default arguments (ar temp = 0 and rep pen = 1.125 seems to be stable, at least given the few things i tested), do not pass top k/top p/min p to NAR even though technically none of those things should matter when greedy sampling

											
										
										
											2024-10-22 23:12:39 +00:00
+										elif self.sampler_order == "random":
 											random.shuffle( self.paths )
-												add shuffle to samplers that can support it

											
										
										
											2024-06-30 16:36:46 +00:00
+										else:
-												added option to sort paths by durations to better group equally lengthed sequences together (and there was maybe a logic error from creating the samplers and then interleave-reordering paths, desyncing them, maybe)

											
										
										
											2024-06-14 03:37:34 +00:00
+											# just interleave
 											self.paths = [*_interleaved_reorder(self.paths, self.get_speaker)]
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												modified default arguments (ar temp = 0 and rep pen = 1.125 seems to be stable, at least given the few things i tested), do not pass top k/top p/min p to NAR even though technically none of those things should matter when greedy sampling

											
										
										
											2024-10-22 23:12:39 +00:00
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
 										# dict of speakers keyed by speaker group
 										self.spkrs_by_spkr_group = {}
 										for data_dir in self.dataset:
 											spkr = cfg.get_spkr( data_dir / "dummy" )
 											spkr_group = cfg.get_spkr_group( data_dir / "dummy" )
-												make validation samplers ignore sampler type

											
										
										
											2023-10-22 14:01:47 +00:00
+											if spkr not in self.paths_by_spkr_name or len(self.paths_by_spkr_name[spkr]) < cfg.dataset.min_utterances:
 												continue
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+											if spkr_group not in self.spkrs_by_spkr_group:
 												self.spkrs_by_spkr_group[spkr_group] = []
 											self.spkrs_by_spkr_group[spkr_group].append( spkr )
 										self.spkr_groups = list(self.spkrs_by_spkr_group.keys())
-												added torchscale XMOE integration (because Mixtral 8x7B seems very promising and I want to see if it works)

											
										
										
											2023-12-21 00:45:58 +00:00
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+										self.noise_paths = _load_paths(cfg.dataset.noise, "noise")
 										self.noise_paths = list(itertools.chain.from_iterable(self.noise_paths.values()))
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
 										self.phone_symmap = phone_symmap or self._get_phone_symmap()
-												more oversights fixed because I've been using a cached dataloader forever now and didn't catch these problems

											
										
										
											2023-08-24 15:25:33 +00:00
+										self.spkr_symmap = self._get_spkr_symmap()
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+										self.spkr_group_symmap = self._get_spkr_group_symmap()
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
+										self.lang_symmap = self._get_lang_symmap()
-												deprecate sole AR/NAR model by only keeping the AR+NAR (the beauty of no one using this is that I can break compat as much as I want), add tone token for when I classify my dataset with tone/emotion in the future, some other things

											
										
										
											2024-04-16 00:54:32 +00:00
+										self.tone_symmap = self._get_tone_symmap()
-												more oversights fixed because I've been using a cached dataloader forever now and didn't catch these problems

											
										
										
											2023-08-24 15:25:33 +00:00
+										self.task_symmap = self._get_task_symmap()
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												added option to either naively concat codes to concat audio waveforms (prior behavior) or to decode => concat => encode instead (although this only currently happens for prom sampling if an utternace is too small)

											
										
										
											2024-07-18 21:48:41 +00:00
+										# grab IDs for bos, space, and eos for easy input creation later
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+										self.empty_text = [ cfg.tokenizer._bos_token, cfg.tokenizer.get_vocab()[" "], cfg.tokenizer._eos_token ]
-												fixes for re-introducing SpeechX tasks (need to actually validate if these all do the right things)

											
										
										
											2024-07-18 22:16:32 +00:00
+										# have it fetch at training time if any is invalid, because the tokenizer obj might not have it easily fetchable ahead of itme
 										# encoding before parallelizing things causes things to whine
 										if self.empty_text[0] is None or self.empty_text[-1] is None:
 											self.empty_text = None
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										# assert len(self.phone_symmap) < 256, "Unique token count should be [0,255] to fit within uint8"
 										self.text_dtype = torch.uint8 if len(self.phone_symmap) < 256 else torch.int16
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+										if len(self.paths) == 0:
 											raise ValueError(f"No valid path is found for {self.dataset_type}")
-												sampler update (some brainworm just never actually had a sampler for sample_type=path)

											
										
										
											2024-06-14 21:55:40 +00:00
-												added --eval-random-text-prompts to use random text prompts for eval pass, added --random-prompts for demo page and --lora to use a sample with the lora disabled, probably finally fixed validation dataloader breaking on eval

											
										
										
											2024-10-10 18:40:25 +00:00
+										if self.sampler_type == "path" and self.training:
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+											if self.sampler_order == "duration" and cfg.dataset.sample_max_duration_batch > 0:
-												ugh

											
										
										
											2024-06-30 03:14:35 +00:00
+												self.sampler = BatchedOrderedSampler(
-												add shuffle to samplers that can support it

											
										
										
											2024-06-30 16:36:46 +00:00
+													self.duration_buckets if not self.sampler_state_dict_path.exists() else {}, # pass nothing if we're just going to load from a state anyways
 													max_duration=cfg.dataset.sample_max_duration_batch,
 													max_batch_size=cfg.hyperparameters.batch_size if self.training else cfg.evaluation.batch_size,
 													shuffle=self.sampler_shuffle
-												ugh

											
										
										
											2024-06-30 03:14:35 +00:00
+												)
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+											else:
-												add shuffle to samplers that can support it

											
										
										
											2024-06-30 16:36:46 +00:00
+												self.sampler = OrderedSampler( len(self) ) if not self.sampler_shuffle else RandomSampler( len(self) )
-												sampler update (some brainworm just never actually had a sampler for sample_type=path)

											
										
										
											2024-06-14 21:55:40 +00:00
+											self.samplers = {}
 											self.spkr_samplers = {}
 										else:
 											self.sampler = RandomSampler( len(self) )
-												add shuffle to samplers that can support it

											
										
										
											2024-06-30 16:36:46 +00:00
+											self.samplers = { name: PoolSampler( paths, keep_all=True, shuffle=self.sampler_shuffle ) for name, paths in self.paths_by_spkr_name.items() }
 											self.spkr_samplers = { name: PoolSampler( [*set(speakers)], keep_all=True, shuffle=self.sampler_shuffle ) for name, speakers in self.spkrs_by_spkr_group.items() }
-												sampler update (some brainworm just never actually had a sampler for sample_type=path)

											
										
										
											2024-06-14 21:55:40 +00:00
-												added --eval-random-text-prompts to use random text prompts for eval pass, added --random-prompts for demo page and --lora to use a sample with the lora disabled, probably finally fixed validation dataloader breaking on eval

											
										
										
											2024-10-10 18:40:25 +00:00
+										self.load_state_dict()
-												change from chunk-slicing paths for distributed dataloader to instead interleave

											
										
										
											2024-06-29 15:10:35 +00:00
 									@cached_property
 									def sampler_state_dict_path(self):
-												tweaks and fixes for lora stuffs

											
										
										
											2024-09-08 23:05:21 +00:00
+										return cfg.ckpt_dir / (cfg.lora.full_name if cfg.lora is not None else cfg.model.full_name) / f"sampler.{self.sampler_type}.rank{global_rank()}.pt"
-												sampler update (some brainworm just never actually had a sampler for sample_type=path)

											
										
										
											2024-06-14 21:55:40 +00:00
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+									def get_speaker(self, path):
 										if isinstance(path, str):
 											path = Path(path)
 										res = cfg.get_spkr(path)
 										return res
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
+									def get_speaker_group(self, path):
 										if isinstance(path, str):
 											path = Path(path)
 										res = cfg.get_spkr_group(path)
 										return res
-												lang fixes / reworked phoneme symmap validation

											
										
										
											2024-09-19 00:36:03 +00:00
+									# this isn't really necessary since our data/metadata contains markers for languages, but this is still in in-case it's needed to force a language setting (for example, whisperX's lang isn't that accurate at times)
 									def get_language(self, speaker_group, lang="en"):
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
+										for k, v in cfg.dataset.speaker_languages.items():
 											if speaker_group in v:
 												lang = k
 												break
-												lang fixes / reworked phoneme symmap validation

											
										
										
											2024-09-19 00:36:03 +00:00
+										return lang.lower()
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+									@cached_property
 									def spkrs(self):
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+										return sorted({self.get_speaker(path) for path in self.paths})
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												pseudocode polyfill stub some other flavor of working on adding the tasks

											
										
										
											2023-08-19 03:22:13 +00:00
+									@cached_property
 									def tasks(self):
-												setting up for allowing training for a partial amount of the speechx tasks (do NOT try this at home yet without a proper model, as performance is predecated on having a solid base vall-e model for the tasks

											
										
										
											2023-08-19 05:16:08 +00:00
+										return cfg.dataset.tasks_list # ["tts", "tts", "ns", "sr", "tse", "tts", "tts"] # , "cse", "nse"
-												pseudocode polyfill stub some other flavor of working on adding the tasks

											
										
										
											2023-08-19 03:22:13 +00:00
-												sampler update (some brainworm just never actually had a sampler for sample_type=path)

											
										
										
											2024-06-14 21:55:40 +00:00
+									def save_state_dict(self, path = None):
 										if path is None:
-												add shuffle to samplers that can support it

											
										
										
											2024-06-30 16:36:46 +00:00
+											path = self.sampler_state_dict_path
-												sampler update (some brainworm just never actually had a sampler for sample_type=path)

											
										
										
											2024-06-14 21:55:40 +00:00
-												tweaks and fixes for lora stuffs

											
										
										
											2024-09-08 23:05:21 +00:00
+										if not path.parent.exists():
 											path.parent.mkdir(parents=True, exist_ok=True)
-												sampler update (some brainworm just never actually had a sampler for sample_type=path)

											
										
										
											2024-06-14 21:55:40 +00:00
+										if self.sampler_type == "path":
 											state_dict = self.sampler.get_state()
 										else:
 											state_dict = {
 												"samplers": { name: sampler.get_state() for name, sampler in self.samplers.items() },
 												"spkr_samplers": { name: sampler.get_state() for name, sampler in self.spkr_samplers.items() },
 											}
-												added safetensors support (with metadata) and feed whatever torch.load/torch.save into it

											
										
										
											2024-08-04 04:15:20 +00:00
+										torch_save(state_dict, path)
-												added per-speaker samplers

											
										
										
											2023-09-04 02:27:13 +00:00
-												sampler update (some brainworm just never actually had a sampler for sample_type=path)

											
										
										
											2024-06-14 21:55:40 +00:00
+									def load_state_dict(self, path = None):
-												added --eval-random-text-prompts to use random text prompts for eval pass, added --random-prompts for demo page and --lora to use a sample with the lora disabled, probably finally fixed validation dataloader breaking on eval

											
										
										
											2024-10-10 18:40:25 +00:00
+										if not self.training:
 											return
-												sampler update (some brainworm just never actually had a sampler for sample_type=path)

											
										
										
											2024-06-14 21:55:40 +00:00
+										if path is None:
-												add shuffle to samplers that can support it

											
										
										
											2024-06-30 16:36:46 +00:00
+											path = self.sampler_state_dict_path
-												added per-speaker samplers

											
										
										
											2023-09-04 02:27:13 +00:00
-												sampler update (some brainworm just never actually had a sampler for sample_type=path)

											
										
										
											2024-06-14 21:55:40 +00:00
+										if not path.exists():
 											return
-												added safetensors support (with metadata) and feed whatever torch.load/torch.save into it

											
										
										
											2024-08-04 04:15:20 +00:00
+										state_dict = torch_load(path)
-												sampler update (some brainworm just never actually had a sampler for sample_type=path)

											
										
										
											2024-06-14 21:55:40 +00:00
+										if self.sampler_type == "path":
-												ugh

											
										
										
											2024-06-15 17:29:03 +00:00
+											state_dict = self.sampler.set_state(state_dict)
-												sampler update (some brainworm just never actually had a sampler for sample_type=path)

											
										
										
											2024-06-14 21:55:40 +00:00
+										else:
-												added per-speaker samplers

											
										
										
											2023-09-04 02:27:13 +00:00
+											for name, sampler in state_dict["samplers"].items():
 												if name not in self.samplers:
 													continue
-												ugh

											
										
										
											2024-06-15 17:29:03 +00:00
+												self.samplers[name].set_state( sampler )
-												sampler update (some brainworm just never actually had a sampler for sample_type=path)

											
										
										
											2024-06-14 21:55:40 +00:00
 											for name, sampler in state_dict["spkr_samplers"].items():
 												if name not in self.spkr_samplers:
 													continue
-												ugh

											
										
										
											2024-06-15 17:29:03 +00:00
+												self.spkr_samplers[name].set_state( sampler )
-												added per-speaker samplers

											
										
										
											2023-09-04 02:27:13 +00:00
-												pseudocode polyfill stub some other flavor of working on adding the tasks

											
										
										
											2023-08-19 03:22:13 +00:00
+									def _get_phone_symmap(self):
 										return get_phone_symmap()
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+									def _get_spkr_symmap(self):
 										return {s: i for i, s in enumerate(self.spkrs)}
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+									def _get_spkr_group_symmap(self):
 										return {s: i for i, s in enumerate(self.spkr_groups)}
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
+									def _get_lang_symmap(self):
 										return get_lang_symmap()
-												deprecate sole AR/NAR model by only keeping the AR+NAR (the beauty of no one using this is that I can break compat as much as I want), add tone token for when I classify my dataset with tone/emotion in the future, some other things

											
										
										
											2024-04-16 00:54:32 +00:00
+									def _get_tone_symmap(self):
 										return get_tone_symmap()
-												pseudocode polyfill stub some other flavor of working on adding the tasks

											
										
										
											2023-08-19 03:22:13 +00:00
+									def _get_task_symmap(self):
 										return get_task_symmap()
-												nightmare of spaghetti that might break compat; mechanism to increase RVQ bins of an existing model without retraining, keeps sampled proms/resps at max RVQ level and trim off excess levels according to what model receives them, some other things I already forgot (I really hope no one else has weights being baked right now)

											
										
										
											2023-08-19 20:06:33 +00:00
+									def sample_noise(self):
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+										path = random.choice(self.noise_paths)
-												setting up for allowing training for a partial amount of the speechx tasks (do NOT try this at home yet without a proper model, as performance is predecated on having a solid base vall-e model for the tasks

											
										
										
											2023-08-19 05:16:08 +00:00
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+										if cfg.dataset.use_hdf5:
 											key = _get_hdf5_path(path)
-												nightmare of spaghetti that might break compat; mechanism to increase RVQ bins of an existing model without retraining, keeps sampled proms/resps at max RVQ level and trim off excess levels according to what model receives them, some other things I already forgot (I really hope no one else has weights being baked right now)

											
										
										
											2023-08-19 20:06:33 +00:00
+											qnt = torch.from_numpy(cfg.hdf5[key]["audio"][:, :]).to(torch.int16)
-												setting up for allowing training for a partial amount of the speechx tasks (do NOT try this at home yet without a proper model, as performance is predecated on having a solid base vall-e model for the tasks

											
										
										
											2023-08-19 05:16:08 +00:00
+										else:
-												fix loading without needing an hdf5 dataset already prepped (and some other incidental speedups during dataloader prep)

											
										
										
											2024-05-18 12:14:26 +00:00
+											qnt = _load_quants(path, return_metadata=False)
-												setting up for allowing training for a partial amount of the speechx tasks (do NOT try this at home yet without a proper model, as performance is predecated on having a solid base vall-e model for the tasks

											
										
										
											2023-08-19 05:16:08 +00:00
+										return qnt
-												pseudocode polyfill stub some other flavor of working on adding the tasks

											
										
										
											2023-08-19 03:22:13 +00:00
-												forgot the =

											
										
										
											2023-08-18 00:07:59 +00:00
+									def sample_speakers(self, ignore=[]):
 										choices = set(self.spkrs) - set(ignore)
 										return random.choice([*choices])
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+									def sample_utterance(self, spkr_name, ignore=[]):
 										choices = [*(set(self.paths_by_spkr_name[spkr_name]) - set(ignore))]
 										if len(choices) == 0:
 											return None, None, None
 										path = random.choice(choices)
 										if cfg.dataset.use_hdf5:
 											key = _get_hdf5_path(path)
 											if key not in cfg.hdf5:
 												raise RuntimeError(f'Key of Path ({path}) not in HDF5: {key}')
-												fixes, throw an exception when using NAR only model with non-unified position IDs, since for some reason it outputs garbage for the NAR

											
										
										
											2024-08-03 03:25:49 +00:00
+											#metadata = cfg.hdf5[key].attrs
 											metadata = { f'{k}': f'{v}' for k, v in cfg.hdf5[key].attrs.items() }
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
 											text = cfg.hdf5[key]["text"][:]
 											resps = cfg.hdf5[key]["audio"][:, :]
 											text = torch.from_numpy(text).to(self.text_dtype)
 											resps = torch.from_numpy(resps).to(torch.int16)
 											"""
 											lang = metadata["language"] if "language" in metadata else None
 											tone = metadata["tone"] if "tone" in metadata else None
 											"""
 										else:
 											resps, metadata = _load_quants(path, return_metadata=True)
 											text = torch.tensor(tokenize( metadata["phonemes"] )).to(self.text_dtype)
 											"""
 											lang = metadata["language"] if "language" in metadata else None
 											tone = metadata["tone"] if "tone" in metadata else None
 											"""
 										return path, text, resps
-												actually validated and fixed sampling similar utterances for the prompt (hopefully nothing else is needed)

											
										
										
											2024-09-21 17:59:51 +00:00
+									# icky slop
-												add top_k sampling / offset for prompt similar utterance sampling

											
										
										
											2024-09-26 21:26:40 +00:00
+									def get_similar_utterance(self, path, offset=None ):
 										if offset is None:
 											offset = cfg.dataset.prompt_similar_top_k_offset
-												actually validated and fixed sampling similar utterances for the prompt (hopefully nothing else is needed)

											
										
										
											2024-09-21 17:59:51 +00:00
+										reference = path.name
 										if cfg.dataset.use_hdf5:
 											root = Path( *path.parts[:-1] )
 											path = Path( *path.parts[2:-1] )
 										else:
 											root = Path( *path.parts[:-1] )
 											path = Path(*path.parts[len(cfg.data_dir.parts):-1])
 										metadata = json_read( cfg.metadata_dir / path.with_suffix(".json"), default={} )
-												more stuff for similar-speaker prompt sampling (to-do: actually test if this works...)

											
										
										
											2024-09-17 04:10:29 +00:00
+										if reference not in metadata:
 											return None
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
-												more stuff for similar-speaker prompt sampling (to-do: actually test if this works...)

											
										
										
											2024-09-17 04:10:29 +00:00
+										reference_metadata = metadata[reference]
-												actually validated and fixed sampling similar utterances for the prompt (hopefully nothing else is needed)

											
										
										
											2024-09-21 17:59:51 +00:00
-												more stuff for similar-speaker prompt sampling (to-do: actually test if this works...)

											
										
										
											2024-09-17 04:10:29 +00:00
+										if "similar" not in reference_metadata:
 											return None
-												actually validated and fixed sampling similar utterances for the prompt (hopefully nothing else is needed)

											
										
										
											2024-09-21 17:59:51 +00:00
-												more stuff for similar-speaker prompt sampling (to-do: actually test if this works...)

											
										
										
											2024-09-17 04:10:29 +00:00
+										if len(reference_metadata["similar"]) >= offset:
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
+											offset = 0
-												actually validated and fixed sampling similar utterances for the prompt (hopefully nothing else is needed)

											
										
										
											2024-09-21 17:59:51 +00:00
-												maybe final tweaks, I really needed to unify my json read/write and orjson is proven to be fast enough for me to try and rely on it more

											
										
										
											2024-09-18 03:57:04 +00:00
+										metadata_keys = list(metadata.keys())
-												add top_k sampling / offset for prompt similar utterance sampling

											
										
										
											2024-09-26 21:26:40 +00:00
 										if cfg.dataset.prompt_similar_top_k > 1:
 											indices = reference_metadata["similar"][offset:offset+cfg.dataset.prompt_similar_top_k]
 											index = random.choice( indices )
 										else:
 											index = reference_metadata["similar"][offset]
-												actually validated and fixed sampling similar utterances for the prompt (hopefully nothing else is needed)

											
										
										
											2024-09-21 17:59:51 +00:00
+										name = metadata_keys[index]
 										return root / name
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
-												more stuff for similar-speaker prompt sampling (to-do: actually test if this works...)

											
										
										
											2024-09-17 04:10:29 +00:00
+									def sample_prompts(self, spkr_name, reference, should_trim=True):
-												added prom-less training / inferencing, some other things

											
										
										
											2024-07-23 00:36:07 +00:00
+										if not cfg.dataset.prompt_duration_range or cfg.dataset.prompt_duration_range[-1] == 0:
 											return None
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										prom_list = []
-												more stuff for similar-speaker prompt sampling (to-do: actually test if this works...)

											
										
										
											2024-09-17 04:10:29 +00:00
+										choices = set(self.paths_by_spkr_name[spkr_name]) - {reference}
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										choices = [*choices]
-												deprecate sole AR/NAR model by only keeping the AR+NAR (the beauty of no one using this is that I can break compat as much as I want), add tone token for when I classify my dataset with tone/emotion in the future, some other things

											
										
										
											2024-04-16 00:54:32 +00:00
+										# no other utterances, it'd make more sense to prune speakers with only one utterance in the validation step
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										if len(choices) == 0:
-												added sample_type that samples from speakers to truly balance an epoch by speakers rather than the entire dataset and a sampler that tries to balance by speakers

											
										
										
											2023-08-17 00:39:21 +00:00
+											choices = [*set(self.paths_by_spkr_name[spkr_name])]
 											"""
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+											raise ValueError(
 												f"Failed to find another different utterance for {spkr_name}."
 											)
-												added sample_type that samples from speakers to truly balance an epoch by speakers rather than the entire dataset and a sampler that tries to balance by speakers

											
										
										
											2023-08-17 00:39:21 +00:00
+											"""
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												tested the training preparation for tasks ns, sr, and tse (I don't expect it to go well with only 2 RVQ bins)

											
										
										
											2023-08-19 04:55:40 +00:00
+										prom_length = 0
-												oops

											
										
										
											2024-10-18 14:40:06 +00:00
+										duration_lo, duration_hi = cfg.dataset.prompt_duration_range
 										trim_length = int(random.uniform(duration_lo, duration_hi) * cfg.dataset.frames_per_second) if trim else 0
-												tested the training preparation for tasks ns, sr, and tse (I don't expect it to go well with only 2 RVQ bins)

											
										
										
											2023-08-19 04:55:40 +00:00
-												cleaned up unused config flags, allow less strict yaml by pruning missing keys, renamed some dataset configs to be more unified

											
										
										
											2024-10-17 22:06:48 +00:00
+										for _ in range(cfg.dataset.prompt_max_samples):
 											if reference is not None:
-												more stuff for similar-speaker prompt sampling (to-do: actually test if this works...)

											
										
										
											2024-09-17 04:10:29 +00:00
+												# yuck
-												cleaned up unused config flags, allow less strict yaml by pruning missing keys, renamed some dataset configs to be more unified

											
										
										
											2024-10-17 22:06:48 +00:00
+												path = None
 												if random.random() < cfg.dataset.prompt_similar_p:
 													path = self.get_similar_utterance( reference, offset = len(prom_list) )
-												more stuff for similar-speaker prompt sampling (to-do: actually test if this works...)

											
										
										
											2024-09-17 04:10:29 +00:00
+												if not path:
 													path = random.choice(choices)
 											else:
 												path = random.choice(choices)
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+											if cfg.dataset.use_hdf5:
 												key = _get_hdf5_path(path)
-												nightmare of spaghetti that might break compat; mechanism to increase RVQ bins of an existing model without retraining, keeps sampled proms/resps at max RVQ level and trim off excess levels according to what model receives them, some other things I already forgot (I really hope no one else has weights being baked right now)

											
										
										
											2023-08-19 20:06:33 +00:00
+												qnt = torch.from_numpy(cfg.hdf5[key]["audio"][:, :]).to(torch.int16)
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+											else:
-												fix loading without needing an hdf5 dataset already prepped (and some other incidental speedups during dataloader prep)

											
										
										
											2024-05-18 12:14:26 +00:00
+												qnt = _load_quants(path, return_metadata=False)
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												might just be better to explicitly define prompt duration ranges, especially under a "train small contexts then increase it" training paradigm

											
										
										
											2024-05-11 14:50:54 +00:00
+											if 0 < trim_length and trim_length < qnt.shape[0]:
-												added prom-less training / inferencing, some other things

											
										
										
											2024-07-23 00:36:07 +00:00
+												qnt = trim( qnt, trim_length, reencode=cfg.dataset.reencode_on_concat, device=cfg.dataset.reencode_device )
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
 											prom_list.append(qnt)
-												tested the training preparation for tasks ns, sr, and tse (I don't expect it to go well with only 2 RVQ bins)

											
										
										
											2023-08-19 04:55:40 +00:00
+											prom_length += qnt.shape[0]
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												cleaned up unused config flags, allow less strict yaml by pruning missing keys, renamed some dataset configs to be more unified

											
										
										
											2024-10-17 22:06:48 +00:00
+											if prom_length >= trim_length:
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+												break
-												added experimental option to append utterances for training target (emphasis on experimental)

											
										
										
											2023-10-11 22:32:45 +00:00
+										# might be better to decode => concat waveforms with silence in between => reencode
 										# as you technically can't just append encodec sequences together like this without issues
-												added prom-less training / inferencing, some other things

											
										
										
											2024-07-23 00:36:07 +00:00
+										prom = concat_audio( *prom_list, reencode=cfg.dataset.reencode_on_concat, device=cfg.dataset.reencode_device )
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												might just be better to explicitly define prompt duration ranges, especially under a "train small contexts then increase it" training paradigm

											
										
										
											2024-05-11 14:50:54 +00:00
+										if 0 < trim_length and trim_length < prom.shape[0]:
-												added prom-less training / inferencing, some other things

											
										
										
											2024-07-23 00:36:07 +00:00
+											prom = trim( prom, trim_length, reencode=cfg.dataset.reencode_on_concat, device=cfg.dataset.reencode_device )
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
 										return prom
 									def __getitem__(self, index):
-												fixes for re-introducing SpeechX tasks (need to actually validate if these all do the right things)

											
										
										
											2024-07-18 22:16:32 +00:00
+										if self.empty_text is None:
 											self.empty_text = tokenize(" ")
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+										bos_id, space_id, eos_id = self.empty_text
-												make validation samplers ignore sampler type

											
										
										
											2023-10-22 14:01:47 +00:00
+										if self.sampler_type == "group":
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+											spkr_group = self.spkr_groups[index]
-												make validation samplers ignore sampler type

											
										
										
											2023-10-22 14:01:47 +00:00
+											#spkr_group_id = self.spkr_group_symmap[spkr_group]
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+											spkr_name = self.spkr_samplers[spkr_group].sample()
-												fixed issue with the 'add another target audio to artificially create longer sequences' for HDF5 just duplicating the utterance initially sampled

											
										
										
											2023-10-19 01:38:33 +00:00
+											spkr_id = self.spkr_symmap[spkr_name]
 											path = self.samplers[spkr_name].sample()
-												make validation samplers ignore sampler type

											
										
										
											2023-10-22 14:01:47 +00:00
+										elif self.sampler_type == "speaker":
-												added sample_type that samples from speakers to truly balance an epoch by speakers rather than the entire dataset and a sampler that tries to balance by speakers

											
										
										
											2023-08-17 00:39:21 +00:00
+											spkr_name = self.spkrs[index]
 											spkr_id = self.spkr_symmap[spkr_name]
-												added per-speaker samplers

											
										
										
											2023-09-04 02:27:13 +00:00
+											path = self.samplers[spkr_name].sample()
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+											spkr_group = self.get_speaker_group(path)
-												make validation samplers ignore sampler type

											
										
										
											2023-10-22 14:01:47 +00:00
+											#spkr_group_id = self.spkr_group_symmap[spkr_group]
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										else:
-												removed the sampler as it's very misleading

											
										
										
											2023-08-18 19:47:48 +00:00
+											path = self.paths[index]
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+											spkr_name = self.get_speaker(path)
-												added sample_type that samples from speakers to truly balance an epoch by speakers rather than the entire dataset and a sampler that tries to balance by speakers

											
										
										
											2023-08-17 00:39:21 +00:00
+											spkr_id = self.spkr_symmap[spkr_name]
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+											spkr_group = self.get_speaker_group(path)
-												make validation samplers ignore sampler type

											
										
										
											2023-10-22 14:01:47 +00:00
+											#spkr_group_id = self.spkr_group_symmap[spkr_group]
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												coerce into path for other sampler_types (it's required for sampling for similar utterances)

											
										
										
											2024-09-26 23:37:56 +00:00
+										if not isinstance( path, Path ):
 											path = Path( path )
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										if cfg.dataset.use_hdf5:
 											key = _get_hdf5_path(path)
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
-												final tweaks, hopefully

											
										
										
											2024-04-29 03:28:29 +00:00
+											if key not in cfg.hdf5:
 												raise RuntimeError(f'Key of Path ({path}) not in HDF5: {key}')
-												fixes, throw an exception when using NAR only model with non-unified position IDs, since for some reason it outputs garbage for the NAR

											
										
										
											2024-08-03 03:25:49 +00:00
+											# I need to do some weird coersion to a normal dict because it'll bitch about Hdf5 objects not being pickleable in worker processes
 											metadata = { f'{k}': f'{v}' for k, v in cfg.hdf5[key].attrs.items() }
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
-												remove double spaces in the text phonemes (might have caused problems.........)

											
										
										
											2023-10-11 00:18:24 +00:00
+											text = cfg.hdf5[key]["text"][:]
 											resps = cfg.hdf5[key]["audio"][:, :]
 											text = torch.from_numpy(text).to(self.text_dtype)
 											resps = torch.from_numpy(resps).to(torch.int16)
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
 											lang = metadata["language"] if "language" in metadata else None
 											tone = metadata["tone"] if "tone" in metadata else None
-												added rudimentary demo page creator (currently just embeds base64 wavs into the page, need to test not doing that)

											
										
										
											2024-07-20 01:49:40 +00:00
+											text_string = metadata["text"] if "text" in metadata else None
-												added option to retokenize phonemes for hdf5 (to save having to remake my hdf5 file)

											
										
										
											2024-09-21 18:08:01 +00:00
 											if cfg.dataset.retokenize_text and "phonemes" in metadata:
 												text = torch.tensor(tokenize( metadata["phonemes"] )).to(self.text_dtype)
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										else:
-												fix loading without needing an hdf5 dataset already prepped (and some other incidental speedups during dataloader prep)

											
										
										
											2024-05-18 12:14:26 +00:00
+											resps, metadata = _load_quants(path, return_metadata=True)
 											text = torch.tensor(tokenize( metadata["phonemes"] )).to(self.text_dtype)
-												added experimental option to append utterances for training target (emphasis on experimental)

											
										
										
											2023-10-11 22:32:45 +00:00
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+											lang = metadata["language"] if "language" in metadata else None
 											tone = metadata["tone"] if "tone" in metadata else None
-												added rudimentary demo page creator (currently just embeds base64 wavs into the page, need to test not doing that)

											
										
										
											2024-07-20 01:49:40 +00:00
+											text_string = metadata["text"] if "text" in metadata else None
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
-												lang fixes / reworked phoneme symmap validation

											
										
										
											2024-09-19 00:36:03 +00:00
+										lang = self.get_language(spkr_group) if not lang else lang.lower()
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
 										if not tone:
 											tone = "neutral"
 										lang = torch.tensor([self.lang_symmap[lang]]).to(torch.uint8)
 										tone = torch.tensor([self.tone_symmap[tone]]).to(torch.uint8)
-												added option to either naively concat codes to concat audio waveforms (prior behavior) or to decode => concat => encode instead (although this only currently happens for prom sampling if an utternace is too small)

											
										
										
											2024-07-18 21:48:41 +00:00
+										# a bool to easily experiment with two mindsets later
 										naive = cfg.experimental
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+										# append additional prompts in an attempt to artifically increase lengths / offer new data
-												cleaned up unused config flags, allow less strict yaml by pruning missing keys, renamed some dataset configs to be more unified

											
										
										
											2024-10-17 22:06:48 +00:00
+										if cfg.dataset.resps_max_samples > 1 and random.random() < cfg.dataset.resps_append_p:
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+											ignore_paths = []
-												cleaned up unused config flags, allow less strict yaml by pruning missing keys, renamed some dataset configs to be more unified

											
										
										
											2024-10-17 22:06:48 +00:00
+											for _ in range( 1, cfg.dataset.resps_max_samples ):
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+												path, txt, qnt = self.sample_utterance(spkr_name, ignore=ignore_paths)
 												ignore_paths.append(path)
 												# <s>[original text]</s><s>[new text]</s>
 												if naive:
 													text = torch.concat([ text, txt ])
 												# <s>[original text] [new text]</s>
 												# removes the original text's </s>, includes a space, and remove the new text's <s>
 												else:
-												added experimental option to append utterances for training target (emphasis on experimental)

											
										
										
											2023-10-11 22:32:45 +00:00
+													text = torch.concat([ text[:-1], torch.tensor([self.phone_symmap[" "]]).to(torch.int16),  txt[1:] ])
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+												# might be better to decode => concat waveforms with silence in between => reencode
 												# as you technically can't just append encodec sequences together like this without issues
-												added option to either naively concat codes to concat audio waveforms (prior behavior) or to decode => concat => encode instead (although this only currently happens for prom sampling if an utternace is too small)

											
										
										
											2024-07-18 21:48:41 +00:00
+												resps = concat_audio( resps, qnt, reencode=cfg.dataset.reencode_on_concat, device=cfg.dataset.reencode_device )
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
-												fixes for re-introducing SpeechX tasks (need to actually validate if these all do the right things)

											
										
										
											2024-07-18 22:16:32 +00:00
+										task = random.choice(self.tasks)
 										if f'<{task}>' not in self.task_symmap:
 											raise Exception(f'Task not defined: {task}')
-												disabled preparing of SpeechX tasks, added dynamic temperature testing (to-do: test it, credited in the function)

											
										
										
											2023-10-09 18:01:40 +00:00
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+										# Base TTS (<text><prompt> => <resp>)
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+										if task == "tts":
-												more stuff for similar-speaker prompt sampling (to-do: actually test if this works...)

											
										
										
											2024-09-17 04:10:29 +00:00
+											proms = self.sample_prompts(spkr_name, reference=path)
-												added experimental option to append utterances for training target (emphasis on experimental)

											
										
										
											2023-10-11 22:32:45 +00:00
-												oops

											
										
										
											2024-10-18 14:40:06 +00:00
+											if cfg.dataset.prompt_inject_noise:
-												added prom-less training / inferencing, some other things

											
										
										
											2024-07-23 00:36:07 +00:00
+												# sample random noise
 												noise = self.sample_noise()
 												# extend the noise to fill the target audio
 												noise = repeat_extend_audio(noise, proms.shape[0])
 												# create the input prompt by merging the target audio with the noise
 												proms = merge_audio( proms, noise, scale=[1, cfg.dataset.noise_scale], device=cfg.dataset.reencode_device )
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+										# VALL-E Continuous (<text><partial resp> => <remaining resp> )
 										#     (this could just be sampled as <text a><text b><audio a> => <audio b>, but I need to experiment with it)
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+										elif task == "tts-c":
 											# trim a piece of the output response
 											if naive:
-												oops

											
										
										
											2024-10-18 14:40:06 +00:00
+												duration_lo, duration_hi = cfg.dataset.prompt_duration_range
 												trim_length = int(random.uniform(duration_lo, duration_hi) * cfg.dataset.frames_per_second)
-												integrated plot script, added tts-c task token to help the model be able to mix between normal VALL-E and VALL-E continuous

											
										
										
											2023-09-02 21:29:53 +00:00
-												(need to verify) added modifying model size and config bool to align with VALL-E continuous' methodology

											
										
										
											2023-09-01 22:19:34 +00:00
+												proms = resps[:trim_length, :]
 												resps = resps[trim_length:, :]
 											else:
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+												path, txt, qnt = self.sample_utterance(spkr_name)
 												# <s>[original text]</s><s>[new text]</s>
 												if naive:
 													text = torch.concat([ text, txt ])
 												# <s>[original text] [new text]</s>
 												# removes the original text's </s>, includes a space, and remove the new text's <s>
 												else:
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+													text = torch.concat([ text[:-1], torch.tensor([space_id]).to(torch.int16), txt[1:] ])
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
 												# set prompt as initial response
 												proms = resps
 												# set target as newly sampled response
 												resps = qnt
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+											# inject task token
 											proms = [
 												proms,
 												task,
 											]
-												tweak

											
										
										
											2024-09-06 04:21:18 +00:00
+										# Base STT (<resp> => <text>)
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
+										elif task == "stt":
-												tweak

											
										
										
											2024-09-06 04:21:18 +00:00
+											proms = [
 												task
 											]
-												experimental implementation of STT (need to actually test on a model, test trainer seems to work)

											
										
										
											2024-09-06 01:43:20 +00:00
-												unified nar.py into ar_nar.py

											
										
										
											2024-11-10 18:19:48 +00:00
+										# Duration prediction (<text><prompt> => len(<resp>))
 										elif task == "len":
 											proms = self.sample_prompts(spkr_name, reference=path)
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+										# noise suppression (<text>? <resp+noise> => <resp>)
 										# speech removal (<text>?<resp+noise> => <noise>)
-												pseudocode polyfill stub some other flavor of working on adding the tasks

											
										
										
											2023-08-19 03:22:13 +00:00
+										elif task == "ns" or task == "sr":
 											# sample random noise
-												forgot the =

											
										
										
											2023-08-18 00:07:59 +00:00
+											noise = self.sample_noise()
-												pseudocode polyfill stub some other flavor of working on adding the tasks

											
										
										
											2023-08-19 03:22:13 +00:00
+											# extend the noise to fill the target audio
 											noise = repeat_extend_audio(noise, resps.shape[0])
 											# create the input prompt by merging the target audio with the noise
-												fixes for re-introducing SpeechX tasks (need to actually validate if these all do the right things)

											
										
										
											2024-07-18 22:16:32 +00:00
+											proms = merge_audio( resps, noise, scale=[1, cfg.dataset.noise_scale], device=cfg.dataset.reencode_device )
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
 											# set the text prompt to empty to train without a guided text prompt
 											if random.random() < 0.5:
 												text = None
 											# inject task token
 											proms = [
 												task,
 												proms
 											]
-												pseudocode polyfill stub some other flavor of working on adding the tasks

											
										
										
											2023-08-19 03:22:13 +00:00
+											# set the target to just be the noise if <sr>
 											if task == "sr":
 												resps = noise
-												tested the training preparation for tasks ns, sr, and tse (I don't expect it to go well with only 2 RVQ bins)

											
										
										
											2023-08-19 04:55:40 +00:00
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+										# target speech extraction ( <text><prom><resp + other resp> => <resp> )
-												removed the sampler as it's very misleading

											
										
										
											2023-08-18 19:47:48 +00:00
+										elif task == "tse":
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+											# sample a prompt
-												more stuff for similar-speaker prompt sampling (to-do: actually test if this works...)

											
										
										
											2024-09-17 04:10:29 +00:00
+											proms = self.sample_prompts(spkr_name, reference=path)
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
 											# sample another speaker
 											_, __, other_resps = self.sample_utterance(self.sample_speakers(ignore=[spkr_name]))
-												pseudocode polyfill stub some other flavor of working on adding the tasks

											
										
										
											2023-08-19 03:22:13 +00:00
+											# overlay the random speaker over the target audio
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+											other_resps = merge_audio( resps, other_resps, scale=[1, random.uniform(0.5, 0.75)], device=cfg.dataset.reencode_device )
 											# set the text prompt to empty to train without a guided text prompt
 											if random.random() < 0.5:
 												text = None
-												tested the training preparation for tasks ns, sr, and tse (I don't expect it to go well with only 2 RVQ bins)

											
										
										
											2023-08-19 04:55:40 +00:00
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+											# stitch together the proms
 											proms = [
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+												proms,
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+												task,
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+												other_resps,
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+											]
-												tested the training preparation for tasks ns, sr, and tse (I don't expect it to go well with only 2 RVQ bins)

											
										
										
											2023-08-19 04:55:40 +00:00
-												pseudocode polyfill stub some other flavor of working on adding the tasks

											
										
										
											2023-08-19 03:22:13 +00:00
+										# clean speech editing
-												literally had a urethra moment before going to bed with a way to implement cse/nse tasks

											
										
										
											2023-08-19 06:16:46 +00:00
+										elif task == "cse" or task == "nse":
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+											# speech editing would require higher quality transcription data (phoneme level/word level) unfortunately
 											# as I need to get a good clean point to trim into
 											# instead we'll just sample a bunch of utterances
-												literally had a urethra moment before going to bed with a way to implement cse/nse tasks

											
										
										
											2023-08-19 06:16:46 +00:00
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+											samples = []
 											for _ in range( 4 ):
 												sampled = self.sample_utterance(spkr_name, ignore=[s[0] for s in samples])
 												samples.append( sampled )
-												literally had a urethra moment before going to bed with a way to implement cse/nse tasks

											
										
										
											2023-08-19 06:16:46 +00:00
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+											pre_text, mid_text, post_text, edit_text = [ s[1][1:-1] for s in samples ]
 											pre_prom, mid_prom, post_prom, edit_prom = [ s[2] for s in samples ]
-												literally had a urethra moment before going to bed with a way to implement cse/nse tasks

											
										
										
											2023-08-19 06:16:46 +00:00
 											# randomly drop out pre
 											if random.random() < 0.125:
 												pre_text = None
 												pre_prom = None
 											# randomly drop out post
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+											elif random.random() < 0.125:
-												literally had a urethra moment before going to bed with a way to implement cse/nse tasks

											
										
										
											2023-08-19 06:16:46 +00:00
+												post_text = None
 												post_prom = None
 											# create new text
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+											text = concat_audio(
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+												torch.tensor( [ bos_id ] ).to(dtype=self.text_dtype), # <s>
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+												pre_text,
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+												None if pre_text is None else torch.tensor( [ space_id ] ).to(dtype=self.text_dtype), # " "
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+												edit_text,
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+												None if post_text is None else torch.tensor( [ space_id ] ).to(dtype=self.text_dtype), # " "
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+												post_text,
-												changed torch.Tensor().to(device, dtype) to just torch.tensor(..., device, dtype) because it's been bothering my autism that I'm creating tensors then converting rather than creating with the right device/dtype, some 'optimization' to compile the model but it doesnt seem to do anything useful

											
										
										
											2024-08-04 03:10:21 +00:00
+												torch.tensor( [ eos_id ] ).to(dtype=self.text_dtype), # </s>
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
 												reencode=False,
-												literally had a urethra moment before going to bed with a way to implement cse/nse tasks

											
										
										
											2023-08-19 06:16:46 +00:00
+											)
 											if task == "nse":
 												# sample random noise
 												noise = self.sample_noise()
 												# it might be better to extend the noise to the sum of the pre+mid+post or pre+edit+post to keep the noise truly coherent
 												# but it's noise, it's supposed to be random
-												nightmare of spaghetti that might break compat; mechanism to increase RVQ bins of an existing model without retraining, keeps sampled proms/resps at max RVQ level and trim off excess levels according to what model receives them, some other things I already forgot (I really hope no one else has weights being baked right now)

											
										
										
											2023-08-19 20:06:33 +00:00
+												def noise_proms( p ):
-												literally had a urethra moment before going to bed with a way to implement cse/nse tasks

											
										
										
											2023-08-19 06:16:46 +00:00
+													# ignore if we turned it off
-												nightmare of spaghetti that might break compat; mechanism to increase RVQ bins of an existing model without retraining, keeps sampled proms/resps at max RVQ level and trim off excess levels according to what model receives them, some other things I already forgot (I really hope no one else has weights being baked right now)

											
										
										
											2023-08-19 20:06:33 +00:00
+													if p is None:
-												literally had a urethra moment before going to bed with a way to implement cse/nse tasks

											
										
										
											2023-08-19 06:16:46 +00:00
+														return None
 													# extend the noise to fill the target audio
-												nightmare of spaghetti that might break compat; mechanism to increase RVQ bins of an existing model without retraining, keeps sampled proms/resps at max RVQ level and trim off excess levels according to what model receives them, some other things I already forgot (I really hope no one else has weights being baked right now)

											
										
										
											2023-08-19 20:06:33 +00:00
+													n = repeat_extend_audio(noise, p.shape[0])
-												literally had a urethra moment before going to bed with a way to implement cse/nse tasks

											
										
										
											2023-08-19 06:16:46 +00:00
+													# merge the noise over the utterance
-												fixes for re-introducing SpeechX tasks (need to actually validate if these all do the right things)

											
										
										
											2024-07-18 22:16:32 +00:00
+													return merge_audio(p, n, scale=[1, cfg.dataset.noise_scale], device=cfg.dataset.reencode_device)
-												literally had a urethra moment before going to bed with a way to implement cse/nse tasks

											
										
										
											2023-08-19 06:16:46 +00:00
 												# apply noise to all pieces
 												pre_prom = noise_proms( pre_prom )
 												mid_prom = noise_proms( mid_prom )
 												post_prom = noise_proms( post_prom )
 												edit_prom = noise_proms( edit_prom )
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+											# create new prom
 											proms = [
 												pre_prom,
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+												"soe",
 												"mask" if task == "cse" else mid_prom,
 												"eoe",
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+												post_prom,
 											]
-												literally had a urethra moment before going to bed with a way to implement cse/nse tasks

											
										
										
											2023-08-19 06:16:46 +00:00
+											# create new resp
-												added option to either naively concat codes to concat audio waveforms (prior behavior) or to decode => concat => encode instead (although this only currently happens for prom sampling if an utternace is too small)

											
										
										
											2024-07-18 21:48:41 +00:00
+											resps = concat_audio(
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+												pre_prom,
 												edit_prom,
 												post_prom,
-												added option to either naively concat codes to concat audio waveforms (prior behavior) or to decode => concat => encode instead (although this only currently happens for prom sampling if an utternace is too small)

											
										
										
											2024-07-18 21:48:41 +00:00
+												reencode=cfg.dataset.reencode_on_concat,
 												device=cfg.dataset.reencode_device,
-												literally had a urethra moment before going to bed with a way to implement cse/nse tasks

											
										
										
											2023-08-19 06:16:46 +00:00
+											)
-												nightmare of spaghetti that might break compat; mechanism to increase RVQ bins of an existing model without retraining, keeps sampled proms/resps at max RVQ level and trim off excess levels according to what model receives them, some other things I already forgot (I really hope no one else has weights being baked right now)

											
										
										
											2023-08-19 20:06:33 +00:00
+										else:
-												shuffled VALL-E continuous as a task tts-c instead, logic fixes for it

											
										
										
											2023-09-02 17:23:40 +00:00
+											raise Exception(f'Undefined task: {task}')
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												fixes...

											
										
										
											2024-07-19 04:25:32 +00:00
+										if text is None:
 											text = torch.tensor([bos_id, eos_id]).to(self.text_dtype)
-												busy work and cleanup while I wait for 1TB of audio to quantize... again.

											
										
										
											2024-08-07 01:23:33 +00:00
+										# pad the target with silence
-												cleaned up unused config flags, allow less strict yaml by pruning missing keys, renamed some dataset configs to be more unified

											
										
										
											2024-10-17 22:06:48 +00:00
+										if random.random() < cfg.dataset.resps_pad_silence_p:
-												busy work and cleanup while I wait for 1TB of audio to quantize... again.

											
										
										
											2024-08-07 01:23:33 +00:00
+											resps = pad_codes_with_silence( resps )
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										return dict(
 											index=index,
-												added ability to mark models as disabled for training, and hotloading them for eval/validation (useful if training only one model, or training a model per GPU)

											
										
										
											2023-08-27 17:26:12 +00:00
+											path=Path(path),
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+											spkr_name=spkr_name,
 											spkr_id=spkr_id,
-												maybe fix evaluation dataset not being capped to cfg.evaluation.size

											
										
										
											2023-08-17 23:56:37 +00:00
+											task=task,
-												added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested)

											
										
										
											2023-10-12 01:38:40 +00:00
+											lang=lang,
-												re-introducing SpeechX tasks (need to validate them all, everything works with base tts anyways)

											
										
										
											2024-07-18 21:16:14 +00:00
+											tone=tone,
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+											text=text,
 											proms=proms,
 											resps=resps,
-												cleaned up demo page creation, added option to pass in RVQ level sampling distribution for training

											
										
										
											2024-07-22 00:12:03 +00:00
 											metadata=metadata,
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										)
 									def head_(self, n):
 										self._head = n
 									def training_(self, value):
 										self.training = value
 									def __len__(self):
-												make validation samplers ignore sampler type

											
										
										
											2023-10-22 14:01:47 +00:00
+										if self.sampler_type == "group":
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+											return min(len(self.spkr_groups), self._head or len(self.spkr_groups))
-												make validation samplers ignore sampler type

											
										
										
											2023-10-22 14:01:47 +00:00
+										if self.sampler_type == "speaker":
-												added sample_type that samples from speakers to truly balance an epoch by speakers rather than the entire dataset and a sampler that tries to balance by speakers

											
										
										
											2023-08-17 00:39:21 +00:00
+											return min(len(self.spkrs), self._head or len(self.spkrs))
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										return min(len(self.paths), self._head or len(self.paths))
 								def collate_fn(samples: list[dict]):
 									batch: dict[str, Any] = {k: [s[k] for s in samples] for k in samples[0]}
 									return batch
 								def _seed_worker(worker_id):
 									worker_seed = torch.initial_seed() % 2**32
 									np.random.seed(worker_seed)
 									random.seed(worker_seed)
 								def _create_dataloader(dataset, training):
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+									kwargs = dict(
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
+										shuffle=not training,
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+										batch_size=cfg.hyperparameters.batch_size if training else cfg.evaluation.batch_size,
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										drop_last=training,
-												sanity cleanups with weird off-by-one-ness, cleaned up and validated vall_e.models.experimental works again

											
										
										
											2024-07-27 20:36:05 +00:00
+										sampler=dataset.sampler if training else None,
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+									) if not isinstance(dataset.sampler, BatchedOrderedSampler) else dict(
 										batch_sampler=dataset.sampler,
 									)
 									return DataLoader(
 										dataset=dataset,
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										num_workers=cfg.dataset.workers,
 										collate_fn=collate_fn,
-												fixes with the local backend

											
										
										
											2023-08-24 22:05:56 +00:00
+										persistent_workers=cfg.dataset.workers > 1,
-												add shuffle to samplers that can support it

											
										
										
											2024-06-30 16:36:46 +00:00
+										pin_memory=False,
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										worker_init_fn=_seed_worker,
-												sort duration buckets to ensure that paths sorted-by-duration are actually sorted by duration (because i didnt know that python dicts can have non-strings as keys), added batching samples based on total duration to ensure best training throughput

											
										
										
											2024-06-29 03:28:54 +00:00
+										**kwargs,
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+									)
 								def create_datasets():
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+									train_dataset = Dataset( training=True )
 									val_dataset = Dataset( phone_symmap=train_dataset.phone_symmap, training=False )
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
 									return train_dataset, val_dataset
-												added rudimentary demo page creator (currently just embeds base64 wavs into the page, need to test not doing that)

											
										
										
											2024-07-20 01:49:40 +00:00
+								def create_train_dataloader():
 									train_dataset = Dataset( training=True )
 									train_dl = _create_dataloader(train_dataset, training=True)
 									_logger.info(str(train_dataset.phone_symmap))
 									_logger.info(str(train_dataset.spkr_symmap))
 									_logger.info(str(train_dataset.spkr_group_symmap))
 									_logger.info(f"#samples (train): {len(train_dataset)}.")
 									_logger.info(f"#duration (train): {str(train_dataset.duration)}.")
 									return train_dl
 								def create_val_dataloader():
 									val_dataset = Dataset( training=False )
 									val_dl = _create_dataloader(val_dataset, training=False)
 									_logger.info(str(val_dataset.phone_symmap))
 									_logger.info(str(val_dataset.spkr_symmap))
 									_logger.info(str(val_dataset.spkr_group_symmap))
 									_logger.info(f"#samples (val): {len(val_dataset)}.")
 									_logger.info(f"#duration (val): {str(val_dataset.duration)}.")
 									return val_dl
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
 								def create_train_val_dataloader():
 									train_dataset, val_dataset = create_datasets()
-												test trainer (vall_e.models.ar_nar) tests some SpeechX features

											
										
										
											2024-07-18 23:46:45 +00:00
+									# deepcopy is slow
 									subtrain_dataset = Dataset( training=True )
-												residual_in_fp32=False for mamba arch backends because it breaks the classifier (output projection / lm head / what-have-you) under AMP

											
										
										
											2024-06-15 17:08:03 +00:00
-												make validation samplers ignore sampler type

											
										
										
											2023-10-22 14:01:47 +00:00
+									if subtrain_dataset.sampler_type == "path":
-												actually make the evaluation dataset shuffled for sample_type=speaker

											
										
										
											2023-08-17 20:04:45 +00:00
+										subtrain_dataset.head_(cfg.evaluation.size)
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
 									train_dl = _create_dataloader(train_dataset, training=True)
 									val_dl = _create_dataloader(val_dataset, training=False)
 									subtrain_dl = _create_dataloader(subtrain_dataset, training=False)
 									_logger.info(str(train_dataset.phone_symmap))
 									_logger.info(str(train_dataset.spkr_symmap))
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
+									_logger.info(str(train_dataset.spkr_group_symmap))
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
 									_logger.info(f"#samples (train): {len(train_dataset)}.")
 									_logger.info(f"#samples (val): {len(val_dataset)}.")
 									_logger.info(f"#samples (subtrain): {len(subtrain_dataset)}.")
 									_logger.info(f"#duration (train): {str(train_dataset.duration)}.")
 									_logger.info(f"#duration (val): {str(val_dataset.duration)}.")
 									_logger.info(f"#duration (subtrain): {str(subtrain_dataset.duration)}.")
 									assert isinstance(subtrain_dl.dataset, Dataset)
 									return train_dl, subtrain_dl, val_dl
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+								# parse metadata from an numpy file (.enc/.dac) and validate it
-												added helper script to process Emilia (amphion/Emilia-Dataset), clean up espeak phonemes for non-English transcriptions with English words (because for some reason espeak injects (en){word}(lang) markers and it's annoying)

											
										
										
											2024-09-09 14:57:32 +00:00
+								def process_artifact_metadata( artifact ):
 									metadata = {}
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+									# text transcription (just in case)
-												added helper script to process Emilia (amphion/Emilia-Dataset), clean up espeak phonemes for non-English transcriptions with English words (because for some reason espeak injects (en){word}(lang) markers and it's annoying)

											
										
										
											2024-09-09 14:57:32 +00:00
+									if "text" in artifact["metadata"]:
 										metadata["text"] = artifact["metadata"]["text"]
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+									# phonemization of text transcription (just in case)
-												added helper script to process Emilia (amphion/Emilia-Dataset), clean up espeak phonemes for non-English transcriptions with English words (because for some reason espeak injects (en){word}(lang) markers and it's annoying)

											
										
										
											2024-09-09 14:57:32 +00:00
+									if "phonemes" in artifact["metadata"]:
 										metadata["phonemes"] = artifact["metadata"]["phonemes"]
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+									# language for sampling / input creation
-												added helper script to process Emilia (amphion/Emilia-Dataset), clean up espeak phonemes for non-English transcriptions with English words (because for some reason espeak injects (en){word}(lang) markers and it's annoying)

											
										
										
											2024-09-09 14:57:32 +00:00
+									if "language" in artifact["metadata"]:
 										metadata["language"] = artifact["metadata"]["language"]
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+									# top-k similar utterances for this utternace
 									if "similar" in artifact["metadata"]:
 										metadata["similar"] = artifact["metadata"]["similar"]
 									# duration for use of culling / sorting dataset
 									if "duration" in artifact["metadata"]:
-												added experimental entropix sampling support

											
										
										
											2024-10-12 02:18:26 +00:00
+										metadata["duration"] = float(artifact["metadata"]["duration"])
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+									# derive duration from sample count / sample rate
 									elif "original_length" in artifact["metadata"] and "sample_rate" in artifact["metadata"]:
-												added helper script to process Emilia (amphion/Emilia-Dataset), clean up espeak phonemes for non-English transcriptions with English words (because for some reason espeak injects (en){word}(lang) markers and it's annoying)

											
										
										
											2024-09-09 14:57:32 +00:00
+										metadata["duration"] = artifact["metadata"]["original_length"] / artifact["metadata"]["sample_rate"]
 									# rephonemize if required
 									if "phonemes" not in metadata and "text" in metadata:
 										metadata["phonemes"] = encode_phns( metadata["text"], language=metadata["language"] if "language" in metadata["language"] else "en" )
 									# clean up phonemes from espeak
 									#     for example: Sonnenküste Update => zˈɔnənkˌystə (en)ˈʌpdeɪt(de)
 									# to-do: regex replace /([a-z]{2})/ to ""
 									if "phonemes" in metadata:
 										metadata["phonemes"] = metadata["phonemes"].replace("(en)", "")
-												regex replace out the (lang) markers in espeak, updated tokenizer vocab as lazily as possible to not have unk tokens

											
										
										
											2024-09-21 17:29:28 +00:00
+										if "language" in metadata:
 											metadata["phonemes"] = metadata["phonemes"].replace(f"({metadata['language']})", "")
 										metadata["phonemes"] = re.sub(r'\([a-z]{2}\)', "", metadata["phonemes"])
-												added helper script to process Emilia (amphion/Emilia-Dataset), clean up espeak phonemes for non-English transcriptions with English words (because for some reason espeak injects (en){word}(lang) markers and it's annoying)

											
										
										
											2024-09-09 14:57:32 +00:00
 									return metadata
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+								# yucky, but I would like to have the LibriTTS-R utterances remapped to their LibriSpeech counterpart
 								# to-do: allow this to be adjusted without having to regenerate metadata / HDF5 by remapping name during dataloader creation
 								def remap_speaker_name( name ):
 									# commented out because I don't want the LibriSpeech portion of the dataset to get added
 									"""
 									if "LibriTTS-R" in speaker_name:
 										name = name.replace("LibriTTS-R", "LibriVox")
 									"""
 									return name
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+								# parse dataset into better to sample metadata
-												update tokenizer because, for some reason, it had the wrong order for the special tokens to where eos = unk

											
										
										
											2024-04-29 14:09:26 +00:00
+								def create_dataset_metadata( skip_existing=True ):
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
+									symmap = get_phone_symmap()
 									root = str(cfg.data_dir)
 									metadata_root = str(cfg.metadata_dir)
-												final tweaks, hopefully

											
										
										
											2024-04-29 03:28:29 +00:00
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
+									cfg.metadata_dir.mkdir(parents=True, exist_ok=True)
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
+									def add( dir, type="training", audios=True, texts=True ):
 										name = str(dir)
 										name = name.replace(root, "")
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+										speaker_name = remap_speaker_name( name )
-												final tweaks (again) before training restarts

											
										
										
											2024-05-08 07:11:38 +00:00
 										metadata_path = Path(f"{metadata_root}/{speaker_name}.json")
 										metadata_path.parents[0].mkdir(parents=True, exist_ok=True)
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+										metadata = json_read( metadata_path, default={} )
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
+										if not os.path.isdir(f'{root}/{name}/'):
 											return
-												added flash_attn LlamaAttention (including flash_attn==1.0.9)

											
										
										
											2024-08-19 01:51:14 +00:00
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
+										files = os.listdir(f'{root}/{name}/')
-												added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup

											
										
										
											2023-10-17 00:30:38 +00:00
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
+										# grab IDs for every file
 										ids = { file.replace(_get_quant_extension(), "").replace(_get_phone_extension(), "") for file in files }
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												added flash_attn LlamaAttention (including flash_attn==1.0.9)

											
										
										
											2024-08-19 01:51:14 +00:00
+										wrote = False
-												support for wildcard in training/validation/noise dataset array (to-do: a better way to query between metadata folder and data folder)

											
										
										
											2024-09-19 02:34:43 +00:00
+										for id in tqdm(ids, desc=f"Processing {name}", disable=True):
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
+											try:
-												added flash_attn LlamaAttention (including flash_attn==1.0.9)

											
										
										
											2024-08-19 01:51:14 +00:00
+												quant_path = Path(f'{root}/{name}/{id}{_get_quant_extension()}')
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												added flash_attn LlamaAttention (including flash_attn==1.0.9)

											
										
										
											2024-08-19 01:51:14 +00:00
+												if audios and not quant_path.exists():
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
+													continue
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												final tweaks (again) before training restarts

											
										
										
											2024-05-08 07:11:38 +00:00
+												key = f'{type}/{speaker_name}/{id}'
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
-												final tweaks (again) before training restarts

											
										
										
											2024-05-08 07:11:38 +00:00
+												if skip_existing and id in metadata:
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
+													continue
-												added flash_attn LlamaAttention (including flash_attn==1.0.9)

											
										
										
											2024-08-19 01:51:14 +00:00
 												wrote = True
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
 												if id not in metadata:
 													metadata[id] = {}
-												final tweaks, hopefully, again

											
										
										
											2024-05-16 04:04:19 +00:00
+												utterance_metadata = {}
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
+												if audios:
-												added helper script to process Emilia (amphion/Emilia-Dataset), clean up espeak phonemes for non-English transcriptions with English words (because for some reason espeak injects (en){word}(lang) markers and it's annoying)

											
										
										
											2024-09-09 14:57:32 +00:00
+													artifact = np.load(quant_path, allow_pickle=True)[()]
 													qnt = torch.from_numpy(artifact["codes"].astype(int))[0].t().to(dtype=torch.int16)
 													utterance_metadata = process_artifact_metadata( artifact )
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
-												final tweaks, hopefully, again

											
										
										
											2024-05-16 04:04:19 +00:00
+												for k, v in utterance_metadata.items():
 													metadata[id][k] = v
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
 											except Exception as e:
-												final tweaks, hopefully, again

											
										
										
											2024-05-16 04:04:19 +00:00
+												tqdm.write(f'Error while processing {id}: {e}')
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
-												added flash_attn LlamaAttention (including flash_attn==1.0.9)

											
										
										
											2024-08-19 01:51:14 +00:00
+										if wrote:
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+											json_write( metadata, metadata_path )
-												metadata only path (might drop HDF5 since its giving file sizes twice as large as my actual unpacked dataset)

											
										
										
											2024-04-29 04:03:09 +00:00
 									# training
 									for data_dir in tqdm(sorted(cfg.dataset.training), desc="Processing Training"):
 										add( data_dir, type="training" )
 									# validation
 									for data_dir in tqdm(sorted(cfg.dataset.validation), desc='Processing Validation'):
 										add( data_dir, type="validation" )
 									# noise
 									for data_dir in tqdm(sorted(cfg.dataset.noise), desc='Processing Noise'):
 										add( data_dir, type="noise", texts=False )
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												validated that SpeechX tasks cse and nse works, added a method to test each task by invoking `python3 -m vall_e.data --action=tasks --tasks='sr,se,cse,nse'`

											
										
										
											2023-08-19 14:50:07 +00:00
+								# parse yaml to create an hdf5 file
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+								def create_dataset_hdf5( skip_existing=True ):
-												validated that SpeechX tasks cse and nse works, added a method to test each task by invoking `python3 -m vall_e.data --action=tasks --tasks='sr,se,cse,nse'`

											
										
										
											2023-08-19 14:50:07 +00:00
+									cfg.dataset.use_hdf5 = True
 									cfg.load_hdf5(write=True)
-												final tweaks (again) before training restarts

											
										
										
											2024-05-08 07:11:38 +00:00
+									hf = cfg.hdf5
-												validated that SpeechX tasks cse and nse works, added a method to test each task by invoking `python3 -m vall_e.data --action=tasks --tasks='sr,se,cse,nse'`

											
										
										
											2023-08-19 14:50:07 +00:00
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+									symmap = get_phone_symmap()
-												final tweaks, hopefully

											
										
										
											2024-04-29 03:28:29 +00:00
+									root = str(cfg.data_dir)
 									metadata_root = str(cfg.metadata_dir)
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
+									def add( dir, type="training", audios=True, texts=True, verbose=False ):
-												final tweaks, hopefully

											
										
										
											2024-04-29 03:28:29 +00:00
+										name = str(dir)
 										name = name.replace(root, "")
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+										speaker_name = remap_speaker_name( name )
-												final tweaks, hopefully

											
										
										
											2024-04-29 03:28:29 +00:00
-												final tweaks (again) before training restarts

											
										
										
											2024-05-08 07:11:38 +00:00
+										metadata_path = Path(f"{metadata_root}/{speaker_name}.json")
 										metadata_path.parents[0].mkdir(parents=True, exist_ok=True)
-												final tweaks, hopefully

											
										
										
											2024-04-29 03:28:29 +00:00
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+										metadata = json_read(metadata_path, default={})
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
 										if not os.path.isdir(f'{root}/{name}/'):
 											return
-												DAC just doesn't work well enough......

											
										
										
											2024-05-25 16:07:52 +00:00
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										files = os.listdir(f'{root}/{name}/')
 										# grab IDs for every file
-												converting over to a different intermediary dataset format

											
										
										
											2024-04-19 02:24:06 +00:00
+										ids = { file.replace(_get_quant_extension(), "").replace(_get_phone_extension(), "") for file in files }
-												DAC just doesn't work well enough......

											
										
										
											2024-05-25 16:07:52 +00:00
+										"""
 										# rephonemizes if you fuck up and use and old tokenizer...
 										for id, entry in tqdm(metadata.items(), desc=f"Processing {name}"):
 											key = f'{type}/{speaker_name}/{id}'
 											if key not in hf:
 												continue
 											group = hf[key]
 											if "phonemes" not in entry:
 												continue
 											if "text" not in group:
 												continue
 											txt = entry["phonemes"]
 											phn = "".join(txt)
 											phn = cfg.tokenizer.encode(phn)
 											phn = np.array(phn).astype(np.uint8)
 											del group["text"]
 											group.create_dataset('text', data=phn, compression='lzf')
 										"""
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
+										for id in tqdm(ids, desc=f"Processing {name}", disable=not verbose):
-												unified more things with training the AR+NAR monolothic model

											
										
										
											2023-09-12 20:54:41 +00:00
+											try:
-												fix loading without needing an hdf5 dataset already prepped (and some other incidental speedups during dataloader prep)

											
										
										
											2024-05-18 12:14:26 +00:00
+												quant_exists = os.path.exists(f'{root}/{name}/{id}{_get_quant_extension()}') if audios else True
 												text_exists = os.path.exists(f'{root}/{name}/{id}{_get_phone_extension()}') if texts else True
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												fix loading without needing an hdf5 dataset already prepped (and some other incidental speedups during dataloader prep)

											
										
										
											2024-05-18 12:14:26 +00:00
+												if not quant_exists:
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+													continue
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												final tweaks (again) before training restarts

											
										
										
											2024-05-08 07:11:38 +00:00
+												key = f'{type}/{speaker_name}/{id}'
-												unified more things with training the AR+NAR monolothic model

											
										
										
											2023-09-12 20:54:41 +00:00
-												final tweaks, hopefully

											
										
										
											2024-04-29 03:28:29 +00:00
+												if skip_existing and key in hf:
 													continue
 												group = hf.create_group(key) if key not in hf else hf[key]
 												if id not in metadata:
 													metadata[id] = {}
-												unified more things with training the AR+NAR monolothic model

											
										
										
											2023-09-12 20:54:41 +00:00
-												final tweaks, hopefully, again

											
										
										
											2024-05-16 04:04:19 +00:00
+												utterance_metadata = {}
-												unified more things with training the AR+NAR monolothic model

											
										
										
											2023-09-12 20:54:41 +00:00
+												# audio
 												if audios:
-												added helper script to process Emilia (amphion/Emilia-Dataset), clean up espeak phonemes for non-English transcriptions with English words (because for some reason espeak injects (en){word}(lang) markers and it's annoying)

											
										
										
											2024-09-09 14:57:32 +00:00
+													artifact = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
 													qnt = torch.from_numpy(artifact["codes"].astype(int))[0].t().to(dtype=torch.int16)
 													utterance_metadata = process_artifact_metadata( artifact )
-												final tweaks, hopefully

											
										
										
											2024-04-29 03:28:29 +00:00
 													if "audio" not in group:
-												final tweaks, hopefully, again

											
										
										
											2024-05-16 04:04:19 +00:00
+														group.create_dataset('audio', data=qnt.numpy().astype(np.int16), compression='lzf')
-												converting over to a different intermediary dataset format

											
										
										
											2024-04-19 02:24:06 +00:00
-												unified more things with training the AR+NAR monolothic model

											
										
										
											2023-09-12 20:54:41 +00:00
+												# text
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+												# this is a relic from when I did have the quantized audio and phoneme transcription separate
 												# to-do: ensure I can remove this block
-												unified more things with training the AR+NAR monolothic model

											
										
										
											2023-09-12 20:54:41 +00:00
+												if texts:
-												final tweaks, hopefully, again

											
										
										
											2024-05-16 04:04:19 +00:00
+													if not utterance_metadata and text_exists:
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+														utterance_metadata = json_read(f'{root}/{name}/{id}{_get_phone_extension()}')
-												dataset preparation script updates, caved and am using HF tokenizer now

											
										
										
											2024-04-21 19:49:18 +00:00
-												final tweaks, hopefully, again

											
										
										
											2024-05-16 04:04:19 +00:00
+													phn = "".join(utterance_metadata["phonemes"])
 													phn = cfg.tokenizer.encode(phn)
-												dataset preparation script updates, caved and am using HF tokenizer now

											
										
										
											2024-04-21 19:49:18 +00:00
+													phn = np.array(phn).astype(np.uint8)
-												actually use the passed-through sample rate from encode for DAC because it does its own resampling I guess

											
										
										
											2024-04-18 18:32:41 +00:00
-												final tweaks, hopefully

											
										
										
											2024-04-29 03:28:29 +00:00
+													if "text" not in group:
 														group.create_dataset('text', data=phn, compression='lzf')
-												actually use the passed-through sample rate from encode for DAC because it does its own resampling I guess

											
										
										
											2024-04-18 18:32:41 +00:00
-												final tweaks, hopefully, again

											
										
										
											2024-05-16 04:04:19 +00:00
+												for k, v in utterance_metadata.items():
 													group.attrs[k] = v
 													metadata[id][k] = v
-												final tweaks, hopefully

											
										
										
											2024-04-29 03:28:29 +00:00
-												unified more things with training the AR+NAR monolothic model

											
										
										
											2023-09-12 20:54:41 +00:00
+											except Exception as e:
-												final tweaks, hopefully, again

											
										
										
											2024-05-16 04:04:19 +00:00
+												tqdm.write(f'Error while processing {id}: {e}')
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+										json_write( metadata, metadata_path )
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
 									# training
-												final tweaks (again) before training restarts

											
										
										
											2024-05-08 07:11:38 +00:00
+									for data_dir in tqdm(cfg.dataset.training, desc="Processing Training"):
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										add( data_dir, type="training" )
 									# validation
-												final tweaks (again) before training restarts

											
										
										
											2024-05-08 07:11:38 +00:00
+									for data_dir in tqdm(cfg.dataset.validation, desc='Processing Validation'):
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+										add( data_dir, type="validation" )
-												forgot to have it pull from specified noise to the hdf5 dataset

											
										
										
											2023-08-19 04:57:07 +00:00
+									# noise
-												final tweaks (again) before training restarts

											
										
										
											2024-05-08 07:11:38 +00:00
+									for data_dir in tqdm(cfg.dataset.noise, desc='Processing Noise'):
-												validated that SpeechX tasks cse and nse works, added a method to test each task by invoking `python3 -m vall_e.data --action=tasks --tasks='sr,se,cse,nse'`

											
										
										
											2023-08-19 14:50:07 +00:00
+										add( data_dir, type="noise", texts=False )
-												forgot to have it pull from specified noise to the hdf5 dataset

											
										
										
											2023-08-19 04:57:07 +00:00
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+									# write symmap
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+									if "symmap" in hf:
 										del hf['symmap']
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+									hf.create_dataset('symmap', data=json_stringify(symmap))
-												Rewrite init

											
										
										
											2023-08-02 21:53:35 +00:00
+									hf.close()
 								if __name__ == "__main__":
-												actually make the evaluation dataset shuffled for sample_type=speaker

											
										
										
											2023-08-17 20:04:45 +00:00
+									import argparse
 									parser = argparse.ArgumentParser("Save trained model to path.")
-												validated that SpeechX tasks cse and nse works, added a method to test each task by invoking `python3 -m vall_e.data --action=tasks --tasks='sr,se,cse,nse'`

											
										
										
											2023-08-19 14:50:07 +00:00
+									parser.add_argument("--action", type=str)
 									parser.add_argument("--tasks", type=str)
-												ugh

											
										
										
											2024-06-09 16:39:43 +00:00
+									args, unknown = parser.parse_known_args()
-												actually make the evaluation dataset shuffled for sample_type=speaker

											
										
										
											2023-08-17 20:04:45 +00:00
-												validated that SpeechX tasks cse and nse works, added a method to test each task by invoking `python3 -m vall_e.data --action=tasks --tasks='sr,se,cse,nse'`

											
										
										
											2023-08-19 14:50:07 +00:00
+									task = args.action
-												tested the training preparation for tasks ns, sr, and tse (I don't expect it to go well with only 2 RVQ bins)

											
										
										
											2023-08-19 04:55:40 +00:00
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
+									setup_logging()
-												wrapped saving the checkpoint in a try/catch so I can stop waking up to the damn trainer crashing because it ran out of disk space; I'd much rather it keep training to give me time to eventually clear up disk space rather than it silently restarting on its own

											
										
										
											2023-08-20 11:29:17 +00:00
+									cfg.dataset.workers = 1
-												validated that SpeechX tasks cse and nse works, added a method to test each task by invoking `python3 -m vall_e.data --action=tasks --tasks='sr,se,cse,nse'`

											
										
										
											2023-08-19 14:50:07 +00:00
+									if args.action == "hdf5":
-												actually make the evaluation dataset shuffled for sample_type=speaker

											
										
										
											2023-08-17 20:04:45 +00:00
+										create_dataset_hdf5()
-												ugh

											
										
										
											2024-05-12 18:02:15 +00:00
+									elif args.action == "list-dataset":
 										dataset = []
 										for group in os.listdir(cfg.data_dir):
 											for name in os.listdir(cfg.data_dir / group):
 												if len(os.listdir(cfg.data_dir / group / name)) == 0:
 													continue
 												dataset.append(f'{group}/{name}')
-												more tweaks

											
										
										
											2024-09-18 21:43:57 +00:00
+										_logger.info(json_stringify(dataset))
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
+									elif args.action == "metadata":
 										create_dataset_metadata()
-												validated that SpeechX tasks cse and nse works, added a method to test each task by invoking `python3 -m vall_e.data --action=tasks --tasks='sr,se,cse,nse'`

											
										
										
											2023-08-19 14:50:07 +00:00
+									elif args.action == "sample":
-												tested the training preparation for tasks ns, sr, and tse (I don't expect it to go well with only 2 RVQ bins)

											
										
										
											2023-08-19 04:55:40 +00:00
+										train_dl, subtrain_dl, val_dl = create_train_val_dataloader()
 										samples = {
 											"training": [ next(iter(train_dl)),  next(iter(train_dl)) ],
 											"evaluation": [ next(iter(subtrain_dl)),  next(iter(subtrain_dl)) ],
-												some insanity for sanity checks (some phonemes from phonemizing japanese are not in my tokenizer...)

											
										
										
											2024-07-22 05:30:40 +00:00
+											#"validation": [ next(iter(val_dl)),  next(iter(val_dl)) ],
-												tested the training preparation for tasks ns, sr, and tse (I don't expect it to go well with only 2 RVQ bins)

											
										
										
											2023-08-19 04:55:40 +00:00
+										}
-												ughh

											
										
										
											2024-05-12 12:30:59 +00:00
+										Path("./data/sample-test/").mkdir(parents=True, exist_ok=True)
 										for k, v in samples.items():
 											for i in range(len(v)):
 												for j in tqdm(range(len(v[i]['proms'])), desc="Decoding..."):
-												make loss scaling opt-in rather than automatically determined (because it seems a DAC-based model really doesnt like loss scaling)

											
										
										
											2024-08-09 15:51:36 +00:00
+													"""
-												ughh

											
										
										
											2024-05-12 12:30:59 +00:00
+													"""
 													try:
 														decode_to_file( v[i]['proms'][j], f"./data/sample-test/{k}.{i}.{j}.proms.wav", device="cpu" )
 													except Exception as e:
-												moved prints to use logger, edited readme (fused_attn doesnt seem stable for training)

											
										
										
											2024-08-29 18:27:16 +00:00
+														_logger.info(f"Error while decoding prom {k}.{i}.{j}.wav: {str(e)}")
-												ughh

											
										
										
											2024-05-12 12:30:59 +00:00
+													try:
 														decode_to_file( v[i]['resps'][j], f"./data/sample-test/{k}.{i}.{j}.resps.wav", device="cpu" )
 													except Exception as e:
-												moved prints to use logger, edited readme (fused_attn doesnt seem stable for training)

											
										
										
											2024-08-29 18:27:16 +00:00
+														_logger.info(f"Error while decoding resp {k}.{i}.{j}.wav: {str(e)}")
-												ughh

											
										
										
											2024-05-12 12:30:59 +00:00
+													v[i]['proms'][j] = v[i]['proms'][j].shape
 													v[i]['resps'][j] = v[i]['resps'][j].shape
-												tested the training preparation for tasks ns, sr, and tse (I don't expect it to go well with only 2 RVQ bins)

											
										
										
											2023-08-19 04:55:40 +00:00
+										for k, v in samples.items():
 											for i in range(len(v)):
-												moved prints to use logger, edited readme (fused_attn doesnt seem stable for training)

											
										
										
											2024-08-29 18:27:16 +00:00
+												_logger.info(f'{k}[{i}]: {v[i]}')
-												some insanity for sanity checks (some phonemes from phonemizing japanese are not in my tokenizer...)

											
										
										
											2024-07-22 05:30:40 +00:00
+									elif args.action == "validate":
 										train_dl, subtrain_dl, val_dl = create_train_val_dataloader()
-												lang fixes / reworked phoneme symmap validation

											
										
										
											2024-09-19 00:36:03 +00:00
+										dataset = train_dl.dataset
-												some insanity for sanity checks (some phonemes from phonemizing japanese are not in my tokenizer...)

											
										
										
											2024-07-22 05:30:40 +00:00
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
+										missing = []
 										symmap = get_phone_symmap()
-												lang fixes / reworked phoneme symmap validation

											
										
										
											2024-09-19 00:36:03 +00:00
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
+										for index in tqdm(range(len( dataset )), desc="Processing dataset..."):
-												lang fixes / reworked phoneme symmap validation

											
										
										
											2024-09-19 00:36:03 +00:00
+											if dataset.sampler_type == "group":
 												spkr_group = dataset.spkr_groups[index]
 												#spkr_group_id = dataset.spkr_group_symmap[spkr_group]
 												spkr_name = dataset.spkr_samplers[spkr_group].sample()
 												spkr_id = dataset.spkr_symmap[spkr_name]
 												path = dataset.samplers[spkr_name].sample()
 											elif dataset.sampler_type == "speaker":
 												spkr_name = dataset.spkrs[index]
 												spkr_id = dataset.spkr_symmap[spkr_name]
 												path = dataset.samplers[spkr_name].sample()
 												spkr_group = dataset.get_speaker_group(path)
 												#spkr_group_id = dataset.spkr_group_symmap[spkr_group]
 											else:
 												path = dataset.paths[index]
 												spkr_name = dataset.get_speaker(path)
 												spkr_id = dataset.spkr_symmap[spkr_name]
 												spkr_group = dataset.get_speaker_group(path)
 												#spkr_group_id = dataset.spkr_group_symmap[spkr_group]
 											if cfg.dataset.use_hdf5:
 												key = _get_hdf5_path(path)
 												if key not in cfg.hdf5:
 													continue
 												metadata = { f'{k}': f'{v}' for k, v in cfg.hdf5[key].attrs.items() }
 											else:
 												_, metadata = _load_quants(path, return_metadata=True)
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
-												lang fixes / reworked phoneme symmap validation

											
										
										
											2024-09-19 00:36:03 +00:00
+											phonemes = metadata["phonemes"]
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
+											for i, phone in enumerate( phonemes ):
 												if phone in symmap:
 													continue
 												if phone in missing:
 													continue
 												_logger.info( f"{path} | {phonemes}[{i}] | {phone}" )
 												missing.append( phone )
 											"""
 											text = tokenize( phonemes )[1:-1]
 											unk_token = tokenize("<unk>")[1]
 											if unk_token in text:
 												print( unk_token, text, phonemes )
-												lang fixes / reworked phoneme symmap validation

											
										
										
											2024-09-19 00:36:03 +00:00
+											for i, token in enumerate(text):
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
+												if token != unk_token:
-												lang fixes / reworked phoneme symmap validation

											
										
										
											2024-09-19 00:36:03 +00:00
+													continue
 												phone = phonemes[i]
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
+												if phone not in missing:
 													_logger.info( f"{path} | {phonemes}[{i}] | {phone}" )
-												some insanity for sanity checks (some phonemes from phonemizing japanese are not in my tokenizer...)

											
										
										
											2024-07-22 05:30:40 +00:00
+												missing |= set([phone])
-												actually fix validation of phonemes in the symmap

											
										
										
											2024-09-21 17:19:34 +00:00
+											"""
-												some insanity for sanity checks (some phonemes from phonemizing japanese are not in my tokenizer...)

											
										
										
											2024-07-22 05:30:40 +00:00
-												moved prints to use logger, edited readme (fused_attn doesnt seem stable for training)

											
										
										
											2024-08-29 18:27:16 +00:00
+										_logger.info( f"Missing tokens: {missing}" )
-												some insanity for sanity checks (some phonemes from phonemizing japanese are not in my tokenizer...)

											
										
										
											2024-07-22 05:30:40 +00:00
-												overhauled dataloading code to be marginally faster, mostly cleaned up, and can leverage a metadata json to help things out

											
										
										
											2023-08-27 00:53:23 +00:00
-												validated that SpeechX tasks cse and nse works, added a method to test each task by invoking `python3 -m vall_e.data --action=tasks --tasks='sr,se,cse,nse'`

											
										
										
											2023-08-19 14:50:07 +00:00
+									elif args.action == "tasks":
-												tested the training preparation for tasks ns, sr, and tse (I don't expect it to go well with only 2 RVQ bins)

											
										
										
											2023-08-19 04:55:40 +00:00
+										index = 0
-												validated that SpeechX tasks cse and nse works, added a method to test each task by invoking `python3 -m vall_e.data --action=tasks --tasks='sr,se,cse,nse'`

											
										
										
											2023-08-19 14:50:07 +00:00
+										cfg.dataset.tasks_list = args.tasks.split(",")
 										train_dl, subtrain_dl, val_dl = create_train_val_dataloader()
 										batch = next(iter(train_dl))
-												removed the sampler as it's very misleading

											
										
										
											2023-08-18 19:47:48 +00:00
-												validated that SpeechX tasks cse and nse works, added a method to test each task by invoking `python3 -m vall_e.data --action=tasks --tasks='sr,se,cse,nse'`

											
										
										
											2023-08-19 14:50:07 +00:00
+										for text, resps, proms, task in zip(batch["text"], batch["resps"], batch["proms"], batch["task"]):
 											if task not in cfg.dataset.tasks_list:
 												continue
-												removed the sampler as it's very misleading

											
										
										
											2023-08-18 19:47:48 +00:00
-												moved prints to use logger, edited readme (fused_attn doesnt seem stable for training)

											
										
										
											2024-08-29 18:27:16 +00:00
+											_logger.info( f'{text} {task} {cfg.model.resp_levels}')
 											_logger.info( f'{proms.shape} {resps.shape}' )
-												added total samples processed and tokens processed (len of text tokens + len of target response tokens)

											
										
										
											2023-08-28 16:02:45 +00:00
 											tokens = 0
 											tokens += sum([ text.shape[0] for text in batch["text"] ])
 											tokens += sum([ resps.shape[0] for resps in batch["resps"] ])
-												moved prints to use logger, edited readme (fused_attn doesnt seem stable for training)

											
										
										
											2024-08-29 18:27:16 +00:00
+											_logger.info( f'{tokens}' )
-												added total samples processed and tokens processed (len of text tokens + len of target response tokens)

											
										
										
											2023-08-28 16:02:45 +00:00
-												wrapped saving the checkpoint in a try/catch so I can stop waking up to the damn trainer crashing because it ran out of disk space; I'd much rather it keep training to give me time to eventually clear up disk space rather than it silently restarting on its own

											
										
										
											2023-08-20 11:29:17 +00:00
+											decode_to_file( proms, f"./data/{task}.proms.wav", device="cpu" )
 											decode_to_file( resps, f"./data/{task}.resps.wav", device="cpu" )
-												validated that SpeechX tasks cse and nse works, added a method to test each task by invoking `python3 -m vall_e.data --action=tasks --tasks='sr,se,cse,nse'`

											
										
										
											2023-08-19 14:50:07 +00:00
+											break