added picking final candidate based on sum of score instead of first candidate (this changes nothing).

2023-09-13 13:19:11 -05:00 · 2023-09-13 13:19:11 -05:00 · 4aef798135
commit 4aef798135
parent 23a5fdd645
4 changed files with 85 additions and 36 deletions
--- a/vall_e/models/ar.py
+++ b/vall_e/models/ar.py
@ -121,12 +121,14 @@ class AR(Base):
 		stopped = torch.zeros(batch_size, device=device).bool()
 		state = {} if cfg.inference.recurrent_forward else None
 		sampling_beam_width_use_logs = True
 		scores = [ 1.0 ] * sampling_beam_width
 		if self.interleave:
 			max_steps *= self.n_prom_levels
 		# get next in sequence
 		for n in trange(max_steps // max(1, self.recurrent_chunk_size)):
 			# get next in sequence
 			logits = super().forward(
 				text_list=text_list,
@ -149,12 +151,23 @@ class AR(Base):
 				beam_width=sampling_beam_width,
 			)
 			# first step, expand batch
 			# we do it here because the sampler will already expand our logits list
-			if sampling_beam_width > 0 and batch_size == 1:
+			if sampling_beam_width > 0:
-				text_list = text_list * sampling_beam_width
+				# expand tuple
-				proms_list = proms_list * sampling_beam_width
+				r, s = r
-				resps_list = resps_list * sampling_beam_width
+				# first step, expand batch
 				if batch_size == 1:
 					batch_size *= sampling_beam_width
 					text_list = text_list * sampling_beam_width
 					proms_list = proms_list * sampling_beam_width
 					sequence_list = sequence_list * sampling_beam_width
 					stopped = torch.zeros(batch_size, device=device).bool()
 				# update scores
 				if sampling_beam_width_use_logs:
 					scores = [ (math.log(scores[i]) if scores[i] > 0 else 0) + math.log(score) for i, score in enumerate(s) ]
 				else:
 					scores = [ scores[i] * score for i, score in enumerate(s) ]
 			# append tokens
 			for i, ri in enumerate(r):
@ -168,6 +181,16 @@ class AR(Base):
 			if stopped.all().item():
 				break
 		# pick the best scoring candidate
 		# desu this is always going to be candidate 0
 		if sampling_beam_width and len(scores) > 0:
 			best_idx, best_score = (0, 0)
 			for idx, score in enumerate(scores):
 				if best_score > score:
 					best_idx, best_score = idx, score
 			sequence_list = [sequence_list[best_idx]]
 		res = [self._prune(r) for r in resps_list]
 		if self.interleave:
 			res = [self._deinterleave(r) for r in res]
--- a/vall_e/models/ar_nar.py
+++ b/vall_e/models/ar_nar.py
@ -5,6 +5,7 @@ import torch
 from torch.nn.utils.rnn import pad_sequence
 import random
 import math
 from einops import rearrange
 from torch import Tensor
 from tqdm import trange
@ -151,12 +152,14 @@ class AR_NAR(Base):
 		state = {} if cfg.inference.recurrent_forward else None
 		sampling_beam_width_use_logs = True
 		scores = [ 1.0 ] * sampling_beam_width
 		if self.interleave:
 			max_steps *= self.n_prom_levels
 		# get next in sequence
 		for n in trange(max_steps // max(1, self.recurrent_chunk_size)):
 			# get next in sequence
 			resps_list = self._unsqueeze_list(sequence_list)
 			logits = super().forward(
 				text_list=text_list,
@ -179,14 +182,23 @@ class AR_NAR(Base):
 				beam_width=sampling_beam_width,
 			)
 			# first step, expand batch
 			# we do it here because the sampler will already expand our logits list
-			if sampling_beam_width > 0 and batch_size == 1:
+			if sampling_beam_width > 0:
-				batch_size *= sampling_beam_width
+				# expand tuple
-				text_list = text_list * sampling_beam_width
+				r, s = r
-				proms_list = proms_list * sampling_beam_width
+				# first step, expand batch
-				sequence_list = sequence_list * sampling_beam_width
+				if batch_size == 1:
-				stopped = torch.zeros(batch_size, device=device).bool()
+					batch_size *= sampling_beam_width
 					text_list = text_list * sampling_beam_width
 					proms_list = proms_list * sampling_beam_width
 					sequence_list = sequence_list * sampling_beam_width
 					stopped = torch.zeros(batch_size, device=device).bool()
 				# update scores
 				if sampling_beam_width_use_logs:
 					scores = [ (math.log(scores[i]) if scores[i] > 0 else 0) + math.log(score) for i, score in enumerate(s) ]
 				else:
 					scores = [ scores[i] * score for i, score in enumerate(s) ]
 			# append tokens
 			for i, ri in enumerate(r):
@ -199,9 +211,15 @@ class AR_NAR(Base):
 			if stopped.all().item():
 				break
-		# pick the first candidate
+		# pick the best scoring candidate
-		if sampling_beam_width:
+		# desu this is always going to be candidate 0
-			sequence_list = sequence_list[:1]
+		if sampling_beam_width and len(scores) > 0:
 			best_idx, best_score = (0, 0)
 			for idx, score in enumerate(scores):
 				if best_score > score:
 					best_idx, best_score = idx, score
 			sequence_list = [sequence_list[best_idx]]
 		return [self._prune(r) for r in sequence_list]
--- a/vall_e/models/base.py
+++ b/vall_e/models/base.py
@ -119,6 +119,22 @@ def top_k_top_p_filtering( logits, top_k=0, top_p=1.0, filter_value=-float("Inf"
 	return logits
 # picks the top K tokens amongst a batch of logits
 # logits: [Tensor] list of logits
 # candidates: [(batch, token)] list, where batch indicates the index of the logits the given token is from
 def top_k_logits_list( logits_list, k ):
 	# ( batch, tokens ) => ( batch x tokens )
 	logits = torch.cat( logits_list )
 	candidates = list(torch.topk(logits.flatten(), k).indices.tolist()) # perform top-k across all logits
 	for i, index in enumerate(candidates):
 		t = []
 		N = np.prod(logits.size())
 		for n in logits.size():
 			N //= n
 			t.append(index // N)
 			index %= N
 		candidates[i] = tuple(t)
 	return candidates
 # automagically parses a batch-list and returns it as a list
 class Embedding(nn.Embedding):
@ -128,7 +144,7 @@ class Embedding(nn.Embedding):
 		return super().forward(torch.cat(x_list)).split([*map(len, x_list)])
-class MultiEmbedding(nn.Embedding):
+class MultiEmbedding(nn.Module):
 	"""
 	This embedding sums embeddings on different levels.
 	"""
@ -468,21 +484,13 @@ class Base(nn.Module):
 		# do beam search (naive implementation)
 		# picks the top-k across all batches, and re-batches those resultant tokens
-		# this doesn't do any other mumbo with previous logits
+		# returns the logit scores as well to be P-concatted with the previous scores
 		# to-do: not naively implement beam searching
 		if beam_width > 1:
-			# ( batch, tokens ) => ( batch x tokens )
+			candidates = top_k_logits_list( logits, beam_width )
-			flattened = torch.cat( logits )
+			res = [ torch.tensor(token, device=logits[batch].device, dtype=torch.int16).unsqueeze(dim=-1) for batch, token in candidates ]
-			candidates = list(torch.topk(flattened.flatten(), beam_width).indices.tolist()) # perform top-k across all logits
+			scores = [ logits[batch].flatten()[token] for batch, token in candidates ]
-			for i, index in enumerate(candidates):
+			return res, scores
 				t = []
 				N = np.prod(flattened.size())
 				for n in flattened.size():
 					N //= n
 					t.append(index // N)
 					index %= N
 				candidates[i] = tuple(t)
 			return [ torch.tensor(token, device=logits[batch].device, dtype=torch.int16).unsqueeze(dim=-1) for batch, token in candidates ] #, [ logits[batch] for batch, token in candidates ]
 		# and sample
 		# the original implementation used this instead of argmax; it's probably placebo but it performs better than argmax
--- a/vall_e/train.py
+++ b/vall_e/train.py
@ -152,10 +152,10 @@ def run_eval(engines, disabled_engines, eval_name, dl):
 	stats = {k: sum(v) / len(v) for k, v in stats.items()}
-	engines_stats.update({ f'{name}.{eval_name}': stats })
+	engines_stats = {
-
+		f'{name}.{eval_name}': stats,
-	iteration = engines.global_step
+		"it": engines.global_step,
-	engines_stats['it'] = iteration
+	}
 	#engines_stats['epoch'] = iteration * cfg.hyperparameters.gradient_accumulation_steps / len(dl)
 	_logger.info(f"Validation Metrics: {json.dumps(engines_stats)}.")