its a miracle i was able to get a semblance of audio with the naive AudioEncoder (now it interleaves properly)

2025-02-24 14:39:12 -06:00 · 2025-02-24 14:39:12 -06:00 · 33d5a7109a
commit 33d5a7109a
parent 6e7b269147
1 changed files with 27 additions and 7 deletions
--- a/vall_e/models/base.py
+++ b/vall_e/models/base.py
@ -354,36 +354,55 @@ class AudioEncoder(nn.Module):
 		self.proj = nn.Linear(8 * token_dim, 1 * token_dim)
 	def forward(self, xi: Tensor, dropout_mask = None, dropout_token = None  ) -> Tensor:
 		# empty
 		if xi.shape[0] == 0:
 			return torch.zeros((0, self.proj.weight.shape[0]), device=xi.device)
 		if dropout_mask is not None:
 			xi = _dropout_codes( xi, dropout_mask, dropout_token )
-		x = torch.cat([ emb( xi[:, l] ) for l, emb in enumerate(self.embs) ], dim=-1)
+		# old way
 		x = self.proj(x)
 		"""
 		x = sum([ emb( xi[:, l] ) for l, emb in enumerate(self.embs) ])
 		"""
 		# naive way to "encode" by flattening
 		"""
 		x = torch.cat([ emb( xi[:, l] ) for l, emb in enumerate(self.embs) ], dim=-1)
 		x = self.proj(x)
 		"""
 		# encode by interleaving
 		seq_len = xi.shape[0]
 		# (8, seq_len, dim)
 		x = [ emb( xi[:, l] ) for l, emb in enumerate(self.embs) ]
 		# => (seq_len, dim * 8) interleaved
 		x_i = []
 		for i in range(xi.shape[0]):
 			x_i.append(torch.cat([ x[l][i] for l in range(len(self.embs)) ], dim=-1))
 		x = torch.stack( x_i, dim=0 )		
 		# => (seq_len, dim)
 		x = self.proj(x)
 		return x
 # Pseudo-MoE by doing additional decoding from the main transformer's last hidden output
 # ironically, not using a classifier to hidden_dim => audio_tokens causes problems with fitment
 class AudioDecoder(nn.Module):
 	def __init__(
 		self,
 		d_model,
 		hidden_size,
 		vocab_size,
 		resp_levels,
 	):
 		super().__init__()
 		self.vocab_size = vocab_size
 		self.up = nn.Linear( d_model, hidden_size )
-		self.down = nn.Linear( hidden_size, vocab_size )
+		self.down = nn.Linear( hidden_size, vocab_size * resp_levels )
 	def forward(self, x: Tensor, level: int | None = None, stack: bool = True, **kwargs ) -> Tensor:
 		x = self.up( x )
 		x = self.down( x )
 		# interleave by reshaping / permuting
 		# at least I hope this does it properly
 		batch_size, seq_len, dim = x.shape
 		x = x.reshape( batch_size, seq_len, 8, dim // 8 )
 		x = x.permute( 0, 2, 1, 3 )
@ -738,7 +757,8 @@ class Base(nn.Module):
 			self.audio_decoder = AudioDecoder(
 				d_model,
 				d_model * 2,
-				(n_audio_tokens + 1) * self.n_resp_levels,
+				(n_audio_tokens + 1),
 				self.n_resp_levels,
 			)
 		if attention_backend == "auto":