From 476d87d4aae8a75c4f8a21a6674901379e35b7cb Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Tue, 25 Mar 2025 21:33:01 -0500
Subject: [PATCH] cringe fix because I guess I moved which logit gets trained
 for len duration (I should probably rethink this)

---
 docs/models_v2.md        | 3 +++
 vall_e/models/base_v2.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/models_v2.md b/docs/models_v2.md
index 91eabbb..7ab3996 100644
--- a/docs/models_v2.md
+++ b/docs/models_v2.md
@@ -145,6 +145,9 @@ These settings should be avoided:
 
 To be evaluated thoroughly.
 * The smaller model seems to have hit its capacity limit, while the larger model is slowly improving (although objective metrics are not noted).
+* The model seems pretty quick, even for the large model.
+* The smaller model seems small enough for CPU-only inferencing
+	* Despite its poor zero-shot performance, it could be perfectly fine for finetuning.
 
 At a glance, compared to the prior model setup, this implementation allows for the model to better represent speech as it's able to see the entire signal and account for it in its latent space, rather than only specific levels of it.
 
diff --git a/vall_e/models/base_v2.py b/vall_e/models/base_v2.py
index 258205f..f7b4309 100644
--- a/vall_e/models/base_v2.py
+++ b/vall_e/models/base_v2.py
@@ -765,7 +765,7 @@ class Base_V2(nn.Module):
 
 			# needed, cringe
 			if task_type == "len":
-				batch[-1] = torch.cat( [ batch[-1], self.sep[None] ] )
+				batch[-1] = torch.cat( [ batch[-1], self.sep[None], self.sep[None] ] )
 
 			x_list.append( _join( batch, self.sep ) )