From 98b357cc535dc055860a34b35c9fecf783b7ee67 Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Fri, 30 May 2025 22:56:07 -0500
Subject: [PATCH] things i forgot to do last week now that some mental
 faculties were restored

---
 docs/models_v2.md          | 8 +++++++-
 vall_e/inference.py        | 2 ++
 vall_e/models/ar_nar_v2.py | 4 +++-
 3 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/docs/models_v2.md b/docs/models_v2.md
index aee359b..1767262 100644
--- a/docs/models_v2.md
+++ b/docs/models_v2.md
@@ -194,4 +194,10 @@ However, output leaves a lot to be desired:
 * both the small and the large model seemed to have hit a "capacity" limit
 	* the "confidence" problem of the prior implementation seems to have emerged even for typical speakers
 * some other quirks and emergent behaviors inherent to the model I'm not aware of / can't recall
-	* such as the demasking sampler loop being quite particular
\ No newline at end of file
+	* such as the demasking sampler loop being quite particular
+* naturally, LoRAs are trainable:
+	* at a glance it seems to address the problems of poor/inconsistent zero-shot performance
+	* training a LoRA is agonizing because the loss doesn't progress anywhere near as nicely as it does against EnCodec-based models
+	* however, there seems to be a problem when predicting the duration that causes it to be too short (when the input prompt is of the speaker) or too long (when the input prompt is not of the speaker)
+		* simply disabling the LoRA specifically for duration prediction seems to fix this
+	* additional testing against LoRAs is necessary to draw further conclusions
\ No newline at end of file
diff --git a/vall_e/inference.py b/vall_e/inference.py
index 87d9d37..7f41aa3 100644
--- a/vall_e/inference.py
+++ b/vall_e/inference.py
@@ -114,6 +114,8 @@ class TTS():
 			return text
 
 		# check if tokenizes without any unks (for example, if already phonemized text is passes)
+		# to-do: properly fix this
+		#  - i don't remember what specific situation arised where phonemized text is already passed in to warrant the need to detect it
 		"""
 		if precheck and "<unk>" in self.symmap:
 			tokens = tokenize( text )
diff --git a/vall_e/models/ar_nar_v2.py b/vall_e/models/ar_nar_v2.py
index 0a2639b..43f22ce 100644
--- a/vall_e/models/ar_nar_v2.py
+++ b/vall_e/models/ar_nar_v2.py
@@ -400,7 +400,9 @@ class AR_NAR_V2(Base_V2):
 			batch_size = len(proms_list)
 
 		if cfg.lora is not None:
-			enable_lora( self, cfg.lora.active_level( 0 ) if use_lora is None else use_lora )
+			# enable_lora( self, cfg.lora.active_level( 0 ) if use_lora is None else use_lora )
+			# force disable LoRAs for this
+			enable_lora( self, False )
 
 		task_list = [ "len" for _ in range( batch_size ) ]
 		quant_levels = [ 0 for _ in range( batch_size ) ]