From eea70f569868ca2b3b5f91e6240272f7238ad9af Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Tue, 15 Oct 2024 19:25:03 -0500
Subject: [PATCH] kludge fix for an oversight in the model when trying to train
 for longer input prompt durations......

---
 vall_e/inference.py | 10 ++++++++--
 vall_e/webui.py     |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/vall_e/inference.py b/vall_e/inference.py
index cc57114..dc5a8f0 100755
--- a/vall_e/inference.py
+++ b/vall_e/inference.py
@@ -11,7 +11,7 @@ from einops import rearrange
 from pathlib import Path
 
 from .emb import g2p, qnt
-from .emb.qnt import trim, trim_random, unload_model
+from .emb.qnt import trim, trim_random, unload_model, repeat_extend_audio
 from .utils import to_device, set_seed, wrapper as ml
 
 from .config import cfg, Config
@@ -103,7 +103,7 @@ class TTS():
 		return torch.tensor([ id ])
 
 	# to-do: trim before quantizing, instead of after
-	def encode_audio( self, paths, trim_length=0.0 ):
+	def encode_audio( self, paths, trim_length=9.0 ):
 		# already a tensor, return it
 		if isinstance( paths, Tensor ):
 			return paths
@@ -126,8 +126,14 @@ class TTS():
 
 		res = torch.cat(proms)
 		
+		# kludge, but it's to correct an oversight in training
+		if trim_length:
+			res = repeat_extend_audio( res, cfg.dataset.frames_per_second * trim_length )
+
+		"""
 		if trim_length:
 			res = trim( res, int( cfg.dataset.frames_per_second * trim_length ) )
+		"""
 		
 		return res
 
diff --git a/vall_e/webui.py b/vall_e/webui.py
index 589af0a..97f166e 100644
--- a/vall_e/webui.py
+++ b/vall_e/webui.py
@@ -346,7 +346,7 @@ with ui:
 						with gr.Row():
 							layout["inference_tts"]["inputs"]["max-seconds"] = gr.Slider(value=12, minimum=1, maximum=32, step=0.1, label="Maximum Seconds", info="Limits how many steps to perform in the AR pass.")
 							#layout["inference_tts"]["inputs"]["max-nar-levels"] = gr.Slider(value=7, minimum=0, maximum=7, step=1, label="Max NAR Levels", info="Limits how many steps to perform in the NAR pass.")
-							layout["inference_tts"]["inputs"]["input-prompt-length"] = gr.Slider(value=3.0, minimum=0.0, maximum=12.0, step=0.05, label="Input Prompt Trim Length", info="Trims the input prompt down to X seconds. Set 0 to disable.")
+							layout["inference_tts"]["inputs"]["input-prompt-length"] = gr.Slider(value=9.0, minimum=0.0, maximum=12.0, step=0.05, label="Input Prompt Trim Length", info="Trims the input prompt down to X seconds. Set 0 to disable.")
 						with gr.Row():
 							layout["inference_tts"]["inputs"]["ar-temp"] = gr.Slider(value=1.0, minimum=0.0, maximum=1.5, step=0.05, label="Temperature (AR)", info="Modifies the randomness from the samples in the AR. (0 to greedy sample)")
 							layout["inference_tts"]["inputs"]["nar-temp"] = gr.Slider(value=0.0, minimum=0.0, maximum=1.5, step=0.05, label="Temperature (NAR)", info="Modifies the randomness from the samples in the NAR. (0 to greedy sample)")