From 2a084544e8859fa8db4880b2ea85acc358c2c2b4 Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Thu, 21 Nov 2024 13:04:07 -0600
Subject: [PATCH] moved duration padding for NAR-len to be a scalar instead
 (since it seems longer utterances need it much more so than shorter
 utterances)

---
 vall_e/inference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vall_e/inference.py b/vall_e/inference.py
index fbc220f..d65aa68 100755
--- a/vall_e/inference.py
+++ b/vall_e/inference.py
@@ -272,13 +272,13 @@ class TTS():
 			with torch.autocast("cuda", dtype=self.dtype, enabled=self.amp):
 				if model_len is not None:
 					# extra kwargs
-					duration_padding = sampling_kwargs.pop("duration_padding", 1)
+					duration_padding = sampling_kwargs.pop("duration_padding", 1.05)
 					nar_len_prefix_length = sampling_kwargs.pop("nar_len_prefix_length", 0)
 
 					len_list = model_len( text_list=[phns], proms_list=[prom], task_list=["len"], disable_tqdm=not tqdm, **{"max_duration": 5} ) # don't need more than that
 
 					# add an additional X seconds
-					len_list = [ l + duration_padding * cfg.dataset.frames_per_second for l in len_list ]
+					len_list = [ l * duration_padding * cfg.dataset.frames_per_second for l in len_list ]
 
 					kwargs = {}
 					# nasty hardcode to load a reference file and have that as the input target