moved duration padding for NAR-len to be a scalar instead (since it seems longer utterances need it much more so than shorter utterances)

2024-11-21 13:04:07 -06:00 · 2024-11-21 13:04:07 -06:00 · 2a084544e8
commit 2a084544e8
parent 6aee08f9c0
1 changed files with 2 additions and 2 deletions
--- a/vall_e/inference.py
+++ b/vall_e/inference.py
@ -272,13 +272,13 @@ class TTS():
 			with torch.autocast("cuda", dtype=self.dtype, enabled=self.amp):
 				if model_len is not None:
 					# extra kwargs
-					duration_padding = sampling_kwargs.pop("duration_padding", 1)
+					duration_padding = sampling_kwargs.pop("duration_padding", 1.05)
 					nar_len_prefix_length = sampling_kwargs.pop("nar_len_prefix_length", 0)

 					len_list = model_len( text_list=[phns], proms_list=[prom], task_list=["len"], disable_tqdm=not tqdm, **{"max_duration": 5} ) # don't need more than that

 					# add an additional X seconds
-					len_list = [ l + duration_padding * cfg.dataset.frames_per_second for l in len_list ]
+					len_list = [ l * duration_padding * cfg.dataset.frames_per_second for l in len_list ]

 					kwargs = {}
 					# nasty hardcode to load a reference file and have that as the input target