From 2a084544e8859fa8db4880b2ea85acc358c2c2b4 Mon Sep 17 00:00:00 2001 From: mrq Date: Thu, 21 Nov 2024 13:04:07 -0600 Subject: [PATCH] moved duration padding for NAR-len to be a scalar instead (since it seems longer utterances need it much more so than shorter utterances) --- vall_e/inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vall_e/inference.py b/vall_e/inference.py index fbc220f..d65aa68 100755 --- a/vall_e/inference.py +++ b/vall_e/inference.py @@ -272,13 +272,13 @@ class TTS(): with torch.autocast("cuda", dtype=self.dtype, enabled=self.amp): if model_len is not None: # extra kwargs - duration_padding = sampling_kwargs.pop("duration_padding", 1) + duration_padding = sampling_kwargs.pop("duration_padding", 1.05) nar_len_prefix_length = sampling_kwargs.pop("nar_len_prefix_length", 0) len_list = model_len( text_list=[phns], proms_list=[prom], task_list=["len"], disable_tqdm=not tqdm, **{"max_duration": 5} ) # don't need more than that # add an additional X seconds - len_list = [ l + duration_padding * cfg.dataset.frames_per_second for l in len_list ] + len_list = [ l * duration_padding * cfg.dataset.frames_per_second for l in len_list ] kwargs = {} # nasty hardcode to load a reference file and have that as the input target