added prefixing with silence (was to test something, currently hidden under cfg.experimental=True)
This commit is contained in:
parent
6b04c13c56
commit
71731ed785
|
@ -184,6 +184,7 @@ class TTS():
|
|||
#
|
||||
input_prompt_length=0.0,
|
||||
input_prompt_prefix=False,
|
||||
prefix_silence=0.0,
|
||||
#
|
||||
ar_temp=0.0,
|
||||
nar_temp=0.0,
|
||||
|
@ -295,6 +296,7 @@ class TTS():
|
|||
resps_list = model_ar(
|
||||
text_list=[phns], proms_list=[prom], lang_list=[lang], max_steps=max_ar_steps,
|
||||
input_prompt_prefix=input_prompt_prefix,
|
||||
prefix_silence=prefix_silence,
|
||||
sampling_temperature=ar_temp,
|
||||
sampling_min_temperature=min_ar_temp,
|
||||
sampling_top_p=top_p, sampling_top_k=top_k, sampling_min_p=min_p,
|
||||
|
|
|
@ -43,6 +43,9 @@ class AR(Base):
|
|||
max_steps: int = 1000,
|
||||
max_levels: int = 0,
|
||||
|
||||
input_prompt_prefix: bool = False,
|
||||
prefix_silence: float = 1.0,
|
||||
|
||||
sampling_temperature: float = 1.0,
|
||||
sampling_min_temperature: float = -1.0,
|
||||
sampling_top_k: int = -100,
|
||||
|
|
|
@ -23,7 +23,7 @@ import logging
|
|||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
from ..emb.qnt import trim, encode_as_embedding
|
||||
from ..emb.qnt import trim, encode_as_embedding, get_silence
|
||||
from ..utils import get_devices, setup_logging, timer
|
||||
|
||||
from .lora import enable_lora
|
||||
|
@ -49,6 +49,7 @@ class AR_NAR(Base):
|
|||
max_levels: int = 0,
|
||||
|
||||
input_prompt_prefix: bool = False,
|
||||
prefix_silence: float = 1.0,
|
||||
|
||||
sampling_temperature: float = 1.0,
|
||||
sampling_min_temperature: float = -1.0,
|
||||
|
@ -284,6 +285,10 @@ class AR_NAR(Base):
|
|||
elif input_prompt_prefix:
|
||||
start_slice[i] = proms_list[i].shape[0]
|
||||
sequence_list[i], proms_list[i] = proms_list[i][:, 0], sequence_list[i]
|
||||
elif prefix_silence > 0:
|
||||
sequence_list[i] = get_silence(prefix_silence, device=sequence_list[i].device)
|
||||
sequence_list[i] = sequence_list[i][:, 0]
|
||||
# start_slice[i] = sequence_list[i].shape[0]
|
||||
|
||||
# get next in sequence
|
||||
for n in trange(max_steps // max(1, self.causal_size), desc="AR", disable=disable_tqdm):
|
||||
|
@ -295,7 +300,6 @@ class AR_NAR(Base):
|
|||
# naturally, rep pen wrangles this initial burst of noise, but naively relying on rep_pen is no good, as it fails after ~6 seconds of audio
|
||||
# however, switching to a default sampling temperature with "clean greedy sampled codes" will make the rest of sequence sound as if it were greedy sampled
|
||||
# to-do: tune these values, maybe have it factor based on confidence scores or something
|
||||
# to-do: see if instead just prefixing with blank audio overcomes the initla noise anyways
|
||||
if low_temperature:
|
||||
enabled = n < low_temperature_range
|
||||
sampling_repetition_penalty = 1.35 if enabled else original_sampling_repetition_penalty
|
||||
|
|
|
@ -147,14 +147,14 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|||
parser.add_argument("--references", type=str, default=kwargs["reference"])
|
||||
parser.add_argument("--language", type=str, default=kwargs["language"])
|
||||
parser.add_argument("--input-prompt-length", type=float, default=kwargs["input-prompt-length"])
|
||||
#parser.add_argument("--input-prompt-prefix", action='store_true', default=kwargs["input-prompt-prefix"])
|
||||
parser.add_argument("--input-prompt-prefix", action='store_true')
|
||||
parser.add_argument("--input-prompt-prefix", action='store_true', default=kwargs["input-prompt-prefix"] if cfg.experimental else False)
|
||||
parser.add_argument("--max-ar-steps", type=int, default=int(kwargs["max-seconds"]*cfg.dataset.frames_per_second))
|
||||
parser.add_argument("--max-nar-levels", type=int, default=0), # kwargs["max-nar-levels"])
|
||||
parser.add_argument("--max-nar-levels", type=int, default=kwargs["max-nar-levels"] if cfg.experimental else 0)
|
||||
parser.add_argument("--ar-temp", type=float, default=kwargs["ar-temp"])
|
||||
parser.add_argument("--nar-temp", type=float, default=kwargs["nar-temp"])
|
||||
parser.add_argument("--min-ar-temp", type=float, default=kwargs["min-ar-temp"])
|
||||
parser.add_argument("--min-nar-temp", type=float, default=kwargs["min-nar-temp"])
|
||||
parser.add_argument("--prefix-silence", type=float, default=kwargs["prefix-silence"] if cfg.experimental else 0)
|
||||
parser.add_argument("--top-p", type=float, default=kwargs["top-p"])
|
||||
parser.add_argument("--top-k", type=int, default=kwargs["top-k"])
|
||||
parser.add_argument("--min-p", type=float, default=kwargs["min-p"])
|
||||
|
@ -195,6 +195,7 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|||
max_nar_levels=args.max_nar_levels,
|
||||
input_prompt_length=args.input_prompt_length,
|
||||
input_prompt_prefix=args.input_prompt_prefix,
|
||||
prefix_silence=args.prefix_silence,
|
||||
ar_temp=args.ar_temp,
|
||||
nar_temp=args.nar_temp,
|
||||
min_ar_temp=args.min_ar_temp,
|
||||
|
@ -345,13 +346,16 @@ with ui:
|
|||
with gr.Tab("Basic Settings"):
|
||||
with gr.Row():
|
||||
layout["inference_tts"]["inputs"]["max-seconds"] = gr.Slider(value=12, minimum=1, maximum=32, step=0.1, label="Maximum Seconds", info="Limits how many steps to perform in the AR pass.")
|
||||
#layout["inference_tts"]["inputs"]["max-nar-levels"] = gr.Slider(value=7, minimum=0, maximum=7, step=1, label="Max NAR Levels", info="Limits how many steps to perform in the NAR pass.")
|
||||
if cfg.experimental:
|
||||
layout["inference_tts"]["inputs"]["max-nar-levels"] = gr.Slider(value=7, minimum=0, maximum=7, step=1, label="Max NAR Levels", info="Limits how many steps to perform in the NAR pass.")
|
||||
layout["inference_tts"]["inputs"]["input-prompt-length"] = gr.Slider(value=5.0, minimum=0.0, maximum=12.0, step=0.05, label="Input Prompt Repeat/Trim Length", info="Repeats and trims the input prompt down to X seconds. Set 0 to disable.")
|
||||
with gr.Row():
|
||||
layout["inference_tts"]["inputs"]["ar-temp"] = gr.Slider(value=0.0, minimum=0.0, maximum=1.5, step=0.05, label="Temperature (AR)", info="Modifies the randomness from the samples in the AR. (0 to greedy* sample)")
|
||||
layout["inference_tts"]["inputs"]["nar-temp"] = gr.Slider(value=0.0, minimum=0.0, maximum=1.5, step=0.05, label="Temperature (NAR)", info="Modifies the randomness from the samples in the NAR. (0 to greedy sample)")
|
||||
with gr.Row():
|
||||
#layout["inference_tts"]["inputs"]["input-prompt-prefix"] = gr.Checkbox(label="Input Prompt as Prefix", info="Treats the input prompt clip as the prefix of the generated sequence.")
|
||||
if cfg.experimental:
|
||||
layout["inference_tts"]["inputs"]["input-prompt-prefix"] = gr.Checkbox(label="Input Prompt as Prefix", info="Treats the input prompt clip as the prefix of the generated sequence.")
|
||||
layout["inference_tts"]["inputs"]["prefix-silence"] = gr.Slider(value=0.0, minimum=0.0, maximum=1.0, step=0.05, label="Silence Prefix Duration", info="Amount of silence to prefix to the output response before beginning inference.")
|
||||
layout["inference_tts"]["inputs"]["dynamic-sampling"] = gr.Checkbox(label="Dynamic Temperature", info="Dynamically adjusts the temperature based on the highest confident predicted token per sampling step.")
|
||||
if cfg.experimental:
|
||||
layout["inference_tts"]["inputs"]["entropix-sampling"] = gr.Checkbox(label="Entropix Sampling", info="Dynamically samples based on entropy/varentropy values from the logits / attention scores.")
|
||||
|
|
Loading…
Reference in New Issue
Block a user