added language selection in web UI, tweaked demo script
This commit is contained in:
parent
10df2ef5f3
commit
2f1dca3089
|
@ -43,7 +43,7 @@ def main():
|
||||||
parser.add_argument("--demo-dir", type=Path, default=None)
|
parser.add_argument("--demo-dir", type=Path, default=None)
|
||||||
parser.add_argument("--skip-existing", action="store_true")
|
parser.add_argument("--skip-existing", action="store_true")
|
||||||
parser.add_argument("--sample-from-dataset", action="store_true")
|
parser.add_argument("--sample-from-dataset", action="store_true")
|
||||||
parser.add_argument("--load-from-dataloader", action="store_true")
|
parser.add_argument("--skip-loading-dataloader", action="store_true")
|
||||||
parser.add_argument("--dataset-samples", type=int, default=0)
|
parser.add_argument("--dataset-samples", type=int, default=0)
|
||||||
parser.add_argument("--audio-path-root", type=str, default=None)
|
parser.add_argument("--audio-path-root", type=str, default=None)
|
||||||
parser.add_argument("--preamble", type=str, default=None)
|
parser.add_argument("--preamble", type=str, default=None)
|
||||||
|
@ -89,7 +89,7 @@ def main():
|
||||||
if not args.preamble:
|
if not args.preamble:
|
||||||
args.preamble = "<br>".join([
|
args.preamble = "<br>".join([
|
||||||
'Below are some samples from my VALL-E implementation: <a href="https://git.ecker.tech/mrq/vall-e/">https://git.ecker.tech/mrq/vall-e/</a>.',
|
'Below are some samples from my VALL-E implementation: <a href="https://git.ecker.tech/mrq/vall-e/">https://git.ecker.tech/mrq/vall-e/</a>.',
|
||||||
'I do not consider these to be state of the art, as the model does not follow close to the prompt as I would like for general speakers.',
|
'Unlike the original VALL-E demo page, I\'m placing emphasis on the input prompt, as the model adheres to it stronger than others.',
|
||||||
])
|
])
|
||||||
|
|
||||||
# read html template
|
# read html template
|
||||||
|
@ -115,13 +115,14 @@ def main():
|
||||||
"librispeech": args.demo_dir / "librispeech",
|
"librispeech": args.demo_dir / "librispeech",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (args.demo_dir / "dataset").exists():
|
||||||
|
samples_dirs["dataset"] = args.demo_dir / "dataset"
|
||||||
|
|
||||||
# pull from dataset samples
|
# pull from dataset samples
|
||||||
if args.sample_from_dataset:
|
if args.sample_from_dataset:
|
||||||
cfg.dataset.cache = False
|
cfg.dataset.cache = False
|
||||||
|
|
||||||
samples_dirs["dataset"] = args.demo_dir / "dataset"
|
samples_dirs["dataset"] = args.demo_dir / "dataset"
|
||||||
|
|
||||||
if args.load_from_dataloader:
|
|
||||||
_logger.info("Loading dataloader...")
|
_logger.info("Loading dataloader...")
|
||||||
dataloader = create_train_dataloader()
|
dataloader = create_train_dataloader()
|
||||||
_logger.info("Loaded dataloader.")
|
_logger.info("Loaded dataloader.")
|
||||||
|
@ -182,6 +183,7 @@ def main():
|
||||||
if args.skip_existing and out_path.exists():
|
if args.skip_existing and out_path.exists():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
tts.inference(
|
tts.inference(
|
||||||
text=text,
|
text=text,
|
||||||
references=[prompt],
|
references=[prompt],
|
||||||
|
@ -199,6 +201,8 @@ def main():
|
||||||
seed=args.seed,
|
seed=args.seed,
|
||||||
tqdm=False,
|
tqdm=False,
|
||||||
)
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f'Error while processing {out_path}: {e}')
|
||||||
|
|
||||||
# collate entries into HTML
|
# collate entries into HTML
|
||||||
samples = [
|
samples = [
|
||||||
|
|
|
@ -22,6 +22,7 @@ from .train import train
|
||||||
from .utils import get_devices, setup_logging
|
from .utils import get_devices, setup_logging
|
||||||
from .utils.io import json_read, json_stringify
|
from .utils.io import json_read, json_stringify
|
||||||
from .emb.qnt import decode_to_wave
|
from .emb.qnt import decode_to_wave
|
||||||
|
from .data import get_lang_symmap
|
||||||
|
|
||||||
tts = None
|
tts = None
|
||||||
|
|
||||||
|
@ -100,6 +101,9 @@ def load_model( yaml, device, dtype, attention ):
|
||||||
def get_speakers():
|
def get_speakers():
|
||||||
return cfg.dataset.training
|
return cfg.dataset.training
|
||||||
|
|
||||||
|
def get_languages():
|
||||||
|
return get_lang_symmap().keys()
|
||||||
|
|
||||||
#@gradio_wrapper(inputs=layout["dataset"]["inputs"].keys())
|
#@gradio_wrapper(inputs=layout["dataset"]["inputs"].keys())
|
||||||
def load_sample( speaker ):
|
def load_sample( speaker ):
|
||||||
metadata_path = cfg.metadata_dir / f'{speaker}.json'
|
metadata_path = cfg.metadata_dir / f'{speaker}.json'
|
||||||
|
@ -158,7 +162,7 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
||||||
parser.add_argument("--text", type=str, default=kwargs["text"])
|
parser.add_argument("--text", type=str, default=kwargs["text"])
|
||||||
parser.add_argument("--task", type=str, default="tts")
|
parser.add_argument("--task", type=str, default="tts")
|
||||||
parser.add_argument("--references", type=str, default=kwargs["reference"])
|
parser.add_argument("--references", type=str, default=kwargs["reference"])
|
||||||
parser.add_argument("--language", type=str, default="en")
|
parser.add_argument("--language", type=str, default=kwargs["language"])
|
||||||
parser.add_argument("--input-prompt-length", type=float, default=kwargs["input-prompt-length"])
|
parser.add_argument("--input-prompt-length", type=float, default=kwargs["input-prompt-length"])
|
||||||
parser.add_argument("--max-ar-steps", type=int, default=int(kwargs["max-seconds"]*cfg.dataset.frames_per_second))
|
parser.add_argument("--max-ar-steps", type=int, default=int(kwargs["max-seconds"]*cfg.dataset.frames_per_second))
|
||||||
parser.add_argument("--max-nar-levels", type=int, default=0), # kwargs["max-nar-levels"])
|
parser.add_argument("--max-nar-levels", type=int, default=0), # kwargs["max-nar-levels"])
|
||||||
|
@ -231,7 +235,7 @@ def do_inference_stt( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
||||||
parser = argparse.ArgumentParser(allow_abbrev=False)
|
parser = argparse.ArgumentParser(allow_abbrev=False)
|
||||||
# I'm very sure I can procedurally generate this list
|
# I'm very sure I can procedurally generate this list
|
||||||
parser.add_argument("--references", type=str, default=kwargs["reference"])
|
parser.add_argument("--references", type=str, default=kwargs["reference"])
|
||||||
parser.add_argument("--language", type=str, default="en")
|
parser.add_argument("--language", type=str, default=kwargs["language"])
|
||||||
parser.add_argument("--max-ar-steps", type=int, default=0)
|
parser.add_argument("--max-ar-steps", type=int, default=0)
|
||||||
parser.add_argument("--ar-temp", type=float, default=kwargs["ar-temp"])
|
parser.add_argument("--ar-temp", type=float, default=kwargs["ar-temp"])
|
||||||
parser.add_argument("--min-ar-temp", type=float, default=kwargs["min-ar-temp"])
|
parser.add_argument("--min-ar-temp", type=float, default=kwargs["min-ar-temp"])
|
||||||
|
@ -381,6 +385,7 @@ with ui:
|
||||||
layout["inference_tts"]["inputs"]["nar-temp"] = gr.Slider(value=0.0, minimum=0.0, maximum=1.5, step=0.05, label="Temperature (NAR)", info="Modifies the randomness from the samples in the NAR. (0 to greedy sample)")
|
layout["inference_tts"]["inputs"]["nar-temp"] = gr.Slider(value=0.0, minimum=0.0, maximum=1.5, step=0.05, label="Temperature (NAR)", info="Modifies the randomness from the samples in the NAR. (0 to greedy sample)")
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
layout["inference_tts"]["inputs"]["dynamic-sampling"] = gr.Checkbox(label="Dynamic Temperature", info="Dynamically adjusts the temperature based on the highest confident predicted token per sampling step.")
|
layout["inference_tts"]["inputs"]["dynamic-sampling"] = gr.Checkbox(label="Dynamic Temperature", info="Dynamically adjusts the temperature based on the highest confident predicted token per sampling step.")
|
||||||
|
layout["inference_tts"]["inputs"]["language"] = gr.Dropdown(choices=get_languages(), label="Language", value="en")
|
||||||
with gr.Tab("Sampler Settings"):
|
with gr.Tab("Sampler Settings"):
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
layout["inference_tts"]["inputs"]["top-p"] = gr.Slider(value=1.0, minimum=0.0, maximum=1.0, step=0.05, label="Top P", info=r"Limits the samples that are outside the top P% of probabilities.")
|
layout["inference_tts"]["inputs"]["top-p"] = gr.Slider(value=1.0, minimum=0.0, maximum=1.0, step=0.05, label="Top P", info=r"Limits the samples that are outside the top P% of probabilities.")
|
||||||
|
@ -419,7 +424,7 @@ with ui:
|
||||||
layout["inference_stt"]["inputs"]["ar-temp"] = gr.Slider(value=0.0, minimum=0.0, maximum=1.5, step=0.05, label="Temperature (AR)", info="Modifies the randomness from the samples in the AR. (0 to greedy sample)")
|
layout["inference_stt"]["inputs"]["ar-temp"] = gr.Slider(value=0.0, minimum=0.0, maximum=1.5, step=0.05, label="Temperature (AR)", info="Modifies the randomness from the samples in the AR. (0 to greedy sample)")
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
layout["inference_stt"]["inputs"]["dynamic-sampling"] = gr.Checkbox(label="Dynamic Temperature", info="Dynamically adjusts the temperature based on the highest confident predicted token per sampling step.")
|
layout["inference_stt"]["inputs"]["dynamic-sampling"] = gr.Checkbox(label="Dynamic Temperature", info="Dynamically adjusts the temperature based on the highest confident predicted token per sampling step.")
|
||||||
|
layout["inference_stt"]["inputs"]["language"] = gr.Dropdown(choices=get_languages(), label="Language", value="en")
|
||||||
with gr.Tab("Sampler Settings"):
|
with gr.Tab("Sampler Settings"):
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
layout["inference_stt"]["inputs"]["top-p"] = gr.Slider(value=1.0, minimum=0.0, maximum=1.0, step=0.05, label="Top P", info=r"Limits the samples that are outside the top P% of probabilities.")
|
layout["inference_stt"]["inputs"]["top-p"] = gr.Slider(value=1.0, minimum=0.0, maximum=1.0, step=0.05, label="Top P", info=r"Limits the samples that are outside the top P% of probabilities.")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user