added option to playback audio directly, removed no-phonemize option since I swear it worked in testing but it doesn't actually work
This commit is contained in:
parent
9fa87c417a
commit
1a26f789a5
1
setup.py
1
setup.py
@ -80,6 +80,7 @@ setup(
|
|||||||
"gradio",
|
"gradio",
|
||||||
"nltk", # for parsing text inputs down to pieces
|
"nltk", # for parsing text inputs down to pieces
|
||||||
"langdetect", # for detecting the language of a text
|
"langdetect", # for detecting the language of a text
|
||||||
|
"sounddevice", # for raw playback
|
||||||
],
|
],
|
||||||
extras_require = {
|
extras_require = {
|
||||||
"all": [
|
"all": [
|
||||||
|
|||||||
@ -74,6 +74,7 @@ def main():
|
|||||||
parser.add_argument("--amp", action="store_true")
|
parser.add_argument("--amp", action="store_true")
|
||||||
parser.add_argument("--dtype", type=str, default=None)
|
parser.add_argument("--dtype", type=str, default=None)
|
||||||
parser.add_argument("--attention", type=str, default=None)
|
parser.add_argument("--attention", type=str, default=None)
|
||||||
|
parser.add_argument("--play", action="store_true")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
config = None
|
config = None
|
||||||
@ -122,6 +123,7 @@ def main():
|
|||||||
task=args.task,
|
task=args.task,
|
||||||
modality=args.modality,
|
modality=args.modality,
|
||||||
out_path=args.out_path,
|
out_path=args.out_path,
|
||||||
|
play=args.play,
|
||||||
|
|
||||||
input_prompt_length=args.input_prompt_length,
|
input_prompt_length=args.input_prompt_length,
|
||||||
load_from_artifact=args.load_from_artifact,
|
load_from_artifact=args.load_from_artifact,
|
||||||
|
|||||||
@ -29,6 +29,11 @@ from .models import download_model, DEFAULT_MODEL_PATH
|
|||||||
if deepspeed_available:
|
if deepspeed_available:
|
||||||
import deepspeed
|
import deepspeed
|
||||||
|
|
||||||
|
try:
|
||||||
|
import sounddevice as sd
|
||||||
|
except Exception as e:
|
||||||
|
sd = None
|
||||||
|
|
||||||
class TTS():
|
class TTS():
|
||||||
def __init__( self, config=None, lora=None, device=None, amp=None, dtype=None, attention=None ):
|
def __init__( self, config=None, lora=None, device=None, amp=None, dtype=None, attention=None ):
|
||||||
self.loading = True
|
self.loading = True
|
||||||
@ -110,7 +115,7 @@ class TTS():
|
|||||||
return torch.tensor( tokens )
|
return torch.tensor( tokens )
|
||||||
|
|
||||||
if not phonemize:
|
if not phonemize:
|
||||||
return torch.tensor( text_tokenize( content ) )
|
return torch.tensor( text_tokenize( text ) )
|
||||||
|
|
||||||
return torch.tensor( tokenize( g2p.encode(text, language=language) ) )
|
return torch.tensor( tokenize( g2p.encode(text, language=language) ) )
|
||||||
|
|
||||||
@ -352,8 +357,12 @@ class TTS():
|
|||||||
text_language=None,
|
text_language=None,
|
||||||
task="tts",
|
task="tts",
|
||||||
out_path=None,
|
out_path=None,
|
||||||
|
play=False,
|
||||||
**sampling_kwargs,
|
**sampling_kwargs,
|
||||||
):
|
):
|
||||||
|
if sd is None:
|
||||||
|
play = False
|
||||||
|
|
||||||
input_prompt_length = sampling_kwargs.pop("input_prompt_length", 0)
|
input_prompt_length = sampling_kwargs.pop("input_prompt_length", 0)
|
||||||
modality = sampling_kwargs.pop("modality", "auto")
|
modality = sampling_kwargs.pop("modality", "auto")
|
||||||
seed = sampling_kwargs.pop("seed", None)
|
seed = sampling_kwargs.pop("seed", None)
|
||||||
@ -560,6 +569,11 @@ class TTS():
|
|||||||
# add utterances
|
# add utterances
|
||||||
wavs.append(wav)
|
wavs.append(wav)
|
||||||
|
|
||||||
|
if play:
|
||||||
|
sd.play(wav.cpu().numpy()[0], sr)
|
||||||
|
sd.wait()
|
||||||
|
|
||||||
|
|
||||||
# combine all utterances
|
# combine all utterances
|
||||||
return (torch.concat(wavs, dim=-1), sr)
|
return (torch.concat(wavs, dim=-1), sr)
|
||||||
|
|
||||||
|
|||||||
@ -716,7 +716,10 @@ class AR_NAR(Base):
|
|||||||
text_list = [ sequence_list[i] if task in ["phn"] else text_list[i] for i, task in enumerate(task_list) ]
|
text_list = [ sequence_list[i] if task in ["phn"] else text_list[i] for i, task in enumerate(task_list) ]
|
||||||
raw_text_list = [ sequence_list[i] if task in ["un-phn"] else raw_text_list[i] for i, task in enumerate(task_list) ]
|
raw_text_list = [ sequence_list[i] if task in ["un-phn"] else raw_text_list[i] for i, task in enumerate(task_list) ]
|
||||||
else:
|
else:
|
||||||
text_list = [ sequence_list[i] if task in text_task else text_list[i] for i, task in enumerate(task_list) ]
|
if raw_text_list is not None:
|
||||||
|
raw_text_list = [ sequence_list[i] if task in text_task else raw_text_list[i] for i, task in enumerate(task_list) ]
|
||||||
|
else:
|
||||||
|
text_list = [ sequence_list[i] if task in text_task else text_list[i] for i, task in enumerate(task_list) ]
|
||||||
resps_list = [ sequence_list[i] if task not in text_task else resps_list[i] for i, task in enumerate(task_list) ]
|
resps_list = [ sequence_list[i] if task not in text_task else resps_list[i] for i, task in enumerate(task_list) ]
|
||||||
|
|
||||||
quant_levels = [ 0 for _ in range( max( batch_size, beam_width ) ) ]
|
quant_levels = [ 0 for _ in range( max( batch_size, beam_width ) ) ]
|
||||||
|
|||||||
@ -218,6 +218,7 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|||||||
parser.add_argument("--language", type=str, default=kwargs["language"])
|
parser.add_argument("--language", type=str, default=kwargs["language"])
|
||||||
parser.add_argument("--text-language", type=str, default=kwargs["text-language"])
|
parser.add_argument("--text-language", type=str, default=kwargs["text-language"])
|
||||||
parser.add_argument("--no-phonemize", action="store_true")
|
parser.add_argument("--no-phonemize", action="store_true")
|
||||||
|
parser.add_argument("--play", action="store_true")
|
||||||
parser.add_argument("--split-text-by", type=str, default=kwargs["split-text-by"])
|
parser.add_argument("--split-text-by", type=str, default=kwargs["split-text-by"])
|
||||||
parser.add_argument("--context-history", type=int, default=kwargs["context-history"])
|
parser.add_argument("--context-history", type=int, default=kwargs["context-history"])
|
||||||
parser.add_argument("--input-prompt-length", type=float, default=kwargs["input-prompt-length"])
|
parser.add_argument("--input-prompt-length", type=float, default=kwargs["input-prompt-length"])
|
||||||
@ -274,7 +275,10 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|||||||
args.refine_on_stop = True
|
args.refine_on_stop = True
|
||||||
|
|
||||||
if kwargs.pop("no-phonemize", False):
|
if kwargs.pop("no-phonemize", False):
|
||||||
args.no_phonemize = False
|
args.no_phonemize = True
|
||||||
|
|
||||||
|
if kwargs.pop("play", False):
|
||||||
|
args.play = True
|
||||||
|
|
||||||
if args.split_text_by == "lines":
|
if args.split_text_by == "lines":
|
||||||
args.split_text_by = "\n"
|
args.split_text_by = "\n"
|
||||||
@ -324,6 +328,7 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|||||||
language=args.language,
|
language=args.language,
|
||||||
text_language=args.text_language,
|
text_language=args.text_language,
|
||||||
task=args.task,
|
task=args.task,
|
||||||
|
play=args.play,
|
||||||
modality=args.modality.lower(),
|
modality=args.modality.lower(),
|
||||||
references=args.references.split(";") if args.references is not None else [],
|
references=args.references.split(";") if args.references is not None else [],
|
||||||
**sampling_kwargs,
|
**sampling_kwargs,
|
||||||
@ -472,7 +477,11 @@ with ui:
|
|||||||
with gr.Row():
|
with gr.Row():
|
||||||
layout["inference_tts"]["inputs"]["split-text-by"] = gr.Dropdown(choices=["sentences", "lines"], label="Text Delimiter", info="How to split the text into utterances.", value="sentences")
|
layout["inference_tts"]["inputs"]["split-text-by"] = gr.Dropdown(choices=["sentences", "lines"], label="Text Delimiter", info="How to split the text into utterances.", value="sentences")
|
||||||
layout["inference_tts"]["inputs"]["context-history"] = gr.Slider(value=0, minimum=0, maximum=4, step=1, label="(Rolling) Context History", info="How many prior lines to serve as the context/prefix (0 to disable).")
|
layout["inference_tts"]["inputs"]["context-history"] = gr.Slider(value=0, minimum=0, maximum=4, step=1, label="(Rolling) Context History", info="How many prior lines to serve as the context/prefix (0 to disable).")
|
||||||
|
"""
|
||||||
|
with gr.Row():
|
||||||
layout["inference_tts"]["inputs"]["no-phonemize"] = gr.Checkbox(label="No Phonemize", info="Use raw text rather than phonemize the text as the input prompt.")
|
layout["inference_tts"]["inputs"]["no-phonemize"] = gr.Checkbox(label="No Phonemize", info="Use raw text rather than phonemize the text as the input prompt.")
|
||||||
|
layout["inference_tts"]["inputs"]["play"] = gr.Checkbox(label="Auto Play", info="Auto play on generation (using sounddevice).")
|
||||||
|
"""
|
||||||
with gr.Tab("Sampler Settings"):
|
with gr.Tab("Sampler Settings"):
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
layout["inference_tts"]["inputs"]["ar-temperature"] = gr.Slider(value=1.0, minimum=0.0, maximum=1.5, step=0.05, label="Temperature (AR/NAR-len)", info="Adjusts the probabilities in the AR/NAR-len. (0 to greedy* sample)")
|
layout["inference_tts"]["inputs"]["ar-temperature"] = gr.Slider(value=1.0, minimum=0.0, maximum=1.5, step=0.05, label="Temperature (AR/NAR-len)", info="Adjusts the probabilities in the AR/NAR-len. (0 to greedy* sample)")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user