2023-08-02 21:53:35 +00:00
|
|
|
import argparse
|
|
|
|
from pathlib import Path
|
|
|
|
from .inference import TTS
|
2024-06-13 00:49:47 +00:00
|
|
|
from .config import cfg
|
2023-08-02 21:53:35 +00:00
|
|
|
|
2023-08-21 02:36:02 +00:00
|
|
|
def path_list(arg):
|
|
|
|
return [Path(p) for p in arg.split(";")]
|
|
|
|
|
2023-08-02 21:53:35 +00:00
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser("VALL-E TTS")
|
|
|
|
parser.add_argument("text")
|
2023-08-21 02:36:02 +00:00
|
|
|
parser.add_argument("references", type=path_list)
|
2023-10-13 04:21:01 +00:00
|
|
|
parser.add_argument("--language", type=str, default="en")
|
2023-08-21 02:36:02 +00:00
|
|
|
parser.add_argument("--out-path", type=Path, default=None)
|
|
|
|
|
2023-08-02 21:53:35 +00:00
|
|
|
parser.add_argument("--yaml", type=Path, default=None)
|
2023-09-09 02:02:00 +00:00
|
|
|
|
2024-06-09 22:11:38 +00:00
|
|
|
parser.add_argument("--max-ar-steps", type=int, default=12 * cfg.dataset.frames_per_second)
|
2023-09-10 18:50:13 +00:00
|
|
|
parser.add_argument("--max-nar-levels", type=int, default=7)
|
2023-10-12 01:38:40 +00:00
|
|
|
parser.add_argument("--max-ar-context", type=int, default=-1)
|
2023-09-09 02:02:00 +00:00
|
|
|
|
2023-08-02 21:53:35 +00:00
|
|
|
parser.add_argument("--ar-temp", type=float, default=1.0)
|
2024-06-09 22:11:38 +00:00
|
|
|
parser.add_argument("--nar-temp", type=float, default=0.01)
|
2023-10-10 22:02:33 +00:00
|
|
|
parser.add_argument("--min-ar-temp", type=float, default=-1.0)
|
|
|
|
parser.add_argument("--min-nar-temp", type=float, default=-1.0)
|
2023-09-09 23:04:44 +00:00
|
|
|
parser.add_argument("--input-prompt-length", type=float, default=3.0)
|
2023-09-09 01:30:54 +00:00
|
|
|
|
|
|
|
parser.add_argument("--top-p", type=float, default=1.0)
|
2024-06-09 22:11:38 +00:00
|
|
|
parser.add_argument("--top-k", type=int, default=16)
|
2023-09-09 01:30:54 +00:00
|
|
|
parser.add_argument("--repetition-penalty", type=float, default=1.0)
|
2023-09-09 02:02:00 +00:00
|
|
|
parser.add_argument("--repetition-penalty-decay", type=float, default=0.0)
|
2023-09-09 01:30:54 +00:00
|
|
|
parser.add_argument("--length-penalty", type=float, default=0.0)
|
2023-09-13 02:28:07 +00:00
|
|
|
parser.add_argument("--beam-width", type=int, default=0)
|
2023-09-18 23:55:41 +00:00
|
|
|
|
|
|
|
parser.add_argument("--mirostat-tau", type=float, default=0)
|
|
|
|
parser.add_argument("--mirostat-eta", type=float, default=0)
|
2023-09-09 01:30:54 +00:00
|
|
|
|
2023-09-09 21:17:20 +00:00
|
|
|
parser.add_argument("--device", type=str, default=None)
|
|
|
|
parser.add_argument("--amp", action="store_true")
|
|
|
|
parser.add_argument("--dtype", type=str, default=None)
|
2023-08-02 21:53:35 +00:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
2024-06-06 14:48:43 +00:00
|
|
|
tts = TTS( config=args.yaml, device=args.device, dtype=args.dtype, amp=args.amp )
|
2023-09-18 23:55:41 +00:00
|
|
|
tts.inference(
|
|
|
|
text=args.text,
|
|
|
|
references=args.references,
|
2023-10-13 04:21:01 +00:00
|
|
|
language=args.language,
|
2023-09-18 23:55:41 +00:00
|
|
|
out_path=args.out_path,
|
|
|
|
input_prompt_length=args.input_prompt_length,
|
|
|
|
max_ar_steps=args.max_ar_steps, max_nar_levels=args.max_nar_levels,
|
2023-10-12 01:38:40 +00:00
|
|
|
max_ar_context=args.max_ar_context,
|
2023-09-18 23:55:41 +00:00
|
|
|
ar_temp=args.ar_temp, nar_temp=args.nar_temp,
|
2023-10-10 22:02:33 +00:00
|
|
|
min_ar_temp=args.min_ar_temp, min_nar_temp=args.min_nar_temp,
|
2023-09-18 23:55:41 +00:00
|
|
|
top_p=args.top_p, top_k=args.top_k,
|
|
|
|
repetition_penalty=args.repetition_penalty, repetition_penalty_decay=args.repetition_penalty_decay,
|
|
|
|
length_penalty=args.length_penalty,
|
|
|
|
beam_width=args.beam_width,
|
|
|
|
mirostat_tau=args.mirostat_tau, mirostat_eta=args.mirostat_eta
|
|
|
|
)
|
2023-08-02 21:53:35 +00:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|