2023-08-02 21:53:35 +00:00
|
|
|
import argparse
|
|
|
|
from pathlib import Path
|
|
|
|
from .inference import TTS
|
2024-06-13 00:49:47 +00:00
|
|
|
from .config import cfg
|
2023-08-02 21:53:35 +00:00
|
|
|
|
2023-08-21 02:36:02 +00:00
|
|
|
def path_list(arg):
|
2024-07-23 01:47:24 +00:00
|
|
|
if not arg:
|
|
|
|
return None
|
2023-08-21 02:36:02 +00:00
|
|
|
return [Path(p) for p in arg.split(";")]
|
|
|
|
|
2023-08-02 21:53:35 +00:00
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser("VALL-E TTS")
|
|
|
|
parser.add_argument("text")
|
2024-07-23 01:47:24 +00:00
|
|
|
parser.add_argument("references", type=path_list, default=None)
|
2023-10-13 04:21:01 +00:00
|
|
|
parser.add_argument("--language", type=str, default="en")
|
2024-09-06 04:21:18 +00:00
|
|
|
parser.add_argument("--task", type=str, default="tts")
|
2024-11-20 00:51:17 +00:00
|
|
|
parser.add_argument("--modality", type=str, default="auto")
|
2023-08-21 02:36:02 +00:00
|
|
|
parser.add_argument("--out-path", type=Path, default=None)
|
|
|
|
|
2023-08-02 21:53:35 +00:00
|
|
|
parser.add_argument("--yaml", type=Path, default=None)
|
2024-10-26 05:13:10 +00:00
|
|
|
parser.add_argument("--model", type=Path, default=None)
|
|
|
|
parser.add_argument("--lora", type=Path, default=None)
|
2023-09-09 02:02:00 +00:00
|
|
|
|
2024-11-12 02:21:16 +00:00
|
|
|
parser.add_argument("--max-duration", type=int, default=12 * cfg.dataset.frames_per_second)
|
|
|
|
parser.add_argument("--max-steps", type=int, default=25)
|
|
|
|
parser.add_argument("--max-levels", type=int, default=7)
|
2023-09-09 02:02:00 +00:00
|
|
|
|
2024-11-12 02:21:16 +00:00
|
|
|
parser.add_argument("--ar-temperature", type=float, default=1.0)
|
|
|
|
parser.add_argument("--nar-temperature", type=float, default=0.0)
|
|
|
|
parser.add_argument("--min-ar-temperature", type=float, default=-1.0)
|
|
|
|
parser.add_argument("--min-nar-temperature", type=float, default=-1.0)
|
2023-09-09 23:04:44 +00:00
|
|
|
parser.add_argument("--input-prompt-length", type=float, default=3.0)
|
2024-10-04 23:57:19 +00:00
|
|
|
parser.add_argument("--input-prompt-prefix", action="store_true")
|
2024-11-12 02:21:16 +00:00
|
|
|
parser.add_argument("--prefix-silence", type=float, default=0.0)
|
|
|
|
parser.add_argument("--cfg-strength", type=float, default=0.0)
|
2024-11-21 02:37:33 +00:00
|
|
|
parser.add_argument("--cfg-rescale", type=float, default=0.75)
|
2023-09-09 01:30:54 +00:00
|
|
|
|
|
|
|
parser.add_argument("--top-p", type=float, default=1.0)
|
2024-07-22 04:21:37 +00:00
|
|
|
parser.add_argument("--top-k", type=int, default=0)
|
2024-11-13 04:30:09 +00:00
|
|
|
parser.add_argument("--top-no", type=float, default=0.0)
|
2024-10-12 03:36:06 +00:00
|
|
|
parser.add_argument("--min-p", type=float, default=0.0)
|
2024-11-12 02:21:16 +00:00
|
|
|
parser.add_argument("--repetition-penalty", type=float, default=1.0)
|
2023-09-09 02:02:00 +00:00
|
|
|
parser.add_argument("--repetition-penalty-decay", type=float, default=0.0)
|
2023-09-09 01:30:54 +00:00
|
|
|
parser.add_argument("--length-penalty", type=float, default=0.0)
|
2023-09-13 02:28:07 +00:00
|
|
|
parser.add_argument("--beam-width", type=int, default=0)
|
2023-09-18 23:55:41 +00:00
|
|
|
|
|
|
|
parser.add_argument("--mirostat-tau", type=float, default=0)
|
|
|
|
parser.add_argument("--mirostat-eta", type=float, default=0)
|
2024-06-25 18:41:29 +00:00
|
|
|
|
2024-07-30 00:15:07 +00:00
|
|
|
parser.add_argument("--dry-multiplier", type=float, default=0)
|
|
|
|
parser.add_argument("--dry-base", type=float, default=1.75)
|
|
|
|
parser.add_argument("--dry-allowed-length", type=int, default=2)
|
|
|
|
|
2024-10-12 16:27:55 +00:00
|
|
|
parser.add_argument("--entropix-sampling", action="store_true")
|
|
|
|
|
2024-11-02 02:30:06 +00:00
|
|
|
parser.add_argument("--layer-skip", action="store_true")
|
|
|
|
parser.add_argument("--layer-skip-exit-layer", type=int, default=None)
|
2024-11-04 00:31:28 +00:00
|
|
|
parser.add_argument("--layer-skip-entropy-threshold", type=int, default=0.1)
|
|
|
|
parser.add_argument("--layer-skip-varentropy-threshold", type=int, default=0.1)
|
|
|
|
parser.add_argument("--refine-on-stop", action="store_true")
|
2024-11-10 04:57:34 +00:00
|
|
|
|
|
|
|
# experimental settings
|
|
|
|
parser.add_argument("--load-from-artifact", type=Path, default=None)
|
|
|
|
parser.add_argument("--denoise-start", type=float, default=0.0)
|
2024-11-02 02:30:06 +00:00
|
|
|
|
2024-06-25 18:41:29 +00:00
|
|
|
parser.add_argument("--seed", type=int, default=None)
|
2023-09-09 01:30:54 +00:00
|
|
|
|
2023-09-09 21:17:20 +00:00
|
|
|
parser.add_argument("--device", type=str, default=None)
|
|
|
|
parser.add_argument("--amp", action="store_true")
|
|
|
|
parser.add_argument("--dtype", type=str, default=None)
|
2024-08-27 00:33:51 +00:00
|
|
|
parser.add_argument("--attention", type=str, default=None)
|
2023-08-02 21:53:35 +00:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
2024-10-26 05:13:10 +00:00
|
|
|
config = None
|
|
|
|
|
|
|
|
if args.yaml:
|
|
|
|
config = args.yaml
|
|
|
|
elif args.model:
|
|
|
|
config = args.model
|
|
|
|
|
|
|
|
tts = TTS( config=config, lora=args.lora, device=args.device, dtype=args.dtype, amp=args.amp, attention=args.attention )
|
2024-11-12 02:21:16 +00:00
|
|
|
|
|
|
|
sampling_kwargs = dict(
|
|
|
|
max_steps=args.max_steps,
|
|
|
|
max_levels=args.max_levels,
|
|
|
|
max_duration=args.max_duration,
|
|
|
|
ar_temperature=args.ar_temperature, nar_temperature=args.nar_temperature,
|
|
|
|
min_ar_temperature=args.min_ar_temperature, min_nar_temperature=args.min_nar_temperature,
|
2024-11-13 04:30:09 +00:00
|
|
|
top_p=args.top_p, top_k=args.top_k, top_no=args.top_no,min_p=args.min_p,
|
2023-09-18 23:55:41 +00:00
|
|
|
repetition_penalty=args.repetition_penalty, repetition_penalty_decay=args.repetition_penalty_decay,
|
|
|
|
length_penalty=args.length_penalty,
|
|
|
|
beam_width=args.beam_width,
|
2024-06-25 18:41:29 +00:00
|
|
|
mirostat_tau=args.mirostat_tau, mirostat_eta=args.mirostat_eta,
|
2024-07-31 01:53:51 +00:00
|
|
|
dry_multiplier=args.dry_multiplier, dry_base=args.dry_base, dry_allowed_length=args.dry_allowed_length,
|
2024-10-12 16:27:55 +00:00
|
|
|
entropix_sampling=args.entropix_sampling,
|
2024-11-02 02:30:06 +00:00
|
|
|
layer_skip=args.layer_skip,
|
|
|
|
layer_skip_exit_layer=args.layer_skip_exit_layer,
|
2024-11-04 00:31:28 +00:00
|
|
|
layer_skip_entropy_threshold=args.layer_skip_entropy_threshold,
|
|
|
|
layer_skip_varentropy_threshold=args.layer_skip_varentropy_threshold,
|
|
|
|
refine_on_stop=args.refine_on_stop,
|
2024-11-10 04:57:34 +00:00
|
|
|
denoise_start=args.denoise_start,
|
2024-11-12 02:39:48 +00:00
|
|
|
input_prompt_length=args.input_prompt_length,
|
2024-11-12 02:21:16 +00:00
|
|
|
input_prompt_prefix=args.input_prompt_prefix,
|
|
|
|
prefix_silence=args.prefix_silence,
|
|
|
|
cfg_strength=args.cfg_strength,
|
2024-11-21 02:37:33 +00:00
|
|
|
cfg_rescale=args.cfg_rescale,
|
2024-11-12 02:21:16 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
output = tts.inference(
|
|
|
|
text=args.text,
|
|
|
|
references=args.references,
|
|
|
|
language=args.language,
|
|
|
|
task=args.task,
|
2024-11-20 00:51:17 +00:00
|
|
|
modality=args.modality,
|
2024-11-12 02:21:16 +00:00
|
|
|
out_path=args.out_path,
|
|
|
|
|
|
|
|
input_prompt_length=args.input_prompt_length,
|
|
|
|
load_from_artifact=args.load_from_artifact,
|
|
|
|
|
|
|
|
sampling_kwargs=sampling_kwargs,
|
2024-11-10 04:57:34 +00:00
|
|
|
|
2024-06-25 18:41:29 +00:00
|
|
|
seed=args.seed,
|
2023-09-18 23:55:41 +00:00
|
|
|
)
|
2024-09-08 13:30:30 +00:00
|
|
|
|
|
|
|
if isinstance( output, str ):
|
|
|
|
print( output )
|
2023-08-02 21:53:35 +00:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|