2022-01-28 06:19:29 +00:00
import argparse
import os
2022-05-17 18:11:18 +00:00
import torch
2022-01-28 06:19:29 +00:00
import torchaudio
2023-02-02 21:13:28 +00:00
import time
2022-02-04 05:18:21 +00:00
2022-05-19 11:31:02 +00:00
from api import TextToSpeech , MODELS_DIR
2022-05-22 11:28:15 +00:00
from utils . audio import load_voices
2022-02-04 05:18:21 +00:00
2022-01-28 06:19:29 +00:00
if __name__ == ' __main__ ' :
parser = argparse . ArgumentParser ( )
2022-05-03 00:00:57 +00:00
parser . add_argument ( ' --text ' , type = str , help = ' Text to speak. ' , default = " The expressiveness of autoregressive transformers is literally nuts! I absolutely adore them. " )
2022-04-18 16:22:36 +00:00
parser . add_argument ( ' --voice ' , type = str , help = ' Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
2022-05-02 21:40:03 +00:00
' Use the & character to join two voices together. Use a comma to perform inference on multiple voices. ' , default = ' random ' )
2023-02-02 21:13:28 +00:00
parser . add_argument ( ' --preset ' , type = str , help = ' Which voice preset to use. ' , default = ' standard ' )
2022-05-03 03:37:39 +00:00
parser . add_argument ( ' --output_path ' , type = str , help = ' Where to store outputs. ' , default = ' results/ ' )
2022-05-01 23:29:25 +00:00
parser . add_argument ( ' --model_dir ' , type = str , help = ' Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this '
2022-05-19 11:31:02 +00:00
' should only be specified if you have custom checkpoints. ' , default = MODELS_DIR )
2022-05-12 17:25:35 +00:00
parser . add_argument ( ' --candidates ' , type = int , help = ' How many output candidates to produce per-voice. ' , default = 3 )
2022-05-17 18:11:18 +00:00
parser . add_argument ( ' --seed ' , type = int , help = ' Random seed which can be used to reproduce results. ' , default = None )
parser . add_argument ( ' --produce_debug_state ' , type = bool , help = ' Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true. ' , default = True )
2022-05-25 10:22:50 +00:00
parser . add_argument ( ' --cvvp_amount ' , type = float , help = ' How much the CVVP model should influence the output. '
2023-01-05 11:49:55 +00:00
' Increasing this can in some cases reduce the likelihood of multiple speakers. Defaults to 0 (disabled) ' , default = .0 )
2023-02-02 21:13:28 +00:00
parser . add_argument ( ' --temperature ' , type = float , help = ' The softmax temperature of the autoregressive model. ' , default = .8 )
parser . add_argument ( ' --autoregressive_samples ' , type = int , help = ' umber of samples taken from the autoregressive model, all of which are filtered using CLVP. As Tortoise is a probabilistic model, more samples means a higher probability of creating something " great " . ' )
parser . add_argument ( ' --diffusion_iterations ' , type = int , help = ' Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better, however. ' )
2022-01-28 06:19:29 +00:00
args = parser . parse_args ( )
2023-02-02 21:13:28 +00:00
if ( hasattr ( args , " autoregressive_samples " ) and args . autoregressive_samples is not None ) or ( hasattr ( args , " diffusion_iterations " ) and args . diffusion_iterations is not None ) :
del args . preset
if hasattr ( args , " preset " ) :
del args . autoregressive_samples
del args . diffusion_iterations
2022-01-28 06:19:29 +00:00
os . m akedirs ( args . output_path , exist_ok = True )
2022-04-01 17:55:07 +00:00
2022-05-03 00:00:57 +00:00
tts = TextToSpeech ( models_dir = args . model_dir )
2022-01-28 06:19:29 +00:00
2022-04-18 16:22:36 +00:00
selected_voices = args . voice . split ( ' , ' )
2022-05-22 11:28:15 +00:00
for k , selected_voice in enumerate ( selected_voices ) :
if ' & ' in selected_voice :
voice_sel = selected_voice . split ( ' & ' )
else :
voice_sel = [ selected_voice ]
voice_samples , conditioning_latents = load_voices ( voice_sel )
2023-02-02 21:13:28 +00:00
if ( hasattr ( args , " autoregressive_samples " ) and args . autoregressive_samples is not None ) or ( hasattr ( args , " diffusion_iterations " ) and args . diffusion_iterations is not None ) :
gen , dbg_state = tts . tts_with_preset ( args . text , k = args . candidates , voice_samples = voice_samples , conditioning_latents = conditioning_latents ,
use_deterministic_seed = args . seed , return_deterministic_state = True , cvvp_amount = args . cvvp_amount ,
temperature = args . temperature ,
num_autoregressive_samples = args . autoregressive_samples , diffusion_iterations = args . diffusion_iterations )
else :
gen , dbg_state = tts . tts_with_preset ( args . text , k = args . candidates , voice_samples = voice_samples , conditioning_latents = conditioning_latents ,
preset = args . preset , use_deterministic_seed = args . seed , return_deterministic_state = True , cvvp_amount = args . cvvp_amount ,
temperature = args . temperature )
timestamp = int ( time . time ( ) )
outdir = f " { args . output_path } / { selected_voice } / { timestamp } / "
os . makedirs ( outdir , exist_ok = True )
with open ( os . path . join ( outdir , f ' input.txt ' ) , ' w ' ) as f :
f . write ( args . text )
2022-05-12 17:25:35 +00:00
if isinstance ( gen , list ) :
for j , g in enumerate ( gen ) :
2023-02-02 21:13:28 +00:00
torchaudio . save ( os . path . join ( outdir , f ' { k } _ { j } .wav ' ) , g . squeeze ( 0 ) . cpu ( ) , 24000 )
2022-05-12 17:25:35 +00:00
else :
2023-02-02 21:13:28 +00:00
torchaudio . save ( os . path . join ( outdir , f ' { k } .wav ' ) , gen . squeeze ( 0 ) . cpu ( ) , 24000 )
2022-03-27 03:32:12 +00:00
2022-05-17 18:11:18 +00:00
if args . produce_debug_state :
os . makedirs ( ' debug_states ' , exist_ok = True )
2022-05-22 11:50:26 +00:00
torch . save ( dbg_state , f ' debug_states/do_tts_debug_ { selected_voice } .pth ' )
2022-05-17 18:11:18 +00:00