@ -43,7 +43,7 @@ MODELS = {
' vocoder.pth ' : ' https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth ' ,
' rlg_auto.pth ' : ' https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_auto.pth ' ,
' rlg_diffuser.pth ' : ' https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth ' ,
' bigvgan_base_24khz_100band.pth ' : ' https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_base_24khz_100band.pth ' ,
' bigvgan_24khz_100band.pth ' : ' https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_24khz_100band.pth ' ,
@ -51,6 +51,7 @@ MODELS = {
' bigvgan_24khz_100band.json ' : ' https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_24khz_100band.json ' ,
}
def hash_file ( path , algo = " md5 " , buffer_size = 0 ) :
import hashlib
@ -77,12 +78,14 @@ def hash_file(path, algo="md5", buffer_size=0):
return " {0} " . format ( hash . hexdigest ( ) )
def check_for_kill_signal ( ) :
global STOP_SIGNAL
if STOP_SIGNAL :
STOP_SIGNAL = False
raise Exception ( " Kill signal detected " )
def download_models ( specific_models = None ) :
"""
Call to download all the models that Tortoise uses .
@ -102,6 +105,7 @@ def download_models(specific_models=None):
else :
pbar . finish ( )
pbar = None
for model_name , url in MODELS . items ( ) :
if specific_models is not None and model_name not in specific_models :
continue
@ -112,7 +116,7 @@ def download_models(specific_models=None):
proxy = ProxyHandler ( { } )
opener = build_opener ( proxy )
opener . addheaders = [ ( ' User-Agent ' , ' mrq/AI-Voice-Cloning ' ) ]
opener . addheaders = [ ( ' User-Agent ' , ' mrq/AI-Voice-Cloning ' ) ]
install_opener ( opener )
request . urlretrieve ( url , model_path , show_progress )
print ( ' Done. ' )
@ -137,19 +141,23 @@ def pad_or_truncate(t, length):
if t . shape [ - 1 ] == length :
return t
elif t . shape [ - 1 ] < length :
return F . pad ( t , ( 0 , length - t . shape [ - 1 ] ) )
return F . pad ( t , ( 0 , length - t . shape [ - 1 ] ) )
else :
return t [ . . . , : length ]
def load_discrete_vocoder_diffuser ( trained_diffusion_steps = 4000 , desired_diffusion_steps = 200 , cond_free = True , cond_free_k = 1 ) :
def load_discrete_vocoder_diffuser ( trained_diffusion_steps = 4000 , desired_diffusion_steps = 200 , cond_free = True ,
cond_free_k = 1 ) :
"""
Helper function to load a GaussianDiffusion instance configured for use as a vocoder .
"""
return SpacedDiffusion ( use_timesteps = space_timesteps ( trained_diffusion_steps , [ desired_diffusion_steps ] ) , model_mean_type = ' epsilon ' ,
model_var_type = ' learned_range ' , loss_type = ' mse ' , betas = get_named_beta_schedule ( ' linear ' , trained_diffusion_steps ) ,
return SpacedDiffusion ( use_timesteps = space_timesteps ( trained_diffusion_steps , [ desired_diffusion_steps ] ) ,
model_mean_type = ' epsilon ' ,
model_var_type = ' learned_range ' , loss_type = ' mse ' ,
betas = get_named_beta_schedule ( ' linear ' , trained_diffusion_steps ) ,
conditioning_free = cond_free , conditioning_free_k = cond_free_k )
@torch.inference_mode ( )
def format_conditioning ( clip , cond_length = 132300 , device = ' cuda ' , sampling_rate = 22050 ) :
"""
@ -165,6 +173,7 @@ def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=2
mel_clip = mel_clip . unsqueeze ( 0 )
return migrate_to_device ( mel_clip , device )
def fix_autoregressive_output ( codes , stop_token , complain = True ) :
"""
This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was
@ -194,23 +203,27 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
return codes
@torch.inference_mode ( )
def do_spectrogram_diffusion ( diffusion_model , diffuser , latents , conditioning_latents , temperature = 1 , verbose = True , desc = None , sampler = " P " , input_sample_rate = 22050 , output_sample_rate = 24000 ) :
def do_spectrogram_diffusion ( diffusion_model , diffuser , latents , conditioning_latents , temperature = 1 , verbose = True ,
desc = None , sampler = " P " , input_sample_rate = 22050 , output_sample_rate = 24000 ) :
"""
Uses the specified diffusion model to convert discrete codes into a spectrogram .
"""
with torch . no_grad ( ) :
output_seq_len = latents . shape [ 1 ] * 4 * output_sample_rate / / input_sample_rate # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
output_seq_len = latents . shape [
1 ] * 4 * output_sample_rate / / input_sample_rate # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
output_shape = ( latents . shape [ 0 ] , 100 , output_seq_len )
precomputed_embeddings = diffusion_model . timestep_independent ( latents , conditioning_latents , output_seq_len , False )
precomputed_embeddings = diffusion_model . timestep_independent ( latents , conditioning_latents , output_seq_len ,
False )
noise = torch . randn ( output_shape , device = latents . device ) * temperature
diffuser . sampler = sampler . lower ( )
mel = diffuser . sample_loop ( diffusion_model , output_shape , noise = noise ,
model_kwargs = { ' precomputed_aligned_embeddings ' : precomputed_embeddings } , desc = desc )
model_kwargs = { ' precomputed_aligned_embeddings ' : precomputed_embeddings } , desc = desc )
mel = denormalize_tacotron_mel ( mel ) [ : , : , : output_seq_len ]
mel = denormalize_tacotron_mel ( mel ) [ : , : , : output_seq_len ]
if get_device_name ( ) == " dml " :
mel = mel . cpu ( )
return mel
@ -230,7 +243,8 @@ def classify_audio_clip(clip):
results = F . softmax ( classifier ( clip ) , dim = - 1 )
return results [ 0 ] [ 0 ]
def migrate_to_device ( t , device ) :
def migrate_to_device ( t , device ) :
if t is None :
return t
@ -244,23 +258,23 @@ def migrate_to_device( t, device ):
t . device = device
t = t . to ( device )
do_gc ( )
return t
class TextToSpeech :
"""
Main entry point into Tortoise .
"""
def __init__ ( self , autoregressive_batch_size = None , models_dir = MODELS_DIR , enable_redaction = True , device = None ,
minor_optimizations = True ,
unsqueeze_sample_batches = False ,
input_sample_rate = 22050 , output_sample_rate = 24000 ,
autoregressive_model_path = None , diffusion_model_path = None , vocoder_model = None , tokenizer_json = None ,
# ):
use_deepspeed = False ) : # Add use_deepspeed parameter
minor_optimizations = True ,
unsqueeze_sample_batches = False ,
input_sample_rate = 22050 , output_sample_rate = 24000 ,
autoregressive_model_path = None , diffusion_model_path = None , vocoder_model = None , tokenizer_json = None ,
) :
"""
Constructor
: param autoregressive_batch_size : Specifies how many samples to generate per batch . Lower this if you are seeing
@ -271,22 +285,21 @@ class TextToSpeech:
( but are still rendered by the model ) . This can be used for prompt engineering .
Default is true .
: param device : Device to use when running the model . If omitted , the device will be automatically chosen .
"""
"""
self . loading = True
if device is None :
device = get_device ( verbose = True )
self . version = [ 2 , 4 , 4 ] # to-do, autograb this from setup.py, or have setup.py autograb this
self . version = [ 2 , 4 , 4 ] # to-do, autograb this from setup.py, or have setup.py autograb this
self . input_sample_rate = input_sample_rate
self . output_sample_rate = output_sample_rate
self . minor_optimizations = minor_optimizations
self . unsqueeze_sample_batches = unsqueeze_sample_batches
self . use_deepspeed = use_deepspeed # Store use_deepspeed as an instance variable
print ( f ' use_deepspeed api_debug { use_deepspeed } ' )
# for clarity, it's simpler to split these up and just predicate them on requesting VRAM-consuming optimizations
self . preloaded_tensors = minor_optimizations
self . use_kv_cache = minor_optimizations
if get_device_name ( ) == " dml " : # does not work with DirectML
if get_device_name ( ) == " dml " : # does not work with DirectML
print ( " KV caching requested but not supported with the DirectML backend, disabling... " )
self . use_kv_cache = False
@ -315,13 +328,12 @@ class TextToSpeech:
self . load_diffusion_model ( diffusion_model_path )
self . clvp = CLVP ( dim_text = 768 , dim_speech = 768 , dim_latent = 768 , num_text_tokens = 256 , text_enc_depth = 20 ,
text_seq_len = 350 , text_heads = 12 ,
num_speech_tokens = 8192 , speech_enc_depth = 20 , speech_heads = 12 , speech_seq_len = 430 ,
use_xformers = True ) . cpu ( ) . eval ( )
self . clvp . load_state_dict ( torch . load ( get_model_path ( ' clvp2.pth ' , models_dir ) ) )
self . cvvp = None # CVVP model is only loaded if used.
self . cvvp = None # CVVP model is only loaded if used.
self . vocoder_model = vocoder_model
self . load_vocoder_model ( self . vocoder_model )
@ -331,21 +343,23 @@ class TextToSpeech:
self . rlg_diffusion = None
if self . preloaded_tensors :
self . autoregressive = migrate_to_device ( self . autoregressive , self . device )
self . diffusion = migrate_to_device ( self . diffusion , self . device )
self . clvp = migrate_to_device ( self . clvp , self . device )
self . vocoder = migrate_to_device ( self . vocoder , self . device )
self . autoregressive = migrate_to_device ( self . autoregressive , self . device )
self . diffusion = migrate_to_device ( self . diffusion , self . device )
self . clvp = migrate_to_device ( self . clvp , self . device )
self . vocoder = migrate_to_device ( self . vocoder , self . device )
self . loading = False
def load_autoregressive_model ( self , autoregressive_model_path ) :
if hasattr ( self , " autoregressive_model_path " ) and os . path . samefile ( self . autoregressive_model_path , autoregressive_model_path ) :
if hasattr ( self , " autoregressive_model_path " ) and os . path . samefile ( self . autoregressive_model_path ,
autoregressive_model_path ) :
return
self . autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os . path . exists ( autoregressive_model_path ) else get_model_path ( ' autoregressive.pth ' , self . models_dir )
self . autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os . path . exists (
autoregressive_model_path ) else get_model_path ( ' autoregressive.pth ' , self . models_dir )
new_hash = hash_file ( self . autoregressive_model_path )
if hasattr ( self , " autoregressive_model_hash " ) and self . autoregressive_model_hash == new_hash :
if hasattr ( self , " autoregressive_model_hash " ) and self . autoregressive_model_hash == new_hash :
return
self . autoregressive_model_hash = new_hash
@ -356,42 +370,44 @@ class TextToSpeech:
if hasattr ( self , ' autoregressive ' ) :
del self . autoregressive
self . autoregressive = UnifiedVoice ( max_mel_tokens = 604 , max_text_tokens = 402 , max_conditioning_inputs = 2 , layers = 30 ,
model_dim = 1024 ,
heads = 16 , number_text_tokens = 255 , start_text_token = 255 , checkpointing = False ,
train_solo_embeddings = False ) . cpu ( ) . eval ( )
self . autoregressive = UnifiedVoice ( max_mel_tokens = 604 , max_text_tokens = 402 , max_conditioning_inputs = 2 ,
layers = 30 ,
model_dim = 1024 ,
heads = 16 , number_text_tokens = 255 , start_text_token = 255 , checkpointing = False ,
train_solo_embeddings = False ) . cpu ( ) . eval ( )
self . autoregressive . load_state_dict ( torch . load ( self . autoregressive_model_path ) )
self . autoregressive . post_init_gpt2_config ( use_deepspeed= self . use_deepspeed , kv_cache= self . use_kv_cache )
self . autoregressive . post_init_gpt2_config ( kv_cache= self . use_kv_cache )
if self . preloaded_tensors :
self . autoregressive = migrate_to_device ( self . autoregressive , self . device )
self . autoregressive = migrate_to_device ( self . autoregressive , self . device )
self . loading = False
print ( f " Loaded autoregressive model " )
def load_diffusion_model ( self , diffusion_model_path ) :
if hasattr ( self , " diffusion_model_path " ) and os . path . samefile ( self . diffusion_model_path , diffusion_model_path ) :
if hasattr ( self , " diffusion_model_path " ) and os . path . samefile ( self . diffusion_model_path , diffusion_model_path ) :
return
self . loading = True
self . diffusion_model_path = diffusion_model_path if diffusion_model_path and os . path . exists ( diffusion_model_path ) else get_model_path ( ' diffusion_decoder.pth ' , self . models_dir )
self . diffusion_model_path = diffusion_model_path if diffusion_model_path and os . path . exists (
diffusion_model_path ) else get_model_path ( ' diffusion_decoder.pth ' , self . models_dir )
self . diffusion_model_hash = hash_file ( self . diffusion_model_path )
if hasattr ( self , ' diffusion ' ) :
del self . diffusion
self . diffusion = DiffusionTts ( model_channels = 1024 , num_layers = 10 , in_channels = 100 , out_channels = 200 ,
in_latent_channels = 1024 , in_tokens = 8193 , dropout = 0 , use_fp16 = False , num_heads = 16 ,
layer_drop = 0 , unconditioned_percentage = 0 ) . cpu ( ) . eval ( )
in_latent_channels = 1024 , in_tokens = 8193 , dropout = 0 , use_fp16 = False , num_heads = 16 ,
layer_drop = 0 , unconditioned_percentage = 0 ) . cpu ( ) . eval ( )
self . diffusion . load_state_dict ( torch . load ( get_model_path ( ' diffusion_decoder.pth ' , self . models_dir ) ) )
if self . preloaded_tensors :
self . diffusion = migrate_to_device ( self . diffusion , self . device )
self . diffusion = migrate_to_device ( self . diffusion , self . device )
self . loading = False
print ( f " Loaded diffusion model " )
def load_vocoder_model ( self , vocoder_model ) :
if hasattr ( self , " vocoder_model_path " ) and os . path . samefile ( self . vocoder_model_path , vocoder_model ) :
if hasattr ( self , " vocoder_model_path " ) and os . path . samefile ( self . vocoder_model_path , vocoder_model ) :
return
self . loading = True
@ -415,27 +431,30 @@ class TextToSpeech:
vocoder_config = get_model_path ( vocoder_config , self . models_dir )
self . vocoder = BigVGAN ( config = vocoder_config ) . cpu ( )
# elif vocoder_model == "univnet":
# elif vocoder_model == "univnet":
else :
vocoder_key = ' model_g '
self . vocoder_model_path = ' vocoder.pth '
self . vocoder = UnivNetGenerator ( ) . cpu ( )
print ( f " Loading vocoder model: { self . vocoder_model_path } " )
self . vocoder . load_state_dict ( torch . load ( get_model_path ( self . vocoder_model_path , self . models_dir ) , map_location = torch . device ( ' cpu ' ) ) [ vocoder_key ] )
self . vocoder . load_state_dict (
torch . load ( get_model_path ( self . vocoder_model_path , self . models_dir ) , map_location = torch . device ( ' cpu ' ) ) [
vocoder_key ] )
self . vocoder . eval ( inference = True )
if self . preloaded_tensors :
self . vocoder = migrate_to_device ( self . vocoder , self . device )
self . vocoder = migrate_to_device ( self . vocoder , self . device )
self . loading = False
print ( f " Loaded vocoder model " )
def load_tokenizer_json ( self , tokenizer_json ) :
if hasattr ( self , " tokenizer_json " ) and os . path . samefile ( self . tokenizer_json , tokenizer_json ) :
if hasattr ( self , " tokenizer_json " ) and os . path . samefile ( self . tokenizer_json , tokenizer_json ) :
return
self . loading = True
self . tokenizer_json = tokenizer_json if tokenizer_json else os . path . join ( os . path . dirname ( os . path . realpath ( __file__ ) ) , ' ../tortoise/data/tokenizer.json ' )
self . tokenizer_json = tokenizer_json if tokenizer_json else os . path . join (
os . path . dirname ( os . path . realpath ( __file__ ) ) , ' ../tortoise/data/tokenizer.json ' )
print ( " Loading tokenizer JSON: " , self . tokenizer_json )
if hasattr ( self , ' tokenizer ' ) :
@ -448,20 +467,32 @@ class TextToSpeech:
def load_cvvp ( self ) :
""" Load CVVP model. """
self . cvvp = CVVP ( model_dim = 512 , transformer_heads = 8 , dropout = 0 , mel_codes = 8192 , conditioning_enc_depth = 8 , cond_mask_percentage = 0 ,
self . cvvp = CVVP ( model_dim = 512 , transformer_heads = 8 , dropout = 0 , mel_codes = 8192 , conditioning_enc_depth = 8 ,
cond_mask_percentage = 0 ,
speech_enc_depth = 8 , speech_mask_percentage = 0 , latent_multiplier = 1 ) . cpu ( ) . eval ( )
self . cvvp . load_state_dict ( torch . load ( get_model_path ( ' cvvp.pth ' , self . models_dir ) ) )
if self . preloaded_tensors :
self . cvvp = migrate_to_device ( self . cvvp , self . device )
self . cvvp = migrate_to_device ( self . cvvp , self . device )
@torch.inference_mode ( )
def get_conditioning_latents ( self , voice_samples , return_mels = False , verbose = False , slices = 1 , max_chunk_size = None , force_cpu = False , original_ar = False , original_diffusion = False ) :
def get_conditioning_latents (
self , voice_samples , return_mels = False , verbose = False , slices = 1 , max_chunk_size = None , force_cpu = False ,
original_ar = False , original_diffusion = False
) :
"""
Transforms one or more voice_samples into a tuple ( autoregressive_conditioning_latent , diffusion_conditioning_latent ) .
These are expressive learned latents that encode aspects of the provided clips like voice , intonation , and acoustic
properties .
: param voice_samples : List of 2 or more ~ 10 second reference clips , which should be torch tensors containing 22.05 kHz waveform data .
: param force_cpu :
: param max_chunk_size :
: param slices :
: param verbose :
: param return_mels :
: param original_diffusion :
: param original_ar :
: param voice_samples : List of 2 or more ~ 10 second reference clips ,
which should be torch tensors containing 22.05 kHz waveform data .
"""
with torch . no_grad ( ) :
@ -472,7 +503,7 @@ class TextToSpeech:
if not isinstance ( voice_samples , list ) :
voice_samples = [ voice_samples ]
resampler_22K = torchaudio . transforms . Resample (
self . input_sample_rate ,
22050 ,
@ -491,7 +522,7 @@ class TextToSpeech:
beta = 8.555504641634386 ,
) . to ( device )
voice_samples = [ migrate_to_device ( v , device ) for v in voice_samples ]
voice_samples = [ migrate_to_device ( v , device ) for v in voice_samples ]
auto_conds = [ ]
diffusion_conds = [ ]
@ -499,7 +530,8 @@ class TextToSpeech:
if original_ar :
samples = [ resampler_22K ( sample ) for sample in voice_samples ]
for sample in tqdm ( samples , desc = " Computing AR conditioning latents... " ) :
auto_conds . append ( format_conditioning ( sample , device = device , sampling_rate = self . input_sample_rate , cond_length = 132300 ) )
auto_conds . append ( format_conditioning ( sample , device = device , sampling_rate = self . input_sample_rate ,
cond_length = 132300 ) )
else :
samples = [ resampler_22K ( sample ) for sample in voice_samples ]
concat = torch . cat ( samples , dim = - 1 )
@ -516,32 +548,35 @@ class TextToSpeech:
chunk_size = chunks [ 0 ] . shape [ - 1 ]
for chunk in tqdm ( chunks , desc = " Computing AR conditioning latents... " ) :
auto_conds . append ( format_conditioning ( chunk , device = device , sampling_rate = self . input_sample_rate , cond_length = chunk_size ) )
auto_conds . append ( format_conditioning ( chunk , device = device , sampling_rate = self . input_sample_rate ,
cond_length = chunk_size ) )
if original_diffusion :
samples = [ resampler_24K ( sample ) for sample in voice_samples ]
for sample in tqdm ( samples , desc = " Computing diffusion conditioning latents... " ) :
sample = pad_or_truncate ( sample , 102400 )
cond_mel = wav_to_univnet_mel ( migrate_to_device ( sample , device ) , do_normalization = False , device = self . device )
cond_mel = wav_to_univnet_mel ( migrate_to_device ( sample , device ) , do_normalization = False ,
device = self . device )
diffusion_conds . append ( cond_mel )
else :
samples = [ resampler_24K ( sample ) for sample in voice_samples ]
for chunk in tqdm ( chunks , desc = " Computing diffusion conditioning latents... " ) :
check_for_kill_signal ( )
chunk = pad_or_truncate ( chunk , chunk_size )
cond_mel = wav_to_univnet_mel ( migrate_to_device ( chunk , device ) , do_normalization = False , device = device )
cond_mel = wav_to_univnet_mel ( migrate_to_device ( chunk , device ) , do_normalization = False ,
device = device )
diffusion_conds . append ( cond_mel )
auto_conds = torch . stack ( auto_conds , dim = 1 )
self . autoregressive = migrate_to_device ( self . autoregressive , device )
self . autoregressive = migrate_to_device ( self . autoregressive , device )
auto_latent = self . autoregressive . get_conditioning ( auto_conds )
self . autoregressive = migrate_to_device ( self . autoregressive , self . device if self . preloaded_tensors else ' cpu ' )
self . autoregressive = migrate_to_device ( self . autoregressive ,
self . device if self . preloaded_tensors else ' cpu ' )
diffusion_conds = torch . stack ( diffusion_conds , dim = 1 )
self . diffusion = migrate_to_device ( self . diffusion , device )
self . diffusion = migrate_to_device ( self . diffusion , device )
diffusion_latent = self . diffusion . get_conditioning ( diffusion_conds )
self . diffusion = migrate_to_device ( self . diffusion , self . device if self . preloaded_tensors else ' cpu ' )
self . diffusion = migrate_to_device ( self . diffusion , self . device if self . preloaded_tensors else ' cpu ' )
if return_mels :
return auto_latent , diffusion_latent , auto_conds , diffusion_conds
@ -552,9 +587,11 @@ class TextToSpeech:
# Lazy-load the RLG models.
if self . rlg_auto is None :
self . rlg_auto = RandomLatentConverter ( 1024 ) . eval ( )
self . rlg_auto . load_state_dict ( torch . load ( get_model_path ( ' rlg_auto.pth ' , self . models_dir ) , map_location = torch . device ( ' cpu ' ) ) )
self . rlg_auto . load_state_dict (
torch . load ( get_model_path ( ' rlg_auto.pth ' , self . models_dir ) , map_location = torch . device ( ' cpu ' ) ) )
self . rlg_diffusion = RandomLatentConverter ( 2048 ) . eval ( )
self . rlg_diffusion . load_state_dict ( torch . load ( get_model_path ( ' rlg_diffuser.pth ' , self . models_dir ) , map_location = torch . device ( ' cpu ' ) ) )
self . rlg_diffusion . load_state_dict (
torch . load ( get_model_path ( ' rlg_diffuser.pth ' , self . models_dir ) , map_location = torch . device ( ' cpu ' ) ) )
with torch . no_grad ( ) :
return self . rlg_auto ( torch . tensor ( [ 0.0 ] ) ) , self . rlg_diffusion ( torch . tensor ( [ 0.0 ] ) )
@ -576,16 +613,19 @@ class TextToSpeech:
' fast ' : { ' num_autoregressive_samples ' : 96 , ' diffusion_iterations ' : 80 } ,
' standard ' : { ' num_autoregressive_samples ' : 256 , ' diffusion_iterations ' : 200 } ,
' high_quality ' : { ' num_autoregressive_samples ' : 256 , ' diffusion_iterations ' : 400 } ,
' narration ' : { ' num_autoregressive_samples ' : 30 , ' diffusion_iterations ' : 80 , " diffusion_sampler " : " DDIM " } ,
' dialogue ' : { ' num_autoregressive_samples ' : 60 , ' diffusion_iterations ' : 120 , " diffusion_sampler " : " DDIM " }
}
settings . update ( presets [ preset ] )
settings . update ( kwargs ) # allow overriding of preset settings with kwargs
settings . update ( kwargs ) # allow overriding of preset settings with kwargs
return self . tts ( text , * * settings )
@torch.inference_mode ( )
def tts ( self , text , voice_samples = None , conditioning_latents = None , k = 1 , verbose = True , use_deterministic_seed = None ,
return_deterministic_state = False ,
# autoregressive generation parameters follow
num_autoregressive_samples = 512 , temperature = .8 , length_penalty = 1 , repetition_penalty = 2.0 , top_p = .8 , max_mel_tokens = 500 ,
num_autoregressive_samples = 512 , temperature = .8 , length_penalty = 1 , repetition_penalty = 2.0 , top_p = .8 ,
max_mel_tokens = 500 ,
sample_batch_size = None ,
autoregressive_model = None ,
diffusion_model = None ,
@ -667,14 +707,17 @@ class TextToSpeech:
self . load_tokenizer_json ( tokenizer_json )
text_tokens = torch . IntTensor ( self . tokenizer . encode ( text ) ) . unsqueeze ( 0 )
text_tokens = migrate_to_device ( text_tokens , self . device )
text_tokens = migrate_to_device ( text_tokens , self . device )
text_tokens = F . pad ( text_tokens , ( 0 , 1 ) ) # This may not be necessary.
assert text_tokens . shape [ - 1 ] < 400 , ' Too much text provided. Break the text up into separate segments and re-try inference. '
assert text_tokens . shape [
- 1 ] < 400 , ' Too much text provided. Break the text up into separate segments and re-try inference. '
auto_conds = None
if voice_samples is not None :
auto_conditioning , diffusion_conditioning , auto_conds , _ = self . get_conditioning_latents ( voice_samples , return_mels = True , verbose = True )
auto_conditioning , diffusion_conditioning , auto_conds , _ = self . get_conditioning_latents ( voice_samples ,
return_mels = True ,
verbose = True )
elif conditioning_latents is not None :
latent_tuple = conditioning_latents
if len ( latent_tuple ) == 2 :
@ -684,7 +727,8 @@ class TextToSpeech:
else :
auto_conditioning , diffusion_conditioning = self . get_random_conditioning_latents ( )
diffuser = load_discrete_vocoder_diffuser ( desired_diffusion_steps = diffusion_iterations , cond_free = cond_free , cond_free_k = cond_free_k )
diffuser = load_discrete_vocoder_diffuser ( desired_diffusion_steps = diffusion_iterations , cond_free = cond_free ,
cond_free_k = cond_free_k )
self . autoregressive_batch_size = get_device_batch_size ( ) if sample_batch_size is None or sample_batch_size == 0 else sample_batch_size
@ -696,12 +740,12 @@ class TextToSpeech:
stop_mel_token = self . autoregressive . stop_mel_token
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
self . autoregressive = migrate_to_device ( self . autoregressive , self . device )
auto_conditioning = migrate_to_device ( auto_conditioning , self . device )
text_tokens = migrate_to_device ( text_tokens , self . device )
self . autoregressive = migrate_to_device ( self . autoregressive , self . device )
auto_conditioning = migrate_to_device ( auto_conditioning , self . device )
text_tokens = migrate_to_device ( text_tokens , self . device )
with torch . autocast ( device_type = ' cuda ' , dtype = torch . float16 , enabled = half_p ) :
for b in tqdm ( range ( num_batches ) , desc = " Generating autoregressive samples " ):
for b in tqdm ( range ( num_batches ) , desc = " Generating autoregressive samples " , disable = not verbose ):
check_for_kill_signal ( )
codes = self . autoregressive . inference_speech ( auto_conditioning , text_tokens ,
do_sample = True ,
@ -717,76 +761,75 @@ class TextToSpeech:
samples . append ( codes )
if not self . preloaded_tensors :
self . autoregressive = migrate_to_device ( self . autoregressive , ' cpu ' )
self . autoregressive = migrate_to_device ( self . autoregressive , ' cpu ' )
if self . unsqueeze_sample_batches :
new_samples = [ ]
for batch in samples :
for i in range ( batch . shape [ 0 ] ) :
for i in range ( batch . shape [ 0 ] ) :
new_samples . append ( batch [ i ] . unsqueeze ( 0 ) )
samples = new_samples
clip_results = [ ]
if auto_conds is not None :
auto_conditioning = migrate_to_device ( auto_conditioning , self . device )
auto_conditioning = migrate_to_device ( auto_conditioning , self . device )
with torch . autocast ( device_type = ' cuda ' , dtype = torch . float16 , enabled = half_p ) :
if not self . preloaded_tensors :
self . autoregressive = migrate_to_device ( self . autoregressive , ' cpu ' )
self . clvp = migrate_to_device ( self . clvp , self . device )
self . autoregressive = migrate_to_device ( self . autoregressive , ' cpu ' )
self . clvp = migrate_to_device ( self . clvp , self . device )
if cvvp_amount > 0 :
if self . cvvp is None :
self . load_cvvp ( )
if not self . preloaded_tensors :
self . cvvp = migrate_to_device ( self . cvvp , self . device )
desc = " Computing best candidates "
self . cvvp = migrate_to_device ( self . cvvp , self . device )
desc = " Computing best candidates "
if verbose :
if self . cvvp is None :
desc = " Computing best candidates using CLVP "
else :
desc = f " Computing best candidates using CLVP { ( ( 1 - cvvp_amount ) * 100 ) : 2.0f } % and CVVP { ( cvvp_amount * 100 ) : 2.0f } % "
desc = f " Computing best candidates using CLVP { ( ( 1 - cvvp_amount ) * 100 ) : 2.0f } % and CVVP { ( cvvp_amount * 100 ) : 2.0f } % "
for batch in tqdm ( samples , desc = desc ) :
for batch in tqdm ( samples , desc = desc , disable = not verbose ) :
check_for_kill_signal ( )
for i in range ( batch . shape [ 0 ] ) :
batch [ i ] = fix_autoregressive_output ( batch [ i ] , stop_mel_token )
if cvvp_amount != 1 :
clvp = self . clvp ( text_tokens . repeat ( batch . shape [ 0 ] , 1 ) , batch , return_loss = False )
if auto_conds is not None and cvvp_amount > 0 :
cvvp_accumulator = 0
for cl in range ( auto_conds . shape [ 1 ] ) :
cvvp_accumulator = cvvp_accumulator + self . cvvp ( auto_conds [ : , cl ] . repeat ( batch . shape [ 0 ] , 1 , 1 ) , batch , return_loss = False )
cvvp_accumulator = cvvp_accumulator + self . cvvp (
auto_conds [ : , cl ] . repeat ( batch . shape [ 0 ] , 1 , 1 ) , batch , return_loss = False )
cvvp = cvvp_accumulator / auto_conds . shape [ 1 ]
if cvvp_amount == 1 :
clip_results . append ( cvvp )
else :
clip_results . append ( cvvp * cvvp_amount + clvp * ( 1 - cvvp_amount ) )
clip_results . append ( cvvp * cvvp_amount + clvp * ( 1 - cvvp_amount ) )
else :
clip_results . append ( clvp )
if not self . preloaded_tensors and auto_conds is not None :
auto_conds = migrate_to_device ( auto_conds , ' cpu ' )
auto_conds = migrate_to_device ( auto_conds , ' cpu ' )
clip_results = torch . cat ( clip_results , dim = 0 )
samples = torch . cat ( samples , dim = 0 )
best_results = samples [ torch . topk ( clip_results , k = k ) . indices ]
if not self . preloaded_tensors :
self . clvp = migrate_to_device ( self . clvp , ' cpu ' )
self . cvvp = migrate_to_device ( self . cvvp , ' cpu ' )
self . clvp = migrate_to_device ( self . clvp , ' cpu ' )
self . cvvp = migrate_to_device ( self . cvvp , ' cpu ' )
if get_device_name ( ) == " dml " :
text_tokens = migrate_to_device ( text_tokens , ' cpu ' )
best_results = migrate_to_device ( best_results , ' cpu ' )
auto_conditioning = migrate_to_device ( auto_conditioning , ' cpu ' )
self . autoregressive = migrate_to_device ( self . autoregressive , ' cpu ' )
text_tokens = migrate_to_device ( text_tokens , ' cpu ' )
best_results = migrate_to_device ( best_results , ' cpu ' )
auto_conditioning = migrate_to_device ( auto_conditioning , ' cpu ' )
self . autoregressive = migrate_to_device ( self . autoregressive , ' cpu ' )
else :
auto_conditioning = auto_conditioning . to ( self . device )
self . autoregressive = self . autoregressive . to ( self . device )
@ -797,24 +840,27 @@ class TextToSpeech:
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
# results, but will increase memory usage.
best_latents = self . autoregressive ( auto_conditioning . repeat ( k , 1 ) , text_tokens . repeat ( k , 1 ) ,
torch . tensor ( [ text_tokens . shape [ - 1 ] ] , device = text_tokens . device ) , best_results ,
torch . tensor ( [ best_results . shape [ - 1 ] * self . autoregressive . mel_length_compression ] , device = text_tokens . device ) ,
torch . tensor ( [ text_tokens . shape [ - 1 ] ] , device = text_tokens . device ) ,
best_results ,
torch . tensor ( [ best_results . shape [
- 1 ] * self . autoregressive . mel_length_compression ] ,
device = text_tokens . device ) ,
return_latent = True , clip_inputs = False )
diffusion_conditioning = migrate_to_device ( diffusion_conditioning , self . device )
diffusion_conditioning = migrate_to_device ( diffusion_conditioning , self . device )
if get_device_name ( ) == " dml " :
self . autoregressive = migrate_to_device ( self . autoregressive , self . device )
best_results = migrate_to_device ( best_results , self . device )
best_latents = migrate_to_device ( best_latents , self . device )
self . vocoder = migrate_to_device ( self . vocoder , ' cpu ' )
self . autoregressive = migrate_to_device ( self . autoregressive , self . device )
best_results = migrate_to_device ( best_results , self . device )
best_latents = migrate_to_device ( best_latents , self . device )
self . vocoder = migrate_to_device ( self . vocoder , ' cpu ' )
else :
if not self . preloaded_tensors :
self . autoregressive = migrate_to_device ( self . autoregressive , ' cpu ' )
self . autoregressive = migrate_to_device ( self . autoregressive , ' cpu ' )
self . diffusion = migrate_to_device ( self . diffusion , self . device )
self . vocoder = migrate_to_device ( self . vocoder , self . device )
self . diffusion = migrate_to_device ( self . diffusion , self . device )
self . vocoder = migrate_to_device ( self . vocoder , self . device )
del text_tokens
del auto_conditioning
@ -835,22 +881,26 @@ class TextToSpeech:
break
mel = do_spectrogram_diffusion ( self . diffusion , diffuser , latents , diffusion_conditioning ,
temperature = diffusion_temperature , desc = " Transforming autoregressive outputs into audio.. " , sampler = diffusion_sampler ,
input_sample_rate = self . input_sample_rate , output_sample_rate = self . output_sample_rate )
temperature = diffusion_temperature ,
desc = " Transforming autoregressive outputs into audio.. " ,
sampler = diffusion_sampler ,
input_sample_rate = self . input_sample_rate ,
output_sample_rate = self . output_sample_rate )
wav = self . vocoder . inference ( mel )
wav_candidates . append ( wav )
if not self . preloaded_tensors :
self . diffusion = migrate_to_device ( self . diffusion , ' cpu ' )
self . vocoder = migrate_to_device ( self . vocoder , ' cpu ' )
self . diffusion = migrate_to_device ( self . diffusion , ' cpu ' )
self . vocoder = migrate_to_device ( self . vocoder , ' cpu ' )
def potentially_redact ( clip , text ) :
if self . enable_redaction :
t = clip . squeeze ( 1 )
t = migrate_to_device ( t , ' cpu ' if get_device_name ( ) == " dml " else self . device )
t = migrate_to_device ( t , ' cpu ' if get_device_name ( ) == " dml " else self . device )
return self . aligner . redact ( t , text , self . output_sample_rate ) . unsqueeze ( 1 )
return clip
wav_candidates = [ potentially_redact ( wav_candidate , text ) for wav_candidate in wav_candidates ]
if len ( wav_candidates ) > 1 :
@ -876,4 +926,4 @@ class TextToSpeech:
# Can't currently set this because of CUBLAS. TODO: potentially enable it if necessary.
# torch.use_deterministic_algorithms(True)
return seed
return seed