Update tortoise/api.py

My changed versions with more presets
This commit is contained in:
HarkonCollider 2023-09-09 22:00:21 +00:00
parent d7e6914fb8
commit 9ddfcb57aa

View File

@ -51,6 +51,7 @@ MODELS = {
'bigvgan_24khz_100band.json': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_24khz_100band.json', 'bigvgan_24khz_100band.json': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_24khz_100band.json',
} }
def hash_file(path, algo="md5", buffer_size=0): def hash_file(path, algo="md5", buffer_size=0):
import hashlib import hashlib
@ -77,12 +78,14 @@ def hash_file(path, algo="md5", buffer_size=0):
return "{0}".format(hash.hexdigest()) return "{0}".format(hash.hexdigest())
def check_for_kill_signal(): def check_for_kill_signal():
global STOP_SIGNAL global STOP_SIGNAL
if STOP_SIGNAL: if STOP_SIGNAL:
STOP_SIGNAL = False STOP_SIGNAL = False
raise Exception("Kill signal detected") raise Exception("Kill signal detected")
def download_models(specific_models=None): def download_models(specific_models=None):
""" """
Call to download all the models that Tortoise uses. Call to download all the models that Tortoise uses.
@ -102,6 +105,7 @@ def download_models(specific_models=None):
else: else:
pbar.finish() pbar.finish()
pbar = None pbar = None
for model_name, url in MODELS.items(): for model_name, url in MODELS.items():
if specific_models is not None and model_name not in specific_models: if specific_models is not None and model_name not in specific_models:
continue continue
@ -112,7 +116,7 @@ def download_models(specific_models=None):
proxy = ProxyHandler({}) proxy = ProxyHandler({})
opener = build_opener(proxy) opener = build_opener(proxy)
opener.addheaders = [('User-Agent','mrq/AI-Voice-Cloning')] opener.addheaders = [('User-Agent', 'mrq/AI-Voice-Cloning')]
install_opener(opener) install_opener(opener)
request.urlretrieve(url, model_path, show_progress) request.urlretrieve(url, model_path, show_progress)
print('Done.') print('Done.')
@ -137,19 +141,23 @@ def pad_or_truncate(t, length):
if t.shape[-1] == length: if t.shape[-1] == length:
return t return t
elif t.shape[-1] < length: elif t.shape[-1] < length:
return F.pad(t, (0, length-t.shape[-1])) return F.pad(t, (0, length - t.shape[-1]))
else: else:
return t[..., :length] return t[..., :length]
def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusion_steps=200, cond_free=True, cond_free_k=1): def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusion_steps=200, cond_free=True,
cond_free_k=1):
""" """
Helper function to load a GaussianDiffusion instance configured for use as a vocoder. Helper function to load a GaussianDiffusion instance configured for use as a vocoder.
""" """
return SpacedDiffusion(use_timesteps=space_timesteps(trained_diffusion_steps, [desired_diffusion_steps]), model_mean_type='epsilon', return SpacedDiffusion(use_timesteps=space_timesteps(trained_diffusion_steps, [desired_diffusion_steps]),
model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', trained_diffusion_steps), model_mean_type='epsilon',
model_var_type='learned_range', loss_type='mse',
betas=get_named_beta_schedule('linear', trained_diffusion_steps),
conditioning_free=cond_free, conditioning_free_k=cond_free_k) conditioning_free=cond_free, conditioning_free_k=cond_free_k)
@torch.inference_mode() @torch.inference_mode()
def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=22050): def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=22050):
""" """
@ -165,6 +173,7 @@ def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=2
mel_clip = mel_clip.unsqueeze(0) mel_clip = mel_clip.unsqueeze(0)
return migrate_to_device(mel_clip, device) return migrate_to_device(mel_clip, device)
def fix_autoregressive_output(codes, stop_token, complain=True): def fix_autoregressive_output(codes, stop_token, complain=True):
""" """
This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was
@ -194,23 +203,27 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
return codes return codes
@torch.inference_mode() @torch.inference_mode()
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000): def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True,
desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000):
""" """
Uses the specified diffusion model to convert discrete codes into a spectrogram. Uses the specified diffusion model to convert discrete codes into a spectrogram.
""" """
with torch.no_grad(): with torch.no_grad():
output_seq_len = latents.shape[1] * 4 * output_sample_rate // input_sample_rate # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal. output_seq_len = latents.shape[
1] * 4 * output_sample_rate // input_sample_rate # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
output_shape = (latents.shape[0], 100, output_seq_len) output_shape = (latents.shape[0], 100, output_seq_len)
precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len, False) precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len,
False)
noise = torch.randn(output_shape, device=latents.device) * temperature noise = torch.randn(output_shape, device=latents.device) * temperature
diffuser.sampler = sampler.lower() diffuser.sampler = sampler.lower()
mel = diffuser.sample_loop(diffusion_model, output_shape, noise=noise, mel = diffuser.sample_loop(diffusion_model, output_shape, noise=noise,
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}, desc=desc) model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}, desc=desc)
mel = denormalize_tacotron_mel(mel)[:,:,:output_seq_len] mel = denormalize_tacotron_mel(mel)[:, :, :output_seq_len]
if get_device_name() == "dml": if get_device_name() == "dml":
mel = mel.cpu() mel = mel.cpu()
return mel return mel
@ -230,7 +243,8 @@ def classify_audio_clip(clip):
results = F.softmax(classifier(clip), dim=-1) results = F.softmax(classifier(clip), dim=-1)
return results[0][0] return results[0][0]
def migrate_to_device( t, device ):
def migrate_to_device(t, device):
if t is None: if t is None:
return t return t
@ -249,18 +263,18 @@ def migrate_to_device( t, device ):
return t return t
class TextToSpeech: class TextToSpeech:
""" """
Main entry point into Tortoise. Main entry point into Tortoise.
""" """
def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable_redaction=True, device=None, def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable_redaction=True, device=None,
minor_optimizations=True, minor_optimizations=True,
unsqueeze_sample_batches=False, unsqueeze_sample_batches=False,
input_sample_rate=22050, output_sample_rate=24000, input_sample_rate=22050, output_sample_rate=24000,
autoregressive_model_path=None, diffusion_model_path=None, vocoder_model=None, tokenizer_json=None, autoregressive_model_path=None, diffusion_model_path=None, vocoder_model=None, tokenizer_json=None,
# ): ):
use_deepspeed=False): # Add use_deepspeed parameter
""" """
Constructor Constructor
:param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
@ -276,17 +290,16 @@ class TextToSpeech:
if device is None: if device is None:
device = get_device(verbose=True) device = get_device(verbose=True)
self.version = [2,4,4] # to-do, autograb this from setup.py, or have setup.py autograb this self.version = [2, 4, 4] # to-do, autograb this from setup.py, or have setup.py autograb this
self.input_sample_rate = input_sample_rate self.input_sample_rate = input_sample_rate
self.output_sample_rate = output_sample_rate self.output_sample_rate = output_sample_rate
self.minor_optimizations = minor_optimizations self.minor_optimizations = minor_optimizations
self.unsqueeze_sample_batches = unsqueeze_sample_batches self.unsqueeze_sample_batches = unsqueeze_sample_batches
self.use_deepspeed = use_deepspeed # Store use_deepspeed as an instance variable
print(f'use_deepspeed api_debug {use_deepspeed}')
# for clarity, it's simpler to split these up and just predicate them on requesting VRAM-consuming optimizations # for clarity, it's simpler to split these up and just predicate them on requesting VRAM-consuming optimizations
self.preloaded_tensors = minor_optimizations self.preloaded_tensors = minor_optimizations
self.use_kv_cache = minor_optimizations self.use_kv_cache = minor_optimizations
if get_device_name() == "dml": # does not work with DirectML if get_device_name() == "dml": # does not work with DirectML
print("KV caching requested but not supported with the DirectML backend, disabling...") print("KV caching requested but not supported with the DirectML backend, disabling...")
self.use_kv_cache = False self.use_kv_cache = False
@ -315,13 +328,12 @@ class TextToSpeech:
self.load_diffusion_model(diffusion_model_path) self.load_diffusion_model(diffusion_model_path)
self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20, self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20,
text_seq_len=350, text_heads=12, text_seq_len=350, text_heads=12,
num_speech_tokens=8192, speech_enc_depth=20, speech_heads=12, speech_seq_len=430, num_speech_tokens=8192, speech_enc_depth=20, speech_heads=12, speech_seq_len=430,
use_xformers=True).cpu().eval() use_xformers=True).cpu().eval()
self.clvp.load_state_dict(torch.load(get_model_path('clvp2.pth', models_dir))) self.clvp.load_state_dict(torch.load(get_model_path('clvp2.pth', models_dir)))
self.cvvp = None # CVVP model is only loaded if used. self.cvvp = None # CVVP model is only loaded if used.
self.vocoder_model = vocoder_model self.vocoder_model = vocoder_model
self.load_vocoder_model(self.vocoder_model) self.load_vocoder_model(self.vocoder_model)
@ -331,21 +343,23 @@ class TextToSpeech:
self.rlg_diffusion = None self.rlg_diffusion = None
if self.preloaded_tensors: if self.preloaded_tensors:
self.autoregressive = migrate_to_device( self.autoregressive, self.device ) self.autoregressive = migrate_to_device(self.autoregressive, self.device)
self.diffusion = migrate_to_device( self.diffusion, self.device ) self.diffusion = migrate_to_device(self.diffusion, self.device)
self.clvp = migrate_to_device( self.clvp, self.device ) self.clvp = migrate_to_device(self.clvp, self.device)
self.vocoder = migrate_to_device( self.vocoder, self.device ) self.vocoder = migrate_to_device(self.vocoder, self.device)
self.loading = False self.loading = False
def load_autoregressive_model(self, autoregressive_model_path): def load_autoregressive_model(self, autoregressive_model_path):
if hasattr(self,"autoregressive_model_path") and os.path.samefile(self.autoregressive_model_path, autoregressive_model_path): if hasattr(self, "autoregressive_model_path") and os.path.samefile(self.autoregressive_model_path,
autoregressive_model_path):
return return
self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists(autoregressive_model_path) else get_model_path('autoregressive.pth', self.models_dir) self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists(
autoregressive_model_path) else get_model_path('autoregressive.pth', self.models_dir)
new_hash = hash_file(self.autoregressive_model_path) new_hash = hash_file(self.autoregressive_model_path)
if hasattr(self,"autoregressive_model_hash") and self.autoregressive_model_hash == new_hash: if hasattr(self, "autoregressive_model_hash") and self.autoregressive_model_hash == new_hash:
return return
self.autoregressive_model_hash = new_hash self.autoregressive_model_hash = new_hash
@ -356,42 +370,44 @@ class TextToSpeech:
if hasattr(self, 'autoregressive'): if hasattr(self, 'autoregressive'):
del self.autoregressive del self.autoregressive
self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30, self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2,
model_dim=1024, layers=30,
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False, model_dim=1024,
train_solo_embeddings=False).cpu().eval() heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
train_solo_embeddings=False).cpu().eval()
self.autoregressive.load_state_dict(torch.load(self.autoregressive_model_path)) self.autoregressive.load_state_dict(torch.load(self.autoregressive_model_path))
self.autoregressive.post_init_gpt2_config(use_deepspeed=self.use_deepspeed, kv_cache=self.use_kv_cache) self.autoregressive.post_init_gpt2_config(kv_cache=self.use_kv_cache)
if self.preloaded_tensors: if self.preloaded_tensors:
self.autoregressive = migrate_to_device( self.autoregressive, self.device ) self.autoregressive = migrate_to_device(self.autoregressive, self.device)
self.loading = False self.loading = False
print(f"Loaded autoregressive model") print(f"Loaded autoregressive model")
def load_diffusion_model(self, diffusion_model_path): def load_diffusion_model(self, diffusion_model_path):
if hasattr(self,"diffusion_model_path") and os.path.samefile(self.diffusion_model_path, diffusion_model_path): if hasattr(self, "diffusion_model_path") and os.path.samefile(self.diffusion_model_path, diffusion_model_path):
return return
self.loading = True self.loading = True
self.diffusion_model_path = diffusion_model_path if diffusion_model_path and os.path.exists(diffusion_model_path) else get_model_path('diffusion_decoder.pth', self.models_dir) self.diffusion_model_path = diffusion_model_path if diffusion_model_path and os.path.exists(
diffusion_model_path) else get_model_path('diffusion_decoder.pth', self.models_dir)
self.diffusion_model_hash = hash_file(self.diffusion_model_path) self.diffusion_model_hash = hash_file(self.diffusion_model_path)
if hasattr(self, 'diffusion'): if hasattr(self, 'diffusion'):
del self.diffusion del self.diffusion
self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200, self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16, in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
layer_drop=0, unconditioned_percentage=0).cpu().eval() layer_drop=0, unconditioned_percentage=0).cpu().eval()
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', self.models_dir))) self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', self.models_dir)))
if self.preloaded_tensors: if self.preloaded_tensors:
self.diffusion = migrate_to_device( self.diffusion, self.device ) self.diffusion = migrate_to_device(self.diffusion, self.device)
self.loading = False self.loading = False
print(f"Loaded diffusion model") print(f"Loaded diffusion model")
def load_vocoder_model(self, vocoder_model): def load_vocoder_model(self, vocoder_model):
if hasattr(self,"vocoder_model_path") and os.path.samefile(self.vocoder_model_path, vocoder_model): if hasattr(self, "vocoder_model_path") and os.path.samefile(self.vocoder_model_path, vocoder_model):
return return
self.loading = True self.loading = True
@ -415,27 +431,30 @@ class TextToSpeech:
vocoder_config = get_model_path(vocoder_config, self.models_dir) vocoder_config = get_model_path(vocoder_config, self.models_dir)
self.vocoder = BigVGAN(config=vocoder_config).cpu() self.vocoder = BigVGAN(config=vocoder_config).cpu()
#elif vocoder_model == "univnet": # elif vocoder_model == "univnet":
else: else:
vocoder_key = 'model_g' vocoder_key = 'model_g'
self.vocoder_model_path = 'vocoder.pth' self.vocoder_model_path = 'vocoder.pth'
self.vocoder = UnivNetGenerator().cpu() self.vocoder = UnivNetGenerator().cpu()
print(f"Loading vocoder model: {self.vocoder_model_path}") print(f"Loading vocoder model: {self.vocoder_model_path}")
self.vocoder.load_state_dict(torch.load(get_model_path(self.vocoder_model_path, self.models_dir), map_location=torch.device('cpu'))[vocoder_key]) self.vocoder.load_state_dict(
torch.load(get_model_path(self.vocoder_model_path, self.models_dir), map_location=torch.device('cpu'))[
vocoder_key])
self.vocoder.eval(inference=True) self.vocoder.eval(inference=True)
if self.preloaded_tensors: if self.preloaded_tensors:
self.vocoder = migrate_to_device( self.vocoder, self.device ) self.vocoder = migrate_to_device(self.vocoder, self.device)
self.loading = False self.loading = False
print(f"Loaded vocoder model") print(f"Loaded vocoder model")
def load_tokenizer_json(self, tokenizer_json): def load_tokenizer_json(self, tokenizer_json):
if hasattr(self,"tokenizer_json") and os.path.samefile(self.tokenizer_json, tokenizer_json): if hasattr(self, "tokenizer_json") and os.path.samefile(self.tokenizer_json, tokenizer_json):
return return
self.loading = True self.loading = True
self.tokenizer_json = tokenizer_json if tokenizer_json else os.path.join(os.path.dirname(os.path.realpath(__file__)), '../tortoise/data/tokenizer.json') self.tokenizer_json = tokenizer_json if tokenizer_json else os.path.join(
os.path.dirname(os.path.realpath(__file__)), '../tortoise/data/tokenizer.json')
print("Loading tokenizer JSON:", self.tokenizer_json) print("Loading tokenizer JSON:", self.tokenizer_json)
if hasattr(self, 'tokenizer'): if hasattr(self, 'tokenizer'):
@ -448,20 +467,32 @@ class TextToSpeech:
def load_cvvp(self): def load_cvvp(self):
"""Load CVVP model.""" """Load CVVP model."""
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0, self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8,
cond_mask_percentage=0,
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval() speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir))) self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir)))
if self.preloaded_tensors: if self.preloaded_tensors:
self.cvvp = migrate_to_device( self.cvvp, self.device ) self.cvvp = migrate_to_device(self.cvvp, self.device)
@torch.inference_mode() @torch.inference_mode()
def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, slices=1, max_chunk_size=None, force_cpu=False, original_ar=False, original_diffusion=False): def get_conditioning_latents(
self, voice_samples, return_mels=False, verbose=False, slices=1, max_chunk_size=None, force_cpu=False,
original_ar=False, original_diffusion=False
):
""" """
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent). Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
properties. properties.
:param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data. :param force_cpu:
:param max_chunk_size:
:param slices:
:param verbose:
:param return_mels:
:param original_diffusion:
:param original_ar:
:param voice_samples: List of 2 or more ~10 second reference clips,
which should be torch tensors containing 22.05kHz waveform data.
""" """
with torch.no_grad(): with torch.no_grad():
@ -491,7 +522,7 @@ class TextToSpeech:
beta=8.555504641634386, beta=8.555504641634386,
).to(device) ).to(device)
voice_samples = [migrate_to_device(v, device) for v in voice_samples] voice_samples = [migrate_to_device(v, device) for v in voice_samples]
auto_conds = [] auto_conds = []
diffusion_conds = [] diffusion_conds = []
@ -499,7 +530,8 @@ class TextToSpeech:
if original_ar: if original_ar:
samples = [resampler_22K(sample) for sample in voice_samples] samples = [resampler_22K(sample) for sample in voice_samples]
for sample in tqdm(samples, desc="Computing AR conditioning latents..."): for sample in tqdm(samples, desc="Computing AR conditioning latents..."):
auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate, cond_length=132300)) auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate,
cond_length=132300))
else: else:
samples = [resampler_22K(sample) for sample in voice_samples] samples = [resampler_22K(sample) for sample in voice_samples]
concat = torch.cat(samples, dim=-1) concat = torch.cat(samples, dim=-1)
@ -516,32 +548,35 @@ class TextToSpeech:
chunk_size = chunks[0].shape[-1] chunk_size = chunks[0].shape[-1]
for chunk in tqdm(chunks, desc="Computing AR conditioning latents..."): for chunk in tqdm(chunks, desc="Computing AR conditioning latents..."):
auto_conds.append(format_conditioning(chunk, device=device, sampling_rate=self.input_sample_rate, cond_length=chunk_size)) auto_conds.append(format_conditioning(chunk, device=device, sampling_rate=self.input_sample_rate,
cond_length=chunk_size))
if original_diffusion: if original_diffusion:
samples = [resampler_24K(sample) for sample in voice_samples] samples = [resampler_24K(sample) for sample in voice_samples]
for sample in tqdm(samples, desc="Computing diffusion conditioning latents..."): for sample in tqdm(samples, desc="Computing diffusion conditioning latents..."):
sample = pad_or_truncate(sample, 102400) sample = pad_or_truncate(sample, 102400)
cond_mel = wav_to_univnet_mel(migrate_to_device(sample, device), do_normalization=False, device=self.device) cond_mel = wav_to_univnet_mel(migrate_to_device(sample, device), do_normalization=False,
device=self.device)
diffusion_conds.append(cond_mel) diffusion_conds.append(cond_mel)
else: else:
samples = [resampler_24K(sample) for sample in voice_samples] samples = [resampler_24K(sample) for sample in voice_samples]
for chunk in tqdm(chunks, desc="Computing diffusion conditioning latents..."): for chunk in tqdm(chunks, desc="Computing diffusion conditioning latents..."):
check_for_kill_signal() check_for_kill_signal()
chunk = pad_or_truncate(chunk, chunk_size) chunk = pad_or_truncate(chunk, chunk_size)
cond_mel = wav_to_univnet_mel(migrate_to_device( chunk, device ), do_normalization=False, device=device) cond_mel = wav_to_univnet_mel(migrate_to_device(chunk, device), do_normalization=False,
device=device)
diffusion_conds.append(cond_mel) diffusion_conds.append(cond_mel)
auto_conds = torch.stack(auto_conds, dim=1) auto_conds = torch.stack(auto_conds, dim=1)
self.autoregressive = migrate_to_device( self.autoregressive, device ) self.autoregressive = migrate_to_device(self.autoregressive, device)
auto_latent = self.autoregressive.get_conditioning(auto_conds) auto_latent = self.autoregressive.get_conditioning(auto_conds)
self.autoregressive = migrate_to_device( self.autoregressive, self.device if self.preloaded_tensors else 'cpu' ) self.autoregressive = migrate_to_device(self.autoregressive,
self.device if self.preloaded_tensors else 'cpu')
diffusion_conds = torch.stack(diffusion_conds, dim=1) diffusion_conds = torch.stack(diffusion_conds, dim=1)
self.diffusion = migrate_to_device( self.diffusion, device ) self.diffusion = migrate_to_device(self.diffusion, device)
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds) diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
self.diffusion = migrate_to_device( self.diffusion, self.device if self.preloaded_tensors else 'cpu' ) self.diffusion = migrate_to_device(self.diffusion, self.device if self.preloaded_tensors else 'cpu')
if return_mels: if return_mels:
return auto_latent, diffusion_latent, auto_conds, diffusion_conds return auto_latent, diffusion_latent, auto_conds, diffusion_conds
@ -552,9 +587,11 @@ class TextToSpeech:
# Lazy-load the RLG models. # Lazy-load the RLG models.
if self.rlg_auto is None: if self.rlg_auto is None:
self.rlg_auto = RandomLatentConverter(1024).eval() self.rlg_auto = RandomLatentConverter(1024).eval()
self.rlg_auto.load_state_dict(torch.load(get_model_path('rlg_auto.pth', self.models_dir), map_location=torch.device('cpu'))) self.rlg_auto.load_state_dict(
torch.load(get_model_path('rlg_auto.pth', self.models_dir), map_location=torch.device('cpu')))
self.rlg_diffusion = RandomLatentConverter(2048).eval() self.rlg_diffusion = RandomLatentConverter(2048).eval()
self.rlg_diffusion.load_state_dict(torch.load(get_model_path('rlg_diffuser.pth', self.models_dir), map_location=torch.device('cpu'))) self.rlg_diffusion.load_state_dict(
torch.load(get_model_path('rlg_diffuser.pth', self.models_dir), map_location=torch.device('cpu')))
with torch.no_grad(): with torch.no_grad():
return self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(torch.tensor([0.0])) return self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(torch.tensor([0.0]))
@ -576,16 +613,19 @@ class TextToSpeech:
'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80}, 'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80},
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200}, 'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400}, 'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
'narration': {'num_autoregressive_samples': 30, 'diffusion_iterations': 80, "diffusion_sampler": "DDIM"},
'dialogue': {'num_autoregressive_samples': 60, 'diffusion_iterations': 120, "diffusion_sampler": "DDIM"}
} }
settings.update(presets[preset]) settings.update(presets[preset])
settings.update(kwargs) # allow overriding of preset settings with kwargs settings.update(kwargs) # allow overriding of preset settings with kwargs
return self.tts(text, **settings) return self.tts(text, **settings)
@torch.inference_mode() @torch.inference_mode()
def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None, def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None,
return_deterministic_state=False, return_deterministic_state=False,
# autoregressive generation parameters follow # autoregressive generation parameters follow
num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500, num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8,
max_mel_tokens=500,
sample_batch_size=None, sample_batch_size=None,
autoregressive_model=None, autoregressive_model=None,
diffusion_model=None, diffusion_model=None,
@ -667,14 +707,17 @@ class TextToSpeech:
self.load_tokenizer_json(tokenizer_json) self.load_tokenizer_json(tokenizer_json)
text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0) text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0)
text_tokens = migrate_to_device( text_tokens, self.device ) text_tokens = migrate_to_device(text_tokens, self.device)
text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary. text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.
assert text_tokens.shape[-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.' assert text_tokens.shape[
-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.'
auto_conds = None auto_conds = None
if voice_samples is not None: if voice_samples is not None:
auto_conditioning, diffusion_conditioning, auto_conds, _ = self.get_conditioning_latents(voice_samples, return_mels=True, verbose=True) auto_conditioning, diffusion_conditioning, auto_conds, _ = self.get_conditioning_latents(voice_samples,
return_mels=True,
verbose=True)
elif conditioning_latents is not None: elif conditioning_latents is not None:
latent_tuple = conditioning_latents latent_tuple = conditioning_latents
if len(latent_tuple) == 2: if len(latent_tuple) == 2:
@ -684,7 +727,8 @@ class TextToSpeech:
else: else:
auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents() auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents()
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k) diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free,
cond_free_k=cond_free_k)
self.autoregressive_batch_size = get_device_batch_size() if sample_batch_size is None or sample_batch_size == 0 else sample_batch_size self.autoregressive_batch_size = get_device_batch_size() if sample_batch_size is None or sample_batch_size == 0 else sample_batch_size
@ -696,12 +740,12 @@ class TextToSpeech:
stop_mel_token = self.autoregressive.stop_mel_token stop_mel_token = self.autoregressive.stop_mel_token
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output" calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
self.autoregressive = migrate_to_device( self.autoregressive, self.device ) self.autoregressive = migrate_to_device(self.autoregressive, self.device)
auto_conditioning = migrate_to_device( auto_conditioning, self.device ) auto_conditioning = migrate_to_device(auto_conditioning, self.device)
text_tokens = migrate_to_device( text_tokens, self.device ) text_tokens = migrate_to_device(text_tokens, self.device)
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p): with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
for b in tqdm(range(num_batches), desc="Generating autoregressive samples"): for b in tqdm(range(num_batches), desc="Generating autoregressive samples", disable=not verbose):
check_for_kill_signal() check_for_kill_signal()
codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens, codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
do_sample=True, do_sample=True,
@ -717,40 +761,39 @@ class TextToSpeech:
samples.append(codes) samples.append(codes)
if not self.preloaded_tensors: if not self.preloaded_tensors:
self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' ) self.autoregressive = migrate_to_device(self.autoregressive, 'cpu')
if self.unsqueeze_sample_batches: if self.unsqueeze_sample_batches:
new_samples = [] new_samples = []
for batch in samples: for batch in samples:
for i in range(batch.shape[0]): for i in range(batch.shape[0]):
new_samples.append(batch[i].unsqueeze(0)) new_samples.append(batch[i].unsqueeze(0))
samples = new_samples samples = new_samples
clip_results = [] clip_results = []
if auto_conds is not None: if auto_conds is not None:
auto_conditioning = migrate_to_device( auto_conditioning, self.device ) auto_conditioning = migrate_to_device(auto_conditioning, self.device)
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p): with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
if not self.preloaded_tensors: if not self.preloaded_tensors:
self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' ) self.autoregressive = migrate_to_device(self.autoregressive, 'cpu')
self.clvp = migrate_to_device( self.clvp, self.device ) self.clvp = migrate_to_device(self.clvp, self.device)
if cvvp_amount > 0: if cvvp_amount > 0:
if self.cvvp is None: if self.cvvp is None:
self.load_cvvp() self.load_cvvp()
if not self.preloaded_tensors: if not self.preloaded_tensors:
self.cvvp = migrate_to_device( self.cvvp, self.device ) self.cvvp = migrate_to_device(self.cvvp, self.device)
desc="Computing best candidates" desc = "Computing best candidates"
if verbose: if verbose:
if self.cvvp is None: if self.cvvp is None:
desc = "Computing best candidates using CLVP" desc = "Computing best candidates using CLVP"
else: else:
desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%" desc = f"Computing best candidates using CLVP {((1 - cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%"
for batch in tqdm(samples, desc=desc, disable=not verbose):
for batch in tqdm(samples, desc=desc):
check_for_kill_signal() check_for_kill_signal()
for i in range(batch.shape[0]): for i in range(batch.shape[0]):
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token) batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
@ -761,32 +804,32 @@ class TextToSpeech:
if auto_conds is not None and cvvp_amount > 0: if auto_conds is not None and cvvp_amount > 0:
cvvp_accumulator = 0 cvvp_accumulator = 0
for cl in range(auto_conds.shape[1]): for cl in range(auto_conds.shape[1]):
cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False) cvvp_accumulator = cvvp_accumulator + self.cvvp(
auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
cvvp = cvvp_accumulator / auto_conds.shape[1] cvvp = cvvp_accumulator / auto_conds.shape[1]
if cvvp_amount == 1: if cvvp_amount == 1:
clip_results.append(cvvp) clip_results.append(cvvp)
else: else:
clip_results.append(cvvp * cvvp_amount + clvp * (1-cvvp_amount)) clip_results.append(cvvp * cvvp_amount + clvp * (1 - cvvp_amount))
else: else:
clip_results.append(clvp) clip_results.append(clvp)
if not self.preloaded_tensors and auto_conds is not None: if not self.preloaded_tensors and auto_conds is not None:
auto_conds = migrate_to_device( auto_conds, 'cpu' ) auto_conds = migrate_to_device(auto_conds, 'cpu')
clip_results = torch.cat(clip_results, dim=0) clip_results = torch.cat(clip_results, dim=0)
samples = torch.cat(samples, dim=0) samples = torch.cat(samples, dim=0)
best_results = samples[torch.topk(clip_results, k=k).indices] best_results = samples[torch.topk(clip_results, k=k).indices]
if not self.preloaded_tensors: if not self.preloaded_tensors:
self.clvp = migrate_to_device( self.clvp, 'cpu' ) self.clvp = migrate_to_device(self.clvp, 'cpu')
self.cvvp = migrate_to_device( self.cvvp, 'cpu' ) self.cvvp = migrate_to_device(self.cvvp, 'cpu')
if get_device_name() == "dml": if get_device_name() == "dml":
text_tokens = migrate_to_device( text_tokens, 'cpu' ) text_tokens = migrate_to_device(text_tokens, 'cpu')
best_results = migrate_to_device( best_results, 'cpu' ) best_results = migrate_to_device(best_results, 'cpu')
auto_conditioning = migrate_to_device( auto_conditioning, 'cpu' ) auto_conditioning = migrate_to_device(auto_conditioning, 'cpu')
self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' ) self.autoregressive = migrate_to_device(self.autoregressive, 'cpu')
else: else:
auto_conditioning = auto_conditioning.to(self.device) auto_conditioning = auto_conditioning.to(self.device)
self.autoregressive = self.autoregressive.to(self.device) self.autoregressive = self.autoregressive.to(self.device)
@ -797,23 +840,26 @@ class TextToSpeech:
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
# results, but will increase memory usage. # results, but will increase memory usage.
best_latents = self.autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1), best_latents = self.autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results, torch.tensor([text_tokens.shape[-1]], device=text_tokens.device),
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device), best_results,
torch.tensor([best_results.shape[
-1] * self.autoregressive.mel_length_compression],
device=text_tokens.device),
return_latent=True, clip_inputs=False) return_latent=True, clip_inputs=False)
diffusion_conditioning = migrate_to_device( diffusion_conditioning, self.device ) diffusion_conditioning = migrate_to_device(diffusion_conditioning, self.device)
if get_device_name() == "dml": if get_device_name() == "dml":
self.autoregressive = migrate_to_device( self.autoregressive, self.device ) self.autoregressive = migrate_to_device(self.autoregressive, self.device)
best_results = migrate_to_device( best_results, self.device ) best_results = migrate_to_device(best_results, self.device)
best_latents = migrate_to_device( best_latents, self.device ) best_latents = migrate_to_device(best_latents, self.device)
self.vocoder = migrate_to_device( self.vocoder, 'cpu' ) self.vocoder = migrate_to_device(self.vocoder, 'cpu')
else: else:
if not self.preloaded_tensors: if not self.preloaded_tensors:
self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' ) self.autoregressive = migrate_to_device(self.autoregressive, 'cpu')
self.diffusion = migrate_to_device( self.diffusion, self.device ) self.diffusion = migrate_to_device(self.diffusion, self.device)
self.vocoder = migrate_to_device( self.vocoder, self.device ) self.vocoder = migrate_to_device(self.vocoder, self.device)
del text_tokens del text_tokens
del auto_conditioning del auto_conditioning
@ -835,22 +881,26 @@ class TextToSpeech:
break break
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning, mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
temperature=diffusion_temperature, desc="Transforming autoregressive outputs into audio..", sampler=diffusion_sampler, temperature=diffusion_temperature,
input_sample_rate=self.input_sample_rate, output_sample_rate=self.output_sample_rate) desc="Transforming autoregressive outputs into audio..",
sampler=diffusion_sampler,
input_sample_rate=self.input_sample_rate,
output_sample_rate=self.output_sample_rate)
wav = self.vocoder.inference(mel) wav = self.vocoder.inference(mel)
wav_candidates.append(wav) wav_candidates.append(wav)
if not self.preloaded_tensors: if not self.preloaded_tensors:
self.diffusion = migrate_to_device( self.diffusion, 'cpu' ) self.diffusion = migrate_to_device(self.diffusion, 'cpu')
self.vocoder = migrate_to_device( self.vocoder, 'cpu' ) self.vocoder = migrate_to_device(self.vocoder, 'cpu')
def potentially_redact(clip, text): def potentially_redact(clip, text):
if self.enable_redaction: if self.enable_redaction:
t = clip.squeeze(1) t = clip.squeeze(1)
t = migrate_to_device( t, 'cpu' if get_device_name() == "dml" else self.device) t = migrate_to_device(t, 'cpu' if get_device_name() == "dml" else self.device)
return self.aligner.redact(t, text, self.output_sample_rate).unsqueeze(1) return self.aligner.redact(t, text, self.output_sample_rate).unsqueeze(1)
return clip return clip
wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates] wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]
if len(wav_candidates) > 1: if len(wav_candidates) > 1: