Update tortoise/api.py
My changed versions with more presets
This commit is contained in:
parent
d7e6914fb8
commit
9ddfcb57aa
266
tortoise/api.py
266
tortoise/api.py
|
@ -51,6 +51,7 @@ MODELS = {
|
||||||
'bigvgan_24khz_100band.json': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_24khz_100band.json',
|
'bigvgan_24khz_100band.json': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_24khz_100band.json',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def hash_file(path, algo="md5", buffer_size=0):
|
def hash_file(path, algo="md5", buffer_size=0):
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
|
@ -77,12 +78,14 @@ def hash_file(path, algo="md5", buffer_size=0):
|
||||||
|
|
||||||
return "{0}".format(hash.hexdigest())
|
return "{0}".format(hash.hexdigest())
|
||||||
|
|
||||||
|
|
||||||
def check_for_kill_signal():
|
def check_for_kill_signal():
|
||||||
global STOP_SIGNAL
|
global STOP_SIGNAL
|
||||||
if STOP_SIGNAL:
|
if STOP_SIGNAL:
|
||||||
STOP_SIGNAL = False
|
STOP_SIGNAL = False
|
||||||
raise Exception("Kill signal detected")
|
raise Exception("Kill signal detected")
|
||||||
|
|
||||||
|
|
||||||
def download_models(specific_models=None):
|
def download_models(specific_models=None):
|
||||||
"""
|
"""
|
||||||
Call to download all the models that Tortoise uses.
|
Call to download all the models that Tortoise uses.
|
||||||
|
@ -102,6 +105,7 @@ def download_models(specific_models=None):
|
||||||
else:
|
else:
|
||||||
pbar.finish()
|
pbar.finish()
|
||||||
pbar = None
|
pbar = None
|
||||||
|
|
||||||
for model_name, url in MODELS.items():
|
for model_name, url in MODELS.items():
|
||||||
if specific_models is not None and model_name not in specific_models:
|
if specific_models is not None and model_name not in specific_models:
|
||||||
continue
|
continue
|
||||||
|
@ -112,7 +116,7 @@ def download_models(specific_models=None):
|
||||||
|
|
||||||
proxy = ProxyHandler({})
|
proxy = ProxyHandler({})
|
||||||
opener = build_opener(proxy)
|
opener = build_opener(proxy)
|
||||||
opener.addheaders = [('User-Agent','mrq/AI-Voice-Cloning')]
|
opener.addheaders = [('User-Agent', 'mrq/AI-Voice-Cloning')]
|
||||||
install_opener(opener)
|
install_opener(opener)
|
||||||
request.urlretrieve(url, model_path, show_progress)
|
request.urlretrieve(url, model_path, show_progress)
|
||||||
print('Done.')
|
print('Done.')
|
||||||
|
@ -137,19 +141,23 @@ def pad_or_truncate(t, length):
|
||||||
if t.shape[-1] == length:
|
if t.shape[-1] == length:
|
||||||
return t
|
return t
|
||||||
elif t.shape[-1] < length:
|
elif t.shape[-1] < length:
|
||||||
return F.pad(t, (0, length-t.shape[-1]))
|
return F.pad(t, (0, length - t.shape[-1]))
|
||||||
else:
|
else:
|
||||||
return t[..., :length]
|
return t[..., :length]
|
||||||
|
|
||||||
|
|
||||||
def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusion_steps=200, cond_free=True, cond_free_k=1):
|
def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusion_steps=200, cond_free=True,
|
||||||
|
cond_free_k=1):
|
||||||
"""
|
"""
|
||||||
Helper function to load a GaussianDiffusion instance configured for use as a vocoder.
|
Helper function to load a GaussianDiffusion instance configured for use as a vocoder.
|
||||||
"""
|
"""
|
||||||
return SpacedDiffusion(use_timesteps=space_timesteps(trained_diffusion_steps, [desired_diffusion_steps]), model_mean_type='epsilon',
|
return SpacedDiffusion(use_timesteps=space_timesteps(trained_diffusion_steps, [desired_diffusion_steps]),
|
||||||
model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', trained_diffusion_steps),
|
model_mean_type='epsilon',
|
||||||
|
model_var_type='learned_range', loss_type='mse',
|
||||||
|
betas=get_named_beta_schedule('linear', trained_diffusion_steps),
|
||||||
conditioning_free=cond_free, conditioning_free_k=cond_free_k)
|
conditioning_free=cond_free, conditioning_free_k=cond_free_k)
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=22050):
|
def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=22050):
|
||||||
"""
|
"""
|
||||||
|
@ -165,6 +173,7 @@ def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=2
|
||||||
mel_clip = mel_clip.unsqueeze(0)
|
mel_clip = mel_clip.unsqueeze(0)
|
||||||
return migrate_to_device(mel_clip, device)
|
return migrate_to_device(mel_clip, device)
|
||||||
|
|
||||||
|
|
||||||
def fix_autoregressive_output(codes, stop_token, complain=True):
|
def fix_autoregressive_output(codes, stop_token, complain=True):
|
||||||
"""
|
"""
|
||||||
This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was
|
This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was
|
||||||
|
@ -194,23 +203,27 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
|
||||||
|
|
||||||
return codes
|
return codes
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000):
|
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True,
|
||||||
|
desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000):
|
||||||
"""
|
"""
|
||||||
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
||||||
"""
|
"""
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
output_seq_len = latents.shape[1] * 4 * output_sample_rate // input_sample_rate # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
|
output_seq_len = latents.shape[
|
||||||
|
1] * 4 * output_sample_rate // input_sample_rate # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
|
||||||
output_shape = (latents.shape[0], 100, output_seq_len)
|
output_shape = (latents.shape[0], 100, output_seq_len)
|
||||||
precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len, False)
|
precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len,
|
||||||
|
False)
|
||||||
|
|
||||||
noise = torch.randn(output_shape, device=latents.device) * temperature
|
noise = torch.randn(output_shape, device=latents.device) * temperature
|
||||||
|
|
||||||
diffuser.sampler = sampler.lower()
|
diffuser.sampler = sampler.lower()
|
||||||
mel = diffuser.sample_loop(diffusion_model, output_shape, noise=noise,
|
mel = diffuser.sample_loop(diffusion_model, output_shape, noise=noise,
|
||||||
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}, desc=desc)
|
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}, desc=desc)
|
||||||
|
|
||||||
mel = denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
|
mel = denormalize_tacotron_mel(mel)[:, :, :output_seq_len]
|
||||||
if get_device_name() == "dml":
|
if get_device_name() == "dml":
|
||||||
mel = mel.cpu()
|
mel = mel.cpu()
|
||||||
return mel
|
return mel
|
||||||
|
@ -230,7 +243,8 @@ def classify_audio_clip(clip):
|
||||||
results = F.softmax(classifier(clip), dim=-1)
|
results = F.softmax(classifier(clip), dim=-1)
|
||||||
return results[0][0]
|
return results[0][0]
|
||||||
|
|
||||||
def migrate_to_device( t, device ):
|
|
||||||
|
def migrate_to_device(t, device):
|
||||||
if t is None:
|
if t is None:
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
@ -249,18 +263,18 @@ def migrate_to_device( t, device ):
|
||||||
|
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
|
||||||
class TextToSpeech:
|
class TextToSpeech:
|
||||||
"""
|
"""
|
||||||
Main entry point into Tortoise.
|
Main entry point into Tortoise.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable_redaction=True, device=None,
|
def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable_redaction=True, device=None,
|
||||||
minor_optimizations=True,
|
minor_optimizations=True,
|
||||||
unsqueeze_sample_batches=False,
|
unsqueeze_sample_batches=False,
|
||||||
input_sample_rate=22050, output_sample_rate=24000,
|
input_sample_rate=22050, output_sample_rate=24000,
|
||||||
autoregressive_model_path=None, diffusion_model_path=None, vocoder_model=None, tokenizer_json=None,
|
autoregressive_model_path=None, diffusion_model_path=None, vocoder_model=None, tokenizer_json=None,
|
||||||
# ):
|
):
|
||||||
use_deepspeed=False): # Add use_deepspeed parameter
|
|
||||||
"""
|
"""
|
||||||
Constructor
|
Constructor
|
||||||
:param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
|
:param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
|
||||||
|
@ -276,17 +290,16 @@ class TextToSpeech:
|
||||||
if device is None:
|
if device is None:
|
||||||
device = get_device(verbose=True)
|
device = get_device(verbose=True)
|
||||||
|
|
||||||
self.version = [2,4,4] # to-do, autograb this from setup.py, or have setup.py autograb this
|
self.version = [2, 4, 4] # to-do, autograb this from setup.py, or have setup.py autograb this
|
||||||
self.input_sample_rate = input_sample_rate
|
self.input_sample_rate = input_sample_rate
|
||||||
self.output_sample_rate = output_sample_rate
|
self.output_sample_rate = output_sample_rate
|
||||||
self.minor_optimizations = minor_optimizations
|
self.minor_optimizations = minor_optimizations
|
||||||
self.unsqueeze_sample_batches = unsqueeze_sample_batches
|
self.unsqueeze_sample_batches = unsqueeze_sample_batches
|
||||||
self.use_deepspeed = use_deepspeed # Store use_deepspeed as an instance variable
|
|
||||||
print(f'use_deepspeed api_debug {use_deepspeed}')
|
|
||||||
# for clarity, it's simpler to split these up and just predicate them on requesting VRAM-consuming optimizations
|
# for clarity, it's simpler to split these up and just predicate them on requesting VRAM-consuming optimizations
|
||||||
self.preloaded_tensors = minor_optimizations
|
self.preloaded_tensors = minor_optimizations
|
||||||
self.use_kv_cache = minor_optimizations
|
self.use_kv_cache = minor_optimizations
|
||||||
if get_device_name() == "dml": # does not work with DirectML
|
if get_device_name() == "dml": # does not work with DirectML
|
||||||
print("KV caching requested but not supported with the DirectML backend, disabling...")
|
print("KV caching requested but not supported with the DirectML backend, disabling...")
|
||||||
self.use_kv_cache = False
|
self.use_kv_cache = False
|
||||||
|
|
||||||
|
@ -315,13 +328,12 @@ class TextToSpeech:
|
||||||
|
|
||||||
self.load_diffusion_model(diffusion_model_path)
|
self.load_diffusion_model(diffusion_model_path)
|
||||||
|
|
||||||
|
|
||||||
self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20,
|
self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20,
|
||||||
text_seq_len=350, text_heads=12,
|
text_seq_len=350, text_heads=12,
|
||||||
num_speech_tokens=8192, speech_enc_depth=20, speech_heads=12, speech_seq_len=430,
|
num_speech_tokens=8192, speech_enc_depth=20, speech_heads=12, speech_seq_len=430,
|
||||||
use_xformers=True).cpu().eval()
|
use_xformers=True).cpu().eval()
|
||||||
self.clvp.load_state_dict(torch.load(get_model_path('clvp2.pth', models_dir)))
|
self.clvp.load_state_dict(torch.load(get_model_path('clvp2.pth', models_dir)))
|
||||||
self.cvvp = None # CVVP model is only loaded if used.
|
self.cvvp = None # CVVP model is only loaded if used.
|
||||||
|
|
||||||
self.vocoder_model = vocoder_model
|
self.vocoder_model = vocoder_model
|
||||||
self.load_vocoder_model(self.vocoder_model)
|
self.load_vocoder_model(self.vocoder_model)
|
||||||
|
@ -331,21 +343,23 @@ class TextToSpeech:
|
||||||
self.rlg_diffusion = None
|
self.rlg_diffusion = None
|
||||||
|
|
||||||
if self.preloaded_tensors:
|
if self.preloaded_tensors:
|
||||||
self.autoregressive = migrate_to_device( self.autoregressive, self.device )
|
self.autoregressive = migrate_to_device(self.autoregressive, self.device)
|
||||||
self.diffusion = migrate_to_device( self.diffusion, self.device )
|
self.diffusion = migrate_to_device(self.diffusion, self.device)
|
||||||
self.clvp = migrate_to_device( self.clvp, self.device )
|
self.clvp = migrate_to_device(self.clvp, self.device)
|
||||||
self.vocoder = migrate_to_device( self.vocoder, self.device )
|
self.vocoder = migrate_to_device(self.vocoder, self.device)
|
||||||
|
|
||||||
self.loading = False
|
self.loading = False
|
||||||
|
|
||||||
def load_autoregressive_model(self, autoregressive_model_path):
|
def load_autoregressive_model(self, autoregressive_model_path):
|
||||||
if hasattr(self,"autoregressive_model_path") and os.path.samefile(self.autoregressive_model_path, autoregressive_model_path):
|
if hasattr(self, "autoregressive_model_path") and os.path.samefile(self.autoregressive_model_path,
|
||||||
|
autoregressive_model_path):
|
||||||
return
|
return
|
||||||
|
|
||||||
self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists(autoregressive_model_path) else get_model_path('autoregressive.pth', self.models_dir)
|
self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists(
|
||||||
|
autoregressive_model_path) else get_model_path('autoregressive.pth', self.models_dir)
|
||||||
new_hash = hash_file(self.autoregressive_model_path)
|
new_hash = hash_file(self.autoregressive_model_path)
|
||||||
|
|
||||||
if hasattr(self,"autoregressive_model_hash") and self.autoregressive_model_hash == new_hash:
|
if hasattr(self, "autoregressive_model_hash") and self.autoregressive_model_hash == new_hash:
|
||||||
return
|
return
|
||||||
|
|
||||||
self.autoregressive_model_hash = new_hash
|
self.autoregressive_model_hash = new_hash
|
||||||
|
@ -356,42 +370,44 @@ class TextToSpeech:
|
||||||
if hasattr(self, 'autoregressive'):
|
if hasattr(self, 'autoregressive'):
|
||||||
del self.autoregressive
|
del self.autoregressive
|
||||||
|
|
||||||
self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2,
|
||||||
model_dim=1024,
|
layers=30,
|
||||||
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
model_dim=1024,
|
||||||
train_solo_embeddings=False).cpu().eval()
|
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
||||||
|
train_solo_embeddings=False).cpu().eval()
|
||||||
self.autoregressive.load_state_dict(torch.load(self.autoregressive_model_path))
|
self.autoregressive.load_state_dict(torch.load(self.autoregressive_model_path))
|
||||||
self.autoregressive.post_init_gpt2_config(use_deepspeed=self.use_deepspeed, kv_cache=self.use_kv_cache)
|
self.autoregressive.post_init_gpt2_config(kv_cache=self.use_kv_cache)
|
||||||
if self.preloaded_tensors:
|
if self.preloaded_tensors:
|
||||||
self.autoregressive = migrate_to_device( self.autoregressive, self.device )
|
self.autoregressive = migrate_to_device(self.autoregressive, self.device)
|
||||||
|
|
||||||
self.loading = False
|
self.loading = False
|
||||||
print(f"Loaded autoregressive model")
|
print(f"Loaded autoregressive model")
|
||||||
|
|
||||||
def load_diffusion_model(self, diffusion_model_path):
|
def load_diffusion_model(self, diffusion_model_path):
|
||||||
if hasattr(self,"diffusion_model_path") and os.path.samefile(self.diffusion_model_path, diffusion_model_path):
|
if hasattr(self, "diffusion_model_path") and os.path.samefile(self.diffusion_model_path, diffusion_model_path):
|
||||||
return
|
return
|
||||||
|
|
||||||
self.loading = True
|
self.loading = True
|
||||||
|
|
||||||
self.diffusion_model_path = diffusion_model_path if diffusion_model_path and os.path.exists(diffusion_model_path) else get_model_path('diffusion_decoder.pth', self.models_dir)
|
self.diffusion_model_path = diffusion_model_path if diffusion_model_path and os.path.exists(
|
||||||
|
diffusion_model_path) else get_model_path('diffusion_decoder.pth', self.models_dir)
|
||||||
self.diffusion_model_hash = hash_file(self.diffusion_model_path)
|
self.diffusion_model_hash = hash_file(self.diffusion_model_path)
|
||||||
|
|
||||||
if hasattr(self, 'diffusion'):
|
if hasattr(self, 'diffusion'):
|
||||||
del self.diffusion
|
del self.diffusion
|
||||||
|
|
||||||
self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
|
self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
|
||||||
in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
|
in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
|
||||||
layer_drop=0, unconditioned_percentage=0).cpu().eval()
|
layer_drop=0, unconditioned_percentage=0).cpu().eval()
|
||||||
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', self.models_dir)))
|
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', self.models_dir)))
|
||||||
if self.preloaded_tensors:
|
if self.preloaded_tensors:
|
||||||
self.diffusion = migrate_to_device( self.diffusion, self.device )
|
self.diffusion = migrate_to_device(self.diffusion, self.device)
|
||||||
|
|
||||||
self.loading = False
|
self.loading = False
|
||||||
print(f"Loaded diffusion model")
|
print(f"Loaded diffusion model")
|
||||||
|
|
||||||
def load_vocoder_model(self, vocoder_model):
|
def load_vocoder_model(self, vocoder_model):
|
||||||
if hasattr(self,"vocoder_model_path") and os.path.samefile(self.vocoder_model_path, vocoder_model):
|
if hasattr(self, "vocoder_model_path") and os.path.samefile(self.vocoder_model_path, vocoder_model):
|
||||||
return
|
return
|
||||||
|
|
||||||
self.loading = True
|
self.loading = True
|
||||||
|
@ -415,27 +431,30 @@ class TextToSpeech:
|
||||||
vocoder_config = get_model_path(vocoder_config, self.models_dir)
|
vocoder_config = get_model_path(vocoder_config, self.models_dir)
|
||||||
|
|
||||||
self.vocoder = BigVGAN(config=vocoder_config).cpu()
|
self.vocoder = BigVGAN(config=vocoder_config).cpu()
|
||||||
#elif vocoder_model == "univnet":
|
# elif vocoder_model == "univnet":
|
||||||
else:
|
else:
|
||||||
vocoder_key = 'model_g'
|
vocoder_key = 'model_g'
|
||||||
self.vocoder_model_path = 'vocoder.pth'
|
self.vocoder_model_path = 'vocoder.pth'
|
||||||
self.vocoder = UnivNetGenerator().cpu()
|
self.vocoder = UnivNetGenerator().cpu()
|
||||||
|
|
||||||
print(f"Loading vocoder model: {self.vocoder_model_path}")
|
print(f"Loading vocoder model: {self.vocoder_model_path}")
|
||||||
self.vocoder.load_state_dict(torch.load(get_model_path(self.vocoder_model_path, self.models_dir), map_location=torch.device('cpu'))[vocoder_key])
|
self.vocoder.load_state_dict(
|
||||||
|
torch.load(get_model_path(self.vocoder_model_path, self.models_dir), map_location=torch.device('cpu'))[
|
||||||
|
vocoder_key])
|
||||||
|
|
||||||
self.vocoder.eval(inference=True)
|
self.vocoder.eval(inference=True)
|
||||||
if self.preloaded_tensors:
|
if self.preloaded_tensors:
|
||||||
self.vocoder = migrate_to_device( self.vocoder, self.device )
|
self.vocoder = migrate_to_device(self.vocoder, self.device)
|
||||||
self.loading = False
|
self.loading = False
|
||||||
print(f"Loaded vocoder model")
|
print(f"Loaded vocoder model")
|
||||||
|
|
||||||
def load_tokenizer_json(self, tokenizer_json):
|
def load_tokenizer_json(self, tokenizer_json):
|
||||||
if hasattr(self,"tokenizer_json") and os.path.samefile(self.tokenizer_json, tokenizer_json):
|
if hasattr(self, "tokenizer_json") and os.path.samefile(self.tokenizer_json, tokenizer_json):
|
||||||
return
|
return
|
||||||
|
|
||||||
self.loading = True
|
self.loading = True
|
||||||
self.tokenizer_json = tokenizer_json if tokenizer_json else os.path.join(os.path.dirname(os.path.realpath(__file__)), '../tortoise/data/tokenizer.json')
|
self.tokenizer_json = tokenizer_json if tokenizer_json else os.path.join(
|
||||||
|
os.path.dirname(os.path.realpath(__file__)), '../tortoise/data/tokenizer.json')
|
||||||
print("Loading tokenizer JSON:", self.tokenizer_json)
|
print("Loading tokenizer JSON:", self.tokenizer_json)
|
||||||
|
|
||||||
if hasattr(self, 'tokenizer'):
|
if hasattr(self, 'tokenizer'):
|
||||||
|
@ -448,20 +467,32 @@ class TextToSpeech:
|
||||||
|
|
||||||
def load_cvvp(self):
|
def load_cvvp(self):
|
||||||
"""Load CVVP model."""
|
"""Load CVVP model."""
|
||||||
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8,
|
||||||
|
cond_mask_percentage=0,
|
||||||
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
||||||
self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir)))
|
self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir)))
|
||||||
|
|
||||||
if self.preloaded_tensors:
|
if self.preloaded_tensors:
|
||||||
self.cvvp = migrate_to_device( self.cvvp, self.device )
|
self.cvvp = migrate_to_device(self.cvvp, self.device)
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, slices=1, max_chunk_size=None, force_cpu=False, original_ar=False, original_diffusion=False):
|
def get_conditioning_latents(
|
||||||
|
self, voice_samples, return_mels=False, verbose=False, slices=1, max_chunk_size=None, force_cpu=False,
|
||||||
|
original_ar=False, original_diffusion=False
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
||||||
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
|
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
|
||||||
properties.
|
properties.
|
||||||
:param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data.
|
:param force_cpu:
|
||||||
|
:param max_chunk_size:
|
||||||
|
:param slices:
|
||||||
|
:param verbose:
|
||||||
|
:param return_mels:
|
||||||
|
:param original_diffusion:
|
||||||
|
:param original_ar:
|
||||||
|
:param voice_samples: List of 2 or more ~10 second reference clips,
|
||||||
|
which should be torch tensors containing 22.05kHz waveform data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
|
@ -491,7 +522,7 @@ class TextToSpeech:
|
||||||
beta=8.555504641634386,
|
beta=8.555504641634386,
|
||||||
).to(device)
|
).to(device)
|
||||||
|
|
||||||
voice_samples = [migrate_to_device(v, device) for v in voice_samples]
|
voice_samples = [migrate_to_device(v, device) for v in voice_samples]
|
||||||
|
|
||||||
auto_conds = []
|
auto_conds = []
|
||||||
diffusion_conds = []
|
diffusion_conds = []
|
||||||
|
@ -499,7 +530,8 @@ class TextToSpeech:
|
||||||
if original_ar:
|
if original_ar:
|
||||||
samples = [resampler_22K(sample) for sample in voice_samples]
|
samples = [resampler_22K(sample) for sample in voice_samples]
|
||||||
for sample in tqdm(samples, desc="Computing AR conditioning latents..."):
|
for sample in tqdm(samples, desc="Computing AR conditioning latents..."):
|
||||||
auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate, cond_length=132300))
|
auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate,
|
||||||
|
cond_length=132300))
|
||||||
else:
|
else:
|
||||||
samples = [resampler_22K(sample) for sample in voice_samples]
|
samples = [resampler_22K(sample) for sample in voice_samples]
|
||||||
concat = torch.cat(samples, dim=-1)
|
concat = torch.cat(samples, dim=-1)
|
||||||
|
@ -516,32 +548,35 @@ class TextToSpeech:
|
||||||
chunk_size = chunks[0].shape[-1]
|
chunk_size = chunks[0].shape[-1]
|
||||||
|
|
||||||
for chunk in tqdm(chunks, desc="Computing AR conditioning latents..."):
|
for chunk in tqdm(chunks, desc="Computing AR conditioning latents..."):
|
||||||
auto_conds.append(format_conditioning(chunk, device=device, sampling_rate=self.input_sample_rate, cond_length=chunk_size))
|
auto_conds.append(format_conditioning(chunk, device=device, sampling_rate=self.input_sample_rate,
|
||||||
|
cond_length=chunk_size))
|
||||||
|
|
||||||
if original_diffusion:
|
if original_diffusion:
|
||||||
samples = [resampler_24K(sample) for sample in voice_samples]
|
samples = [resampler_24K(sample) for sample in voice_samples]
|
||||||
for sample in tqdm(samples, desc="Computing diffusion conditioning latents..."):
|
for sample in tqdm(samples, desc="Computing diffusion conditioning latents..."):
|
||||||
sample = pad_or_truncate(sample, 102400)
|
sample = pad_or_truncate(sample, 102400)
|
||||||
cond_mel = wav_to_univnet_mel(migrate_to_device(sample, device), do_normalization=False, device=self.device)
|
cond_mel = wav_to_univnet_mel(migrate_to_device(sample, device), do_normalization=False,
|
||||||
|
device=self.device)
|
||||||
diffusion_conds.append(cond_mel)
|
diffusion_conds.append(cond_mel)
|
||||||
else:
|
else:
|
||||||
samples = [resampler_24K(sample) for sample in voice_samples]
|
samples = [resampler_24K(sample) for sample in voice_samples]
|
||||||
for chunk in tqdm(chunks, desc="Computing diffusion conditioning latents..."):
|
for chunk in tqdm(chunks, desc="Computing diffusion conditioning latents..."):
|
||||||
check_for_kill_signal()
|
check_for_kill_signal()
|
||||||
chunk = pad_or_truncate(chunk, chunk_size)
|
chunk = pad_or_truncate(chunk, chunk_size)
|
||||||
cond_mel = wav_to_univnet_mel(migrate_to_device( chunk, device ), do_normalization=False, device=device)
|
cond_mel = wav_to_univnet_mel(migrate_to_device(chunk, device), do_normalization=False,
|
||||||
|
device=device)
|
||||||
diffusion_conds.append(cond_mel)
|
diffusion_conds.append(cond_mel)
|
||||||
|
|
||||||
auto_conds = torch.stack(auto_conds, dim=1)
|
auto_conds = torch.stack(auto_conds, dim=1)
|
||||||
self.autoregressive = migrate_to_device( self.autoregressive, device )
|
self.autoregressive = migrate_to_device(self.autoregressive, device)
|
||||||
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
||||||
self.autoregressive = migrate_to_device( self.autoregressive, self.device if self.preloaded_tensors else 'cpu' )
|
self.autoregressive = migrate_to_device(self.autoregressive,
|
||||||
|
self.device if self.preloaded_tensors else 'cpu')
|
||||||
|
|
||||||
diffusion_conds = torch.stack(diffusion_conds, dim=1)
|
diffusion_conds = torch.stack(diffusion_conds, dim=1)
|
||||||
self.diffusion = migrate_to_device( self.diffusion, device )
|
self.diffusion = migrate_to_device(self.diffusion, device)
|
||||||
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
||||||
self.diffusion = migrate_to_device( self.diffusion, self.device if self.preloaded_tensors else 'cpu' )
|
self.diffusion = migrate_to_device(self.diffusion, self.device if self.preloaded_tensors else 'cpu')
|
||||||
|
|
||||||
if return_mels:
|
if return_mels:
|
||||||
return auto_latent, diffusion_latent, auto_conds, diffusion_conds
|
return auto_latent, diffusion_latent, auto_conds, diffusion_conds
|
||||||
|
@ -552,9 +587,11 @@ class TextToSpeech:
|
||||||
# Lazy-load the RLG models.
|
# Lazy-load the RLG models.
|
||||||
if self.rlg_auto is None:
|
if self.rlg_auto is None:
|
||||||
self.rlg_auto = RandomLatentConverter(1024).eval()
|
self.rlg_auto = RandomLatentConverter(1024).eval()
|
||||||
self.rlg_auto.load_state_dict(torch.load(get_model_path('rlg_auto.pth', self.models_dir), map_location=torch.device('cpu')))
|
self.rlg_auto.load_state_dict(
|
||||||
|
torch.load(get_model_path('rlg_auto.pth', self.models_dir), map_location=torch.device('cpu')))
|
||||||
self.rlg_diffusion = RandomLatentConverter(2048).eval()
|
self.rlg_diffusion = RandomLatentConverter(2048).eval()
|
||||||
self.rlg_diffusion.load_state_dict(torch.load(get_model_path('rlg_diffuser.pth', self.models_dir), map_location=torch.device('cpu')))
|
self.rlg_diffusion.load_state_dict(
|
||||||
|
torch.load(get_model_path('rlg_diffuser.pth', self.models_dir), map_location=torch.device('cpu')))
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
return self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(torch.tensor([0.0]))
|
return self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(torch.tensor([0.0]))
|
||||||
|
|
||||||
|
@ -576,16 +613,19 @@ class TextToSpeech:
|
||||||
'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80},
|
'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80},
|
||||||
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
|
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
|
||||||
'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
|
'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
|
||||||
|
'narration': {'num_autoregressive_samples': 30, 'diffusion_iterations': 80, "diffusion_sampler": "DDIM"},
|
||||||
|
'dialogue': {'num_autoregressive_samples': 60, 'diffusion_iterations': 120, "diffusion_sampler": "DDIM"}
|
||||||
}
|
}
|
||||||
settings.update(presets[preset])
|
settings.update(presets[preset])
|
||||||
settings.update(kwargs) # allow overriding of preset settings with kwargs
|
settings.update(kwargs) # allow overriding of preset settings with kwargs
|
||||||
return self.tts(text, **settings)
|
return self.tts(text, **settings)
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None,
|
def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None,
|
||||||
return_deterministic_state=False,
|
return_deterministic_state=False,
|
||||||
# autoregressive generation parameters follow
|
# autoregressive generation parameters follow
|
||||||
num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
|
num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8,
|
||||||
|
max_mel_tokens=500,
|
||||||
sample_batch_size=None,
|
sample_batch_size=None,
|
||||||
autoregressive_model=None,
|
autoregressive_model=None,
|
||||||
diffusion_model=None,
|
diffusion_model=None,
|
||||||
|
@ -667,14 +707,17 @@ class TextToSpeech:
|
||||||
self.load_tokenizer_json(tokenizer_json)
|
self.load_tokenizer_json(tokenizer_json)
|
||||||
|
|
||||||
text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0)
|
text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0)
|
||||||
text_tokens = migrate_to_device( text_tokens, self.device )
|
text_tokens = migrate_to_device(text_tokens, self.device)
|
||||||
|
|
||||||
text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.
|
text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.
|
||||||
assert text_tokens.shape[-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.'
|
assert text_tokens.shape[
|
||||||
|
-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.'
|
||||||
|
|
||||||
auto_conds = None
|
auto_conds = None
|
||||||
if voice_samples is not None:
|
if voice_samples is not None:
|
||||||
auto_conditioning, diffusion_conditioning, auto_conds, _ = self.get_conditioning_latents(voice_samples, return_mels=True, verbose=True)
|
auto_conditioning, diffusion_conditioning, auto_conds, _ = self.get_conditioning_latents(voice_samples,
|
||||||
|
return_mels=True,
|
||||||
|
verbose=True)
|
||||||
elif conditioning_latents is not None:
|
elif conditioning_latents is not None:
|
||||||
latent_tuple = conditioning_latents
|
latent_tuple = conditioning_latents
|
||||||
if len(latent_tuple) == 2:
|
if len(latent_tuple) == 2:
|
||||||
|
@ -684,7 +727,8 @@ class TextToSpeech:
|
||||||
else:
|
else:
|
||||||
auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents()
|
auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents()
|
||||||
|
|
||||||
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free,
|
||||||
|
cond_free_k=cond_free_k)
|
||||||
|
|
||||||
self.autoregressive_batch_size = get_device_batch_size() if sample_batch_size is None or sample_batch_size == 0 else sample_batch_size
|
self.autoregressive_batch_size = get_device_batch_size() if sample_batch_size is None or sample_batch_size == 0 else sample_batch_size
|
||||||
|
|
||||||
|
@ -696,12 +740,12 @@ class TextToSpeech:
|
||||||
stop_mel_token = self.autoregressive.stop_mel_token
|
stop_mel_token = self.autoregressive.stop_mel_token
|
||||||
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
|
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
|
||||||
|
|
||||||
self.autoregressive = migrate_to_device( self.autoregressive, self.device )
|
self.autoregressive = migrate_to_device(self.autoregressive, self.device)
|
||||||
auto_conditioning = migrate_to_device( auto_conditioning, self.device )
|
auto_conditioning = migrate_to_device(auto_conditioning, self.device)
|
||||||
text_tokens = migrate_to_device( text_tokens, self.device )
|
text_tokens = migrate_to_device(text_tokens, self.device)
|
||||||
|
|
||||||
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
|
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
|
||||||
for b in tqdm(range(num_batches), desc="Generating autoregressive samples"):
|
for b in tqdm(range(num_batches), desc="Generating autoregressive samples", disable=not verbose):
|
||||||
check_for_kill_signal()
|
check_for_kill_signal()
|
||||||
codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
|
codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
|
||||||
do_sample=True,
|
do_sample=True,
|
||||||
|
@ -717,40 +761,39 @@ class TextToSpeech:
|
||||||
samples.append(codes)
|
samples.append(codes)
|
||||||
|
|
||||||
if not self.preloaded_tensors:
|
if not self.preloaded_tensors:
|
||||||
self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' )
|
self.autoregressive = migrate_to_device(self.autoregressive, 'cpu')
|
||||||
|
|
||||||
if self.unsqueeze_sample_batches:
|
if self.unsqueeze_sample_batches:
|
||||||
new_samples = []
|
new_samples = []
|
||||||
for batch in samples:
|
for batch in samples:
|
||||||
for i in range(batch.shape[0]):
|
for i in range(batch.shape[0]):
|
||||||
new_samples.append(batch[i].unsqueeze(0))
|
new_samples.append(batch[i].unsqueeze(0))
|
||||||
samples = new_samples
|
samples = new_samples
|
||||||
|
|
||||||
clip_results = []
|
clip_results = []
|
||||||
if auto_conds is not None:
|
if auto_conds is not None:
|
||||||
auto_conditioning = migrate_to_device( auto_conditioning, self.device )
|
auto_conditioning = migrate_to_device(auto_conditioning, self.device)
|
||||||
|
|
||||||
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
|
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
|
||||||
if not self.preloaded_tensors:
|
if not self.preloaded_tensors:
|
||||||
self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' )
|
self.autoregressive = migrate_to_device(self.autoregressive, 'cpu')
|
||||||
self.clvp = migrate_to_device( self.clvp, self.device )
|
self.clvp = migrate_to_device(self.clvp, self.device)
|
||||||
|
|
||||||
if cvvp_amount > 0:
|
if cvvp_amount > 0:
|
||||||
if self.cvvp is None:
|
if self.cvvp is None:
|
||||||
self.load_cvvp()
|
self.load_cvvp()
|
||||||
|
|
||||||
if not self.preloaded_tensors:
|
if not self.preloaded_tensors:
|
||||||
self.cvvp = migrate_to_device( self.cvvp, self.device )
|
self.cvvp = migrate_to_device(self.cvvp, self.device)
|
||||||
|
|
||||||
desc="Computing best candidates"
|
desc = "Computing best candidates"
|
||||||
if verbose:
|
if verbose:
|
||||||
if self.cvvp is None:
|
if self.cvvp is None:
|
||||||
desc = "Computing best candidates using CLVP"
|
desc = "Computing best candidates using CLVP"
|
||||||
else:
|
else:
|
||||||
desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%"
|
desc = f"Computing best candidates using CLVP {((1 - cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%"
|
||||||
|
|
||||||
|
for batch in tqdm(samples, desc=desc, disable=not verbose):
|
||||||
for batch in tqdm(samples, desc=desc):
|
|
||||||
check_for_kill_signal()
|
check_for_kill_signal()
|
||||||
for i in range(batch.shape[0]):
|
for i in range(batch.shape[0]):
|
||||||
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
|
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
|
||||||
|
@ -761,32 +804,32 @@ class TextToSpeech:
|
||||||
if auto_conds is not None and cvvp_amount > 0:
|
if auto_conds is not None and cvvp_amount > 0:
|
||||||
cvvp_accumulator = 0
|
cvvp_accumulator = 0
|
||||||
for cl in range(auto_conds.shape[1]):
|
for cl in range(auto_conds.shape[1]):
|
||||||
cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
|
cvvp_accumulator = cvvp_accumulator + self.cvvp(
|
||||||
|
auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
|
||||||
cvvp = cvvp_accumulator / auto_conds.shape[1]
|
cvvp = cvvp_accumulator / auto_conds.shape[1]
|
||||||
if cvvp_amount == 1:
|
if cvvp_amount == 1:
|
||||||
clip_results.append(cvvp)
|
clip_results.append(cvvp)
|
||||||
else:
|
else:
|
||||||
clip_results.append(cvvp * cvvp_amount + clvp * (1-cvvp_amount))
|
clip_results.append(cvvp * cvvp_amount + clvp * (1 - cvvp_amount))
|
||||||
else:
|
else:
|
||||||
clip_results.append(clvp)
|
clip_results.append(clvp)
|
||||||
|
|
||||||
if not self.preloaded_tensors and auto_conds is not None:
|
if not self.preloaded_tensors and auto_conds is not None:
|
||||||
auto_conds = migrate_to_device( auto_conds, 'cpu' )
|
auto_conds = migrate_to_device(auto_conds, 'cpu')
|
||||||
|
|
||||||
clip_results = torch.cat(clip_results, dim=0)
|
clip_results = torch.cat(clip_results, dim=0)
|
||||||
samples = torch.cat(samples, dim=0)
|
samples = torch.cat(samples, dim=0)
|
||||||
best_results = samples[torch.topk(clip_results, k=k).indices]
|
best_results = samples[torch.topk(clip_results, k=k).indices]
|
||||||
|
|
||||||
if not self.preloaded_tensors:
|
if not self.preloaded_tensors:
|
||||||
self.clvp = migrate_to_device( self.clvp, 'cpu' )
|
self.clvp = migrate_to_device(self.clvp, 'cpu')
|
||||||
self.cvvp = migrate_to_device( self.cvvp, 'cpu' )
|
self.cvvp = migrate_to_device(self.cvvp, 'cpu')
|
||||||
|
|
||||||
|
|
||||||
if get_device_name() == "dml":
|
if get_device_name() == "dml":
|
||||||
text_tokens = migrate_to_device( text_tokens, 'cpu' )
|
text_tokens = migrate_to_device(text_tokens, 'cpu')
|
||||||
best_results = migrate_to_device( best_results, 'cpu' )
|
best_results = migrate_to_device(best_results, 'cpu')
|
||||||
auto_conditioning = migrate_to_device( auto_conditioning, 'cpu' )
|
auto_conditioning = migrate_to_device(auto_conditioning, 'cpu')
|
||||||
self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' )
|
self.autoregressive = migrate_to_device(self.autoregressive, 'cpu')
|
||||||
else:
|
else:
|
||||||
auto_conditioning = auto_conditioning.to(self.device)
|
auto_conditioning = auto_conditioning.to(self.device)
|
||||||
self.autoregressive = self.autoregressive.to(self.device)
|
self.autoregressive = self.autoregressive.to(self.device)
|
||||||
|
@ -797,23 +840,26 @@ class TextToSpeech:
|
||||||
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
||||||
# results, but will increase memory usage.
|
# results, but will increase memory usage.
|
||||||
best_latents = self.autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
|
best_latents = self.autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
|
||||||
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
|
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device),
|
||||||
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
|
best_results,
|
||||||
|
torch.tensor([best_results.shape[
|
||||||
|
-1] * self.autoregressive.mel_length_compression],
|
||||||
|
device=text_tokens.device),
|
||||||
return_latent=True, clip_inputs=False)
|
return_latent=True, clip_inputs=False)
|
||||||
|
|
||||||
diffusion_conditioning = migrate_to_device( diffusion_conditioning, self.device )
|
diffusion_conditioning = migrate_to_device(diffusion_conditioning, self.device)
|
||||||
|
|
||||||
if get_device_name() == "dml":
|
if get_device_name() == "dml":
|
||||||
self.autoregressive = migrate_to_device( self.autoregressive, self.device )
|
self.autoregressive = migrate_to_device(self.autoregressive, self.device)
|
||||||
best_results = migrate_to_device( best_results, self.device )
|
best_results = migrate_to_device(best_results, self.device)
|
||||||
best_latents = migrate_to_device( best_latents, self.device )
|
best_latents = migrate_to_device(best_latents, self.device)
|
||||||
self.vocoder = migrate_to_device( self.vocoder, 'cpu' )
|
self.vocoder = migrate_to_device(self.vocoder, 'cpu')
|
||||||
else:
|
else:
|
||||||
if not self.preloaded_tensors:
|
if not self.preloaded_tensors:
|
||||||
self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' )
|
self.autoregressive = migrate_to_device(self.autoregressive, 'cpu')
|
||||||
|
|
||||||
self.diffusion = migrate_to_device( self.diffusion, self.device )
|
self.diffusion = migrate_to_device(self.diffusion, self.device)
|
||||||
self.vocoder = migrate_to_device( self.vocoder, self.device )
|
self.vocoder = migrate_to_device(self.vocoder, self.device)
|
||||||
|
|
||||||
del text_tokens
|
del text_tokens
|
||||||
del auto_conditioning
|
del auto_conditioning
|
||||||
|
@ -835,22 +881,26 @@ class TextToSpeech:
|
||||||
break
|
break
|
||||||
|
|
||||||
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
|
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
|
||||||
temperature=diffusion_temperature, desc="Transforming autoregressive outputs into audio..", sampler=diffusion_sampler,
|
temperature=diffusion_temperature,
|
||||||
input_sample_rate=self.input_sample_rate, output_sample_rate=self.output_sample_rate)
|
desc="Transforming autoregressive outputs into audio..",
|
||||||
|
sampler=diffusion_sampler,
|
||||||
|
input_sample_rate=self.input_sample_rate,
|
||||||
|
output_sample_rate=self.output_sample_rate)
|
||||||
|
|
||||||
wav = self.vocoder.inference(mel)
|
wav = self.vocoder.inference(mel)
|
||||||
wav_candidates.append(wav)
|
wav_candidates.append(wav)
|
||||||
|
|
||||||
if not self.preloaded_tensors:
|
if not self.preloaded_tensors:
|
||||||
self.diffusion = migrate_to_device( self.diffusion, 'cpu' )
|
self.diffusion = migrate_to_device(self.diffusion, 'cpu')
|
||||||
self.vocoder = migrate_to_device( self.vocoder, 'cpu' )
|
self.vocoder = migrate_to_device(self.vocoder, 'cpu')
|
||||||
|
|
||||||
def potentially_redact(clip, text):
|
def potentially_redact(clip, text):
|
||||||
if self.enable_redaction:
|
if self.enable_redaction:
|
||||||
t = clip.squeeze(1)
|
t = clip.squeeze(1)
|
||||||
t = migrate_to_device( t, 'cpu' if get_device_name() == "dml" else self.device)
|
t = migrate_to_device(t, 'cpu' if get_device_name() == "dml" else self.device)
|
||||||
return self.aligner.redact(t, text, self.output_sample_rate).unsqueeze(1)
|
return self.aligner.redact(t, text, self.output_sample_rate).unsqueeze(1)
|
||||||
return clip
|
return clip
|
||||||
|
|
||||||
wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]
|
wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]
|
||||||
|
|
||||||
if len(wav_candidates) > 1:
|
if len(wav_candidates) > 1:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user