Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e650800447 |
@ -7,9 +7,9 @@ progressbar
|
||||
einops
|
||||
unidecode
|
||||
scipy
|
||||
librosa==0.8.1
|
||||
librosa
|
||||
torchaudio
|
||||
threadpoolctl
|
||||
appdirs
|
||||
numpy<=1.23.5
|
||||
numpy
|
||||
numba
|
||||
2
setup.py
2
setup.py
@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
||||
setuptools.setup(
|
||||
name="TorToiSe",
|
||||
packages=setuptools.find_packages(),
|
||||
version="2.4.5",
|
||||
version="2.4.4",
|
||||
author="James Betker",
|
||||
author_email="james@adamant.ai",
|
||||
description="A high quality multi-voice text-to-speech library",
|
||||
|
||||
434
tortoise/api.py
434
tortoise/api.py
@ -29,7 +29,7 @@ from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named
|
||||
from tortoise.utils.tokenizer import VoiceBpeTokenizer
|
||||
from tortoise.utils.wav2vec_alignment import Wav2VecAlignment
|
||||
|
||||
from tortoise.utils.device import get_device, get_device_name, get_device_batch_size, print_stats, do_gc
|
||||
from tortoise.utils.device import get_device, get_device_name, get_device_batch_size
|
||||
|
||||
pbar = None
|
||||
STOP_SIGNAL = False
|
||||
@ -43,12 +43,8 @@ MODELS = {
|
||||
'vocoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth',
|
||||
'rlg_auto.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_auto.pth',
|
||||
'rlg_diffuser.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth',
|
||||
|
||||
'bigvgan_base_24khz_100band.pth': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_base_24khz_100band.pth',
|
||||
'bigvgan_24khz_100band.pth': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_24khz_100band.pth',
|
||||
|
||||
'bigvgan_base_24khz_100band.json': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_base_24khz_100band.json',
|
||||
'bigvgan_24khz_100band.json': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_24khz_100band.json',
|
||||
#'bigvgan_24khz_100band.pth': 'https://huggingface.co/ecker/tortoise-tts-models/resolve/main/models/bigvgan_24khz_100band.pth',
|
||||
}
|
||||
|
||||
def hash_file(path, algo="md5", buffer_size=0):
|
||||
@ -83,6 +79,16 @@ def check_for_kill_signal():
|
||||
STOP_SIGNAL = False
|
||||
raise Exception("Kill signal detected")
|
||||
|
||||
def tqdm_override(arr, verbose=False, progress=None, desc=None):
|
||||
check_for_kill_signal()
|
||||
|
||||
if verbose and desc is not None:
|
||||
print(desc)
|
||||
|
||||
if progress is None:
|
||||
return tqdm(arr, disable=not verbose)
|
||||
return progress.tqdm(arr, desc=f'{progress.msg_prefix} {desc}' if hasattr(progress, 'msg_prefix') else desc, track_tqdm=True)
|
||||
|
||||
def download_models(specific_models=None):
|
||||
"""
|
||||
Call to download all the models that Tortoise uses.
|
||||
@ -150,7 +156,7 @@ def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusi
|
||||
model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', trained_diffusion_steps),
|
||||
conditioning_free=cond_free, conditioning_free_k=cond_free_k)
|
||||
|
||||
@torch.inference_mode()
|
||||
|
||||
def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=22050):
|
||||
"""
|
||||
Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
|
||||
@ -162,8 +168,8 @@ def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=2
|
||||
rand_start = random.randint(0, gap)
|
||||
clip = clip[:, rand_start:rand_start + cond_length]
|
||||
mel_clip = TorchMelSpectrogram(sampling_rate=sampling_rate)(clip.unsqueeze(0)).squeeze(0)
|
||||
mel_clip = mel_clip.unsqueeze(0)
|
||||
return migrate_to_device(mel_clip, device)
|
||||
return mel_clip.unsqueeze(0).to(device)
|
||||
|
||||
|
||||
def fix_autoregressive_output(codes, stop_token, complain=True):
|
||||
"""
|
||||
@ -194,8 +200,8 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
|
||||
|
||||
return codes
|
||||
|
||||
@torch.inference_mode()
|
||||
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000):
|
||||
|
||||
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, progress=None, desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000):
|
||||
"""
|
||||
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
||||
"""
|
||||
@ -208,7 +214,8 @@ def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_la
|
||||
|
||||
diffuser.sampler = sampler.lower()
|
||||
mel = diffuser.sample_loop(diffusion_model, output_shape, noise=noise,
|
||||
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings}, desc=desc)
|
||||
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
|
||||
verbose=verbose, progress=progress, desc=desc)
|
||||
|
||||
mel = denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
|
||||
if get_device_name() == "dml":
|
||||
@ -230,37 +237,12 @@ def classify_audio_clip(clip):
|
||||
results = F.softmax(classifier(clip), dim=-1)
|
||||
return results[0][0]
|
||||
|
||||
def migrate_to_device( t, device ):
|
||||
if t is None:
|
||||
return t
|
||||
|
||||
if not hasattr(t, 'device'):
|
||||
t.device = device
|
||||
t.manually_track_device = True
|
||||
elif t.device == device:
|
||||
return t
|
||||
|
||||
if hasattr(t, 'manually_track_device') and t.manually_track_device:
|
||||
t.device = device
|
||||
|
||||
t = t.to(device)
|
||||
|
||||
do_gc()
|
||||
|
||||
return t
|
||||
|
||||
class TextToSpeech:
|
||||
"""
|
||||
Main entry point into Tortoise.
|
||||
"""
|
||||
|
||||
def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable_redaction=True, device=None,
|
||||
minor_optimizations=True,
|
||||
unsqueeze_sample_batches=False,
|
||||
input_sample_rate=22050, output_sample_rate=24000,
|
||||
autoregressive_model_path=None, diffusion_model_path=None, vocoder_model=None, tokenizer_json=None,
|
||||
# ):
|
||||
use_deepspeed=False): # Add use_deepspeed parameter
|
||||
def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable_redaction=True, device=None, minor_optimizations=True, input_sample_rate=22050, output_sample_rate=24000, autoregressive_model_path=None, vocoder_model=None):
|
||||
"""
|
||||
Constructor
|
||||
:param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
|
||||
@ -280,9 +262,7 @@ class TextToSpeech:
|
||||
self.input_sample_rate = input_sample_rate
|
||||
self.output_sample_rate = output_sample_rate
|
||||
self.minor_optimizations = minor_optimizations
|
||||
self.unsqueeze_sample_batches = unsqueeze_sample_batches
|
||||
self.use_deepspeed = use_deepspeed # Store use_deepspeed as an instance variable
|
||||
print(f'use_deepspeed api_debug {use_deepspeed}')
|
||||
|
||||
# for clarity, it's simpler to split these up and just predicate them on requesting VRAM-consuming optimizations
|
||||
self.preloaded_tensors = minor_optimizations
|
||||
self.use_kv_cache = minor_optimizations
|
||||
@ -297,23 +277,23 @@ class TextToSpeech:
|
||||
if self.enable_redaction:
|
||||
self.aligner = Wav2VecAlignment(device='cpu' if get_device_name() == "dml" else self.device)
|
||||
|
||||
self.load_tokenizer_json(tokenizer_json)
|
||||
self.tokenizer = VoiceBpeTokenizer()
|
||||
|
||||
|
||||
if os.path.exists(f'{models_dir}/autoregressive.ptt'):
|
||||
# Assume this is a traced directory.
|
||||
self.autoregressive = torch.jit.load(f'{models_dir}/autoregressive.ptt')
|
||||
self.diffusion = torch.jit.load(f'{models_dir}/diffusion_decoder.ptt')
|
||||
else:
|
||||
if not autoregressive_model_path or not os.path.exists(autoregressive_model_path):
|
||||
autoregressive_model_path = get_model_path('autoregressive.pth', models_dir)
|
||||
|
||||
self.load_autoregressive_model(autoregressive_model_path)
|
||||
|
||||
if os.path.exists(f'{models_dir}/diffusion_decoder.ptt'):
|
||||
self.diffusion = torch.jit.load(f'{models_dir}/diffusion_decoder.ptt')
|
||||
else:
|
||||
if not diffusion_model_path or not os.path.exists(diffusion_model_path):
|
||||
diffusion_model_path = get_model_path('diffusion_decoder.pth', models_dir)
|
||||
|
||||
self.load_diffusion_model(diffusion_model_path)
|
||||
self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
|
||||
in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
|
||||
layer_drop=0, unconditioned_percentage=0).cpu().eval()
|
||||
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
|
||||
|
||||
|
||||
self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20,
|
||||
@ -331,107 +311,39 @@ class TextToSpeech:
|
||||
self.rlg_diffusion = None
|
||||
|
||||
if self.preloaded_tensors:
|
||||
self.autoregressive = migrate_to_device( self.autoregressive, self.device )
|
||||
self.diffusion = migrate_to_device( self.diffusion, self.device )
|
||||
self.clvp = migrate_to_device( self.clvp, self.device )
|
||||
self.vocoder = migrate_to_device( self.vocoder, self.device )
|
||||
|
||||
self.autoregressive = self.autoregressive.to(self.device)
|
||||
self.diffusion = self.diffusion.to(self.device)
|
||||
self.clvp = self.clvp.to(self.device)
|
||||
self.vocoder = self.vocoder.to(self.device)
|
||||
self.loading = False
|
||||
|
||||
def load_autoregressive_model(self, autoregressive_model_path, is_xtts=False):
|
||||
if hasattr(self,"autoregressive_model_path") and os.path.samefile(self.autoregressive_model_path, autoregressive_model_path):
|
||||
def load_autoregressive_model(self, autoregressive_model_path):
|
||||
if hasattr(self,"autoregressive_model_path") and self.autoregressive_model_path == autoregressive_model_path:
|
||||
return
|
||||
|
||||
self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists(autoregressive_model_path) else get_model_path('autoregressive.pth', self.models_dir)
|
||||
new_hash = hash_file(self.autoregressive_model_path)
|
||||
|
||||
if hasattr(self,"autoregressive_model_hash") and self.autoregressive_model_hash == new_hash:
|
||||
return
|
||||
|
||||
self.autoregressive_model_hash = new_hash
|
||||
|
||||
self.loading = True
|
||||
|
||||
self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists(autoregressive_model_path) else get_model_path('autoregressive.pth', self.models_dir)
|
||||
self.autoregressive_model_hash = hash_file(self.autoregressive_model_path)
|
||||
print(f"Loading autoregressive model: {self.autoregressive_model_path}")
|
||||
|
||||
if hasattr(self, 'autoregressive'):
|
||||
del self.autoregressive
|
||||
|
||||
# XTTS requires a different "dimensionality" for its autoregressive model
|
||||
if new_hash == "e4ce21eae0043f7691d6a6c8540b74b8" or is_xtts:
|
||||
dimensionality = {
|
||||
"max_mel_tokens": 605,
|
||||
"max_text_tokens": 402,
|
||||
"max_prompt_tokens": 70,
|
||||
"max_conditioning_inputs": 1,
|
||||
"layers": 30,
|
||||
"model_dim": 1024,
|
||||
"heads": 16,
|
||||
"number_text_tokens": 5023, # -1
|
||||
"start_text_token": 261,
|
||||
"stop_text_token": 0,
|
||||
"number_mel_codes": 8194,
|
||||
"start_mel_token": 8192,
|
||||
"stop_mel_token": 8193,
|
||||
}
|
||||
else:
|
||||
dimensionality = {
|
||||
"max_mel_tokens": 604,
|
||||
"max_text_tokens": 402,
|
||||
"max_conditioning_inputs": 2,
|
||||
"layers": 30,
|
||||
"model_dim": 1024,
|
||||
"heads": 16,
|
||||
"number_text_tokens": 255,
|
||||
"start_text_token": 255,
|
||||
"checkpointing": False,
|
||||
"train_solo_embeddings": False
|
||||
}
|
||||
|
||||
self.autoregressive = UnifiedVoice(**dimensionality).cpu().eval()
|
||||
self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
||||
model_dim=1024,
|
||||
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
||||
train_solo_embeddings=False).cpu().eval()
|
||||
self.autoregressive.load_state_dict(torch.load(self.autoregressive_model_path))
|
||||
self.autoregressive.post_init_gpt2_config(use_deepspeed=self.use_deepspeed, kv_cache=self.use_kv_cache)
|
||||
self.autoregressive.post_init_gpt2_config(kv_cache=self.use_kv_cache)
|
||||
if self.preloaded_tensors:
|
||||
self.autoregressive = migrate_to_device( self.autoregressive, self.device )
|
||||
self.autoregressive = self.autoregressive.to(self.device)
|
||||
|
||||
self.loading = False
|
||||
print(f"Loaded autoregressive model")
|
||||
|
||||
def load_diffusion_model(self, diffusion_model_path):
|
||||
if hasattr(self,"diffusion_model_path") and os.path.samefile(self.diffusion_model_path, diffusion_model_path):
|
||||
return
|
||||
|
||||
self.loading = True
|
||||
|
||||
self.diffusion_model_path = diffusion_model_path if diffusion_model_path and os.path.exists(diffusion_model_path) else get_model_path('diffusion_decoder.pth', self.models_dir)
|
||||
self.diffusion_model_hash = hash_file(self.diffusion_model_path)
|
||||
|
||||
if hasattr(self, 'diffusion'):
|
||||
del self.diffusion
|
||||
|
||||
# XTTS does not require a different "dimensionality" for its diffusion model
|
||||
dimensionality = {
|
||||
"model_channels": 1024,
|
||||
"num_layers": 10,
|
||||
"in_channels": 100,
|
||||
"out_channels": 200,
|
||||
"in_latent_channels": 1024,
|
||||
"in_tokens": 8193,
|
||||
"dropout": 0,
|
||||
"use_fp16": False,
|
||||
"num_heads": 16,
|
||||
"layer_drop": 0,
|
||||
"unconditioned_percentage": 0
|
||||
}
|
||||
self.diffusion = DiffusionTts(**dimensionality)
|
||||
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', self.models_dir)))
|
||||
if self.preloaded_tensors:
|
||||
self.diffusion = migrate_to_device( self.diffusion, self.device )
|
||||
|
||||
self.loading = False
|
||||
print(f"Loaded diffusion model")
|
||||
|
||||
def load_vocoder_model(self, vocoder_model):
|
||||
if hasattr(self,"vocoder_model_path") and os.path.samefile(self.vocoder_model_path, vocoder_model):
|
||||
if hasattr(self,"vocoder_model_path") and self.vocoder_model_path == vocoder_model:
|
||||
return
|
||||
|
||||
self.loading = True
|
||||
@ -439,7 +351,7 @@ class TextToSpeech:
|
||||
if hasattr(self, 'vocoder'):
|
||||
del self.vocoder
|
||||
|
||||
print("Loading vocoder model:", vocoder_model)
|
||||
print(vocoder_model)
|
||||
if vocoder_model is None:
|
||||
vocoder_model = 'bigvgan_24khz_100band'
|
||||
|
||||
@ -449,12 +361,7 @@ class TextToSpeech:
|
||||
self.vocoder_model_path = 'bigvgan_24khz_100band.pth'
|
||||
if f'{vocoder_model}.pth' in MODELS:
|
||||
self.vocoder_model_path = f'{vocoder_model}.pth'
|
||||
vocoder_config = 'bigvgan_24khz_100band.json'
|
||||
if f'{vocoder_model}.json' in MODELS:
|
||||
vocoder_config = f'{vocoder_model}.json'
|
||||
vocoder_config = get_model_path(vocoder_config, self.models_dir)
|
||||
|
||||
self.vocoder = BigVGAN(config=vocoder_config).cpu()
|
||||
self.vocoder = BigVGAN().cpu()
|
||||
#elif vocoder_model == "univnet":
|
||||
else:
|
||||
vocoder_key = 'model_g'
|
||||
@ -466,26 +373,10 @@ class TextToSpeech:
|
||||
|
||||
self.vocoder.eval(inference=True)
|
||||
if self.preloaded_tensors:
|
||||
self.vocoder = migrate_to_device( self.vocoder, self.device )
|
||||
self.vocoder = self.vocoder.to(self.device)
|
||||
self.loading = False
|
||||
print(f"Loaded vocoder model")
|
||||
|
||||
def load_tokenizer_json(self, tokenizer_json):
|
||||
if hasattr(self,"tokenizer_json") and os.path.samefile(self.tokenizer_json, tokenizer_json):
|
||||
return
|
||||
|
||||
self.loading = True
|
||||
self.tokenizer_json = tokenizer_json if tokenizer_json else os.path.join(os.path.dirname(os.path.realpath(__file__)), '../tortoise/data/tokenizer.json')
|
||||
print("Loading tokenizer JSON:", self.tokenizer_json)
|
||||
|
||||
if hasattr(self, 'tokenizer'):
|
||||
del self.tokenizer
|
||||
|
||||
self.tokenizer = VoiceBpeTokenizer(vocab_file=self.tokenizer_json)
|
||||
|
||||
self.loading = False
|
||||
print(f"Loaded tokenizer")
|
||||
|
||||
def load_cvvp(self):
|
||||
"""Load CVVP model."""
|
||||
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
||||
@ -493,17 +384,15 @@ class TextToSpeech:
|
||||
self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir)))
|
||||
|
||||
if self.preloaded_tensors:
|
||||
self.cvvp = migrate_to_device( self.cvvp, self.device )
|
||||
self.cvvp = self.cvvp.to(self.device)
|
||||
|
||||
@torch.inference_mode()
|
||||
def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, slices=1, max_chunk_size=None, force_cpu=False, original_ar=False, original_diffusion=False):
|
||||
def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, slices=1, max_chunk_size=None, force_cpu=False):
|
||||
"""
|
||||
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
||||
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
|
||||
properties.
|
||||
:param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data.
|
||||
"""
|
||||
|
||||
with torch.no_grad():
|
||||
# computing conditional latents requires being done on the CPU if using DML because M$ still hasn't implemented some core functions
|
||||
if get_device_name() == "dml":
|
||||
@ -513,75 +402,70 @@ class TextToSpeech:
|
||||
if not isinstance(voice_samples, list):
|
||||
voice_samples = [voice_samples]
|
||||
|
||||
resampler_22K = torchaudio.transforms.Resample(
|
||||
voice_samples = [v.to(device) for v in voice_samples]
|
||||
|
||||
resampler = torchaudio.transforms.Resample(
|
||||
self.input_sample_rate,
|
||||
22050,
|
||||
self.output_sample_rate,
|
||||
lowpass_filter_width=16,
|
||||
rolloff=0.85,
|
||||
resampling_method="kaiser_window",
|
||||
beta=8.555504641634386,
|
||||
).to(device)
|
||||
|
||||
resampler_24K = torchaudio.transforms.Resample(
|
||||
self.input_sample_rate,
|
||||
24000,
|
||||
lowpass_filter_width=16,
|
||||
rolloff=0.85,
|
||||
resampling_method="kaiser_window",
|
||||
beta=8.555504641634386,
|
||||
).to(device)
|
||||
|
||||
voice_samples = [migrate_to_device(v, device) for v in voice_samples]
|
||||
)
|
||||
|
||||
samples = []
|
||||
auto_conds = []
|
||||
diffusion_conds = []
|
||||
|
||||
if original_ar:
|
||||
samples = [resampler_22K(sample) for sample in voice_samples]
|
||||
for sample in tqdm(samples, desc="Computing AR conditioning latents..."):
|
||||
auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate, cond_length=132300))
|
||||
else:
|
||||
samples = [resampler_22K(sample) for sample in voice_samples]
|
||||
concat = torch.cat(samples, dim=-1)
|
||||
chunk_size = concat.shape[-1]
|
||||
|
||||
if slices == 0:
|
||||
slices = 1
|
||||
elif max_chunk_size is not None and chunk_size > max_chunk_size:
|
||||
slices = 1
|
||||
while int(chunk_size / slices) > max_chunk_size:
|
||||
slices = slices + 1
|
||||
|
||||
chunks = torch.chunk(concat, slices, dim=1)
|
||||
chunk_size = chunks[0].shape[-1]
|
||||
|
||||
for chunk in tqdm(chunks, desc="Computing AR conditioning latents..."):
|
||||
auto_conds.append(format_conditioning(chunk, device=device, sampling_rate=self.input_sample_rate, cond_length=chunk_size))
|
||||
|
||||
|
||||
if original_diffusion:
|
||||
samples = [resampler_24K(sample) for sample in voice_samples]
|
||||
for sample in tqdm(samples, desc="Computing diffusion conditioning latents..."):
|
||||
sample = pad_or_truncate(sample, 102400)
|
||||
cond_mel = wav_to_univnet_mel(migrate_to_device(sample, device), do_normalization=False, device=self.device)
|
||||
diffusion_conds.append(cond_mel)
|
||||
else:
|
||||
samples = [resampler_24K(sample) for sample in voice_samples]
|
||||
for chunk in tqdm(chunks, desc="Computing diffusion conditioning latents..."):
|
||||
check_for_kill_signal()
|
||||
chunk = pad_or_truncate(chunk, chunk_size)
|
||||
cond_mel = wav_to_univnet_mel(migrate_to_device( chunk, device ), do_normalization=False, device=device)
|
||||
diffusion_conds.append(cond_mel)
|
||||
for sample in voice_samples:
|
||||
auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate))
|
||||
samples.append(resampler(sample.cpu()).to(device)) # icky no good, easier to do the resampling on CPU than figure out how to do it on GPU
|
||||
|
||||
auto_conds = torch.stack(auto_conds, dim=1)
|
||||
self.autoregressive = migrate_to_device( self.autoregressive, device )
|
||||
|
||||
|
||||
self.autoregressive = self.autoregressive.to(device)
|
||||
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
||||
self.autoregressive = migrate_to_device( self.autoregressive, self.device if self.preloaded_tensors else 'cpu' )
|
||||
if self.preloaded_tensors:
|
||||
self.autoregressive = self.autoregressive.to(self.device)
|
||||
else:
|
||||
self.autoregressive = self.autoregressive.cpu()
|
||||
|
||||
|
||||
diffusion_conds = []
|
||||
chunks = []
|
||||
|
||||
concat = torch.cat(samples, dim=-1)
|
||||
chunk_size = concat.shape[-1]
|
||||
|
||||
if slices == 0:
|
||||
slices = 1
|
||||
elif max_chunk_size is not None and chunk_size > max_chunk_size:
|
||||
slices = 1
|
||||
while int(chunk_size / slices) > max_chunk_size:
|
||||
slices = slices + 1
|
||||
|
||||
chunks = torch.chunk(concat, slices, dim=1)
|
||||
chunk_size = chunks[0].shape[-1]
|
||||
|
||||
# expand / truncate samples to match the common size
|
||||
# required, as tensors need to be of the same length
|
||||
for chunk in tqdm_override(chunks, verbose=verbose, progress=progress, desc="Computing conditioning latents..."):
|
||||
check_for_kill_signal()
|
||||
chunk = pad_or_truncate(chunk, chunk_size)
|
||||
cond_mel = wav_to_univnet_mel(chunk.to(device), do_normalization=False, device=device)
|
||||
diffusion_conds.append(cond_mel)
|
||||
|
||||
diffusion_conds = torch.stack(diffusion_conds, dim=1)
|
||||
self.diffusion = migrate_to_device( self.diffusion, device )
|
||||
|
||||
self.diffusion = self.diffusion.to(device)
|
||||
|
||||
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
||||
self.diffusion = migrate_to_device( self.diffusion, self.device if self.preloaded_tensors else 'cpu' )
|
||||
|
||||
if self.preloaded_tensors:
|
||||
self.diffusion = self.diffusion.to(self.device)
|
||||
else:
|
||||
self.diffusion = self.diffusion.cpu()
|
||||
|
||||
|
||||
|
||||
if return_mels:
|
||||
return auto_latent, diffusion_latent, auto_conds, diffusion_conds
|
||||
@ -621,15 +505,12 @@ class TextToSpeech:
|
||||
settings.update(kwargs) # allow overriding of preset settings with kwargs
|
||||
return self.tts(text, **settings)
|
||||
|
||||
@torch.inference_mode()
|
||||
def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None,
|
||||
return_deterministic_state=False,
|
||||
# autoregressive generation parameters follow
|
||||
num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
|
||||
sample_batch_size=None,
|
||||
autoregressive_model=None,
|
||||
diffusion_model=None,
|
||||
tokenizer_json=None,
|
||||
# CVVP parameters follow
|
||||
cvvp_amount=.0,
|
||||
# diffusion generation parameters follow
|
||||
@ -637,6 +518,7 @@ class TextToSpeech:
|
||||
diffusion_sampler="P",
|
||||
breathing_room=8,
|
||||
half_p=False,
|
||||
progress=None,
|
||||
**hf_generate_kwargs):
|
||||
"""
|
||||
Produces an audio clip of the given text being spoken with the given reference voice.
|
||||
@ -696,19 +578,7 @@ class TextToSpeech:
|
||||
elif autoregressive_model != self.autoregressive_model_path:
|
||||
self.load_autoregressive_model(autoregressive_model)
|
||||
|
||||
if diffusion_model is None:
|
||||
diffusion_model = self.diffusion_model_path
|
||||
elif diffusion_model != self.diffusion_model_path:
|
||||
self.load_diffusion_model(diffusion_model)
|
||||
|
||||
if tokenizer_json is None:
|
||||
tokenizer_json = self.tokenizer_json
|
||||
elif tokenizer_json != self.tokenizer_json:
|
||||
self.load_tokenizer_json(tokenizer_json)
|
||||
|
||||
text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0)
|
||||
text_tokens = migrate_to_device( text_tokens, self.device )
|
||||
|
||||
text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).to(self.device)
|
||||
text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.
|
||||
assert text_tokens.shape[-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.'
|
||||
|
||||
@ -736,12 +606,12 @@ class TextToSpeech:
|
||||
stop_mel_token = self.autoregressive.stop_mel_token
|
||||
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
|
||||
|
||||
self.autoregressive = migrate_to_device( self.autoregressive, self.device )
|
||||
auto_conditioning = migrate_to_device( auto_conditioning, self.device )
|
||||
text_tokens = migrate_to_device( text_tokens, self.device )
|
||||
self.autoregressive = self.autoregressive.to(self.device)
|
||||
auto_conditioning = auto_conditioning.to(self.device)
|
||||
text_tokens = text_tokens.to(self.device)
|
||||
|
||||
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
|
||||
for b in tqdm(range(num_batches), desc="Generating autoregressive samples"):
|
||||
for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"):
|
||||
check_for_kill_signal()
|
||||
codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
|
||||
do_sample=True,
|
||||
@ -757,30 +627,24 @@ class TextToSpeech:
|
||||
samples.append(codes)
|
||||
|
||||
if not self.preloaded_tensors:
|
||||
self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' )
|
||||
|
||||
if self.unsqueeze_sample_batches:
|
||||
new_samples = []
|
||||
for batch in samples:
|
||||
for i in range(batch.shape[0]):
|
||||
new_samples.append(batch[i].unsqueeze(0))
|
||||
samples = new_samples
|
||||
self.autoregressive = self.autoregressive.cpu()
|
||||
auto_conditioning = auto_conditioning.cpu()
|
||||
|
||||
clip_results = []
|
||||
|
||||
if auto_conds is not None:
|
||||
auto_conditioning = migrate_to_device( auto_conditioning, self.device )
|
||||
auto_conds = auto_conds.to(self.device)
|
||||
|
||||
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
|
||||
if not self.preloaded_tensors:
|
||||
self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' )
|
||||
self.clvp = migrate_to_device( self.clvp, self.device )
|
||||
if not self.minor_optimizations:
|
||||
self.autoregressive = self.autoregressive.cpu()
|
||||
self.clvp = self.clvp.to(self.device)
|
||||
|
||||
if cvvp_amount > 0:
|
||||
if self.cvvp is None:
|
||||
self.load_cvvp()
|
||||
|
||||
if not self.preloaded_tensors:
|
||||
self.cvvp = migrate_to_device( self.cvvp, self.device )
|
||||
if not self.minor_optimizations:
|
||||
self.cvvp = self.cvvp.to(self.device)
|
||||
|
||||
desc="Computing best candidates"
|
||||
if verbose:
|
||||
@ -789,8 +653,7 @@ class TextToSpeech:
|
||||
else:
|
||||
desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%"
|
||||
|
||||
|
||||
for batch in tqdm(samples, desc=desc):
|
||||
for batch in tqdm_override(samples, verbose=verbose, progress=progress, desc=desc):
|
||||
check_for_kill_signal()
|
||||
for i in range(batch.shape[0]):
|
||||
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
|
||||
@ -811,31 +674,30 @@ class TextToSpeech:
|
||||
clip_results.append(clvp)
|
||||
|
||||
if not self.preloaded_tensors and auto_conds is not None:
|
||||
auto_conds = migrate_to_device( auto_conds, 'cpu' )
|
||||
auto_conds = auto_conds.cpu()
|
||||
|
||||
clip_results = torch.cat(clip_results, dim=0)
|
||||
samples = torch.cat(samples, dim=0)
|
||||
if k < num_autoregressive_samples:
|
||||
best_results = samples[torch.topk(clip_results, k=k).indices]
|
||||
else:
|
||||
best_results = samples
|
||||
best_results = samples[torch.topk(clip_results, k=k).indices]
|
||||
|
||||
if not self.preloaded_tensors:
|
||||
self.clvp = migrate_to_device( self.clvp, 'cpu' )
|
||||
self.cvvp = migrate_to_device( self.cvvp, 'cpu' )
|
||||
|
||||
|
||||
if get_device_name() == "dml":
|
||||
text_tokens = migrate_to_device( text_tokens, 'cpu' )
|
||||
best_results = migrate_to_device( best_results, 'cpu' )
|
||||
auto_conditioning = migrate_to_device( auto_conditioning, 'cpu' )
|
||||
self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' )
|
||||
else:
|
||||
auto_conditioning = auto_conditioning.to(self.device)
|
||||
self.autoregressive = self.autoregressive.to(self.device)
|
||||
self.clvp = self.clvp.cpu()
|
||||
if self.cvvp is not None:
|
||||
self.cvvp = self.cvvp.cpu()
|
||||
|
||||
del samples
|
||||
|
||||
if get_device_name() == "dml":
|
||||
text_tokens = text_tokens.cpu()
|
||||
best_results = best_results.cpu()
|
||||
auto_conditioning = auto_conditioning.cpu()
|
||||
self.autoregressive = self.autoregressive.cpu()
|
||||
else:
|
||||
#text_tokens = text_tokens.to(self.device)
|
||||
#best_results = best_results.to(self.device)
|
||||
auto_conditioning = auto_conditioning.to(self.device)
|
||||
self.autoregressive = self.autoregressive.to(self.device)
|
||||
|
||||
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
||||
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
||||
# results, but will increase memory usage.
|
||||
@ -844,19 +706,21 @@ class TextToSpeech:
|
||||
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
|
||||
return_latent=True, clip_inputs=False)
|
||||
|
||||
diffusion_conditioning = migrate_to_device( diffusion_conditioning, self.device )
|
||||
diffusion_conditioning = diffusion_conditioning.to(self.device)
|
||||
|
||||
if get_device_name() == "dml":
|
||||
self.autoregressive = migrate_to_device( self.autoregressive, self.device )
|
||||
best_results = migrate_to_device( best_results, self.device )
|
||||
best_latents = migrate_to_device( best_latents, self.device )
|
||||
self.vocoder = migrate_to_device( self.vocoder, 'cpu' )
|
||||
self.autoregressive = self.autoregressive.to(self.device)
|
||||
best_results = best_results.to(self.device)
|
||||
best_latents = best_latents.to(self.device)
|
||||
|
||||
self.vocoder = self.vocoder.cpu()
|
||||
else:
|
||||
if not self.preloaded_tensors:
|
||||
self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' )
|
||||
self.autoregressive = self.autoregressive.cpu()
|
||||
|
||||
self.diffusion = self.diffusion.to(self.device)
|
||||
self.vocoder = self.vocoder.to(self.device)
|
||||
|
||||
self.diffusion = migrate_to_device( self.diffusion, self.device )
|
||||
self.vocoder = migrate_to_device( self.vocoder, self.device )
|
||||
|
||||
del text_tokens
|
||||
del auto_conditioning
|
||||
@ -878,21 +742,19 @@ class TextToSpeech:
|
||||
break
|
||||
|
||||
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
|
||||
temperature=diffusion_temperature, desc="Transforming autoregressive outputs into audio..", sampler=diffusion_sampler,
|
||||
temperature=diffusion_temperature, verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio..", sampler=diffusion_sampler,
|
||||
input_sample_rate=self.input_sample_rate, output_sample_rate=self.output_sample_rate)
|
||||
|
||||
wav = self.vocoder.inference(mel)
|
||||
wav_candidates.append(wav)
|
||||
|
||||
if not self.preloaded_tensors:
|
||||
self.diffusion = migrate_to_device( self.diffusion, 'cpu' )
|
||||
self.vocoder = migrate_to_device( self.vocoder, 'cpu' )
|
||||
self.diffusion = self.diffusion.cpu()
|
||||
self.vocoder = self.vocoder.cpu()
|
||||
|
||||
def potentially_redact(clip, text):
|
||||
if self.enable_redaction:
|
||||
t = clip.squeeze(1)
|
||||
t = migrate_to_device( t, 'cpu' if get_device_name() == "dml" else self.device)
|
||||
return self.aligner.redact(t, text, self.output_sample_rate).unsqueeze(1)
|
||||
return self.aligner.redact(clip.squeeze(1).to('cpu' if get_device_name() == "dml" else self.device), text, self.output_sample_rate).unsqueeze(1)
|
||||
return clip
|
||||
wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]
|
||||
|
||||
@ -901,7 +763,7 @@ class TextToSpeech:
|
||||
else:
|
||||
res = wav_candidates[0]
|
||||
|
||||
do_gc()
|
||||
gc.collect()
|
||||
|
||||
if return_deterministic_state:
|
||||
return res, (deterministic_seed, text, voice_samples, conditioning_latents)
|
||||
|
||||
@ -14,7 +14,6 @@ if __name__ == '__main__':
|
||||
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
||||
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='random')
|
||||
parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
|
||||
parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=True)
|
||||
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
|
||||
parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
|
||||
'should only be specified if you have custom checkpoints.', default=MODELS_DIR)
|
||||
@ -38,8 +37,8 @@ if __name__ == '__main__':
|
||||
|
||||
|
||||
os.makedirs(args.output_path, exist_ok=True)
|
||||
#print(f'use_deepspeed do_tts_debug {use_deepspeed}')
|
||||
tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed)
|
||||
|
||||
tts = TextToSpeech(models_dir=args.model_dir)
|
||||
|
||||
selected_voices = args.voice.split(',')
|
||||
for k, selected_voice in enumerate(selected_voices):
|
||||
|
||||
@ -283,9 +283,9 @@ class MelEncoder(nn.Module):
|
||||
|
||||
|
||||
class UnifiedVoice(nn.Module):
|
||||
def __init__(self, layers=8, model_dim=512, heads=8, max_text_tokens=120, max_prompt_tokens=2, max_mel_tokens=250, max_conditioning_inputs=1,
|
||||
def __init__(self, layers=8, model_dim=512, heads=8, max_text_tokens=120, max_mel_tokens=250, max_conditioning_inputs=1,
|
||||
mel_length_compression=1024, number_text_tokens=256,
|
||||
start_text_token=None, stop_text_token=0, number_mel_codes=8194, start_mel_token=8192,
|
||||
start_text_token=None, number_mel_codes=8194, start_mel_token=8192,
|
||||
stop_mel_token=8193, train_solo_embeddings=False, use_mel_codes_as_input=True,
|
||||
checkpointing=True, types=1):
|
||||
"""
|
||||
@ -295,7 +295,6 @@ class UnifiedVoice(nn.Module):
|
||||
heads: Number of transformer heads. Must be divisible by model_dim. Recommend model_dim//64
|
||||
max_text_tokens: Maximum number of text tokens that will be encountered by model.
|
||||
max_mel_tokens: Maximum number of MEL tokens that will be encountered by model.
|
||||
max_prompt_tokens: compat set to 2, 70 for XTTS
|
||||
max_conditioning_inputs: Maximum number of conditioning inputs provided to the model. If (1), conditioning input can be of format (b,80,s), otherwise (b,n,80,s).
|
||||
mel_length_compression: The factor between <number_input_samples> and <mel_tokens>. Used to compute MEL code padding given wav input length.
|
||||
number_text_tokens:
|
||||
@ -312,7 +311,7 @@ class UnifiedVoice(nn.Module):
|
||||
|
||||
self.number_text_tokens = number_text_tokens
|
||||
self.start_text_token = number_text_tokens * types if start_text_token is None else start_text_token
|
||||
self.stop_text_token = stop_text_token
|
||||
self.stop_text_token = 0
|
||||
self.number_mel_codes = number_mel_codes
|
||||
self.start_mel_token = start_mel_token
|
||||
self.stop_mel_token = stop_mel_token
|
||||
@ -320,7 +319,6 @@ class UnifiedVoice(nn.Module):
|
||||
self.heads = heads
|
||||
self.max_mel_tokens = max_mel_tokens
|
||||
self.max_text_tokens = max_text_tokens
|
||||
self.max_prompt_tokens = max_prompt_tokens
|
||||
self.model_dim = model_dim
|
||||
self.max_conditioning_inputs = max_conditioning_inputs
|
||||
self.mel_length_compression = mel_length_compression
|
||||
@ -354,8 +352,8 @@ class UnifiedVoice(nn.Module):
|
||||
for module in embeddings:
|
||||
module.weight.data.normal_(mean=0.0, std=.02)
|
||||
|
||||
def post_init_gpt2_config(self, use_deepspeed=False, kv_cache=False):
|
||||
seq_length = self.max_mel_tokens + self.max_text_tokens + self.max_prompt_tokens
|
||||
def post_init_gpt2_config(self, kv_cache=False):
|
||||
seq_length = self.max_mel_tokens + self.max_text_tokens + 2
|
||||
gpt_config = GPT2Config(vocab_size=self.max_mel_tokens,
|
||||
n_positions=seq_length,
|
||||
n_ctx=seq_length,
|
||||
@ -365,17 +363,6 @@ class UnifiedVoice(nn.Module):
|
||||
gradient_checkpointing=False,
|
||||
use_cache=True)
|
||||
self.inference_model = GPT2InferenceModel(gpt_config, self.gpt, self.mel_pos_embedding, self.mel_embedding, self.final_norm, self.mel_head, kv_cache=kv_cache)
|
||||
#print(f'use_deepspeed autoregressive_debug {use_deepspeed}')
|
||||
if use_deepspeed and torch.cuda.is_available():
|
||||
import deepspeed
|
||||
self.ds_engine = deepspeed.init_inference(model=self.inference_model,
|
||||
mp_size=1,
|
||||
replace_with_kernel_inject=True,
|
||||
dtype=torch.float32)
|
||||
self.inference_model = self.ds_engine.module.eval()
|
||||
else:
|
||||
self.inference_model = self.inference_model.eval()
|
||||
|
||||
self.gpt.wte = self.mel_embedding
|
||||
|
||||
def build_aligned_inputs_and_targets(self, input, start_token, stop_token):
|
||||
@ -496,9 +483,9 @@ class UnifiedVoice(nn.Module):
|
||||
|
||||
def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1,
|
||||
max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs):
|
||||
seq_length = self.max_mel_tokens + self.max_text_tokens + self.max_prompt_tokens
|
||||
seq_length = self.max_mel_tokens + self.max_text_tokens + 2
|
||||
if not hasattr(self, 'inference_model'):
|
||||
self.post_init_gpt2_config(kv_cache=self.kv_cache)
|
||||
self.post_init_gpt2_config(kv_cache=self.kv_cachepost_init_gpt2_config)
|
||||
|
||||
|
||||
text_inputs = F.pad(text_inputs, (0, 1), value=self.stop_text_token)
|
||||
|
||||
@ -129,27 +129,14 @@ class AttrDict(dict):
|
||||
|
||||
class BigVGAN(nn.Module):
|
||||
# this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
|
||||
def __init__(self, config=None, data=None):
|
||||
def __init__(self):
|
||||
super(BigVGAN, self).__init__()
|
||||
|
||||
"""
|
||||
with open(os.path.join(os.path.dirname(__file__), 'config.json'), 'r') as f:
|
||||
data = f.read()
|
||||
"""
|
||||
if config and data is None:
|
||||
with open(config, 'r') as f:
|
||||
data = f.read()
|
||||
jsonConfig = json.loads(data)
|
||||
elif data is not None:
|
||||
if isinstance(data, str):
|
||||
jsonConfig = json.loads(data)
|
||||
else:
|
||||
jsonConfig = data
|
||||
else:
|
||||
raise Exception("no config specified")
|
||||
|
||||
|
||||
global h
|
||||
jsonConfig = json.loads(data)
|
||||
h = AttrDict(jsonConfig)
|
||||
|
||||
self.mel_channel = h.num_mels
|
||||
|
||||
@ -9,8 +9,6 @@ from tortoise.models.xtransformers import Encoder
|
||||
|
||||
import tortoise.utils.torch_intermediary as ml
|
||||
|
||||
from tortoise.utils.device import print_stats, do_gc
|
||||
|
||||
def exists(val):
|
||||
return val is not None
|
||||
|
||||
@ -126,13 +124,14 @@ class CLVP(nn.Module):
|
||||
text_emb += self.text_pos_emb(torch.arange(text.shape[1], device=device))
|
||||
speech_emb += self.speech_pos_emb(torch.arange(speech_emb.shape[1], device=device))
|
||||
|
||||
|
||||
text_latents = self.to_text_latent(masked_mean(self.text_transformer(text_emb, mask=text_mask), text_mask, dim=1))
|
||||
enc_text = self.text_transformer(text_emb, mask=text_mask)
|
||||
enc_speech = self.speech_transformer(speech_emb, mask=voice_mask)
|
||||
|
||||
# on ROCm at least, allocated VRAM spikes here
|
||||
do_gc()
|
||||
speech_latents = self.to_speech_latent(masked_mean(self.speech_transformer(speech_emb, mask=voice_mask), voice_mask, dim=1))
|
||||
do_gc()
|
||||
text_latents = masked_mean(enc_text, text_mask, dim=1)
|
||||
speech_latents = masked_mean(enc_speech, voice_mask, dim=1)
|
||||
|
||||
text_latents = self.to_text_latent(text_latents)
|
||||
speech_latents = self.to_speech_latent(speech_latents)
|
||||
|
||||
text_latents, speech_latents = map(lambda t: F.normalize(t, p=2, dim=-1), (text_latents, speech_latents))
|
||||
|
||||
|
||||
46
tortoise/models/config.json
Normal file
46
tortoise/models/config.json
Normal file
@ -0,0 +1,46 @@
|
||||
{
|
||||
"resblock": "1",
|
||||
"num_gpus": 0,
|
||||
"batch_size": 32,
|
||||
"learning_rate": 0.0001,
|
||||
"adam_b1": 0.8,
|
||||
"adam_b2": 0.99,
|
||||
"lr_decay": 0.999,
|
||||
"seed": 1234,
|
||||
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"upsample_initial_channel": 512,
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
|
||||
"activation": "snakebeta",
|
||||
"snake_logscale": true,
|
||||
|
||||
"discriminator": "mrd",
|
||||
"resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
|
||||
"mpd_reshapes": [2, 3, 5, 7, 11],
|
||||
"use_spectral_norm": false,
|
||||
"discriminator_channel_mult": 1,
|
||||
|
||||
"segment_size": 8192,
|
||||
"num_mels": 100,
|
||||
"num_freq": 1025,
|
||||
"n_fft": 1024,
|
||||
"hop_size": 256,
|
||||
"win_size": 1024,
|
||||
|
||||
"sampling_rate": 24000,
|
||||
|
||||
"fmin": 0,
|
||||
"fmax": 12000,
|
||||
"fmax_for_loss": null,
|
||||
|
||||
"num_workers": 4,
|
||||
|
||||
"dist_config": {
|
||||
"dist_backend": "nccl",
|
||||
"dist_url": "tcp://localhost:54321",
|
||||
"world_size": 1
|
||||
}
|
||||
}
|
||||
@ -17,7 +17,6 @@ if __name__ == '__main__':
|
||||
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='pat')
|
||||
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
|
||||
parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
|
||||
parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=True)
|
||||
parser.add_argument('--regenerate', type=str, help='Comma-separated list of clip numbers to re-generate, or nothing.', default=None)
|
||||
parser.add_argument('--candidates', type=int, help='How many output candidates to produce per-voice. Only the first candidate is actually used in the final product, the others can be used manually.', default=1)
|
||||
parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
|
||||
@ -26,7 +25,7 @@ if __name__ == '__main__':
|
||||
parser.add_argument('--produce_debug_state', type=bool, help='Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.', default=True)
|
||||
|
||||
args = parser.parse_args()
|
||||
tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed)
|
||||
tts = TextToSpeech(models_dir=args.model_dir)
|
||||
|
||||
outpath = args.output_path
|
||||
selected_voices = args.voice.split(',')
|
||||
|
||||
@ -2,7 +2,6 @@ import os
|
||||
from glob import glob
|
||||
|
||||
import librosa
|
||||
import soundfile as sf
|
||||
import torch
|
||||
import torchaudio
|
||||
import numpy as np
|
||||
@ -25,9 +24,6 @@ def load_audio(audiopath, sampling_rate):
|
||||
elif audiopath[-4:] == '.mp3':
|
||||
audio, lsr = librosa.load(audiopath, sr=sampling_rate)
|
||||
audio = torch.FloatTensor(audio)
|
||||
elif audiopath[-5:] == '.flac':
|
||||
audio, lsr = sf.read(audiopath)
|
||||
audio = torch.FloatTensor(audio)
|
||||
else:
|
||||
assert False, f"Unsupported audio format provided: {audiopath[-4:]}"
|
||||
|
||||
@ -89,77 +85,17 @@ def get_voices(extra_voice_dirs=[], load_latents=True):
|
||||
for sub in subs:
|
||||
subj = os.path.join(d, sub)
|
||||
if os.path.isdir(subj):
|
||||
voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.flac'))
|
||||
voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3'))
|
||||
if load_latents:
|
||||
voices[sub] = voices[sub] + list(glob(f'{subj}/*.pth'))
|
||||
return voices
|
||||
|
||||
def get_voice( name, dir=get_voice_dir(), load_latents=True, extensions=["wav", "mp3", "flac"] ):
|
||||
subj = f'{dir}/{name}/'
|
||||
if not os.path.isdir(subj):
|
||||
return
|
||||
files = os.listdir(subj)
|
||||
|
||||
if load_latents:
|
||||
extensions.append("pth")
|
||||
|
||||
voice = []
|
||||
for file in files:
|
||||
ext = os.path.splitext(file)[-1][1:]
|
||||
if ext not in extensions:
|
||||
continue
|
||||
|
||||
voice.append(f'{subj}/{file}')
|
||||
|
||||
return sorted( voice )
|
||||
|
||||
def get_voice_list(dir=get_voice_dir(), append_defaults=False, load_latents=True, extensions=["wav", "mp3", "flac"]):
|
||||
defaults = [ "random", "microphone" ]
|
||||
os.makedirs(dir, exist_ok=True)
|
||||
#res = sorted([d for d in os.listdir(dir) if d not in defaults and os.path.isdir(os.path.join(dir, d)) and len(os.listdir(os.path.join(dir, d))) > 0 ])
|
||||
|
||||
res = []
|
||||
for name in os.listdir(dir):
|
||||
if name in defaults:
|
||||
continue
|
||||
if not os.path.isdir(f'{dir}/{name}'):
|
||||
continue
|
||||
if len(os.listdir(os.path.join(dir, name))) == 0:
|
||||
continue
|
||||
files = get_voice( name, dir=dir, extensions=extensions, load_latents=load_latents )
|
||||
|
||||
if len(files) > 0:
|
||||
res.append(name)
|
||||
else:
|
||||
for subdir in os.listdir(f'{dir}/{name}'):
|
||||
if not os.path.isdir(f'{dir}/{name}/{subdir}'):
|
||||
continue
|
||||
files = get_voice( f'{name}/{subdir}', dir=dir, extensions=extensions, load_latents=load_latents )
|
||||
if len(files) == 0:
|
||||
continue
|
||||
res.append(f'{name}/{subdir}')
|
||||
|
||||
res = sorted(res)
|
||||
|
||||
if append_defaults:
|
||||
res = res + defaults
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def _get_voices( dirs=[get_voice_dir()], load_latents=True ):
|
||||
voices = {}
|
||||
for dir in dirs:
|
||||
voice_list = get_voice_list(dir=dir)
|
||||
voices |= { name: get_voice(name=name, dir=dir, load_latents=load_latents) for name in voice_list }
|
||||
|
||||
return voices
|
||||
|
||||
def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu', model_hash=None):
|
||||
if voice == 'random':
|
||||
return None, None
|
||||
|
||||
voices = _get_voices(dirs=[get_voice_dir()] + extra_voice_dirs, load_latents=load_latents)
|
||||
voices = get_voices(extra_voice_dirs=extra_voice_dirs, load_latents=load_latents)
|
||||
|
||||
paths = voices[voice]
|
||||
mtime = 0
|
||||
|
||||
@ -3,30 +3,6 @@ import psutil
|
||||
import importlib
|
||||
|
||||
DEVICE_OVERRIDE = None
|
||||
DEVICE_BATCH_SIZE_MAP = [(14, 16), (10,8), (7,4)]
|
||||
|
||||
from inspect import currentframe, getframeinfo
|
||||
import gc
|
||||
|
||||
def do_gc():
|
||||
gc.collect()
|
||||
try:
|
||||
torch.cuda.empty_cache()
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
def print_stats(collect=False):
|
||||
cf = currentframe().f_back
|
||||
msg = f'{getframeinfo(cf).filename}:{cf.f_lineno}'
|
||||
|
||||
if collect:
|
||||
do_gc()
|
||||
|
||||
tot = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
|
||||
res = torch.cuda.memory_reserved(0) / (1024 ** 3)
|
||||
alloc = torch.cuda.memory_allocated(0) / (1024 ** 3)
|
||||
print("[{}] Total: {:.3f} | Reserved: {:.3f} | Allocated: {:.3f} | Free: {:.3f}".format( msg, tot, res, alloc, tot-res ))
|
||||
|
||||
|
||||
def has_dml():
|
||||
loader = importlib.find_loader('torch_directml')
|
||||
@ -40,7 +16,7 @@ def set_device_name(name):
|
||||
global DEVICE_OVERRIDE
|
||||
DEVICE_OVERRIDE = name
|
||||
|
||||
def get_device_name(attempt_gc=True):
|
||||
def get_device_name():
|
||||
global DEVICE_OVERRIDE
|
||||
if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "":
|
||||
return DEVICE_OVERRIDE
|
||||
@ -49,8 +25,6 @@ def get_device_name(attempt_gc=True):
|
||||
|
||||
if torch.cuda.is_available():
|
||||
name = 'cuda'
|
||||
if attempt_gc:
|
||||
torch.cuda.empty_cache() # may have performance implications
|
||||
elif has_dml():
|
||||
name = 'dml'
|
||||
|
||||
@ -71,30 +45,37 @@ def get_device(verbose=False):
|
||||
|
||||
return torch.device(name)
|
||||
|
||||
def get_device_vram( name=get_device_name() ):
|
||||
def get_device_batch_size():
|
||||
available = 1
|
||||
|
||||
if name == "cuda":
|
||||
_, available = torch.cuda.mem_get_info()
|
||||
name = get_device_name()
|
||||
|
||||
if name == "dml":
|
||||
# there's nothing publicly accessible in the DML API that exposes this
|
||||
# there's a method to get currently used RAM statistics... as tiles
|
||||
available = 1
|
||||
elif name == "cuda":
|
||||
_,available = torch.cuda.mem_get_info()
|
||||
elif name == "cpu":
|
||||
available = psutil.virtual_memory()[4]
|
||||
|
||||
return available / (1024 ** 3)
|
||||
|
||||
def get_device_batch_size(name=get_device_name()):
|
||||
vram = get_device_vram(name)
|
||||
|
||||
if vram > 14:
|
||||
availableGb = available / (1024 ** 3)
|
||||
|
||||
print(f"Total device memory available: {availableGb}")
|
||||
if availableGb > 18:
|
||||
print(f"Setting AutoRegressive Batch Size to: 32")
|
||||
print(f"Damn. Nice GPU Dude.")
|
||||
return 32
|
||||
elif availableGb > 14:
|
||||
print(f"Setting AutoRegressive Batch Size to: 16")
|
||||
return 16
|
||||
elif vram > 10:
|
||||
elif availableGb > 10:
|
||||
print(f"Setting AutoRegressive Batch Size to: 8")
|
||||
return 8
|
||||
elif vram > 7:
|
||||
elif availableGb > 7:
|
||||
print(f"Setting AutoRegressive Batch Size to: 4")
|
||||
return 4
|
||||
"""
|
||||
for k, v in DEVICE_BATCH_SIZE_MAP:
|
||||
if vram > k:
|
||||
return v
|
||||
"""
|
||||
print(f"Setting AutoRegressive Batch Size to: 1")
|
||||
print(f"Don't cry about it if it doesn't work.")
|
||||
return 1
|
||||
|
||||
def get_device_count(name=get_device_name()):
|
||||
@ -107,8 +88,6 @@ def get_device_count(name=get_device_name()):
|
||||
return 1
|
||||
|
||||
|
||||
# if you're getting errors make sure you've updated your torch-directml, and if you're still getting errors then you can uncomment the below block
|
||||
"""
|
||||
if has_dml():
|
||||
_cumsum = torch.cumsum
|
||||
_repeat_interleave = torch.repeat_interleave
|
||||
@ -126,5 +105,4 @@ if has_dml():
|
||||
torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) )
|
||||
torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) )
|
||||
torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) )
|
||||
torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) )
|
||||
"""
|
||||
torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) )
|
||||
@ -13,7 +13,15 @@ import math
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch as th
|
||||
from tqdm.auto import tqdm
|
||||
from tqdm import tqdm
|
||||
|
||||
def tqdm_override(arr, verbose=False, progress=None, desc=None):
|
||||
if verbose and desc is not None:
|
||||
print(desc)
|
||||
|
||||
if progress is None:
|
||||
return tqdm(arr, disable=not verbose)
|
||||
return progress.tqdm(arr, desc=f'{progress.msg_prefix} {desc}' if hasattr(progress, 'msg_prefix') else desc, track_tqdm=True)
|
||||
|
||||
def normal_kl(mean1, logvar1, mean2, logvar2):
|
||||
"""
|
||||
@ -548,6 +556,7 @@ class GaussianDiffusion:
|
||||
model_kwargs=None,
|
||||
device=None,
|
||||
verbose=False,
|
||||
progress=None,
|
||||
desc=None
|
||||
):
|
||||
"""
|
||||
@ -580,6 +589,7 @@ class GaussianDiffusion:
|
||||
model_kwargs=model_kwargs,
|
||||
device=device,
|
||||
verbose=verbose,
|
||||
progress=progress,
|
||||
desc=desc
|
||||
):
|
||||
final = sample
|
||||
@ -596,6 +606,7 @@ class GaussianDiffusion:
|
||||
model_kwargs=None,
|
||||
device=None,
|
||||
verbose=False,
|
||||
progress=None,
|
||||
desc=None
|
||||
):
|
||||
"""
|
||||
@ -615,7 +626,7 @@ class GaussianDiffusion:
|
||||
img = th.randn(*shape, device=device)
|
||||
indices = list(range(self.num_timesteps))[::-1]
|
||||
|
||||
for i in tqdm(indices, desc=desc):
|
||||
for i in tqdm_override(indices, verbose=verbose, desc=desc, progress=progress):
|
||||
t = th.tensor([i] * shape[0], device=device)
|
||||
with th.no_grad():
|
||||
out = self.p_sample(
|
||||
@ -730,6 +741,7 @@ class GaussianDiffusion:
|
||||
device=None,
|
||||
verbose=False,
|
||||
eta=0.0,
|
||||
progress=None,
|
||||
desc=None,
|
||||
):
|
||||
"""
|
||||
@ -749,6 +761,7 @@ class GaussianDiffusion:
|
||||
device=device,
|
||||
verbose=verbose,
|
||||
eta=eta,
|
||||
progress=progress,
|
||||
desc=desc
|
||||
):
|
||||
final = sample
|
||||
@ -766,6 +779,7 @@ class GaussianDiffusion:
|
||||
device=None,
|
||||
verbose=False,
|
||||
eta=0.0,
|
||||
progress=None,
|
||||
desc=None,
|
||||
):
|
||||
"""
|
||||
@ -784,7 +798,10 @@ class GaussianDiffusion:
|
||||
indices = list(range(self.num_timesteps))[::-1]
|
||||
|
||||
if verbose:
|
||||
indices = tqdm(indices, desc=desc)
|
||||
# Lazy import so that we don't depend on tqdm.
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
indices = tqdm_override(indices, verbose=verbose, desc=desc, progress=progress)
|
||||
|
||||
for i in indices:
|
||||
t = th.tensor([i] * shape[0], device=device)
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
|
||||
import inflect
|
||||
import torch
|
||||
@ -171,39 +170,16 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), '
|
||||
|
||||
|
||||
class VoiceBpeTokenizer:
|
||||
def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, preprocess=None):
|
||||
with open(vocab_file, 'r', encoding='utf-8') as f:
|
||||
vocab = json.load(f)
|
||||
|
||||
self.language = vocab['model']['language'] if 'language' in vocab['model'] else None
|
||||
|
||||
if preprocess is None:
|
||||
self.preprocess = 'pre_tokenizer' in vocab and vocab['pre_tokenizer']
|
||||
else:
|
||||
self.preprocess = preprocess
|
||||
def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
|
||||
if vocab_file is not None:
|
||||
self.tokenizer = Tokenizer.from_file(vocab_file)
|
||||
|
||||
def preprocess_text(self, txt):
|
||||
if self.language == 'ja':
|
||||
import pykakasi
|
||||
|
||||
kks = pykakasi.kakasi()
|
||||
results = kks.convert(txt)
|
||||
words = []
|
||||
|
||||
for result in results:
|
||||
words.append(result['kana'])
|
||||
|
||||
txt = " ".join(words)
|
||||
txt = basic_cleaners(txt)
|
||||
else:
|
||||
txt = english_cleaners(txt)
|
||||
txt = english_cleaners(txt)
|
||||
return txt
|
||||
|
||||
def encode(self, txt):
|
||||
if self.preprocess:
|
||||
txt = self.preprocess_text(txt)
|
||||
txt = self.preprocess_text(txt)
|
||||
txt = txt.replace(' ', '[SPACE]')
|
||||
return self.tokenizer.encode(txt).ids
|
||||
|
||||
|
||||
@ -22,19 +22,17 @@ import os
|
||||
|
||||
USE_STABLE_EMBEDDING = False
|
||||
try:
|
||||
import bitsandbytes as bnb
|
||||
OVERRIDE_LINEAR = False
|
||||
OVERRIDE_EMBEDDING = False
|
||||
OVERRIDE_ADAM = False
|
||||
OVERRIDE_ADAMW = False
|
||||
OVERRIDE_EMBEDDING = True
|
||||
OVERRIDE_ADAM = True
|
||||
OVERRIDE_ADAMW = True
|
||||
|
||||
USE_STABLE_EMBEDDING = os.environ.get('BITSANDBYTES_USE_STABLE_EMBEDDING', '1' if USE_STABLE_EMBEDDING else '0') == '1'
|
||||
OVERRIDE_LINEAR = os.environ.get('BITSANDBYTES_OVERRIDE_LINEAR', '1' if OVERRIDE_LINEAR else '0') == '1'
|
||||
OVERRIDE_EMBEDDING = os.environ.get('BITSANDBYTES_OVERRIDE_EMBEDDING', '1' if OVERRIDE_EMBEDDING else '0') == '1'
|
||||
OVERRIDE_ADAM = os.environ.get('BITSANDBYTES_OVERRIDE_ADAM', '1' if OVERRIDE_ADAM else '0') == '1'
|
||||
OVERRIDE_ADAMW = os.environ.get('BITSANDBYTES_OVERRIDE_ADAMW', '1' if OVERRIDE_ADAMW else '0') == '1'
|
||||
|
||||
if OVERRIDE_LINEAR or OVERRIDE_EMBEDDING or OVERRIDE_ADAM or OVERRIDE_ADAMW:
|
||||
import bitsandbytes as bnb
|
||||
except Exception as e:
|
||||
OVERRIDE_LINEAR = False
|
||||
OVERRIDE_EMBEDDING = False
|
||||
|
||||
@ -144,7 +144,7 @@ class Wav2VecAlignment:
|
||||
non_redacted_intervals = []
|
||||
last_point = 0
|
||||
for i in range(len(fully_split)):
|
||||
if i % 2 == 0 and fully_split[i] != "": # Check for empty string fixes index error
|
||||
if i % 2 == 0:
|
||||
end_interval = max(0, last_point + len(fully_split[i]) - 1)
|
||||
non_redacted_intervals.append((last_point, end_interval))
|
||||
last_point += len(fully_split[i])
|
||||
|
||||
Loading…
Reference in New Issue
Block a user