forked from mrq/tortoise-tts
update to v2 models (clvp pending)
This commit is contained in:
parent
a578697287
commit
ad0f3fdd58
31
api.py
31
api.py
|
@ -23,9 +23,11 @@ from utils.tokenizer import VoiceBpeTokenizer, lev_distance
|
||||||
pbar = None
|
pbar = None
|
||||||
def download_models():
|
def download_models():
|
||||||
MODELS = {
|
MODELS = {
|
||||||
'clip.pth': 'https://huggingface.co/jbetker/tortoise-tts-clip/resolve/main/pytorch-model.bin',
|
'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth',
|
||||||
'diffusion.pth': 'https://huggingface.co/jbetker/tortoise-tts-diffusion-v1/resolve/main/pytorch-model.bin',
|
'clvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/clip.pth',
|
||||||
'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-autoregressive/resolve/main/pytorch-model.bin'
|
'clip.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/cvvp.pth',
|
||||||
|
'diffusion_decoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/diffusion_decoder.pth',
|
||||||
|
'vocoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth',
|
||||||
}
|
}
|
||||||
os.makedirs('.models', exist_ok=True)
|
os.makedirs('.models', exist_ok=True)
|
||||||
def show_progress(block_num, block_size, total_size):
|
def show_progress(block_num, block_size, total_size):
|
||||||
|
@ -162,25 +164,12 @@ class TextToSpeech:
|
||||||
train_solo_embeddings=False,
|
train_solo_embeddings=False,
|
||||||
average_conditioning_embeddings=True).cpu().eval()
|
average_conditioning_embeddings=True).cpu().eval()
|
||||||
self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
|
self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
|
||||||
'''
|
|
||||||
self.autoregressive = UnifiedVoice(max_mel_tokens=2048, max_text_tokens=1024, max_conditioning_inputs=1, layers=42,
|
|
||||||
model_dim=1152, heads=18, number_text_tokens=256, train_solo_embeddings=False,
|
|
||||||
average_conditioning_embeddings=True, types=2).cpu().eval()
|
|
||||||
self.autoregressive.load_state_dict(torch.load('X:\\dlas\\experiments\\train_gpt_tts_xl\\models\\15250_gpt_ema.pth'))
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.autoregressive_for_diffusion = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
|
||||||
model_dim=1024,
|
|
||||||
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
|
||||||
train_solo_embeddings=False,
|
|
||||||
average_conditioning_embeddings=True).cpu().eval()
|
|
||||||
self.autoregressive_for_diffusion.load_state_dict(torch.load('.models/autoregressive.pth'))
|
|
||||||
|
|
||||||
self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
|
self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
|
||||||
text_seq_len=350, text_heads=8,
|
text_seq_len=350, text_heads=8,
|
||||||
num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
|
num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
|
||||||
use_xformers=True).cpu().eval()
|
use_xformers=True).cpu().eval()
|
||||||
self.clvp.load_state_dict(torch.load('.models/clip.pth'))
|
self.clvp.load_state_dict(torch.load('.models/clvp.pth'))
|
||||||
|
|
||||||
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
||||||
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
||||||
|
@ -281,11 +270,11 @@ class TextToSpeech:
|
||||||
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
||||||
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
||||||
# results, but will increase memory usage.
|
# results, but will increase memory usage.
|
||||||
self.autoregressive_for_diffusion = self.autoregressive_for_diffusion.cuda()
|
self.autoregressive = self.autoregressive.cuda()
|
||||||
best_latents = self.autoregressive_for_diffusion(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
|
best_latents = self.autoregressive(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
|
||||||
torch.tensor([best_results.shape[-1]*self.autoregressive_for_diffusion.mel_length_compression], device=conds.device),
|
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
|
||||||
return_latent=True, clip_inputs=False)
|
return_latent=True, clip_inputs=False)
|
||||||
self.autoregressive_for_diffusion = self.autoregressive_for_diffusion.cpu()
|
self.autoregressive = self.autoregressive.cpu()
|
||||||
|
|
||||||
print("Performing vocoding..")
|
print("Performing vocoding..")
|
||||||
wav_candidates = []
|
wav_candidates = []
|
||||||
|
|
Loading…
Reference in New Issue
Block a user