forked from mrq/tortoise-tts
248 lines
9.8 KiB
Plaintext
248 lines
9.8 KiB
Plaintext
{
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0,
|
|
"metadata": {
|
|
"colab": {
|
|
"name": "tortoise-tts.ipynb",
|
|
"provenance": [],
|
|
"collapsed_sections": []
|
|
},
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3"
|
|
},
|
|
"language_info": {
|
|
"name": "python"
|
|
},
|
|
"accelerator": "GPU"
|
|
},
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "JrK20I32grP6"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"!git clone https://github.com/neonbjb/tortoise-tts.git\n",
|
|
"%cd tortoise-tts\n",
|
|
"!pip install -r requirements.txt"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Imports used through the rest of the notebook.\n",
|
|
"import torch\n",
|
|
"import torchaudio\n",
|
|
"import torch.nn as nn\n",
|
|
"import torch.nn.functional as F\n",
|
|
"from tqdm import tqdm\n",
|
|
"\n",
|
|
"from utils.tokenizer import VoiceBpeTokenizer\n",
|
|
"from models.discrete_diffusion_vocoder import DiscreteDiffusionVocoder\n",
|
|
"from models.text_voice_clip import VoiceCLIP\n",
|
|
"from models.dvae import DiscreteVAE\n",
|
|
"from models.autoregressive import UnifiedVoice\n",
|
|
"\n",
|
|
"# These have some fairly interesting code that is hidden in the colab. Consider checking it out.\n",
|
|
"from do_tts import download_models, load_discrete_vocoder_diffuser, load_conditioning, fix_autoregressive_output, do_spectrogram_diffusion"
|
|
],
|
|
"metadata": {
|
|
"id": "Gen09NM4hONQ"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Download pretrained models and set up pretrained voice bank. Feel free to upload and add your own voices here.\n",
|
|
"# To do so, upload two WAV files cropped to 5-10 seconds of someone speaking.\n",
|
|
"download_models()\n",
|
|
"preselected_cond_voices = {\n",
|
|
" # Male voices\n",
|
|
" 'dotrice': ['voices/dotrice/1.wav', 'voices/dotrice/2.wav'],\n",
|
|
" 'harris': ['voices/harris/1.wav', 'voices/harris/2.wav'],\n",
|
|
" 'lescault': ['voices/lescault/1.wav', 'voices/lescault/2.wav'],\n",
|
|
" 'otto': ['voices/otto/1.wav', 'voices/otto/2.wav'],\n",
|
|
" # Female voices\n",
|
|
" 'atkins': ['voices/atkins/1.wav', 'voices/atkins/2.wav'],\n",
|
|
" 'grace': ['voices/grace/1.wav', 'voices/grace/2.wav'],\n",
|
|
" 'kennard': ['voices/kennard/1.wav', 'voices/kennard/2.wav'],\n",
|
|
" 'mol': ['voices/mol/1.wav', 'voices/mol/2.wav'],\n",
|
|
" }"
|
|
],
|
|
"metadata": {
|
|
"id": "SSleVnRAiEE2"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# This is the text that will be spoken.\n",
|
|
"text = \"And took the other as just as fair, and having perhaps the better claim, because it was grassy and wanted wear.\"\n",
|
|
"# This is the voice that will speak it.\n",
|
|
"voice = 'atkins'\n",
|
|
"# This is the number of samples we will generate from the DALLE-style model. More will produce better results, but will take longer to produce.\n",
|
|
"# I don't recommend going less than 128.\n",
|
|
"num_autoregressive_samples = 128"
|
|
],
|
|
"metadata": {
|
|
"id": "bt_aoxONjfL2"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Prepare data.\n",
|
|
"tokenizer = VoiceBpeTokenizer()\n",
|
|
"text = torch.IntTensor(tokenizer.encode(text)).unsqueeze(0).cuda()\n",
|
|
"text = F.pad(text, (0,1)) # This may not be necessary.\n",
|
|
"cond_paths = preselected_cond_voices[voice]\n",
|
|
"conds = []\n",
|
|
"for cond_path in cond_paths:\n",
|
|
" c, cond_wav = load_conditioning(cond_path)\n",
|
|
" conds.append(c)\n",
|
|
"conds = torch.stack(conds, dim=1) # And just use the last cond_wav for the diffusion model."
|
|
],
|
|
"metadata": {
|
|
"id": "KEXOKjIvn6NW"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Load the autoregressive model.\n",
|
|
"autoregressive = UnifiedVoice(max_mel_tokens=300, max_text_tokens=200, max_conditioning_inputs=2, layers=30, model_dim=1024,\n",
|
|
" heads=16, number_text_tokens=256, start_text_token=255, checkpointing=False, train_solo_embeddings=False).cuda().eval()\n",
|
|
"autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))\n",
|
|
"stop_mel_token = autoregressive.stop_mel_token"
|
|
],
|
|
"metadata": {
|
|
"id": "Z15xFT_uhP8v"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Perform inference with the autoregressive model, generating num_autoregressive_samples\n",
|
|
"with torch.no_grad():\n",
|
|
" samples = []\n",
|
|
" for b in tqdm(range(num_autoregressive_samples // 16)):\n",
|
|
" codes = autoregressive.inference_speech(conds, text, num_beams=1, repetition_penalty=1.0, do_sample=True, top_k=50, top_p=.95,\n",
|
|
" temperature=.9, num_return_sequences=16, length_penalty=1)\n",
|
|
" padding_needed = 250 - codes.shape[1]\n",
|
|
" codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)\n",
|
|
" samples.append(codes)\n",
|
|
"\n",
|
|
"# Delete model weights to conserve memory.\n",
|
|
"del autoregressive"
|
|
],
|
|
"metadata": {
|
|
"id": "xajqWiEik-j0"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Load the CLIP model.\n",
|
|
"clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=8, text_seq_len=120, text_heads=8,\n",
|
|
" num_speech_tokens=8192, speech_enc_depth=10, speech_heads=8, speech_seq_len=250).cuda().eval()\n",
|
|
"clip.load_state_dict(torch.load('.models/clip.pth'))"
|
|
],
|
|
"metadata": {
|
|
"id": "KNgYSyuyliMs"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Use the CLIP model to select the best autoregressive output to match the given text.\n",
|
|
"clip_results = []\n",
|
|
"with torch.no_grad():\n",
|
|
" for batch in samples:\n",
|
|
" for i in range(batch.shape[0]):\n",
|
|
" batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)\n",
|
|
" text = text[:, :120] # Ugly hack to fix the fact that I didn't train CLIP to handle long enough text.\n",
|
|
" clip_results.append(clip(text.repeat(batch.shape[0], 1),\n",
|
|
" torch.full((batch.shape[0],), fill_value=text.shape[1]-1, dtype=torch.long, device='cuda'),\n",
|
|
" batch, torch.full((batch.shape[0],), fill_value=batch.shape[1]*1024, dtype=torch.long, device='cuda'),\n",
|
|
" return_loss=False))\n",
|
|
" clip_results = torch.cat(clip_results, dim=0)\n",
|
|
" samples = torch.cat(samples, dim=0)\n",
|
|
" best_results = samples[torch.topk(clip_results, k=1).indices]\n",
|
|
"\n",
|
|
"# Save samples to CPU memory, delete clip to conserve memory.\n",
|
|
"samples = samples.cpu()\n",
|
|
"del clip"
|
|
],
|
|
"metadata": {
|
|
"id": "DDXkM0lclp4U"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Load the DVAE and diffusion model.\n",
|
|
"dvae = DiscreteVAE(positional_dims=1, channels=80, hidden_dim=512, num_resnet_blocks=3, codebook_dim=512, num_tokens=8192, num_layers=2,\n",
|
|
" record_codes=True, kernel_size=3, use_transposed_convs=False).cuda().eval()\n",
|
|
"dvae.load_state_dict(torch.load('.models/dvae.pth'), strict=False)\n",
|
|
"diffusion = DiscreteDiffusionVocoder(model_channels=128, dvae_dim=80, channel_mult=[1, 1, 1.5, 2, 3, 4, 6, 8, 8, 8, 8], num_res_blocks=[1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1],\n",
|
|
" spectrogram_conditioning_resolutions=[2,512], attention_resolutions=[512,1024], num_heads=4, kernel_size=3, scale_factor=2,\n",
|
|
" conditioning_inputs_provided=True, time_embed_dim_multiplier=4).cuda().eval()\n",
|
|
"diffusion.load_state_dict(torch.load('.models/diffusion.pth'))\n",
|
|
"diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=100)"
|
|
],
|
|
"metadata": {
|
|
"id": "97acSnBal8Q2"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Decode the (best) discrete sequence created by the autoregressive model.\n",
|
|
"with torch.no_grad():\n",
|
|
" for b in range(best_results.shape[0]):\n",
|
|
" code = best_results[b].unsqueeze(0)\n",
|
|
" wav = do_spectrogram_diffusion(diffusion, dvae, diffuser, code, cond_wav, spectrogram_compression_factor=256, mean=True)\n",
|
|
" torchaudio.save(f'{voice}_{b}.wav', wav.squeeze(0).cpu(), 22050)"
|
|
],
|
|
"metadata": {
|
|
"id": "HEDABTrdl_kM"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Listen to your text! (told you that'd take a long time..)\n",
|
|
"from IPython.display import Audio\n",
|
|
"Audio(data=wav.squeeze(0).cpu().numpy(), rate=22050)"
|
|
],
|
|
"metadata": {
|
|
"id": "EyHmcdqBmSvf"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
}
|
|
]
|
|
} |