diff --git a/tortoise_tts.ipynb b/tortoise_tts.ipynb new file mode 100644 index 0000000..f7513b6 --- /dev/null +++ b/tortoise_tts.ipynb @@ -0,0 +1,248 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "tortoise-tts.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JrK20I32grP6" + }, + "outputs": [], + "source": [ + "!git clone https://github.com/neonbjb/tortoise-tts.git\n", + "%cd tortoise-tts\n", + "!pip install -r requirements.txt" + ] + }, + { + "cell_type": "code", + "source": [ + "# Imports used through the rest of the notebook.\n", + "import torch\n", + "import torchaudio\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from tqdm import tqdm\n", + "\n", + "from utils.tokenizer import VoiceBpeTokenizer\n", + "from models.discrete_diffusion_vocoder import DiscreteDiffusionVocoder\n", + "from models.text_voice_clip import VoiceCLIP\n", + "from models.dvae import DiscreteVAE\n", + "from models.autoregressive import UnifiedVoice\n", + "\n", + "# These have some fairly interesting code that is hidden in the colab. Consider checking it out.\n", + "from do_tts import download_models, load_discrete_vocoder_diffuser, load_conditioning, fix_autoregressive_output, do_spectrogram_diffusion" + ], + "metadata": { + "id": "Gen09NM4hONQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Download pretrained models and set up pretrained voice bank. Feel free to upload and add your own voices here.\n", + "# To do so, upload two WAV files cropped to 5-10 seconds of someone speaking.\n", + "download_models()\n", + "preselected_cond_voices = {\n", + " # Male voices\n", + " 'dotrice': ['voices/dotrice/1.wav', 'voices/dotrice/2.wav'],\n", + " 'harris': ['voices/harris/1.wav', 'voices/harris/2.wav'],\n", + " 'lescault': ['voices/lescault/1.wav', 'voices/lescault/2.wav'],\n", + " 'otto': ['voices/otto/1.wav', 'voices/otto/2.wav'],\n", + " # Female voices\n", + " 'atkins': ['voices/atkins/1.wav', 'voices/atkins/2.wav'],\n", + " 'grace': ['voices/grace/1.wav', 'voices/grace/2.wav'],\n", + " 'kennard': ['voices/kennard/1.wav', 'voices/kennard/2.wav'],\n", + " 'mol': ['voices/mol/1.wav', 'voices/mol/2.wav'],\n", + " }" + ], + "metadata": { + "id": "SSleVnRAiEE2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# This is the text that will be spoken.\n", + "text = \"And took the other as just as fair, and having perhaps the better claim, because it was grassy and wanted wear.\"\n", + "# This is the voice that will speak it.\n", + "voice = 'atkins'\n", + "# This is the number of samples we will generate from the DALLE-style model. More will produce better results, but will take longer to produce.\n", + "# I don't recommend going less than 128.\n", + "num_autoregressive_samples = 128" + ], + "metadata": { + "id": "bt_aoxONjfL2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Prepare data.\n", + "tokenizer = VoiceBpeTokenizer()\n", + "text = torch.IntTensor(tokenizer.encode(text)).unsqueeze(0).cuda()\n", + "text = F.pad(text, (0,1)) # This may not be necessary.\n", + "cond_paths = preselected_cond_voices[voice]\n", + "conds = []\n", + "for cond_path in cond_paths:\n", + " c, cond_wav = load_conditioning(cond_path)\n", + " conds.append(c)\n", + "conds = torch.stack(conds, dim=1) # And just use the last cond_wav for the diffusion model." + ], + "metadata": { + "id": "KEXOKjIvn6NW" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Load the autoregressive model.\n", + "autoregressive = UnifiedVoice(max_mel_tokens=300, max_text_tokens=200, max_conditioning_inputs=2, layers=30, model_dim=1024,\n", + " heads=16, number_text_tokens=256, start_text_token=255, checkpointing=False, train_solo_embeddings=False).cuda().eval()\n", + "autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))\n", + "stop_mel_token = autoregressive.stop_mel_token" + ], + "metadata": { + "id": "Z15xFT_uhP8v" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Perform inference with the autoregressive model, generating num_autoregressive_samples\n", + "with torch.no_grad():\n", + " samples = []\n", + " for b in tqdm(range(num_autoregressive_samples // 16)):\n", + " codes = autoregressive.inference_speech(conds, text, num_beams=1, repetition_penalty=1.0, do_sample=True, top_k=50, top_p=.95,\n", + " temperature=.9, num_return_sequences=16, length_penalty=1)\n", + " padding_needed = 250 - codes.shape[1]\n", + " codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)\n", + " samples.append(codes)\n", + "\n", + "# Delete model weights to conserve memory.\n", + "del autoregressive" + ], + "metadata": { + "id": "xajqWiEik-j0" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Load the CLIP model.\n", + "clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=8, text_seq_len=120, text_heads=8,\n", + " num_speech_tokens=8192, speech_enc_depth=10, speech_heads=8, speech_seq_len=250).cuda().eval()\n", + "clip.load_state_dict(torch.load('.models/clip.pth'))" + ], + "metadata": { + "id": "KNgYSyuyliMs" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Use the CLIP model to select the best autoregressive output to match the given text.\n", + "clip_results = []\n", + "with torch.no_grad():\n", + " for batch in samples:\n", + " for i in range(batch.shape[0]):\n", + " batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)\n", + " text = text[:, :120] # Ugly hack to fix the fact that I didn't train CLIP to handle long enough text.\n", + " clip_results.append(clip(text.repeat(batch.shape[0], 1),\n", + " torch.full((batch.shape[0],), fill_value=text.shape[1]-1, dtype=torch.long, device='cuda'),\n", + " batch, torch.full((batch.shape[0],), fill_value=batch.shape[1]*1024, dtype=torch.long, device='cuda'),\n", + " return_loss=False))\n", + " clip_results = torch.cat(clip_results, dim=0)\n", + " samples = torch.cat(samples, dim=0)\n", + " best_results = samples[torch.topk(clip_results, k=1).indices]\n", + "\n", + "# Save samples to CPU memory, delete clip to conserve memory.\n", + "samples = samples.cpu()\n", + "del clip" + ], + "metadata": { + "id": "DDXkM0lclp4U" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Load the DVAE and diffusion model.\n", + "dvae = DiscreteVAE(positional_dims=1, channels=80, hidden_dim=512, num_resnet_blocks=3, codebook_dim=512, num_tokens=8192, num_layers=2,\n", + " record_codes=True, kernel_size=3, use_transposed_convs=False).cuda().eval()\n", + "dvae.load_state_dict(torch.load('.models/dvae.pth'), strict=False)\n", + "diffusion = DiscreteDiffusionVocoder(model_channels=128, dvae_dim=80, channel_mult=[1, 1, 1.5, 2, 3, 4, 6, 8, 8, 8, 8], num_res_blocks=[1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1],\n", + " spectrogram_conditioning_resolutions=[2,512], attention_resolutions=[512,1024], num_heads=4, kernel_size=3, scale_factor=2,\n", + " conditioning_inputs_provided=True, time_embed_dim_multiplier=4).cuda().eval()\n", + "diffusion.load_state_dict(torch.load('.models/diffusion.pth'))\n", + "diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=100)" + ], + "metadata": { + "id": "97acSnBal8Q2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Decode the (best) discrete sequence created by the autoregressive model.\n", + "with torch.no_grad():\n", + " for b in range(best_results.shape[0]):\n", + " code = best_results[b].unsqueeze(0)\n", + " wav = do_spectrogram_diffusion(diffusion, dvae, diffuser, code, cond_wav, spectrogram_compression_factor=256, mean=True)\n", + " torchaudio.save(f'{voice}_{b}.wav', wav.squeeze(0).cpu(), 22050)" + ], + "metadata": { + "id": "HEDABTrdl_kM" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Listen to your text! (told you that'd take a long time..)\n", + "from IPython.display import Audio\n", + "Audio(data=wav.squeeze(0).cpu().numpy(), rate=22050)" + ], + "metadata": { + "id": "EyHmcdqBmSvf" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file