From b462b4cfb6bb9753e30efea6eea2ae9d126f33b0 Mon Sep 17 00:00:00 2001
From: James Betker <jbetker@gmail.com>
Date: Thu, 10 Mar 2022 23:21:01 -0700
Subject: [PATCH] Add colab notebook

---
 tortoise_tts.ipynb | 248 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 248 insertions(+)
 create mode 100644 tortoise_tts.ipynb

diff --git a/tortoise_tts.ipynb b/tortoise_tts.ipynb
new file mode 100644
index 0000000..f7513b6
--- /dev/null
+++ b/tortoise_tts.ipynb
@@ -0,0 +1,248 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "tortoise-tts.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "JrK20I32grP6"
+      },
+      "outputs": [],
+      "source": [
+        "!git clone https://github.com/neonbjb/tortoise-tts.git\n",
+        "%cd tortoise-tts\n",
+        "!pip install -r requirements.txt"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Imports used through the rest of the notebook.\n",
+        "import torch\n",
+        "import torchaudio\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "from tqdm import tqdm\n",
+        "\n",
+        "from utils.tokenizer import VoiceBpeTokenizer\n",
+        "from models.discrete_diffusion_vocoder import DiscreteDiffusionVocoder\n",
+        "from models.text_voice_clip import VoiceCLIP\n",
+        "from models.dvae import DiscreteVAE\n",
+        "from models.autoregressive import UnifiedVoice\n",
+        "\n",
+        "# These have some fairly interesting code that is hidden in the colab. Consider checking it out.\n",
+        "from do_tts import download_models, load_discrete_vocoder_diffuser, load_conditioning, fix_autoregressive_output, do_spectrogram_diffusion"
+      ],
+      "metadata": {
+        "id": "Gen09NM4hONQ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Download pretrained models and set up pretrained voice bank. Feel free to upload and add your own voices here.\n",
+        "# To do so, upload two WAV files cropped to 5-10 seconds of someone speaking.\n",
+        "download_models()\n",
+        "preselected_cond_voices = {\n",
+        "        # Male voices\n",
+        "        'dotrice': ['voices/dotrice/1.wav', 'voices/dotrice/2.wav'],\n",
+        "        'harris': ['voices/harris/1.wav', 'voices/harris/2.wav'],\n",
+        "        'lescault': ['voices/lescault/1.wav', 'voices/lescault/2.wav'],\n",
+        "        'otto': ['voices/otto/1.wav', 'voices/otto/2.wav'],\n",
+        "        # Female voices\n",
+        "        'atkins': ['voices/atkins/1.wav', 'voices/atkins/2.wav'],\n",
+        "        'grace': ['voices/grace/1.wav', 'voices/grace/2.wav'],\n",
+        "        'kennard': ['voices/kennard/1.wav', 'voices/kennard/2.wav'],\n",
+        "        'mol': ['voices/mol/1.wav', 'voices/mol/2.wav'],\n",
+        "    }"
+      ],
+      "metadata": {
+        "id": "SSleVnRAiEE2"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# This is the text that will be spoken.\n",
+        "text = \"And took the other as just as fair, and having perhaps the better claim, because it was grassy and wanted wear.\"\n",
+        "# This is the voice that will speak it.\n",
+        "voice = 'atkins'\n",
+        "# This is the number of samples we will generate from the DALLE-style model. More will produce better results, but will take longer to produce.\n",
+        "# I don't recommend going less than 128.\n",
+        "num_autoregressive_samples = 128"
+      ],
+      "metadata": {
+        "id": "bt_aoxONjfL2"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Prepare data.\n",
+        "tokenizer = VoiceBpeTokenizer()\n",
+        "text = torch.IntTensor(tokenizer.encode(text)).unsqueeze(0).cuda()\n",
+        "text = F.pad(text, (0,1))  # This may not be necessary.\n",
+        "cond_paths = preselected_cond_voices[voice]\n",
+        "conds = []\n",
+        "for cond_path in cond_paths:\n",
+        "    c, cond_wav = load_conditioning(cond_path)\n",
+        "    conds.append(c)\n",
+        "conds = torch.stack(conds, dim=1)  # And just use the last cond_wav for the diffusion model."
+      ],
+      "metadata": {
+        "id": "KEXOKjIvn6NW"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Load the autoregressive model.\n",
+        "autoregressive = UnifiedVoice(max_mel_tokens=300, max_text_tokens=200, max_conditioning_inputs=2, layers=30, model_dim=1024,\n",
+        "                                      heads=16, number_text_tokens=256, start_text_token=255, checkpointing=False, train_solo_embeddings=False).cuda().eval()\n",
+        "autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))\n",
+        "stop_mel_token = autoregressive.stop_mel_token"
+      ],
+      "metadata": {
+        "id": "Z15xFT_uhP8v"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Perform inference with the autoregressive model, generating num_autoregressive_samples\n",
+        "with torch.no_grad():\n",
+        "    samples = []\n",
+        "    for b in tqdm(range(num_autoregressive_samples // 16)):\n",
+        "        codes = autoregressive.inference_speech(conds, text, num_beams=1, repetition_penalty=1.0, do_sample=True, top_k=50, top_p=.95,\n",
+        "                                                temperature=.9, num_return_sequences=16, length_penalty=1)\n",
+        "        padding_needed = 250 - codes.shape[1]\n",
+        "        codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)\n",
+        "        samples.append(codes)\n",
+        "\n",
+        "# Delete model weights to conserve memory.\n",
+        "del autoregressive"
+      ],
+      "metadata": {
+        "id": "xajqWiEik-j0"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Load the CLIP model.\n",
+        "clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=8, text_seq_len=120, text_heads=8,\n",
+        "                  num_speech_tokens=8192, speech_enc_depth=10, speech_heads=8, speech_seq_len=250).cuda().eval()\n",
+        "clip.load_state_dict(torch.load('.models/clip.pth'))"
+      ],
+      "metadata": {
+        "id": "KNgYSyuyliMs"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Use the CLIP model to select the best autoregressive output to match the given text.\n",
+        "clip_results = []\n",
+        "with torch.no_grad():\n",
+        "    for batch in samples:\n",
+        "        for i in range(batch.shape[0]):\n",
+        "            batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)\n",
+        "        text = text[:, :120]  # Ugly hack to fix the fact that I didn't train CLIP to handle long enough text.\n",
+        "        clip_results.append(clip(text.repeat(batch.shape[0], 1),\n",
+        "                            torch.full((batch.shape[0],), fill_value=text.shape[1]-1, dtype=torch.long, device='cuda'),\n",
+        "                            batch, torch.full((batch.shape[0],), fill_value=batch.shape[1]*1024, dtype=torch.long, device='cuda'),\n",
+        "                            return_loss=False))\n",
+        "    clip_results = torch.cat(clip_results, dim=0)\n",
+        "    samples = torch.cat(samples, dim=0)\n",
+        "    best_results = samples[torch.topk(clip_results, k=1).indices]\n",
+        "\n",
+        "# Save samples to CPU memory, delete clip to conserve memory.\n",
+        "samples = samples.cpu()\n",
+        "del clip"
+      ],
+      "metadata": {
+        "id": "DDXkM0lclp4U"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Load the DVAE and diffusion model.\n",
+        "dvae = DiscreteVAE(positional_dims=1, channels=80, hidden_dim=512, num_resnet_blocks=3, codebook_dim=512, num_tokens=8192, num_layers=2,\n",
+        "                    record_codes=True, kernel_size=3, use_transposed_convs=False).cuda().eval()\n",
+        "dvae.load_state_dict(torch.load('.models/dvae.pth'), strict=False)\n",
+        "diffusion = DiscreteDiffusionVocoder(model_channels=128, dvae_dim=80, channel_mult=[1, 1, 1.5, 2, 3, 4, 6, 8, 8, 8, 8], num_res_blocks=[1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1],\n",
+        "                                      spectrogram_conditioning_resolutions=[2,512], attention_resolutions=[512,1024], num_heads=4, kernel_size=3, scale_factor=2,\n",
+        "                                      conditioning_inputs_provided=True, time_embed_dim_multiplier=4).cuda().eval()\n",
+        "diffusion.load_state_dict(torch.load('.models/diffusion.pth'))\n",
+        "diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=100)"
+      ],
+      "metadata": {
+        "id": "97acSnBal8Q2"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Decode the (best) discrete sequence created by the autoregressive model.\n",
+        "with torch.no_grad():\n",
+        "    for b in range(best_results.shape[0]):\n",
+        "        code = best_results[b].unsqueeze(0)\n",
+        "        wav = do_spectrogram_diffusion(diffusion, dvae, diffuser, code, cond_wav, spectrogram_compression_factor=256, mean=True)\n",
+        "        torchaudio.save(f'{voice}_{b}.wav', wav.squeeze(0).cpu(), 22050)"
+      ],
+      "metadata": {
+        "id": "HEDABTrdl_kM"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Listen to your text! (told you that'd take a long time..)\n",
+        "from IPython.display import Audio\n",
+        "Audio(data=wav.squeeze(0).cpu().numpy(), rate=22050)"
+      ],
+      "metadata": {
+        "id": "EyHmcdqBmSvf"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file