2022-03-11 06:21:01 +00:00
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "tortoise-tts.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
2022-04-25 22:59:04 +00:00
{
"cell_type": "markdown",
"source": [
"Welcome to Tortoise! 🐢🐢🐢🐢\n",
"\n",
"Before you begin, I **strongly** recommend you turn on a GPU runtime.\n",
"\n",
"There's a reason this is called \"Tortoise\" - this model takes up to a minute to perform inference for a single sentence on a GPU. Expect waits on the order of hours on a CPU."
],
"metadata": {
"id": "_pIZ3ZXNp7cf"
}
},
2022-03-11 06:21:01 +00:00
{
"cell_type": "code",
"execution_count": null,
"metadata": {
2022-04-26 03:17:49 +00:00
"id": "JrK20I32grP6"
2022-03-11 06:21:01 +00:00
},
2022-04-26 03:17:49 +00:00
"outputs": [],
2022-03-11 06:21:01 +00:00
"source": [
"!git clone https://github.com/neonbjb/tortoise-tts.git\n",
"%cd tortoise-tts\n",
"!pip install -r requirements.txt"
]
},
{
"cell_type": "code",
"source": [
"# Imports used through the rest of the notebook.\n",
"import torch\n",
"import torchaudio\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"\n",
2022-04-26 03:28:18 +00:00
"import IPython\n",
"\n",
2022-04-25 22:59:04 +00:00
"from api import TextToSpeech\n",
"from utils.audio import load_audio, get_voices\n",
2022-03-11 06:21:01 +00:00
"\n",
2022-04-25 22:59:04 +00:00
"# This will download all the models used by Tortoise from the HF hub.\n",
"tts = TextToSpeech()"
2022-03-11 06:21:01 +00:00
],
"metadata": {
2022-04-26 03:17:49 +00:00
"id": "Gen09NM4hONQ"
2022-03-11 06:21:01 +00:00
},
"execution_count": null,
2022-04-26 03:17:49 +00:00
"outputs": []
2022-03-11 06:21:01 +00:00
},
{
"cell_type": "code",
"source": [
2022-04-25 22:59:04 +00:00
"# List all the voices available. These are just some random clips I've gathered\n",
"# from the internet as well as a few voices from the training dataset.\n",
"# Feel free to add your own clips to the voices/ folder.\n",
"%ls voices"
2022-03-11 06:21:01 +00:00
],
"metadata": {
2022-04-26 03:17:49 +00:00
"id": "SSleVnRAiEE2"
2022-03-11 06:21:01 +00:00
},
"execution_count": null,
2022-04-26 03:17:49 +00:00
"outputs": []
2022-03-11 06:21:01 +00:00
},
{
"cell_type": "code",
"source": [
"# This is the text that will be spoken.\n",
2022-04-25 22:59:04 +00:00
"text = \"Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?\"\n",
"\n",
"# Here's something for the poetically inclined.. (set text=)\n",
"\"\"\"\n",
"Then took the other, as just as fair,\n",
"And having perhaps the better claim,\n",
"Because it was grassy and wanted wear;\n",
"Though as for that the passing there\n",
"Had worn them really about the same,\"\"\"\n",
"\n",
"# Pick one of the voices from above\n",
2022-04-26 03:28:18 +00:00
"voice = 'train_dotrice'\n",
2022-04-25 22:59:04 +00:00
"# Pick a \"preset mode\" to determine quality. Options: {\"ultra_fast\", \"fast\" (default), \"standard\", \"high_quality\"}. See docs in api.py\n",
"preset = \"fast\""
2022-03-11 06:21:01 +00:00
],
"metadata": {
"id": "bt_aoxONjfL2"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
2022-04-25 22:59:04 +00:00
"# Fetch the voice references and forward execute!\n",
"voices = get_voices()\n",
"cond_paths = voices[voice]\n",
2022-03-11 06:21:01 +00:00
"conds = []\n",
"for cond_path in cond_paths:\n",
2022-04-25 22:59:04 +00:00
" c = load_audio(cond_path, 22050)\n",
2022-03-11 06:21:01 +00:00
" conds.append(c)\n",
"\n",
2022-04-25 22:59:04 +00:00
"gen = tts.tts_with_preset(text, conds, preset)\n",
2022-04-26 03:28:18 +00:00
"torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)\n",
"IPython.display.Audio('generated.wav')"
2022-03-11 06:21:01 +00:00
],
"metadata": {
2022-04-26 03:17:49 +00:00
"id": "KEXOKjIvn6NW"
2022-03-11 06:21:01 +00:00
},
"execution_count": null,
2022-04-26 03:17:49 +00:00
"outputs": []
2022-03-11 06:21:01 +00:00
},
{
"cell_type": "code",
"source": [
2022-04-25 22:59:04 +00:00
"# You can add as many conditioning voices as you want together. Combining\n",
"# clips from multiple voices takes the mean of the latent space for all\n",
"# voices. This creates a novel voice that is a combination of the two inputs.\n",
"#\n",
"# Lets see what it would sound like if Picard and Kirk had a kid with a penchant for philosophy:\n",
"conds = []\n",
2022-04-26 03:17:49 +00:00
"for v in ['pat', 'william']:\n",
2022-04-25 22:59:04 +00:00
" cond_paths = voices[v]\n",
" for cond_path in cond_paths:\n",
" c = load_audio(cond_path, 22050)\n",
" conds.append(c)\n",
2022-03-11 06:21:01 +00:00
"\n",
2022-04-25 22:59:04 +00:00
"gen = tts.tts_with_preset(\"They used to say that if man was meant to fly, he’ d have wings. But he did fly. He discovered he had to.\", conds, preset)\n",
2022-04-26 03:28:18 +00:00
"torchaudio.save('captain_kirkard.wav', gen.squeeze(0).cpu(), 24000)\n",
"IPython.display.Audio('captain_kirkard.wav')"
2022-03-11 06:21:01 +00:00
],
"metadata": {
2022-04-26 03:17:49 +00:00
"id": "fYTk8KUezUr5"
2022-03-11 06:21:01 +00:00
},
"execution_count": null,
2022-04-26 03:17:49 +00:00
"outputs": []
2022-03-11 06:21:01 +00:00
}
]
}