From e4e8ebfc55b1e8896151286a12564301637a2054 Mon Sep 17 00:00:00 2001 From: James Betker Date: Mon, 2 May 2022 20:20:50 -0600 Subject: [PATCH] getting ready for 2.1 release --- README.md | 22 ++++++++++++++++++++-- tortoise/api.py | 2 +- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 377f93a..12f27d9 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,15 @@ Tortoise is a text-to-speech program built with the following priorities: This repo contains all the code needed to run Tortoise TTS in inference mode. +### New features + +#### v2.1; 2022/5/2 +- Added ability to produce totally random voices. +- Added ability to download voice conditioning latent via a script, and then use a user-provided conditioning latent. +- Added ability to use your own pretrained models. +- Refactored directory structures. +- Performance improvements & bug fixes. + ## What's in a name? I'm naming my speech-related repos after Mojave desert flora and fauna. Tortoise is a bit tongue in cheek: this model @@ -38,7 +47,7 @@ pip install -r requirements.txt This script allows you to speak a single phrase with one or more voices. ```shell -python do_tts.py --text "I'm going to speak this" --voice dotrice --preset fast +python do_tts.py --text "I'm going to speak this" --voice random --preset fast ``` ### read.py @@ -46,7 +55,7 @@ python do_tts.py --text "I'm going to speak this" --voice dotrice --preset fast This script provides tools for reading large amounts of text. ```shell -python read.py --textfile --voice dotrice +python read.py --textfile --voice random ``` This will break up the textfile into sentences, and then convert them to speech one at a time. It will output a series @@ -72,6 +81,15 @@ Tortoise was specifically trained to be a multi-speaker model. It accomplishes t These reference clips are recordings of a speaker that you provide to guide speech generation. These clips are used to determine many properties of the output, such as the pitch and tone of the voice, speaking speed, and even speaking defects like a lisp or stuttering. The reference clip is also used to determine non-voice related aspects of the audio output like volume, background noise, recording quality and reverb. +### Random voice + +I've included a feature which randomly generates a voice. These voices don't actually exist and will be random every time you run +it. The results are quite fascinating and I recommend you play around with it! + +You can use the random voice by passing in 'random' as the voice name. Tortoise will take care of the rest. + +For the those in the ML space: this is created by projecting a random vector onto the voice conditioning latent space. + ### Provided voices This repo comes with several pre-packaged voices. You will be familiar with many of them. :) diff --git a/tortoise/api.py b/tortoise/api.py index 5d2165b..c4cce4d 100644 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -165,7 +165,7 @@ class TextToSpeech: Main entry point into Tortoise. """ - def __init__(self, autoregressive_batch_size=16, models_dir='.models', enable_redaction=True): + def __init__(self, autoregressive_batch_size=16, models_dir='.models', enable_redaction=False): """ Constructor :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing