diff --git a/README.md b/README.md index 15f8d32..ecf8968 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,11 @@ This repo contains all the code needed to run Tortoise TTS in inference mode. ### New features +#### v2.2; 2022/5/5 +- Added several new voices from the training set. +- Automated redaction. Wrap the text you want to use to prompt the model but not be spoken in brackets. +- Bug fixes + #### v2.1; 2022/5/2 - Added ability to produce totally random voices. - Added ability to download voice conditioning latent via a script, and then use a user-provided conditioning latent. @@ -95,11 +100,9 @@ For the those in the ML space: this is created by projecting a random vector ont ### Provided voices -This repo comes with several pre-packaged voices. You will be familiar with many of them. :) - -Most of the provided voices were not found in the training set. Experimentally, it seems that voices from the training set -produce more realistic outputs then those outside of the training set. Any voice prepended with "train" came from the -training set. +This repo comes with several pre-packaged voices. Voices prepended with "train_" came from the training set and perform +far better than the others. If your goal is high quality speech, I recommend you pick one of them. If you want to see +what Tortoise can do for zero-shot mimicing, take a look at the others. ### Adding a new voice diff --git a/examples/prompting/angry.mp3 b/examples/prompting/angry.mp3 new file mode 100644 index 0000000..6a833d1 Binary files /dev/null and b/examples/prompting/angry.mp3 differ diff --git a/examples/prompting/happy.mp3 b/examples/prompting/happy.mp3 new file mode 100644 index 0000000..79868e2 Binary files /dev/null and b/examples/prompting/happy.mp3 differ diff --git a/examples/prompting/sad.mp3 b/examples/prompting/sad.mp3 new file mode 100644 index 0000000..8ea2610 Binary files /dev/null and b/examples/prompting/sad.mp3 differ diff --git a/examples/prompting/scared.mp3 b/examples/prompting/scared.mp3 new file mode 100644 index 0000000..8bdfcd6 Binary files /dev/null and b/examples/prompting/scared.mp3 differ diff --git a/examples/various/desktop.ini b/examples/various/desktop.ini deleted file mode 100644 index d957fd1..0000000 --- a/examples/various/desktop.ini +++ /dev/null @@ -1,4 +0,0 @@ -[ViewState] -Mode= -Vid= -FolderType=Generic diff --git a/setup.py b/setup.py index 019e48d..da80c6f 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh: setuptools.setup( name="TorToiSe", packages=setuptools.find_packages(), - version="2.1.3", + version="2.2.0", author="James Betker", author_email="james@adamant.ai", description="A high quality multi-voice text-to-speech library", diff --git a/tortoise/models/vocoder.py b/tortoise/models/vocoder.py index d38fb56..346f381 100644 --- a/tortoise/models/vocoder.py +++ b/tortoise/models/vocoder.py @@ -284,8 +284,6 @@ class UnivNetGenerator(nn.Module): self.remove_weight_norm() def remove_weight_norm(self): - print('Removing weight norm...') - nn.utils.remove_weight_norm(self.conv_pre) for layer in self.conv_post: diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py index e402910..f68e831 100644 --- a/tortoise/utils/audio.py +++ b/tortoise/utils/audio.py @@ -137,7 +137,7 @@ class TacotronSTFT(torch.nn.Module): self.stft_fn = STFT(filter_length, hop_length, win_length) from librosa.filters import mel as librosa_mel_fn mel_basis = librosa_mel_fn( - sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) + sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis) diff --git a/tortoise/utils/wav2vec_alignment.py b/tortoise/utils/wav2vec_alignment.py index fe4a3fb..4a05659 100644 --- a/tortoise/utils/wav2vec_alignment.py +++ b/tortoise/utils/wav2vec_alignment.py @@ -66,7 +66,7 @@ class Wav2VecAlignment: logits = logits[0] pred_string = self.tokenizer.decode(logits.argmax(-1).tolist()) - fixed_expectation = max_alignment(expected_text, pred_string) + fixed_expectation = max_alignment(expected_text.lower(), pred_string) w2v_compression = orig_len // logits.shape[0] expected_tokens = self.tokenizer.encode(fixed_expectation) expected_chars = list(fixed_expectation) @@ -100,7 +100,10 @@ class Wav2VecAlignment: break pop_till_you_win() - assert len(expected_tokens) == 0, "This shouldn't happen. My coding sucks." + if not (len(expected_tokens) == 0 and len(alignments) == len(expected_text)): + torch.save([audio, expected_text], 'alignment_debug.pth') + assert False, "Something went wrong with the alignment algorithm. I've dumped a file, 'alignment_debug.pth' to" \ + "your current working directory. Please report this along with the file so it can get fixed." # Now fix up alignments. Anything with -1 should be interpolated. alignments.append(orig_len) # This'll get removed but makes the algorithm below more readable. diff --git a/tortoise/voices/applejack/1.wav b/tortoise/voices/applejack/1.wav new file mode 100644 index 0000000..d82dce2 Binary files /dev/null and b/tortoise/voices/applejack/1.wav differ diff --git a/tortoise/voices/applejack/2.wav b/tortoise/voices/applejack/2.wav new file mode 100644 index 0000000..4ed4965 Binary files /dev/null and b/tortoise/voices/applejack/2.wav differ diff --git a/tortoise/voices/applejack/3.wav b/tortoise/voices/applejack/3.wav new file mode 100644 index 0000000..6cc51ea Binary files /dev/null and b/tortoise/voices/applejack/3.wav differ diff --git a/tortoise/voices/rainbow/1.wav b/tortoise/voices/rainbow/1.wav new file mode 100644 index 0000000..53cf781 Binary files /dev/null and b/tortoise/voices/rainbow/1.wav differ diff --git a/tortoise/voices/rainbow/2.wav b/tortoise/voices/rainbow/2.wav new file mode 100644 index 0000000..1eae99c Binary files /dev/null and b/tortoise/voices/rainbow/2.wav differ diff --git a/tortoise/voices/rainbow/3.wav b/tortoise/voices/rainbow/3.wav new file mode 100644 index 0000000..71bc300 Binary files /dev/null and b/tortoise/voices/rainbow/3.wav differ diff --git a/tortoise/voices/train_daws/1.mp3 b/tortoise/voices/train_daws/1.mp3 new file mode 100644 index 0000000..4f2dbb0 Binary files /dev/null and b/tortoise/voices/train_daws/1.mp3 differ diff --git a/tortoise/voices/train_daws/2.mp3 b/tortoise/voices/train_daws/2.mp3 new file mode 100644 index 0000000..f754f03 Binary files /dev/null and b/tortoise/voices/train_daws/2.mp3 differ diff --git a/tortoise/voices/train_daws/3.mp3 b/tortoise/voices/train_daws/3.mp3 new file mode 100644 index 0000000..d9dace8 Binary files /dev/null and b/tortoise/voices/train_daws/3.mp3 differ diff --git a/tortoise/voices/train_dreams/1.mp3 b/tortoise/voices/train_dreams/1.mp3 new file mode 100644 index 0000000..f820e28 Binary files /dev/null and b/tortoise/voices/train_dreams/1.mp3 differ diff --git a/tortoise/voices/train_dreams/2.mp3 b/tortoise/voices/train_dreams/2.mp3 new file mode 100644 index 0000000..fbdd0ff Binary files /dev/null and b/tortoise/voices/train_dreams/2.mp3 differ diff --git a/tortoise/voices/train_dreams/3.mp3 b/tortoise/voices/train_dreams/3.mp3 new file mode 100644 index 0000000..2b73e06 Binary files /dev/null and b/tortoise/voices/train_dreams/3.mp3 differ diff --git a/tortoise/voices/train_empire/1.mp3 b/tortoise/voices/train_empire/1.mp3 new file mode 100644 index 0000000..de570b8 Binary files /dev/null and b/tortoise/voices/train_empire/1.mp3 differ diff --git a/tortoise/voices/train_empire/2.mp3 b/tortoise/voices/train_empire/2.mp3 new file mode 100644 index 0000000..45aa4da Binary files /dev/null and b/tortoise/voices/train_empire/2.mp3 differ diff --git a/tortoise/voices/train_empire/3.mp3 b/tortoise/voices/train_empire/3.mp3 new file mode 100644 index 0000000..674ad22 Binary files /dev/null and b/tortoise/voices/train_empire/3.mp3 differ diff --git a/tortoise/voices/train_mouse/1.mp3 b/tortoise/voices/train_mouse/1.mp3 new file mode 100644 index 0000000..937f182 Binary files /dev/null and b/tortoise/voices/train_mouse/1.mp3 differ diff --git a/tortoise/voices/train_mouse/2.mp3 b/tortoise/voices/train_mouse/2.mp3 new file mode 100644 index 0000000..275d90f Binary files /dev/null and b/tortoise/voices/train_mouse/2.mp3 differ diff --git a/tortoise/voices/train_mouse/3.mp3 b/tortoise/voices/train_mouse/3.mp3 new file mode 100644 index 0000000..fe197c7 Binary files /dev/null and b/tortoise/voices/train_mouse/3.mp3 differ diff --git a/tortoise/voices/yannic/00045.mp3 b/tortoise/voices/yannic/00045.mp3 new file mode 100644 index 0000000..1af9f74 Binary files /dev/null and b/tortoise/voices/yannic/00045.mp3 differ diff --git a/tortoise/voices/yannic/00055.mp3 b/tortoise/voices/yannic/00055.mp3 new file mode 100644 index 0000000..fbdac60 Binary files /dev/null and b/tortoise/voices/yannic/00055.mp3 differ diff --git a/tortoise/voices/yannic/00203.mp3 b/tortoise/voices/yannic/00203.mp3 new file mode 100644 index 0000000..0e7da0d Binary files /dev/null and b/tortoise/voices/yannic/00203.mp3 differ diff --git a/tortoise_v2_examples.html b/tortoise_v2_examples.html index 088c349..5702ed3 100644 --- a/tortoise_v2_examples.html +++ b/tortoise_v2_examples.html @@ -1,4 +1,16 @@ -These words were never spoken.

Handpicked results


+TorToiSe - These words were never spoken. + +

Introduction 🐢

+

TorToiSe is a text-to-speech program built in April 2022 by jbetker@. TorToiSe is open source, with trained model weights +available at https://github.com/neonbjb/tortoise-tts

+ +

This page demonstrates some of the results of TorToiSe.

+ +

Handpicked results 🐢

+

Following are several particularly good results generated by the model.

+ +

Short-form

+



@@ -16,14 +28,28 @@


-

Handpicked longform result:


-

Compared to Tacotron2 (with the LJSpeech voice):

Tacotron2+WaveglowTorToiSe

+ +

Short-form

+
+ +

Compared to Tacotron2 (with the LJSpeech voice): 🐢

+

LJSpeech is a popular dataset used to train small-scale TTS models. TorToiSe is a multi-voice model, following is how +it renders the LJSpeech voice with no fine-tuning, compared with results for the same text from the popular Tacotron2 +model paired with the Waveglow transformer:

+
Tacotron2+WaveglowTorToiSe






-

Various spoken texts for all voices:

+
textangiedanieldeniroemmafreemangeralthallejlawljmyselfpatsnakestomtrain_atkinstrain_dotricetrain_kennardweaverwilliam
+ +

All Results 🐢

+

Following are all the results from which the hand-picked results were drawn from. Also included is the reference + audio that the program is trying to mimic. This will give you a better sense of how TorToiSe really performs.

+ +

Short-form

+ @@ -44,19 +70,36 @@ -
textangiedanieldeniroemmafreemangeralthallejlawljmyselfpatsnakestomtrain_atkinstrain_dotricetrain_kennardweaverwilliam
reference clip
autoregressive_ml
bengio_it_needs_to_know_what_is_bad
tacotron2_sample3
tacotron2_sample4
watts_this_is_the_real_secret_of_life
wilde_nowadays_people_know_the_price

Longform result for all voices:


-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
wilde_nowadays_people_know_the_price
+ +

Long-form

+Angelina:
+Craig:
+Deniro:
+Emma:
+Freeman:
+Geralt:
+Halle:
+Jlaw:
+LJ:
+Myself:
+Pat:
+Snakes:
+Tom:
+Weaver:
+William:
+ +

Prompt Engineering 🐢

+

Tortoise is capable of "prompt-engineering" in that tone and prosody is affected by the emotions inflected in the words +fed to the program. For example, prompting the model with "[I am so angry,] I went to the park and threw a ball" will +result in it outputting "I went to the park and threw the ball" with an angry tone.

+ +

Following are a few examples of different prompts. The effect is subtle, but is definitely there. Many voices are +less effected by this.

+ +Angry:
+Sad:
+Happy:
+Scared:
+ \ No newline at end of file