v2.2

2022-05-06 00:11:10 -06:00 · 2022-05-06 00:11:10 -06:00 · e18428166d
commit e18428166d
parent 4704eb1cef
4 changed files with 14 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -9,6 +9,11 @@ This repo contains all the code needed to run Tortoise TTS in inference mode.
 ### New features
 #### v2.2; 2022/5/5
 - Added several new voices from the training set.
 - Automated redaction. Wrap the text you want to use to prompt the model but not be spoken in brackets.
 - Bug fixes
 #### v2.1; 2022/5/2
 - Added ability to produce totally random voices.
 - Added ability to download voice conditioning latent via a script, and then use a user-provided conditioning latent.
@ -95,11 +100,9 @@ For the those in the ML space: this is created by projecting a random vector ont
 ### Provided voices
-This repo comes with several pre-packaged voices. You will be familiar with many of them. :)
+This repo comes with several pre-packaged voices. Voices prepended with "train_" came from the training set and perform
-
+far better than the others. If your goal is high quality speech, I recommend you pick one of them. If you want to see
-Most of the provided voices were not found in the training set. Experimentally, it seems that voices from the training set
+what Tortoise can do for zero-shot mimicing, take a look at the others.
 produce more realistic outputs then those outside of the training set. Any voice prepended with "train" came from the
 training set.
 ### Adding a new voice
--- a/tortoise/models/vocoder.py
+++ b/tortoise/models/vocoder.py
@ -284,8 +284,6 @@ class UnivNetGenerator(nn.Module):
            self.remove_weight_norm()
    def remove_weight_norm(self):
        print('Removing weight norm...')
        nn.utils.remove_weight_norm(self.conv_pre)
        for layer in self.conv_post:
--- a/tortoise/utils/audio.py
+++ b/tortoise/utils/audio.py
@ -137,7 +137,7 @@ class TacotronSTFT(torch.nn.Module):
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        from librosa.filters import mel as librosa_mel_fn
        mel_basis = librosa_mel_fn(
-            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
+            sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)
--- a/tortoise/utils/wav2vec_alignment.py
+++ b/tortoise/utils/wav2vec_alignment.py
@ -66,7 +66,7 @@ class Wav2VecAlignment:
        logits = logits[0]
        pred_string = self.tokenizer.decode(logits.argmax(-1).tolist())
-        fixed_expectation = max_alignment(expected_text, pred_string)
+        fixed_expectation = max_alignment(expected_text.lower(), pred_string)
        w2v_compression = orig_len // logits.shape[0]
        expected_tokens = self.tokenizer.encode(fixed_expectation)
        expected_chars = list(fixed_expectation)
@ -100,7 +100,10 @@ class Wav2VecAlignment:
                    break
        pop_till_you_win()
-        assert len(expected_tokens) == 0, "This shouldn't happen. My coding sucks."
+        if not (len(expected_tokens) == 0 and len(alignments) == len(expected_text)):
            torch.save([audio, expected_text], 'alignment_debug.pth')
            assert False, "Something went wrong with the alignment algorithm. I've dumped a file, 'alignment_debug.pth' to" \
                          "your current working directory. Please report this along with the file so it can get fixed."
        # Now fix up alignments. Anything with -1 should be interpolated.
        alignments.append(orig_len)  # This'll get removed but makes the algorithm below more readable.