diff --git a/README.md b/README.md index 15f8d32..ecf8968 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,11 @@ This repo contains all the code needed to run Tortoise TTS in inference mode. ### New features +#### v2.2; 2022/5/5 +- Added several new voices from the training set. +- Automated redaction. Wrap the text you want to use to prompt the model but not be spoken in brackets. +- Bug fixes + #### v2.1; 2022/5/2 - Added ability to produce totally random voices. - Added ability to download voice conditioning latent via a script, and then use a user-provided conditioning latent. @@ -95,11 +100,9 @@ For the those in the ML space: this is created by projecting a random vector ont ### Provided voices -This repo comes with several pre-packaged voices. You will be familiar with many of them. :) - -Most of the provided voices were not found in the training set. Experimentally, it seems that voices from the training set -produce more realistic outputs then those outside of the training set. Any voice prepended with "train" came from the -training set. +This repo comes with several pre-packaged voices. Voices prepended with "train_" came from the training set and perform +far better than the others. If your goal is high quality speech, I recommend you pick one of them. If you want to see +what Tortoise can do for zero-shot mimicing, take a look at the others. ### Adding a new voice diff --git a/tortoise/models/vocoder.py b/tortoise/models/vocoder.py index d38fb56..346f381 100644 --- a/tortoise/models/vocoder.py +++ b/tortoise/models/vocoder.py @@ -284,8 +284,6 @@ class UnivNetGenerator(nn.Module): self.remove_weight_norm() def remove_weight_norm(self): - print('Removing weight norm...') - nn.utils.remove_weight_norm(self.conv_pre) for layer in self.conv_post: diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py index e402910..f68e831 100644 --- a/tortoise/utils/audio.py +++ b/tortoise/utils/audio.py @@ -137,7 +137,7 @@ class TacotronSTFT(torch.nn.Module): self.stft_fn = STFT(filter_length, hop_length, win_length) from librosa.filters import mel as librosa_mel_fn mel_basis = librosa_mel_fn( - sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) + sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis) diff --git a/tortoise/utils/wav2vec_alignment.py b/tortoise/utils/wav2vec_alignment.py index fe4a3fb..4a05659 100644 --- a/tortoise/utils/wav2vec_alignment.py +++ b/tortoise/utils/wav2vec_alignment.py @@ -66,7 +66,7 @@ class Wav2VecAlignment: logits = logits[0] pred_string = self.tokenizer.decode(logits.argmax(-1).tolist()) - fixed_expectation = max_alignment(expected_text, pred_string) + fixed_expectation = max_alignment(expected_text.lower(), pred_string) w2v_compression = orig_len // logits.shape[0] expected_tokens = self.tokenizer.encode(fixed_expectation) expected_chars = list(fixed_expectation) @@ -100,7 +100,10 @@ class Wav2VecAlignment: break pop_till_you_win() - assert len(expected_tokens) == 0, "This shouldn't happen. My coding sucks." + if not (len(expected_tokens) == 0 and len(alignments) == len(expected_text)): + torch.save([audio, expected_text], 'alignment_debug.pth') + assert False, "Something went wrong with the alignment algorithm. I've dumped a file, 'alignment_debug.pth' to" \ + "your current working directory. Please report this along with the file so it can get fixed." # Now fix up alignments. Anything with -1 should be interpolated. alignments.append(orig_len) # This'll get removed but makes the algorithm below more readable.