1
1
forked from mrq/tortoise-tts
This commit is contained in:
James Betker 2022-05-06 00:11:10 -06:00
parent 4704eb1cef
commit e18428166d
4 changed files with 14 additions and 10 deletions

View File

@ -9,6 +9,11 @@ This repo contains all the code needed to run Tortoise TTS in inference mode.
### New features
#### v2.2; 2022/5/5
- Added several new voices from the training set.
- Automated redaction. Wrap the text you want to use to prompt the model but not be spoken in brackets.
- Bug fixes
#### v2.1; 2022/5/2
- Added ability to produce totally random voices.
- Added ability to download voice conditioning latent via a script, and then use a user-provided conditioning latent.
@ -95,11 +100,9 @@ For the those in the ML space: this is created by projecting a random vector ont
### Provided voices
This repo comes with several pre-packaged voices. You will be familiar with many of them. :)
Most of the provided voices were not found in the training set. Experimentally, it seems that voices from the training set
produce more realistic outputs then those outside of the training set. Any voice prepended with "train" came from the
training set.
This repo comes with several pre-packaged voices. Voices prepended with "train_" came from the training set and perform
far better than the others. If your goal is high quality speech, I recommend you pick one of them. If you want to see
what Tortoise can do for zero-shot mimicing, take a look at the others.
### Adding a new voice

View File

@ -284,8 +284,6 @@ class UnivNetGenerator(nn.Module):
self.remove_weight_norm()
def remove_weight_norm(self):
print('Removing weight norm...')
nn.utils.remove_weight_norm(self.conv_pre)
for layer in self.conv_post:

View File

@ -137,7 +137,7 @@ class TacotronSTFT(torch.nn.Module):
self.stft_fn = STFT(filter_length, hop_length, win_length)
from librosa.filters import mel as librosa_mel_fn
mel_basis = librosa_mel_fn(
sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax)
mel_basis = torch.from_numpy(mel_basis).float()
self.register_buffer('mel_basis', mel_basis)

View File

@ -66,7 +66,7 @@ class Wav2VecAlignment:
logits = logits[0]
pred_string = self.tokenizer.decode(logits.argmax(-1).tolist())
fixed_expectation = max_alignment(expected_text, pred_string)
fixed_expectation = max_alignment(expected_text.lower(), pred_string)
w2v_compression = orig_len // logits.shape[0]
expected_tokens = self.tokenizer.encode(fixed_expectation)
expected_chars = list(fixed_expectation)
@ -100,7 +100,10 @@ class Wav2VecAlignment:
break
pop_till_you_win()
assert len(expected_tokens) == 0, "This shouldn't happen. My coding sucks."
if not (len(expected_tokens) == 0 and len(alignments) == len(expected_text)):
torch.save([audio, expected_text], 'alignment_debug.pth')
assert False, "Something went wrong with the alignment algorithm. I've dumped a file, 'alignment_debug.pth' to" \
"your current working directory. Please report this along with the file so it can get fixed."
# Now fix up alignments. Anything with -1 should be interpolated.
alignments.append(orig_len) # This'll get removed but makes the algorithm below more readable.