forked from mrq/tortoise-tts
v2.2
This commit is contained in:
parent
4704eb1cef
commit
e18428166d
13
README.md
13
README.md
|
@ -9,6 +9,11 @@ This repo contains all the code needed to run Tortoise TTS in inference mode.
|
||||||
|
|
||||||
### New features
|
### New features
|
||||||
|
|
||||||
|
#### v2.2; 2022/5/5
|
||||||
|
- Added several new voices from the training set.
|
||||||
|
- Automated redaction. Wrap the text you want to use to prompt the model but not be spoken in brackets.
|
||||||
|
- Bug fixes
|
||||||
|
|
||||||
#### v2.1; 2022/5/2
|
#### v2.1; 2022/5/2
|
||||||
- Added ability to produce totally random voices.
|
- Added ability to produce totally random voices.
|
||||||
- Added ability to download voice conditioning latent via a script, and then use a user-provided conditioning latent.
|
- Added ability to download voice conditioning latent via a script, and then use a user-provided conditioning latent.
|
||||||
|
@ -95,11 +100,9 @@ For the those in the ML space: this is created by projecting a random vector ont
|
||||||
|
|
||||||
### Provided voices
|
### Provided voices
|
||||||
|
|
||||||
This repo comes with several pre-packaged voices. You will be familiar with many of them. :)
|
This repo comes with several pre-packaged voices. Voices prepended with "train_" came from the training set and perform
|
||||||
|
far better than the others. If your goal is high quality speech, I recommend you pick one of them. If you want to see
|
||||||
Most of the provided voices were not found in the training set. Experimentally, it seems that voices from the training set
|
what Tortoise can do for zero-shot mimicing, take a look at the others.
|
||||||
produce more realistic outputs then those outside of the training set. Any voice prepended with "train" came from the
|
|
||||||
training set.
|
|
||||||
|
|
||||||
### Adding a new voice
|
### Adding a new voice
|
||||||
|
|
||||||
|
|
|
@ -284,8 +284,6 @@ class UnivNetGenerator(nn.Module):
|
||||||
self.remove_weight_norm()
|
self.remove_weight_norm()
|
||||||
|
|
||||||
def remove_weight_norm(self):
|
def remove_weight_norm(self):
|
||||||
print('Removing weight norm...')
|
|
||||||
|
|
||||||
nn.utils.remove_weight_norm(self.conv_pre)
|
nn.utils.remove_weight_norm(self.conv_pre)
|
||||||
|
|
||||||
for layer in self.conv_post:
|
for layer in self.conv_post:
|
||||||
|
|
|
@ -137,7 +137,7 @@ class TacotronSTFT(torch.nn.Module):
|
||||||
self.stft_fn = STFT(filter_length, hop_length, win_length)
|
self.stft_fn = STFT(filter_length, hop_length, win_length)
|
||||||
from librosa.filters import mel as librosa_mel_fn
|
from librosa.filters import mel as librosa_mel_fn
|
||||||
mel_basis = librosa_mel_fn(
|
mel_basis = librosa_mel_fn(
|
||||||
sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
|
sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax)
|
||||||
mel_basis = torch.from_numpy(mel_basis).float()
|
mel_basis = torch.from_numpy(mel_basis).float()
|
||||||
self.register_buffer('mel_basis', mel_basis)
|
self.register_buffer('mel_basis', mel_basis)
|
||||||
|
|
||||||
|
|
|
@ -66,7 +66,7 @@ class Wav2VecAlignment:
|
||||||
logits = logits[0]
|
logits = logits[0]
|
||||||
pred_string = self.tokenizer.decode(logits.argmax(-1).tolist())
|
pred_string = self.tokenizer.decode(logits.argmax(-1).tolist())
|
||||||
|
|
||||||
fixed_expectation = max_alignment(expected_text, pred_string)
|
fixed_expectation = max_alignment(expected_text.lower(), pred_string)
|
||||||
w2v_compression = orig_len // logits.shape[0]
|
w2v_compression = orig_len // logits.shape[0]
|
||||||
expected_tokens = self.tokenizer.encode(fixed_expectation)
|
expected_tokens = self.tokenizer.encode(fixed_expectation)
|
||||||
expected_chars = list(fixed_expectation)
|
expected_chars = list(fixed_expectation)
|
||||||
|
@ -100,7 +100,10 @@ class Wav2VecAlignment:
|
||||||
break
|
break
|
||||||
|
|
||||||
pop_till_you_win()
|
pop_till_you_win()
|
||||||
assert len(expected_tokens) == 0, "This shouldn't happen. My coding sucks."
|
if not (len(expected_tokens) == 0 and len(alignments) == len(expected_text)):
|
||||||
|
torch.save([audio, expected_text], 'alignment_debug.pth')
|
||||||
|
assert False, "Something went wrong with the alignment algorithm. I've dumped a file, 'alignment_debug.pth' to" \
|
||||||
|
"your current working directory. Please report this along with the file so it can get fixed."
|
||||||
|
|
||||||
# Now fix up alignments. Anything with -1 should be interpolated.
|
# Now fix up alignments. Anything with -1 should be interpolated.
|
||||||
alignments.append(orig_len) # This'll get removed but makes the algorithm below more readable.
|
alignments.append(orig_len) # This'll get removed but makes the algorithm below more readable.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user