forked from mrq/tortoise-tts
v2.2
This commit is contained in:
parent
4704eb1cef
commit
e18428166d
13
README.md
13
README.md
|
@ -9,6 +9,11 @@ This repo contains all the code needed to run Tortoise TTS in inference mode.
|
|||
|
||||
### New features
|
||||
|
||||
#### v2.2; 2022/5/5
|
||||
- Added several new voices from the training set.
|
||||
- Automated redaction. Wrap the text you want to use to prompt the model but not be spoken in brackets.
|
||||
- Bug fixes
|
||||
|
||||
#### v2.1; 2022/5/2
|
||||
- Added ability to produce totally random voices.
|
||||
- Added ability to download voice conditioning latent via a script, and then use a user-provided conditioning latent.
|
||||
|
@ -95,11 +100,9 @@ For the those in the ML space: this is created by projecting a random vector ont
|
|||
|
||||
### Provided voices
|
||||
|
||||
This repo comes with several pre-packaged voices. You will be familiar with many of them. :)
|
||||
|
||||
Most of the provided voices were not found in the training set. Experimentally, it seems that voices from the training set
|
||||
produce more realistic outputs then those outside of the training set. Any voice prepended with "train" came from the
|
||||
training set.
|
||||
This repo comes with several pre-packaged voices. Voices prepended with "train_" came from the training set and perform
|
||||
far better than the others. If your goal is high quality speech, I recommend you pick one of them. If you want to see
|
||||
what Tortoise can do for zero-shot mimicing, take a look at the others.
|
||||
|
||||
### Adding a new voice
|
||||
|
||||
|
|
|
@ -284,8 +284,6 @@ class UnivNetGenerator(nn.Module):
|
|||
self.remove_weight_norm()
|
||||
|
||||
def remove_weight_norm(self):
|
||||
print('Removing weight norm...')
|
||||
|
||||
nn.utils.remove_weight_norm(self.conv_pre)
|
||||
|
||||
for layer in self.conv_post:
|
||||
|
|
|
@ -137,7 +137,7 @@ class TacotronSTFT(torch.nn.Module):
|
|||
self.stft_fn = STFT(filter_length, hop_length, win_length)
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
mel_basis = librosa_mel_fn(
|
||||
sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
|
||||
sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax)
|
||||
mel_basis = torch.from_numpy(mel_basis).float()
|
||||
self.register_buffer('mel_basis', mel_basis)
|
||||
|
||||
|
|
|
@ -66,7 +66,7 @@ class Wav2VecAlignment:
|
|||
logits = logits[0]
|
||||
pred_string = self.tokenizer.decode(logits.argmax(-1).tolist())
|
||||
|
||||
fixed_expectation = max_alignment(expected_text, pred_string)
|
||||
fixed_expectation = max_alignment(expected_text.lower(), pred_string)
|
||||
w2v_compression = orig_len // logits.shape[0]
|
||||
expected_tokens = self.tokenizer.encode(fixed_expectation)
|
||||
expected_chars = list(fixed_expectation)
|
||||
|
@ -100,7 +100,10 @@ class Wav2VecAlignment:
|
|||
break
|
||||
|
||||
pop_till_you_win()
|
||||
assert len(expected_tokens) == 0, "This shouldn't happen. My coding sucks."
|
||||
if not (len(expected_tokens) == 0 and len(alignments) == len(expected_text)):
|
||||
torch.save([audio, expected_text], 'alignment_debug.pth')
|
||||
assert False, "Something went wrong with the alignment algorithm. I've dumped a file, 'alignment_debug.pth' to" \
|
||||
"your current working directory. Please report this along with the file so it can get fixed."
|
||||
|
||||
# Now fix up alignments. Anything with -1 should be interpolated.
|
||||
alignments.append(orig_len) # This'll get removed but makes the algorithm below more readable.
|
||||
|
|
Loading…
Reference in New Issue
Block a user