1
1
forked from mrq/tortoise-tts

DirectML: fixed redaction/aligner by forcing it to stay on CPU

This commit is contained in:
mrq 2023-02-12 20:52:04 +00:00
parent 824ad38cca
commit 5b5e32338c
2 changed files with 11 additions and 10 deletions

View File

@ -196,6 +196,7 @@ Below are a list of generation settings:
* `Samples`: analogous to samples in image generation. More samples = better resemblance / clone quality, at the cost of performance. This strictly affects clone quality.
* `Iterations`: influences audio sound quality in the final output. More iterations = higher quality sound. This step is relatively cheap, so do not be discouraged from increasing this. This strictly affects quality in the actual sound.
* `Temperature`: how much randomness to introduce to the generated samples. Lower values = better resemblance to the source samples, but some temperature is still required for great output.
- I assume this affects variance between candidates. Very low temperatures will have very low variety between candidates. Very high temperatures will have large nuances between candidates.
- **!**NOTE**!**: This value is very inconsistent and entirely depends on the input voice. In other words, some voices will be receptive to playing with this value, while others won't make much of a difference.
- **!**NOTE**!**: some voices will be very receptive to this, where it speaks slowly at low temperatures, but nudging it a hair and it speaks too fast.
* `Pause Size`: Governs how large pauses are at the end of a clip (in token size, not seconds). Increase this if your output gets cut off at the end.

View File

@ -242,7 +242,7 @@ class TextToSpeech:
self.enable_redaction = enable_redaction
self.device = device
if self.enable_redaction:
self.aligner = Wav2VecAlignment(device=None)
self.aligner = Wav2VecAlignment(device='cpu' if get_device_name() == "dml" else self.device)
self.tokenizer = VoiceBpeTokenizer()
@ -310,12 +310,6 @@ class TextToSpeech:
voice_samples = [v.to(device) for v in voice_samples]
samples = []
auto_conds = []
for vs in voice_samples:
auto_conds.append(format_conditioning(vs, device=device, sampling_rate=self.input_sample_rate))
auto_conds = torch.stack(auto_conds, dim=1)
resampler = torchaudio.transforms.Resample(
self.input_sample_rate,
self.output_sample_rate,
@ -324,10 +318,16 @@ class TextToSpeech:
resampling_method="kaiser_window",
beta=8.555504641634386,
)
# resample in its own pass to make things easier
samples = []
auto_conds = []
for sample in voice_samples:
auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate))
samples.append(resampler(sample.cpu()).to(device)) # icky no good, easier to do the resampling on CPU than figure out how to do it on GPU
auto_conds = torch.stack(auto_conds, dim=1)
self.autoregressive = self.autoregressive.to(device)
auto_latent = self.autoregressive.get_conditioning(auto_conds)
if self.preloaded_tensors:
@ -661,7 +661,7 @@ class TextToSpeech:
input_sample_rate=self.input_sample_rate, output_sample_rate=self.output_sample_rate)
wav = self.vocoder.inference(mel)
wav_candidates.append(wav.cpu())
wav_candidates.append(wav)
if not self.preloaded_tensors:
self.diffusion = self.diffusion.cpu()
@ -669,7 +669,7 @@ class TextToSpeech:
def potentially_redact(clip, text):
if self.enable_redaction:
return self.aligner.redact(clip.squeeze(1), text, self.output_sample_rate).unsqueeze(1)
return self.aligner.redact(clip.squeeze(1).to('cpu' if get_device_name() == "dml" else self.device), text, self.output_sample_rate).unsqueeze(1)
return clip
wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]