forked from mrq/tortoise-tts
DirectML: fixed redaction/aligner by forcing it to stay on CPU
This commit is contained in:
parent
824ad38cca
commit
5b5e32338c
|
@ -196,6 +196,7 @@ Below are a list of generation settings:
|
||||||
* `Samples`: analogous to samples in image generation. More samples = better resemblance / clone quality, at the cost of performance. This strictly affects clone quality.
|
* `Samples`: analogous to samples in image generation. More samples = better resemblance / clone quality, at the cost of performance. This strictly affects clone quality.
|
||||||
* `Iterations`: influences audio sound quality in the final output. More iterations = higher quality sound. This step is relatively cheap, so do not be discouraged from increasing this. This strictly affects quality in the actual sound.
|
* `Iterations`: influences audio sound quality in the final output. More iterations = higher quality sound. This step is relatively cheap, so do not be discouraged from increasing this. This strictly affects quality in the actual sound.
|
||||||
* `Temperature`: how much randomness to introduce to the generated samples. Lower values = better resemblance to the source samples, but some temperature is still required for great output.
|
* `Temperature`: how much randomness to introduce to the generated samples. Lower values = better resemblance to the source samples, but some temperature is still required for great output.
|
||||||
|
- I assume this affects variance between candidates. Very low temperatures will have very low variety between candidates. Very high temperatures will have large nuances between candidates.
|
||||||
- **!**NOTE**!**: This value is very inconsistent and entirely depends on the input voice. In other words, some voices will be receptive to playing with this value, while others won't make much of a difference.
|
- **!**NOTE**!**: This value is very inconsistent and entirely depends on the input voice. In other words, some voices will be receptive to playing with this value, while others won't make much of a difference.
|
||||||
- **!**NOTE**!**: some voices will be very receptive to this, where it speaks slowly at low temperatures, but nudging it a hair and it speaks too fast.
|
- **!**NOTE**!**: some voices will be very receptive to this, where it speaks slowly at low temperatures, but nudging it a hair and it speaks too fast.
|
||||||
* `Pause Size`: Governs how large pauses are at the end of a clip (in token size, not seconds). Increase this if your output gets cut off at the end.
|
* `Pause Size`: Governs how large pauses are at the end of a clip (in token size, not seconds). Increase this if your output gets cut off at the end.
|
||||||
|
|
|
@ -242,7 +242,7 @@ class TextToSpeech:
|
||||||
self.enable_redaction = enable_redaction
|
self.enable_redaction = enable_redaction
|
||||||
self.device = device
|
self.device = device
|
||||||
if self.enable_redaction:
|
if self.enable_redaction:
|
||||||
self.aligner = Wav2VecAlignment(device=None)
|
self.aligner = Wav2VecAlignment(device='cpu' if get_device_name() == "dml" else self.device)
|
||||||
|
|
||||||
self.tokenizer = VoiceBpeTokenizer()
|
self.tokenizer = VoiceBpeTokenizer()
|
||||||
|
|
||||||
|
@ -310,12 +310,6 @@ class TextToSpeech:
|
||||||
|
|
||||||
voice_samples = [v.to(device) for v in voice_samples]
|
voice_samples = [v.to(device) for v in voice_samples]
|
||||||
|
|
||||||
samples = []
|
|
||||||
auto_conds = []
|
|
||||||
for vs in voice_samples:
|
|
||||||
auto_conds.append(format_conditioning(vs, device=device, sampling_rate=self.input_sample_rate))
|
|
||||||
auto_conds = torch.stack(auto_conds, dim=1)
|
|
||||||
|
|
||||||
resampler = torchaudio.transforms.Resample(
|
resampler = torchaudio.transforms.Resample(
|
||||||
self.input_sample_rate,
|
self.input_sample_rate,
|
||||||
self.output_sample_rate,
|
self.output_sample_rate,
|
||||||
|
@ -324,10 +318,16 @@ class TextToSpeech:
|
||||||
resampling_method="kaiser_window",
|
resampling_method="kaiser_window",
|
||||||
beta=8.555504641634386,
|
beta=8.555504641634386,
|
||||||
)
|
)
|
||||||
# resample in its own pass to make things easier
|
|
||||||
|
samples = []
|
||||||
|
auto_conds = []
|
||||||
for sample in voice_samples:
|
for sample in voice_samples:
|
||||||
|
auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate))
|
||||||
samples.append(resampler(sample.cpu()).to(device)) # icky no good, easier to do the resampling on CPU than figure out how to do it on GPU
|
samples.append(resampler(sample.cpu()).to(device)) # icky no good, easier to do the resampling on CPU than figure out how to do it on GPU
|
||||||
|
|
||||||
|
auto_conds = torch.stack(auto_conds, dim=1)
|
||||||
|
|
||||||
|
|
||||||
self.autoregressive = self.autoregressive.to(device)
|
self.autoregressive = self.autoregressive.to(device)
|
||||||
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
||||||
if self.preloaded_tensors:
|
if self.preloaded_tensors:
|
||||||
|
@ -661,7 +661,7 @@ class TextToSpeech:
|
||||||
input_sample_rate=self.input_sample_rate, output_sample_rate=self.output_sample_rate)
|
input_sample_rate=self.input_sample_rate, output_sample_rate=self.output_sample_rate)
|
||||||
|
|
||||||
wav = self.vocoder.inference(mel)
|
wav = self.vocoder.inference(mel)
|
||||||
wav_candidates.append(wav.cpu())
|
wav_candidates.append(wav)
|
||||||
|
|
||||||
if not self.preloaded_tensors:
|
if not self.preloaded_tensors:
|
||||||
self.diffusion = self.diffusion.cpu()
|
self.diffusion = self.diffusion.cpu()
|
||||||
|
@ -669,7 +669,7 @@ class TextToSpeech:
|
||||||
|
|
||||||
def potentially_redact(clip, text):
|
def potentially_redact(clip, text):
|
||||||
if self.enable_redaction:
|
if self.enable_redaction:
|
||||||
return self.aligner.redact(clip.squeeze(1), text, self.output_sample_rate).unsqueeze(1)
|
return self.aligner.redact(clip.squeeze(1).to('cpu' if get_device_name() == "dml" else self.device), text, self.output_sample_rate).unsqueeze(1)
|
||||||
return clip
|
return clip
|
||||||
wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]
|
wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user