diff --git a/README.md b/README.md index 636b5bf..6ccf610 100755 --- a/README.md +++ b/README.md @@ -196,6 +196,7 @@ Below are a list of generation settings: * `Samples`: analogous to samples in image generation. More samples = better resemblance / clone quality, at the cost of performance. This strictly affects clone quality. * `Iterations`: influences audio sound quality in the final output. More iterations = higher quality sound. This step is relatively cheap, so do not be discouraged from increasing this. This strictly affects quality in the actual sound. * `Temperature`: how much randomness to introduce to the generated samples. Lower values = better resemblance to the source samples, but some temperature is still required for great output. + - I assume this affects variance between candidates. Very low temperatures will have very low variety between candidates. Very high temperatures will have large nuances between candidates. - **!**NOTE**!**: This value is very inconsistent and entirely depends on the input voice. In other words, some voices will be receptive to playing with this value, while others won't make much of a difference. - **!**NOTE**!**: some voices will be very receptive to this, where it speaks slowly at low temperatures, but nudging it a hair and it speaks too fast. * `Pause Size`: Governs how large pauses are at the end of a clip (in token size, not seconds). Increase this if your output gets cut off at the end. diff --git a/tortoise/api.py b/tortoise/api.py index 8dba13c..4d499c2 100755 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -242,7 +242,7 @@ class TextToSpeech: self.enable_redaction = enable_redaction self.device = device if self.enable_redaction: - self.aligner = Wav2VecAlignment(device=None) + self.aligner = Wav2VecAlignment(device='cpu' if get_device_name() == "dml" else self.device) self.tokenizer = VoiceBpeTokenizer() @@ -310,12 +310,6 @@ class TextToSpeech: voice_samples = [v.to(device) for v in voice_samples] - samples = [] - auto_conds = [] - for vs in voice_samples: - auto_conds.append(format_conditioning(vs, device=device, sampling_rate=self.input_sample_rate)) - auto_conds = torch.stack(auto_conds, dim=1) - resampler = torchaudio.transforms.Resample( self.input_sample_rate, self.output_sample_rate, @@ -324,10 +318,16 @@ class TextToSpeech: resampling_method="kaiser_window", beta=8.555504641634386, ) - # resample in its own pass to make things easier + + samples = [] + auto_conds = [] for sample in voice_samples: + auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate)) samples.append(resampler(sample.cpu()).to(device)) # icky no good, easier to do the resampling on CPU than figure out how to do it on GPU + auto_conds = torch.stack(auto_conds, dim=1) + + self.autoregressive = self.autoregressive.to(device) auto_latent = self.autoregressive.get_conditioning(auto_conds) if self.preloaded_tensors: @@ -661,7 +661,7 @@ class TextToSpeech: input_sample_rate=self.input_sample_rate, output_sample_rate=self.output_sample_rate) wav = self.vocoder.inference(mel) - wav_candidates.append(wav.cpu()) + wav_candidates.append(wav) if not self.preloaded_tensors: self.diffusion = self.diffusion.cpu() @@ -669,7 +669,7 @@ class TextToSpeech: def potentially_redact(clip, text): if self.enable_redaction: - return self.aligner.redact(clip.squeeze(1), text, self.output_sample_rate).unsqueeze(1) + return self.aligner.redact(clip.squeeze(1).to('cpu' if get_device_name() == "dml" else self.device), text, self.output_sample_rate).unsqueeze(1) return clip wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]