From 5b5e32338ce140118ed8f127214de0885117f09c Mon Sep 17 00:00:00 2001
From: mrq <barry.quiggles@protonmail.com>
Date: Sun, 12 Feb 2023 20:52:04 +0000
Subject: [PATCH] DirectML: fixed redaction/aligner by forcing it to stay on
 CPU

---
 README.md       |  1 +
 tortoise/api.py | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 636b5bf..6ccf610 100755
--- a/README.md
+++ b/README.md
@@ -196,6 +196,7 @@ Below are a list of generation settings:
 * `Samples`: analogous to samples in image generation. More samples = better resemblance / clone quality, at the cost of performance. This strictly affects clone quality.
 * `Iterations`: influences audio sound quality in the final output. More iterations = higher quality sound. This step is relatively cheap, so do not be discouraged from increasing this. This strictly affects quality in the actual sound.
 * `Temperature`: how much randomness to introduce to the generated samples. Lower values = better resemblance to the source samples, but some temperature is still required for great output.
+	- I assume this affects variance between candidates. Very low temperatures will have very low variety between candidates. Very high temperatures will have large nuances between candidates.
 	- **!**NOTE**!**: This value is very inconsistent and entirely depends on the input voice. In other words, some voices will be receptive to playing with this value, while others won't make much of a difference.
 	- **!**NOTE**!**: some voices will be very receptive to this, where it speaks slowly at low temperatures, but nudging it a hair and it speaks too fast.
 * `Pause Size`: Governs how large pauses are at the end of a clip (in token size, not seconds). Increase this if your output gets cut off at the end.
diff --git a/tortoise/api.py b/tortoise/api.py
index 8dba13c..4d499c2 100755
--- a/tortoise/api.py
+++ b/tortoise/api.py
@@ -242,7 +242,7 @@ class TextToSpeech:
         self.enable_redaction = enable_redaction
         self.device = device
         if self.enable_redaction:
-            self.aligner = Wav2VecAlignment(device=None)
+            self.aligner = Wav2VecAlignment(device='cpu' if get_device_name() == "dml" else self.device)
 
         self.tokenizer = VoiceBpeTokenizer()
 
@@ -310,12 +310,6 @@ class TextToSpeech:
             
             voice_samples = [v.to(device) for v in voice_samples]
 
-            samples = []
-            auto_conds = []
-            for vs in voice_samples:
-                auto_conds.append(format_conditioning(vs, device=device, sampling_rate=self.input_sample_rate))
-            auto_conds = torch.stack(auto_conds, dim=1)
-
             resampler = torchaudio.transforms.Resample(
                 self.input_sample_rate,
                 self.output_sample_rate,
@@ -324,10 +318,16 @@ class TextToSpeech:
                 resampling_method="kaiser_window",
                 beta=8.555504641634386,
             )
-            # resample in its own pass to make things easier
+
+            samples = []
+            auto_conds = []
             for sample in voice_samples:
+                auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate))
                 samples.append(resampler(sample.cpu()).to(device)) # icky no good, easier to do the resampling on CPU than figure out how to do it on GPU
 
+            auto_conds = torch.stack(auto_conds, dim=1)
+
+
             self.autoregressive = self.autoregressive.to(device)
             auto_latent = self.autoregressive.get_conditioning(auto_conds)
             if self.preloaded_tensors:
@@ -661,7 +661,7 @@ class TextToSpeech:
                                                input_sample_rate=self.input_sample_rate, output_sample_rate=self.output_sample_rate)
 
                 wav = self.vocoder.inference(mel)
-                wav_candidates.append(wav.cpu())
+                wav_candidates.append(wav)
             
             if not self.preloaded_tensors:
                 self.diffusion = self.diffusion.cpu()
@@ -669,7 +669,7 @@ class TextToSpeech:
 
             def potentially_redact(clip, text):
                 if self.enable_redaction:
-                    return self.aligner.redact(clip.squeeze(1), text, self.output_sample_rate).unsqueeze(1)
+                    return self.aligner.redact(clip.squeeze(1).to('cpu' if get_device_name() == "dml" else self.device), text, self.output_sample_rate).unsqueeze(1)
                 return clip
             wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]