fixed regression with computing conditional latencies outside of the CPU

2023-02-12 17:44:39 +00:00 · 2023-02-12 17:44:39 +00:00 · 2210b49cb6
commit 2210b49cb6
parent a2d95fe208
2 changed files with 64 additions and 38 deletions
--- a/README.md
+++ b/README.md
@ -152,9 +152,11 @@ Unlike training embeddings for AI image generations, preparing a "dataset" for v

 As a general rule of thumb, try to source clips that aren't noisy, solely the subject you are trying to clone, and doesn't contain any non-words (like yells, guttural noises, etc.). If you must, run your source through a background music/noise remover (how to is an exercise left to the reader). It isn't entirely a detriment if you're unable to provide clean audio, however. Just be wary that you might have some headaches with getting acceptable output.

-Nine times out of ten, you should be fine using as many clips as possible. There's (now) no preference between combining your audio into one file, or leaving it split. However, if you're aiming for a specific delivery, it *should* be best for you to narrow down to just using that as your provided source (for example, changing one word in a line).
+Nine times out of ten, you should be fine using as many clips as possible. There's no hard specifics on how many, or how long, your sources should be.

-There's no hard specifics on how many, or how long, your sources should be.
+However, keep in mind how you combine/separate your clips; depending on the mode to calculate a voice's conditional latents:
+* you might suffer from reduced throughput, as the smallest voice file will be used as the size of best fit
+* a voice might get split mid-word, affecting how the latents are computed, as each batch is averaged together

 If you're looking to trim your clips, in my opinion, ~~Audacity~~ Tenacity works good enough, as you can easily output your clips into the proper format (22050 Hz sampling rate).

--- a/tortoise/api.py
+++ b/tortoise/api.py
@ -294,7 +294,7 @@ class TextToSpeech:
        if self.preloaded_tensors:
            self.cvvp = self.cvvp.to(self.device)

-    def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, chunk_size=None, max_chunk_size=None, chunk_tensors=True):
+    def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, chunk_size=None, max_chunk_size=None, chunk_tensors=True, calculation_mode=0):
        """
        Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
        These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
@ -303,34 +303,58 @@ class TextToSpeech:
        """
        with torch.no_grad():
            # computing conditional latents requires being done on the CPU if using DML because M$ still hasn't implemented some core functions
-            device = 'cpu' if get_device_name() == "dml" else self.device
+            device = torch.device('cpu') if get_device_name() == "dml" else self.device
+
+            if not isinstance(voice_samples, list):
+                voice_samples = [voice_samples]
            
            voice_samples = [v.to(device) for v in voice_samples]

+            samples = []
            auto_conds = []
-            if not isinstance(voice_samples, list):
-                voice_samples = [voice_samples]
            for vs in voice_samples:
                auto_conds.append(format_conditioning(vs, device=device, sampling_rate=self.input_sample_rate))
-
            auto_conds = torch.stack(auto_conds, dim=1)

-            if get_device_name() == "dml":
-                self.autoregressive = self.autoregressive.cpu()
+            resampler = torchaudio.transforms.Resample(
+                self.input_sample_rate,
+                self.output_sample_rate,
+                lowpass_filter_width=16,
+                rolloff=0.85,
+                resampling_method="kaiser_window",
+                beta=8.555504641634386,
+            )
+            # resample in its own pass to make things easier
+            for sample in voice_samples:
+                samples.append(resampler(sample.cpu()).to(device)) # icky no good, easier to do the resampling on CPU than figure out how to do it on GPU

+            self.autoregressive = self.autoregressive.to(device)
            auto_latent = self.autoregressive.get_conditioning(auto_conds)
-
            if self.preloaded_tensors:
                self.autoregressive = self.autoregressive.to(self.device)
+            else:
+                self.autoregressive = self.autoregressive.cpu()
+
            
            diffusion_conds = []
+            chunks = []

-            samples = [] # resample in its own pass to make things easier
-            for sample in voice_samples:
-                # The diffuser operates at a sample rate of 24000 (except for the latent inputs)
-                #samples.append(torchaudio.functional.resample(sample, 22050, 24000))
-                samples.append(torchaudio.functional.resample(sample, self.input_sample_rate, self.output_sample_rate))

+            # new behavior: combine all samples, and divide accordingly
+            # doesn't work, need to fix
+            if calculation_mode == 1:
+                concat = torch.cat(samples, dim=-1)
+                if chunk_size is None:
+                    chunk_size = concat.shape[-1]
+
+                if max_chunk_size is not None and chunk_size > max_chunk_size:
+                    while chunk_size > max_chunk_size:
+                        chunk_size = chunk_size / 2
+
+                print(f"Size of best fit: {chunk_size}")
+                chunks = torch.chunk(concat, int(concat.shape[-1] / chunk_size) + 1, dim=1)
+            # default new behavior: use the smallest voice sample as a common chunk size
+            else:
                if chunk_size is None:
                    for sample in tqdm_override(samples, verbose=verbose and len(samples) > 1, progress=progress if len(samples) > 1 else None, desc="Calculating size of best fit..."):
                        if chunk_tensors:
@ -343,7 +367,6 @@ class TextToSpeech:
                    chunk_size = max_chunk_size
                    print(f"Chunk size exceeded, clamping to: {max_chunk_size}")

-            chunks = []
                if chunk_tensors:
                    for sample in tqdm_override(samples, verbose=verbose, progress=progress, desc="Slicing samples into chunks..."):
                        sliced = torch.chunk(sample, int(sample.shape[-1] / chunk_size) + 1, dim=1)
@ -359,13 +382,14 @@ class TextToSpeech:

            diffusion_conds = torch.stack(diffusion_conds, dim=1)

-            if get_device_name() == "dml":
-                self.diffusion = self.diffusion.cpu()
+            self.diffusion = self.diffusion.to(device)
            
            diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)

            if self.preloaded_tensors:
                self.diffusion = self.diffusion.to(self.device)
+            else:
+                self.diffusion = self.diffusion.cpu()

        if return_mels:
            return auto_latent, diffusion_latent, auto_conds, diffusion_conds