From b23d6b4b4c07497702bcea9539c214cb2b069c7c Mon Sep 17 00:00:00 2001
From: mrq <barry.quiggles@protonmail.com>
Date: Thu, 9 Feb 2023 01:53:25 +0000
Subject: [PATCH] owari da...

---
 README.md                           |  8 ++-
 app.py                              |  2 +-
 tortoise/api.py                     | 87 +++++++++--------------------
 tortoise/utils/audio.py             |  6 +-
 tortoise/utils/device.py            | 71 +++++++++++++++++++++++
 tortoise/utils/wav2vec_alignment.py |  4 +-
 6 files changed, 109 insertions(+), 69 deletions(-)
 create mode 100755 tortoise/utils/device.py
 mode change 100644 => 100755 tortoise/utils/wav2vec_alignment.py

diff --git a/README.md b/README.md
index a5c2dff..c169d06 100755
--- a/README.md
+++ b/README.md
@@ -71,11 +71,17 @@ After installing Python, open the Start Menu and search for `Command Prompt`. Ty
 Paste `git clone https://git.ecker.tech/mrq/tortoise-tts` to download TorToiSe and additional scripts, then hit Enter. Inexperienced users can just download the repo as a ZIP, and extract.
 
 Afterwards, run the setup script, depending on your GPU, to automatically set things up.
-* AMD: `setup-directml.bat` (**!**NOTE**!**: DirectML support is currently being worked on)
+* ~~AMD: `setup-directml.bat`~~
 * NVIDIA: `setup-cuda.bat`
 
 If you've done everything right, you shouldn't have any errors.
 
+##### Note on DirectML Support
+
+At first, I thought it was just one simple problem that needed to be fixed, but as I picked at it and did a new install (having CUDA enabled too caused some things to silently "work" despite using DML instead), more problems cropped up, exposing that PyTorch-DirectML isn't quite ready yet.
+
+I doubt even if I sucked off a wizard, there'd still be other problems cropping up.
+
 #### Linux
 
 First, make sure you have both `python3.x` and `git` installed, as well as the required compute platform according to your GPU (ROCm or CUDA)
diff --git a/app.py b/app.py
index 4d85921..6e3b1d1 100755
--- a/app.py
+++ b/app.py
@@ -33,7 +33,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
     else:
         progress(0, desc="Loading voice...")
         voice_samples, conditioning_latents = load_voice(voice)
-    
+
     if voice_samples is not None:
         sample_voice = voice_samples[0]
         conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size)
diff --git a/tortoise/api.py b/tortoise/api.py
index a439076..7b99857 100755
--- a/tortoise/api.py
+++ b/tortoise/api.py
@@ -30,6 +30,8 @@ from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named
 from tortoise.utils.tokenizer import VoiceBpeTokenizer
 from tortoise.utils.wav2vec_alignment import Wav2VecAlignment
 
+from tortoise.utils.device import get_device, get_device_name, get_device_batch_size
+
 pbar = None
 
 MODELS_DIR = os.environ.get('TORTOISE_MODELS_DIR')
@@ -191,57 +193,6 @@ def classify_audio_clip(clip):
     results = F.softmax(classifier(clip), dim=-1)
     return results[0][0]
 
-
-def pick_best_batch_size_for_gpu():
-    """
-    Tries to pick a batch size that will fit in your GPU. These sizes aren't guaranteed to work, but they should give
-    you a good shot.
-    """
-    if torch.cuda.is_available():
-        _, available = torch.cuda.mem_get_info()
-        availableGb = available / (1024 ** 3)
-        if availableGb > 14:
-            return 16
-        elif availableGb > 10:
-            return 8
-        elif availableGb > 7:
-            return 4
-    return 1
-
-def has_dml():
-    return False
-
-    # currently getting an error thrown during the autoregressive pass
-    # File "X:\programs\tortoise-tts\tortoise-venv\lib\site-packages\transformers\generation_utils.py", line 1905, in sample
-    # unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
-    # RuntimeError: new(): expected key in DispatchKeySet(CPU, CUDA, HIP, XLA, MPS, IPU, XPU, HPU, Lazy, Meta) but got: PrivateUse1
-    # so I'll need to look into it more
-
-    """
-    import importlib
-    loader = importlib.find_loader('torch_directml')
-    return loader is not None
-    """
-
-def get_optimal_device():
-    name = 'cpu'
-
-    if has_dml():
-        name = 'dml'
-    elif torch.cuda.is_available():
-        name = 'cuda'
-
-    if name == 'cpu':
-        print("No hardware acceleration is available, falling back to CPU...")    
-    else:
-        print(f"Hardware acceleration found: {name}")
-
-    if name == "dml":
-        import torch_directml
-        return torch_directml.device()
-
-    return torch.device(name)
-
 class TextToSpeech:
     """
     Main entry point into Tortoise.
@@ -260,18 +211,18 @@ class TextToSpeech:
         :param device: Device to use when running the model. If omitted, the device will be automatically chosen.
         """ 
         if device is None:
-            device = get_optimal_device()
+            device = get_device(verbose=True)
 
         self.input_sample_rate = input_sample_rate
         self.output_sample_rate = output_sample_rate
         self.minor_optimizations = minor_optimizations
 
         self.models_dir = models_dir
-        self.autoregressive_batch_size = pick_best_batch_size_for_gpu() if autoregressive_batch_size is None or autoregressive_batch_size == 0 else autoregressive_batch_size
+        self.autoregressive_batch_size = get_device_batch_size() if autoregressive_batch_size is None or autoregressive_batch_size == 0 else autoregressive_batch_size
         self.enable_redaction = enable_redaction
         self.device = device
         if self.enable_redaction:
-            self.aligner = Wav2VecAlignment(device=self.device)
+            self.aligner = Wav2VecAlignment(device=None)
 
         self.tokenizer = VoiceBpeTokenizer()
 
@@ -331,13 +282,15 @@ class TextToSpeech:
         :param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data.
         """
         with torch.no_grad():
-            voice_samples = [v.to(self.device) for v in voice_samples]
+            device = 'cpu' if get_device_name() == "dml" else self.device
+
+            voice_samples = [v.to(device) for v in voice_samples]
 
             auto_conds = []
             if not isinstance(voice_samples, list):
                 voice_samples = [voice_samples]
             for vs in voice_samples:
-                auto_conds.append(format_conditioning(vs, device=self.device, sampling_rate=self.input_sample_rate))
+                auto_conds.append(format_conditioning(vs, device=device, sampling_rate=self.input_sample_rate))
 
             auto_conds = torch.stack(auto_conds, dim=1)
             
@@ -372,20 +325,30 @@ class TextToSpeech:
                 
             for chunk in tqdm_override(chunks, verbose=verbose, progress=progress, desc="Computing conditioning latents..."):
                 chunk = pad_or_truncate(chunk, chunk_size)
-                cond_mel = wav_to_univnet_mel(chunk.to(self.device), do_normalization=False, device=self.device)
+                cond_mel = wav_to_univnet_mel(chunk.to(device), do_normalization=False, device=device)
                 diffusion_conds.append(cond_mel)
 
             diffusion_conds = torch.stack(diffusion_conds, dim=1)
 
+            # required since DML implementation screams about falling back to CPU, but crashes anyways
             if self.minor_optimizations:
-                auto_latent = self.autoregressive.get_conditioning(auto_conds)
-                diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
+                if get_device_name() == "dml":
+                    self.autoregressive = self.autoregressive.cpu()
+                    auto_latent = self.autoregressive.get_conditioning(auto_conds)
+                    self.autoregressive = self.autoregressive.to(self.device)
+
+                    self.diffusion = self.diffusion.cpu()
+                    diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
+                    self.diffusion = self.diffusion.to(self.device)
+                else:
+                    auto_latent = self.autoregressive.get_conditioning(auto_conds)
+                    diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
             else:
-                self.autoregressive = self.autoregressive.to(self.device)
+                self.autoregressive = self.autoregressive.to(device)
                 auto_latent = self.autoregressive.get_conditioning(auto_conds)
                 self.autoregressive = self.autoregressive.cpu()
 
-                self.diffusion = self.diffusion.to(self.device)
+                self.diffusion = self.diffusion.to(device)
                 diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
                 self.diffusion = self.diffusion.cpu()
 
@@ -509,7 +472,7 @@ class TextToSpeech:
 
         diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
 
-        self.autoregressive_batch_size = pick_best_batch_size_for_gpu() if sample_batch_size is None or sample_batch_size == 0 else sample_batch_size
+        self.autoregressive_batch_size = get_device_batch_size() if sample_batch_size is None or sample_batch_size == 0 else sample_batch_size
 
         with torch.no_grad():
             samples = []
diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py
index 82bbc9f..32839ad 100755
--- a/tortoise/utils/audio.py
+++ b/tortoise/utils/audio.py
@@ -97,7 +97,7 @@ def get_voices(extra_voice_dirs=[]):
     return voices
 
 
-def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050):
+def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu'):
     if voice == 'random':
         return None, None
 
@@ -120,7 +120,7 @@ def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050)
     if load_latents and latent is not None:
         if os.path.getmtime(latent) > mtime:
             print(f"Reading from latent: {latent}")
-            return None, torch.load(latent)
+            return None, torch.load(latent, map_location=device)
         print(f"Latent file out of date: {latent}")
     
     conds = []
@@ -197,7 +197,7 @@ class TacotronSTFT(torch.nn.Module):
         return mel_output
 
 
-def wav_to_univnet_mel(wav, do_normalization=False, device='cuda', sample_rate=24000):
+def wav_to_univnet_mel(wav, do_normalization=False, device='cpu', sample_rate=24000):
     stft = TacotronSTFT(1024, 256, 1024, 100, sample_rate, 0, 12000)
     stft = stft.to(device)
     mel = stft.mel_spectrogram(wav)
diff --git a/tortoise/utils/device.py b/tortoise/utils/device.py
new file mode 100755
index 0000000..cb83926
--- /dev/null
+++ b/tortoise/utils/device.py
@@ -0,0 +1,71 @@
+import torch
+
+def has_dml():
+    """
+    # huggingface's transformer/GPT2 model will just lead to a long track of problems
+    # I will suck off a wizard if he gets this remedied somehow
+    """
+    """
+    # Note 1:
+    # self.inference_model.generate will lead to this error in torch.LongTensor.new:
+    #   RuntimeError: new(): expected key in DispatchKeySet(CPU, CUDA, HIP, XLA, MPS, IPU, XPU, HPU, Lazy, Meta) but got: PrivateUse1
+    # Patching "./venv/lib/site-packages/transformers/generation_utils.py:1906" with:
+    #   unfinished_sequences = input_ids.new_tensor(input_ids.shape[0], device=input_ids.device).fill_(1)
+    # "fixes" it, but meets another error/crash about an unimplemented functions.........
+    """
+    """
+    # Note 2:
+    # torch.load() will gripe about something CUDA not existing
+    # remedy this with passing map_location="cpu"
+    """
+    """
+    # Note 3:
+    # stft requires device='cpu' or it'll crash about some error about an unimplemented function I do not remember
+    """
+    """
+    # Note 4:
+    # 'Tensor.multinominal' and 'Tensor.repeat_interleave' throws errors about being unimplemented and falls back to CPU and crashes
+    """
+    return False
+    """
+    import importlib
+    loader = importlib.find_loader('torch_directml')
+    return loader is not None
+    """
+
+def get_device_name():
+    name = 'cpu'
+
+    if has_dml():
+        name = 'dml'
+    elif torch.cuda.is_available():
+        name = 'cuda'
+
+    return name
+
+def get_device(verbose=False):
+    name = get_device_name()
+
+    if verbose:
+        if name == 'cpu':
+            print("No hardware acceleration is available, falling back to CPU...")    
+        else:
+            print(f"Hardware acceleration found: {name}")
+
+    if name == "dml":
+        import torch_directml
+        return torch_directml.device()
+
+    return torch.device(name)
+
+def get_device_batch_size():
+    if torch.cuda.is_available():
+        _, available = torch.cuda.mem_get_info()
+        availableGb = available / (1024 ** 3)
+        if availableGb > 14:
+            return 16
+        elif availableGb > 10:
+            return 8
+        elif availableGb > 7:
+            return 4
+    return 1
\ No newline at end of file
diff --git a/tortoise/utils/wav2vec_alignment.py b/tortoise/utils/wav2vec_alignment.py
old mode 100644
new mode 100755
index e398540..f11835f
--- a/tortoise/utils/wav2vec_alignment.py
+++ b/tortoise/utils/wav2vec_alignment.py
@@ -5,7 +5,7 @@ import torchaudio
 from transformers import Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, Wav2Vec2Processor
 
 from tortoise.utils.audio import load_audio
-
+from tortoise.utils.device import get_device
 
 def max_alignment(s1, s2, skip_character='~', record=None):
     """
@@ -51,7 +51,7 @@ class Wav2VecAlignment:
     """
     def __init__(self, device=None):
         if device is None:
-            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            device = torch.device(get_device())
 
         self.model = Wav2Vec2ForCTC.from_pretrained("jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli").cpu()
         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"facebook/wav2vec2-large-960h")