diff --git a/README.md b/README.md index a5c2dff..c169d06 100755 --- a/README.md +++ b/README.md @@ -71,11 +71,17 @@ After installing Python, open the Start Menu and search for `Command Prompt`. Ty Paste `git clone https://git.ecker.tech/mrq/tortoise-tts` to download TorToiSe and additional scripts, then hit Enter. Inexperienced users can just download the repo as a ZIP, and extract. Afterwards, run the setup script, depending on your GPU, to automatically set things up. -* AMD: `setup-directml.bat` (**!**NOTE**!**: DirectML support is currently being worked on) +* ~~AMD: `setup-directml.bat`~~ * NVIDIA: `setup-cuda.bat` If you've done everything right, you shouldn't have any errors. +##### Note on DirectML Support + +At first, I thought it was just one simple problem that needed to be fixed, but as I picked at it and did a new install (having CUDA enabled too caused some things to silently "work" despite using DML instead), more problems cropped up, exposing that PyTorch-DirectML isn't quite ready yet. + +I doubt even if I sucked off a wizard, there'd still be other problems cropping up. + #### Linux First, make sure you have both `python3.x` and `git` installed, as well as the required compute platform according to your GPU (ROCm or CUDA) diff --git a/app.py b/app.py index 4d85921..6e3b1d1 100755 --- a/app.py +++ b/app.py @@ -33,7 +33,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate else: progress(0, desc="Loading voice...") voice_samples, conditioning_latents = load_voice(voice) - + if voice_samples is not None: sample_voice = voice_samples[0] conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size) diff --git a/tortoise/api.py b/tortoise/api.py index a439076..7b99857 100755 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -30,6 +30,8 @@ from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named from tortoise.utils.tokenizer import VoiceBpeTokenizer from tortoise.utils.wav2vec_alignment import Wav2VecAlignment +from tortoise.utils.device import get_device, get_device_name, get_device_batch_size + pbar = None MODELS_DIR = os.environ.get('TORTOISE_MODELS_DIR') @@ -191,57 +193,6 @@ def classify_audio_clip(clip): results = F.softmax(classifier(clip), dim=-1) return results[0][0] - -def pick_best_batch_size_for_gpu(): - """ - Tries to pick a batch size that will fit in your GPU. These sizes aren't guaranteed to work, but they should give - you a good shot. - """ - if torch.cuda.is_available(): - _, available = torch.cuda.mem_get_info() - availableGb = available / (1024 ** 3) - if availableGb > 14: - return 16 - elif availableGb > 10: - return 8 - elif availableGb > 7: - return 4 - return 1 - -def has_dml(): - return False - - # currently getting an error thrown during the autoregressive pass - # File "X:\programs\tortoise-tts\tortoise-venv\lib\site-packages\transformers\generation_utils.py", line 1905, in sample - # unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) - # RuntimeError: new(): expected key in DispatchKeySet(CPU, CUDA, HIP, XLA, MPS, IPU, XPU, HPU, Lazy, Meta) but got: PrivateUse1 - # so I'll need to look into it more - - """ - import importlib - loader = importlib.find_loader('torch_directml') - return loader is not None - """ - -def get_optimal_device(): - name = 'cpu' - - if has_dml(): - name = 'dml' - elif torch.cuda.is_available(): - name = 'cuda' - - if name == 'cpu': - print("No hardware acceleration is available, falling back to CPU...") - else: - print(f"Hardware acceleration found: {name}") - - if name == "dml": - import torch_directml - return torch_directml.device() - - return torch.device(name) - class TextToSpeech: """ Main entry point into Tortoise. @@ -260,18 +211,18 @@ class TextToSpeech: :param device: Device to use when running the model. If omitted, the device will be automatically chosen. """ if device is None: - device = get_optimal_device() + device = get_device(verbose=True) self.input_sample_rate = input_sample_rate self.output_sample_rate = output_sample_rate self.minor_optimizations = minor_optimizations self.models_dir = models_dir - self.autoregressive_batch_size = pick_best_batch_size_for_gpu() if autoregressive_batch_size is None or autoregressive_batch_size == 0 else autoregressive_batch_size + self.autoregressive_batch_size = get_device_batch_size() if autoregressive_batch_size is None or autoregressive_batch_size == 0 else autoregressive_batch_size self.enable_redaction = enable_redaction self.device = device if self.enable_redaction: - self.aligner = Wav2VecAlignment(device=self.device) + self.aligner = Wav2VecAlignment(device=None) self.tokenizer = VoiceBpeTokenizer() @@ -331,13 +282,15 @@ class TextToSpeech: :param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data. """ with torch.no_grad(): - voice_samples = [v.to(self.device) for v in voice_samples] + device = 'cpu' if get_device_name() == "dml" else self.device + + voice_samples = [v.to(device) for v in voice_samples] auto_conds = [] if not isinstance(voice_samples, list): voice_samples = [voice_samples] for vs in voice_samples: - auto_conds.append(format_conditioning(vs, device=self.device, sampling_rate=self.input_sample_rate)) + auto_conds.append(format_conditioning(vs, device=device, sampling_rate=self.input_sample_rate)) auto_conds = torch.stack(auto_conds, dim=1) @@ -372,20 +325,30 @@ class TextToSpeech: for chunk in tqdm_override(chunks, verbose=verbose, progress=progress, desc="Computing conditioning latents..."): chunk = pad_or_truncate(chunk, chunk_size) - cond_mel = wav_to_univnet_mel(chunk.to(self.device), do_normalization=False, device=self.device) + cond_mel = wav_to_univnet_mel(chunk.to(device), do_normalization=False, device=device) diffusion_conds.append(cond_mel) diffusion_conds = torch.stack(diffusion_conds, dim=1) + # required since DML implementation screams about falling back to CPU, but crashes anyways if self.minor_optimizations: - auto_latent = self.autoregressive.get_conditioning(auto_conds) - diffusion_latent = self.diffusion.get_conditioning(diffusion_conds) + if get_device_name() == "dml": + self.autoregressive = self.autoregressive.cpu() + auto_latent = self.autoregressive.get_conditioning(auto_conds) + self.autoregressive = self.autoregressive.to(self.device) + + self.diffusion = self.diffusion.cpu() + diffusion_latent = self.diffusion.get_conditioning(diffusion_conds) + self.diffusion = self.diffusion.to(self.device) + else: + auto_latent = self.autoregressive.get_conditioning(auto_conds) + diffusion_latent = self.diffusion.get_conditioning(diffusion_conds) else: - self.autoregressive = self.autoregressive.to(self.device) + self.autoregressive = self.autoregressive.to(device) auto_latent = self.autoregressive.get_conditioning(auto_conds) self.autoregressive = self.autoregressive.cpu() - self.diffusion = self.diffusion.to(self.device) + self.diffusion = self.diffusion.to(device) diffusion_latent = self.diffusion.get_conditioning(diffusion_conds) self.diffusion = self.diffusion.cpu() @@ -509,7 +472,7 @@ class TextToSpeech: diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k) - self.autoregressive_batch_size = pick_best_batch_size_for_gpu() if sample_batch_size is None or sample_batch_size == 0 else sample_batch_size + self.autoregressive_batch_size = get_device_batch_size() if sample_batch_size is None or sample_batch_size == 0 else sample_batch_size with torch.no_grad(): samples = [] diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py index 82bbc9f..32839ad 100755 --- a/tortoise/utils/audio.py +++ b/tortoise/utils/audio.py @@ -97,7 +97,7 @@ def get_voices(extra_voice_dirs=[]): return voices -def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050): +def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu'): if voice == 'random': return None, None @@ -120,7 +120,7 @@ def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050) if load_latents and latent is not None: if os.path.getmtime(latent) > mtime: print(f"Reading from latent: {latent}") - return None, torch.load(latent) + return None, torch.load(latent, map_location=device) print(f"Latent file out of date: {latent}") conds = [] @@ -197,7 +197,7 @@ class TacotronSTFT(torch.nn.Module): return mel_output -def wav_to_univnet_mel(wav, do_normalization=False, device='cuda', sample_rate=24000): +def wav_to_univnet_mel(wav, do_normalization=False, device='cpu', sample_rate=24000): stft = TacotronSTFT(1024, 256, 1024, 100, sample_rate, 0, 12000) stft = stft.to(device) mel = stft.mel_spectrogram(wav) diff --git a/tortoise/utils/device.py b/tortoise/utils/device.py new file mode 100755 index 0000000..cb83926 --- /dev/null +++ b/tortoise/utils/device.py @@ -0,0 +1,71 @@ +import torch + +def has_dml(): + """ + # huggingface's transformer/GPT2 model will just lead to a long track of problems + # I will suck off a wizard if he gets this remedied somehow + """ + """ + # Note 1: + # self.inference_model.generate will lead to this error in torch.LongTensor.new: + # RuntimeError: new(): expected key in DispatchKeySet(CPU, CUDA, HIP, XLA, MPS, IPU, XPU, HPU, Lazy, Meta) but got: PrivateUse1 + # Patching "./venv/lib/site-packages/transformers/generation_utils.py:1906" with: + # unfinished_sequences = input_ids.new_tensor(input_ids.shape[0], device=input_ids.device).fill_(1) + # "fixes" it, but meets another error/crash about an unimplemented functions......... + """ + """ + # Note 2: + # torch.load() will gripe about something CUDA not existing + # remedy this with passing map_location="cpu" + """ + """ + # Note 3: + # stft requires device='cpu' or it'll crash about some error about an unimplemented function I do not remember + """ + """ + # Note 4: + # 'Tensor.multinominal' and 'Tensor.repeat_interleave' throws errors about being unimplemented and falls back to CPU and crashes + """ + return False + """ + import importlib + loader = importlib.find_loader('torch_directml') + return loader is not None + """ + +def get_device_name(): + name = 'cpu' + + if has_dml(): + name = 'dml' + elif torch.cuda.is_available(): + name = 'cuda' + + return name + +def get_device(verbose=False): + name = get_device_name() + + if verbose: + if name == 'cpu': + print("No hardware acceleration is available, falling back to CPU...") + else: + print(f"Hardware acceleration found: {name}") + + if name == "dml": + import torch_directml + return torch_directml.device() + + return torch.device(name) + +def get_device_batch_size(): + if torch.cuda.is_available(): + _, available = torch.cuda.mem_get_info() + availableGb = available / (1024 ** 3) + if availableGb > 14: + return 16 + elif availableGb > 10: + return 8 + elif availableGb > 7: + return 4 + return 1 \ No newline at end of file diff --git a/tortoise/utils/wav2vec_alignment.py b/tortoise/utils/wav2vec_alignment.py old mode 100644 new mode 100755 index e398540..f11835f --- a/tortoise/utils/wav2vec_alignment.py +++ b/tortoise/utils/wav2vec_alignment.py @@ -5,7 +5,7 @@ import torchaudio from transformers import Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, Wav2Vec2Processor from tortoise.utils.audio import load_audio - +from tortoise.utils.device import get_device def max_alignment(s1, s2, skip_character='~', record=None): """ @@ -51,7 +51,7 @@ class Wav2VecAlignment: """ def __init__(self, device=None): if device is None: - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + device = torch.device(get_device()) self.model = Wav2Vec2ForCTC.from_pretrained("jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli").cpu() self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"facebook/wav2vec2-large-960h")