forked from mrq/tortoise-tts
owari da...
This commit is contained in:
parent
6255c98006
commit
a37546ad99
|
@ -71,11 +71,17 @@ After installing Python, open the Start Menu and search for `Command Prompt`. Ty
|
||||||
Paste `git clone https://git.ecker.tech/mrq/tortoise-tts` to download TorToiSe and additional scripts, then hit Enter. Inexperienced users can just download the repo as a ZIP, and extract.
|
Paste `git clone https://git.ecker.tech/mrq/tortoise-tts` to download TorToiSe and additional scripts, then hit Enter. Inexperienced users can just download the repo as a ZIP, and extract.
|
||||||
|
|
||||||
Afterwards, run the setup script, depending on your GPU, to automatically set things up.
|
Afterwards, run the setup script, depending on your GPU, to automatically set things up.
|
||||||
* AMD: `setup-directml.bat` (**!**NOTE**!**: DirectML support is currently being worked on)
|
* ~~AMD: `setup-directml.bat`~~
|
||||||
* NVIDIA: `setup-cuda.bat`
|
* NVIDIA: `setup-cuda.bat`
|
||||||
|
|
||||||
If you've done everything right, you shouldn't have any errors.
|
If you've done everything right, you shouldn't have any errors.
|
||||||
|
|
||||||
|
##### Note on DirectML Support
|
||||||
|
|
||||||
|
At first, I thought it was just one simple problem that needed to be fixed, but as I picked at it and did a new install (having CUDA enabled too caused some things to silently "work" despite using DML instead), more problems cropped up, exposing that PyTorch-DirectML isn't quite ready yet.
|
||||||
|
|
||||||
|
I doubt even if I sucked off a wizard, there'd still be other problems cropping up.
|
||||||
|
|
||||||
#### Linux
|
#### Linux
|
||||||
|
|
||||||
First, make sure you have both `python3.x` and `git` installed, as well as the required compute platform according to your GPU (ROCm or CUDA)
|
First, make sure you have both `python3.x` and `git` installed, as well as the required compute platform according to your GPU (ROCm or CUDA)
|
||||||
|
|
2
app.py
2
app.py
|
@ -33,7 +33,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
|
||||||
else:
|
else:
|
||||||
progress(0, desc="Loading voice...")
|
progress(0, desc="Loading voice...")
|
||||||
voice_samples, conditioning_latents = load_voice(voice)
|
voice_samples, conditioning_latents = load_voice(voice)
|
||||||
|
|
||||||
if voice_samples is not None:
|
if voice_samples is not None:
|
||||||
sample_voice = voice_samples[0]
|
sample_voice = voice_samples[0]
|
||||||
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size)
|
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, max_chunk_size=args.cond_latent_max_chunk_size)
|
||||||
|
|
|
@ -30,6 +30,8 @@ from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named
|
||||||
from tortoise.utils.tokenizer import VoiceBpeTokenizer
|
from tortoise.utils.tokenizer import VoiceBpeTokenizer
|
||||||
from tortoise.utils.wav2vec_alignment import Wav2VecAlignment
|
from tortoise.utils.wav2vec_alignment import Wav2VecAlignment
|
||||||
|
|
||||||
|
from tortoise.utils.device import get_device, get_device_name, get_device_batch_size
|
||||||
|
|
||||||
pbar = None
|
pbar = None
|
||||||
|
|
||||||
MODELS_DIR = os.environ.get('TORTOISE_MODELS_DIR')
|
MODELS_DIR = os.environ.get('TORTOISE_MODELS_DIR')
|
||||||
|
@ -191,57 +193,6 @@ def classify_audio_clip(clip):
|
||||||
results = F.softmax(classifier(clip), dim=-1)
|
results = F.softmax(classifier(clip), dim=-1)
|
||||||
return results[0][0]
|
return results[0][0]
|
||||||
|
|
||||||
|
|
||||||
def pick_best_batch_size_for_gpu():
|
|
||||||
"""
|
|
||||||
Tries to pick a batch size that will fit in your GPU. These sizes aren't guaranteed to work, but they should give
|
|
||||||
you a good shot.
|
|
||||||
"""
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
_, available = torch.cuda.mem_get_info()
|
|
||||||
availableGb = available / (1024 ** 3)
|
|
||||||
if availableGb > 14:
|
|
||||||
return 16
|
|
||||||
elif availableGb > 10:
|
|
||||||
return 8
|
|
||||||
elif availableGb > 7:
|
|
||||||
return 4
|
|
||||||
return 1
|
|
||||||
|
|
||||||
def has_dml():
|
|
||||||
return False
|
|
||||||
|
|
||||||
# currently getting an error thrown during the autoregressive pass
|
|
||||||
# File "X:\programs\tortoise-tts\tortoise-venv\lib\site-packages\transformers\generation_utils.py", line 1905, in sample
|
|
||||||
# unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
|
|
||||||
# RuntimeError: new(): expected key in DispatchKeySet(CPU, CUDA, HIP, XLA, MPS, IPU, XPU, HPU, Lazy, Meta) but got: PrivateUse1
|
|
||||||
# so I'll need to look into it more
|
|
||||||
|
|
||||||
"""
|
|
||||||
import importlib
|
|
||||||
loader = importlib.find_loader('torch_directml')
|
|
||||||
return loader is not None
|
|
||||||
"""
|
|
||||||
|
|
||||||
def get_optimal_device():
|
|
||||||
name = 'cpu'
|
|
||||||
|
|
||||||
if has_dml():
|
|
||||||
name = 'dml'
|
|
||||||
elif torch.cuda.is_available():
|
|
||||||
name = 'cuda'
|
|
||||||
|
|
||||||
if name == 'cpu':
|
|
||||||
print("No hardware acceleration is available, falling back to CPU...")
|
|
||||||
else:
|
|
||||||
print(f"Hardware acceleration found: {name}")
|
|
||||||
|
|
||||||
if name == "dml":
|
|
||||||
import torch_directml
|
|
||||||
return torch_directml.device()
|
|
||||||
|
|
||||||
return torch.device(name)
|
|
||||||
|
|
||||||
class TextToSpeech:
|
class TextToSpeech:
|
||||||
"""
|
"""
|
||||||
Main entry point into Tortoise.
|
Main entry point into Tortoise.
|
||||||
|
@ -260,18 +211,18 @@ class TextToSpeech:
|
||||||
:param device: Device to use when running the model. If omitted, the device will be automatically chosen.
|
:param device: Device to use when running the model. If omitted, the device will be automatically chosen.
|
||||||
"""
|
"""
|
||||||
if device is None:
|
if device is None:
|
||||||
device = get_optimal_device()
|
device = get_device(verbose=True)
|
||||||
|
|
||||||
self.input_sample_rate = input_sample_rate
|
self.input_sample_rate = input_sample_rate
|
||||||
self.output_sample_rate = output_sample_rate
|
self.output_sample_rate = output_sample_rate
|
||||||
self.minor_optimizations = minor_optimizations
|
self.minor_optimizations = minor_optimizations
|
||||||
|
|
||||||
self.models_dir = models_dir
|
self.models_dir = models_dir
|
||||||
self.autoregressive_batch_size = pick_best_batch_size_for_gpu() if autoregressive_batch_size is None or autoregressive_batch_size == 0 else autoregressive_batch_size
|
self.autoregressive_batch_size = get_device_batch_size() if autoregressive_batch_size is None or autoregressive_batch_size == 0 else autoregressive_batch_size
|
||||||
self.enable_redaction = enable_redaction
|
self.enable_redaction = enable_redaction
|
||||||
self.device = device
|
self.device = device
|
||||||
if self.enable_redaction:
|
if self.enable_redaction:
|
||||||
self.aligner = Wav2VecAlignment(device=self.device)
|
self.aligner = Wav2VecAlignment(device=None)
|
||||||
|
|
||||||
self.tokenizer = VoiceBpeTokenizer()
|
self.tokenizer = VoiceBpeTokenizer()
|
||||||
|
|
||||||
|
@ -331,13 +282,15 @@ class TextToSpeech:
|
||||||
:param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data.
|
:param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data.
|
||||||
"""
|
"""
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
voice_samples = [v.to(self.device) for v in voice_samples]
|
device = 'cpu' if get_device_name() == "dml" else self.device
|
||||||
|
|
||||||
|
voice_samples = [v.to(device) for v in voice_samples]
|
||||||
|
|
||||||
auto_conds = []
|
auto_conds = []
|
||||||
if not isinstance(voice_samples, list):
|
if not isinstance(voice_samples, list):
|
||||||
voice_samples = [voice_samples]
|
voice_samples = [voice_samples]
|
||||||
for vs in voice_samples:
|
for vs in voice_samples:
|
||||||
auto_conds.append(format_conditioning(vs, device=self.device, sampling_rate=self.input_sample_rate))
|
auto_conds.append(format_conditioning(vs, device=device, sampling_rate=self.input_sample_rate))
|
||||||
|
|
||||||
auto_conds = torch.stack(auto_conds, dim=1)
|
auto_conds = torch.stack(auto_conds, dim=1)
|
||||||
|
|
||||||
|
@ -372,20 +325,30 @@ class TextToSpeech:
|
||||||
|
|
||||||
for chunk in tqdm_override(chunks, verbose=verbose, progress=progress, desc="Computing conditioning latents..."):
|
for chunk in tqdm_override(chunks, verbose=verbose, progress=progress, desc="Computing conditioning latents..."):
|
||||||
chunk = pad_or_truncate(chunk, chunk_size)
|
chunk = pad_or_truncate(chunk, chunk_size)
|
||||||
cond_mel = wav_to_univnet_mel(chunk.to(self.device), do_normalization=False, device=self.device)
|
cond_mel = wav_to_univnet_mel(chunk.to(device), do_normalization=False, device=device)
|
||||||
diffusion_conds.append(cond_mel)
|
diffusion_conds.append(cond_mel)
|
||||||
|
|
||||||
diffusion_conds = torch.stack(diffusion_conds, dim=1)
|
diffusion_conds = torch.stack(diffusion_conds, dim=1)
|
||||||
|
|
||||||
|
# required since DML implementation screams about falling back to CPU, but crashes anyways
|
||||||
if self.minor_optimizations:
|
if self.minor_optimizations:
|
||||||
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
if get_device_name() == "dml":
|
||||||
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
self.autoregressive = self.autoregressive.cpu()
|
||||||
|
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
||||||
|
self.autoregressive = self.autoregressive.to(self.device)
|
||||||
|
|
||||||
|
self.diffusion = self.diffusion.cpu()
|
||||||
|
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
||||||
|
self.diffusion = self.diffusion.to(self.device)
|
||||||
|
else:
|
||||||
|
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
||||||
|
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
||||||
else:
|
else:
|
||||||
self.autoregressive = self.autoregressive.to(self.device)
|
self.autoregressive = self.autoregressive.to(device)
|
||||||
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
||||||
self.autoregressive = self.autoregressive.cpu()
|
self.autoregressive = self.autoregressive.cpu()
|
||||||
|
|
||||||
self.diffusion = self.diffusion.to(self.device)
|
self.diffusion = self.diffusion.to(device)
|
||||||
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
||||||
self.diffusion = self.diffusion.cpu()
|
self.diffusion = self.diffusion.cpu()
|
||||||
|
|
||||||
|
@ -509,7 +472,7 @@ class TextToSpeech:
|
||||||
|
|
||||||
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
||||||
|
|
||||||
self.autoregressive_batch_size = pick_best_batch_size_for_gpu() if sample_batch_size is None or sample_batch_size == 0 else sample_batch_size
|
self.autoregressive_batch_size = get_device_batch_size() if sample_batch_size is None or sample_batch_size == 0 else sample_batch_size
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
samples = []
|
samples = []
|
||||||
|
|
|
@ -97,7 +97,7 @@ def get_voices(extra_voice_dirs=[]):
|
||||||
return voices
|
return voices
|
||||||
|
|
||||||
|
|
||||||
def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050):
|
def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu'):
|
||||||
if voice == 'random':
|
if voice == 'random':
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
@ -120,7 +120,7 @@ def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050)
|
||||||
if load_latents and latent is not None:
|
if load_latents and latent is not None:
|
||||||
if os.path.getmtime(latent) > mtime:
|
if os.path.getmtime(latent) > mtime:
|
||||||
print(f"Reading from latent: {latent}")
|
print(f"Reading from latent: {latent}")
|
||||||
return None, torch.load(latent)
|
return None, torch.load(latent, map_location=device)
|
||||||
print(f"Latent file out of date: {latent}")
|
print(f"Latent file out of date: {latent}")
|
||||||
|
|
||||||
conds = []
|
conds = []
|
||||||
|
@ -197,7 +197,7 @@ class TacotronSTFT(torch.nn.Module):
|
||||||
return mel_output
|
return mel_output
|
||||||
|
|
||||||
|
|
||||||
def wav_to_univnet_mel(wav, do_normalization=False, device='cuda', sample_rate=24000):
|
def wav_to_univnet_mel(wav, do_normalization=False, device='cpu', sample_rate=24000):
|
||||||
stft = TacotronSTFT(1024, 256, 1024, 100, sample_rate, 0, 12000)
|
stft = TacotronSTFT(1024, 256, 1024, 100, sample_rate, 0, 12000)
|
||||||
stft = stft.to(device)
|
stft = stft.to(device)
|
||||||
mel = stft.mel_spectrogram(wav)
|
mel = stft.mel_spectrogram(wav)
|
||||||
|
|
71
tortoise/utils/device.py
Executable file
71
tortoise/utils/device.py
Executable file
|
@ -0,0 +1,71 @@
|
||||||
|
import torch
|
||||||
|
|
||||||
|
def has_dml():
|
||||||
|
"""
|
||||||
|
# huggingface's transformer/GPT2 model will just lead to a long track of problems
|
||||||
|
# I will suck off a wizard if he gets this remedied somehow
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
# Note 1:
|
||||||
|
# self.inference_model.generate will lead to this error in torch.LongTensor.new:
|
||||||
|
# RuntimeError: new(): expected key in DispatchKeySet(CPU, CUDA, HIP, XLA, MPS, IPU, XPU, HPU, Lazy, Meta) but got: PrivateUse1
|
||||||
|
# Patching "./venv/lib/site-packages/transformers/generation_utils.py:1906" with:
|
||||||
|
# unfinished_sequences = input_ids.new_tensor(input_ids.shape[0], device=input_ids.device).fill_(1)
|
||||||
|
# "fixes" it, but meets another error/crash about an unimplemented functions.........
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
# Note 2:
|
||||||
|
# torch.load() will gripe about something CUDA not existing
|
||||||
|
# remedy this with passing map_location="cpu"
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
# Note 3:
|
||||||
|
# stft requires device='cpu' or it'll crash about some error about an unimplemented function I do not remember
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
# Note 4:
|
||||||
|
# 'Tensor.multinominal' and 'Tensor.repeat_interleave' throws errors about being unimplemented and falls back to CPU and crashes
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
"""
|
||||||
|
import importlib
|
||||||
|
loader = importlib.find_loader('torch_directml')
|
||||||
|
return loader is not None
|
||||||
|
"""
|
||||||
|
|
||||||
|
def get_device_name():
|
||||||
|
name = 'cpu'
|
||||||
|
|
||||||
|
if has_dml():
|
||||||
|
name = 'dml'
|
||||||
|
elif torch.cuda.is_available():
|
||||||
|
name = 'cuda'
|
||||||
|
|
||||||
|
return name
|
||||||
|
|
||||||
|
def get_device(verbose=False):
|
||||||
|
name = get_device_name()
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
if name == 'cpu':
|
||||||
|
print("No hardware acceleration is available, falling back to CPU...")
|
||||||
|
else:
|
||||||
|
print(f"Hardware acceleration found: {name}")
|
||||||
|
|
||||||
|
if name == "dml":
|
||||||
|
import torch_directml
|
||||||
|
return torch_directml.device()
|
||||||
|
|
||||||
|
return torch.device(name)
|
||||||
|
|
||||||
|
def get_device_batch_size():
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
_, available = torch.cuda.mem_get_info()
|
||||||
|
availableGb = available / (1024 ** 3)
|
||||||
|
if availableGb > 14:
|
||||||
|
return 16
|
||||||
|
elif availableGb > 10:
|
||||||
|
return 8
|
||||||
|
elif availableGb > 7:
|
||||||
|
return 4
|
||||||
|
return 1
|
4
tortoise/utils/wav2vec_alignment.py
Normal file → Executable file
4
tortoise/utils/wav2vec_alignment.py
Normal file → Executable file
|
@ -5,7 +5,7 @@ import torchaudio
|
||||||
from transformers import Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, Wav2Vec2Processor
|
from transformers import Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, Wav2Vec2Processor
|
||||||
|
|
||||||
from tortoise.utils.audio import load_audio
|
from tortoise.utils.audio import load_audio
|
||||||
|
from tortoise.utils.device import get_device
|
||||||
|
|
||||||
def max_alignment(s1, s2, skip_character='~', record=None):
|
def max_alignment(s1, s2, skip_character='~', record=None):
|
||||||
"""
|
"""
|
||||||
|
@ -51,7 +51,7 @@ class Wav2VecAlignment:
|
||||||
"""
|
"""
|
||||||
def __init__(self, device=None):
|
def __init__(self, device=None):
|
||||||
if device is None:
|
if device is None:
|
||||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
device = torch.device(get_device())
|
||||||
|
|
||||||
self.model = Wav2Vec2ForCTC.from_pretrained("jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli").cpu()
|
self.model = Wav2Vec2ForCTC.from_pretrained("jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli").cpu()
|
||||||
self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"facebook/wav2vec2-large-960h")
|
self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"facebook/wav2vec2-large-960h")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user