From cbd3c95c42ac1da9772f61b9895954ee693075c9 Mon Sep 17 00:00:00 2001 From: mrq Date: Sun, 20 Aug 2023 22:32:01 -0500 Subject: [PATCH] possible speedup with one simple trick (it worked for valle inferencing), also backported the voice list loading from aivc --- tortoise/api.py | 6 ++-- tortoise/utils/audio.py | 62 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/tortoise/api.py b/tortoise/api.py index 0293076..c8691d8 100755 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -150,7 +150,7 @@ def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusi model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', trained_diffusion_steps), conditioning_free=cond_free, conditioning_free_k=cond_free_k) - +@torch.inference_mode() def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=22050): """ Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models. @@ -194,7 +194,7 @@ def fix_autoregressive_output(codes, stop_token, complain=True): return codes - +@torch.inference_mode() def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000): """ Uses the specified diffusion model to convert discrete codes into a spectrogram. @@ -453,6 +453,7 @@ class TextToSpeech: if self.preloaded_tensors: self.cvvp = migrate_to_device( self.cvvp, self.device ) + @torch.inference_mode() def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, slices=1, max_chunk_size=None, force_cpu=False, original_ar=False, original_diffusion=False): """ Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent). @@ -578,6 +579,7 @@ class TextToSpeech: settings.update(kwargs) # allow overriding of preset settings with kwargs return self.tts(text, **settings) + @torch.inference_mode() def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None, return_deterministic_state=False, # autoregressive generation parameters follow diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py index e3885e5..4302c40 100755 --- a/tortoise/utils/audio.py +++ b/tortoise/utils/audio.py @@ -94,12 +94,72 @@ def get_voices(extra_voice_dirs=[], load_latents=True): voices[sub] = voices[sub] + list(glob(f'{subj}/*.pth')) return voices +def get_voice( name, dir=get_voice_dir(), load_latents=True, extensions=["wav", "mp3", "flac"] ): + subj = f'{dir}/{name}/' + if not os.path.isdir(subj): + return + files = os.listdir(subj) + + if load_latents: + extensions.append(".pth") + + voice = [] + for file in files: + ext = os.path.splitext(file)[-1][1:] + if ext not in extensions: + continue + + voice.append(f'{subj}/{file}') + + return sorted( voice ) + +def get_voice_list(dir=get_voice_dir(), append_defaults=False, extensions=["wav", "mp3", "flac", "pth"]): + defaults = [ "random", "microphone" ] + os.makedirs(dir, exist_ok=True) + #res = sorted([d for d in os.listdir(dir) if d not in defaults and os.path.isdir(os.path.join(dir, d)) and len(os.listdir(os.path.join(dir, d))) > 0 ]) + + res = [] + for name in os.listdir(dir): + if name in defaults: + continue + if not os.path.isdir(f'{dir}/{name}'): + continue + if len(os.listdir(os.path.join(dir, name))) == 0: + continue + files = get_voice( name, dir=dir, extensions=extensions ) + + if len(files) > 0: + res.append(name) + else: + for subdir in os.listdir(f'{dir}/{name}'): + if not os.path.isdir(f'{dir}/{name}/{subdir}'): + continue + files = get_voice( f'{name}/{subdir}', dir=dir, extensions=extensions ) + if len(files) == 0: + continue + res.append(f'{name}/{subdir}') + + res = sorted(res) + + if append_defaults: + res = res + defaults + + return res + + +def _get_voices( dirs=[get_voice_dir()], load_latents=True ): + voices = {} + for dir in dirs: + voice_list = get_voice_list(dir=dir) + voices |= { name: get_voice(name=name, dir=dir, load_latents=load_latents) for name in voice_list } + + return voices def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu', model_hash=None): if voice == 'random': return None, None - voices = get_voices(extra_voice_dirs=extra_voice_dirs, load_latents=load_latents) + voices = _get_voices(dirs=[get_voice_dir()] + extra_voice_dirs, load_latents=load_latents) paths = voices[voice] mtime = 0