possible speedup with one simple trick (it worked for valle inferencing), also backported the voice list loading from aivc

This commit is contained in:
mrq 2023-08-20 22:32:01 -05:00
parent 9afa71542b
commit cbd3c95c42
2 changed files with 65 additions and 3 deletions

View File

@ -150,7 +150,7 @@ def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusi
model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', trained_diffusion_steps), model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', trained_diffusion_steps),
conditioning_free=cond_free, conditioning_free_k=cond_free_k) conditioning_free=cond_free, conditioning_free_k=cond_free_k)
@torch.inference_mode()
def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=22050): def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=22050):
""" """
Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models. Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
@ -194,7 +194,7 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
return codes return codes
@torch.inference_mode()
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000): def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000):
""" """
Uses the specified diffusion model to convert discrete codes into a spectrogram. Uses the specified diffusion model to convert discrete codes into a spectrogram.
@ -453,6 +453,7 @@ class TextToSpeech:
if self.preloaded_tensors: if self.preloaded_tensors:
self.cvvp = migrate_to_device( self.cvvp, self.device ) self.cvvp = migrate_to_device( self.cvvp, self.device )
@torch.inference_mode()
def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, slices=1, max_chunk_size=None, force_cpu=False, original_ar=False, original_diffusion=False): def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, slices=1, max_chunk_size=None, force_cpu=False, original_ar=False, original_diffusion=False):
""" """
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent). Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
@ -578,6 +579,7 @@ class TextToSpeech:
settings.update(kwargs) # allow overriding of preset settings with kwargs settings.update(kwargs) # allow overriding of preset settings with kwargs
return self.tts(text, **settings) return self.tts(text, **settings)
@torch.inference_mode()
def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None, def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None,
return_deterministic_state=False, return_deterministic_state=False,
# autoregressive generation parameters follow # autoregressive generation parameters follow

View File

@ -94,12 +94,72 @@ def get_voices(extra_voice_dirs=[], load_latents=True):
voices[sub] = voices[sub] + list(glob(f'{subj}/*.pth')) voices[sub] = voices[sub] + list(glob(f'{subj}/*.pth'))
return voices return voices
def get_voice( name, dir=get_voice_dir(), load_latents=True, extensions=["wav", "mp3", "flac"] ):
subj = f'{dir}/{name}/'
if not os.path.isdir(subj):
return
files = os.listdir(subj)
if load_latents:
extensions.append(".pth")
voice = []
for file in files:
ext = os.path.splitext(file)[-1][1:]
if ext not in extensions:
continue
voice.append(f'{subj}/{file}')
return sorted( voice )
def get_voice_list(dir=get_voice_dir(), append_defaults=False, extensions=["wav", "mp3", "flac", "pth"]):
defaults = [ "random", "microphone" ]
os.makedirs(dir, exist_ok=True)
#res = sorted([d for d in os.listdir(dir) if d not in defaults and os.path.isdir(os.path.join(dir, d)) and len(os.listdir(os.path.join(dir, d))) > 0 ])
res = []
for name in os.listdir(dir):
if name in defaults:
continue
if not os.path.isdir(f'{dir}/{name}'):
continue
if len(os.listdir(os.path.join(dir, name))) == 0:
continue
files = get_voice( name, dir=dir, extensions=extensions )
if len(files) > 0:
res.append(name)
else:
for subdir in os.listdir(f'{dir}/{name}'):
if not os.path.isdir(f'{dir}/{name}/{subdir}'):
continue
files = get_voice( f'{name}/{subdir}', dir=dir, extensions=extensions )
if len(files) == 0:
continue
res.append(f'{name}/{subdir}')
res = sorted(res)
if append_defaults:
res = res + defaults
return res
def _get_voices( dirs=[get_voice_dir()], load_latents=True ):
voices = {}
for dir in dirs:
voice_list = get_voice_list(dir=dir)
voices |= { name: get_voice(name=name, dir=dir, load_latents=load_latents) for name in voice_list }
return voices
def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu', model_hash=None): def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu', model_hash=None):
if voice == 'random': if voice == 'random':
return None, None return None, None
voices = get_voices(extra_voice_dirs=extra_voice_dirs, load_latents=load_latents) voices = _get_voices(dirs=[get_voice_dir()] + extra_voice_dirs, load_latents=load_latents)
paths = voices[voice] paths = voices[voice]
mtime = 0 mtime = 0