forked from mrq/tortoise-tts
possible speedup with one simple trick (it worked for valle inferencing), also backported the voice list loading from aivc
This commit is contained in:
parent
9afa71542b
commit
cbd3c95c42
|
@ -150,7 +150,7 @@ def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusi
|
|||
model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', trained_diffusion_steps),
|
||||
conditioning_free=cond_free, conditioning_free_k=cond_free_k)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=22050):
|
||||
"""
|
||||
Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
|
||||
|
@ -194,7 +194,7 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
|
|||
|
||||
return codes
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000):
|
||||
"""
|
||||
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
||||
|
@ -453,6 +453,7 @@ class TextToSpeech:
|
|||
if self.preloaded_tensors:
|
||||
self.cvvp = migrate_to_device( self.cvvp, self.device )
|
||||
|
||||
@torch.inference_mode()
|
||||
def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, slices=1, max_chunk_size=None, force_cpu=False, original_ar=False, original_diffusion=False):
|
||||
"""
|
||||
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
||||
|
@ -578,6 +579,7 @@ class TextToSpeech:
|
|||
settings.update(kwargs) # allow overriding of preset settings with kwargs
|
||||
return self.tts(text, **settings)
|
||||
|
||||
@torch.inference_mode()
|
||||
def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None,
|
||||
return_deterministic_state=False,
|
||||
# autoregressive generation parameters follow
|
||||
|
|
|
@ -94,12 +94,72 @@ def get_voices(extra_voice_dirs=[], load_latents=True):
|
|||
voices[sub] = voices[sub] + list(glob(f'{subj}/*.pth'))
|
||||
return voices
|
||||
|
||||
def get_voice( name, dir=get_voice_dir(), load_latents=True, extensions=["wav", "mp3", "flac"] ):
|
||||
subj = f'{dir}/{name}/'
|
||||
if not os.path.isdir(subj):
|
||||
return
|
||||
files = os.listdir(subj)
|
||||
|
||||
if load_latents:
|
||||
extensions.append(".pth")
|
||||
|
||||
voice = []
|
||||
for file in files:
|
||||
ext = os.path.splitext(file)[-1][1:]
|
||||
if ext not in extensions:
|
||||
continue
|
||||
|
||||
voice.append(f'{subj}/{file}')
|
||||
|
||||
return sorted( voice )
|
||||
|
||||
def get_voice_list(dir=get_voice_dir(), append_defaults=False, extensions=["wav", "mp3", "flac", "pth"]):
|
||||
defaults = [ "random", "microphone" ]
|
||||
os.makedirs(dir, exist_ok=True)
|
||||
#res = sorted([d for d in os.listdir(dir) if d not in defaults and os.path.isdir(os.path.join(dir, d)) and len(os.listdir(os.path.join(dir, d))) > 0 ])
|
||||
|
||||
res = []
|
||||
for name in os.listdir(dir):
|
||||
if name in defaults:
|
||||
continue
|
||||
if not os.path.isdir(f'{dir}/{name}'):
|
||||
continue
|
||||
if len(os.listdir(os.path.join(dir, name))) == 0:
|
||||
continue
|
||||
files = get_voice( name, dir=dir, extensions=extensions )
|
||||
|
||||
if len(files) > 0:
|
||||
res.append(name)
|
||||
else:
|
||||
for subdir in os.listdir(f'{dir}/{name}'):
|
||||
if not os.path.isdir(f'{dir}/{name}/{subdir}'):
|
||||
continue
|
||||
files = get_voice( f'{name}/{subdir}', dir=dir, extensions=extensions )
|
||||
if len(files) == 0:
|
||||
continue
|
||||
res.append(f'{name}/{subdir}')
|
||||
|
||||
res = sorted(res)
|
||||
|
||||
if append_defaults:
|
||||
res = res + defaults
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def _get_voices( dirs=[get_voice_dir()], load_latents=True ):
|
||||
voices = {}
|
||||
for dir in dirs:
|
||||
voice_list = get_voice_list(dir=dir)
|
||||
voices |= { name: get_voice(name=name, dir=dir, load_latents=load_latents) for name in voice_list }
|
||||
|
||||
return voices
|
||||
|
||||
def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu', model_hash=None):
|
||||
if voice == 'random':
|
||||
return None, None
|
||||
|
||||
voices = get_voices(extra_voice_dirs=extra_voice_dirs, load_latents=load_latents)
|
||||
voices = _get_voices(dirs=[get_voice_dir()] + extra_voice_dirs, load_latents=load_latents)
|
||||
|
||||
paths = voices[voice]
|
||||
mtime = 0
|
||||
|
|
Loading…
Reference in New Issue
Block a user