forked from mrq/tortoise-tts
01b783fc02
- Adds a new script and API endpoints for doing this - Reworks autoregressive and diffusion models so that the conditioning is computed separately (which will actually provide a mild performance boost) - Updates README This is untested. Need to do the following manual tests (and someday write unit tests for this behemoth before it becomes a problem..) 1) Does get_conditioning_latents.py work? 2) Can I feed those latents back into the model by creating a new voice? 3) Can I still mix and match voices (both with conditioning latents and normal voices) with read.py?
31 lines
1.2 KiB
Python
31 lines
1.2 KiB
Python
import argparse
|
|
import os
|
|
import torch
|
|
|
|
from api import TextToSpeech
|
|
from tortoise.utils.audio import load_audio, get_voices
|
|
|
|
"""
|
|
Dumps the conditioning latents for the specified voice to disk. These are expressive latents which can be used for
|
|
other ML models, or can be augmented manually and fed back into Tortoise to affect vocal qualities.
|
|
"""
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--voice', type=str, help='Selects the voice to convert to conditioning latents', default='pat')
|
|
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/conditioning_latents')
|
|
args = parser.parse_args()
|
|
os.makedirs(args.output_path, exist_ok=True)
|
|
|
|
tts = TextToSpeech()
|
|
voices = get_voices()
|
|
selected_voices = args.voice.split(',')
|
|
for voice in selected_voices:
|
|
cond_paths = voices[voice]
|
|
conds = []
|
|
for cond_path in cond_paths:
|
|
c = load_audio(cond_path, 22050)
|
|
conds.append(c)
|
|
conditioning_latents = tts.get_conditioning_latents(conds)
|
|
torch.save(conditioning_latents, os.path.join(args.output_path, f'{voice}.pth'))
|
|
|