forked from mrq/tortoise-tts
31 lines
1.2 KiB
Python
31 lines
1.2 KiB
Python
|
import argparse
|
||
|
import os
|
||
|
import torch
|
||
|
|
||
|
from api import TextToSpeech
|
||
|
from tortoise.utils.audio import load_audio, get_voices
|
||
|
|
||
|
"""
|
||
|
Dumps the conditioning latents for the specified voice to disk. These are expressive latents which can be used for
|
||
|
other ML models, or can be augmented manually and fed back into Tortoise to affect vocal qualities.
|
||
|
"""
|
||
|
if __name__ == '__main__':
|
||
|
parser = argparse.ArgumentParser()
|
||
|
parser.add_argument('--voice', type=str, help='Selects the voice to convert to conditioning latents', default='pat')
|
||
|
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/conditioning_latents')
|
||
|
args = parser.parse_args()
|
||
|
os.makedirs(args.output_path, exist_ok=True)
|
||
|
|
||
|
tts = TextToSpeech()
|
||
|
voices = get_voices()
|
||
|
selected_voices = args.voice.split(',')
|
||
|
for voice in selected_voices:
|
||
|
cond_paths = voices[voice]
|
||
|
conds = []
|
||
|
for cond_path in cond_paths:
|
||
|
c = load_audio(cond_path, 22050)
|
||
|
conds.append(c)
|
||
|
conditioning_latents = tts.get_conditioning_latents(conds)
|
||
|
torch.save(conditioning_latents, os.path.join(args.output_path, f'{voice}.pth'))
|
||
|
|