forked from mrq/tortoise-tts
Moved voices out of the tortoise folder because it kept being processed for setup.py
This commit is contained in:
parent
2bce24b9dd
commit
7471bc209c
|
@ -36,7 +36,7 @@ My fork boasts the following additions, fixes, and optimizations:
|
||||||
* uses the entire audio sample instead of the first four seconds of each sound file for better reproducing
|
* uses the entire audio sample instead of the first four seconds of each sound file for better reproducing
|
||||||
* activated unused DDIM sampler
|
* activated unused DDIM sampler
|
||||||
* use of some optimizations like `kv_cache`ing for the autoregression sample pass, and keeping data on GPU
|
* use of some optimizations like `kv_cache`ing for the autoregression sample pass, and keeping data on GPU
|
||||||
* compatability with DirectML
|
* compatibilty with DirectML
|
||||||
* easy install scripts
|
* easy install scripts
|
||||||
* and more!
|
* and more!
|
||||||
|
|
||||||
|
@ -139,7 +139,9 @@ If you're looking to trim your clips, in my opinion, ~~Audacity~~ Tenacity works
|
||||||
|
|
||||||
Power users with FFMPEG already installed can simply used the provided conversion script in `.\convert\`.
|
Power users with FFMPEG already installed can simply used the provided conversion script in `.\convert\`.
|
||||||
|
|
||||||
After preparing your clips as WAV files at a sample rate of 22050 Hz, open up the `tortoise-tts` folder you're working in, navigate to `./tortoise/voice/`, create a new folder in whatever name you want, then dump your clips into that folder. While you're in the `voice` folder, you can take a look at the other provided voices.
|
After preparing your clips as WAV files at a sample rate of 22050 Hz, open up the `tortoise-tts` folder you're working in, navigate to the `voices` folder, create a new folder in whatever name you want, then dump your clips into that folder. While you're in the `voice` folder, you can take a look at the other provided voices.
|
||||||
|
|
||||||
|
**!**NOTE**!**: Before 2023.02.10, voices used to be stored under `.\tortoise\voices\`, but has been moved up one folder. Compatibily is maintained with the old voice folder, but will take priority.
|
||||||
|
|
||||||
## Using the Software
|
## Using the Software
|
||||||
|
|
||||||
|
@ -269,7 +271,7 @@ This was just a quick test for an adjustable setting, but this one turned out re
|
||||||
To me, I find a few problems with TorToiSe over 11.AI:
|
To me, I find a few problems with TorToiSe over 11.AI:
|
||||||
* computation time is quite an issue. Despite Stable Diffusion proving to be adequate on my 2060, TorToiSe takes quite some time with modest settings.
|
* computation time is quite an issue. Despite Stable Diffusion proving to be adequate on my 2060, TorToiSe takes quite some time with modest settings.
|
||||||
- However, on my 6800XT, performance was drastically uplifted due to having more VRAM for larger batch sizes (at the cost of Krashing).
|
- However, on my 6800XT, performance was drastically uplifted due to having more VRAM for larger batch sizes (at the cost of Krashing).
|
||||||
* reproducability in a voice depends on the "compatability" with the model TorToiSe was trained on.
|
* reproducability in a voice depends on the "compatibilty" with the model TorToiSe was trained on.
|
||||||
- However, this also appears to be similar to 11.AI, where it was mostly trained on audiobook readings.
|
- However, this also appears to be similar to 11.AI, where it was mostly trained on audiobook readings.
|
||||||
* the lack of an obvious analog to the "stability" and "similarity" sliders kind of sucks, but it's not the end of the world.
|
* the lack of an obvious analog to the "stability" and "similarity" sliders kind of sucks, but it's not the end of the world.
|
||||||
However, the `temperature` option seems to prove to be a proper analog to either of these.
|
However, the `temperature` option seems to prove to be a proper analog to either of these.
|
||||||
|
|
|
@ -10,8 +10,20 @@ from scipy.io.wavfile import read
|
||||||
from tortoise.utils.stft import STFT
|
from tortoise.utils.stft import STFT
|
||||||
|
|
||||||
|
|
||||||
BUILTIN_VOICES_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../voices')
|
if 'TORTOISE_VOICES_DIR' not in os.environ:
|
||||||
|
voice_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../voices')
|
||||||
|
|
||||||
|
if not os.path.exists(voice_dir):
|
||||||
|
voice_dir = os.path.dirname('./voices/')
|
||||||
|
|
||||||
|
os.environ['TORTOISE_VOICES_DIR'] = voice_dir
|
||||||
|
|
||||||
|
BUILTIN_VOICES_DIR = os.environ.get('TORTOISE_VOICES_DIR')
|
||||||
|
|
||||||
|
os.makedirs(BUILTIN_VOICES_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
def get_voice_dir():
|
||||||
|
return BUILTIN_VOICES_DIR
|
||||||
|
|
||||||
def load_wav_to_torch(full_path):
|
def load_wav_to_torch(full_path):
|
||||||
sampling_rate, data = read(full_path)
|
sampling_rate, data = read(full_path)
|
||||||
|
|
0
voices/.gitkeep
Executable file
0
voices/.gitkeep
Executable file
12
webui.py
12
webui.py
|
@ -17,7 +17,7 @@ from datetime import datetime
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
|
|
||||||
from tortoise.api import TextToSpeech
|
from tortoise.api import TextToSpeech
|
||||||
from tortoise.utils.audio import load_audio, load_voice, load_voices
|
from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_dir
|
||||||
from tortoise.utils.text import split_and_recombine_text
|
from tortoise.utils.text import split_and_recombine_text
|
||||||
|
|
||||||
args = None
|
args = None
|
||||||
|
@ -75,7 +75,7 @@ def generate(
|
||||||
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
|
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
|
||||||
|
|
||||||
if voice != "microphone":
|
if voice != "microphone":
|
||||||
torch.save(conditioning_latents, f'./tortoise/voices/{voice}/cond_latents.pth')
|
torch.save(conditioning_latents, f'./{get_voice_dir()}/{voice}/cond_latents.pth')
|
||||||
voice_samples = None
|
voice_samples = None
|
||||||
else:
|
else:
|
||||||
sample_voice = None
|
sample_voice = None
|
||||||
|
@ -235,7 +235,7 @@ def generate(
|
||||||
f.write(json.dumps(info, indent='\t') )
|
f.write(json.dumps(info, indent='\t') )
|
||||||
|
|
||||||
if voice is not None and conditioning_latents is not None:
|
if voice is not None and conditioning_latents is not None:
|
||||||
with open(f'./tortoise/voices/{voice}/cond_latents.pth', 'rb') as f:
|
with open(f'./{get_voice_dir()}/{voice}/cond_latents.pth', 'rb') as f:
|
||||||
info['latents'] = base64.b64encode(f.read()).decode("ascii")
|
info['latents'] = base64.b64encode(f.read()).decode("ascii")
|
||||||
|
|
||||||
if args.embed_output_metadata:
|
if args.embed_output_metadata:
|
||||||
|
@ -297,7 +297,7 @@ def read_generate_settings(file, save_latents=True, save_as_temp=True):
|
||||||
del j['latents']
|
del j['latents']
|
||||||
|
|
||||||
if latents and save_latents:
|
if latents and save_latents:
|
||||||
outdir=f'./tortoise/voices/{".temp" if save_as_temp else j["voice"]}/'
|
outdir=f'./{get_voice_dir()}/{".temp" if save_as_temp else j["voice"]}/'
|
||||||
os.makedirs(outdir, exist_ok=True)
|
os.makedirs(outdir, exist_ok=True)
|
||||||
with open(f'{outdir}/cond_latents.pth', 'wb') as f:
|
with open(f'{outdir}/cond_latents.pth', 'wb') as f:
|
||||||
f.write(latents)
|
f.write(latents)
|
||||||
|
@ -390,7 +390,7 @@ def reload_tts():
|
||||||
tts = setup_tortoise()
|
tts = setup_tortoise()
|
||||||
|
|
||||||
def update_voices():
|
def update_voices():
|
||||||
return gr.Dropdown.update(choices=sorted(os.listdir("./tortoise/voices")) + ["microphone"])
|
return gr.Dropdown.update(choices=sorted(os.listdir(get_voice_dir())) + ["microphone"])
|
||||||
|
|
||||||
def export_exec_settings( share, listen, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, cond_latent_max_chunk_size, sample_batch_size, concurrency_count, output_sample_rate, output_volume ):
|
def export_exec_settings( share, listen, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, cond_latent_max_chunk_size, sample_batch_size, concurrency_count, output_sample_rate, output_volume ):
|
||||||
args.share = share
|
args.share = share
|
||||||
|
@ -517,7 +517,7 @@ def setup_gradio():
|
||||||
)
|
)
|
||||||
prompt = gr.Textbox(lines=1, label="Custom Emotion + Prompt (if selected)")
|
prompt = gr.Textbox(lines=1, label="Custom Emotion + Prompt (if selected)")
|
||||||
voice = gr.Dropdown(
|
voice = gr.Dropdown(
|
||||||
sorted(os.listdir("./tortoise/voices")) + ["microphone"],
|
sorted(os.listdir(get_voice_dir())) + ["microphone"],
|
||||||
label="Voice",
|
label="Voice",
|
||||||
type="value",
|
type="value",
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user