diff --git a/README.md b/README.md index 17d5d30..e219625 100755 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ My fork boasts the following additions, fixes, and optimizations: * uses the entire audio sample instead of the first four seconds of each sound file for better reproducing * activated unused DDIM sampler * use of some optimizations like `kv_cache`ing for the autoregression sample pass, and keeping data on GPU -* compatability with DirectML +* compatibilty with DirectML * easy install scripts * and more! @@ -139,7 +139,9 @@ If you're looking to trim your clips, in my opinion, ~~Audacity~~ Tenacity works Power users with FFMPEG already installed can simply used the provided conversion script in `.\convert\`. -After preparing your clips as WAV files at a sample rate of 22050 Hz, open up the `tortoise-tts` folder you're working in, navigate to `./tortoise/voice/`, create a new folder in whatever name you want, then dump your clips into that folder. While you're in the `voice` folder, you can take a look at the other provided voices. +After preparing your clips as WAV files at a sample rate of 22050 Hz, open up the `tortoise-tts` folder you're working in, navigate to the `voices` folder, create a new folder in whatever name you want, then dump your clips into that folder. While you're in the `voice` folder, you can take a look at the other provided voices. + +**!**NOTE**!**: Before 2023.02.10, voices used to be stored under `.\tortoise\voices\`, but has been moved up one folder. Compatibily is maintained with the old voice folder, but will take priority. ## Using the Software @@ -269,7 +271,7 @@ This was just a quick test for an adjustable setting, but this one turned out re To me, I find a few problems with TorToiSe over 11.AI: * computation time is quite an issue. Despite Stable Diffusion proving to be adequate on my 2060, TorToiSe takes quite some time with modest settings. - However, on my 6800XT, performance was drastically uplifted due to having more VRAM for larger batch sizes (at the cost of Krashing). -* reproducability in a voice depends on the "compatability" with the model TorToiSe was trained on. +* reproducability in a voice depends on the "compatibilty" with the model TorToiSe was trained on. - However, this also appears to be similar to 11.AI, where it was mostly trained on audiobook readings. * the lack of an obvious analog to the "stability" and "similarity" sliders kind of sucks, but it's not the end of the world. However, the `temperature` option seems to prove to be a proper analog to either of these. diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py index 32839ad..7f3c8a9 100755 --- a/tortoise/utils/audio.py +++ b/tortoise/utils/audio.py @@ -10,8 +10,20 @@ from scipy.io.wavfile import read from tortoise.utils.stft import STFT -BUILTIN_VOICES_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../voices') +if 'TORTOISE_VOICES_DIR' not in os.environ: + voice_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../voices') + if not os.path.exists(voice_dir): + voice_dir = os.path.dirname('./voices/') + + os.environ['TORTOISE_VOICES_DIR'] = voice_dir + +BUILTIN_VOICES_DIR = os.environ.get('TORTOISE_VOICES_DIR') + +os.makedirs(BUILTIN_VOICES_DIR, exist_ok=True) + +def get_voice_dir(): + return BUILTIN_VOICES_DIR def load_wav_to_torch(full_path): sampling_rate, data = read(full_path) diff --git a/voices/.gitkeep b/voices/.gitkeep new file mode 100755 index 0000000..e69de29 diff --git a/webui.py b/webui.py index 3f9210d..773db7f 100755 --- a/webui.py +++ b/webui.py @@ -17,7 +17,7 @@ from datetime import datetime from fastapi import FastAPI from tortoise.api import TextToSpeech -from tortoise.utils.audio import load_audio, load_voice, load_voices +from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_dir from tortoise.utils.text import split_and_recombine_text args = None @@ -75,7 +75,7 @@ def generate( conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None) if voice != "microphone": - torch.save(conditioning_latents, f'./tortoise/voices/{voice}/cond_latents.pth') + torch.save(conditioning_latents, f'./{get_voice_dir()}/{voice}/cond_latents.pth') voice_samples = None else: sample_voice = None @@ -235,7 +235,7 @@ def generate( f.write(json.dumps(info, indent='\t') ) if voice is not None and conditioning_latents is not None: - with open(f'./tortoise/voices/{voice}/cond_latents.pth', 'rb') as f: + with open(f'./{get_voice_dir()}/{voice}/cond_latents.pth', 'rb') as f: info['latents'] = base64.b64encode(f.read()).decode("ascii") if args.embed_output_metadata: @@ -297,7 +297,7 @@ def read_generate_settings(file, save_latents=True, save_as_temp=True): del j['latents'] if latents and save_latents: - outdir=f'./tortoise/voices/{".temp" if save_as_temp else j["voice"]}/' + outdir=f'./{get_voice_dir()}/{".temp" if save_as_temp else j["voice"]}/' os.makedirs(outdir, exist_ok=True) with open(f'{outdir}/cond_latents.pth', 'wb') as f: f.write(latents) @@ -390,7 +390,7 @@ def reload_tts(): tts = setup_tortoise() def update_voices(): - return gr.Dropdown.update(choices=sorted(os.listdir("./tortoise/voices")) + ["microphone"]) + return gr.Dropdown.update(choices=sorted(os.listdir(get_voice_dir())) + ["microphone"]) def export_exec_settings( share, listen, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, cond_latent_max_chunk_size, sample_batch_size, concurrency_count, output_sample_rate, output_volume ): args.share = share @@ -517,7 +517,7 @@ def setup_gradio(): ) prompt = gr.Textbox(lines=1, label="Custom Emotion + Prompt (if selected)") voice = gr.Dropdown( - sorted(os.listdir("./tortoise/voices")) + ["microphone"], + sorted(os.listdir(get_voice_dir())) + ["microphone"], label="Voice", type="value", )