From 96e9acdeecba423275e191597ed56f7a12dfd233 Mon Sep 17 00:00:00 2001 From: mrq Date: Fri, 17 Feb 2023 05:42:55 +0000 Subject: [PATCH] added preparation of LJSpeech-esque dataset --- README.md | 23 +++---------------- requirements.txt | 8 ++++--- src/utils.py | 60 +++++++++++++++++++++++++++++++++++++++++++----- src/webui.py | 22 ++++++++++++++++-- train.ipynb | 2 +- 5 files changed, 83 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 31b68ec..2983fb8 100755 --- a/README.md +++ b/README.md @@ -1,14 +1,12 @@ # AI Voice Cloning -This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts). +This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts). Similar to my own findings for Stable Diffusion image generation, this rentry may appear a little disheveled as I note my new findings with TorToiSe. Please keep this in mind if the guide seems to shift a bit or sound confusing. >\>Ugh... why bother when I can just abuse 11.AI? -I very much encourage (You) to use 11.AI while it's still viable to use. For the layman, it's easier to go through the hoops of coughing up the $5 or abusing the free trial over actually setting up a TorToiSe environment and dealing with its quirks. - -However, I also encourage your own experimentation with TorToiSe, as it's very, very promising, it just takes a little love and elbow grease. +You're more than welcome to, but TorToiSe is shaping up to be a very promising tool, especially with finetuning now on the horizon. This is not endorsed by [neonbjb](https://github.com/neonbjb/). I do not expect this to run into any ethical issues, as it seems (like me), this is mostly for making funny haha vidya characters say funny lines. @@ -302,19 +300,4 @@ I think this also highlights how just combining your entire source sample gung-h Output (`Is that really you, Mary?`, Ultra Fast preset, settings and latents embedded) * https://files.catbox.moe/gy1jvz.wav -This was just a quick test for an adjustable setting, but this one turned out really nice (for being a quick test) on the off chance. It's not the original delivery, and it definitely sounds robotic still, but it's on the Ultra Fast preset, as expected. - -## Caveats (and Upsides) - -To me, I find a few problems with TorToiSe over 11.AI: -* computation time is quite an issue. Despite Stable Diffusion proving to be adequate on my 2060, TorToiSe takes quite some time with modest settings. - - However, on my 6800XT, performance was drastically uplifted due to having more VRAM for larger batch sizes (at the cost of Krashing). -* reproducability in a voice depends on the "compatibilty" with the model TorToiSe was trained on. - - However, this also appears to be similar to 11.AI, where it was mostly trained on audiobook readings. -* the lack of an obvious analog to the "stability" and "similarity" sliders kind of sucks, but it's not the end of the world. - However, the `temperature` option seems to prove to be a proper analog to either of these. - -Although, I can look past these as TorToiSe offers, in comparison to 11.AI: -* the "speaking too fast" issue does not exist with TorToiSe. I don't need to fight with it by pretending I'm a Gaia user in the early 2000s by sprinkling ellipses. -* the overall delivery seems very natural, sometimes small, dramatic pauses gets added at the legitimately most convenient moments, and the inhales tend to be more natural. Many of vocaroos from 11.AI where it just does not seem properly delivered. -* being able to run it locally means I do not have to worry about some Polack seeing me use the "dick" word. \ No newline at end of file +This was just a quick test for an adjustable setting, but this one turned out really nice (for being a quick test) on the off chance. It's not the original delivery, and it definitely sounds robotic still, but it's on the Ultra Fast preset, as expected. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index afb90d2..19f11e0 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,9 @@ git+https://git.ecker.tech/mrq/tortoise-tts.git # git+https://git.ecker.tech/mrq/DL-Art-School.git - -whisper +git+https://github.com/openai/whisper.git +more-itertools +ffmpeg-python gradio music-tag -voicefixer \ No newline at end of file +voicefixer +psutil \ No newline at end of file diff --git a/src/utils.py b/src/utils.py index 99a197a..4036fc6 100755 --- a/src/utils.py +++ b/src/utils.py @@ -29,12 +29,12 @@ from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_ from tortoise.utils.text import split_and_recombine_text from tortoise.utils.device import get_device_name, set_device_name +import whisper args = None tts = None webui = None voicefixer = None -whisper = None dlas = None def get_args(): @@ -108,6 +108,13 @@ def setup_args(): return args +def pad(num, zeroes): + s = "" + for i in range(zeroes,0,-1): + if num < 10 ** i: + s = f"{s}0" + return f"{s}{num}" + def generate( text, delimiter, @@ -253,11 +260,8 @@ def generate( idx = keys[-1] + 1 # I know there's something to pad I don't care - pad = "" - for i in range(4,0,-1): - if idx < 10 ** i: - pad = f"{pad}0" - idx = f"{pad}{idx}" + + idx = pad(idx, 4) def get_name(line=0, candidate=0, combined=False): name = f"{idx}" @@ -455,6 +459,50 @@ def save_training_settings( batch_size=None, learning_rate=None, print_rate=None with open(f'./training/{settings["name"]}.yaml', 'w', encoding="utf-8") as f: f.write(yaml) +whisper_model = None +def prepare_dataset( files, outdir ): + global whisper_model + if whisper_model is None: + whisper_model = whisper.load_model("base") + + os.makedirs(outdir, exist_ok=True) + + idx = 0 + results = {} + + for file in files: + print(f"Transcribing file: {file}") + + result = whisper_model.transcribe(file) + results[os.path.basename(file)] = result + + print(f"Transcribed file: {file}, {len(result['segments'])} found.") + + waveform, sampling_rate = torchaudio.load(file) + num_channels, num_frames = waveform.shape + + transcription = [] + for segment in result['segments']: + start = int(segment['start'] * sampling_rate)-1 + end = int(segment['end'] * sampling_rate)+1 + + print(segment['start'], segment['end']) + print(start, end) + + sliced_waveform = waveform[:, start:end] + sliced_name = f"{pad(idx, 4)}.wav" + + torchaudio.save(f"{outdir}/{sliced_name}", sliced_waveform, sampling_rate) + + transcription.append(f"{sliced_name}|{segment['text'].trim()}") + idx = idx + 1 + + with open(f'{outdir}/whisper.json', 'w', encoding="utf-8") as f: + f.write(json.dumps(results, indent='\t')) + + with open(f'{outdir}/train.txt', 'w', encoding="utf-8") as f: + f.write("\n".join(transcription)) + def reset_generation_settings(): with open(f'./config/generate.json', 'w', encoding="utf-8") as f: f.write(json.dumps({}, indent='\t') ) diff --git a/src/webui.py b/src/webui.py index fb8ce58..253dd4c 100755 --- a/src/webui.py +++ b/src/webui.py @@ -15,7 +15,7 @@ import gradio.utils from datetime import datetime import tortoise.api -from tortoise.utils.audio import get_voice_dir +from tortoise.utils.audio import get_voice_dir, get_voices from utils import * @@ -370,8 +370,24 @@ def setup_gradio(): ] ) with gr.Tab("Training"): - with gr.Tab("Configuration"): with gr.Row(): + with gr.Column(): + dataset_settings = [ + gr.Dropdown( get_voice_list(), label="Dataset Source", type="value" ), + ] + dataset_voices = dataset_settings[0] + + prepare_dataset_button = gr.Button(value="Prepare") + + def prepare_dataset_proxy( voice ): + return prepare_dataset( get_voices(load_latents=False)[voice], outdir=f"./training/{voice}/" ) + + prepare_dataset_button.click( + prepare_dataset_proxy, + inputs=dataset_settings, + outputs=None + ) + with gr.Column(): training_settings = [ gr.Slider(label="Batch Size", value=128), @@ -452,6 +468,7 @@ def setup_gradio(): # YUCK def update_voices(): return ( + gr.Dropdown.update(choices=get_voice_list()), gr.Dropdown.update(choices=get_voice_list()), gr.Dropdown.update(choices=get_voice_list("./results/")), ) @@ -463,6 +480,7 @@ def setup_gradio(): inputs=None, outputs=[ voice, + dataset_voices, history_voices ] ) diff --git a/train.ipynb b/train.ipynb index ca89a31..12320ec 100755 --- a/train.ipynb +++ b/train.ipynb @@ -31,6 +31,7 @@ "source":[ "!git clone https://git.ecker.tech/mrq/DL-Art-School\n", "%cd DL-Art-School\n", + "!rm -r experiments\n", "!pip install -r requirements.txt" ] }, @@ -40,7 +41,6 @@ "from google.colab import drive\n", "drive.mount('/content/drive')", "%cd /content/DL-Art-School/\n", - "#!rm -r experiments\n", "!ln -s /content/drive/MyDrive/experiments/\n", ], "metadata":{