added preparation of LJSpeech-esque dataset

This commit is contained in:
mrq 2023-02-17 05:42:55 +00:00
parent 9c0e4666d2
commit 96e9acdeec
5 changed files with 83 additions and 32 deletions

View File

@ -1,14 +1,12 @@
# AI Voice Cloning
This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).
This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).
Similar to my own findings for Stable Diffusion image generation, this rentry may appear a little disheveled as I note my new findings with TorToiSe. Please keep this in mind if the guide seems to shift a bit or sound confusing.
>\>Ugh... why bother when I can just abuse 11.AI?
I very much encourage (You) to use 11.AI while it's still viable to use. For the layman, it's easier to go through the hoops of coughing up the $5 or abusing the free trial over actually setting up a TorToiSe environment and dealing with its quirks.
However, I also encourage your own experimentation with TorToiSe, as it's very, very promising, it just takes a little love and elbow grease.
You're more than welcome to, but TorToiSe is shaping up to be a very promising tool, especially with finetuning now on the horizon.
This is not endorsed by [neonbjb](https://github.com/neonbjb/). I do not expect this to run into any ethical issues, as it seems (like me), this is mostly for making funny haha vidya characters say funny lines.
@ -302,19 +300,4 @@ I think this also highlights how just combining your entire source sample gung-h
Output (`Is that really you, Mary?`, Ultra Fast preset, settings and latents embedded)
* https://files.catbox.moe/gy1jvz.wav
This was just a quick test for an adjustable setting, but this one turned out really nice (for being a quick test) on the off chance. It's not the original delivery, and it definitely sounds robotic still, but it's on the Ultra Fast preset, as expected.
## Caveats (and Upsides)
To me, I find a few problems with TorToiSe over 11.AI:
* computation time is quite an issue. Despite Stable Diffusion proving to be adequate on my 2060, TorToiSe takes quite some time with modest settings.
- However, on my 6800XT, performance was drastically uplifted due to having more VRAM for larger batch sizes (at the cost of Krashing).
* reproducability in a voice depends on the "compatibilty" with the model TorToiSe was trained on.
- However, this also appears to be similar to 11.AI, where it was mostly trained on audiobook readings.
* the lack of an obvious analog to the "stability" and "similarity" sliders kind of sucks, but it's not the end of the world.
However, the `temperature` option seems to prove to be a proper analog to either of these.
Although, I can look past these as TorToiSe offers, in comparison to 11.AI:
* the "speaking too fast" issue does not exist with TorToiSe. I don't need to fight with it by pretending I'm a Gaia user in the early 2000s by sprinkling ellipses.
* the overall delivery seems very natural, sometimes small, dramatic pauses gets added at the legitimately most convenient moments, and the inhales tend to be more natural. Many of vocaroos from 11.AI where it just does not seem properly delivered.
* being able to run it locally means I do not have to worry about some Polack seeing me use the "dick" word.
This was just a quick test for an adjustable setting, but this one turned out really nice (for being a quick test) on the off chance. It's not the original delivery, and it definitely sounds robotic still, but it's on the Ultra Fast preset, as expected.

View File

@ -1,7 +1,9 @@
git+https://git.ecker.tech/mrq/tortoise-tts.git
# git+https://git.ecker.tech/mrq/DL-Art-School.git
whisper
git+https://github.com/openai/whisper.git
more-itertools
ffmpeg-python
gradio
music-tag
voicefixer
voicefixer
psutil

View File

@ -29,12 +29,12 @@ from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_
from tortoise.utils.text import split_and_recombine_text
from tortoise.utils.device import get_device_name, set_device_name
import whisper
args = None
tts = None
webui = None
voicefixer = None
whisper = None
dlas = None
def get_args():
@ -108,6 +108,13 @@ def setup_args():
return args
def pad(num, zeroes):
s = ""
for i in range(zeroes,0,-1):
if num < 10 ** i:
s = f"{s}0"
return f"{s}{num}"
def generate(
text,
delimiter,
@ -253,11 +260,8 @@ def generate(
idx = keys[-1] + 1
# I know there's something to pad I don't care
pad = ""
for i in range(4,0,-1):
if idx < 10 ** i:
pad = f"{pad}0"
idx = f"{pad}{idx}"
idx = pad(idx, 4)
def get_name(line=0, candidate=0, combined=False):
name = f"{idx}"
@ -455,6 +459,50 @@ def save_training_settings( batch_size=None, learning_rate=None, print_rate=None
with open(f'./training/{settings["name"]}.yaml', 'w', encoding="utf-8") as f:
f.write(yaml)
whisper_model = None
def prepare_dataset( files, outdir ):
global whisper_model
if whisper_model is None:
whisper_model = whisper.load_model("base")
os.makedirs(outdir, exist_ok=True)
idx = 0
results = {}
for file in files:
print(f"Transcribing file: {file}")
result = whisper_model.transcribe(file)
results[os.path.basename(file)] = result
print(f"Transcribed file: {file}, {len(result['segments'])} found.")
waveform, sampling_rate = torchaudio.load(file)
num_channels, num_frames = waveform.shape
transcription = []
for segment in result['segments']:
start = int(segment['start'] * sampling_rate)-1
end = int(segment['end'] * sampling_rate)+1
print(segment['start'], segment['end'])
print(start, end)
sliced_waveform = waveform[:, start:end]
sliced_name = f"{pad(idx, 4)}.wav"
torchaudio.save(f"{outdir}/{sliced_name}", sliced_waveform, sampling_rate)
transcription.append(f"{sliced_name}|{segment['text'].trim()}")
idx = idx + 1
with open(f'{outdir}/whisper.json', 'w', encoding="utf-8") as f:
f.write(json.dumps(results, indent='\t'))
with open(f'{outdir}/train.txt', 'w', encoding="utf-8") as f:
f.write("\n".join(transcription))
def reset_generation_settings():
with open(f'./config/generate.json', 'w', encoding="utf-8") as f:
f.write(json.dumps({}, indent='\t') )

View File

@ -15,7 +15,7 @@ import gradio.utils
from datetime import datetime
import tortoise.api
from tortoise.utils.audio import get_voice_dir
from tortoise.utils.audio import get_voice_dir, get_voices
from utils import *
@ -370,8 +370,24 @@ def setup_gradio():
]
)
with gr.Tab("Training"):
with gr.Tab("Configuration"):
with gr.Row():
with gr.Column():
dataset_settings = [
gr.Dropdown( get_voice_list(), label="Dataset Source", type="value" ),
]
dataset_voices = dataset_settings[0]
prepare_dataset_button = gr.Button(value="Prepare")
def prepare_dataset_proxy( voice ):
return prepare_dataset( get_voices(load_latents=False)[voice], outdir=f"./training/{voice}/" )
prepare_dataset_button.click(
prepare_dataset_proxy,
inputs=dataset_settings,
outputs=None
)
with gr.Column():
training_settings = [
gr.Slider(label="Batch Size", value=128),
@ -452,6 +468,7 @@ def setup_gradio():
# YUCK
def update_voices():
return (
gr.Dropdown.update(choices=get_voice_list()),
gr.Dropdown.update(choices=get_voice_list()),
gr.Dropdown.update(choices=get_voice_list("./results/")),
)
@ -463,6 +480,7 @@ def setup_gradio():
inputs=None,
outputs=[
voice,
dataset_voices,
history_voices
]
)

View File

@ -31,6 +31,7 @@
"source":[
"!git clone https://git.ecker.tech/mrq/DL-Art-School\n",
"%cd DL-Art-School\n",
"!rm -r experiments\n",
"!pip install -r requirements.txt"
]
},
@ -40,7 +41,6 @@
"from google.colab import drive\n",
"drive.mount('/content/drive')",
"%cd /content/DL-Art-School/\n",
"#!rm -r experiments\n",
"!ln -s /content/drive/MyDrive/experiments/\n",
],
"metadata":{