Compare commits

..

No commits in common. "master" and "master" have entirely different histories.

19 changed files with 4217 additions and 5330 deletions

View File

@ -1,4 +0,0 @@
/models
/training
/voices
/bin

View File

@ -1,37 +0,0 @@
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
ARG DEBIAN_FRONTEND=noninteractive
ARG TZ=UTC
ARG MINICONDA_VERSION=23.1.0-1
ARG PYTHON_VERSION=3.9.13
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
RUN apt-get update
RUN apt install -y curl wget git ffmpeg
RUN adduser --disabled-password --gecos '' --shell /bin/bash user
USER user
ENV HOME=/home/user
WORKDIR $HOME
RUN mkdir $HOME/.cache $HOME/.config && chmod -R 777 $HOME
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh
RUN chmod +x Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh
RUN ./Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh -b -p /home/user/miniconda
ENV PATH="$HOME/miniconda/bin:$PATH"
RUN conda init
RUN conda install python=$PYTHON_VERSION
RUN python3 -m pip install --upgrade pip
RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
RUN mkdir $HOME/ai-voice-cloning
WORKDIR $HOME/ai-voice-cloning
COPY --chown=user:user modules modules
RUN python3 -m pip install -r ./modules/tortoise-tts/requirements.txt
RUN python3 -m pip install -e ./modules/tortoise-tts/
RUN python3 -m pip install -r ./modules/dlas/requirements.txt
RUN python3 -m pip install -e ./modules/dlas/
ADD requirements.txt requirements.txt
RUN python3 -m pip install -r ./requirements.txt
ADD --chown=user:user . $HOME/ai-voice-cloning
CMD ["python", "./src/main.py", "--listen", "0.0.0.0:7680"]

View File

@ -1,9 +1,9 @@
# AI Voice Cloning
> **Note** This project is effectively abandonware due to requiring a rewrite. Please use [JarodMica/ai-voice-cloning](https://github.com/JarodMica/ai-voice-cloning).
This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).
Similar to my own findings for Stable Diffusion image generation, this rentry may appear a little disheveled as I note my new findings with TorToiSe. Please keep this in mind if the guide seems to shift a bit or sound confusing.
>\>Ugh... why bother when I can just abuse 11.AI?
You're more than welcome to, but TorToiSe is shaping up to be a very promising tool, especially with finetuning now on the horizon.

View File

@ -1,106 +1,13 @@
dataset:
training: [
"./training/${voice}/valle/",
]
noise: [
"./training/valle/data/Other/noise/",
]
data_dirs: [./training/${voice}/valle/]
spkr_name_getter: "lambda p: p.parts[-3]" # "lambda p: p.parts[-1].split('-')[0]"
speaker_name_getter: "lambda p: p.parts[-3]" # "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
max_phones: 72
use_hdf5: False
hdf5_name: data.h5
hdf5_flag: r
validate: True
models: '${models}'
batch_size: ${batch_size}
gradient_accumulation_steps: ${gradient_accumulation_size}
eval_batch_size: ${batch_size}
workers: 4
cache: False
phones_range: [4, 64]
duration_range: [1.0, 8.0]
random_utterance: 1.0
max_prompts: 3
prompt_duration: 3.0
sample_type: path
tasks_list: ["tts"] # ["tts", "ns", "sr", "tse", "cse", "nse", "tts"]
models:
_max_levels: 8
_models:
- name: "ar"
size: "full"
resp_levels: 1
prom_levels: 2
tasks: 8
arch_type: "retnet"
- name: "nar"
size: "full"
resp_levels: 3
prom_levels: 4
tasks: 8
arch_type: "retnet"
hyperparameters:
batch_size: ${batch_size}
gradient_accumulation_steps: ${gradient_accumulation_size}
gradient_clipping: 100
optimizer: AdamW
learning_rate: 1.0e-4
scheduler_type: ""
evaluation:
batch_size: ${batch_size}
frequency: ${validation_rate}
size: 16
steps: 300
ar_temperature: 0.95
nar_temperature: 0.25
trainer:
iterations: ${iterations}
save_tag: step
save_on_oom: True
save_on_quit: True
export_on_save: True
export_on_quit: True
save_frequency: ${save_rate}
keep_last_checkpoints: 4
aggressive_optimizations: False
load_state_dict: True
#strict_loading: False
#load_tag: "9500"
#load_states: False
#restart_step_count: True
gc_mode: None # "global_step"
weight_dtype: bfloat16
backend: deepspeed
deepspeed:
zero_optimization_level: 2
use_compression_training: True
inference:
use_vocos: True
normalize: False
weight_dtype: float32
bitsandbytes:
enabled: False
injects: True
linear: True
embedding: True
max_iter: ${iterations}
save_ckpt_every: ${save_rate}
eval_every: ${validation_rate}

@ -1 +1 @@
Subproject commit bf3b6c87aa825295f64a31d010fd5e896fbcda43
Subproject commit 0bcdf81d0444218b4dedaefa5c546d42f36b8130

View File

@ -38,24 +38,10 @@
],
"source":[
"!apt install python3.10-venv\n",
"!apt install python3.8-venv\n",
"!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
"%cd /content/ai-voice-cloning\n",
"# get local dependencies\n",
"!git submodule init\n",
"!git submodule update --remote\n",
"# setup venv\n",
"!python3 -m venv venv\n",
"!source ./venv/bin/activate\n",
"!python3 -m pip install --upgrade pip # just to be safe\n",
"# CUDA\n",
"!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
"# install requirements\n",
"!python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements\n",
"!python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe\n",
"!python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirements, last, because whisperx will break a dependency here\n",
"!python3 -m pip install -e ./modules/dlas/ # install DLAS\n",
"!python3 -m pip install -r ./requirements.txt # install local requirements"
"!./setup-cuda.sh"
]
},
{
@ -129,8 +115,7 @@
"cell_type":"code",
"source":[
"%cd /content/ai-voice-cloning/\n",
"!source ./venv/bin/activate\n",
"!python3 ./src/main.py --share"
"!./start.sh --share"
],
"metadata":{
"id":"QRA8jF3cF-YJ"

View File

@ -1,17 +1,9 @@
--extra-index-url https://download.pytorch.org/whl/cu118
torch>=2.1.0
torchvision
torchaudio
git+https://github.com/openai/whisper.git
openai-whisper
more-itertools
ffmpeg-python
gradio<=3.23.0
gradio
music-tag
voicefixer
psutil
phonemizer
pydantic==1.10.11
websockets
beartype==0.15.0
pykakasi

View File

@ -4,7 +4,7 @@ git submodule update --remote
python -m venv venv
call .\venv\Scripts\activate.bat
python -m pip install --upgrade pip
python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
python -m pip install -r .\modules\tortoise-tts\requirements.txt
python -m pip install -e .\modules\tortoise-tts\
python -m pip install -r .\modules\dlas\requirements.txt

View File

@ -7,7 +7,7 @@ python3 -m venv venv
source ./venv/bin/activate
python3 -m pip install --upgrade pip # just to be safe
# CUDA
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
# install requirements
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe

View File

@ -4,7 +4,7 @@ git submodule update --remote
python -m venv venv
call .\venv\Scripts\activate.bat
python -m pip install --upgrade pip
python -m pip install torch torchvision torchaudio torch-directml
python -m pip install torch==1.13.1 torchvision torchaudio torch-directml
python -m pip install -r .\modules\tortoise-tts\requirements.txt
python -m pip install -e .\modules\tortoise-tts\
python -m pip install -r .\modules\dlas\requirements.txt

View File

@ -1,4 +0,0 @@
#!/bin/bash
git submodule init
git submodule update --remote
docker build -t ai-voice-cloning .

View File

@ -7,7 +7,7 @@ python3 -m venv venv
source ./venv/bin/activate
python3 -m pip install --upgrade pip # just to be safe
# ROCM
pip3 install torch==1.13.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
pip3 install torch==1.13.1 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
# install requirements
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe

View File

@ -1,84 +0,0 @@
import asyncio
import json
from threading import Thread
from websockets.server import serve
from utils import generate, get_autoregressive_models, get_voice_list, args, update_autoregressive_model, update_diffusion_model, update_tokenizer
# this is a not so nice workaround to set values to None if their string value is "None"
def replaceNoneStringWithNone(message):
ignore_fields = ['text'] # list of fields which CAN have "None" as literal String value
for member in message:
if message[member] == 'None' and member not in ignore_fields:
message[member] = None
return message
async def _handle_generate(websocket, message):
# update args parameters which control the model settings
if message.get('autoregressive_model'):
update_autoregressive_model(message['autoregressive_model'])
if message.get('diffusion_model'):
update_diffusion_model(message['diffusion_model'])
if message.get('tokenizer_json'):
update_tokenizer(message['tokenizer_json'])
if message.get('sample_batch_size'):
global args
args.sample_batch_size = message['sample_batch_size']
message['result'] = generate(**message)
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
async def _handle_get_autoregressive_models(websocket, message):
message['result'] = get_autoregressive_models()
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
async def _handle_get_voice_list(websocket, message):
message['result'] = get_voice_list()
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
async def _handle_message(websocket, message):
message = replaceNoneStringWithNone(message)
if message.get('action') and message['action'] == 'generate':
await _handle_generate(websocket, message)
elif message.get('action') and message['action'] == 'get_voices':
await _handle_get_voice_list(websocket, message)
elif message.get('action') and message['action'] == 'get_autoregressive_models':
await _handle_get_autoregressive_models(websocket, message)
else:
print("websocket: undhandled message: " + message)
async def _handle_connection(websocket, path):
print("websocket: client connected")
async for message in websocket:
try:
await _handle_message(websocket, json.loads(message))
except ValueError:
print("websocket: malformed json received")
async def _run(host: str, port: int):
print(f"websocket: server started on ws://{host}:{port}")
async with serve(_handle_connection, host, port, ping_interval=None):
await asyncio.Future() # run forever
def _run_server(listen_address: str, port: int):
asyncio.run(_run(host=listen_address, port=port))
def start_websocket_server(listen_address: str, port: int):
Thread(target=_run_server, args=[listen_address, port], daemon=True).start()

View File

@ -1,66 +0,0 @@
import os
import argparse
if 'TORTOISE_MODELS_DIR' not in os.environ:
os.environ['TORTOISE_MODELS_DIR'] = os.path.realpath(os.path.join(os.getcwd(), './models/tortoise/'))
if 'TRANSFORMERS_CACHE' not in os.environ:
os.environ['TRANSFORMERS_CACHE'] = os.path.realpath(os.path.join(os.getcwd(), './models/transformers/'))
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
from utils import *
if __name__ == "__main__":
args = setup_args(cli=True)
default_arguments = import_generate_settings()
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument("--text", default=default_arguments['text'])
parser.add_argument("--delimiter", default=default_arguments['delimiter'])
parser.add_argument("--emotion", default=default_arguments['emotion'])
parser.add_argument("--prompt", default=default_arguments['prompt'])
parser.add_argument("--voice", default=default_arguments['voice'])
parser.add_argument("--mic_audio", default=default_arguments['mic_audio'])
parser.add_argument("--voice_latents_chunks", default=default_arguments['voice_latents_chunks'])
parser.add_argument("--candidates", default=default_arguments['candidates'])
parser.add_argument("--seed", default=default_arguments['seed'])
parser.add_argument("--num_autoregressive_samples", default=default_arguments['num_autoregressive_samples'])
parser.add_argument("--diffusion_iterations", default=default_arguments['diffusion_iterations'])
parser.add_argument("--temperature", default=default_arguments['temperature'])
parser.add_argument("--diffusion_sampler", default=default_arguments['diffusion_sampler'])
parser.add_argument("--breathing_room", default=default_arguments['breathing_room'])
parser.add_argument("--cvvp_weight", default=default_arguments['cvvp_weight'])
parser.add_argument("--top_p", default=default_arguments['top_p'])
parser.add_argument("--diffusion_temperature", default=default_arguments['diffusion_temperature'])
parser.add_argument("--length_penalty", default=default_arguments['length_penalty'])
parser.add_argument("--repetition_penalty", default=default_arguments['repetition_penalty'])
parser.add_argument("--cond_free_k", default=default_arguments['cond_free_k'])
args, unknown = parser.parse_known_args()
kwargs = {
'text': args.text,
'delimiter': args.delimiter,
'emotion': args.emotion,
'prompt': args.prompt,
'voice': args.voice,
'mic_audio': args.mic_audio,
'voice_latents_chunks': args.voice_latents_chunks,
'candidates': args.candidates,
'seed': args.seed,
'num_autoregressive_samples': args.num_autoregressive_samples,
'diffusion_iterations': args.diffusion_iterations,
'temperature': args.temperature,
'diffusion_sampler': args.diffusion_sampler,
'breathing_room': args.breathing_room,
'cvvp_weight': args.cvvp_weight,
'top_p': args.top_p,
'diffusion_temperature': args.diffusion_temperature,
'length_penalty': args.length_penalty,
'repetition_penalty': args.repetition_penalty,
'cond_free_k': args.cond_free_k,
'experimentals': default_arguments['experimentals'],
}
tts = load_tts()
generate(**kwargs)

View File

@ -11,9 +11,6 @@ os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
from utils import *
from webui import *
from api.websocket_server import start_websocket_server
if __name__ == "__main__":
args = setup_args()
@ -26,9 +23,6 @@ if __name__ == "__main__":
if not args.defer_tts_load:
tts = load_tts()
if args.websocket_enabled:
start_websocket_server(args.websocket_listen_address, args.websocket_listen_port)
webui.block_thread()
elif __name__ == "main":
from fastapi import FastAPI
@ -44,4 +38,3 @@ elif __name__ == "main":
if not args.defer_tts_load:
tts = load_tts()

File diff suppressed because it is too large Load Diff

View File

@ -78,8 +78,6 @@ def generate_proxy(
repetition_penalty,
cond_free_k,
experimentals,
voice_latents_original_ar,
voice_latents_original_diffusion,
progress=gr.Progress(track_tqdm=True)
):
kwargs = locals()
@ -168,8 +166,8 @@ def reset_generate_settings_proxy():
return tuple(res)
def compute_latents_proxy(voice, voice_latents_chunks, original_ar, original_diffusion, progress=gr.Progress(track_tqdm=True)):
compute_latents( voice=voice, voice_latents_chunks=voice_latents_chunks, original_ar=original_ar, original_diffusion=original_diffusion )
def compute_latents_proxy(voice, voice_latents_chunks, progress=gr.Progress(track_tqdm=True)):
compute_latents( voice=voice, voice_latents_chunks=voice_latents_chunks, progress=progress )
return voice
@ -198,46 +196,7 @@ def read_generate_settings_proxy(file, saveAs='.temp'):
def slice_dataset_proxy( voice, trim_silence, start_offset, end_offset, progress=gr.Progress(track_tqdm=True) ):
return slice_dataset( voice, trim_silence=trim_silence, start_offset=start_offset, end_offset=end_offset, results=None, progress=progress )
def diarize_dataset( voice, progress=gr.Progress(track_tqdm=True) ):
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=args.hf_token)
messages = []
files = get_voice(voice, load_latents=False)
for file in enumerate_progress(files, desc="Iterating through voice files", progress=progress):
diarization = pipeline(file)
for turn, _, speaker in diarization.itertracks(yield_label=True):
message = f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}"
print(message)
messages.append(message)
return "\n".join(messages)
def prepare_all_datasets( language, validation_text_length, validation_audio_length, skip_existings, slice_audio, trim_silence, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=True) ):
kwargs = locals()
messages = []
voices = get_voice_list()
for voice in voices:
print("Processing:", voice)
message = transcribe_dataset( voice=voice, language=language, skip_existings=skip_existings, progress=progress )
messages.append(message)
if slice_audio:
for voice in voices:
print("Processing:", voice)
message = slice_dataset( voice, trim_silence=trim_silence, start_offset=slice_start_offset, end_offset=slice_end_offset, results=None, progress=progress )
messages.append(message)
for voice in voices:
print("Processing:", voice)
message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length, progress=progress )
messages.append(message)
return "\n".join(messages)
def prepare_dataset_proxy( voice, language, validation_text_length, validation_audio_length, skip_existings, slice_audio, trim_silence, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=True) ):
def prepare_dataset_proxy( voice, language, validation_text_length, validation_audio_length, skip_existings, slice_audio, trim_silence, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=False) ):
messages = []
message = transcribe_dataset( voice=voice, language=language, skip_existings=skip_existings, progress=progress )
@ -385,8 +344,6 @@ def setup_gradio():
GENERATE_SETTINGS["voice"] = gr.Dropdown(choices=voice_list_with_defaults, label="Voice", type="value", value=voice_list_with_defaults[0]) # it'd be very cash money if gradio was able to default to the first value in the list without this shit
GENERATE_SETTINGS["mic_audio"] = gr.Audio( label="Microphone Source", source="microphone", type="filepath", visible=False )
GENERATE_SETTINGS["voice_latents_chunks"] = gr.Number(label="Voice Chunks", precision=0, value=0, visible=args.tts_backend=="tortoise")
GENERATE_SETTINGS["voice_latents_original_ar"] = gr.Checkbox(label="Use Original Latents Method (AR)", visible=args.tts_backend=="tortoise")
GENERATE_SETTINGS["voice_latents_original_diffusion"] = gr.Checkbox(label="Use Original Latents Method (Diffusion)", visible=args.tts_backend=="tortoise")
with gr.Row():
refresh_voices = gr.Button(value="Refresh Voice List")
recompute_voice_latents = gr.Button(value="(Re)Compute Voice Latents")
@ -402,16 +359,15 @@ def setup_gradio():
outputs=GENERATE_SETTINGS["mic_audio"],
)
with gr.Column():
preset = None
GENERATE_SETTINGS["candidates"] = gr.Slider(value=1, minimum=1, maximum=6, step=1, label="Candidates", visible=args.tts_backend=="tortoise")
GENERATE_SETTINGS["seed"] = gr.Number(value=0, precision=0, label="Seed", visible=args.tts_backend=="tortoise")
GENERATE_SETTINGS["seed"] = gr.Number(value=0, precision=0, label="Seed")
preset = gr.Radio( ["Ultra Fast", "Fast", "Standard", "High Quality"], label="Preset", type="value", value="Ultra Fast", visible=args.tts_backend=="tortoise" )
preset = gr.Radio( ["Ultra Fast", "Fast", "Standard", "High Quality"], label="Preset", type="value", value="Ultra Fast" )
GENERATE_SETTINGS["num_autoregressive_samples"] = gr.Slider(value=16, minimum=2, maximum=2048 if args.tts_backend=="vall-e" else 512, step=1, label="Samples", visible=args.tts_backend!="bark")
GENERATE_SETTINGS["num_autoregressive_samples"] = gr.Slider(value=16, minimum=2, maximum=512, step=1, label="Samples")
GENERATE_SETTINGS["diffusion_iterations"] = gr.Slider(value=30, minimum=0, maximum=512, step=1, label="Iterations", visible=args.tts_backend=="tortoise")
GENERATE_SETTINGS["temperature"] = gr.Slider(value=0.95 if args.tts_backend=="vall-e" else 0.2, minimum=0, maximum=1, step=0.05, label="Temperature")
GENERATE_SETTINGS["temperature"] = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature")
show_experimental_settings = gr.Checkbox(label="Show Experimental Settings", visible=args.tts_backend=="tortoise")
reset_generate_settings_button = gr.Button(value="Reset to Default")
@ -512,8 +468,6 @@ def setup_gradio():
DATASET_SETTINGS['slice_end_offset'] = gr.Number(label="Slice End Offset", value=0)
transcribe_button = gr.Button(value="Transcribe and Process")
transcribe_all_button = gr.Button(value="Transcribe All")
diarize_button = gr.Button(value="Diarize", visible=False)
with gr.Row():
slice_dataset_button = gr.Button(value="(Re)Slice Audio")
@ -526,7 +480,7 @@ def setup_gradio():
dataset_settings = list(DATASET_SETTINGS.values())
with gr.Column():
prepare_dataset_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8)
with gr.Tab("Generate Configuration", visible=args.tts_backend != "bark"):
with gr.Tab("Generate Configuration"):
with gr.Row():
with gr.Column():
TRAINING_SETTINGS["epochs"] = gr.Number(label="Epochs", value=500, precision=0)
@ -578,7 +532,7 @@ def setup_gradio():
with gr.Row():
training_optimize_configuration = gr.Button(value="Validate Training Configuration")
training_save_configuration = gr.Button(value="Save Training Configuration")
with gr.Tab("Run Training", visible=args.tts_backend != "bark"):
with gr.Tab("Run Training"):
with gr.Row():
with gr.Column():
training_configs = gr.Dropdown(label="Training Configuration", choices=training_list, value=training_list[0] if len(training_list) else "")
@ -589,10 +543,8 @@ def setup_gradio():
keep_x_past_checkpoints = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1)
with gr.Row():
training_graph_x_min = gr.Number(label="X Min", precision=0, value=0)
training_graph_x_max = gr.Number(label="X Max", precision=0, value=0)
training_graph_y_min = gr.Number(label="Y Min", precision=0, value=0)
training_graph_y_max = gr.Number(label="Y Max", precision=0, value=0)
training_graph_x_lim = gr.Number(label="X Limit", precision=0, value=0)
training_graph_y_lim = gr.Number(label="Y Limit", precision=0, value=0)
with gr.Row():
start_training_button = gr.Button(value="Train")
@ -602,7 +554,7 @@ def setup_gradio():
with gr.Column():
training_loss_graph = gr.LinePlot(label="Training Metrics",
x="it", # x="epoch",
x="epoch",
y="value",
title="Loss Metrics",
color="type",
@ -611,7 +563,7 @@ def setup_gradio():
height=350,
)
training_lr_graph = gr.LinePlot(label="Training Metrics",
x="it", # x="epoch",
x="epoch",
y="value",
title="Learning Rate",
color="type",
@ -620,14 +572,14 @@ def setup_gradio():
height=350,
)
training_grad_norm_graph = gr.LinePlot(label="Training Metrics",
x="it", # x="epoch",
x="epoch",
y="value",
title="Gradient Normals",
color="type",
tooltip=['epoch', 'it', 'value', 'type'],
width=500,
height=350,
visible=False, # args.tts_backend=="vall-e"
visible=args.tts_backend=="vall-e"
)
view_losses = gr.Button(value="View Losses")
@ -643,7 +595,6 @@ def setup_gradio():
EXEC_SETTINGS['embed_output_metadata'] = gr.Checkbox(label="Embed Output Metadata", value=args.embed_output_metadata)
EXEC_SETTINGS['latents_lean_and_mean'] = gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean)
EXEC_SETTINGS['voice_fixer'] = gr.Checkbox(label="Use Voice Fixer on Generated Output", value=args.voice_fixer)
EXEC_SETTINGS['use_deepspeed'] = gr.Checkbox(label="Use DeepSpeed for Speed Bump.", value=args.use_deepspeed)
EXEC_SETTINGS['voice_fixer_use_cuda'] = gr.Checkbox(label="Use CUDA for Voice Fixer", value=args.voice_fixer_use_cuda)
EXEC_SETTINGS['force_cpu_for_conditioning_latents'] = gr.Checkbox(label="Force CPU for Conditioning Latents", value=args.force_cpu_for_conditioning_latents)
EXEC_SETTINGS['defer_tts_load'] = gr.Checkbox(label="Do Not Load TTS On Startup", value=args.defer_tts_load)
@ -659,9 +610,11 @@ def setup_gradio():
EXEC_SETTINGS['results_folder'] = gr.Textbox(label="Results Folder", value=args.results_folder)
# EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0])
if args.tts_backend=="vall-e":
with gr.Column():
EXEC_SETTINGS['valle_model'] = gr.Dropdown(choices=valle_models, label="VALL-E Model Config", value=args.valle_model if args.valle_model else valle_models[0])
with gr.Column(visible=args.tts_backend=="vall-e"):
default_valle_model_choice = ""
if len(valle_models):
default_valle_model_choice = valle_models[0]
EXEC_SETTINGS['valle_model'] = gr.Dropdown(choices=valle_models, label="VALL-E Model Config", value=args.valle_model if args.valle_model else default_valle_model_choice)
with gr.Column(visible=args.tts_backend=="tortoise"):
EXEC_SETTINGS['autoregressive_model'] = gr.Dropdown(choices=["auto"] + autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else "auto")
@ -722,8 +675,7 @@ def setup_gradio():
EXEC_SETTINGS['autoregressive_model'].change(
fn=update_autoregressive_model,
inputs=EXEC_SETTINGS['autoregressive_model'],
outputs=None,
api_name="set_autoregressive_model"
outputs=None
)
EXEC_SETTINGS['vocoder_model'].change(
@ -772,21 +724,18 @@ def setup_gradio():
inputs=show_experimental_settings,
outputs=experimental_column
)
if preset:
preset.change(fn=update_presets,
inputs=preset,
outputs=[
GENERATE_SETTINGS['num_autoregressive_samples'],
GENERATE_SETTINGS['diffusion_iterations'],
],
)
preset.change(fn=update_presets,
inputs=preset,
outputs=[
GENERATE_SETTINGS['num_autoregressive_samples'],
GENERATE_SETTINGS['diffusion_iterations'],
],
)
recompute_voice_latents.click(compute_latents_proxy,
inputs=[
GENERATE_SETTINGS['voice'],
GENERATE_SETTINGS['voice_latents_chunks'],
GENERATE_SETTINGS['voice_latents_original_ar'],
GENERATE_SETTINGS['voice_latents_original_diffusion'],
],
outputs=GENERATE_SETTINGS['voice'],
)
@ -870,10 +819,8 @@ def setup_gradio():
training_output.change(
fn=update_training_dataplot,
inputs=[
training_graph_x_min,
training_graph_x_max,
training_graph_y_min,
training_graph_y_max,
training_graph_x_lim,
training_graph_y_lim,
],
outputs=[
training_loss_graph,
@ -886,10 +833,8 @@ def setup_gradio():
view_losses.click(
fn=update_training_dataplot,
inputs=[
training_graph_x_min,
training_graph_x_max,
training_graph_y_min,
training_graph_y_max,
training_graph_x_lim,
training_graph_y_lim,
training_configs,
],
outputs=[
@ -914,16 +859,6 @@ def setup_gradio():
inputs=dataset_settings,
outputs=prepare_dataset_output #console_output
)
transcribe_all_button.click(
prepare_all_datasets,
inputs=dataset_settings[1:],
outputs=prepare_dataset_output #console_output
)
diarize_button.click(
diarize_dataset,
inputs=dataset_settings[0],
outputs=prepare_dataset_output #console_output
)
prepare_dataset_button.click(
prepare_dataset,
inputs=[

View File

@ -1,14 +0,0 @@
#!/bin/bash
CMD="python3 ./src/main.py $@"
# CMD="bash"
CPATH="/home/user/ai-voice-cloning"
docker run --rm --gpus all \
--mount "type=bind,src=$PWD/models,dst=$CPATH/models" \
--mount "type=bind,src=$PWD/training,dst=$CPATH/training" \
--mount "type=bind,src=$PWD/voices,dst=$CPATH/voices" \
--mount "type=bind,src=$PWD/bin,dst=$CPATH/bin" \
--workdir $CPATH \
--user "$(id -u):$(id -g)" \
--net host \
-it ai-voice-cloning $CMD

View File

@ -1,15 +0,0 @@
#!/bin/bash
CMD="python3 ./src/train.py --yaml $1"
# ipc host is one way to increase the shared memory for the container
# more info here https://github.com/pytorch/pytorch#docker-image
CPATH="/home/user/ai-voice-cloning"
docker run --rm --gpus all \
--mount "type=bind,src=$PWD/models,dst=$CPATH/models" \
--mount "type=bind,src=$PWD/training,dst=$CPATH/training" \
--mount "type=bind,src=$PWD/voices,dst=$CPATH/voices" \
--mount "type=bind,src=$PWD/bin,dst=$CPATH/bin" \
--mount "type=bind,src=$PWD/src,dst=$CPATH/src" \
--workdir $CPATH \
--ipc host \
--user "$(id -u):$(id -g)" \
-it ai-voice-cloning $CMD