Compare commits
4 Commits
Author | SHA1 | Date | |
---|---|---|---|
2db2cdc1ff | |||
38601033af | |||
89d7642a0f | |||
a961141fe6 |
16
.vscode/launch.json
vendored
Normal file
16
.vscode/launch.json
vendored
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
{
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "Python: Debug Main.py",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "${workspaceFolder}/src/main.py",
|
||||||
|
"console": "integratedTerminal",
|
||||||
|
"env": {
|
||||||
|
"PYTHONPATH": "${workspaceFolder}"
|
||||||
|
},
|
||||||
|
"preLaunchTask": "Setup Environment",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
12
.vscode/tasks.json
vendored
Normal file
12
.vscode/tasks.json
vendored
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
{
|
||||||
|
"version": "2.0.0",
|
||||||
|
"tasks": [
|
||||||
|
{
|
||||||
|
"label": "Setup Environment",
|
||||||
|
"type": "shell",
|
||||||
|
"command": "python",
|
||||||
|
"args": ["activate_env.py"],
|
||||||
|
"problemMatcher": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
9
activate_env.py
Normal file
9
activate_env.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
import subprocess
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Ativa o ambiente virtual
|
||||||
|
subprocess.run([r'.\venv\Scripts\activate.bat'], shell=True)
|
||||||
|
|
||||||
|
# Configura as variáveis de ambiente
|
||||||
|
os.environ['PATH'] = r'.\bin;' + os.environ['PATH']
|
||||||
|
os.environ['PYTHONUTF8'] = '1'
|
|
@ -38,10 +38,24 @@
|
||||||
|
|
||||||
],
|
],
|
||||||
"source":[
|
"source":[
|
||||||
"!apt install python3.8-venv\n",
|
"!apt install python3.10-venv\n",
|
||||||
"!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
|
"!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
|
||||||
"%cd /content/ai-voice-cloning\n",
|
"%cd /content/ai-voice-cloning\n",
|
||||||
"!./setup-cuda.sh"
|
"# get local dependencies\n",
|
||||||
|
"!git submodule init\n",
|
||||||
|
"!git submodule update --remote\n",
|
||||||
|
"# setup venv\n",
|
||||||
|
"!python3 -m venv venv\n",
|
||||||
|
"!source ./venv/bin/activate\n",
|
||||||
|
"!python3 -m pip install --upgrade pip # just to be safe\n",
|
||||||
|
"# CUDA\n",
|
||||||
|
"!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
|
||||||
|
"# install requirements\n",
|
||||||
|
"!python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements\n",
|
||||||
|
"!python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe\n",
|
||||||
|
"!python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirements, last, because whisperx will break a dependency here\n",
|
||||||
|
"!python3 -m pip install -e ./modules/dlas/ # install DLAS\n",
|
||||||
|
"!python3 -m pip install -r ./requirements.txt # install local requirements"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -115,7 +129,8 @@
|
||||||
"cell_type":"code",
|
"cell_type":"code",
|
||||||
"source":[
|
"source":[
|
||||||
"%cd /content/ai-voice-cloning/\n",
|
"%cd /content/ai-voice-cloning/\n",
|
||||||
"!./start.sh --share"
|
"!source ./venv/bin/activate\n",
|
||||||
|
"!python3 ./src/main.py --share"
|
||||||
],
|
],
|
||||||
"metadata":{
|
"metadata":{
|
||||||
"id":"QRA8jF3cF-YJ"
|
"id":"QRA8jF3cF-YJ"
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
git+https://github.com/openai/whisper.git
|
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||||
|
torch>=2.1.0
|
||||||
|
torchvision
|
||||||
|
torchaudio
|
||||||
|
|
||||||
|
openai-whisper
|
||||||
more-itertools
|
more-itertools
|
||||||
ffmpeg-python
|
ffmpeg-python
|
||||||
gradio<=3.23.0
|
gradio<=3.23.0
|
||||||
|
|
46
src/webui.py
46
src/webui.py
|
@ -92,7 +92,7 @@ def generate_proxy(
|
||||||
unload_tts()
|
unload_tts()
|
||||||
|
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
return (
|
return (
|
||||||
outputs[0],
|
outputs[0],
|
||||||
gr.update(value=sample, visible=sample is not None),
|
gr.update(value=sample, visible=sample is not None),
|
||||||
|
@ -131,7 +131,7 @@ def history_view_results( voice ):
|
||||||
metadata, _ = read_generate_settings(f"{outdir}/{file}", read_latents=False)
|
metadata, _ = read_generate_settings(f"{outdir}/{file}", read_latents=False)
|
||||||
if metadata is None:
|
if metadata is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
values = []
|
values = []
|
||||||
for k in HISTORY_HEADERS:
|
for k in HISTORY_HEADERS:
|
||||||
v = file
|
v = file
|
||||||
|
@ -185,7 +185,7 @@ def read_generate_settings_proxy(file, saveAs='.temp'):
|
||||||
os.makedirs(outdir, exist_ok=True)
|
os.makedirs(outdir, exist_ok=True)
|
||||||
with open(f'{outdir}/cond_latents.pth', 'wb') as f:
|
with open(f'{outdir}/cond_latents.pth', 'wb') as f:
|
||||||
f.write(latents)
|
f.write(latents)
|
||||||
|
|
||||||
latents = f'{outdir}/cond_latents.pth'
|
latents = f'{outdir}/cond_latents.pth'
|
||||||
|
|
||||||
return (
|
return (
|
||||||
|
@ -229,7 +229,7 @@ def prepare_all_datasets( language, validation_text_length, validation_audio_len
|
||||||
print("Processing:", voice)
|
print("Processing:", voice)
|
||||||
message = slice_dataset( voice, trim_silence=trim_silence, start_offset=slice_start_offset, end_offset=slice_end_offset, results=None, progress=progress )
|
message = slice_dataset( voice, trim_silence=trim_silence, start_offset=slice_start_offset, end_offset=slice_end_offset, results=None, progress=progress )
|
||||||
messages.append(message)
|
messages.append(message)
|
||||||
|
|
||||||
for voice in voices:
|
for voice in voices:
|
||||||
print("Processing:", voice)
|
print("Processing:", voice)
|
||||||
message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length, progress=progress )
|
message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length, progress=progress )
|
||||||
|
@ -239,7 +239,7 @@ def prepare_all_datasets( language, validation_text_length, validation_audio_len
|
||||||
|
|
||||||
def prepare_dataset_proxy( voice, language, validation_text_length, validation_audio_length, skip_existings, slice_audio, trim_silence, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=True) ):
|
def prepare_dataset_proxy( voice, language, validation_text_length, validation_audio_length, skip_existings, slice_audio, trim_silence, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=True) ):
|
||||||
messages = []
|
messages = []
|
||||||
|
|
||||||
message = transcribe_dataset( voice=voice, language=language, skip_existings=skip_existings, progress=progress )
|
message = transcribe_dataset( voice=voice, language=language, skip_existings=skip_existings, progress=progress )
|
||||||
messages.append(message)
|
messages.append(message)
|
||||||
|
|
||||||
|
@ -355,7 +355,7 @@ def setup_gradio():
|
||||||
voice_list_with_defaults = get_voice_list(append_defaults=True)
|
voice_list_with_defaults = get_voice_list(append_defaults=True)
|
||||||
voice_list = get_voice_list()
|
voice_list = get_voice_list()
|
||||||
result_voices = get_voice_list(args.results_folder)
|
result_voices = get_voice_list(args.results_folder)
|
||||||
|
|
||||||
valle_models = get_valle_models()
|
valle_models = get_valle_models()
|
||||||
|
|
||||||
autoregressive_models = get_autoregressive_models()
|
autoregressive_models = get_autoregressive_models()
|
||||||
|
@ -371,7 +371,9 @@ def setup_gradio():
|
||||||
arg = GENERATE_SETTINGS_ARGS[i]
|
arg = GENERATE_SETTINGS_ARGS[i]
|
||||||
GENERATE_SETTINGS[arg] = None
|
GENERATE_SETTINGS[arg] = None
|
||||||
|
|
||||||
with gr.Blocks() as ui:
|
with gr.Blocks(theme="freddyaboulton/dracula_revamped", css="footer { display: none!important}", title="Voice Clonning WebUI") as ui:
|
||||||
|
gr.Markdown("## 🤗🎙️ Voice clonning ")
|
||||||
|
gr.Markdown("Ai Voice clonning <a href='https://git.ecker.tech/terminator/ai-voice-cloning-terminator'>based on Tortoise</a>")
|
||||||
with gr.Tab("Generate"):
|
with gr.Tab("Generate"):
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
|
@ -402,7 +404,7 @@ def setup_gradio():
|
||||||
outputs=GENERATE_SETTINGS["mic_audio"],
|
outputs=GENERATE_SETTINGS["mic_audio"],
|
||||||
)
|
)
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
preset = None
|
preset = None
|
||||||
GENERATE_SETTINGS["candidates"] = gr.Slider(value=1, minimum=1, maximum=6, step=1, label="Candidates", visible=args.tts_backend=="tortoise")
|
GENERATE_SETTINGS["candidates"] = gr.Slider(value=1, minimum=1, maximum=6, step=1, label="Candidates", visible=args.tts_backend=="tortoise")
|
||||||
GENERATE_SETTINGS["seed"] = gr.Number(value=0, precision=0, label="Seed", visible=args.tts_backend=="tortoise")
|
GENERATE_SETTINGS["seed"] = gr.Number(value=0, precision=0, label="Seed", visible=args.tts_backend=="tortoise")
|
||||||
|
|
||||||
|
@ -412,7 +414,7 @@ def setup_gradio():
|
||||||
GENERATE_SETTINGS["diffusion_iterations"] = gr.Slider(value=30, minimum=0, maximum=512, step=1, label="Iterations", visible=args.tts_backend=="tortoise")
|
GENERATE_SETTINGS["diffusion_iterations"] = gr.Slider(value=30, minimum=0, maximum=512, step=1, label="Iterations", visible=args.tts_backend=="tortoise")
|
||||||
|
|
||||||
GENERATE_SETTINGS["temperature"] = gr.Slider(value=0.95 if args.tts_backend=="vall-e" else 0.2, minimum=0, maximum=1, step=0.05, label="Temperature")
|
GENERATE_SETTINGS["temperature"] = gr.Slider(value=0.95 if args.tts_backend=="vall-e" else 0.2, minimum=0, maximum=1, step=0.05, label="Temperature")
|
||||||
|
|
||||||
show_experimental_settings = gr.Checkbox(label="Show Experimental Settings", visible=args.tts_backend=="tortoise")
|
show_experimental_settings = gr.Checkbox(label="Show Experimental Settings", visible=args.tts_backend=="tortoise")
|
||||||
reset_generate_settings_button = gr.Button(value="Reset to Default")
|
reset_generate_settings_button = gr.Button(value="Reset to Default")
|
||||||
with gr.Column(visible=False) as col:
|
with gr.Column(visible=False) as col:
|
||||||
|
@ -514,7 +516,7 @@ def setup_gradio():
|
||||||
transcribe_button = gr.Button(value="Transcribe and Process")
|
transcribe_button = gr.Button(value="Transcribe and Process")
|
||||||
transcribe_all_button = gr.Button(value="Transcribe All")
|
transcribe_all_button = gr.Button(value="Transcribe All")
|
||||||
diarize_button = gr.Button(value="Diarize", visible=False)
|
diarize_button = gr.Button(value="Diarize", visible=False)
|
||||||
|
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
slice_dataset_button = gr.Button(value="(Re)Slice Audio")
|
slice_dataset_button = gr.Button(value="(Re)Slice Audio")
|
||||||
prepare_dataset_button = gr.Button(value="(Re)Create Dataset")
|
prepare_dataset_button = gr.Button(value="(Re)Create Dataset")
|
||||||
|
@ -534,7 +536,7 @@ def setup_gradio():
|
||||||
TRAINING_SETTINGS["learning_rate"] = gr.Slider(label="Learning Rate", value=1e-5, minimum=0, maximum=1e-4, step=1e-6)
|
TRAINING_SETTINGS["learning_rate"] = gr.Slider(label="Learning Rate", value=1e-5, minimum=0, maximum=1e-4, step=1e-6)
|
||||||
TRAINING_SETTINGS["mel_lr_weight"] = gr.Slider(label="Mel LR Ratio", value=1.00, minimum=0, maximum=1)
|
TRAINING_SETTINGS["mel_lr_weight"] = gr.Slider(label="Mel LR Ratio", value=1.00, minimum=0, maximum=1)
|
||||||
TRAINING_SETTINGS["text_lr_weight"] = gr.Slider(label="Text LR Ratio", value=0.01, minimum=0, maximum=1)
|
TRAINING_SETTINGS["text_lr_weight"] = gr.Slider(label="Text LR Ratio", value=0.01, minimum=0, maximum=1)
|
||||||
|
|
||||||
with gr.Row(visible=args.tts_backend=="tortoise"):
|
with gr.Row(visible=args.tts_backend=="tortoise"):
|
||||||
lr_schemes = list(LEARNING_RATE_SCHEMES.keys())
|
lr_schemes = list(LEARNING_RATE_SCHEMES.keys())
|
||||||
TRAINING_SETTINGS["learning_rate_scheme"] = gr.Radio(lr_schemes, label="Learning Rate Scheme", value=lr_schemes[0], type="value")
|
TRAINING_SETTINGS["learning_rate_scheme"] = gr.Radio(lr_schemes, label="Learning Rate Scheme", value=lr_schemes[0], type="value")
|
||||||
|
@ -567,7 +569,7 @@ def setup_gradio():
|
||||||
|
|
||||||
TRAINING_SETTINGS["source_model"] = gr.Dropdown( choices=autoregressive_models, label="Source Model", type="value", value=autoregressive_models[0], visible=args.tts_backend=="tortoise" )
|
TRAINING_SETTINGS["source_model"] = gr.Dropdown( choices=autoregressive_models, label="Source Model", type="value", value=autoregressive_models[0], visible=args.tts_backend=="tortoise" )
|
||||||
TRAINING_SETTINGS["resume_state"] = gr.Textbox(label="Resume State Path", placeholder="./training/${voice}/finetune/training_state/${last_state}.state", visible=args.tts_backend=="tortoise")
|
TRAINING_SETTINGS["resume_state"] = gr.Textbox(label="Resume State Path", placeholder="./training/${voice}/finetune/training_state/${last_state}.state", visible=args.tts_backend=="tortoise")
|
||||||
|
|
||||||
TRAINING_SETTINGS["voice"] = gr.Dropdown( choices=dataset_list, label="Dataset", type="value", value=dataset_list[0] if len(dataset_list) else "" )
|
TRAINING_SETTINGS["voice"] = gr.Dropdown( choices=dataset_list, label="Dataset", type="value", value=dataset_list[0] if len(dataset_list) else "" )
|
||||||
|
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
|
@ -585,9 +587,9 @@ def setup_gradio():
|
||||||
refresh_configs = gr.Button(value="Refresh Configurations")
|
refresh_configs = gr.Button(value="Refresh Configurations")
|
||||||
training_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8)
|
training_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8)
|
||||||
verbose_training = gr.Checkbox(label="Verbose Console Output", value=True)
|
verbose_training = gr.Checkbox(label="Verbose Console Output", value=True)
|
||||||
|
|
||||||
keep_x_past_checkpoints = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1)
|
keep_x_past_checkpoints = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1)
|
||||||
|
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
training_graph_x_min = gr.Number(label="X Min", precision=0, value=0)
|
training_graph_x_min = gr.Number(label="X Min", precision=0, value=0)
|
||||||
training_graph_x_max = gr.Number(label="X Max", precision=0, value=0)
|
training_graph_x_max = gr.Number(label="X Max", precision=0, value=0)
|
||||||
|
@ -598,8 +600,8 @@ def setup_gradio():
|
||||||
start_training_button = gr.Button(value="Train")
|
start_training_button = gr.Button(value="Train")
|
||||||
stop_training_button = gr.Button(value="Stop")
|
stop_training_button = gr.Button(value="Stop")
|
||||||
reconnect_training_button = gr.Button(value="Reconnect")
|
reconnect_training_button = gr.Button(value="Reconnect")
|
||||||
|
|
||||||
|
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
training_loss_graph = gr.LinePlot(label="Training Metrics",
|
training_loss_graph = gr.LinePlot(label="Training Metrics",
|
||||||
x="it", # x="epoch",
|
x="it", # x="epoch",
|
||||||
|
@ -658,7 +660,7 @@ def setup_gradio():
|
||||||
|
|
||||||
EXEC_SETTINGS['results_folder'] = gr.Textbox(label="Results Folder", value=args.results_folder)
|
EXEC_SETTINGS['results_folder'] = gr.Textbox(label="Results Folder", value=args.results_folder)
|
||||||
# EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0])
|
# EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0])
|
||||||
|
|
||||||
if args.tts_backend=="vall-e":
|
if args.tts_backend=="vall-e":
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
EXEC_SETTINGS['valle_model'] = gr.Dropdown(choices=valle_models, label="VALL-E Model Config", value=args.valle_model if args.valle_model else valle_models[0])
|
EXEC_SETTINGS['valle_model'] = gr.Dropdown(choices=valle_models, label="VALL-E Model Config", value=args.valle_model if args.valle_model else valle_models[0])
|
||||||
|
@ -668,7 +670,7 @@ def setup_gradio():
|
||||||
EXEC_SETTINGS['diffusion_model'] = gr.Dropdown(choices=diffusion_models, label="Diffusion Model", value=args.diffusion_model if args.diffusion_model else diffusion_models[0])
|
EXEC_SETTINGS['diffusion_model'] = gr.Dropdown(choices=diffusion_models, label="Diffusion Model", value=args.diffusion_model if args.diffusion_model else diffusion_models[0])
|
||||||
EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1])
|
EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1])
|
||||||
EXEC_SETTINGS['tokenizer_json'] = gr.Dropdown(tokenizer_jsons, label="Tokenizer JSON Path", value=args.tokenizer_json if args.tokenizer_json else tokenizer_jsons[0])
|
EXEC_SETTINGS['tokenizer_json'] = gr.Dropdown(tokenizer_jsons, label="Tokenizer JSON Path", value=args.tokenizer_json if args.tokenizer_json else tokenizer_jsons[0])
|
||||||
|
|
||||||
EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p']
|
EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p']
|
||||||
EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes']
|
EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes']
|
||||||
|
|
||||||
|
@ -718,7 +720,7 @@ def setup_gradio():
|
||||||
exec_inputs = list(EXEC_SETTINGS.values())
|
exec_inputs = list(EXEC_SETTINGS.values())
|
||||||
for k in EXEC_SETTINGS:
|
for k in EXEC_SETTINGS:
|
||||||
EXEC_SETTINGS[k].change( fn=update_args_proxy, inputs=exec_inputs )
|
EXEC_SETTINGS[k].change( fn=update_args_proxy, inputs=exec_inputs )
|
||||||
|
|
||||||
EXEC_SETTINGS['autoregressive_model'].change(
|
EXEC_SETTINGS['autoregressive_model'].change(
|
||||||
fn=update_autoregressive_model,
|
fn=update_autoregressive_model,
|
||||||
inputs=EXEC_SETTINGS['autoregressive_model'],
|
inputs=EXEC_SETTINGS['autoregressive_model'],
|
||||||
|
@ -790,7 +792,7 @@ def setup_gradio():
|
||||||
],
|
],
|
||||||
outputs=GENERATE_SETTINGS['voice'],
|
outputs=GENERATE_SETTINGS['voice'],
|
||||||
)
|
)
|
||||||
|
|
||||||
GENERATE_SETTINGS['emotion'].change(
|
GENERATE_SETTINGS['emotion'].change(
|
||||||
fn=lambda value: gr.update(visible=value == "Custom"),
|
fn=lambda value: gr.update(visible=value == "Custom"),
|
||||||
inputs=GENERATE_SETTINGS['emotion'],
|
inputs=GENERATE_SETTINGS['emotion'],
|
||||||
|
@ -944,7 +946,7 @@ def setup_gradio():
|
||||||
],
|
],
|
||||||
outputs=prepare_dataset_output
|
outputs=prepare_dataset_output
|
||||||
)
|
)
|
||||||
|
|
||||||
training_refresh_dataset.click(
|
training_refresh_dataset.click(
|
||||||
lambda: gr.update(choices=get_dataset_list()),
|
lambda: gr.update(choices=get_dataset_list()),
|
||||||
inputs=None,
|
inputs=None,
|
||||||
|
@ -966,7 +968,7 @@ def setup_gradio():
|
||||||
|
|
||||||
if os.path.isfile('./config/generate.json'):
|
if os.path.isfile('./config/generate.json'):
|
||||||
ui.load(import_generate_settings_proxy, inputs=None, outputs=generate_settings)
|
ui.load(import_generate_settings_proxy, inputs=None, outputs=generate_settings)
|
||||||
|
|
||||||
if args.check_for_updates:
|
if args.check_for_updates:
|
||||||
ui.load(check_for_updates)
|
ui.load(check_for_updates)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user