Compare commits
4 Commits
Author | SHA1 | Date | |
---|---|---|---|
2db2cdc1ff | |||
38601033af | |||
89d7642a0f | |||
a961141fe6 |
16
.vscode/launch.json
vendored
Normal file
16
.vscode/launch.json
vendored
Normal file
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python: Debug Main.py",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/src/main.py",
|
||||
"console": "integratedTerminal",
|
||||
"env": {
|
||||
"PYTHONPATH": "${workspaceFolder}"
|
||||
},
|
||||
"preLaunchTask": "Setup Environment",
|
||||
}
|
||||
]
|
||||
}
|
12
.vscode/tasks.json
vendored
Normal file
12
.vscode/tasks.json
vendored
Normal file
|
@ -0,0 +1,12 @@
|
|||
{
|
||||
"version": "2.0.0",
|
||||
"tasks": [
|
||||
{
|
||||
"label": "Setup Environment",
|
||||
"type": "shell",
|
||||
"command": "python",
|
||||
"args": ["activate_env.py"],
|
||||
"problemMatcher": []
|
||||
}
|
||||
]
|
||||
}
|
9
activate_env.py
Normal file
9
activate_env.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
import subprocess
|
||||
import os
|
||||
|
||||
# Ativa o ambiente virtual
|
||||
subprocess.run([r'.\venv\Scripts\activate.bat'], shell=True)
|
||||
|
||||
# Configura as variáveis de ambiente
|
||||
os.environ['PATH'] = r'.\bin;' + os.environ['PATH']
|
||||
os.environ['PYTHONUTF8'] = '1'
|
|
@ -38,10 +38,24 @@
|
|||
|
||||
],
|
||||
"source":[
|
||||
"!apt install python3.8-venv\n",
|
||||
"!apt install python3.10-venv\n",
|
||||
"!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
|
||||
"%cd /content/ai-voice-cloning\n",
|
||||
"!./setup-cuda.sh"
|
||||
"# get local dependencies\n",
|
||||
"!git submodule init\n",
|
||||
"!git submodule update --remote\n",
|
||||
"# setup venv\n",
|
||||
"!python3 -m venv venv\n",
|
||||
"!source ./venv/bin/activate\n",
|
||||
"!python3 -m pip install --upgrade pip # just to be safe\n",
|
||||
"# CUDA\n",
|
||||
"!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
|
||||
"# install requirements\n",
|
||||
"!python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements\n",
|
||||
"!python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe\n",
|
||||
"!python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirements, last, because whisperx will break a dependency here\n",
|
||||
"!python3 -m pip install -e ./modules/dlas/ # install DLAS\n",
|
||||
"!python3 -m pip install -r ./requirements.txt # install local requirements"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -115,7 +129,8 @@
|
|||
"cell_type":"code",
|
||||
"source":[
|
||||
"%cd /content/ai-voice-cloning/\n",
|
||||
"!./start.sh --share"
|
||||
"!source ./venv/bin/activate\n",
|
||||
"!python3 ./src/main.py --share"
|
||||
],
|
||||
"metadata":{
|
||||
"id":"QRA8jF3cF-YJ"
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
git+https://github.com/openai/whisper.git
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch>=2.1.0
|
||||
torchvision
|
||||
torchaudio
|
||||
|
||||
openai-whisper
|
||||
more-itertools
|
||||
ffmpeg-python
|
||||
gradio<=3.23.0
|
||||
|
|
46
src/webui.py
46
src/webui.py
|
@ -92,7 +92,7 @@ def generate_proxy(
|
|||
unload_tts()
|
||||
|
||||
raise e
|
||||
|
||||
|
||||
return (
|
||||
outputs[0],
|
||||
gr.update(value=sample, visible=sample is not None),
|
||||
|
@ -131,7 +131,7 @@ def history_view_results( voice ):
|
|||
metadata, _ = read_generate_settings(f"{outdir}/{file}", read_latents=False)
|
||||
if metadata is None:
|
||||
continue
|
||||
|
||||
|
||||
values = []
|
||||
for k in HISTORY_HEADERS:
|
||||
v = file
|
||||
|
@ -185,7 +185,7 @@ def read_generate_settings_proxy(file, saveAs='.temp'):
|
|||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(f'{outdir}/cond_latents.pth', 'wb') as f:
|
||||
f.write(latents)
|
||||
|
||||
|
||||
latents = f'{outdir}/cond_latents.pth'
|
||||
|
||||
return (
|
||||
|
@ -229,7 +229,7 @@ def prepare_all_datasets( language, validation_text_length, validation_audio_len
|
|||
print("Processing:", voice)
|
||||
message = slice_dataset( voice, trim_silence=trim_silence, start_offset=slice_start_offset, end_offset=slice_end_offset, results=None, progress=progress )
|
||||
messages.append(message)
|
||||
|
||||
|
||||
for voice in voices:
|
||||
print("Processing:", voice)
|
||||
message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length, progress=progress )
|
||||
|
@ -239,7 +239,7 @@ def prepare_all_datasets( language, validation_text_length, validation_audio_len
|
|||
|
||||
def prepare_dataset_proxy( voice, language, validation_text_length, validation_audio_length, skip_existings, slice_audio, trim_silence, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=True) ):
|
||||
messages = []
|
||||
|
||||
|
||||
message = transcribe_dataset( voice=voice, language=language, skip_existings=skip_existings, progress=progress )
|
||||
messages.append(message)
|
||||
|
||||
|
@ -355,7 +355,7 @@ def setup_gradio():
|
|||
voice_list_with_defaults = get_voice_list(append_defaults=True)
|
||||
voice_list = get_voice_list()
|
||||
result_voices = get_voice_list(args.results_folder)
|
||||
|
||||
|
||||
valle_models = get_valle_models()
|
||||
|
||||
autoregressive_models = get_autoregressive_models()
|
||||
|
@ -371,7 +371,9 @@ def setup_gradio():
|
|||
arg = GENERATE_SETTINGS_ARGS[i]
|
||||
GENERATE_SETTINGS[arg] = None
|
||||
|
||||
with gr.Blocks() as ui:
|
||||
with gr.Blocks(theme="freddyaboulton/dracula_revamped", css="footer { display: none!important}", title="Voice Clonning WebUI") as ui:
|
||||
gr.Markdown("## 🤗🎙️ Voice clonning ")
|
||||
gr.Markdown("Ai Voice clonning <a href='https://git.ecker.tech/terminator/ai-voice-cloning-terminator'>based on Tortoise</a>")
|
||||
with gr.Tab("Generate"):
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
|
@ -402,7 +404,7 @@ def setup_gradio():
|
|||
outputs=GENERATE_SETTINGS["mic_audio"],
|
||||
)
|
||||
with gr.Column():
|
||||
preset = None
|
||||
preset = None
|
||||
GENERATE_SETTINGS["candidates"] = gr.Slider(value=1, minimum=1, maximum=6, step=1, label="Candidates", visible=args.tts_backend=="tortoise")
|
||||
GENERATE_SETTINGS["seed"] = gr.Number(value=0, precision=0, label="Seed", visible=args.tts_backend=="tortoise")
|
||||
|
||||
|
@ -412,7 +414,7 @@ def setup_gradio():
|
|||
GENERATE_SETTINGS["diffusion_iterations"] = gr.Slider(value=30, minimum=0, maximum=512, step=1, label="Iterations", visible=args.tts_backend=="tortoise")
|
||||
|
||||
GENERATE_SETTINGS["temperature"] = gr.Slider(value=0.95 if args.tts_backend=="vall-e" else 0.2, minimum=0, maximum=1, step=0.05, label="Temperature")
|
||||
|
||||
|
||||
show_experimental_settings = gr.Checkbox(label="Show Experimental Settings", visible=args.tts_backend=="tortoise")
|
||||
reset_generate_settings_button = gr.Button(value="Reset to Default")
|
||||
with gr.Column(visible=False) as col:
|
||||
|
@ -514,7 +516,7 @@ def setup_gradio():
|
|||
transcribe_button = gr.Button(value="Transcribe and Process")
|
||||
transcribe_all_button = gr.Button(value="Transcribe All")
|
||||
diarize_button = gr.Button(value="Diarize", visible=False)
|
||||
|
||||
|
||||
with gr.Row():
|
||||
slice_dataset_button = gr.Button(value="(Re)Slice Audio")
|
||||
prepare_dataset_button = gr.Button(value="(Re)Create Dataset")
|
||||
|
@ -534,7 +536,7 @@ def setup_gradio():
|
|||
TRAINING_SETTINGS["learning_rate"] = gr.Slider(label="Learning Rate", value=1e-5, minimum=0, maximum=1e-4, step=1e-6)
|
||||
TRAINING_SETTINGS["mel_lr_weight"] = gr.Slider(label="Mel LR Ratio", value=1.00, minimum=0, maximum=1)
|
||||
TRAINING_SETTINGS["text_lr_weight"] = gr.Slider(label="Text LR Ratio", value=0.01, minimum=0, maximum=1)
|
||||
|
||||
|
||||
with gr.Row(visible=args.tts_backend=="tortoise"):
|
||||
lr_schemes = list(LEARNING_RATE_SCHEMES.keys())
|
||||
TRAINING_SETTINGS["learning_rate_scheme"] = gr.Radio(lr_schemes, label="Learning Rate Scheme", value=lr_schemes[0], type="value")
|
||||
|
@ -567,7 +569,7 @@ def setup_gradio():
|
|||
|
||||
TRAINING_SETTINGS["source_model"] = gr.Dropdown( choices=autoregressive_models, label="Source Model", type="value", value=autoregressive_models[0], visible=args.tts_backend=="tortoise" )
|
||||
TRAINING_SETTINGS["resume_state"] = gr.Textbox(label="Resume State Path", placeholder="./training/${voice}/finetune/training_state/${last_state}.state", visible=args.tts_backend=="tortoise")
|
||||
|
||||
|
||||
TRAINING_SETTINGS["voice"] = gr.Dropdown( choices=dataset_list, label="Dataset", type="value", value=dataset_list[0] if len(dataset_list) else "" )
|
||||
|
||||
with gr.Row():
|
||||
|
@ -585,9 +587,9 @@ def setup_gradio():
|
|||
refresh_configs = gr.Button(value="Refresh Configurations")
|
||||
training_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8)
|
||||
verbose_training = gr.Checkbox(label="Verbose Console Output", value=True)
|
||||
|
||||
|
||||
keep_x_past_checkpoints = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1)
|
||||
|
||||
|
||||
with gr.Row():
|
||||
training_graph_x_min = gr.Number(label="X Min", precision=0, value=0)
|
||||
training_graph_x_max = gr.Number(label="X Max", precision=0, value=0)
|
||||
|
@ -598,8 +600,8 @@ def setup_gradio():
|
|||
start_training_button = gr.Button(value="Train")
|
||||
stop_training_button = gr.Button(value="Stop")
|
||||
reconnect_training_button = gr.Button(value="Reconnect")
|
||||
|
||||
|
||||
|
||||
|
||||
with gr.Column():
|
||||
training_loss_graph = gr.LinePlot(label="Training Metrics",
|
||||
x="it", # x="epoch",
|
||||
|
@ -658,7 +660,7 @@ def setup_gradio():
|
|||
|
||||
EXEC_SETTINGS['results_folder'] = gr.Textbox(label="Results Folder", value=args.results_folder)
|
||||
# EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0])
|
||||
|
||||
|
||||
if args.tts_backend=="vall-e":
|
||||
with gr.Column():
|
||||
EXEC_SETTINGS['valle_model'] = gr.Dropdown(choices=valle_models, label="VALL-E Model Config", value=args.valle_model if args.valle_model else valle_models[0])
|
||||
|
@ -668,7 +670,7 @@ def setup_gradio():
|
|||
EXEC_SETTINGS['diffusion_model'] = gr.Dropdown(choices=diffusion_models, label="Diffusion Model", value=args.diffusion_model if args.diffusion_model else diffusion_models[0])
|
||||
EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1])
|
||||
EXEC_SETTINGS['tokenizer_json'] = gr.Dropdown(tokenizer_jsons, label="Tokenizer JSON Path", value=args.tokenizer_json if args.tokenizer_json else tokenizer_jsons[0])
|
||||
|
||||
|
||||
EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p']
|
||||
EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes']
|
||||
|
||||
|
@ -718,7 +720,7 @@ def setup_gradio():
|
|||
exec_inputs = list(EXEC_SETTINGS.values())
|
||||
for k in EXEC_SETTINGS:
|
||||
EXEC_SETTINGS[k].change( fn=update_args_proxy, inputs=exec_inputs )
|
||||
|
||||
|
||||
EXEC_SETTINGS['autoregressive_model'].change(
|
||||
fn=update_autoregressive_model,
|
||||
inputs=EXEC_SETTINGS['autoregressive_model'],
|
||||
|
@ -790,7 +792,7 @@ def setup_gradio():
|
|||
],
|
||||
outputs=GENERATE_SETTINGS['voice'],
|
||||
)
|
||||
|
||||
|
||||
GENERATE_SETTINGS['emotion'].change(
|
||||
fn=lambda value: gr.update(visible=value == "Custom"),
|
||||
inputs=GENERATE_SETTINGS['emotion'],
|
||||
|
@ -944,7 +946,7 @@ def setup_gradio():
|
|||
],
|
||||
outputs=prepare_dataset_output
|
||||
)
|
||||
|
||||
|
||||
training_refresh_dataset.click(
|
||||
lambda: gr.update(choices=get_dataset_list()),
|
||||
inputs=None,
|
||||
|
@ -966,7 +968,7 @@ def setup_gradio():
|
|||
|
||||
if os.path.isfile('./config/generate.json'):
|
||||
ui.load(import_generate_settings_proxy, inputs=None, outputs=generate_settings)
|
||||
|
||||
|
||||
if args.check_for_updates:
|
||||
ui.load(check_for_updates)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user