Compare commits

...

4 Commits

6 changed files with 84 additions and 26 deletions

16
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,16 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Python: Debug Main.py",
"type": "python",
"request": "launch",
"program": "${workspaceFolder}/src/main.py",
"console": "integratedTerminal",
"env": {
"PYTHONPATH": "${workspaceFolder}"
},
"preLaunchTask": "Setup Environment",
}
]
}

12
.vscode/tasks.json vendored Normal file
View File

@ -0,0 +1,12 @@
{
"version": "2.0.0",
"tasks": [
{
"label": "Setup Environment",
"type": "shell",
"command": "python",
"args": ["activate_env.py"],
"problemMatcher": []
}
]
}

9
activate_env.py Normal file
View File

@ -0,0 +1,9 @@
import subprocess
import os
# Ativa o ambiente virtual
subprocess.run([r'.\venv\Scripts\activate.bat'], shell=True)
# Configura as variáveis de ambiente
os.environ['PATH'] = r'.\bin;' + os.environ['PATH']
os.environ['PYTHONUTF8'] = '1'

View File

@ -38,10 +38,24 @@
], ],
"source":[ "source":[
"!apt install python3.8-venv\n", "!apt install python3.10-venv\n",
"!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n", "!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
"%cd /content/ai-voice-cloning\n", "%cd /content/ai-voice-cloning\n",
"!./setup-cuda.sh" "# get local dependencies\n",
"!git submodule init\n",
"!git submodule update --remote\n",
"# setup venv\n",
"!python3 -m venv venv\n",
"!source ./venv/bin/activate\n",
"!python3 -m pip install --upgrade pip # just to be safe\n",
"# CUDA\n",
"!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
"# install requirements\n",
"!python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements\n",
"!python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe\n",
"!python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirements, last, because whisperx will break a dependency here\n",
"!python3 -m pip install -e ./modules/dlas/ # install DLAS\n",
"!python3 -m pip install -r ./requirements.txt # install local requirements"
] ]
}, },
{ {
@ -115,7 +129,8 @@
"cell_type":"code", "cell_type":"code",
"source":[ "source":[
"%cd /content/ai-voice-cloning/\n", "%cd /content/ai-voice-cloning/\n",
"!./start.sh --share" "!source ./venv/bin/activate\n",
"!python3 ./src/main.py --share"
], ],
"metadata":{ "metadata":{
"id":"QRA8jF3cF-YJ" "id":"QRA8jF3cF-YJ"

View File

@ -1,5 +1,9 @@
git+https://github.com/openai/whisper.git --extra-index-url https://download.pytorch.org/whl/cu118
torch>=2.1.0
torchvision
torchaudio
openai-whisper
more-itertools more-itertools
ffmpeg-python ffmpeg-python
gradio<=3.23.0 gradio<=3.23.0

View File

@ -92,7 +92,7 @@ def generate_proxy(
unload_tts() unload_tts()
raise e raise e
return ( return (
outputs[0], outputs[0],
gr.update(value=sample, visible=sample is not None), gr.update(value=sample, visible=sample is not None),
@ -131,7 +131,7 @@ def history_view_results( voice ):
metadata, _ = read_generate_settings(f"{outdir}/{file}", read_latents=False) metadata, _ = read_generate_settings(f"{outdir}/{file}", read_latents=False)
if metadata is None: if metadata is None:
continue continue
values = [] values = []
for k in HISTORY_HEADERS: for k in HISTORY_HEADERS:
v = file v = file
@ -185,7 +185,7 @@ def read_generate_settings_proxy(file, saveAs='.temp'):
os.makedirs(outdir, exist_ok=True) os.makedirs(outdir, exist_ok=True)
with open(f'{outdir}/cond_latents.pth', 'wb') as f: with open(f'{outdir}/cond_latents.pth', 'wb') as f:
f.write(latents) f.write(latents)
latents = f'{outdir}/cond_latents.pth' latents = f'{outdir}/cond_latents.pth'
return ( return (
@ -229,7 +229,7 @@ def prepare_all_datasets( language, validation_text_length, validation_audio_len
print("Processing:", voice) print("Processing:", voice)
message = slice_dataset( voice, trim_silence=trim_silence, start_offset=slice_start_offset, end_offset=slice_end_offset, results=None, progress=progress ) message = slice_dataset( voice, trim_silence=trim_silence, start_offset=slice_start_offset, end_offset=slice_end_offset, results=None, progress=progress )
messages.append(message) messages.append(message)
for voice in voices: for voice in voices:
print("Processing:", voice) print("Processing:", voice)
message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length, progress=progress ) message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length, progress=progress )
@ -239,7 +239,7 @@ def prepare_all_datasets( language, validation_text_length, validation_audio_len
def prepare_dataset_proxy( voice, language, validation_text_length, validation_audio_length, skip_existings, slice_audio, trim_silence, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=True) ): def prepare_dataset_proxy( voice, language, validation_text_length, validation_audio_length, skip_existings, slice_audio, trim_silence, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=True) ):
messages = [] messages = []
message = transcribe_dataset( voice=voice, language=language, skip_existings=skip_existings, progress=progress ) message = transcribe_dataset( voice=voice, language=language, skip_existings=skip_existings, progress=progress )
messages.append(message) messages.append(message)
@ -355,7 +355,7 @@ def setup_gradio():
voice_list_with_defaults = get_voice_list(append_defaults=True) voice_list_with_defaults = get_voice_list(append_defaults=True)
voice_list = get_voice_list() voice_list = get_voice_list()
result_voices = get_voice_list(args.results_folder) result_voices = get_voice_list(args.results_folder)
valle_models = get_valle_models() valle_models = get_valle_models()
autoregressive_models = get_autoregressive_models() autoregressive_models = get_autoregressive_models()
@ -371,7 +371,9 @@ def setup_gradio():
arg = GENERATE_SETTINGS_ARGS[i] arg = GENERATE_SETTINGS_ARGS[i]
GENERATE_SETTINGS[arg] = None GENERATE_SETTINGS[arg] = None
with gr.Blocks() as ui: with gr.Blocks(theme="freddyaboulton/dracula_revamped", css="footer { display: none!important}", title="Voice Clonning WebUI") as ui:
gr.Markdown("## 🤗🎙️ Voice clonning ")
gr.Markdown("Ai Voice clonning <a href='https://git.ecker.tech/terminator/ai-voice-cloning-terminator'>based on Tortoise</a>")
with gr.Tab("Generate"): with gr.Tab("Generate"):
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
@ -402,7 +404,7 @@ def setup_gradio():
outputs=GENERATE_SETTINGS["mic_audio"], outputs=GENERATE_SETTINGS["mic_audio"],
) )
with gr.Column(): with gr.Column():
preset = None preset = None
GENERATE_SETTINGS["candidates"] = gr.Slider(value=1, minimum=1, maximum=6, step=1, label="Candidates", visible=args.tts_backend=="tortoise") GENERATE_SETTINGS["candidates"] = gr.Slider(value=1, minimum=1, maximum=6, step=1, label="Candidates", visible=args.tts_backend=="tortoise")
GENERATE_SETTINGS["seed"] = gr.Number(value=0, precision=0, label="Seed", visible=args.tts_backend=="tortoise") GENERATE_SETTINGS["seed"] = gr.Number(value=0, precision=0, label="Seed", visible=args.tts_backend=="tortoise")
@ -412,7 +414,7 @@ def setup_gradio():
GENERATE_SETTINGS["diffusion_iterations"] = gr.Slider(value=30, minimum=0, maximum=512, step=1, label="Iterations", visible=args.tts_backend=="tortoise") GENERATE_SETTINGS["diffusion_iterations"] = gr.Slider(value=30, minimum=0, maximum=512, step=1, label="Iterations", visible=args.tts_backend=="tortoise")
GENERATE_SETTINGS["temperature"] = gr.Slider(value=0.95 if args.tts_backend=="vall-e" else 0.2, minimum=0, maximum=1, step=0.05, label="Temperature") GENERATE_SETTINGS["temperature"] = gr.Slider(value=0.95 if args.tts_backend=="vall-e" else 0.2, minimum=0, maximum=1, step=0.05, label="Temperature")
show_experimental_settings = gr.Checkbox(label="Show Experimental Settings", visible=args.tts_backend=="tortoise") show_experimental_settings = gr.Checkbox(label="Show Experimental Settings", visible=args.tts_backend=="tortoise")
reset_generate_settings_button = gr.Button(value="Reset to Default") reset_generate_settings_button = gr.Button(value="Reset to Default")
with gr.Column(visible=False) as col: with gr.Column(visible=False) as col:
@ -514,7 +516,7 @@ def setup_gradio():
transcribe_button = gr.Button(value="Transcribe and Process") transcribe_button = gr.Button(value="Transcribe and Process")
transcribe_all_button = gr.Button(value="Transcribe All") transcribe_all_button = gr.Button(value="Transcribe All")
diarize_button = gr.Button(value="Diarize", visible=False) diarize_button = gr.Button(value="Diarize", visible=False)
with gr.Row(): with gr.Row():
slice_dataset_button = gr.Button(value="(Re)Slice Audio") slice_dataset_button = gr.Button(value="(Re)Slice Audio")
prepare_dataset_button = gr.Button(value="(Re)Create Dataset") prepare_dataset_button = gr.Button(value="(Re)Create Dataset")
@ -534,7 +536,7 @@ def setup_gradio():
TRAINING_SETTINGS["learning_rate"] = gr.Slider(label="Learning Rate", value=1e-5, minimum=0, maximum=1e-4, step=1e-6) TRAINING_SETTINGS["learning_rate"] = gr.Slider(label="Learning Rate", value=1e-5, minimum=0, maximum=1e-4, step=1e-6)
TRAINING_SETTINGS["mel_lr_weight"] = gr.Slider(label="Mel LR Ratio", value=1.00, minimum=0, maximum=1) TRAINING_SETTINGS["mel_lr_weight"] = gr.Slider(label="Mel LR Ratio", value=1.00, minimum=0, maximum=1)
TRAINING_SETTINGS["text_lr_weight"] = gr.Slider(label="Text LR Ratio", value=0.01, minimum=0, maximum=1) TRAINING_SETTINGS["text_lr_weight"] = gr.Slider(label="Text LR Ratio", value=0.01, minimum=0, maximum=1)
with gr.Row(visible=args.tts_backend=="tortoise"): with gr.Row(visible=args.tts_backend=="tortoise"):
lr_schemes = list(LEARNING_RATE_SCHEMES.keys()) lr_schemes = list(LEARNING_RATE_SCHEMES.keys())
TRAINING_SETTINGS["learning_rate_scheme"] = gr.Radio(lr_schemes, label="Learning Rate Scheme", value=lr_schemes[0], type="value") TRAINING_SETTINGS["learning_rate_scheme"] = gr.Radio(lr_schemes, label="Learning Rate Scheme", value=lr_schemes[0], type="value")
@ -567,7 +569,7 @@ def setup_gradio():
TRAINING_SETTINGS["source_model"] = gr.Dropdown( choices=autoregressive_models, label="Source Model", type="value", value=autoregressive_models[0], visible=args.tts_backend=="tortoise" ) TRAINING_SETTINGS["source_model"] = gr.Dropdown( choices=autoregressive_models, label="Source Model", type="value", value=autoregressive_models[0], visible=args.tts_backend=="tortoise" )
TRAINING_SETTINGS["resume_state"] = gr.Textbox(label="Resume State Path", placeholder="./training/${voice}/finetune/training_state/${last_state}.state", visible=args.tts_backend=="tortoise") TRAINING_SETTINGS["resume_state"] = gr.Textbox(label="Resume State Path", placeholder="./training/${voice}/finetune/training_state/${last_state}.state", visible=args.tts_backend=="tortoise")
TRAINING_SETTINGS["voice"] = gr.Dropdown( choices=dataset_list, label="Dataset", type="value", value=dataset_list[0] if len(dataset_list) else "" ) TRAINING_SETTINGS["voice"] = gr.Dropdown( choices=dataset_list, label="Dataset", type="value", value=dataset_list[0] if len(dataset_list) else "" )
with gr.Row(): with gr.Row():
@ -585,9 +587,9 @@ def setup_gradio():
refresh_configs = gr.Button(value="Refresh Configurations") refresh_configs = gr.Button(value="Refresh Configurations")
training_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8) training_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8)
verbose_training = gr.Checkbox(label="Verbose Console Output", value=True) verbose_training = gr.Checkbox(label="Verbose Console Output", value=True)
keep_x_past_checkpoints = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1) keep_x_past_checkpoints = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1)
with gr.Row(): with gr.Row():
training_graph_x_min = gr.Number(label="X Min", precision=0, value=0) training_graph_x_min = gr.Number(label="X Min", precision=0, value=0)
training_graph_x_max = gr.Number(label="X Max", precision=0, value=0) training_graph_x_max = gr.Number(label="X Max", precision=0, value=0)
@ -598,8 +600,8 @@ def setup_gradio():
start_training_button = gr.Button(value="Train") start_training_button = gr.Button(value="Train")
stop_training_button = gr.Button(value="Stop") stop_training_button = gr.Button(value="Stop")
reconnect_training_button = gr.Button(value="Reconnect") reconnect_training_button = gr.Button(value="Reconnect")
with gr.Column(): with gr.Column():
training_loss_graph = gr.LinePlot(label="Training Metrics", training_loss_graph = gr.LinePlot(label="Training Metrics",
x="it", # x="epoch", x="it", # x="epoch",
@ -658,7 +660,7 @@ def setup_gradio():
EXEC_SETTINGS['results_folder'] = gr.Textbox(label="Results Folder", value=args.results_folder) EXEC_SETTINGS['results_folder'] = gr.Textbox(label="Results Folder", value=args.results_folder)
# EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0]) # EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0])
if args.tts_backend=="vall-e": if args.tts_backend=="vall-e":
with gr.Column(): with gr.Column():
EXEC_SETTINGS['valle_model'] = gr.Dropdown(choices=valle_models, label="VALL-E Model Config", value=args.valle_model if args.valle_model else valle_models[0]) EXEC_SETTINGS['valle_model'] = gr.Dropdown(choices=valle_models, label="VALL-E Model Config", value=args.valle_model if args.valle_model else valle_models[0])
@ -668,7 +670,7 @@ def setup_gradio():
EXEC_SETTINGS['diffusion_model'] = gr.Dropdown(choices=diffusion_models, label="Diffusion Model", value=args.diffusion_model if args.diffusion_model else diffusion_models[0]) EXEC_SETTINGS['diffusion_model'] = gr.Dropdown(choices=diffusion_models, label="Diffusion Model", value=args.diffusion_model if args.diffusion_model else diffusion_models[0])
EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1]) EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1])
EXEC_SETTINGS['tokenizer_json'] = gr.Dropdown(tokenizer_jsons, label="Tokenizer JSON Path", value=args.tokenizer_json if args.tokenizer_json else tokenizer_jsons[0]) EXEC_SETTINGS['tokenizer_json'] = gr.Dropdown(tokenizer_jsons, label="Tokenizer JSON Path", value=args.tokenizer_json if args.tokenizer_json else tokenizer_jsons[0])
EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p'] EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p']
EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes'] EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes']
@ -718,7 +720,7 @@ def setup_gradio():
exec_inputs = list(EXEC_SETTINGS.values()) exec_inputs = list(EXEC_SETTINGS.values())
for k in EXEC_SETTINGS: for k in EXEC_SETTINGS:
EXEC_SETTINGS[k].change( fn=update_args_proxy, inputs=exec_inputs ) EXEC_SETTINGS[k].change( fn=update_args_proxy, inputs=exec_inputs )
EXEC_SETTINGS['autoregressive_model'].change( EXEC_SETTINGS['autoregressive_model'].change(
fn=update_autoregressive_model, fn=update_autoregressive_model,
inputs=EXEC_SETTINGS['autoregressive_model'], inputs=EXEC_SETTINGS['autoregressive_model'],
@ -790,7 +792,7 @@ def setup_gradio():
], ],
outputs=GENERATE_SETTINGS['voice'], outputs=GENERATE_SETTINGS['voice'],
) )
GENERATE_SETTINGS['emotion'].change( GENERATE_SETTINGS['emotion'].change(
fn=lambda value: gr.update(visible=value == "Custom"), fn=lambda value: gr.update(visible=value == "Custom"),
inputs=GENERATE_SETTINGS['emotion'], inputs=GENERATE_SETTINGS['emotion'],
@ -944,7 +946,7 @@ def setup_gradio():
], ],
outputs=prepare_dataset_output outputs=prepare_dataset_output
) )
training_refresh_dataset.click( training_refresh_dataset.click(
lambda: gr.update(choices=get_dataset_list()), lambda: gr.update(choices=get_dataset_list()),
inputs=None, inputs=None,
@ -966,7 +968,7 @@ def setup_gradio():
if os.path.isfile('./config/generate.json'): if os.path.isfile('./config/generate.json'):
ui.load(import_generate_settings_proxy, inputs=None, outputs=generate_settings) ui.load(import_generate_settings_proxy, inputs=None, outputs=generate_settings)
if args.check_for_updates: if args.check_for_updates:
ui.load(check_for_updates) ui.load(check_for_updates)