forked from camenduru/ai-voice-cloning
Slight fix, getting close to be able to train from the web UI directly
This commit is contained in:
parent
8482131e10
commit
f87764e7d0
|
@ -447,9 +447,9 @@ def save_training_settings( batch_size=None, learning_rate=None, print_rate=None
|
||||||
"save_rate": save_rate if save_rate else 50,
|
"save_rate": save_rate if save_rate else 50,
|
||||||
"name": name if name else "finetune",
|
"name": name if name else "finetune",
|
||||||
"dataset_name": dataset_name if dataset_name else "finetune",
|
"dataset_name": dataset_name if dataset_name else "finetune",
|
||||||
"dataset_path": dataset_path if dataset_path else "./experiments/finetune/train.txt",
|
"dataset_path": dataset_path if dataset_path else "./training/finetune/train.txt",
|
||||||
"validation_name": validation_name if validation_name else "finetune",
|
"validation_name": validation_name if validation_name else "finetune",
|
||||||
"validation_path": validation_path if validation_path else "./experiments/finetune/val.txt",
|
"validation_path": validation_path if validation_path else "./training/finetune/train.txt",
|
||||||
}
|
}
|
||||||
|
|
||||||
with open(f'./training/.template.yaml', 'r', encoding="utf-8") as f:
|
with open(f'./training/.template.yaml', 'r', encoding="utf-8") as f:
|
||||||
|
@ -462,7 +462,7 @@ def save_training_settings( batch_size=None, learning_rate=None, print_rate=None
|
||||||
f.write(yaml)
|
f.write(yaml)
|
||||||
|
|
||||||
whisper_model = None
|
whisper_model = None
|
||||||
def prepare_dataset( files, outdir ):
|
def prepare_dataset( files, outdir, language=None ):
|
||||||
global whisper_model
|
global whisper_model
|
||||||
if whisper_model is None:
|
if whisper_model is None:
|
||||||
whisper_model = whisper.load_model(args.whisper_model)
|
whisper_model = whisper.load_model(args.whisper_model)
|
||||||
|
@ -476,7 +476,7 @@ def prepare_dataset( files, outdir ):
|
||||||
for file in files:
|
for file in files:
|
||||||
print(f"Transcribing file: {file}")
|
print(f"Transcribing file: {file}")
|
||||||
|
|
||||||
result = whisper_model.transcribe(file)
|
result = whisper_model.transcribe(file, language=language)
|
||||||
results[os.path.basename(file)] = result
|
results[os.path.basename(file)] = result
|
||||||
|
|
||||||
print(f"Transcribed file: {file}, {len(result['segments'])} found.")
|
print(f"Transcribed file: {file}, {len(result['segments'])} found.")
|
||||||
|
|
|
@ -375,14 +375,15 @@ def setup_gradio():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
dataset_settings = [
|
dataset_settings = [
|
||||||
gr.Dropdown( get_voice_list(), label="Dataset Source", type="value" ),
|
gr.Dropdown( get_voice_list(), label="Dataset Source", type="value" ),
|
||||||
|
gr.Textbox(label="Language", placeholder="English")
|
||||||
]
|
]
|
||||||
dataset_voices = dataset_settings[0]
|
dataset_voices = dataset_settings[0]
|
||||||
|
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
prepare_dataset_button = gr.Button(value="Prepare")
|
prepare_dataset_button = gr.Button(value="Prepare")
|
||||||
|
|
||||||
def prepare_dataset_proxy( voice ):
|
def prepare_dataset_proxy( voice, language ):
|
||||||
return prepare_dataset( get_voices(load_latents=False)[voice], outdir=f"./training/{voice}/" )
|
return prepare_dataset( get_voices(load_latents=False)[voice], outdir=f"./training/{voice}/", language=language )
|
||||||
|
|
||||||
prepare_dataset_button.click(
|
prepare_dataset_button.click(
|
||||||
prepare_dataset_proxy,
|
prepare_dataset_proxy,
|
||||||
|
@ -403,9 +404,9 @@ def setup_gradio():
|
||||||
training_settings = training_settings + [
|
training_settings = training_settings + [
|
||||||
gr.Textbox(label="Training Name", placeholder="finetune"),
|
gr.Textbox(label="Training Name", placeholder="finetune"),
|
||||||
gr.Textbox(label="Dataset Name", placeholder="finetune"),
|
gr.Textbox(label="Dataset Name", placeholder="finetune"),
|
||||||
gr.Textbox(label="Dataset Path", placeholder="./experiments/finetune/train.txt"),
|
gr.Textbox(label="Dataset Path", placeholder="./training/finetune/train.txt"),
|
||||||
gr.Textbox(label="Validation Name", placeholder="finetune"),
|
gr.Textbox(label="Validation Name", placeholder="finetune"),
|
||||||
gr.Textbox(label="Validation Path", placeholder="./experiments/finetune/val.txt"),
|
gr.Textbox(label="Validation Path", placeholder="./training/finetune/train.txt"),
|
||||||
]
|
]
|
||||||
|
|
||||||
save_yaml_button.click(save_training_settings,
|
save_yaml_button.click(save_training_settings,
|
||||||
|
|
|
@ -24,6 +24,7 @@ datasets:
|
||||||
num_conditioning_candidates: 2
|
num_conditioning_candidates: 2
|
||||||
conditioning_length: 44000
|
conditioning_length: 44000
|
||||||
use_bpe_tokenizer: True
|
use_bpe_tokenizer: True
|
||||||
|
tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json
|
||||||
load_aligned_codes: False
|
load_aligned_codes: False
|
||||||
val:
|
val:
|
||||||
name: ${validation_name}
|
name: ${validation_name}
|
||||||
|
@ -40,6 +41,7 @@ datasets:
|
||||||
num_conditioning_candidates: 2
|
num_conditioning_candidates: 2
|
||||||
conditioning_length: 44000
|
conditioning_length: 44000
|
||||||
use_bpe_tokenizer: True
|
use_bpe_tokenizer: True
|
||||||
|
tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json
|
||||||
load_aligned_codes: False
|
load_aligned_codes: False
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
@ -59,20 +61,20 @@ steps:
|
||||||
injectors: # TODO: replace this entire sequence with the GptVoiceLatentInjector
|
injectors: # TODO: replace this entire sequence with the GptVoiceLatentInjector
|
||||||
paired_to_mel:
|
paired_to_mel:
|
||||||
type: torch_mel_spectrogram
|
type: torch_mel_spectrogram
|
||||||
mel_norm_file: ./experiments/clips_mel_norms.pth
|
mel_norm_file: ./models/tortoise/clips_mel_norms.pth
|
||||||
in: wav
|
in: wav
|
||||||
out: paired_mel
|
out: paired_mel
|
||||||
paired_cond_to_mel:
|
paired_cond_to_mel:
|
||||||
type: for_each
|
type: for_each
|
||||||
subtype: torch_mel_spectrogram
|
subtype: torch_mel_spectrogram
|
||||||
mel_norm_file: ./experiments/clips_mel_norms.pth
|
mel_norm_file: ./models/tortoise/clips_mel_norms.pth
|
||||||
in: conditioning
|
in: conditioning
|
||||||
out: paired_conditioning_mel
|
out: paired_conditioning_mel
|
||||||
to_codes:
|
to_codes:
|
||||||
type: discrete_token
|
type: discrete_token
|
||||||
in: paired_mel
|
in: paired_mel
|
||||||
out: paired_mel_codes
|
out: paired_mel_codes
|
||||||
dvae_config: "./experiments/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT
|
dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT
|
||||||
paired_fwd_text:
|
paired_fwd_text:
|
||||||
type: generator
|
type: generator
|
||||||
generator: gpt
|
generator: gpt
|
||||||
|
@ -112,9 +114,9 @@ networks:
|
||||||
#only_alignment_head: False # uv3/4
|
#only_alignment_head: False # uv3/4
|
||||||
|
|
||||||
path:
|
path:
|
||||||
pretrain_model_gpt: './experiments/autoregressive.pth' # CHANGEME: copy this from tortoise cache
|
pretrain_model_gpt: './models/tortoise/autoregressive.pth' # CHANGEME: copy this from tortoise cache
|
||||||
strict_load: true
|
strict_load: true
|
||||||
#resume_state: ./experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state.
|
#resume_state: ./models/tortoise/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state.
|
||||||
|
|
||||||
# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit)
|
# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit)
|
||||||
train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
|
train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
|
||||||
|
|
Loading…
Reference in New Issue
Block a user