forked from mrq/ai-voice-cloning
added dropdown to select which whisper model to use for transcription, added note that FFMPEG is required
This commit is contained in:
parent
96e9acdeec
commit
12933cfd60
16
README.md
16
README.md
|
@ -223,7 +223,17 @@ To import a voice, click `Import Voice`. Remember to click `Refresh Voice List`
|
||||||
|
|
||||||
This tab will contain a collection of sub-tabs pertaining to training.
|
This tab will contain a collection of sub-tabs pertaining to training.
|
||||||
|
|
||||||
#### Configuration
|
#### Prepare Dataset
|
||||||
|
|
||||||
|
This section will aid in preparing the dataset for fine-tuning.
|
||||||
|
|
||||||
|
With it, you simply select a voice, then click the button, and wait for the console to tell you it's done. The results will be saved to `./training/{voice name}/`.
|
||||||
|
|
||||||
|
The web UI will leverage [openai/whisper](https://github.com/openai/whisper) to transcribe a given sample source, and split them into convenient pieces.
|
||||||
|
|
||||||
|
**!**NOTE**!**: transcription leverages FFMPEG, so please make sure you either have an FFMPEG installed visible to your PATH, or drop the binary in the `./bin/` folder.
|
||||||
|
|
||||||
|
#### Generate Configuration
|
||||||
|
|
||||||
This will generate the YAML necessary to feed into training. For now, you can set:
|
This will generate the YAML necessary to feed into training. For now, you can set:
|
||||||
* `Batch Size`: size of batches for training, more batches = faster training, at the cost of higher VRAM. setting this to 1 will lead to problems
|
* `Batch Size`: size of batches for training, more batches = faster training, at the cost of higher VRAM. setting this to 1 will lead to problems
|
||||||
|
@ -255,6 +265,10 @@ Below are settings that override the default launch arguments. Some of these req
|
||||||
* `Embed Output Metadata`: enables embedding the settings and latents used to generate that audio clip inside that audio clip. Metadata is stored as a JSON string in the `lyrics` tag.
|
* `Embed Output Metadata`: enables embedding the settings and latents used to generate that audio clip inside that audio clip. Metadata is stored as a JSON string in the `lyrics` tag.
|
||||||
* `Slimmer Computed Latents`: falls back to the original, 12.9KiB way of storing latents (without the extra bits required for using the CVVP model).
|
* `Slimmer Computed Latents`: falls back to the original, 12.9KiB way of storing latents (without the extra bits required for using the CVVP model).
|
||||||
* `Voice Fixer`: runs each generated audio clip through `voicefixer`, if available and installed.
|
* `Voice Fixer`: runs each generated audio clip through `voicefixer`, if available and installed.
|
||||||
|
* `Use CUDA for Voice Fixer`: allows voicefixer to use CUDA. Speeds up cleaning the output, but at the cost of more VRAM consumed. Disable if you OOM.
|
||||||
|
* `Device Override`: overrides the device name used to pass to PyTorch for hardware acceleration. You can use the accompanied `list_devices.py` script to map valid strings to GPU names. You can also pass `cpu` if you want to fallback to software mode.
|
||||||
|
* `Whisper Model`: the specific model to use for Whisper transcription, when preparing a dataset to finetune with.
|
||||||
|
|
||||||
* `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors.
|
* `Voice Latent Max Chunk Size`: during the voice latents calculation pass, this limits how large, in bytes, a chunk can be. Large values can run into VRAM OOM errors.
|
||||||
* `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit.
|
* `Sample Batch Size`: sets the batch size when generating autoregressive samples. Bigger batches result in faster compute, at the cost of increased VRAM consumption. Leave to 0 to calculate a "best" fit.
|
||||||
* `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips.
|
* `Concurrency Count`: how many Gradio events the queue can process at once. Leave this over 1 if you want to modify settings in the UI that updates other settings while generating audio clips.
|
||||||
|
|
|
@ -57,6 +57,7 @@ def setup_args():
|
||||||
'voice-fixer-use-cuda': True,
|
'voice-fixer-use-cuda': True,
|
||||||
'force-cpu-for-conditioning-latents': False,
|
'force-cpu-for-conditioning-latents': False,
|
||||||
'device-override': None,
|
'device-override': None,
|
||||||
|
'whisper-model': "base",
|
||||||
'concurrency-count': 2,
|
'concurrency-count': 2,
|
||||||
'output-sample-rate': 44100,
|
'output-sample-rate': 44100,
|
||||||
'output-volume': 1,
|
'output-volume': 1,
|
||||||
|
@ -80,6 +81,7 @@ def setup_args():
|
||||||
parser.add_argument("--voice-fixer-use-cuda", action='store_true', default=default_arguments['voice-fixer-use-cuda'], help="Hints to voicefixer to use CUDA, if available.")
|
parser.add_argument("--voice-fixer-use-cuda", action='store_true', default=default_arguments['voice-fixer-use-cuda'], help="Hints to voicefixer to use CUDA, if available.")
|
||||||
parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)")
|
parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)")
|
||||||
parser.add_argument("--device-override", default=default_arguments['device-override'], help="A device string to override pass through Torch")
|
parser.add_argument("--device-override", default=default_arguments['device-override'], help="A device string to override pass through Torch")
|
||||||
|
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
|
||||||
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets how many batches to use during the autoregressive samples pass")
|
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets how many batches to use during the autoregressive samples pass")
|
||||||
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
|
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
|
||||||
parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)")
|
parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)")
|
||||||
|
@ -463,7 +465,7 @@ whisper_model = None
|
||||||
def prepare_dataset( files, outdir ):
|
def prepare_dataset( files, outdir ):
|
||||||
global whisper_model
|
global whisper_model
|
||||||
if whisper_model is None:
|
if whisper_model is None:
|
||||||
whisper_model = whisper.load_model("base")
|
whisper_model = whisper.load_model(args.whisper_model)
|
||||||
|
|
||||||
os.makedirs(outdir, exist_ok=True)
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
|
||||||
|
@ -653,7 +655,7 @@ def get_voice_list(dir=get_voice_dir()):
|
||||||
os.makedirs(dir, exist_ok=True)
|
os.makedirs(dir, exist_ok=True)
|
||||||
return sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and len(os.listdir(os.path.join(dir, d))) > 0 ]) + ["microphone", "random"]
|
return sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and len(os.listdir(os.path.join(dir, d))) > 0 ]) + ["microphone", "random"]
|
||||||
|
|
||||||
def export_exec_settings( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, force_cpu_for_conditioning_latents, device_override, sample_batch_size, concurrency_count, output_sample_rate, output_volume ):
|
def export_exec_settings( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, force_cpu_for_conditioning_latents, device_override, whisper_model, sample_batch_size, concurrency_count, output_sample_rate, output_volume ):
|
||||||
global args
|
global args
|
||||||
|
|
||||||
args.listen = listen
|
args.listen = listen
|
||||||
|
@ -663,6 +665,7 @@ def export_exec_settings( listen, share, check_for_updates, models_from_local_on
|
||||||
args.low_vram = low_vram
|
args.low_vram = low_vram
|
||||||
args.force_cpu_for_conditioning_latents = force_cpu_for_conditioning_latents
|
args.force_cpu_for_conditioning_latents = force_cpu_for_conditioning_latents
|
||||||
args.device_override = device_override
|
args.device_override = device_override
|
||||||
|
args.whisper_model = whisper_model
|
||||||
args.sample_batch_size = sample_batch_size
|
args.sample_batch_size = sample_batch_size
|
||||||
args.embed_output_metadata = embed_output_metadata
|
args.embed_output_metadata = embed_output_metadata
|
||||||
args.latents_lean_and_mean = latents_lean_and_mean
|
args.latents_lean_and_mean = latents_lean_and_mean
|
||||||
|
@ -680,6 +683,7 @@ def export_exec_settings( listen, share, check_for_updates, models_from_local_on
|
||||||
'models-from-local-only':args.models_from_local_only,
|
'models-from-local-only':args.models_from_local_only,
|
||||||
'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents,
|
'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents,
|
||||||
'device-override': args.device_override,
|
'device-override': args.device_override,
|
||||||
|
'whisper-model': args.whisper_model,
|
||||||
'sample-batch-size': args.sample_batch_size,
|
'sample-batch-size': args.sample_batch_size,
|
||||||
'embed-output-metadata': args.embed_output_metadata,
|
'embed-output-metadata': args.embed_output_metadata,
|
||||||
'latents-lean-and-mean': args.latents_lean_and_mean,
|
'latents-lean-and-mean': args.latents_lean_and_mean,
|
||||||
|
|
|
@ -370,6 +370,7 @@ def setup_gradio():
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
with gr.Tab("Training"):
|
with gr.Tab("Training"):
|
||||||
|
with gr.Tab("Prepare Dataset"):
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
dataset_settings = [
|
dataset_settings = [
|
||||||
|
@ -377,6 +378,7 @@ def setup_gradio():
|
||||||
]
|
]
|
||||||
dataset_voices = dataset_settings[0]
|
dataset_voices = dataset_settings[0]
|
||||||
|
|
||||||
|
with gr.Column():
|
||||||
prepare_dataset_button = gr.Button(value="Prepare")
|
prepare_dataset_button = gr.Button(value="Prepare")
|
||||||
|
|
||||||
def prepare_dataset_proxy( voice ):
|
def prepare_dataset_proxy( voice ):
|
||||||
|
@ -387,7 +389,8 @@ def setup_gradio():
|
||||||
inputs=dataset_settings,
|
inputs=dataset_settings,
|
||||||
outputs=None
|
outputs=None
|
||||||
)
|
)
|
||||||
|
with gr.Tab("Generate Configuration"):
|
||||||
|
with gr.Row():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
training_settings = [
|
training_settings = [
|
||||||
gr.Slider(label="Batch Size", value=128),
|
gr.Slider(label="Batch Size", value=128),
|
||||||
|
@ -395,6 +398,7 @@ def setup_gradio():
|
||||||
gr.Number(label="Print Frequency", value=50),
|
gr.Number(label="Print Frequency", value=50),
|
||||||
gr.Number(label="Save Frequency", value=50),
|
gr.Number(label="Save Frequency", value=50),
|
||||||
]
|
]
|
||||||
|
save_yaml_button = gr.Button(value="Save Training Configuration")
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
training_settings = training_settings + [
|
training_settings = training_settings + [
|
||||||
gr.Textbox(label="Training Name", placeholder="finetune"),
|
gr.Textbox(label="Training Name", placeholder="finetune"),
|
||||||
|
@ -403,7 +407,7 @@ def setup_gradio():
|
||||||
gr.Textbox(label="Validation Name", placeholder="finetune"),
|
gr.Textbox(label="Validation Name", placeholder="finetune"),
|
||||||
gr.Textbox(label="Validation Path", placeholder="./experiments/finetune/val.txt"),
|
gr.Textbox(label="Validation Path", placeholder="./experiments/finetune/val.txt"),
|
||||||
]
|
]
|
||||||
save_yaml_button = gr.Button(value="Save Training Configuration")
|
|
||||||
save_yaml_button.click(save_training_settings,
|
save_yaml_button.click(save_training_settings,
|
||||||
inputs=training_settings,
|
inputs=training_settings,
|
||||||
outputs=None
|
outputs=None
|
||||||
|
@ -424,6 +428,7 @@ def setup_gradio():
|
||||||
gr.Checkbox(label="Use CUDA for Voice Fixer", value=args.voice_fixer_use_cuda),
|
gr.Checkbox(label="Use CUDA for Voice Fixer", value=args.voice_fixer_use_cuda),
|
||||||
gr.Checkbox(label="Force CPU for Conditioning Latents", value=args.force_cpu_for_conditioning_latents),
|
gr.Checkbox(label="Force CPU for Conditioning Latents", value=args.force_cpu_for_conditioning_latents),
|
||||||
gr.Textbox(label="Device Override", value=args.device_override),
|
gr.Textbox(label="Device Override", value=args.device_override),
|
||||||
|
gr.Dropdown(label="Whisper Model", value=args.whisper_model, choices=["tiny", "base", "small", "medium", "large"]),
|
||||||
]
|
]
|
||||||
gr.Button(value="Check for Updates").click(check_for_updates)
|
gr.Button(value="Check for Updates").click(check_for_updates)
|
||||||
gr.Button(value="Reload TTS").click(reload_tts)
|
gr.Button(value="Reload TTS").click(reload_tts)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user