forked from camenduru/ai-voice-cloning
added checkboxes to use the original method for calculating latents (ignores the voice chunk field)
This commit is contained in:
parent
9e3eca2261
commit
31da215c5f
|
@ -1 +1 @@
|
||||||
Subproject commit c90ee7c5296992ad96c8790b5b7cc3737062e1e6
|
Subproject commit 5ff00bf3bfa97e2c8e9f166b920273f83ac9d8f0
|
|
@ -875,7 +875,6 @@ def generate_tortoise(**kwargs):
|
||||||
'k': parameters['candidates'],
|
'k': parameters['candidates'],
|
||||||
'diffusion_sampler': parameters['diffusion_sampler'],
|
'diffusion_sampler': parameters['diffusion_sampler'],
|
||||||
'breathing_room': parameters['breathing_room'],
|
'breathing_room': parameters['breathing_room'],
|
||||||
'progress': parameters['progress'],
|
|
||||||
'half_p': "Half Precision" in parameters['experimentals'],
|
'half_p': "Half Precision" in parameters['experimentals'],
|
||||||
'cond_free': "Conditioning-Free" in parameters['experimentals'],
|
'cond_free': "Conditioning-Free" in parameters['experimentals'],
|
||||||
'cvvp_amount': parameters['cvvp_weight'],
|
'cvvp_amount': parameters['cvvp_weight'],
|
||||||
|
@ -1256,7 +1255,7 @@ def update_baseline_for_latents_chunks( voice ):
|
||||||
return int(total_duration / total) if total > 0 else 1
|
return int(total_duration / total) if total > 0 else 1
|
||||||
return int(total_duration / args.autocalculate_voice_chunk_duration_size) if total_duration > 0 else 1
|
return int(total_duration / args.autocalculate_voice_chunk_duration_size) if total_duration > 0 else 1
|
||||||
|
|
||||||
def compute_latents(voice=None, voice_samples=None, voice_latents_chunks=0, progress=None):
|
def compute_latents(voice=None, voice_samples=None, voice_latents_chunks=0, original_ar=False, original_diffusion=False):
|
||||||
global tts
|
global tts
|
||||||
global args
|
global args
|
||||||
|
|
||||||
|
@ -1309,7 +1308,7 @@ def compute_latents(voice=None, voice_samples=None, voice_latents_chunks=0, prog
|
||||||
if voice_samples is None:
|
if voice_samples is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents)
|
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents, original_ar=original_ar, original_diffusion=original_diffusion)
|
||||||
|
|
||||||
if len(conditioning_latents) == 4:
|
if len(conditioning_latents) == 4:
|
||||||
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
|
conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None)
|
||||||
|
|
10
src/webui.py
10
src/webui.py
|
@ -78,6 +78,8 @@ def generate_proxy(
|
||||||
repetition_penalty,
|
repetition_penalty,
|
||||||
cond_free_k,
|
cond_free_k,
|
||||||
experimentals,
|
experimentals,
|
||||||
|
voice_latents_original_ar,
|
||||||
|
voice_latents_original_diffusion,
|
||||||
progress=gr.Progress(track_tqdm=True)
|
progress=gr.Progress(track_tqdm=True)
|
||||||
):
|
):
|
||||||
kwargs = locals()
|
kwargs = locals()
|
||||||
|
@ -166,12 +168,12 @@ def reset_generate_settings_proxy():
|
||||||
|
|
||||||
return tuple(res)
|
return tuple(res)
|
||||||
|
|
||||||
def compute_latents_proxy(voice, voice_latents_chunks, progress=gr.Progress(track_tqdm=True)):
|
def compute_latents_proxy(voice, voice_latents_chunks, original_ar, original_diffusion, progress=gr.Progress(track_tqdm=True)):
|
||||||
if args.tts_backend == "bark":
|
if args.tts_backend == "bark":
|
||||||
global tts
|
global tts
|
||||||
tts.create_voice( voice )
|
tts.create_voice( voice )
|
||||||
return voice
|
return voice
|
||||||
compute_latents( voice=voice, voice_latents_chunks=voice_latents_chunks, progress=progress )
|
compute_latents( voice=voice, voice_latents_chunks=voice_latents_chunks, original_ar=original_ar, original_diffusion=original_diffusion )
|
||||||
return voice
|
return voice
|
||||||
|
|
||||||
|
|
||||||
|
@ -387,6 +389,8 @@ def setup_gradio():
|
||||||
GENERATE_SETTINGS["voice"] = gr.Dropdown(choices=voice_list_with_defaults, label="Voice", type="value", value=voice_list_with_defaults[0]) # it'd be very cash money if gradio was able to default to the first value in the list without this shit
|
GENERATE_SETTINGS["voice"] = gr.Dropdown(choices=voice_list_with_defaults, label="Voice", type="value", value=voice_list_with_defaults[0]) # it'd be very cash money if gradio was able to default to the first value in the list without this shit
|
||||||
GENERATE_SETTINGS["mic_audio"] = gr.Audio( label="Microphone Source", source="microphone", type="filepath", visible=False )
|
GENERATE_SETTINGS["mic_audio"] = gr.Audio( label="Microphone Source", source="microphone", type="filepath", visible=False )
|
||||||
GENERATE_SETTINGS["voice_latents_chunks"] = gr.Number(label="Voice Chunks", precision=0, value=0, visible=args.tts_backend=="tortoise")
|
GENERATE_SETTINGS["voice_latents_chunks"] = gr.Number(label="Voice Chunks", precision=0, value=0, visible=args.tts_backend=="tortoise")
|
||||||
|
GENERATE_SETTINGS["voice_latents_original_ar"] = gr.Checkbox(label="Use Original Latents Method (AR)", visible=args.tts_backend=="tortoise")
|
||||||
|
GENERATE_SETTINGS["voice_latents_original_diffusion"] = gr.Checkbox(label="Use Original Latents Method (Diffusion)", visible=args.tts_backend=="tortoise")
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
refresh_voices = gr.Button(value="Refresh Voice List")
|
refresh_voices = gr.Button(value="Refresh Voice List")
|
||||||
recompute_voice_latents = gr.Button(value="(Re)Compute Voice Latents")
|
recompute_voice_latents = gr.Button(value="(Re)Compute Voice Latents")
|
||||||
|
@ -783,6 +787,8 @@ def setup_gradio():
|
||||||
inputs=[
|
inputs=[
|
||||||
GENERATE_SETTINGS['voice'],
|
GENERATE_SETTINGS['voice'],
|
||||||
GENERATE_SETTINGS['voice_latents_chunks'],
|
GENERATE_SETTINGS['voice_latents_chunks'],
|
||||||
|
GENERATE_SETTINGS['voice_latents_original_ar'],
|
||||||
|
GENERATE_SETTINGS['voice_latents_original_diffusion'],
|
||||||
],
|
],
|
||||||
outputs=GENERATE_SETTINGS['voice'],
|
outputs=GENERATE_SETTINGS['voice'],
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user