forked from mrq/ai-voice-cloning
added options to pick tokenizer json and diffusion model (so I don't have to add it in later when I get bored and add in diffusion training)
This commit is contained in:
parent
07b684c4e7
commit
363d0b09b1
|
@ -24,7 +24,7 @@ datasets:
|
|||
num_conditioning_candidates: 2
|
||||
conditioning_length: 44000
|
||||
use_bpe_tokenizer: True
|
||||
tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
|
||||
tokenizer_vocab: ${tokenizer_json} # ./models/tortoise/bpe_lowercase_asr_256.json
|
||||
load_aligned_codes: False
|
||||
val:
|
||||
name: validation
|
||||
|
@ -41,7 +41,7 @@ datasets:
|
|||
num_conditioning_candidates: 2
|
||||
conditioning_length: 44000
|
||||
use_bpe_tokenizer: True
|
||||
tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
|
||||
tokenizer_vocab: ${tokenizer_json} # ./models/tortoise/bpe_lowercase_asr_256.json
|
||||
load_aligned_codes: False
|
||||
|
||||
steps:
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 65a43deb9e354342ba805214edf1283b8af6fa90
|
||||
Subproject commit 42cb1f36741aa3a24e7aab03e73b51becd182fa7
|
139
src/utils.py
139
src/utils.py
|
@ -192,7 +192,10 @@ def generate(**kwargs):
|
|||
'half_p': "Half Precision" in parameters['experimentals'],
|
||||
'cond_free': "Conditioning-Free" in parameters['experimentals'],
|
||||
'cvvp_amount': parameters['cvvp_weight'],
|
||||
|
||||
'autoregressive_model': args.autoregressive_model,
|
||||
'diffusion_model': args.diffusion_model,
|
||||
'tokenizer_json': args.tokenizer_json,
|
||||
}
|
||||
|
||||
# could be better to just do a ternary on everything above, but i am not a professional
|
||||
|
@ -211,6 +214,14 @@ def generate(**kwargs):
|
|||
settings['autoregressive_model'] = deduce_autoregressive_model(selected_voice)
|
||||
tts.load_autoregressive_model(settings['autoregressive_model'])
|
||||
|
||||
if settings['diffusion_model'] is not None:
|
||||
if settings['diffusion_model'] == "auto":
|
||||
settings['diffusion_model'] = deduce_diffusion_model(selected_voice)
|
||||
tts.load_diffusion_model(settings['diffusion_model'])
|
||||
|
||||
if settings['tokenizer_json'] is not None:
|
||||
tts.load_tokenizer_json(settings['tokenizer_json'])
|
||||
|
||||
settings['voice_samples'], settings['conditioning_latents'], _ = fetch_voice(voice=selected_voice)
|
||||
|
||||
# clamp it down for the insane users who want this
|
||||
|
@ -1547,6 +1558,7 @@ def save_training_settings( **kwargs ):
|
|||
settings['validation_batch_size'] = validation_lines
|
||||
messages.append(f"Batch size exceeds validation dataset size, clamping validation batch size to {validation_lines}")
|
||||
|
||||
settings['tokenizer_json'] = args.tokenizer_json
|
||||
|
||||
if settings['gpus'] > get_device_count():
|
||||
settings['gpus'] = get_device_count()
|
||||
|
@ -1679,6 +1691,9 @@ def import_voices(files, saveAs=None, progress=None):
|
|||
|
||||
print(f"Imported voice to {path}")
|
||||
|
||||
def relative_paths( dirs ):
|
||||
return [ './' + os.path.relpath( d ).replace("\\", "/") for d in dirs ]
|
||||
|
||||
def get_voice_list(dir=get_voice_dir(), append_defaults=False):
|
||||
defaults = [ "random", "microphone" ]
|
||||
os.makedirs(dir, exist_ok=True)
|
||||
|
@ -1687,6 +1702,7 @@ def get_voice_list(dir=get_voice_dir(), append_defaults=False):
|
|||
res = res + defaults
|
||||
return res
|
||||
|
||||
|
||||
def get_autoregressive_models(dir="./models/finetunes/", prefixed=False):
|
||||
os.makedirs(dir, exist_ok=True)
|
||||
base = [get_model_path('autoregressive.pth')]
|
||||
|
@ -1702,9 +1718,6 @@ def get_autoregressive_models(dir="./models/finetunes/", prefixed=False):
|
|||
models = sorted([ int(d[:-8]) for d in os.listdir(f'./training/{training}/finetune/models/') if d[-8:] == "_gpt.pth" ])
|
||||
found = found + [ f'./training/{training}/finetune/models/{d}_gpt.pth' for d in models ]
|
||||
|
||||
if len(found) > 0 or len(additionals) > 0:
|
||||
base = ["auto"] + base
|
||||
|
||||
res = base + additionals + found
|
||||
|
||||
if prefixed:
|
||||
|
@ -1715,7 +1728,27 @@ def get_autoregressive_models(dir="./models/finetunes/", prefixed=False):
|
|||
|
||||
res[i] = f'[{shorthash}] {path}'
|
||||
|
||||
return res
|
||||
return ["auto"] + relative_paths(res)
|
||||
|
||||
def get_diffusion_models(dir="./models/finetunes/", prefixed=False):
|
||||
return relative_paths([ get_model_path('diffusion_decoder.pth') ])
|
||||
|
||||
def get_tokenizer_jsons( dir="./models/tokenizers/" ):
|
||||
additionals = sorted([ f'{additional_path}/{d}' for d in os.listdir(dir) if d[-5:] == ".json" ]) if os.path.isdir(dir) else []
|
||||
return relative_paths([ "./modules/tortoise-tts/tortoise/data/tokenizer.json" ] + additionals)
|
||||
|
||||
def tokenize_text( text ):
|
||||
from tortoise.utils.tokenizer import VoiceBpeTokenizer
|
||||
|
||||
if not tts:
|
||||
if tts_loading:
|
||||
raise Exception("TTS is still initializing...")
|
||||
load_tts()
|
||||
|
||||
encoded = tts.tokenizer.encode(text)
|
||||
decoded = tts.tokenizer.tokenizer.decode(encoded, skip_special_tokens=False)
|
||||
|
||||
return "\n".join([ str(encoded), decoded ])
|
||||
|
||||
def get_dataset_list(dir="./training/"):
|
||||
return sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.txt" in os.listdir(os.path.join(dir, d)) ])
|
||||
|
@ -1834,7 +1867,9 @@ def setup_args():
|
|||
'tts-backend': TTSES[0],
|
||||
|
||||
'autoregressive-model': None,
|
||||
'diffusion-model': None,
|
||||
'vocoder-model': VOCODERS[-1],
|
||||
'tokenizer-json': None,
|
||||
|
||||
'whisper-backend': 'openai/whisper',
|
||||
'whisper-model': "base",
|
||||
|
@ -1866,7 +1901,6 @@ def setup_args():
|
|||
parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)")
|
||||
parser.add_argument("--defer-tts-load", default=default_arguments['defer-tts-load'], action='store_true', help="Defers loading TTS model")
|
||||
parser.add_argument("--prune-nonfinal-outputs", default=default_arguments['prune-nonfinal-outputs'], action='store_true', help="Deletes non-final output files on completing a generation")
|
||||
parser.add_argument("--vocoder-model", default=default_arguments['vocoder-model'], action='store_true', help="Specifies with vocoder to use")
|
||||
parser.add_argument("--device-override", default=default_arguments['device-override'], help="A device string to override pass through Torch")
|
||||
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets how many batches to use during the autoregressive samples pass")
|
||||
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
|
||||
|
@ -1875,7 +1909,12 @@ def setup_args():
|
|||
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
|
||||
|
||||
parser.add_argument("--tts-backend", default=default_arguments['tts-backend'], help="Specifies which TTS backend to use.")
|
||||
|
||||
parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.")
|
||||
parser.add_argument("--diffusion-model", default=default_arguments['diffusion-model'], help="Specifies which diffusion model to use for sampling.")
|
||||
parser.add_argument("--vocoder-model", default=default_arguments['vocoder-model'], action='store_true', help="Specifies with vocoder to use")
|
||||
parser.add_argument("--tokenizer-json", default=default_arguments['tokenizer-json'], help="Specifies which tokenizer json to use for tokenizing.")
|
||||
|
||||
parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)")
|
||||
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
|
||||
|
||||
|
@ -1935,7 +1974,9 @@ def get_default_settings( hypenated=True ):
|
|||
'tts-backend': args.tts_backend,
|
||||
|
||||
'autoregressive-model': args.autoregressive_model,
|
||||
'diffusion-model': args.diffusion_model,
|
||||
'vocoder-model': args.vocoder_model,
|
||||
'tokenizer-json': args.tokenizer_json,
|
||||
|
||||
'whisper-backend': args.whisper_backend,
|
||||
'whisper-model': args.whisper_model,
|
||||
|
@ -1975,8 +2016,11 @@ def update_args( **kwargs ):
|
|||
args.output_volume = settings['output_volume']
|
||||
|
||||
args.tts_backend = settings['tts_backend']
|
||||
|
||||
args.autoregressive_model = settings['autoregressive_model']
|
||||
args.diffusion_model = settings['diffusion_model']
|
||||
args.vocoder_model = settings['vocoder_model']
|
||||
args.tokenizer_json = settings['tokenizer_json']
|
||||
|
||||
args.whisper_backend = settings['whisper_backend']
|
||||
args.whisper_model = settings['whisper_model']
|
||||
|
@ -1994,15 +2038,6 @@ def save_args_settings():
|
|||
with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
|
||||
f.write(json.dumps(settings, indent='\t') )
|
||||
|
||||
def tokenize_text( text ):
|
||||
from tortoise.utils.tokenizer import VoiceBpeTokenizer
|
||||
|
||||
tokenizer = VoiceBpeTokenizer()
|
||||
encoded = tokenizer.encode(text)
|
||||
decoded = tokenizer.tokenizer.decode(encoded, skip_special_tokens=False)
|
||||
|
||||
return "\n".join([ str(encoded), decoded ])
|
||||
|
||||
# super kludgy )`;
|
||||
def import_generate_settings(file = None):
|
||||
if not file:
|
||||
|
@ -2099,7 +2134,7 @@ def version_check_tts( min_version ):
|
|||
return True
|
||||
return False
|
||||
|
||||
def load_tts( restart=False, autoregressive_model=None ):
|
||||
def load_tts( restart=False, autoregressive_model=None, diffusion_model=None, vocoder_model=None, tokenizer_json=None ):
|
||||
global args
|
||||
global tts
|
||||
|
||||
|
@ -2114,13 +2149,27 @@ def load_tts( restart=False, autoregressive_model=None ):
|
|||
if autoregressive_model == "auto":
|
||||
autoregressive_model = deduce_autoregressive_model()
|
||||
|
||||
if diffusion_model:
|
||||
args.diffusion_model = diffusion_model
|
||||
else:
|
||||
diffusion_model = args.diffusion_model
|
||||
|
||||
if vocoder_model:
|
||||
args.vocoder_model = vocoder_model
|
||||
else:
|
||||
vocoder_model = args.vocoder_model
|
||||
|
||||
if tokenizer_json:
|
||||
args.tokenizer_json = tokenizer_json
|
||||
else:
|
||||
tokenizer_json = args.tokenizer_json
|
||||
|
||||
if get_device_name() == "cpu":
|
||||
print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.")
|
||||
|
||||
tts_loading = True
|
||||
print(f"Loading TorToiSe... (AR: {autoregressive_model}, vocoder: {args.vocoder_model})")
|
||||
tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, vocoder_model=args.vocoder_model)
|
||||
print(f"Loading TorToiSe... (AR: {autoregressive_model}, vocoder: {vocoder_model})")
|
||||
tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, diffusion_model_path=diffusion_model, vocoder_model=vocoder_model, tokenizer_json=tokenizer_json)
|
||||
tts_loading = False
|
||||
|
||||
get_model_path('dvae.pth')
|
||||
|
@ -2207,6 +2256,40 @@ def update_autoregressive_model(autoregressive_model_path):
|
|||
|
||||
return autoregressive_model_path
|
||||
|
||||
def update_diffusion_model(diffusion_model_path):
|
||||
match = re.findall(r'^\[[a-fA-F0-9]{8}\] (.+?)$', diffusion_model_path)
|
||||
if match:
|
||||
diffusion_model_path = match[0]
|
||||
|
||||
if not diffusion_model_path or not os.path.exists(diffusion_model_path):
|
||||
print(f"Invalid model: {diffusion_model_path}")
|
||||
return
|
||||
|
||||
args.diffusion_model = diffusion_model_path
|
||||
save_args_settings()
|
||||
print(f'Stored diffusion model to settings: {diffusion_model_path}')
|
||||
|
||||
global tts
|
||||
if not tts:
|
||||
if tts_loading:
|
||||
raise Exception("TTS is still initializing...")
|
||||
return
|
||||
|
||||
if hasattr(tts, "loading") and tts.loading:
|
||||
raise Exception("TTS is still initializing...")
|
||||
|
||||
if diffusion_model_path == "auto":
|
||||
diffusion_model_path = deduce_diffusion_model()
|
||||
|
||||
if diffusion_model_path == tts.diffusion_model_path:
|
||||
return
|
||||
|
||||
tts.load_diffusion_model(diffusion_model_path)
|
||||
|
||||
do_gc()
|
||||
|
||||
return diffusion_model_path
|
||||
|
||||
def update_vocoder_model(vocoder_model):
|
||||
args.vocoder_model = vocoder_model
|
||||
save_args_settings()
|
||||
|
@ -2229,6 +2312,28 @@ def update_vocoder_model(vocoder_model):
|
|||
|
||||
return vocoder_model
|
||||
|
||||
def update_tokenizer(tokenizer_json):
|
||||
args.tokenizer_json = tokenizer_json
|
||||
save_args_settings()
|
||||
print(f'Stored tokenizer to settings: {tokenizer_json}')
|
||||
|
||||
global tts
|
||||
if not tts:
|
||||
if tts_loading:
|
||||
raise Exception("TTS is still initializing...")
|
||||
return
|
||||
|
||||
if hasattr(tts, "loading") and tts.loading:
|
||||
raise Exception("TTS is still initializing...")
|
||||
|
||||
print(f"Loading model: {tokenizer_json}")
|
||||
tts.load_tokenizer_json(tokenizer_json)
|
||||
print(f"Loaded model: {tts.tokenizer_json}")
|
||||
|
||||
do_gc()
|
||||
|
||||
return vocoder_model
|
||||
|
||||
def load_voicefixer(restart=False):
|
||||
global voicefixer
|
||||
|
||||
|
|
42
src/webui.py
42
src/webui.py
|
@ -310,7 +310,11 @@ def setup_gradio():
|
|||
voice_list_with_defaults = get_voice_list(append_defaults=True)
|
||||
voice_list = get_voice_list()
|
||||
result_voices = get_voice_list("./results/")
|
||||
|
||||
autoregressive_models = get_autoregressive_models()
|
||||
diffusion_models = get_diffusion_models()
|
||||
tokenizer_jsons = get_tokenizer_jsons()
|
||||
|
||||
dataset_list = get_dataset_list()
|
||||
training_list = get_training_list()
|
||||
|
||||
|
@ -560,17 +564,20 @@ def setup_gradio():
|
|||
EXEC_SETTINGS['force_cpu_for_conditioning_latents'] = gr.Checkbox(label="Force CPU for Conditioning Latents", value=args.force_cpu_for_conditioning_latents)
|
||||
EXEC_SETTINGS['defer_tts_load'] = gr.Checkbox(label="Do Not Load TTS On Startup", value=args.defer_tts_load)
|
||||
EXEC_SETTINGS['prune_nonfinal_outputs'] = gr.Checkbox(label="Delete Non-Final Output", value=args.prune_nonfinal_outputs)
|
||||
EXEC_SETTINGS['device_override'] = gr.Textbox(label="Device Override", value=args.device_override)
|
||||
with gr.Column():
|
||||
EXEC_SETTINGS['sample_batch_size'] = gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size)
|
||||
EXEC_SETTINGS['concurrency_count'] = gr.Number(label="Gradio Concurrency Count", precision=0, value=args.concurrency_count)
|
||||
EXEC_SETTINGS['autocalculate_voice_chunk_duration_size'] = gr.Number(label="Auto-Calculate Voice Chunk Duration (in seconds)", precision=0, value=args.autocalculate_voice_chunk_duration_size)
|
||||
EXEC_SETTINGS['output_volume'] = gr.Slider(label="Output Volume", minimum=0, maximum=2, value=args.output_volume)
|
||||
EXEC_SETTINGS['device_override'] = gr.Textbox(label="Device Override", value=args.device_override)
|
||||
|
||||
with gr.Column():
|
||||
# EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0])
|
||||
|
||||
EXEC_SETTINGS['autoregressive_model'] = gr.Dropdown(choices=autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else autoregressive_models[0])
|
||||
EXEC_SETTINGS['diffusion_model'] = gr.Dropdown(choices=diffusion_models, label="Diffusion Model", value=args.diffusion_model if args.diffusion_model else diffusion_models[0])
|
||||
EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1])
|
||||
EXEC_SETTINGS['tokenizer_json'] = gr.Dropdown(tokenizer_jsons, label="Tokenizer JSON Path", value=args.tokenizer_json if args.tokenizer_json else tokenizer_jsons[0])
|
||||
|
||||
EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p']
|
||||
EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes']
|
||||
|
@ -585,16 +592,37 @@ def setup_gradio():
|
|||
)
|
||||
# kill_button = gr.Button(value="Close UI")
|
||||
|
||||
def update_model_list_proxy( val ):
|
||||
def update_model_list_proxy( autoregressive, diffusion, tokenizer ):
|
||||
autoregressive_models = get_autoregressive_models()
|
||||
if val not in autoregressive_models:
|
||||
val = autoregressive_models[0]
|
||||
return gr.update( choices=autoregressive_models, value=val )
|
||||
if autoregressive not in autoregressive_models:
|
||||
autoregressive = autoregressive_models[0]
|
||||
|
||||
diffusion_models = get_diffusion_models()
|
||||
if diffusion not in diffusion_models:
|
||||
diffusion = diffusion_models[0]
|
||||
|
||||
tokenizer_jsons = get_tokenizer_jsons()
|
||||
if tokenizer not in tokenizer_jsons:
|
||||
tokenizer = tokenizer_jsons[0]
|
||||
|
||||
return (
|
||||
gr.update( choices=autoregressive_models, value=autoregressive ),
|
||||
gr.update( choices=diffusion_models, value=diffusion ),
|
||||
gr.update( choices=tokenizer_jsons, value=tokenizer ),
|
||||
)
|
||||
|
||||
autoregressive_models_update_button.click(
|
||||
update_model_list_proxy,
|
||||
inputs=EXEC_SETTINGS['autoregressive_model'],
|
||||
outputs=EXEC_SETTINGS['autoregressive_model'],
|
||||
inputs=[
|
||||
EXEC_SETTINGS['autoregressive_model'],
|
||||
EXEC_SETTINGS['diffusion_model'],
|
||||
EXEC_SETTINGS['tokenizer_json'],
|
||||
],
|
||||
outputs=[
|
||||
EXEC_SETTINGS['autoregressive_model'],
|
||||
EXEC_SETTINGS['diffusion_model'],
|
||||
EXEC_SETTINGS['tokenizer_json'],
|
||||
],
|
||||
)
|
||||
|
||||
exec_inputs = list(EXEC_SETTINGS.values())
|
||||
|
|
Loading…
Reference in New Issue
Block a user