added options to pick tokenizer json and diffusion model (so I don't have to add it in later when I get bored and add in diffusion training)

This commit is contained in:
mrq 2023-03-15 00:37:38 +00:00
parent 07b684c4e7
commit 363d0b09b1
4 changed files with 160 additions and 27 deletions

View File

@ -24,7 +24,7 @@ datasets:
num_conditioning_candidates: 2 num_conditioning_candidates: 2
conditioning_length: 44000 conditioning_length: 44000
use_bpe_tokenizer: True use_bpe_tokenizer: True
tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json tokenizer_vocab: ${tokenizer_json} # ./models/tortoise/bpe_lowercase_asr_256.json
load_aligned_codes: False load_aligned_codes: False
val: val:
name: validation name: validation
@ -41,7 +41,7 @@ datasets:
num_conditioning_candidates: 2 num_conditioning_candidates: 2
conditioning_length: 44000 conditioning_length: 44000
use_bpe_tokenizer: True use_bpe_tokenizer: True
tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json tokenizer_vocab: ${tokenizer_json} # ./models/tortoise/bpe_lowercase_asr_256.json
load_aligned_codes: False load_aligned_codes: False
steps: steps:

@ -1 +1 @@
Subproject commit 65a43deb9e354342ba805214edf1283b8af6fa90 Subproject commit 42cb1f36741aa3a24e7aab03e73b51becd182fa7

View File

@ -192,7 +192,10 @@ def generate(**kwargs):
'half_p': "Half Precision" in parameters['experimentals'], 'half_p': "Half Precision" in parameters['experimentals'],
'cond_free': "Conditioning-Free" in parameters['experimentals'], 'cond_free': "Conditioning-Free" in parameters['experimentals'],
'cvvp_amount': parameters['cvvp_weight'], 'cvvp_amount': parameters['cvvp_weight'],
'autoregressive_model': args.autoregressive_model, 'autoregressive_model': args.autoregressive_model,
'diffusion_model': args.diffusion_model,
'tokenizer_json': args.tokenizer_json,
} }
# could be better to just do a ternary on everything above, but i am not a professional # could be better to just do a ternary on everything above, but i am not a professional
@ -211,6 +214,14 @@ def generate(**kwargs):
settings['autoregressive_model'] = deduce_autoregressive_model(selected_voice) settings['autoregressive_model'] = deduce_autoregressive_model(selected_voice)
tts.load_autoregressive_model(settings['autoregressive_model']) tts.load_autoregressive_model(settings['autoregressive_model'])
if settings['diffusion_model'] is not None:
if settings['diffusion_model'] == "auto":
settings['diffusion_model'] = deduce_diffusion_model(selected_voice)
tts.load_diffusion_model(settings['diffusion_model'])
if settings['tokenizer_json'] is not None:
tts.load_tokenizer_json(settings['tokenizer_json'])
settings['voice_samples'], settings['conditioning_latents'], _ = fetch_voice(voice=selected_voice) settings['voice_samples'], settings['conditioning_latents'], _ = fetch_voice(voice=selected_voice)
# clamp it down for the insane users who want this # clamp it down for the insane users who want this
@ -1547,6 +1558,7 @@ def save_training_settings( **kwargs ):
settings['validation_batch_size'] = validation_lines settings['validation_batch_size'] = validation_lines
messages.append(f"Batch size exceeds validation dataset size, clamping validation batch size to {validation_lines}") messages.append(f"Batch size exceeds validation dataset size, clamping validation batch size to {validation_lines}")
settings['tokenizer_json'] = args.tokenizer_json
if settings['gpus'] > get_device_count(): if settings['gpus'] > get_device_count():
settings['gpus'] = get_device_count() settings['gpus'] = get_device_count()
@ -1679,6 +1691,9 @@ def import_voices(files, saveAs=None, progress=None):
print(f"Imported voice to {path}") print(f"Imported voice to {path}")
def relative_paths( dirs ):
return [ './' + os.path.relpath( d ).replace("\\", "/") for d in dirs ]
def get_voice_list(dir=get_voice_dir(), append_defaults=False): def get_voice_list(dir=get_voice_dir(), append_defaults=False):
defaults = [ "random", "microphone" ] defaults = [ "random", "microphone" ]
os.makedirs(dir, exist_ok=True) os.makedirs(dir, exist_ok=True)
@ -1687,6 +1702,7 @@ def get_voice_list(dir=get_voice_dir(), append_defaults=False):
res = res + defaults res = res + defaults
return res return res
def get_autoregressive_models(dir="./models/finetunes/", prefixed=False): def get_autoregressive_models(dir="./models/finetunes/", prefixed=False):
os.makedirs(dir, exist_ok=True) os.makedirs(dir, exist_ok=True)
base = [get_model_path('autoregressive.pth')] base = [get_model_path('autoregressive.pth')]
@ -1702,9 +1718,6 @@ def get_autoregressive_models(dir="./models/finetunes/", prefixed=False):
models = sorted([ int(d[:-8]) for d in os.listdir(f'./training/{training}/finetune/models/') if d[-8:] == "_gpt.pth" ]) models = sorted([ int(d[:-8]) for d in os.listdir(f'./training/{training}/finetune/models/') if d[-8:] == "_gpt.pth" ])
found = found + [ f'./training/{training}/finetune/models/{d}_gpt.pth' for d in models ] found = found + [ f'./training/{training}/finetune/models/{d}_gpt.pth' for d in models ]
if len(found) > 0 or len(additionals) > 0:
base = ["auto"] + base
res = base + additionals + found res = base + additionals + found
if prefixed: if prefixed:
@ -1715,7 +1728,27 @@ def get_autoregressive_models(dir="./models/finetunes/", prefixed=False):
res[i] = f'[{shorthash}] {path}' res[i] = f'[{shorthash}] {path}'
return res return ["auto"] + relative_paths(res)
def get_diffusion_models(dir="./models/finetunes/", prefixed=False):
return relative_paths([ get_model_path('diffusion_decoder.pth') ])
def get_tokenizer_jsons( dir="./models/tokenizers/" ):
additionals = sorted([ f'{additional_path}/{d}' for d in os.listdir(dir) if d[-5:] == ".json" ]) if os.path.isdir(dir) else []
return relative_paths([ "./modules/tortoise-tts/tortoise/data/tokenizer.json" ] + additionals)
def tokenize_text( text ):
from tortoise.utils.tokenizer import VoiceBpeTokenizer
if not tts:
if tts_loading:
raise Exception("TTS is still initializing...")
load_tts()
encoded = tts.tokenizer.encode(text)
decoded = tts.tokenizer.tokenizer.decode(encoded, skip_special_tokens=False)
return "\n".join([ str(encoded), decoded ])
def get_dataset_list(dir="./training/"): def get_dataset_list(dir="./training/"):
return sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.txt" in os.listdir(os.path.join(dir, d)) ]) return sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.txt" in os.listdir(os.path.join(dir, d)) ])
@ -1834,7 +1867,9 @@ def setup_args():
'tts-backend': TTSES[0], 'tts-backend': TTSES[0],
'autoregressive-model': None, 'autoregressive-model': None,
'diffusion-model': None,
'vocoder-model': VOCODERS[-1], 'vocoder-model': VOCODERS[-1],
'tokenizer-json': None,
'whisper-backend': 'openai/whisper', 'whisper-backend': 'openai/whisper',
'whisper-model': "base", 'whisper-model': "base",
@ -1866,7 +1901,6 @@ def setup_args():
parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)") parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)")
parser.add_argument("--defer-tts-load", default=default_arguments['defer-tts-load'], action='store_true', help="Defers loading TTS model") parser.add_argument("--defer-tts-load", default=default_arguments['defer-tts-load'], action='store_true', help="Defers loading TTS model")
parser.add_argument("--prune-nonfinal-outputs", default=default_arguments['prune-nonfinal-outputs'], action='store_true', help="Deletes non-final output files on completing a generation") parser.add_argument("--prune-nonfinal-outputs", default=default_arguments['prune-nonfinal-outputs'], action='store_true', help="Deletes non-final output files on completing a generation")
parser.add_argument("--vocoder-model", default=default_arguments['vocoder-model'], action='store_true', help="Specifies with vocoder to use")
parser.add_argument("--device-override", default=default_arguments['device-override'], help="A device string to override pass through Torch") parser.add_argument("--device-override", default=default_arguments['device-override'], help="A device string to override pass through Torch")
parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets how many batches to use during the autoregressive samples pass") parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets how many batches to use during the autoregressive samples pass")
parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once") parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once")
@ -1875,7 +1909,12 @@ def setup_args():
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output") parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
parser.add_argument("--tts-backend", default=default_arguments['tts-backend'], help="Specifies which TTS backend to use.") parser.add_argument("--tts-backend", default=default_arguments['tts-backend'], help="Specifies which TTS backend to use.")
parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.") parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.")
parser.add_argument("--diffusion-model", default=default_arguments['diffusion-model'], help="Specifies which diffusion model to use for sampling.")
parser.add_argument("--vocoder-model", default=default_arguments['vocoder-model'], action='store_true', help="Specifies with vocoder to use")
parser.add_argument("--tokenizer-json", default=default_arguments['tokenizer-json'], help="Specifies which tokenizer json to use for tokenizing.")
parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)") parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)")
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.") parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
@ -1935,7 +1974,9 @@ def get_default_settings( hypenated=True ):
'tts-backend': args.tts_backend, 'tts-backend': args.tts_backend,
'autoregressive-model': args.autoregressive_model, 'autoregressive-model': args.autoregressive_model,
'diffusion-model': args.diffusion_model,
'vocoder-model': args.vocoder_model, 'vocoder-model': args.vocoder_model,
'tokenizer-json': args.tokenizer_json,
'whisper-backend': args.whisper_backend, 'whisper-backend': args.whisper_backend,
'whisper-model': args.whisper_model, 'whisper-model': args.whisper_model,
@ -1975,8 +2016,11 @@ def update_args( **kwargs ):
args.output_volume = settings['output_volume'] args.output_volume = settings['output_volume']
args.tts_backend = settings['tts_backend'] args.tts_backend = settings['tts_backend']
args.autoregressive_model = settings['autoregressive_model'] args.autoregressive_model = settings['autoregressive_model']
args.diffusion_model = settings['diffusion_model']
args.vocoder_model = settings['vocoder_model'] args.vocoder_model = settings['vocoder_model']
args.tokenizer_json = settings['tokenizer_json']
args.whisper_backend = settings['whisper_backend'] args.whisper_backend = settings['whisper_backend']
args.whisper_model = settings['whisper_model'] args.whisper_model = settings['whisper_model']
@ -1994,15 +2038,6 @@ def save_args_settings():
with open(f'./config/exec.json', 'w', encoding="utf-8") as f: with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
f.write(json.dumps(settings, indent='\t') ) f.write(json.dumps(settings, indent='\t') )
def tokenize_text( text ):
from tortoise.utils.tokenizer import VoiceBpeTokenizer
tokenizer = VoiceBpeTokenizer()
encoded = tokenizer.encode(text)
decoded = tokenizer.tokenizer.decode(encoded, skip_special_tokens=False)
return "\n".join([ str(encoded), decoded ])
# super kludgy )`; # super kludgy )`;
def import_generate_settings(file = None): def import_generate_settings(file = None):
if not file: if not file:
@ -2099,7 +2134,7 @@ def version_check_tts( min_version ):
return True return True
return False return False
def load_tts( restart=False, autoregressive_model=None ): def load_tts( restart=False, autoregressive_model=None, diffusion_model=None, vocoder_model=None, tokenizer_json=None ):
global args global args
global tts global tts
@ -2114,13 +2149,27 @@ def load_tts( restart=False, autoregressive_model=None ):
if autoregressive_model == "auto": if autoregressive_model == "auto":
autoregressive_model = deduce_autoregressive_model() autoregressive_model = deduce_autoregressive_model()
if diffusion_model:
args.diffusion_model = diffusion_model
else:
diffusion_model = args.diffusion_model
if vocoder_model:
args.vocoder_model = vocoder_model
else:
vocoder_model = args.vocoder_model
if tokenizer_json:
args.tokenizer_json = tokenizer_json
else:
tokenizer_json = args.tokenizer_json
if get_device_name() == "cpu": if get_device_name() == "cpu":
print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.") print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.")
tts_loading = True tts_loading = True
print(f"Loading TorToiSe... (AR: {autoregressive_model}, vocoder: {args.vocoder_model})") print(f"Loading TorToiSe... (AR: {autoregressive_model}, vocoder: {vocoder_model})")
tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, vocoder_model=args.vocoder_model) tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, diffusion_model_path=diffusion_model, vocoder_model=vocoder_model, tokenizer_json=tokenizer_json)
tts_loading = False tts_loading = False
get_model_path('dvae.pth') get_model_path('dvae.pth')
@ -2207,6 +2256,40 @@ def update_autoregressive_model(autoregressive_model_path):
return autoregressive_model_path return autoregressive_model_path
def update_diffusion_model(diffusion_model_path):
match = re.findall(r'^\[[a-fA-F0-9]{8}\] (.+?)$', diffusion_model_path)
if match:
diffusion_model_path = match[0]
if not diffusion_model_path or not os.path.exists(diffusion_model_path):
print(f"Invalid model: {diffusion_model_path}")
return
args.diffusion_model = diffusion_model_path
save_args_settings()
print(f'Stored diffusion model to settings: {diffusion_model_path}')
global tts
if not tts:
if tts_loading:
raise Exception("TTS is still initializing...")
return
if hasattr(tts, "loading") and tts.loading:
raise Exception("TTS is still initializing...")
if diffusion_model_path == "auto":
diffusion_model_path = deduce_diffusion_model()
if diffusion_model_path == tts.diffusion_model_path:
return
tts.load_diffusion_model(diffusion_model_path)
do_gc()
return diffusion_model_path
def update_vocoder_model(vocoder_model): def update_vocoder_model(vocoder_model):
args.vocoder_model = vocoder_model args.vocoder_model = vocoder_model
save_args_settings() save_args_settings()
@ -2229,6 +2312,28 @@ def update_vocoder_model(vocoder_model):
return vocoder_model return vocoder_model
def update_tokenizer(tokenizer_json):
args.tokenizer_json = tokenizer_json
save_args_settings()
print(f'Stored tokenizer to settings: {tokenizer_json}')
global tts
if not tts:
if tts_loading:
raise Exception("TTS is still initializing...")
return
if hasattr(tts, "loading") and tts.loading:
raise Exception("TTS is still initializing...")
print(f"Loading model: {tokenizer_json}")
tts.load_tokenizer_json(tokenizer_json)
print(f"Loaded model: {tts.tokenizer_json}")
do_gc()
return vocoder_model
def load_voicefixer(restart=False): def load_voicefixer(restart=False):
global voicefixer global voicefixer

View File

@ -310,7 +310,11 @@ def setup_gradio():
voice_list_with_defaults = get_voice_list(append_defaults=True) voice_list_with_defaults = get_voice_list(append_defaults=True)
voice_list = get_voice_list() voice_list = get_voice_list()
result_voices = get_voice_list("./results/") result_voices = get_voice_list("./results/")
autoregressive_models = get_autoregressive_models() autoregressive_models = get_autoregressive_models()
diffusion_models = get_diffusion_models()
tokenizer_jsons = get_tokenizer_jsons()
dataset_list = get_dataset_list() dataset_list = get_dataset_list()
training_list = get_training_list() training_list = get_training_list()
@ -560,17 +564,20 @@ def setup_gradio():
EXEC_SETTINGS['force_cpu_for_conditioning_latents'] = gr.Checkbox(label="Force CPU for Conditioning Latents", value=args.force_cpu_for_conditioning_latents) EXEC_SETTINGS['force_cpu_for_conditioning_latents'] = gr.Checkbox(label="Force CPU for Conditioning Latents", value=args.force_cpu_for_conditioning_latents)
EXEC_SETTINGS['defer_tts_load'] = gr.Checkbox(label="Do Not Load TTS On Startup", value=args.defer_tts_load) EXEC_SETTINGS['defer_tts_load'] = gr.Checkbox(label="Do Not Load TTS On Startup", value=args.defer_tts_load)
EXEC_SETTINGS['prune_nonfinal_outputs'] = gr.Checkbox(label="Delete Non-Final Output", value=args.prune_nonfinal_outputs) EXEC_SETTINGS['prune_nonfinal_outputs'] = gr.Checkbox(label="Delete Non-Final Output", value=args.prune_nonfinal_outputs)
EXEC_SETTINGS['device_override'] = gr.Textbox(label="Device Override", value=args.device_override)
with gr.Column(): with gr.Column():
EXEC_SETTINGS['sample_batch_size'] = gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size) EXEC_SETTINGS['sample_batch_size'] = gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size)
EXEC_SETTINGS['concurrency_count'] = gr.Number(label="Gradio Concurrency Count", precision=0, value=args.concurrency_count) EXEC_SETTINGS['concurrency_count'] = gr.Number(label="Gradio Concurrency Count", precision=0, value=args.concurrency_count)
EXEC_SETTINGS['autocalculate_voice_chunk_duration_size'] = gr.Number(label="Auto-Calculate Voice Chunk Duration (in seconds)", precision=0, value=args.autocalculate_voice_chunk_duration_size) EXEC_SETTINGS['autocalculate_voice_chunk_duration_size'] = gr.Number(label="Auto-Calculate Voice Chunk Duration (in seconds)", precision=0, value=args.autocalculate_voice_chunk_duration_size)
EXEC_SETTINGS['output_volume'] = gr.Slider(label="Output Volume", minimum=0, maximum=2, value=args.output_volume) EXEC_SETTINGS['output_volume'] = gr.Slider(label="Output Volume", minimum=0, maximum=2, value=args.output_volume)
EXEC_SETTINGS['device_override'] = gr.Textbox(label="Device Override", value=args.device_override)
with gr.Column():
# EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0]) # EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0])
EXEC_SETTINGS['autoregressive_model'] = gr.Dropdown(choices=autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else autoregressive_models[0]) EXEC_SETTINGS['autoregressive_model'] = gr.Dropdown(choices=autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else autoregressive_models[0])
EXEC_SETTINGS['diffusion_model'] = gr.Dropdown(choices=diffusion_models, label="Diffusion Model", value=args.diffusion_model if args.diffusion_model else diffusion_models[0])
EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1]) EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1])
EXEC_SETTINGS['tokenizer_json'] = gr.Dropdown(tokenizer_jsons, label="Tokenizer JSON Path", value=args.tokenizer_json if args.tokenizer_json else tokenizer_jsons[0])
EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p'] EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p']
EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes'] EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes']
@ -585,16 +592,37 @@ def setup_gradio():
) )
# kill_button = gr.Button(value="Close UI") # kill_button = gr.Button(value="Close UI")
def update_model_list_proxy( val ): def update_model_list_proxy( autoregressive, diffusion, tokenizer ):
autoregressive_models = get_autoregressive_models() autoregressive_models = get_autoregressive_models()
if val not in autoregressive_models: if autoregressive not in autoregressive_models:
val = autoregressive_models[0] autoregressive = autoregressive_models[0]
return gr.update( choices=autoregressive_models, value=val )
diffusion_models = get_diffusion_models()
if diffusion not in diffusion_models:
diffusion = diffusion_models[0]
tokenizer_jsons = get_tokenizer_jsons()
if tokenizer not in tokenizer_jsons:
tokenizer = tokenizer_jsons[0]
return (
gr.update( choices=autoregressive_models, value=autoregressive ),
gr.update( choices=diffusion_models, value=diffusion ),
gr.update( choices=tokenizer_jsons, value=tokenizer ),
)
autoregressive_models_update_button.click( autoregressive_models_update_button.click(
update_model_list_proxy, update_model_list_proxy,
inputs=EXEC_SETTINGS['autoregressive_model'], inputs=[
outputs=EXEC_SETTINGS['autoregressive_model'], EXEC_SETTINGS['autoregressive_model'],
EXEC_SETTINGS['diffusion_model'],
EXEC_SETTINGS['tokenizer_json'],
],
outputs=[
EXEC_SETTINGS['autoregressive_model'],
EXEC_SETTINGS['diffusion_model'],
EXEC_SETTINGS['tokenizer_json'],
],
) )
exec_inputs = list(EXEC_SETTINGS.values()) exec_inputs = list(EXEC_SETTINGS.values())