diff --git a/models/.template.yaml b/models/.template.dlas.yaml similarity index 100% rename from models/.template.yaml rename to models/.template.dlas.yaml diff --git a/models/.template.valle.yaml b/models/.template.valle.yaml new file mode 100755 index 0000000..58e7acf --- /dev/null +++ b/models/.template.valle.yaml @@ -0,0 +1,9 @@ +data_dirs: [./training/${voice}/valle/] +spkr_name_getter: "lambda p: p.parts[-3]" + +model: ${model_name} +batch_size: ${batch_size} +eval_batch_size: ${validation_batch_size} +eval_every: ${validation_rate} + +sampling_temperature: 1.0 \ No newline at end of file diff --git a/src/utils.py b/src/utils.py index 445fb2d..d20b6b5 100755 --- a/src/utils.py +++ b/src/utils.py @@ -20,6 +20,8 @@ import subprocess import psutil import yaml import hashlib +import io +import gzip import tqdm import torch @@ -45,6 +47,7 @@ WHISPER_MODELS = ["tiny", "base", "small", "medium", "large"] WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"] WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp"] VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band'] +TTSES = ['tortoise'] # + ['vall-e'] GENERATE_SETTINGS_ARGS = None @@ -56,6 +59,16 @@ RESAMPLERS = {} MIN_TRAINING_DURATION = 0.6 MAX_TRAINING_DURATION = 11.6097505669 +VALLE_ENABLED = False + +try: + from vall_e.emb.qnt import encode as quantize + from vall_e.emb.g2p import encode as phonemize + + VALLE_ENABLED = True +except Exception as e: + pass + args = None tts = None tts_loading = False @@ -1195,7 +1208,7 @@ def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, resul messages.append(f"Sliced segments: {files} => {segments}.") return "\n".join(messages) -def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=False ): +def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=True ): indir = f'./training/{voice}/' infile = f'{indir}/whisper.json' messages = [] @@ -1273,6 +1286,8 @@ def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=F continue waveform, sample_rate = torchaudio.load(path) + num_channels, num_frames = waveform.shape + duration = num_frames / sample_rate error = validate_waveform( waveform, sample_rate ) if error: @@ -1281,21 +1296,43 @@ def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=F messages.append(message) errored += 1 continue + culled = len(text) < text_length if not culled and audio_length > 0: - num_channels, num_frames = waveform.shape - duration = num_frames / sample_rate culled = duration < audio_length # for when i add in a little treat ;), as it requires normalized text - if normalize and length(normalized_text) < 200: + if normalize and len(normalized_text) < 200: line = f'audio/{file}|{text}|{normalized_text}' else: line = f'audio/{file}|{text}' lines['training' if not culled else 'validation'].append(line) + if culled or not VALLE_ENABLED: + continue + + # VALL-E dataset + os.makedirs(f'{indir}/valle/', exist_ok=True) + + try: + from vall_e.emb.qnt import encode as quantize + from vall_e.emb.g2p import encode as phonemize + + if waveform.shape[0] == 2: + waveform = wav[:1] + + quantized = quantize( waveform, sample_rate ).cpu() + torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}') + + phonemes = phonemize(normalized_text) + open(f'{indir}/valle/{file.replace(".wav",".phn.txt")}', 'w', encoding='utf-8').write(" ".join(phonemes)) + + except Exception as e: + print(e) + pass + training_joined = "\n".join(lines['training']) validation_joined = "\n".join(lines['validation']) @@ -1538,21 +1575,27 @@ def save_training_settings( **kwargs ): settings['source_model'] = f"pretrain_model_gpt: '{settings['source_model']}'" settings['resume_state'] = f"# resume_state: '{settings['resume_state']}'" - with open(f'./models/.template.yaml', 'r', encoding="utf-8") as f: - yaml = f.read() + def use_template(template, out): + with open(template, 'r', encoding="utf-8") as f: + yaml = f.read() - # i could just load and edit the YAML directly, but this is easier, as I don't need to bother with path traversals - for k in settings: - if settings[k] is None: - continue - yaml = yaml.replace(f"${{{k}}}", str(settings[k])) + # i could just load and edit the YAML directly, but this is easier, as I don't need to bother with path traversals + for k in settings: + if settings[k] is None: + continue + yaml = yaml.replace(f"${{{k}}}", str(settings[k])) - outyaml = f'./training/{settings["voice"]}/train.yaml' - with open(outyaml, 'w', encoding="utf-8") as f: - f.write(yaml) + with open(out, 'w', encoding="utf-8") as f: + f.write(yaml) + use_template(f'./models/.template.dlas.yaml', f'./training/{settings["voice"]}/train.yaml') - messages.append(f"Saved training output to: {outyaml}") + settings['model_name'] = "ar" + use_template(f'./models/.template.valle.yaml', f'./training/{settings["voice"]}/ar.yaml') + settings['model_name'] = "nar" + use_template(f'./models/.template.valle.yaml', f'./training/{settings["voice"]}/nar.yaml') + + messages.append(f"Saved training output") return settings, messages def import_voices(files, saveAs=None, progress=None): @@ -1743,17 +1786,22 @@ def setup_args(): 'latents-lean-and-mean': True, 'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it 'voice-fixer-use-cuda': True, + 'force-cpu-for-conditioning-latents': False, 'defer-tts-load': False, 'device-override': None, 'prune-nonfinal-outputs': True, - 'vocoder-model': VOCODERS[-1], 'concurrency-count': 2, - 'autocalculate-voice-chunk-duration-size': 0, + 'autocalculate-voice-chunk-duration-size': 10, + 'output-sample-rate': 44100, 'output-volume': 1, + 'tts-backend': TTSES[0], + 'autoregressive-model': None, + 'vocoder-model': VOCODERS[-1], + 'whisper-backend': 'openai/whisper', 'whisper-model': "base", @@ -1792,6 +1840,7 @@ def setup_args(): parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)") parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output") + parser.add_argument("--tts-backend", default=default_arguments['tts-backend'], help="Specifies which TTS backend to use.") parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.") parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)") parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.") @@ -1828,10 +1877,48 @@ def setup_args(): return args +def get_default_settings( hypenated=True ): + settings = { + 'listen': None if not args.listen else args.listen, + 'share': args.share, + 'low-vram':args.low_vram, + 'check-for-updates':args.check_for_updates, + 'models-from-local-only':args.models_from_local_only, + 'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents, + 'defer-tts-load': args.defer_tts_load, + 'prune-nonfinal-outputs': args.prune_nonfinal_outputs, + 'device-override': args.device_override, + 'sample-batch-size': args.sample_batch_size, + 'embed-output-metadata': args.embed_output_metadata, + 'latents-lean-and-mean': args.latents_lean_and_mean, + 'voice-fixer': args.voice_fixer, + 'voice-fixer-use-cuda': args.voice_fixer_use_cuda, + 'concurrency-count': args.concurrency_count, + 'output-sample-rate': args.output_sample_rate, + 'autocalculate-voice-chunk-duration-size': args.autocalculate_voice_chunk_duration_size, + 'output-volume': args.output_volume, + + 'tts-backend': args.tts_backend, + + 'autoregressive-model': args.autoregressive_model, + 'vocoder-model': args.vocoder_model, + + 'whisper-backend': args.whisper_backend, + 'whisper-model': args.whisper_model, + + 'training-default-halfp': args.training_default_halfp, + 'training-default-bnb': args.training_default_bnb, + } + + res = {} + for k in settings: + res[k.replace("-", "_") if not hypenated else k] = settings[k] + return res + def update_args( **kwargs ): global args - settings = {} + settings = get_default_settings(hypenated=False) settings.update(kwargs) args.listen = settings['listen'] @@ -1853,8 +1940,10 @@ def update_args( **kwargs ): args.autocalculate_voice_chunk_duration_size = settings['autocalculate_voice_chunk_duration_size'] args.output_volume = settings['output_volume'] + args.tts_backend = settings['tts_backend'] args.autoregressive_model = settings['autoregressive_model'] args.vocoder_model = settings['vocoder_model'] + args.whisper_backend = settings['whisper_backend'] args.whisper_model = settings['whisper_model'] @@ -1865,34 +1954,7 @@ def update_args( **kwargs ): def save_args_settings(): global args - settings = { - 'listen': None if not args.listen else args.listen, - 'share': args.share, - 'low-vram':args.low_vram, - 'check-for-updates':args.check_for_updates, - 'models-from-local-only':args.models_from_local_only, - 'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents, - 'defer-tts-load': args.defer_tts_load, - 'prune-nonfinal-outputs': args.prune_nonfinal_outputs, - 'device-override': args.device_override, - 'sample-batch-size': args.sample_batch_size, - 'embed-output-metadata': args.embed_output_metadata, - 'latents-lean-and-mean': args.latents_lean_and_mean, - 'voice-fixer': args.voice_fixer, - 'voice-fixer-use-cuda': args.voice_fixer_use_cuda, - 'concurrency-count': args.concurrency_count, - 'output-sample-rate': args.output_sample_rate, - 'autocalculate-voice-chunk-duration-size': args.autocalculate_voice_chunk_duration_size, - 'output-volume': args.output_volume, - - 'autoregressive-model': args.autoregressive_model, - 'vocoder-model': args.vocoder_model, - 'whisper-backend': args.whisper_backend, - 'whisper-model': args.whisper_model, - - 'training-default-halfp': args.training_default_halfp, - 'training-default-bnb': args.training_default_bnb, - } + settings = get_default_settings() os.makedirs('./config/', exist_ok=True) with open(f'./config/exec.json', 'w', encoding="utf-8") as f: @@ -2009,18 +2071,13 @@ def load_tts( restart=False, autoregressive_model=None ): if autoregressive_model == "auto": autoregressive_model = deduce_autoregressive_model() - print(f"Loading TorToiSe... (AR: {autoregressive_model}, vocoder: {args.vocoder_model})") if get_device_name() == "cpu": print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.") tts_loading = True - try: - tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, vocoder_model=args.vocoder_model) - except Exception as e: - tts = TextToSpeech(minor_optimizations=not args.low_vram) - load_autoregressive_model(autoregressive_model) - + print(f"Loading TorToiSe... (AR: {autoregressive_model}, vocoder: {args.vocoder_model})") + tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, vocoder_model=args.vocoder_model) tts_loading = False get_model_path('dvae.pth') diff --git a/src/webui.py b/src/webui.py index a994cc2..3da2505 100755 --- a/src/webui.py +++ b/src/webui.py @@ -548,11 +548,11 @@ def setup_gradio(): EXEC_SETTINGS['autocalculate_voice_chunk_duration_size'] = gr.Number(label="Auto-Calculate Voice Chunk Duration (in seconds)", precision=0, value=args.autocalculate_voice_chunk_duration_size) EXEC_SETTINGS['output_volume'] = gr.Slider(label="Output Volume", minimum=0, maximum=2, value=args.output_volume) + # EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0]) + EXEC_SETTINGS['autoregressive_model'] = gr.Dropdown(choices=autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else autoregressive_models[0]) - EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1]) - EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p'] EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes'] diff --git a/start.sh b/start.sh index f1f9947..e0ac548 100755 --- a/start.sh +++ b/start.sh @@ -1,4 +1,5 @@ #!/bin/bash +ulimit -Sn `ulimit -Hn` # ROCm is a bitch source ./venv/bin/activate python3 ./src/main.py "$@" deactivate