diff --git a/src/utils.py b/src/utils.py index 435cd4d..e7cad07 100755 --- a/src/utils.py +++ b/src/utils.py @@ -1,3958 +1,3962 @@ -import os -if 'XDG_CACHE_HOME' not in os.environ: - os.environ['XDG_CACHE_HOME'] = os.path.realpath(os.path.join(os.getcwd(), './models/')) - -if 'TORTOISE_MODELS_DIR' not in os.environ: - os.environ['TORTOISE_MODELS_DIR'] = os.path.realpath(os.path.join(os.getcwd(), './models/tortoise/')) - -if 'TRANSFORMERS_CACHE' not in os.environ: - os.environ['TRANSFORMERS_CACHE'] = os.path.realpath(os.path.join(os.getcwd(), './models/transformers/')) - -import argparse -import time -import math -import json -import base64 -import re -import urllib.request -import signal -import gc -import subprocess -import psutil -import yaml -import hashlib -import string -import random - -from tqdm import tqdm -import torch -import torchaudio -import music_tag -import gradio as gr -import gradio.utils -import pandas as pd -import numpy as np - -from glob import glob -from datetime import datetime -from datetime import timedelta - -from tortoise.api import TextToSpeech as TorToise_TTS, MODELS, get_model_path, pad_or_truncate -from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_dir, get_voices -from tortoise.utils.text import split_and_recombine_text -from tortoise.utils.device import get_device_name, set_device_name, get_device_count, get_device_vram, get_device_batch_size, do_gc - - -MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth" - -WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2"] -WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"] -WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp", "m-bain/whisperx"] -VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band'] -TTSES = ['tortoise'] - -INFERENCING = False -GENERATE_SETTINGS_ARGS = None - -LEARNING_RATE_SCHEMES = {"Multistep": "MultiStepLR", "Cos. Annealing": "CosineAnnealingLR_Restart"} -LEARNING_RATE_SCHEDULE = [ 2, 4, 9, 18, 25, 33, 50 ] - -RESAMPLERS = {} - -MIN_TRAINING_DURATION = 0.6 -MAX_TRAINING_DURATION = 11.6097505669 -MAX_TRAINING_CHAR_LENGTH = 200 - -VALLE_ENABLED = False -BARK_ENABLED = False - -VERBOSE_DEBUG = True - -import traceback - -try: - from whisper.normalizers.english import EnglishTextNormalizer - from whisper.normalizers.basic import BasicTextNormalizer - from whisper.tokenizer import LANGUAGES - - print("Whisper detected") -except Exception as e: - if VERBOSE_DEBUG: - print(traceback.format_exc()) - pass - -try: - from vall_e.emb.qnt import encode as valle_quantize - from vall_e.emb.g2p import encode as valle_phonemize - - from vall_e.inference import TTS as VALLE_TTS - - import soundfile - - print("VALL-E detected") - VALLE_ENABLED = True -except Exception as e: - if VERBOSE_DEBUG: - print(traceback.format_exc()) - pass - -if VALLE_ENABLED: - TTSES.append('vall-e') - -# torchaudio.set_audio_backend('soundfile') - -try: - import bark - from bark import text_to_semantic - from bark.generation import SAMPLE_RATE as BARK_SAMPLE_RATE, ALLOWED_PROMPTS, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic, load_codec_model - from bark.api import generate_audio as bark_generate_audio - from encodec.utils import convert_audio - - from scipy.io.wavfile import write as write_wav - - print("Bark detected") - BARK_ENABLED = True -except Exception as e: - if VERBOSE_DEBUG: - print(traceback.format_exc()) - pass - -if BARK_ENABLED: - TTSES.append('bark') - - def semantic_to_audio_tokens( - semantic_tokens, - history_prompt = None, - temp = 0.7, - silent = False, - output_full = False, - ): - coarse_tokens = generate_coarse( - semantic_tokens, history_prompt=history_prompt, temp=temp, silent=silent, use_kv_caching=True - ) - fine_tokens = generate_fine(coarse_tokens, history_prompt=history_prompt, temp=0.5) - - if output_full: - full_generation = { - "semantic_prompt": semantic_tokens, - "coarse_prompt": coarse_tokens, - "fine_prompt": fine_tokens, - } - return full_generation - return fine_tokens - - class Bark_TTS(): - def __init__(self, small=False): - self.input_sample_rate = BARK_SAMPLE_RATE - self.output_sample_rate = BARK_SAMPLE_RATE # args.output_sample_rate - - preload_models( - text_use_gpu=True, - coarse_use_gpu=True, - fine_use_gpu=True, - codec_use_gpu=True, - - text_use_small=small, - coarse_use_small=small, - fine_use_small=small, - - force_reload=False - ) - - self.device = get_device_name() - - try: - from vocos import Vocos - self.vocos_enabled = True - print("Vocos detected") - except Exception as e: - if VERBOSE_DEBUG: - print(traceback.format_exc()) - self.vocos_enabled = False - - try: - from hubert.hubert_manager import HuBERTManager - - hubert_manager = HuBERTManager() - hubert_manager.make_sure_hubert_installed() - hubert_manager.make_sure_tokenizer_installed() - - self.hubert_enabled = True - print("HuBERT detected") - except Exception as e: - if VERBOSE_DEBUG: - print(traceback.format_exc()) - self.hubert_enabled = False - - if self.vocos_enabled: - self.vocos = Vocos.from_pretrained("charactr/vocos-encodec-24khz").to(self.device) - - def create_voice( self, voice ): - transcription_json = f'./training/{voice}/whisper.json' - if not os.path.exists(transcription_json): - raise f"Transcription for voice not found: {voice}" - - transcriptions = json.load(open(transcription_json, 'r', encoding="utf-8")) - candidates = [] - for file in transcriptions: - result = transcriptions[file] - added = 0 - - for segment in result['segments']: - path = file.replace(".wav", f"_{pad(segment['id'], 4)}.wav") - # check if the slice actually exists - if not os.path.exists(f'./training/{voice}/audio/{path}'): - continue - - entry = ( - path, - segment['end'] - segment['start'], - segment['text'] - ) - candidates.append(entry) - added = added + 1 - - # if nothing got added (assuming because nothign was sliced), use the master file - if added == 0: # added < len(result['segments']): - start = 0 - end = 0 - for segment in result['segments']: - start = max( start, segment['start'] ) - end = max( end, segment['end'] ) - - entry = ( - file, - end - start, - result['text'] - ) - candidates.append(entry) - - candidates.sort(key=lambda x: x[1]) - candidate = random.choice(candidates) - audio_filepath = f'./training/{voice}/audio/{candidate[0]}' - text = candidate[-1] - - print("Using as reference:", audio_filepath, text) - - # Load and pre-process the audio waveform - model = load_codec_model(use_gpu=True) - wav, sr = torchaudio.load(audio_filepath) - wav = convert_audio(wav, sr, model.sample_rate, model.channels) - - # generate semantic tokens - - if self.hubert_enabled: - from hubert.pre_kmeans_hubert import CustomHubert - from hubert.customtokenizer import CustomTokenizer - - wav = wav.to(self.device) - - # Extract discrete codes from EnCodec - with torch.no_grad(): - encoded_frames = model.encode(wav.unsqueeze(0)) - codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T] - - # get seconds of audio - seconds = wav.shape[-1] / model.sample_rate - - # Load the HuBERT model - hubert_model = CustomHubert(checkpoint_path='./data/models/hubert/hubert.pt').to(self.device) - - # Load the CustomTokenizer model - tokenizer = CustomTokenizer.load_from_checkpoint('./data/models/hubert/tokenizer.pth').to(self.device) - - semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate) - semantic_tokens = tokenizer.get_token(semantic_vectors) - - # move codes to cpu - codes = codes.cpu().numpy() - # move semantic tokens to cpu - semantic_tokens = semantic_tokens.cpu().numpy() - else: - wav = wav.unsqueeze(0).to(self.device) - - # Extract discrete codes from EnCodec - with torch.no_grad(): - encoded_frames = model.encode(wav) - codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze().cpu().numpy() # [n_q, T] - - # get seconds of audio - seconds = wav.shape[-1] / model.sample_rate - - # generate semantic tokens - semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds, top_k=50, top_p=.95, temp=0.7) - - # print(bark.__file__) - bark_location = os.path.dirname(os.path.relpath(bark.__file__)) # './modules/bark/bark/' - output_path = f'./{bark_location}/assets/prompts/' + voice.replace("/", "_") + '.npz' - np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens) - - def inference( self, text, voice, text_temp=0.7, waveform_temp=0.7 ): - if voice == "random": - voice = None - else: - if not os.path.exists('./modules/bark/bark/assets/prompts/' + voice + '.npz'): - self.create_voice( voice ) - voice = voice.replace("/", "_") - if voice not in ALLOWED_PROMPTS: - ALLOWED_PROMPTS.add( voice ) - - semantic_tokens = text_to_semantic(text, history_prompt=voice, temp=text_temp, silent=False) - audio_tokens = semantic_to_audio_tokens( semantic_tokens, history_prompt=voice, temp=waveform_temp, silent=False, output_full=False ) - - if self.vocos_enabled: - audio_tokens_torch = torch.from_numpy(audio_tokens).to(self.device) - features = self.vocos.codes_to_features(audio_tokens_torch) - wav = self.vocos.decode(features, bandwidth_id=torch.tensor([2], device=self.device)) - else: - wav = codec_decode( audio_tokens ) - - return ( wav, BARK_SAMPLE_RATE ) - # return (bark_generate_audio(text, history_prompt=voice, text_temp=text_temp, waveform_temp=waveform_temp), BARK_SAMPLE_RATE) - -args = None -tts = None -tts_loading = False -webui = None -voicefixer = None - -whisper_model = None -whisper_align_model = None - -training_state = None - -current_voice = None - -def cleanup_voice_name( name ): - return name.split("/")[-1] - -def resample( waveform, input_rate, output_rate=44100 ): - # mono-ize - waveform = torch.mean(waveform, dim=0, keepdim=True) - - if input_rate == output_rate: - return waveform, output_rate - - key = f'{input_rate}:{output_rate}' - if not key in RESAMPLERS: - RESAMPLERS[key] = torchaudio.transforms.Resample( - input_rate, - output_rate, - lowpass_filter_width=16, - rolloff=0.85, - resampling_method="kaiser_window", - beta=8.555504641634386, - ) - - return RESAMPLERS[key]( waveform ), output_rate - -def generate(**kwargs): - if args.tts_backend == "tortoise": - return generate_tortoise(**kwargs) - if args.tts_backend == "vall-e": - return generate_valle(**kwargs) - if args.tts_backend == "bark": - return generate_bark(**kwargs) - -def generate_bark(**kwargs): - parameters = {} - parameters.update(kwargs) - - voice = parameters['voice'] - progress = parameters['progress'] if 'progress' in parameters else None - if parameters['seed'] == 0: - parameters['seed'] = None - - usedSeed = parameters['seed'] - - global args - global tts - - unload_whisper() - unload_voicefixer() - - if not tts: - # should check if it's loading or unloaded, and load it if it's unloaded - if tts_loading: - raise Exception("TTS is still initializing...") - if progress is not None: - notify_progress("Initializing TTS...", progress=progress) - load_tts() - if hasattr(tts, "loading") and tts.loading: - raise Exception("TTS is still initializing...") - - do_gc() - - voice_samples = None - conditioning_latents = None - sample_voice = None - - voice_cache = {} - - def get_settings( override=None ): - settings = { - 'voice': parameters['voice'], - 'text_temp': float(parameters['temperature']), - 'waveform_temp': float(parameters['temperature']), - } - - # could be better to just do a ternary on everything above, but i am not a professional - selected_voice = voice - if override is not None: - if 'voice' in override: - selected_voice = override['voice'] - - for k in override: - if k not in settings: - continue - settings[k] = override[k] - - return settings - - if not parameters['delimiter']: - parameters['delimiter'] = "\n" - elif parameters['delimiter'] == "\\n": - parameters['delimiter'] = "\n" - - if parameters['delimiter'] and parameters['delimiter'] != "" and parameters['delimiter'] in parameters['text']: - texts = parameters['text'].split(parameters['delimiter']) - else: - texts = split_and_recombine_text(parameters['text']) - - full_start_time = time.time() - - outdir = f"{args.results_folder}/{voice}/" - os.makedirs(outdir, exist_ok=True) - - audio_cache = {} - - volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None - - idx = 0 - idx_cache = {} - for i, file in enumerate(os.listdir(outdir)): - filename = os.path.basename(file) - extension = os.path.splitext(filename)[-1][1:] - if extension != "json" and extension != "wav": - continue - match = re.findall(rf"^{cleanup_voice_name(voice)}_(\d+)(?:.+?)?{extension}$", filename) - if match and len(match) > 0: - key = int(match[0]) - idx_cache[key] = True - - if len(idx_cache) > 0: - keys = sorted(list(idx_cache.keys())) - idx = keys[-1] + 1 - - idx = pad(idx, 4) - - def get_name(line=0, candidate=0, combined=False): - name = f"{idx}" - if combined: - name = f"{name}_combined" - elif len(texts) > 1: - name = f"{name}_{line}" - if parameters['candidates'] > 1: - name = f"{name}_{candidate}" - return name - - def get_info( voice, settings = None, latents = True ): - info = {} - info.update(parameters) - - info['time'] = time.time()-full_start_time - info['datetime'] = datetime.now().isoformat() - - info['progress'] = None - del info['progress'] - - if info['delimiter'] == "\n": - info['delimiter'] = "\\n" - - if settings is not None: - for k in settings: - if k in info: - info[k] = settings[k] - return info - - INFERENCING = True - for line, cut_text in enumerate(texts): - tqdm_prefix = f'[{str(line+1)}/{str(len(texts))}]' - print(f"{tqdm_prefix} Generating line: {cut_text}") - start_time = time.time() - - # do setting editing - match = re.findall(r'^(\{.+\}) (.+?)$', cut_text) - override = None - if match and len(match) > 0: - match = match[0] - try: - override = json.loads(match[0]) - cut_text = match[1].strip() - except Exception as e: - raise Exception("Prompt settings editing requested, but received invalid JSON") - - settings = get_settings( override=override ) - - gen = tts.inference(cut_text, **settings ) - - run_time = time.time()-start_time - print(f"Generating line took {run_time} seconds") - - if not isinstance(gen, list): - gen = [gen] - - for j, g in enumerate(gen): - wav, sr = g - name = get_name(line=line, candidate=j) - - settings['text'] = cut_text - settings['time'] = run_time - settings['datetime'] = datetime.now().isoformat() - - # save here in case some error happens mid-batch - if tts.vocos_enabled: - torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', wav.cpu(), sr) - else: - write_wav(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', sr, wav) - wav, sr = torchaudio.load(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') - - audio_cache[name] = { - 'audio': wav, - 'settings': get_info(voice=override['voice'] if override and 'voice' in override else voice, settings=settings) - } - - del gen - do_gc() - INFERENCING = False - - for k in audio_cache: - audio = audio_cache[k]['audio'] - - audio, _ = resample(audio, tts.output_sample_rate, args.output_sample_rate) - if volume_adjust is not None: - audio = volume_adjust(audio) - - audio_cache[k]['audio'] = audio - torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{k}.wav', audio, args.output_sample_rate) - - output_voices = [] - for candidate in range(parameters['candidates']): - if len(texts) > 1: - audio_clips = [] - for line in range(len(texts)): - name = get_name(line=line, candidate=candidate) - audio = audio_cache[name]['audio'] - audio_clips.append(audio) - - name = get_name(candidate=candidate, combined=True) - audio = torch.cat(audio_clips, dim=-1) - torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', audio, args.output_sample_rate) - - audio = audio.squeeze(0).cpu() - audio_cache[name] = { - 'audio': audio, - 'settings': get_info(voice=voice), - 'output': True - } - else: - try: - name = get_name(candidate=candidate) - audio_cache[name]['output'] = True - except Exception as e: - for name in audio_cache: - audio_cache[name]['output'] = True - - - if args.voice_fixer: - if not voicefixer: - notify_progress("Loading voicefix...", progress=progress) - load_voicefixer() - - try: - fixed_cache = {} - for name in tqdm(audio_cache, desc="Running voicefix..."): - del audio_cache[name]['audio'] - if 'output' not in audio_cache[name] or not audio_cache[name]['output']: - continue - - path = f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav' - fixed = f'{outdir}/{cleanup_voice_name(voice)}_{name}_fixed.wav' - voicefixer.restore( - input=path, - output=fixed, - cuda=get_device_name() == "cuda" and args.voice_fixer_use_cuda, - #mode=mode, - ) - - fixed_cache[f'{name}_fixed'] = { - 'settings': audio_cache[name]['settings'], - 'output': True - } - audio_cache[name]['output'] = False - - for name in fixed_cache: - audio_cache[name] = fixed_cache[name] - except Exception as e: - print(e) - print("\nFailed to run Voicefixer") - - for name in audio_cache: - if 'output' not in audio_cache[name] or not audio_cache[name]['output']: - if args.prune_nonfinal_outputs: - audio_cache[name]['pruned'] = True - os.remove(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') - continue - - output_voices.append(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') - - if not args.embed_output_metadata: - with open(f'{outdir}/{cleanup_voice_name(voice)}_{name}.json', 'w', encoding="utf-8") as f: - f.write(json.dumps(audio_cache[name]['settings'], indent='\t') ) - - if args.embed_output_metadata: - for name in tqdm(audio_cache, desc="Embedding metadata..."): - if 'pruned' in audio_cache[name] and audio_cache[name]['pruned']: - continue - - metadata = music_tag.load_file(f"{outdir}/{cleanup_voice_name(voice)}_{name}.wav") - metadata['lyrics'] = json.dumps(audio_cache[name]['settings']) - metadata.save() - - if sample_voice is not None: - sample_voice = (tts.input_sample_rate, sample_voice.numpy()) - - info = get_info(voice=voice, latents=False) - print(f"Generation took {info['time']} seconds, saved to '{output_voices[0]}'\n") - - info['seed'] = usedSeed - if 'latents' in info: - del info['latents'] - - os.makedirs('./config/', exist_ok=True) - with open(f'./config/generate.json', 'w', encoding="utf-8") as f: - f.write(json.dumps(info, indent='\t') ) - - stats = [ - [ parameters['seed'], "{:.3f}".format(info['time']) ] - ] - - return ( - sample_voice, - output_voices, - stats, - ) - -def generate_valle(**kwargs): - parameters = {} - parameters.update(kwargs) - - voice = parameters['voice'] - progress = parameters['progress'] if 'progress' in parameters else None - if parameters['seed'] == 0: - parameters['seed'] = None - - usedSeed = parameters['seed'] - - global args - global tts - - unload_whisper() - unload_voicefixer() - - if not tts: - # should check if it's loading or unloaded, and load it if it's unloaded - if tts_loading: - raise Exception("TTS is still initializing...") - if progress is not None: - notify_progress("Initializing TTS...", progress=progress) - load_tts() - if hasattr(tts, "loading") and tts.loading: - raise Exception("TTS is still initializing...") - - do_gc() - - voice_samples = None - conditioning_latents = None - sample_voice = None - - voice_cache = {} - def fetch_voice( voice ): - if voice in voice_cache: - return voice_cache[voice] - - """ - voice_dir = f'./training/{voice}/audio/' - - if not os.path.isdir(voice_dir) or len(os.listdir(voice_dir)) == 0: - voice_dir = f'./voices/{voice}/' - - files = [ f'{voice_dir}/{d}' for d in os.listdir(voice_dir) if d[-4:] == ".wav" ] - """ - - if os.path.isdir(f'./training/{voice}/audio/'): - files = get_voice(name="audio", dir=f"./training/{voice}/", load_latents=False) - else: - files = get_voice(name=voice, load_latents=False) - - # return files - voice_cache[voice] = random.sample(files, k=min(3, len(files))) - return voice_cache[voice] - - def get_settings( override=None ): - settings = { - 'ar_temp': float(parameters['temperature']), - 'nar_temp': float(parameters['temperature']), - 'max_ar_steps': parameters['num_autoregressive_samples'], - } - - # could be better to just do a ternary on everything above, but i am not a professional - selected_voice = voice - if override is not None: - if 'voice' in override: - selected_voice = override['voice'] - - for k in override: - if k not in settings: - continue - settings[k] = override[k] - - settings['references'] = fetch_voice(voice=selected_voice) # [ fetch_voice(voice=selected_voice) for _ in range(3) ] - return settings - - if not parameters['delimiter']: - parameters['delimiter'] = "\n" - elif parameters['delimiter'] == "\\n": - parameters['delimiter'] = "\n" - - if parameters['delimiter'] and parameters['delimiter'] != "" and parameters['delimiter'] in parameters['text']: - texts = parameters['text'].split(parameters['delimiter']) - else: - texts = split_and_recombine_text(parameters['text']) - - full_start_time = time.time() - - outdir = f"{args.results_folder}/{voice}/" - os.makedirs(outdir, exist_ok=True) - - audio_cache = {} - - volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None - - idx = 0 - idx_cache = {} - for i, file in enumerate(os.listdir(outdir)): - filename = os.path.basename(file) - extension = os.path.splitext(filename)[-1][1:] - if extension != "json" and extension != "wav": - continue - match = re.findall(rf"^{voice}_(\d+)(?:.+?)?{extension}$", filename) - if match and len(match) > 0: - key = int(match[0]) - idx_cache[key] = True - - if len(idx_cache) > 0: - keys = sorted(list(idx_cache.keys())) - idx = keys[-1] + 1 - - idx = pad(idx, 4) - - def get_name(line=0, candidate=0, combined=False): - name = f"{idx}" - if combined: - name = f"{name}_combined" - elif len(texts) > 1: - name = f"{name}_{line}" - if parameters['candidates'] > 1: - name = f"{name}_{candidate}" - return name - - def get_info( voice, settings = None, latents = True ): - info = {} - info.update(parameters) - - info['time'] = time.time()-full_start_time - info['datetime'] = datetime.now().isoformat() - - info['progress'] = None - del info['progress'] - - if info['delimiter'] == "\n": - info['delimiter'] = "\\n" - - if settings is not None: - for k in settings: - if k in info: - info[k] = settings[k] - return info - - INFERENCING = True - for line, cut_text in enumerate(texts): - tqdm_prefix = f'[{str(line+1)}/{str(len(texts))}]' - print(f"{tqdm_prefix} Generating line: {cut_text}") - start_time = time.time() - - # do setting editing - match = re.findall(r'^(\{.+\}) (.+?)$', cut_text) - override = None - if match and len(match) > 0: - match = match[0] - try: - override = json.loads(match[0]) - cut_text = match[1].strip() - except Exception as e: - raise Exception("Prompt settings editing requested, but received invalid JSON") - - name = get_name(line=line, candidate=0) - - settings = get_settings( override=override ) - references = settings['references'] - settings.pop("references") - settings['out_path'] = f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav' - - gen = tts.inference(cut_text, references, **settings ) - - run_time = time.time()-start_time - print(f"Generating line took {run_time} seconds") - - if not isinstance(gen, list): - gen = [gen] - - for j, g in enumerate(gen): - wav, sr = g - name = get_name(line=line, candidate=j) - - settings['text'] = cut_text - settings['time'] = run_time - settings['datetime'] = datetime.now().isoformat() - - # save here in case some error happens mid-batch - #torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', wav.cpu(), sr) - #soundfile.write(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', wav.cpu()[0,0], sr) - wav, sr = torchaudio.load(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') - - audio_cache[name] = { - 'audio': wav, - 'settings': get_info(voice=override['voice'] if override and 'voice' in override else voice, settings=settings) - } - - del gen - do_gc() - INFERENCING = False - - for k in audio_cache: - audio = audio_cache[k]['audio'] - - audio, _ = resample(audio, tts.output_sample_rate, args.output_sample_rate) - if volume_adjust is not None: - audio = volume_adjust(audio) - - audio_cache[k]['audio'] = audio - torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{k}.wav', audio, args.output_sample_rate) - - output_voices = [] - for candidate in range(parameters['candidates']): - if len(texts) > 1: - audio_clips = [] - for line in range(len(texts)): - name = get_name(line=line, candidate=candidate) - audio = audio_cache[name]['audio'] - audio_clips.append(audio) - - name = get_name(candidate=candidate, combined=True) - audio = torch.cat(audio_clips, dim=-1) - torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', audio, args.output_sample_rate) - - audio = audio.squeeze(0).cpu() - audio_cache[name] = { - 'audio': audio, - 'settings': get_info(voice=voice), - 'output': True - } - else: - name = get_name(candidate=candidate) - audio_cache[name]['output'] = True - - - if args.voice_fixer: - if not voicefixer: - notify_progress("Loading voicefix...", progress=progress) - load_voicefixer() - - try: - fixed_cache = {} - for name in tqdm(audio_cache, desc="Running voicefix..."): - del audio_cache[name]['audio'] - if 'output' not in audio_cache[name] or not audio_cache[name]['output']: - continue - - path = f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav' - fixed = f'{outdir}/{cleanup_voice_name(voice)}_{name}_fixed.wav' - voicefixer.restore( - input=path, - output=fixed, - cuda=get_device_name() == "cuda" and args.voice_fixer_use_cuda, - #mode=mode, - ) - - fixed_cache[f'{name}_fixed'] = { - 'settings': audio_cache[name]['settings'], - 'output': True - } - audio_cache[name]['output'] = False - - for name in fixed_cache: - audio_cache[name] = fixed_cache[name] - except Exception as e: - print(e) - print("\nFailed to run Voicefixer") - - for name in audio_cache: - if 'output' not in audio_cache[name] or not audio_cache[name]['output']: - if args.prune_nonfinal_outputs: - audio_cache[name]['pruned'] = True - os.remove(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') - continue - - output_voices.append(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') - - if not args.embed_output_metadata: - with open(f'{outdir}/{cleanup_voice_name(voice)}_{name}.json', 'w', encoding="utf-8") as f: - f.write(json.dumps(audio_cache[name]['settings'], indent='\t') ) - - if args.embed_output_metadata: - for name in tqdm(audio_cache, desc="Embedding metadata..."): - if 'pruned' in audio_cache[name] and audio_cache[name]['pruned']: - continue - - metadata = music_tag.load_file(f"{outdir}/{cleanup_voice_name(voice)}_{name}.wav") - metadata['lyrics'] = json.dumps(audio_cache[name]['settings']) - metadata.save() - - if sample_voice is not None: - sample_voice = (tts.input_sample_rate, sample_voice.numpy()) - - info = get_info(voice=voice, latents=False) - print(f"Generation took {info['time']} seconds, saved to '{output_voices[0]}'\n") - - info['seed'] = usedSeed - if 'latents' in info: - del info['latents'] - - os.makedirs('./config/', exist_ok=True) - with open(f'./config/generate.json', 'w', encoding="utf-8") as f: - f.write(json.dumps(info, indent='\t') ) - - stats = [ - [ parameters['seed'], "{:.3f}".format(info['time']) ] - ] - - return ( - sample_voice, - output_voices, - stats, - ) - -def generate_tortoise(**kwargs): - parameters = {} - parameters.update(kwargs) - - voice = parameters['voice'] - progress = parameters['progress'] if 'progress' in parameters else None - if parameters['seed'] == 0: - parameters['seed'] = None - - usedSeed = parameters['seed'] - - global args - global tts - - unload_whisper() - unload_voicefixer() - - if not tts: - # should check if it's loading or unloaded, and load it if it's unloaded - if tts_loading: - raise Exception("TTS is still initializing...") - load_tts() - if hasattr(tts, "loading") and tts.loading: - raise Exception("TTS is still initializing...") - - do_gc() - - voice_samples = None - conditioning_latents = None - sample_voice = None - - voice_cache = {} - def fetch_voice( voice ): - cache_key = f'{voice}:{tts.autoregressive_model_hash[:8]}' - if cache_key in voice_cache: - return voice_cache[cache_key] - - print(f"Loading voice: {voice} with model {tts.autoregressive_model_hash[:8]}") - sample_voice = None - if voice == "microphone": - if parameters['mic_audio'] is None: - raise Exception("Please provide audio from mic when choosing `microphone` as a voice input") - voice_samples, conditioning_latents = [load_audio(parameters['mic_audio'], tts.input_sample_rate)], None - elif voice == "random": - voice_samples, conditioning_latents = None, tts.get_random_conditioning_latents() - else: - if progress is not None: - notify_progress(f"Loading voice: {voice}", progress=progress) - - voice_samples, conditioning_latents = load_voice(voice, model_hash=tts.autoregressive_model_hash) - - if voice_samples and len(voice_samples) > 0: - if conditioning_latents is None: - conditioning_latents = compute_latents(voice=voice, voice_samples=voice_samples, voice_latents_chunks=parameters['voice_latents_chunks']) - - sample_voice = torch.cat(voice_samples, dim=-1).squeeze().cpu() - voice_samples = None - - voice_cache[cache_key] = (voice_samples, conditioning_latents, sample_voice) - return voice_cache[cache_key] - - def get_settings( override=None ): - settings = { - 'temperature': float(parameters['temperature']), - - 'top_p': float(parameters['top_p']), - 'diffusion_temperature': float(parameters['diffusion_temperature']), - 'length_penalty': float(parameters['length_penalty']), - 'repetition_penalty': float(parameters['repetition_penalty']), - 'cond_free_k': float(parameters['cond_free_k']), - - 'num_autoregressive_samples': parameters['num_autoregressive_samples'], - 'sample_batch_size': args.sample_batch_size, - 'diffusion_iterations': parameters['diffusion_iterations'], - - 'voice_samples': None, - 'conditioning_latents': None, - - 'use_deterministic_seed': parameters['seed'], - 'return_deterministic_state': True, - 'k': parameters['candidates'], - 'diffusion_sampler': parameters['diffusion_sampler'], - 'breathing_room': parameters['breathing_room'], - 'half_p': "Half Precision" in parameters['experimentals'], - 'cond_free': "Conditioning-Free" in parameters['experimentals'], - 'cvvp_amount': parameters['cvvp_weight'], - - 'autoregressive_model': args.autoregressive_model, - 'diffusion_model': args.diffusion_model, - 'tokenizer_json': args.tokenizer_json, - } - - # could be better to just do a ternary on everything above, but i am not a professional - selected_voice = voice - if override is not None: - if 'voice' in override: - selected_voice = override['voice'] - - for k in override: - if k not in settings: - continue - settings[k] = override[k] - - if settings['autoregressive_model'] is not None: - if settings['autoregressive_model'] == "auto": - settings['autoregressive_model'] = deduce_autoregressive_model(selected_voice) - tts.load_autoregressive_model(settings['autoregressive_model']) - - if settings['diffusion_model'] is not None: - if settings['diffusion_model'] == "auto": - settings['diffusion_model'] = deduce_diffusion_model(selected_voice) - tts.load_diffusion_model(settings['diffusion_model']) - - if settings['tokenizer_json'] is not None: - tts.load_tokenizer_json(settings['tokenizer_json']) - - settings['voice_samples'], settings['conditioning_latents'], _ = fetch_voice(voice=selected_voice) - - # clamp it down for the insane users who want this - # it would be wiser to enforce the sample size to the batch size, but this is what the user wants - settings['sample_batch_size'] = args.sample_batch_size - if not settings['sample_batch_size']: - settings['sample_batch_size'] = tts.autoregressive_batch_size - if settings['num_autoregressive_samples'] < settings['sample_batch_size']: - settings['sample_batch_size'] = settings['num_autoregressive_samples'] - - if settings['conditioning_latents'] is not None and len(settings['conditioning_latents']) == 2 and settings['cvvp_amount'] > 0: - print("Requesting weighing against CVVP weight, but voice latents are missing some extra data. Please regenerate your voice latents with 'Slimmer voice latents' unchecked.") - settings['cvvp_amount'] = 0 - - return settings - - if not parameters['delimiter']: - parameters['delimiter'] = "\n" - elif parameters['delimiter'] == "\\n": - parameters['delimiter'] = "\n" - - if parameters['delimiter'] and parameters['delimiter'] != "" and parameters['delimiter'] in parameters['text']: - texts = parameters['text'].split(parameters['delimiter']) - else: - texts = split_and_recombine_text(parameters['text']) - - full_start_time = time.time() - - outdir = f"{args.results_folder}/{voice}/" - os.makedirs(outdir, exist_ok=True) - - audio_cache = {} - - volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None - - idx = 0 - idx_cache = {} - for i, file in enumerate(os.listdir(outdir)): - filename = os.path.basename(file) - extension = os.path.splitext(filename)[-1][1:] - if extension != "json" and extension != "wav": - continue - match = re.findall(rf"^{voice}_(\d+)(?:.+?)?{extension}$", filename) - if match and len(match) > 0: - key = int(match[0]) - idx_cache[key] = True - - if len(idx_cache) > 0: - keys = sorted(list(idx_cache.keys())) - idx = keys[-1] + 1 - - idx = pad(idx, 4) - - def get_name(line=0, candidate=0, combined=False): - name = f"{idx}" - if combined: - name = f"{name}_combined" - elif len(texts) > 1: - name = f"{name}_{line}" - if parameters['candidates'] > 1: - name = f"{name}_{candidate}" - return name - - def get_info( voice, settings = None, latents = True ): - info = {} - info.update(parameters) - - info['time'] = time.time()-full_start_time - info['datetime'] = datetime.now().isoformat() - - info['model'] = tts.autoregressive_model_path - info['model_hash'] = tts.autoregressive_model_hash - - info['progress'] = None - del info['progress'] - - if info['delimiter'] == "\n": - info['delimiter'] = "\\n" - - if settings is not None: - for k in settings: - if k in info: - info[k] = settings[k] - - if 'half_p' in settings and 'cond_free' in settings: - info['experimentals'] = [] - if settings['half_p']: - info['experimentals'].append("Half Precision") - if settings['cond_free']: - info['experimentals'].append("Conditioning-Free") - - if latents and "latents" not in info: - voice = info['voice'] - model_hash = settings["model_hash"][:8] if settings is not None and "model_hash" in settings else tts.autoregressive_model_hash[:8] - - dir = f'{get_voice_dir()}/{voice}/' - latents_path = f'{dir}/cond_latents_{model_hash}.pth' - - if voice == "random" or voice == "microphone": - if latents and settings is not None and settings['conditioning_latents']: - os.makedirs(dir, exist_ok=True) - torch.save(conditioning_latents, latents_path) - - if latents_path and os.path.exists(latents_path): - try: - with open(latents_path, 'rb') as f: - info['latents'] = base64.b64encode(f.read()).decode("ascii") - except Exception as e: - pass - - return info - - INFERENCING = True - for line, cut_text in enumerate(texts): - if should_phonemize(): - cut_text = phonemizer( cut_text ) - - if parameters['emotion'] == "Custom": - if parameters['prompt'] and parameters['prompt'].strip() != "": - cut_text = f"[{parameters['prompt']},] {cut_text}" - elif parameters['emotion'] != "None" and parameters['emotion']: - cut_text = f"[I am really {parameters['emotion'].lower()},] {cut_text}" - - tqdm_prefix = f'[{str(line+1)}/{str(len(texts))}]' - print(f"{tqdm_prefix} Generating line: {cut_text}") - start_time = time.time() - - # do setting editing - match = re.findall(r'^(\{.+\}) (.+?)$', cut_text) - override = None - if match and len(match) > 0: - match = match[0] - try: - override = json.loads(match[0]) - cut_text = match[1].strip() - except Exception as e: - raise Exception("Prompt settings editing requested, but received invalid JSON") - - settings = get_settings( override=override ) - gen, additionals = tts.tts(cut_text, **settings ) - - parameters['seed'] = additionals[0] - run_time = time.time()-start_time - print(f"Generating line took {run_time} seconds") - - if not isinstance(gen, list): - gen = [gen] - - for j, g in enumerate(gen): - audio = g.squeeze(0).cpu() - name = get_name(line=line, candidate=j) - - settings['text'] = cut_text - settings['time'] = run_time - settings['datetime'] = datetime.now().isoformat() - if args.tts_backend == "tortoise": - settings['model'] = tts.autoregressive_model_path - settings['model_hash'] = tts.autoregressive_model_hash - - audio_cache[name] = { - 'audio': audio, - 'settings': get_info(voice=override['voice'] if override and 'voice' in override else voice, settings=settings) - } - # save here in case some error happens mid-batch - torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', audio, tts.output_sample_rate) - - del gen - do_gc() - INFERENCING = False - - for k in audio_cache: - audio = audio_cache[k]['audio'] - - audio, _ = resample(audio, tts.output_sample_rate, args.output_sample_rate) - if volume_adjust is not None: - audio = volume_adjust(audio) - - audio_cache[k]['audio'] = audio - torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{k}.wav', audio, args.output_sample_rate) - - output_voices = [] - for candidate in range(parameters['candidates']): - if len(texts) > 1: - audio_clips = [] - for line in range(len(texts)): - name = get_name(line=line, candidate=candidate) - audio = audio_cache[name]['audio'] - audio_clips.append(audio) - - name = get_name(candidate=candidate, combined=True) - audio = torch.cat(audio_clips, dim=-1) - torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', audio, args.output_sample_rate) - - audio = audio.squeeze(0).cpu() - audio_cache[name] = { - 'audio': audio, - 'settings': get_info(voice=voice), - 'output': True - } - else: - name = get_name(candidate=candidate) - audio_cache[name]['output'] = True - - - if args.voice_fixer: - if not voicefixer: - notify_progress("Loading voicefix...", progress=progress) - load_voicefixer() - - try: - fixed_cache = {} - for name in tqdm(audio_cache, desc="Running voicefix..."): - del audio_cache[name]['audio'] - if 'output' not in audio_cache[name] or not audio_cache[name]['output']: - continue - - path = f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav' - fixed = f'{outdir}/{cleanup_voice_name(voice)}_{name}_fixed.wav' - voicefixer.restore( - input=path, - output=fixed, - cuda=get_device_name() == "cuda" and args.voice_fixer_use_cuda, - #mode=mode, - ) - - fixed_cache[f'{name}_fixed'] = { - 'settings': audio_cache[name]['settings'], - 'output': True - } - audio_cache[name]['output'] = False - - for name in fixed_cache: - audio_cache[name] = fixed_cache[name] - except Exception as e: - print(e) - print("\nFailed to run Voicefixer") - - for name in audio_cache: - if 'output' not in audio_cache[name] or not audio_cache[name]['output']: - if args.prune_nonfinal_outputs: - audio_cache[name]['pruned'] = True - os.remove(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') - continue - - output_voices.append(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') - - if not args.embed_output_metadata: - with open(f'{outdir}/{cleanup_voice_name(voice)}_{name}.json', 'w', encoding="utf-8") as f: - f.write(json.dumps(audio_cache[name]['settings'], indent='\t') ) - - if args.embed_output_metadata: - for name in tqdm(audio_cache, desc="Embedding metadata..."): - if 'pruned' in audio_cache[name] and audio_cache[name]['pruned']: - continue - - metadata = music_tag.load_file(f"{outdir}/{cleanup_voice_name(voice)}_{name}.wav") - metadata['lyrics'] = json.dumps(audio_cache[name]['settings']) - metadata.save() - - if sample_voice is not None: - sample_voice = (tts.input_sample_rate, sample_voice.numpy()) - - info = get_info(voice=voice, latents=False) - print(f"Generation took {info['time']} seconds, saved to '{output_voices[0]}'\n") - - info['seed'] = usedSeed - if 'latents' in info: - del info['latents'] - - os.makedirs('./config/', exist_ok=True) - with open(f'./config/generate.json', 'w', encoding="utf-8") as f: - f.write(json.dumps(info, indent='\t') ) - - stats = [ - [ parameters['seed'], "{:.3f}".format(info['time']) ] - ] - - return ( - sample_voice, - output_voices, - stats, - ) - -def cancel_generate(): - if not INFERENCING: - return - - import tortoise.api - - tortoise.api.STOP_SIGNAL = True - -def hash_file(path, algo="md5", buffer_size=0): - hash = None - if algo == "md5": - hash = hashlib.md5() - elif algo == "sha1": - hash = hashlib.sha1() - else: - raise Exception(f'Unknown hash algorithm specified: {algo}') - - if not os.path.exists(path): - raise Exception(f'Path not found: {path}') - - with open(path, 'rb') as f: - if buffer_size > 0: - while True: - data = f.read(buffer_size) - if not data: - break - hash.update(data) - else: - hash.update(f.read()) - - return "{0}".format(hash.hexdigest()) - -def update_baseline_for_latents_chunks( voice ): - global current_voice - current_voice = voice - - path = f'{get_voice_dir()}/{voice}/' - if not os.path.isdir(path): - return 1 - - dataset_file = f'./training/{voice}/train.txt' - if os.path.exists(dataset_file): - return 0 # 0 will leverage using the LJspeech dataset for computing latents - - files = os.listdir(path) - - total = 0 - total_duration = 0 - - for file in files: - if file[-4:] != ".wav": - continue - - metadata = torchaudio.info(f'{path}/{file}') - duration = metadata.num_frames / metadata.sample_rate - total_duration += duration - total = total + 1 - - - # brain too fried to figure out a better way - if args.autocalculate_voice_chunk_duration_size == 0: - return int(total_duration / total) if total > 0 else 1 - return int(total_duration / args.autocalculate_voice_chunk_duration_size) if total_duration > 0 else 1 - -def compute_latents(voice=None, voice_samples=None, voice_latents_chunks=0, original_ar=False, original_diffusion=False): - global tts - global args - - unload_whisper() - unload_voicefixer() - - if not tts: - if tts_loading: - raise Exception("TTS is still initializing...") - load_tts() - - if hasattr(tts, "loading") and tts.loading: - raise Exception("TTS is still initializing...") - - if args.tts_backend == "bark": - tts.create_voice( voice ) - return - - if args.autoregressive_model == "auto": - tts.load_autoregressive_model(deduce_autoregressive_model(voice)) - - if voice: - load_from_dataset = voice_latents_chunks == 0 - - if load_from_dataset: - dataset_path = f'./training/{voice}/train.txt' - if not os.path.exists(dataset_path): - load_from_dataset = False - else: - with open(dataset_path, 'r', encoding="utf-8") as f: - lines = f.readlines() - - print("Leveraging dataset for computing latents") - - voice_samples = [] - max_length = 0 - for line in lines: - filename = f'./training/{voice}/{line.split("|")[0]}' - - waveform = load_audio(filename, 22050) - max_length = max(max_length, waveform.shape[-1]) - voice_samples.append(waveform) - - for i in range(len(voice_samples)): - voice_samples[i] = pad_or_truncate(voice_samples[i], max_length) - - voice_latents_chunks = len(voice_samples) - if voice_latents_chunks == 0: - print("Dataset is empty!") - load_from_dataset = True - if not load_from_dataset: - voice_samples, _ = load_voice(voice, load_latents=False) - - if voice_samples is None: - return - - conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents, original_ar=original_ar, original_diffusion=original_diffusion) - - if len(conditioning_latents) == 4: - conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None) - - outfile = f'{get_voice_dir()}/{voice}/cond_latents_{tts.autoregressive_model_hash[:8]}.pth' - torch.save(conditioning_latents, outfile) - print(f'Saved voice latents: {outfile}') - - return conditioning_latents - -# superfluous, but it cleans up some things -class TrainingState(): - def __init__(self, config_path, keep_x_past_checkpoints=0, start=True): - self.killed = False - - self.training_dir = os.path.dirname(config_path) - with open(config_path, 'r') as file: - self.yaml_config = yaml.safe_load(file) - - self.json_config = json.load(open(f"{self.training_dir}/train.json", 'r', encoding="utf-8")) - self.dataset_path = f"{self.training_dir}/train.txt" - with open(self.dataset_path, 'r', encoding="utf-8") as f: - self.dataset_size = len(f.readlines()) - - self.batch_size = self.json_config["batch_size"] - self.save_rate = self.json_config["save_rate"] - - self.epoch = 0 - self.epochs = self.json_config["epochs"] - self.it = 0 - self.its = calc_iterations( self.epochs, self.dataset_size, self.batch_size ) - self.step = 0 - self.steps = int(self.its / self.dataset_size) - self.checkpoint = 0 - self.checkpoints = int((self.its - self.it) / self.save_rate) - - self.gpus = self.json_config['gpus'] - - self.buffer = [] - - self.open_state = False - self.training_started = False - - self.info = {} - - self.it_rate = "" - self.it_rates = 0 - - self.epoch_rate = "" - - self.eta = "?" - self.eta_hhmmss = "?" - - self.nan_detected = False - - self.last_info_check_at = 0 - self.statistics = { - 'loss': [], - 'lr': [], - 'grad_norm': [], - } - self.losses = [] - self.metrics = { - 'step': "", - 'rate': "", - 'loss': "", - } - - self.loss_milestones = [ 1.0, 0.15, 0.05 ] - - if args.tts_backend=="vall-e": - self.valle_last_it = 0 - self.valle_steps = 0 - - if keep_x_past_checkpoints > 0: - self.cleanup_old(keep=keep_x_past_checkpoints) - if start: - self.spawn_process(config_path=config_path, gpus=self.gpus) - - def spawn_process(self, config_path, gpus=1): - if args.tts_backend == "vall-e": - self.cmd = ['deepspeed', f'--num_gpus={gpus}', '--module', 'vall_e.train', f'yaml="{config_path}"'] - else: - self.cmd = ['train.bat', config_path] if os.name == "nt" else ['./train.sh', config_path] - - print("Spawning process: ", " ".join(self.cmd)) - self.process = subprocess.Popen(self.cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True) - - def parse_metrics(self, data): - if isinstance(data, str): - if line.find('Training Metrics:') >= 0: - data = json.loads(line.split("Training Metrics:")[-1]) - data['mode'] = "training" - elif line.find('Validation Metrics:') >= 0: - data = json.loads(line.split("Validation Metrics:")[-1]) - data['mode'] = "validation" - else: - return - - self.info = data - if 'epoch' in self.info: - self.epoch = int(self.info['epoch']) - if 'it' in self.info: - self.it = int(self.info['it']) - if 'step' in self.info: - self.step = int(self.info['step']) - if 'steps' in self.info: - self.steps = int(self.info['steps']) - - if 'elapsed_time' in self.info: - self.info['iteration_rate'] = self.info['elapsed_time'] - del self.info['elapsed_time'] - - if 'iteration_rate' in self.info: - it_rate = self.info['iteration_rate'] - self.it_rate = f'{"{:.3f}".format(1/it_rate)}it/s' if 0 < it_rate and it_rate < 1 else f'{"{:.3f}".format(it_rate)}s/it' - self.it_rates += it_rate - - if self.it_rates > 0 and self.it * self.steps > 0: - epoch_rate = self.it_rates / self.it * self.steps - self.epoch_rate = f'{"{:.3f}".format(1/epoch_rate)}epoch/s' if 0 < epoch_rate and epoch_rate < 1 else f'{"{:.3f}".format(epoch_rate)}s/epoch' - - try: - self.eta = (self.its - self.it) * (self.it_rates / self.it) - eta = str(timedelta(seconds=int(self.eta))) - self.eta_hhmmss = eta - except Exception as e: - self.eta_hhmmss = "?" - pass - - self.metrics['step'] = [f"{self.epoch}/{self.epochs}"] - if self.epochs != self.its: - self.metrics['step'].append(f"{self.it}/{self.its}") - if self.steps > 1: - self.metrics['step'].append(f"{self.step}/{self.steps}") - self.metrics['step'] = ", ".join(self.metrics['step']) - - if args.tts_backend == "tortoise": - epoch = self.epoch + (self.step / self.steps) - else: - epoch = self.info['epoch'] if 'epoch' in self.info else self.it - - if self.it > 0: - # probably can double for-loop but whatever - keys = { - 'lrs': ['lr'], - 'losses': ['loss_text_ce', 'loss_mel_ce'], - 'accuracies': [], - 'precisions': [], - 'grad_norms': [], - } - if args.tts_backend == "vall-e": - keys['lrs'] = [ - 'ar.lr', 'nar.lr', - ] - keys['losses'] = [ - # 'ar.loss', 'nar.loss', 'ar+nar.loss', - 'ar.loss.nll', 'nar.loss.nll', - ] - - keys['accuracies'] = [ - 'ar.loss.acc', 'nar.loss.acc', - 'ar.stats.acc', 'nar.loss.acc', - ] - keys['precisions'] = [ 'ar.loss.precision', 'nar.loss.precision', ] - keys['grad_norms'] = ['ar.grad_norm', 'nar.grad_norm'] - - for k in keys['lrs']: - if k not in self.info: - continue - - self.statistics['lr'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': k}) - - for k in keys['accuracies']: - if k not in self.info: - continue - - self.statistics['loss'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': k}) - - for k in keys['precisions']: - if k not in self.info: - continue - - self.statistics['loss'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': k}) - - for k in keys['losses']: - if k not in self.info: - continue - - prefix = "" - - if "mode" in self.info and self.info["mode"] == "validation": - prefix = f'{self.info["name"] if "name" in self.info else "val"}_' - - self.statistics['loss'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': f'{prefix}{k}' }) - - self.losses.append( self.statistics['loss'][-1] ) - - for k in keys['grad_norms']: - if k not in self.info: - continue - self.statistics['grad_norm'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': k}) - - return data - - def get_status(self): - message = None - - self.metrics['rate'] = [] - if self.epoch_rate: - self.metrics['rate'].append(self.epoch_rate) - if self.it_rate and self.epoch_rate[:-7] != self.it_rate[:-4]: - self.metrics['rate'].append(self.it_rate) - self.metrics['rate'] = ", ".join(self.metrics['rate']) - - eta_hhmmss = self.eta_hhmmss if self.eta_hhmmss else "?" - - self.metrics['loss'] = [] - if 'lr' in self.info: - self.metrics['loss'].append(f'LR: {"{:.3e}".format(self.info["lr"])}') - - if len(self.losses) > 0: - self.metrics['loss'].append(f'Loss: {"{:.3f}".format(self.losses[-1]["value"])}') - - if False and len(self.losses) >= 2: - deriv = 0 - accum_length = len(self.losses)//2 # i *guess* this is fine when you think about it - loss_value = self.losses[-1]["value"] - - for i in range(accum_length): - d1_loss = self.losses[accum_length-i-1]["value"] - d2_loss = self.losses[accum_length-i-2]["value"] - dloss = (d2_loss - d1_loss) - - d1_step = self.losses[accum_length-i-1]["it"] - d2_step = self.losses[accum_length-i-2]["it"] - dstep = (d2_step - d1_step) - - if dstep == 0: - continue - - inst_deriv = dloss / dstep - deriv += inst_deriv - - deriv = deriv / accum_length - - print("Deriv: ", deriv) - - if deriv != 0: # dloss < 0: - next_milestone = None - for milestone in self.loss_milestones: - if loss_value > milestone: - next_milestone = milestone - break - - print(f"Loss value: {loss_value} | Next milestone: {next_milestone} | Distance: {loss_value - next_milestone}") - - if next_milestone: - # tfw can do simple calculus but not basic algebra in my head - est_its = (next_milestone - loss_value) / deriv * 100 - print(f"Estimated: {est_its}") - if est_its >= 0: - self.metrics['loss'].append(f'Est. milestone {next_milestone} in: {int(est_its)}its') - else: - est_loss = inst_deriv * (self.its - self.it) + loss_value - if est_loss >= 0: - self.metrics['loss'].append(f'Est. final loss: {"{:.3f}".format(est_loss)}') - - self.metrics['loss'] = ", ".join(self.metrics['loss']) - - message = f"[{self.metrics['step']}] [{self.metrics['rate']}] [ETA: {eta_hhmmss}] [{self.metrics['loss']}]" - if self.nan_detected: - message = f"[!NaN DETECTED! {self.nan_detected}] {message}" - - return message - - def load_statistics(self, update=False): - if not os.path.isdir(self.training_dir): - return - - if args.tts_backend == "tortoise": - logs = sorted([f'{self.training_dir}/finetune/{d}' for d in os.listdir(f'{self.training_dir}/finetune/') if d[-4:] == ".log" ]) - else: - log_dir = "logs" - logs = sorted([f'{self.training_dir}/{log_dir}/{d}/log.txt' for d in os.listdir(f'{self.training_dir}/{log_dir}/') ]) - - if update: - logs = [logs[-1]] - - infos = {} - highest_step = self.last_info_check_at - - if not update: - self.statistics['loss'] = [] - self.statistics['lr'] = [] - self.statistics['grad_norm'] = [] - self.it_rates = 0 - - unq = {} - averager = None - prev_state = 0 - - for log in logs: - with open(log, 'r', encoding="utf-8") as f: - lines = f.readlines() - - for line in lines: - line = line.strip() - if not line: - continue - - if line[-1] == ".": - line = line[:-1] - - if line.find('Training Metrics:') >= 0: - split = line.split("Training Metrics:")[-1] - data = json.loads(split) - - name = "train" - mode = "training" - prev_state = 0 - elif line.find('Validation Metrics:') >= 0: - data = json.loads(line.split("Validation Metrics:")[-1]) - if "it" not in data: - data['it'] = it - if "epoch" not in data: - data['epoch'] = epoch - - # name = data['name'] if 'name' in data else "val" - mode = "validation" - - if prev_state == 0: - name = "subtrain" - else: - name = "val" - - prev_state += 1 - else: - continue - - if "it" not in data: - continue - - it = data['it'] - epoch = data['epoch'] - - if args.tts_backend == "vall-e": - if not averager or averager['key'] != f'{it}_{name}' or averager['mode'] != mode: - averager = { - 'key': f'{it}_{name}', - 'name': name, - 'mode': mode, - "metrics": {} - } - for k in data: - if data[k] is None: - continue - averager['metrics'][k] = [ data[k] ] - else: - for k in data: - if data[k] is None: - continue - if k not in averager['metrics']: - averager['metrics'][k] = [ data[k] ] - else: - averager['metrics'][k].append( data[k] ) - - unq[f'{it}_{mode}_{name}'] = averager - else: - unq[f'{it}_{mode}_{name}'] = data - - if update and it <= self.last_info_check_at: - continue - - blacklist = [ "batch", "eval" ] - for it in unq: - if args.tts_backend == "vall-e": - stats = unq[it] - data = {k: sum(v) / len(v) for k, v in stats['metrics'].items() if k not in blacklist } - #data = {k: min(v) for k, v in stats['metrics'].items() if k not in blacklist } - #data = {k: max(v) for k, v in stats['metrics'].items() if k not in blacklist } - data['name'] = stats['name'] - data['mode'] = stats['mode'] - data['steps'] = len(stats['metrics']['it']) - else: - data = unq[it] - self.parse_metrics(data) - - self.last_info_check_at = highest_step - - def cleanup_old(self, keep=2): - if keep <= 0: - return - - if args.tts_backend == "vall-e": - return - - if not os.path.isdir(f'{self.training_dir}/finetune/'): - return - - models = sorted([ int(d[:-8]) for d in os.listdir(f'{self.training_dir}/finetune/models/') if d[-8:] == "_gpt.pth" ]) - states = sorted([ int(d[:-6]) for d in os.listdir(f'{self.training_dir}/finetune/training_state/') if d[-6:] == ".state" ]) - remove_models = models[:-keep] - remove_states = states[:-keep] - - for d in remove_models: - path = f'{self.training_dir}/finetune/models/{d}_gpt.pth' - print("Removing", path) - os.remove(path) - for d in remove_states: - path = f'{self.training_dir}/finetune/training_state/{d}.state' - print("Removing", path) - os.remove(path) - - def parse(self, line, verbose=False, keep_x_past_checkpoints=0, buffer_size=8, progress=None ): - self.buffer.append(f'{line}') - - data = None - percent = 0 - message = None - should_return = False - - MESSAGE_START = 'Start training from epoch' - MESSAGE_FINSIHED = 'Finished training' - MESSAGE_SAVING = 'Saving models and training states.' - - MESSAGE_METRICS_TRAINING = 'Training Metrics:' - MESSAGE_METRICS_VALIDATION = 'Validation Metrics:' - - if line.find(MESSAGE_FINSIHED) >= 0: - self.killed = True - # rip out iteration info - elif not self.training_started: - if line.find(MESSAGE_START) >= 0: - self.training_started = True # could just leverage the above variable, but this is python, and there's no point in these aggressive microoptimizations - - match = re.findall(r'epoch: ([\d,]+)', line) - if match and len(match) > 0: - self.epoch = int(match[0].replace(",", "")) - match = re.findall(r'iter: ([\d,]+)', line) - if match and len(match) > 0: - self.it = int(match[0].replace(",", "")) - - self.checkpoints = int((self.its - self.it) / self.save_rate) - - self.load_statistics() - - should_return = True - else: - if line.find(MESSAGE_SAVING) >= 0: - self.checkpoint += 1 - message = f"[{self.checkpoint}/{self.checkpoints}] Saving checkpoint..." - percent = self.checkpoint / self.checkpoints - - self.cleanup_old(keep=keep_x_past_checkpoints) - elif line.find(MESSAGE_METRICS_TRAINING) >= 0: - data = json.loads(line.split(MESSAGE_METRICS_TRAINING)[-1]) - data['mode'] = "training" - elif line.find(MESSAGE_METRICS_VALIDATION) >= 0: - data = json.loads(line.split(MESSAGE_METRICS_VALIDATION)[-1]) - data['mode'] = "validation" - - if data is not None: - if ': nan' in line and not self.nan_detected: - self.nan_detected = self.it - - self.parse_metrics( data ) - message = self.get_status() - - if message: - percent = self.it / float(self.its) # self.epoch / float(self.epochs) - if progress is not None: - progress(percent, message) - - self.buffer.append(f'[{"{:.3f}".format(percent*100)}%] {message}') - should_return = True - - if verbose and not self.training_started: - should_return = True - - self.buffer = self.buffer[-buffer_size:] - - result = None - if should_return: - result = "".join(self.buffer) if not self.training_started else message - - return ( - result, - percent, - message, - ) - -try: - import altair as alt - alt.data_transformers.enable('default', max_rows=None) -except Exception as e: - print(e) - pass - -def run_training(config_path, verbose=False, keep_x_past_checkpoints=0, progress=gr.Progress(track_tqdm=True)): - global training_state - if training_state and training_state.process: - return "Training already in progress" - - - # ensure we have the dvae.pth - if args.tts_backend == "tortoise": - get_model_path('dvae.pth') - - # I don't know if this is still necessary, as it was bitching at me for not doing this, despite it being in a separate process - torch.multiprocessing.freeze_support() - - unload_tts() - unload_whisper() - unload_voicefixer() - - training_state = TrainingState(config_path=config_path, keep_x_past_checkpoints=keep_x_past_checkpoints) - - for line in iter(training_state.process.stdout.readline, ""): - if training_state is None or training_state.killed: - return - - result, percent, message = training_state.parse( line=line, verbose=verbose, keep_x_past_checkpoints=keep_x_past_checkpoints, progress=progress ) - print(f"[Training] [{datetime.now().isoformat()}] {line[:-1]}") - if result: - yield result - - if progress is not None and message: - progress(percent, message) - - if training_state: - training_state.process.stdout.close() - return_code = training_state.process.wait() - training_state = None - -def update_training_dataplot(x_min=None, x_max=None, y_min=None, y_max=None, config_path=None): - global training_state - losses = None - lrs = None - grad_norms = None - - x_lim = [ x_min, x_max ] - y_lim = [ y_min, y_max ] - - if not training_state: - if config_path: - training_state = TrainingState(config_path=config_path, start=False) - training_state.load_statistics() - message = training_state.get_status() - - if training_state: - if not x_lim[-1]: - x_lim[-1] = training_state.epochs - - if not y_lim[-1]: - y_lim = None - - if len(training_state.statistics['loss']) > 0: - losses = gr.LinePlot.update( - value = pd.DataFrame(training_state.statistics['loss']), - x_lim=x_lim, y_lim=y_lim, - x="epoch", y="value", # x="it", - title="Loss Metrics", color="type", tooltip=['epoch', 'it', 'value', 'type'], - width=500, height=350 - ) - if len(training_state.statistics['lr']) > 0: - lrs = gr.LinePlot.update( - value = pd.DataFrame(training_state.statistics['lr']), - x_lim=x_lim, - x="epoch", y="value", # x="it", - title="Learning Rate", color="type", tooltip=['epoch', 'it', 'value', 'type'], - width=500, height=350 - ) - if len(training_state.statistics['grad_norm']) > 0: - grad_norms = gr.LinePlot.update( - value = pd.DataFrame(training_state.statistics['grad_norm']), - x_lim=x_lim, - x="epoch", y="value", # x="it", - title="Gradient Normals", color="type", tooltip=['epoch', 'it', 'value', 'type'], - width=500, height=350 - ) - - if config_path: - del training_state - training_state = None - - return (losses, lrs, grad_norms) - -def reconnect_training(verbose=False, progress=gr.Progress(track_tqdm=True)): - global training_state - if not training_state or not training_state.process: - return "Training not in progress" - - for line in iter(training_state.process.stdout.readline, ""): - result, percent, message = training_state.parse( line=line, verbose=verbose, progress=progress ) - print(f"[Training] [{datetime.now().isoformat()}] {line[:-1]}") - if result: - yield result - - if progress is not None and message: - progress(percent, message) - -def stop_training(): - global training_state - if training_state is None: - return "No training in progress" - print("Killing training process...") - training_state.killed = True - - children = [] - if args.tts_backend == "tortoise": - # wrapped in a try/catch in case for some reason this fails outside of Linux - try: - children = [p.info for p in psutil.process_iter(attrs=['pid', 'name', 'cmdline']) if './src/train.py' in p.info['cmdline']] - except Exception as e: - pass - - training_state.process.stdout.close() - training_state.process.terminate() - training_state.process.kill() - elif args.tts_backend == "vall-e": - print(training_state.process.communicate(input='quit')[0]) - - return_code = training_state.process.wait() - - for p in children: - os.kill( p['pid'], signal.SIGKILL ) - - training_state = None - print("Killed training process.") - return f"Training cancelled: {return_code}" - -def get_halfp_model_path(): - autoregressive_model_path = get_model_path('autoregressive.pth') - return autoregressive_model_path.replace(".pth", "_half.pth") - -def convert_to_halfp(): - autoregressive_model_path = get_model_path('autoregressive.pth') - print(f'Converting model to half precision: {autoregressive_model_path}') - model = torch.load(autoregressive_model_path) - for k in model: - model[k] = model[k].half() - - outfile = get_halfp_model_path() - torch.save(model, outfile) - print(f'Converted model to half precision: {outfile}') - - -# collapses short segments into the previous segment -def whisper_sanitize( results ): - sanitized = json.loads(json.dumps(results)) - sanitized['segments'] = [] - - for segment in results['segments']: - length = segment['end'] - segment['start'] - if length >= MIN_TRAINING_DURATION or len(sanitized['segments']) == 0: - sanitized['segments'].append(segment) - continue - - last_segment = sanitized['segments'][-1] - # segment already asimilitated it, somehow - if last_segment['end'] >= segment['end']: - continue - """ - # segment already asimilitated it, somehow - if last_segment['text'].endswith(segment['text']): - continue - """ - last_segment['text'] += segment['text'] - last_segment['end'] = segment['end'] - - for i in range(len(sanitized['segments'])): - sanitized['segments'][i]['id'] = i - - return sanitized - -def whisper_transcribe( file, language=None ): - # shouldn't happen, but it's for safety - global whisper_model - global whisper_align_model - - if not whisper_model: - load_whisper_model(language=language) - - if args.whisper_backend == "openai/whisper": - if not language: - language = None - - return whisper_model.transcribe(file, language=language) - - if args.whisper_backend == "lightmare/whispercpp": - res = whisper_model.transcribe(file) - segments = whisper_model.extract_text_and_timestamps( res ) - - result = { - 'text': [], - 'segments': [] - } - for segment in segments: - reparsed = { - 'start': segment[0] / 100.0, - 'end': segment[1] / 100.0, - 'text': segment[2], - 'id': len(result['segments']) - } - result['text'].append( segment[2] ) - result['segments'].append(reparsed) - - result['text'] = " ".join(result['text']) - return result - - if args.whisper_backend == "m-bain/whisperx": - import whisperx - - device = "cuda" if get_device_name() == "cuda" else "cpu" - result = whisper_model.transcribe(file, batch_size=args.whisper_batchsize) - - align_model, metadata = whisper_align_model - result_aligned = whisperx.align(result["segments"], align_model, metadata, file, device, return_char_alignments=False) - - result['segments'] = result_aligned['segments'] - result['text'] = [] - for segment in result['segments']: - segment['id'] = len(result['text']) - result['text'].append(segment['text'].strip()) - result['text'] = " ".join(result['text']) - - return result - -def validate_waveform( waveform, sample_rate, min_only=False ): - if not torch.any(waveform < 0): - return "Waveform is empty" - - num_channels, num_frames = waveform.shape - duration = num_frames / sample_rate - - if duration < MIN_TRAINING_DURATION: - return "Duration too short ({:.3f}s < {:.3f}s)".format(duration, MIN_TRAINING_DURATION) - - if not min_only: - if duration > MAX_TRAINING_DURATION: - return "Duration too long ({:.3f}s < {:.3f}s)".format(MAX_TRAINING_DURATION, duration) - - return - -def transcribe_dataset( voice, language=None, skip_existings=False, progress=None ): - unload_tts() - - global whisper_model - if whisper_model is None: - load_whisper_model(language=language) - - results = {} - - files = get_voice(voice, load_latents=False) - indir = f'./training/{voice}/' - infile = f'{indir}/whisper.json' - - quantize_in_memory = args.tts_backend == "vall-e" - - os.makedirs(f'{indir}/audio/', exist_ok=True) - - TARGET_SAMPLE_RATE = 22050 - if args.tts_backend != "tortoise": - TARGET_SAMPLE_RATE = 24000 - if tts: - TARGET_SAMPLE_RATE = tts.input_sample_rate - - if os.path.exists(infile): - results = json.load(open(infile, 'r', encoding="utf-8")) - - for file in tqdm(files, desc="Iterating through voice files"): - basename = os.path.basename(file) - - if basename in results and skip_existings: - print(f"Skipping already parsed file: {basename}") - continue - - try: - result = whisper_transcribe(file, language=language) - except Exception as e: - print("Failed to transcribe:", file, e) - continue - - results[basename] = result - - if not quantize_in_memory: - waveform, sample_rate = torchaudio.load(file) - # resample to the input rate, since it'll get resampled for training anyways - # this should also "help" increase throughput a bit when filling the dataloaders - waveform, sample_rate = resample(waveform, sample_rate, TARGET_SAMPLE_RATE) - if waveform.shape[0] == 2: - waveform = waveform[:1] - - try: - kwargs = {} - if basename[-4:] == ".wav": - kwargs['encoding'] = "PCM_S" - kwargs['bits_per_sample'] = 16 - - torchaudio.save(f"{indir}/audio/{basename}", waveform, sample_rate, **kwargs) - except Exception as e: - print(e) - - with open(infile, 'w', encoding="utf-8") as f: - f.write(json.dumps(results, indent='\t')) - - do_gc() - - modified = False - for basename in results: - try: - sanitized = whisper_sanitize(results[basename]) - if len(sanitized['segments']) > 0 and len(sanitized['segments']) != len(results[basename]['segments']): - results[basename] = sanitized - modified = True - print("Segments sanizited: ", basename) - except Exception as e: - print("Failed to sanitize:", basename, e) - pass - - if modified: - os.rename(infile, infile.replace(".json", ".unsanitized.json")) - with open(infile, 'w', encoding="utf-8") as f: - f.write(json.dumps(results, indent='\t')) - - return f"Processed dataset to: {indir}" - -def slice_waveform( waveform, sample_rate, start, end, trim ): - start = int(start * sample_rate) - end = int(end * sample_rate) - - if start < 0: - start = 0 - if end >= waveform.shape[-1]: - end = waveform.shape[-1] - 1 - - sliced = waveform[:, start:end] - - error = validate_waveform( sliced, sample_rate, min_only=True ) - if trim and not error: - sliced = torchaudio.functional.vad( sliced, sample_rate ) - - return sliced, error - -def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, results=None, progress=gr.Progress() ): - indir = f'./training/{voice}/' - infile = f'{indir}/whisper.json' - messages = [] - - if not os.path.exists(infile): - message = f"Missing dataset: {infile}" - print(message) - return message - - if results is None: - results = json.load(open(infile, 'r', encoding="utf-8")) - - TARGET_SAMPLE_RATE = 22050 - if args.tts_backend != "tortoise": - TARGET_SAMPLE_RATE = 24000 - if tts: - TARGET_SAMPLE_RATE = tts.input_sample_rate - - files = 0 - segments = 0 - for filename in results: - path = f'./voices/{voice}/{filename}' - extension = os.path.splitext(filename)[-1][1:] - out_extension = extension # "wav" - - if not os.path.exists(path): - path = f'./training/{voice}/{filename}' - - if not os.path.exists(path): - message = f"Missing source audio: {filename}" - print(message) - messages.append(message) - continue - - files += 1 - result = results[filename] - waveform, sample_rate = torchaudio.load(path) - num_channels, num_frames = waveform.shape - duration = num_frames / sample_rate - - for segment in result['segments']: - file = filename.replace(f".{extension}", f"_{pad(segment['id'], 4)}.{out_extension}") - - sliced, error = slice_waveform( waveform, sample_rate, segment['start'] + start_offset, segment['end'] + end_offset, trim_silence ) - if error: - message = f"{error}, skipping... {file}" - print(message) - messages.append(message) - continue - - sliced, _ = resample( sliced, sample_rate, TARGET_SAMPLE_RATE ) - - if waveform.shape[0] == 2: - waveform = waveform[:1] - - kwargs = {} - if file[-4:] == ".wav": - kwargs['encoding'] = "PCM_S" - kwargs['bits_per_sample'] = 16 - - torchaudio.save(f"{indir}/audio/{file}", sliced, TARGET_SAMPLE_RATE, **kwargs) - - segments +=1 - - messages.append(f"Sliced segments: {files} => {segments}.") - return "\n".join(messages) - -# takes an LJSpeech-dataset-formatted .txt file and phonemize it -def phonemize_txt_file( path ): - with open(path, 'r', encoding='utf-8') as f: - lines = f.readlines() - - reparsed = [] - with open(path.replace(".txt", ".phn.txt"), 'a', encoding='utf-8') as f: - for line in tqdm(lines, desc='Phonemizing...'): - split = line.split("|") - audio = split[0] - text = split[2] - - phonemes = phonemizer( text ) - reparsed.append(f'{audio}|{phonemes}') - f.write(f'\n{audio}|{phonemes}') - - - joined = "\n".join(reparsed) - with open(path.replace(".txt", ".phn.txt"), 'w', encoding='utf-8') as f: - f.write(joined) - - return joined - -# takes an LJSpeech-dataset-formatted .txt (and phonemized .phn.txt from the above) and creates a JSON that should slot in as whisper.json -def create_dataset_json( path ): - with open(path, 'r', encoding='utf-8') as f: - lines = f.readlines() - - phonemes = None - phn_path = path.replace(".txt", ".phn.txt") - if os.path.exists(phn_path): - with open(phn_path, 'r', encoding='utf-8') as f: - phonemes = f.readlines() - - data = {} - - for line in lines: - split = line.split("|") - audio = split[0] - text = split[1] - - data[audio] = { - 'text': text.strip() - } - - for line in phonemes: - split = line.split("|") - audio = split[0] - text = split[1] - - data[audio]['phonemes'] = text.strip() - - with open(path.replace(".txt", ".json"), 'w', encoding='utf-8') as f: - f.write(json.dumps(data, indent="\t")) - - -cached_backends = {} - -def phonemizer( text, language="en-us" ): - from phonemizer import phonemize - from phonemizer.backend import BACKENDS - - def _get_backend( language="en-us", backend="espeak" ): - key = f'{language}_{backend}' - if key in cached_backends: - return cached_backends[key] - - if backend == 'espeak': - phonemizer = BACKENDS[backend]( language, preserve_punctuation=True, with_stress=True) - elif backend == 'espeak-mbrola': - phonemizer = BACKENDS[backend]( language ) - else: - phonemizer = BACKENDS[backend]( language, preserve_punctuation=True ) - - cached_backends[key] = phonemizer - return phonemizer - if language == "en": - language = "en-us" - - backend = _get_backend(language=language, backend=args.phonemizer_backend) - if backend is not None: - tokens = backend.phonemize( [text], strip=True ) - else: - tokens = phonemize( [text], language=language, strip=True, preserve_punctuation=True, with_stress=True ) - - return tokens[0] if len(tokens) == 0 else tokens - tokenized = " ".join( tokens ) - -def should_phonemize(): - if args.tts_backend == "vall-e": - return False - - should = args.tokenizer_json is not None and args.tokenizer_json[-8:] == "ipa.json" - if should: - try: - from phonemizer import phonemize - except Exception as e: - return False - return should - -def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, progress=gr.Progress() ): - indir = f'./training/{voice}/' - infile = f'{indir}/whisper.json' - if not os.path.exists(infile): - message = f"Missing dataset: {infile}" - print(message) - return message - - results = json.load(open(infile, 'r', encoding="utf-8")) - - errored = 0 - messages = [] - normalize = False # True - phonemize = should_phonemize() - lines = { 'training': [], 'validation': [] } - segments = {} - - quantize_in_memory = args.tts_backend == "vall-e" - - if args.tts_backend != "tortoise": - text_length = 0 - audio_length = 0 - - start_offset = -0.1 - end_offset = 0.1 - trim_silence = False - - TARGET_SAMPLE_RATE = 22050 - if args.tts_backend != "tortoise": - TARGET_SAMPLE_RATE = 24000 - if tts: - TARGET_SAMPLE_RATE = tts.input_sample_rate - - for filename in tqdm(results, desc="Parsing results"): - use_segment = use_segments - - extension = os.path.splitext(filename)[-1][1:] - out_extension = extension # "wav" - result = results[filename] - lang = result['language'] - language = LANGUAGES[lang] if lang in LANGUAGES else lang - normalizer = EnglishTextNormalizer() if language and language == "english" else BasicTextNormalizer() - - # check if unsegmented text exceeds 200 characters - if not use_segment: - if len(result['text']) > MAX_TRAINING_CHAR_LENGTH: - message = f"Text length too long ({MAX_TRAINING_CHAR_LENGTH} < {len(result['text'])}), using segments: {filename}" - print(message) - messages.append(message) - use_segment = True - - # check if unsegmented audio exceeds 11.6s - if not use_segment: - path = f'{indir}/audio/{filename}' - if not quantize_in_memory and not os.path.exists(path): - messages.append(f"Missing source audio: {filename}") - errored += 1 - continue - - duration = 0 - for segment in result['segments']: - duration = max(duration, segment['end']) - - if duration >= MAX_TRAINING_DURATION: - message = f"Audio too large, using segments: {filename}" - print(message) - messages.append(message) - use_segment = True - - # implicitly segment - if use_segment and not use_segments: - exists = True - for segment in result['segments']: - duration = segment['end'] - segment['start'] - if duration <= MIN_TRAINING_DURATION or MAX_TRAINING_DURATION <= duration: - continue - - path = f'{indir}/audio/' + filename.replace(f".{extension}", f"_{pad(segment['id'], 4)}.{out_extension}") - if os.path.exists(path): - continue - exists = False - break - - if not quantize_in_memory and not exists: - tmp = {} - tmp[filename] = result - print(f"Audio not segmented, segmenting: {filename}") - message = slice_dataset( voice, results=tmp ) - print(message) - messages = messages + message.split("\n") - - waveform = None - - - if quantize_in_memory: - path = f'{indir}/audio/{filename}' - if not os.path.exists(path): - path = f'./voices/{voice}/{filename}' - - if not os.path.exists(path): - message = f"Audio not found: {path}" - print(message) - messages.append(message) - #continue - else: - waveform = torchaudio.load(path) - waveform = resample(waveform[0], waveform[1], TARGET_SAMPLE_RATE) - - if not use_segment: - segments[filename] = { - 'text': result['text'], - 'lang': lang, - 'language': language, - 'normalizer': normalizer, - 'phonemes': result['phonemes'] if 'phonemes' in result else None - } - - if waveform: - segments[filename]['waveform'] = waveform - else: - for segment in result['segments']: - duration = segment['end'] - segment['start'] - if duration <= MIN_TRAINING_DURATION or MAX_TRAINING_DURATION <= duration: - continue - - file = filename.replace(f".{extension}", f"_{pad(segment['id'], 4)}.{out_extension}") - - segments[file] = { - 'text': segment['text'], - 'lang': lang, - 'language': language, - 'normalizer': normalizer, - 'phonemes': segment['phonemes'] if 'phonemes' in segment else None - } - - if waveform: - sliced, error = slice_waveform( waveform[0], waveform[1], segment['start'] + start_offset, segment['end'] + end_offset, trim_silence ) - if error: - message = f"{error}, skipping... {file}" - print(message) - messages.append(message) - segments[file]['error'] = error - #continue - else: - segments[file]['waveform'] = (sliced, waveform[1]) - - jobs = { - 'quantize': [[], []], - 'phonemize': [[], []], - } - - for file in tqdm(segments, desc="Parsing segments"): - extension = os.path.splitext(file)[-1][1:] - result = segments[file] - path = f'{indir}/audio/{file}' - - text = result['text'] - lang = result['lang'] - language = result['language'] - normalizer = result['normalizer'] - phonemes = result['phonemes'] - if phonemize and phonemes is None: - phonemes = phonemizer( text, language=lang ) - - normalized = normalizer(text) if normalize else text - - if len(text) > MAX_TRAINING_CHAR_LENGTH: - message = f"Text length too long ({MAX_TRAINING_CHAR_LENGTH} < {len(text)}), skipping... {file}" - print(message) - messages.append(message) - errored += 1 - continue - - # num_channels, num_frames = waveform.shape - #duration = num_frames / sample_rate - - - culled = len(text) < text_length - #if not culled and audio_length > 0: - # culled = duration < audio_length - - line = f'audio/{file}|{phonemes if phonemize and phonemes else text}' - - lines['training' if not culled else 'validation'].append(line) - - if culled or args.tts_backend != "vall-e": - continue - - os.makedirs(f'{indir}/valle/', exist_ok=True) - #os.makedirs(f'./training/valle/data/{voice}/', exist_ok=True) - - phn_file = f'{indir}/valle/{file.replace(f".{extension}",".phn.txt")}' - #phn_file = f'./training/valle/data/{voice}/{file.replace(f".{extension}",".phn.txt")}' - if not os.path.exists(phn_file): - jobs['phonemize'][0].append(phn_file) - jobs['phonemize'][1].append(normalized) - """ - phonemized = valle_phonemize( normalized ) - open(f'{indir}/valle/{file.replace(".wav",".phn.txt")}', 'w', encoding='utf-8').write(" ".join(phonemized)) - print("Phonemized:", file, normalized, text) - """ - - qnt_file = f'{indir}/valle/{file.replace(f".{extension}",".qnt.pt")}' - #qnt_file = f'./training/valle/data/{voice}/{file.replace(f".{extension}",".qnt.pt")}' - if 'error' not in result: - if not quantize_in_memory and not os.path.exists(path): - message = f"Missing segment, skipping... {file}" - print(message) - messages.append(message) - errored += 1 - continue - - if not os.path.exists(qnt_file): - waveform = None - if 'waveform' in result: - waveform, sample_rate = result['waveform'] - elif os.path.exists(path): - waveform, sample_rate = torchaudio.load(path) - error = validate_waveform( waveform, sample_rate ) - if error: - message = f"{error}, skipping... {file}" - print(message) - messages.append(message) - errored += 1 - continue - - if waveform is not None: - jobs['quantize'][0].append(qnt_file) - jobs['quantize'][1].append((waveform, sample_rate)) - """ - quantized = valle_quantize( waveform, sample_rate ).cpu() - torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}') - print("Quantized:", file) - """ - - for i in tqdm(range(len(jobs['quantize'][0])), desc="Quantizing"): - qnt_file = jobs['quantize'][0][i] - waveform, sample_rate = jobs['quantize'][1][i] - - quantized = valle_quantize( waveform, sample_rate ).cpu() - torch.save(quantized, qnt_file) - #print("Quantized:", qnt_file) - - for i in tqdm(range(len(jobs['phonemize'][0])), desc="Phonemizing"): - phn_file = jobs['phonemize'][0][i] - normalized = jobs['phonemize'][1][i] - - try: - phonemized = valle_phonemize( normalized ) - open(phn_file, 'w', encoding='utf-8').write(" ".join(phonemized)) - #print("Phonemized:", phn_file) - except Exception as e: - message = f"Failed to phonemize: {phn_file}: {normalized}" - messages.append(message) - print(message) - - - training_joined = "\n".join(lines['training']) - validation_joined = "\n".join(lines['validation']) - - with open(f'{indir}/train.txt', 'w', encoding="utf-8") as f: - f.write(training_joined) - - with open(f'{indir}/validation.txt', 'w', encoding="utf-8") as f: - f.write(validation_joined) - - messages.append(f"Prepared {len(lines['training'])} lines (validation: {len(lines['validation'])}, culled: {errored}).\n{training_joined}\n\n{validation_joined}") - return "\n".join(messages) - -def calc_iterations( epochs, lines, batch_size ): - return int(math.ceil(epochs * math.ceil(lines / batch_size))) - -def schedule_learning_rate( iterations, schedule=LEARNING_RATE_SCHEDULE ): - return [int(iterations * d) for d in schedule] - -def optimize_training_settings( **kwargs ): - messages = [] - settings = {} - settings.update(kwargs) - - dataset_path = f"./training/{settings['voice']}/train.txt" - with open(dataset_path, 'r', encoding="utf-8") as f: - lines = len(f.readlines()) - - if lines == 0: - raise Exception("Empty dataset.") - - if settings['batch_size'] > lines: - settings['batch_size'] = lines - messages.append(f"Batch size is larger than your dataset, clamping batch size to: {settings['batch_size']}") - - """ - if lines % settings['batch_size'] != 0: - settings['batch_size'] = int(lines / settings['batch_size']) - if settings['batch_size'] == 0: - settings['batch_size'] = 1 - messages.append(f"Batch size not neatly divisible by dataset size, adjusting batch size to: {settings['batch_size']}") - """ - if settings['gradient_accumulation_size'] == 0: - settings['gradient_accumulation_size'] = 1 - - if settings['batch_size'] / settings['gradient_accumulation_size'] < 2: - settings['gradient_accumulation_size'] = int(settings['batch_size'] / 2) - if settings['gradient_accumulation_size'] == 0: - settings['gradient_accumulation_size'] = 1 - - messages.append(f"Gradient accumulation size is too large for a given batch size, clamping gradient accumulation size to: {settings['gradient_accumulation_size']}") - elif settings['batch_size'] % settings['gradient_accumulation_size'] != 0: - settings['gradient_accumulation_size'] -= settings['batch_size'] % settings['gradient_accumulation_size'] - if settings['gradient_accumulation_size'] == 0: - settings['gradient_accumulation_size'] = 1 - - messages.append(f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}") - - if settings['batch_size'] % settings['gpus'] != 0: - settings['batch_size'] -= settings['batch_size'] % settings['gpus'] - if settings['batch_size'] == 0: - settings['batch_size'] = 1 - messages.append(f"Batch size not neatly divisible by GPU count, adjusting batch size to: {settings['batch_size']}") - - - def get_device_batch_size( vram ): - DEVICE_BATCH_SIZE_MAP = [ - (70, 128), # based on an A100-80G, I can safely get a ratio of 4096:32 = 128 - (32, 64), # based on my two 6800XTs, I can only really safely get a ratio of 128:2 = 64 - (16, 8), # based on an A4000, I can do a ratio of 512:64 = 8:1 - (8, 4), # interpolated - (6, 2), # based on my 2060, it only really lets me have a batch ratio of 2:1 - ] - for k, v in DEVICE_BATCH_SIZE_MAP: - if vram > (k-1): - return v - return 1 - - if settings['gpus'] > get_device_count(): - settings['gpus'] = get_device_count() - messages.append(f"GPU count exceeds defacto GPU count, clamping to: {settings['gpus']}") - - if settings['gpus'] <= 1: - settings['gpus'] = 1 - else: - messages.append(f"! EXPERIMENTAL ! Multi-GPU training is extremely particular, expect issues.") - - # assuming you have equal GPUs - vram = get_device_vram() * settings['gpus'] - batch_ratio = int(settings['batch_size'] / settings['gradient_accumulation_size']) - batch_cap = get_device_batch_size(vram) - - if batch_ratio > batch_cap: - settings['gradient_accumulation_size'] = int(settings['batch_size'] / batch_cap) - messages.append(f"Batch ratio ({batch_ratio}) is expected to exceed your VRAM capacity ({'{:.3f}'.format(vram)}GB, suggested {batch_cap} batch size cap), adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}") - - iterations = calc_iterations(epochs=settings['epochs'], lines=lines, batch_size=settings['batch_size']) - - if settings['epochs'] < settings['save_rate']: - settings['save_rate'] = settings['epochs'] - messages.append(f"Save rate is too small for the given iteration step, clamping save rate to: {settings['save_rate']}") - - if settings['epochs'] < settings['validation_rate']: - settings['validation_rate'] = settings['epochs'] - messages.append(f"Validation rate is too small for the given iteration step, clamping validation rate to: {settings['validation_rate']}") - - if settings['resume_state'] and not os.path.exists(settings['resume_state']): - settings['resume_state'] = None - messages.append("Resume path specified, but does not exist. Disabling...") - - if settings['bitsandbytes']: - messages.append("! EXPERIMENTAL ! BitsAndBytes requested.") - - if settings['half_p']: - if settings['bitsandbytes']: - settings['half_p'] = False - messages.append("Half Precision requested, but BitsAndBytes is also requested. Due to redundancies, disabling half precision...") - else: - messages.append("! EXPERIMENTAL ! Half Precision requested.") - if not os.path.exists(get_halfp_model_path()): - convert_to_halfp() - - steps = int(iterations / settings['epochs']) - - messages.append(f"For {settings['epochs']} epochs with {lines} lines in batches of {settings['batch_size']}, iterating for {iterations} steps ({steps}) steps per epoch)") - - return settings, messages - -def save_training_settings( **kwargs ): - messages = [] - settings = {} - settings.update(kwargs) - - - outjson = f'./training/{settings["voice"]}/train.json' - with open(outjson, 'w', encoding="utf-8") as f: - f.write(json.dumps(settings, indent='\t') ) - - settings['dataset_path'] = f"./training/{settings['voice']}/train.txt" - settings['validation_path'] = f"./training/{settings['voice']}/validation.txt" - - with open(settings['dataset_path'], 'r', encoding="utf-8") as f: - lines = len(f.readlines()) - - settings['iterations'] = calc_iterations(epochs=settings['epochs'], lines=lines, batch_size=settings['batch_size']) - - if not settings['source_model'] or settings['source_model'] == "auto": - settings['source_model'] = f"./models/tortoise/autoregressive{'_half' if settings['half_p'] else ''}.pth" - - if settings['half_p']: - if not os.path.exists(get_halfp_model_path()): - convert_to_halfp() - - messages.append(f"For {settings['epochs']} epochs with {lines} lines, iterating for {settings['iterations']} steps") - - iterations_per_epoch = settings['iterations'] / settings['epochs'] - - settings['save_rate'] = int(settings['save_rate'] * iterations_per_epoch) - settings['validation_rate'] = int(settings['validation_rate'] * iterations_per_epoch) - - iterations_per_epoch = int(iterations_per_epoch) - - if settings['save_rate'] < 1: - settings['save_rate'] = 1 - """ - if settings['validation_rate'] < 1: - settings['validation_rate'] = 1 - """ - """ - if settings['iterations'] % settings['save_rate'] != 0: - adjustment = int(settings['iterations'] / settings['save_rate']) * settings['save_rate'] - messages.append(f"Iteration rate is not evenly divisible by save rate, adjusting: {settings['iterations']} => {adjustment}") - settings['iterations'] = adjustment - """ - - settings['validation_batch_size'] = int(settings['batch_size'] / settings['gradient_accumulation_size']) - if not os.path.exists(settings['validation_path']): - settings['validation_enabled'] = False - messages.append("Validation not found, disabling validation...") - elif settings['validation_batch_size'] == 0: - settings['validation_enabled'] = False - messages.append("Validation batch size == 0, disabling validation...") - else: - with open(settings['validation_path'], 'r', encoding="utf-8") as f: - validation_lines = len(f.readlines()) - - if validation_lines < settings['validation_batch_size']: - settings['validation_batch_size'] = validation_lines - messages.append(f"Batch size exceeds validation dataset size, clamping validation batch size to {validation_lines}") - - settings['tokenizer_json'] = args.tokenizer_json if args.tokenizer_json else get_tokenizer_jsons()[0] - - if settings['gpus'] > get_device_count(): - settings['gpus'] = get_device_count() - - # what an utter mistake this was - settings['optimizer'] = 'adamw' # if settings['gpus'] == 1 else 'adamw_zero' - - if 'learning_rate_scheme' not in settings or settings['learning_rate_scheme'] not in LEARNING_RATE_SCHEMES: - settings['learning_rate_scheme'] = "Multistep" - - settings['learning_rate_scheme'] = LEARNING_RATE_SCHEMES[settings['learning_rate_scheme']] - - learning_rate_schema = [f"default_lr_scheme: {settings['learning_rate_scheme']}"] - if settings['learning_rate_scheme'] == "MultiStepLR": - if not settings['learning_rate_schedule']: - settings['learning_rate_schedule'] = LEARNING_RATE_SCHEDULE - elif isinstance(settings['learning_rate_schedule'],str): - settings['learning_rate_schedule'] = json.loads(settings['learning_rate_schedule']) - - settings['learning_rate_schedule'] = schedule_learning_rate( iterations_per_epoch, settings['learning_rate_schedule'] ) - - learning_rate_schema.append(f" gen_lr_steps: {settings['learning_rate_schedule']}") - learning_rate_schema.append(f" lr_gamma: 0.5") - elif settings['learning_rate_scheme'] == "CosineAnnealingLR_Restart": - epochs = settings['epochs'] - restarts = settings['learning_rate_restarts'] - restart_period = int(epochs / restarts) - - if 'learning_rate_warmup' not in settings: - settings['learning_rate_warmup'] = 0 - if 'learning_rate_min' not in settings: - settings['learning_rate_min'] = 1e-08 - - if 'learning_rate_period' not in settings: - settings['learning_rate_period'] = [ iterations_per_epoch * restart_period for x in range(epochs) ] - - settings['learning_rate_restarts'] = [ iterations_per_epoch * (x+1) * restart_period for x in range(restarts) ] # [52, 104, 156, 208] - - if 'learning_rate_restart_weights' not in settings: - settings['learning_rate_restart_weights'] = [ ( restarts - x - 1 ) / restarts for x in range(restarts) ] # [.75, .5, .25, .125] - settings['learning_rate_restart_weights'][-1] = settings['learning_rate_restart_weights'][-2] * 0.5 - - learning_rate_schema.append(f" T_period: {settings['learning_rate_period']}") - learning_rate_schema.append(f" warmup: {settings['learning_rate_warmup']}") - learning_rate_schema.append(f" eta_min: !!float {settings['learning_rate_min']}") - learning_rate_schema.append(f" restarts: {settings['learning_rate_restarts']}") - learning_rate_schema.append(f" restart_weights: {settings['learning_rate_restart_weights']}") - settings['learning_rate_scheme'] = "\n".join(learning_rate_schema) - - if settings['resume_state']: - settings['source_model'] = f"# pretrain_model_gpt: '{settings['source_model']}'" - settings['resume_state'] = f"resume_state: '{settings['resume_state']}'" - else: - settings['source_model'] = f"pretrain_model_gpt: '{settings['source_model']}'" - settings['resume_state'] = f"# resume_state: '{settings['resume_state']}'" - - def use_template(template, out): - with open(template, 'r', encoding="utf-8") as f: - yaml = f.read() - - # i could just load and edit the YAML directly, but this is easier, as I don't need to bother with path traversals - for k in settings: - if settings[k] is None: - continue - yaml = yaml.replace(f"${{{k}}}", str(settings[k])) - - with open(out, 'w', encoding="utf-8") as f: - f.write(yaml) - - if args.tts_backend == "tortoise": - use_template(f'./models/.template.dlas.yaml', f'./training/{settings["voice"]}/train.yaml') - elif args.tts_backend == "vall-e": - settings['model_name'] = "[ 'ar-quarter', 'nar-quarter' ]" - use_template(f'./models/.template.valle.yaml', f'./training/{settings["voice"]}/config.yaml') - - messages.append(f"Saved training output") - return settings, messages - -def import_voices(files, saveAs=None, progress=None): - global args - - if not isinstance(files, list): - files = [files] - - for file in tqdm(files, desc="Importing voice files"): - j, latents = read_generate_settings(file, read_latents=True) - - if j is not None and saveAs is None: - saveAs = j['voice'] - if saveAs is None or saveAs == "": - raise Exception("Specify a voice name") - - outdir = f'{get_voice_dir()}/{saveAs}/' - os.makedirs(outdir, exist_ok=True) - - if latents: - print(f"Importing latents to {latents}") - with open(f'{outdir}/cond_latents.pth', 'wb') as f: - f.write(latents) - latents = f'{outdir}/cond_latents.pth' - print(f"Imported latents to {latents}") - else: - filename = file.name - if filename[-4:] != ".wav": - raise Exception("Please convert to a WAV first") - - path = f"{outdir}/{os.path.basename(filename)}" - print(f"Importing voice to {path}") - - waveform, sample_rate = torchaudio.load(filename) - - if args.voice_fixer: - if not voicefixer: - load_voicefixer() - - waveform, sample_rate = resample(waveform, sample_rate, 44100) - torchaudio.save(path, waveform, sample_rate) - - print(f"Running 'voicefixer' on voice sample: {path}") - voicefixer.restore( - input = path, - output = path, - cuda=get_device_name() == "cuda" and args.voice_fixer_use_cuda, - #mode=mode, - ) - else: - torchaudio.save(path, waveform, sample_rate) - - print(f"Imported voice to {path}") - -def relative_paths( dirs ): - return [ './' + os.path.relpath( d ).replace("\\", "/") for d in dirs ] - -def get_voice( name, dir=get_voice_dir(), load_latents=True, extensions=["wav", "mp3", "flac"] ): - subj = f'{dir}/{name}/' - if not os.path.isdir(subj): - return - files = os.listdir(subj) - - if load_latents: - extensions.append("pth") - - voice = [] - for file in files: - ext = os.path.splitext(file)[-1][1:] - if ext not in extensions: - continue - - voice.append(f'{subj}/{file}') - - return sorted( voice ) - -def get_voice_list(dir=get_voice_dir(), append_defaults=False, extensions=["wav", "mp3", "flac", "pth"]): - defaults = [ "random", "microphone" ] - os.makedirs(dir, exist_ok=True) - #res = sorted([d for d in os.listdir(dir) if d not in defaults and os.path.isdir(os.path.join(dir, d)) and len(os.listdir(os.path.join(dir, d))) > 0 ]) - - res = [] - for name in os.listdir(dir): - if name in defaults: - continue - if not os.path.isdir(f'{dir}/{name}'): - continue - if len(os.listdir(os.path.join(dir, name))) == 0: - continue - files = get_voice( name, dir=dir, extensions=extensions ) - - if len(files) > 0: - res.append(name) - else: - for subdir in os.listdir(f'{dir}/{name}'): - if not os.path.isdir(f'{dir}/{name}/{subdir}'): - continue - files = get_voice( f'{name}/{subdir}', dir=dir, extensions=extensions ) - if len(files) == 0: - continue - res.append(f'{name}/{subdir}') - - res = sorted(res) - - if append_defaults: - res = res + defaults - - return res - -def get_valle_models(dir="./training/"): - return [ f'{dir}/{d}/config.yaml' for d in os.listdir(dir) if os.path.exists(f'{dir}/{d}/config.yaml') ] - -def get_autoregressive_models(dir="./models/finetunes/", prefixed=False, auto=False): - os.makedirs(dir, exist_ok=True) - base = [get_model_path('autoregressive.pth')] - halfp = get_halfp_model_path() - if os.path.exists(halfp): - base.append(halfp) - - additionals = sorted([f'{dir}/{d}' for d in os.listdir(dir) if d[-4:] == ".pth" ]) - found = [] - for training in os.listdir(f'./training/'): - if not os.path.isdir(f'./training/{training}/') or not os.path.isdir(f'./training/{training}/finetune/') or not os.path.isdir(f'./training/{training}/finetune/models/'): - continue - models = sorted([ int(d[:-8]) for d in os.listdir(f'./training/{training}/finetune/models/') if d[-8:] == "_gpt.pth" ]) - found = found + [ f'./training/{training}/finetune/models/{d}_gpt.pth' for d in models ] - - res = base + additionals + found - - if prefixed: - for i in range(len(res)): - path = res[i] - hash = hash_file(path) - shorthash = hash[:8] - - res[i] = f'[{shorthash}] {path}' - - paths = relative_paths(res) - if auto: - paths = ["auto"] + paths - - return paths - -def get_diffusion_models(dir="./models/finetunes/", prefixed=False): - return relative_paths([ get_model_path('diffusion_decoder.pth') ]) - -def get_tokenizer_jsons( dir="./models/tokenizers/" ): - additionals = sorted([ f'{dir}/{d}' for d in os.listdir(dir) if d[-5:] == ".json" ]) if os.path.isdir(dir) else [] - return relative_paths([ "./modules/tortoise-tts/tortoise/data/tokenizer.json" ] + additionals) - -def tokenize_text( text, config=None, stringed=True, skip_specials=False ): - from tortoise.utils.tokenizer import VoiceBpeTokenizer - - if not config: - config = args.tokenizer_json if args.tokenizer_json else get_tokenizer_jsons()[0] - - if not tts: - tokenizer = VoiceBpeTokenizer(config) - else: - tokenizer = tts.tokenizer - - encoded = tokenizer.encode(text) - decoded = tokenizer.tokenizer.decode(encoded, skip_special_tokens=skip_specials).split(" ") - - if stringed: - return "\n".join([ str(encoded), str(decoded) ]) - - return decoded - -def get_dataset_list(dir="./training/"): - return sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.txt" in os.listdir(os.path.join(dir, d)) ]) - -def get_training_list(dir="./training/"): - if args.tts_backend == "tortoise": - return sorted([f'./training/{d}/train.yaml' for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.yaml" in os.listdir(os.path.join(dir, d)) ]) - else: - return sorted([f'./training/{d}/config.yaml' for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "config.yaml" in os.listdir(os.path.join(dir, d)) ]) - -def pad(num, zeroes): - return str(num).zfill(zeroes+1) - -def curl(url): - try: - req = urllib.request.Request(url, headers={'User-Agent': 'Python'}) - conn = urllib.request.urlopen(req) - data = conn.read() - data = data.decode() - data = json.loads(data) - conn.close() - return data - except Exception as e: - print(e) - return None - -def check_for_updates( dir = None ): - if dir is None: - check_for_updates("./.git/") - check_for_updates("./.git/modules/dlas/") - check_for_updates("./.git/modules/tortoise-tts/") - return - - git_dir = dir - if not os.path.isfile(f'{git_dir}/FETCH_HEAD'): - print(f"Cannot check for updates for {dir}: not from a git repo") - return False - - with open(f'{git_dir}/FETCH_HEAD', 'r', encoding="utf-8") as f: - head = f.read() - - match = re.findall(r"^([a-f0-9]+).+?https:\/\/(.+?)\/(.+?)\/(.+?)\n", head) - if match is None or len(match) == 0: - print(f"Cannot check for updates for {dir}: cannot parse FETCH_HEAD") - return False - - match = match[0] - - local = match[0] - host = match[1] - owner = match[2] - repo = match[3] - - res = curl(f"https://{host}/api/v1/repos/{owner}/{repo}/branches/") #this only works for gitea instances - - if res is None or len(res) == 0: - print(f"Cannot check for updates for {dir}: cannot fetch from remote") - return False - - remote = res[0]["commit"]["id"] - - if remote != local: - print(f"New version found for {dir}: {local[:8]} => {remote[:8]}") - return True - - return False - -def notify_progress(message, progress=None, verbose=True): - if verbose: - print(message) - - if progress is None: - tqdm.write(message) - else: - progress(0, desc=message) - -def get_args(): - global args - return args - -def setup_args(cli=False): - global args - - default_arguments = { - 'share': False, - 'listen': None, - 'check-for-updates': False, - 'models-from-local-only': False, - 'low-vram': False, - 'sample-batch-size': None, - 'unsqueeze-sample-batches': False, - 'embed-output-metadata': True, - 'latents-lean-and-mean': True, - 'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it - 'voice-fixer-use-cuda': True, - - - 'force-cpu-for-conditioning-latents': False, - 'defer-tts-load': False, - 'device-override': None, - 'prune-nonfinal-outputs': True, - 'concurrency-count': 2, - 'autocalculate-voice-chunk-duration-size': 10, - - 'output-sample-rate': 44100, - 'output-volume': 1, - 'results-folder': "./results/", - - 'hf-token': None, - 'tts-backend': TTSES[0], - - 'autoregressive-model': None, - 'diffusion-model': None, - 'vocoder-model': VOCODERS[-1], - 'tokenizer-json': None, - - 'phonemizer-backend': 'espeak', - - 'valle-model': None, - - 'whisper-backend': 'openai/whisper', - 'whisper-model': "base", - 'whisper-batchsize': 1, - - 'training-default-halfp': False, - 'training-default-bnb': True, - - 'websocket-listen-address': "127.0.0.1", - 'websocket-listen-port': 8069, - 'websocket-enabled': False - } - - if os.path.isfile('./config/exec.json'): - with open(f'./config/exec.json', 'r', encoding="utf-8") as f: - try: - overrides = json.load(f) - for k in overrides: - default_arguments[k] = overrides[k] - except Exception as e: - print(e) - pass - - parser = argparse.ArgumentParser(allow_abbrev=not cli) - parser.add_argument("--share", action='store_true', default=default_arguments['share'], help="Lets Gradio return a public URL to use anywhere") - parser.add_argument("--listen", default=default_arguments['listen'], help="Path for Gradio to listen on") - parser.add_argument("--check-for-updates", action='store_true', default=default_arguments['check-for-updates'], help="Checks for update on startup") - parser.add_argument("--models-from-local-only", action='store_true', default=default_arguments['models-from-local-only'], help="Only loads models from disk, does not check for updates for models") - parser.add_argument("--low-vram", action='store_true', default=default_arguments['low-vram'], help="Disables some optimizations that increases VRAM usage") - parser.add_argument("--no-embed-output-metadata", action='store_false', default=not default_arguments['embed-output-metadata'], help="Disables embedding output metadata into resulting WAV files for easily fetching its settings used with the web UI (data is stored in the lyrics metadata tag)") - parser.add_argument("--latents-lean-and-mean", action='store_true', default=default_arguments['latents-lean-and-mean'], help="Exports the bare essentials for latents.") - parser.add_argument("--voice-fixer", action='store_true', default=default_arguments['voice-fixer'], help="Uses python module 'voicefixer' to improve audio quality, if available.") - parser.add_argument("--voice-fixer-use-cuda", action='store_true', default=default_arguments['voice-fixer-use-cuda'], help="Hints to voicefixer to use CUDA, if available.") - parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)") - parser.add_argument("--defer-tts-load", default=default_arguments['defer-tts-load'], action='store_true', help="Defers loading TTS model") - parser.add_argument("--prune-nonfinal-outputs", default=default_arguments['prune-nonfinal-outputs'], action='store_true', help="Deletes non-final output files on completing a generation") - parser.add_argument("--device-override", default=default_arguments['device-override'], help="A device string to override pass through Torch") - parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets how many batches to use during the autoregressive samples pass") - parser.add_argument("--unsqueeze-sample-batches", default=default_arguments['unsqueeze-sample-batches'], action='store_true', help="Unsqueezes sample batches to process one by one after sampling") - parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once") - parser.add_argument("--autocalculate-voice-chunk-duration-size", type=float, default=default_arguments['autocalculate-voice-chunk-duration-size'], help="Number of seconds to suggest voice chunk size for (for example, 100 seconds of audio at 10 seconds per chunk will suggest 10 chunks)") - parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)") - parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output") - parser.add_argument("--results-folder", type=str, default=default_arguments['results-folder'], help="Sets output directory") - - parser.add_argument("--hf-token", type=str, default=default_arguments['hf-token'], help="HuggingFace Token") - parser.add_argument("--tts-backend", default=default_arguments['tts-backend'], help="Specifies which TTS backend to use.") - - parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.") - parser.add_argument("--diffusion-model", default=default_arguments['diffusion-model'], help="Specifies which diffusion model to use for sampling.") - parser.add_argument("--vocoder-model", default=default_arguments['vocoder-model'], action='store_true', help="Specifies with vocoder to use") - parser.add_argument("--tokenizer-json", default=default_arguments['tokenizer-json'], help="Specifies which tokenizer json to use for tokenizing.") - - parser.add_argument("--phonemizer-backend", default=default_arguments['phonemizer-backend'], help="Specifies which phonemizer backend to use.") - - parser.add_argument("--valle-model", default=default_arguments['valle-model'], help="Specifies which VALL-E model to use for sampling.") - - parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)") - parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.") - parser.add_argument("--whisper-batchsize", type=int, default=default_arguments['whisper-batchsize'], help="Specifies batch size for WhisperX") - - parser.add_argument("--training-default-halfp", action='store_true', default=default_arguments['training-default-halfp'], help="Training default: halfp") - parser.add_argument("--training-default-bnb", action='store_true', default=default_arguments['training-default-bnb'], help="Training default: bnb") - - parser.add_argument("--websocket-listen-port", type=int, default=default_arguments['websocket-listen-port'], help="Websocket server listen port, default: 8069") - parser.add_argument("--websocket-listen-address", default=default_arguments['websocket-listen-address'], help="Websocket server listen address, default: 127.0.0.1") - parser.add_argument("--websocket-enabled", action='store_true', default=default_arguments['websocket-enabled'], help="Websocket API server enabled, default: false") - - if cli: - args, unknown = parser.parse_known_args() - else: - args = parser.parse_args() - - args.embed_output_metadata = not args.no_embed_output_metadata - - if not args.device_override: - set_device_name(args.device_override) - - if args.sample_batch_size == 0 and get_device_batch_size() == 1: - print("!WARNING! Automatically deduced sample batch size returned 1.") - - args.listen_host = None - args.listen_port = None - args.listen_path = None - if args.listen: - try: - match = re.findall(r"^(?:(.+?):(\d+))?(\/.*?)?$", args.listen)[0] - - args.listen_host = match[0] if match[0] != "" else "127.0.0.1" - args.listen_port = match[1] if match[1] != "" else None - args.listen_path = match[2] if match[2] != "" else "/" - except Exception as e: - pass - - if args.listen_port is not None: - args.listen_port = int(args.listen_port) - if args.listen_port == 0: - args.listen_port = None - - return args - -def get_default_settings( hypenated=True ): - settings = { - 'listen': None if not args.listen else args.listen, - 'share': args.share, - 'low-vram':args.low_vram, - 'check-for-updates':args.check_for_updates, - 'models-from-local-only':args.models_from_local_only, - 'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents, - 'defer-tts-load': args.defer_tts_load, - 'prune-nonfinal-outputs': args.prune_nonfinal_outputs, - 'device-override': args.device_override, - 'sample-batch-size': args.sample_batch_size, - 'unsqueeze-sample-batches': args.unsqueeze_sample_batches, - 'embed-output-metadata': args.embed_output_metadata, - 'latents-lean-and-mean': args.latents_lean_and_mean, - 'voice-fixer': args.voice_fixer, - 'voice-fixer-use-cuda': args.voice_fixer_use_cuda, - 'concurrency-count': args.concurrency_count, - 'output-sample-rate': args.output_sample_rate, - 'autocalculate-voice-chunk-duration-size': args.autocalculate_voice_chunk_duration_size, - 'output-volume': args.output_volume, - 'results-folder': args.results_folder, - - 'hf-token': args.hf_token, - 'tts-backend': args.tts_backend, - - 'autoregressive-model': args.autoregressive_model, - 'diffusion-model': args.diffusion_model, - 'vocoder-model': args.vocoder_model, - 'tokenizer-json': args.tokenizer_json, - - 'phonemizer-backend': args.phonemizer_backend, - - 'valle-model': args.valle_model, - - 'whisper-backend': args.whisper_backend, - 'whisper-model': args.whisper_model, - 'whisper-batchsize': args.whisper_batchsize, - - 'training-default-halfp': args.training_default_halfp, - 'training-default-bnb': args.training_default_bnb, - } - - res = {} - for k in settings: - res[k.replace("-", "_") if not hypenated else k] = settings[k] - return res - -def update_args( **kwargs ): - global args - - settings = get_default_settings(hypenated=False) - settings.update(kwargs) - - args.listen = settings['listen'] - args.share = settings['share'] - args.check_for_updates = settings['check_for_updates'] - args.models_from_local_only = settings['models_from_local_only'] - args.low_vram = settings['low_vram'] - args.force_cpu_for_conditioning_latents = settings['force_cpu_for_conditioning_latents'] - args.defer_tts_load = settings['defer_tts_load'] - args.prune_nonfinal_outputs = settings['prune_nonfinal_outputs'] - args.device_override = settings['device_override'] - args.sample_batch_size = settings['sample_batch_size'] - args.unsqueeze_sample_batches = settings['unsqueeze_sample_batches'] - args.embed_output_metadata = settings['embed_output_metadata'] - args.latents_lean_and_mean = settings['latents_lean_and_mean'] - args.voice_fixer = settings['voice_fixer'] - args.voice_fixer_use_cuda = settings['voice_fixer_use_cuda'] - args.concurrency_count = settings['concurrency_count'] - args.output_sample_rate = 44000 - args.autocalculate_voice_chunk_duration_size = settings['autocalculate_voice_chunk_duration_size'] - args.output_volume = settings['output_volume'] - args.results_folder = settings['results_folder'] - - args.hf_token = settings['hf_token'] - args.tts_backend = settings['tts_backend'] - - args.autoregressive_model = settings['autoregressive_model'] - args.diffusion_model = settings['diffusion_model'] - args.vocoder_model = settings['vocoder_model'] - args.tokenizer_json = settings['tokenizer_json'] - - args.phonemizer_backend = settings['phonemizer_backend'] - - args.valle_model = settings['valle_model'] - - args.whisper_backend = settings['whisper_backend'] - args.whisper_model = settings['whisper_model'] - args.whisper_batchsize = settings['whisper_batchsize'] - - args.training_default_halfp = settings['training_default_halfp'] - args.training_default_bnb = settings['training_default_bnb'] - - save_args_settings() - -def save_args_settings(): - global args - settings = get_default_settings() - - os.makedirs('./config/', exist_ok=True) - with open(f'./config/exec.json', 'w', encoding="utf-8") as f: - f.write(json.dumps(settings, indent='\t') ) - -# super kludgy )`; -def import_generate_settings(file = None): - if not file: - file = "./config/generate.json" - - res = { - 'text': None, - 'delimiter': None, - 'emotion': None, - 'prompt': None, - 'voice': "random", - 'mic_audio': None, - 'voice_latents_chunks': None, - 'candidates': None, - 'seed': None, - 'num_autoregressive_samples': 16, - 'diffusion_iterations': 30, - 'temperature': 0.8, - 'diffusion_sampler': "DDIM", - 'breathing_room': 8 , - 'cvvp_weight': 0.0, - 'top_p': 0.8, - 'diffusion_temperature': 1.0, - 'length_penalty': 1.0, - 'repetition_penalty': 2.0, - 'cond_free_k': 2.0, - 'experimentals': None, - } - - settings, _ = read_generate_settings(file, read_latents=False) - - if settings is not None: - res.update(settings) - - return res - -def reset_generate_settings(): - with open(f'./config/generate.json', 'w', encoding="utf-8") as f: - f.write(json.dumps({}, indent='\t') ) - return import_generate_settings() - -def read_generate_settings(file, read_latents=True): - j = None - latents = None - - if isinstance(file, list) and len(file) == 1: - file = file[0] - - try: - if file is not None: - if hasattr(file, 'name'): - file = file.name - - if file[-4:] == ".wav": - metadata = music_tag.load_file(file) - if 'lyrics' in metadata: - j = json.loads(str(metadata['lyrics'])) - elif file[-5:] == ".json": - with open(file, 'r') as f: - j = json.load(f) - except Exception as e: - pass - - if j is not None: - if 'latents' in j: - if read_latents: - latents = base64.b64decode(j['latents']) - del j['latents'] - - - if "time" in j: - j["time"] = "{:.3f}".format(j["time"]) - - - - return ( - j, - latents, - ) - -def version_check_tts( min_version ): - global tts - if not tts: - raise Exception("TTS is not initialized") - - if not hasattr(tts, 'version'): - return False - - if min_version[0] > tts.version[0]: - return True - if min_version[1] > tts.version[1]: - return True - if min_version[2] >= tts.version[2]: - return True - return False - -def load_tts( restart=False, - # TorToiSe configs - autoregressive_model=None, diffusion_model=None, vocoder_model=None, tokenizer_json=None, - # VALL-E configs - valle_model=None, -): - global args - global tts - - if restart: - unload_tts() - - tts_loading = True - if args.tts_backend == "tortoise": - if autoregressive_model: - args.autoregressive_model = autoregressive_model - else: - autoregressive_model = args.autoregressive_model - - if autoregressive_model == "auto": - autoregressive_model = deduce_autoregressive_model() - - if diffusion_model: - args.diffusion_model = diffusion_model - else: - diffusion_model = args.diffusion_model - - if vocoder_model: - args.vocoder_model = vocoder_model - else: - vocoder_model = args.vocoder_model - - if tokenizer_json: - args.tokenizer_json = tokenizer_json - else: - tokenizer_json = args.tokenizer_json - - if get_device_name() == "cpu": - print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.") - - print(f"Loading TorToiSe... (AR: {autoregressive_model}, diffusion: {diffusion_model}, vocoder: {vocoder_model})") - tts = TorToise_TTS(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, diffusion_model_path=diffusion_model, vocoder_model=vocoder_model, tokenizer_json=tokenizer_json, unsqueeze_sample_batches=args.unsqueeze_sample_batches) - elif args.tts_backend == "vall-e": - if valle_model: - args.valle_model = valle_model - else: - valle_model = args.valle_model - - print(f"Loading VALL-E... (Config: {valle_model})") - tts = VALLE_TTS(config=args.valle_model) - elif args.tts_backend == "bark": - - print(f"Loading Bark...") - tts = Bark_TTS(small=args.low_vram) - - print("Loaded TTS, ready for generation.") - tts_loading = False - return tts - -def unload_tts(): - global tts - - if tts: - del tts - tts = None - print("Unloaded TTS") - do_gc() - -def reload_tts(): - unload_tts() - load_tts() - -def get_current_voice(): - global current_voice - if current_voice: - return current_voice - - settings, _ = read_generate_settings("./config/generate.json", read_latents=False) - - if settings and "voice" in settings['voice']: - return settings["voice"] - - return None - -def deduce_autoregressive_model(voice=None): - if not voice: - voice = get_current_voice() - - if voice: - if os.path.exists(f'./models/finetunes/{voice}.pth'): - return f'./models/finetunes/{voice}.pth' - - dir = f'./training/{voice}/finetune/models/' - if os.path.isdir(dir): - counts = sorted([ int(d[:-8]) for d in os.listdir(dir) if d[-8:] == "_gpt.pth" ]) - names = [ f'{dir}/{d}_gpt.pth' for d in counts ] - if len(names) > 0: - return names[-1] - - if args.autoregressive_model != "auto": - return args.autoregressive_model - - return get_model_path('autoregressive.pth') - -def update_autoregressive_model(autoregressive_model_path): - if args.tts_backend != "tortoise": - raise f"Unsupported backend: {args.tts_backend}" - - if autoregressive_model_path == "auto": - autoregressive_model_path = deduce_autoregressive_model() - else: - match = re.findall(r'^\[[a-fA-F0-9]{8}\] (.+?)$', autoregressive_model_path) - if match: - autoregressive_model_path = match[0] - - if not autoregressive_model_path or not os.path.exists(autoregressive_model_path): - print(f"Invalid model: {autoregressive_model_path}") - return - - args.autoregressive_model = autoregressive_model_path - save_args_settings() - print(f'Stored autoregressive model to settings: {autoregressive_model_path}') - - global tts - if not tts: - if tts_loading: - raise Exception("TTS is still initializing...") - return - - if hasattr(tts, "loading") and tts.loading: - raise Exception("TTS is still initializing...") - - - if autoregressive_model_path == tts.autoregressive_model_path: - return - - tts.load_autoregressive_model(autoregressive_model_path) - - do_gc() - - return autoregressive_model_path - -def update_diffusion_model(diffusion_model_path): - if args.tts_backend != "tortoise": - raise f"Unsupported backend: {args.tts_backend}" - - match = re.findall(r'^\[[a-fA-F0-9]{8}\] (.+?)$', diffusion_model_path) - if match: - diffusion_model_path = match[0] - - if not diffusion_model_path or not os.path.exists(diffusion_model_path): - print(f"Invalid model: {diffusion_model_path}") - return - - args.diffusion_model = diffusion_model_path - save_args_settings() - print(f'Stored diffusion model to settings: {diffusion_model_path}') - - global tts - if not tts: - if tts_loading: - raise Exception("TTS is still initializing...") - return - - if hasattr(tts, "loading") and tts.loading: - raise Exception("TTS is still initializing...") - - if diffusion_model_path == "auto": - diffusion_model_path = deduce_diffusion_model() - - if diffusion_model_path == tts.diffusion_model_path: - return - - tts.load_diffusion_model(diffusion_model_path) - - do_gc() - - return diffusion_model_path - -def update_vocoder_model(vocoder_model): - if args.tts_backend != "tortoise": - raise f"Unsupported backend: {args.tts_backend}" - - args.vocoder_model = vocoder_model - save_args_settings() - print(f'Stored vocoder model to settings: {vocoder_model}') - - global tts - if not tts: - if tts_loading: - raise Exception("TTS is still initializing...") - return - - if hasattr(tts, "loading") and tts.loading: - raise Exception("TTS is still initializing...") - - print(f"Loading model: {vocoder_model}") - tts.load_vocoder_model(vocoder_model) - print(f"Loaded model: {tts.vocoder_model}") - - do_gc() - - return vocoder_model - -def update_tokenizer(tokenizer_json): - if args.tts_backend != "tortoise": - raise f"Unsupported backend: {args.tts_backend}" - - args.tokenizer_json = tokenizer_json - save_args_settings() - print(f'Stored tokenizer to settings: {tokenizer_json}') - - global tts - if not tts: - if tts_loading: - raise Exception("TTS is still initializing...") - return - - if hasattr(tts, "loading") and tts.loading: - raise Exception("TTS is still initializing...") - - print(f"Loading tokenizer vocab: {tokenizer_json}") - tts.load_tokenizer_json(tokenizer_json) - print(f"Loaded tokenizer vocab: {tts.tokenizer_json}") - - do_gc() - - return vocoder_model - -def load_voicefixer(restart=False): - global voicefixer - - if restart: - unload_voicefixer() - - try: - print("Loading Voicefixer") - from voicefixer import VoiceFixer - voicefixer = VoiceFixer() - print("Loaded Voicefixer") - except Exception as e: - print(f"Error occurred while tring to initialize voicefixer: {e}") - if voicefixer: - del voicefixer - voicefixer = None - -def unload_voicefixer(): - global voicefixer - - if voicefixer: - del voicefixer - voicefixer = None - print("Unloaded Voicefixer") - - do_gc() - -def load_whisper_model(language=None, model_name=None, progress=None): - global whisper_model - global whisper_align_model - - if args.whisper_backend not in WHISPER_BACKENDS: - raise Exception(f"unavailable backend: {args.whisper_backend}") - - if not model_name: - model_name = args.whisper_model - else: - args.whisper_model = model_name - save_args_settings() - - if language and f'{model_name}.{language}' in WHISPER_SPECIALIZED_MODELS: - model_name = f'{model_name}.{language}' - print(f"Loading specialized model for language: {language}") - - notify_progress(f"Loading Whisper model: {model_name}", progress=progress) - - if args.whisper_backend == "openai/whisper": - import whisper - try: - #is it possible for model to fit on vram but go oom later on while executing on data? - whisper_model = whisper.load_model(model_name) - except: - print("Out of VRAM memory. falling back to loading Whisper on CPU.") - whisper_model = whisper.load_model(model_name, device="cpu") - elif args.whisper_backend == "lightmare/whispercpp": - from whispercpp import Whisper - if not language: - language = 'auto' - - b_lang = language.encode('ascii') - whisper_model = Whisper(model_name, models_dir='./models/', language=b_lang) - elif args.whisper_backend == "m-bain/whisperx": - import whisper, whisperx - device = "cuda" if get_device_name() == "cuda" else "cpu" - whisper_model = whisperx.load_model(model_name, device) - whisper_align_model = whisperx.load_align_model(model_name="WAV2VEC2_ASR_LARGE_LV60K_960H" if language=="en" else None, language_code=language, device=device) - - print("Loaded Whisper model") - -def unload_whisper(): - global whisper_model - global whisper_align_model - - if whisper_align_model: - del whisper_align_model - whisper_align_model = None - - if whisper_model: - del whisper_model - whisper_model = None - print("Unloaded Whisper") - - do_gc() - -# shamelessly borrowed from Voldy's Web UI: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/modules/extras.py#L74 -def merge_models( primary_model_name, secondary_model_name, alpha, progress=gr.Progress() ): - key_blacklist = [] - - def weighted_sum(theta0, theta1, alpha): - return ((1 - alpha) * theta0) + (alpha * theta1) - - def read_model( filename ): - print(f"Loading {filename}") - return torch.load(filename) - - theta_func = weighted_sum - - theta_0 = read_model(primary_model_name) - theta_1 = read_model(secondary_model_name) - - for key in tqdm(theta_0.keys(), desc="Merging..."): - if key in key_blacklist: - print("Skipping ignored key:", key) - continue - - a = theta_0[key] - b = theta_1[key] - - if a.dtype != torch.float32 and a.dtype != torch.float16: - print("Skipping key:", key, a.dtype) - continue - - if b.dtype != torch.float32 and b.dtype != torch.float16: - print("Skipping key:", key, b.dtype) - continue - - theta_0[key] = theta_func(a, b, alpha) - - del theta_1 - - primary_basename = os.path.splitext(os.path.basename(primary_model_name))[0] - secondary_basename = os.path.splitext(os.path.basename(secondary_model_name))[0] - suffix = "{:.3f}".format(alpha) - output_path = f'./models/finetunes/{primary_basename}_{secondary_basename}_{suffix}_merge.pth' - - torch.save(theta_0, output_path) - message = f"Saved to {output_path}" - print(message) +import os +if 'XDG_CACHE_HOME' not in os.environ: + os.environ['XDG_CACHE_HOME'] = os.path.realpath(os.path.join(os.getcwd(), './models/')) + +if 'TORTOISE_MODELS_DIR' not in os.environ: + os.environ['TORTOISE_MODELS_DIR'] = os.path.realpath(os.path.join(os.getcwd(), './models/tortoise/')) + +if 'TRANSFORMERS_CACHE' not in os.environ: + os.environ['TRANSFORMERS_CACHE'] = os.path.realpath(os.path.join(os.getcwd(), './models/transformers/')) + +import argparse +import time +import math +import json +import base64 +import re +import urllib.request +import signal +import gc +import subprocess +import psutil +import yaml +import hashlib +import string +import random + +from tqdm import tqdm +import torch +import torchaudio +import music_tag +import gradio as gr +import gradio.utils +import pandas as pd +import numpy as np + +from glob import glob +from datetime import datetime +from datetime import timedelta + +from tortoise.api import TextToSpeech as TorToise_TTS, MODELS, get_model_path, pad_or_truncate +from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_dir, get_voices +from tortoise.utils.text import split_and_recombine_text +from tortoise.utils.device import get_device_name, set_device_name, get_device_count, get_device_vram, get_device_batch_size, do_gc + + +MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth" + +WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2"] +WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"] +WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp", "m-bain/whisperx"] +VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band'] +TTSES = ['tortoise'] + +INFERENCING = False +GENERATE_SETTINGS_ARGS = None + +LEARNING_RATE_SCHEMES = {"Multistep": "MultiStepLR", "Cos. Annealing": "CosineAnnealingLR_Restart"} +LEARNING_RATE_SCHEDULE = [ 2, 4, 9, 18, 25, 33, 50 ] + +RESAMPLERS = {} + +MIN_TRAINING_DURATION = 0.6 +MAX_TRAINING_DURATION = 11.6097505669 +MAX_TRAINING_CHAR_LENGTH = 200 + +VALLE_ENABLED = False +BARK_ENABLED = False + +VERBOSE_DEBUG = True + +import traceback + +try: + from whisper.normalizers.english import EnglishTextNormalizer + from whisper.normalizers.basic import BasicTextNormalizer + from whisper.tokenizer import LANGUAGES + + print("Whisper detected") +except Exception as e: + if VERBOSE_DEBUG: + print(traceback.format_exc()) + pass + +try: + from vall_e.emb.qnt import encode as valle_quantize + from vall_e.emb.g2p import encode as valle_phonemize + + from vall_e.inference import TTS as VALLE_TTS + + import soundfile + + print("VALL-E detected") + VALLE_ENABLED = True +except Exception as e: + if VERBOSE_DEBUG: + print(traceback.format_exc()) + pass + +if VALLE_ENABLED: + TTSES.append('vall-e') + +# torchaudio.set_audio_backend('soundfile') + +try: + import bark + from bark import text_to_semantic + from bark.generation import SAMPLE_RATE as BARK_SAMPLE_RATE, ALLOWED_PROMPTS, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic, load_codec_model + from bark.api import generate_audio as bark_generate_audio + from encodec.utils import convert_audio + + from scipy.io.wavfile import write as write_wav + + print("Bark detected") + BARK_ENABLED = True +except Exception as e: + if VERBOSE_DEBUG: + print(traceback.format_exc()) + pass + +if BARK_ENABLED: + TTSES.append('bark') + + def semantic_to_audio_tokens( + semantic_tokens, + history_prompt = None, + temp = 0.7, + silent = False, + output_full = False, + ): + coarse_tokens = generate_coarse( + semantic_tokens, history_prompt=history_prompt, temp=temp, silent=silent, use_kv_caching=True + ) + fine_tokens = generate_fine(coarse_tokens, history_prompt=history_prompt, temp=0.5) + + if output_full: + full_generation = { + "semantic_prompt": semantic_tokens, + "coarse_prompt": coarse_tokens, + "fine_prompt": fine_tokens, + } + return full_generation + return fine_tokens + + class Bark_TTS(): + def __init__(self, small=False): + self.input_sample_rate = BARK_SAMPLE_RATE + self.output_sample_rate = BARK_SAMPLE_RATE # args.output_sample_rate + + preload_models( + text_use_gpu=True, + coarse_use_gpu=True, + fine_use_gpu=True, + codec_use_gpu=True, + + text_use_small=small, + coarse_use_small=small, + fine_use_small=small, + + force_reload=False + ) + + self.device = get_device_name() + + try: + from vocos import Vocos + self.vocos_enabled = True + print("Vocos detected") + except Exception as e: + if VERBOSE_DEBUG: + print(traceback.format_exc()) + self.vocos_enabled = False + + try: + from hubert.hubert_manager import HuBERTManager + + hubert_manager = HuBERTManager() + hubert_manager.make_sure_hubert_installed() + hubert_manager.make_sure_tokenizer_installed() + + self.hubert_enabled = True + print("HuBERT detected") + except Exception as e: + if VERBOSE_DEBUG: + print(traceback.format_exc()) + self.hubert_enabled = False + + if self.vocos_enabled: + self.vocos = Vocos.from_pretrained("charactr/vocos-encodec-24khz").to(self.device) + + def create_voice( self, voice ): + transcription_json = f'./training/{voice}/whisper.json' + if not os.path.exists(transcription_json): + raise f"Transcription for voice not found: {voice}" + + transcriptions = json.load(open(transcription_json, 'r', encoding="utf-8")) + candidates = [] + for file in transcriptions: + result = transcriptions[file] + added = 0 + + for segment in result['segments']: + path = file.replace(".wav", f"_{pad(segment['id'], 4)}.wav") + # check if the slice actually exists + if not os.path.exists(f'./training/{voice}/audio/{path}'): + continue + + entry = ( + path, + segment['end'] - segment['start'], + segment['text'] + ) + candidates.append(entry) + added = added + 1 + + # if nothing got added (assuming because nothign was sliced), use the master file + if added == 0: # added < len(result['segments']): + start = 0 + end = 0 + for segment in result['segments']: + start = max( start, segment['start'] ) + end = max( end, segment['end'] ) + + entry = ( + file, + end - start, + result['text'] + ) + candidates.append(entry) + + candidates.sort(key=lambda x: x[1]) + candidate = random.choice(candidates) + audio_filepath = f'./training/{voice}/audio/{candidate[0]}' + text = candidate[-1] + + print("Using as reference:", audio_filepath, text) + + # Load and pre-process the audio waveform + model = load_codec_model(use_gpu=True) + wav, sr = torchaudio.load(audio_filepath) + wav = convert_audio(wav, sr, model.sample_rate, model.channels) + + # generate semantic tokens + + if self.hubert_enabled: + from hubert.pre_kmeans_hubert import CustomHubert + from hubert.customtokenizer import CustomTokenizer + + wav = wav.to(self.device) + + # Extract discrete codes from EnCodec + with torch.no_grad(): + encoded_frames = model.encode(wav.unsqueeze(0)) + codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T] + + # get seconds of audio + seconds = wav.shape[-1] / model.sample_rate + + # Load the HuBERT model + hubert_model = CustomHubert(checkpoint_path='./data/models/hubert/hubert.pt').to(self.device) + + # Load the CustomTokenizer model + tokenizer = CustomTokenizer.load_from_checkpoint('./data/models/hubert/tokenizer.pth').to(self.device) + + semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate) + semantic_tokens = tokenizer.get_token(semantic_vectors) + + # move codes to cpu + codes = codes.cpu().numpy() + # move semantic tokens to cpu + semantic_tokens = semantic_tokens.cpu().numpy() + else: + wav = wav.unsqueeze(0).to(self.device) + + # Extract discrete codes from EnCodec + with torch.no_grad(): + encoded_frames = model.encode(wav) + codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze().cpu().numpy() # [n_q, T] + + # get seconds of audio + seconds = wav.shape[-1] / model.sample_rate + + # generate semantic tokens + semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds, top_k=50, top_p=.95, temp=0.7) + + # print(bark.__file__) + bark_location = os.path.dirname(os.path.relpath(bark.__file__)) # './modules/bark/bark/' + output_path = f'./{bark_location}/assets/prompts/' + voice.replace("/", "_") + '.npz' + np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens) + + def inference( self, text, voice, text_temp=0.7, waveform_temp=0.7 ): + if voice == "random": + voice = None + else: + if not os.path.exists('./modules/bark/bark/assets/prompts/' + voice + '.npz'): + self.create_voice( voice ) + voice = voice.replace("/", "_") + if voice not in ALLOWED_PROMPTS: + ALLOWED_PROMPTS.add( voice ) + + semantic_tokens = text_to_semantic(text, history_prompt=voice, temp=text_temp, silent=False) + audio_tokens = semantic_to_audio_tokens( semantic_tokens, history_prompt=voice, temp=waveform_temp, silent=False, output_full=False ) + + if self.vocos_enabled: + audio_tokens_torch = torch.from_numpy(audio_tokens).to(self.device) + features = self.vocos.codes_to_features(audio_tokens_torch) + wav = self.vocos.decode(features, bandwidth_id=torch.tensor([2], device=self.device)) + else: + wav = codec_decode( audio_tokens ) + + return ( wav, BARK_SAMPLE_RATE ) + # return (bark_generate_audio(text, history_prompt=voice, text_temp=text_temp, waveform_temp=waveform_temp), BARK_SAMPLE_RATE) + +args = None +tts = None +tts_loading = False +webui = None +voicefixer = None + +whisper_model = None +whisper_align_model = None + +training_state = None + +current_voice = None + +def cleanup_voice_name( name ): + return name.split("/")[-1] + +def resample( waveform, input_rate, output_rate=44100 ): + # mono-ize + waveform = torch.mean(waveform, dim=0, keepdim=True) + + if input_rate == output_rate: + return waveform, output_rate + + key = f'{input_rate}:{output_rate}' + if not key in RESAMPLERS: + RESAMPLERS[key] = torchaudio.transforms.Resample( + input_rate, + output_rate, + lowpass_filter_width=16, + rolloff=0.85, + resampling_method="kaiser_window", + beta=8.555504641634386, + ) + + return RESAMPLERS[key]( waveform ), output_rate + +def generate(**kwargs): + if args.tts_backend == "tortoise": + return generate_tortoise(**kwargs) + if args.tts_backend == "vall-e": + return generate_valle(**kwargs) + if args.tts_backend == "bark": + return generate_bark(**kwargs) + +def generate_bark(**kwargs): + parameters = {} + parameters.update(kwargs) + + voice = parameters['voice'] + progress = parameters['progress'] if 'progress' in parameters else None + if parameters['seed'] == 0: + parameters['seed'] = None + + usedSeed = parameters['seed'] + + global args + global tts + + unload_whisper() + unload_voicefixer() + + if not tts: + # should check if it's loading or unloaded, and load it if it's unloaded + if tts_loading: + raise Exception("TTS is still initializing...") + if progress is not None: + notify_progress("Initializing TTS...", progress=progress) + load_tts() + if hasattr(tts, "loading") and tts.loading: + raise Exception("TTS is still initializing...") + + do_gc() + + voice_samples = None + conditioning_latents = None + sample_voice = None + + voice_cache = {} + + def get_settings( override=None ): + settings = { + 'voice': parameters['voice'], + 'text_temp': float(parameters['temperature']), + 'waveform_temp': float(parameters['temperature']), + } + + # could be better to just do a ternary on everything above, but i am not a professional + selected_voice = voice + if override is not None: + if 'voice' in override: + selected_voice = override['voice'] + + for k in override: + if k not in settings: + continue + settings[k] = override[k] + + return settings + + if not parameters['delimiter']: + parameters['delimiter'] = "\n" + elif parameters['delimiter'] == "\\n": + parameters['delimiter'] = "\n" + + if parameters['delimiter'] and parameters['delimiter'] != "" and parameters['delimiter'] in parameters['text']: + texts = parameters['text'].split(parameters['delimiter']) + else: + texts = split_and_recombine_text(parameters['text']) + + full_start_time = time.time() + + outdir = f"{args.results_folder}/{voice}/" + os.makedirs(outdir, exist_ok=True) + + audio_cache = {} + + volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None + + idx = 0 + idx_cache = {} + for i, file in enumerate(os.listdir(outdir)): + filename = os.path.basename(file) + extension = os.path.splitext(filename)[-1][1:] + if extension != "json" and extension != "wav": + continue + match = re.findall(rf"^{cleanup_voice_name(voice)}_(\d+)(?:.+?)?{extension}$", filename) + if match and len(match) > 0: + key = int(match[0]) + idx_cache[key] = True + + if len(idx_cache) > 0: + keys = sorted(list(idx_cache.keys())) + idx = keys[-1] + 1 + + idx = pad(idx, 4) + + def get_name(line=0, candidate=0, combined=False): + name = f"{idx}" + if combined: + name = f"{name}_combined" + elif len(texts) > 1: + name = f"{name}_{line}" + if parameters['candidates'] > 1: + name = f"{name}_{candidate}" + return name + + def get_info( voice, settings = None, latents = True ): + info = {} + info.update(parameters) + + info['time'] = time.time()-full_start_time + info['datetime'] = datetime.now().isoformat() + + info['progress'] = None + del info['progress'] + + if info['delimiter'] == "\n": + info['delimiter'] = "\\n" + + if settings is not None: + for k in settings: + if k in info: + info[k] = settings[k] + return info + + INFERENCING = True + for line, cut_text in enumerate(texts): + tqdm_prefix = f'[{str(line+1)}/{str(len(texts))}]' + print(f"{tqdm_prefix} Generating line: {cut_text}") + start_time = time.time() + + # do setting editing + match = re.findall(r'^(\{.+\}) (.+?)$', cut_text) + override = None + if match and len(match) > 0: + match = match[0] + try: + override = json.loads(match[0]) + cut_text = match[1].strip() + except Exception as e: + raise Exception("Prompt settings editing requested, but received invalid JSON") + + settings = get_settings( override=override ) + + gen = tts.inference(cut_text, **settings ) + + run_time = time.time()-start_time + print(f"Generating line took {run_time} seconds") + + if not isinstance(gen, list): + gen = [gen] + + for j, g in enumerate(gen): + wav, sr = g + name = get_name(line=line, candidate=j) + + settings['text'] = cut_text + settings['time'] = run_time + settings['datetime'] = datetime.now().isoformat() + + # save here in case some error happens mid-batch + if tts.vocos_enabled: + torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', wav.cpu(), sr) + else: + write_wav(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', sr, wav) + wav, sr = torchaudio.load(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') + + audio_cache[name] = { + 'audio': wav, + 'settings': get_info(voice=override['voice'] if override and 'voice' in override else voice, settings=settings) + } + + del gen + do_gc() + INFERENCING = False + + for k in audio_cache: + audio = audio_cache[k]['audio'] + + audio, _ = resample(audio, tts.output_sample_rate, args.output_sample_rate) + if volume_adjust is not None: + audio = volume_adjust(audio) + + audio_cache[k]['audio'] = audio + torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{k}.wav', audio, args.output_sample_rate) + + output_voices = [] + for candidate in range(parameters['candidates']): + if len(texts) > 1: + audio_clips = [] + for line in range(len(texts)): + name = get_name(line=line, candidate=candidate) + audio = audio_cache[name]['audio'] + audio_clips.append(audio) + + name = get_name(candidate=candidate, combined=True) + audio = torch.cat(audio_clips, dim=-1) + torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', audio, args.output_sample_rate) + + audio = audio.squeeze(0).cpu() + audio_cache[name] = { + 'audio': audio, + 'settings': get_info(voice=voice), + 'output': True + } + else: + try: + name = get_name(candidate=candidate) + audio_cache[name]['output'] = True + except Exception as e: + for name in audio_cache: + audio_cache[name]['output'] = True + + + if args.voice_fixer: + if not voicefixer: + notify_progress("Loading voicefix...", progress=progress) + load_voicefixer() + + try: + fixed_cache = {} + for name in tqdm(audio_cache, desc="Running voicefix..."): + del audio_cache[name]['audio'] + if 'output' not in audio_cache[name] or not audio_cache[name]['output']: + continue + + path = f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav' + fixed = f'{outdir}/{cleanup_voice_name(voice)}_{name}_fixed.wav' + voicefixer.restore( + input=path, + output=fixed, + cuda=get_device_name() == "cuda" and args.voice_fixer_use_cuda, + #mode=mode, + ) + + fixed_cache[f'{name}_fixed'] = { + 'settings': audio_cache[name]['settings'], + 'output': True + } + audio_cache[name]['output'] = False + + for name in fixed_cache: + audio_cache[name] = fixed_cache[name] + except Exception as e: + print(e) + print("\nFailed to run Voicefixer") + + for name in audio_cache: + if 'output' not in audio_cache[name] or not audio_cache[name]['output']: + if args.prune_nonfinal_outputs: + audio_cache[name]['pruned'] = True + os.remove(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') + continue + + output_voices.append(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') + + if not args.embed_output_metadata: + with open(f'{outdir}/{cleanup_voice_name(voice)}_{name}.json', 'w', encoding="utf-8") as f: + f.write(json.dumps(audio_cache[name]['settings'], indent='\t') ) + + if args.embed_output_metadata: + for name in tqdm(audio_cache, desc="Embedding metadata..."): + if 'pruned' in audio_cache[name] and audio_cache[name]['pruned']: + continue + + metadata = music_tag.load_file(f"{outdir}/{cleanup_voice_name(voice)}_{name}.wav") + metadata['lyrics'] = json.dumps(audio_cache[name]['settings']) + metadata.save() + + if sample_voice is not None: + sample_voice = (tts.input_sample_rate, sample_voice.numpy()) + + info = get_info(voice=voice, latents=False) + print(f"Generation took {info['time']} seconds, saved to '{output_voices[0]}'\n") + + info['seed'] = usedSeed + if 'latents' in info: + del info['latents'] + + os.makedirs('./config/', exist_ok=True) + with open(f'./config/generate.json', 'w', encoding="utf-8") as f: + f.write(json.dumps(info, indent='\t') ) + + stats = [ + [ parameters['seed'], "{:.3f}".format(info['time']) ] + ] + + return ( + sample_voice, + output_voices, + stats, + ) + +def generate_valle(**kwargs): + parameters = {} + parameters.update(kwargs) + + voice = parameters['voice'] + progress = parameters['progress'] if 'progress' in parameters else None + if parameters['seed'] == 0: + parameters['seed'] = None + + usedSeed = parameters['seed'] + + global args + global tts + + unload_whisper() + unload_voicefixer() + + if not tts: + # should check if it's loading or unloaded, and load it if it's unloaded + if tts_loading: + raise Exception("TTS is still initializing...") + if progress is not None: + notify_progress("Initializing TTS...", progress=progress) + load_tts() + if hasattr(tts, "loading") and tts.loading: + raise Exception("TTS is still initializing...") + + do_gc() + + voice_samples = None + conditioning_latents = None + sample_voice = None + + voice_cache = {} + def fetch_voice( voice ): + if voice in voice_cache: + return voice_cache[voice] + + """ + voice_dir = f'./training/{voice}/audio/' + + if not os.path.isdir(voice_dir) or len(os.listdir(voice_dir)) == 0: + voice_dir = f'./voices/{voice}/' + + files = [ f'{voice_dir}/{d}' for d in os.listdir(voice_dir) if d[-4:] == ".wav" ] + """ + + if os.path.isdir(f'./training/{voice}/audio/'): + files = get_voice(name="audio", dir=f"./training/{voice}/", load_latents=False) + else: + files = get_voice(name=voice, load_latents=False) + + # return files + voice_cache[voice] = random.sample(files, k=min(3, len(files))) + return voice_cache[voice] + + def get_settings( override=None ): + settings = { + 'ar_temp': float(parameters['temperature']), + 'nar_temp': float(parameters['temperature']), + 'max_ar_steps': parameters['num_autoregressive_samples'], + } + + # could be better to just do a ternary on everything above, but i am not a professional + selected_voice = voice + if override is not None: + if 'voice' in override: + selected_voice = override['voice'] + + for k in override: + if k not in settings: + continue + settings[k] = override[k] + + settings['references'] = fetch_voice(voice=selected_voice) # [ fetch_voice(voice=selected_voice) for _ in range(3) ] + return settings + + if not parameters['delimiter']: + parameters['delimiter'] = "\n" + elif parameters['delimiter'] == "\\n": + parameters['delimiter'] = "\n" + + if parameters['delimiter'] and parameters['delimiter'] != "" and parameters['delimiter'] in parameters['text']: + texts = parameters['text'].split(parameters['delimiter']) + else: + texts = split_and_recombine_text(parameters['text']) + + full_start_time = time.time() + + outdir = f"{args.results_folder}/{voice}/" + os.makedirs(outdir, exist_ok=True) + + audio_cache = {} + + volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None + + idx = 0 + idx_cache = {} + for i, file in enumerate(os.listdir(outdir)): + filename = os.path.basename(file) + extension = os.path.splitext(filename)[-1][1:] + if extension != "json" and extension != "wav": + continue + match = re.findall(rf"^{voice}_(\d+)(?:.+?)?{extension}$", filename) + if match and len(match) > 0: + key = int(match[0]) + idx_cache[key] = True + + if len(idx_cache) > 0: + keys = sorted(list(idx_cache.keys())) + idx = keys[-1] + 1 + + idx = pad(idx, 4) + + def get_name(line=0, candidate=0, combined=False): + name = f"{idx}" + if combined: + name = f"{name}_combined" + elif len(texts) > 1: + name = f"{name}_{line}" + if parameters['candidates'] > 1: + name = f"{name}_{candidate}" + return name + + def get_info( voice, settings = None, latents = True ): + info = {} + info.update(parameters) + + info['time'] = time.time()-full_start_time + info['datetime'] = datetime.now().isoformat() + + info['progress'] = None + del info['progress'] + + if info['delimiter'] == "\n": + info['delimiter'] = "\\n" + + if settings is not None: + for k in settings: + if k in info: + info[k] = settings[k] + return info + + INFERENCING = True + for line, cut_text in enumerate(texts): + tqdm_prefix = f'[{str(line+1)}/{str(len(texts))}]' + print(f"{tqdm_prefix} Generating line: {cut_text}") + start_time = time.time() + + # do setting editing + match = re.findall(r'^(\{.+\}) (.+?)$', cut_text) + override = None + if match and len(match) > 0: + match = match[0] + try: + override = json.loads(match[0]) + cut_text = match[1].strip() + except Exception as e: + raise Exception("Prompt settings editing requested, but received invalid JSON") + + name = get_name(line=line, candidate=0) + + settings = get_settings( override=override ) + references = settings['references'] + settings.pop("references") + settings['out_path'] = f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav' + + gen = tts.inference(cut_text, references, **settings ) + + run_time = time.time()-start_time + print(f"Generating line took {run_time} seconds") + + if not isinstance(gen, list): + gen = [gen] + + for j, g in enumerate(gen): + wav, sr = g + name = get_name(line=line, candidate=j) + + settings['text'] = cut_text + settings['time'] = run_time + settings['datetime'] = datetime.now().isoformat() + + # save here in case some error happens mid-batch + #torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', wav.cpu(), sr) + #soundfile.write(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', wav.cpu()[0,0], sr) + wav, sr = torchaudio.load(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') + + audio_cache[name] = { + 'audio': wav, + 'settings': get_info(voice=override['voice'] if override and 'voice' in override else voice, settings=settings) + } + + del gen + do_gc() + INFERENCING = False + + for k in audio_cache: + audio = audio_cache[k]['audio'] + + audio, _ = resample(audio, tts.output_sample_rate, args.output_sample_rate) + if volume_adjust is not None: + audio = volume_adjust(audio) + + audio_cache[k]['audio'] = audio + torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{k}.wav', audio, args.output_sample_rate) + + output_voices = [] + for candidate in range(parameters['candidates']): + if len(texts) > 1: + audio_clips = [] + for line in range(len(texts)): + name = get_name(line=line, candidate=candidate) + audio = audio_cache[name]['audio'] + audio_clips.append(audio) + + name = get_name(candidate=candidate, combined=True) + audio = torch.cat(audio_clips, dim=-1) + torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', audio, args.output_sample_rate) + + audio = audio.squeeze(0).cpu() + audio_cache[name] = { + 'audio': audio, + 'settings': get_info(voice=voice), + 'output': True + } + else: + name = get_name(candidate=candidate) + audio_cache[name]['output'] = True + + + if args.voice_fixer: + if not voicefixer: + notify_progress("Loading voicefix...", progress=progress) + load_voicefixer() + + try: + fixed_cache = {} + for name in tqdm(audio_cache, desc="Running voicefix..."): + del audio_cache[name]['audio'] + if 'output' not in audio_cache[name] or not audio_cache[name]['output']: + continue + + path = f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav' + fixed = f'{outdir}/{cleanup_voice_name(voice)}_{name}_fixed.wav' + voicefixer.restore( + input=path, + output=fixed, + cuda=get_device_name() == "cuda" and args.voice_fixer_use_cuda, + #mode=mode, + ) + + fixed_cache[f'{name}_fixed'] = { + 'settings': audio_cache[name]['settings'], + 'output': True + } + audio_cache[name]['output'] = False + + for name in fixed_cache: + audio_cache[name] = fixed_cache[name] + except Exception as e: + print(e) + print("\nFailed to run Voicefixer") + + for name in audio_cache: + if 'output' not in audio_cache[name] or not audio_cache[name]['output']: + if args.prune_nonfinal_outputs: + audio_cache[name]['pruned'] = True + os.remove(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') + continue + + output_voices.append(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') + + if not args.embed_output_metadata: + with open(f'{outdir}/{cleanup_voice_name(voice)}_{name}.json', 'w', encoding="utf-8") as f: + f.write(json.dumps(audio_cache[name]['settings'], indent='\t') ) + + if args.embed_output_metadata: + for name in tqdm(audio_cache, desc="Embedding metadata..."): + if 'pruned' in audio_cache[name] and audio_cache[name]['pruned']: + continue + + metadata = music_tag.load_file(f"{outdir}/{cleanup_voice_name(voice)}_{name}.wav") + metadata['lyrics'] = json.dumps(audio_cache[name]['settings']) + metadata.save() + + if sample_voice is not None: + sample_voice = (tts.input_sample_rate, sample_voice.numpy()) + + info = get_info(voice=voice, latents=False) + print(f"Generation took {info['time']} seconds, saved to '{output_voices[0]}'\n") + + info['seed'] = usedSeed + if 'latents' in info: + del info['latents'] + + os.makedirs('./config/', exist_ok=True) + with open(f'./config/generate.json', 'w', encoding="utf-8") as f: + f.write(json.dumps(info, indent='\t') ) + + stats = [ + [ parameters['seed'], "{:.3f}".format(info['time']) ] + ] + + return ( + sample_voice, + output_voices, + stats, + ) + +def generate_tortoise(**kwargs): + parameters = {} + parameters.update(kwargs) + + voice = parameters['voice'] + progress = parameters['progress'] if 'progress' in parameters else None + if parameters['seed'] == 0: + parameters['seed'] = None + + usedSeed = parameters['seed'] + + global args + global tts + + unload_whisper() + unload_voicefixer() + + if not tts: + # should check if it's loading or unloaded, and load it if it's unloaded + if tts_loading: + raise Exception("TTS is still initializing...") + load_tts() + if hasattr(tts, "loading") and tts.loading: + raise Exception("TTS is still initializing...") + + do_gc() + + voice_samples = None + conditioning_latents = None + sample_voice = None + + voice_cache = {} + def fetch_voice( voice ): + cache_key = f'{voice}:{tts.autoregressive_model_hash[:8]}' + if cache_key in voice_cache: + return voice_cache[cache_key] + + print(f"Loading voice: {voice} with model {tts.autoregressive_model_hash[:8]}") + sample_voice = None + if voice == "microphone": + if parameters['mic_audio'] is None: + raise Exception("Please provide audio from mic when choosing `microphone` as a voice input") + voice_samples, conditioning_latents = [load_audio(parameters['mic_audio'], tts.input_sample_rate)], None + elif voice == "random": + voice_samples, conditioning_latents = None, tts.get_random_conditioning_latents() + else: + if progress is not None: + notify_progress(f"Loading voice: {voice}", progress=progress) + + voice_samples, conditioning_latents = load_voice(voice, model_hash=tts.autoregressive_model_hash) + + if voice_samples and len(voice_samples) > 0: + if conditioning_latents is None: + conditioning_latents = compute_latents(voice=voice, voice_samples=voice_samples, voice_latents_chunks=parameters['voice_latents_chunks']) + + sample_voice = torch.cat(voice_samples, dim=-1).squeeze().cpu() + voice_samples = None + + voice_cache[cache_key] = (voice_samples, conditioning_latents, sample_voice) + return voice_cache[cache_key] + + def get_settings( override=None ): + settings = { + 'temperature': float(parameters['temperature']), + + 'top_p': float(parameters['top_p']), + 'diffusion_temperature': float(parameters['diffusion_temperature']), + 'length_penalty': float(parameters['length_penalty']), + 'repetition_penalty': float(parameters['repetition_penalty']), + 'cond_free_k': float(parameters['cond_free_k']), + + 'num_autoregressive_samples': parameters['num_autoregressive_samples'], + 'sample_batch_size': args.sample_batch_size, + 'diffusion_iterations': parameters['diffusion_iterations'], + + 'voice_samples': None, + 'conditioning_latents': None, + + 'use_deterministic_seed': parameters['seed'], + 'return_deterministic_state': True, + 'k': parameters['candidates'], + 'diffusion_sampler': parameters['diffusion_sampler'], + 'breathing_room': parameters['breathing_room'], + 'half_p': "Half Precision" in parameters['experimentals'], + 'cond_free': "Conditioning-Free" in parameters['experimentals'], + 'cvvp_amount': parameters['cvvp_weight'], + + 'autoregressive_model': args.autoregressive_model, + 'diffusion_model': args.diffusion_model, + 'tokenizer_json': args.tokenizer_json, + } + + # could be better to just do a ternary on everything above, but i am not a professional + selected_voice = voice + if override is not None: + if 'voice' in override: + selected_voice = override['voice'] + + for k in override: + if k not in settings: + continue + settings[k] = override[k] + + if settings['autoregressive_model'] is not None: + if settings['autoregressive_model'] == "auto": + settings['autoregressive_model'] = deduce_autoregressive_model(selected_voice) + tts.load_autoregressive_model(settings['autoregressive_model']) + + if settings['diffusion_model'] is not None: + if settings['diffusion_model'] == "auto": + settings['diffusion_model'] = deduce_diffusion_model(selected_voice) + tts.load_diffusion_model(settings['diffusion_model']) + + if settings['tokenizer_json'] is not None: + tts.load_tokenizer_json(settings['tokenizer_json']) + + settings['voice_samples'], settings['conditioning_latents'], _ = fetch_voice(voice=selected_voice) + + # clamp it down for the insane users who want this + # it would be wiser to enforce the sample size to the batch size, but this is what the user wants + settings['sample_batch_size'] = args.sample_batch_size + if not settings['sample_batch_size']: + settings['sample_batch_size'] = tts.autoregressive_batch_size + if settings['num_autoregressive_samples'] < settings['sample_batch_size']: + settings['sample_batch_size'] = settings['num_autoregressive_samples'] + + if settings['conditioning_latents'] is not None and len(settings['conditioning_latents']) == 2 and settings['cvvp_amount'] > 0: + print("Requesting weighing against CVVP weight, but voice latents are missing some extra data. Please regenerate your voice latents with 'Slimmer voice latents' unchecked.") + settings['cvvp_amount'] = 0 + + return settings + + if not parameters['delimiter']: + parameters['delimiter'] = "\n" + elif parameters['delimiter'] == "\\n": + parameters['delimiter'] = "\n" + + if parameters['delimiter'] and parameters['delimiter'] != "" and parameters['delimiter'] in parameters['text']: + texts = parameters['text'].split(parameters['delimiter']) + else: + texts = split_and_recombine_text(parameters['text']) + + full_start_time = time.time() + + outdir = f"{args.results_folder}/{voice}/" + os.makedirs(outdir, exist_ok=True) + + audio_cache = {} + + volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None + + idx = 0 + idx_cache = {} + for i, file in enumerate(os.listdir(outdir)): + filename = os.path.basename(file) + extension = os.path.splitext(filename)[-1][1:] + if extension != "json" and extension != "wav": + continue + match = re.findall(rf"^{voice}_(\d+)(?:.+?)?{extension}$", filename) + if match and len(match) > 0: + key = int(match[0]) + idx_cache[key] = True + + if len(idx_cache) > 0: + keys = sorted(list(idx_cache.keys())) + idx = keys[-1] + 1 + + idx = pad(idx, 4) + + def get_name(line=0, candidate=0, combined=False): + name = f"{idx}" + if combined: + name = f"{name}_combined" + elif len(texts) > 1: + name = f"{name}_{line}" + if parameters['candidates'] > 1: + name = f"{name}_{candidate}" + return name + + def get_info( voice, settings = None, latents = True ): + info = {} + info.update(parameters) + + info['time'] = time.time()-full_start_time + info['datetime'] = datetime.now().isoformat() + + info['model'] = tts.autoregressive_model_path + info['model_hash'] = tts.autoregressive_model_hash + + info['progress'] = None + del info['progress'] + + if info['delimiter'] == "\n": + info['delimiter'] = "\\n" + + if settings is not None: + for k in settings: + if k in info: + info[k] = settings[k] + + if 'half_p' in settings and 'cond_free' in settings: + info['experimentals'] = [] + if settings['half_p']: + info['experimentals'].append("Half Precision") + if settings['cond_free']: + info['experimentals'].append("Conditioning-Free") + + if latents and "latents" not in info: + voice = info['voice'] + model_hash = settings["model_hash"][:8] if settings is not None and "model_hash" in settings else tts.autoregressive_model_hash[:8] + + dir = f'{get_voice_dir()}/{voice}/' + latents_path = f'{dir}/cond_latents_{model_hash}.pth' + + if voice == "random" or voice == "microphone": + if latents and settings is not None and settings['conditioning_latents']: + os.makedirs(dir, exist_ok=True) + torch.save(conditioning_latents, latents_path) + + if latents_path and os.path.exists(latents_path): + try: + with open(latents_path, 'rb') as f: + info['latents'] = base64.b64encode(f.read()).decode("ascii") + except Exception as e: + pass + + return info + + INFERENCING = True + for line, cut_text in enumerate(texts): + if should_phonemize(): + cut_text = phonemizer( cut_text ) + + if parameters['emotion'] == "Custom": + if parameters['prompt'] and parameters['prompt'].strip() != "": + cut_text = f"[{parameters['prompt']},] {cut_text}" + elif parameters['emotion'] != "None" and parameters['emotion']: + cut_text = f"[I am really {parameters['emotion'].lower()},] {cut_text}" + + tqdm_prefix = f'[{str(line+1)}/{str(len(texts))}]' + print(f"{tqdm_prefix} Generating line: {cut_text}") + start_time = time.time() + + # do setting editing + match = re.findall(r'^(\{.+\}) (.+?)$', cut_text) + override = None + if match and len(match) > 0: + match = match[0] + try: + override = json.loads(match[0]) + cut_text = match[1].strip() + except Exception as e: + raise Exception("Prompt settings editing requested, but received invalid JSON") + + settings = get_settings( override=override ) + gen, additionals = tts.tts(cut_text, **settings ) + + parameters['seed'] = additionals[0] + run_time = time.time()-start_time + print(f"Generating line took {run_time} seconds") + + if not isinstance(gen, list): + gen = [gen] + + for j, g in enumerate(gen): + audio = g.squeeze(0).cpu() + name = get_name(line=line, candidate=j) + + settings['text'] = cut_text + settings['time'] = run_time + settings['datetime'] = datetime.now().isoformat() + if args.tts_backend == "tortoise": + settings['model'] = tts.autoregressive_model_path + settings['model_hash'] = tts.autoregressive_model_hash + + audio_cache[name] = { + 'audio': audio, + 'settings': get_info(voice=override['voice'] if override and 'voice' in override else voice, settings=settings) + } + # save here in case some error happens mid-batch + torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', audio, tts.output_sample_rate) + + del gen + do_gc() + INFERENCING = False + + for k in audio_cache: + audio = audio_cache[k]['audio'] + + audio, _ = resample(audio, tts.output_sample_rate, args.output_sample_rate) + if volume_adjust is not None: + audio = volume_adjust(audio) + + audio_cache[k]['audio'] = audio + torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{k}.wav', audio, args.output_sample_rate) + + output_voices = [] + for candidate in range(parameters['candidates']): + if len(texts) > 1: + audio_clips = [] + for line in range(len(texts)): + name = get_name(line=line, candidate=candidate) + audio = audio_cache[name]['audio'] + audio_clips.append(audio) + + name = get_name(candidate=candidate, combined=True) + audio = torch.cat(audio_clips, dim=-1) + torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', audio, args.output_sample_rate) + + audio = audio.squeeze(0).cpu() + audio_cache[name] = { + 'audio': audio, + 'settings': get_info(voice=voice), + 'output': True + } + else: + name = get_name(candidate=candidate) + audio_cache[name]['output'] = True + + + if args.voice_fixer: + if not voicefixer: + notify_progress("Loading voicefix...", progress=progress) + load_voicefixer() + + try: + fixed_cache = {} + for name in tqdm(audio_cache, desc="Running voicefix..."): + del audio_cache[name]['audio'] + if 'output' not in audio_cache[name] or not audio_cache[name]['output']: + continue + + path = f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav' + fixed = f'{outdir}/{cleanup_voice_name(voice)}_{name}_fixed.wav' + voicefixer.restore( + input=path, + output=fixed, + cuda=get_device_name() == "cuda" and args.voice_fixer_use_cuda, + #mode=mode, + ) + + fixed_cache[f'{name}_fixed'] = { + 'settings': audio_cache[name]['settings'], + 'output': True + } + audio_cache[name]['output'] = False + + for name in fixed_cache: + audio_cache[name] = fixed_cache[name] + except Exception as e: + print(e) + print("\nFailed to run Voicefixer") + + for name in audio_cache: + if 'output' not in audio_cache[name] or not audio_cache[name]['output']: + if args.prune_nonfinal_outputs: + audio_cache[name]['pruned'] = True + os.remove(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') + continue + + output_voices.append(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav') + + if not args.embed_output_metadata: + with open(f'{outdir}/{cleanup_voice_name(voice)}_{name}.json', 'w', encoding="utf-8") as f: + f.write(json.dumps(audio_cache[name]['settings'], indent='\t') ) + + if args.embed_output_metadata: + for name in tqdm(audio_cache, desc="Embedding metadata..."): + if 'pruned' in audio_cache[name] and audio_cache[name]['pruned']: + continue + + metadata = music_tag.load_file(f"{outdir}/{cleanup_voice_name(voice)}_{name}.wav") + metadata['lyrics'] = json.dumps(audio_cache[name]['settings']) + metadata.save() + + if sample_voice is not None: + sample_voice = (tts.input_sample_rate, sample_voice.numpy()) + + info = get_info(voice=voice, latents=False) + print(f"Generation took {info['time']} seconds, saved to '{output_voices[0]}'\n") + + info['seed'] = usedSeed + if 'latents' in info: + del info['latents'] + + os.makedirs('./config/', exist_ok=True) + with open(f'./config/generate.json', 'w', encoding="utf-8") as f: + f.write(json.dumps(info, indent='\t') ) + + stats = [ + [ parameters['seed'], "{:.3f}".format(info['time']) ] + ] + + return ( + sample_voice, + output_voices, + stats, + ) + +def cancel_generate(): + if not INFERENCING: + return + + import tortoise.api + + tortoise.api.STOP_SIGNAL = True + +def hash_file(path, algo="md5", buffer_size=0): + hash = None + if algo == "md5": + hash = hashlib.md5() + elif algo == "sha1": + hash = hashlib.sha1() + else: + raise Exception(f'Unknown hash algorithm specified: {algo}') + + if not os.path.exists(path): + raise Exception(f'Path not found: {path}') + + with open(path, 'rb') as f: + if buffer_size > 0: + while True: + data = f.read(buffer_size) + if not data: + break + hash.update(data) + else: + hash.update(f.read()) + + return "{0}".format(hash.hexdigest()) + +def update_baseline_for_latents_chunks( voice ): + global current_voice + current_voice = voice + + path = f'{get_voice_dir()}/{voice}/' + if not os.path.isdir(path): + return 1 + + dataset_file = f'./training/{voice}/train.txt' + if os.path.exists(dataset_file): + return 0 # 0 will leverage using the LJspeech dataset for computing latents + + files = os.listdir(path) + + total = 0 + total_duration = 0 + + for file in files: + if file[-4:] != ".wav": + continue + + metadata = torchaudio.info(f'{path}/{file}') + duration = metadata.num_frames / metadata.sample_rate + total_duration += duration + total = total + 1 + + + # brain too fried to figure out a better way + if args.autocalculate_voice_chunk_duration_size == 0: + return int(total_duration / total) if total > 0 else 1 + return int(total_duration / args.autocalculate_voice_chunk_duration_size) if total_duration > 0 else 1 + +def compute_latents(voice=None, voice_samples=None, voice_latents_chunks=0, original_ar=False, original_diffusion=False): + global tts + global args + + unload_whisper() + unload_voicefixer() + + if not tts: + if tts_loading: + raise Exception("TTS is still initializing...") + load_tts() + + if hasattr(tts, "loading") and tts.loading: + raise Exception("TTS is still initializing...") + + if args.tts_backend == "bark": + tts.create_voice( voice ) + return + + if args.autoregressive_model == "auto": + tts.load_autoregressive_model(deduce_autoregressive_model(voice)) + + if voice: + load_from_dataset = voice_latents_chunks == 0 + + if load_from_dataset: + dataset_path = f'./training/{voice}/train.txt' + if not os.path.exists(dataset_path): + load_from_dataset = False + else: + with open(dataset_path, 'r', encoding="utf-8") as f: + lines = f.readlines() + + print("Leveraging dataset for computing latents") + + voice_samples = [] + max_length = 0 + for line in lines: + filename = f'./training/{voice}/{line.split("|")[0]}' + + waveform = load_audio(filename, 22050) + max_length = max(max_length, waveform.shape[-1]) + voice_samples.append(waveform) + + for i in range(len(voice_samples)): + voice_samples[i] = pad_or_truncate(voice_samples[i], max_length) + + voice_latents_chunks = len(voice_samples) + if voice_latents_chunks == 0: + print("Dataset is empty!") + load_from_dataset = True + if not load_from_dataset: + voice_samples, _ = load_voice(voice, load_latents=False) + + if voice_samples is None: + return + + conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents, original_ar=original_ar, original_diffusion=original_diffusion) + + if len(conditioning_latents) == 4: + conditioning_latents = (conditioning_latents[0], conditioning_latents[1], conditioning_latents[2], None) + + outfile = f'{get_voice_dir()}/{voice}/cond_latents_{tts.autoregressive_model_hash[:8]}.pth' + torch.save(conditioning_latents, outfile) + print(f'Saved voice latents: {outfile}') + + return conditioning_latents + +# superfluous, but it cleans up some things +class TrainingState(): + def __init__(self, config_path, keep_x_past_checkpoints=0, start=True): + self.killed = False + + self.training_dir = os.path.dirname(config_path) + with open(config_path, 'r') as file: + self.yaml_config = yaml.safe_load(file) + + self.json_config = json.load(open(f"{self.training_dir}/train.json", 'r', encoding="utf-8")) + self.dataset_path = f"{self.training_dir}/train.txt" + with open(self.dataset_path, 'r', encoding="utf-8") as f: + self.dataset_size = len(f.readlines()) + + self.batch_size = self.json_config["batch_size"] + self.save_rate = self.json_config["save_rate"] + + self.epoch = 0 + self.epochs = self.json_config["epochs"] + self.it = 0 + self.its = calc_iterations( self.epochs, self.dataset_size, self.batch_size ) + self.step = 0 + self.steps = int(self.its / self.dataset_size) + self.checkpoint = 0 + self.checkpoints = int((self.its - self.it) / self.save_rate) + + self.gpus = self.json_config['gpus'] + + self.buffer = [] + + self.open_state = False + self.training_started = False + + self.info = {} + + self.it_rate = "" + self.it_rates = 0 + + self.epoch_rate = "" + + self.eta = "?" + self.eta_hhmmss = "?" + + self.nan_detected = False + + self.last_info_check_at = 0 + self.statistics = { + 'loss': [], + 'lr': [], + 'grad_norm': [], + } + self.losses = [] + self.metrics = { + 'step': "", + 'rate': "", + 'loss': "", + } + + self.loss_milestones = [ 1.0, 0.15, 0.05 ] + + if args.tts_backend=="vall-e": + self.valle_last_it = 0 + self.valle_steps = 0 + + if keep_x_past_checkpoints > 0: + self.cleanup_old(keep=keep_x_past_checkpoints) + if start: + self.spawn_process(config_path=config_path, gpus=self.gpus) + + def spawn_process(self, config_path, gpus=1): + if args.tts_backend == "vall-e": + self.cmd = ['deepspeed', f'--num_gpus={gpus}', '--module', 'vall_e.train', f'yaml="{config_path}"'] + else: + self.cmd = ['train.bat', config_path] if os.name == "nt" else ['./train.sh', config_path] + + print("Spawning process: ", " ".join(self.cmd)) + self.process = subprocess.Popen(self.cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True) + + def parse_metrics(self, data): + if isinstance(data, str): + if line.find('Training Metrics:') >= 0: + data = json.loads(line.split("Training Metrics:")[-1]) + data['mode'] = "training" + elif line.find('Validation Metrics:') >= 0: + data = json.loads(line.split("Validation Metrics:")[-1]) + data['mode'] = "validation" + else: + return + + self.info = data + if 'epoch' in self.info: + self.epoch = int(self.info['epoch']) + if 'it' in self.info: + self.it = int(self.info['it']) + if 'step' in self.info: + self.step = int(self.info['step']) + if 'steps' in self.info: + self.steps = int(self.info['steps']) + + if 'elapsed_time' in self.info: + self.info['iteration_rate'] = self.info['elapsed_time'] + del self.info['elapsed_time'] + + if 'iteration_rate' in self.info: + it_rate = self.info['iteration_rate'] + self.it_rate = f'{"{:.3f}".format(1/it_rate)}it/s' if 0 < it_rate and it_rate < 1 else f'{"{:.3f}".format(it_rate)}s/it' + self.it_rates += it_rate + + if self.it_rates > 0 and self.it * self.steps > 0: + epoch_rate = self.it_rates / self.it * self.steps + self.epoch_rate = f'{"{:.3f}".format(1/epoch_rate)}epoch/s' if 0 < epoch_rate and epoch_rate < 1 else f'{"{:.3f}".format(epoch_rate)}s/epoch' + + try: + self.eta = (self.its - self.it) * (self.it_rates / self.it) + eta = str(timedelta(seconds=int(self.eta))) + self.eta_hhmmss = eta + except Exception as e: + self.eta_hhmmss = "?" + pass + + self.metrics['step'] = [f"{self.epoch}/{self.epochs}"] + if self.epochs != self.its: + self.metrics['step'].append(f"{self.it}/{self.its}") + if self.steps > 1: + self.metrics['step'].append(f"{self.step}/{self.steps}") + self.metrics['step'] = ", ".join(self.metrics['step']) + + if args.tts_backend == "tortoise": + epoch = self.epoch + (self.step / self.steps) + else: + epoch = self.info['epoch'] if 'epoch' in self.info else self.it + + if self.it > 0: + # probably can double for-loop but whatever + keys = { + 'lrs': ['lr'], + 'losses': ['loss_text_ce', 'loss_mel_ce'], + 'accuracies': [], + 'precisions': [], + 'grad_norms': [], + } + if args.tts_backend == "vall-e": + keys['lrs'] = [ + 'ar.lr', 'nar.lr', + ] + keys['losses'] = [ + # 'ar.loss', 'nar.loss', 'ar+nar.loss', + 'ar.loss.nll', 'nar.loss.nll', + ] + + keys['accuracies'] = [ + 'ar.loss.acc', 'nar.loss.acc', + 'ar.stats.acc', 'nar.loss.acc', + ] + keys['precisions'] = [ 'ar.loss.precision', 'nar.loss.precision', ] + keys['grad_norms'] = ['ar.grad_norm', 'nar.grad_norm'] + + for k in keys['lrs']: + if k not in self.info: + continue + + self.statistics['lr'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': k}) + + for k in keys['accuracies']: + if k not in self.info: + continue + + self.statistics['loss'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': k}) + + for k in keys['precisions']: + if k not in self.info: + continue + + self.statistics['loss'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': k}) + + for k in keys['losses']: + if k not in self.info: + continue + + prefix = "" + + if "mode" in self.info and self.info["mode"] == "validation": + prefix = f'{self.info["name"] if "name" in self.info else "val"}_' + + self.statistics['loss'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': f'{prefix}{k}' }) + + self.losses.append( self.statistics['loss'][-1] ) + + for k in keys['grad_norms']: + if k not in self.info: + continue + self.statistics['grad_norm'].append({'epoch': epoch, 'it': self.it, 'value': self.info[k], 'type': k}) + + return data + + def get_status(self): + message = None + + self.metrics['rate'] = [] + if self.epoch_rate: + self.metrics['rate'].append(self.epoch_rate) + if self.it_rate and self.epoch_rate[:-7] != self.it_rate[:-4]: + self.metrics['rate'].append(self.it_rate) + self.metrics['rate'] = ", ".join(self.metrics['rate']) + + eta_hhmmss = self.eta_hhmmss if self.eta_hhmmss else "?" + + self.metrics['loss'] = [] + if 'lr' in self.info: + self.metrics['loss'].append(f'LR: {"{:.3e}".format(self.info["lr"])}') + + if len(self.losses) > 0: + self.metrics['loss'].append(f'Loss: {"{:.3f}".format(self.losses[-1]["value"])}') + + if False and len(self.losses) >= 2: + deriv = 0 + accum_length = len(self.losses)//2 # i *guess* this is fine when you think about it + loss_value = self.losses[-1]["value"] + + for i in range(accum_length): + d1_loss = self.losses[accum_length-i-1]["value"] + d2_loss = self.losses[accum_length-i-2]["value"] + dloss = (d2_loss - d1_loss) + + d1_step = self.losses[accum_length-i-1]["it"] + d2_step = self.losses[accum_length-i-2]["it"] + dstep = (d2_step - d1_step) + + if dstep == 0: + continue + + inst_deriv = dloss / dstep + deriv += inst_deriv + + deriv = deriv / accum_length + + print("Deriv: ", deriv) + + if deriv != 0: # dloss < 0: + next_milestone = None + for milestone in self.loss_milestones: + if loss_value > milestone: + next_milestone = milestone + break + + print(f"Loss value: {loss_value} | Next milestone: {next_milestone} | Distance: {loss_value - next_milestone}") + + if next_milestone: + # tfw can do simple calculus but not basic algebra in my head + est_its = (next_milestone - loss_value) / deriv * 100 + print(f"Estimated: {est_its}") + if est_its >= 0: + self.metrics['loss'].append(f'Est. milestone {next_milestone} in: {int(est_its)}its') + else: + est_loss = inst_deriv * (self.its - self.it) + loss_value + if est_loss >= 0: + self.metrics['loss'].append(f'Est. final loss: {"{:.3f}".format(est_loss)}') + + self.metrics['loss'] = ", ".join(self.metrics['loss']) + + message = f"[{self.metrics['step']}] [{self.metrics['rate']}] [ETA: {eta_hhmmss}] [{self.metrics['loss']}]" + if self.nan_detected: + message = f"[!NaN DETECTED! {self.nan_detected}] {message}" + + return message + + def load_statistics(self, update=False): + if not os.path.isdir(self.training_dir): + return + + if args.tts_backend == "tortoise": + logs = sorted([f'{self.training_dir}/finetune/{d}' for d in os.listdir(f'{self.training_dir}/finetune/') if d[-4:] == ".log" ]) + else: + log_dir = "logs" + logs = sorted([f'{self.training_dir}/{log_dir}/{d}/log.txt' for d in os.listdir(f'{self.training_dir}/{log_dir}/') ]) + + if update: + logs = [logs[-1]] + + infos = {} + highest_step = self.last_info_check_at + + if not update: + self.statistics['loss'] = [] + self.statistics['lr'] = [] + self.statistics['grad_norm'] = [] + self.it_rates = 0 + + unq = {} + averager = None + prev_state = 0 + + for log in logs: + with open(log, 'r', encoding="utf-8") as f: + lines = f.readlines() + + for line in lines: + line = line.strip() + if not line: + continue + + if line[-1] == ".": + line = line[:-1] + + if line.find('Training Metrics:') >= 0: + split = line.split("Training Metrics:")[-1] + data = json.loads(split) + + name = "train" + mode = "training" + prev_state = 0 + elif line.find('Validation Metrics:') >= 0: + data = json.loads(line.split("Validation Metrics:")[-1]) + if "it" not in data: + data['it'] = it + if "epoch" not in data: + data['epoch'] = epoch + + # name = data['name'] if 'name' in data else "val" + mode = "validation" + + if prev_state == 0: + name = "subtrain" + else: + name = "val" + + prev_state += 1 + else: + continue + + if "it" not in data: + continue + + it = data['it'] + epoch = data['epoch'] + + if args.tts_backend == "vall-e": + if not averager or averager['key'] != f'{it}_{name}' or averager['mode'] != mode: + averager = { + 'key': f'{it}_{name}', + 'name': name, + 'mode': mode, + "metrics": {} + } + for k in data: + if data[k] is None: + continue + averager['metrics'][k] = [ data[k] ] + else: + for k in data: + if data[k] is None: + continue + if k not in averager['metrics']: + averager['metrics'][k] = [ data[k] ] + else: + averager['metrics'][k].append( data[k] ) + + unq[f'{it}_{mode}_{name}'] = averager + else: + unq[f'{it}_{mode}_{name}'] = data + + if update and it <= self.last_info_check_at: + continue + + blacklist = [ "batch", "eval" ] + for it in unq: + if args.tts_backend == "vall-e": + stats = unq[it] + data = {k: sum(v) / len(v) for k, v in stats['metrics'].items() if k not in blacklist } + #data = {k: min(v) for k, v in stats['metrics'].items() if k not in blacklist } + #data = {k: max(v) for k, v in stats['metrics'].items() if k not in blacklist } + data['name'] = stats['name'] + data['mode'] = stats['mode'] + data['steps'] = len(stats['metrics']['it']) + else: + data = unq[it] + self.parse_metrics(data) + + self.last_info_check_at = highest_step + + def cleanup_old(self, keep=2): + if keep <= 0: + return + + if args.tts_backend == "vall-e": + return + + if not os.path.isdir(f'{self.training_dir}/finetune/'): + return + + models = sorted([ int(d[:-8]) for d in os.listdir(f'{self.training_dir}/finetune/models/') if d[-8:] == "_gpt.pth" ]) + states = sorted([ int(d[:-6]) for d in os.listdir(f'{self.training_dir}/finetune/training_state/') if d[-6:] == ".state" ]) + remove_models = models[:-keep] + remove_states = states[:-keep] + + for d in remove_models: + path = f'{self.training_dir}/finetune/models/{d}_gpt.pth' + print("Removing", path) + os.remove(path) + for d in remove_states: + path = f'{self.training_dir}/finetune/training_state/{d}.state' + print("Removing", path) + os.remove(path) + + def parse(self, line, verbose=False, keep_x_past_checkpoints=0, buffer_size=8, progress=None ): + self.buffer.append(f'{line}') + + data = None + percent = 0 + message = None + should_return = False + + MESSAGE_START = 'Start training from epoch' + MESSAGE_FINSIHED = 'Finished training' + MESSAGE_SAVING = 'Saving models and training states.' + + MESSAGE_METRICS_TRAINING = 'Training Metrics:' + MESSAGE_METRICS_VALIDATION = 'Validation Metrics:' + + if line.find(MESSAGE_FINSIHED) >= 0: + self.killed = True + # rip out iteration info + elif not self.training_started: + if line.find(MESSAGE_START) >= 0: + self.training_started = True # could just leverage the above variable, but this is python, and there's no point in these aggressive microoptimizations + + match = re.findall(r'epoch: ([\d,]+)', line) + if match and len(match) > 0: + self.epoch = int(match[0].replace(",", "")) + match = re.findall(r'iter: ([\d,]+)', line) + if match and len(match) > 0: + self.it = int(match[0].replace(",", "")) + + self.checkpoints = int((self.its - self.it) / self.save_rate) + + self.load_statistics() + + should_return = True + else: + if line.find(MESSAGE_SAVING) >= 0: + self.checkpoint += 1 + message = f"[{self.checkpoint}/{self.checkpoints}] Saving checkpoint..." + percent = self.checkpoint / self.checkpoints + + self.cleanup_old(keep=keep_x_past_checkpoints) + elif line.find(MESSAGE_METRICS_TRAINING) >= 0: + data = json.loads(line.split(MESSAGE_METRICS_TRAINING)[-1]) + data['mode'] = "training" + elif line.find(MESSAGE_METRICS_VALIDATION) >= 0: + data = json.loads(line.split(MESSAGE_METRICS_VALIDATION)[-1]) + data['mode'] = "validation" + + if data is not None: + if ': nan' in line and not self.nan_detected: + self.nan_detected = self.it + + self.parse_metrics( data ) + message = self.get_status() + + if message: + percent = self.it / float(self.its) # self.epoch / float(self.epochs) + if progress is not None: + progress(percent, message) + + self.buffer.append(f'[{"{:.3f}".format(percent*100)}%] {message}') + should_return = True + + if verbose and not self.training_started: + should_return = True + + self.buffer = self.buffer[-buffer_size:] + + result = None + if should_return: + result = "".join(self.buffer) if not self.training_started else message + + return ( + result, + percent, + message, + ) + +try: + import altair as alt + alt.data_transformers.enable('default', max_rows=None) +except Exception as e: + print(e) + pass + +def run_training(config_path, verbose=False, keep_x_past_checkpoints=0, progress=gr.Progress(track_tqdm=True)): + global training_state + if training_state and training_state.process: + return "Training already in progress" + + + # ensure we have the dvae.pth + if args.tts_backend == "tortoise": + get_model_path('dvae.pth') + + # I don't know if this is still necessary, as it was bitching at me for not doing this, despite it being in a separate process + torch.multiprocessing.freeze_support() + + unload_tts() + unload_whisper() + unload_voicefixer() + + training_state = TrainingState(config_path=config_path, keep_x_past_checkpoints=keep_x_past_checkpoints) + + for line in iter(training_state.process.stdout.readline, ""): + if training_state is None or training_state.killed: + return + + result, percent, message = training_state.parse( line=line, verbose=verbose, keep_x_past_checkpoints=keep_x_past_checkpoints, progress=progress ) + print(f"[Training] [{datetime.now().isoformat()}] {line[:-1]}") + if result: + yield result + + if progress is not None and message: + progress(percent, message) + + if training_state: + training_state.process.stdout.close() + return_code = training_state.process.wait() + training_state = None + +def update_training_dataplot(x_min=None, x_max=None, y_min=None, y_max=None, config_path=None): + global training_state + losses = None + lrs = None + grad_norms = None + + x_lim = [ x_min, x_max ] + y_lim = [ y_min, y_max ] + + if not training_state: + if config_path: + training_state = TrainingState(config_path=config_path, start=False) + training_state.load_statistics() + message = training_state.get_status() + + if training_state: + if not x_lim[-1]: + x_lim[-1] = training_state.epochs + + if not y_lim[-1]: + y_lim = None + + if len(training_state.statistics['loss']) > 0: + losses = gr.LinePlot.update( + value = pd.DataFrame(training_state.statistics['loss']), + x_lim=x_lim, y_lim=y_lim, + x="epoch", y="value", # x="it", + title="Loss Metrics", color="type", tooltip=['epoch', 'it', 'value', 'type'], + width=500, height=350 + ) + if len(training_state.statistics['lr']) > 0: + lrs = gr.LinePlot.update( + value = pd.DataFrame(training_state.statistics['lr']), + x_lim=x_lim, + x="epoch", y="value", # x="it", + title="Learning Rate", color="type", tooltip=['epoch', 'it', 'value', 'type'], + width=500, height=350 + ) + if len(training_state.statistics['grad_norm']) > 0: + grad_norms = gr.LinePlot.update( + value = pd.DataFrame(training_state.statistics['grad_norm']), + x_lim=x_lim, + x="epoch", y="value", # x="it", + title="Gradient Normals", color="type", tooltip=['epoch', 'it', 'value', 'type'], + width=500, height=350 + ) + + if config_path: + del training_state + training_state = None + + return (losses, lrs, grad_norms) + +def reconnect_training(verbose=False, progress=gr.Progress(track_tqdm=True)): + global training_state + if not training_state or not training_state.process: + return "Training not in progress" + + for line in iter(training_state.process.stdout.readline, ""): + result, percent, message = training_state.parse( line=line, verbose=verbose, progress=progress ) + print(f"[Training] [{datetime.now().isoformat()}] {line[:-1]}") + if result: + yield result + + if progress is not None and message: + progress(percent, message) + +def stop_training(): + global training_state + if training_state is None: + return "No training in progress" + print("Killing training process...") + training_state.killed = True + + children = [] + if args.tts_backend == "tortoise": + # wrapped in a try/catch in case for some reason this fails outside of Linux + try: + children = [p.info for p in psutil.process_iter(attrs=['pid', 'name', 'cmdline']) if './src/train.py' in p.info['cmdline']] + except Exception as e: + pass + + training_state.process.stdout.close() + training_state.process.terminate() + training_state.process.kill() + elif args.tts_backend == "vall-e": + print(training_state.process.communicate(input='quit')[0]) + + return_code = training_state.process.wait() + + for p in children: + os.kill( p['pid'], signal.SIGKILL ) + + training_state = None + print("Killed training process.") + return f"Training cancelled: {return_code}" + +def get_halfp_model_path(): + autoregressive_model_path = get_model_path('autoregressive.pth') + return autoregressive_model_path.replace(".pth", "_half.pth") + +def convert_to_halfp(): + autoregressive_model_path = get_model_path('autoregressive.pth') + print(f'Converting model to half precision: {autoregressive_model_path}') + model = torch.load(autoregressive_model_path) + for k in model: + model[k] = model[k].half() + + outfile = get_halfp_model_path() + torch.save(model, outfile) + print(f'Converted model to half precision: {outfile}') + + +# collapses short segments into the previous segment +def whisper_sanitize( results ): + sanitized = json.loads(json.dumps(results)) + sanitized['segments'] = [] + + for segment in results['segments']: + length = segment['end'] - segment['start'] + if length >= MIN_TRAINING_DURATION or len(sanitized['segments']) == 0: + sanitized['segments'].append(segment) + continue + + last_segment = sanitized['segments'][-1] + # segment already asimilitated it, somehow + if last_segment['end'] >= segment['end']: + continue + """ + # segment already asimilitated it, somehow + if last_segment['text'].endswith(segment['text']): + continue + """ + last_segment['text'] += segment['text'] + last_segment['end'] = segment['end'] + + for i in range(len(sanitized['segments'])): + sanitized['segments'][i]['id'] = i + + return sanitized + +def whisper_transcribe( file, language=None ): + # shouldn't happen, but it's for safety + global whisper_model + global whisper_align_model + + if not whisper_model: + load_whisper_model(language=language) + + if args.whisper_backend == "openai/whisper": + if not language: + language = None + + return whisper_model.transcribe(file, language=language) + + if args.whisper_backend == "lightmare/whispercpp": + res = whisper_model.transcribe(file) + segments = whisper_model.extract_text_and_timestamps( res ) + + result = { + 'text': [], + 'segments': [] + } + for segment in segments: + reparsed = { + 'start': segment[0] / 100.0, + 'end': segment[1] / 100.0, + 'text': segment[2], + 'id': len(result['segments']) + } + result['text'].append( segment[2] ) + result['segments'].append(reparsed) + + result['text'] = " ".join(result['text']) + return result + + if args.whisper_backend == "m-bain/whisperx": + import whisperx + + device = "cuda" if get_device_name() == "cuda" else "cpu" + result = whisper_model.transcribe(file, batch_size=args.whisper_batchsize) + + align_model, metadata = whisper_align_model + result_aligned = whisperx.align(result["segments"], align_model, metadata, file, device, return_char_alignments=False) + + result['segments'] = result_aligned['segments'] + result['text'] = [] + for segment in result['segments']: + segment['id'] = len(result['text']) + result['text'].append(segment['text'].strip()) + result['text'] = " ".join(result['text']) + + return result + +def validate_waveform( waveform, sample_rate, min_only=False ): + if not torch.any(waveform < 0): + return "Waveform is empty" + + num_channels, num_frames = waveform.shape + duration = num_frames / sample_rate + + if duration < MIN_TRAINING_DURATION: + return "Duration too short ({:.3f}s < {:.3f}s)".format(duration, MIN_TRAINING_DURATION) + + if not min_only: + if duration > MAX_TRAINING_DURATION: + return "Duration too long ({:.3f}s < {:.3f}s)".format(MAX_TRAINING_DURATION, duration) + + return + +def transcribe_dataset( voice, language=None, skip_existings=False, progress=None ): + unload_tts() + + global whisper_model + if whisper_model is None: + load_whisper_model(language=language) + + results = {} + + files = get_voice(voice, load_latents=False) + indir = f'./training/{voice}/' + infile = f'{indir}/whisper.json' + + quantize_in_memory = args.tts_backend == "vall-e" + + os.makedirs(f'{indir}/audio/', exist_ok=True) + + TARGET_SAMPLE_RATE = 22050 + if args.tts_backend != "tortoise": + TARGET_SAMPLE_RATE = 24000 + if tts: + TARGET_SAMPLE_RATE = tts.input_sample_rate + + if os.path.exists(infile): + results = json.load(open(infile, 'r', encoding="utf-8")) + + for file in tqdm(files, desc="Iterating through voice files"): + basename = os.path.basename(file) + + if basename in results and skip_existings: + print(f"Skipping already parsed file: {basename}") + continue + + try: + result = whisper_transcribe(file, language=language) + except Exception as e: + print("Failed to transcribe:", file, e) + continue + + results[basename] = result + + if not quantize_in_memory: + waveform, sample_rate = torchaudio.load(file) + # resample to the input rate, since it'll get resampled for training anyways + # this should also "help" increase throughput a bit when filling the dataloaders + waveform, sample_rate = resample(waveform, sample_rate, TARGET_SAMPLE_RATE) + if waveform.shape[0] == 2: + waveform = waveform[:1] + + try: + kwargs = {} + if basename[-4:] == ".wav": + kwargs['encoding'] = "PCM_S" + kwargs['bits_per_sample'] = 16 + + torchaudio.save(f"{indir}/audio/{basename}", waveform, sample_rate, **kwargs) + except Exception as e: + print(e) + + with open(infile, 'w', encoding="utf-8") as f: + f.write(json.dumps(results, indent='\t')) + + do_gc() + + modified = False + for basename in results: + try: + sanitized = whisper_sanitize(results[basename]) + if len(sanitized['segments']) > 0 and len(sanitized['segments']) != len(results[basename]['segments']): + results[basename] = sanitized + modified = True + print("Segments sanizited: ", basename) + except Exception as e: + print("Failed to sanitize:", basename, e) + pass + + if modified: + os.rename(infile, infile.replace(".json", ".unsanitized.json")) + with open(infile, 'w', encoding="utf-8") as f: + f.write(json.dumps(results, indent='\t')) + + return f"Processed dataset to: {indir}" + +def slice_waveform( waveform, sample_rate, start, end, trim ): + start = int(start * sample_rate) + end = int(end * sample_rate) + + if start < 0: + start = 0 + if end >= waveform.shape[-1]: + end = waveform.shape[-1] - 1 + + sliced = waveform[:, start:end] + + error = validate_waveform( sliced, sample_rate, min_only=True ) + if trim and not error: + sliced = torchaudio.functional.vad( sliced, sample_rate ) + + return sliced, error + +def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, results=None, progress=gr.Progress() ): + indir = f'./training/{voice}/' + infile = f'{indir}/whisper.json' + messages = [] + + if not os.path.exists(infile): + message = f"Missing dataset: {infile}" + print(message) + return message + + if results is None: + results = json.load(open(infile, 'r', encoding="utf-8")) + + TARGET_SAMPLE_RATE = 22050 + if args.tts_backend != "tortoise": + TARGET_SAMPLE_RATE = 24000 + if tts: + TARGET_SAMPLE_RATE = tts.input_sample_rate + + files = 0 + segments = 0 + for filename in results: + path = f'./voices/{voice}/{filename}' + extension = os.path.splitext(filename)[-1][1:] + out_extension = extension # "wav" + + if not os.path.exists(path): + path = f'./training/{voice}/{filename}' + + if not os.path.exists(path): + message = f"Missing source audio: {filename}" + print(message) + messages.append(message) + continue + + files += 1 + result = results[filename] + waveform, sample_rate = torchaudio.load(path) + num_channels, num_frames = waveform.shape + duration = num_frames / sample_rate + + for segment in result['segments']: + file = filename.replace(f".{extension}", f"_{pad(segment['id'], 4)}.{out_extension}") + + sliced, error = slice_waveform( waveform, sample_rate, segment['start'] + start_offset, segment['end'] + end_offset, trim_silence ) + if error: + message = f"{error}, skipping... {file}" + print(message) + messages.append(message) + continue + + sliced, _ = resample( sliced, sample_rate, TARGET_SAMPLE_RATE ) + + if waveform.shape[0] == 2: + waveform = waveform[:1] + + kwargs = {} + if file[-4:] == ".wav": + kwargs['encoding'] = "PCM_S" + kwargs['bits_per_sample'] = 16 + + torchaudio.save(f"{indir}/audio/{file}", sliced, TARGET_SAMPLE_RATE, **kwargs) + + segments +=1 + + messages.append(f"Sliced segments: {files} => {segments}.") + return "\n".join(messages) + +# takes an LJSpeech-dataset-formatted .txt file and phonemize it +def phonemize_txt_file( path ): + with open(path, 'r', encoding='utf-8') as f: + lines = f.readlines() + + reparsed = [] + with open(path.replace(".txt", ".phn.txt"), 'a', encoding='utf-8') as f: + for line in tqdm(lines, desc='Phonemizing...'): + split = line.split("|") + audio = split[0] + text = split[2] + + phonemes = phonemizer( text ) + reparsed.append(f'{audio}|{phonemes}') + f.write(f'\n{audio}|{phonemes}') + + + joined = "\n".join(reparsed) + with open(path.replace(".txt", ".phn.txt"), 'w', encoding='utf-8') as f: + f.write(joined) + + return joined + +# takes an LJSpeech-dataset-formatted .txt (and phonemized .phn.txt from the above) and creates a JSON that should slot in as whisper.json +def create_dataset_json( path ): + with open(path, 'r', encoding='utf-8') as f: + lines = f.readlines() + + phonemes = None + phn_path = path.replace(".txt", ".phn.txt") + if os.path.exists(phn_path): + with open(phn_path, 'r', encoding='utf-8') as f: + phonemes = f.readlines() + + data = {} + + for line in lines: + split = line.split("|") + audio = split[0] + text = split[1] + + data[audio] = { + 'text': text.strip() + } + + for line in phonemes: + split = line.split("|") + audio = split[0] + text = split[1] + + data[audio]['phonemes'] = text.strip() + + with open(path.replace(".txt", ".json"), 'w', encoding='utf-8') as f: + f.write(json.dumps(data, indent="\t")) + + +cached_backends = {} + +def phonemizer( text, language="en-us" ): + from phonemizer import phonemize + from phonemizer.backend import BACKENDS + + def _get_backend( language="en-us", backend="espeak" ): + key = f'{language}_{backend}' + if key in cached_backends: + return cached_backends[key] + + if backend == 'espeak': + phonemizer = BACKENDS[backend]( language, preserve_punctuation=True, with_stress=True) + elif backend == 'espeak-mbrola': + phonemizer = BACKENDS[backend]( language ) + else: + phonemizer = BACKENDS[backend]( language, preserve_punctuation=True ) + + cached_backends[key] = phonemizer + return phonemizer + if language == "en": + language = "en-us" + + backend = _get_backend(language=language, backend=args.phonemizer_backend) + if backend is not None: + tokens = backend.phonemize( [text], strip=True ) + else: + tokens = phonemize( [text], language=language, strip=True, preserve_punctuation=True, with_stress=True ) + + return tokens[0] if len(tokens) == 0 else tokens + tokenized = " ".join( tokens ) + +def should_phonemize(): + if args.tts_backend == "vall-e": + return False + + should = args.tokenizer_json is not None and args.tokenizer_json[-8:] == "ipa.json" + if should: + try: + from phonemizer import phonemize + except Exception as e: + return False + return should + +def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, progress=gr.Progress() ): + indir = f'./training/{voice}/' + infile = f'{indir}/whisper.json' + if not os.path.exists(infile): + message = f"Missing dataset: {infile}" + print(message) + return message + + results = json.load(open(infile, 'r', encoding="utf-8")) + + errored = 0 + messages = [] + normalize = False # True + phonemize = should_phonemize() + lines = { 'training': [], 'validation': [] } + segments = {} + + quantize_in_memory = args.tts_backend == "vall-e" + + if args.tts_backend != "tortoise": + text_length = 0 + audio_length = 0 + + start_offset = -0.1 + end_offset = 0.1 + trim_silence = False + + TARGET_SAMPLE_RATE = 22050 + if args.tts_backend != "tortoise": + TARGET_SAMPLE_RATE = 24000 + if tts: + TARGET_SAMPLE_RATE = tts.input_sample_rate + + for filename in tqdm(results, desc="Parsing results"): + use_segment = use_segments + + extension = os.path.splitext(filename)[-1][1:] + out_extension = extension # "wav" + result = results[filename] + lang = result['language'] + language = LANGUAGES[lang] if lang in LANGUAGES else lang + normalizer = EnglishTextNormalizer() if language and language == "english" else BasicTextNormalizer() + + # check if unsegmented text exceeds 200 characters + if not use_segment: + if len(result['text']) > MAX_TRAINING_CHAR_LENGTH: + message = f"Text length too long ({MAX_TRAINING_CHAR_LENGTH} < {len(result['text'])}), using segments: {filename}" + print(message) + messages.append(message) + use_segment = True + + # check if unsegmented audio exceeds 11.6s + if not use_segment: + path = f'{indir}/audio/{filename}' + if not quantize_in_memory and not os.path.exists(path): + messages.append(f"Missing source audio: {filename}") + errored += 1 + continue + + duration = 0 + for segment in result['segments']: + duration = max(duration, segment['end']) + + if duration >= MAX_TRAINING_DURATION: + message = f"Audio too large, using segments: {filename}" + print(message) + messages.append(message) + use_segment = True + + # implicitly segment + if use_segment and not use_segments: + exists = True + for segment in result['segments']: + duration = segment['end'] - segment['start'] + if duration <= MIN_TRAINING_DURATION or MAX_TRAINING_DURATION <= duration: + continue + + path = f'{indir}/audio/' + filename.replace(f".{extension}", f"_{pad(segment['id'], 4)}.{out_extension}") + if os.path.exists(path): + continue + exists = False + break + + if not quantize_in_memory and not exists: + tmp = {} + tmp[filename] = result + print(f"Audio not segmented, segmenting: {filename}") + message = slice_dataset( voice, results=tmp ) + print(message) + messages = messages + message.split("\n") + + waveform = None + + + if quantize_in_memory: + path = f'{indir}/audio/{filename}' + if not os.path.exists(path): + path = f'./voices/{voice}/{filename}' + + if not os.path.exists(path): + message = f"Audio not found: {path}" + print(message) + messages.append(message) + #continue + else: + waveform = torchaudio.load(path) + waveform = resample(waveform[0], waveform[1], TARGET_SAMPLE_RATE) + + if not use_segment: + segments[filename] = { + 'text': result['text'], + 'lang': lang, + 'language': language, + 'normalizer': normalizer, + 'phonemes': result['phonemes'] if 'phonemes' in result else None + } + + if waveform: + segments[filename]['waveform'] = waveform + else: + for segment in result['segments']: + duration = segment['end'] - segment['start'] + if duration <= MIN_TRAINING_DURATION or MAX_TRAINING_DURATION <= duration: + continue + + file = filename.replace(f".{extension}", f"_{pad(segment['id'], 4)}.{out_extension}") + + segments[file] = { + 'text': segment['text'], + 'lang': lang, + 'language': language, + 'normalizer': normalizer, + 'phonemes': segment['phonemes'] if 'phonemes' in segment else None + } + + if waveform: + sliced, error = slice_waveform( waveform[0], waveform[1], segment['start'] + start_offset, segment['end'] + end_offset, trim_silence ) + if error: + message = f"{error}, skipping... {file}" + print(message) + messages.append(message) + segments[file]['error'] = error + #continue + else: + segments[file]['waveform'] = (sliced, waveform[1]) + + jobs = { + 'quantize': [[], []], + 'phonemize': [[], []], + } + + for file in tqdm(segments, desc="Parsing segments"): + extension = os.path.splitext(file)[-1][1:] + result = segments[file] + path = f'{indir}/audio/{file}' + + text = result['text'] + lang = result['lang'] + language = result['language'] + normalizer = result['normalizer'] + phonemes = result['phonemes'] + if phonemize and phonemes is None: + phonemes = phonemizer( text, language=lang ) + + normalized = normalizer(text) if normalize else text + + if len(text) > MAX_TRAINING_CHAR_LENGTH: + message = f"Text length too long ({MAX_TRAINING_CHAR_LENGTH} < {len(text)}), skipping... {file}" + print(message) + messages.append(message) + errored += 1 + continue + + # num_channels, num_frames = waveform.shape + #duration = num_frames / sample_rate + + + culled = len(text) < text_length + #if not culled and audio_length > 0: + # culled = duration < audio_length + + line = f'audio/{file}|{phonemes if phonemize and phonemes else text}' + + lines['training' if not culled else 'validation'].append(line) + + if culled or args.tts_backend != "vall-e": + continue + + os.makedirs(f'{indir}/valle/', exist_ok=True) + #os.makedirs(f'./training/valle/data/{voice}/', exist_ok=True) + + phn_file = f'{indir}/valle/{file.replace(f".{extension}",".phn.txt")}' + #phn_file = f'./training/valle/data/{voice}/{file.replace(f".{extension}",".phn.txt")}' + if not os.path.exists(phn_file): + jobs['phonemize'][0].append(phn_file) + jobs['phonemize'][1].append(normalized) + """ + phonemized = valle_phonemize( normalized ) + open(f'{indir}/valle/{file.replace(".wav",".phn.txt")}', 'w', encoding='utf-8').write(" ".join(phonemized)) + print("Phonemized:", file, normalized, text) + """ + + qnt_file = f'{indir}/valle/{file.replace(f".{extension}",".qnt.pt")}' + #qnt_file = f'./training/valle/data/{voice}/{file.replace(f".{extension}",".qnt.pt")}' + if 'error' not in result: + if not quantize_in_memory and not os.path.exists(path): + message = f"Missing segment, skipping... {file}" + print(message) + messages.append(message) + errored += 1 + continue + + if not os.path.exists(qnt_file): + waveform = None + if 'waveform' in result: + waveform, sample_rate = result['waveform'] + elif os.path.exists(path): + waveform, sample_rate = torchaudio.load(path) + error = validate_waveform( waveform, sample_rate ) + if error: + message = f"{error}, skipping... {file}" + print(message) + messages.append(message) + errored += 1 + continue + + if waveform is not None: + jobs['quantize'][0].append(qnt_file) + jobs['quantize'][1].append((waveform, sample_rate)) + """ + quantized = valle_quantize( waveform, sample_rate ).cpu() + torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}') + print("Quantized:", file) + """ + + for i in tqdm(range(len(jobs['quantize'][0])), desc="Quantizing"): + qnt_file = jobs['quantize'][0][i] + waveform, sample_rate = jobs['quantize'][1][i] + + quantized = valle_quantize( waveform, sample_rate ).cpu() + torch.save(quantized, qnt_file) + #print("Quantized:", qnt_file) + + for i in tqdm(range(len(jobs['phonemize'][0])), desc="Phonemizing"): + phn_file = jobs['phonemize'][0][i] + normalized = jobs['phonemize'][1][i] + + try: + phonemized = valle_phonemize( normalized ) + open(phn_file, 'w', encoding='utf-8').write(" ".join(phonemized)) + #print("Phonemized:", phn_file) + except Exception as e: + message = f"Failed to phonemize: {phn_file}: {normalized}" + messages.append(message) + print(message) + + + training_joined = "\n".join(lines['training']) + validation_joined = "\n".join(lines['validation']) + + with open(f'{indir}/train.txt', 'w', encoding="utf-8") as f: + f.write(training_joined) + + with open(f'{indir}/validation.txt', 'w', encoding="utf-8") as f: + f.write(validation_joined) + + messages.append(f"Prepared {len(lines['training'])} lines (validation: {len(lines['validation'])}, culled: {errored}).\n{training_joined}\n\n{validation_joined}") + return "\n".join(messages) + +def calc_iterations( epochs, lines, batch_size ): + return int(math.ceil(epochs * math.ceil(lines / batch_size))) + +def schedule_learning_rate( iterations, schedule=LEARNING_RATE_SCHEDULE ): + return [int(iterations * d) for d in schedule] + +def optimize_training_settings( **kwargs ): + messages = [] + settings = {} + settings.update(kwargs) + + dataset_path = f"./training/{settings['voice']}/train.txt" + with open(dataset_path, 'r', encoding="utf-8") as f: + lines = len(f.readlines()) + + if lines == 0: + raise Exception("Empty dataset.") + + if settings['batch_size'] > lines: + settings['batch_size'] = lines + messages.append(f"Batch size is larger than your dataset, clamping batch size to: {settings['batch_size']}") + + """ + if lines % settings['batch_size'] != 0: + settings['batch_size'] = int(lines / settings['batch_size']) + if settings['batch_size'] == 0: + settings['batch_size'] = 1 + messages.append(f"Batch size not neatly divisible by dataset size, adjusting batch size to: {settings['batch_size']}") + """ + if settings['gradient_accumulation_size'] == 0: + settings['gradient_accumulation_size'] = 1 + + if settings['batch_size'] / settings['gradient_accumulation_size'] < 2: + settings['gradient_accumulation_size'] = int(settings['batch_size'] / 2) + if settings['gradient_accumulation_size'] == 0: + settings['gradient_accumulation_size'] = 1 + + messages.append(f"Gradient accumulation size is too large for a given batch size, clamping gradient accumulation size to: {settings['gradient_accumulation_size']}") + elif settings['batch_size'] % settings['gradient_accumulation_size'] != 0: + settings['gradient_accumulation_size'] -= settings['batch_size'] % settings['gradient_accumulation_size'] + if settings['gradient_accumulation_size'] == 0: + settings['gradient_accumulation_size'] = 1 + + messages.append(f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}") + + if settings['batch_size'] % settings['gpus'] != 0: + settings['batch_size'] -= settings['batch_size'] % settings['gpus'] + if settings['batch_size'] == 0: + settings['batch_size'] = 1 + messages.append(f"Batch size not neatly divisible by GPU count, adjusting batch size to: {settings['batch_size']}") + + + def get_device_batch_size( vram ): + DEVICE_BATCH_SIZE_MAP = [ + (70, 128), # based on an A100-80G, I can safely get a ratio of 4096:32 = 128 + (32, 64), # based on my two 6800XTs, I can only really safely get a ratio of 128:2 = 64 + (16, 8), # based on an A4000, I can do a ratio of 512:64 = 8:1 + (8, 4), # interpolated + (6, 2), # based on my 2060, it only really lets me have a batch ratio of 2:1 + ] + for k, v in DEVICE_BATCH_SIZE_MAP: + if vram > (k-1): + return v + return 1 + + if settings['gpus'] > get_device_count(): + settings['gpus'] = get_device_count() + messages.append(f"GPU count exceeds defacto GPU count, clamping to: {settings['gpus']}") + + if settings['gpus'] <= 1: + settings['gpus'] = 1 + else: + messages.append(f"! EXPERIMENTAL ! Multi-GPU training is extremely particular, expect issues.") + + # assuming you have equal GPUs + vram = get_device_vram() * settings['gpus'] + batch_ratio = int(settings['batch_size'] / settings['gradient_accumulation_size']) + batch_cap = get_device_batch_size(vram) + + if batch_ratio > batch_cap: + settings['gradient_accumulation_size'] = int(settings['batch_size'] / batch_cap) + messages.append(f"Batch ratio ({batch_ratio}) is expected to exceed your VRAM capacity ({'{:.3f}'.format(vram)}GB, suggested {batch_cap} batch size cap), adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}") + + iterations = calc_iterations(epochs=settings['epochs'], lines=lines, batch_size=settings['batch_size']) + + if settings['epochs'] < settings['save_rate']: + settings['save_rate'] = settings['epochs'] + messages.append(f"Save rate is too small for the given iteration step, clamping save rate to: {settings['save_rate']}") + + if settings['epochs'] < settings['validation_rate']: + settings['validation_rate'] = settings['epochs'] + messages.append(f"Validation rate is too small for the given iteration step, clamping validation rate to: {settings['validation_rate']}") + + if settings['resume_state'] and not os.path.exists(settings['resume_state']): + settings['resume_state'] = None + messages.append("Resume path specified, but does not exist. Disabling...") + + if settings['bitsandbytes']: + messages.append("! EXPERIMENTAL ! BitsAndBytes requested.") + + if settings['half_p']: + if settings['bitsandbytes']: + settings['half_p'] = False + messages.append("Half Precision requested, but BitsAndBytes is also requested. Due to redundancies, disabling half precision...") + else: + messages.append("! EXPERIMENTAL ! Half Precision requested.") + if not os.path.exists(get_halfp_model_path()): + convert_to_halfp() + + steps = int(iterations / settings['epochs']) + + messages.append(f"For {settings['epochs']} epochs with {lines} lines in batches of {settings['batch_size']}, iterating for {iterations} steps ({steps}) steps per epoch)") + + return settings, messages + +def save_training_settings( **kwargs ): + messages = [] + settings = {} + settings.update(kwargs) + + + outjson = f'./training/{settings["voice"]}/train.json' + with open(outjson, 'w', encoding="utf-8") as f: + f.write(json.dumps(settings, indent='\t') ) + + settings['dataset_path'] = f"./training/{settings['voice']}/train.txt" + settings['validation_path'] = f"./training/{settings['voice']}/validation.txt" + + with open(settings['dataset_path'], 'r', encoding="utf-8") as f: + lines = len(f.readlines()) + + settings['iterations'] = calc_iterations(epochs=settings['epochs'], lines=lines, batch_size=settings['batch_size']) + + if not settings['source_model'] or settings['source_model'] == "auto": + settings['source_model'] = f"./models/tortoise/autoregressive{'_half' if settings['half_p'] else ''}.pth" + + if settings['half_p']: + if not os.path.exists(get_halfp_model_path()): + convert_to_halfp() + + messages.append(f"For {settings['epochs']} epochs with {lines} lines, iterating for {settings['iterations']} steps") + + iterations_per_epoch = settings['iterations'] / settings['epochs'] + + settings['save_rate'] = int(settings['save_rate'] * iterations_per_epoch) + settings['validation_rate'] = int(settings['validation_rate'] * iterations_per_epoch) + + iterations_per_epoch = int(iterations_per_epoch) + + if settings['save_rate'] < 1: + settings['save_rate'] = 1 + """ + if settings['validation_rate'] < 1: + settings['validation_rate'] = 1 + """ + """ + if settings['iterations'] % settings['save_rate'] != 0: + adjustment = int(settings['iterations'] / settings['save_rate']) * settings['save_rate'] + messages.append(f"Iteration rate is not evenly divisible by save rate, adjusting: {settings['iterations']} => {adjustment}") + settings['iterations'] = adjustment + """ + + settings['validation_batch_size'] = int(settings['batch_size'] / settings['gradient_accumulation_size']) + if not os.path.exists(settings['validation_path']): + settings['validation_enabled'] = False + messages.append("Validation not found, disabling validation...") + elif settings['validation_batch_size'] == 0: + settings['validation_enabled'] = False + messages.append("Validation batch size == 0, disabling validation...") + else: + with open(settings['validation_path'], 'r', encoding="utf-8") as f: + validation_lines = len(f.readlines()) + + if validation_lines < settings['validation_batch_size']: + settings['validation_batch_size'] = validation_lines + messages.append(f"Batch size exceeds validation dataset size, clamping validation batch size to {validation_lines}") + + settings['tokenizer_json'] = args.tokenizer_json if args.tokenizer_json else get_tokenizer_jsons()[0] + + if settings['gpus'] > get_device_count(): + settings['gpus'] = get_device_count() + + # what an utter mistake this was + settings['optimizer'] = 'adamw' # if settings['gpus'] == 1 else 'adamw_zero' + + if 'learning_rate_scheme' not in settings or settings['learning_rate_scheme'] not in LEARNING_RATE_SCHEMES: + settings['learning_rate_scheme'] = "Multistep" + + settings['learning_rate_scheme'] = LEARNING_RATE_SCHEMES[settings['learning_rate_scheme']] + + learning_rate_schema = [f"default_lr_scheme: {settings['learning_rate_scheme']}"] + if settings['learning_rate_scheme'] == "MultiStepLR": + if not settings['learning_rate_schedule']: + settings['learning_rate_schedule'] = LEARNING_RATE_SCHEDULE + elif isinstance(settings['learning_rate_schedule'],str): + settings['learning_rate_schedule'] = json.loads(settings['learning_rate_schedule']) + + settings['learning_rate_schedule'] = schedule_learning_rate( iterations_per_epoch, settings['learning_rate_schedule'] ) + + learning_rate_schema.append(f" gen_lr_steps: {settings['learning_rate_schedule']}") + learning_rate_schema.append(f" lr_gamma: 0.5") + elif settings['learning_rate_scheme'] == "CosineAnnealingLR_Restart": + epochs = settings['epochs'] + restarts = settings['learning_rate_restarts'] + restart_period = int(epochs / restarts) + + if 'learning_rate_warmup' not in settings: + settings['learning_rate_warmup'] = 0 + if 'learning_rate_min' not in settings: + settings['learning_rate_min'] = 1e-08 + + if 'learning_rate_period' not in settings: + settings['learning_rate_period'] = [ iterations_per_epoch * restart_period for x in range(epochs) ] + + settings['learning_rate_restarts'] = [ iterations_per_epoch * (x+1) * restart_period for x in range(restarts) ] # [52, 104, 156, 208] + + if 'learning_rate_restart_weights' not in settings: + settings['learning_rate_restart_weights'] = [ ( restarts - x - 1 ) / restarts for x in range(restarts) ] # [.75, .5, .25, .125] + settings['learning_rate_restart_weights'][-1] = settings['learning_rate_restart_weights'][-2] * 0.5 + + learning_rate_schema.append(f" T_period: {settings['learning_rate_period']}") + learning_rate_schema.append(f" warmup: {settings['learning_rate_warmup']}") + learning_rate_schema.append(f" eta_min: !!float {settings['learning_rate_min']}") + learning_rate_schema.append(f" restarts: {settings['learning_rate_restarts']}") + learning_rate_schema.append(f" restart_weights: {settings['learning_rate_restart_weights']}") + settings['learning_rate_scheme'] = "\n".join(learning_rate_schema) + + if settings['resume_state']: + settings['source_model'] = f"# pretrain_model_gpt: '{settings['source_model']}'" + settings['resume_state'] = f"resume_state: '{settings['resume_state']}'" + else: + settings['source_model'] = f"pretrain_model_gpt: '{settings['source_model']}'" + settings['resume_state'] = f"# resume_state: '{settings['resume_state']}'" + + def use_template(template, out): + with open(template, 'r', encoding="utf-8") as f: + yaml = f.read() + + # i could just load and edit the YAML directly, but this is easier, as I don't need to bother with path traversals + for k in settings: + if settings[k] is None: + continue + yaml = yaml.replace(f"${{{k}}}", str(settings[k])) + + with open(out, 'w', encoding="utf-8") as f: + f.write(yaml) + + if args.tts_backend == "tortoise": + use_template(f'./models/.template.dlas.yaml', f'./training/{settings["voice"]}/train.yaml') + elif args.tts_backend == "vall-e": + settings['model_name'] = "[ 'ar-quarter', 'nar-quarter' ]" + use_template(f'./models/.template.valle.yaml', f'./training/{settings["voice"]}/config.yaml') + + messages.append(f"Saved training output") + return settings, messages + +def import_voices(files, saveAs=None, progress=None): + global args + + if not isinstance(files, list): + files = [files] + + for file in tqdm(files, desc="Importing voice files"): + j, latents = read_generate_settings(file, read_latents=True) + + if j is not None and saveAs is None: + saveAs = j['voice'] + if saveAs is None or saveAs == "": + raise Exception("Specify a voice name") + + outdir = f'{get_voice_dir()}/{saveAs}/' + os.makedirs(outdir, exist_ok=True) + + if latents: + print(f"Importing latents to {latents}") + with open(f'{outdir}/cond_latents.pth', 'wb') as f: + f.write(latents) + latents = f'{outdir}/cond_latents.pth' + print(f"Imported latents to {latents}") + else: + filename = file.name + if filename[-4:] != ".wav": + raise Exception("Please convert to a WAV first") + + path = f"{outdir}/{os.path.basename(filename)}" + print(f"Importing voice to {path}") + + waveform, sample_rate = torchaudio.load(filename) + + if args.voice_fixer: + if not voicefixer: + load_voicefixer() + + waveform, sample_rate = resample(waveform, sample_rate, 44100) + torchaudio.save(path, waveform, sample_rate) + + print(f"Running 'voicefixer' on voice sample: {path}") + voicefixer.restore( + input = path, + output = path, + cuda=get_device_name() == "cuda" and args.voice_fixer_use_cuda, + #mode=mode, + ) + else: + torchaudio.save(path, waveform, sample_rate) + + print(f"Imported voice to {path}") + +def relative_paths( dirs ): + return [ './' + os.path.relpath( d ).replace("\\", "/") for d in dirs ] + +def get_voice( name, dir=get_voice_dir(), load_latents=True, extensions=["wav", "mp3", "flac"] ): + subj = f'{dir}/{name}/' + if not os.path.isdir(subj): + return + files = os.listdir(subj) + + if load_latents: + extensions.append("pth") + + voice = [] + for file in files: + ext = os.path.splitext(file)[-1][1:] + if ext not in extensions: + continue + + voice.append(f'{subj}/{file}') + + return sorted( voice ) + +def get_voice_list(dir=get_voice_dir(), append_defaults=False, extensions=["wav", "mp3", "flac", "pth"]): + defaults = [ "random", "microphone" ] + os.makedirs(dir, exist_ok=True) + #res = sorted([d for d in os.listdir(dir) if d not in defaults and os.path.isdir(os.path.join(dir, d)) and len(os.listdir(os.path.join(dir, d))) > 0 ]) + + res = [] + for name in os.listdir(dir): + if name in defaults: + continue + if not os.path.isdir(f'{dir}/{name}'): + continue + if len(os.listdir(os.path.join(dir, name))) == 0: + continue + files = get_voice( name, dir=dir, extensions=extensions ) + + if len(files) > 0: + res.append(name) + else: + for subdir in os.listdir(f'{dir}/{name}'): + if not os.path.isdir(f'{dir}/{name}/{subdir}'): + continue + files = get_voice( f'{name}/{subdir}', dir=dir, extensions=extensions ) + if len(files) == 0: + continue + res.append(f'{name}/{subdir}') + + res = sorted(res) + + if append_defaults: + res = res + defaults + + return res + +def get_valle_models(dir="./training/"): + return [ f'{dir}/{d}/config.yaml' for d in os.listdir(dir) if os.path.exists(f'{dir}/{d}/config.yaml') ] + +def get_autoregressive_models(dir="./models/finetunes/", prefixed=False, auto=False): + os.makedirs(dir, exist_ok=True) + base = [get_model_path('autoregressive.pth')] + halfp = get_halfp_model_path() + if os.path.exists(halfp): + base.append(halfp) + + additionals = sorted([f'{dir}/{d}' for d in os.listdir(dir) if d[-4:] == ".pth" ]) + found = [] + for training in os.listdir(f'./training/'): + if not os.path.isdir(f'./training/{training}/') or not os.path.isdir(f'./training/{training}/finetune/') or not os.path.isdir(f'./training/{training}/finetune/models/'): + continue + models = sorted([ int(d[:-8]) for d in os.listdir(f'./training/{training}/finetune/models/') if d[-8:] == "_gpt.pth" ]) + found = found + [ f'./training/{training}/finetune/models/{d}_gpt.pth' for d in models ] + + res = base + additionals + found + + if prefixed: + for i in range(len(res)): + path = res[i] + hash = hash_file(path) + shorthash = hash[:8] + + res[i] = f'[{shorthash}] {path}' + + paths = relative_paths(res) + if auto: + paths = ["auto"] + paths + + return paths + +def get_diffusion_models(dir="./models/finetunes/", prefixed=False): + return relative_paths([ get_model_path('diffusion_decoder.pth') ]) + +def get_tokenizer_jsons( dir="./models/tokenizers/" ): + additionals = sorted([ f'{dir}/{d}' for d in os.listdir(dir) if d[-5:] == ".json" ]) if os.path.isdir(dir) else [] + return relative_paths([ "./modules/tortoise-tts/tortoise/data/tokenizer.json" ] + additionals) + +def tokenize_text( text, config=None, stringed=True, skip_specials=False ): + from tortoise.utils.tokenizer import VoiceBpeTokenizer + + if not config: + config = args.tokenizer_json if args.tokenizer_json else get_tokenizer_jsons()[0] + + if not tts: + tokenizer = VoiceBpeTokenizer(config) + else: + tokenizer = tts.tokenizer + + encoded = tokenizer.encode(text) + decoded = tokenizer.tokenizer.decode(encoded, skip_special_tokens=skip_specials).split(" ") + + if stringed: + return "\n".join([ str(encoded), str(decoded) ]) + + return decoded + +def get_dataset_list(dir="./training/"): + return sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.txt" in os.listdir(os.path.join(dir, d)) ]) + +def get_training_list(dir="./training/"): + if args.tts_backend == "tortoise": + return sorted([f'./training/{d}/train.yaml' for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.yaml" in os.listdir(os.path.join(dir, d)) ]) + else: + return sorted([f'./training/{d}/config.yaml' for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "config.yaml" in os.listdir(os.path.join(dir, d)) ]) + +def pad(num, zeroes): + return str(num).zfill(zeroes+1) + +def curl(url): + try: + req = urllib.request.Request(url, headers={'User-Agent': 'Python'}) + conn = urllib.request.urlopen(req) + data = conn.read() + data = data.decode() + data = json.loads(data) + conn.close() + return data + except Exception as e: + print(e) + return None + +def check_for_updates( dir = None ): + if dir is None: + check_for_updates("./.git/") + check_for_updates("./.git/modules/dlas/") + check_for_updates("./.git/modules/tortoise-tts/") + return + + git_dir = dir + if not os.path.isfile(f'{git_dir}/FETCH_HEAD'): + print(f"Cannot check for updates for {dir}: not from a git repo") + return False + + with open(f'{git_dir}/FETCH_HEAD', 'r', encoding="utf-8") as f: + head = f.read() + + match = re.findall(r"^([a-f0-9]+).+?https:\/\/(.+?)\/(.+?)\/(.+?)\n", head) + if match is None or len(match) == 0: + print(f"Cannot check for updates for {dir}: cannot parse FETCH_HEAD") + return False + + match = match[0] + + local = match[0] + host = match[1] + owner = match[2] + repo = match[3] + + res = curl(f"https://{host}/api/v1/repos/{owner}/{repo}/branches/") #this only works for gitea instances + + if res is None or len(res) == 0: + print(f"Cannot check for updates for {dir}: cannot fetch from remote") + return False + + remote = res[0]["commit"]["id"] + + if remote != local: + print(f"New version found for {dir}: {local[:8]} => {remote[:8]}") + return True + + return False + +def notify_progress(message, progress=None, verbose=True): + if verbose: + print(message) + + if progress is None: + tqdm.write(message) + else: + progress(0, desc=message) + +def get_args(): + global args + return args + +def setup_args(cli=False): + global args + + default_arguments = { + 'share': False, + 'listen': None, + 'check-for-updates': False, + 'models-from-local-only': False, + 'low-vram': False, + 'sample-batch-size': None, + 'unsqueeze-sample-batches': False, + 'embed-output-metadata': True, + 'latents-lean-and-mean': True, + 'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it + 'use-deepspeed': True, + 'voice-fixer-use-cuda': True, + + + 'force-cpu-for-conditioning-latents': False, + 'defer-tts-load': False, + 'device-override': None, + 'prune-nonfinal-outputs': True, + 'concurrency-count': 2, + 'autocalculate-voice-chunk-duration-size': 10, + + 'output-sample-rate': 44100, + 'output-volume': 1, + 'results-folder': "./results/", + + 'hf-token': None, + 'tts-backend': TTSES[0], + + 'autoregressive-model': None, + 'diffusion-model': None, + 'vocoder-model': VOCODERS[-1], + 'tokenizer-json': None, + + 'phonemizer-backend': 'espeak', + + 'valle-model': None, + + 'whisper-backend': 'openai/whisper', + 'whisper-model': "base", + 'whisper-batchsize': 1, + + 'training-default-halfp': False, + 'training-default-bnb': True, + + 'websocket-listen-address': "127.0.0.1", + 'websocket-listen-port': 8069, + 'websocket-enabled': False + } + + if os.path.isfile('./config/exec.json'): + with open(f'./config/exec.json', 'r', encoding="utf-8") as f: + try: + overrides = json.load(f) + for k in overrides: + default_arguments[k] = overrides[k] + except Exception as e: + print(e) + pass + + parser = argparse.ArgumentParser(allow_abbrev=not cli) + parser.add_argument("--share", action='store_true', default=default_arguments['share'], help="Lets Gradio return a public URL to use anywhere") + parser.add_argument("--listen", default=default_arguments['listen'], help="Path for Gradio to listen on") + parser.add_argument("--check-for-updates", action='store_true', default=default_arguments['check-for-updates'], help="Checks for update on startup") + parser.add_argument("--models-from-local-only", action='store_true', default=default_arguments['models-from-local-only'], help="Only loads models from disk, does not check for updates for models") + parser.add_argument("--low-vram", action='store_true', default=default_arguments['low-vram'], help="Disables some optimizations that increases VRAM usage") + parser.add_argument("--no-embed-output-metadata", action='store_false', default=not default_arguments['embed-output-metadata'], help="Disables embedding output metadata into resulting WAV files for easily fetching its settings used with the web UI (data is stored in the lyrics metadata tag)") + parser.add_argument("--latents-lean-and-mean", action='store_true', default=default_arguments['latents-lean-and-mean'], help="Exports the bare essentials for latents.") + parser.add_argument("--voice-fixer", action='store_true', default=default_arguments['voice-fixer'], help="Uses python module 'voicefixer' to improve audio quality, if available.") + parser.add_argument("--voice-fixer-use-cuda", action='store_true', default=default_arguments['voice-fixer-use-cuda'], help="Hints to voicefixer to use CUDA, if available.") + parser.add_argument("--use-deepspeed", action='store_true', default=default_arguments['use-deepspeed'], help="Use deepspeed for speed bump.") + parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)") + parser.add_argument("--defer-tts-load", default=default_arguments['defer-tts-load'], action='store_true', help="Defers loading TTS model") + parser.add_argument("--prune-nonfinal-outputs", default=default_arguments['prune-nonfinal-outputs'], action='store_true', help="Deletes non-final output files on completing a generation") + parser.add_argument("--device-override", default=default_arguments['device-override'], help="A device string to override pass through Torch") + parser.add_argument("--sample-batch-size", default=default_arguments['sample-batch-size'], type=int, help="Sets how many batches to use during the autoregressive samples pass") + parser.add_argument("--unsqueeze-sample-batches", default=default_arguments['unsqueeze-sample-batches'], action='store_true', help="Unsqueezes sample batches to process one by one after sampling") + parser.add_argument("--concurrency-count", type=int, default=default_arguments['concurrency-count'], help="How many Gradio events to process at once") + parser.add_argument("--autocalculate-voice-chunk-duration-size", type=float, default=default_arguments['autocalculate-voice-chunk-duration-size'], help="Number of seconds to suggest voice chunk size for (for example, 100 seconds of audio at 10 seconds per chunk will suggest 10 chunks)") + parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)") + parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output") + parser.add_argument("--results-folder", type=str, default=default_arguments['results-folder'], help="Sets output directory") + + parser.add_argument("--hf-token", type=str, default=default_arguments['hf-token'], help="HuggingFace Token") + parser.add_argument("--tts-backend", default=default_arguments['tts-backend'], help="Specifies which TTS backend to use.") + + parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.") + parser.add_argument("--diffusion-model", default=default_arguments['diffusion-model'], help="Specifies which diffusion model to use for sampling.") + parser.add_argument("--vocoder-model", default=default_arguments['vocoder-model'], action='store_true', help="Specifies with vocoder to use") + parser.add_argument("--tokenizer-json", default=default_arguments['tokenizer-json'], help="Specifies which tokenizer json to use for tokenizing.") + + parser.add_argument("--phonemizer-backend", default=default_arguments['phonemizer-backend'], help="Specifies which phonemizer backend to use.") + + parser.add_argument("--valle-model", default=default_arguments['valle-model'], help="Specifies which VALL-E model to use for sampling.") + + parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)") + parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.") + parser.add_argument("--whisper-batchsize", type=int, default=default_arguments['whisper-batchsize'], help="Specifies batch size for WhisperX") + + parser.add_argument("--training-default-halfp", action='store_true', default=default_arguments['training-default-halfp'], help="Training default: halfp") + parser.add_argument("--training-default-bnb", action='store_true', default=default_arguments['training-default-bnb'], help="Training default: bnb") + + parser.add_argument("--websocket-listen-port", type=int, default=default_arguments['websocket-listen-port'], help="Websocket server listen port, default: 8069") + parser.add_argument("--websocket-listen-address", default=default_arguments['websocket-listen-address'], help="Websocket server listen address, default: 127.0.0.1") + parser.add_argument("--websocket-enabled", action='store_true', default=default_arguments['websocket-enabled'], help="Websocket API server enabled, default: false") + + if cli: + args, unknown = parser.parse_known_args() + else: + args = parser.parse_args() + + args.embed_output_metadata = not args.no_embed_output_metadata + + if not args.device_override: + set_device_name(args.device_override) + + if args.sample_batch_size == 0 and get_device_batch_size() == 1: + print("!WARNING! Automatically deduced sample batch size returned 1.") + + args.listen_host = None + args.listen_port = None + args.listen_path = None + if args.listen: + try: + match = re.findall(r"^(?:(.+?):(\d+))?(\/.*?)?$", args.listen)[0] + + args.listen_host = match[0] if match[0] != "" else "127.0.0.1" + args.listen_port = match[1] if match[1] != "" else None + args.listen_path = match[2] if match[2] != "" else "/" + except Exception as e: + pass + + if args.listen_port is not None: + args.listen_port = int(args.listen_port) + if args.listen_port == 0: + args.listen_port = None + + return args + +def get_default_settings( hypenated=True ): + settings = { + 'listen': None if not args.listen else args.listen, + 'share': args.share, + 'low-vram':args.low_vram, + 'check-for-updates':args.check_for_updates, + 'models-from-local-only':args.models_from_local_only, + 'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents, + 'defer-tts-load': args.defer_tts_load, + 'prune-nonfinal-outputs': args.prune_nonfinal_outputs, + 'device-override': args.device_override, + 'sample-batch-size': args.sample_batch_size, + 'unsqueeze-sample-batches': args.unsqueeze_sample_batches, + 'embed-output-metadata': args.embed_output_metadata, + 'latents-lean-and-mean': args.latents_lean_and_mean, + 'voice-fixer': args.voice_fixer, + 'use-deepspeed': args.use_deepspeed, + 'voice-fixer-use-cuda': args.voice_fixer_use_cuda, + 'concurrency-count': args.concurrency_count, + 'output-sample-rate': args.output_sample_rate, + 'autocalculate-voice-chunk-duration-size': args.autocalculate_voice_chunk_duration_size, + 'output-volume': args.output_volume, + 'results-folder': args.results_folder, + + 'hf-token': args.hf_token, + 'tts-backend': args.tts_backend, + + 'autoregressive-model': args.autoregressive_model, + 'diffusion-model': args.diffusion_model, + 'vocoder-model': args.vocoder_model, + 'tokenizer-json': args.tokenizer_json, + + 'phonemizer-backend': args.phonemizer_backend, + + 'valle-model': args.valle_model, + + 'whisper-backend': args.whisper_backend, + 'whisper-model': args.whisper_model, + 'whisper-batchsize': args.whisper_batchsize, + + 'training-default-halfp': args.training_default_halfp, + 'training-default-bnb': args.training_default_bnb, + } + + res = {} + for k in settings: + res[k.replace("-", "_") if not hypenated else k] = settings[k] + return res + +def update_args( **kwargs ): + global args + + settings = get_default_settings(hypenated=False) + settings.update(kwargs) + + args.listen = settings['listen'] + args.share = settings['share'] + args.check_for_updates = settings['check_for_updates'] + args.models_from_local_only = settings['models_from_local_only'] + args.low_vram = settings['low_vram'] + args.force_cpu_for_conditioning_latents = settings['force_cpu_for_conditioning_latents'] + args.defer_tts_load = settings['defer_tts_load'] + args.prune_nonfinal_outputs = settings['prune_nonfinal_outputs'] + args.device_override = settings['device_override'] + args.sample_batch_size = settings['sample_batch_size'] + args.unsqueeze_sample_batches = settings['unsqueeze_sample_batches'] + args.embed_output_metadata = settings['embed_output_metadata'] + args.latents_lean_and_mean = settings['latents_lean_and_mean'] + args.voice_fixer = settings['voice_fixer'] + args.voice_fixer_use_cuda = settings['voice_fixer_use_cuda'] + args.use_deepspeed = settings['use_deepspeed'] + args.concurrency_count = settings['concurrency_count'] + args.output_sample_rate = 44000 + args.autocalculate_voice_chunk_duration_size = settings['autocalculate_voice_chunk_duration_size'] + args.output_volume = settings['output_volume'] + args.results_folder = settings['results_folder'] + + args.hf_token = settings['hf_token'] + args.tts_backend = settings['tts_backend'] + + args.autoregressive_model = settings['autoregressive_model'] + args.diffusion_model = settings['diffusion_model'] + args.vocoder_model = settings['vocoder_model'] + args.tokenizer_json = settings['tokenizer_json'] + + args.phonemizer_backend = settings['phonemizer_backend'] + + args.valle_model = settings['valle_model'] + + args.whisper_backend = settings['whisper_backend'] + args.whisper_model = settings['whisper_model'] + args.whisper_batchsize = settings['whisper_batchsize'] + + args.training_default_halfp = settings['training_default_halfp'] + args.training_default_bnb = settings['training_default_bnb'] + + save_args_settings() + +def save_args_settings(): + global args + settings = get_default_settings() + + os.makedirs('./config/', exist_ok=True) + with open(f'./config/exec.json', 'w', encoding="utf-8") as f: + f.write(json.dumps(settings, indent='\t') ) + +# super kludgy )`; +def import_generate_settings(file = None): + if not file: + file = "./config/generate.json" + + res = { + 'text': None, + 'delimiter': None, + 'emotion': None, + 'prompt': None, + 'voice': "random", + 'mic_audio': None, + 'voice_latents_chunks': None, + 'candidates': None, + 'seed': None, + 'num_autoregressive_samples': 16, + 'diffusion_iterations': 30, + 'temperature': 0.8, + 'diffusion_sampler': "DDIM", + 'breathing_room': 8 , + 'cvvp_weight': 0.0, + 'top_p': 0.8, + 'diffusion_temperature': 1.0, + 'length_penalty': 1.0, + 'repetition_penalty': 2.0, + 'cond_free_k': 2.0, + 'experimentals': None, + } + + settings, _ = read_generate_settings(file, read_latents=False) + + if settings is not None: + res.update(settings) + + return res + +def reset_generate_settings(): + with open(f'./config/generate.json', 'w', encoding="utf-8") as f: + f.write(json.dumps({}, indent='\t') ) + return import_generate_settings() + +def read_generate_settings(file, read_latents=True): + j = None + latents = None + + if isinstance(file, list) and len(file) == 1: + file = file[0] + + try: + if file is not None: + if hasattr(file, 'name'): + file = file.name + + if file[-4:] == ".wav": + metadata = music_tag.load_file(file) + if 'lyrics' in metadata: + j = json.loads(str(metadata['lyrics'])) + elif file[-5:] == ".json": + with open(file, 'r') as f: + j = json.load(f) + except Exception as e: + pass + + if j is not None: + if 'latents' in j: + if read_latents: + latents = base64.b64decode(j['latents']) + del j['latents'] + + + if "time" in j: + j["time"] = "{:.3f}".format(j["time"]) + + + + return ( + j, + latents, + ) + +def version_check_tts( min_version ): + global tts + if not tts: + raise Exception("TTS is not initialized") + + if not hasattr(tts, 'version'): + return False + + if min_version[0] > tts.version[0]: + return True + if min_version[1] > tts.version[1]: + return True + if min_version[2] >= tts.version[2]: + return True + return False + +def load_tts( restart=False, + # TorToiSe configs + autoregressive_model=None, diffusion_model=None, vocoder_model=None, tokenizer_json=None, + # VALL-E configs + valle_model=None, +): + global args + global tts + + if restart: + unload_tts() + + tts_loading = True + if args.tts_backend == "tortoise": + if autoregressive_model: + args.autoregressive_model = autoregressive_model + else: + autoregressive_model = args.autoregressive_model + + if autoregressive_model == "auto": + autoregressive_model = deduce_autoregressive_model() + + if diffusion_model: + args.diffusion_model = diffusion_model + else: + diffusion_model = args.diffusion_model + + if vocoder_model: + args.vocoder_model = vocoder_model + else: + vocoder_model = args.vocoder_model + + if tokenizer_json: + args.tokenizer_json = tokenizer_json + else: + tokenizer_json = args.tokenizer_json + + if get_device_name() == "cpu": + print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.") + + print(f"Loading TorToiSe... (AR: {autoregressive_model}, diffusion: {diffusion_model}, vocoder: {vocoder_model})") + tts = TorToise_TTS(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, diffusion_model_path=diffusion_model, vocoder_model=vocoder_model, tokenizer_json=tokenizer_json, unsqueeze_sample_batches=args.unsqueeze_sample_batches, use_deepspeed=args.use_deepspeed) + elif args.tts_backend == "vall-e": + if valle_model: + args.valle_model = valle_model + else: + valle_model = args.valle_model + + print(f"Loading VALL-E... (Config: {valle_model})") + tts = VALLE_TTS(config=args.valle_model) + elif args.tts_backend == "bark": + + print(f"Loading Bark...") + tts = Bark_TTS(small=args.low_vram) + + print("Loaded TTS, ready for generation.") + tts_loading = False + return tts + +def unload_tts(): + global tts + + if tts: + del tts + tts = None + print("Unloaded TTS") + do_gc() + +def reload_tts(): + unload_tts() + load_tts() + +def get_current_voice(): + global current_voice + if current_voice: + return current_voice + + settings, _ = read_generate_settings("./config/generate.json", read_latents=False) + + if settings and "voice" in settings['voice']: + return settings["voice"] + + return None + +def deduce_autoregressive_model(voice=None): + if not voice: + voice = get_current_voice() + + if voice: + if os.path.exists(f'./models/finetunes/{voice}.pth'): + return f'./models/finetunes/{voice}.pth' + + dir = f'./training/{voice}/finetune/models/' + if os.path.isdir(dir): + counts = sorted([ int(d[:-8]) for d in os.listdir(dir) if d[-8:] == "_gpt.pth" ]) + names = [ f'{dir}/{d}_gpt.pth' for d in counts ] + if len(names) > 0: + return names[-1] + + if args.autoregressive_model != "auto": + return args.autoregressive_model + + return get_model_path('autoregressive.pth') + +def update_autoregressive_model(autoregressive_model_path): + if args.tts_backend != "tortoise": + raise f"Unsupported backend: {args.tts_backend}" + + if autoregressive_model_path == "auto": + autoregressive_model_path = deduce_autoregressive_model() + else: + match = re.findall(r'^\[[a-fA-F0-9]{8}\] (.+?)$', autoregressive_model_path) + if match: + autoregressive_model_path = match[0] + + if not autoregressive_model_path or not os.path.exists(autoregressive_model_path): + print(f"Invalid model: {autoregressive_model_path}") + return + + args.autoregressive_model = autoregressive_model_path + save_args_settings() + print(f'Stored autoregressive model to settings: {autoregressive_model_path}') + + global tts + if not tts: + if tts_loading: + raise Exception("TTS is still initializing...") + return + + if hasattr(tts, "loading") and tts.loading: + raise Exception("TTS is still initializing...") + + + if autoregressive_model_path == tts.autoregressive_model_path: + return + + tts.load_autoregressive_model(autoregressive_model_path) + + do_gc() + + return autoregressive_model_path + +def update_diffusion_model(diffusion_model_path): + if args.tts_backend != "tortoise": + raise f"Unsupported backend: {args.tts_backend}" + + match = re.findall(r'^\[[a-fA-F0-9]{8}\] (.+?)$', diffusion_model_path) + if match: + diffusion_model_path = match[0] + + if not diffusion_model_path or not os.path.exists(diffusion_model_path): + print(f"Invalid model: {diffusion_model_path}") + return + + args.diffusion_model = diffusion_model_path + save_args_settings() + print(f'Stored diffusion model to settings: {diffusion_model_path}') + + global tts + if not tts: + if tts_loading: + raise Exception("TTS is still initializing...") + return + + if hasattr(tts, "loading") and tts.loading: + raise Exception("TTS is still initializing...") + + if diffusion_model_path == "auto": + diffusion_model_path = deduce_diffusion_model() + + if diffusion_model_path == tts.diffusion_model_path: + return + + tts.load_diffusion_model(diffusion_model_path) + + do_gc() + + return diffusion_model_path + +def update_vocoder_model(vocoder_model): + if args.tts_backend != "tortoise": + raise f"Unsupported backend: {args.tts_backend}" + + args.vocoder_model = vocoder_model + save_args_settings() + print(f'Stored vocoder model to settings: {vocoder_model}') + + global tts + if not tts: + if tts_loading: + raise Exception("TTS is still initializing...") + return + + if hasattr(tts, "loading") and tts.loading: + raise Exception("TTS is still initializing...") + + print(f"Loading model: {vocoder_model}") + tts.load_vocoder_model(vocoder_model) + print(f"Loaded model: {tts.vocoder_model}") + + do_gc() + + return vocoder_model + +def update_tokenizer(tokenizer_json): + if args.tts_backend != "tortoise": + raise f"Unsupported backend: {args.tts_backend}" + + args.tokenizer_json = tokenizer_json + save_args_settings() + print(f'Stored tokenizer to settings: {tokenizer_json}') + + global tts + if not tts: + if tts_loading: + raise Exception("TTS is still initializing...") + return + + if hasattr(tts, "loading") and tts.loading: + raise Exception("TTS is still initializing...") + + print(f"Loading tokenizer vocab: {tokenizer_json}") + tts.load_tokenizer_json(tokenizer_json) + print(f"Loaded tokenizer vocab: {tts.tokenizer_json}") + + do_gc() + + return vocoder_model + +def load_voicefixer(restart=False): + global voicefixer + + if restart: + unload_voicefixer() + + try: + print("Loading Voicefixer") + from voicefixer import VoiceFixer + voicefixer = VoiceFixer() + print("Loaded Voicefixer") + except Exception as e: + print(f"Error occurred while tring to initialize voicefixer: {e}") + if voicefixer: + del voicefixer + voicefixer = None + +def unload_voicefixer(): + global voicefixer + + if voicefixer: + del voicefixer + voicefixer = None + print("Unloaded Voicefixer") + + do_gc() + +def load_whisper_model(language=None, model_name=None, progress=None): + global whisper_model + global whisper_align_model + + if args.whisper_backend not in WHISPER_BACKENDS: + raise Exception(f"unavailable backend: {args.whisper_backend}") + + if not model_name: + model_name = args.whisper_model + else: + args.whisper_model = model_name + save_args_settings() + + if language and f'{model_name}.{language}' in WHISPER_SPECIALIZED_MODELS: + model_name = f'{model_name}.{language}' + print(f"Loading specialized model for language: {language}") + + notify_progress(f"Loading Whisper model: {model_name}", progress=progress) + + if args.whisper_backend == "openai/whisper": + import whisper + try: + #is it possible for model to fit on vram but go oom later on while executing on data? + whisper_model = whisper.load_model(model_name) + except: + print("Out of VRAM memory. falling back to loading Whisper on CPU.") + whisper_model = whisper.load_model(model_name, device="cpu") + elif args.whisper_backend == "lightmare/whispercpp": + from whispercpp import Whisper + if not language: + language = 'auto' + + b_lang = language.encode('ascii') + whisper_model = Whisper(model_name, models_dir='./models/', language=b_lang) + elif args.whisper_backend == "m-bain/whisperx": + import whisper, whisperx + device = "cuda" if get_device_name() == "cuda" else "cpu" + whisper_model = whisperx.load_model(model_name, device) + whisper_align_model = whisperx.load_align_model(model_name="WAV2VEC2_ASR_LARGE_LV60K_960H" if language=="en" else None, language_code=language, device=device) + + print("Loaded Whisper model") + +def unload_whisper(): + global whisper_model + global whisper_align_model + + if whisper_align_model: + del whisper_align_model + whisper_align_model = None + + if whisper_model: + del whisper_model + whisper_model = None + print("Unloaded Whisper") + + do_gc() + +# shamelessly borrowed from Voldy's Web UI: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/modules/extras.py#L74 +def merge_models( primary_model_name, secondary_model_name, alpha, progress=gr.Progress() ): + key_blacklist = [] + + def weighted_sum(theta0, theta1, alpha): + return ((1 - alpha) * theta0) + (alpha * theta1) + + def read_model( filename ): + print(f"Loading {filename}") + return torch.load(filename) + + theta_func = weighted_sum + + theta_0 = read_model(primary_model_name) + theta_1 = read_model(secondary_model_name) + + for key in tqdm(theta_0.keys(), desc="Merging..."): + if key in key_blacklist: + print("Skipping ignored key:", key) + continue + + a = theta_0[key] + b = theta_1[key] + + if a.dtype != torch.float32 and a.dtype != torch.float16: + print("Skipping key:", key, a.dtype) + continue + + if b.dtype != torch.float32 and b.dtype != torch.float16: + print("Skipping key:", key, b.dtype) + continue + + theta_0[key] = theta_func(a, b, alpha) + + del theta_1 + + primary_basename = os.path.splitext(os.path.basename(primary_model_name))[0] + secondary_basename = os.path.splitext(os.path.basename(secondary_model_name))[0] + suffix = "{:.3f}".format(alpha) + output_path = f'./models/finetunes/{primary_basename}_{secondary_basename}_{suffix}_merge.pth' + + torch.save(theta_0, output_path) + message = f"Saved to {output_path}" + print(message) return message \ No newline at end of file diff --git a/src/webui.py b/src/webui.py index 402bb2b..560453d 100755 --- a/src/webui.py +++ b/src/webui.py @@ -1,977 +1,978 @@ -import os -import argparse -import time -import json -import base64 -import re -import inspect -import urllib.request - -import torch -import torchaudio -import music_tag -import gradio as gr -import gradio.utils - -from datetime import datetime - -import tortoise.api -from tortoise.utils.audio import get_voice_dir, get_voices -from tortoise.utils.device import get_device_count - -from utils import * - -args = setup_args() - -GENERATE_SETTINGS = {} -TRANSCRIBE_SETTINGS = {} -EXEC_SETTINGS = {} -TRAINING_SETTINGS = {} -MERGER_SETTINGS = {} -GENERATE_SETTINGS_ARGS = [] - -PRESETS = { - 'Ultra Fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False}, - 'Fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80}, - 'Standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200}, - 'High Quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400}, -} - -HISTORY_HEADERS = { - "Name": "", - "Samples": "num_autoregressive_samples", - "Iterations": "diffusion_iterations", - "Temp.": "temperature", - "Sampler": "diffusion_sampler", - "CVVP": "cvvp_weight", - "Top P": "top_p", - "Diff. Temp.": "diffusion_temperature", - "Len Pen": "length_penalty", - "Rep Pen": "repetition_penalty", - "Cond-Free K": "cond_free_k", - "Time": "time", - "Datetime": "datetime", - "Model": "model", - "Model Hash": "model_hash", -} - -# can't use *args OR **kwargs if I want to retain the ability to use progress -def generate_proxy( - text, - delimiter, - emotion, - prompt, - voice, - mic_audio, - voice_latents_chunks, - candidates, - seed, - num_autoregressive_samples, - diffusion_iterations, - temperature, - diffusion_sampler, - breathing_room, - cvvp_weight, - top_p, - diffusion_temperature, - length_penalty, - repetition_penalty, - cond_free_k, - experimentals, - voice_latents_original_ar, - voice_latents_original_diffusion, - progress=gr.Progress(track_tqdm=True) -): - kwargs = locals() - - try: - sample, outputs, stats = generate(**kwargs) - except Exception as e: - message = str(e) - if message == "Kill signal detected": - unload_tts() - - raise e - - return ( - outputs[0], - gr.update(value=sample, visible=sample is not None), - gr.update(choices=outputs, value=outputs[0], visible=len(outputs) > 1, interactive=True), - gr.update(value=stats, visible=True), - ) - - -def update_presets(value): - if value in PRESETS: - preset = PRESETS[value] - return (gr.update(value=preset['num_autoregressive_samples']), gr.update(value=preset['diffusion_iterations'])) - else: - return (gr.update(), gr.update()) - -def get_training_configs(): - configs = [] - for i, file in enumerate(sorted(os.listdir(f"./training/"))): - if file[-5:] != ".yaml" or file[0] == ".": - continue - configs.append(f"./training/{file}") - - return configs - -def update_training_configs(): - return gr.update(choices=get_training_list()) - -def history_view_results( voice ): - results = [] - files = [] - outdir = f"{args.results_folder}/{voice}/" - for i, file in enumerate(sorted(os.listdir(outdir))): - if file[-4:] != ".wav": - continue - - metadata, _ = read_generate_settings(f"{outdir}/{file}", read_latents=False) - if metadata is None: - continue - - values = [] - for k in HISTORY_HEADERS: - v = file - if k != "Name": - v = metadata[HISTORY_HEADERS[k]] if HISTORY_HEADERS[k] in metadata else '?' - values.append(v) - - - files.append(file) - results.append(values) - - return ( - results, - gr.Dropdown.update(choices=sorted(files)) - ) - -def import_generate_settings_proxy( file=None ): - global GENERATE_SETTINGS_ARGS - settings = import_generate_settings( file ) - - res = [] - for k in GENERATE_SETTINGS_ARGS: - res.append(settings[k] if k in settings else None) - - return tuple(res) - -def reset_generate_settings_proxy(): - global GENERATE_SETTINGS_ARGS - settings = reset_generate_settings() - - res = [] - for k in GENERATE_SETTINGS_ARGS: - res.append(settings[k] if k in settings else None) - - return tuple(res) - -def compute_latents_proxy(voice, voice_latents_chunks, original_ar, original_diffusion, progress=gr.Progress(track_tqdm=True)): - compute_latents( voice=voice, voice_latents_chunks=voice_latents_chunks, original_ar=original_ar, original_diffusion=original_diffusion ) - return voice - - -def import_voices_proxy(files, name, progress=gr.Progress(track_tqdm=True)): - import_voices(files, name, progress) - return gr.update() - -def read_generate_settings_proxy(file, saveAs='.temp'): - j, latents = read_generate_settings(file) - - if latents: - outdir = f'{get_voice_dir()}/{saveAs}/' - os.makedirs(outdir, exist_ok=True) - with open(f'{outdir}/cond_latents.pth', 'wb') as f: - f.write(latents) - - latents = f'{outdir}/cond_latents.pth' - - return ( - gr.update(value=j, visible=j is not None), - gr.update(value=latents, visible=latents is not None), - None if j is None else j['voice'], - gr.update(visible=j is not None), - ) - -def slice_dataset_proxy( voice, trim_silence, start_offset, end_offset, progress=gr.Progress(track_tqdm=True) ): - return slice_dataset( voice, trim_silence=trim_silence, start_offset=start_offset, end_offset=end_offset, results=None, progress=progress ) - -def diarize_dataset( voice, progress=gr.Progress(track_tqdm=True) ): - from pyannote.audio import Pipeline - pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=args.hf_token) - - messages = [] - files = get_voice(voice, load_latents=False) - for file in enumerate_progress(files, desc="Iterating through voice files", progress=progress): - diarization = pipeline(file) - for turn, _, speaker in diarization.itertracks(yield_label=True): - message = f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}" - print(message) - messages.append(message) - - return "\n".join(messages) - -def prepare_all_datasets( language, validation_text_length, validation_audio_length, skip_existings, slice_audio, trim_silence, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=True) ): - kwargs = locals() - - messages = [] - voices = get_voice_list() - - for voice in voices: - print("Processing:", voice) - message = transcribe_dataset( voice=voice, language=language, skip_existings=skip_existings, progress=progress ) - messages.append(message) - - if slice_audio: - for voice in voices: - print("Processing:", voice) - message = slice_dataset( voice, trim_silence=trim_silence, start_offset=slice_start_offset, end_offset=slice_end_offset, results=None, progress=progress ) - messages.append(message) - - for voice in voices: - print("Processing:", voice) - message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length, progress=progress ) - messages.append(message) - - return "\n".join(messages) - -def prepare_dataset_proxy( voice, language, validation_text_length, validation_audio_length, skip_existings, slice_audio, trim_silence, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=True) ): - messages = [] - - message = transcribe_dataset( voice=voice, language=language, skip_existings=skip_existings, progress=progress ) - messages.append(message) - - if slice_audio: - message = slice_dataset( voice, trim_silence=trim_silence, start_offset=slice_start_offset, end_offset=slice_end_offset, results=None, progress=progress ) - messages.append(message) - - message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length, progress=progress ) - messages.append(message) - - return "\n".join(messages) - -def update_args_proxy( *args ): - kwargs = {} - keys = list(EXEC_SETTINGS.keys()) - for i in range(len(args)): - k = keys[i] - v = args[i] - kwargs[k] = v - - update_args(**kwargs) -def optimize_training_settings_proxy( *args ): - kwargs = {} - keys = list(TRAINING_SETTINGS.keys()) - for i in range(len(args)): - k = keys[i] - v = args[i] - kwargs[k] = v - - settings, messages = optimize_training_settings(**kwargs) - output = list(settings.values()) - return output[:-1] + ["\n".join(messages)] - -def import_training_settings_proxy( voice ): - messages = [] - injson = f'./training/{voice}/train.json' - statedir = f'./training/{voice}/finetune/training_state/' - output = {} - - try: - with open(injson, 'r', encoding="utf-8") as f: - settings = json.loads(f.read()) - except: - messages.append(f"Error import /{voice}/train.json") - - for k in TRAINING_SETTINGS: - output[k] = TRAINING_SETTINGS[k].value - - output = list(output.values()) - return output[:-1] + ["\n".join(messages)] - - if os.path.isdir(statedir): - resumes = sorted([int(d[:-6]) for d in os.listdir(statedir) if d[-6:] == ".state" ]) - - if len(resumes) > 0: - settings['resume_state'] = f'{statedir}/{resumes[-1]}.state' - messages.append(f"Found most recent training state: {settings['resume_state']}") - - output = {} - for k in TRAINING_SETTINGS: - if k not in settings: - output[k] = gr.update() - else: - output[k] = gr.update(value=settings[k]) - - output = list(output.values()) - - messages.append(f"Imported training settings: {injson}") - - return output[:-1] + ["\n".join(messages)] - -def save_training_settings_proxy( *args ): - kwargs = {} - keys = list(TRAINING_SETTINGS.keys()) - for i in range(len(args)): - k = keys[i] - v = args[i] - kwargs[k] = v - - settings, messages = save_training_settings(**kwargs) - return "\n".join(messages) - -def update_voices(): - return ( - gr.Dropdown.update(choices=get_voice_list(append_defaults=True)), - gr.Dropdown.update(choices=get_voice_list()), - gr.Dropdown.update(choices=get_voice_list(args.results_folder)), - ) - -def history_copy_settings( voice, file ): - return import_generate_settings( f"{args.results_folder}/{voice}/{file}" ) - -def setup_gradio(): - global args - global ui - - if not args.share: - def noop(function, return_value=None): - def wrapped(*args, **kwargs): - return return_value - return wrapped - gradio.utils.version_check = noop(gradio.utils.version_check) - gradio.utils.initiated_analytics = noop(gradio.utils.initiated_analytics) - gradio.utils.launch_analytics = noop(gradio.utils.launch_analytics) - gradio.utils.integration_analytics = noop(gradio.utils.integration_analytics) - gradio.utils.error_analytics = noop(gradio.utils.error_analytics) - gradio.utils.log_feature_analytics = noop(gradio.utils.log_feature_analytics) - #gradio.utils.get_local_ip_address = noop(gradio.utils.get_local_ip_address, 'localhost') - - if args.models_from_local_only: - os.environ['TRANSFORMERS_OFFLINE']='1' - - voice_list_with_defaults = get_voice_list(append_defaults=True) - voice_list = get_voice_list() - result_voices = get_voice_list(args.results_folder) - - valle_models = get_valle_models() - - autoregressive_models = get_autoregressive_models() - diffusion_models = get_diffusion_models() - tokenizer_jsons = get_tokenizer_jsons() - - dataset_list = get_dataset_list() - training_list = get_training_list() - - global GENERATE_SETTINGS_ARGS - GENERATE_SETTINGS_ARGS = list(inspect.signature(generate_proxy).parameters.keys())[:-1] - for i in range(len(GENERATE_SETTINGS_ARGS)): - arg = GENERATE_SETTINGS_ARGS[i] - GENERATE_SETTINGS[arg] = None - - with gr.Blocks() as ui: - with gr.Tab("Generate"): - with gr.Row(): - with gr.Column(): - GENERATE_SETTINGS["text"] = gr.Textbox(lines=4, value="Your prompt here.", label="Input Prompt") - with gr.Row(): - with gr.Column(): - GENERATE_SETTINGS["delimiter"] = gr.Textbox(lines=1, label="Line Delimiter", placeholder="\\n") - - GENERATE_SETTINGS["emotion"] = gr.Radio( ["Happy", "Sad", "Angry", "Disgusted", "Arrogant", "Custom", "None"], value="None", label="Emotion", type="value", interactive=True, visible=args.tts_backend=="tortoise" ) - GENERATE_SETTINGS["prompt"] = gr.Textbox(lines=1, label="Custom Emotion", visible=False) - GENERATE_SETTINGS["voice"] = gr.Dropdown(choices=voice_list_with_defaults, label="Voice", type="value", value=voice_list_with_defaults[0]) # it'd be very cash money if gradio was able to default to the first value in the list without this shit - GENERATE_SETTINGS["mic_audio"] = gr.Audio( label="Microphone Source", source="microphone", type="filepath", visible=False ) - GENERATE_SETTINGS["voice_latents_chunks"] = gr.Number(label="Voice Chunks", precision=0, value=0, visible=args.tts_backend=="tortoise") - GENERATE_SETTINGS["voice_latents_original_ar"] = gr.Checkbox(label="Use Original Latents Method (AR)", visible=args.tts_backend=="tortoise") - GENERATE_SETTINGS["voice_latents_original_diffusion"] = gr.Checkbox(label="Use Original Latents Method (Diffusion)", visible=args.tts_backend=="tortoise") - with gr.Row(): - refresh_voices = gr.Button(value="Refresh Voice List") - recompute_voice_latents = gr.Button(value="(Re)Compute Voice Latents") - - GENERATE_SETTINGS["voice"].change( - fn=update_baseline_for_latents_chunks, - inputs=GENERATE_SETTINGS["voice"], - outputs=GENERATE_SETTINGS["voice_latents_chunks"] - ) - GENERATE_SETTINGS["voice"].change( - fn=lambda value: gr.update(visible=value == "microphone"), - inputs=GENERATE_SETTINGS["voice"], - outputs=GENERATE_SETTINGS["mic_audio"], - ) - with gr.Column(): - preset = None - GENERATE_SETTINGS["candidates"] = gr.Slider(value=1, minimum=1, maximum=6, step=1, label="Candidates", visible=args.tts_backend=="tortoise") - GENERATE_SETTINGS["seed"] = gr.Number(value=0, precision=0, label="Seed", visible=args.tts_backend=="tortoise") - - preset = gr.Radio( ["Ultra Fast", "Fast", "Standard", "High Quality"], label="Preset", type="value", value="Ultra Fast", visible=args.tts_backend=="tortoise" ) - - GENERATE_SETTINGS["num_autoregressive_samples"] = gr.Slider(value=16, minimum=2, maximum=2048 if args.tts_backend=="vall-e" else 512, step=1, label="Samples", visible=args.tts_backend!="bark") - GENERATE_SETTINGS["diffusion_iterations"] = gr.Slider(value=30, minimum=0, maximum=512, step=1, label="Iterations", visible=args.tts_backend=="tortoise") - - GENERATE_SETTINGS["temperature"] = gr.Slider(value=0.95 if args.tts_backend=="vall-e" else 0.2, minimum=0, maximum=1, step=0.05, label="Temperature") - - show_experimental_settings = gr.Checkbox(label="Show Experimental Settings", visible=args.tts_backend=="tortoise") - reset_generate_settings_button = gr.Button(value="Reset to Default") - with gr.Column(visible=False) as col: - experimental_column = col - - GENERATE_SETTINGS["experimentals"] = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=["Conditioning-Free"], label="Experimental Flags") - GENERATE_SETTINGS["breathing_room"] = gr.Slider(value=8, minimum=1, maximum=32, step=1, label="Pause Size") - GENERATE_SETTINGS["diffusion_sampler"] = gr.Radio( - ["P", "DDIM"], # + ["K_Euler_A", "DPM++2M"], - value="DDIM", label="Diffusion Samplers", type="value" - ) - GENERATE_SETTINGS["cvvp_weight"] = gr.Slider(value=0, minimum=0, maximum=1, label="CVVP Weight") - GENERATE_SETTINGS["top_p"] = gr.Slider(value=0.8, minimum=0, maximum=1, label="Top P") - GENERATE_SETTINGS["diffusion_temperature"] = gr.Slider(value=1.0, minimum=0, maximum=1, label="Diffusion Temperature") - GENERATE_SETTINGS["length_penalty"] = gr.Slider(value=1.0, minimum=0, maximum=8, label="Length Penalty") - GENERATE_SETTINGS["repetition_penalty"] = gr.Slider(value=2.0, minimum=0, maximum=8, label="Repetition Penalty") - GENERATE_SETTINGS["cond_free_k"] = gr.Slider(value=2.0, minimum=0, maximum=4, label="Conditioning-Free K") - with gr.Column(): - with gr.Row(): - submit = gr.Button(value="Generate") - stop = gr.Button(value="Stop") - - generation_results = gr.Dataframe(label="Results", headers=["Seed", "Time"], visible=False) - source_sample = gr.Audio(label="Source Sample", visible=False) - output_audio = gr.Audio(label="Output") - candidates_list = gr.Dropdown(label="Candidates", type="value", visible=False, choices=[""], value="") - - def change_candidate( val ): - if not val: - return - return val - - candidates_list.change( - fn=change_candidate, - inputs=candidates_list, - outputs=output_audio, - ) - with gr.Tab("History"): - with gr.Row(): - with gr.Column(): - history_info = gr.Dataframe(label="Results", headers=list(HISTORY_HEADERS.keys())) - with gr.Row(): - with gr.Column(): - history_voices = gr.Dropdown(choices=result_voices, label="Voice", type="value", value=result_voices[0] if len(result_voices) > 0 else "") - with gr.Column(): - history_results_list = gr.Dropdown(label="Results",type="value", interactive=True, value="") - with gr.Column(): - history_audio = gr.Audio() - history_copy_settings_button = gr.Button(value="Copy Settings") - with gr.Tab("Utilities"): - with gr.Tab("Import / Analyze"): - with gr.Row(): - with gr.Column(): - audio_in = gr.Files(type="file", label="Audio Input", file_types=["audio"]) - import_voice_name = gr.Textbox(label="Voice Name") - import_voice_button = gr.Button(value="Import Voice") - with gr.Column(visible=False) as col: - utilities_metadata_column = col - - metadata_out = gr.JSON(label="Audio Metadata") - copy_button = gr.Button(value="Copy Settings") - latents_out = gr.File(type="binary", label="Voice Latents") - with gr.Tab("Tokenizer"): - with gr.Row(): - text_tokenizier_input = gr.TextArea(label="Text", max_lines=4) - text_tokenizier_output = gr.TextArea(label="Tokenized Text", max_lines=4) - - with gr.Row(): - text_tokenizier_button = gr.Button(value="Tokenize Text") - with gr.Tab("Model Merger"): - with gr.Column(): - with gr.Row(): - MERGER_SETTINGS["model_a"] = gr.Dropdown( choices=autoregressive_models, label="Model A", type="value", value=autoregressive_models[0] ) - MERGER_SETTINGS["model_b"] = gr.Dropdown( choices=autoregressive_models, label="Model B", type="value", value=autoregressive_models[0] ) - with gr.Row(): - MERGER_SETTINGS["weight_slider"] = gr.Slider(label="Weight (from A to B)", value=0.5, minimum=0, maximum=1) - with gr.Row(): - merger_button = gr.Button(value="Run Merger") - with gr.Column(): - merger_output = gr.TextArea(label="Console Output", max_lines=8) - with gr.Tab("Training"): - with gr.Tab("Prepare Dataset"): - with gr.Row(): - with gr.Column(): - DATASET_SETTINGS = {} - DATASET_SETTINGS['voice'] = gr.Dropdown( choices=voice_list, label="Dataset Source", type="value", value=voice_list[0] if len(voice_list) > 0 else "" ) - with gr.Row(): - DATASET_SETTINGS['language'] = gr.Textbox(label="Language", value="en") - DATASET_SETTINGS['validation_text_length'] = gr.Number(label="Validation Text Length Threshold", value=12, precision=0, visible=args.tts_backend=="tortoise") - DATASET_SETTINGS['validation_audio_length'] = gr.Number(label="Validation Audio Length Threshold", value=1, visible=args.tts_backend=="tortoise" ) - with gr.Row(): - DATASET_SETTINGS['skip'] = gr.Checkbox(label="Skip Existing", value=False) - DATASET_SETTINGS['slice'] = gr.Checkbox(label="Slice Segments", value=False) - DATASET_SETTINGS['trim_silence'] = gr.Checkbox(label="Trim Silence", value=False) - with gr.Row(): - DATASET_SETTINGS['slice_start_offset'] = gr.Number(label="Slice Start Offset", value=0) - DATASET_SETTINGS['slice_end_offset'] = gr.Number(label="Slice End Offset", value=0) - - transcribe_button = gr.Button(value="Transcribe and Process") - transcribe_all_button = gr.Button(value="Transcribe All") - diarize_button = gr.Button(value="Diarize", visible=False) - - with gr.Row(): - slice_dataset_button = gr.Button(value="(Re)Slice Audio") - prepare_dataset_button = gr.Button(value="(Re)Create Dataset") - - with gr.Row(): - EXEC_SETTINGS['whisper_backend'] = gr.Dropdown(WHISPER_BACKENDS, label="Whisper Backends", value=args.whisper_backend) - EXEC_SETTINGS['whisper_model'] = gr.Dropdown(WHISPER_MODELS, label="Whisper Model", value=args.whisper_model) - - dataset_settings = list(DATASET_SETTINGS.values()) - with gr.Column(): - prepare_dataset_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8) - with gr.Tab("Generate Configuration", visible=args.tts_backend != "bark"): - with gr.Row(): - with gr.Column(): - TRAINING_SETTINGS["epochs"] = gr.Number(label="Epochs", value=500, precision=0) - with gr.Row(visible=args.tts_backend=="tortoise"): - TRAINING_SETTINGS["learning_rate"] = gr.Slider(label="Learning Rate", value=1e-5, minimum=0, maximum=1e-4, step=1e-6) - TRAINING_SETTINGS["mel_lr_weight"] = gr.Slider(label="Mel LR Ratio", value=1.00, minimum=0, maximum=1) - TRAINING_SETTINGS["text_lr_weight"] = gr.Slider(label="Text LR Ratio", value=0.01, minimum=0, maximum=1) - - with gr.Row(visible=args.tts_backend=="tortoise"): - lr_schemes = list(LEARNING_RATE_SCHEMES.keys()) - TRAINING_SETTINGS["learning_rate_scheme"] = gr.Radio(lr_schemes, label="Learning Rate Scheme", value=lr_schemes[0], type="value") - TRAINING_SETTINGS["learning_rate_schedule"] = gr.Textbox(label="Learning Rate Schedule", placeholder=str(LEARNING_RATE_SCHEDULE), visible=True) - TRAINING_SETTINGS["learning_rate_restarts"] = gr.Number(label="Learning Rate Restarts", value=4, precision=0, visible=False) - - TRAINING_SETTINGS["learning_rate_scheme"].change( - fn=lambda x: ( gr.update(visible=x == lr_schemes[0]), gr.update(visible=x == lr_schemes[1]) ), - inputs=TRAINING_SETTINGS["learning_rate_scheme"], - outputs=[ - TRAINING_SETTINGS["learning_rate_schedule"], - TRAINING_SETTINGS["learning_rate_restarts"], - ] - ) - with gr.Row(): - TRAINING_SETTINGS["batch_size"] = gr.Number(label="Batch Size", value=128, precision=0) - TRAINING_SETTINGS["gradient_accumulation_size"] = gr.Number(label="Gradient Accumulation Size", value=4, precision=0) - with gr.Row(): - TRAINING_SETTINGS["save_rate"] = gr.Number(label="Save Frequency (in epochs)", value=5, precision=0) - TRAINING_SETTINGS["validation_rate"] = gr.Number(label="Validation Frequency (in epochs)", value=5, precision=0) - - with gr.Row(): - TRAINING_SETTINGS["half_p"] = gr.Checkbox(label="Half Precision", value=args.training_default_halfp, visible=args.tts_backend=="tortoise") - TRAINING_SETTINGS["bitsandbytes"] = gr.Checkbox(label="BitsAndBytes", value=args.training_default_bnb, visible=args.tts_backend=="tortoise") - TRAINING_SETTINGS["validation_enabled"] = gr.Checkbox(label="Validation Enabled", value=False) - - with gr.Row(): - TRAINING_SETTINGS["workers"] = gr.Number(label="Worker Processes", value=2, precision=0, visible=args.tts_backend=="tortoise") - TRAINING_SETTINGS["gpus"] = gr.Number(label="GPUs", value=get_device_count(), precision=0) - - TRAINING_SETTINGS["source_model"] = gr.Dropdown( choices=autoregressive_models, label="Source Model", type="value", value=autoregressive_models[0], visible=args.tts_backend=="tortoise" ) - TRAINING_SETTINGS["resume_state"] = gr.Textbox(label="Resume State Path", placeholder="./training/${voice}/finetune/training_state/${last_state}.state", visible=args.tts_backend=="tortoise") - - TRAINING_SETTINGS["voice"] = gr.Dropdown( choices=dataset_list, label="Dataset", type="value", value=dataset_list[0] if len(dataset_list) else "" ) - - with gr.Row(): - training_refresh_dataset = gr.Button(value="Refresh Dataset List") - training_import_settings = gr.Button(value="Reuse/Import Dataset") - with gr.Column(): - training_configuration_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8) - with gr.Row(): - training_optimize_configuration = gr.Button(value="Validate Training Configuration") - training_save_configuration = gr.Button(value="Save Training Configuration") - with gr.Tab("Run Training", visible=args.tts_backend != "bark"): - with gr.Row(): - with gr.Column(): - training_configs = gr.Dropdown(label="Training Configuration", choices=training_list, value=training_list[0] if len(training_list) else "") - refresh_configs = gr.Button(value="Refresh Configurations") - training_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8) - verbose_training = gr.Checkbox(label="Verbose Console Output", value=True) - - keep_x_past_checkpoints = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1) - - with gr.Row(): - training_graph_x_min = gr.Number(label="X Min", precision=0, value=0) - training_graph_x_max = gr.Number(label="X Max", precision=0, value=0) - training_graph_y_min = gr.Number(label="Y Min", precision=0, value=0) - training_graph_y_max = gr.Number(label="Y Max", precision=0, value=0) - - with gr.Row(): - start_training_button = gr.Button(value="Train") - stop_training_button = gr.Button(value="Stop") - reconnect_training_button = gr.Button(value="Reconnect") - - - with gr.Column(): - training_loss_graph = gr.LinePlot(label="Training Metrics", - x="it", # x="epoch", - y="value", - title="Loss Metrics", - color="type", - tooltip=['epoch', 'it', 'value', 'type'], - width=500, - height=350, - ) - training_lr_graph = gr.LinePlot(label="Training Metrics", - x="it", # x="epoch", - y="value", - title="Learning Rate", - color="type", - tooltip=['epoch', 'it', 'value', 'type'], - width=500, - height=350, - ) - training_grad_norm_graph = gr.LinePlot(label="Training Metrics", - x="it", # x="epoch", - y="value", - title="Gradient Normals", - color="type", - tooltip=['epoch', 'it', 'value', 'type'], - width=500, - height=350, - visible=False, # args.tts_backend=="vall-e" - ) - view_losses = gr.Button(value="View Losses") - - with gr.Tab("Settings"): - with gr.Row(): - exec_inputs = [] - with gr.Column(): - EXEC_SETTINGS['listen'] = gr.Textbox(label="Listen", value=args.listen, placeholder="127.0.0.1:7860/") - EXEC_SETTINGS['share'] = gr.Checkbox(label="Public Share Gradio", value=args.share) - EXEC_SETTINGS['check_for_updates'] = gr.Checkbox(label="Check For Updates", value=args.check_for_updates) - EXEC_SETTINGS['models_from_local_only'] = gr.Checkbox(label="Only Load Models Locally", value=args.models_from_local_only) - EXEC_SETTINGS['low_vram'] = gr.Checkbox(label="Low VRAM", value=args.low_vram) - EXEC_SETTINGS['embed_output_metadata'] = gr.Checkbox(label="Embed Output Metadata", value=args.embed_output_metadata) - EXEC_SETTINGS['latents_lean_and_mean'] = gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean) - EXEC_SETTINGS['voice_fixer'] = gr.Checkbox(label="Use Voice Fixer on Generated Output", value=args.voice_fixer) - EXEC_SETTINGS['voice_fixer_use_cuda'] = gr.Checkbox(label="Use CUDA for Voice Fixer", value=args.voice_fixer_use_cuda) - EXEC_SETTINGS['force_cpu_for_conditioning_latents'] = gr.Checkbox(label="Force CPU for Conditioning Latents", value=args.force_cpu_for_conditioning_latents) - EXEC_SETTINGS['defer_tts_load'] = gr.Checkbox(label="Do Not Load TTS On Startup", value=args.defer_tts_load) - EXEC_SETTINGS['prune_nonfinal_outputs'] = gr.Checkbox(label="Delete Non-Final Output", value=args.prune_nonfinal_outputs) - with gr.Column(): - EXEC_SETTINGS['sample_batch_size'] = gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size) - EXEC_SETTINGS['unsqueeze_sample_batches'] = gr.Checkbox(label="Unsqueeze Sample Batches", value=args.unsqueeze_sample_batches) - EXEC_SETTINGS['concurrency_count'] = gr.Number(label="Gradio Concurrency Count", precision=0, value=args.concurrency_count) - EXEC_SETTINGS['autocalculate_voice_chunk_duration_size'] = gr.Number(label="Auto-Calculate Voice Chunk Duration (in seconds)", precision=0, value=args.autocalculate_voice_chunk_duration_size) - EXEC_SETTINGS['output_volume'] = gr.Slider(label="Output Volume", minimum=0, maximum=2, value=args.output_volume) - EXEC_SETTINGS['device_override'] = gr.Textbox(label="Device Override", value=args.device_override) - - EXEC_SETTINGS['results_folder'] = gr.Textbox(label="Results Folder", value=args.results_folder) - # EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0]) - - if args.tts_backend=="vall-e": - with gr.Column(): - EXEC_SETTINGS['valle_model'] = gr.Dropdown(choices=valle_models, label="VALL-E Model Config", value=args.valle_model if args.valle_model else valle_models[0]) - - with gr.Column(visible=args.tts_backend=="tortoise"): - EXEC_SETTINGS['autoregressive_model'] = gr.Dropdown(choices=["auto"] + autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else "auto") - EXEC_SETTINGS['diffusion_model'] = gr.Dropdown(choices=diffusion_models, label="Diffusion Model", value=args.diffusion_model if args.diffusion_model else diffusion_models[0]) - EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1]) - EXEC_SETTINGS['tokenizer_json'] = gr.Dropdown(tokenizer_jsons, label="Tokenizer JSON Path", value=args.tokenizer_json if args.tokenizer_json else tokenizer_jsons[0]) - - EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p'] - EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes'] - - with gr.Row(): - autoregressive_models_update_button = gr.Button(value="Refresh Model List") - gr.Button(value="Check for Updates").click(check_for_updates) - gr.Button(value="(Re)Load TTS").click( - reload_tts, - inputs=None, - outputs=None - ) - # kill_button = gr.Button(value="Close UI") - - def update_model_list_proxy( autoregressive, diffusion, tokenizer ): - autoregressive_models = get_autoregressive_models() - if autoregressive not in autoregressive_models: - autoregressive = autoregressive_models[0] - - diffusion_models = get_diffusion_models() - if diffusion not in diffusion_models: - diffusion = diffusion_models[0] - - tokenizer_jsons = get_tokenizer_jsons() - if tokenizer not in tokenizer_jsons: - tokenizer = tokenizer_jsons[0] - - return ( - gr.update( choices=autoregressive_models, value=autoregressive ), - gr.update( choices=diffusion_models, value=diffusion ), - gr.update( choices=tokenizer_jsons, value=tokenizer ), - ) - - autoregressive_models_update_button.click( - update_model_list_proxy, - inputs=[ - EXEC_SETTINGS['autoregressive_model'], - EXEC_SETTINGS['diffusion_model'], - EXEC_SETTINGS['tokenizer_json'], - ], - outputs=[ - EXEC_SETTINGS['autoregressive_model'], - EXEC_SETTINGS['diffusion_model'], - EXEC_SETTINGS['tokenizer_json'], - ], - ) - - exec_inputs = list(EXEC_SETTINGS.values()) - for k in EXEC_SETTINGS: - EXEC_SETTINGS[k].change( fn=update_args_proxy, inputs=exec_inputs ) - - EXEC_SETTINGS['autoregressive_model'].change( - fn=update_autoregressive_model, - inputs=EXEC_SETTINGS['autoregressive_model'], - outputs=None, - api_name="set_autoregressive_model" - ) - - EXEC_SETTINGS['vocoder_model'].change( - fn=update_vocoder_model, - inputs=EXEC_SETTINGS['vocoder_model'], - outputs=None - ) - - history_voices.change( - fn=history_view_results, - inputs=history_voices, - outputs=[ - history_info, - history_results_list, - ] - ) - history_results_list.change( - fn=lambda voice, file: f"{args.results_folder}/{voice}/{file}", - inputs=[ - history_voices, - history_results_list, - ], - outputs=history_audio - ) - audio_in.upload( - fn=read_generate_settings_proxy, - inputs=audio_in, - outputs=[ - metadata_out, - latents_out, - import_voice_name, - utilities_metadata_column, - ] - ) - - import_voice_button.click( - fn=import_voices_proxy, - inputs=[ - audio_in, - import_voice_name, - ], - outputs=import_voice_name #console_output - ) - show_experimental_settings.change( - fn=lambda x: gr.update(visible=x), - inputs=show_experimental_settings, - outputs=experimental_column - ) - if preset: - preset.change(fn=update_presets, - inputs=preset, - outputs=[ - GENERATE_SETTINGS['num_autoregressive_samples'], - GENERATE_SETTINGS['diffusion_iterations'], - ], - ) - - recompute_voice_latents.click(compute_latents_proxy, - inputs=[ - GENERATE_SETTINGS['voice'], - GENERATE_SETTINGS['voice_latents_chunks'], - GENERATE_SETTINGS['voice_latents_original_ar'], - GENERATE_SETTINGS['voice_latents_original_diffusion'], - ], - outputs=GENERATE_SETTINGS['voice'], - ) - - GENERATE_SETTINGS['emotion'].change( - fn=lambda value: gr.update(visible=value == "Custom"), - inputs=GENERATE_SETTINGS['emotion'], - outputs=GENERATE_SETTINGS['prompt'] - ) - GENERATE_SETTINGS['mic_audio'].change(fn=lambda value: gr.update(value="microphone"), - inputs=GENERATE_SETTINGS['mic_audio'], - outputs=GENERATE_SETTINGS['voice'] - ) - - refresh_voices.click(update_voices, - inputs=None, - outputs=[ - GENERATE_SETTINGS['voice'], - DATASET_SETTINGS['voice'], - history_voices - ] - ) - - generate_settings = list(GENERATE_SETTINGS.values()) - submit.click( - lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)), - outputs=[source_sample, candidates_list, generation_results], - ) - - submit_event = submit.click(generate_proxy, - inputs=generate_settings, - outputs=[output_audio, source_sample, candidates_list, generation_results], - api_name="generate", - ) - - - copy_button.click(import_generate_settings_proxy, - inputs=audio_in, # JSON elements cannot be used as inputs - outputs=generate_settings - ) - - reset_generate_settings_button.click( - fn=reset_generate_settings_proxy, - inputs=None, - outputs=generate_settings - ) - - history_copy_settings_button.click(history_copy_settings, - inputs=[ - history_voices, - history_results_list, - ], - outputs=generate_settings - ) - - text_tokenizier_button.click(tokenize_text, - inputs=text_tokenizier_input, - outputs=text_tokenizier_output - ) - - merger_button.click(merge_models, - inputs=list(MERGER_SETTINGS.values()), - outputs=merger_output - ) - - refresh_configs.click( - lambda: gr.update(choices=get_training_list()), - inputs=None, - outputs=training_configs - ) - start_training_button.click(run_training, - inputs=[ - training_configs, - verbose_training, - keep_x_past_checkpoints, - ], - outputs=[ - training_output, - ], - ) - training_output.change( - fn=update_training_dataplot, - inputs=[ - training_graph_x_min, - training_graph_x_max, - training_graph_y_min, - training_graph_y_max, - ], - outputs=[ - training_loss_graph, - training_lr_graph, - training_grad_norm_graph, - ], - show_progress=False, - ) - - view_losses.click( - fn=update_training_dataplot, - inputs=[ - training_graph_x_min, - training_graph_x_max, - training_graph_y_min, - training_graph_y_max, - training_configs, - ], - outputs=[ - training_loss_graph, - training_lr_graph, - training_grad_norm_graph, - ], - ) - - stop_training_button.click(stop_training, - inputs=None, - outputs=training_output #console_output - ) - reconnect_training_button.click(reconnect_training, - inputs=[ - verbose_training, - ], - outputs=training_output #console_output - ) - transcribe_button.click( - prepare_dataset_proxy, - inputs=dataset_settings, - outputs=prepare_dataset_output #console_output - ) - transcribe_all_button.click( - prepare_all_datasets, - inputs=dataset_settings[1:], - outputs=prepare_dataset_output #console_output - ) - diarize_button.click( - diarize_dataset, - inputs=dataset_settings[0], - outputs=prepare_dataset_output #console_output - ) - prepare_dataset_button.click( - prepare_dataset, - inputs=[ - DATASET_SETTINGS['voice'], - DATASET_SETTINGS['slice'], - DATASET_SETTINGS['validation_text_length'], - DATASET_SETTINGS['validation_audio_length'], - ], - outputs=prepare_dataset_output #console_output - ) - slice_dataset_button.click( - slice_dataset_proxy, - inputs=[ - DATASET_SETTINGS['voice'], - DATASET_SETTINGS['trim_silence'], - DATASET_SETTINGS['slice_start_offset'], - DATASET_SETTINGS['slice_end_offset'], - ], - outputs=prepare_dataset_output - ) - - training_refresh_dataset.click( - lambda: gr.update(choices=get_dataset_list()), - inputs=None, - outputs=TRAINING_SETTINGS["voice"], - ) - training_settings = list(TRAINING_SETTINGS.values()) - training_optimize_configuration.click(optimize_training_settings_proxy, - inputs=training_settings, - outputs=training_settings[:-1] + [training_configuration_output] #console_output - ) - training_import_settings.click(import_training_settings_proxy, - inputs=TRAINING_SETTINGS['voice'], - outputs=training_settings[:-1] + [training_configuration_output] #console_output - ) - training_save_configuration.click(save_training_settings_proxy, - inputs=training_settings, - outputs=training_configuration_output #console_output - ) - - if os.path.isfile('./config/generate.json'): - ui.load(import_generate_settings_proxy, inputs=None, outputs=generate_settings) - - if args.check_for_updates: - ui.load(check_for_updates) - - stop.click(fn=cancel_generate, inputs=None, outputs=None) - - - ui.queue(concurrency_count=args.concurrency_count) - webui = ui +import os +import argparse +import time +import json +import base64 +import re +import inspect +import urllib.request + +import torch +import torchaudio +import music_tag +import gradio as gr +import gradio.utils + +from datetime import datetime + +import tortoise.api +from tortoise.utils.audio import get_voice_dir, get_voices +from tortoise.utils.device import get_device_count + +from utils import * + +args = setup_args() + +GENERATE_SETTINGS = {} +TRANSCRIBE_SETTINGS = {} +EXEC_SETTINGS = {} +TRAINING_SETTINGS = {} +MERGER_SETTINGS = {} +GENERATE_SETTINGS_ARGS = [] + +PRESETS = { + 'Ultra Fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False}, + 'Fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80}, + 'Standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200}, + 'High Quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400}, +} + +HISTORY_HEADERS = { + "Name": "", + "Samples": "num_autoregressive_samples", + "Iterations": "diffusion_iterations", + "Temp.": "temperature", + "Sampler": "diffusion_sampler", + "CVVP": "cvvp_weight", + "Top P": "top_p", + "Diff. Temp.": "diffusion_temperature", + "Len Pen": "length_penalty", + "Rep Pen": "repetition_penalty", + "Cond-Free K": "cond_free_k", + "Time": "time", + "Datetime": "datetime", + "Model": "model", + "Model Hash": "model_hash", +} + +# can't use *args OR **kwargs if I want to retain the ability to use progress +def generate_proxy( + text, + delimiter, + emotion, + prompt, + voice, + mic_audio, + voice_latents_chunks, + candidates, + seed, + num_autoregressive_samples, + diffusion_iterations, + temperature, + diffusion_sampler, + breathing_room, + cvvp_weight, + top_p, + diffusion_temperature, + length_penalty, + repetition_penalty, + cond_free_k, + experimentals, + voice_latents_original_ar, + voice_latents_original_diffusion, + progress=gr.Progress(track_tqdm=True) +): + kwargs = locals() + + try: + sample, outputs, stats = generate(**kwargs) + except Exception as e: + message = str(e) + if message == "Kill signal detected": + unload_tts() + + raise e + + return ( + outputs[0], + gr.update(value=sample, visible=sample is not None), + gr.update(choices=outputs, value=outputs[0], visible=len(outputs) > 1, interactive=True), + gr.update(value=stats, visible=True), + ) + + +def update_presets(value): + if value in PRESETS: + preset = PRESETS[value] + return (gr.update(value=preset['num_autoregressive_samples']), gr.update(value=preset['diffusion_iterations'])) + else: + return (gr.update(), gr.update()) + +def get_training_configs(): + configs = [] + for i, file in enumerate(sorted(os.listdir(f"./training/"))): + if file[-5:] != ".yaml" or file[0] == ".": + continue + configs.append(f"./training/{file}") + + return configs + +def update_training_configs(): + return gr.update(choices=get_training_list()) + +def history_view_results( voice ): + results = [] + files = [] + outdir = f"{args.results_folder}/{voice}/" + for i, file in enumerate(sorted(os.listdir(outdir))): + if file[-4:] != ".wav": + continue + + metadata, _ = read_generate_settings(f"{outdir}/{file}", read_latents=False) + if metadata is None: + continue + + values = [] + for k in HISTORY_HEADERS: + v = file + if k != "Name": + v = metadata[HISTORY_HEADERS[k]] if HISTORY_HEADERS[k] in metadata else '?' + values.append(v) + + + files.append(file) + results.append(values) + + return ( + results, + gr.Dropdown.update(choices=sorted(files)) + ) + +def import_generate_settings_proxy( file=None ): + global GENERATE_SETTINGS_ARGS + settings = import_generate_settings( file ) + + res = [] + for k in GENERATE_SETTINGS_ARGS: + res.append(settings[k] if k in settings else None) + + return tuple(res) + +def reset_generate_settings_proxy(): + global GENERATE_SETTINGS_ARGS + settings = reset_generate_settings() + + res = [] + for k in GENERATE_SETTINGS_ARGS: + res.append(settings[k] if k in settings else None) + + return tuple(res) + +def compute_latents_proxy(voice, voice_latents_chunks, original_ar, original_diffusion, progress=gr.Progress(track_tqdm=True)): + compute_latents( voice=voice, voice_latents_chunks=voice_latents_chunks, original_ar=original_ar, original_diffusion=original_diffusion ) + return voice + + +def import_voices_proxy(files, name, progress=gr.Progress(track_tqdm=True)): + import_voices(files, name, progress) + return gr.update() + +def read_generate_settings_proxy(file, saveAs='.temp'): + j, latents = read_generate_settings(file) + + if latents: + outdir = f'{get_voice_dir()}/{saveAs}/' + os.makedirs(outdir, exist_ok=True) + with open(f'{outdir}/cond_latents.pth', 'wb') as f: + f.write(latents) + + latents = f'{outdir}/cond_latents.pth' + + return ( + gr.update(value=j, visible=j is not None), + gr.update(value=latents, visible=latents is not None), + None if j is None else j['voice'], + gr.update(visible=j is not None), + ) + +def slice_dataset_proxy( voice, trim_silence, start_offset, end_offset, progress=gr.Progress(track_tqdm=True) ): + return slice_dataset( voice, trim_silence=trim_silence, start_offset=start_offset, end_offset=end_offset, results=None, progress=progress ) + +def diarize_dataset( voice, progress=gr.Progress(track_tqdm=True) ): + from pyannote.audio import Pipeline + pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=args.hf_token) + + messages = [] + files = get_voice(voice, load_latents=False) + for file in enumerate_progress(files, desc="Iterating through voice files", progress=progress): + diarization = pipeline(file) + for turn, _, speaker in diarization.itertracks(yield_label=True): + message = f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}" + print(message) + messages.append(message) + + return "\n".join(messages) + +def prepare_all_datasets( language, validation_text_length, validation_audio_length, skip_existings, slice_audio, trim_silence, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=True) ): + kwargs = locals() + + messages = [] + voices = get_voice_list() + + for voice in voices: + print("Processing:", voice) + message = transcribe_dataset( voice=voice, language=language, skip_existings=skip_existings, progress=progress ) + messages.append(message) + + if slice_audio: + for voice in voices: + print("Processing:", voice) + message = slice_dataset( voice, trim_silence=trim_silence, start_offset=slice_start_offset, end_offset=slice_end_offset, results=None, progress=progress ) + messages.append(message) + + for voice in voices: + print("Processing:", voice) + message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length, progress=progress ) + messages.append(message) + + return "\n".join(messages) + +def prepare_dataset_proxy( voice, language, validation_text_length, validation_audio_length, skip_existings, slice_audio, trim_silence, slice_start_offset, slice_end_offset, progress=gr.Progress(track_tqdm=True) ): + messages = [] + + message = transcribe_dataset( voice=voice, language=language, skip_existings=skip_existings, progress=progress ) + messages.append(message) + + if slice_audio: + message = slice_dataset( voice, trim_silence=trim_silence, start_offset=slice_start_offset, end_offset=slice_end_offset, results=None, progress=progress ) + messages.append(message) + + message = prepare_dataset( voice, use_segments=slice_audio, text_length=validation_text_length, audio_length=validation_audio_length, progress=progress ) + messages.append(message) + + return "\n".join(messages) + +def update_args_proxy( *args ): + kwargs = {} + keys = list(EXEC_SETTINGS.keys()) + for i in range(len(args)): + k = keys[i] + v = args[i] + kwargs[k] = v + + update_args(**kwargs) +def optimize_training_settings_proxy( *args ): + kwargs = {} + keys = list(TRAINING_SETTINGS.keys()) + for i in range(len(args)): + k = keys[i] + v = args[i] + kwargs[k] = v + + settings, messages = optimize_training_settings(**kwargs) + output = list(settings.values()) + return output[:-1] + ["\n".join(messages)] + +def import_training_settings_proxy( voice ): + messages = [] + injson = f'./training/{voice}/train.json' + statedir = f'./training/{voice}/finetune/training_state/' + output = {} + + try: + with open(injson, 'r', encoding="utf-8") as f: + settings = json.loads(f.read()) + except: + messages.append(f"Error import /{voice}/train.json") + + for k in TRAINING_SETTINGS: + output[k] = TRAINING_SETTINGS[k].value + + output = list(output.values()) + return output[:-1] + ["\n".join(messages)] + + if os.path.isdir(statedir): + resumes = sorted([int(d[:-6]) for d in os.listdir(statedir) if d[-6:] == ".state" ]) + + if len(resumes) > 0: + settings['resume_state'] = f'{statedir}/{resumes[-1]}.state' + messages.append(f"Found most recent training state: {settings['resume_state']}") + + output = {} + for k in TRAINING_SETTINGS: + if k not in settings: + output[k] = gr.update() + else: + output[k] = gr.update(value=settings[k]) + + output = list(output.values()) + + messages.append(f"Imported training settings: {injson}") + + return output[:-1] + ["\n".join(messages)] + +def save_training_settings_proxy( *args ): + kwargs = {} + keys = list(TRAINING_SETTINGS.keys()) + for i in range(len(args)): + k = keys[i] + v = args[i] + kwargs[k] = v + + settings, messages = save_training_settings(**kwargs) + return "\n".join(messages) + +def update_voices(): + return ( + gr.Dropdown.update(choices=get_voice_list(append_defaults=True)), + gr.Dropdown.update(choices=get_voice_list()), + gr.Dropdown.update(choices=get_voice_list(args.results_folder)), + ) + +def history_copy_settings( voice, file ): + return import_generate_settings( f"{args.results_folder}/{voice}/{file}" ) + +def setup_gradio(): + global args + global ui + + if not args.share: + def noop(function, return_value=None): + def wrapped(*args, **kwargs): + return return_value + return wrapped + gradio.utils.version_check = noop(gradio.utils.version_check) + gradio.utils.initiated_analytics = noop(gradio.utils.initiated_analytics) + gradio.utils.launch_analytics = noop(gradio.utils.launch_analytics) + gradio.utils.integration_analytics = noop(gradio.utils.integration_analytics) + gradio.utils.error_analytics = noop(gradio.utils.error_analytics) + gradio.utils.log_feature_analytics = noop(gradio.utils.log_feature_analytics) + #gradio.utils.get_local_ip_address = noop(gradio.utils.get_local_ip_address, 'localhost') + + if args.models_from_local_only: + os.environ['TRANSFORMERS_OFFLINE']='1' + + voice_list_with_defaults = get_voice_list(append_defaults=True) + voice_list = get_voice_list() + result_voices = get_voice_list(args.results_folder) + + valle_models = get_valle_models() + + autoregressive_models = get_autoregressive_models() + diffusion_models = get_diffusion_models() + tokenizer_jsons = get_tokenizer_jsons() + + dataset_list = get_dataset_list() + training_list = get_training_list() + + global GENERATE_SETTINGS_ARGS + GENERATE_SETTINGS_ARGS = list(inspect.signature(generate_proxy).parameters.keys())[:-1] + for i in range(len(GENERATE_SETTINGS_ARGS)): + arg = GENERATE_SETTINGS_ARGS[i] + GENERATE_SETTINGS[arg] = None + + with gr.Blocks() as ui: + with gr.Tab("Generate"): + with gr.Row(): + with gr.Column(): + GENERATE_SETTINGS["text"] = gr.Textbox(lines=4, value="Your prompt here.", label="Input Prompt") + with gr.Row(): + with gr.Column(): + GENERATE_SETTINGS["delimiter"] = gr.Textbox(lines=1, label="Line Delimiter", placeholder="\\n") + + GENERATE_SETTINGS["emotion"] = gr.Radio( ["Happy", "Sad", "Angry", "Disgusted", "Arrogant", "Custom", "None"], value="None", label="Emotion", type="value", interactive=True, visible=args.tts_backend=="tortoise" ) + GENERATE_SETTINGS["prompt"] = gr.Textbox(lines=1, label="Custom Emotion", visible=False) + GENERATE_SETTINGS["voice"] = gr.Dropdown(choices=voice_list_with_defaults, label="Voice", type="value", value=voice_list_with_defaults[0]) # it'd be very cash money if gradio was able to default to the first value in the list without this shit + GENERATE_SETTINGS["mic_audio"] = gr.Audio( label="Microphone Source", source="microphone", type="filepath", visible=False ) + GENERATE_SETTINGS["voice_latents_chunks"] = gr.Number(label="Voice Chunks", precision=0, value=0, visible=args.tts_backend=="tortoise") + GENERATE_SETTINGS["voice_latents_original_ar"] = gr.Checkbox(label="Use Original Latents Method (AR)", visible=args.tts_backend=="tortoise") + GENERATE_SETTINGS["voice_latents_original_diffusion"] = gr.Checkbox(label="Use Original Latents Method (Diffusion)", visible=args.tts_backend=="tortoise") + with gr.Row(): + refresh_voices = gr.Button(value="Refresh Voice List") + recompute_voice_latents = gr.Button(value="(Re)Compute Voice Latents") + + GENERATE_SETTINGS["voice"].change( + fn=update_baseline_for_latents_chunks, + inputs=GENERATE_SETTINGS["voice"], + outputs=GENERATE_SETTINGS["voice_latents_chunks"] + ) + GENERATE_SETTINGS["voice"].change( + fn=lambda value: gr.update(visible=value == "microphone"), + inputs=GENERATE_SETTINGS["voice"], + outputs=GENERATE_SETTINGS["mic_audio"], + ) + with gr.Column(): + preset = None + GENERATE_SETTINGS["candidates"] = gr.Slider(value=1, minimum=1, maximum=6, step=1, label="Candidates", visible=args.tts_backend=="tortoise") + GENERATE_SETTINGS["seed"] = gr.Number(value=0, precision=0, label="Seed", visible=args.tts_backend=="tortoise") + + preset = gr.Radio( ["Ultra Fast", "Fast", "Standard", "High Quality"], label="Preset", type="value", value="Ultra Fast", visible=args.tts_backend=="tortoise" ) + + GENERATE_SETTINGS["num_autoregressive_samples"] = gr.Slider(value=16, minimum=2, maximum=2048 if args.tts_backend=="vall-e" else 512, step=1, label="Samples", visible=args.tts_backend!="bark") + GENERATE_SETTINGS["diffusion_iterations"] = gr.Slider(value=30, minimum=0, maximum=512, step=1, label="Iterations", visible=args.tts_backend=="tortoise") + + GENERATE_SETTINGS["temperature"] = gr.Slider(value=0.95 if args.tts_backend=="vall-e" else 0.2, minimum=0, maximum=1, step=0.05, label="Temperature") + + show_experimental_settings = gr.Checkbox(label="Show Experimental Settings", visible=args.tts_backend=="tortoise") + reset_generate_settings_button = gr.Button(value="Reset to Default") + with gr.Column(visible=False) as col: + experimental_column = col + + GENERATE_SETTINGS["experimentals"] = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=["Conditioning-Free"], label="Experimental Flags") + GENERATE_SETTINGS["breathing_room"] = gr.Slider(value=8, minimum=1, maximum=32, step=1, label="Pause Size") + GENERATE_SETTINGS["diffusion_sampler"] = gr.Radio( + ["P", "DDIM"], # + ["K_Euler_A", "DPM++2M"], + value="DDIM", label="Diffusion Samplers", type="value" + ) + GENERATE_SETTINGS["cvvp_weight"] = gr.Slider(value=0, minimum=0, maximum=1, label="CVVP Weight") + GENERATE_SETTINGS["top_p"] = gr.Slider(value=0.8, minimum=0, maximum=1, label="Top P") + GENERATE_SETTINGS["diffusion_temperature"] = gr.Slider(value=1.0, minimum=0, maximum=1, label="Diffusion Temperature") + GENERATE_SETTINGS["length_penalty"] = gr.Slider(value=1.0, minimum=0, maximum=8, label="Length Penalty") + GENERATE_SETTINGS["repetition_penalty"] = gr.Slider(value=2.0, minimum=0, maximum=8, label="Repetition Penalty") + GENERATE_SETTINGS["cond_free_k"] = gr.Slider(value=2.0, minimum=0, maximum=4, label="Conditioning-Free K") + with gr.Column(): + with gr.Row(): + submit = gr.Button(value="Generate") + stop = gr.Button(value="Stop") + + generation_results = gr.Dataframe(label="Results", headers=["Seed", "Time"], visible=False) + source_sample = gr.Audio(label="Source Sample", visible=False) + output_audio = gr.Audio(label="Output") + candidates_list = gr.Dropdown(label="Candidates", type="value", visible=False, choices=[""], value="") + + def change_candidate( val ): + if not val: + return + return val + + candidates_list.change( + fn=change_candidate, + inputs=candidates_list, + outputs=output_audio, + ) + with gr.Tab("History"): + with gr.Row(): + with gr.Column(): + history_info = gr.Dataframe(label="Results", headers=list(HISTORY_HEADERS.keys())) + with gr.Row(): + with gr.Column(): + history_voices = gr.Dropdown(choices=result_voices, label="Voice", type="value", value=result_voices[0] if len(result_voices) > 0 else "") + with gr.Column(): + history_results_list = gr.Dropdown(label="Results",type="value", interactive=True, value="") + with gr.Column(): + history_audio = gr.Audio() + history_copy_settings_button = gr.Button(value="Copy Settings") + with gr.Tab("Utilities"): + with gr.Tab("Import / Analyze"): + with gr.Row(): + with gr.Column(): + audio_in = gr.Files(type="file", label="Audio Input", file_types=["audio"]) + import_voice_name = gr.Textbox(label="Voice Name") + import_voice_button = gr.Button(value="Import Voice") + with gr.Column(visible=False) as col: + utilities_metadata_column = col + + metadata_out = gr.JSON(label="Audio Metadata") + copy_button = gr.Button(value="Copy Settings") + latents_out = gr.File(type="binary", label="Voice Latents") + with gr.Tab("Tokenizer"): + with gr.Row(): + text_tokenizier_input = gr.TextArea(label="Text", max_lines=4) + text_tokenizier_output = gr.TextArea(label="Tokenized Text", max_lines=4) + + with gr.Row(): + text_tokenizier_button = gr.Button(value="Tokenize Text") + with gr.Tab("Model Merger"): + with gr.Column(): + with gr.Row(): + MERGER_SETTINGS["model_a"] = gr.Dropdown( choices=autoregressive_models, label="Model A", type="value", value=autoregressive_models[0] ) + MERGER_SETTINGS["model_b"] = gr.Dropdown( choices=autoregressive_models, label="Model B", type="value", value=autoregressive_models[0] ) + with gr.Row(): + MERGER_SETTINGS["weight_slider"] = gr.Slider(label="Weight (from A to B)", value=0.5, minimum=0, maximum=1) + with gr.Row(): + merger_button = gr.Button(value="Run Merger") + with gr.Column(): + merger_output = gr.TextArea(label="Console Output", max_lines=8) + with gr.Tab("Training"): + with gr.Tab("Prepare Dataset"): + with gr.Row(): + with gr.Column(): + DATASET_SETTINGS = {} + DATASET_SETTINGS['voice'] = gr.Dropdown( choices=voice_list, label="Dataset Source", type="value", value=voice_list[0] if len(voice_list) > 0 else "" ) + with gr.Row(): + DATASET_SETTINGS['language'] = gr.Textbox(label="Language", value="en") + DATASET_SETTINGS['validation_text_length'] = gr.Number(label="Validation Text Length Threshold", value=12, precision=0, visible=args.tts_backend=="tortoise") + DATASET_SETTINGS['validation_audio_length'] = gr.Number(label="Validation Audio Length Threshold", value=1, visible=args.tts_backend=="tortoise" ) + with gr.Row(): + DATASET_SETTINGS['skip'] = gr.Checkbox(label="Skip Existing", value=False) + DATASET_SETTINGS['slice'] = gr.Checkbox(label="Slice Segments", value=False) + DATASET_SETTINGS['trim_silence'] = gr.Checkbox(label="Trim Silence", value=False) + with gr.Row(): + DATASET_SETTINGS['slice_start_offset'] = gr.Number(label="Slice Start Offset", value=0) + DATASET_SETTINGS['slice_end_offset'] = gr.Number(label="Slice End Offset", value=0) + + transcribe_button = gr.Button(value="Transcribe and Process") + transcribe_all_button = gr.Button(value="Transcribe All") + diarize_button = gr.Button(value="Diarize", visible=False) + + with gr.Row(): + slice_dataset_button = gr.Button(value="(Re)Slice Audio") + prepare_dataset_button = gr.Button(value="(Re)Create Dataset") + + with gr.Row(): + EXEC_SETTINGS['whisper_backend'] = gr.Dropdown(WHISPER_BACKENDS, label="Whisper Backends", value=args.whisper_backend) + EXEC_SETTINGS['whisper_model'] = gr.Dropdown(WHISPER_MODELS, label="Whisper Model", value=args.whisper_model) + + dataset_settings = list(DATASET_SETTINGS.values()) + with gr.Column(): + prepare_dataset_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8) + with gr.Tab("Generate Configuration", visible=args.tts_backend != "bark"): + with gr.Row(): + with gr.Column(): + TRAINING_SETTINGS["epochs"] = gr.Number(label="Epochs", value=500, precision=0) + with gr.Row(visible=args.tts_backend=="tortoise"): + TRAINING_SETTINGS["learning_rate"] = gr.Slider(label="Learning Rate", value=1e-5, minimum=0, maximum=1e-4, step=1e-6) + TRAINING_SETTINGS["mel_lr_weight"] = gr.Slider(label="Mel LR Ratio", value=1.00, minimum=0, maximum=1) + TRAINING_SETTINGS["text_lr_weight"] = gr.Slider(label="Text LR Ratio", value=0.01, minimum=0, maximum=1) + + with gr.Row(visible=args.tts_backend=="tortoise"): + lr_schemes = list(LEARNING_RATE_SCHEMES.keys()) + TRAINING_SETTINGS["learning_rate_scheme"] = gr.Radio(lr_schemes, label="Learning Rate Scheme", value=lr_schemes[0], type="value") + TRAINING_SETTINGS["learning_rate_schedule"] = gr.Textbox(label="Learning Rate Schedule", placeholder=str(LEARNING_RATE_SCHEDULE), visible=True) + TRAINING_SETTINGS["learning_rate_restarts"] = gr.Number(label="Learning Rate Restarts", value=4, precision=0, visible=False) + + TRAINING_SETTINGS["learning_rate_scheme"].change( + fn=lambda x: ( gr.update(visible=x == lr_schemes[0]), gr.update(visible=x == lr_schemes[1]) ), + inputs=TRAINING_SETTINGS["learning_rate_scheme"], + outputs=[ + TRAINING_SETTINGS["learning_rate_schedule"], + TRAINING_SETTINGS["learning_rate_restarts"], + ] + ) + with gr.Row(): + TRAINING_SETTINGS["batch_size"] = gr.Number(label="Batch Size", value=128, precision=0) + TRAINING_SETTINGS["gradient_accumulation_size"] = gr.Number(label="Gradient Accumulation Size", value=4, precision=0) + with gr.Row(): + TRAINING_SETTINGS["save_rate"] = gr.Number(label="Save Frequency (in epochs)", value=5, precision=0) + TRAINING_SETTINGS["validation_rate"] = gr.Number(label="Validation Frequency (in epochs)", value=5, precision=0) + + with gr.Row(): + TRAINING_SETTINGS["half_p"] = gr.Checkbox(label="Half Precision", value=args.training_default_halfp, visible=args.tts_backend=="tortoise") + TRAINING_SETTINGS["bitsandbytes"] = gr.Checkbox(label="BitsAndBytes", value=args.training_default_bnb, visible=args.tts_backend=="tortoise") + TRAINING_SETTINGS["validation_enabled"] = gr.Checkbox(label="Validation Enabled", value=False) + + with gr.Row(): + TRAINING_SETTINGS["workers"] = gr.Number(label="Worker Processes", value=2, precision=0, visible=args.tts_backend=="tortoise") + TRAINING_SETTINGS["gpus"] = gr.Number(label="GPUs", value=get_device_count(), precision=0) + + TRAINING_SETTINGS["source_model"] = gr.Dropdown( choices=autoregressive_models, label="Source Model", type="value", value=autoregressive_models[0], visible=args.tts_backend=="tortoise" ) + TRAINING_SETTINGS["resume_state"] = gr.Textbox(label="Resume State Path", placeholder="./training/${voice}/finetune/training_state/${last_state}.state", visible=args.tts_backend=="tortoise") + + TRAINING_SETTINGS["voice"] = gr.Dropdown( choices=dataset_list, label="Dataset", type="value", value=dataset_list[0] if len(dataset_list) else "" ) + + with gr.Row(): + training_refresh_dataset = gr.Button(value="Refresh Dataset List") + training_import_settings = gr.Button(value="Reuse/Import Dataset") + with gr.Column(): + training_configuration_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8) + with gr.Row(): + training_optimize_configuration = gr.Button(value="Validate Training Configuration") + training_save_configuration = gr.Button(value="Save Training Configuration") + with gr.Tab("Run Training", visible=args.tts_backend != "bark"): + with gr.Row(): + with gr.Column(): + training_configs = gr.Dropdown(label="Training Configuration", choices=training_list, value=training_list[0] if len(training_list) else "") + refresh_configs = gr.Button(value="Refresh Configurations") + training_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8) + verbose_training = gr.Checkbox(label="Verbose Console Output", value=True) + + keep_x_past_checkpoints = gr.Slider(label="Keep X Previous States", minimum=0, maximum=8, value=0, step=1) + + with gr.Row(): + training_graph_x_min = gr.Number(label="X Min", precision=0, value=0) + training_graph_x_max = gr.Number(label="X Max", precision=0, value=0) + training_graph_y_min = gr.Number(label="Y Min", precision=0, value=0) + training_graph_y_max = gr.Number(label="Y Max", precision=0, value=0) + + with gr.Row(): + start_training_button = gr.Button(value="Train") + stop_training_button = gr.Button(value="Stop") + reconnect_training_button = gr.Button(value="Reconnect") + + + with gr.Column(): + training_loss_graph = gr.LinePlot(label="Training Metrics", + x="it", # x="epoch", + y="value", + title="Loss Metrics", + color="type", + tooltip=['epoch', 'it', 'value', 'type'], + width=500, + height=350, + ) + training_lr_graph = gr.LinePlot(label="Training Metrics", + x="it", # x="epoch", + y="value", + title="Learning Rate", + color="type", + tooltip=['epoch', 'it', 'value', 'type'], + width=500, + height=350, + ) + training_grad_norm_graph = gr.LinePlot(label="Training Metrics", + x="it", # x="epoch", + y="value", + title="Gradient Normals", + color="type", + tooltip=['epoch', 'it', 'value', 'type'], + width=500, + height=350, + visible=False, # args.tts_backend=="vall-e" + ) + view_losses = gr.Button(value="View Losses") + + with gr.Tab("Settings"): + with gr.Row(): + exec_inputs = [] + with gr.Column(): + EXEC_SETTINGS['listen'] = gr.Textbox(label="Listen", value=args.listen, placeholder="127.0.0.1:7860/") + EXEC_SETTINGS['share'] = gr.Checkbox(label="Public Share Gradio", value=args.share) + EXEC_SETTINGS['check_for_updates'] = gr.Checkbox(label="Check For Updates", value=args.check_for_updates) + EXEC_SETTINGS['models_from_local_only'] = gr.Checkbox(label="Only Load Models Locally", value=args.models_from_local_only) + EXEC_SETTINGS['low_vram'] = gr.Checkbox(label="Low VRAM", value=args.low_vram) + EXEC_SETTINGS['embed_output_metadata'] = gr.Checkbox(label="Embed Output Metadata", value=args.embed_output_metadata) + EXEC_SETTINGS['latents_lean_and_mean'] = gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean) + EXEC_SETTINGS['voice_fixer'] = gr.Checkbox(label="Use Voice Fixer on Generated Output", value=args.voice_fixer) + EXEC_SETTINGS['use_deepspeed'] = gr.Checkbox(label="Use DeepSpeed for Speed Bump.", value=args.use_deepspeed) + EXEC_SETTINGS['voice_fixer_use_cuda'] = gr.Checkbox(label="Use CUDA for Voice Fixer", value=args.voice_fixer_use_cuda) + EXEC_SETTINGS['force_cpu_for_conditioning_latents'] = gr.Checkbox(label="Force CPU for Conditioning Latents", value=args.force_cpu_for_conditioning_latents) + EXEC_SETTINGS['defer_tts_load'] = gr.Checkbox(label="Do Not Load TTS On Startup", value=args.defer_tts_load) + EXEC_SETTINGS['prune_nonfinal_outputs'] = gr.Checkbox(label="Delete Non-Final Output", value=args.prune_nonfinal_outputs) + with gr.Column(): + EXEC_SETTINGS['sample_batch_size'] = gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size) + EXEC_SETTINGS['unsqueeze_sample_batches'] = gr.Checkbox(label="Unsqueeze Sample Batches", value=args.unsqueeze_sample_batches) + EXEC_SETTINGS['concurrency_count'] = gr.Number(label="Gradio Concurrency Count", precision=0, value=args.concurrency_count) + EXEC_SETTINGS['autocalculate_voice_chunk_duration_size'] = gr.Number(label="Auto-Calculate Voice Chunk Duration (in seconds)", precision=0, value=args.autocalculate_voice_chunk_duration_size) + EXEC_SETTINGS['output_volume'] = gr.Slider(label="Output Volume", minimum=0, maximum=2, value=args.output_volume) + EXEC_SETTINGS['device_override'] = gr.Textbox(label="Device Override", value=args.device_override) + + EXEC_SETTINGS['results_folder'] = gr.Textbox(label="Results Folder", value=args.results_folder) + # EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0]) + + if args.tts_backend=="vall-e": + with gr.Column(): + EXEC_SETTINGS['valle_model'] = gr.Dropdown(choices=valle_models, label="VALL-E Model Config", value=args.valle_model if args.valle_model else valle_models[0]) + + with gr.Column(visible=args.tts_backend=="tortoise"): + EXEC_SETTINGS['autoregressive_model'] = gr.Dropdown(choices=["auto"] + autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else "auto") + EXEC_SETTINGS['diffusion_model'] = gr.Dropdown(choices=diffusion_models, label="Diffusion Model", value=args.diffusion_model if args.diffusion_model else diffusion_models[0]) + EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1]) + EXEC_SETTINGS['tokenizer_json'] = gr.Dropdown(tokenizer_jsons, label="Tokenizer JSON Path", value=args.tokenizer_json if args.tokenizer_json else tokenizer_jsons[0]) + + EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p'] + EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes'] + + with gr.Row(): + autoregressive_models_update_button = gr.Button(value="Refresh Model List") + gr.Button(value="Check for Updates").click(check_for_updates) + gr.Button(value="(Re)Load TTS").click( + reload_tts, + inputs=None, + outputs=None + ) + # kill_button = gr.Button(value="Close UI") + + def update_model_list_proxy( autoregressive, diffusion, tokenizer ): + autoregressive_models = get_autoregressive_models() + if autoregressive not in autoregressive_models: + autoregressive = autoregressive_models[0] + + diffusion_models = get_diffusion_models() + if diffusion not in diffusion_models: + diffusion = diffusion_models[0] + + tokenizer_jsons = get_tokenizer_jsons() + if tokenizer not in tokenizer_jsons: + tokenizer = tokenizer_jsons[0] + + return ( + gr.update( choices=autoregressive_models, value=autoregressive ), + gr.update( choices=diffusion_models, value=diffusion ), + gr.update( choices=tokenizer_jsons, value=tokenizer ), + ) + + autoregressive_models_update_button.click( + update_model_list_proxy, + inputs=[ + EXEC_SETTINGS['autoregressive_model'], + EXEC_SETTINGS['diffusion_model'], + EXEC_SETTINGS['tokenizer_json'], + ], + outputs=[ + EXEC_SETTINGS['autoregressive_model'], + EXEC_SETTINGS['diffusion_model'], + EXEC_SETTINGS['tokenizer_json'], + ], + ) + + exec_inputs = list(EXEC_SETTINGS.values()) + for k in EXEC_SETTINGS: + EXEC_SETTINGS[k].change( fn=update_args_proxy, inputs=exec_inputs ) + + EXEC_SETTINGS['autoregressive_model'].change( + fn=update_autoregressive_model, + inputs=EXEC_SETTINGS['autoregressive_model'], + outputs=None, + api_name="set_autoregressive_model" + ) + + EXEC_SETTINGS['vocoder_model'].change( + fn=update_vocoder_model, + inputs=EXEC_SETTINGS['vocoder_model'], + outputs=None + ) + + history_voices.change( + fn=history_view_results, + inputs=history_voices, + outputs=[ + history_info, + history_results_list, + ] + ) + history_results_list.change( + fn=lambda voice, file: f"{args.results_folder}/{voice}/{file}", + inputs=[ + history_voices, + history_results_list, + ], + outputs=history_audio + ) + audio_in.upload( + fn=read_generate_settings_proxy, + inputs=audio_in, + outputs=[ + metadata_out, + latents_out, + import_voice_name, + utilities_metadata_column, + ] + ) + + import_voice_button.click( + fn=import_voices_proxy, + inputs=[ + audio_in, + import_voice_name, + ], + outputs=import_voice_name #console_output + ) + show_experimental_settings.change( + fn=lambda x: gr.update(visible=x), + inputs=show_experimental_settings, + outputs=experimental_column + ) + if preset: + preset.change(fn=update_presets, + inputs=preset, + outputs=[ + GENERATE_SETTINGS['num_autoregressive_samples'], + GENERATE_SETTINGS['diffusion_iterations'], + ], + ) + + recompute_voice_latents.click(compute_latents_proxy, + inputs=[ + GENERATE_SETTINGS['voice'], + GENERATE_SETTINGS['voice_latents_chunks'], + GENERATE_SETTINGS['voice_latents_original_ar'], + GENERATE_SETTINGS['voice_latents_original_diffusion'], + ], + outputs=GENERATE_SETTINGS['voice'], + ) + + GENERATE_SETTINGS['emotion'].change( + fn=lambda value: gr.update(visible=value == "Custom"), + inputs=GENERATE_SETTINGS['emotion'], + outputs=GENERATE_SETTINGS['prompt'] + ) + GENERATE_SETTINGS['mic_audio'].change(fn=lambda value: gr.update(value="microphone"), + inputs=GENERATE_SETTINGS['mic_audio'], + outputs=GENERATE_SETTINGS['voice'] + ) + + refresh_voices.click(update_voices, + inputs=None, + outputs=[ + GENERATE_SETTINGS['voice'], + DATASET_SETTINGS['voice'], + history_voices + ] + ) + + generate_settings = list(GENERATE_SETTINGS.values()) + submit.click( + lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)), + outputs=[source_sample, candidates_list, generation_results], + ) + + submit_event = submit.click(generate_proxy, + inputs=generate_settings, + outputs=[output_audio, source_sample, candidates_list, generation_results], + api_name="generate", + ) + + + copy_button.click(import_generate_settings_proxy, + inputs=audio_in, # JSON elements cannot be used as inputs + outputs=generate_settings + ) + + reset_generate_settings_button.click( + fn=reset_generate_settings_proxy, + inputs=None, + outputs=generate_settings + ) + + history_copy_settings_button.click(history_copy_settings, + inputs=[ + history_voices, + history_results_list, + ], + outputs=generate_settings + ) + + text_tokenizier_button.click(tokenize_text, + inputs=text_tokenizier_input, + outputs=text_tokenizier_output + ) + + merger_button.click(merge_models, + inputs=list(MERGER_SETTINGS.values()), + outputs=merger_output + ) + + refresh_configs.click( + lambda: gr.update(choices=get_training_list()), + inputs=None, + outputs=training_configs + ) + start_training_button.click(run_training, + inputs=[ + training_configs, + verbose_training, + keep_x_past_checkpoints, + ], + outputs=[ + training_output, + ], + ) + training_output.change( + fn=update_training_dataplot, + inputs=[ + training_graph_x_min, + training_graph_x_max, + training_graph_y_min, + training_graph_y_max, + ], + outputs=[ + training_loss_graph, + training_lr_graph, + training_grad_norm_graph, + ], + show_progress=False, + ) + + view_losses.click( + fn=update_training_dataplot, + inputs=[ + training_graph_x_min, + training_graph_x_max, + training_graph_y_min, + training_graph_y_max, + training_configs, + ], + outputs=[ + training_loss_graph, + training_lr_graph, + training_grad_norm_graph, + ], + ) + + stop_training_button.click(stop_training, + inputs=None, + outputs=training_output #console_output + ) + reconnect_training_button.click(reconnect_training, + inputs=[ + verbose_training, + ], + outputs=training_output #console_output + ) + transcribe_button.click( + prepare_dataset_proxy, + inputs=dataset_settings, + outputs=prepare_dataset_output #console_output + ) + transcribe_all_button.click( + prepare_all_datasets, + inputs=dataset_settings[1:], + outputs=prepare_dataset_output #console_output + ) + diarize_button.click( + diarize_dataset, + inputs=dataset_settings[0], + outputs=prepare_dataset_output #console_output + ) + prepare_dataset_button.click( + prepare_dataset, + inputs=[ + DATASET_SETTINGS['voice'], + DATASET_SETTINGS['slice'], + DATASET_SETTINGS['validation_text_length'], + DATASET_SETTINGS['validation_audio_length'], + ], + outputs=prepare_dataset_output #console_output + ) + slice_dataset_button.click( + slice_dataset_proxy, + inputs=[ + DATASET_SETTINGS['voice'], + DATASET_SETTINGS['trim_silence'], + DATASET_SETTINGS['slice_start_offset'], + DATASET_SETTINGS['slice_end_offset'], + ], + outputs=prepare_dataset_output + ) + + training_refresh_dataset.click( + lambda: gr.update(choices=get_dataset_list()), + inputs=None, + outputs=TRAINING_SETTINGS["voice"], + ) + training_settings = list(TRAINING_SETTINGS.values()) + training_optimize_configuration.click(optimize_training_settings_proxy, + inputs=training_settings, + outputs=training_settings[:-1] + [training_configuration_output] #console_output + ) + training_import_settings.click(import_training_settings_proxy, + inputs=TRAINING_SETTINGS['voice'], + outputs=training_settings[:-1] + [training_configuration_output] #console_output + ) + training_save_configuration.click(save_training_settings_proxy, + inputs=training_settings, + outputs=training_configuration_output #console_output + ) + + if os.path.isfile('./config/generate.json'): + ui.load(import_generate_settings_proxy, inputs=None, outputs=generate_settings) + + if args.check_for_updates: + ui.load(check_for_updates) + + stop.click(fn=cancel_generate, inputs=None, outputs=None) + + + ui.queue(concurrency_count=args.concurrency_count) + webui = ui return webui \ No newline at end of file