From 4f1593c8db8c4126313526b945e2694053d25ee0 Mon Sep 17 00:00:00 2001 From: mrq Date: Sun, 12 May 2024 10:17:29 -0500 Subject: [PATCH] a bunch of shit to salvage my old encodec-quantized audio because dac-encoded audio just does not want to converge --- scripts/process_dataset.py | 57 +++++++++++---- vall_e/data.py | 145 ++++++++++++++++++++++++++++++++++++- vall_e/emb/qnt.py | 2 - 3 files changed, 187 insertions(+), 17 deletions(-) diff --git a/scripts/process_dataset.py b/scripts/process_dataset.py index 52b2908..65036e4 100644 --- a/scripts/process_dataset.py +++ b/scripts/process_dataset.py @@ -5,15 +5,21 @@ import torchaudio from tqdm.auto import tqdm from pathlib import Path +from vall_e.config import cfg from vall_e.emb.g2p import encode as valle_phonemize from vall_e.emb.qnt import encode as valle_quantize, _replace_file_extension -# to-do: use argparser +# things that could be args +cfg.sample_rate = 24_000 +cfg.inference.audio_backend = "encodec" + input_audio = "voices" -input_metadata = "training/metadata" -output_dataset = "training/data" +input_metadata = "./training/metadata" +output_dataset = f"./training/data-{'2' if cfg.sample_rate else '4'}4KHz-{cfg.inference.audio_backend}" device = "cuda" +audio_extension = ".dac" if cfg.inference.audio_backend == "dac" else ".enc" + slice = "auto" missing = { "transcription": [], @@ -28,6 +34,9 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')): if not os.path.isdir(f'./{input_audio}/{dataset_name}/'): print("Is not dir:", f'./{input_audio}/{dataset_name}/') continue + + if dataset_name in ["LibriVox", "Audiobooks"]: + continue for speaker_id in tqdm(sorted(os.listdir(f'./{input_audio}/{dataset_name}/')), desc=f"Processing speaker in {dataset_name}"): if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'): @@ -35,6 +44,23 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')): continue os.makedirs(f'./{output_dataset}/{dataset_name}/{speaker_id}/', exist_ok=True) + + if speaker_id == "Noise": + for filename in sorted(os.listdir(f'./{input_audio}/{dataset_name}/{speaker_id}/')): + inpath = Path(f'./{input_audio}/{dataset_name}/{speaker_id}/{filename}') + outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{filename}') + + if _replace_file_extension(outpath, audio_extension).exists(): + continue + + waveform, sample_rate = torchaudio.load(inpath) + qnt = valle_quantize(waveform, sr=sample_rate, device=device) + if cfg.inference.audio_backend == "dac": + qnt.save(_replace_file_extension(outpath, audio_extension)) + else: + torch.save( qnt, _replace_file_extension(outpath, audio_extension) ) + + continue metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/whisper.json') if not metadata_path.exists(): @@ -47,6 +73,9 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')): missing["transcription"].append(str(metadata_path)) continue + if f'{dataset_name}/{speaker_id}' not in dataset: + dataset.append(f'{dataset_name}/{speaker_id}') + txts = [] wavs = [] @@ -64,9 +93,6 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')): waveform, sample_rate = None, None language = metadata[filename]["language"] if "language" in metadata[filename] else "english" - if f'{dataset_name}/{speaker_id}' not in dataset: - dataset.append(f'{dataset_name}/{speaker_id}') - if len(metadata[filename]["segments"]) == 0 or not use_slices: outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}.{extension}') text = metadata[filename]["text"] @@ -74,7 +100,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')): if len(text) == 0: continue - if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, ".dac").exists(): + if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists(): continue if not _replace_file_extension(outpath, ".json").exists(): @@ -84,7 +110,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')): language, )) - if not _replace_file_extension(outpath, ".dac").exists(): + if not _replace_file_extension(outpath, audio_extension).exists(): if waveform is None: waveform, sample_rate = torchaudio.load(inpath) @@ -100,7 +126,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')): i = i + 1 outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}') - if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, ".dac").exists(): + if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists(): continue if not _replace_file_extension(outpath, ".json").exists(): @@ -110,7 +136,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')): language, )) - if not _replace_file_extension(outpath, ".dac").exists(): + if not _replace_file_extension(outpath, audio_extension).exists(): if waveform is None: waveform, sample_rate = torchaudio.load(inpath) @@ -132,7 +158,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')): )) if len(txts) > 0: - for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}"): + for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}", disable=True): outpath, text, language = job phones = valle_phonemize(text) data = { @@ -147,10 +173,13 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')): try: outpath, waveform, sample_rate = job qnt = valle_quantize(waveform, sr=sample_rate, device=device) - qnt.save(_replace_file_extension(outpath, ".dac")) + if cfg.inference.audio_backend == "dac": + qnt.save(_replace_file_extension(outpath, audio_extension)) + else: + torch.save( qnt, _replace_file_extension(outpath, audio_extension) ) except Exception as e: print(f"Failed to quantize: {outpath}:", e) continue -open("./training/missing.json", 'w', encoding='utf-8').write(json.dumps(missing)) -open("./training/dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset)) \ No newline at end of file +open("./missing.json", 'w', encoding='utf-8').write(json.dumps(missing)) +open("./dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset)) diff --git a/vall_e/data.py b/vall_e/data.py index 8f7a229..80585db 100755 --- a/vall_e/data.py +++ b/vall_e/data.py @@ -145,7 +145,7 @@ def _get_hdf5_paths( data_dir, type="training", validate=False ): return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones key = f"/{type}/{_get_hdf5_path(data_dir)}" - return [ Path(f"{key}/{child.attrs['id']}") for child in cfg.hdf5[key].values() if not validate or _validate(child) ] if key in cfg.hdf5 else [] + return [ Path(f"{key}/{child}") for child in cfg.hdf5[key].keys() if not validate or _validate(child) ] if key in cfg.hdf5 else [] def _get_paths_of_extensions( path, extensions=_get_quant_extension(), validate=False ): if isinstance(path, str): @@ -1003,6 +1003,145 @@ def create_dataset_hdf5( skip_existing=True ): hf.create_dataset('symmap', data=json.dumps(symmap)) hf.close() +def extract_dataset_hdf5( skip_existing=True ): + cfg.dataset.use_hdf5 = True + cfg.load_hdf5(write=False) + hf = cfg.hdf5 + + symmap = get_phone_symmap() + + reverse_symmap = {"1":"","2":"","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"dˌ","12":"mˌ","13":"d","14":"ɹ","15":"tˈ","16":"pˌ","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"wˌ","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"hˌ","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"bˌ","75":"kˈ","76":"ɔ","77":"zˈ","78":"ᵻ","79":"kˌ","80":"vˈ","81":"fˌ","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"tˌ","86":"pˈ","87":"ðˌ","88":"sˌ","89":"nˌ","90":"lˌ","91":"̩","92":"ʔ","93":"vˌ","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"jˌ","101":"uːˈ","102":"iːˈ","103":"zˌ","104":".ˈ","105":"…","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"iˌ","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"-ˌ","127":"ɫ","128":"q","129":"—","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":".ˌ","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"?ˌ","150":",ˌ","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"!ˌ","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"qˌ","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"”","180":"“","181":"“ˈ","182":"“ˌ","183":";ˈ","184":";ˌ","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"ᵝ","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"oˌ","221":"eˈ","222":"ʍ","223":"eˌ","224":"uˌ","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"} + + root = str(cfg.data_dir) + + def add( dir, type="training", audios=True, texts=True ): + name = str(dir) + name = name.replace(root, "data/") + + Path(f'{cfg.relpath}/{name}/').mkdir(parents=True, exist_ok=True) + + if f'{type}/{name}' not in hf: + return + + ids = [ key for key in hf[f'{type}/{name}'].keys() ] + + for id in tqdm(ids, desc=f"Processing {name}"): + try: + key = f'{type}/{name}/{id}' + + if key not in hf: + tqdm.write(f'Missing key: {key}') + continue + + group = hf[key] + audio_exists = "audio" in group + text_exists = "text" in group + + if not audio_exists or not text_exists: + tqdm.write(f'Missing audio/text: {key}') + continue + + audio_path = Path(f'{cfg.relpath}/{name}/{id}.enc') + text_path = Path(f'{cfg.relpath}/{name}/{id}.json') + + # audio + if audios and audio_exists and not audio_path.exists(): + qnt = group["audio"][:, :] + torch.save( qnt, f'{cfg.relpath}/{name}/{id}.enc' ) + + # text + if texts and text_exists and not text_path.exists(): + tokens = group["text"][:][1:-1] + phones = [ reverse_symmap[f'{token}'] for token in tokens ] + phones = list("".join(phones).replace(" ", " ")) + + j = { + "text": "", + "phonemes": phones, + "language": "en" + } + + with open(text_path, "w", encoding="utf-8") as f: + f.write( json.dumps( j ) ) + + except Exception as e: + raise e + + # training + for data_dir in tqdm(cfg.dataset.training, desc="Processing Training"): + add( data_dir, type="training" ) + + # validation + for data_dir in tqdm(cfg.dataset.validation, desc='Processing Validation'): + add( data_dir, type="validation" ) + + # noise + for data_dir in tqdm(cfg.dataset.noise, desc='Processing Noise'): + add( data_dir, type="noise", texts=False ) + + hf.close() + +def retokenize_dataset_hdf5( skip_existing=True ): + cfg.dataset.use_hdf5 = True + cfg.load_hdf5(write=True) + hf = cfg.hdf5 + + symmap = get_phone_symmap() + reverse_symmap = {"1":"","2":"","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"dˌ","12":"mˌ","13":"d","14":"ɹ","15":"tˈ","16":"pˌ","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"wˌ","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"hˌ","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"bˌ","75":"kˈ","76":"ɔ","77":"zˈ","78":"ᵻ","79":"kˌ","80":"vˈ","81":"fˌ","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"tˌ","86":"pˈ","87":"ðˌ","88":"sˌ","89":"nˌ","90":"lˌ","91":"̩","92":"ʔ","93":"vˌ","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"jˌ","101":"uːˈ","102":"iːˈ","103":"zˌ","104":".ˈ","105":"…","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"iˌ","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"-ˌ","127":"ɫ","128":"q","129":"—","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":".ˌ","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"?ˌ","150":",ˌ","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"!ˌ","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"qˌ","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"”","180":"“","181":"“ˈ","182":"“ˌ","183":";ˈ","184":";ˌ","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"ᵝ","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"oˌ","221":"eˈ","222":"ʍ","223":"eˌ","224":"uˌ","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"} + + root = str(cfg.data_dir) + + def add( dir, type="training" ): + name = str(dir) + name = name.replace(root, "data/") + + Path(f'{cfg.relpath}/{name}/').mkdir(parents=True, exist_ok=True) + + if f'{type}/{name}' not in hf: + return + + ids = [ key for key in hf[f'{type}/{name}'].keys() ] + + for id in tqdm(ids, desc=f"Processing {name}"): + try: + key = f'{type}/{name}/{id}' + + if key not in hf: + tqdm.write(f'Missing key: {key}') + continue + + group = hf[key] + if not "text" in group: + tqdm.write(f'Missing text: {key}') + continue + + tokens = group["text"][:][1:-1] + content = list("".join([ reverse_symmap[f'{token}'] for token in tokens ]).replace(" ", " ")) + + tokens = cfg.tokenizer.encode("".join(content)) + tokens = np.array(tokens).astype(np.uint8) + + del group['text'] + group.create_dataset('text', data=tokens, compression='lzf') + + except Exception as e: + raise e + + # training + for data_dir in tqdm(cfg.dataset.training, desc="Processing Training"): + add( data_dir, type="training" ) + + # validation + for data_dir in tqdm(cfg.dataset.validation, desc='Processing Validation'): + add( data_dir, type="validation" ) + + # write symmap + if "symmap" in hf: + del hf['symmap'] + + hf.create_dataset('symmap', data=json.dumps(symmap)) + hf.close() + if __name__ == "__main__": import argparse @@ -1023,6 +1162,10 @@ if __name__ == "__main__": if args.action == "hdf5": create_dataset_hdf5() + if args.action == "extract-hdf5": + extract_dataset_hdf5() + if args.action == "retokenize-hdf5": + retokenize_dataset_hdf5() elif args.action == "metadata": create_dataset_metadata() elif args.action == "sample": diff --git a/vall_e/emb/qnt.py b/vall_e/emb/qnt.py index ea4531b..b222f89 100755 --- a/vall_e/emb/qnt.py +++ b/vall_e/emb/qnt.py @@ -144,7 +144,6 @@ def _load_vocos_model(device="cuda", levels=cfg.model.max_levels): @cache def _load_dac_model(device="cuda", levels=cfg.model.max_levels): kwargs = dict(model_type="24khz",model_bitrate="8kbps",tag="latest") - """ if not cfg.variable_sample_rate: # yes there's a better way, something like f'{cfg.sample.rate//1000}hz' if cfg.sample_rate == 44_000: @@ -155,7 +154,6 @@ def _load_dac_model(device="cuda", levels=cfg.model.max_levels): kwargs["model_type"] = "16khz" else: raise Exception(f'unsupported sample rate: {cfg.sample_rate}') - """ model = __load_dac_model(**kwargs) model = model.to(device)