diff --git a/scripts/process_old_dataaset.py b/scripts/process_old_dataaset.py index 80608a6..47d0941 100644 --- a/scripts/process_old_dataaset.py +++ b/scripts/process_old_dataaset.py @@ -1,62 +1,102 @@ import os import json import torch +import torchaudio from tqdm.auto import tqdm from pathlib import Path from vall_e.emb.g2p import encode as valle_phonemize -from vall_e.emb.qnt import encode_from_file as valle_quantize, _replace_file_extension +from vall_e.emb.qnt import encode as valle_quantize, _replace_file_extension -input_audio = "voices" +input_audio = "voices_4" input_metadata = "metadata" output_dataset = "training" device = "cuda" -txts = [] -wavs = [] +def pad(num, zeroes): + return str(num).zfill(zeroes+1) for dataset_name in os.listdir(f'./{input_audio}/'): if not os.path.isdir(f'./{input_audio}/{dataset_name}/'): + print("Is not dir:", f'./{input_audio}/{dataset_name}/') continue for speaker_id in tqdm(os.listdir(f'./{input_audio}/{dataset_name}/'), desc="Processing speaker"): if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'): + print("Is not dir:", f'./{input_audio}/{dataset_name}/{speaker_id}') continue os.makedirs(f'./{output_dataset}/{dataset_name}/{speaker_id}/', exist_ok=True) - for filename in os.listdir(f'./{input_audio}/{dataset_name}/{speaker_id}/'): + + metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/whisper.json') + if not metadata_path.exists(): + print("Does not exist:", metadata_path) + continue + + metadata = json.loads(open(metadata_path, "r", encoding="utf-8").read()) + + txts = [] + wavs = [] + + for filename in metadata.keys(): inpath = Path(f'./{input_audio}/{dataset_name}/{speaker_id}/{filename}') - outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{filename}') - - metadata_json = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/whisper.json') - - if not metadata_json.exists() or not inpath.exist(): - print("Does not exist:", metadata_json, inpath) + if not inpath.exists(): + print("Does not exist:", inpath) continue - - if ".wav" not in filename and ".mp3" not in filename: - continue - - if not _replace_file_extension(outpath, ".json").exists(): - txts.push([ inpath, outpath ]) - if not _replace_file_extension(outpath, ".dac").exists(): - wavs.push([ inpath, outpath ]) + extension = os.path.splitext(filename)[-1][1:] + fname = filename.replace(f'.{extension}', "") -for paths in tqdm(txts, desc="Phonemizing..."): - text = open(paths[0], "r", encoding="utf-8").read() - phones = valle_phonemize(text) - data = { - "text": text, - "phonemes": phones, - "language": "english", - } - open(_replace_file_extension(paths[1], ".json"), 'w', encoding='utf-8').write(json.dumps(data)) - #phones = valle_phonemize(open(paths[0], "r", encoding="utf-8").read()) - #open(_replace_file_extension(paths[1], ".phn.txt"), "w", encoding="utf-8").write(" ".join(phones)) + waveform, sample_rate = None, None + language = metadata[filename]["language"] if "language" in metadata[filename] else "english" -for paths in tqdm(wavs, desc="Quantizing..."): - qnt = valle_quantize(paths[0], device=device) - qnt.save(_replace_file_extension(paths[1], ".dac")) - #torch.save(qnt.cpu(), _replace_file_extension(paths[1], ".qnt.pt")) + for segment in metadata[filename]["segments"]: + id = pad(segment['id'], 4) + outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}') + + if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, ".dac").exists(): + continue + + if waveform is None: + waveform, sample_rate = torchaudio.load(inpath) + + start = int(segment['start'] * sample_rate) + end = int(segment['end'] * sample_rate) + + if start < 0: + start = 0 + if end >= waveform.shape[-1]: + end = waveform.shape[-1] - 1 + + if not _replace_file_extension(outpath, ".json").exists(): + txts.append(( + outpath, + segment["text"], + language, + )) + + if not _replace_file_extension(outpath, ".dac").exists(): + wavs.append(( + outpath, + waveform[:, start:end], + sample_rate + )) + for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}"): + outpath, text, language = job + phones = valle_phonemize(text) + data = { + "text": text, + "phonemes": phones, + "language": language, + } + open(_replace_file_extension(outpath, ".json"), 'w', encoding='utf-8').write(json.dumps(data)) + + for job in tqdm(wavs, desc=f"Quantizing: {speaker_id}"): + try: + outpath, waveform, sample_rate = job + qnt = valle_quantize(waveform, sr=sample_rate, device=device) + qnt.save(_replace_file_extension(outpath, ".dac")) + except Exception as e: + print(f"Failed to quantize: {speaker_id}") + continue