Forgot to copy intermediary dataset conversion script

This commit is contained in:
mrq 2024-04-18 21:34:28 -05:00
parent 8214aa23d7
commit 00804a47e9

View File

@ -1,62 +1,102 @@
import os import os
import json import json
import torch import torch
import torchaudio
from tqdm.auto import tqdm from tqdm.auto import tqdm
from pathlib import Path from pathlib import Path
from vall_e.emb.g2p import encode as valle_phonemize from vall_e.emb.g2p import encode as valle_phonemize
from vall_e.emb.qnt import encode_from_file as valle_quantize, _replace_file_extension from vall_e.emb.qnt import encode as valle_quantize, _replace_file_extension
input_audio = "voices" input_audio = "voices_4"
input_metadata = "metadata" input_metadata = "metadata"
output_dataset = "training" output_dataset = "training"
device = "cuda" device = "cuda"
txts = [] def pad(num, zeroes):
wavs = [] return str(num).zfill(zeroes+1)
for dataset_name in os.listdir(f'./{input_audio}/'): for dataset_name in os.listdir(f'./{input_audio}/'):
if not os.path.isdir(f'./{input_audio}/{dataset_name}/'): if not os.path.isdir(f'./{input_audio}/{dataset_name}/'):
print("Is not dir:", f'./{input_audio}/{dataset_name}/')
continue continue
for speaker_id in tqdm(os.listdir(f'./{input_audio}/{dataset_name}/'), desc="Processing speaker"): for speaker_id in tqdm(os.listdir(f'./{input_audio}/{dataset_name}/'), desc="Processing speaker"):
if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'): if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'):
print("Is not dir:", f'./{input_audio}/{dataset_name}/{speaker_id}')
continue continue
os.makedirs(f'./{output_dataset}/{dataset_name}/{speaker_id}/', exist_ok=True) os.makedirs(f'./{output_dataset}/{dataset_name}/{speaker_id}/', exist_ok=True)
for filename in os.listdir(f'./{input_audio}/{dataset_name}/{speaker_id}/'):
metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/whisper.json')
if not metadata_path.exists():
print("Does not exist:", metadata_path)
continue
metadata = json.loads(open(metadata_path, "r", encoding="utf-8").read())
txts = []
wavs = []
for filename in metadata.keys():
inpath = Path(f'./{input_audio}/{dataset_name}/{speaker_id}/{filename}') inpath = Path(f'./{input_audio}/{dataset_name}/{speaker_id}/{filename}')
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{filename}') if not inpath.exists():
print("Does not exist:", inpath)
metadata_json = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/whisper.json')
if not metadata_json.exists() or not inpath.exist():
print("Does not exist:", metadata_json, inpath)
continue continue
if ".wav" not in filename and ".mp3" not in filename:
continue
if not _replace_file_extension(outpath, ".json").exists():
txts.push([ inpath, outpath ])
if not _replace_file_extension(outpath, ".dac").exists(): extension = os.path.splitext(filename)[-1][1:]
wavs.push([ inpath, outpath ]) fname = filename.replace(f'.{extension}', "")
for paths in tqdm(txts, desc="Phonemizing..."): waveform, sample_rate = None, None
text = open(paths[0], "r", encoding="utf-8").read() language = metadata[filename]["language"] if "language" in metadata[filename] else "english"
phones = valle_phonemize(text)
data = {
"text": text,
"phonemes": phones,
"language": "english",
}
open(_replace_file_extension(paths[1], ".json"), 'w', encoding='utf-8').write(json.dumps(data))
#phones = valle_phonemize(open(paths[0], "r", encoding="utf-8").read())
#open(_replace_file_extension(paths[1], ".phn.txt"), "w", encoding="utf-8").write(" ".join(phones))
for paths in tqdm(wavs, desc="Quantizing..."): for segment in metadata[filename]["segments"]:
qnt = valle_quantize(paths[0], device=device) id = pad(segment['id'], 4)
qnt.save(_replace_file_extension(paths[1], ".dac")) outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}')
#torch.save(qnt.cpu(), _replace_file_extension(paths[1], ".qnt.pt"))
if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, ".dac").exists():
continue
if waveform is None:
waveform, sample_rate = torchaudio.load(inpath)
start = int(segment['start'] * sample_rate)
end = int(segment['end'] * sample_rate)
if start < 0:
start = 0
if end >= waveform.shape[-1]:
end = waveform.shape[-1] - 1
if not _replace_file_extension(outpath, ".json").exists():
txts.append((
outpath,
segment["text"],
language,
))
if not _replace_file_extension(outpath, ".dac").exists():
wavs.append((
outpath,
waveform[:, start:end],
sample_rate
))
for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}"):
outpath, text, language = job
phones = valle_phonemize(text)
data = {
"text": text,
"phonemes": phones,
"language": language,
}
open(_replace_file_extension(outpath, ".json"), 'w', encoding='utf-8').write(json.dumps(data))
for job in tqdm(wavs, desc=f"Quantizing: {speaker_id}"):
try:
outpath, waveform, sample_rate = job
qnt = valle_quantize(waveform, sr=sample_rate, device=device)
qnt.save(_replace_file_extension(outpath, ".dac"))
except Exception as e:
print(f"Failed to quantize: {speaker_id}")
continue