Forgot to copy intermediary dataset conversion script
This commit is contained in:
parent
8214aa23d7
commit
00804a47e9
|
@ -1,62 +1,102 @@
|
|||
import os
|
||||
import json
|
||||
import torch
|
||||
import torchaudio
|
||||
|
||||
from tqdm.auto import tqdm
|
||||
from pathlib import Path
|
||||
from vall_e.emb.g2p import encode as valle_phonemize
|
||||
from vall_e.emb.qnt import encode_from_file as valle_quantize, _replace_file_extension
|
||||
from vall_e.emb.qnt import encode as valle_quantize, _replace_file_extension
|
||||
|
||||
input_audio = "voices"
|
||||
input_audio = "voices_4"
|
||||
input_metadata = "metadata"
|
||||
output_dataset = "training"
|
||||
|
||||
device = "cuda"
|
||||
|
||||
txts = []
|
||||
wavs = []
|
||||
def pad(num, zeroes):
|
||||
return str(num).zfill(zeroes+1)
|
||||
|
||||
for dataset_name in os.listdir(f'./{input_audio}/'):
|
||||
if not os.path.isdir(f'./{input_audio}/{dataset_name}/'):
|
||||
print("Is not dir:", f'./{input_audio}/{dataset_name}/')
|
||||
continue
|
||||
|
||||
for speaker_id in tqdm(os.listdir(f'./{input_audio}/{dataset_name}/'), desc="Processing speaker"):
|
||||
if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'):
|
||||
print("Is not dir:", f'./{input_audio}/{dataset_name}/{speaker_id}')
|
||||
continue
|
||||
|
||||
os.makedirs(f'./{output_dataset}/{dataset_name}/{speaker_id}/', exist_ok=True)
|
||||
for filename in os.listdir(f'./{input_audio}/{dataset_name}/{speaker_id}/'):
|
||||
|
||||
metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/whisper.json')
|
||||
if not metadata_path.exists():
|
||||
print("Does not exist:", metadata_path)
|
||||
continue
|
||||
|
||||
metadata = json.loads(open(metadata_path, "r", encoding="utf-8").read())
|
||||
|
||||
txts = []
|
||||
wavs = []
|
||||
|
||||
for filename in metadata.keys():
|
||||
inpath = Path(f'./{input_audio}/{dataset_name}/{speaker_id}/{filename}')
|
||||
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{filename}')
|
||||
|
||||
metadata_json = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/whisper.json')
|
||||
|
||||
if not metadata_json.exists() or not inpath.exist():
|
||||
print("Does not exist:", metadata_json, inpath)
|
||||
if not inpath.exists():
|
||||
print("Does not exist:", inpath)
|
||||
continue
|
||||
|
||||
if ".wav" not in filename and ".mp3" not in filename:
|
||||
continue
|
||||
|
||||
if not _replace_file_extension(outpath, ".json").exists():
|
||||
txts.push([ inpath, outpath ])
|
||||
|
||||
if not _replace_file_extension(outpath, ".dac").exists():
|
||||
wavs.push([ inpath, outpath ])
|
||||
extension = os.path.splitext(filename)[-1][1:]
|
||||
fname = filename.replace(f'.{extension}', "")
|
||||
|
||||
for paths in tqdm(txts, desc="Phonemizing..."):
|
||||
text = open(paths[0], "r", encoding="utf-8").read()
|
||||
phones = valle_phonemize(text)
|
||||
data = {
|
||||
"text": text,
|
||||
"phonemes": phones,
|
||||
"language": "english",
|
||||
}
|
||||
open(_replace_file_extension(paths[1], ".json"), 'w', encoding='utf-8').write(json.dumps(data))
|
||||
#phones = valle_phonemize(open(paths[0], "r", encoding="utf-8").read())
|
||||
#open(_replace_file_extension(paths[1], ".phn.txt"), "w", encoding="utf-8").write(" ".join(phones))
|
||||
waveform, sample_rate = None, None
|
||||
language = metadata[filename]["language"] if "language" in metadata[filename] else "english"
|
||||
|
||||
for paths in tqdm(wavs, desc="Quantizing..."):
|
||||
qnt = valle_quantize(paths[0], device=device)
|
||||
qnt.save(_replace_file_extension(paths[1], ".dac"))
|
||||
#torch.save(qnt.cpu(), _replace_file_extension(paths[1], ".qnt.pt"))
|
||||
for segment in metadata[filename]["segments"]:
|
||||
id = pad(segment['id'], 4)
|
||||
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}')
|
||||
|
||||
if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, ".dac").exists():
|
||||
continue
|
||||
|
||||
if waveform is None:
|
||||
waveform, sample_rate = torchaudio.load(inpath)
|
||||
|
||||
start = int(segment['start'] * sample_rate)
|
||||
end = int(segment['end'] * sample_rate)
|
||||
|
||||
if start < 0:
|
||||
start = 0
|
||||
if end >= waveform.shape[-1]:
|
||||
end = waveform.shape[-1] - 1
|
||||
|
||||
if not _replace_file_extension(outpath, ".json").exists():
|
||||
txts.append((
|
||||
outpath,
|
||||
segment["text"],
|
||||
language,
|
||||
))
|
||||
|
||||
if not _replace_file_extension(outpath, ".dac").exists():
|
||||
wavs.append((
|
||||
outpath,
|
||||
waveform[:, start:end],
|
||||
sample_rate
|
||||
))
|
||||
for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}"):
|
||||
outpath, text, language = job
|
||||
phones = valle_phonemize(text)
|
||||
data = {
|
||||
"text": text,
|
||||
"phonemes": phones,
|
||||
"language": language,
|
||||
}
|
||||
open(_replace_file_extension(outpath, ".json"), 'w', encoding='utf-8').write(json.dumps(data))
|
||||
|
||||
for job in tqdm(wavs, desc=f"Quantizing: {speaker_id}"):
|
||||
try:
|
||||
outpath, waveform, sample_rate = job
|
||||
qnt = valle_quantize(waveform, sr=sample_rate, device=device)
|
||||
qnt.save(_replace_file_extension(outpath, ".dac"))
|
||||
except Exception as e:
|
||||
print(f"Failed to quantize: {speaker_id}")
|
||||
continue
|
||||
|
|
Loading…
Reference in New Issue
Block a user