Forgot to copy intermediary dataset conversion script
This commit is contained in:
parent
8214aa23d7
commit
00804a47e9
|
@ -1,62 +1,102 @@
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import torch
|
import torch
|
||||||
|
import torchaudio
|
||||||
|
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from vall_e.emb.g2p import encode as valle_phonemize
|
from vall_e.emb.g2p import encode as valle_phonemize
|
||||||
from vall_e.emb.qnt import encode_from_file as valle_quantize, _replace_file_extension
|
from vall_e.emb.qnt import encode as valle_quantize, _replace_file_extension
|
||||||
|
|
||||||
input_audio = "voices"
|
input_audio = "voices_4"
|
||||||
input_metadata = "metadata"
|
input_metadata = "metadata"
|
||||||
output_dataset = "training"
|
output_dataset = "training"
|
||||||
|
|
||||||
device = "cuda"
|
device = "cuda"
|
||||||
|
|
||||||
txts = []
|
def pad(num, zeroes):
|
||||||
wavs = []
|
return str(num).zfill(zeroes+1)
|
||||||
|
|
||||||
for dataset_name in os.listdir(f'./{input_audio}/'):
|
for dataset_name in os.listdir(f'./{input_audio}/'):
|
||||||
if not os.path.isdir(f'./{input_audio}/{dataset_name}/'):
|
if not os.path.isdir(f'./{input_audio}/{dataset_name}/'):
|
||||||
|
print("Is not dir:", f'./{input_audio}/{dataset_name}/')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for speaker_id in tqdm(os.listdir(f'./{input_audio}/{dataset_name}/'), desc="Processing speaker"):
|
for speaker_id in tqdm(os.listdir(f'./{input_audio}/{dataset_name}/'), desc="Processing speaker"):
|
||||||
if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'):
|
if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'):
|
||||||
|
print("Is not dir:", f'./{input_audio}/{dataset_name}/{speaker_id}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
os.makedirs(f'./{output_dataset}/{dataset_name}/{speaker_id}/', exist_ok=True)
|
os.makedirs(f'./{output_dataset}/{dataset_name}/{speaker_id}/', exist_ok=True)
|
||||||
for filename in os.listdir(f'./{input_audio}/{dataset_name}/{speaker_id}/'):
|
|
||||||
|
metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/whisper.json')
|
||||||
|
if not metadata_path.exists():
|
||||||
|
print("Does not exist:", metadata_path)
|
||||||
|
continue
|
||||||
|
|
||||||
|
metadata = json.loads(open(metadata_path, "r", encoding="utf-8").read())
|
||||||
|
|
||||||
|
txts = []
|
||||||
|
wavs = []
|
||||||
|
|
||||||
|
for filename in metadata.keys():
|
||||||
inpath = Path(f'./{input_audio}/{dataset_name}/{speaker_id}/{filename}')
|
inpath = Path(f'./{input_audio}/{dataset_name}/{speaker_id}/{filename}')
|
||||||
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{filename}')
|
if not inpath.exists():
|
||||||
|
print("Does not exist:", inpath)
|
||||||
metadata_json = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/whisper.json')
|
|
||||||
|
|
||||||
if not metadata_json.exists() or not inpath.exist():
|
|
||||||
print("Does not exist:", metadata_json, inpath)
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if ".wav" not in filename and ".mp3" not in filename:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not _replace_file_extension(outpath, ".json").exists():
|
|
||||||
txts.push([ inpath, outpath ])
|
|
||||||
|
|
||||||
if not _replace_file_extension(outpath, ".dac").exists():
|
extension = os.path.splitext(filename)[-1][1:]
|
||||||
wavs.push([ inpath, outpath ])
|
fname = filename.replace(f'.{extension}', "")
|
||||||
|
|
||||||
for paths in tqdm(txts, desc="Phonemizing..."):
|
waveform, sample_rate = None, None
|
||||||
text = open(paths[0], "r", encoding="utf-8").read()
|
language = metadata[filename]["language"] if "language" in metadata[filename] else "english"
|
||||||
phones = valle_phonemize(text)
|
|
||||||
data = {
|
|
||||||
"text": text,
|
|
||||||
"phonemes": phones,
|
|
||||||
"language": "english",
|
|
||||||
}
|
|
||||||
open(_replace_file_extension(paths[1], ".json"), 'w', encoding='utf-8').write(json.dumps(data))
|
|
||||||
#phones = valle_phonemize(open(paths[0], "r", encoding="utf-8").read())
|
|
||||||
#open(_replace_file_extension(paths[1], ".phn.txt"), "w", encoding="utf-8").write(" ".join(phones))
|
|
||||||
|
|
||||||
for paths in tqdm(wavs, desc="Quantizing..."):
|
for segment in metadata[filename]["segments"]:
|
||||||
qnt = valle_quantize(paths[0], device=device)
|
id = pad(segment['id'], 4)
|
||||||
qnt.save(_replace_file_extension(paths[1], ".dac"))
|
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}')
|
||||||
#torch.save(qnt.cpu(), _replace_file_extension(paths[1], ".qnt.pt"))
|
|
||||||
|
if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, ".dac").exists():
|
||||||
|
continue
|
||||||
|
|
||||||
|
if waveform is None:
|
||||||
|
waveform, sample_rate = torchaudio.load(inpath)
|
||||||
|
|
||||||
|
start = int(segment['start'] * sample_rate)
|
||||||
|
end = int(segment['end'] * sample_rate)
|
||||||
|
|
||||||
|
if start < 0:
|
||||||
|
start = 0
|
||||||
|
if end >= waveform.shape[-1]:
|
||||||
|
end = waveform.shape[-1] - 1
|
||||||
|
|
||||||
|
if not _replace_file_extension(outpath, ".json").exists():
|
||||||
|
txts.append((
|
||||||
|
outpath,
|
||||||
|
segment["text"],
|
||||||
|
language,
|
||||||
|
))
|
||||||
|
|
||||||
|
if not _replace_file_extension(outpath, ".dac").exists():
|
||||||
|
wavs.append((
|
||||||
|
outpath,
|
||||||
|
waveform[:, start:end],
|
||||||
|
sample_rate
|
||||||
|
))
|
||||||
|
for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}"):
|
||||||
|
outpath, text, language = job
|
||||||
|
phones = valle_phonemize(text)
|
||||||
|
data = {
|
||||||
|
"text": text,
|
||||||
|
"phonemes": phones,
|
||||||
|
"language": language,
|
||||||
|
}
|
||||||
|
open(_replace_file_extension(outpath, ".json"), 'w', encoding='utf-8').write(json.dumps(data))
|
||||||
|
|
||||||
|
for job in tqdm(wavs, desc=f"Quantizing: {speaker_id}"):
|
||||||
|
try:
|
||||||
|
outpath, waveform, sample_rate = job
|
||||||
|
qnt = valle_quantize(waveform, sr=sample_rate, device=device)
|
||||||
|
qnt.save(_replace_file_extension(outpath, ".dac"))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to quantize: {speaker_id}")
|
||||||
|
continue
|
||||||
|
|
Loading…
Reference in New Issue
Block a user