vall-e/scripts/parse_ppp.py

96 lines
2.4 KiB
Python
Raw Normal View History

import os
import json
import torch
from tqdm.auto import tqdm
from pathlib import Path
from vall_e.emb.g2p import encode as valle_phonemize
from vall_e.emb.qnt import encode_from_file as valle_quantize, _replace_file_extension
device = "cuda"
target = "in"
audio_map = {}
text_map = {}
data = {}
for season in os.listdir(f"./{target}/"):
if not os.path.isdir(f"./{target}/{season}/"):
continue
for episode in os.listdir(f"./{target}/{season}/"):
if not os.path.isdir(f"./{target}/{season}/{episode}/"):
continue
for filename in os.listdir(f"./{target}/{season}/{episode}/"):
path = f'./{target}/{season}/{episode}/{filename}'
attrs = filename.split("_")
timestamp = f'{attrs[0]}h{attrs[1]}m{attrs[2]}s'
key = f'{episode}_{timestamp}'
if filename[-5:] == ".flac":
name = attrs[3]
emotion = attrs[4]
quality = attrs[5]
audio_map[key] = {
"path": path,
'episode': episode,
"name": name,
"emotion": emotion,
"quality": quality,
"timestamp": timestamp,
}
elif filename[-4:] == ".txt":
text_map[key] = open(path, encoding="utf-8").read()
txts = {}
wavs = []
for key, entry in audio_map.items():
path = entry['path']
name = entry['name']
emotion = entry['emotion']
quality = entry['quality']
episode = entry['episode']
path = entry['path']
timestamp = entry['timestamp']
transcription = text_map[key]
if name not in data:
data[name] = {}
os.makedirs(f'./training/{name}/', exist_ok=True)
os.makedirs(f'./voices/{name}/', exist_ok=True)
key = f'{episode}_{timestamp}.flac'
os.rename(path, f'./voices/{name}/{key}')
data[name][key] = {
"segments": [],
"language": "en",
"text": transcription,
"misc": {
"emotion": emotion,
"quality": quality,
"timestamp": timestamp,
"episode": episode,
}
}
path = f'./voices/{name}/{key}'
txts[path] = transcription
wavs.append(Path(path))
for name in data.keys():
open(f"./training/{name}/whisper.json", "w", encoding="utf-8").write( json.dumps( data[name], indent='\t' ) )
for key, text in tqdm(txts.items(), desc="Phonemizing..."):
path = Path(key)
phones = valle_phonemize(text)
open(_replace_file_extension(path, ".phn.txt"), "w", encoding="utf-8").write(" ".join(phones))
for path in tqdm(wavs, desc="Quantizing..."):
qnt = valle_quantize(path, device=device)
torch.save(qnt.cpu(), _replace_file_extension(path, ".qnt.pt"))