vall-e/scripts/parse_ppp.py

"""
# Helper script to parse PPP dataset into a friendlier hierarchy
"""

import os
import json
import torch

from tqdm.auto import tqdm
from pathlib import Path
from vall_e.emb.g2p import encode as valle_phonemize
from vall_e.emb.qnt import encode_from_file as valle_quantize, _replace_file_extension

target = "in"

audio_map = {}
text_map = {}

data = {}

for season in os.listdir(f"./{target}/"):
	if not os.path.isdir(f"./{target}/{season}/"):
		continue

	for episode in os.listdir(f"./{target}/{season}/"):
		if not os.path.isdir(f"./{target}/{season}/{episode}/"):
			continue

		for filename in os.listdir(f"./{target}/{season}/{episode}/"):
			path = f'./{target}/{season}/{episode}/{filename}'
			attrs = filename.split("_")
			timestamp = f'{attrs[0]}h{attrs[1]}m{attrs[2]}s'

			key = f'{episode}_{timestamp}'
			
			if filename[-5:] == ".flac":
				name = attrs[3]
				emotion = attrs[4]
				quality = attrs[5]
				
				audio_map[key] = {
					"path": path,
					'episode': episode,
					"name": name,
					"emotion": emotion,
					"quality": quality,
					"timestamp": timestamp,
				}
			
			elif filename[-4:] == ".txt":
				text_map[key] = open(path, encoding="utf-8").read()
txts = {}
wavs = []

for key, entry in audio_map.items():
	path = entry['path']
	name = entry['name']
	emotion = entry['emotion']
	quality = entry['quality']
	episode = entry['episode']
	path = entry['path']
	timestamp = entry['timestamp']
	transcription = text_map[key]
	if name not in data:
		data[name] = {}
		os.makedirs(f'./training/{name}/', exist_ok=True)
		os.makedirs(f'./voices/{name}/', exist_ok=True)

	key = f'{episode}_{timestamp}.flac'
	os.rename(path, f'./voices/{name}/{key}')

	data[name][key] = {
		"segments": [],
		"language": "en",
		"text": transcription,
		"misc": {
			"emotion": emotion,
			"quality": quality,
			"timestamp": timestamp,
			"episode": episode,
		}
	}

	path = f'./voices/{name}/{key}'
	txts[path] = transcription
	wavs.append(Path(path))

for name in data.keys():
	open(f"./training/{name}/whisper.json", "w", encoding="utf-8").write( json.dumps( data[name], indent='\t' ) )

# to-do: update to "The Proper Way"
# for now it can just be fed back into "The Proper Way""
"""
device = "cuda"
for key, text in tqdm(txts.items(), desc="Phonemizing..."):
	path = Path(key)
	phones = valle_phonemize(text)
	open(_replace_file_extension(path, ".phn.txt"), "w", encoding="utf-8").write(" ".join(phones))

for path in tqdm(wavs, desc="Quantizing..."):
	qnt = valle_quantize(path, device=device)
	torch.save(qnt.cpu(), _replace_file_extension(path, ".qnt.pt"))
"""
re-adapted process_libritts.py to a 'better' way (better because it processed without needing to shuffle a bunch of things and adapt to cope or something) 2024-08-06 01:34:58 +00:00			`"""`
			`# Helper script to parse PPP dataset into a friendlier hierarchy`
			`"""`

added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup 2023-10-17 00:30:38 +00:00			`import os`
			`import json`
			`import torch`

			`from tqdm.auto import tqdm`
			`from pathlib import Path`
			`from vall_e.emb.g2p import encode as valle_phonemize`
			`from vall_e.emb.qnt import encode_from_file as valle_quantize, _replace_file_extension`

			`target = "in"`

			`audio_map = {}`
			`text_map = {}`

			`data = {}`

			`for season in os.listdir(f"./{target}/"):`
			`if not os.path.isdir(f"./{target}/{season}/"):`
			`continue`

			`for episode in os.listdir(f"./{target}/{season}/"):`
			`if not os.path.isdir(f"./{target}/{season}/{episode}/"):`
			`continue`

			`for filename in os.listdir(f"./{target}/{season}/{episode}/"):`
			`path = f'./{target}/{season}/{episode}/{filename}'`
			`attrs = filename.split("_")`
			`timestamp = f'{attrs[0]}h{attrs[1]}m{attrs[2]}s'`

			`key = f'{episode}_{timestamp}'`

			`if filename[-5:] == ".flac":`
			`name = attrs[3]`
			`emotion = attrs[4]`
			`quality = attrs[5]`

			`audio_map[key] = {`
			`"path": path,`
			`'episode': episode,`
			`"name": name,`
			`"emotion": emotion,`
			`"quality": quality,`
			`"timestamp": timestamp,`
			`}`

			`elif filename[-4:] == ".txt":`
			`text_map[key] = open(path, encoding="utf-8").read()`
			`txts = {}`
			`wavs = []`

			`for key, entry in audio_map.items():`
			`path = entry['path']`
			`name = entry['name']`
			`emotion = entry['emotion']`
			`quality = entry['quality']`
			`episode = entry['episode']`
			`path = entry['path']`
			`timestamp = entry['timestamp']`
			`transcription = text_map[key]`
			`if name not in data:`
			`data[name] = {}`
			`os.makedirs(f'./training/{name}/', exist_ok=True)`
			`os.makedirs(f'./voices/{name}/', exist_ok=True)`

			`key = f'{episode}_{timestamp}.flac'`
			`os.rename(path, f'./voices/{name}/{key}')`

			`data[name][key] = {`
			`"segments": [],`
			`"language": "en",`
			`"text": transcription,`
			`"misc": {`
			`"emotion": emotion,`
			`"quality": quality,`
			`"timestamp": timestamp,`
			`"episode": episode,`
			`}`
			`}`

			`path = f'./voices/{name}/{key}'`
			`txts[path] = transcription`
			`wavs.append(Path(path))`

			`for name in data.keys():`
			`open(f"./training/{name}/whisper.json", "w", encoding="utf-8").write( json.dumps( data[name], indent='\t' ) )`

re-adapted process_libritts.py to a 'better' way (better because it processed without needing to shuffle a bunch of things and adapt to cope or something) 2024-08-06 01:34:58 +00:00			`# to-do: update to "The Proper Way"`
			`# for now it can just be fed back into "The Proper Way""`
			`"""`
			`device = "cuda"`
added sampling by speaker group name (might be better to de-emphasize the LibriVox/Audiobooks that are in large numbers, and emphasize the smaller pools), log cleanup 2023-10-17 00:30:38 +00:00			`for key, text in tqdm(txts.items(), desc="Phonemizing..."):`
			`path = Path(key)`
			`phones = valle_phonemize(text)`
			`open(_replace_file_extension(path, ".phn.txt"), "w", encoding="utf-8").write(" ".join(phones))`

			`for path in tqdm(wavs, desc="Quantizing..."):`
			`qnt = valle_quantize(path, device=device)`
re-adapted process_libritts.py to a 'better' way (better because it processed without needing to shuffle a bunch of things and adapt to cope or something) 2024-08-06 01:34:58 +00:00			`torch.save(qnt.cpu(), _replace_file_extension(path, ".qnt.pt"))`
			`"""`