vall-e/scripts/prepare_librilight.py

"""
# Handles processing `facebookresearch/libri-light`'s unlabeled audio into a friendlier hierarchy
"""

import os
import json

datasets = ["small", "medium", "large", "duplicate"]
output_dataset = "LibriLight-4K"

for input_dataset in datasets:
	if not os.path.isdir(f'./{input_dataset}/'):
		continue

	for speaker_id in os.listdir(f'./{input_dataset}/'):
		if not os.path.isdir(f'./{input_dataset}/{speaker_id}/'):
			continue
		
		for book_name in os.listdir(f'./{input_dataset}/{speaker_id}/'):
			subid = 0

			for filename in os.listdir(f'./{input_dataset}/{speaker_id}/{book_name}'):
				if filename[-5:] != ".json":
					continue

				basename = filename[:-5]

				json_path = f'./{input_dataset}/{speaker_id}/{book_name}/{basename}.json'
				flac_path = f'./{input_dataset}/{speaker_id}/{book_name}/{basename}.flac'

				j = json.load(open(json_path, 'r', encoding="utf-8"))
				id = j['book_meta']['id']
				
				json_id_path = f'./{output_dataset}/{speaker_id}/{speaker_id}_{id}_{subid}.json'
				flac_id_path = f'./{output_dataset}/{speaker_id}/{speaker_id}_{id}_{subid}.flac'

				os.makedirs(f'./{output_dataset}/{speaker_id}/', exist_ok=True)
				os.rename(json_path, json_id_path)
				os.rename(flac_path, flac_id_path)

				subid += 1
re-adapted process_libritts.py to a 'better' way (better because it processed without needing to shuffle a bunch of things and adapt to cope or something) 2024-08-06 01:34:58 +00:00			`"""`
			# Handles processing `facebookresearch/libri-light`'s unlabeled audio into a friendlier hierarchy
			`"""`

added helper scripts to process LibriTTS/LibriLight, detect duplicate speaker+books between them, and script to directly phonemize and quantize LibriTTS 2023-08-26 15:21:12 +00:00			`import os`
			`import json`

re-adapted process_libritts.py to a 'better' way (better because it processed without needing to shuffle a bunch of things and adapt to cope or something) 2024-08-06 01:34:58 +00:00			`datasets = ["small", "medium", "large", "duplicate"]`
dataset preparation script updates, caved and am using HF tokenizer now 2024-04-21 19:49:18 +00:00			`output_dataset = "LibriLight-4K"`
added helper scripts to process LibriTTS/LibriLight, detect duplicate speaker+books between them, and script to directly phonemize and quantize LibriTTS 2023-08-26 15:21:12 +00:00
re-adapted process_libritts.py to a 'better' way (better because it processed without needing to shuffle a bunch of things and adapt to cope or something) 2024-08-06 01:34:58 +00:00			`for input_dataset in datasets:`
			`if not os.path.isdir(f'./{input_dataset}/'):`
added helper scripts to process LibriTTS/LibriLight, detect duplicate speaker+books between them, and script to directly phonemize and quantize LibriTTS 2023-08-26 15:21:12 +00:00			`continue`

re-adapted process_libritts.py to a 'better' way (better because it processed without needing to shuffle a bunch of things and adapt to cope or something) 2024-08-06 01:34:58 +00:00			`for speaker_id in os.listdir(f'./{input_dataset}/'):`
			`if not os.path.isdir(f'./{input_dataset}/{speaker_id}/'):`
			`continue`

			`for book_name in os.listdir(f'./{input_dataset}/{speaker_id}/'):`
			`subid = 0`

			`for filename in os.listdir(f'./{input_dataset}/{speaker_id}/{book_name}'):`
			`if filename[-5:] != ".json":`
			`continue`
added helper scripts to process LibriTTS/LibriLight, detect duplicate speaker+books between them, and script to directly phonemize and quantize LibriTTS 2023-08-26 15:21:12 +00:00
re-adapted process_libritts.py to a 'better' way (better because it processed without needing to shuffle a bunch of things and adapt to cope or something) 2024-08-06 01:34:58 +00:00			`basename = filename[:-5]`
added helper scripts to process LibriTTS/LibriLight, detect duplicate speaker+books between them, and script to directly phonemize and quantize LibriTTS 2023-08-26 15:21:12 +00:00
re-adapted process_libritts.py to a 'better' way (better because it processed without needing to shuffle a bunch of things and adapt to cope or something) 2024-08-06 01:34:58 +00:00			`json_path = f'./{input_dataset}/{speaker_id}/{book_name}/{basename}.json'`
			`flac_path = f'./{input_dataset}/{speaker_id}/{book_name}/{basename}.flac'`
added helper scripts to process LibriTTS/LibriLight, detect duplicate speaker+books between them, and script to directly phonemize and quantize LibriTTS 2023-08-26 15:21:12 +00:00
re-adapted process_libritts.py to a 'better' way (better because it processed without needing to shuffle a bunch of things and adapt to cope or something) 2024-08-06 01:34:58 +00:00			`j = json.load(open(json_path, 'r', encoding="utf-8"))`
			`id = j['book_meta']['id']`

			`json_id_path = f'./{output_dataset}/{speaker_id}/{speaker_id}_{id}_{subid}.json'`
			`flac_id_path = f'./{output_dataset}/{speaker_id}/{speaker_id}_{id}_{subid}.flac'`
added helper scripts to process LibriTTS/LibriLight, detect duplicate speaker+books between them, and script to directly phonemize and quantize LibriTTS 2023-08-26 15:21:12 +00:00
re-adapted process_libritts.py to a 'better' way (better because it processed without needing to shuffle a bunch of things and adapt to cope or something) 2024-08-06 01:34:58 +00:00			`os.makedirs(f'./{output_dataset}/{speaker_id}/', exist_ok=True)`
			`os.rename(json_path, json_id_path)`
			`os.rename(flac_path, flac_id_path)`
added helper scripts to process LibriTTS/LibriLight, detect duplicate speaker+books between them, and script to directly phonemize and quantize LibriTTS 2023-08-26 15:21:12 +00:00
re-adapted process_libritts.py to a 'better' way (better because it processed without needing to shuffle a bunch of things and adapt to cope or something) 2024-08-06 01:34:58 +00:00			`subid += 1`