vall-e/scripts/deduplicate_librilight_libritts.py

"""
# Helper script to try and detect any duplications between LibriLight and LibriTTS (I don't think there were any)
"""

import os
import json

librilight_dir = "LibriLight-6K"
libritts_dir = "LibriTTS-Train"

librilight_data = {}
libritts_data = {}

for speaker_id in os.listdir(f'./{librilight_dir}/'):
	for filename in os.listdir(f'./{librilight_dir}/{speaker_id}'):
		parts = filename.split("_")
		book_id = parts[1]
		subid = parts[2]

		if speaker_id not in librilight_data:
			librilight_data[speaker_id] = {}
		if book_id not in librilight_data[speaker_id]:
			librilight_data[speaker_id][book_id] = []
		librilight_data[speaker_id][book_id].append(subid)

for speaker_id in os.listdir(f'./{libritts_dir}/'):
	for filename in os.listdir(f'./{libritts_dir}/{speaker_id}'):
		parts = filename.split("_")
		book_id = parts[1]
		subid = parts[2]

		if speaker_id not in libritts_data:
			libritts_data[speaker_id] = {}
		if book_id not in libritts_data[speaker_id]:
			libritts_data[speaker_id][book_id] = []
		libritts_data[speaker_id][book_id].append(subid)

duplicates = []

for speaker_id, books in libritts_data.items():
	if speaker_id not in librilight_data:
		continue
	for book_id, _ in books.items():
		if book_id not in librilight_data[speaker_id]:
			continue
		print(f'Duplicate: {speaker_id}/{book_id}')
		duplicates.append(f'{speaker_id}/{book_id}')

print("Duplicates:", duplicates)
re-adapted process_libritts.py to a 'better' way (better because it processed without needing to shuffle a bunch of things and adapt to cope or something) 2024-08-06 01:34:58 +00:00			`"""`
			`# Helper script to try and detect any duplications between LibriLight and LibriTTS (I don't think there were any)`
			`"""`

added helper scripts to process LibriTTS/LibriLight, detect duplicate speaker+books between them, and script to directly phonemize and quantize LibriTTS 2023-08-26 15:21:12 +00:00			`import os`
			`import json`

			`librilight_dir = "LibriLight-6K"`
			`libritts_dir = "LibriTTS-Train"`

			`librilight_data = {}`
			`libritts_data = {}`

			`for speaker_id in os.listdir(f'./{librilight_dir}/'):`
			`for filename in os.listdir(f'./{librilight_dir}/{speaker_id}'):`
			`parts = filename.split("_")`
			`book_id = parts[1]`
			`subid = parts[2]`

			`if speaker_id not in librilight_data:`
			`librilight_data[speaker_id] = {}`
			`if book_id not in librilight_data[speaker_id]:`
			`librilight_data[speaker_id][book_id] = []`
			`librilight_data[speaker_id][book_id].append(subid)`

			`for speaker_id in os.listdir(f'./{libritts_dir}/'):`
			`for filename in os.listdir(f'./{libritts_dir}/{speaker_id}'):`
			`parts = filename.split("_")`
			`book_id = parts[1]`
			`subid = parts[2]`

			`if speaker_id not in libritts_data:`
			`libritts_data[speaker_id] = {}`
			`if book_id not in libritts_data[speaker_id]:`
			`libritts_data[speaker_id][book_id] = []`
			`libritts_data[speaker_id][book_id].append(subid)`

			`duplicates = []`

			`for speaker_id, books in libritts_data.items():`
			`if speaker_id not in librilight_data:`
			`continue`
			`for book_id, _ in books.items():`
			`if book_id not in librilight_data[speaker_id]:`
			`continue`
			`print(f'Duplicate: {speaker_id}/{book_id}')`
			`duplicates.append(f'{speaker_id}/{book_id}')`

			`print("Duplicates:", duplicates)`