"""
# Helper script to try and detect any duplications between LibriLight and LibriTTS (I don't think there were any)
"""

import os
import json

librilight_dir = "LibriLight-6K"
libritts_dir = "LibriTTS-Train"

librilight_data = {}
libritts_data = {}

for speaker_id in os.listdir(f'./{librilight_dir}/'):
	for filename in os.listdir(f'./{librilight_dir}/{speaker_id}'):
		parts = filename.split("_")
		book_id = parts[1]
		subid = parts[2]

		if speaker_id not in librilight_data:
			librilight_data[speaker_id] = {}
		if book_id not in librilight_data[speaker_id]:
			librilight_data[speaker_id][book_id] = []
		librilight_data[speaker_id][book_id].append(subid)

for speaker_id in os.listdir(f'./{libritts_dir}/'):
	for filename in os.listdir(f'./{libritts_dir}/{speaker_id}'):
		parts = filename.split("_")
		book_id = parts[1]
		subid = parts[2]

		if speaker_id not in libritts_data:
			libritts_data[speaker_id] = {}
		if book_id not in libritts_data[speaker_id]:
			libritts_data[speaker_id][book_id] = []
		libritts_data[speaker_id][book_id].append(subid)

duplicates = []

for speaker_id, books in libritts_data.items():
	if speaker_id not in librilight_data:
		continue
	for book_id, _ in books.items():
		if book_id not in librilight_data[speaker_id]:
			continue
		print(f'Duplicate: {speaker_id}/{book_id}')
		duplicates.append(f'{speaker_id}/{book_id}')

print("Duplicates:", duplicates)