2024-08-06 01:34:58 +00:00
|
|
|
"""
|
|
|
|
# Handles processing `facebookresearch/libri-light`'s unlabeled audio into a friendlier hierarchy
|
|
|
|
"""
|
|
|
|
|
2023-08-26 15:21:12 +00:00
|
|
|
import os
|
|
|
|
import json
|
|
|
|
|
2024-08-06 01:34:58 +00:00
|
|
|
datasets = ["small", "medium", "large", "duplicate"]
|
2024-04-21 19:49:18 +00:00
|
|
|
output_dataset = "LibriLight-4K"
|
2023-08-26 15:21:12 +00:00
|
|
|
|
2024-08-06 01:34:58 +00:00
|
|
|
for input_dataset in datasets:
|
|
|
|
if not os.path.isdir(f'./{input_dataset}/'):
|
2023-08-26 15:21:12 +00:00
|
|
|
continue
|
|
|
|
|
2024-08-06 01:34:58 +00:00
|
|
|
for speaker_id in os.listdir(f'./{input_dataset}/'):
|
|
|
|
if not os.path.isdir(f'./{input_dataset}/{speaker_id}/'):
|
|
|
|
continue
|
|
|
|
|
|
|
|
for book_name in os.listdir(f'./{input_dataset}/{speaker_id}/'):
|
|
|
|
subid = 0
|
|
|
|
|
|
|
|
for filename in os.listdir(f'./{input_dataset}/{speaker_id}/{book_name}'):
|
|
|
|
if filename[-5:] != ".json":
|
|
|
|
continue
|
2023-08-26 15:21:12 +00:00
|
|
|
|
2024-08-06 01:34:58 +00:00
|
|
|
basename = filename[:-5]
|
2023-08-26 15:21:12 +00:00
|
|
|
|
2024-08-06 01:34:58 +00:00
|
|
|
json_path = f'./{input_dataset}/{speaker_id}/{book_name}/{basename}.json'
|
|
|
|
flac_path = f'./{input_dataset}/{speaker_id}/{book_name}/{basename}.flac'
|
2023-08-26 15:21:12 +00:00
|
|
|
|
2024-08-06 01:34:58 +00:00
|
|
|
j = json.load(open(json_path, 'r', encoding="utf-8"))
|
|
|
|
id = j['book_meta']['id']
|
|
|
|
|
|
|
|
json_id_path = f'./{output_dataset}/{speaker_id}/{speaker_id}_{id}_{subid}.json'
|
|
|
|
flac_id_path = f'./{output_dataset}/{speaker_id}/{speaker_id}_{id}_{subid}.flac'
|
2023-08-26 15:21:12 +00:00
|
|
|
|
2024-08-06 01:34:58 +00:00
|
|
|
os.makedirs(f'./{output_dataset}/{speaker_id}/', exist_ok=True)
|
|
|
|
os.rename(json_path, json_id_path)
|
|
|
|
os.rename(flac_path, flac_id_path)
|
2023-08-26 15:21:12 +00:00
|
|
|
|
2024-08-06 01:34:58 +00:00
|
|
|
subid += 1
|