ai-voice-cloning/src/cull_dataset.py

35 lines
829 B
Python
Executable File

import os
import sys
indir = f'./training/{sys.argv[1]}/'
cap = int(sys.argv[2])
if not os.path.isdir(indir):
raise Exception(f"Invalid directory: {indir}")
if not os.path.exists(f'{indir}/train.txt'):
raise Exception(f"Missing dataset: {indir}/train.txt")
with open(f'{indir}/train.txt', 'r', encoding="utf-8") as f:
lines = f.readlines()
validation = []
training = []
for line in lines:
split = line.split("|")
filename = split[0]
text = split[1]
if len(text) < cap:
validation.append(line.strip())
else:
training.append(line.strip())
with open(f'{indir}/train_culled.txt', 'w', encoding="utf-8") as f:
f.write("\n".join(training))
with open(f'{indir}/validation.txt', 'w', encoding="utf-8") as f:
f.write("\n".join(validation))
print(f"Culled {len(validation)} lines")