35 lines
829 B
Python
Executable File
35 lines
829 B
Python
Executable File
import os
|
|
import sys
|
|
|
|
indir = f'./training/{sys.argv[1]}/'
|
|
cap = int(sys.argv[2])
|
|
|
|
if not os.path.isdir(indir):
|
|
raise Exception(f"Invalid directory: {indir}")
|
|
|
|
if not os.path.exists(f'{indir}/train.txt'):
|
|
raise Exception(f"Missing dataset: {indir}/train.txt")
|
|
|
|
with open(f'{indir}/train.txt', 'r', encoding="utf-8") as f:
|
|
lines = f.readlines()
|
|
|
|
validation = []
|
|
training = []
|
|
|
|
for line in lines:
|
|
split = line.split("|")
|
|
filename = split[0]
|
|
text = split[1]
|
|
|
|
if len(text) < cap:
|
|
validation.append(line.strip())
|
|
else:
|
|
training.append(line.strip())
|
|
|
|
with open(f'{indir}/train_culled.txt', 'w', encoding="utf-8") as f:
|
|
f.write("\n".join(training))
|
|
|
|
with open(f'{indir}/validation.txt', 'w', encoding="utf-8") as f:
|
|
f.write("\n".join(validation))
|
|
|
|
print(f"Culled {len(validation)} lines") |