44 lines
1.2 KiB
Python
44 lines
1.2 KiB
Python
|
import argparse
|
||
|
import functools
|
||
|
import os
|
||
|
import pathlib
|
||
|
from multiprocessing.pool import ThreadPool
|
||
|
|
||
|
from tqdm import tqdm
|
||
|
|
||
|
|
||
|
'''
|
||
|
Helper function for scripts that iterate over large sets of files. Defines command-line arguments
|
||
|
for operating over a large set of files, then handles setting up a worker queue system to operate
|
||
|
on those files. You need to provide your own process_file_fn.
|
||
|
|
||
|
process_file_fn expected signature:
|
||
|
(path, output_path)
|
||
|
'''
|
||
|
def do_to_files(process_file_fn):
|
||
|
parser = argparse.ArgumentParser()
|
||
|
parser.add_argument('--path')
|
||
|
parser.add_argument('--glob')
|
||
|
parser.add_argument('--out')
|
||
|
parser.add_argument('--resume')
|
||
|
parser.add_argument('--num_workers')
|
||
|
|
||
|
args = parser.parse_args()
|
||
|
src = args.path
|
||
|
glob = args.glob
|
||
|
out = args.out
|
||
|
resume = args.resume
|
||
|
num_workers = int(args.num_workers)
|
||
|
|
||
|
path = pathlib.Path(src)
|
||
|
files = path.rglob(glob)
|
||
|
files = [str(f) for f in files]
|
||
|
files = files[resume:]
|
||
|
pfn = functools.partial(process_file_fn, output_path=out)
|
||
|
if num_workers > 0:
|
||
|
with ThreadPool(num_workers) as pool:
|
||
|
list(tqdm(pool.imap(pfn, files), total=len(files)))
|
||
|
else:
|
||
|
for f in tqdm(files):
|
||
|
pfn(f)
|