import argparse import functools import os import pathlib from multiprocessing.pool import ThreadPool from tqdm import tqdm ''' Helper function for scripts that iterate over large sets of files. Defines command-line arguments for operating over a large set of files, then handles setting up a worker queue system to operate on those files. You need to provide your own process_file_fn. process_file_fn expected signature: (path, output_path) ''' def do_to_files(process_file_fn): parser = argparse.ArgumentParser() parser.add_argument('--path') parser.add_argument('--glob') parser.add_argument('--out') parser.add_argument('--resume') parser.add_argument('--num_workers') args = parser.parse_args() src = args.path glob = args.glob out = args.out resume = args.resume num_workers = int(args.num_workers) os.makedirs(out, exist_ok=True) path = pathlib.Path(src) files = path.rglob(glob) files = [str(f) for f in files] files = files[resume:] pfn = functools.partial(process_file_fn, output_path=out) if num_workers > 0: with ThreadPool(num_workers) as pool: list(tqdm(pool.imap(pfn, files), total=len(files))) else: for f in tqdm(files): pfn(f)