From 3c0f2fbb21c19795733373f31a2c7452f73a534f Mon Sep 17 00:00:00 2001 From: James Betker Date: Sun, 7 Nov 2021 14:16:11 -0700 Subject: [PATCH] Add filtration script for finding resampled clips (or phone calls) --- .../filter_clips_with_no_hifreq_data.py | 23 ++++++++++ codes/scripts/do_to_files.py | 44 +++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 codes/scripts/audio/preparation/filter_clips_with_no_hifreq_data.py create mode 100644 codes/scripts/do_to_files.py diff --git a/codes/scripts/audio/preparation/filter_clips_with_no_hifreq_data.py b/codes/scripts/audio/preparation/filter_clips_with_no_hifreq_data.py new file mode 100644 index 00000000..413be9d8 --- /dev/null +++ b/codes/scripts/audio/preparation/filter_clips_with_no_hifreq_data.py @@ -0,0 +1,23 @@ +import torch +import torchaudio + +from data.audio.unsupervised_audio_dataset import load_audio +from scripts.do_to_files import do_to_files + + +def get_spec_mags(clip): + stft = torch.stft(clip, n_fft=22000, hop_length=1024, return_complex=True) + stft = stft[0, -2000:, :] + return (stft.real ** 2 + stft.imag ** 2).sqrt() + + +def filter_no_hifreq_data(path, output_path): + clip = load_audio(path, 22050) + if clip.shape[-1] < 22050: + return + stft = get_spec_mags(clip) + if stft.mean() < .08: + print(f"Ignore {path}") + +if __name__ == '__main__': + do_to_files(filter_no_hifreq_data) \ No newline at end of file diff --git a/codes/scripts/do_to_files.py b/codes/scripts/do_to_files.py new file mode 100644 index 00000000..0edda8bb --- /dev/null +++ b/codes/scripts/do_to_files.py @@ -0,0 +1,44 @@ +import argparse +import functools +import os +import pathlib +from multiprocessing.pool import ThreadPool + +from tqdm import tqdm + + +''' +Helper function for scripts that iterate over large sets of files. Defines command-line arguments +for operating over a large set of files, then handles setting up a worker queue system to operate +on those files. You need to provide your own process_file_fn. + +process_file_fn expected signature: + (path, output_path) +''' +def do_to_files(process_file_fn): + parser = argparse.ArgumentParser() + parser.add_argument('--path') + parser.add_argument('--glob') + parser.add_argument('--out') + parser.add_argument('--resume') + parser.add_argument('--num_workers') + + args = parser.parse_args() + src = args.path + glob = args.glob + out = args.out + resume = args.resume + num_workers = int(args.num_workers) + + os.makedirs(out, exist_ok=True) + path = pathlib.Path(src) + files = path.rglob(glob) + files = [str(f) for f in files] + files = files[resume:] + pfn = functools.partial(process_file_fn, output_path=out) + if num_workers > 0: + with ThreadPool(num_workers) as pool: + list(tqdm(pool.imap(pfn, files), total=len(files))) + else: + for f in tqdm(files): + pfn(f)