Add filtration script for finding resampled clips (or phone calls)
This commit is contained in:
parent
756b4dad09
commit
3c0f2fbb21
|
@ -0,0 +1,23 @@
|
|||
import torch
|
||||
import torchaudio
|
||||
|
||||
from data.audio.unsupervised_audio_dataset import load_audio
|
||||
from scripts.do_to_files import do_to_files
|
||||
|
||||
|
||||
def get_spec_mags(clip):
|
||||
stft = torch.stft(clip, n_fft=22000, hop_length=1024, return_complex=True)
|
||||
stft = stft[0, -2000:, :]
|
||||
return (stft.real ** 2 + stft.imag ** 2).sqrt()
|
||||
|
||||
|
||||
def filter_no_hifreq_data(path, output_path):
|
||||
clip = load_audio(path, 22050)
|
||||
if clip.shape[-1] < 22050:
|
||||
return
|
||||
stft = get_spec_mags(clip)
|
||||
if stft.mean() < .08:
|
||||
print(f"Ignore {path}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
do_to_files(filter_no_hifreq_data)
|
44
codes/scripts/do_to_files.py
Normal file
44
codes/scripts/do_to_files.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
import argparse
|
||||
import functools
|
||||
import os
|
||||
import pathlib
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
'''
|
||||
Helper function for scripts that iterate over large sets of files. Defines command-line arguments
|
||||
for operating over a large set of files, then handles setting up a worker queue system to operate
|
||||
on those files. You need to provide your own process_file_fn.
|
||||
|
||||
process_file_fn expected signature:
|
||||
(path, output_path)
|
||||
'''
|
||||
def do_to_files(process_file_fn):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--path')
|
||||
parser.add_argument('--glob')
|
||||
parser.add_argument('--out')
|
||||
parser.add_argument('--resume')
|
||||
parser.add_argument('--num_workers')
|
||||
|
||||
args = parser.parse_args()
|
||||
src = args.path
|
||||
glob = args.glob
|
||||
out = args.out
|
||||
resume = args.resume
|
||||
num_workers = int(args.num_workers)
|
||||
|
||||
os.makedirs(out, exist_ok=True)
|
||||
path = pathlib.Path(src)
|
||||
files = path.rglob(glob)
|
||||
files = [str(f) for f in files]
|
||||
files = files[resume:]
|
||||
pfn = functools.partial(process_file_fn, output_path=out)
|
||||
if num_workers > 0:
|
||||
with ThreadPool(num_workers) as pool:
|
||||
list(tqdm(pool.imap(pfn, files), total=len(files)))
|
||||
else:
|
||||
for f in tqdm(files):
|
||||
pfn(f)
|
Loading…
Reference in New Issue
Block a user