forked from mrq/DL-Art-School
Add filtration script for finding resampled clips (or phone calls)
This commit is contained in:
parent
756b4dad09
commit
3c0f2fbb21
|
@ -0,0 +1,23 @@
|
||||||
|
import torch
|
||||||
|
import torchaudio
|
||||||
|
|
||||||
|
from data.audio.unsupervised_audio_dataset import load_audio
|
||||||
|
from scripts.do_to_files import do_to_files
|
||||||
|
|
||||||
|
|
||||||
|
def get_spec_mags(clip):
|
||||||
|
stft = torch.stft(clip, n_fft=22000, hop_length=1024, return_complex=True)
|
||||||
|
stft = stft[0, -2000:, :]
|
||||||
|
return (stft.real ** 2 + stft.imag ** 2).sqrt()
|
||||||
|
|
||||||
|
|
||||||
|
def filter_no_hifreq_data(path, output_path):
|
||||||
|
clip = load_audio(path, 22050)
|
||||||
|
if clip.shape[-1] < 22050:
|
||||||
|
return
|
||||||
|
stft = get_spec_mags(clip)
|
||||||
|
if stft.mean() < .08:
|
||||||
|
print(f"Ignore {path}")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
do_to_files(filter_no_hifreq_data)
|
44
codes/scripts/do_to_files.py
Normal file
44
codes/scripts/do_to_files.py
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
import argparse
|
||||||
|
import functools
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
from multiprocessing.pool import ThreadPool
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
Helper function for scripts that iterate over large sets of files. Defines command-line arguments
|
||||||
|
for operating over a large set of files, then handles setting up a worker queue system to operate
|
||||||
|
on those files. You need to provide your own process_file_fn.
|
||||||
|
|
||||||
|
process_file_fn expected signature:
|
||||||
|
(path, output_path)
|
||||||
|
'''
|
||||||
|
def do_to_files(process_file_fn):
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--path')
|
||||||
|
parser.add_argument('--glob')
|
||||||
|
parser.add_argument('--out')
|
||||||
|
parser.add_argument('--resume')
|
||||||
|
parser.add_argument('--num_workers')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
src = args.path
|
||||||
|
glob = args.glob
|
||||||
|
out = args.out
|
||||||
|
resume = args.resume
|
||||||
|
num_workers = int(args.num_workers)
|
||||||
|
|
||||||
|
os.makedirs(out, exist_ok=True)
|
||||||
|
path = pathlib.Path(src)
|
||||||
|
files = path.rglob(glob)
|
||||||
|
files = [str(f) for f in files]
|
||||||
|
files = files[resume:]
|
||||||
|
pfn = functools.partial(process_file_fn, output_path=out)
|
||||||
|
if num_workers > 0:
|
||||||
|
with ThreadPool(num_workers) as pool:
|
||||||
|
list(tqdm(pool.imap(pfn, files), total=len(files)))
|
||||||
|
else:
|
||||||
|
for f in tqdm(files):
|
||||||
|
pfn(f)
|
Loading…
Reference in New Issue
Block a user