Improvements to spleeter_filter_noisy_clips

This commit is contained in:
James Betker 2021-10-07 21:28:00 -06:00
parent 33120cb35c
commit b94e587f46
3 changed files with 162 additions and 37 deletions

View File

@ -0,0 +1,27 @@
import os
import shutil
import argparse
from tqdm import tqdm
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('input', metavar='in', type=str)
parser.add_argument('basis', metavar='basis', type=str)
parser.add_argument('garbage', metavar='garbage', type=str)
args = parser.parse_args()
print(f"Moving files from {args.input} to {args.garbage}")
os.makedirs(args.garbage, exist_ok=True)
with open(args.input) as f:
lines = f.readlines()
for line in tqdm(lines):
line = line.strip()
assert args.basis in line
movefile = os.path.join(args.garbage, line.replace(args.basis, '')[1:])
print(f'{line} -> {movefile}')
os.makedirs(os.path.dirname(movefile), exist_ok=True)
shutil.move(line, movefile)

View File

@ -1,11 +1,64 @@
from typing import Optional
import torch
import torch.nn as nn
from scipy.signal.windows import hann
from spleeter.audio.adapter import AudioAdapter
from torch.utils.data import Dataset
import numpy as np
import librosa
from data.util import find_audio_files
def spleeter_stft(
data: np.ndarray, inverse: bool = False, length: Optional[int] = None
) -> np.ndarray:
"""
Single entrypoint for both stft and istft. This computes stft and
istft with librosa on stereo data. The two channels are processed
separately and are concatenated together in the result. The
expected input formats are: (n_samples, 2) for stft and (T, F, 2)
for istft.
Parameters:
data (numpy.array):
Array with either the waveform or the complex spectrogram
depending on the parameter inverse
inverse (bool):
(Optional) Should a stft or an istft be computed.
length (Optional[int]):
Returns:
numpy.ndarray:
Stereo data as numpy array for the transform. The channels
are stored in the last dimension.
"""
assert not (inverse and length is None)
data = np.asfortranarray(data)
N = 4096
H = 1024
win = hann(N, sym=False)
fstft = librosa.core.istft if inverse else librosa.core.stft
win_len_arg = {"win_length": None, "length": None} if inverse else {"n_fft": N}
n_channels = data.shape[-1]
out = []
for c in range(n_channels):
d = (
np.concatenate((np.zeros((N,)), data[:, c], np.zeros((N,))))
if not inverse
else data[:, :, c].T
)
s = fstft(d, hop_length=H, window=win, center=False, **win_len_arg)
if inverse:
s = s[N: N + length]
s = np.expand_dims(s.T, 2 - inverse)
out.append(s)
if len(out) == 1:
return out[0]
return np.concatenate(out, axis=2 - inverse)
class SpleeterDataset(Dataset):
def __init__(self, src_dir, sample_rate=22050, max_duration=20, skip=0):
self.files = find_audio_files(src_dir, include_nonwav=True)
@ -17,22 +70,15 @@ class SpleeterDataset(Dataset):
def __getitem__(self, item):
file = self.files[item]
try:
wave, sample_rate = self.audio_loader.load(file, sample_rate=self.sample_rate)
assert sample_rate == self.sample_rate
wave = wave[:,0] # strip off channels
wave = torch.tensor(wave)
except:
wave = torch.zeros(self.sample_rate * self.max_duration)
print(f"Error with {file}")
original_duration = wave.shape[0]
padding_needed = self.sample_rate * self.max_duration - original_duration
if padding_needed > 0:
wave = nn.functional.pad(wave, (0, padding_needed))
wave, sample_rate = self.audio_loader.load(file, sample_rate=self.sample_rate)
assert sample_rate == self.sample_rate
stft = torch.tensor(spleeter_stft(wave))
# TODO: pad this up so it can be batched.
return {
'path': file,
'wave': wave,
'duration': original_duration,
'stft': stft,
#'duration': original_duration,
}
def __len__(self):

View File

@ -1,3 +1,5 @@
from math import ceil
from scipy.io import wavfile
import os
@ -5,6 +7,7 @@ import argparse
import numpy as np
from scipy.io import wavfile
from spleeter.separator import Separator
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from spleeter.audio.adapter import AudioAdapter
from tqdm import tqdm
@ -74,6 +77,63 @@ def find_files_of_type(data_type, dataroot, weights=[], qualifier=is_image_file)
return paths, sizes
class SpleeterDataset(Dataset):
def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None):
self.batch_sz = batch_sz
self.max_duration = max_duration
self.files = find_audio_files(src_dir, include_nonwav=True)
self.sample_rate = sample_rate
# Partition files if needed.
if partition_size is not None:
psz = int(partition_size)
prt = int(partition)
self.files = self.files[prt * psz:(prt + 1) * psz]
# Find the resume point and carry on from there.
if resume is not None:
for i, f in enumerate(self.files):
if resume in f:
break
assert i < len(self.files)
self.files = self.files[i:]
self.loader = AudioAdapter.default()
def __len__(self):
return ceil(len(self.files) / self.batch_sz)
def __getitem__(self, item):
item = item * self.batch_sz
wavs = None
files = []
ends = []
for k in range(self.batch_sz):
ind = k+item
if ind >= len(self.files):
break
#try:
wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate)
assert sr == 22050
# Get rid of all channels except one.
if wav.shape[1] > 1:
wav = wav[:, 0]
if wavs is None:
wavs = wav
else:
wavs = np.concatenate([wavs, wav])
ends.append(wavs.shape[0])
files.append(self.files[ind])
#except:
# print(f'Error loading {self.files[ind]}')
return {
'audio': wavs,
'files': files,
'ends': ends
}
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--path')
@ -86,37 +146,29 @@ def main():
src_dir = args.path
out_file = args.out
output_sample_rate=22050
waiting_for_file = args.resume is not None
resume_file = args.resume
audio_loader = AudioAdapter.default()
files = find_audio_files(src_dir, include_nonwav=True)
loader = DataLoader(SpleeterDataset(src_dir, batch_sz=16, sample_rate=output_sample_rate,
max_duration=10, partition=args.partition, partition_size=args.partition_size,
resume=resume_file), batch_size=1, num_workers=1)
# Partition files if needed.
if args.partition_size is not None:
psz = int(args.partition_size)
prt = int(args.partition)
files = files[prt*psz:(prt+1)*psz]
#separator = Separator('pretrained_models/2stems', input_sr=output_sample_rate)
separator = Separator('spleeter:2stems')
unacceptable_files = open(out_file, 'a')
for e, path in enumerate(tqdm(files)):
if waiting_for_file and resume_file not in path:
continue
waiting_for_file = False
print(f"{e}: Processing {path}")
spleeter_ld, sr = audio_loader.load(path, sample_rate=output_sample_rate)
sep = separator.separate(spleeter_ld)
for batch in tqdm(loader):
audio, files, ends = batch['audio'], batch['files'], batch['ends']
sep = separator.separate(audio.squeeze(0).numpy())
vocals = sep['vocals']
bg = sep['accompaniment']
vmax = np.abs(vocals).mean()
bmax = np.abs(bg).mean()
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
ratio = vmax / (bmax+.0000001)
if ratio < 25: # These values were derived empirically
unacceptable_files.write(f'{path}\n')
start = 0
for path, end in zip(files, ends):
vmax = np.abs(vocals[start:end]).mean()
bmax = np.abs(bg[start:end]).mean()
start = end
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
ratio = vmax / (bmax+.0000001)
if ratio < 18: # These values were derived empirically
unacceptable_files.write(f'{path[0]}\n')
unacceptable_files.flush()
unacceptable_files.close()