Improvements to spleeter_filter_noisy_clips
This commit is contained in:
parent
33120cb35c
commit
b94e587f46
|
@ -0,0 +1,27 @@
|
|||
import os
|
||||
import shutil
|
||||
import argparse
|
||||
from tqdm import tqdm
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('input', metavar='in', type=str)
|
||||
parser.add_argument('basis', metavar='basis', type=str)
|
||||
parser.add_argument('garbage', metavar='garbage', type=str)
|
||||
args = parser.parse_args()
|
||||
print(f"Moving files from {args.input} to {args.garbage}")
|
||||
os.makedirs(args.garbage, exist_ok=True)
|
||||
|
||||
with open(args.input) as f:
|
||||
lines = f.readlines()
|
||||
for line in tqdm(lines):
|
||||
line = line.strip()
|
||||
assert args.basis in line
|
||||
movefile = os.path.join(args.garbage, line.replace(args.basis, '')[1:])
|
||||
print(f'{line} -> {movefile}')
|
||||
os.makedirs(os.path.dirname(movefile), exist_ok=True)
|
||||
shutil.move(line, movefile)
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,11 +1,64 @@
|
|||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from scipy.signal.windows import hann
|
||||
from spleeter.audio.adapter import AudioAdapter
|
||||
from torch.utils.data import Dataset
|
||||
import numpy as np
|
||||
import librosa
|
||||
|
||||
from data.util import find_audio_files
|
||||
|
||||
|
||||
def spleeter_stft(
|
||||
data: np.ndarray, inverse: bool = False, length: Optional[int] = None
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Single entrypoint for both stft and istft. This computes stft and
|
||||
istft with librosa on stereo data. The two channels are processed
|
||||
separately and are concatenated together in the result. The
|
||||
expected input formats are: (n_samples, 2) for stft and (T, F, 2)
|
||||
for istft.
|
||||
|
||||
Parameters:
|
||||
data (numpy.array):
|
||||
Array with either the waveform or the complex spectrogram
|
||||
depending on the parameter inverse
|
||||
inverse (bool):
|
||||
(Optional) Should a stft or an istft be computed.
|
||||
length (Optional[int]):
|
||||
|
||||
Returns:
|
||||
numpy.ndarray:
|
||||
Stereo data as numpy array for the transform. The channels
|
||||
are stored in the last dimension.
|
||||
"""
|
||||
assert not (inverse and length is None)
|
||||
data = np.asfortranarray(data)
|
||||
N = 4096
|
||||
H = 1024
|
||||
win = hann(N, sym=False)
|
||||
fstft = librosa.core.istft if inverse else librosa.core.stft
|
||||
win_len_arg = {"win_length": None, "length": None} if inverse else {"n_fft": N}
|
||||
n_channels = data.shape[-1]
|
||||
out = []
|
||||
for c in range(n_channels):
|
||||
d = (
|
||||
np.concatenate((np.zeros((N,)), data[:, c], np.zeros((N,))))
|
||||
if not inverse
|
||||
else data[:, :, c].T
|
||||
)
|
||||
s = fstft(d, hop_length=H, window=win, center=False, **win_len_arg)
|
||||
if inverse:
|
||||
s = s[N: N + length]
|
||||
s = np.expand_dims(s.T, 2 - inverse)
|
||||
out.append(s)
|
||||
if len(out) == 1:
|
||||
return out[0]
|
||||
return np.concatenate(out, axis=2 - inverse)
|
||||
|
||||
|
||||
class SpleeterDataset(Dataset):
|
||||
def __init__(self, src_dir, sample_rate=22050, max_duration=20, skip=0):
|
||||
self.files = find_audio_files(src_dir, include_nonwav=True)
|
||||
|
@ -17,22 +70,15 @@ class SpleeterDataset(Dataset):
|
|||
|
||||
def __getitem__(self, item):
|
||||
file = self.files[item]
|
||||
try:
|
||||
wave, sample_rate = self.audio_loader.load(file, sample_rate=self.sample_rate)
|
||||
assert sample_rate == self.sample_rate
|
||||
wave = wave[:,0] # strip off channels
|
||||
wave = torch.tensor(wave)
|
||||
except:
|
||||
wave = torch.zeros(self.sample_rate * self.max_duration)
|
||||
print(f"Error with {file}")
|
||||
original_duration = wave.shape[0]
|
||||
padding_needed = self.sample_rate * self.max_duration - original_duration
|
||||
if padding_needed > 0:
|
||||
wave = nn.functional.pad(wave, (0, padding_needed))
|
||||
wave, sample_rate = self.audio_loader.load(file, sample_rate=self.sample_rate)
|
||||
assert sample_rate == self.sample_rate
|
||||
stft = torch.tensor(spleeter_stft(wave))
|
||||
# TODO: pad this up so it can be batched.
|
||||
return {
|
||||
'path': file,
|
||||
'wave': wave,
|
||||
'duration': original_duration,
|
||||
'stft': stft,
|
||||
#'duration': original_duration,
|
||||
}
|
||||
|
||||
def __len__(self):
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from math import ceil
|
||||
|
||||
from scipy.io import wavfile
|
||||
import os
|
||||
|
||||
|
@ -5,6 +7,7 @@ import argparse
|
|||
import numpy as np
|
||||
from scipy.io import wavfile
|
||||
from spleeter.separator import Separator
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from tqdm import tqdm
|
||||
from spleeter.audio.adapter import AudioAdapter
|
||||
from tqdm import tqdm
|
||||
|
@ -74,6 +77,63 @@ def find_files_of_type(data_type, dataroot, weights=[], qualifier=is_image_file)
|
|||
return paths, sizes
|
||||
|
||||
|
||||
class SpleeterDataset(Dataset):
|
||||
def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None):
|
||||
self.batch_sz = batch_sz
|
||||
self.max_duration = max_duration
|
||||
self.files = find_audio_files(src_dir, include_nonwav=True)
|
||||
self.sample_rate = sample_rate
|
||||
|
||||
# Partition files if needed.
|
||||
if partition_size is not None:
|
||||
psz = int(partition_size)
|
||||
prt = int(partition)
|
||||
self.files = self.files[prt * psz:(prt + 1) * psz]
|
||||
|
||||
# Find the resume point and carry on from there.
|
||||
if resume is not None:
|
||||
for i, f in enumerate(self.files):
|
||||
if resume in f:
|
||||
break
|
||||
assert i < len(self.files)
|
||||
self.files = self.files[i:]
|
||||
self.loader = AudioAdapter.default()
|
||||
|
||||
def __len__(self):
|
||||
return ceil(len(self.files) / self.batch_sz)
|
||||
|
||||
def __getitem__(self, item):
|
||||
item = item * self.batch_sz
|
||||
wavs = None
|
||||
files = []
|
||||
ends = []
|
||||
for k in range(self.batch_sz):
|
||||
ind = k+item
|
||||
if ind >= len(self.files):
|
||||
break
|
||||
|
||||
#try:
|
||||
wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate)
|
||||
assert sr == 22050
|
||||
# Get rid of all channels except one.
|
||||
if wav.shape[1] > 1:
|
||||
wav = wav[:, 0]
|
||||
|
||||
if wavs is None:
|
||||
wavs = wav
|
||||
else:
|
||||
wavs = np.concatenate([wavs, wav])
|
||||
ends.append(wavs.shape[0])
|
||||
files.append(self.files[ind])
|
||||
#except:
|
||||
# print(f'Error loading {self.files[ind]}')
|
||||
return {
|
||||
'audio': wavs,
|
||||
'files': files,
|
||||
'ends': ends
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--path')
|
||||
|
@ -86,37 +146,29 @@ def main():
|
|||
src_dir = args.path
|
||||
out_file = args.out
|
||||
output_sample_rate=22050
|
||||
waiting_for_file = args.resume is not None
|
||||
resume_file = args.resume
|
||||
|
||||
audio_loader = AudioAdapter.default()
|
||||
files = find_audio_files(src_dir, include_nonwav=True)
|
||||
loader = DataLoader(SpleeterDataset(src_dir, batch_sz=16, sample_rate=output_sample_rate,
|
||||
max_duration=10, partition=args.partition, partition_size=args.partition_size,
|
||||
resume=resume_file), batch_size=1, num_workers=1)
|
||||
|
||||
# Partition files if needed.
|
||||
if args.partition_size is not None:
|
||||
psz = int(args.partition_size)
|
||||
prt = int(args.partition)
|
||||
files = files[prt*psz:(prt+1)*psz]
|
||||
|
||||
#separator = Separator('pretrained_models/2stems', input_sr=output_sample_rate)
|
||||
separator = Separator('spleeter:2stems')
|
||||
unacceptable_files = open(out_file, 'a')
|
||||
for e, path in enumerate(tqdm(files)):
|
||||
if waiting_for_file and resume_file not in path:
|
||||
continue
|
||||
waiting_for_file = False
|
||||
print(f"{e}: Processing {path}")
|
||||
spleeter_ld, sr = audio_loader.load(path, sample_rate=output_sample_rate)
|
||||
sep = separator.separate(spleeter_ld)
|
||||
for batch in tqdm(loader):
|
||||
audio, files, ends = batch['audio'], batch['files'], batch['ends']
|
||||
sep = separator.separate(audio.squeeze(0).numpy())
|
||||
vocals = sep['vocals']
|
||||
bg = sep['accompaniment']
|
||||
vmax = np.abs(vocals).mean()
|
||||
bmax = np.abs(bg).mean()
|
||||
|
||||
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
|
||||
ratio = vmax / (bmax+.0000001)
|
||||
if ratio < 25: # These values were derived empirically
|
||||
unacceptable_files.write(f'{path}\n')
|
||||
start = 0
|
||||
for path, end in zip(files, ends):
|
||||
vmax = np.abs(vocals[start:end]).mean()
|
||||
bmax = np.abs(bg[start:end]).mean()
|
||||
start = end
|
||||
|
||||
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
|
||||
ratio = vmax / (bmax+.0000001)
|
||||
if ratio < 18: # These values were derived empirically
|
||||
unacceptable_files.write(f'{path[0]}\n')
|
||||
unacceptable_files.flush()
|
||||
|
||||
unacceptable_files.close()
|
||||
|
|
Loading…
Reference in New Issue
Block a user