forked from mrq/DL-Art-School
Improvements to spleeter_filter_noisy_clips
This commit is contained in:
parent
33120cb35c
commit
b94e587f46
codes/scripts/audio/preparation
|
@ -0,0 +1,27 @@
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import argparse
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('input', metavar='in', type=str)
|
||||||
|
parser.add_argument('basis', metavar='basis', type=str)
|
||||||
|
parser.add_argument('garbage', metavar='garbage', type=str)
|
||||||
|
args = parser.parse_args()
|
||||||
|
print(f"Moving files from {args.input} to {args.garbage}")
|
||||||
|
os.makedirs(args.garbage, exist_ok=True)
|
||||||
|
|
||||||
|
with open(args.input) as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
for line in tqdm(lines):
|
||||||
|
line = line.strip()
|
||||||
|
assert args.basis in line
|
||||||
|
movefile = os.path.join(args.garbage, line.replace(args.basis, '')[1:])
|
||||||
|
print(f'{line} -> {movefile}')
|
||||||
|
os.makedirs(os.path.dirname(movefile), exist_ok=True)
|
||||||
|
shutil.move(line, movefile)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,64 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
from scipy.signal.windows import hann
|
||||||
from spleeter.audio.adapter import AudioAdapter
|
from spleeter.audio.adapter import AudioAdapter
|
||||||
from torch.utils.data import Dataset
|
from torch.utils.data import Dataset
|
||||||
|
import numpy as np
|
||||||
|
import librosa
|
||||||
|
|
||||||
from data.util import find_audio_files
|
from data.util import find_audio_files
|
||||||
|
|
||||||
|
|
||||||
|
def spleeter_stft(
|
||||||
|
data: np.ndarray, inverse: bool = False, length: Optional[int] = None
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Single entrypoint for both stft and istft. This computes stft and
|
||||||
|
istft with librosa on stereo data. The two channels are processed
|
||||||
|
separately and are concatenated together in the result. The
|
||||||
|
expected input formats are: (n_samples, 2) for stft and (T, F, 2)
|
||||||
|
for istft.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
data (numpy.array):
|
||||||
|
Array with either the waveform or the complex spectrogram
|
||||||
|
depending on the parameter inverse
|
||||||
|
inverse (bool):
|
||||||
|
(Optional) Should a stft or an istft be computed.
|
||||||
|
length (Optional[int]):
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
numpy.ndarray:
|
||||||
|
Stereo data as numpy array for the transform. The channels
|
||||||
|
are stored in the last dimension.
|
||||||
|
"""
|
||||||
|
assert not (inverse and length is None)
|
||||||
|
data = np.asfortranarray(data)
|
||||||
|
N = 4096
|
||||||
|
H = 1024
|
||||||
|
win = hann(N, sym=False)
|
||||||
|
fstft = librosa.core.istft if inverse else librosa.core.stft
|
||||||
|
win_len_arg = {"win_length": None, "length": None} if inverse else {"n_fft": N}
|
||||||
|
n_channels = data.shape[-1]
|
||||||
|
out = []
|
||||||
|
for c in range(n_channels):
|
||||||
|
d = (
|
||||||
|
np.concatenate((np.zeros((N,)), data[:, c], np.zeros((N,))))
|
||||||
|
if not inverse
|
||||||
|
else data[:, :, c].T
|
||||||
|
)
|
||||||
|
s = fstft(d, hop_length=H, window=win, center=False, **win_len_arg)
|
||||||
|
if inverse:
|
||||||
|
s = s[N: N + length]
|
||||||
|
s = np.expand_dims(s.T, 2 - inverse)
|
||||||
|
out.append(s)
|
||||||
|
if len(out) == 1:
|
||||||
|
return out[0]
|
||||||
|
return np.concatenate(out, axis=2 - inverse)
|
||||||
|
|
||||||
|
|
||||||
class SpleeterDataset(Dataset):
|
class SpleeterDataset(Dataset):
|
||||||
def __init__(self, src_dir, sample_rate=22050, max_duration=20, skip=0):
|
def __init__(self, src_dir, sample_rate=22050, max_duration=20, skip=0):
|
||||||
self.files = find_audio_files(src_dir, include_nonwav=True)
|
self.files = find_audio_files(src_dir, include_nonwav=True)
|
||||||
|
@ -17,22 +70,15 @@ class SpleeterDataset(Dataset):
|
||||||
|
|
||||||
def __getitem__(self, item):
|
def __getitem__(self, item):
|
||||||
file = self.files[item]
|
file = self.files[item]
|
||||||
try:
|
|
||||||
wave, sample_rate = self.audio_loader.load(file, sample_rate=self.sample_rate)
|
wave, sample_rate = self.audio_loader.load(file, sample_rate=self.sample_rate)
|
||||||
assert sample_rate == self.sample_rate
|
assert sample_rate == self.sample_rate
|
||||||
wave = wave[:,0] # strip off channels
|
stft = torch.tensor(spleeter_stft(wave))
|
||||||
wave = torch.tensor(wave)
|
# TODO: pad this up so it can be batched.
|
||||||
except:
|
|
||||||
wave = torch.zeros(self.sample_rate * self.max_duration)
|
|
||||||
print(f"Error with {file}")
|
|
||||||
original_duration = wave.shape[0]
|
|
||||||
padding_needed = self.sample_rate * self.max_duration - original_duration
|
|
||||||
if padding_needed > 0:
|
|
||||||
wave = nn.functional.pad(wave, (0, padding_needed))
|
|
||||||
return {
|
return {
|
||||||
'path': file,
|
'path': file,
|
||||||
'wave': wave,
|
'wave': wave,
|
||||||
'duration': original_duration,
|
'stft': stft,
|
||||||
|
#'duration': original_duration,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from math import ceil
|
||||||
|
|
||||||
from scipy.io import wavfile
|
from scipy.io import wavfile
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
@ -5,6 +7,7 @@ import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.io import wavfile
|
from scipy.io import wavfile
|
||||||
from spleeter.separator import Separator
|
from spleeter.separator import Separator
|
||||||
|
from torch.utils.data import Dataset, DataLoader
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from spleeter.audio.adapter import AudioAdapter
|
from spleeter.audio.adapter import AudioAdapter
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
@ -74,6 +77,63 @@ def find_files_of_type(data_type, dataroot, weights=[], qualifier=is_image_file)
|
||||||
return paths, sizes
|
return paths, sizes
|
||||||
|
|
||||||
|
|
||||||
|
class SpleeterDataset(Dataset):
|
||||||
|
def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None):
|
||||||
|
self.batch_sz = batch_sz
|
||||||
|
self.max_duration = max_duration
|
||||||
|
self.files = find_audio_files(src_dir, include_nonwav=True)
|
||||||
|
self.sample_rate = sample_rate
|
||||||
|
|
||||||
|
# Partition files if needed.
|
||||||
|
if partition_size is not None:
|
||||||
|
psz = int(partition_size)
|
||||||
|
prt = int(partition)
|
||||||
|
self.files = self.files[prt * psz:(prt + 1) * psz]
|
||||||
|
|
||||||
|
# Find the resume point and carry on from there.
|
||||||
|
if resume is not None:
|
||||||
|
for i, f in enumerate(self.files):
|
||||||
|
if resume in f:
|
||||||
|
break
|
||||||
|
assert i < len(self.files)
|
||||||
|
self.files = self.files[i:]
|
||||||
|
self.loader = AudioAdapter.default()
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return ceil(len(self.files) / self.batch_sz)
|
||||||
|
|
||||||
|
def __getitem__(self, item):
|
||||||
|
item = item * self.batch_sz
|
||||||
|
wavs = None
|
||||||
|
files = []
|
||||||
|
ends = []
|
||||||
|
for k in range(self.batch_sz):
|
||||||
|
ind = k+item
|
||||||
|
if ind >= len(self.files):
|
||||||
|
break
|
||||||
|
|
||||||
|
#try:
|
||||||
|
wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate)
|
||||||
|
assert sr == 22050
|
||||||
|
# Get rid of all channels except one.
|
||||||
|
if wav.shape[1] > 1:
|
||||||
|
wav = wav[:, 0]
|
||||||
|
|
||||||
|
if wavs is None:
|
||||||
|
wavs = wav
|
||||||
|
else:
|
||||||
|
wavs = np.concatenate([wavs, wav])
|
||||||
|
ends.append(wavs.shape[0])
|
||||||
|
files.append(self.files[ind])
|
||||||
|
#except:
|
||||||
|
# print(f'Error loading {self.files[ind]}')
|
||||||
|
return {
|
||||||
|
'audio': wavs,
|
||||||
|
'files': files,
|
||||||
|
'ends': ends
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('--path')
|
parser.add_argument('--path')
|
||||||
|
@ -86,37 +146,29 @@ def main():
|
||||||
src_dir = args.path
|
src_dir = args.path
|
||||||
out_file = args.out
|
out_file = args.out
|
||||||
output_sample_rate=22050
|
output_sample_rate=22050
|
||||||
waiting_for_file = args.resume is not None
|
|
||||||
resume_file = args.resume
|
resume_file = args.resume
|
||||||
|
|
||||||
audio_loader = AudioAdapter.default()
|
loader = DataLoader(SpleeterDataset(src_dir, batch_sz=16, sample_rate=output_sample_rate,
|
||||||
files = find_audio_files(src_dir, include_nonwav=True)
|
max_duration=10, partition=args.partition, partition_size=args.partition_size,
|
||||||
|
resume=resume_file), batch_size=1, num_workers=1)
|
||||||
|
|
||||||
# Partition files if needed.
|
|
||||||
if args.partition_size is not None:
|
|
||||||
psz = int(args.partition_size)
|
|
||||||
prt = int(args.partition)
|
|
||||||
files = files[prt*psz:(prt+1)*psz]
|
|
||||||
|
|
||||||
#separator = Separator('pretrained_models/2stems', input_sr=output_sample_rate)
|
|
||||||
separator = Separator('spleeter:2stems')
|
separator = Separator('spleeter:2stems')
|
||||||
unacceptable_files = open(out_file, 'a')
|
unacceptable_files = open(out_file, 'a')
|
||||||
for e, path in enumerate(tqdm(files)):
|
for batch in tqdm(loader):
|
||||||
if waiting_for_file and resume_file not in path:
|
audio, files, ends = batch['audio'], batch['files'], batch['ends']
|
||||||
continue
|
sep = separator.separate(audio.squeeze(0).numpy())
|
||||||
waiting_for_file = False
|
|
||||||
print(f"{e}: Processing {path}")
|
|
||||||
spleeter_ld, sr = audio_loader.load(path, sample_rate=output_sample_rate)
|
|
||||||
sep = separator.separate(spleeter_ld)
|
|
||||||
vocals = sep['vocals']
|
vocals = sep['vocals']
|
||||||
bg = sep['accompaniment']
|
bg = sep['accompaniment']
|
||||||
vmax = np.abs(vocals).mean()
|
start = 0
|
||||||
bmax = np.abs(bg).mean()
|
for path, end in zip(files, ends):
|
||||||
|
vmax = np.abs(vocals[start:end]).mean()
|
||||||
|
bmax = np.abs(bg[start:end]).mean()
|
||||||
|
start = end
|
||||||
|
|
||||||
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
|
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
|
||||||
ratio = vmax / (bmax+.0000001)
|
ratio = vmax / (bmax+.0000001)
|
||||||
if ratio < 25: # These values were derived empirically
|
if ratio < 18: # These values were derived empirically
|
||||||
unacceptable_files.write(f'{path}\n')
|
unacceptable_files.write(f'{path[0]}\n')
|
||||||
unacceptable_files.flush()
|
unacceptable_files.flush()
|
||||||
|
|
||||||
unacceptable_files.close()
|
unacceptable_files.close()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user