diff --git a/codes/scripts/audio/preparation/process_spleeter_filter_outputs.py b/codes/scripts/audio/preparation/process_spleeter_filter_outputs.py new file mode 100644 index 00000000..80138025 --- /dev/null +++ b/codes/scripts/audio/preparation/process_spleeter_filter_outputs.py @@ -0,0 +1,27 @@ +import os +import shutil +import argparse +from tqdm import tqdm + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('input', metavar='in', type=str) + parser.add_argument('basis', metavar='basis', type=str) + parser.add_argument('garbage', metavar='garbage', type=str) + args = parser.parse_args() + print(f"Moving files from {args.input} to {args.garbage}") + os.makedirs(args.garbage, exist_ok=True) + + with open(args.input) as f: + lines = f.readlines() + for line in tqdm(lines): + line = line.strip() + assert args.basis in line + movefile = os.path.join(args.garbage, line.replace(args.basis, '')[1:]) + print(f'{line} -> {movefile}') + os.makedirs(os.path.dirname(movefile), exist_ok=True) + shutil.move(line, movefile) + + + + diff --git a/codes/scripts/audio/preparation/spleeter_dataset.py b/codes/scripts/audio/preparation/spleeter_dataset.py index c840e4f9..b4dc6ffd 100644 --- a/codes/scripts/audio/preparation/spleeter_dataset.py +++ b/codes/scripts/audio/preparation/spleeter_dataset.py @@ -1,11 +1,64 @@ +from typing import Optional + import torch import torch.nn as nn +from scipy.signal.windows import hann from spleeter.audio.adapter import AudioAdapter from torch.utils.data import Dataset +import numpy as np +import librosa from data.util import find_audio_files +def spleeter_stft( + data: np.ndarray, inverse: bool = False, length: Optional[int] = None +) -> np.ndarray: + """ + Single entrypoint for both stft and istft. This computes stft and + istft with librosa on stereo data. The two channels are processed + separately and are concatenated together in the result. The + expected input formats are: (n_samples, 2) for stft and (T, F, 2) + for istft. + + Parameters: + data (numpy.array): + Array with either the waveform or the complex spectrogram + depending on the parameter inverse + inverse (bool): + (Optional) Should a stft or an istft be computed. + length (Optional[int]): + + Returns: + numpy.ndarray: + Stereo data as numpy array for the transform. The channels + are stored in the last dimension. + """ + assert not (inverse and length is None) + data = np.asfortranarray(data) + N = 4096 + H = 1024 + win = hann(N, sym=False) + fstft = librosa.core.istft if inverse else librosa.core.stft + win_len_arg = {"win_length": None, "length": None} if inverse else {"n_fft": N} + n_channels = data.shape[-1] + out = [] + for c in range(n_channels): + d = ( + np.concatenate((np.zeros((N,)), data[:, c], np.zeros((N,)))) + if not inverse + else data[:, :, c].T + ) + s = fstft(d, hop_length=H, window=win, center=False, **win_len_arg) + if inverse: + s = s[N: N + length] + s = np.expand_dims(s.T, 2 - inverse) + out.append(s) + if len(out) == 1: + return out[0] + return np.concatenate(out, axis=2 - inverse) + + class SpleeterDataset(Dataset): def __init__(self, src_dir, sample_rate=22050, max_duration=20, skip=0): self.files = find_audio_files(src_dir, include_nonwav=True) @@ -17,22 +70,15 @@ class SpleeterDataset(Dataset): def __getitem__(self, item): file = self.files[item] - try: - wave, sample_rate = self.audio_loader.load(file, sample_rate=self.sample_rate) - assert sample_rate == self.sample_rate - wave = wave[:,0] # strip off channels - wave = torch.tensor(wave) - except: - wave = torch.zeros(self.sample_rate * self.max_duration) - print(f"Error with {file}") - original_duration = wave.shape[0] - padding_needed = self.sample_rate * self.max_duration - original_duration - if padding_needed > 0: - wave = nn.functional.pad(wave, (0, padding_needed)) + wave, sample_rate = self.audio_loader.load(file, sample_rate=self.sample_rate) + assert sample_rate == self.sample_rate + stft = torch.tensor(spleeter_stft(wave)) + # TODO: pad this up so it can be batched. return { 'path': file, 'wave': wave, - 'duration': original_duration, + 'stft': stft, + #'duration': original_duration, } def __len__(self): diff --git a/codes/scripts/audio/preparation/spleeter_filter_noisy_clips.py b/codes/scripts/audio/preparation/spleeter_filter_noisy_clips.py index a14778ae..f38f61e1 100644 --- a/codes/scripts/audio/preparation/spleeter_filter_noisy_clips.py +++ b/codes/scripts/audio/preparation/spleeter_filter_noisy_clips.py @@ -1,3 +1,5 @@ +from math import ceil + from scipy.io import wavfile import os @@ -5,6 +7,7 @@ import argparse import numpy as np from scipy.io import wavfile from spleeter.separator import Separator +from torch.utils.data import Dataset, DataLoader from tqdm import tqdm from spleeter.audio.adapter import AudioAdapter from tqdm import tqdm @@ -74,6 +77,63 @@ def find_files_of_type(data_type, dataroot, weights=[], qualifier=is_image_file) return paths, sizes +class SpleeterDataset(Dataset): + def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None): + self.batch_sz = batch_sz + self.max_duration = max_duration + self.files = find_audio_files(src_dir, include_nonwav=True) + self.sample_rate = sample_rate + + # Partition files if needed. + if partition_size is not None: + psz = int(partition_size) + prt = int(partition) + self.files = self.files[prt * psz:(prt + 1) * psz] + + # Find the resume point and carry on from there. + if resume is not None: + for i, f in enumerate(self.files): + if resume in f: + break + assert i < len(self.files) + self.files = self.files[i:] + self.loader = AudioAdapter.default() + + def __len__(self): + return ceil(len(self.files) / self.batch_sz) + + def __getitem__(self, item): + item = item * self.batch_sz + wavs = None + files = [] + ends = [] + for k in range(self.batch_sz): + ind = k+item + if ind >= len(self.files): + break + + #try: + wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate) + assert sr == 22050 + # Get rid of all channels except one. + if wav.shape[1] > 1: + wav = wav[:, 0] + + if wavs is None: + wavs = wav + else: + wavs = np.concatenate([wavs, wav]) + ends.append(wavs.shape[0]) + files.append(self.files[ind]) + #except: + # print(f'Error loading {self.files[ind]}') + return { + 'audio': wavs, + 'files': files, + 'ends': ends + } + + def main(): parser = argparse.ArgumentParser() parser.add_argument('--path') @@ -86,37 +146,29 @@ def main(): src_dir = args.path out_file = args.out output_sample_rate=22050 - waiting_for_file = args.resume is not None resume_file = args.resume - audio_loader = AudioAdapter.default() - files = find_audio_files(src_dir, include_nonwav=True) + loader = DataLoader(SpleeterDataset(src_dir, batch_sz=16, sample_rate=output_sample_rate, + max_duration=10, partition=args.partition, partition_size=args.partition_size, + resume=resume_file), batch_size=1, num_workers=1) - # Partition files if needed. - if args.partition_size is not None: - psz = int(args.partition_size) - prt = int(args.partition) - files = files[prt*psz:(prt+1)*psz] - - #separator = Separator('pretrained_models/2stems', input_sr=output_sample_rate) separator = Separator('spleeter:2stems') unacceptable_files = open(out_file, 'a') - for e, path in enumerate(tqdm(files)): - if waiting_for_file and resume_file not in path: - continue - waiting_for_file = False - print(f"{e}: Processing {path}") - spleeter_ld, sr = audio_loader.load(path, sample_rate=output_sample_rate) - sep = separator.separate(spleeter_ld) + for batch in tqdm(loader): + audio, files, ends = batch['audio'], batch['files'], batch['ends'] + sep = separator.separate(audio.squeeze(0).numpy()) vocals = sep['vocals'] bg = sep['accompaniment'] - vmax = np.abs(vocals).mean() - bmax = np.abs(bg).mean() - - # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough. - ratio = vmax / (bmax+.0000001) - if ratio < 25: # These values were derived empirically - unacceptable_files.write(f'{path}\n') + start = 0 + for path, end in zip(files, ends): + vmax = np.abs(vocals[start:end]).mean() + bmax = np.abs(bg[start:end]).mean() + start = end + + # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough. + ratio = vmax / (bmax+.0000001) + if ratio < 18: # These values were derived empirically + unacceptable_files.write(f'{path[0]}\n') unacceptable_files.flush() unacceptable_files.close()