diff --git a/codes/scripts/__init__.py b/codes/scripts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/codes/scripts/audio/__init__.py b/codes/scripts/audio/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/codes/scripts/audio/preparation/__init__.py b/codes/scripts/audio/preparation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/codes/scripts/audio/preparation/spleeter_dataset.py b/codes/scripts/audio/preparation/spleeter_dataset.py deleted file mode 100644 index b4dc6ffd..00000000 --- a/codes/scripts/audio/preparation/spleeter_dataset.py +++ /dev/null @@ -1,85 +0,0 @@ -from typing import Optional - -import torch -import torch.nn as nn -from scipy.signal.windows import hann -from spleeter.audio.adapter import AudioAdapter -from torch.utils.data import Dataset -import numpy as np -import librosa - -from data.util import find_audio_files - - -def spleeter_stft( - data: np.ndarray, inverse: bool = False, length: Optional[int] = None -) -> np.ndarray: - """ - Single entrypoint for both stft and istft. This computes stft and - istft with librosa on stereo data. The two channels are processed - separately and are concatenated together in the result. The - expected input formats are: (n_samples, 2) for stft and (T, F, 2) - for istft. - - Parameters: - data (numpy.array): - Array with either the waveform or the complex spectrogram - depending on the parameter inverse - inverse (bool): - (Optional) Should a stft or an istft be computed. - length (Optional[int]): - - Returns: - numpy.ndarray: - Stereo data as numpy array for the transform. The channels - are stored in the last dimension. - """ - assert not (inverse and length is None) - data = np.asfortranarray(data) - N = 4096 - H = 1024 - win = hann(N, sym=False) - fstft = librosa.core.istft if inverse else librosa.core.stft - win_len_arg = {"win_length": None, "length": None} if inverse else {"n_fft": N} - n_channels = data.shape[-1] - out = [] - for c in range(n_channels): - d = ( - np.concatenate((np.zeros((N,)), data[:, c], np.zeros((N,)))) - if not inverse - else data[:, :, c].T - ) - s = fstft(d, hop_length=H, window=win, center=False, **win_len_arg) - if inverse: - s = s[N: N + length] - s = np.expand_dims(s.T, 2 - inverse) - out.append(s) - if len(out) == 1: - return out[0] - return np.concatenate(out, axis=2 - inverse) - - -class SpleeterDataset(Dataset): - def __init__(self, src_dir, sample_rate=22050, max_duration=20, skip=0): - self.files = find_audio_files(src_dir, include_nonwav=True) - if skip > 0: - self.files = self.files[skip:] - self.audio_loader = AudioAdapter.default() - self.sample_rate = sample_rate - self.max_duration = max_duration - - def __getitem__(self, item): - file = self.files[item] - wave, sample_rate = self.audio_loader.load(file, sample_rate=self.sample_rate) - assert sample_rate == self.sample_rate - stft = torch.tensor(spleeter_stft(wave)) - # TODO: pad this up so it can be batched. - return { - 'path': file, - 'wave': wave, - 'stft': stft, - #'duration': original_duration, - } - - def __len__(self): - return len(self.files) \ No newline at end of file diff --git a/codes/scripts/audio/preparation/spleeter_filter_noisy_clips.py b/codes/scripts/audio/preparation/spleeter_filter_noisy_clips.py index 3bceeb56..cb1418b0 100644 --- a/codes/scripts/audio/preparation/spleeter_filter_noisy_clips.py +++ b/codes/scripts/audio/preparation/spleeter_filter_noisy_clips.py @@ -1,169 +1,14 @@ import multiprocessing -from math import ceil - -from scipy.io import wavfile -import os import argparse -import numpy as np -from scipy.io import wavfile -from torch.utils.data import Dataset, DataLoader -from tqdm import tqdm -from spleeter.audio.adapter import AudioAdapter +from torch.utils.data import DataLoader from tqdm import tqdm -from data.util import IMG_EXTENSIONS -from scripts.audio.preparation.spleeter_separator_mod import Separator +from scripts.audio.preparation.spleeter_utils.filter_noisy_clips_collector import invert_spectrogram_and_save +from scripts.audio.preparation.spleeter_utils.spleeter_dataset import SpleeterDataset +from scripts.audio.preparation.spleeter_utils.spleeter_separator_mod import Separator -def is_image_file(filename): - return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) - - -def is_wav_file(filename): - return filename.endswith('.wav') - - -def is_audio_file(filename): - AUDIO_EXTENSIONS = ['.wav', '.mp3', '.wma', 'm4b'] - return any(filename.endswith(extension) for extension in AUDIO_EXTENSIONS) - - -def _get_paths_from_images(path, qualifier=is_image_file): - """get image path list from image folder""" - assert os.path.isdir(path), '{:s} is not a valid directory'.format(path) - images = [] - for dirpath, _, fnames in sorted(os.walk(path)): - for fname in sorted(fnames): - if qualifier(fname) and 'ref.jpg' not in fname: - img_path = os.path.join(dirpath, fname) - images.append(img_path) - if not images: - print("Warning: {:s} has no valid image file".format(path)) - return images - - -def _get_paths_from_lmdb(dataroot): - """get image path list from lmdb meta info""" - meta_info = pickle.load(open(os.path.join(dataroot, 'meta_info.pkl'), 'rb')) - paths = meta_info['keys'] - sizes = meta_info['resolution'] - if len(sizes) == 1: - sizes = sizes * len(paths) - return paths, sizes - - -def find_audio_files(dataroot, include_nonwav=False): - if include_nonwav: - return find_files_of_type(None, dataroot, qualifier=is_audio_file)[0] - else: - return find_files_of_type(None, dataroot, qualifier=is_wav_file)[0] - - -def find_files_of_type(data_type, dataroot, weights=[], qualifier=is_image_file): - if isinstance(dataroot, list): - paths = [] - for i in range(len(dataroot)): - r = dataroot[i] - extends = 1 - - # Weights have the effect of repeatedly adding the paths from the given root to the final product. - if weights: - extends = weights[i] - for j in range(extends): - paths.extend(_get_paths_from_images(r, qualifier)) - paths = sorted(paths) - sizes = len(paths) - else: - paths = sorted(_get_paths_from_images(dataroot, qualifier)) - sizes = len(paths) - return paths, sizes - - -class SpleeterDataset(Dataset): - def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None): - self.batch_sz = batch_sz - self.max_duration = max_duration - self.files = find_audio_files(src_dir, include_nonwav=True) - self.sample_rate = sample_rate - self.separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False) - - # Partition files if needed. - if partition_size is not None: - psz = int(partition_size) - prt = int(partition) - self.files = self.files[prt * psz:(prt + 1) * psz] - - # Find the resume point and carry on from there. - if resume is not None: - for i, f in enumerate(self.files): - if resume in f: - break - assert i < len(self.files) - self.files = self.files[i:] - self.loader = AudioAdapter.default() - - def __len__(self): - return ceil(len(self.files) / self.batch_sz) - - def __getitem__(self, item): - item = item * self.batch_sz - wavs = None - files = [] - ends = [] - for k in range(self.batch_sz): - ind = k+item - if ind >= len(self.files): - break - - try: - wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate) - assert sr == 22050 - # Get rid of all channels except one. - if wav.shape[1] > 1: - wav = wav[:, 0] - - if wavs is None: - wavs = wav - else: - wavs = np.concatenate([wavs, wav]) - ends.append(wavs.shape[0]) - files.append(self.files[ind]) - except: - print(f'Error loading {self.files[ind]}') - stft = self.separator.stft(wavs) - return { - 'audio': wavs, - 'files': files, - 'ends': ends, - 'stft': stft - } - -def invert_spectrogram_and_save(args, queue): - separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False) - out_file = args.out - unacceptable_files = open(out_file, 'a') - - while True: - combo = queue.get() - if combo is None: - break - vocals, bg, wavlen, files, ends = combo - vocals = separator.stft(vocals, inverse=True, length=wavlen) - bg = separator.stft(vocals, inverse=True, length=wavlen) - start = 0 - for path, end in zip(files, ends): - vmax = np.abs(vocals[start:end]).mean() - bmax = np.abs(bg[start:end]).mean() - start = end - - # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough. - ratio = vmax / (bmax+.0000001) - if ratio < 18: # These values were derived empirically - unacceptable_files.write(f'{path[0]}\n') - unacceptable_files.flush() - - unacceptable_files.close() def main(): @@ -180,20 +25,18 @@ def main(): resume_file = args.resume worker_queue = multiprocessing.Queue() - from scripts.audio.preparation.useless import invert_spectrogram_and_save worker = multiprocessing.Process(target=invert_spectrogram_and_save, args=(args, worker_queue)) worker.start() loader = DataLoader(SpleeterDataset(src_dir, batch_sz=16, sample_rate=output_sample_rate, max_duration=10, partition=args.partition, partition_size=args.partition_size, - resume=resume_file), batch_size=1, num_workers=0) + resume=resume_file), batch_size=1, num_workers=1) separator = Separator('spleeter:2stems', multiprocess=False) - for k in range(100): - for batch in tqdm(loader): - audio, files, ends, stft = batch['audio'], batch['files'], batch['ends'], batch['stft'] - sep = separator.separate_spectrogram(stft.squeeze(0).numpy()) - worker_queue.put((sep['vocals'], sep['accompaniment'], audio.shape[1], files, ends)) + for batch in tqdm(loader): + audio, files, ends, stft = batch['audio'], batch['files'], batch['ends'], batch['stft'] + sep = separator.separate_spectrogram(stft.squeeze(0).numpy()) + worker_queue.put((sep['vocals'], sep['accompaniment'], audio.shape[1], files, ends)) worker_queue.put(None) worker.join() diff --git a/codes/scripts/audio/preparation/spleeter_filter_noisy_clips_with_pytorch.py b/codes/scripts/audio/preparation/spleeter_filter_noisy_clips_with_pytorch.py deleted file mode 100644 index 6450c1f2..00000000 --- a/codes/scripts/audio/preparation/spleeter_filter_noisy_clips_with_pytorch.py +++ /dev/null @@ -1,47 +0,0 @@ -from scipy.io import wavfile -import os - -import numpy as np -from scipy.io import wavfile -from torch.utils.data import DataLoader -from tqdm import tqdm - -from models.spleeter.separator import Separator -from scripts.audio.preparation.spleeter_dataset import SpleeterDataset - - -# Note: The Pytorch implementation of Spleeter is not working correctly. Fixing this would significantly -# speed up the script since we can separate out dataloading and do batch inference. -def main(): - src_dir = 'F:\\split\\joe_rogan' - output_sample_rate=22050 - batch_size=16 - - dl = DataLoader(SpleeterDataset(src_dir, output_sample_rate, skip=batch_size*33000), batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=True) - separator = Separator('pretrained_models/2stems', input_sr=output_sample_rate) - unacceptable_files = open('unacceptable.txt', 'a') - for batch in tqdm(dl): - waves = batch['wave'] - paths = batch['path'] - durations = batch['duration'] - - sep = separator.separate(waves) - for j in range(sep['vocals'].shape[0]): - vocals = sep['vocals'][j][:durations[j]] - bg = sep['accompaniment'][j][:durations[j]] - vmax = np.abs(vocals[output_sample_rate:-output_sample_rate]).mean() - bmax = np.abs(bg[output_sample_rate:-output_sample_rate]).mean() - - # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough. - ratio = vmax / (bmax+.0000001) - if ratio < 4: # These values were derived empirically - unacceptable_files.write(f'{paths[j]}\n') - unacceptable_files.flush() - unacceptable_files.close() - - -# Uses torch spleeter to divide audio clips into one of two bins: -# 1. Audio has little to no background noise, saved to "output_dir" -# 2. Audio has a lot of background noise, bg noise split off and saved to "output_dir_bg" -if __name__ == '__main__': - main() diff --git a/codes/scripts/audio/preparation/spleeter_utils/__init__.py b/codes/scripts/audio/preparation/spleeter_utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/codes/scripts/audio/preparation/spleeter_utils/filter_noisy_clips_collector.py b/codes/scripts/audio/preparation/spleeter_utils/filter_noisy_clips_collector.py new file mode 100644 index 00000000..1f59367c --- /dev/null +++ b/codes/scripts/audio/preparation/spleeter_utils/filter_noisy_clips_collector.py @@ -0,0 +1,28 @@ +from scripts.audio.preparation.spleeter_utils.spleeter_separator_mod import Separator +import numpy as np + +def invert_spectrogram_and_save(args, queue): + separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False) + out_file = args.out + unacceptable_files = open(out_file, 'a') + + while True: + combo = queue.get() + if combo is None: + break + vocals, bg, wavlen, files, ends = combo + vocals = separator.stft(vocals, inverse=True, length=wavlen) + bg = separator.stft(bg, inverse=True, length=wavlen) + start = 0 + for path, end in zip(files, ends): + vmax = np.abs(vocals[start:end]).mean() + bmax = np.abs(bg[start:end]).mean() + start = end + + # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough. + ratio = vmax / (bmax+.0000001) + if ratio < 18: # These values were derived empirically + unacceptable_files.write(f'{path[0]}\n') + unacceptable_files.flush() + + unacceptable_files.close() \ No newline at end of file diff --git a/codes/scripts/audio/preparation/spleeter_utils/spleeter_dataset.py b/codes/scripts/audio/preparation/spleeter_utils/spleeter_dataset.py new file mode 100644 index 00000000..e96befa4 --- /dev/null +++ b/codes/scripts/audio/preparation/spleeter_utils/spleeter_dataset.py @@ -0,0 +1,69 @@ +from math import ceil + +import numpy as np + +from spleeter.audio.adapter import AudioAdapter +from torch.utils.data import Dataset + +from data.util import find_audio_files +from scripts.audio.preparation.spleeter_utils.spleeter_separator_mod import Separator + + +class SpleeterDataset(Dataset): + def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None): + self.batch_sz = batch_sz + self.max_duration = max_duration + self.files = find_audio_files(src_dir, include_nonwav=True) + self.sample_rate = sample_rate + self.separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False) + + # Partition files if needed. + if partition_size is not None: + psz = int(partition_size) + prt = int(partition) + self.files = self.files[prt * psz:(prt + 1) * psz] + + # Find the resume point and carry on from there. + if resume is not None: + for i, f in enumerate(self.files): + if resume in f: + break + assert i < len(self.files) + self.files = self.files[i:] + self.loader = AudioAdapter.default() + + def __len__(self): + return ceil(len(self.files) / self.batch_sz) + + def __getitem__(self, item): + item = item * self.batch_sz + wavs = None + files = [] + ends = [] + for k in range(self.batch_sz): + ind = k+item + if ind >= len(self.files): + break + + try: + wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate) + assert sr == 22050 + # Get rid of all channels except one. + if wav.shape[1] > 1: + wav = wav[:, 0] + + if wavs is None: + wavs = wav + else: + wavs = np.concatenate([wavs, wav]) + ends.append(wavs.shape[0]) + files.append(self.files[ind]) + except: + print(f'Error loading {self.files[ind]}') + stft = self.separator.stft(wavs) + return { + 'audio': wavs, + 'files': files, + 'ends': ends, + 'stft': stft + } diff --git a/codes/scripts/audio/preparation/spleeter_separator_mod.py b/codes/scripts/audio/preparation/spleeter_utils/spleeter_separator_mod.py similarity index 100% rename from codes/scripts/audio/preparation/spleeter_separator_mod.py rename to codes/scripts/audio/preparation/spleeter_utils/spleeter_separator_mod.py diff --git a/codes/scripts/audio/spleeter_split_voice_and_background.py b/codes/scripts/audio/spleeter_split_voice_and_background.py index cad9b525..ed152f74 100644 --- a/codes/scripts/audio/spleeter_split_voice_and_background.py +++ b/codes/scripts/audio/spleeter_split_voice_and_background.py @@ -9,7 +9,7 @@ from spleeter.audio.adapter import AudioAdapter import numpy as np -# Uses spleeter to divide audio clips into one of two bins: +# Uses spleeter_utils to divide audio clips into one of two bins: # 1. Audio has little to no background noise, saved to "output_dir" # 2. Audio has a lot of background noise, bg noise split off and saved to "output_dir_bg" if __name__ == '__main__':