More fixes

This commit is contained in:
James Betker 2021-10-09 23:27:14 -06:00
parent 932ea29a83
commit 32ba496632
11 changed files with 107 additions and 299 deletions

View File

View File

View File

@ -1,85 +0,0 @@
from typing import Optional
import torch
import torch.nn as nn
from scipy.signal.windows import hann
from spleeter.audio.adapter import AudioAdapter
from torch.utils.data import Dataset
import numpy as np
import librosa
from data.util import find_audio_files
def spleeter_stft(
data: np.ndarray, inverse: bool = False, length: Optional[int] = None
) -> np.ndarray:
"""
Single entrypoint for both stft and istft. This computes stft and
istft with librosa on stereo data. The two channels are processed
separately and are concatenated together in the result. The
expected input formats are: (n_samples, 2) for stft and (T, F, 2)
for istft.
Parameters:
data (numpy.array):
Array with either the waveform or the complex spectrogram
depending on the parameter inverse
inverse (bool):
(Optional) Should a stft or an istft be computed.
length (Optional[int]):
Returns:
numpy.ndarray:
Stereo data as numpy array for the transform. The channels
are stored in the last dimension.
"""
assert not (inverse and length is None)
data = np.asfortranarray(data)
N = 4096
H = 1024
win = hann(N, sym=False)
fstft = librosa.core.istft if inverse else librosa.core.stft
win_len_arg = {"win_length": None, "length": None} if inverse else {"n_fft": N}
n_channels = data.shape[-1]
out = []
for c in range(n_channels):
d = (
np.concatenate((np.zeros((N,)), data[:, c], np.zeros((N,))))
if not inverse
else data[:, :, c].T
)
s = fstft(d, hop_length=H, window=win, center=False, **win_len_arg)
if inverse:
s = s[N: N + length]
s = np.expand_dims(s.T, 2 - inverse)
out.append(s)
if len(out) == 1:
return out[0]
return np.concatenate(out, axis=2 - inverse)
class SpleeterDataset(Dataset):
def __init__(self, src_dir, sample_rate=22050, max_duration=20, skip=0):
self.files = find_audio_files(src_dir, include_nonwav=True)
if skip > 0:
self.files = self.files[skip:]
self.audio_loader = AudioAdapter.default()
self.sample_rate = sample_rate
self.max_duration = max_duration
def __getitem__(self, item):
file = self.files[item]
wave, sample_rate = self.audio_loader.load(file, sample_rate=self.sample_rate)
assert sample_rate == self.sample_rate
stft = torch.tensor(spleeter_stft(wave))
# TODO: pad this up so it can be batched.
return {
'path': file,
'wave': wave,
'stft': stft,
#'duration': original_duration,
}
def __len__(self):
return len(self.files)

View File

@ -1,169 +1,14 @@
import multiprocessing
from math import ceil
from scipy.io import wavfile
import os
import argparse
import numpy as np
from scipy.io import wavfile
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from spleeter.audio.adapter import AudioAdapter
from torch.utils.data import DataLoader
from tqdm import tqdm
from data.util import IMG_EXTENSIONS
from scripts.audio.preparation.spleeter_separator_mod import Separator
from scripts.audio.preparation.spleeter_utils.filter_noisy_clips_collector import invert_spectrogram_and_save
from scripts.audio.preparation.spleeter_utils.spleeter_dataset import SpleeterDataset
from scripts.audio.preparation.spleeter_utils.spleeter_separator_mod import Separator
def is_image_file(filename):
return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
def is_wav_file(filename):
return filename.endswith('.wav')
def is_audio_file(filename):
AUDIO_EXTENSIONS = ['.wav', '.mp3', '.wma', 'm4b']
return any(filename.endswith(extension) for extension in AUDIO_EXTENSIONS)
def _get_paths_from_images(path, qualifier=is_image_file):
"""get image path list from image folder"""
assert os.path.isdir(path), '{:s} is not a valid directory'.format(path)
images = []
for dirpath, _, fnames in sorted(os.walk(path)):
for fname in sorted(fnames):
if qualifier(fname) and 'ref.jpg' not in fname:
img_path = os.path.join(dirpath, fname)
images.append(img_path)
if not images:
print("Warning: {:s} has no valid image file".format(path))
return images
def _get_paths_from_lmdb(dataroot):
"""get image path list from lmdb meta info"""
meta_info = pickle.load(open(os.path.join(dataroot, 'meta_info.pkl'), 'rb'))
paths = meta_info['keys']
sizes = meta_info['resolution']
if len(sizes) == 1:
sizes = sizes * len(paths)
return paths, sizes
def find_audio_files(dataroot, include_nonwav=False):
if include_nonwav:
return find_files_of_type(None, dataroot, qualifier=is_audio_file)[0]
else:
return find_files_of_type(None, dataroot, qualifier=is_wav_file)[0]
def find_files_of_type(data_type, dataroot, weights=[], qualifier=is_image_file):
if isinstance(dataroot, list):
paths = []
for i in range(len(dataroot)):
r = dataroot[i]
extends = 1
# Weights have the effect of repeatedly adding the paths from the given root to the final product.
if weights:
extends = weights[i]
for j in range(extends):
paths.extend(_get_paths_from_images(r, qualifier))
paths = sorted(paths)
sizes = len(paths)
else:
paths = sorted(_get_paths_from_images(dataroot, qualifier))
sizes = len(paths)
return paths, sizes
class SpleeterDataset(Dataset):
def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None):
self.batch_sz = batch_sz
self.max_duration = max_duration
self.files = find_audio_files(src_dir, include_nonwav=True)
self.sample_rate = sample_rate
self.separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False)
# Partition files if needed.
if partition_size is not None:
psz = int(partition_size)
prt = int(partition)
self.files = self.files[prt * psz:(prt + 1) * psz]
# Find the resume point and carry on from there.
if resume is not None:
for i, f in enumerate(self.files):
if resume in f:
break
assert i < len(self.files)
self.files = self.files[i:]
self.loader = AudioAdapter.default()
def __len__(self):
return ceil(len(self.files) / self.batch_sz)
def __getitem__(self, item):
item = item * self.batch_sz
wavs = None
files = []
ends = []
for k in range(self.batch_sz):
ind = k+item
if ind >= len(self.files):
break
try:
wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate)
assert sr == 22050
# Get rid of all channels except one.
if wav.shape[1] > 1:
wav = wav[:, 0]
if wavs is None:
wavs = wav
else:
wavs = np.concatenate([wavs, wav])
ends.append(wavs.shape[0])
files.append(self.files[ind])
except:
print(f'Error loading {self.files[ind]}')
stft = self.separator.stft(wavs)
return {
'audio': wavs,
'files': files,
'ends': ends,
'stft': stft
}
def invert_spectrogram_and_save(args, queue):
separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False)
out_file = args.out
unacceptable_files = open(out_file, 'a')
while True:
combo = queue.get()
if combo is None:
break
vocals, bg, wavlen, files, ends = combo
vocals = separator.stft(vocals, inverse=True, length=wavlen)
bg = separator.stft(vocals, inverse=True, length=wavlen)
start = 0
for path, end in zip(files, ends):
vmax = np.abs(vocals[start:end]).mean()
bmax = np.abs(bg[start:end]).mean()
start = end
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
ratio = vmax / (bmax+.0000001)
if ratio < 18: # These values were derived empirically
unacceptable_files.write(f'{path[0]}\n')
unacceptable_files.flush()
unacceptable_files.close()
def main():
@ -180,20 +25,18 @@ def main():
resume_file = args.resume
worker_queue = multiprocessing.Queue()
from scripts.audio.preparation.useless import invert_spectrogram_and_save
worker = multiprocessing.Process(target=invert_spectrogram_and_save, args=(args, worker_queue))
worker.start()
loader = DataLoader(SpleeterDataset(src_dir, batch_sz=16, sample_rate=output_sample_rate,
max_duration=10, partition=args.partition, partition_size=args.partition_size,
resume=resume_file), batch_size=1, num_workers=0)
resume=resume_file), batch_size=1, num_workers=1)
separator = Separator('spleeter:2stems', multiprocess=False)
for k in range(100):
for batch in tqdm(loader):
audio, files, ends, stft = batch['audio'], batch['files'], batch['ends'], batch['stft']
sep = separator.separate_spectrogram(stft.squeeze(0).numpy())
worker_queue.put((sep['vocals'], sep['accompaniment'], audio.shape[1], files, ends))
for batch in tqdm(loader):
audio, files, ends, stft = batch['audio'], batch['files'], batch['ends'], batch['stft']
sep = separator.separate_spectrogram(stft.squeeze(0).numpy())
worker_queue.put((sep['vocals'], sep['accompaniment'], audio.shape[1], files, ends))
worker_queue.put(None)
worker.join()

View File

@ -1,47 +0,0 @@
from scipy.io import wavfile
import os
import numpy as np
from scipy.io import wavfile
from torch.utils.data import DataLoader
from tqdm import tqdm
from models.spleeter.separator import Separator
from scripts.audio.preparation.spleeter_dataset import SpleeterDataset
# Note: The Pytorch implementation of Spleeter is not working correctly. Fixing this would significantly
# speed up the script since we can separate out dataloading and do batch inference.
def main():
src_dir = 'F:\\split\\joe_rogan'
output_sample_rate=22050
batch_size=16
dl = DataLoader(SpleeterDataset(src_dir, output_sample_rate, skip=batch_size*33000), batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=True)
separator = Separator('pretrained_models/2stems', input_sr=output_sample_rate)
unacceptable_files = open('unacceptable.txt', 'a')
for batch in tqdm(dl):
waves = batch['wave']
paths = batch['path']
durations = batch['duration']
sep = separator.separate(waves)
for j in range(sep['vocals'].shape[0]):
vocals = sep['vocals'][j][:durations[j]]
bg = sep['accompaniment'][j][:durations[j]]
vmax = np.abs(vocals[output_sample_rate:-output_sample_rate]).mean()
bmax = np.abs(bg[output_sample_rate:-output_sample_rate]).mean()
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
ratio = vmax / (bmax+.0000001)
if ratio < 4: # These values were derived empirically
unacceptable_files.write(f'{paths[j]}\n')
unacceptable_files.flush()
unacceptable_files.close()
# Uses torch spleeter to divide audio clips into one of two bins:
# 1. Audio has little to no background noise, saved to "output_dir"
# 2. Audio has a lot of background noise, bg noise split off and saved to "output_dir_bg"
if __name__ == '__main__':
main()

View File

@ -0,0 +1,28 @@
from scripts.audio.preparation.spleeter_utils.spleeter_separator_mod import Separator
import numpy as np
def invert_spectrogram_and_save(args, queue):
separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False)
out_file = args.out
unacceptable_files = open(out_file, 'a')
while True:
combo = queue.get()
if combo is None:
break
vocals, bg, wavlen, files, ends = combo
vocals = separator.stft(vocals, inverse=True, length=wavlen)
bg = separator.stft(bg, inverse=True, length=wavlen)
start = 0
for path, end in zip(files, ends):
vmax = np.abs(vocals[start:end]).mean()
bmax = np.abs(bg[start:end]).mean()
start = end
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
ratio = vmax / (bmax+.0000001)
if ratio < 18: # These values were derived empirically
unacceptable_files.write(f'{path[0]}\n')
unacceptable_files.flush()
unacceptable_files.close()

View File

@ -0,0 +1,69 @@
from math import ceil
import numpy as np
from spleeter.audio.adapter import AudioAdapter
from torch.utils.data import Dataset
from data.util import find_audio_files
from scripts.audio.preparation.spleeter_utils.spleeter_separator_mod import Separator
class SpleeterDataset(Dataset):
def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None):
self.batch_sz = batch_sz
self.max_duration = max_duration
self.files = find_audio_files(src_dir, include_nonwav=True)
self.sample_rate = sample_rate
self.separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False)
# Partition files if needed.
if partition_size is not None:
psz = int(partition_size)
prt = int(partition)
self.files = self.files[prt * psz:(prt + 1) * psz]
# Find the resume point and carry on from there.
if resume is not None:
for i, f in enumerate(self.files):
if resume in f:
break
assert i < len(self.files)
self.files = self.files[i:]
self.loader = AudioAdapter.default()
def __len__(self):
return ceil(len(self.files) / self.batch_sz)
def __getitem__(self, item):
item = item * self.batch_sz
wavs = None
files = []
ends = []
for k in range(self.batch_sz):
ind = k+item
if ind >= len(self.files):
break
try:
wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate)
assert sr == 22050
# Get rid of all channels except one.
if wav.shape[1] > 1:
wav = wav[:, 0]
if wavs is None:
wavs = wav
else:
wavs = np.concatenate([wavs, wav])
ends.append(wavs.shape[0])
files.append(self.files[ind])
except:
print(f'Error loading {self.files[ind]}')
stft = self.separator.stft(wavs)
return {
'audio': wavs,
'files': files,
'ends': ends,
'stft': stft
}

View File

@ -9,7 +9,7 @@ from spleeter.audio.adapter import AudioAdapter
import numpy as np
# Uses spleeter to divide audio clips into one of two bins:
# Uses spleeter_utils to divide audio clips into one of two bins:
# 1. Audio has little to no background noise, saved to "output_dir"
# 2. Audio has a lot of background noise, bg noise split off and saved to "output_dir_bg"
if __name__ == '__main__':