More fixes
This commit is contained in:
parent
932ea29a83
commit
32ba496632
0
codes/scripts/__init__.py
Normal file
0
codes/scripts/__init__.py
Normal file
0
codes/scripts/audio/__init__.py
Normal file
0
codes/scripts/audio/__init__.py
Normal file
0
codes/scripts/audio/preparation/__init__.py
Normal file
0
codes/scripts/audio/preparation/__init__.py
Normal file
|
@ -1,85 +0,0 @@
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
from scipy.signal.windows import hann
|
|
||||||
from spleeter.audio.adapter import AudioAdapter
|
|
||||||
from torch.utils.data import Dataset
|
|
||||||
import numpy as np
|
|
||||||
import librosa
|
|
||||||
|
|
||||||
from data.util import find_audio_files
|
|
||||||
|
|
||||||
|
|
||||||
def spleeter_stft(
|
|
||||||
data: np.ndarray, inverse: bool = False, length: Optional[int] = None
|
|
||||||
) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
Single entrypoint for both stft and istft. This computes stft and
|
|
||||||
istft with librosa on stereo data. The two channels are processed
|
|
||||||
separately and are concatenated together in the result. The
|
|
||||||
expected input formats are: (n_samples, 2) for stft and (T, F, 2)
|
|
||||||
for istft.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
data (numpy.array):
|
|
||||||
Array with either the waveform or the complex spectrogram
|
|
||||||
depending on the parameter inverse
|
|
||||||
inverse (bool):
|
|
||||||
(Optional) Should a stft or an istft be computed.
|
|
||||||
length (Optional[int]):
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
numpy.ndarray:
|
|
||||||
Stereo data as numpy array for the transform. The channels
|
|
||||||
are stored in the last dimension.
|
|
||||||
"""
|
|
||||||
assert not (inverse and length is None)
|
|
||||||
data = np.asfortranarray(data)
|
|
||||||
N = 4096
|
|
||||||
H = 1024
|
|
||||||
win = hann(N, sym=False)
|
|
||||||
fstft = librosa.core.istft if inverse else librosa.core.stft
|
|
||||||
win_len_arg = {"win_length": None, "length": None} if inverse else {"n_fft": N}
|
|
||||||
n_channels = data.shape[-1]
|
|
||||||
out = []
|
|
||||||
for c in range(n_channels):
|
|
||||||
d = (
|
|
||||||
np.concatenate((np.zeros((N,)), data[:, c], np.zeros((N,))))
|
|
||||||
if not inverse
|
|
||||||
else data[:, :, c].T
|
|
||||||
)
|
|
||||||
s = fstft(d, hop_length=H, window=win, center=False, **win_len_arg)
|
|
||||||
if inverse:
|
|
||||||
s = s[N: N + length]
|
|
||||||
s = np.expand_dims(s.T, 2 - inverse)
|
|
||||||
out.append(s)
|
|
||||||
if len(out) == 1:
|
|
||||||
return out[0]
|
|
||||||
return np.concatenate(out, axis=2 - inverse)
|
|
||||||
|
|
||||||
|
|
||||||
class SpleeterDataset(Dataset):
|
|
||||||
def __init__(self, src_dir, sample_rate=22050, max_duration=20, skip=0):
|
|
||||||
self.files = find_audio_files(src_dir, include_nonwav=True)
|
|
||||||
if skip > 0:
|
|
||||||
self.files = self.files[skip:]
|
|
||||||
self.audio_loader = AudioAdapter.default()
|
|
||||||
self.sample_rate = sample_rate
|
|
||||||
self.max_duration = max_duration
|
|
||||||
|
|
||||||
def __getitem__(self, item):
|
|
||||||
file = self.files[item]
|
|
||||||
wave, sample_rate = self.audio_loader.load(file, sample_rate=self.sample_rate)
|
|
||||||
assert sample_rate == self.sample_rate
|
|
||||||
stft = torch.tensor(spleeter_stft(wave))
|
|
||||||
# TODO: pad this up so it can be batched.
|
|
||||||
return {
|
|
||||||
'path': file,
|
|
||||||
'wave': wave,
|
|
||||||
'stft': stft,
|
|
||||||
#'duration': original_duration,
|
|
||||||
}
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self.files)
|
|
|
@ -1,169 +1,14 @@
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
from math import ceil
|
|
||||||
|
|
||||||
from scipy.io import wavfile
|
|
||||||
import os
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
from torch.utils.data import DataLoader
|
||||||
from scipy.io import wavfile
|
|
||||||
from torch.utils.data import Dataset, DataLoader
|
|
||||||
from tqdm import tqdm
|
|
||||||
from spleeter.audio.adapter import AudioAdapter
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from data.util import IMG_EXTENSIONS
|
from scripts.audio.preparation.spleeter_utils.filter_noisy_clips_collector import invert_spectrogram_and_save
|
||||||
from scripts.audio.preparation.spleeter_separator_mod import Separator
|
from scripts.audio.preparation.spleeter_utils.spleeter_dataset import SpleeterDataset
|
||||||
|
from scripts.audio.preparation.spleeter_utils.spleeter_separator_mod import Separator
|
||||||
|
|
||||||
|
|
||||||
def is_image_file(filename):
|
|
||||||
return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
|
|
||||||
|
|
||||||
|
|
||||||
def is_wav_file(filename):
|
|
||||||
return filename.endswith('.wav')
|
|
||||||
|
|
||||||
|
|
||||||
def is_audio_file(filename):
|
|
||||||
AUDIO_EXTENSIONS = ['.wav', '.mp3', '.wma', 'm4b']
|
|
||||||
return any(filename.endswith(extension) for extension in AUDIO_EXTENSIONS)
|
|
||||||
|
|
||||||
|
|
||||||
def _get_paths_from_images(path, qualifier=is_image_file):
|
|
||||||
"""get image path list from image folder"""
|
|
||||||
assert os.path.isdir(path), '{:s} is not a valid directory'.format(path)
|
|
||||||
images = []
|
|
||||||
for dirpath, _, fnames in sorted(os.walk(path)):
|
|
||||||
for fname in sorted(fnames):
|
|
||||||
if qualifier(fname) and 'ref.jpg' not in fname:
|
|
||||||
img_path = os.path.join(dirpath, fname)
|
|
||||||
images.append(img_path)
|
|
||||||
if not images:
|
|
||||||
print("Warning: {:s} has no valid image file".format(path))
|
|
||||||
return images
|
|
||||||
|
|
||||||
|
|
||||||
def _get_paths_from_lmdb(dataroot):
|
|
||||||
"""get image path list from lmdb meta info"""
|
|
||||||
meta_info = pickle.load(open(os.path.join(dataroot, 'meta_info.pkl'), 'rb'))
|
|
||||||
paths = meta_info['keys']
|
|
||||||
sizes = meta_info['resolution']
|
|
||||||
if len(sizes) == 1:
|
|
||||||
sizes = sizes * len(paths)
|
|
||||||
return paths, sizes
|
|
||||||
|
|
||||||
|
|
||||||
def find_audio_files(dataroot, include_nonwav=False):
|
|
||||||
if include_nonwav:
|
|
||||||
return find_files_of_type(None, dataroot, qualifier=is_audio_file)[0]
|
|
||||||
else:
|
|
||||||
return find_files_of_type(None, dataroot, qualifier=is_wav_file)[0]
|
|
||||||
|
|
||||||
|
|
||||||
def find_files_of_type(data_type, dataroot, weights=[], qualifier=is_image_file):
|
|
||||||
if isinstance(dataroot, list):
|
|
||||||
paths = []
|
|
||||||
for i in range(len(dataroot)):
|
|
||||||
r = dataroot[i]
|
|
||||||
extends = 1
|
|
||||||
|
|
||||||
# Weights have the effect of repeatedly adding the paths from the given root to the final product.
|
|
||||||
if weights:
|
|
||||||
extends = weights[i]
|
|
||||||
for j in range(extends):
|
|
||||||
paths.extend(_get_paths_from_images(r, qualifier))
|
|
||||||
paths = sorted(paths)
|
|
||||||
sizes = len(paths)
|
|
||||||
else:
|
|
||||||
paths = sorted(_get_paths_from_images(dataroot, qualifier))
|
|
||||||
sizes = len(paths)
|
|
||||||
return paths, sizes
|
|
||||||
|
|
||||||
|
|
||||||
class SpleeterDataset(Dataset):
|
|
||||||
def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None):
|
|
||||||
self.batch_sz = batch_sz
|
|
||||||
self.max_duration = max_duration
|
|
||||||
self.files = find_audio_files(src_dir, include_nonwav=True)
|
|
||||||
self.sample_rate = sample_rate
|
|
||||||
self.separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False)
|
|
||||||
|
|
||||||
# Partition files if needed.
|
|
||||||
if partition_size is not None:
|
|
||||||
psz = int(partition_size)
|
|
||||||
prt = int(partition)
|
|
||||||
self.files = self.files[prt * psz:(prt + 1) * psz]
|
|
||||||
|
|
||||||
# Find the resume point and carry on from there.
|
|
||||||
if resume is not None:
|
|
||||||
for i, f in enumerate(self.files):
|
|
||||||
if resume in f:
|
|
||||||
break
|
|
||||||
assert i < len(self.files)
|
|
||||||
self.files = self.files[i:]
|
|
||||||
self.loader = AudioAdapter.default()
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return ceil(len(self.files) / self.batch_sz)
|
|
||||||
|
|
||||||
def __getitem__(self, item):
|
|
||||||
item = item * self.batch_sz
|
|
||||||
wavs = None
|
|
||||||
files = []
|
|
||||||
ends = []
|
|
||||||
for k in range(self.batch_sz):
|
|
||||||
ind = k+item
|
|
||||||
if ind >= len(self.files):
|
|
||||||
break
|
|
||||||
|
|
||||||
try:
|
|
||||||
wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate)
|
|
||||||
assert sr == 22050
|
|
||||||
# Get rid of all channels except one.
|
|
||||||
if wav.shape[1] > 1:
|
|
||||||
wav = wav[:, 0]
|
|
||||||
|
|
||||||
if wavs is None:
|
|
||||||
wavs = wav
|
|
||||||
else:
|
|
||||||
wavs = np.concatenate([wavs, wav])
|
|
||||||
ends.append(wavs.shape[0])
|
|
||||||
files.append(self.files[ind])
|
|
||||||
except:
|
|
||||||
print(f'Error loading {self.files[ind]}')
|
|
||||||
stft = self.separator.stft(wavs)
|
|
||||||
return {
|
|
||||||
'audio': wavs,
|
|
||||||
'files': files,
|
|
||||||
'ends': ends,
|
|
||||||
'stft': stft
|
|
||||||
}
|
|
||||||
|
|
||||||
def invert_spectrogram_and_save(args, queue):
|
|
||||||
separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False)
|
|
||||||
out_file = args.out
|
|
||||||
unacceptable_files = open(out_file, 'a')
|
|
||||||
|
|
||||||
while True:
|
|
||||||
combo = queue.get()
|
|
||||||
if combo is None:
|
|
||||||
break
|
|
||||||
vocals, bg, wavlen, files, ends = combo
|
|
||||||
vocals = separator.stft(vocals, inverse=True, length=wavlen)
|
|
||||||
bg = separator.stft(vocals, inverse=True, length=wavlen)
|
|
||||||
start = 0
|
|
||||||
for path, end in zip(files, ends):
|
|
||||||
vmax = np.abs(vocals[start:end]).mean()
|
|
||||||
bmax = np.abs(bg[start:end]).mean()
|
|
||||||
start = end
|
|
||||||
|
|
||||||
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
|
|
||||||
ratio = vmax / (bmax+.0000001)
|
|
||||||
if ratio < 18: # These values were derived empirically
|
|
||||||
unacceptable_files.write(f'{path[0]}\n')
|
|
||||||
unacceptable_files.flush()
|
|
||||||
|
|
||||||
unacceptable_files.close()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -180,20 +25,18 @@ def main():
|
||||||
resume_file = args.resume
|
resume_file = args.resume
|
||||||
|
|
||||||
worker_queue = multiprocessing.Queue()
|
worker_queue = multiprocessing.Queue()
|
||||||
from scripts.audio.preparation.useless import invert_spectrogram_and_save
|
|
||||||
worker = multiprocessing.Process(target=invert_spectrogram_and_save, args=(args, worker_queue))
|
worker = multiprocessing.Process(target=invert_spectrogram_and_save, args=(args, worker_queue))
|
||||||
worker.start()
|
worker.start()
|
||||||
|
|
||||||
loader = DataLoader(SpleeterDataset(src_dir, batch_sz=16, sample_rate=output_sample_rate,
|
loader = DataLoader(SpleeterDataset(src_dir, batch_sz=16, sample_rate=output_sample_rate,
|
||||||
max_duration=10, partition=args.partition, partition_size=args.partition_size,
|
max_duration=10, partition=args.partition, partition_size=args.partition_size,
|
||||||
resume=resume_file), batch_size=1, num_workers=0)
|
resume=resume_file), batch_size=1, num_workers=1)
|
||||||
|
|
||||||
separator = Separator('spleeter:2stems', multiprocess=False)
|
separator = Separator('spleeter:2stems', multiprocess=False)
|
||||||
for k in range(100):
|
for batch in tqdm(loader):
|
||||||
for batch in tqdm(loader):
|
audio, files, ends, stft = batch['audio'], batch['files'], batch['ends'], batch['stft']
|
||||||
audio, files, ends, stft = batch['audio'], batch['files'], batch['ends'], batch['stft']
|
sep = separator.separate_spectrogram(stft.squeeze(0).numpy())
|
||||||
sep = separator.separate_spectrogram(stft.squeeze(0).numpy())
|
worker_queue.put((sep['vocals'], sep['accompaniment'], audio.shape[1], files, ends))
|
||||||
worker_queue.put((sep['vocals'], sep['accompaniment'], audio.shape[1], files, ends))
|
|
||||||
worker_queue.put(None)
|
worker_queue.put(None)
|
||||||
worker.join()
|
worker.join()
|
||||||
|
|
||||||
|
|
|
@ -1,47 +0,0 @@
|
||||||
from scipy.io import wavfile
|
|
||||||
import os
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from scipy.io import wavfile
|
|
||||||
from torch.utils.data import DataLoader
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from models.spleeter.separator import Separator
|
|
||||||
from scripts.audio.preparation.spleeter_dataset import SpleeterDataset
|
|
||||||
|
|
||||||
|
|
||||||
# Note: The Pytorch implementation of Spleeter is not working correctly. Fixing this would significantly
|
|
||||||
# speed up the script since we can separate out dataloading and do batch inference.
|
|
||||||
def main():
|
|
||||||
src_dir = 'F:\\split\\joe_rogan'
|
|
||||||
output_sample_rate=22050
|
|
||||||
batch_size=16
|
|
||||||
|
|
||||||
dl = DataLoader(SpleeterDataset(src_dir, output_sample_rate, skip=batch_size*33000), batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=True)
|
|
||||||
separator = Separator('pretrained_models/2stems', input_sr=output_sample_rate)
|
|
||||||
unacceptable_files = open('unacceptable.txt', 'a')
|
|
||||||
for batch in tqdm(dl):
|
|
||||||
waves = batch['wave']
|
|
||||||
paths = batch['path']
|
|
||||||
durations = batch['duration']
|
|
||||||
|
|
||||||
sep = separator.separate(waves)
|
|
||||||
for j in range(sep['vocals'].shape[0]):
|
|
||||||
vocals = sep['vocals'][j][:durations[j]]
|
|
||||||
bg = sep['accompaniment'][j][:durations[j]]
|
|
||||||
vmax = np.abs(vocals[output_sample_rate:-output_sample_rate]).mean()
|
|
||||||
bmax = np.abs(bg[output_sample_rate:-output_sample_rate]).mean()
|
|
||||||
|
|
||||||
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
|
|
||||||
ratio = vmax / (bmax+.0000001)
|
|
||||||
if ratio < 4: # These values were derived empirically
|
|
||||||
unacceptable_files.write(f'{paths[j]}\n')
|
|
||||||
unacceptable_files.flush()
|
|
||||||
unacceptable_files.close()
|
|
||||||
|
|
||||||
|
|
||||||
# Uses torch spleeter to divide audio clips into one of two bins:
|
|
||||||
# 1. Audio has little to no background noise, saved to "output_dir"
|
|
||||||
# 2. Audio has a lot of background noise, bg noise split off and saved to "output_dir_bg"
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
from scripts.audio.preparation.spleeter_utils.spleeter_separator_mod import Separator
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def invert_spectrogram_and_save(args, queue):
|
||||||
|
separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False)
|
||||||
|
out_file = args.out
|
||||||
|
unacceptable_files = open(out_file, 'a')
|
||||||
|
|
||||||
|
while True:
|
||||||
|
combo = queue.get()
|
||||||
|
if combo is None:
|
||||||
|
break
|
||||||
|
vocals, bg, wavlen, files, ends = combo
|
||||||
|
vocals = separator.stft(vocals, inverse=True, length=wavlen)
|
||||||
|
bg = separator.stft(bg, inverse=True, length=wavlen)
|
||||||
|
start = 0
|
||||||
|
for path, end in zip(files, ends):
|
||||||
|
vmax = np.abs(vocals[start:end]).mean()
|
||||||
|
bmax = np.abs(bg[start:end]).mean()
|
||||||
|
start = end
|
||||||
|
|
||||||
|
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
|
||||||
|
ratio = vmax / (bmax+.0000001)
|
||||||
|
if ratio < 18: # These values were derived empirically
|
||||||
|
unacceptable_files.write(f'{path[0]}\n')
|
||||||
|
unacceptable_files.flush()
|
||||||
|
|
||||||
|
unacceptable_files.close()
|
|
@ -0,0 +1,69 @@
|
||||||
|
from math import ceil
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from spleeter.audio.adapter import AudioAdapter
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
|
||||||
|
from data.util import find_audio_files
|
||||||
|
from scripts.audio.preparation.spleeter_utils.spleeter_separator_mod import Separator
|
||||||
|
|
||||||
|
|
||||||
|
class SpleeterDataset(Dataset):
|
||||||
|
def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None):
|
||||||
|
self.batch_sz = batch_sz
|
||||||
|
self.max_duration = max_duration
|
||||||
|
self.files = find_audio_files(src_dir, include_nonwav=True)
|
||||||
|
self.sample_rate = sample_rate
|
||||||
|
self.separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False)
|
||||||
|
|
||||||
|
# Partition files if needed.
|
||||||
|
if partition_size is not None:
|
||||||
|
psz = int(partition_size)
|
||||||
|
prt = int(partition)
|
||||||
|
self.files = self.files[prt * psz:(prt + 1) * psz]
|
||||||
|
|
||||||
|
# Find the resume point and carry on from there.
|
||||||
|
if resume is not None:
|
||||||
|
for i, f in enumerate(self.files):
|
||||||
|
if resume in f:
|
||||||
|
break
|
||||||
|
assert i < len(self.files)
|
||||||
|
self.files = self.files[i:]
|
||||||
|
self.loader = AudioAdapter.default()
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return ceil(len(self.files) / self.batch_sz)
|
||||||
|
|
||||||
|
def __getitem__(self, item):
|
||||||
|
item = item * self.batch_sz
|
||||||
|
wavs = None
|
||||||
|
files = []
|
||||||
|
ends = []
|
||||||
|
for k in range(self.batch_sz):
|
||||||
|
ind = k+item
|
||||||
|
if ind >= len(self.files):
|
||||||
|
break
|
||||||
|
|
||||||
|
try:
|
||||||
|
wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate)
|
||||||
|
assert sr == 22050
|
||||||
|
# Get rid of all channels except one.
|
||||||
|
if wav.shape[1] > 1:
|
||||||
|
wav = wav[:, 0]
|
||||||
|
|
||||||
|
if wavs is None:
|
||||||
|
wavs = wav
|
||||||
|
else:
|
||||||
|
wavs = np.concatenate([wavs, wav])
|
||||||
|
ends.append(wavs.shape[0])
|
||||||
|
files.append(self.files[ind])
|
||||||
|
except:
|
||||||
|
print(f'Error loading {self.files[ind]}')
|
||||||
|
stft = self.separator.stft(wavs)
|
||||||
|
return {
|
||||||
|
'audio': wavs,
|
||||||
|
'files': files,
|
||||||
|
'ends': ends,
|
||||||
|
'stft': stft
|
||||||
|
}
|
|
@ -9,7 +9,7 @@ from spleeter.audio.adapter import AudioAdapter
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
# Uses spleeter to divide audio clips into one of two bins:
|
# Uses spleeter_utils to divide audio clips into one of two bins:
|
||||||
# 1. Audio has little to no background noise, saved to "output_dir"
|
# 1. Audio has little to no background noise, saved to "output_dir"
|
||||||
# 2. Audio has a lot of background noise, bg noise split off and saved to "output_dir_bg"
|
# 2. Audio has a lot of background noise, bg noise split off and saved to "output_dir_bg"
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Reference in New Issue
Block a user