More fixes
This commit is contained in:
parent
932ea29a83
commit
32ba496632
0
codes/scripts/__init__.py
Normal file
0
codes/scripts/__init__.py
Normal file
0
codes/scripts/audio/__init__.py
Normal file
0
codes/scripts/audio/__init__.py
Normal file
0
codes/scripts/audio/preparation/__init__.py
Normal file
0
codes/scripts/audio/preparation/__init__.py
Normal file
|
@ -1,85 +0,0 @@
|
|||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from scipy.signal.windows import hann
|
||||
from spleeter.audio.adapter import AudioAdapter
|
||||
from torch.utils.data import Dataset
|
||||
import numpy as np
|
||||
import librosa
|
||||
|
||||
from data.util import find_audio_files
|
||||
|
||||
|
||||
def spleeter_stft(
|
||||
data: np.ndarray, inverse: bool = False, length: Optional[int] = None
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Single entrypoint for both stft and istft. This computes stft and
|
||||
istft with librosa on stereo data. The two channels are processed
|
||||
separately and are concatenated together in the result. The
|
||||
expected input formats are: (n_samples, 2) for stft and (T, F, 2)
|
||||
for istft.
|
||||
|
||||
Parameters:
|
||||
data (numpy.array):
|
||||
Array with either the waveform or the complex spectrogram
|
||||
depending on the parameter inverse
|
||||
inverse (bool):
|
||||
(Optional) Should a stft or an istft be computed.
|
||||
length (Optional[int]):
|
||||
|
||||
Returns:
|
||||
numpy.ndarray:
|
||||
Stereo data as numpy array for the transform. The channels
|
||||
are stored in the last dimension.
|
||||
"""
|
||||
assert not (inverse and length is None)
|
||||
data = np.asfortranarray(data)
|
||||
N = 4096
|
||||
H = 1024
|
||||
win = hann(N, sym=False)
|
||||
fstft = librosa.core.istft if inverse else librosa.core.stft
|
||||
win_len_arg = {"win_length": None, "length": None} if inverse else {"n_fft": N}
|
||||
n_channels = data.shape[-1]
|
||||
out = []
|
||||
for c in range(n_channels):
|
||||
d = (
|
||||
np.concatenate((np.zeros((N,)), data[:, c], np.zeros((N,))))
|
||||
if not inverse
|
||||
else data[:, :, c].T
|
||||
)
|
||||
s = fstft(d, hop_length=H, window=win, center=False, **win_len_arg)
|
||||
if inverse:
|
||||
s = s[N: N + length]
|
||||
s = np.expand_dims(s.T, 2 - inverse)
|
||||
out.append(s)
|
||||
if len(out) == 1:
|
||||
return out[0]
|
||||
return np.concatenate(out, axis=2 - inverse)
|
||||
|
||||
|
||||
class SpleeterDataset(Dataset):
|
||||
def __init__(self, src_dir, sample_rate=22050, max_duration=20, skip=0):
|
||||
self.files = find_audio_files(src_dir, include_nonwav=True)
|
||||
if skip > 0:
|
||||
self.files = self.files[skip:]
|
||||
self.audio_loader = AudioAdapter.default()
|
||||
self.sample_rate = sample_rate
|
||||
self.max_duration = max_duration
|
||||
|
||||
def __getitem__(self, item):
|
||||
file = self.files[item]
|
||||
wave, sample_rate = self.audio_loader.load(file, sample_rate=self.sample_rate)
|
||||
assert sample_rate == self.sample_rate
|
||||
stft = torch.tensor(spleeter_stft(wave))
|
||||
# TODO: pad this up so it can be batched.
|
||||
return {
|
||||
'path': file,
|
||||
'wave': wave,
|
||||
'stft': stft,
|
||||
#'duration': original_duration,
|
||||
}
|
||||
|
||||
def __len__(self):
|
||||
return len(self.files)
|
|
@ -1,169 +1,14 @@
|
|||
import multiprocessing
|
||||
from math import ceil
|
||||
|
||||
from scipy.io import wavfile
|
||||
import os
|
||||
|
||||
import argparse
|
||||
import numpy as np
|
||||
from scipy.io import wavfile
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from tqdm import tqdm
|
||||
from spleeter.audio.adapter import AudioAdapter
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
|
||||
from data.util import IMG_EXTENSIONS
|
||||
from scripts.audio.preparation.spleeter_separator_mod import Separator
|
||||
from scripts.audio.preparation.spleeter_utils.filter_noisy_clips_collector import invert_spectrogram_and_save
|
||||
from scripts.audio.preparation.spleeter_utils.spleeter_dataset import SpleeterDataset
|
||||
from scripts.audio.preparation.spleeter_utils.spleeter_separator_mod import Separator
|
||||
|
||||
|
||||
def is_image_file(filename):
|
||||
return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
|
||||
|
||||
|
||||
def is_wav_file(filename):
|
||||
return filename.endswith('.wav')
|
||||
|
||||
|
||||
def is_audio_file(filename):
|
||||
AUDIO_EXTENSIONS = ['.wav', '.mp3', '.wma', 'm4b']
|
||||
return any(filename.endswith(extension) for extension in AUDIO_EXTENSIONS)
|
||||
|
||||
|
||||
def _get_paths_from_images(path, qualifier=is_image_file):
|
||||
"""get image path list from image folder"""
|
||||
assert os.path.isdir(path), '{:s} is not a valid directory'.format(path)
|
||||
images = []
|
||||
for dirpath, _, fnames in sorted(os.walk(path)):
|
||||
for fname in sorted(fnames):
|
||||
if qualifier(fname) and 'ref.jpg' not in fname:
|
||||
img_path = os.path.join(dirpath, fname)
|
||||
images.append(img_path)
|
||||
if not images:
|
||||
print("Warning: {:s} has no valid image file".format(path))
|
||||
return images
|
||||
|
||||
|
||||
def _get_paths_from_lmdb(dataroot):
|
||||
"""get image path list from lmdb meta info"""
|
||||
meta_info = pickle.load(open(os.path.join(dataroot, 'meta_info.pkl'), 'rb'))
|
||||
paths = meta_info['keys']
|
||||
sizes = meta_info['resolution']
|
||||
if len(sizes) == 1:
|
||||
sizes = sizes * len(paths)
|
||||
return paths, sizes
|
||||
|
||||
|
||||
def find_audio_files(dataroot, include_nonwav=False):
|
||||
if include_nonwav:
|
||||
return find_files_of_type(None, dataroot, qualifier=is_audio_file)[0]
|
||||
else:
|
||||
return find_files_of_type(None, dataroot, qualifier=is_wav_file)[0]
|
||||
|
||||
|
||||
def find_files_of_type(data_type, dataroot, weights=[], qualifier=is_image_file):
|
||||
if isinstance(dataroot, list):
|
||||
paths = []
|
||||
for i in range(len(dataroot)):
|
||||
r = dataroot[i]
|
||||
extends = 1
|
||||
|
||||
# Weights have the effect of repeatedly adding the paths from the given root to the final product.
|
||||
if weights:
|
||||
extends = weights[i]
|
||||
for j in range(extends):
|
||||
paths.extend(_get_paths_from_images(r, qualifier))
|
||||
paths = sorted(paths)
|
||||
sizes = len(paths)
|
||||
else:
|
||||
paths = sorted(_get_paths_from_images(dataroot, qualifier))
|
||||
sizes = len(paths)
|
||||
return paths, sizes
|
||||
|
||||
|
||||
class SpleeterDataset(Dataset):
|
||||
def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None):
|
||||
self.batch_sz = batch_sz
|
||||
self.max_duration = max_duration
|
||||
self.files = find_audio_files(src_dir, include_nonwav=True)
|
||||
self.sample_rate = sample_rate
|
||||
self.separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False)
|
||||
|
||||
# Partition files if needed.
|
||||
if partition_size is not None:
|
||||
psz = int(partition_size)
|
||||
prt = int(partition)
|
||||
self.files = self.files[prt * psz:(prt + 1) * psz]
|
||||
|
||||
# Find the resume point and carry on from there.
|
||||
if resume is not None:
|
||||
for i, f in enumerate(self.files):
|
||||
if resume in f:
|
||||
break
|
||||
assert i < len(self.files)
|
||||
self.files = self.files[i:]
|
||||
self.loader = AudioAdapter.default()
|
||||
|
||||
def __len__(self):
|
||||
return ceil(len(self.files) / self.batch_sz)
|
||||
|
||||
def __getitem__(self, item):
|
||||
item = item * self.batch_sz
|
||||
wavs = None
|
||||
files = []
|
||||
ends = []
|
||||
for k in range(self.batch_sz):
|
||||
ind = k+item
|
||||
if ind >= len(self.files):
|
||||
break
|
||||
|
||||
try:
|
||||
wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate)
|
||||
assert sr == 22050
|
||||
# Get rid of all channels except one.
|
||||
if wav.shape[1] > 1:
|
||||
wav = wav[:, 0]
|
||||
|
||||
if wavs is None:
|
||||
wavs = wav
|
||||
else:
|
||||
wavs = np.concatenate([wavs, wav])
|
||||
ends.append(wavs.shape[0])
|
||||
files.append(self.files[ind])
|
||||
except:
|
||||
print(f'Error loading {self.files[ind]}')
|
||||
stft = self.separator.stft(wavs)
|
||||
return {
|
||||
'audio': wavs,
|
||||
'files': files,
|
||||
'ends': ends,
|
||||
'stft': stft
|
||||
}
|
||||
|
||||
def invert_spectrogram_and_save(args, queue):
|
||||
separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False)
|
||||
out_file = args.out
|
||||
unacceptable_files = open(out_file, 'a')
|
||||
|
||||
while True:
|
||||
combo = queue.get()
|
||||
if combo is None:
|
||||
break
|
||||
vocals, bg, wavlen, files, ends = combo
|
||||
vocals = separator.stft(vocals, inverse=True, length=wavlen)
|
||||
bg = separator.stft(vocals, inverse=True, length=wavlen)
|
||||
start = 0
|
||||
for path, end in zip(files, ends):
|
||||
vmax = np.abs(vocals[start:end]).mean()
|
||||
bmax = np.abs(bg[start:end]).mean()
|
||||
start = end
|
||||
|
||||
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
|
||||
ratio = vmax / (bmax+.0000001)
|
||||
if ratio < 18: # These values were derived empirically
|
||||
unacceptable_files.write(f'{path[0]}\n')
|
||||
unacceptable_files.flush()
|
||||
|
||||
unacceptable_files.close()
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -180,20 +25,18 @@ def main():
|
|||
resume_file = args.resume
|
||||
|
||||
worker_queue = multiprocessing.Queue()
|
||||
from scripts.audio.preparation.useless import invert_spectrogram_and_save
|
||||
worker = multiprocessing.Process(target=invert_spectrogram_and_save, args=(args, worker_queue))
|
||||
worker.start()
|
||||
|
||||
loader = DataLoader(SpleeterDataset(src_dir, batch_sz=16, sample_rate=output_sample_rate,
|
||||
max_duration=10, partition=args.partition, partition_size=args.partition_size,
|
||||
resume=resume_file), batch_size=1, num_workers=0)
|
||||
resume=resume_file), batch_size=1, num_workers=1)
|
||||
|
||||
separator = Separator('spleeter:2stems', multiprocess=False)
|
||||
for k in range(100):
|
||||
for batch in tqdm(loader):
|
||||
audio, files, ends, stft = batch['audio'], batch['files'], batch['ends'], batch['stft']
|
||||
sep = separator.separate_spectrogram(stft.squeeze(0).numpy())
|
||||
worker_queue.put((sep['vocals'], sep['accompaniment'], audio.shape[1], files, ends))
|
||||
for batch in tqdm(loader):
|
||||
audio, files, ends, stft = batch['audio'], batch['files'], batch['ends'], batch['stft']
|
||||
sep = separator.separate_spectrogram(stft.squeeze(0).numpy())
|
||||
worker_queue.put((sep['vocals'], sep['accompaniment'], audio.shape[1], files, ends))
|
||||
worker_queue.put(None)
|
||||
worker.join()
|
||||
|
||||
|
|
|
@ -1,47 +0,0 @@
|
|||
from scipy.io import wavfile
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from scipy.io import wavfile
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
|
||||
from models.spleeter.separator import Separator
|
||||
from scripts.audio.preparation.spleeter_dataset import SpleeterDataset
|
||||
|
||||
|
||||
# Note: The Pytorch implementation of Spleeter is not working correctly. Fixing this would significantly
|
||||
# speed up the script since we can separate out dataloading and do batch inference.
|
||||
def main():
|
||||
src_dir = 'F:\\split\\joe_rogan'
|
||||
output_sample_rate=22050
|
||||
batch_size=16
|
||||
|
||||
dl = DataLoader(SpleeterDataset(src_dir, output_sample_rate, skip=batch_size*33000), batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=True)
|
||||
separator = Separator('pretrained_models/2stems', input_sr=output_sample_rate)
|
||||
unacceptable_files = open('unacceptable.txt', 'a')
|
||||
for batch in tqdm(dl):
|
||||
waves = batch['wave']
|
||||
paths = batch['path']
|
||||
durations = batch['duration']
|
||||
|
||||
sep = separator.separate(waves)
|
||||
for j in range(sep['vocals'].shape[0]):
|
||||
vocals = sep['vocals'][j][:durations[j]]
|
||||
bg = sep['accompaniment'][j][:durations[j]]
|
||||
vmax = np.abs(vocals[output_sample_rate:-output_sample_rate]).mean()
|
||||
bmax = np.abs(bg[output_sample_rate:-output_sample_rate]).mean()
|
||||
|
||||
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
|
||||
ratio = vmax / (bmax+.0000001)
|
||||
if ratio < 4: # These values were derived empirically
|
||||
unacceptable_files.write(f'{paths[j]}\n')
|
||||
unacceptable_files.flush()
|
||||
unacceptable_files.close()
|
||||
|
||||
|
||||
# Uses torch spleeter to divide audio clips into one of two bins:
|
||||
# 1. Audio has little to no background noise, saved to "output_dir"
|
||||
# 2. Audio has a lot of background noise, bg noise split off and saved to "output_dir_bg"
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -0,0 +1,28 @@
|
|||
from scripts.audio.preparation.spleeter_utils.spleeter_separator_mod import Separator
|
||||
import numpy as np
|
||||
|
||||
def invert_spectrogram_and_save(args, queue):
|
||||
separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False)
|
||||
out_file = args.out
|
||||
unacceptable_files = open(out_file, 'a')
|
||||
|
||||
while True:
|
||||
combo = queue.get()
|
||||
if combo is None:
|
||||
break
|
||||
vocals, bg, wavlen, files, ends = combo
|
||||
vocals = separator.stft(vocals, inverse=True, length=wavlen)
|
||||
bg = separator.stft(bg, inverse=True, length=wavlen)
|
||||
start = 0
|
||||
for path, end in zip(files, ends):
|
||||
vmax = np.abs(vocals[start:end]).mean()
|
||||
bmax = np.abs(bg[start:end]).mean()
|
||||
start = end
|
||||
|
||||
# Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough.
|
||||
ratio = vmax / (bmax+.0000001)
|
||||
if ratio < 18: # These values were derived empirically
|
||||
unacceptable_files.write(f'{path[0]}\n')
|
||||
unacceptable_files.flush()
|
||||
|
||||
unacceptable_files.close()
|
|
@ -0,0 +1,69 @@
|
|||
from math import ceil
|
||||
|
||||
import numpy as np
|
||||
|
||||
from spleeter.audio.adapter import AudioAdapter
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
from data.util import find_audio_files
|
||||
from scripts.audio.preparation.spleeter_utils.spleeter_separator_mod import Separator
|
||||
|
||||
|
||||
class SpleeterDataset(Dataset):
|
||||
def __init__(self, src_dir, batch_sz, max_duration, sample_rate=22050, partition=None, partition_size=None, resume=None):
|
||||
self.batch_sz = batch_sz
|
||||
self.max_duration = max_duration
|
||||
self.files = find_audio_files(src_dir, include_nonwav=True)
|
||||
self.sample_rate = sample_rate
|
||||
self.separator = Separator('spleeter:2stems', multiprocess=False, load_tf=False)
|
||||
|
||||
# Partition files if needed.
|
||||
if partition_size is not None:
|
||||
psz = int(partition_size)
|
||||
prt = int(partition)
|
||||
self.files = self.files[prt * psz:(prt + 1) * psz]
|
||||
|
||||
# Find the resume point and carry on from there.
|
||||
if resume is not None:
|
||||
for i, f in enumerate(self.files):
|
||||
if resume in f:
|
||||
break
|
||||
assert i < len(self.files)
|
||||
self.files = self.files[i:]
|
||||
self.loader = AudioAdapter.default()
|
||||
|
||||
def __len__(self):
|
||||
return ceil(len(self.files) / self.batch_sz)
|
||||
|
||||
def __getitem__(self, item):
|
||||
item = item * self.batch_sz
|
||||
wavs = None
|
||||
files = []
|
||||
ends = []
|
||||
for k in range(self.batch_sz):
|
||||
ind = k+item
|
||||
if ind >= len(self.files):
|
||||
break
|
||||
|
||||
try:
|
||||
wav, sr = self.loader.load(self.files[ind], sample_rate=self.sample_rate)
|
||||
assert sr == 22050
|
||||
# Get rid of all channels except one.
|
||||
if wav.shape[1] > 1:
|
||||
wav = wav[:, 0]
|
||||
|
||||
if wavs is None:
|
||||
wavs = wav
|
||||
else:
|
||||
wavs = np.concatenate([wavs, wav])
|
||||
ends.append(wavs.shape[0])
|
||||
files.append(self.files[ind])
|
||||
except:
|
||||
print(f'Error loading {self.files[ind]}')
|
||||
stft = self.separator.stft(wavs)
|
||||
return {
|
||||
'audio': wavs,
|
||||
'files': files,
|
||||
'ends': ends,
|
||||
'stft': stft
|
||||
}
|
|
@ -9,7 +9,7 @@ from spleeter.audio.adapter import AudioAdapter
|
|||
import numpy as np
|
||||
|
||||
|
||||
# Uses spleeter to divide audio clips into one of two bins:
|
||||
# Uses spleeter_utils to divide audio clips into one of two bins:
|
||||
# 1. Audio has little to no background noise, saved to "output_dir"
|
||||
# 2. Audio has a lot of background noise, bg noise split off and saved to "output_dir_bg"
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in New Issue
Block a user