From e5d97dfd5635558efa566a446579945f3d6cc1a9 Mon Sep 17 00:00:00 2001 From: James Betker Date: Fri, 8 Jul 2022 00:37:53 -0600 Subject: [PATCH] misc --- .../play_with_spectral_representations.py | 11 ++++++++ .../scripts/audio/prep_music/demucs_notes.txt | 12 ++++++++- .../audio/prep_music/phase_1_split_files.py | 6 ++--- codes/utils/music_utils.py | 25 +++++++++++++++++++ 4 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 codes/scripts/audio/play_with_spectral_representations.py diff --git a/codes/scripts/audio/play_with_spectral_representations.py b/codes/scripts/audio/play_with_spectral_representations.py new file mode 100644 index 00000000..94acd87e --- /dev/null +++ b/codes/scripts/audio/play_with_spectral_representations.py @@ -0,0 +1,11 @@ +import torchvision.utils + +from utils.music_utils import music2mel, music2cqt +from utils.util import load_audio + +if __name__ == '__main__': + clip = load_audio('Y:\\split\\yt-music-eval\\00001.wav', 22050) + mel = music2mel(clip) + cqt = music2cqt(clip) + torchvision.utils.save_image((mel.unsqueeze(1) + 1) / 2, 'mel.png') + torchvision.utils.save_image((cqt.unsqueeze(1) + 1) / 2, 'cqt.png') diff --git a/codes/scripts/audio/prep_music/demucs_notes.txt b/codes/scripts/audio/prep_music/demucs_notes.txt index f54f03be..b8d61993 100644 --- a/codes/scripts/audio/prep_music/demucs_notes.txt +++ b/codes/scripts/audio/prep_music/demucs_notes.txt @@ -5,4 +5,14 @@ https://github.com/neonbjb/demucs conda activate demucs python setup.py install CUDA_VISIBLE_DEVICES=0 python -m demucs /y/split/bt-music-5 --out=/y/separated/bt-music-5 --num_workers=2 --device cuda --two-stems=vocals -`` \ No newline at end of file +``` + +Example usage of generate_long_cheaters and generate_long_mels, post demucs: + +``` +CUDA_VISIBLE_DEVICES=0 python generate_long_mels.py --path=/y/separated/mpm/1 --progress_file=/y/separated/large_mels/mpm/already_processed.txt \ +--output_path=/y/separated/large_mels/mpm/1 --num_threads=2 + +CUDA_VISIBLE_DEVICES=2 python generate_long_cheaters.py --path=/y/separated/large_mels/mpm/3 --progress_file=/y/separated/large_mel_cheaters/mpm/already_processed.txt \ +--output_path=/y/separated/large_mel_cheaters/mpm/3 --num_threads=1 +``` \ No newline at end of file diff --git a/codes/scripts/audio/prep_music/phase_1_split_files.py b/codes/scripts/audio/prep_music/phase_1_split_files.py index e24195c3..ce12e570 100644 --- a/codes/scripts/audio/prep_music/phase_1_split_files.py +++ b/codes/scripts/audio/prep_music/phase_1_split_files.py @@ -47,9 +47,9 @@ def process_file(file, base_path, output_path, progress_file, duration_per_clip, if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('-path', type=str, help='Path to search for files', default='C:\\Users\\James\\Downloads\\soundcloud-dl\\sc2') - parser.add_argument('-progress_file', type=str, help='Place to store all files that have already been processed', default='C:\\Users\\James\\Downloads\\soundcloud-dl\\sc2\\already_processed.txt') - parser.add_argument('-output_path', type=str, help='Path for output files', default='Y:\\split\\soundcloud_mixes\\bigmix1') + parser.add_argument('-path', type=str, help='Path to search for files', default='Y:\\sources\\soundcloud-mixes\\mixes2') + parser.add_argument('-progress_file', type=str, help='Place to store all files that have already been processed', default='Y:\\sources\\soundcloud-mixes\\mixes2\\already_processed.txt') + parser.add_argument('-output_path', type=str, help='Path for output files', default='Y:\\split\\soundcloud-mixes2') parser.add_argument('-num_threads', type=int, help='Number of concurrent workers processing files.', default=4) parser.add_argument('-duration', type=int, help='Duration per clip in seconds', default=30) args = parser.parse_args() diff --git a/codes/utils/music_utils.py b/codes/utils/music_utils.py index 788c1bb5..ff40f805 100644 --- a/codes/utils/music_utils.py +++ b/codes/utils/music_utils.py @@ -1,6 +1,31 @@ import torch +def music2mel(clip): + if len(clip.shape) == 1: + clip = clip.unsqueeze(0) + + from trainer.injectors.audio_injectors import TorchMelSpectrogramInjector + inj = TorchMelSpectrogramInjector({'n_mel_channels': 256, 'mel_fmax': 11000, 'filter_length': 16000, + 'normalize': True, 'true_normalization': True, 'in': 'in', 'out': 'out'}, {}) + return inj({'in': clip})['out'] + + +def music2cqt(clip): + def normalize_cqt(cqt): + # CQT_MIN = 0 + CQT_MAX = 18 + return 2 * cqt / CQT_MAX - 1 + + if len(clip.shape) == 1: + clip = clip.unsqueeze(0) + from nnAudio.features.cqt import CQT + # Visually, filter_scale=.25 seems to be the most descriptive representation, but loses frequency fidelity. + # It may be desirable to mix filter_scale=.25 with filter_scale=1. + cqt = CQT(sr=22050, hop_length=256, n_bins=256, bins_per_octave=32, filter_scale=.25, norm=1, verbose=False) + return normalize_cqt(cqt(clip)) + + def get_mel2wav_model(): from models.audio.music.unet_diffusion_waveform_gen_simple import DiffusionWaveformGen model = DiffusionWaveformGen(model_channels=256, in_channels=16, in_mel_channels=256, out_channels=32, channel_mult=[1,2,3,4,4],