From ed6eae407f18e6f15b211775633a812e3a9c69f8 Mon Sep 17 00:00:00 2001 From: James Betker Date: Mon, 30 Aug 2021 21:19:13 -0600 Subject: [PATCH] More scripts for splitting and formatting audio --- .../spleeter_split_voice_and_background.py | 62 +++++++++++++++++++ codes/scripts/audio/split_on_silence.py | 42 +++++++++++++ codes/scripts/audio/test_audio_gen.py | 2 +- .../audio/test_audio_speech_recognition.py | 2 +- 4 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 codes/scripts/audio/spleeter_split_voice_and_background.py create mode 100644 codes/scripts/audio/split_on_silence.py diff --git a/codes/scripts/audio/spleeter_split_voice_and_background.py b/codes/scripts/audio/spleeter_split_voice_and_background.py new file mode 100644 index 00000000..3ab3e37a --- /dev/null +++ b/codes/scripts/audio/spleeter_split_voice_and_background.py @@ -0,0 +1,62 @@ +from scipy.io import wavfile +from spleeter.separator import Separator +from tqdm import tqdm + +from data.util import find_audio_files +import os +import os.path as osp +from spleeter.audio.adapter import AudioAdapter +import numpy as np + + +# Uses spleeter to divide audio clips into one of two bins: +# 1. Audio has little to no background noise, saved to "output_dir" +# 2. Audio has a lot of background noise, bg noise split off and saved to "output_dir_bg" +if __name__ == '__main__': + src_dir = 'F:\\split\\books1' + output_dir = 'F:\\split\\cleaned\\books1' + output_dir_bg = 'F:\\split\\background-noise\\books1' + output_sample_rate=22050 + + os.makedirs(output_dir_bg, exist_ok=True) + os.makedirs(output_dir, exist_ok=True) + + audio_loader = AudioAdapter.default() + separator = Separator('spleeter:2stems') + files = find_audio_files(src_dir, include_nonwav=True) + for e, file in enumerate(tqdm(files)): + file_basis = osp.relpath(file, src_dir)\ + .replace('/', '_')\ + .replace('\\', '_')\ + .replace('.', '_')\ + .replace(' ', '_')\ + .replace('!', '_')\ + .replace(',', '_') + if len(file_basis) > 100: + file_basis = file_basis[:100] + try: + wave, sample_rate = audio_loader.load(file, sample_rate=output_sample_rate) + except: + print(f"Error with {file}") + continue + + sep = separator.separate(wave) + vocals = sep['vocals'] + bg = sep['accompaniment'] + vmax = np.abs(vocals).mean() + bmax = np.abs(bg).mean() + + # Only output to the "good" sample dir if the ratio of background noise to vocal noise is high enough. + ratio = vmax / (bmax+.0000001) + if ratio >= 25: # These values were derived empirically + od = output_dir + os = wave + elif ratio <= 1: + od = output_dir_bg + os = bg + + # Strip out channels. + if len(os.shape) > 1: + os = os[:, 0] # Just use the first channel. + + wavfile.write(osp.join(od, f'{e}_{file_basis}.wav'), output_sample_rate, os) diff --git a/codes/scripts/audio/split_on_silence.py b/codes/scripts/audio/split_on_silence.py new file mode 100644 index 00000000..b5138d6d --- /dev/null +++ b/codes/scripts/audio/split_on_silence.py @@ -0,0 +1,42 @@ +import argparse +import logging +import os +from pydub import AudioSegment +from pydub.exceptions import CouldntDecodeError +from pydub.silence import split_on_silence +from data.util import find_audio_files +from tqdm import tqdm + + +# Uses pydub to process a directory of audio files, splitting them into clips at points where it detects a small amount +# of silence. +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--path') + parser.add_argument('--out') + args = parser.parse_args() + minimum_duration = 5 + maximum_duration = 20 + files = find_audio_files(args.path, include_nonwav=True) + for e, wav_file in enumerate(tqdm(files)): + if e < 4197: + continue + print(f"Processing {wav_file}..") + outdir = os.path.join(args.out, f'{e}_{os.path.basename(wav_file[:-4])}').replace('.', '').strip() + os.makedirs(outdir, exist_ok=True) + + try: + speech = AudioSegment.from_file(wav_file) + except CouldntDecodeError as e: + print(e) + continue + chunks = split_on_silence(speech, min_silence_len=300, silence_thresh=-40, + seek_step=100, keep_silence=50) + + for i in range(0, len(chunks)): + if chunks[i].duration_seconds < minimum_duration or chunks[i].duration_seconds > maximum_duration: + continue + chunks[i].export(f"{outdir}/{i:05d}.wav", format='wav', parameters=["-ar", "22050", "-ac", "1"]) + +if __name__ == '__main__': + main() diff --git a/codes/scripts/audio/test_audio_gen.py b/codes/scripts/audio/test_audio_gen.py index 0e65f7b1..e62b3c80 100644 --- a/codes/scripts/audio/test_audio_gen.py +++ b/codes/scripts/audio/test_audio_gen.py @@ -54,7 +54,7 @@ if __name__ == "__main__": torch.backends.cudnn.benchmark = True want_metrics = False parser = argparse.ArgumentParser() - parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_stop_pred_dataset.yml') + parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_lrdvae_audio_clips.yml') opt = option.parse(parser.parse_args().opt, is_train=False) opt = option.dict_to_nonedict(opt) utils.util.loaded_options = opt diff --git a/codes/scripts/audio/test_audio_speech_recognition.py b/codes/scripts/audio/test_audio_speech_recognition.py index 4d1bfb8e..2de85809 100644 --- a/codes/scripts/audio/test_audio_speech_recognition.py +++ b/codes/scripts/audio/test_audio_speech_recognition.py @@ -44,7 +44,7 @@ if __name__ == "__main__": torch.backends.cudnn.benchmark = True want_metrics = False parser = argparse.ArgumentParser() - parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_gpt_asr_mozcv.yml') + parser.add_argument('-opt', type=str, help='Path to options YAML file.', default='../options/test_gpt_asr_mass.yml') opt = option.parse(parser.parse_args().opt, is_train=False) opt = option.dict_to_nonedict(opt) utils.util.loaded_options = opt