From d2bdeb6f20b809787d9a64c8044275eec4b84c59 Mon Sep 17 00:00:00 2001 From: James Betker Date: Tue, 8 Mar 2022 15:52:26 -0700 Subject: [PATCH] misc audio support --- codes/utils/util.py | 61 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/codes/utils/util.py b/codes/utils/util.py index a64ffbb3..5fbb78c5 100644 --- a/codes/utils/util.py +++ b/codes/utils/util.py @@ -1,7 +1,10 @@ import os +import pathlib import sys import time import math + +import scipy import torch.nn.functional as F from datetime import datetime import random @@ -10,6 +13,8 @@ from collections import OrderedDict import numpy as np import cv2 import torch +import torchaudio +from audio2numpy import open_audio from torchvision.utils import make_grid from shutil import get_terminal_size import scp @@ -541,3 +546,59 @@ def optimizer_to(opt, device): subparam.data = subparam.data.to(device) if subparam._grad is not None: subparam._grad.data = subparam._grad.data.to(device) + +#'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' +#''' AUDIO UTILS ''' +#'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' + + +def find_audio_files(base_path, globs=['*.wav', '*.mp3', '*.ogg', '*.flac']): + path = pathlib.Path(base_path) + paths = [] + for glob in globs: + paths.extend([str(f) for f in path.rglob(glob)]) + return paths + + +def load_audio(audiopath, sampling_rate, raw_data=None): + if raw_data is not None: + # Assume the data is wav format. SciPy's reader can read raw WAV data from a BytesIO wrapper. + audio, lsr = load_wav_to_torch(raw_data) + else: + if audiopath[-4:] == '.wav': + audio, lsr = load_wav_to_torch(audiopath) + else: + audio, lsr = open_audio(audiopath) + audio = torch.FloatTensor(audio) + + # Remove any channel data. + if len(audio.shape) > 1: + if audio.shape[0] < 5: + audio = audio[0] + else: + assert audio.shape[1] < 5 + audio = audio[:, 0] + + if lsr != sampling_rate: + audio = torchaudio.functional.resample(audio, lsr, sampling_rate) + + # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk. + # '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds. + if torch.any(audio > 2) or not torch.any(audio < 0): + print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}") + audio.clip_(-1, 1) + + return audio + + +def load_wav_to_torch(full_path): + sampling_rate, data = scipy.io.wavfile.read(full_path) + if data.dtype == np.int32: + norm_fix = 2 ** 31 + elif data.dtype == np.int16: + norm_fix = 2 ** 15 + elif data.dtype == np.float16 or data.dtype == np.float32: + norm_fix = 1. + else: + raise NotImplemented(f"Provided data dtype not supported: {data.dtype}") + return (torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate)