forked from mrq/tortoise-tts
44 lines
1.6 KiB
Python
44 lines
1.6 KiB
Python
|
import torch
|
||
|
import torchaudio
|
||
|
|
||
|
|
||
|
def load_wav_to_torch(full_path):
|
||
|
sampling_rate, data = read(full_path)
|
||
|
if data.dtype == np.int32:
|
||
|
norm_fix = 2 ** 31
|
||
|
elif data.dtype == np.int16:
|
||
|
norm_fix = 2 ** 15
|
||
|
elif data.dtype == np.float16 or data.dtype == np.float32:
|
||
|
norm_fix = 1.
|
||
|
else:
|
||
|
raise NotImplemented(f"Provided data dtype not supported: {data.dtype}")
|
||
|
return (torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate)
|
||
|
|
||
|
|
||
|
def load_audio(audiopath, sampling_rate):
|
||
|
if audiopath[-4:] == '.wav':
|
||
|
audio, lsr = load_wav_to_torch(audiopath)
|
||
|
elif audiopath[-4:] == '.mp3':
|
||
|
# https://github.com/neonbjb/pyfastmp3decoder - Definitely worth it.
|
||
|
from pyfastmp3decoder.mp3decoder import load_mp3
|
||
|
audio, lsr = load_mp3(audiopath, sampling_rate)
|
||
|
audio = torch.FloatTensor(audio)
|
||
|
|
||
|
# Remove any channel data.
|
||
|
if len(audio.shape) > 1:
|
||
|
if audio.shape[0] < 5:
|
||
|
audio = audio[0]
|
||
|
else:
|
||
|
assert audio.shape[1] < 5
|
||
|
audio = audio[:, 0]
|
||
|
|
||
|
if lsr != sampling_rate:
|
||
|
audio = torchaudio.functional.resample(audio, lsr, sampling_rate)
|
||
|
|
||
|
# Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
|
||
|
# '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
|
||
|
if torch.any(audio > 2) or not torch.any(audio < 0):
|
||
|
print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")
|
||
|
audio.clip_(-1, 1)
|
||
|
|
||
|
return audio.unsqueeze(0)
|