Improve efficiency of audio_with_noise_dataset

This commit is contained in:
James Betker 2022-03-08 15:50:13 -07:00
parent b3def182de
commit 38fd9fc985
2 changed files with 13 additions and 8 deletions

View File

@ -8,6 +8,7 @@ import torchaudio
from torch.utils.data import Dataset
from tqdm import tqdm
import torch.nn.functional as F
from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio
from data.util import load_paths_from_cache, find_files_of_type, is_audio_file
@ -95,6 +96,8 @@ class AudioWithNoiseDataset(Dataset):
out = self.underlying_dataset[item]
clip = out['clip']
dlen = clip.shape[-1]
clip = clip[:, :out['clip_lengths']]
augpath = ''
augvol = 0
try:
@ -149,15 +152,17 @@ class AudioWithNoiseDataset(Dataset):
# Apply the GSM codec to simulate cellular phone audio.
clip = torchaudio.functional.apply_codec(clip, self.underlying_dataset.sampling_rate, format="gsm")
except:
print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.")
print(sys.exc_info())
#print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.")
#print(sys.exc_info())
#raise # Uncomment to surface exceptions.
return self[item]
clip.clip_(-1, 1)
# Restore padding.
clip = F.pad(clip, (0, dlen-clip.shape[-1]))
out['clip'] = clip
out['label'] = label
out['aug'] = aug
#out['aug'] = aug
out['augpath'] = augpath
out['augvol'] = augvol
out['clipvol'] = clipvol
@ -170,10 +175,10 @@ class AudioWithNoiseDataset(Dataset):
if __name__ == '__main__':
params = {
'mode': 'unsupervised_audio_with_noise',
'path': ['\\\\192.168.5.3\\rtx3080_audio\\split\\cleaned\\books0'],
'cache_path': 'E:\\audio\\remote-cache3.pth',
'path': ['y:/clips/books1'],
'cache_path': 'E:\\audio\\remote-cache4.pth',
'sampling_rate': 22050,
'pad_to_samples': 80960,
'pad_to_samples': 400000,
'phase': 'train',
'n_workers': 0,
'batch_size': 4,
@ -191,7 +196,7 @@ if __name__ == '__main__':
i = 0
for b in tqdm(dl):
for b_ in range(b['clip'].shape[0]):
#torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate)
torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate)
#torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}')
i += 1

View File

@ -37,7 +37,7 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
In practice, this means two things:
1) Index {i} of this dataset means nothing: fetching from the same index will almost always return different data.
As a result, this dataset should not be used for validation or test runs.
As a result, this dataset should not be used for validation or test runs. Use PairedVoiceAudio dataset instead.
2) This dataset has a slight bias for items with longer text or longer filenames.
The upshot is that this dataset loads extremely quickly and consumes almost no system memory.