Improve efficiency of audio_with_noise_dataset

This commit is contained in:
James Betker 2022-03-08 15:50:13 -07:00
parent b3def182de
commit 38fd9fc985
2 changed files with 13 additions and 8 deletions

View File

@ -8,6 +8,7 @@ import torchaudio
from torch.utils.data import Dataset from torch.utils.data import Dataset
from tqdm import tqdm from tqdm import tqdm
import torch.nn.functional as F
from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio
from data.util import load_paths_from_cache, find_files_of_type, is_audio_file from data.util import load_paths_from_cache, find_files_of_type, is_audio_file
@ -95,6 +96,8 @@ class AudioWithNoiseDataset(Dataset):
out = self.underlying_dataset[item] out = self.underlying_dataset[item]
clip = out['clip'] clip = out['clip']
dlen = clip.shape[-1]
clip = clip[:, :out['clip_lengths']]
augpath = '' augpath = ''
augvol = 0 augvol = 0
try: try:
@ -149,15 +152,17 @@ class AudioWithNoiseDataset(Dataset):
# Apply the GSM codec to simulate cellular phone audio. # Apply the GSM codec to simulate cellular phone audio.
clip = torchaudio.functional.apply_codec(clip, self.underlying_dataset.sampling_rate, format="gsm") clip = torchaudio.functional.apply_codec(clip, self.underlying_dataset.sampling_rate, format="gsm")
except: except:
print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.") #print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.")
print(sys.exc_info()) #print(sys.exc_info())
#raise # Uncomment to surface exceptions. #raise # Uncomment to surface exceptions.
return self[item] return self[item]
clip.clip_(-1, 1) clip.clip_(-1, 1)
# Restore padding.
clip = F.pad(clip, (0, dlen-clip.shape[-1]))
out['clip'] = clip out['clip'] = clip
out['label'] = label out['label'] = label
out['aug'] = aug #out['aug'] = aug
out['augpath'] = augpath out['augpath'] = augpath
out['augvol'] = augvol out['augvol'] = augvol
out['clipvol'] = clipvol out['clipvol'] = clipvol
@ -170,10 +175,10 @@ class AudioWithNoiseDataset(Dataset):
if __name__ == '__main__': if __name__ == '__main__':
params = { params = {
'mode': 'unsupervised_audio_with_noise', 'mode': 'unsupervised_audio_with_noise',
'path': ['\\\\192.168.5.3\\rtx3080_audio\\split\\cleaned\\books0'], 'path': ['y:/clips/books1'],
'cache_path': 'E:\\audio\\remote-cache3.pth', 'cache_path': 'E:\\audio\\remote-cache4.pth',
'sampling_rate': 22050, 'sampling_rate': 22050,
'pad_to_samples': 80960, 'pad_to_samples': 400000,
'phase': 'train', 'phase': 'train',
'n_workers': 0, 'n_workers': 0,
'batch_size': 4, 'batch_size': 4,
@ -191,7 +196,7 @@ if __name__ == '__main__':
i = 0 i = 0
for b in tqdm(dl): for b in tqdm(dl):
for b_ in range(b['clip'].shape[0]): for b_ in range(b['clip'].shape[0]):
#torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate) torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate)
#torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate) #torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}') print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}')
i += 1 i += 1

View File

@ -37,7 +37,7 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
In practice, this means two things: In practice, this means two things:
1) Index {i} of this dataset means nothing: fetching from the same index will almost always return different data. 1) Index {i} of this dataset means nothing: fetching from the same index will almost always return different data.
As a result, this dataset should not be used for validation or test runs. As a result, this dataset should not be used for validation or test runs. Use PairedVoiceAudio dataset instead.
2) This dataset has a slight bias for items with longer text or longer filenames. 2) This dataset has a slight bias for items with longer text or longer filenames.
The upshot is that this dataset loads extremely quickly and consumes almost no system memory. The upshot is that this dataset loads extremely quickly and consumes almost no system memory.