forked from mrq/DL-Art-School
Improve efficiency of audio_with_noise_dataset
This commit is contained in:
parent
b3def182de
commit
38fd9fc985
|
@ -8,6 +8,7 @@ import torchaudio
|
||||||
from torch.utils.data import Dataset
|
from torch.utils.data import Dataset
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
import torch.nn.functional as F
|
||||||
from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio
|
from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio
|
||||||
from data.util import load_paths_from_cache, find_files_of_type, is_audio_file
|
from data.util import load_paths_from_cache, find_files_of_type, is_audio_file
|
||||||
|
|
||||||
|
@ -95,6 +96,8 @@ class AudioWithNoiseDataset(Dataset):
|
||||||
|
|
||||||
out = self.underlying_dataset[item]
|
out = self.underlying_dataset[item]
|
||||||
clip = out['clip']
|
clip = out['clip']
|
||||||
|
dlen = clip.shape[-1]
|
||||||
|
clip = clip[:, :out['clip_lengths']]
|
||||||
augpath = ''
|
augpath = ''
|
||||||
augvol = 0
|
augvol = 0
|
||||||
try:
|
try:
|
||||||
|
@ -149,15 +152,17 @@ class AudioWithNoiseDataset(Dataset):
|
||||||
# Apply the GSM codec to simulate cellular phone audio.
|
# Apply the GSM codec to simulate cellular phone audio.
|
||||||
clip = torchaudio.functional.apply_codec(clip, self.underlying_dataset.sampling_rate, format="gsm")
|
clip = torchaudio.functional.apply_codec(clip, self.underlying_dataset.sampling_rate, format="gsm")
|
||||||
except:
|
except:
|
||||||
print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.")
|
#print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.")
|
||||||
print(sys.exc_info())
|
#print(sys.exc_info())
|
||||||
#raise # Uncomment to surface exceptions.
|
#raise # Uncomment to surface exceptions.
|
||||||
return self[item]
|
return self[item]
|
||||||
|
|
||||||
clip.clip_(-1, 1)
|
clip.clip_(-1, 1)
|
||||||
|
# Restore padding.
|
||||||
|
clip = F.pad(clip, (0, dlen-clip.shape[-1]))
|
||||||
out['clip'] = clip
|
out['clip'] = clip
|
||||||
out['label'] = label
|
out['label'] = label
|
||||||
out['aug'] = aug
|
#out['aug'] = aug
|
||||||
out['augpath'] = augpath
|
out['augpath'] = augpath
|
||||||
out['augvol'] = augvol
|
out['augvol'] = augvol
|
||||||
out['clipvol'] = clipvol
|
out['clipvol'] = clipvol
|
||||||
|
@ -170,10 +175,10 @@ class AudioWithNoiseDataset(Dataset):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
params = {
|
params = {
|
||||||
'mode': 'unsupervised_audio_with_noise',
|
'mode': 'unsupervised_audio_with_noise',
|
||||||
'path': ['\\\\192.168.5.3\\rtx3080_audio\\split\\cleaned\\books0'],
|
'path': ['y:/clips/books1'],
|
||||||
'cache_path': 'E:\\audio\\remote-cache3.pth',
|
'cache_path': 'E:\\audio\\remote-cache4.pth',
|
||||||
'sampling_rate': 22050,
|
'sampling_rate': 22050,
|
||||||
'pad_to_samples': 80960,
|
'pad_to_samples': 400000,
|
||||||
'phase': 'train',
|
'phase': 'train',
|
||||||
'n_workers': 0,
|
'n_workers': 0,
|
||||||
'batch_size': 4,
|
'batch_size': 4,
|
||||||
|
@ -191,7 +196,7 @@ if __name__ == '__main__':
|
||||||
i = 0
|
i = 0
|
||||||
for b in tqdm(dl):
|
for b in tqdm(dl):
|
||||||
for b_ in range(b['clip'].shape[0]):
|
for b_ in range(b['clip'].shape[0]):
|
||||||
#torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate)
|
torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate)
|
||||||
#torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
|
#torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
|
||||||
print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}')
|
print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}')
|
||||||
i += 1
|
i += 1
|
||||||
|
|
|
@ -37,7 +37,7 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
||||||
|
|
||||||
In practice, this means two things:
|
In practice, this means two things:
|
||||||
1) Index {i} of this dataset means nothing: fetching from the same index will almost always return different data.
|
1) Index {i} of this dataset means nothing: fetching from the same index will almost always return different data.
|
||||||
As a result, this dataset should not be used for validation or test runs.
|
As a result, this dataset should not be used for validation or test runs. Use PairedVoiceAudio dataset instead.
|
||||||
2) This dataset has a slight bias for items with longer text or longer filenames.
|
2) This dataset has a slight bias for items with longer text or longer filenames.
|
||||||
|
|
||||||
The upshot is that this dataset loads extremely quickly and consumes almost no system memory.
|
The upshot is that this dataset loads extremely quickly and consumes almost no system memory.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user