forked from mrq/DL-Art-School
Update noise augmentation dataset to include voices that are appended at the end of another clip.
This commit is contained in:
parent
c4e4cf91a0
commit
726e30c4f7
|
@ -98,6 +98,7 @@ class AudioWithNoiseDataset(Dataset):
|
|||
clip = out['clip']
|
||||
dlen = clip.shape[-1]
|
||||
clip = clip[:, :out['clip_lengths']]
|
||||
padding_room = dlen - clip.shape[-1]
|
||||
augpath = ''
|
||||
augvol = 0
|
||||
try:
|
||||
|
@ -106,9 +107,9 @@ class AudioWithNoiseDataset(Dataset):
|
|||
clip = clip * clipvol
|
||||
|
||||
label = random.randint(0, 4) # Current excludes GSM corruption.
|
||||
#label = 2
|
||||
aug = torch.zeros_like(clip)
|
||||
label = 3
|
||||
if label > 0 and label < 4: # 0 is basically "leave it alone"
|
||||
aug_needed = True
|
||||
augvol = (random.random() * (self.max_volume-self.min_volume) + self.min_volume)
|
||||
if label == 1:
|
||||
# Add environmental noise.
|
||||
|
@ -120,22 +121,38 @@ class AudioWithNoiseDataset(Dataset):
|
|||
intg_fns = [_integration_fn_fully_enabled]
|
||||
augvol *= .5 # Music is often severely in the background.
|
||||
elif label == 3:
|
||||
# Add another voice.
|
||||
augpath = random.choice(self.underlying_dataset.audiopaths)
|
||||
intg_fns = [_integration_fn_smooth, _integration_fn_fully_enabled]
|
||||
aug = load_audio(augpath, self.underlying_dataset.sampling_rate)
|
||||
if aug.shape[1] > clip.shape[1]:
|
||||
n, cn = aug.shape[1], clip.shape[1]
|
||||
gap = n-cn
|
||||
placement = random.randint(0, gap)
|
||||
aug = aug[:, placement:placement+cn]
|
||||
aug = random.choice(intg_fns)(aug.shape[1]) * aug
|
||||
aug = aug * augvol
|
||||
if aug.shape[1] < clip.shape[1]:
|
||||
gap = clip.shape[1] - aug.shape[1]
|
||||
placement = random.randint(0, gap-1)
|
||||
aug = torch.nn.functional.pad(aug, (placement, gap-placement))
|
||||
clip = clip + aug
|
||||
# This can take two forms:
|
||||
if padding_room < 22000 or random.random() < .5:
|
||||
# (1) The voices talk over one another. If there is no padding room, we always take this choice.
|
||||
intg_fns = [_integration_fn_smooth, _integration_fn_fully_enabled]
|
||||
else:
|
||||
# (2) There are simply two voices in the clip, separated from one another.
|
||||
# This is a special case that does not use the same logic as the rest of the augmentations.
|
||||
aug = load_audio(augpath, self.underlying_dataset.sampling_rate)
|
||||
# Pad with some random silence
|
||||
aug = F.pad(aug, (random.randint(20,4000), 0))
|
||||
# Fit what we can given the padding room we have.
|
||||
aug = aug[:, :padding_room]
|
||||
clip = torch.cat([clip, aug], dim=1)
|
||||
# Restore some meta-parameters.
|
||||
padding_room = dlen - clip.shape[-1]
|
||||
out['clip_lengths'] = clip.shape[-1]
|
||||
aug_needed = False
|
||||
if aug_needed:
|
||||
aug = load_audio(augpath, self.underlying_dataset.sampling_rate)
|
||||
if aug.shape[1] > clip.shape[1]:
|
||||
n, cn = aug.shape[1], clip.shape[1]
|
||||
gap = n-cn
|
||||
placement = random.randint(0, gap)
|
||||
aug = aug[:, placement:placement+cn]
|
||||
aug = random.choice(intg_fns)(aug.shape[1]) * aug
|
||||
aug = aug * augvol
|
||||
if aug.shape[1] < clip.shape[1]:
|
||||
gap = clip.shape[1] - aug.shape[1]
|
||||
placement = random.randint(0, gap-1)
|
||||
aug = torch.nn.functional.pad(aug, (placement, gap-placement))
|
||||
clip = clip + aug
|
||||
elif label == 4:
|
||||
# Perform reverb (to simulate being in a large room with an omni-mic). This is performed by convolving
|
||||
# impulse recordings from openair over the input clip.
|
||||
|
@ -159,7 +176,7 @@ class AudioWithNoiseDataset(Dataset):
|
|||
|
||||
clip.clip_(-1, 1)
|
||||
# Restore padding.
|
||||
clip = F.pad(clip, (0, dlen-clip.shape[-1]))
|
||||
clip = F.pad(clip, (0, padding_room))
|
||||
out['clip'] = clip
|
||||
out['label'] = label
|
||||
#out['aug'] = aug
|
||||
|
@ -196,7 +213,7 @@ if __name__ == '__main__':
|
|||
i = 0
|
||||
for b in tqdm(dl):
|
||||
for b_ in range(b['clip'].shape[0]):
|
||||
torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_], ds.sampling_rate)
|
||||
torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_][:, :b['clip_lengths'][b_]], ds.sampling_rate)
|
||||
#torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
|
||||
print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}')
|
||||
i += 1
|
||||
|
|
Loading…
Reference in New Issue
Block a user