diff --git a/codes/data/audio/paired_voice_audio_dataset.py b/codes/data/audio/paired_voice_audio_dataset.py index 8f74e31f..6df37389 100644 --- a/codes/data/audio/paired_voice_audio_dataset.py +++ b/codes/data/audio/paired_voice_audio_dataset.py @@ -222,8 +222,8 @@ if __name__ == '__main__': batch_sz = 8 params = { 'mode': 'paired_voice_audio', - 'path': ['Z:\\clips\\podcasts-0-transcribed.tsv'], - 'fetcher_mode': ['tsv'], + 'path': ['Y:\\bigasr_dataset\\hifi_tts_mp3\\test.txt'], + 'fetcher_mode': ['libritts'], 'phase': 'train', 'n_workers': 0, 'batch_size': batch_sz, @@ -238,11 +238,20 @@ if __name__ == '__main__': } from data import create_dataset, create_dataloader + def save(b, i, ib, key, c=None): + if c is not None: + torchaudio.save(f'{i}_clip_{ib}_{key}_{c}.wav', b[key][ib][c], 22050) + else: + torchaudio.save(f'{i}_clip_{ib}_{key}.wav', b[key][ib], 22050) + ds, c = create_dataset(params, return_collate=True) dl = create_dataloader(ds, params, collate_fn=c) i = 0 m = None for i, b in tqdm(enumerate(dl)): for ib in range(batch_sz): - print(f"text_seq: {b['text_lengths'].max()}, speech_seq: {b['wav_lengths'].max()//1024}") + print(f'{i} {ib} {b["real_text"][ib]}') + save(b, i, ib, 'wav') + if i > 5: + break diff --git a/codes/data/audio/unsupervised_audio_dataset.py b/codes/data/audio/unsupervised_audio_dataset.py index e82d857e..dd0ab5c1 100644 --- a/codes/data/audio/unsupervised_audio_dataset.py +++ b/codes/data/audio/unsupervised_audio_dataset.py @@ -38,9 +38,7 @@ def load_audio(audiopath, sampling_rate): audio = audio[:, 0] if lsr != sampling_rate: - #if lsr < sampling_rate: - # warn(f'{audiopath} has a sample rate of {sampling_rate} which is lower than the requested sample rate of {sampling_rate}. This is not a good idea.') - audio = torch.nn.functional.interpolate(audio.unsqueeze(0).unsqueeze(1), scale_factor=sampling_rate/lsr, mode='nearest', recompute_scale_factor=False).squeeze() + audio = torchaudio.functional.resample(audio, lsr, sampling_rate) # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk. # '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.