# Combines all libriTTS WAV->text mappings into a single file import os import random import audio2numpy import torch from scipy.io import wavfile from tqdm import tqdm from utils.audio_resampler import AudioResampler def secs_to_frames(secs, sr): return int(secs*sr) def get_audio_clip(audio, sr, start, end): start = secs_to_frames(start, sr) end = secs_to_frames(end, sr) assert end > start if end >= audio.shape[0]: return None return audio[start:end] # Produces an audio clip that would produce a MEL spectrogram of length mel_length by parsing parsed_sentences starting # at starting_index and moving forwards until the full length is finished. # Returns: # On failure, returns tuple: (end_index, None, [], []) # On success: returns tuple: (end_index, clip, start_points, end_points) # clip.shape = (,) # start_points = list(ints) where each sentence in the clip starts # end_points = list(ints) where each sentence in the clip ends def gather_clip(audio, parsed_sentences, starting_index, sr, mel_length): audio_length = (mel_length * 256) / sr # This is technically a hyperparameter, but I have no intent of changing the MEL hop length. starts = [] ends = [] start, end = parsed_sentences[starting_index][4:6] start = float(start) end = float(end) clipstart = max(start - random.random() * 2, 0) # Offset start backwards by up to 2 seconds clipend = start + audio_length clip = get_audio_clip(audio, sr, clipstart, clipend) if clip is not None: # Fetch the start and endpoints that go along with this clip. starts.append(secs_to_frames(start-clipstart, sr)) while end < clipend: ends.append(secs_to_frames(end-clipstart, sr)) starting_index += 1 if starting_index >= len(parsed_sentences): break start, end = parsed_sentences[starting_index][4:6] start = float(start) end = float(end) if start < clipend: starts.append(secs_to_frames(start-clipstart, sr)) return starting_index+1, clip, starts, ends if __name__ == '__main__': full_book_root = 'D:\\data\\audio\\libritts\\full_books\\mp3' libri_root = 'D:\\data\\audio\\libritts\\test-clean' desired_mel_length = 2000 desired_audio_sample_rate = 22050 output_dir = 'D:\\data\\audio\\libritts\\stop_dataset_eval' os.makedirs(output_dir, exist_ok=True) j = 0 readers = os.listdir(libri_root) for it, reader_dir in enumerate(tqdm(readers)): #if it <= 145: # Hey idiot! If you change this, change j too! # continue reader = os.path.join(libri_root, reader_dir) if not os.path.isdir(reader): continue for chapter_dir in os.listdir(reader): chapter = os.path.join(reader, chapter_dir) if not os.path.isdir(chapter): continue id = f'{os.path.basename(reader)}_{os.path.basename(chapter)}' book_file = os.path.join(chapter, f'{id}.book.tsv') if not os.path.exists(book_file): continue with open(book_file, encoding='utf-8') as f: full_chapter, sr = audio2numpy.open_audio(os.path.join(full_book_root, reader_dir, chapter_dir, f'{chapter_dir}.mp3')) full_chapter = torch.tensor(full_chapter) if len(full_chapter.shape) > 1: full_chapter = full_chapter[:, 0] # Only use mono-audio. resampler = AudioResampler(sr, desired_audio_sample_rate, dtype=torch.float) full_chapter = resampler(full_chapter.unsqueeze(0)).squeeze(0) parsed_sentences = [line.strip().split('\t') for line in f] i = 0 while i < len(parsed_sentences): i, clip, ns, ne = gather_clip(full_chapter, parsed_sentences, i, desired_audio_sample_rate, desired_mel_length) if clip is not None: wavfile.write(os.path.join(output_dir, f'{j}.wav'), desired_audio_sample_rate, clip.cpu().numpy()) torch.save((ns,ne), os.path.join(output_dir, f'{j}_se.pth')) j += 1