DL-Art-School/codes/scripts/audio/librivox/produce_libri_stretched_dataset.py

# Combines all libriTTS WAV->text mappings into a single file
import os
import random

import audio2numpy
import torch
from scipy.io import wavfile
from tqdm import tqdm

from utils.audio_resampler import AudioResampler


def secs_to_frames(secs, sr):
    return int(secs*sr)


def get_audio_clip(audio, sr, start, end):
    start = secs_to_frames(start, sr)
    end = secs_to_frames(end, sr)
    assert end > start
    if end >= audio.shape[0]:
        return None
    return audio[start:end]


# Produces an audio clip that would produce a MEL spectrogram of length mel_length by parsing parsed_sentences starting
# at starting_index and moving forwards until the full length is finished.
# Returns:
#  On failure, returns tuple: (end_index, None, [], [])
#  On success: returns tuple: (end_index, clip, start_points, end_points)
#    clip.shape = (<mel_length*256>,)
#    start_points = list(ints) where each sentence in the clip starts
#    end_points = list(ints) where each sentence in the clip ends
def gather_clip(audio, parsed_sentences, starting_index, sr, mel_length):
    audio_length = (mel_length * 256) / sr  # This is technically a hyperparameter, but I have no intent of changing the MEL hop length.
    starts = []
    ends = []
    start, end = parsed_sentences[starting_index][4:6]
    start = float(start)
    end = float(end)
    clipstart = max(start - random.random() * 2, 0)  # Offset start backwards by up to 2 seconds
    clipend = start + audio_length
    clip = get_audio_clip(audio, sr, clipstart, clipend)
    if clip is not None:
        # Fetch the start and endpoints that go along with this clip.
        starts.append(secs_to_frames(start-clipstart, sr))
        while end < clipend:
            ends.append(secs_to_frames(end-clipstart, sr))
            starting_index += 1
            if starting_index >= len(parsed_sentences):
                break
            start, end = parsed_sentences[starting_index][4:6]
            start = float(start)
            end = float(end)
            if start < clipend:
                starts.append(secs_to_frames(start-clipstart, sr))

    return starting_index+1, clip, starts, ends


if __name__ == '__main__':
    full_book_root = 'D:\\data\\audio\\libritts\\full_books\\mp3'
    libri_root = 'D:\\data\\audio\\libritts\\test-clean'
    desired_mel_length = 2000
    desired_audio_sample_rate = 22050
    output_dir = 'D:\\data\\audio\\libritts\\stop_dataset_eval'

    os.makedirs(output_dir, exist_ok=True)
    j = 0
    readers = os.listdir(libri_root)
    for it, reader_dir in enumerate(tqdm(readers)):
        #if it <= 145:  # Hey idiot! If you change this, change j too!
        #    continue
        reader = os.path.join(libri_root, reader_dir)
        if not os.path.isdir(reader):
            continue
        for chapter_dir in os.listdir(reader):
            chapter = os.path.join(reader, chapter_dir)
            if not os.path.isdir(chapter):
                continue
            id = f'{os.path.basename(reader)}_{os.path.basename(chapter)}'
            book_file = os.path.join(chapter, f'{id}.book.tsv')
            if not os.path.exists(book_file):
                continue
            with open(book_file, encoding='utf-8') as f:
                full_chapter, sr = audio2numpy.open_audio(os.path.join(full_book_root, reader_dir, chapter_dir, f'{chapter_dir}.mp3'))
                full_chapter = torch.tensor(full_chapter)
                if len(full_chapter.shape) > 1:
                    full_chapter = full_chapter[:, 0]  # Only use mono-audio.
                resampler = AudioResampler(sr, desired_audio_sample_rate, dtype=torch.float)
                full_chapter = resampler(full_chapter.unsqueeze(0)).squeeze(0)
                parsed_sentences = [line.strip().split('\t') for line in f]
                i = 0
                while i < len(parsed_sentences):
                    i, clip, ns, ne = gather_clip(full_chapter, parsed_sentences, i, desired_audio_sample_rate, desired_mel_length)
                    if clip is not None:
                        wavfile.write(os.path.join(output_dir, f'{j}.wav'), desired_audio_sample_rate, clip.cpu().numpy())
                        torch.save((ns,ne), os.path.join(output_dir, f'{j}_se.pth'))
                        j += 1