DL-Art-School/codes/scripts/audio/librivox/produce_libri_stretched_dataset.py

# Combines all libriTTS WAV->text mappings into a single file
import os
import random

import audio2numpy
import torch
from scipy.io import wavfile
from tqdm import tqdm

from utils.audio_resampler import AudioResampler


def secs_to_frames(secs, sr):
    return int(secs*sr)


def get_audio_clip(audio, sr, start, end):
    start = secs_to_frames(start, sr)
    end = secs_to_frames(end, sr)
    assert end > start
    if end >= audio.shape[0]:
        return None
    return audio[start:end]


# Produces an audio clip that would produce a MEL spectrogram of length mel_length by parsing parsed_sentences starting
# at starting_index and moving forwards until the full length is finished.
# Returns:
#  On failure, returns tuple: (end_index, None, [], [])
#  On success: returns tuple: (end_index, clip, start_points, end_points)
#    clip.shape = (<mel_length*256>,)
#    start_points = list(ints) where each sentence in the clip starts
#    end_points = list(ints) where each sentence in the clip ends
def gather_clip(audio, parsed_sentences, starting_index, sr, mel_length):
    audio_length = (mel_length * 256) / sr  # This is technically a hyperparameter, but I have no intent of changing the MEL hop length.
    starts = []
    ends = []
    start, end = parsed_sentences[starting_index][4:6]
    start = float(start)
    end = float(end)
    clipstart = max(start - random.random() * 2, 0)  # Offset start backwards by up to 2 seconds
    clipend = start + audio_length
    clip = get_audio_clip(audio, sr, clipstart, clipend)
    if clip is not None:
        # Fetch the start and endpoints that go along with this clip.
        starts.append(secs_to_frames(start-clipstart, sr))
        while end < clipend:
            ends.append(secs_to_frames(end-clipstart, sr))
            starting_index += 1
            if starting_index >= len(parsed_sentences):
                break
            start, end = parsed_sentences[starting_index][4:6]
            start = float(start)
            end = float(end)
            if start < clipend:
                starts.append(secs_to_frames(start-clipstart, sr))

    return starting_index+1, clip, starts, ends


if __name__ == '__main__':
    full_book_root = 'D:\\data\\audio\\libritts\\full_books\\mp3'
    libri_root = 'D:\\data\\audio\\libritts\\test-clean'
    desired_mel_length = 2000
    desired_audio_sample_rate = 22050
    output_dir = 'D:\\data\\audio\\libritts\\stop_dataset_eval'

    os.makedirs(output_dir, exist_ok=True)
    j = 0
    readers = os.listdir(libri_root)
    for it, reader_dir in enumerate(tqdm(readers)):
        #if it <= 145:  # Hey idiot! If you change this, change j too!
        #    continue
        reader = os.path.join(libri_root, reader_dir)
        if not os.path.isdir(reader):
            continue
        for chapter_dir in os.listdir(reader):
            chapter = os.path.join(reader, chapter_dir)
            if not os.path.isdir(chapter):
                continue
            id = f'{os.path.basename(reader)}_{os.path.basename(chapter)}'
            book_file = os.path.join(chapter, f'{id}.book.tsv')
            if not os.path.exists(book_file):
                continue
            with open(book_file, encoding='utf-8') as f:
                full_chapter, sr = audio2numpy.open_audio(os.path.join(full_book_root, reader_dir, chapter_dir, f'{chapter_dir}.mp3'))
                full_chapter = torch.tensor(full_chapter)
                if len(full_chapter.shape) > 1:
                    full_chapter = full_chapter[:, 0]  # Only use mono-audio.
                resampler = AudioResampler(sr, desired_audio_sample_rate, dtype=torch.float)
                full_chapter = resampler(full_chapter.unsqueeze(0)).squeeze(0)
                parsed_sentences = [line.strip().split('\t') for line in f]
                i = 0
                while i < len(parsed_sentences):
                    i, clip, ns, ne = gather_clip(full_chapter, parsed_sentences, i, desired_audio_sample_rate, desired_mel_length)
                    if clip is not None:
                        wavfile.write(os.path.join(output_dir, f'{j}.wav'), desired_audio_sample_rate, clip.cpu().numpy())
                        torch.save((ns,ne), os.path.join(output_dir, f'{j}_se.pth'))
                        j += 1
Stop dataset - attempt #2 2021-08-19 00:29:38 +00:00			`# Combines all libriTTS WAV->text mappings into a single file`
			`import os`
			`import random`

			`import audio2numpy`
			`import torch`
			`from scipy.io import wavfile`
			`from tqdm import tqdm`

			`from utils.audio_resampler import AudioResampler`


			`def secs_to_frames(secs, sr):`
			`return int(secs*sr)`


			`def get_audio_clip(audio, sr, start, end):`
			`start = secs_to_frames(start, sr)`
			`end = secs_to_frames(end, sr)`
			`assert end > start`
			`if end >= audio.shape[0]:`
			`return None`
			`return audio[start:end]`


			`# Produces an audio clip that would produce a MEL spectrogram of length mel_length by parsing parsed_sentences starting`
			`# at starting_index and moving forwards until the full length is finished.`
			`# Returns:`
			`# On failure, returns tuple: (end_index, None, [], [])`
			`# On success: returns tuple: (end_index, clip, start_points, end_points)`
			`# clip.shape = (<mel_length*256>,)`
			`# start_points = list(ints) where each sentence in the clip starts`
			`# end_points = list(ints) where each sentence in the clip ends`
			`def gather_clip(audio, parsed_sentences, starting_index, sr, mel_length):`
			`audio_length = (mel_length * 256) / sr # This is technically a hyperparameter, but I have no intent of changing the MEL hop length.`
			`starts = []`
			`ends = []`
			`start, end = parsed_sentences[starting_index][4:6]`
			`start = float(start)`
			`end = float(end)`
			`clipstart = max(start - random.random() * 2, 0) # Offset start backwards by up to 2 seconds`
			`clipend = start + audio_length`
			`clip = get_audio_clip(audio, sr, clipstart, clipend)`
			`if clip is not None:`
			`# Fetch the start and endpoints that go along with this clip.`
			`starts.append(secs_to_frames(start-clipstart, sr))`
			`while end < clipend:`
			`ends.append(secs_to_frames(end-clipstart, sr))`
			`starting_index += 1`
			`if starting_index >= len(parsed_sentences):`
			`break`
			`start, end = parsed_sentences[starting_index][4:6]`
			`start = float(start)`
			`end = float(end)`
			`if start < clipend:`
			`starts.append(secs_to_frames(start-clipstart, sr))`

			`return starting_index+1, clip, starts, ends`


			`if __name__ == '__main__':`
			`full_book_root = 'D:\\data\\audio\\libritts\\full_books\\mp3'`
			`libri_root = 'D:\\data\\audio\\libritts\\test-clean'`
			`desired_mel_length = 2000`
			`desired_audio_sample_rate = 22050`
			`output_dir = 'D:\\data\\audio\\libritts\\stop_dataset_eval'`

			`os.makedirs(output_dir, exist_ok=True)`
			`j = 0`
			`readers = os.listdir(libri_root)`
			`for it, reader_dir in enumerate(tqdm(readers)):`
			`#if it <= 145: # Hey idiot! If you change this, change j too!`
			`# continue`
			`reader = os.path.join(libri_root, reader_dir)`
			`if not os.path.isdir(reader):`
			`continue`
			`for chapter_dir in os.listdir(reader):`
			`chapter = os.path.join(reader, chapter_dir)`
			`if not os.path.isdir(chapter):`
			`continue`
			`id = f'{os.path.basename(reader)}_{os.path.basename(chapter)}'`
			`book_file = os.path.join(chapter, f'{id}.book.tsv')`
			`if not os.path.exists(book_file):`
			`continue`
			`with open(book_file, encoding='utf-8') as f:`
			`full_chapter, sr = audio2numpy.open_audio(os.path.join(full_book_root, reader_dir, chapter_dir, f'{chapter_dir}.mp3'))`
			`full_chapter = torch.tensor(full_chapter)`
			`if len(full_chapter.shape) > 1:`
			`full_chapter = full_chapter[:, 0] # Only use mono-audio.`
			`resampler = AudioResampler(sr, desired_audio_sample_rate, dtype=torch.float)`
			`full_chapter = resampler(full_chapter.unsqueeze(0)).squeeze(0)`
			`parsed_sentences = [line.strip().split('\t') for line in f]`
			`i = 0`
			`while i < len(parsed_sentences):`
			`i, clip, ns, ne = gather_clip(full_chapter, parsed_sentences, i, desired_audio_sample_rate, desired_mel_length)`
			`if clip is not None:`
			`wavfile.write(os.path.join(output_dir, f'{j}.wav'), desired_audio_sample_rate, clip.cpu().numpy())`
			`torch.save((ns,ne), os.path.join(output_dir, f'{j}_se.pth'))`
			`j += 1`