Allow processing of multiple audio sources at once from nv_tacotron_dataset

This commit is contained in:
James Betker 2021-08-14 16:04:05 -06:00
parent 007976082b
commit d6a73acaed
3 changed files with 57 additions and 12 deletions

View File

@ -18,7 +18,8 @@ from utils.util import opt_get
def load_mozilla_cv(filename):
with open(filename, encoding='utf-8') as f:
components = [line.strip().split('\t') for line in f][1:] # First line is the header
filepaths_and_text = [[f'clips/{component[1]}', component[2]] for component in components]
base = os.path.dirname(filename)
filepaths_and_text = [[os.path.join(base, f'clips/{component[1]}'), component[2]] for component in components]
return filepaths_and_text
@ -29,15 +30,24 @@ class TextMelLoader(torch.utils.data.Dataset):
3) computes mel-spectrograms from audio files.
"""
def __init__(self, hparams):
self.path = os.path.dirname(hparams['path'])
self.path = hparams['path']
if not isinstance(self.path, list):
self.path = [self.path]
fetcher_mode = opt_get(hparams, ['fetcher_mode'], 'lj')
if fetcher_mode == 'lj':
if not isinstance(fetcher_mode, list):
fetcher_mode = [fetcher_mode]
assert len(self.path) == len(fetcher_mode)
self.audiopaths_and_text = []
for p, fm in zip(self.path, fetcher_mode):
if fm == 'lj' or fm == 'libritts':
fetcher_fn = load_filepaths_and_text
elif fetcher_mode == 'mozilla_cv':
elif fm == 'mozilla_cv':
fetcher_fn = load_mozilla_cv
else:
raise NotImplementedError()
self.audiopaths_and_text = fetcher_fn(hparams['path'])
self.audiopaths_and_text.extend(fetcher_fn(p))
self.text_cleaners = hparams.text_cleaners
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
@ -61,7 +71,6 @@ class TextMelLoader(torch.utils.data.Dataset):
def get_mel_text_pair(self, audiopath_and_text):
# separate filename and text
audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
audiopath = os.path.join(self.path, audiopath)
text_seq = self.get_text(text)
mel = self.get_mel(audiopath)
return (text_seq, mel, text, audiopath_and_text[0])
@ -205,11 +214,11 @@ def load_mel_buffer_from_file(path):
def dump_mels_to_disk():
params = {
'mode': 'nv_tacotron',
'path': 'E:\\audio\\MozillaCommonVoice\\en\\test.tsv',
'path': ['E:\\audio\\MozillaCommonVoice\\en\\test.tsv', 'E:\\audio\\LibriTTS\\train-other-500_list.txt'],
'fetcher_mode': ['mozilla_cv', 'libritts'],
'phase': 'train',
'n_workers': 0,
'batch_size': 1,
'fetcher_mode': 'mozilla_cv',
'needs_collate': True,
'max_mel_length': 1000,
'max_text_length': 200,

View File

@ -1,3 +1,5 @@
import os.path
import numpy as np
from scipy.io.wavfile import read
import torch
@ -18,6 +20,9 @@ def load_wav_to_torch(full_path):
def load_filepaths_and_text(filename, split="|"):
with open(filename, encoding='utf-8') as f:
filepaths_and_text = [line.strip().split(split) for line in f]
base = os.path.dirname(filename)
for j in range(len(filepaths_and_text)):
filepaths_and_text[j][0] = os.path.join(base, filepaths_and_text[j][0])
return filepaths_and_text

View File

@ -0,0 +1,31 @@
# Combines all libriTTS WAV->text mappings into a single file
import os
from tqdm import tqdm
if __name__ == '__main__':
libri_root = 'E:\\audio\\LibriTTS'
basis = 'train-other-500'
readers = os.listdir(os.path.join(libri_root, basis))
ofile = open(os.path.join(libri_root, f'{basis}_list.txt'), 'w', encoding='utf-8')
for reader_dir in tqdm(readers):
reader = os.path.join(libri_root, basis, reader_dir)
if not os.path.isdir(reader):
continue
for chapter_dir in os.listdir(reader):
chapter = os.path.join(reader, chapter_dir)
if not os.path.isdir(chapter):
continue
id = f'{os.path.basename(reader)}_{os.path.basename(chapter)}'
trans_file = f'{id}.trans.tsv'
with open(os.path.join(chapter, trans_file), encoding='utf-8') as f:
trans_lines = [line.strip().split('\t') for line in f]
for line in trans_lines:
wav_file, raw_text, normalized_text = line
wav_file = '/'.join([basis, reader_dir, chapter_dir, f'{wav_file}.wav'])
if not os.path.exists(os.path.join(libri_root, wav_file)):
print(f'!WARNING could not open {wav_file}')
ofile.write(f'{wav_file}|{normalized_text}\n')
ofile.flush()
ofile.close()