diff --git a/codes/data/audio/nv_tacotron_dataset.py b/codes/data/audio/nv_tacotron_dataset.py index 8257e080..75dec5e5 100644 --- a/codes/data/audio/nv_tacotron_dataset.py +++ b/codes/data/audio/nv_tacotron_dataset.py @@ -18,7 +18,8 @@ from utils.util import opt_get def load_mozilla_cv(filename): with open(filename, encoding='utf-8') as f: components = [line.strip().split('\t') for line in f][1:] # First line is the header - filepaths_and_text = [[f'clips/{component[1]}', component[2]] for component in components] + base = os.path.dirname(filename) + filepaths_and_text = [[os.path.join(base, f'clips/{component[1]}'), component[2]] for component in components] return filepaths_and_text @@ -29,15 +30,24 @@ class TextMelLoader(torch.utils.data.Dataset): 3) computes mel-spectrograms from audio files. """ def __init__(self, hparams): - self.path = os.path.dirname(hparams['path']) + self.path = hparams['path'] + if not isinstance(self.path, list): + self.path = [self.path] + fetcher_mode = opt_get(hparams, ['fetcher_mode'], 'lj') - if fetcher_mode == 'lj': - fetcher_fn = load_filepaths_and_text - elif fetcher_mode == 'mozilla_cv': - fetcher_fn = load_mozilla_cv - else: - raise NotImplementedError() - self.audiopaths_and_text = fetcher_fn(hparams['path']) + if not isinstance(fetcher_mode, list): + fetcher_mode = [fetcher_mode] + assert len(self.path) == len(fetcher_mode) + + self.audiopaths_and_text = [] + for p, fm in zip(self.path, fetcher_mode): + if fm == 'lj' or fm == 'libritts': + fetcher_fn = load_filepaths_and_text + elif fm == 'mozilla_cv': + fetcher_fn = load_mozilla_cv + else: + raise NotImplementedError() + self.audiopaths_and_text.extend(fetcher_fn(p)) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate @@ -61,7 +71,6 @@ class TextMelLoader(torch.utils.data.Dataset): def get_mel_text_pair(self, audiopath_and_text): # separate filename and text audiopath, text = audiopath_and_text[0], audiopath_and_text[1] - audiopath = os.path.join(self.path, audiopath) text_seq = self.get_text(text) mel = self.get_mel(audiopath) return (text_seq, mel, text, audiopath_and_text[0]) @@ -205,11 +214,11 @@ def load_mel_buffer_from_file(path): def dump_mels_to_disk(): params = { 'mode': 'nv_tacotron', - 'path': 'E:\\audio\\MozillaCommonVoice\\en\\test.tsv', + 'path': ['E:\\audio\\MozillaCommonVoice\\en\\test.tsv', 'E:\\audio\\LibriTTS\\train-other-500_list.txt'], + 'fetcher_mode': ['mozilla_cv', 'libritts'], 'phase': 'train', 'n_workers': 0, 'batch_size': 1, - 'fetcher_mode': 'mozilla_cv', 'needs_collate': True, 'max_mel_length': 1000, 'max_text_length': 200, diff --git a/codes/models/tacotron2/taco_utils.py b/codes/models/tacotron2/taco_utils.py index 0ba729fc..a3b03a93 100644 --- a/codes/models/tacotron2/taco_utils.py +++ b/codes/models/tacotron2/taco_utils.py @@ -1,3 +1,5 @@ +import os.path + import numpy as np from scipy.io.wavfile import read import torch @@ -18,6 +20,9 @@ def load_wav_to_torch(full_path): def load_filepaths_and_text(filename, split="|"): with open(filename, encoding='utf-8') as f: filepaths_and_text = [line.strip().split(split) for line in f] + base = os.path.dirname(filename) + for j in range(len(filepaths_and_text)): + filepaths_and_text[j][0] = os.path.join(base, filepaths_and_text[j][0]) return filepaths_and_text diff --git a/codes/scripts/audio/preprocess_libritts.py b/codes/scripts/audio/preprocess_libritts.py new file mode 100644 index 00000000..c22fcd7f --- /dev/null +++ b/codes/scripts/audio/preprocess_libritts.py @@ -0,0 +1,31 @@ +# Combines all libriTTS WAV->text mappings into a single file +import os + +from tqdm import tqdm + +if __name__ == '__main__': + libri_root = 'E:\\audio\\LibriTTS' + basis = 'train-other-500' + + readers = os.listdir(os.path.join(libri_root, basis)) + ofile = open(os.path.join(libri_root, f'{basis}_list.txt'), 'w', encoding='utf-8') + for reader_dir in tqdm(readers): + reader = os.path.join(libri_root, basis, reader_dir) + if not os.path.isdir(reader): + continue + for chapter_dir in os.listdir(reader): + chapter = os.path.join(reader, chapter_dir) + if not os.path.isdir(chapter): + continue + id = f'{os.path.basename(reader)}_{os.path.basename(chapter)}' + trans_file = f'{id}.trans.tsv' + with open(os.path.join(chapter, trans_file), encoding='utf-8') as f: + trans_lines = [line.strip().split('\t') for line in f] + for line in trans_lines: + wav_file, raw_text, normalized_text = line + wav_file = '/'.join([basis, reader_dir, chapter_dir, f'{wav_file}.wav']) + if not os.path.exists(os.path.join(libri_root, wav_file)): + print(f'!WARNING could not open {wav_file}') + ofile.write(f'{wav_file}|{normalized_text}\n') + ofile.flush() + ofile.close()