Allow processing of multiple audio sources at once from nv_tacotron_dataset

2021-08-14 16:04:05 -06:00 · 2021-08-14 16:04:05 -06:00 · d6a73acaed
commit d6a73acaed
parent 007976082b
3 changed files with 57 additions and 12 deletions
--- a/codes/data/audio/nv_tacotron_dataset.py
+++ b/codes/data/audio/nv_tacotron_dataset.py
@ -18,7 +18,8 @@ from utils.util import opt_get
 def load_mozilla_cv(filename):
    with open(filename, encoding='utf-8') as f:
        components = [line.strip().split('\t') for line in f][1:]  # First line is the header
-        filepaths_and_text = [[f'clips/{component[1]}', component[2]] for component in components]
+        base = os.path.dirname(filename)
+        filepaths_and_text = [[os.path.join(base, f'clips/{component[1]}'), component[2]] for component in components]
    return filepaths_and_text


@ -29,15 +30,24 @@ class TextMelLoader(torch.utils.data.Dataset):
        3) computes mel-spectrograms from audio files.
    """
    def __init__(self, hparams):
-        self.path = os.path.dirname(hparams['path'])
+        self.path = hparams['path']
+        if not isinstance(self.path, list):
+            self.path = [self.path]
+
        fetcher_mode = opt_get(hparams, ['fetcher_mode'], 'lj')
-        if fetcher_mode == 'lj':
-            fetcher_fn = load_filepaths_and_text
-        elif fetcher_mode == 'mozilla_cv':
-            fetcher_fn = load_mozilla_cv
-        else:
-            raise NotImplementedError()
-        self.audiopaths_and_text = fetcher_fn(hparams['path'])
+        if not isinstance(fetcher_mode, list):
+            fetcher_mode = [fetcher_mode]
+        assert len(self.path) == len(fetcher_mode)
+
+        self.audiopaths_and_text = []
+        for p, fm in zip(self.path, fetcher_mode):
+            if fm == 'lj' or fm == 'libritts':
+                fetcher_fn = load_filepaths_and_text
+            elif fm == 'mozilla_cv':
+                fetcher_fn = load_mozilla_cv
+            else:
+                raise NotImplementedError()
+            self.audiopaths_and_text.extend(fetcher_fn(p))
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
@ -61,7 +71,6 @@ class TextMelLoader(torch.utils.data.Dataset):
    def get_mel_text_pair(self, audiopath_and_text):
        # separate filename and text
        audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
-        audiopath = os.path.join(self.path, audiopath)
        text_seq = self.get_text(text)
        mel = self.get_mel(audiopath)
        return (text_seq, mel, text, audiopath_and_text[0])
@ -205,11 +214,11 @@ def load_mel_buffer_from_file(path):
 def dump_mels_to_disk():
    params = {
        'mode': 'nv_tacotron',
-        'path': 'E:\\audio\\MozillaCommonVoice\\en\\test.tsv',
+        'path': ['E:\\audio\\MozillaCommonVoice\\en\\test.tsv', 'E:\\audio\\LibriTTS\\train-other-500_list.txt'],
+        'fetcher_mode': ['mozilla_cv', 'libritts'],
        'phase': 'train',
        'n_workers': 0,
        'batch_size': 1,
-        'fetcher_mode': 'mozilla_cv',
        'needs_collate': True,
        'max_mel_length': 1000,
        'max_text_length': 200,
--- a/codes/models/tacotron2/taco_utils.py
+++ b/codes/models/tacotron2/taco_utils.py
@ -1,3 +1,5 @@
+import os.path
+
 import numpy as np
 from scipy.io.wavfile import read
 import torch
@ -18,6 +20,9 @@ def load_wav_to_torch(full_path):
 def load_filepaths_and_text(filename, split="|"):
    with open(filename, encoding='utf-8') as f:
        filepaths_and_text = [line.strip().split(split) for line in f]
+        base = os.path.dirname(filename)
+        for j in range(len(filepaths_and_text)):
+            filepaths_and_text[j][0] = os.path.join(base, filepaths_and_text[j][0])
    return filepaths_and_text


--- a/codes/scripts/audio/preprocess_libritts.py
+++ b/codes/scripts/audio/preprocess_libritts.py
@ -0,0 +1,31 @@
+# Combines all libriTTS WAV->text mappings into a single file
+import os
+
+from tqdm import tqdm
+
+if __name__ == '__main__':
+    libri_root = 'E:\\audio\\LibriTTS'
+    basis = 'train-other-500'
+
+    readers = os.listdir(os.path.join(libri_root, basis))
+    ofile = open(os.path.join(libri_root, f'{basis}_list.txt'), 'w', encoding='utf-8')
+    for reader_dir in tqdm(readers):
+        reader = os.path.join(libri_root, basis, reader_dir)
+        if not os.path.isdir(reader):
+            continue
+        for chapter_dir in os.listdir(reader):
+            chapter = os.path.join(reader, chapter_dir)
+            if not os.path.isdir(chapter):
+                continue
+            id = f'{os.path.basename(reader)}_{os.path.basename(chapter)}'
+            trans_file = f'{id}.trans.tsv'
+            with open(os.path.join(chapter, trans_file), encoding='utf-8') as f:
+                trans_lines = [line.strip().split('\t') for line in f]
+                for line in trans_lines:
+                    wav_file, raw_text, normalized_text = line
+                    wav_file = '/'.join([basis, reader_dir, chapter_dir, f'{wav_file}.wav'])
+                    if not os.path.exists(os.path.join(libri_root, wav_file)):
+                        print(f'!WARNING could not open {wav_file}')
+                    ofile.write(f'{wav_file}|{normalized_text}\n')
+            ofile.flush()
+    ofile.close()