data and prep improvements

This commit is contained in:
James Betker 2022-03-12 15:10:11 -07:00
parent 1e87b934db
commit 896accb71f
3 changed files with 44 additions and 37 deletions

View File

@ -9,6 +9,7 @@ from utils.util import opt_get
def create_dataloader(dataset, dataset_opt, opt=None, sampler=None, collate_fn=None, shuffle=True): def create_dataloader(dataset, dataset_opt, opt=None, sampler=None, collate_fn=None, shuffle=True):
phase = dataset_opt['phase'] phase = dataset_opt['phase']
pin_memory = opt_get(dataset_opt, ['pin_memory'], True)
if phase == 'train': if phase == 'train':
if opt_get(opt, ['dist'], False): if opt_get(opt, ['dist'], False):
world_size = torch.distributed.get_world_size() world_size = torch.distributed.get_world_size()
@ -20,11 +21,11 @@ def create_dataloader(dataset, dataset_opt, opt=None, sampler=None, collate_fn=N
batch_size = dataset_opt['batch_size'] batch_size = dataset_opt['batch_size']
return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,
num_workers=num_workers, sampler=sampler, drop_last=True, num_workers=num_workers, sampler=sampler, drop_last=True,
pin_memory=True, collate_fn=collate_fn) pin_memory=pin_memory, collate_fn=collate_fn)
else: else:
batch_size = dataset_opt['batch_size'] or 1 batch_size = dataset_opt['batch_size'] or 1
return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0,
pin_memory=True, collate_fn=collate_fn) pin_memory=pin_memory, collate_fn=collate_fn)
def create_dataset(dataset_opt, return_collate=False): def create_dataset(dataset_opt, return_collate=False):

View File

@ -4,6 +4,7 @@ import os
import shutil import shutil
import sys import sys
from multiprocessing.pool import ThreadPool from multiprocessing.pool import ThreadPool
from random import shuffle
import torch import torch
import torch.nn as nn import torch.nn as nn
@ -53,57 +54,61 @@ class AudioFolderDataset(torch.utils.data.Dataset):
def process_folder(folder, output_path, base_path, progress_file, max_files): def process_folder(folder, output_path, base_path, progress_file, max_files):
classifier = load_model_from_config(args.classifier_model_opt, model_name='classifier', also_load_savepoint=True).cuda().eval() classifier = load_model_from_config(args.classifier_model_opt, model_name='classifier', also_load_savepoint=True).cuda().eval()
dataset = AudioFolderDataset(folder, sampling_rate=22050, pad_to=600000) dataset = AudioFolderDataset(folder, sampling_rate=22050, pad_to=600000)
if len(dataset) == 0:
return
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=2, pin_memory=True) dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=2, pin_memory=True)
spec_injector = MelSpectrogramInjector({'in': 'clip', 'out': 'mel'}, {}) spec_injector = MelSpectrogramInjector({'in': 'clip', 'out': 'mel'}, {})
with torch.no_grad(): with torch.no_grad():
total_count = 0 total_count = 0
for batch in tqdm(dataloader): for batch in tqdm(dataloader):
max_len = max(batch['samples']) try:
clips = batch['clip'][:, :max_len].cuda() max_len = max(batch['samples'])
paths = batch['path'] clips = batch['clip'][:, :max_len].cuda()
mels = spec_injector({'clip': clips})['mel'] paths = batch['path']
padding = ceil_multiple(mels.shape[-1], 16) mels = spec_injector({'clip': clips})['mel']
mels = F.pad(mels, (0, padding))
def get_spec_mags(clip): def get_spec_mags(clip):
stft = torch.stft(clip, n_fft=22000, hop_length=1024, return_complex=True) stft = torch.stft(clip, n_fft=22000, hop_length=1024, return_complex=True)
stft = stft[0, -2000:, :] stft = stft[0, -2000:, :]
return (stft.real ** 2 + stft.imag ** 2).sqrt() return (stft.real ** 2 + stft.imag ** 2).sqrt()
no_hifreq_data = get_spec_mags(clips).mean(dim=1) < .15 no_hifreq_data = get_spec_mags(clips).mean(dim=1) < .15
if torch.all(no_hifreq_data): if torch.all(no_hifreq_data):
continue
labels = torch.argmax(classifier(mels), dim=-1)
for b in range(clips.shape[0]):
if no_hifreq_data[b]:
continue continue
if labels[b] != 0:
continue labels = torch.argmax(classifier(mels), dim=-1)
dirpath = paths[b].replace(os.path.basename(paths[b]), "")
path = os.path.relpath(dirpath, base_path) for b in range(clips.shape[0]):
opath = os.path.join(output_path, path) if no_hifreq_data[b]:
os.makedirs(opath, exist_ok=True) continue
shutil.copy(paths[b], opath) if labels[b] != 0:
total_count += 1 continue
dirpath = paths[b].replace(os.path.basename(paths[b]), "")
path = os.path.relpath(dirpath, base_path)
opath = os.path.join(output_path, path)
os.makedirs(opath, exist_ok=True)
shutil.copy(paths[b], opath)
total_count += 1
if total_count >= max_files:
break
if total_count >= max_files: if total_count >= max_files:
break break
if total_count >= max_files: except:
break print("Exception encountered. Will ignore and continue. Exception info follows.")
print(sys.exc_info())
with open(progress_file, 'a', encoding='utf-8') as pf: with open(progress_file, 'a', encoding='utf-8') as pf:
pf.write(output_path + "\n") pf.write(folder + "\n")
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-path', type=str, help='Path to search for split files (should be the direct output of phase 1)', parser.add_argument('-path', type=str, help='Path to search for split files (should be the direct output of phase 1)',
default='Y:\\split\\big_podcast') default='Y:\\split\\big_podcast')
parser.add_argument('-progress_file', type=str, help='Place to store all folders that have already been processed', default='Y:\\split\\big_podcast\\already_processed.txt') parser.add_argument('-progress_file', type=str, help='Place to store all folders that have already been processed', default='Y:\\filtered\\big_podcast\\already_processed.txt')
parser.add_argument('-output_path', type=str, help='Path where sampled&filtered files are sent', default='Y:\\filtered\\big_podcast') parser.add_argument('-output_path', type=str, help='Path where sampled&filtered files are sent', default='Y:\\filtered\\big_podcast')
parser.add_argument('-num_threads', type=int, help='Number of concurrent workers processing files.', default=1) parser.add_argument('-num_threads', type=int, help='Number of concurrent workers processing files.', default=6)
parser.add_argument('-max_samples_per_folder', type=int, help='Maximum number of clips that can be extracted from each folder.', default=2000) parser.add_argument('-max_samples_per_folder', type=int, help='Maximum number of clips that can be extracted from each folder.', default=1000)
parser.add_argument('-classifier_model_opt', type=str, help='Train/test options file that configures the model used to classify the audio clips.', parser.add_argument('-classifier_model_opt', type=str, help='Train/test options file that configures the model used to classify the audio clips.',
default='../options/test_noisy_audio_clips_classifier.yml') default='../options/test_noisy_audio_clips_classifier.yml')
args = parser.parse_args() args = parser.parse_args()
@ -114,12 +119,13 @@ if __name__ == '__main__':
fullpath = os.path.join(args.path, cast_dir) fullpath = os.path.join(args.path, cast_dir)
if os.path.isdir(fullpath): if os.path.isdir(fullpath):
all_split_files.append(fullpath) all_split_files.append(fullpath)
shuffle(all_split_files)
all_split_files = set(all_split_files) all_split_files = set(all_split_files)
# Load the already processed files, if present, and get the set difference. # Load the already processed files, if present, and get the set difference.
if os.path.exists(args.progress_file): if os.path.exists(args.progress_file):
with open(args.progress_file, 'r', encoding='utf-8') as pf: with open(args.progress_file, 'r', encoding='utf-8') as pf:
processed = set(pf.readlines()) processed = set([l.strip() for l in pf.readlines()])
orig_len = len(all_split_files) orig_len = len(all_split_files)
all_split_files = all_split_files - processed all_split_files = all_split_files - processed
print(f'All folders: {orig_len}, processed files: {len(processed)}; {len(all_split_files)/orig_len}% of files remain to be processed.') print(f'All folders: {orig_len}, processed files: {len(processed)}; {len(all_split_files)/orig_len}% of files remain to be processed.')

View File

@ -18,5 +18,5 @@ if __name__ == '__main__':
''' '''
# Build tokenizer vocab # Build tokenizer vocab
mapping = tacotron_symbol_mapping() #mapping = tacotron_symbol_mapping()
print(json.dumps(mapping)) #print(json.dumps(mapping))