diff --git a/codes/data/__init__.py b/codes/data/__init__.py index 3d43e1c0..a33e6142 100644 --- a/codes/data/__init__.py +++ b/codes/data/__init__.py @@ -9,6 +9,7 @@ from utils.util import opt_get def create_dataloader(dataset, dataset_opt, opt=None, sampler=None, collate_fn=None, shuffle=True): phase = dataset_opt['phase'] + pin_memory = opt_get(dataset_opt, ['pin_memory'], True) if phase == 'train': if opt_get(opt, ['dist'], False): world_size = torch.distributed.get_world_size() @@ -20,11 +21,11 @@ def create_dataloader(dataset, dataset_opt, opt=None, sampler=None, collate_fn=N batch_size = dataset_opt['batch_size'] return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, sampler=sampler, drop_last=True, - pin_memory=True, collate_fn=collate_fn) + pin_memory=pin_memory, collate_fn=collate_fn) else: batch_size = dataset_opt['batch_size'] or 1 return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, - pin_memory=True, collate_fn=collate_fn) + pin_memory=pin_memory, collate_fn=collate_fn) def create_dataset(dataset_opt, return_collate=False): diff --git a/codes/scripts/audio/preparation/phase_2_sample_and_filter.py b/codes/scripts/audio/preparation/phase_2_sample_and_filter.py index 28633dc5..7e8d5801 100644 --- a/codes/scripts/audio/preparation/phase_2_sample_and_filter.py +++ b/codes/scripts/audio/preparation/phase_2_sample_and_filter.py @@ -4,6 +4,7 @@ import os import shutil import sys from multiprocessing.pool import ThreadPool +from random import shuffle import torch import torch.nn as nn @@ -53,57 +54,61 @@ class AudioFolderDataset(torch.utils.data.Dataset): def process_folder(folder, output_path, base_path, progress_file, max_files): classifier = load_model_from_config(args.classifier_model_opt, model_name='classifier', also_load_savepoint=True).cuda().eval() dataset = AudioFolderDataset(folder, sampling_rate=22050, pad_to=600000) + if len(dataset) == 0: + return dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=2, pin_memory=True) spec_injector = MelSpectrogramInjector({'in': 'clip', 'out': 'mel'}, {}) with torch.no_grad(): total_count = 0 for batch in tqdm(dataloader): - max_len = max(batch['samples']) - clips = batch['clip'][:, :max_len].cuda() - paths = batch['path'] - mels = spec_injector({'clip': clips})['mel'] - padding = ceil_multiple(mels.shape[-1], 16) - mels = F.pad(mels, (0, padding)) + try: + max_len = max(batch['samples']) + clips = batch['clip'][:, :max_len].cuda() + paths = batch['path'] + mels = spec_injector({'clip': clips})['mel'] - def get_spec_mags(clip): - stft = torch.stft(clip, n_fft=22000, hop_length=1024, return_complex=True) - stft = stft[0, -2000:, :] - return (stft.real ** 2 + stft.imag ** 2).sqrt() - no_hifreq_data = get_spec_mags(clips).mean(dim=1) < .15 - if torch.all(no_hifreq_data): - continue - - labels = torch.argmax(classifier(mels), dim=-1) - - for b in range(clips.shape[0]): - if no_hifreq_data[b]: + def get_spec_mags(clip): + stft = torch.stft(clip, n_fft=22000, hop_length=1024, return_complex=True) + stft = stft[0, -2000:, :] + return (stft.real ** 2 + stft.imag ** 2).sqrt() + no_hifreq_data = get_spec_mags(clips).mean(dim=1) < .15 + if torch.all(no_hifreq_data): continue - if labels[b] != 0: - continue - dirpath = paths[b].replace(os.path.basename(paths[b]), "") - path = os.path.relpath(dirpath, base_path) - opath = os.path.join(output_path, path) - os.makedirs(opath, exist_ok=True) - shutil.copy(paths[b], opath) - total_count += 1 + + labels = torch.argmax(classifier(mels), dim=-1) + + for b in range(clips.shape[0]): + if no_hifreq_data[b]: + continue + if labels[b] != 0: + continue + dirpath = paths[b].replace(os.path.basename(paths[b]), "") + path = os.path.relpath(dirpath, base_path) + opath = os.path.join(output_path, path) + os.makedirs(opath, exist_ok=True) + shutil.copy(paths[b], opath) + total_count += 1 + if total_count >= max_files: + break if total_count >= max_files: break - if total_count >= max_files: - break + except: + print("Exception encountered. Will ignore and continue. Exception info follows.") + print(sys.exc_info()) with open(progress_file, 'a', encoding='utf-8') as pf: - pf.write(output_path + "\n") + pf.write(folder + "\n") if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-path', type=str, help='Path to search for split files (should be the direct output of phase 1)', default='Y:\\split\\big_podcast') - parser.add_argument('-progress_file', type=str, help='Place to store all folders that have already been processed', default='Y:\\split\\big_podcast\\already_processed.txt') + parser.add_argument('-progress_file', type=str, help='Place to store all folders that have already been processed', default='Y:\\filtered\\big_podcast\\already_processed.txt') parser.add_argument('-output_path', type=str, help='Path where sampled&filtered files are sent', default='Y:\\filtered\\big_podcast') - parser.add_argument('-num_threads', type=int, help='Number of concurrent workers processing files.', default=1) - parser.add_argument('-max_samples_per_folder', type=int, help='Maximum number of clips that can be extracted from each folder.', default=2000) + parser.add_argument('-num_threads', type=int, help='Number of concurrent workers processing files.', default=6) + parser.add_argument('-max_samples_per_folder', type=int, help='Maximum number of clips that can be extracted from each folder.', default=1000) parser.add_argument('-classifier_model_opt', type=str, help='Train/test options file that configures the model used to classify the audio clips.', default='../options/test_noisy_audio_clips_classifier.yml') args = parser.parse_args() @@ -114,12 +119,13 @@ if __name__ == '__main__': fullpath = os.path.join(args.path, cast_dir) if os.path.isdir(fullpath): all_split_files.append(fullpath) + shuffle(all_split_files) all_split_files = set(all_split_files) # Load the already processed files, if present, and get the set difference. if os.path.exists(args.progress_file): with open(args.progress_file, 'r', encoding='utf-8') as pf: - processed = set(pf.readlines()) + processed = set([l.strip() for l in pf.readlines()]) orig_len = len(all_split_files) all_split_files = all_split_files - processed print(f'All folders: {orig_len}, processed files: {len(processed)}; {len(all_split_files)/orig_len}% of files remain to be processed.') diff --git a/codes/scripts/hugging_face_hub_upload.py b/codes/scripts/hugging_face_hub_upload.py index b05784c6..cadcce23 100644 --- a/codes/scripts/hugging_face_hub_upload.py +++ b/codes/scripts/hugging_face_hub_upload.py @@ -18,5 +18,5 @@ if __name__ == '__main__': ''' # Build tokenizer vocab - mapping = tacotron_symbol_mapping() - print(json.dumps(mapping)) \ No newline at end of file + #mapping = tacotron_symbol_mapping() + #print(json.dumps(mapping)) \ No newline at end of file