diff --git a/codes/scripts/audio/preparation/phase_2_sample_and_filter.py b/codes/scripts/audio/preparation/phase_2_sample_and_filter.py index 7e8d5801..350c6fe8 100644 --- a/codes/scripts/audio/preparation/phase_2_sample_and_filter.py +++ b/codes/scripts/audio/preparation/phase_2_sample_and_filter.py @@ -104,9 +104,9 @@ def process_folder(folder, output_path, base_path, progress_file, max_files): if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-path', type=str, help='Path to search for split files (should be the direct output of phase 1)', - default='Y:\\split\\big_podcast') + default='Y:\\split\\youtube') parser.add_argument('-progress_file', type=str, help='Place to store all folders that have already been processed', default='Y:\\filtered\\big_podcast\\already_processed.txt') - parser.add_argument('-output_path', type=str, help='Path where sampled&filtered files are sent', default='Y:\\filtered\\big_podcast') + parser.add_argument('-output_path', type=str, help='Path where sampled&filtered files are sent', default='Y:\\filtered\\youtube') parser.add_argument('-num_threads', type=int, help='Number of concurrent workers processing files.', default=6) parser.add_argument('-max_samples_per_folder', type=int, help='Maximum number of clips that can be extracted from each folder.', default=1000) parser.add_argument('-classifier_model_opt', type=str, help='Train/test options file that configures the model used to classify the audio clips.', diff --git a/codes/scripts/audio/use_vocoder.py b/codes/scripts/audio/use_vocoder.py index 73a70a04..866087e0 100644 --- a/codes/scripts/audio/use_vocoder.py +++ b/codes/scripts/audio/use_vocoder.py @@ -10,7 +10,7 @@ class Vocoder: self.model = WaveGlow(n_mel_channels=80, n_flows=12, n_group=8, n_early_size=2, n_early_every=4, WN_config={'n_layers': 8, 'n_channels': 256, 'kernel_size': 3}) sd = torch.load('../experiments/waveglow_256channels_universal_v5.pth') self.model.load_state_dict(sd) - self.model = self.model.to('cuda') + self.model = self.model.cpu() self.model.eval() def transform_mel_to_audio(self, mel): @@ -22,8 +22,6 @@ class Vocoder: if __name__ == '__main__': vocoder = Vocoder() - m = torch.load('test_mels.pth') - for i, b in enumerate(m): - plot_spectrogram(b.cpu()) - wav = vocoder.transform_mel_to_audio(b) - wavfile.write(f'{i}.wav', 22050, wav[0].cpu().numpy()) \ No newline at end of file + m = torch.load('C:\\Users\\jbetk\\Documents\\tmp\\some_audio\\00008.mel').cpu() + wav = vocoder.transform_mel_to_audio(m) + wavfile.write(f'0.wav', 22050, wav[0].cpu().numpy()) \ No newline at end of file diff --git a/codes/sweep.py b/codes/sweep.py index 67ad658d..dbe80fe5 100644 --- a/codes/sweep.py +++ b/codes/sweep.py @@ -42,7 +42,7 @@ if __name__ == '__main__': 'less_heads': {'networks': {'generator': {'kwargs': {'num_heads': 2}}}}, 'eff_off': {'networks': {'generator': {'kwargs': {'efficient_convs': False}}}}, 'more_time': {'networks': {'generator': {'kwargs': {'time_embed_dim_multiplier': 8}}}}, - 'deeper_res': {'networks': {'generator': {'kwargs': {'num_res_blocks': [3, 3, 3, 3, 3, 4, 4]}}}}, + 'scale_shift_off': {'networks': {'generator': {'kwargs': {'use_scale_shift_norm': False}}}}, 'shallow_res': {'networks': {'generator': {'kwargs': {'num_res_blocks': [1, 1, 1, 1, 1, 2, 2]}}}}, } opt = option.parse(base_opt, is_train=True) diff --git a/codes/utils/util.py b/codes/utils/util.py index 5fbb78c5..5e32832d 100644 --- a/codes/utils/util.py +++ b/codes/utils/util.py @@ -592,6 +592,7 @@ def load_audio(audiopath, sampling_rate, raw_data=None): def load_wav_to_torch(full_path): + import scipy.io.wavfile sampling_rate, data = scipy.io.wavfile.read(full_path) if data.dtype == np.int32: norm_fix = 2 ** 31