From be5f052255df24adfc9b6c335f896b4bc4beb095 Mon Sep 17 00:00:00 2001 From: James Betker Date: Tue, 22 Mar 2022 11:40:56 -0600 Subject: [PATCH] misc --- codes/scripts/audio/gen/w2v_patcher.py | 15 +++++++++++++++ .../preparation/phase3_generate_similarities.py | 4 ++-- codes/trainer/eval/audio_diffusion_fid.py | 6 +++--- 3 files changed, 20 insertions(+), 5 deletions(-) create mode 100644 codes/scripts/audio/gen/w2v_patcher.py diff --git a/codes/scripts/audio/gen/w2v_patcher.py b/codes/scripts/audio/gen/w2v_patcher.py new file mode 100644 index 00000000..848d0487 --- /dev/null +++ b/codes/scripts/audio/gen/w2v_patcher.py @@ -0,0 +1,15 @@ +import torch + +from utils.util import load_model_from_config + +if __name__ == '__main__': + config = "D:\\dlas\\options\\train_wav2vec_matcher.yml" + model_name = "generator" + model_path = "D:\dlas\experiments\train_wav2vec_matcher\models" + wav_dump_path = "FIXME" + + model = load_model_from_config(config, model_name, also_load_savepoint=False, load_path=model_path, device='cuda').eval() + w2v_logits, audio_samples = torch.load(wav_dump_path) + + w2v_logits_chunked = torch.chunk(w2v_logits, 32) + for chunk in w2v_logits_chunked: diff --git a/codes/scripts/audio/preparation/phase3_generate_similarities.py b/codes/scripts/audio/preparation/phase3_generate_similarities.py index 5b4d1772..9468219a 100644 --- a/codes/scripts/audio/preparation/phase3_generate_similarities.py +++ b/codes/scripts/audio/preparation/phase3_generate_similarities.py @@ -106,8 +106,8 @@ if __name__ == '__main__': """ parser = argparse.ArgumentParser() parser.add_argument('-o', type=str, help='Path to the options YAML file used to train the CLIP model', default='../options/train_voice_voice_clip.yml') - parser.add_argument('--num_workers', type=int, help='Number concurrent processes to use', default=2) - parser.add_argument('--root_path', type=str, help='Root path to search for audio directories from', default='Y:\\filtered\\big_podcast') + parser.add_argument('--num_workers', type=int, help='Number concurrent processes to use', default=4) + parser.add_argument('--root_path', type=str, help='Root path to search for audio directories from', default='Y:\\filtered\\youtube') parser.add_argument('--clip_size', type=int, help='Amount of audio samples to pull from each file', default=22050) args = parser.parse_args() diff --git a/codes/trainer/eval/audio_diffusion_fid.py b/codes/trainer/eval/audio_diffusion_fid.py index c91a93da..bd8def3d 100644 --- a/codes/trainer/eval/audio_diffusion_fid.py +++ b/codes/trainer/eval/audio_diffusion_fid.py @@ -265,11 +265,11 @@ if __name__ == '__main__': if __name__ == '__main__': from utils.util import load_model_from_config - diffusion = load_model_from_config('X:\\dlas\\experiments\\train_diffusion_tts9_mel.yml', 'generator', + diffusion = load_model_from_config('X:\\dlas\\experiments\\train_diffusion_tts_mel_flat.yml', 'generator', also_load_savepoint=False, - load_path='X:\\dlas\\experiments\\train_diffusion_tts9_mel\\models\\47500_generator_ema.pth').cuda() + load_path='X:\\dlas\\experiments\\train_diffusion_tts_mel_flat\\models\\6500_generator.pth').cuda() opt_eval = {'eval_tsv': 'Y:\\libritts\\test-clean\\transcribed-brief-w2v.tsv', 'diffusion_steps': 100, - 'conditioning_free': True, 'conditioning_free_k': 1, + 'conditioning_free': False, 'conditioning_free_k': 1, 'diffusion_schedule': 'linear', 'diffusion_type': 'tts9_mel'} env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval', 'step': 557, 'device': 'cuda', 'opt': {}} eval = AudioDiffusionFid(diffusion, opt_eval, env)