diff --git a/codes/scripts/audio/speech_to_speech_clip.py b/codes/scripts/audio/preparation/phase3_generate_similarities.py similarity index 52% rename from codes/scripts/audio/speech_to_speech_clip.py rename to codes/scripts/audio/preparation/phase3_generate_similarities.py index 40e36523..67e87a6a 100644 --- a/codes/scripts/audio/speech_to_speech_clip.py +++ b/codes/scripts/audio/preparation/phase3_generate_similarities.py @@ -38,68 +38,71 @@ def process_subdir(subdir, options, clip_sz): global clip_model if clip_model is None: print('Loading CLIP model..') - clip_model = load_model_from_config(preloaded_options=options, model_name='clip', also_load_savepoint=True) + clip_model = load_model_from_config(preloaded_options=options, model_name='clip', also_load_savepoint=True).cuda() + clip_model.eval() - root, paths = subdir - if len(paths) == 0: - return - root = str(root) - output_file = os.path.join(root, 'similarities.pth') - if os.path.exists(output_file): - print(f'{root} already processed. Skipping.') - return - print(f'Processing {root}..') + with torch.no_grad(): + root, paths = subdir + if len(paths) == 0: + return + root = str(root) + output_file = os.path.join(root, 'similarities.pth') + if os.path.exists(output_file): + print(f'{root} already processed. Skipping.') + return + print(f'Processing {root}..') - clips = [] - for path in paths: - clip = load_audio(str(path), 22050) - padding = clip_sz - clip.shape[1] - if padding > 0: - clip = F.pad(clip, (0, padding)) - elif padding < 0: - clip = clip[:, :clip_sz] - clips.append(clip) - sims = None - while len(clips) > 0: - stacked = torch.stack(clips[:256], dim=0).cuda() - clips = clips[256:] - mels = wav_to_mel(stacked) - outp = clip_model.inference(mels).cpu() - if sims is None: - sims = outp - else: - if outp.shape[-1] != 256: - outp = F.pad(outp, (0,256-outp.shape[-1])) - sims = torch.cat([sims, outp], dim=0) + clips = [] + for path in paths: + clip = load_audio(str(path), 22050) + padding = clip_sz - clip.shape[1] + if padding > 0: + clip = F.pad(clip, (0, padding)) + elif padding < 0: + clip = clip[:, :clip_sz] + clips.append(clip) + sims = None + while len(clips) > 0: + stacked = torch.stack(clips[:256], dim=0).cuda() + clips = clips[256:] + mels = wav_to_mel(stacked).cuda() + outp = clip_model.inference(mels).cpu() + if sims is None: + sims = outp + else: + if outp.shape[-1] != 256: + outp = F.pad(outp, (0,256-outp.shape[-1])) + sims = torch.cat([sims, outp], dim=0) - simmap = {} - # TODO: this can be further improved. We're just taking the topk here but, there is no gaurantee that there is 3 - # samples from the same speaker in any given folder. - for path, sim in zip(paths, sims): - n = min(4, len(sim)) - top3 = torch.topk(sim, n) - rel = os.path.relpath(str(path), root) - simpaths = [] - if n == 1: - simpaths.append(rel) - else: - for i in range(1,n): # The first entry is always the file itself. - top_ind = top3.indices[i] - simpaths.append(os.path.relpath(paths[top_ind], root)) - simmap[rel] = simpaths - torch.save(simmap, output_file) + simmap = {} + # TODO: this can be further improved. We're just taking the topk here but, there is no gaurantee that there is 3 + # samples from the same speaker in any given folder. + for path, sim in zip(paths, sims): + n = min(4, len(sim)) + top3 = torch.topk(sim, n) + rel = os.path.relpath(str(path), root) + simpaths = [] + if n == 1: + simpaths.append(rel) + else: + for i in range(1,n): # The first entry is always the file itself. + top_ind = top3.indices[i] + simpaths.append(os.path.relpath(paths[top_ind], root)) + simmap[rel] = simpaths + torch.save(simmap, output_file) if __name__ == '__main__': """ This script iterates within a directory filled with subdirs. Each subdir contains a list of audio files from the same source. The script uses an speech-to-speech clip model to find the most similar audio clips within each subdir for - each clip within that subdir. + each clip within that subdir. These similar files are recorded in a "similarities.pth" file in each subdirectory, which + is consumed during training when the dataset searches for conditioning clips. """ parser = argparse.ArgumentParser() parser.add_argument('-o', type=str, help='Path to the options YAML file used to train the CLIP model', default='../options/train_voice_voice_clip.yml') parser.add_argument('--num_workers', type=int, help='Number concurrent processes to use', default=6) - parser.add_argument('--root_path', type=str, help='Root path to search for audio directories from', default='Y:\\bigasr_dataset\\tedlium') + parser.add_argument('--root_path', type=str, help='Root path to search for audio directories from', default='Y:\\filtered\\big_podcast') parser.add_argument('--clip_size', type=int, help='Amount of audio samples to pull from each file', default=22050) args = parser.parse_args()