diff --git a/codes/scripts/audio/speech_to_speech_clip.py b/codes/scripts/audio/preparation/phase3_generate_similarities.py
similarity index 52%
rename from codes/scripts/audio/speech_to_speech_clip.py
rename to codes/scripts/audio/preparation/phase3_generate_similarities.py
index 40e36523..67e87a6a 100644
--- a/codes/scripts/audio/speech_to_speech_clip.py
+++ b/codes/scripts/audio/preparation/phase3_generate_similarities.py
@@ -38,68 +38,71 @@ def process_subdir(subdir, options, clip_sz):
     global clip_model
     if clip_model is None:
         print('Loading CLIP model..')
-        clip_model = load_model_from_config(preloaded_options=options, model_name='clip', also_load_savepoint=True)
+        clip_model = load_model_from_config(preloaded_options=options, model_name='clip', also_load_savepoint=True).cuda()
+        clip_model.eval()
 
-    root, paths = subdir
-    if len(paths) == 0:
-        return
-    root = str(root)
-    output_file = os.path.join(root, 'similarities.pth')
-    if os.path.exists(output_file):
-        print(f'{root} already processed. Skipping.')
-        return
-    print(f'Processing {root}..')
+    with torch.no_grad():
+        root, paths = subdir
+        if len(paths) == 0:
+            return
+        root = str(root)
+        output_file = os.path.join(root, 'similarities.pth')
+        if os.path.exists(output_file):
+            print(f'{root} already processed. Skipping.')
+            return
+        print(f'Processing {root}..')
 
-    clips = []
-    for path in paths:
-        clip = load_audio(str(path), 22050)
-        padding = clip_sz - clip.shape[1]
-        if padding > 0:
-            clip = F.pad(clip, (0, padding))
-        elif padding < 0:
-            clip = clip[:, :clip_sz]
-        clips.append(clip)
-    sims = None
-    while len(clips) > 0:
-        stacked = torch.stack(clips[:256], dim=0).cuda()
-        clips = clips[256:]
-        mels = wav_to_mel(stacked)
-        outp = clip_model.inference(mels).cpu()
-        if sims is None:
-            sims = outp
-        else:
-            if outp.shape[-1] != 256:
-                outp = F.pad(outp, (0,256-outp.shape[-1]))
-            sims = torch.cat([sims, outp], dim=0)
+        clips = []
+        for path in paths:
+            clip = load_audio(str(path), 22050)
+            padding = clip_sz - clip.shape[1]
+            if padding > 0:
+                clip = F.pad(clip, (0, padding))
+            elif padding < 0:
+                clip = clip[:, :clip_sz]
+            clips.append(clip)
+        sims = None
+        while len(clips) > 0:
+            stacked = torch.stack(clips[:256], dim=0).cuda()
+            clips = clips[256:]
+            mels = wav_to_mel(stacked).cuda()
+            outp = clip_model.inference(mels).cpu()
+            if sims is None:
+                sims = outp
+            else:
+                if outp.shape[-1] != 256:
+                    outp = F.pad(outp, (0,256-outp.shape[-1]))
+                sims = torch.cat([sims, outp], dim=0)
 
-    simmap = {}
-    # TODO: this can be further improved. We're just taking the topk here but, there is no gaurantee that there is 3
-    # samples from the same speaker in any given folder.
-    for path, sim in zip(paths, sims):
-        n = min(4, len(sim))
-        top3 = torch.topk(sim, n)
-        rel = os.path.relpath(str(path), root)
-        simpaths = []
-        if n == 1:
-            simpaths.append(rel)
-        else:
-            for i in range(1,n):  # The first entry is always the file itself.
-                top_ind = top3.indices[i]
-                simpaths.append(os.path.relpath(paths[top_ind], root))
-        simmap[rel] = simpaths
-    torch.save(simmap, output_file)
+        simmap = {}
+        # TODO: this can be further improved. We're just taking the topk here but, there is no gaurantee that there is 3
+        # samples from the same speaker in any given folder.
+        for path, sim in zip(paths, sims):
+            n = min(4, len(sim))
+            top3 = torch.topk(sim, n)
+            rel = os.path.relpath(str(path), root)
+            simpaths = []
+            if n == 1:
+                simpaths.append(rel)
+            else:
+                for i in range(1,n):  # The first entry is always the file itself.
+                    top_ind = top3.indices[i]
+                    simpaths.append(os.path.relpath(paths[top_ind], root))
+            simmap[rel] = simpaths
+        torch.save(simmap, output_file)
 
 
 if __name__ == '__main__':
     """
     This script iterates within a directory filled with subdirs. Each subdir contains a list of audio files from the same
     source. The script uses an speech-to-speech clip model to find the <n> most similar audio clips within each subdir for
-    each clip within that subdir.
+    each clip within that subdir. These similar files are recorded in a "similarities.pth" file in each subdirectory, which 
+    is consumed during training when the dataset searches for conditioning clips.
     """
     parser = argparse.ArgumentParser()
     parser.add_argument('-o', type=str, help='Path to the options YAML file used to train the CLIP model', default='../options/train_voice_voice_clip.yml')
     parser.add_argument('--num_workers', type=int, help='Number concurrent processes to use', default=6)
-    parser.add_argument('--root_path', type=str, help='Root path to search for audio directories from', default='Y:\\bigasr_dataset\\tedlium')
+    parser.add_argument('--root_path', type=str, help='Root path to search for audio directories from', default='Y:\\filtered\\big_podcast')
     parser.add_argument('--clip_size', type=int, help='Amount of audio samples to pull from each file', default=22050)
     args = parser.parse_args()