Check in speech2speech CLIP inference tool

2021-12-29 00:19:44 -07:00 · 2021-12-29 00:19:44 -07:00 · b24a51f0aa
commit b24a51f0aa
parent c1bef01dfa
4 changed files with 123 additions and 1 deletions
--- a/codes/models/gpt_voice/voice_voice_clip.py
+++ b/codes/models/gpt_voice/voice_voice_clip.py
@ -70,6 +70,14 @@ class VoiceCLIP(nn.Module):
        loss = (F.cross_entropy(sim, labels) + F.cross_entropy(sim.t(), labels)) / 2
        return loss

+    def inference(self, speech_mels):
+        emb = self.encoder(speech_mels)
+        latent = self.to_latent(emb)
+        latent = F.normalize(latent, p=2, dim=-1)
+        temp = self.temperature.exp()
+        sim = einsum('i d, j d -> i j', latent, latent) * temp
+        return sim
+

@register_model
 def register_voice_to_voice_clip(opt_net, opt):
--- a/codes/scripts/audio/speech_to_speech_clip.py
+++ b/codes/scripts/audio/speech_to_speech_clip.py
@ -0,0 +1,113 @@
+import argparse
+import functools
+import os
+from multiprocessing.pool import ThreadPool
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import yaml
+from tqdm import tqdm
+
+from data.audio.unsupervised_audio_dataset import load_audio
+from data.util import is_wav_file, find_files_of_type, is_audio_file
+from models.audio_resnet import resnet34, resnet50
+from models.tacotron2.taco_utils import load_wav_to_torch
+from scripts.audio.gen.speech_synthesis_utils import wav_to_mel
+from scripts.byol.byol_extract_wrapped_model import extract_byol_model_from_state_dict
+from utils.options import Loader
+from utils.util import load_model_from_config
+
+clip_model = None
+
+
+def recursively_find_audio_directories(root):
+    subdirs = []
+    audio_files = []
+    for f in os.scandir(root):
+        if f.is_dir():
+            subdirs.append(f)
+        elif is_audio_file(f.path):
+            audio_files.append(f.path)
+    assert len(subdirs) == 0 or len(audio_files) == 0
+    if len(subdirs) > 0:
+        res = []
+        for subdir in subdirs:
+            res.extend(recursively_find_audio_directories(subdir.path))
+        return res
+    return [(root, audio_files)]
+
+
+def process_subdir(subdir, options, clip_sz):
+    global clip_model
+    if clip_model is None:
+        print('Loading CLIP model..')
+        clip_model = load_model_from_config(preloaded_options=options, model_name='clip', also_load_savepoint=True)
+
+    root, paths = subdir
+    root = str(root)
+
+    clips = []
+    for path in paths:
+        clip = load_audio(str(path), 22050)
+        padding = clip_sz - clip.shape[1]
+        if padding > 0:
+            clip = F.pad(clip, (0, padding))
+        elif padding < 0:
+            clip = clip[:, :clip_sz]
+        clips.append(clip)
+    sims = None
+    while len(clips) > 0:
+        stacked = torch.stack(clips[:256], dim=0).cuda()
+        clips = clips[256:]
+        mels = wav_to_mel(stacked)
+        outp = clip_model.inference(mels)
+        if sims is None:
+            sims = outp
+        else:
+            if outp.shape[-1] != 256:
+                outp = F.pad(outp, (0,256-outp.shape[-1]))
+            sims = torch.cat([sims, outp], dim=0)
+
+    simmap = {}
+    for path, sim in zip(paths, sims):
+        n = min(4, len(sim))
+        top3 = torch.topk(sim, n)
+        rel = os.path.relpath(str(path), root)
+        simpaths = []
+        if n == 1:
+            simpaths.append(rel)
+        else:
+            for i in range(1,n):  # The first entry is always the file itself.
+                top_ind = top3.indices[i]
+                simpaths.append(os.path.relpath(paths[top_ind], root))
+        simmap[rel] = simpaths
+    torch.save(simmap, os.path.join(root, 'similarities.pth'))
+
+
+if __name__ == '__main__':
+    """
+    This script iterates within a directory filled with subdirs. Each subdir contains a list of audio files from the same
+    source. The script uses an speech-to-speech clip model to find the <n> most similar audio clips within each subdir for
+    each clip within that subdir.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', type=str, help='Path to the options YAML file used to train the CLIP model', default='../options/train_voice_voice_clip.yml')
+    parser.add_argument('--num_workers', type=int, help='Number concurrent processes to use', default=1)
+    parser.add_argument('--root_path', type=str, help='Root path to search for audio directories from', default='Z:\\clips\\podcasts-0\\7_Joe Rogan Experience #1004 - W. Kamau Bell')
+    parser.add_argument('--clip_size', type=int, help='Amount of audio samples to pull from each file', default=22050)
+    args = parser.parse_args()
+
+    with open(args.o, mode='r') as f:
+        opt = yaml.load(f, Loader=Loader)
+
+    all_files = recursively_find_audio_directories(args.root_path)
+    fn = functools.partial(process_subdir, options=opt, clip_sz=args.clip_size)
+    if args.num_workers > 1:
+        with ThreadPool(args.num_workers) as pool:
+            tqdm(list(pool.imap(fn, all_files)), total=len(all_files))
+    else:
+        for subdir in tqdm(all_files):
+            fn(subdir)
+
+
--- a/codes/train.py
+++ b/codes/train.py
@ -286,7 +286,7 @@ class Trainer:

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
-    parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_gpt_asr_mass_hf.yml')
+    parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_gpt_asr_mass_hf2.yml')
    parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
--- a/codes/utils/util.py
+++ b/codes/utils/util.py
@ -485,5 +485,6 @@ def load_model_from_config(cfg_file=None, model_name=None, dev='cuda', also_load
        assert load_path is None
        load_path = opt['path'][f'pretrain_model_{model_name}']
    if load_path is not None:
+        print(f"Loading from {load_path}")
        model.load_state_dict(torch.load(load_path))
    return model