diff --git a/codes/models/gpt_voice/voice_voice_clip.py b/codes/models/gpt_voice/voice_voice_clip.py index 83094271..c128be0e 100644 --- a/codes/models/gpt_voice/voice_voice_clip.py +++ b/codes/models/gpt_voice/voice_voice_clip.py @@ -70,6 +70,14 @@ class VoiceCLIP(nn.Module): loss = (F.cross_entropy(sim, labels) + F.cross_entropy(sim.t(), labels)) / 2 return loss + def inference(self, speech_mels): + emb = self.encoder(speech_mels) + latent = self.to_latent(emb) + latent = F.normalize(latent, p=2, dim=-1) + temp = self.temperature.exp() + sim = einsum('i d, j d -> i j', latent, latent) * temp + return sim + @register_model def register_voice_to_voice_clip(opt_net, opt): diff --git a/codes/scripts/audio/speech_to_speech_clip.py b/codes/scripts/audio/speech_to_speech_clip.py new file mode 100644 index 00000000..fee778d8 --- /dev/null +++ b/codes/scripts/audio/speech_to_speech_clip.py @@ -0,0 +1,113 @@ +import argparse +import functools +import os +from multiprocessing.pool import ThreadPool + +import torch +import torch.nn as nn +import torch.nn.functional as F +import yaml +from tqdm import tqdm + +from data.audio.unsupervised_audio_dataset import load_audio +from data.util import is_wav_file, find_files_of_type, is_audio_file +from models.audio_resnet import resnet34, resnet50 +from models.tacotron2.taco_utils import load_wav_to_torch +from scripts.audio.gen.speech_synthesis_utils import wav_to_mel +from scripts.byol.byol_extract_wrapped_model import extract_byol_model_from_state_dict +from utils.options import Loader +from utils.util import load_model_from_config + +clip_model = None + + +def recursively_find_audio_directories(root): + subdirs = [] + audio_files = [] + for f in os.scandir(root): + if f.is_dir(): + subdirs.append(f) + elif is_audio_file(f.path): + audio_files.append(f.path) + assert len(subdirs) == 0 or len(audio_files) == 0 + if len(subdirs) > 0: + res = [] + for subdir in subdirs: + res.extend(recursively_find_audio_directories(subdir.path)) + return res + return [(root, audio_files)] + + +def process_subdir(subdir, options, clip_sz): + global clip_model + if clip_model is None: + print('Loading CLIP model..') + clip_model = load_model_from_config(preloaded_options=options, model_name='clip', also_load_savepoint=True) + + root, paths = subdir + root = str(root) + + clips = [] + for path in paths: + clip = load_audio(str(path), 22050) + padding = clip_sz - clip.shape[1] + if padding > 0: + clip = F.pad(clip, (0, padding)) + elif padding < 0: + clip = clip[:, :clip_sz] + clips.append(clip) + sims = None + while len(clips) > 0: + stacked = torch.stack(clips[:256], dim=0).cuda() + clips = clips[256:] + mels = wav_to_mel(stacked) + outp = clip_model.inference(mels) + if sims is None: + sims = outp + else: + if outp.shape[-1] != 256: + outp = F.pad(outp, (0,256-outp.shape[-1])) + sims = torch.cat([sims, outp], dim=0) + + simmap = {} + for path, sim in zip(paths, sims): + n = min(4, len(sim)) + top3 = torch.topk(sim, n) + rel = os.path.relpath(str(path), root) + simpaths = [] + if n == 1: + simpaths.append(rel) + else: + for i in range(1,n): # The first entry is always the file itself. + top_ind = top3.indices[i] + simpaths.append(os.path.relpath(paths[top_ind], root)) + simmap[rel] = simpaths + torch.save(simmap, os.path.join(root, 'similarities.pth')) + + +if __name__ == '__main__': + """ + This script iterates within a directory filled with subdirs. Each subdir contains a list of audio files from the same + source. The script uses an speech-to-speech clip model to find the most similar audio clips within each subdir for + each clip within that subdir. + """ + parser = argparse.ArgumentParser() + parser.add_argument('-o', type=str, help='Path to the options YAML file used to train the CLIP model', default='../options/train_voice_voice_clip.yml') + parser.add_argument('--num_workers', type=int, help='Number concurrent processes to use', default=1) + parser.add_argument('--root_path', type=str, help='Root path to search for audio directories from', default='Z:\\clips\\podcasts-0\\7_Joe Rogan Experience #1004 - W. Kamau Bell') + parser.add_argument('--clip_size', type=int, help='Amount of audio samples to pull from each file', default=22050) + args = parser.parse_args() + + with open(args.o, mode='r') as f: + opt = yaml.load(f, Loader=Loader) + + all_files = recursively_find_audio_directories(args.root_path) + fn = functools.partial(process_subdir, options=opt, clip_sz=args.clip_size) + if args.num_workers > 1: + with ThreadPool(args.num_workers) as pool: + tqdm(list(pool.imap(fn, all_files)), total=len(all_files)) + else: + for subdir in tqdm(all_files): + fn(subdir) + + diff --git a/codes/train.py b/codes/train.py index 9a9567b9..9f7349ce 100644 --- a/codes/train.py +++ b/codes/train.py @@ -286,7 +286,7 @@ class Trainer: if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_gpt_asr_mass_hf.yml') + parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_gpt_asr_mass_hf2.yml') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() diff --git a/codes/utils/util.py b/codes/utils/util.py index 24cb59a7..d43faa8f 100644 --- a/codes/utils/util.py +++ b/codes/utils/util.py @@ -485,5 +485,6 @@ def load_model_from_config(cfg_file=None, model_name=None, dev='cuda', also_load assert load_path is None load_path = opt['path'][f'pretrain_model_{model_name}'] if load_path is not None: + print(f"Loading from {load_path}") model.load_state_dict(torch.load(load_path)) return model