Add voice2voice clip model

2021-12-28 16:18:12 -07:00 · 2021-12-28 16:18:12 -07:00 · 07c2b9907c
commit 07c2b9907c
parent a9ee5b624f
4 changed files with 92 additions and 4 deletions
--- a/codes/data/audio/unsupervised_audio_dataset.py
+++ b/codes/data/audio/unsupervised_audio_dataset.py
@ -69,6 +69,7 @@ class UnsupervisedAudioDataset(torch.utils.data.Dataset):
        if self.pad_to is not None:
            self.pad_to *= self.sampling_rate
        self.pad_to = opt_get(opt, ['pad_to_samples'], self.pad_to)
        self.min_length = opt_get(opt, ['min_length'], 0)
        # "Resampled clip" is audio data pulled from the basis of "clip" but with randomly different bounds. There are no
        # guarantees that "clip_resampled" is different from "clip": in fact, if "clip" is less than pad_to_seconds/samples,
@ -79,9 +80,12 @@ class UnsupervisedAudioDataset(torch.utils.data.Dataset):
        self.extra_sample_len = opt_get(opt, ['extra_sample_length'], 2)
        self.extra_sample_len *= self.sampling_rate
        self.debug_loading_failures = opt_get(opt, ['debug_loading_failures'], True)
    def get_audio_for_index(self, index):
        audiopath = self.audiopaths[index]
        audio = load_audio(audiopath, self.sampling_rate)
        assert audio.shape[1] > self.min_length
        return audio, audiopath
    def get_related_audio_for_index(self, index):
@ -121,7 +125,8 @@ class UnsupervisedAudioDataset(torch.utils.data.Dataset):
            audio_norm, filename = self.get_audio_for_index(index)
            alt_files, actual_samples = self.get_related_audio_for_index(index)
        except:
-            print(f"Error loading audio for file {self.audiopaths[index]} {sys.exc_info()}")
+            if self.debug_loading_failures:
                print(f"Error loading audio for file {self.audiopaths[index]} {sys.exc_info()}")
            return self[index+1]
        # When generating resampled clips, skew is a bias that tries to spread them out from each other, reducing their
--- a/codes/models/gpt_voice/text_voice_clip.py
+++ b/codes/models/gpt_voice/text_voice_clip.py
--- a/codes/models/gpt_voice/voice_voice_clip.py
+++ b/codes/models/gpt_voice/voice_voice_clip.py
@ -0,0 +1,83 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
 from torch import einsum
 from models.gpt_voice.mini_encoder import AudioMiniEncoder
 from models.lucidrains.dalle.transformer import Transformer
 from trainer.networks import register_model
 from utils.util import opt_get
 def exists(val):
    return val is not None
 def masked_mean(t, mask, dim=1):
    t = t.masked_fill(~mask[:, :, None], 0.)
    return t.sum(dim = 1) / mask.sum(dim = 1)[..., None]
 class VoiceCLIP(nn.Module):
    """
    CLIP model modified to produce similarity scores from different views of the same audio clip.
    """
    def __init__(
            self,
            encoder_output=512,
            dim_latent=512,
            speech_max_seq_len=250,
            mel_compression_ratio=256,
            pretrained_encoder_dict_path=None
    ):
        super().__init__()
        self.encoder = AudioMiniEncoder(80, encoder_output)
        if pretrained_encoder_dict_path is not None:
            self.encoder.load_state_dict(torch.load(pretrained_encoder_dict_path))
        self.to_latent = nn.Linear(encoder_output, dim_latent, bias=False)
        self.temperature = nn.Parameter(torch.tensor(1.))
        self.mel_compression_ratio = mel_compression_ratio
    def forward(
        self,
        speech_mels,
        speech_lengths,
        return_loss=True
    ):
        half_length = min(speech_mels.shape[-1], torch.min(speech_lengths).item() // self.mel_compression_ratio) // 2
        half_length = (half_length // 4) * 4  # Must be a multiple of 4.
        first_half = speech_mels[:, :, :half_length]
        second_half = speech_mels[:, :, half_length:half_length*2]
        first_emb = self.encoder(first_half)
        first_latents = self.to_latent(first_emb)
        second_emb = self.encoder(second_half)
        second_latents = self.to_latent(second_emb)
        first_latents, second_latents = map(lambda t: F.normalize(t, p=2, dim=-1), (first_latents, second_latents))
        temp = self.temperature.exp()
        if not return_loss:
            sim = einsum('n d, n d -> n', first_latents, second_latents) * temp
            return sim
        sim = einsum('i d, j d -> i j', first_latents, second_latents) * temp
        labels = torch.arange(first_latents.shape[0], device=first_latents.device)
        loss = (F.cross_entropy(sim, labels) + F.cross_entropy(sim.t(), labels)) / 2
        return loss
@register_model
 def register_voice_to_voice_clip(opt_net, opt):
    return VoiceCLIP(**opt_get(opt_net, ['kwargs'], {}))
 if __name__ == '__main__':
    clip = VoiceCLIP()
    clip(torch.randn((2,80,200)),
         torch.randint(0,200*1024,(2,)),
         return_loss=True)
--- a/codes/train.py
+++ b/codes/train.py
@ -257,14 +257,14 @@ class Trainer:
                    import wandb
                    wandb.log(eval_dict)
    def do_training(self):
        self.logger.info('Start training from epoch: {:d}, iter: {:d}'.format(self.start_epoch, self.current_step))
        for epoch in range(self.start_epoch, self.total_epochs + 1):
            self.epoch = epoch
            if opt['dist']:
                self.train_sampler.set_epoch(epoch)
-            tq_ldr = tqdm(self.train_loader) if self.rank == 0 else self.train_loader
+
            tq_ldr = tqdm(self.train_loader) if self.rank <= 0 else self.train_loader
            _t = time()
            for train_data in tq_ldr:
@ -286,7 +286,7 @@ class Trainer:
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
-    parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_gpt_asr_mass_hf2.yml')
+    parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_voice_voice_clip.yml')
    parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()