import math import torch import torch.nn.functional as F import traceback from typing import Literal, overload from functools import partial from einops import rearrange from torch import Tensor, einsum, nn from torch.distributions import Categorical from torch.nn.utils.rnn import pad_sequence from torch.utils.checkpoint import checkpoint from torchmetrics.classification import BinaryAccuracy, MulticlassAccuracy, MulticlassPrecision from torchvision.models import resnet18, resnet34, resnet50, resnet101, resnet152 from ..data import get_symmap class Model(nn.Module): def __init__( self, n_tokens: int = 0, # number of token types n_len: int = 12, # how long a sequence can be d_model: int = 512, d_resnet: int = 18, ): super().__init__() _symmap = get_symmap() self.symmap = { f'{v}': k for k, v in _symmap.items() } self.symmap['0'] = "" if n_tokens == 0: n_tokens = len(_symmap.keys()) self.n_tokens = n_tokens self.n_len = n_len + 2 # start/stop tokens # self.d_model = d_model self.d_resnet = d_resnet ResNet = resnet18 if d_resnet == 18: print("Using resnet18") ResNet = resnet18 elif d_resnet == 34: print("Using resnet34") ResNet = resnet34 elif d_resnet == 50: print("Using resnet50") ResNet = resnet50 elif d_resnet == 101: print("Using resnet101") ResNet = resnet101 elif d_resnet == 152: print("Using resnet152") ResNet = resnet152 self.resnet = ResNet(pretrained=False) self.resnet.fc = nn.Linear( self.resnet.fc.in_features, self.n_tokens * self.n_len ) self.accuracy_metric = MulticlassAccuracy( n_tokens, #top_k=10, average="micro", multidim_average="global", ) self.precision_metric = MulticlassPrecision( n_tokens, #top_k=10, average="micro", multidim_average="global", ) def forward( self, image, text = None, # sampling_temperature: float = 1.0, ): logits = self.resnet( torch.stack( image, dim=0 ) ) logits = logits.view(logits.size(0), self.n_len, self.n_tokens).permute(1, 0, 2) pred = logits.argmax(dim=2) if text is not None: labels = rearrange(pad_sequence(text), "t b -> b t").permute(1, 0) loss = [] for i in range(self.n_len): if i >= labels.shape[0]: break loss.append( F.cross_entropy(logits[i], labels[i]) ) self.loss = dict( nll = sum( loss ) / len( loss ), ) try: self.stats = dict( acc = self.accuracy_metric( pred, labels ), precision = self.precision_metric( pred, labels ), ) except Exception as e: pass answer = [ "".join([ self.symmap[f'{x.item()}'] for x in t ]) for t in pred ] return answer