111 lines
2.6 KiB
Python
Executable File
111 lines
2.6 KiB
Python
Executable File
import math
|
|
import torch
|
|
import torch.nn.functional as F
|
|
import traceback
|
|
|
|
from typing import Literal, overload
|
|
from functools import partial
|
|
from einops import rearrange
|
|
|
|
from torch import Tensor, einsum, nn
|
|
from torch.distributions import Categorical
|
|
from torch.nn.utils.rnn import pad_sequence
|
|
from torch.utils.checkpoint import checkpoint
|
|
from torchmetrics.classification import BinaryAccuracy, MulticlassAccuracy, MulticlassPrecision
|
|
from torchvision.models import resnet18, resnet34, resnet50, resnet101, resnet152
|
|
|
|
from ..data import get_symmap
|
|
|
|
class Model(nn.Module):
|
|
def __init__(
|
|
self,
|
|
n_tokens: int = 0, # number of token types
|
|
n_len: int = 12, # how long a sequence can be
|
|
d_model: int = 512,
|
|
d_resnet: int = 18,
|
|
):
|
|
super().__init__()
|
|
|
|
_symmap = get_symmap()
|
|
self.symmap = { f'{v}': k for k, v in _symmap.items() }
|
|
self.symmap['0'] = ""
|
|
|
|
if n_tokens == 0:
|
|
n_tokens = len(_symmap.keys())
|
|
|
|
self.n_tokens = n_tokens
|
|
self.n_len = n_len + 2 # start/stop tokens
|
|
# self.d_model = d_model
|
|
self.d_resnet = d_resnet
|
|
|
|
ResNet = resnet18
|
|
if d_resnet == 18:
|
|
print("Using resnet18")
|
|
ResNet = resnet18
|
|
elif d_resnet == 34:
|
|
print("Using resnet34")
|
|
ResNet = resnet34
|
|
elif d_resnet == 50:
|
|
print("Using resnet50")
|
|
ResNet = resnet50
|
|
elif d_resnet == 101:
|
|
print("Using resnet101")
|
|
ResNet = resnet101
|
|
elif d_resnet == 152:
|
|
print("Using resnet152")
|
|
ResNet = resnet152
|
|
|
|
self.resnet = ResNet(pretrained=False)
|
|
self.resnet.fc = nn.Linear( self.resnet.fc.in_features, self.n_tokens * self.n_len )
|
|
|
|
self.accuracy_metric = MulticlassAccuracy(
|
|
n_tokens,
|
|
#top_k=10,
|
|
average="micro",
|
|
multidim_average="global",
|
|
)
|
|
|
|
self.precision_metric = MulticlassPrecision(
|
|
n_tokens,
|
|
#top_k=10,
|
|
average="micro",
|
|
multidim_average="global",
|
|
)
|
|
def forward(
|
|
self,
|
|
|
|
image,
|
|
text = None, #
|
|
|
|
sampling_temperature: float = 1.0,
|
|
):
|
|
logits = self.resnet( torch.stack( image, dim=0 ) )
|
|
logits = logits.view(logits.size(0), self.n_len, self.n_tokens).permute(1, 0, 2)
|
|
|
|
pred = logits.argmax(dim=2)
|
|
|
|
if text is not None:
|
|
labels = rearrange(pad_sequence(text), "t b -> b t").permute(1, 0)
|
|
loss = []
|
|
for i in range(self.n_len):
|
|
if i >= labels.shape[0]:
|
|
break
|
|
loss.append( F.cross_entropy(logits[i], labels[i]) )
|
|
|
|
self.loss = dict(
|
|
nll = sum( loss ) / len( loss ),
|
|
)
|
|
|
|
try:
|
|
self.stats = dict(
|
|
acc = self.accuracy_metric( pred, labels ),
|
|
precision = self.precision_metric( pred, labels ),
|
|
)
|
|
except Exception as e:
|
|
pass
|
|
|
|
|
|
answer = [ "".join([ self.symmap[f'{x.item()}'] for x in t ]) for t in pred ]
|
|
|
|
return answer
|