2022-01-09 05:18:25 +00:00
|
|
|
"""
|
|
|
|
A list of functions that map a unified set of arguments to a fully built transformer. Also includes some testing
|
|
|
|
utilities for measuring parameter count, FLOPS, and general performance of each type.
|
|
|
|
|
|
|
|
Every function contains the following arguments:
|
|
|
|
|
|
|
|
layers: Net number of layers in the transformer.
|
|
|
|
model_dim: Hidden dimensionality of the model.
|
|
|
|
heads: Number of attention heads.
|
2022-01-10 15:14:41 +00:00
|
|
|
max_mel_seq_len: Maximum mel sequence length to attend to.
|
|
|
|
max_text_seq_len: Maximum text sequence length to attend to.
|
2022-01-09 05:18:25 +00:00
|
|
|
checkpointing: Whether or not the underlying implementation should support gradient checkpointing.
|
2022-01-10 15:14:41 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
(model, global_mel_pos_embedding, global_text_pos_embedding, local_mel_pos_embedding, local_text_pos_embedding)
|
|
|
|
model: The transformer model
|
|
|
|
global_mel_pos_embedding: A global embedding function (that takes the MEL sequence as input) which should be added on to the MEL embeddings.
|
|
|
|
global_text_pos_embedding: The global embedding function for text tokens.
|
|
|
|
local_mel_pos_embedding: A local embedding function which, if not None, should be concatenated with the local text position embeddings and fed to the transformer.
|
|
|
|
local_text_pos_embedding: The local embedding function for text positions which will be None if local_mel_pos_embedding=None.
|
|
|
|
|
2022-01-09 05:18:25 +00:00
|
|
|
"""
|
|
|
|
import functools
|
2022-01-10 05:35:03 +00:00
|
|
|
from time import time
|
2022-01-09 05:18:25 +00:00
|
|
|
import torch
|
2022-01-10 15:14:41 +00:00
|
|
|
import torch.nn as nn
|
2022-01-10 05:35:03 +00:00
|
|
|
from tqdm import tqdm
|
2022-01-09 05:18:25 +00:00
|
|
|
|
|
|
|
|
|
|
|
def null_position_embeddings(range, dim):
|
|
|
|
return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device)
|
|
|
|
|
|
|
|
|
2022-01-10 15:14:41 +00:00
|
|
|
class LearnedPositionEmbeddings(nn.Module):
|
|
|
|
def __init__(self, seq_len, model_dim, init=.02):
|
|
|
|
super().__init__()
|
|
|
|
self.emb = nn.Embedding(seq_len, model_dim)
|
|
|
|
# Initializing this way is standard for GPT-2
|
|
|
|
self.emb.weight.data.normal_(mean=0.0, std=init)
|
|
|
|
|
|
|
|
def forward(self, x):
|
|
|
|
sl = x.shape[1]
|
|
|
|
return self.emb(torch.arange(0, sl, device=x.device))
|
|
|
|
|
|
|
|
|
|
|
|
def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text_seq_len, checkpointing):
|
2022-01-09 05:18:25 +00:00
|
|
|
"""
|
|
|
|
GPT-2 implemented by the HuggingFace library.
|
|
|
|
"""
|
|
|
|
from transformers import GPT2Config, GPT2Model
|
2022-01-10 15:14:41 +00:00
|
|
|
gpt_config = GPT2Config(vocab_size=256, # Unused.
|
|
|
|
n_positions=max_mel_seq_len+max_text_seq_len,
|
|
|
|
n_ctx=max_mel_seq_len+max_text_seq_len,
|
|
|
|
n_embd=model_dim,
|
|
|
|
n_layer=layers,
|
|
|
|
n_head=heads,
|
|
|
|
gradient_checkpointing=checkpointing,
|
|
|
|
use_cache=not checkpointing)
|
2022-01-09 05:18:25 +00:00
|
|
|
gpt = GPT2Model(gpt_config)
|
|
|
|
# Override the built in positional embeddings
|
|
|
|
del gpt.wpe
|
|
|
|
gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim)
|
2022-01-10 05:35:03 +00:00
|
|
|
# Built-in token embeddings are unused.
|
|
|
|
del gpt.wte
|
2022-01-10 15:14:41 +00:00
|
|
|
return gpt, LearnedPositionEmbeddings(max_mel_seq_len, model_dim), LearnedPositionEmbeddings(max_text_seq_len, model_dim),\
|
|
|
|
None, None
|
2022-01-09 05:18:25 +00:00
|
|
|
|
|
|
|
|
2022-01-10 15:14:41 +00:00
|
|
|
def build_lr_performer(layers, model_dim, heads, max_mel_seq_len, max_text_seq_len, checkpointing):
|
2022-01-09 05:18:25 +00:00
|
|
|
"""
|
|
|
|
lucidrains Performer implementation, https://github.com/lucidrains/performer-pytorch
|
|
|
|
"""
|
2022-01-10 15:14:41 +00:00
|
|
|
from models.lucidrains.performer.performer_pytorch import Performer
|
|
|
|
model = Performer(dim=model_dim, depth=layers, heads=heads, dim_head=model_dim, causal=True)
|
2022-01-10 05:35:03 +00:00
|
|
|
return model
|
2022-01-09 05:18:25 +00:00
|
|
|
|
|
|
|
|
2022-01-10 15:14:41 +00:00
|
|
|
def build_lr_reformer(layers, model_dim, heads, max_mel_seq_len, max_text_seq_len, checkpointing):
|
2022-01-09 05:18:25 +00:00
|
|
|
"""
|
|
|
|
lucidrains Reformer implementation, https://github.com/lucidrains/reformer-pytorch
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2022-01-10 15:14:41 +00:00
|
|
|
def build_lr_xformer(layers, model_dim, heads, max_mel_seq_len, max_text_seq_len, checkpointing):
|
2022-01-09 05:18:25 +00:00
|
|
|
"""
|
|
|
|
lucidrains x-transformer implementation, https://github.com/lucidrains/x-transformers
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
def test_all_performance(**kwargs):
|
2022-01-10 05:35:03 +00:00
|
|
|
transformer_builders = [#build_hf_gpt_transformer,
|
|
|
|
build_lr_performer,]
|
|
|
|
# build_lr_reformer,
|
|
|
|
# build_lr_xformer]
|
2022-01-09 05:18:25 +00:00
|
|
|
for builder in transformer_builders:
|
|
|
|
model = builder(**kwargs)
|
2022-01-10 05:35:03 +00:00
|
|
|
start = time()
|
|
|
|
args = torch.randint(0, 8192, (16,450))
|
|
|
|
for k in tqdm(range(10)):
|
|
|
|
model(args)
|
|
|
|
stop = time()
|
|
|
|
print(f"Model: {str(builder)}; Elapsed: {stop-start}")
|
2022-01-09 05:18:25 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2022-01-10 05:35:03 +00:00
|
|
|
test_all_performance(layers=12, model_dim=512, heads=8, num_tokens=8192, max_seq_len=1000, checkpointing=False)
|