forked from mrq/DL-Art-School
70 lines
2.6 KiB
Python
70 lines
2.6 KiB
Python
|
"""
|
||
|
A list of functions that map a unified set of arguments to a fully built transformer. Also includes some testing
|
||
|
utilities for measuring parameter count, FLOPS, and general performance of each type.
|
||
|
|
||
|
Every function contains the following arguments:
|
||
|
|
||
|
layers: Net number of layers in the transformer.
|
||
|
model_dim: Hidden dimensionality of the model.
|
||
|
heads: Number of attention heads.
|
||
|
num_tokens: Number of possible tokens in the transformer's dictionary. Do not use this in future releases.
|
||
|
max_seq_len: Maximum sequence length to attend to.
|
||
|
checkpointing: Whether or not the underlying implementation should support gradient checkpointing.
|
||
|
"""
|
||
|
import functools
|
||
|
import torch
|
||
|
|
||
|
|
||
|
def null_position_embeddings(range, dim):
|
||
|
return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device)
|
||
|
|
||
|
|
||
|
def build_hf_gpt_transformer(layers, model_dim, heads, num_tokens, max_seq_len, checkpointing):
|
||
|
"""
|
||
|
GPT-2 implemented by the HuggingFace library.
|
||
|
"""
|
||
|
from transformers import GPT2Config, GPT2Model
|
||
|
gpt_config = GPT2Config(vocab_size=num_tokens,
|
||
|
n_positions=max_seq_len,
|
||
|
n_ctx=max_seq_len,
|
||
|
n_embd=model_dim,
|
||
|
n_layer=layers,
|
||
|
n_head=heads,
|
||
|
gradient_checkpointing=checkpointing,
|
||
|
use_cache=not checkpointing)
|
||
|
gpt = GPT2Model(gpt_config)
|
||
|
# Override the built in positional embeddings
|
||
|
del gpt.wpe
|
||
|
gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim)
|
||
|
return gpt
|
||
|
|
||
|
|
||
|
def build_lr_performer(layers, model_dim, heads, num_tokens, max_seq_len, checkpointing):
|
||
|
"""
|
||
|
lucidrains Performer implementation, https://github.com/lucidrains/performer-pytorch
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
|
||
|
def build_lr_reformer(layers, model_dim, heads, num_tokens, max_seq_len, checkpointing):
|
||
|
"""
|
||
|
lucidrains Reformer implementation, https://github.com/lucidrains/reformer-pytorch
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
|
||
|
def build_lr_xformer(layers, model_dim, heads, num_tokens, max_seq_len, checkpointing):
|
||
|
"""
|
||
|
lucidrains x-transformer implementation, https://github.com/lucidrains/x-transformers
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
|
||
|
def test_all_performance(**kwargs):
|
||
|
transformer_builders = [build_hf_gpt_transformer, build_lr_performer, build_lr_reformer, build_lr_xformer]
|
||
|
for builder in transformer_builders:
|
||
|
model = builder(**kwargs)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
test_all_performance(12, 512, 8, 8192, 1000, False)
|