Merge pull request #4 from kashif/kashif-patch-1

remove lambda
2022-11-29 12:21:53 +08:00 · 2022-11-29 12:21:53 +08:00 · 559b5fdf56
commit 559b5fdf56
parent c0ad46d7b8 c69aba2a73
3 changed files with 5 additions and 5 deletions
--- a/examples/fairseq/models/bert.py
+++ b/examples/fairseq/models/bert.py
@ -391,7 +391,7 @@ class ClassificationHead(nn.Module):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
-        x = self.activation_fn(x)
+        x = self.activation_fn(x.float()).as_type(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x
@ -418,7 +418,7 @@ class LMHead(nn.Module):
            features = features[masked_tokens, :]
        x = self.dense(features)
-        x = self.activation_fn(x)
+        x = self.activation_fn(x.float()).as_type(x)
        x = self.layer_norm(x)
        # project back to size of vocabulary with bias
        x = F.linear(x, self.weight) + self.bias
--- a/torchscale/architecture/decoder.py
+++ b/torchscale/architecture/decoder.py
@ -400,7 +400,7 @@ class Decoder(nn.Module):
        )
        x = x.transpose(0, 1)
-        # relative postion
+        # relative position
        self_attn_rel_pos_bias = None
        slen = prev_output_tokens.size(1)
        if self.self_attn_relative_position is not None:
--- a/torchscale/component/feedforward_network.py
+++ b/torchscale/component/feedforward_network.py
@ -85,7 +85,7 @@ def get_activation_fn(activation):
    if activation == "relu":
        return F.relu
    elif activation == "gelu":
-        return lambda x: F.gelu(x.float()).type_as(x)
+        return F.gelu
    else:
        raise NotImplementedError
@ -121,7 +121,7 @@ class FeedForwardNetwork(nn.Module):
        x_shape = x.shape
        x = x.reshape(-1, x.size(-1))
        x = self.fc1(x)
-        x = self.activation_fn(x)
+        x = self.activation_fn(x.float()).as_type(x)
        x = self.activation_dropout_module(x)
        if self.ffn_layernorm is not None:
            x = self.ffn_layernorm(x)