diff --git a/torchscale/architecture/config.py b/torchscale/architecture/config.py index 4861c41..347ee23 100644 --- a/torchscale/architecture/config.py +++ b/torchscale/architecture/config.py @@ -112,10 +112,10 @@ class DecoderConfig(object): self.ddp_rank = kwargs.pop("ddp_rank", 0) if self.deepnorm: - self.encoder_normalize_before = False + self.decoder_normalize_before = False self.subln = False if self.subln: - self.encoder_normalize_before = True + self.decoder_normalize_before = True self.deepnorm = False if self.use_xmoe: self.moe_normalize_gate_prob_before_dropping = True diff --git a/torchscale/architecture/decoder.py b/torchscale/architecture/decoder.py index 704e438..b4a313f 100644 --- a/torchscale/architecture/decoder.py +++ b/torchscale/architecture/decoder.py @@ -92,11 +92,6 @@ class DecoderLayer(nn.Module): else: self.alpha = 1.0 - if args.subln: - self.ffn_layernorm = LayerNorm(self.ffn_dim) - else: - self.ffn_layernorm = None - def build_ffn(self, embed_dim, args): return FeedForwardNetwork( embed_dim,