Add features section

2022-11-24 01:06:46 -08:00 · 2022-11-24 01:06:46 -08:00 · 5cbb7980a9
commit 5cbb7980a9
parent afd9094fb5
2 changed files with 53 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -64,6 +64,39 @@ We also support the `Decoder` architecture and the `EncoderDecoder` architecture
 >>> print(encdec)
 ```

+## Key Features
+
+- [DeepNorm to improve the training stability of Post-LayerNorm Transformers](https://arxiv.org/abs/2203.00555)
+  * enabled by setting *deepnorm=True* in the `Config` class.
+
+- [SubLN for the model generality and the training stability](https://arxiv.org/abs/2210.06423)
+  * enabled by *subln=True*. This is enabled by default.
+  * Note that SubLN and DeepNorm cannot be used in one single model.
+
+- [X-MoE: efficient and finetunable sparse MoE modeling](https://arxiv.org/abs/2204.09179)
+  * enabled by *use_xmoe=True*.
+
+- [Multiway architecture for multimodality](https://arxiv.org/abs/2208.10442)
+  * enabled by *multiway=True*.
+
+- [Relative position bias](https://arxiv.org/abs/1910.10683)
+  * enabled by adjusting *rel_pos_buckets* and *max_rel_pos*.
+
+- [SparseClip: improving the gradient clipping for sparse MoE models](https://arxiv.org/abs/2211.13184)
+  * we provide a [sample code](examples/fairseq/utils/sparse_clip.py) that can be easily adapted to the FairSeq (or other) repo.
+
+Most of the features above can be used by simply passing the corresponding parameters to the config. For example:
+
+```python
+>>> from torchscale.architecture.config import EncoderConfig
+>>> from torchscale.architecture.encoder import Encoder
+
+>>> config = EncoderConfig(vocab_size=64000, deepnorm=True, multiway=True)
+>>> model = Encoder(config)
+
+>>> print(model)
+```
+
 ## Examples

 We have the examples of how to use TorchScale in the following scenarios/tasks:
--- a/torchscale/architecture/config.py
+++ b/torchscale/architecture/config.py
@ -45,8 +45,14 @@ class EncoderConfig(object):

        if self.deepnorm:
            self.encoder_normalize_before = False
+            self.subln = False
        if self.subln:
            self.encoder_normalize_before = True
+            self.deepnorm = False
+        if self.use_xmoe:
+            self.moe_normalize_gate_prob_before_dropping = True
+            self.moe_second_expert_policy = "random"
+            assert self.moe_freq > 0 and self.moe_expert_count > 0

    def override(self, args):
        for hp in self.__dict__.keys():
@ -93,9 +99,15 @@ class DecoderConfig(object):
        self.ddp_rank = kwargs.pop("ddp_rank", 0)

        if self.deepnorm:
-            self.decoder_normalize_before = False
+            self.encoder_normalize_before = False
+            self.subln = False
        if self.subln:
-            self.decoder_normalize_before = True
+            self.encoder_normalize_before = True
+            self.deepnorm = False
+        if self.use_xmoe:
+            self.moe_normalize_gate_prob_before_dropping = True
+            self.moe_second_expert_policy = "random"
+            assert self.moe_freq > 0 and self.moe_expert_count > 0

    def override(self, args):
        for hp in self.__dict__.keys():
@ -151,9 +163,15 @@ class EncoderDecoderConfig(object):
        if self.deepnorm:
            self.encoder_normalize_before = False
            self.decoder_normalize_before = False
+            self.subln = False
        if self.subln:
            self.encoder_normalize_before = True
            self.decoder_normalize_before = True
+            self.deepnorm = False
+        if self.use_xmoe:
+            self.moe_normalize_gate_prob_before_dropping = True
+            self.moe_second_expert_policy = "random"
+            assert self.moe_freq > 0 and self.moe_expert_count > 0

    def override(self, args):
        for hp in self.__dict__.keys():