From 5cbb7980a942306bebced3bdbe0641370facbc08 Mon Sep 17 00:00:00 2001
From: shumingma <shumma@microsoft.com>
Date: Thu, 24 Nov 2022 01:06:46 -0800
Subject: [PATCH] Add features section

---
 README.md                         | 33 +++++++++++++++++++++++++++++++
 torchscale/architecture/config.py | 22 +++++++++++++++++++--
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 734678c..9ddda06 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,39 @@ We also support the `Decoder` architecture and the `EncoderDecoder` architecture
 >>> print(encdec)
 ```
 
+## Key Features
+
+- [DeepNorm to improve the training stability of Post-LayerNorm Transformers](https://arxiv.org/abs/2203.00555)
+  * enabled by setting *deepnorm=True* in the `Config` class.
+
+- [SubLN for the model generality and the training stability](https://arxiv.org/abs/2210.06423)
+  * enabled by *subln=True*. This is enabled by default.
+  * Note that SubLN and DeepNorm cannot be used in one single model.
+
+- [X-MoE: efficient and finetunable sparse MoE modeling](https://arxiv.org/abs/2204.09179)
+  * enabled by *use_xmoe=True*.
+
+- [Multiway architecture for multimodality](https://arxiv.org/abs/2208.10442)
+  * enabled by *multiway=True*.
+
+- [Relative position bias](https://arxiv.org/abs/1910.10683)
+  * enabled by adjusting *rel_pos_buckets* and *max_rel_pos*.
+
+- [SparseClip: improving the gradient clipping for sparse MoE models](https://arxiv.org/abs/2211.13184)
+  * we provide a [sample code](examples/fairseq/utils/sparse_clip.py) that can be easily adapted to the FairSeq (or other) repo.
+
+Most of the features above can be used by simply passing the corresponding parameters to the config. For example:
+
+```python
+>>> from torchscale.architecture.config import EncoderConfig
+>>> from torchscale.architecture.encoder import Encoder
+
+>>> config = EncoderConfig(vocab_size=64000, deepnorm=True, multiway=True)
+>>> model = Encoder(config)
+
+>>> print(model)
+```
+
 ## Examples
 
 We have the examples of how to use TorchScale in the following scenarios/tasks:
diff --git a/torchscale/architecture/config.py b/torchscale/architecture/config.py
index b424582..ad8c111 100644
--- a/torchscale/architecture/config.py
+++ b/torchscale/architecture/config.py
@@ -45,8 +45,14 @@ class EncoderConfig(object):
 
         if self.deepnorm:
             self.encoder_normalize_before = False
+            self.subln = False
         if self.subln:
             self.encoder_normalize_before = True
+            self.deepnorm = False
+        if self.use_xmoe:
+            self.moe_normalize_gate_prob_before_dropping = True
+            self.moe_second_expert_policy = "random"
+            assert self.moe_freq > 0 and self.moe_expert_count > 0
 
     def override(self, args):
         for hp in self.__dict__.keys():
@@ -93,9 +99,15 @@ class DecoderConfig(object):
         self.ddp_rank = kwargs.pop("ddp_rank", 0)
 
         if self.deepnorm:
-            self.decoder_normalize_before = False
+            self.encoder_normalize_before = False
+            self.subln = False
         if self.subln:
-            self.decoder_normalize_before = True
+            self.encoder_normalize_before = True
+            self.deepnorm = False
+        if self.use_xmoe:
+            self.moe_normalize_gate_prob_before_dropping = True
+            self.moe_second_expert_policy = "random"
+            assert self.moe_freq > 0 and self.moe_expert_count > 0
 
     def override(self, args):
         for hp in self.__dict__.keys():
@@ -151,9 +163,15 @@ class EncoderDecoderConfig(object):
         if self.deepnorm:
             self.encoder_normalize_before = False
             self.decoder_normalize_before = False
+            self.subln = False
         if self.subln:
             self.encoder_normalize_before = True
             self.decoder_normalize_before = True
+            self.deepnorm = False
+        if self.use_xmoe:
+            self.moe_normalize_gate_prob_before_dropping = True
+            self.moe_second_expert_policy = "random"
+            assert self.moe_freq > 0 and self.moe_expert_count > 0
 
     def override(self, args):
         for hp in self.__dict__.keys():