From 5cbb7980a942306bebced3bdbe0641370facbc08 Mon Sep 17 00:00:00 2001 From: shumingma Date: Thu, 24 Nov 2022 01:06:46 -0800 Subject: [PATCH] Add features section --- README.md | 33 +++++++++++++++++++++++++++++++ torchscale/architecture/config.py | 22 +++++++++++++++++++-- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 734678c..9ddda06 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,39 @@ We also support the `Decoder` architecture and the `EncoderDecoder` architecture >>> print(encdec) ``` +## Key Features + +- [DeepNorm to improve the training stability of Post-LayerNorm Transformers](https://arxiv.org/abs/2203.00555) + * enabled by setting *deepnorm=True* in the `Config` class. + +- [SubLN for the model generality and the training stability](https://arxiv.org/abs/2210.06423) + * enabled by *subln=True*. This is enabled by default. + * Note that SubLN and DeepNorm cannot be used in one single model. + +- [X-MoE: efficient and finetunable sparse MoE modeling](https://arxiv.org/abs/2204.09179) + * enabled by *use_xmoe=True*. + +- [Multiway architecture for multimodality](https://arxiv.org/abs/2208.10442) + * enabled by *multiway=True*. + +- [Relative position bias](https://arxiv.org/abs/1910.10683) + * enabled by adjusting *rel_pos_buckets* and *max_rel_pos*. + +- [SparseClip: improving the gradient clipping for sparse MoE models](https://arxiv.org/abs/2211.13184) + * we provide a [sample code](examples/fairseq/utils/sparse_clip.py) that can be easily adapted to the FairSeq (or other) repo. + +Most of the features above can be used by simply passing the corresponding parameters to the config. For example: + +```python +>>> from torchscale.architecture.config import EncoderConfig +>>> from torchscale.architecture.encoder import Encoder + +>>> config = EncoderConfig(vocab_size=64000, deepnorm=True, multiway=True) +>>> model = Encoder(config) + +>>> print(model) +``` + ## Examples We have the examples of how to use TorchScale in the following scenarios/tasks: diff --git a/torchscale/architecture/config.py b/torchscale/architecture/config.py index b424582..ad8c111 100644 --- a/torchscale/architecture/config.py +++ b/torchscale/architecture/config.py @@ -45,8 +45,14 @@ class EncoderConfig(object): if self.deepnorm: self.encoder_normalize_before = False + self.subln = False if self.subln: self.encoder_normalize_before = True + self.deepnorm = False + if self.use_xmoe: + self.moe_normalize_gate_prob_before_dropping = True + self.moe_second_expert_policy = "random" + assert self.moe_freq > 0 and self.moe_expert_count > 0 def override(self, args): for hp in self.__dict__.keys(): @@ -93,9 +99,15 @@ class DecoderConfig(object): self.ddp_rank = kwargs.pop("ddp_rank", 0) if self.deepnorm: - self.decoder_normalize_before = False + self.encoder_normalize_before = False + self.subln = False if self.subln: - self.decoder_normalize_before = True + self.encoder_normalize_before = True + self.deepnorm = False + if self.use_xmoe: + self.moe_normalize_gate_prob_before_dropping = True + self.moe_second_expert_policy = "random" + assert self.moe_freq > 0 and self.moe_expert_count > 0 def override(self, args): for hp in self.__dict__.keys(): @@ -151,9 +163,15 @@ class EncoderDecoderConfig(object): if self.deepnorm: self.encoder_normalize_before = False self.decoder_normalize_before = False + self.subln = False if self.subln: self.encoder_normalize_before = True self.decoder_normalize_before = True + self.deepnorm = False + if self.use_xmoe: + self.moe_normalize_gate_prob_before_dropping = True + self.moe_second_expert_policy = "random" + assert self.moe_freq > 0 and self.moe_expert_count > 0 def override(self, args): for hp in self.__dict__.keys():