Add features section

This commit is contained in:
shumingma 2022-11-24 01:06:46 -08:00
parent afd9094fb5
commit 5cbb7980a9
2 changed files with 53 additions and 2 deletions

View File

@ -64,6 +64,39 @@ We also support the `Decoder` architecture and the `EncoderDecoder` architecture
>>> print(encdec)
```
## Key Features
- [DeepNorm to improve the training stability of Post-LayerNorm Transformers](https://arxiv.org/abs/2203.00555)
* enabled by setting *deepnorm=True* in the `Config` class.
- [SubLN for the model generality and the training stability](https://arxiv.org/abs/2210.06423)
* enabled by *subln=True*. This is enabled by default.
* Note that SubLN and DeepNorm cannot be used in one single model.
- [X-MoE: efficient and finetunable sparse MoE modeling](https://arxiv.org/abs/2204.09179)
* enabled by *use_xmoe=True*.
- [Multiway architecture for multimodality](https://arxiv.org/abs/2208.10442)
* enabled by *multiway=True*.
- [Relative position bias](https://arxiv.org/abs/1910.10683)
* enabled by adjusting *rel_pos_buckets* and *max_rel_pos*.
- [SparseClip: improving the gradient clipping for sparse MoE models](https://arxiv.org/abs/2211.13184)
* we provide a [sample code](examples/fairseq/utils/sparse_clip.py) that can be easily adapted to the FairSeq (or other) repo.
Most of the features above can be used by simply passing the corresponding parameters to the config. For example:
```python
>>> from torchscale.architecture.config import EncoderConfig
>>> from torchscale.architecture.encoder import Encoder
>>> config = EncoderConfig(vocab_size=64000, deepnorm=True, multiway=True)
>>> model = Encoder(config)
>>> print(model)
```
## Examples
We have the examples of how to use TorchScale in the following scenarios/tasks:

View File

@ -45,8 +45,14 @@ class EncoderConfig(object):
if self.deepnorm:
self.encoder_normalize_before = False
self.subln = False
if self.subln:
self.encoder_normalize_before = True
self.deepnorm = False
if self.use_xmoe:
self.moe_normalize_gate_prob_before_dropping = True
self.moe_second_expert_policy = "random"
assert self.moe_freq > 0 and self.moe_expert_count > 0
def override(self, args):
for hp in self.__dict__.keys():
@ -93,9 +99,15 @@ class DecoderConfig(object):
self.ddp_rank = kwargs.pop("ddp_rank", 0)
if self.deepnorm:
self.decoder_normalize_before = False
self.encoder_normalize_before = False
self.subln = False
if self.subln:
self.decoder_normalize_before = True
self.encoder_normalize_before = True
self.deepnorm = False
if self.use_xmoe:
self.moe_normalize_gate_prob_before_dropping = True
self.moe_second_expert_policy = "random"
assert self.moe_freq > 0 and self.moe_expert_count > 0
def override(self, args):
for hp in self.__dict__.keys():
@ -151,9 +163,15 @@ class EncoderDecoderConfig(object):
if self.deepnorm:
self.encoder_normalize_before = False
self.decoder_normalize_before = False
self.subln = False
if self.subln:
self.encoder_normalize_before = True
self.decoder_normalize_before = True
self.deepnorm = False
if self.use_xmoe:
self.moe_normalize_gate_prob_before_dropping = True
self.moe_second_expert_policy = "random"
assert self.moe_freq > 0 and self.moe_expert_count > 0
def override(self, args):
for hp in self.__dict__.keys():