Add features section
This commit is contained in:
parent
afd9094fb5
commit
5cbb7980a9
33
README.md
33
README.md
|
@ -64,6 +64,39 @@ We also support the `Decoder` architecture and the `EncoderDecoder` architecture
|
||||||
>>> print(encdec)
|
>>> print(encdec)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Key Features
|
||||||
|
|
||||||
|
- [DeepNorm to improve the training stability of Post-LayerNorm Transformers](https://arxiv.org/abs/2203.00555)
|
||||||
|
* enabled by setting *deepnorm=True* in the `Config` class.
|
||||||
|
|
||||||
|
- [SubLN for the model generality and the training stability](https://arxiv.org/abs/2210.06423)
|
||||||
|
* enabled by *subln=True*. This is enabled by default.
|
||||||
|
* Note that SubLN and DeepNorm cannot be used in one single model.
|
||||||
|
|
||||||
|
- [X-MoE: efficient and finetunable sparse MoE modeling](https://arxiv.org/abs/2204.09179)
|
||||||
|
* enabled by *use_xmoe=True*.
|
||||||
|
|
||||||
|
- [Multiway architecture for multimodality](https://arxiv.org/abs/2208.10442)
|
||||||
|
* enabled by *multiway=True*.
|
||||||
|
|
||||||
|
- [Relative position bias](https://arxiv.org/abs/1910.10683)
|
||||||
|
* enabled by adjusting *rel_pos_buckets* and *max_rel_pos*.
|
||||||
|
|
||||||
|
- [SparseClip: improving the gradient clipping for sparse MoE models](https://arxiv.org/abs/2211.13184)
|
||||||
|
* we provide a [sample code](examples/fairseq/utils/sparse_clip.py) that can be easily adapted to the FairSeq (or other) repo.
|
||||||
|
|
||||||
|
Most of the features above can be used by simply passing the corresponding parameters to the config. For example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from torchscale.architecture.config import EncoderConfig
|
||||||
|
>>> from torchscale.architecture.encoder import Encoder
|
||||||
|
|
||||||
|
>>> config = EncoderConfig(vocab_size=64000, deepnorm=True, multiway=True)
|
||||||
|
>>> model = Encoder(config)
|
||||||
|
|
||||||
|
>>> print(model)
|
||||||
|
```
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
We have the examples of how to use TorchScale in the following scenarios/tasks:
|
We have the examples of how to use TorchScale in the following scenarios/tasks:
|
||||||
|
|
|
@ -45,8 +45,14 @@ class EncoderConfig(object):
|
||||||
|
|
||||||
if self.deepnorm:
|
if self.deepnorm:
|
||||||
self.encoder_normalize_before = False
|
self.encoder_normalize_before = False
|
||||||
|
self.subln = False
|
||||||
if self.subln:
|
if self.subln:
|
||||||
self.encoder_normalize_before = True
|
self.encoder_normalize_before = True
|
||||||
|
self.deepnorm = False
|
||||||
|
if self.use_xmoe:
|
||||||
|
self.moe_normalize_gate_prob_before_dropping = True
|
||||||
|
self.moe_second_expert_policy = "random"
|
||||||
|
assert self.moe_freq > 0 and self.moe_expert_count > 0
|
||||||
|
|
||||||
def override(self, args):
|
def override(self, args):
|
||||||
for hp in self.__dict__.keys():
|
for hp in self.__dict__.keys():
|
||||||
|
@ -93,9 +99,15 @@ class DecoderConfig(object):
|
||||||
self.ddp_rank = kwargs.pop("ddp_rank", 0)
|
self.ddp_rank = kwargs.pop("ddp_rank", 0)
|
||||||
|
|
||||||
if self.deepnorm:
|
if self.deepnorm:
|
||||||
self.decoder_normalize_before = False
|
self.encoder_normalize_before = False
|
||||||
|
self.subln = False
|
||||||
if self.subln:
|
if self.subln:
|
||||||
self.decoder_normalize_before = True
|
self.encoder_normalize_before = True
|
||||||
|
self.deepnorm = False
|
||||||
|
if self.use_xmoe:
|
||||||
|
self.moe_normalize_gate_prob_before_dropping = True
|
||||||
|
self.moe_second_expert_policy = "random"
|
||||||
|
assert self.moe_freq > 0 and self.moe_expert_count > 0
|
||||||
|
|
||||||
def override(self, args):
|
def override(self, args):
|
||||||
for hp in self.__dict__.keys():
|
for hp in self.__dict__.keys():
|
||||||
|
@ -151,9 +163,15 @@ class EncoderDecoderConfig(object):
|
||||||
if self.deepnorm:
|
if self.deepnorm:
|
||||||
self.encoder_normalize_before = False
|
self.encoder_normalize_before = False
|
||||||
self.decoder_normalize_before = False
|
self.decoder_normalize_before = False
|
||||||
|
self.subln = False
|
||||||
if self.subln:
|
if self.subln:
|
||||||
self.encoder_normalize_before = True
|
self.encoder_normalize_before = True
|
||||||
self.decoder_normalize_before = True
|
self.decoder_normalize_before = True
|
||||||
|
self.deepnorm = False
|
||||||
|
if self.use_xmoe:
|
||||||
|
self.moe_normalize_gate_prob_before_dropping = True
|
||||||
|
self.moe_second_expert_policy = "random"
|
||||||
|
assert self.moe_freq > 0 and self.moe_expert_count > 0
|
||||||
|
|
||||||
def override(self, args):
|
def override(self, args):
|
||||||
for hp in self.__dict__.keys():
|
for hp in self.__dict__.keys():
|
||||||
|
|
Loading…
Reference in New Issue
Block a user