add --pad-to-max-length in bert+moe example
This commit is contained in:
parent
95aea9c1b4
commit
5b0be94ab8
|
@ -166,7 +166,7 @@ python -m torch.distributed.launch --nproc_per_node=8 --nnodes=8 train.py ${PATH
|
||||||
--moe-gating-use-fp32 --moe-second-expert-policy random --moe-normalize-gate-prob-before-dropping \
|
--moe-gating-use-fp32 --moe-second-expert-policy random --moe-normalize-gate-prob-before-dropping \
|
||||||
--moe-eval-capacity-token-fraction -1.0 \
|
--moe-eval-capacity-token-fraction -1.0 \
|
||||||
--criterion moe_cross_entropy --moe-gate-loss-wt 0.01 --moe-gate-loss-combine-method sum \
|
--criterion moe_cross_entropy --moe-gate-loss-wt 0.01 --moe-gate-loss-combine-method sum \
|
||||||
--use-xmoe
|
--use-xmoe --pad-to-max-length
|
||||||
```
|
```
|
||||||
|
|
||||||
## Example: GPT Pretraining
|
## Example: GPT Pretraining
|
||||||
|
|
Loading…
Reference in New Issue
Block a user