From bc140c65bb10d2aba06cfbddf92d342780eb2e36 Mon Sep 17 00:00:00 2001 From: buaahsh Date: Sun, 5 Mar 2023 07:43:58 +0000 Subject: [PATCH 1/3] fx bert moe --- examples/fairseq/models/bert.py | 3 +++ examples/fairseq/tasks/data/mlm_loader.py | 3 +++ examples/fairseq/tasks/pretraining.py | 3 +++ 3 files changed, 9 insertions(+) diff --git a/examples/fairseq/models/bert.py b/examples/fairseq/models/bert.py index f804973..c7c652d 100644 --- a/examples/fairseq/models/bert.py +++ b/examples/fairseq/models/bert.py @@ -130,6 +130,9 @@ class BertConfig(FairseqDataclass): tpu: bool = II("common.tpu") rel_pos_buckets: int = field(default=0, metadata={"help": ""}) max_rel_pos: int = field(default=0, metadata={"help": ""}) + use_xmoe: Optional[bool] = field( + default=False, + ) moe_freq: int = field( default=0, metadata={"help": "Frequency at which we insert MoE Transformer layers"}, diff --git a/examples/fairseq/tasks/data/mlm_loader.py b/examples/fairseq/tasks/data/mlm_loader.py index eb9cd72..510f654 100644 --- a/examples/fairseq/tasks/data/mlm_loader.py +++ b/examples/fairseq/tasks/data/mlm_loader.py @@ -166,6 +166,9 @@ class MLMLoader(BaseBatchGen): mlm_target_max_length = max([len(x[1]) for x in batch]) s2s_source_max_length = max([len(x[2]) for x in batch]) s2s_target_max_length = max([len(x[3]) for x in batch]) + if self.args.pad_to_max_length: + mlm_source_max_length = self.args.tokens_per_sample + mlm_target_max_length = self.args.tokens_per_sample mlm_source_ids = np.full( shape=(batch_size, mlm_source_max_length), diff --git a/examples/fairseq/tasks/pretraining.py b/examples/fairseq/tasks/pretraining.py index 2d32127..2022907 100644 --- a/examples/fairseq/tasks/pretraining.py +++ b/examples/fairseq/tasks/pretraining.py @@ -117,6 +117,9 @@ class PretrainingConfig(FairseqDataclass): default="", metadata={"help": ""}, ) + pad_to_max_length: bool = field( + default=False, + ) @register_task("pretraining", dataclass=PretrainingConfig) From 95aea9c1b46cbe3b33f7bede0e340f66a0214906 Mon Sep 17 00:00:00 2001 From: Shaohan Huang Date: Sun, 5 Mar 2023 19:36:07 +0800 Subject: [PATCH 2/3] set numpy version --- examples/fairseq/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/fairseq/README.md b/examples/fairseq/README.md index bd3d50c..d2d42a3 100644 --- a/examples/fairseq/README.md +++ b/examples/fairseq/README.md @@ -10,7 +10,7 @@ pip install -e . pip install git+https://github.com/shumingma/fairseq.git@moe pip install git+https://github.com/shumingma/infinibatch.git pip install iopath -pip install --upgrade numpy +pip install numpy==1.23.0 ``` ## Example: BERT Pretraining From 5b0be94ab86e99748e2354e3ee7992790d1ea3ae Mon Sep 17 00:00:00 2001 From: Shaohan Huang Date: Sun, 5 Mar 2023 19:39:04 +0800 Subject: [PATCH 3/3] add --pad-to-max-length in bert+moe example --- examples/fairseq/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/fairseq/README.md b/examples/fairseq/README.md index d2d42a3..d1c5500 100644 --- a/examples/fairseq/README.md +++ b/examples/fairseq/README.md @@ -166,7 +166,7 @@ python -m torch.distributed.launch --nproc_per_node=8 --nnodes=8 train.py ${PATH --moe-gating-use-fp32 --moe-second-expert-policy random --moe-normalize-gate-prob-before-dropping \ --moe-eval-capacity-token-fraction -1.0 \ --criterion moe_cross_entropy --moe-gate-loss-wt 0.01 --moe-gate-loss-combine-method sum \ - --use-xmoe + --use-xmoe --pad-to-max-length ``` ## Example: GPT Pretraining