From bc140c65bb10d2aba06cfbddf92d342780eb2e36 Mon Sep 17 00:00:00 2001
From: buaahsh <buaahsh@foxmail.com>
Date: Sun, 5 Mar 2023 07:43:58 +0000
Subject: [PATCH 1/3] fx bert moe

---
 examples/fairseq/models/bert.py           | 3 +++
 examples/fairseq/tasks/data/mlm_loader.py | 3 +++
 examples/fairseq/tasks/pretraining.py     | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/examples/fairseq/models/bert.py b/examples/fairseq/models/bert.py
index f804973..c7c652d 100644
--- a/examples/fairseq/models/bert.py
+++ b/examples/fairseq/models/bert.py
@@ -130,6 +130,9 @@ class BertConfig(FairseqDataclass):
     tpu: bool = II("common.tpu")
     rel_pos_buckets: int = field(default=0, metadata={"help": ""})
     max_rel_pos: int = field(default=0, metadata={"help": ""})
+    use_xmoe: Optional[bool] = field(
+        default=False,
+    )
     moe_freq: int = field(
         default=0,
         metadata={"help": "Frequency at which we insert MoE Transformer layers"},
diff --git a/examples/fairseq/tasks/data/mlm_loader.py b/examples/fairseq/tasks/data/mlm_loader.py
index eb9cd72..510f654 100644
--- a/examples/fairseq/tasks/data/mlm_loader.py
+++ b/examples/fairseq/tasks/data/mlm_loader.py
@@ -166,6 +166,9 @@ class MLMLoader(BaseBatchGen):
             mlm_target_max_length = max([len(x[1]) for x in batch])
             s2s_source_max_length = max([len(x[2]) for x in batch])
             s2s_target_max_length = max([len(x[3]) for x in batch])
+            if self.args.pad_to_max_length:
+                mlm_source_max_length = self.args.tokens_per_sample
+                mlm_target_max_length = self.args.tokens_per_sample
 
             mlm_source_ids = np.full(
                 shape=(batch_size, mlm_source_max_length),
diff --git a/examples/fairseq/tasks/pretraining.py b/examples/fairseq/tasks/pretraining.py
index 2d32127..2022907 100644
--- a/examples/fairseq/tasks/pretraining.py
+++ b/examples/fairseq/tasks/pretraining.py
@@ -117,6 +117,9 @@ class PretrainingConfig(FairseqDataclass):
         default="",
         metadata={"help": ""},
     )
+    pad_to_max_length: bool = field(
+        default=False,
+        )
 
 
 @register_task("pretraining", dataclass=PretrainingConfig)

From 95aea9c1b46cbe3b33f7bede0e340f66a0214906 Mon Sep 17 00:00:00 2001
From: Shaohan Huang <buaahsh@foxmail.com>
Date: Sun, 5 Mar 2023 19:36:07 +0800
Subject: [PATCH 2/3] set numpy version

---
 examples/fairseq/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/fairseq/README.md b/examples/fairseq/README.md
index bd3d50c..d2d42a3 100644
--- a/examples/fairseq/README.md
+++ b/examples/fairseq/README.md
@@ -10,7 +10,7 @@ pip install -e .
 pip install git+https://github.com/shumingma/fairseq.git@moe
 pip install git+https://github.com/shumingma/infinibatch.git
 pip install iopath
-pip install --upgrade numpy
+pip install numpy==1.23.0
 ```
 
 ## Example: BERT Pretraining

From 5b0be94ab86e99748e2354e3ee7992790d1ea3ae Mon Sep 17 00:00:00 2001
From: Shaohan Huang <buaahsh@foxmail.com>
Date: Sun, 5 Mar 2023 19:39:04 +0800
Subject: [PATCH 3/3] add --pad-to-max-length in bert+moe example

---
 examples/fairseq/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/fairseq/README.md b/examples/fairseq/README.md
index d2d42a3..d1c5500 100644
--- a/examples/fairseq/README.md
+++ b/examples/fairseq/README.md
@@ -166,7 +166,7 @@ python -m torch.distributed.launch --nproc_per_node=8 --nnodes=8 train.py ${PATH
         --moe-gating-use-fp32 --moe-second-expert-policy random --moe-normalize-gate-prob-before-dropping \
         --moe-eval-capacity-token-fraction -1.0 \
         --criterion moe_cross_entropy --moe-gate-loss-wt 0.01 --moe-gate-loss-combine-method sum \
-        --use-xmoe
+        --use-xmoe --pad-to-max-length
 ```
 
 ## Example: GPT Pretraining