From 1354614d4413a0dabdb02bd6c1cdc2d544afd00b Mon Sep 17 00:00:00 2001 From: shumingma Date: Sat, 26 Nov 2022 08:15:08 -0800 Subject: [PATCH] Update config file --- torchscale/architecture/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchscale/architecture/config.py b/torchscale/architecture/config.py index ad8c111..7fb5aca 100644 --- a/torchscale/architecture/config.py +++ b/torchscale/architecture/config.py @@ -22,7 +22,7 @@ class EncoderConfig(object): self.moe_eval_capacity_token_fraction = kwargs.pop("moe_eval_capacity_token_fraction", 0.25) self.moe_second_expert_policy = kwargs.pop("moe_second_expert_policy", "random") self.moe_normalize_gate_prob_before_dropping = kwargs.pop("moe_normalize_gate_prob_before_dropping", False) - self.use_xmoe = kwargs.pop("use_xmoe", True) + self.use_xmoe = kwargs.pop("use_xmoe", False) self.rel_pos_buckets = kwargs.pop("rel_pos_buckets", 0) self.max_rel_pos = kwargs.pop("max_rel_pos", 0) self.deepnorm = kwargs.pop("deepnorm", False) @@ -81,7 +81,7 @@ class DecoderConfig(object): self.moe_eval_capacity_token_fraction = kwargs.pop("moe_eval_capacity_token_fraction", 0.25) self.moe_second_expert_policy = kwargs.pop("moe_second_expert_policy", "random") self.moe_normalize_gate_prob_before_dropping = kwargs.pop("moe_normalize_gate_prob_before_dropping", False) - self.use_xmoe = kwargs.pop("use_xmoe", True) + self.use_xmoe = kwargs.pop("use_xmoe", False) self.rel_pos_buckets = kwargs.pop("rel_pos_buckets", 0) self.max_rel_pos = kwargs.pop("max_rel_pos", 0) self.deepnorm = kwargs.pop("deepnorm", False) @@ -141,7 +141,7 @@ class EncoderDecoderConfig(object): self.moe_eval_capacity_token_fraction = kwargs.pop("moe_eval_capacity_token_fraction", 0.25) self.moe_second_expert_policy = kwargs.pop("moe_second_expert_policy", "random") self.moe_normalize_gate_prob_before_dropping = kwargs.pop("moe_normalize_gate_prob_before_dropping", False) - self.use_xmoe = kwargs.pop("use_xmoe", True) + self.use_xmoe = kwargs.pop("use_xmoe", False) self.rel_pos_buckets = kwargs.pop("rel_pos_buckets", 0) self.max_rel_pos = kwargs.pop("max_rel_pos", 0) self.deepnorm = kwargs.pop("deepnorm", False)