Update MoE criterions
This commit is contained in:
parent
8d8b80a731
commit
670113e446
|
@ -127,7 +127,6 @@ python -m torch.distributed.launch --nproc_per_node=8 --nnodes=8 train.py ${PATH
|
||||||
--span-length 3.0 \
|
--span-length 3.0 \
|
||||||
--leave-unmasked-prob 0.0 \
|
--leave-unmasked-prob 0.0 \
|
||||||
--random-token-prob 0.0 \
|
--random-token-prob 0.0 \
|
||||||
--criterion masked_lm \
|
|
||||||
--arch mlm_base \
|
--arch mlm_base \
|
||||||
--share-encoder-input-output-embed \
|
--share-encoder-input-output-embed \
|
||||||
--required-batch-size-multiple 8 \
|
--required-batch-size-multiple 8 \
|
||||||
|
@ -165,7 +164,7 @@ python -m torch.distributed.launch --nproc_per_node=8 --nnodes=8 train.py ${PATH
|
||||||
--moe-expert-count 64 --moe-freq 2 \
|
--moe-expert-count 64 --moe-freq 2 \
|
||||||
--moe-gating-use-fp32 --moe-second-expert-policy random --moe-normalize-gate-prob-before-dropping \
|
--moe-gating-use-fp32 --moe-second-expert-policy random --moe-normalize-gate-prob-before-dropping \
|
||||||
--moe-eval-capacity-token-fraction -1.0 \
|
--moe-eval-capacity-token-fraction -1.0 \
|
||||||
--criterion moe_cross_entropy --moe-gate-loss-wt 0.01 --moe-gate-loss-combine-method sum \
|
--criterion masked_lm_moe_cross_entropy --moe-gate-loss-wt 0.01 --moe-gate-loss-combine-method sum \
|
||||||
--use-xmoe --pad-to-max-length
|
--use-xmoe --pad-to-max-length
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
0
examples/fairseq/criterions/__init__.py
Normal file
0
examples/fairseq/criterions/__init__.py
Normal file
73
examples/fairseq/criterions/masked_lm_moe.py
Normal file
73
examples/fairseq/criterions/masked_lm_moe.py
Normal file
|
@ -0,0 +1,73 @@
|
||||||
|
# Copyright (c) Facebook, Inc. and its affiliates.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the MIT license found in the
|
||||||
|
# LICENSE file in the root directory of this source tree.
|
||||||
|
|
||||||
|
import math
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from fairseq import metrics, utils
|
||||||
|
from fairseq.criterions import MoECriterion, register_criterion, MoECriterionConfig
|
||||||
|
|
||||||
|
|
||||||
|
@register_criterion("masked_lm_moe_cross_entropy", dataclass=MoECriterionConfig)
|
||||||
|
class MaskedLMMoECrossEntropyCriterion(MoECriterion):
|
||||||
|
|
||||||
|
def compute_inner_loss(self, model, sample, reduce=True):
|
||||||
|
masked_tokens = sample["target"].ne(self.padding_idx)
|
||||||
|
sample_size = masked_tokens.int().sum()
|
||||||
|
|
||||||
|
masked_tokens = torch.where(
|
||||||
|
masked_tokens.any(),
|
||||||
|
masked_tokens,
|
||||||
|
masked_tokens.new([True]),
|
||||||
|
)
|
||||||
|
|
||||||
|
net_output = model(**sample["net_input"], masked_tokens=masked_tokens)
|
||||||
|
sample_size = (
|
||||||
|
sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
|
||||||
|
)
|
||||||
|
lprobs = model.get_normalized_probs(net_output, log_probs=True)
|
||||||
|
lprobs = lprobs.view(-1, lprobs.size(-1))
|
||||||
|
target = model.get_targets(sample, net_output).view(-1)
|
||||||
|
|
||||||
|
if masked_tokens is not None:
|
||||||
|
targets = targets[masked_tokens]
|
||||||
|
|
||||||
|
nll_loss = F.nll_loss(
|
||||||
|
lprobs,
|
||||||
|
target,
|
||||||
|
ignore_index=self.padding_idx,
|
||||||
|
reduction="sum" if reduce else "none",
|
||||||
|
)
|
||||||
|
logging_output = {
|
||||||
|
"inner_loss": nll_loss.data,
|
||||||
|
"ntokens": sample["ntokens"],
|
||||||
|
"nsentences": sample["target"].size(0),
|
||||||
|
"sample_size": sample_size,
|
||||||
|
}
|
||||||
|
return net_output, nll_loss, sample_size, logging_output
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def reduce_metrics(logging_outputs) -> None:
|
||||||
|
"""Aggregate logging outputs from data parallel training."""
|
||||||
|
MoECrossEntropyCriterion.reduce_moe_metrics(logging_outputs)
|
||||||
|
|
||||||
|
loss_sum = sum(log.get("inner_loss", 0) for log in logging_outputs)
|
||||||
|
ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
|
||||||
|
sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
|
||||||
|
|
||||||
|
# we divide by log(2) to convert the loss from base e to base 2
|
||||||
|
metrics.log_scalar(
|
||||||
|
"inner_loss", loss_sum / sample_size / math.log(2), sample_size, round=3
|
||||||
|
)
|
||||||
|
if sample_size != ntokens:
|
||||||
|
metrics.log_scalar(
|
||||||
|
"nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
|
||||||
|
)
|
||||||
|
metrics.log_derived(
|
||||||
|
"ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
metrics.log_derived(
|
||||||
|
"ppl", lambda meters: utils.get_perplexity(meters["inner_loss"].avg)
|
||||||
|
)
|
Loading…
Reference in New Issue
Block a user