torchscale/examples/fairseq/tasks/pretraining.py

210 lines
6.7 KiB
Python
Raw Normal View History

2022-11-23 16:36:55 +00:00
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
2022-11-26 17:01:02 +00:00
import json
import logging
import os
from argparse import Namespace
2022-11-23 16:21:58 +00:00
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from dataclasses import dataclass, field
2022-11-26 17:01:02 +00:00
import sentencepiece as spm
2022-11-23 16:21:58 +00:00
from fairseq import utils
from fairseq.data import Dictionary
2022-11-26 17:01:02 +00:00
from fairseq.dataclass import ChoiceEnum, FairseqDataclass
2022-11-23 16:21:58 +00:00
from fairseq.tasks import FairseqTask, register_task
2022-11-26 17:01:02 +00:00
from omegaconf import II, MISSING
2022-11-23 16:21:58 +00:00
from .data.mlm_loader import MLMLoader
logger = logging.getLogger(__name__)
SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"])
SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"])
2022-11-26 16:10:15 +00:00
2022-11-23 16:21:58 +00:00
@dataclass
class PretrainingConfig(FairseqDataclass):
data: str = field(
default=MISSING,
metadata={
"help": "colon separated path to data directories list, \
will be iterated upon during epochs in round-robin manner"
},
)
sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field(
default="complete",
metadata={
"help": 'If omitted or "none", fills each sample with tokens-per-sample '
'tokens. If set to "complete", splits samples only at the end '
"of sentence, but may include multiple sentences per sample. "
'"complete_doc" is similar but respects doc boundaries. '
'If set to "eos", includes only one sentence per sample.'
},
)
tokens_per_sample: int = field(
default=1024,
metadata={"help": "max number of tokens per sample for LM dataset"},
)
mask_prob: float = field(
default=0.15,
metadata={"help": "probability of replacing a token with mask"},
)
leave_unmasked_prob: float = field(
default=0.1,
metadata={"help": "probability that a masked token is unmasked"},
)
random_token_prob: float = field(
default=0.1,
metadata={"help": "probability of replacing a token with a random token"},
)
freq_weighted_replacement: bool = field(
default=False,
metadata={"help": "sample random replacement words based on word frequencies"},
)
mask_whole_words: bool = field(
default=False,
metadata={"help": "mask whole words; you may also want to set --bpe"},
)
mask_multiple_length: int = field(
default=1,
metadata={"help": "repeat the mask indices multiple times"},
)
mask_stdev: float = field(
default=0.0,
metadata={"help": "stdev of the mask length"},
)
shorten_method: SHORTEN_METHOD_CHOICES = field(
default="none",
metadata={
"help": "if not none, shorten sequences that exceed --tokens-per-sample"
},
)
shorten_data_split_list: str = field(
default="",
metadata={
"help": "comma-separated list of dataset splits to apply shortening to, "
'e.g., "train,valid" (default: all dataset splits)'
},
)
seed: int = II("common.seed")
span_length: float = field(
default=3.0,
metadata={"help": "average span length for masking"},
)
remove_source_sentinel: bool = field(
default=False,
metadata={"help": "remove the source sentinel for the span corruption task"},
)
remove_target_sentinel: bool = field(
default=False,
metadata={"help": "remove the target sentinel for the span corruption task"},
)
batch_read_ahead: int = field(
default=100000,
metadata={"help": "batch read ahead size for infinibatch"},
)
required_batch_size_multiple: int = II("dataset.required_batch_size_multiple")
spm_model: str = field(
default="",
2022-11-26 17:01:02 +00:00
metadata={"help": "sentencepice model to tokenize the data"},
2022-11-23 16:21:58 +00:00
)
dict_file: str = field(
default="",
2022-11-26 17:01:02 +00:00
metadata={"help": ""},
2022-11-23 16:21:58 +00:00
)
2023-03-05 07:43:58 +00:00
pad_to_max_length: bool = field(
default=False,
)
2022-11-23 16:21:58 +00:00
@register_task("pretraining", dataclass=PretrainingConfig)
class PLMTask(FairseqTask):
def __init__(self, cfg, dictionary, tokenizer):
super().__init__(cfg)
self.cfg = cfg
self.dictionary = dictionary
self.tokenizer = tokenizer
self.seed = cfg.seed
self.mask_idx = dictionary.index("<mask>")
@classmethod
def setup_task(cls, cfg, **kwargs):
paths = utils.split_paths(cfg.data)
assert len(paths) > 0
if cfg.dict_file != "":
dictionary = Dictionary.load(cfg.dict_file)
else:
dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
# add mask token
dictionary.add_symbol("<mask>")
for i in range(100):
dictionary.add_symbol(f"<mask_{i}>")
dictionary.pad_to_multiple_(cfg.required_batch_size_multiple)
logger.info("dictionary: {} types".format(len(dictionary)))
# tokenizer = SentencepieceBPE(Namespace(sentencepiece_model=cfg.spm_model))
tokenizer = spm.SentencePieceProcessor()
tokenizer.Load(cfg.spm_model)
return cls(cfg, dictionary, tokenizer)
def load_dataset(self, split, epoch=1, combine=False, **kwargs):
self.datasets[split] = {
2022-11-26 17:01:02 +00:00
"data": json.load(open(f"{self.cfg.data}/json/{split}.json")),
"data_dir": self.cfg.data,
"shuffle": True if split == "train" else False,
2022-11-23 16:21:58 +00:00
}
self.datasets[split] = Namespace(**self.datasets[split])
2022-11-26 16:10:15 +00:00
2022-11-23 16:21:58 +00:00
def dataset(self, split):
if split not in self.datasets:
raise KeyError("Dataset not loaded: " + split)
2022-11-26 16:10:15 +00:00
2022-11-23 16:21:58 +00:00
return self.datasets[split]
def get_batch_iterator(
self,
dataset,
max_tokens=None,
max_sentences=None,
max_positions=None,
ignore_invalid_inputs=False,
required_batch_size_multiple=1,
seed=1,
num_shards=1,
shard_id=0,
num_workers=0,
epoch=1,
data_buffer_size=0,
disable_iterator_cache=False,
2022-12-15 11:44:15 +00:00
**kwargs,
2022-11-23 16:21:58 +00:00
):
return MLMLoader(
2022-11-26 17:01:02 +00:00
self.cfg,
dataset,
self.dictionary,
self.tokenizer,
max_tokens=max_tokens,
max_sentences=max_sentences,
max_positions=max_positions,
ignore_invalid_inputs=ignore_invalid_inputs,
required_batch_size_multiple=required_batch_size_multiple,
seed=seed,
num_shards=num_shards,
shard_id=shard_id,
2022-11-23 16:21:58 +00:00
)
@property
def source_dictionary(self):
return self.dictionary
@property
def target_dictionary(self):
2022-11-26 16:10:15 +00:00
return self.dictionary