the learning rate scheduler pill is a tough pill to swallow
This commit is contained in:
parent
b97faa8173
commit
ddc49c89c5
|
@ -504,7 +504,6 @@ class Hyperparameters:
|
|||
warmup_steps: int = 0 # number of steps to warm up the optimizer before performing updates, I think, this is just passed to deepspeed
|
||||
|
||||
scheduler: str = "" # scheduler to use, currently don't ever use one so this doesn't really matter
|
||||
scheduler_type: str = "" # deprecated
|
||||
scheduler_params: dict = field(default_factory=lambda: {}) # to pass through deepspeed config
|
||||
|
||||
autotune: bool = False # to do deepspeed's autotuning
|
||||
|
@ -1063,10 +1062,6 @@ class Config(BaseConfig):
|
|||
if model.training:
|
||||
model.teacher = False
|
||||
|
||||
if self.hyperparameters.scheduler_type and not self.hyperparameters.scheduler:
|
||||
self.hyperparameters.scheduler = self.hyperparameters.scheduler_type
|
||||
self.hyperparameters.scheduler_type = ""
|
||||
|
||||
# do not combine the two
|
||||
if self.hyperparameters.scheduler == "schedulefree" and self.optimizations.dadaptation:
|
||||
self.hyperparameters.scheduler = ""
|
||||
|
|
|
@ -175,10 +175,32 @@ def load_engines(training=True, **model_kwargs):
|
|||
lr = params['lr'],
|
||||
warmup_steps = cfg.hyperparameters.warmup_steps
|
||||
)
|
||||
elif cfg.hyperparameters.scheduler:
|
||||
scheduler_kwargs = {}
|
||||
if cfg.hyperparameters.scheduler.lower() == "onecycle":
|
||||
scheduler_class = ml.OneCycleLR
|
||||
scheduler_kwargs["max_lr"] = params['lr']
|
||||
elif cfg.hyperparameters.scheduler.lower() == "cosineannealing":
|
||||
scheduler_class = ml.CosineAnnealingLR
|
||||
elif cfg.hyperparameters.scheduler.lower() == "noam":
|
||||
scheduler_class = ml.NoamLR
|
||||
scheduler_kwargs["d_model"] = model.d_model
|
||||
scheduler_kwargs["warmup_steps"] = cfg.hyperparameters.warmup_steps
|
||||
elif cfg.hyperparameters.scheduler.lower() == "warmup":
|
||||
scheduler_class = ml.WarmupLR
|
||||
scheduler_kwargs["warmup_steps"] = cfg.hyperparameters.warmup_steps
|
||||
else:
|
||||
raise ValueError(f'Scheduler specified not implemented: {cfg.hyperparameters.scheduler}')
|
||||
|
||||
scheduler_kwargs.update(cfg.hyperparameters.scheduler_params)
|
||||
lr_scheduler = scheduler_class(
|
||||
optimizer,
|
||||
**scheduler_kwargs,
|
||||
)
|
||||
"""
|
||||
# set up our LR scheduler here
|
||||
"""
|
||||
|
||||
"""
|
||||
# set up our LR scheduler here
|
||||
"""
|
||||
|
||||
if inferencing:
|
||||
optimizer = None
|
||||
|
|
|
@ -64,21 +64,20 @@ class Engine():
|
|||
kwargs.pop("hyper_config")
|
||||
|
||||
self.module = kwargs['model'].to(cfg.device).to(torch.float32 if cfg.trainer.amp else cfg.trainer.dtype)
|
||||
self.optimizer = kwargs['optimizer'] if 'optimizer' in kwargs else None
|
||||
self.lr_scheduler = kwargs['lr_scheduler'] if 'lr_scheduler' in kwargs else None
|
||||
|
||||
stats = kwargs.pop("stats", {})
|
||||
if stats is None:
|
||||
stats = {}
|
||||
self.global_steps = stats.pop("global_step", 0)
|
||||
self.micro_steps = stats.pop("micro_step", 0)
|
||||
self.global_samples = stats.pop("global_samples", 0)
|
||||
self.tokens_processed = stats.pop("tokens_processed", 0)
|
||||
|
||||
self._frozen_params = set()
|
||||
|
||||
self.optimizer = kwargs.get('optimizer', None)
|
||||
self.lr_scheduler = kwargs.get('lr_scheduler', None)
|
||||
self.loss_scaler = torch.cuda.amp.GradScaler() if cfg.trainer.scale_loss else None
|
||||
|
||||
stats = kwargs.get("stats", {})
|
||||
if stats is None:
|
||||
stats = {}
|
||||
|
||||
self.global_steps = stats.get("global_step", 0)
|
||||
self.micro_steps = stats.get("micro_step", 0)
|
||||
self.global_samples = stats.get("global_samples", 0)
|
||||
self.tokens_processed = stats.get("tokens_processed", 0)
|
||||
|
||||
self._frozen_params = set()
|
||||
self.current_batch_size = 0
|
||||
self._global_grad_norm = None
|
||||
|
||||
|
@ -256,9 +255,11 @@ class Engine():
|
|||
self.loss_scaler.update()
|
||||
else:
|
||||
self.optimizer.step()
|
||||
|
||||
if self.lr_scheduler is not None:
|
||||
self.lr_scheduler.step()
|
||||
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
# self._get_grad_norm()
|
||||
|
||||
# doesn't actually work
|
||||
def _get_grad_norm(self):
|
||||
|
|
|
@ -34,10 +34,7 @@ if not distributed_initialized() and cfg.trainer.backend == "deepspeed":
|
|||
|
||||
class Engine(DeepSpeedEngine):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.hyper_config = None
|
||||
if 'hyper_config' in kwargs:
|
||||
self.hyper_config = kwargs['hyper_config']
|
||||
kwargs.pop("hyper_config")
|
||||
self.hyper_config = kwargs.pop('hyper_config', None)
|
||||
|
||||
kwargs['config'] = cfg.trainer.deepspeed.ds_cfg
|
||||
kwargs['config_class'] = DeepSpeedConfig(kwargs['config'])
|
||||
|
@ -50,18 +47,18 @@ class Engine(DeepSpeedEngine):
|
|||
}
|
||||
|
||||
# kwargs['stats'] = None will return None when popped
|
||||
maybe_stats = kwargs.pop('stats', stats)
|
||||
maybe_stats = kwargs.get('stats', stats)
|
||||
if maybe_stats is not None:
|
||||
stats = maybe_stats
|
||||
|
||||
super().__init__(None, *args, **kwargs)
|
||||
self._frozen_params = set()
|
||||
|
||||
self.global_steps = stats["global_step"]
|
||||
self.micro_steps = stats["micro_step"]
|
||||
self.global_samples = stats["global_samples"]
|
||||
self.tokens_processed = stats["tokens_processed"]
|
||||
|
||||
self._frozen_params = set()
|
||||
self.current_batch_size = 0
|
||||
|
||||
def freeze(self, freeze_all=True):
|
||||
|
|
|
@ -952,6 +952,31 @@ def example_usage():
|
|||
if scheduler is not None:
|
||||
_logger.info(f"Scheduler: {scheduler}")
|
||||
optimizer = scheduler( model.parameters(), lr = learning_rate )
|
||||
elif cfg.hyperparameters.scheduler:
|
||||
scheduler_kwargs = {}
|
||||
if scheduler == "onecycle":
|
||||
scheduler_class = ml.OneCycleLR
|
||||
scheduler_kwargs["max_lr"] = params['lr']
|
||||
elif scheduler == "cosineannealing":
|
||||
scheduler_class = ml.CosineAnnealingLR
|
||||
elif scheduler == "noam":
|
||||
scheduler_class = ml.NoamLR
|
||||
scheduler_kwargs["d_model"] = model.d_model
|
||||
scheduler_kwargs["warmup_steps"] = cfg.hyperparameters.warmup_steps
|
||||
elif scheduler == "warmup":
|
||||
scheduler_class = ml.WarmupLR
|
||||
scheduler_kwargs["warmup_steps"] = cfg.hyperparameters.warmup_steps
|
||||
else:
|
||||
raise ValueError(f'Scheduler specified not implemented: {cfg.hyperparameters.scheduler}')
|
||||
|
||||
scheduler_kwargs.update(cfg.hyperparameters.scheduler_params)
|
||||
scheduler = scheduler_class(
|
||||
optimizer,
|
||||
**scheduler_kwargs,
|
||||
)
|
||||
|
||||
if isinstance(scheduler, str):
|
||||
scheduler = None
|
||||
|
||||
if cfg.optimizations.replace and cfg.optimizations.linear:
|
||||
model = ml.replace_linear( model )
|
||||
|
@ -968,7 +993,7 @@ def example_usage():
|
|||
}
|
||||
"""
|
||||
|
||||
engine = Engine(model=model, optimizer=optimizer)
|
||||
engine = Engine(model=model, optimizer=optimizer, lr_scheduler=scheduler)
|
||||
engines = Engines({"ar+nar": engine})
|
||||
engines.setup()
|
||||
|
||||
|
@ -1047,7 +1072,7 @@ def example_usage():
|
|||
for i in t:
|
||||
texts, proms, resps, tasks = sample_data()
|
||||
|
||||
stats = {"step": i}
|
||||
stats = {"step": i, "lr": engine.get_lr()[0]}
|
||||
with torch.autograd.set_detect_anomaly(cfg.trainer.detect_grad_anomaly):
|
||||
stats |= engine.traverse(phns_list=texts, proms_list=proms, resps_list=resps, task_list=tasks, training=True)
|
||||
stats |= {"grad_norm": engine.get_global_grad_norm()}
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
from contextlib import contextmanager
|
||||
|
||||
import math
|
||||
import logging
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import logging
|
||||
from torch.optim.lr_scheduler import _LRScheduler
|
||||
|
||||
from ..config import cfg
|
||||
|
||||
|
@ -17,6 +19,43 @@ AdamW = torch.optim.AdamW
|
|||
SGD = torch.optim.SGD
|
||||
Adagrad = torch.optim.Adagrad
|
||||
|
||||
OneCycleLR = torch.optim.lr_scheduler.OneCycleLR
|
||||
CosineAnnealingLR = torch.optim.lr_scheduler.CosineAnnealingLR
|
||||
LambdaLR = torch.optim.lr_scheduler.LambdaLR
|
||||
|
||||
# implements Noam scheduling
|
||||
# it's cringe
|
||||
class NoamLR(_LRScheduler):
|
||||
def __init__(self, optimizer, warmup_steps, d_model=1024, last_epoch=-1):
|
||||
self.base_factor = d_model ** (-0.5)
|
||||
self.warmup_steps = warmup_steps
|
||||
|
||||
super().__init__(optimizer, last_epoch)
|
||||
|
||||
def get_lr(self):
|
||||
step = max(1, self.last_epoch)
|
||||
scale = self.base_factor * min(step ** (-0.5), step * self.warmup_steps ** (-1.5))
|
||||
|
||||
return [base_lr * scale for base_lr in self.base_lrs]
|
||||
|
||||
# gradually warms up LR then holds or decays
|
||||
class WarmupLR(_LRScheduler):
|
||||
def __init__(self, optimizer, warmup_steps, decay_factor=0.0, last_epoch=-1):
|
||||
self.warmup_steps = warmup_steps
|
||||
self.decay_factor = decay_factor
|
||||
|
||||
super().__init__(optimizer, last_epoch)
|
||||
|
||||
def get_lr(self):
|
||||
step = self.last_epoch + 1
|
||||
scale = 1
|
||||
if step < self.warmup_steps:
|
||||
scale = float(step) / float(max(1, self.warmup_steps))
|
||||
elif self.decay_factor != 0:
|
||||
scale = (1.0 - self.decay_factor) ** (step - self.warmup_steps)
|
||||
|
||||
return [base_lr * scale for base_lr in self.base_lrs]
|
||||
|
||||
# https://github.com/kyegomez/BitNet
|
||||
if cfg.optimizations.bitnet:
|
||||
from bitnet import BitLinear
|
||||
|
|
Loading…
Reference in New Issue
Block a user