133 lines
6.2 KiB
Python
133 lines
6.2 KiB
Python
import math
|
|
import random
|
|
|
|
import torch
|
|
from torch import distributed
|
|
from torch._C._distributed_c10d import ReduceOp
|
|
|
|
from utils.util import opt_get
|
|
|
|
|
|
def create_batch_size_optimizer(opt_train):
|
|
if 'batch_size_optimizer' in opt_train.keys():
|
|
if opt_train['batch_size_optimizer']['type'] == 'gradient_direction':
|
|
return GradientDirectionOptimizer(opt_train)
|
|
return MegabatchBatchSizeOptimizer(opt_train)
|
|
|
|
|
|
def grad(p):
|
|
if p.grad is None:
|
|
return torch.tensor(0)
|
|
return p.grad.detach().clone()
|
|
|
|
|
|
# Base class for BatchSizeOptimizers.
|
|
class BatchSizeOptimizer:
|
|
def focus(self, optimizer):
|
|
pass
|
|
|
|
def should_step(self, it):
|
|
raise NotImplementedError
|
|
|
|
def get_statistics(self):
|
|
return {}
|
|
|
|
|
|
# BatchSizeOptimizer that just steps every megabatch.
|
|
class MegabatchBatchSizeOptimizer(BatchSizeOptimizer):
|
|
def __init__(self, opt_train):
|
|
pass
|
|
|
|
def should_step(self, it):
|
|
return True
|
|
|
|
|
|
# BatchSizeOptimizer that uses the gradient direction of a few parameters to determine when to step.
|
|
# Very similar to what is described in https://aclanthology.org/2020.acl-main.323.pdf
|
|
# Special note: this class will ALWAYS accumulate, at a minimum, 3 batches. Plan accordingly.
|
|
class GradientDirectionOptimizer(BatchSizeOptimizer):
|
|
def __init__(self, opt_train):
|
|
self.opt = opt_train['batch_size_optimizer']
|
|
self.max_full_batches = opt_get(self.opt, ['max_full_batches'], 10)
|
|
self.parameters_to_poll = opt_get(self.opt, ['poll_parameters'], 8)
|
|
self.recalculate_directions_every = opt_get(self.opt, ['recalculate_directions_steps'], 1)
|
|
self.current_model = None
|
|
|
|
# Metrics
|
|
self.steps_taken = 0
|
|
self.last_number_iterations = torch.zeros((128,))
|
|
self.last_number_iterations_i = 0
|
|
self.last_number_iterations_filled = False
|
|
|
|
def vector_angle(self, v1, v2):
|
|
if torch.all(v1 == 0) or torch.all(v2 == 0):
|
|
return torch.tensor(0, device=v1.device)
|
|
with torch.no_grad():
|
|
v1 = v1.flatten()
|
|
v2 = v2.flatten()
|
|
v1_norm = (v1 ** 2).sum().sqrt()
|
|
v2_norm = (v2 ** 2).sum().sqrt()
|
|
angle = torch.arccos((torch.dot(v1, v2)) / (v1_norm * v2_norm))
|
|
return angle
|
|
|
|
def focus(self, model):
|
|
if not hasattr(model, '_gradient_direction_optimizer_finished') or model._gradient_direction_optimizer_finished:
|
|
all_params = list(filter(lambda t: '.weight' in t[0] and not hasattr(t[1].requires_grad, 'DO_NOT_TRAIN'),
|
|
list(model.named_parameters()))) # Extracts weight parameters. Who cares about biases anyways? :)
|
|
num_params = min(len(all_params), self.parameters_to_poll)
|
|
model._gradient_direction_optimizer_params = random.sample(all_params, num_params)
|
|
model._gradient_direction_optimizer_prior_directions = [0 for _ in range(num_params)]
|
|
model._gradient_direction_optimizer_stopped_decreasing = [False for _ in range(num_params)]
|
|
model._gradient_direction_optimizer_prior_grads = None
|
|
model._gradient_direction_optimizer_step = 0
|
|
model._gradient_direction_optimizer_finished = False
|
|
self.current_model = model
|
|
|
|
def should_step(self, it):
|
|
model = self.current_model
|
|
model._gradient_direction_optimizer_step += 1
|
|
cur_grads = [grad(p) for k, p in model._gradient_direction_optimizer_params]
|
|
for cg in cur_grads:
|
|
if torch.any(torch.isnan(cg)):
|
|
print("BSO: found NaN. Passing it off to the GradScaler..")
|
|
return True
|
|
if model._gradient_direction_optimizer_prior_grads is not None:
|
|
cur_dir = [self.vector_angle(lgrad, cgrad) for lgrad, cgrad in zip(model._gradient_direction_optimizer_prior_grads, cur_grads)]
|
|
delta_dir = [(cdir - ldir) for cdir, ldir in zip(cur_dir, model._gradient_direction_optimizer_prior_directions)]
|
|
model._gradient_direction_optimizer_prior_directions = cur_dir
|
|
model._gradient_direction_optimizer_stopped_decreasing = [sd or dd < 0 for sd, dd in zip(model._gradient_direction_optimizer_stopped_decreasing, delta_dir)]
|
|
all_finished = all(model._gradient_direction_optimizer_stopped_decreasing)
|
|
|
|
# For distributed optimizers, like ZeroRedundancyAdam, we need to reach a consensus as to whether or not to reduce.
|
|
if distributed.is_initialized() and distributed.get_world_size() > 1:
|
|
all_finished = torch.tensor(all_finished)
|
|
distributed.all_reduce(all_finished, ReduceOp.BAND)
|
|
all_finished = torch.all(all_finished)
|
|
|
|
if all_finished or model._gradient_direction_optimizer_step >= self.max_full_batches:
|
|
# <0 means the gradient direction is getting larger. Halt batch accumulation here.
|
|
model._gradient_direction_optimizer_finished = True
|
|
self.record_number_steps(model._gradient_direction_optimizer_step)
|
|
# Fix the gradients. We've accumulated _gradient_direction_optimizer_step steps total, so we need to divide the grads by that.
|
|
for p in model.parameters():
|
|
if p.requires_grad:
|
|
p.grad = p.grad / model._gradient_direction_optimizer_step
|
|
return True
|
|
model._gradient_direction_optimizer_prior_grads = cur_grads
|
|
return False
|
|
|
|
def record_number_steps(self, steps):
|
|
self.last_number_iterations[self.last_number_iterations_i] = steps
|
|
if self.last_number_iterations_i == self.last_number_iterations.shape[0]-1:
|
|
self.last_number_iterations_filled = True
|
|
self.last_number_iterations_i = (self.last_number_iterations_i + 1) % self.last_number_iterations.shape[0]
|
|
self.steps_taken += 1
|
|
|
|
def get_statistics(self):
|
|
res = {"batch_size_opt_total_steps": self.steps_taken}
|
|
if self.last_number_iterations_filled:
|
|
res["batch_size_opt_avg_iterations_per_step"] = self.last_number_iterations.mean().item()
|
|
else:
|
|
res["batch_size_opt_avg_iterations_per_step"] = self.last_number_iterations[:self.last_number_iterations_i].mean().item()
|
|
return res
|