Fix some distributed training snafus
This commit is contained in:
parent
11a9e223a6
commit
2a3eec8fd7
|
@ -202,6 +202,7 @@ class DiscriminatorGanLoss(ConfigurableLoss):
|
||||||
# generators and discriminators by essentially having them skip steps while their counterparts "catch up".
|
# generators and discriminators by essentially having them skip steps while their counterparts "catch up".
|
||||||
self.min_loss = opt['min_loss'] if 'min_loss' in opt.keys() else 0
|
self.min_loss = opt['min_loss'] if 'min_loss' in opt.keys() else 0
|
||||||
if self.min_loss != 0:
|
if self.min_loss != 0:
|
||||||
|
assert self.env['rank'] == 0 # distributed training does not support 'min_loss' - it can result in backward() desync by design.
|
||||||
self.loss_rotating_buffer = torch.zeros(10, requires_grad=False)
|
self.loss_rotating_buffer = torch.zeros(10, requires_grad=False)
|
||||||
self.rb_ptr = 0
|
self.rb_ptr = 0
|
||||||
self.losses_computed = 0
|
self.losses_computed = 0
|
||||||
|
|
|
@ -126,48 +126,49 @@ class ConfigurableStep(Module):
|
||||||
self.env['current_step_optimizers'] = self.optimizers
|
self.env['current_step_optimizers'] = self.optimizers
|
||||||
self.env['training'] = train
|
self.env['training'] = train
|
||||||
|
|
||||||
# Inject in any extra dependencies.
|
with self.get_network_for_name(self.get_networks_trained()[0]).join():
|
||||||
for inj in self.injectors:
|
# Inject in any extra dependencies.
|
||||||
# Don't do injections tagged with eval unless we are not in train mode.
|
for inj in self.injectors:
|
||||||
if train and 'eval' in inj.opt.keys() and inj.opt['eval']:
|
# Don't do injections tagged with eval unless we are not in train mode.
|
||||||
continue
|
if train and 'eval' in inj.opt.keys() and inj.opt['eval']:
|
||||||
# Likewise, don't do injections tagged with train unless we are not in eval.
|
|
||||||
if not train and 'train' in inj.opt.keys() and inj.opt['train']:
|
|
||||||
continue
|
|
||||||
# Don't do injections tagged with 'after' or 'before' when we are out of spec.
|
|
||||||
if 'after' in inj.opt.keys() and self.env['step'] < inj.opt['after'] or \
|
|
||||||
'before' in inj.opt.keys() and self.env['step'] > inj.opt['before']:
|
|
||||||
continue
|
|
||||||
injected = inj(local_state)
|
|
||||||
local_state.update(injected)
|
|
||||||
new_state.update(injected)
|
|
||||||
|
|
||||||
if train and len(self.losses) > 0:
|
|
||||||
# Finally, compute the losses.
|
|
||||||
total_loss = 0
|
|
||||||
for loss_name, loss in self.losses.items():
|
|
||||||
# Some losses only activate after a set number of steps. For example, proto-discriminator losses can
|
|
||||||
# be very disruptive to a generator.
|
|
||||||
if 'after' in loss.opt.keys() and loss.opt['after'] > self.env['step']:
|
|
||||||
continue
|
continue
|
||||||
l = loss(self.training_net, local_state)
|
# Likewise, don't do injections tagged with train unless we are not in eval.
|
||||||
total_loss += l * self.weights[loss_name]
|
if not train and 'train' in inj.opt.keys() and inj.opt['train']:
|
||||||
# Record metrics.
|
continue
|
||||||
if isinstance(l, torch.Tensor):
|
# Don't do injections tagged with 'after' or 'before' when we are out of spec.
|
||||||
self.loss_accumulator.add_loss(loss_name, l)
|
if 'after' in inj.opt.keys() and self.env['step'] < inj.opt['after'] or \
|
||||||
for n, v in loss.extra_metrics():
|
'before' in inj.opt.keys() and self.env['step'] > inj.opt['before']:
|
||||||
self.loss_accumulator.add_loss("%s_%s" % (loss_name, n), v)
|
continue
|
||||||
loss.clear_metrics()
|
injected = inj(local_state)
|
||||||
|
local_state.update(injected)
|
||||||
|
new_state.update(injected)
|
||||||
|
|
||||||
# In some cases, the loss could not be set (e.g. all losses have 'after'
|
if train and len(self.losses) > 0:
|
||||||
if isinstance(total_loss, torch.Tensor):
|
# Finally, compute the losses.
|
||||||
self.loss_accumulator.add_loss("%s_total" % (self.get_training_network_name(),), total_loss)
|
total_loss = 0
|
||||||
# Scale the loss down by the accumulation factor.
|
for loss_name, loss in self.losses.items():
|
||||||
total_loss = total_loss / self.env['mega_batch_factor']
|
# Some losses only activate after a set number of steps. For example, proto-discriminator losses can
|
||||||
|
# be very disruptive to a generator.
|
||||||
|
if 'after' in loss.opt.keys() and loss.opt['after'] > self.env['step']:
|
||||||
|
continue
|
||||||
|
l = loss(self.training_net, local_state)
|
||||||
|
total_loss += l * self.weights[loss_name]
|
||||||
|
# Record metrics.
|
||||||
|
if isinstance(l, torch.Tensor):
|
||||||
|
self.loss_accumulator.add_loss(loss_name, l)
|
||||||
|
for n, v in loss.extra_metrics():
|
||||||
|
self.loss_accumulator.add_loss("%s_%s" % (loss_name, n), v)
|
||||||
|
loss.clear_metrics()
|
||||||
|
|
||||||
# Get dem grads!
|
# In some cases, the loss could not be set (e.g. all losses have 'after')
|
||||||
self.scaler.scale(total_loss).backward()
|
if isinstance(total_loss, torch.Tensor):
|
||||||
self.grads_generated = True
|
self.loss_accumulator.add_loss("%s_total" % (self.get_training_network_name(),), total_loss)
|
||||||
|
# Scale the loss down by the accumulation factor.
|
||||||
|
total_loss = total_loss / self.env['mega_batch_factor']
|
||||||
|
|
||||||
|
# Get dem grads!
|
||||||
|
self.scaler.scale(total_loss).backward()
|
||||||
|
self.grads_generated = True
|
||||||
|
|
||||||
# Detach all state variables. Within the step, gradients can flow. Once these variables leave the step
|
# Detach all state variables. Within the step, gradients can flow. Once these variables leave the step
|
||||||
# we must release the gradients.
|
# we must release the gradients.
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
numpy
|
numpy
|
||||||
opencv-python
|
opencv-python
|
||||||
lmdb
|
|
||||||
pyyaml
|
pyyaml
|
||||||
tb-nightly
|
tb-nightly
|
||||||
future
|
future
|
||||||
|
|
|
@ -46,7 +46,7 @@ class Trainer:
|
||||||
|
|
||||||
else:
|
else:
|
||||||
opt['dist'] = True
|
opt['dist'] = True
|
||||||
self.init_dist()
|
self.init_dist('nccl')
|
||||||
world_size = torch.distributed.get_world_size()
|
world_size = torch.distributed.get_world_size()
|
||||||
self.rank = torch.distributed.get_rank()
|
self.rank = torch.distributed.get_rank()
|
||||||
|
|
||||||
|
@ -117,11 +117,11 @@ class Trainer:
|
||||||
total_iters = int(opt['train']['niter'])
|
total_iters = int(opt['train']['niter'])
|
||||||
self.total_epochs = int(math.ceil(total_iters / train_size))
|
self.total_epochs = int(math.ceil(total_iters / train_size))
|
||||||
if opt['dist']:
|
if opt['dist']:
|
||||||
train_sampler = DistIterSampler(self.train_set, world_size, self.rank, dataset_ratio)
|
self.train_sampler = DistIterSampler(self.train_set, world_size, self.rank, dataset_ratio)
|
||||||
self.total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio)))
|
self.total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio)))
|
||||||
else:
|
else:
|
||||||
train_sampler = None
|
self.train_sampler = None
|
||||||
self.train_loader = create_dataloader(self.train_set, dataset_opt, opt, train_sampler)
|
self.train_loader = create_dataloader(self.train_set, dataset_opt, opt, self.train_sampler)
|
||||||
if self.rank <= 0:
|
if self.rank <= 0:
|
||||||
self.logger.info('Number of train images: {:,d}, iters: {:,d}'.format(
|
self.logger.info('Number of train images: {:,d}, iters: {:,d}'.format(
|
||||||
len(self.train_set), train_size))
|
len(self.train_set), train_size))
|
||||||
|
@ -284,6 +284,7 @@ if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_prog_imgset_multifaceted_chained.yml')
|
parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_prog_imgset_multifaceted_chained.yml')
|
||||||
parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher')
|
parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher')
|
||||||
|
parser.add_argument('--local_rank', type=int, default=0)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
opt = option.parse(args.opt, is_train=True)
|
opt = option.parse(args.opt, is_train=True)
|
||||||
trainer = Trainer()
|
trainer = Trainer()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user