Move log consensus to train for efficiency

2022-03-04 13:41:32 -07:00 · 2022-03-04 13:41:32 -07:00 · e1052a5e32
commit e1052a5e32
parent ce6dfdf255
3 changed files with 15 additions and 5 deletions
--- a/codes/train.py
+++ b/codes/train.py
@ -195,11 +195,14 @@ class Trainer:
        #### log
        if self.dataset_debugger is not None:
            self.dataset_debugger.update(train_data)
+        if will_log:
+            # Must be run by all instances to gather consensus.
+            current_model_logs = self.model.get_current_log(self.current_step)
        if will_log and self.rank <= 0:
            logs = {'step': self.current_step,
                    'samples': self.total_training_data_encountered,
                    'megasamples': self.total_training_data_encountered / 1000000}
-            logs.update(self.model.get_current_log(self.current_step))
+            logs.update(current_model_logs)
            if self.dataset_debugger is not None:
                logs.update(self.dataset_debugger.get_debugging_map())
            logs.update(gradient_norms_dict)
--- a/codes/trainer/ExtensibleTrainer.py
+++ b/codes/trainer/ExtensibleTrainer.py
@ -447,6 +447,17 @@ class ExtensibleTrainer(BaseModel):

        # The batch size optimizer also outputs loggable data.
        log.update(self.batch_size_optimizer.get_statistics())
+
+        # In distributed mode, get agreement on all single tensors.
+        if distributed.is_available() and distributed.is_initialized():
+            for k, v in log.items():
+                if not isinstance(v, torch.Tensor):
+                    continue
+                if v.len(v.shape) != 1 or v.dtype != torch.float:
+                    continue
+                distributed.all_reduce(v, op=distributed.ReduceOp.SUM)
+                log[k] = v / distributed.get_world_size()
+
        return log

    def get_current_visuals(self, need_GT=True):
--- a/codes/utils/loss_accumulator.py
+++ b/codes/utils/loss_accumulator.py
@ -22,10 +22,6 @@ class LossAccumulator:
        if '_histogram' in name:
            buf[i] = torch.flatten(tensor.detach().cpu())
        elif isinstance(tensor, torch.Tensor):
-            if distributed.is_available() and distributed.is_initialized():
-                # Gather the metric from all devices before storing it locally.
-                distributed.all_reduce(tensor, op=distributed.ReduceOp.SUM)
-                tensor /= distributed.get_world_size()
            buf[i] = tensor.detach().cpu()
        else:
            buf[i] = tensor