From 008a1f5f8fe6ad8a45f524c11fc49f16ee47bdfe Mon Sep 17 00:00:00 2001 From: mrq Date: Sat, 11 Mar 2023 01:37:00 +0000 Subject: [PATCH] simplified spawning the training process by having it spawn the distributed training processes in the train.py script, so it should work on Windows too --- src/train.py | 17 +++++++---------- src/utils.py | 12 +++++------- train.sh | 11 +---------- 3 files changed, 13 insertions(+), 27 deletions(-) diff --git a/src/train.py b/src/train.py index 4c5a6bb..39dee03 100755 --- a/src/train.py +++ b/src/train.py @@ -50,20 +50,18 @@ import torch import datetime from codes import train as tr from utils import util, options as option +from torch.distributed.run import main # this is effectively just copy pasted and cleaned up from the __main__ section of training.py # I'll clean it up better def train(yaml, launcher='none'): opt = option.parse(yaml, is_train=True) - if launcher != 'none': - # export CUDA_VISIBLE_DEVICES for running in distributed mode. - if 'gpu_ids' in opt.keys(): - gpu_list = ','.join(str(x) for x in opt['gpu_ids']) - os.environ['CUDA_VISIBLE_DEVICES'] = gpu_list - print('export CUDA_VISIBLE_DEVICES=' + gpu_list) - trainer = tr.Trainer() + if launcher == 'none' and opt['gpus'] > 1: + return main([f"--nproc_per_node={opt['gpus']}", "--master_port=1234", "./src/train.py", "-opt", yaml, "--launcher=pytorch"]) + + trainer = tr.Trainer() #### distributed training settings if launcher == 'none': # disabled distributed training opt['dist'] = False @@ -82,13 +80,12 @@ def train(yaml, launcher='none'): trainer.do_training() if __name__ == "__main__": - # simple check because I'm brain damaged and forgot I can't modify what a module exports by simply changing the booleans that decide what it exports after the fact try: import torch_intermediary if torch_intermediary.OVERRIDE_ADAM: - print("Using BitsAndBytes ADAMW optimizations") + print("Using BitsAndBytes optimizations") else: - print("NOT using BitsAndBytes ADAMW optimizations") + print("NOT using BitsAndBytes optimizations") except Exception as e: pass diff --git a/src/utils.py b/src/utils.py index d171571..fb39866 100755 --- a/src/utils.py +++ b/src/utils.py @@ -640,7 +640,7 @@ class TrainingState(): self.spawn_process(config_path=config_path, gpus=gpus) def spawn_process(self, config_path, gpus=1): - self.cmd = ['train.bat', config_path] if os.name == "nt" else ['./train.sh', str(int(gpus)), config_path] + self.cmd = ['train.bat', config_path] if os.name == "nt" else ['./train.sh', config_path] print("Spawning process: ", " ".join(self.cmd)) self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True) @@ -671,8 +671,8 @@ class TrainingState(): self.it_rate = f'{"{:.3f}".format(1/it_rate)}it/s' if 0 < it_rate and it_rate < 1 else f'{"{:.3f}".format(it_rate)}s/it' self.it_rates += it_rate - self.eta = (self.its - self.it) * (self.it_rates / self.its) try: + self.eta = (self.its - self.it) * (self.it_rates / self.it) eta = str(timedelta(seconds=int(self.eta))) self.eta_hhmmss = eta except Exception as e: @@ -1218,20 +1218,18 @@ def optimize_training_settings( **kwargs ): settings['gradient_accumulation_size'] = 1 messages.append(f"Gradient accumulation size is too large for a given batch size, clamping gradient accumulation size to: {settings['gradient_accumulation_size']}") - """ elif settings['batch_size'] % settings['gradient_accumulation_size'] != 0: - settings['gradient_accumulation_size'] = int(settings['batch_size'] / settings['gradient_accumulation_size']) + settings['gradient_accumulation_size'] -= settings['batch_size'] % settings['gradient_accumulation_size'] if settings['gradient_accumulation_size'] == 0: settings['gradient_accumulation_size'] = 1 messages.append(f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {settings['gradient_accumulation_size']}") if settings['batch_size'] % settings['gpus'] != 0: - settings['batch_size'] = int(settings['batch_size'] / settings['gpus']) + settings['batch_size'] -= settings['batch_size'] % settings['gpus'] if settings['batch_size'] == 0: settings['batch_size'] = 1 messages.append(f"Batch size not neatly divisible by GPU count, adjusting batch size to: {settings['batch_size']}") - """ def get_device_batch_size( vram ): @@ -1254,7 +1252,7 @@ def optimize_training_settings( **kwargs ): settings['gpus'] = 1 else: messages.append(f"! EXPERIMENTAL ! Multi-GPU training is extremely particular, expect issues.") - + # assuming you have equal GPUs vram = get_device_vram() * settings['gpus'] batch_ratio = int(settings['batch_size'] / settings['gradient_accumulation_size']) diff --git a/train.sh b/train.sh index 7459658..5e83e27 100755 --- a/train.sh +++ b/train.sh @@ -1,13 +1,4 @@ #!/bin/bash source ./venv/bin/activate - -GPUS=$1 -CONFIG=$2 -PORT=1234 - -if (( $GPUS > 1 )); then - torchrun --nproc_per_node=$GPUS --master_port=$PORT ./src/train.py -opt "$CONFIG" --launcher=pytorch -else - python3 ./src/train.py -opt "$CONFIG" -fi +python3 ./src/train.py -opt "$1" deactivate