2023-02-17 16:29:27 +00:00
|
|
|
import os
|
|
|
|
import sys
|
2023-02-23 06:24:54 +00:00
|
|
|
import argparse
|
2023-02-26 01:57:56 +00:00
|
|
|
import yaml
|
2023-03-14 15:48:09 +00:00
|
|
|
import datetime
|
2023-02-24 23:13:13 +00:00
|
|
|
|
2023-03-14 15:48:09 +00:00
|
|
|
from torch.distributed.run import main as torchrun
|
2023-02-24 23:13:13 +00:00
|
|
|
|
2023-02-17 19:06:05 +00:00
|
|
|
# this is effectively just copy pasted and cleaned up from the __main__ section of training.py
|
2023-03-14 15:48:09 +00:00
|
|
|
def train(config_path, launcher='none'):
|
|
|
|
opt = option.parse(config_path, is_train=True)
|
2023-02-18 02:07:22 +00:00
|
|
|
|
2023-03-11 01:37:00 +00:00
|
|
|
if launcher == 'none' and opt['gpus'] > 1:
|
2023-03-14 15:48:09 +00:00
|
|
|
return torchrun([f"--nproc_per_node={opt['gpus']}", "./src/train.py", "--yaml", config_path, "--launcher=pytorch"])
|
2023-03-11 01:37:00 +00:00
|
|
|
|
|
|
|
trainer = tr.Trainer()
|
2023-03-14 15:48:09 +00:00
|
|
|
if launcher == 'none':
|
2023-02-18 02:07:22 +00:00
|
|
|
opt['dist'] = False
|
|
|
|
trainer.rank = -1
|
|
|
|
if len(opt['gpu_ids']) == 1:
|
|
|
|
torch.cuda.set_device(opt['gpu_ids'][0])
|
|
|
|
print('Disabled distributed training.')
|
|
|
|
else:
|
|
|
|
opt['dist'] = True
|
2023-03-09 00:26:47 +00:00
|
|
|
tr.init_dist('nccl', timeout=datetime.timedelta(seconds=5*60))
|
2023-02-18 02:07:22 +00:00
|
|
|
trainer.world_size = torch.distributed.get_world_size()
|
|
|
|
trainer.rank = torch.distributed.get_rank()
|
|
|
|
torch.cuda.set_device(torch.distributed.get_rank())
|
|
|
|
|
2023-03-14 15:48:09 +00:00
|
|
|
trainer.init(config_path, opt, launcher, '')
|
2023-02-18 02:07:22 +00:00
|
|
|
trainer.do_training()
|
|
|
|
|
2023-03-14 18:52:56 +00:00
|
|
|
if __name__ == "__main__":
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('--yaml', type=str, help='Path to training configuration file.', default='./training/voice/train.yml', nargs='+') # ugh
|
|
|
|
parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='Job launcher')
|
|
|
|
args = parser.parse_args()
|
|
|
|
args.yaml = " ".join(args.yaml) # absolutely disgusting
|
|
|
|
config_path = args.yaml
|
|
|
|
|
|
|
|
with open(config_path, 'r') as file:
|
|
|
|
opt_config = yaml.safe_load(file)
|
|
|
|
|
|
|
|
# yucky override
|
|
|
|
if "bitsandbytes" in opt_config and not opt_config["bitsandbytes"]:
|
|
|
|
os.environ['BITSANDBYTES_OVERRIDE_LINEAR'] = '0'
|
|
|
|
os.environ['BITSANDBYTES_OVERRIDE_EMBEDDING'] = '0'
|
|
|
|
os.environ['BITSANDBYTES_OVERRIDE_ADAM'] = '0'
|
|
|
|
os.environ['BITSANDBYTES_OVERRIDE_ADAMW'] = '0'
|
|
|
|
|
|
|
|
try:
|
|
|
|
import torch_intermediary
|
|
|
|
if torch_intermediary.OVERRIDE_ADAM:
|
|
|
|
print("Using BitsAndBytes optimizations")
|
|
|
|
else:
|
|
|
|
print("NOT using BitsAndBytes optimizations")
|
|
|
|
except Exception as e:
|
|
|
|
pass
|
|
|
|
|
|
|
|
import torch
|
2023-03-21 15:46:53 +00:00
|
|
|
from dlas import train as tr
|
|
|
|
from dlas.utils import util, options as option
|
2023-02-23 07:05:39 +00:00
|
|
|
|
2023-03-14 18:52:56 +00:00
|
|
|
train(config_path, args.launcher)
|