ai-voice-cloning/src/train.py

71 lines
2.5 KiB
Python
Raw Normal View History

import os
import sys
2023-02-23 06:24:54 +00:00
import argparse
import yaml
2023-03-14 15:48:09 +00:00
import datetime
2023-03-14 15:48:09 +00:00
from torch.distributed.run import main as torchrun
2023-03-14 15:48:09 +00:00
# I don't want this invoked from an import
if __name__ != "__main__":
raise Exception("Do not invoke this from an import")
2023-03-14 15:48:09 +00:00
parser = argparse.ArgumentParser()
parser.add_argument('--yaml', type=str, help='Path to training configuration file.', default='./training/voice/train.yml', nargs='+') # ugh
parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='Job launcher')
args = parser.parse_args()
args.yaml = " ".join(args.yaml) # absolutely disgusting
config_path = args.yaml
2023-03-14 15:48:09 +00:00
with open(config_path, 'r') as file:
opt_config = yaml.safe_load(file)
2023-03-14 15:48:09 +00:00
# it'd be downright sugoi if I was able to install DLAS as a pip package
sys.path.insert(0, './modules/dlas/codes/')
sys.path.insert(0, './modules/dlas/')
2023-03-14 15:48:09 +00:00
# yucky override
if "bitsandbytes" in opt_config and not opt_config["bitsandbytes"]:
os.environ['BITSANDBYTES_OVERRIDE_LINEAR'] = '0'
os.environ['BITSANDBYTES_OVERRIDE_EMBEDDING'] = '0'
os.environ['BITSANDBYTES_OVERRIDE_ADAM'] = '0'
os.environ['BITSANDBYTES_OVERRIDE_ADAMW'] = '0'
2023-02-23 06:24:54 +00:00
import torch
from codes import train as tr
from utils import util, options as option
# this is effectively just copy pasted and cleaned up from the __main__ section of training.py
2023-03-14 15:48:09 +00:00
def train(config_path, launcher='none'):
opt = option.parse(config_path, is_train=True)
if launcher == 'none' and opt['gpus'] > 1:
2023-03-14 15:48:09 +00:00
return torchrun([f"--nproc_per_node={opt['gpus']}", "./src/train.py", "--yaml", config_path, "--launcher=pytorch"])
trainer = tr.Trainer()
2023-03-14 15:48:09 +00:00
if launcher == 'none':
opt['dist'] = False
trainer.rank = -1
if len(opt['gpu_ids']) == 1:
torch.cuda.set_device(opt['gpu_ids'][0])
print('Disabled distributed training.')
else:
opt['dist'] = True
tr.init_dist('nccl', timeout=datetime.timedelta(seconds=5*60))
trainer.world_size = torch.distributed.get_world_size()
trainer.rank = torch.distributed.get_rank()
torch.cuda.set_device(torch.distributed.get_rank())
2023-03-14 15:48:09 +00:00
trainer.init(config_path, opt, launcher, '')
trainer.do_training()
2023-03-14 15:48:09 +00:00
try:
import torch_intermediary
if torch_intermediary.OVERRIDE_ADAM:
print("Using BitsAndBytes optimizations")
else:
print("NOT using BitsAndBytes optimizations")
except Exception as e:
pass
2023-03-14 15:48:09 +00:00
train(config_path, args.launcher)