Add oneAPI hacks for some training

Are they really hacks though if the existing thing being "hacked" had so many magic cuda?
This commit is contained in:
a-One-Fan 2023-05-04 21:12:25 +03:00
parent a4afad8837
commit ef5bd18f58
6 changed files with 42 additions and 14 deletions

View File

@ -19,6 +19,8 @@ import dlas.trainer.losses as L
from dlas.trainer.networks import register_model
from dlas.utils.util import checkpoint, opt_get
import os
try:
from apex import amp
@ -26,7 +28,11 @@ try:
except:
APEX_AVAILABLE = False
assert torch.cuda.is_available(), 'You need to have an Nvidia GPU with CUDA installed.'
if os.environ.get("AIVC_TRAIN_ONEAPI"):
assert torch.xpu.is_available(), 'You have chosen to train with oneAPI, but no XPU is available.'
else:
assert torch.cuda.is_available(), 'You need to have an Nvidia GPU with CUDA installed. Alternatively, you may train with oneAPI.'
num_cores = multiprocessing.cpu_count()

View File

@ -55,8 +55,14 @@ def init_dist(backend, **kwargs):
import torch.distributed as dist
rank = int(os.environ['LOCAL_RANK'])
assert rank < torch.cuda.device_count()
torch.cuda.set_device(rank)
if os.environ.get("AIVC_TRAIN_ONEAPI"):
import intel_extension_for_pytorch
import oneccl_bindings_for_pytorch
assert rank < torch.xpu.device_count()
torch.xpu.set_device(rank)
else:
assert rank < torch.cuda.device_count()
torch.cuda.set_device(rank)
dist.init_process_group(backend=backend, **kwargs)

View File

@ -70,6 +70,9 @@ class ExtensibleTrainer(BaseModel):
self.auto_scale_basis = opt_get(
opt, ['automatically_scale_base_layer_size'], 1024)
self.tdevice = "xpu:" + str(self.device) if os.environ.get("AIVC_TRAIN_ONEAPI") else "cuda:" + str(self.device)
self.tdevice = torch.device(self.tdevice)
self.netsG = {}
self.netsD = {}
for name, net in opt['networks'].items():
@ -84,12 +87,12 @@ class ExtensibleTrainer(BaseModel):
if net['type'] == 'generator':
if new_net is None:
new_net = networks.create_model(
opt, net, self.netsG).to(self.device)
opt, net, self.netsG).to(self.tdevice)
self.netsG[name] = new_net
elif net['type'] == 'discriminator':
if new_net is None:
new_net = networks.create_model(
opt, net, self.netsD).to(self.device)
opt, net, self.netsD).to(self.tdevice)
self.netsD[name] = new_net
else:
raise NotImplementedError(
@ -155,8 +158,9 @@ class ExtensibleTrainer(BaseModel):
# Do NOT be tempted to put find_unused_parameters=True here. It will not work when checkpointing is
# used and in a few other cases. But you can try it if you really want.
dnet = DistributedDataParallel(anet, device_ids=[torch.cuda.current_device()],
output_device=torch.cuda.current_device(),
dev_id = torch.xpu.current_device() if os.environ.get("AIVC_TRAIN_ONEAPI") else torch.cuda.current_device()
dnet = DistributedDataParallel(anet, device_ids=[dev_id], output_device=dev_id,
find_unused_parameters=opt_get(opt, ['ddp_find_unused_parameters'], False))
# DDP graphs cannot be used with gradient checkpointing unless you use find_unused_parameters=True,
# which does not work with this trainer (as stated above). However, if the graph is not subject
@ -241,7 +245,7 @@ class ExtensibleTrainer(BaseModel):
else:
v = v[sort_indices]
if isinstance(v, torch.Tensor):
self.dstate[k] = [t.to(self.device) for t in torch.chunk(
self.dstate[k] = [t.to(self.tdevice) for t in torch.chunk(
v, chunks=batch_factor, dim=0)]
if opt_get(self.opt, ['train', 'auto_collate'], False):

View File

@ -17,8 +17,11 @@ class BaseModel():
self.rank = torch.distributed.get_rank()
else:
self.rank = -1 # non dist training
self.device = torch.cuda.current_device(
) if opt['gpu_ids'] else torch.device('cpu')
if os.environ.get("AIVC_TRAIN_ONEAPI"):
self.device = torch.xpu.current_device()
else:
self.device = torch.cuda.current_device() if opt['gpu_ids'] else torch.device('cpu')
self.amp_level = 'O0' if opt['amp_opt_level'] is None else opt['amp_opt_level']
self.is_train = opt['is_train']
self.opt_in_cpu = opt_get(opt, ['keep_optimizer_states_on_cpu'], False)

View File

@ -9,6 +9,8 @@ from dlas.trainer.inject import Injector
from dlas.utils.music_utils import get_music_codegen
from dlas.utils.util import load_model_from_config, opt_get, pad_or_truncate
import os
MEL_MIN = -11.512925148010254
TACOTRON_MEL_MAX = 2.3143386840820312
TORCH_MEL_MAX = 4.82 # FYI: this STILL isn't assertive enough...
@ -185,8 +187,9 @@ class DiscreteTokenInjector(Injector):
cfg = opt_get(
opt, ['dvae_config'], "../experiments/train_diffusion_vocoder_22k_level.yml")
dvae_name = opt_get(opt, ['dvae_name'], 'dvae')
devstr = "xpu:" if os.environ.get("AIVC_TRAIN_ONEAPI") else "cuda:"
self.dvae = load_model_from_config(
cfg, dvae_name, device=f'cuda:{env["device"]}').eval()
cfg, dvae_name, device=devstr + str(env["device"])).eval()
def forward(self, state):
inp = state[self.input]

View File

@ -533,10 +533,16 @@ def load_model_from_config(cfg_file=None, model_name=None, also_load_savepoint=T
# Mapper for torch.load() that maps cuda devices to the correct CUDA device, but leaves CPU devices alone.
def map_cuda_to_correct_device(storage, loc):
if str(loc).startswith('cuda'):
return storage.cuda(torch.cuda.current_device())
if os.environ.get("AIVC_TRAIN_ONEAPI"):
if str(loc).startswith('xpu'):
return storage.xpu(torch.xpu.current_device())
else:
return storage.cpu()
else:
return storage.cpu()
if str(loc).startswith('cuda'):
return storage.cuda(torch.cuda.current_device())
else:
return storage.cpu()
def list_to_device(l, dev):