forked from mrq/DL-Art-School
Add oneAPI hacks for some training
Are they really hacks though if the existing thing being "hacked" had so many magic cuda?
This commit is contained in:
parent
a4afad8837
commit
ef5bd18f58
|
@ -19,6 +19,8 @@ import dlas.trainer.losses as L
|
|||
from dlas.trainer.networks import register_model
|
||||
from dlas.utils.util import checkpoint, opt_get
|
||||
|
||||
import os
|
||||
|
||||
try:
|
||||
from apex import amp
|
||||
|
||||
|
@ -26,7 +28,11 @@ try:
|
|||
except:
|
||||
APEX_AVAILABLE = False
|
||||
|
||||
assert torch.cuda.is_available(), 'You need to have an Nvidia GPU with CUDA installed.'
|
||||
|
||||
if os.environ.get("AIVC_TRAIN_ONEAPI"):
|
||||
assert torch.xpu.is_available(), 'You have chosen to train with oneAPI, but no XPU is available.'
|
||||
else:
|
||||
assert torch.cuda.is_available(), 'You need to have an Nvidia GPU with CUDA installed. Alternatively, you may train with oneAPI.'
|
||||
|
||||
num_cores = multiprocessing.cpu_count()
|
||||
|
||||
|
|
|
@ -55,8 +55,14 @@ def init_dist(backend, **kwargs):
|
|||
import torch.distributed as dist
|
||||
|
||||
rank = int(os.environ['LOCAL_RANK'])
|
||||
assert rank < torch.cuda.device_count()
|
||||
torch.cuda.set_device(rank)
|
||||
if os.environ.get("AIVC_TRAIN_ONEAPI"):
|
||||
import intel_extension_for_pytorch
|
||||
import oneccl_bindings_for_pytorch
|
||||
assert rank < torch.xpu.device_count()
|
||||
torch.xpu.set_device(rank)
|
||||
else:
|
||||
assert rank < torch.cuda.device_count()
|
||||
torch.cuda.set_device(rank)
|
||||
dist.init_process_group(backend=backend, **kwargs)
|
||||
|
||||
|
||||
|
|
|
@ -70,6 +70,9 @@ class ExtensibleTrainer(BaseModel):
|
|||
self.auto_scale_basis = opt_get(
|
||||
opt, ['automatically_scale_base_layer_size'], 1024)
|
||||
|
||||
self.tdevice = "xpu:" + str(self.device) if os.environ.get("AIVC_TRAIN_ONEAPI") else "cuda:" + str(self.device)
|
||||
self.tdevice = torch.device(self.tdevice)
|
||||
|
||||
self.netsG = {}
|
||||
self.netsD = {}
|
||||
for name, net in opt['networks'].items():
|
||||
|
@ -84,12 +87,12 @@ class ExtensibleTrainer(BaseModel):
|
|||
if net['type'] == 'generator':
|
||||
if new_net is None:
|
||||
new_net = networks.create_model(
|
||||
opt, net, self.netsG).to(self.device)
|
||||
opt, net, self.netsG).to(self.tdevice)
|
||||
self.netsG[name] = new_net
|
||||
elif net['type'] == 'discriminator':
|
||||
if new_net is None:
|
||||
new_net = networks.create_model(
|
||||
opt, net, self.netsD).to(self.device)
|
||||
opt, net, self.netsD).to(self.tdevice)
|
||||
self.netsD[name] = new_net
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
|
@ -155,8 +158,9 @@ class ExtensibleTrainer(BaseModel):
|
|||
|
||||
# Do NOT be tempted to put find_unused_parameters=True here. It will not work when checkpointing is
|
||||
# used and in a few other cases. But you can try it if you really want.
|
||||
dnet = DistributedDataParallel(anet, device_ids=[torch.cuda.current_device()],
|
||||
output_device=torch.cuda.current_device(),
|
||||
|
||||
dev_id = torch.xpu.current_device() if os.environ.get("AIVC_TRAIN_ONEAPI") else torch.cuda.current_device()
|
||||
dnet = DistributedDataParallel(anet, device_ids=[dev_id], output_device=dev_id,
|
||||
find_unused_parameters=opt_get(opt, ['ddp_find_unused_parameters'], False))
|
||||
# DDP graphs cannot be used with gradient checkpointing unless you use find_unused_parameters=True,
|
||||
# which does not work with this trainer (as stated above). However, if the graph is not subject
|
||||
|
@ -241,7 +245,7 @@ class ExtensibleTrainer(BaseModel):
|
|||
else:
|
||||
v = v[sort_indices]
|
||||
if isinstance(v, torch.Tensor):
|
||||
self.dstate[k] = [t.to(self.device) for t in torch.chunk(
|
||||
self.dstate[k] = [t.to(self.tdevice) for t in torch.chunk(
|
||||
v, chunks=batch_factor, dim=0)]
|
||||
|
||||
if opt_get(self.opt, ['train', 'auto_collate'], False):
|
||||
|
|
|
@ -17,8 +17,11 @@ class BaseModel():
|
|||
self.rank = torch.distributed.get_rank()
|
||||
else:
|
||||
self.rank = -1 # non dist training
|
||||
self.device = torch.cuda.current_device(
|
||||
) if opt['gpu_ids'] else torch.device('cpu')
|
||||
|
||||
if os.environ.get("AIVC_TRAIN_ONEAPI"):
|
||||
self.device = torch.xpu.current_device()
|
||||
else:
|
||||
self.device = torch.cuda.current_device() if opt['gpu_ids'] else torch.device('cpu')
|
||||
self.amp_level = 'O0' if opt['amp_opt_level'] is None else opt['amp_opt_level']
|
||||
self.is_train = opt['is_train']
|
||||
self.opt_in_cpu = opt_get(opt, ['keep_optimizer_states_on_cpu'], False)
|
||||
|
|
|
@ -9,6 +9,8 @@ from dlas.trainer.inject import Injector
|
|||
from dlas.utils.music_utils import get_music_codegen
|
||||
from dlas.utils.util import load_model_from_config, opt_get, pad_or_truncate
|
||||
|
||||
import os
|
||||
|
||||
MEL_MIN = -11.512925148010254
|
||||
TACOTRON_MEL_MAX = 2.3143386840820312
|
||||
TORCH_MEL_MAX = 4.82 # FYI: this STILL isn't assertive enough...
|
||||
|
@ -185,8 +187,9 @@ class DiscreteTokenInjector(Injector):
|
|||
cfg = opt_get(
|
||||
opt, ['dvae_config'], "../experiments/train_diffusion_vocoder_22k_level.yml")
|
||||
dvae_name = opt_get(opt, ['dvae_name'], 'dvae')
|
||||
devstr = "xpu:" if os.environ.get("AIVC_TRAIN_ONEAPI") else "cuda:"
|
||||
self.dvae = load_model_from_config(
|
||||
cfg, dvae_name, device=f'cuda:{env["device"]}').eval()
|
||||
cfg, dvae_name, device=devstr + str(env["device"])).eval()
|
||||
|
||||
def forward(self, state):
|
||||
inp = state[self.input]
|
||||
|
|
|
@ -533,10 +533,16 @@ def load_model_from_config(cfg_file=None, model_name=None, also_load_savepoint=T
|
|||
|
||||
# Mapper for torch.load() that maps cuda devices to the correct CUDA device, but leaves CPU devices alone.
|
||||
def map_cuda_to_correct_device(storage, loc):
|
||||
if str(loc).startswith('cuda'):
|
||||
return storage.cuda(torch.cuda.current_device())
|
||||
if os.environ.get("AIVC_TRAIN_ONEAPI"):
|
||||
if str(loc).startswith('xpu'):
|
||||
return storage.xpu(torch.xpu.current_device())
|
||||
else:
|
||||
return storage.cpu()
|
||||
else:
|
||||
return storage.cpu()
|
||||
if str(loc).startswith('cuda'):
|
||||
return storage.cuda(torch.cuda.current_device())
|
||||
else:
|
||||
return storage.cpu()
|
||||
|
||||
|
||||
def list_to_device(l, dev):
|
||||
|
|
Loading…
Reference in New Issue
Block a user