From 44d2dcbb193f8ae813326a1a165cabf9c23e0db5 Mon Sep 17 00:00:00 2001 From: a-One-Fan Date: Sun, 30 Apr 2023 23:05:24 +0300 Subject: [PATCH] Add initial oneAPI support --- tortoise/api.py | 12 ++++++++++-- tortoise/models/diffusion_decoder.py | 6 +++++- tortoise/utils/device.py | 25 +++++++++++++++++++++++++ tortoise/utils/diffusion.py | 5 ++++- 4 files changed, 44 insertions(+), 4 deletions(-) diff --git a/tortoise/api.py b/tortoise/api.py index 9b091ca..4d5bb15 100755 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -680,9 +680,17 @@ class TextToSpeech: auto_conditioning = migrate_to_device( auto_conditioning, self.device ) text_tokens = migrate_to_device( text_tokens, self.device ) - with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p): + if self.device.type == 'xpu': # The following autocasts were hardcoded for CUDA + _device_type = 'xpu' + _dtype = torch.bfloat16 # float16 support for oneAPI was missing / is worse? + else: + _device_type = 'cuda' # Should these be changed to just use the device directly? Unsure how this will do for dml/rocm + _dtype = torch.float16 + + with torch.autocast(device_type=_device_type, dtype=_dtype, enabled=half_p): for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"): check_for_kill_signal() + do_gc() # oneAPI VRAM codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens, do_sample=True, top_p=top_p, @@ -710,7 +718,7 @@ class TextToSpeech: if auto_conds is not None: auto_conditioning = migrate_to_device( auto_conditioning, self.device ) - with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p): + with torch.autocast(device_type=_device_type, dtype=_dtype, enabled=half_p): if not self.preloaded_tensors: self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' ) self.clvp = migrate_to_device( self.clvp, self.device ) diff --git a/tortoise/models/diffusion_decoder.py b/tortoise/models/diffusion_decoder.py index 9d8bcde..8cace2b 100755 --- a/tortoise/models/diffusion_decoder.py +++ b/tortoise/models/diffusion_decoder.py @@ -309,7 +309,11 @@ class DiffusionTts(nn.Module): else: # First and last blocks will have autocast disabled for improved precision. # x.device.type - with autocast(device_type='cuda', enabled=self.enable_fp16 and i != 0): + if self.device.type == 'xpu': # The following autocast was hardcoded for CUDA + _device_type = 'xpu' + else: + _device_type = 'cuda' # Should these be changed to just use the device directly? Unsure how this will do for dml/rocm + with autocast(device_type=_device_type, enabled=self.enable_fp16 and i != 0): x = lyr(x, time_emb) x = x.float() diff --git a/tortoise/utils/device.py b/tortoise/utils/device.py index c082c13..eddfcea 100755 --- a/tortoise/utils/device.py +++ b/tortoise/utils/device.py @@ -8,6 +8,10 @@ DEVICE_BATCH_SIZE_MAP = [(14, 16), (10,8), (7,4)] from inspect import currentframe, getframeinfo import gc +def xpu_get_mem(device=0): + total_memory = ipex.xpu.get_device_properties(device).total_memory + return total_memory, total_memory - torch.xpu.memory_allocated(device) + def do_gc(): gc.collect() try: @@ -15,6 +19,11 @@ def do_gc(): except Exception as e: pass + try: + torch.xpu.empty_cache() + except Exception as e: + pass + def print_stats(collect=False): cf = currentframe().f_back msg = f'{getframeinfo(cf).filename}:{cf.f_lineno}' @@ -36,6 +45,16 @@ def has_dml(): import torch_directml return torch_directml.is_available() +def has_ipex(): + loader = importlib.find_loader('intel_extension_for_pytorch') + if loader is None: + return False + + import intel_extension_for_pytorch + global ipex + ipex = intel_extension_for_pytorch # Could doing this over and over be an issue? + return torch.xpu.is_available() + def set_device_name(name): global DEVICE_OVERRIDE DEVICE_OVERRIDE = name @@ -51,6 +70,10 @@ def get_device_name(attempt_gc=True): name = 'cuda' if attempt_gc: torch.cuda.empty_cache() # may have performance implications + elif has_ipex(): + name = 'xpu' + if attempt_gc: + torch.xpu.empty_cache() elif has_dml(): name = 'dml' @@ -76,6 +99,8 @@ def get_device_vram( name=get_device_name() ): if name == "cuda": _, available = torch.cuda.mem_get_info() + elif name == "xpu": + _, available = xpu_get_mem() elif name == "cpu": available = psutil.virtual_memory()[4] diff --git a/tortoise/utils/diffusion.py b/tortoise/utils/diffusion.py index c706416..bd607ba 100755 --- a/tortoise/utils/diffusion.py +++ b/tortoise/utils/diffusion.py @@ -1271,7 +1271,10 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape): dimension equal to the length of timesteps. :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. """ - res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() + if timesteps.device.type == 'xpu': # TODO: Arc currently does not support FP64 broadly, and this will change eventually. Remove when this happens? + res = th.from_numpy(arr).float().to(device=timesteps.device)[timesteps] + else: + res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() while len(res.shape) < len(broadcast_shape): res = res[..., None] return res.expand(broadcast_shape) \ No newline at end of file