Compare commits

...

5 Commits
main ... main

Author SHA1 Message Date
f05dfd0bea Specify numpy 1.23
Newer numpy versions don't work.
2023-08-30 08:25:05 +03:00
e99a905d7c Fix lowercasing of kernel 2023-07-13 22:14:16 +03:00
1271237d89 Workaround for WSL VRAM leaks 2023-07-13 10:16:57 +03:00
8618922a33 Implement correct XPU device count
Forgot to do that
2023-05-04 21:14:07 +03:00
44d2dcbb19 Add initial oneAPI support 2023-04-30 23:05:24 +03:00
5 changed files with 52 additions and 5 deletions

View File

@ -11,5 +11,5 @@ librosa==0.8.1
torchaudio
threadpoolctl
appdirs
numpy
numpy==1.23
numba

View File

@ -680,9 +680,17 @@ class TextToSpeech:
auto_conditioning = migrate_to_device( auto_conditioning, self.device )
text_tokens = migrate_to_device( text_tokens, self.device )
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
if self.device.type == 'xpu': # The following autocasts were hardcoded for CUDA
_device_type = 'xpu'
_dtype = torch.bfloat16 # float16 support for oneAPI was missing / is worse?
else:
_device_type = 'cuda' # Should these be changed to just use the device directly? Unsure how this will do for dml/rocm
_dtype = torch.float16
with torch.autocast(device_type=_device_type, dtype=_dtype, enabled=half_p):
for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"):
check_for_kill_signal()
do_gc() # oneAPI VRAM
codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
do_sample=True,
top_p=top_p,
@ -710,7 +718,7 @@ class TextToSpeech:
if auto_conds is not None:
auto_conditioning = migrate_to_device( auto_conditioning, self.device )
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p):
with torch.autocast(device_type=_device_type, dtype=_dtype, enabled=half_p):
if not self.preloaded_tensors:
self.autoregressive = migrate_to_device( self.autoregressive, 'cpu' )
self.clvp = migrate_to_device( self.clvp, self.device )

View File

@ -309,7 +309,11 @@ class DiffusionTts(nn.Module):
else:
# First and last blocks will have autocast disabled for improved precision.
# x.device.type
with autocast(device_type='cuda', enabled=self.enable_fp16 and i != 0):
if self.device.type == 'xpu': # The following autocast was hardcoded for CUDA
_device_type = 'xpu'
else:
_device_type = 'cuda' # Should these be changed to just use the device directly? Unsure how this will do for dml/rocm
with autocast(device_type=_device_type, enabled=self.enable_fp16 and i != 0):
x = lyr(x, time_emb)
x = x.float()

View File

@ -1,6 +1,7 @@
import torch
import psutil
import importlib
import os
DEVICE_OVERRIDE = None
DEVICE_BATCH_SIZE_MAP = [(14, 16), (10,8), (7,4)]
@ -8,6 +9,12 @@ DEVICE_BATCH_SIZE_MAP = [(14, 16), (10,8), (7,4)]
from inspect import currentframe, getframeinfo
import gc
is_WSL = 'wsl' in os.popen("uname -r").read().lower()
def xpu_get_mem(device=0):
total_memory = ipex.xpu.get_device_properties(device).total_memory
return total_memory, total_memory - torch.xpu.memory_allocated(device)
def do_gc():
gc.collect()
try:
@ -15,6 +22,12 @@ def do_gc():
except Exception as e:
pass
if not is_WSL:
try:
torch.xpu.empty_cache()
except Exception as e:
pass
def print_stats(collect=False):
cf = currentframe().f_back
msg = f'{getframeinfo(cf).filename}:{cf.f_lineno}'
@ -36,6 +49,16 @@ def has_dml():
import torch_directml
return torch_directml.is_available()
def has_ipex():
loader = importlib.find_loader('intel_extension_for_pytorch')
if loader is None:
return False
import intel_extension_for_pytorch
global ipex
ipex = intel_extension_for_pytorch # Could doing this over and over be an issue?
return torch.xpu.is_available()
def set_device_name(name):
global DEVICE_OVERRIDE
DEVICE_OVERRIDE = name
@ -51,6 +74,10 @@ def get_device_name(attempt_gc=True):
name = 'cuda'
if attempt_gc:
torch.cuda.empty_cache() # may have performance implications
elif has_ipex():
name = 'xpu'
if attempt_gc:
torch.xpu.empty_cache()
elif has_dml():
name = 'dml'
@ -76,6 +103,8 @@ def get_device_vram( name=get_device_name() ):
if name == "cuda":
_, available = torch.cuda.mem_get_info()
elif name == "xpu":
_, available = xpu_get_mem()
elif name == "cpu":
available = psutil.virtual_memory()[4]
@ -103,6 +132,9 @@ def get_device_count(name=get_device_name()):
if name == "dml":
import torch_directml
return torch_directml.device_count()
if name == "xpu":
import intel_extension_for_pytorch
return torch.xpu.device_count()
return 1

View File

@ -1271,7 +1271,10 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape):
dimension equal to the length of timesteps.
:return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
"""
res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
if timesteps.device.type == 'xpu': # TODO: Arc currently does not support FP64 broadly, and this will change eventually. Remove when this happens?
res = th.from_numpy(arr).float().to(device=timesteps.device)[timesteps]
else:
res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
while len(res.shape) < len(broadcast_shape):
res = res[..., None]
return res.expand(broadcast_shape)