Update 'tortoise/utils/device.py'

Noticed that the autoregressive batch size was being set off of VRAM size. Adjusted to scale for the VRAM capacity of 90 series GPUs. In this case, 16 -> 32 batches. 

Using the standard pre-set with ChungusVGAN, I went from 16 steps to 8.
Over an average of 3 runs, I achieved an average of 294 seconds with 16 batches, to 234 seconds with 32. Can't complain at a 1.2x speed increase with functionally 2 lines of code. Can't complain. 

I restarted tortoise each run, and executing ```torch.cuda.empty_cache()``` just before loading the autoregressive model to clean the memory cache each time.
This commit is contained in:
deviandice 2023-03-07 14:05:27 +00:00
parent 26133c2031
commit e650800447

View File

@ -1,97 +1,108 @@
import torch
import psutil
import importlib
DEVICE_OVERRIDE = None
def has_dml():
loader = importlib.find_loader('torch_directml')
if loader is None:
return False
import torch_directml
return torch_directml.is_available()
def set_device_name(name):
global DEVICE_OVERRIDE
DEVICE_OVERRIDE = name
def get_device_name():
global DEVICE_OVERRIDE
if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "":
return DEVICE_OVERRIDE
name = 'cpu'
if torch.cuda.is_available():
name = 'cuda'
elif has_dml():
name = 'dml'
return name
def get_device(verbose=False):
name = get_device_name()
if verbose:
if name == 'cpu':
print("No hardware acceleration is available, falling back to CPU...")
else:
print(f"Hardware acceleration found: {name}")
if name == "dml":
import torch_directml
return torch_directml.device()
return torch.device(name)
def get_device_batch_size():
available = 1
name = get_device_name()
if name == "dml":
# there's nothing publically accessible in the DML API that exposes this
# there's a method to get currently used RAM statistics... as tiles
available = 1
elif name == "cuda":
_, available = torch.cuda.mem_get_info()
elif name == "cpu":
available = psutil.virtual_memory()[4]
availableGb = available / (1024 ** 3)
if availableGb > 14:
return 16
elif availableGb > 10:
return 8
elif availableGb > 7:
return 4
return 1
def get_device_count(name=get_device_name()):
if name == "cuda":
return torch.cuda.device_count()
if name == "dml":
import torch_directml
return torch_directml.device_count()
return 1
if has_dml():
_cumsum = torch.cumsum
_repeat_interleave = torch.repeat_interleave
_multinomial = torch.multinomial
_Tensor_new = torch.Tensor.new
_Tensor_cumsum = torch.Tensor.cumsum
_Tensor_repeat_interleave = torch.Tensor.repeat_interleave
_Tensor_multinomial = torch.Tensor.multinomial
torch.cumsum = lambda input, *args, **kwargs: ( _cumsum(input.to("cpu"), *args, **kwargs).to(input.device) )
torch.repeat_interleave = lambda input, *args, **kwargs: ( _repeat_interleave(input.to("cpu"), *args, **kwargs).to(input.device) )
torch.multinomial = lambda input, *args, **kwargs: ( _multinomial(input.to("cpu"), *args, **kwargs).to(input.device) )
torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) )
torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) )
torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) )
import torch
import psutil
import importlib
DEVICE_OVERRIDE = None
def has_dml():
loader = importlib.find_loader('torch_directml')
if loader is None:
return False
import torch_directml
return torch_directml.is_available()
def set_device_name(name):
global DEVICE_OVERRIDE
DEVICE_OVERRIDE = name
def get_device_name():
global DEVICE_OVERRIDE
if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "":
return DEVICE_OVERRIDE
name = 'cpu'
if torch.cuda.is_available():
name = 'cuda'
elif has_dml():
name = 'dml'
return name
def get_device(verbose=False):
name = get_device_name()
if verbose:
if name == 'cpu':
print("No hardware acceleration is available, falling back to CPU...")
else:
print(f"Hardware acceleration found: {name}")
if name == "dml":
import torch_directml
return torch_directml.device()
return torch.device(name)
def get_device_batch_size():
available = 1
name = get_device_name()
if name == "dml":
# there's nothing publicly accessible in the DML API that exposes this
# there's a method to get currently used RAM statistics... as tiles
available = 1
elif name == "cuda":
_,available = torch.cuda.mem_get_info()
elif name == "cpu":
available = psutil.virtual_memory()[4]
availableGb = available / (1024 ** 3)
print(f"Total device memory available: {availableGb}")
if availableGb > 18:
print(f"Setting AutoRegressive Batch Size to: 32")
print(f"Damn. Nice GPU Dude.")
return 32
elif availableGb > 14:
print(f"Setting AutoRegressive Batch Size to: 16")
return 16
elif availableGb > 10:
print(f"Setting AutoRegressive Batch Size to: 8")
return 8
elif availableGb > 7:
print(f"Setting AutoRegressive Batch Size to: 4")
return 4
print(f"Setting AutoRegressive Batch Size to: 1")
print(f"Don't cry about it if it doesn't work.")
return 1
def get_device_count(name=get_device_name()):
if name == "cuda":
return torch.cuda.device_count()
if name == "dml":
import torch_directml
return torch_directml.device_count()
return 1
if has_dml():
_cumsum = torch.cumsum
_repeat_interleave = torch.repeat_interleave
_multinomial = torch.multinomial
_Tensor_new = torch.Tensor.new
_Tensor_cumsum = torch.Tensor.cumsum
_Tensor_repeat_interleave = torch.Tensor.repeat_interleave
_Tensor_multinomial = torch.Tensor.multinomial
torch.cumsum = lambda input, *args, **kwargs: ( _cumsum(input.to("cpu"), *args, **kwargs).to(input.device) )
torch.repeat_interleave = lambda input, *args, **kwargs: ( _repeat_interleave(input.to("cpu"), *args, **kwargs).to(input.device) )
torch.multinomial = lambda input, *args, **kwargs: ( _multinomial(input.to("cpu"), *args, **kwargs).to(input.device) )
torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) )
torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) )
torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) )
torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) )