Update 'tortoise/utils/device.py'

Noticed that the autoregressive batch size was being set off of VRAM size. Adjusted to scale for the VRAM capacity of 90 series GPUs. In this case, 16 -> 32 batches. 

Using the standard pre-set with ChungusVGAN, I went from 16 steps to 8.
Over an average of 3 runs, I achieved an average of 294 seconds with 16 batches, to 234 seconds with 32. Can't complain at a 1.2x speed increase with functionally 2 lines of code. Can't complain. 

I restarted tortoise each run, and executing ```torch.cuda.empty_cache()``` just before loading the autoregressive model to clean the memory cache each time.
This commit is contained in:
deviandice 2023-03-07 14:05:27 +00:00
parent 26133c2031
commit e650800447

View File

@ -1,97 +1,108 @@
import torch import torch
import psutil import psutil
import importlib import importlib
DEVICE_OVERRIDE = None DEVICE_OVERRIDE = None
def has_dml(): def has_dml():
loader = importlib.find_loader('torch_directml') loader = importlib.find_loader('torch_directml')
if loader is None: if loader is None:
return False return False
import torch_directml import torch_directml
return torch_directml.is_available() return torch_directml.is_available()
def set_device_name(name): def set_device_name(name):
global DEVICE_OVERRIDE global DEVICE_OVERRIDE
DEVICE_OVERRIDE = name DEVICE_OVERRIDE = name
def get_device_name(): def get_device_name():
global DEVICE_OVERRIDE global DEVICE_OVERRIDE
if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "": if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "":
return DEVICE_OVERRIDE return DEVICE_OVERRIDE
name = 'cpu' name = 'cpu'
if torch.cuda.is_available(): if torch.cuda.is_available():
name = 'cuda' name = 'cuda'
elif has_dml(): elif has_dml():
name = 'dml' name = 'dml'
return name return name
def get_device(verbose=False): def get_device(verbose=False):
name = get_device_name() name = get_device_name()
if verbose: if verbose:
if name == 'cpu': if name == 'cpu':
print("No hardware acceleration is available, falling back to CPU...") print("No hardware acceleration is available, falling back to CPU...")
else: else:
print(f"Hardware acceleration found: {name}") print(f"Hardware acceleration found: {name}")
if name == "dml": if name == "dml":
import torch_directml import torch_directml
return torch_directml.device() return torch_directml.device()
return torch.device(name) return torch.device(name)
def get_device_batch_size(): def get_device_batch_size():
available = 1 available = 1
name = get_device_name() name = get_device_name()
if name == "dml": if name == "dml":
# there's nothing publically accessible in the DML API that exposes this # there's nothing publicly accessible in the DML API that exposes this
# there's a method to get currently used RAM statistics... as tiles # there's a method to get currently used RAM statistics... as tiles
available = 1 available = 1
elif name == "cuda": elif name == "cuda":
_, available = torch.cuda.mem_get_info() _,available = torch.cuda.mem_get_info()
elif name == "cpu": elif name == "cpu":
available = psutil.virtual_memory()[4] available = psutil.virtual_memory()[4]
availableGb = available / (1024 ** 3) availableGb = available / (1024 ** 3)
if availableGb > 14:
return 16 print(f"Total device memory available: {availableGb}")
elif availableGb > 10: if availableGb > 18:
return 8 print(f"Setting AutoRegressive Batch Size to: 32")
elif availableGb > 7: print(f"Damn. Nice GPU Dude.")
return 4 return 32
return 1 elif availableGb > 14:
print(f"Setting AutoRegressive Batch Size to: 16")
def get_device_count(name=get_device_name()): return 16
if name == "cuda": elif availableGb > 10:
return torch.cuda.device_count() print(f"Setting AutoRegressive Batch Size to: 8")
if name == "dml": return 8
import torch_directml elif availableGb > 7:
return torch_directml.device_count() print(f"Setting AutoRegressive Batch Size to: 4")
return 4
return 1 print(f"Setting AutoRegressive Batch Size to: 1")
print(f"Don't cry about it if it doesn't work.")
return 1
if has_dml():
_cumsum = torch.cumsum def get_device_count(name=get_device_name()):
_repeat_interleave = torch.repeat_interleave if name == "cuda":
_multinomial = torch.multinomial return torch.cuda.device_count()
if name == "dml":
_Tensor_new = torch.Tensor.new import torch_directml
_Tensor_cumsum = torch.Tensor.cumsum return torch_directml.device_count()
_Tensor_repeat_interleave = torch.Tensor.repeat_interleave
_Tensor_multinomial = torch.Tensor.multinomial return 1
torch.cumsum = lambda input, *args, **kwargs: ( _cumsum(input.to("cpu"), *args, **kwargs).to(input.device) )
torch.repeat_interleave = lambda input, *args, **kwargs: ( _repeat_interleave(input.to("cpu"), *args, **kwargs).to(input.device) ) if has_dml():
torch.multinomial = lambda input, *args, **kwargs: ( _multinomial(input.to("cpu"), *args, **kwargs).to(input.device) ) _cumsum = torch.cumsum
_repeat_interleave = torch.repeat_interleave
torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) ) _multinomial = torch.multinomial
torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) )
torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) ) _Tensor_new = torch.Tensor.new
_Tensor_cumsum = torch.Tensor.cumsum
_Tensor_repeat_interleave = torch.Tensor.repeat_interleave
_Tensor_multinomial = torch.Tensor.multinomial
torch.cumsum = lambda input, *args, **kwargs: ( _cumsum(input.to("cpu"), *args, **kwargs).to(input.device) )
torch.repeat_interleave = lambda input, *args, **kwargs: ( _repeat_interleave(input.to("cpu"), *args, **kwargs).to(input.device) )
torch.multinomial = lambda input, *args, **kwargs: ( _multinomial(input.to("cpu"), *args, **kwargs).to(input.device) )
torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) )
torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) )
torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) )
torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) ) torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) )