forked from mrq/tortoise-tts
Update 'tortoise/utils/device.py'
Noticed that the autoregressive batch size was being set off of VRAM size. Adjusted to scale for the VRAM capacity of 90 series GPUs. In this case, 16 -> 32 batches. Using the standard pre-set with ChungusVGAN, I went from 16 steps to 8. Over an average of 3 runs, I achieved an average of 294 seconds with 16 batches, to 234 seconds with 32. Can't complain at a 1.2x speed increase with functionally 2 lines of code. Can't complain. I restarted tortoise each run, and executing ```torch.cuda.empty_cache()``` just before loading the autoregressive model to clean the memory cache each time.
This commit is contained in:
parent
26133c2031
commit
e650800447
|
@ -1,97 +1,108 @@
|
||||||
import torch
|
import torch
|
||||||
import psutil
|
import psutil
|
||||||
import importlib
|
import importlib
|
||||||
|
|
||||||
DEVICE_OVERRIDE = None
|
DEVICE_OVERRIDE = None
|
||||||
|
|
||||||
def has_dml():
|
def has_dml():
|
||||||
loader = importlib.find_loader('torch_directml')
|
loader = importlib.find_loader('torch_directml')
|
||||||
if loader is None:
|
if loader is None:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
import torch_directml
|
import torch_directml
|
||||||
return torch_directml.is_available()
|
return torch_directml.is_available()
|
||||||
|
|
||||||
def set_device_name(name):
|
def set_device_name(name):
|
||||||
global DEVICE_OVERRIDE
|
global DEVICE_OVERRIDE
|
||||||
DEVICE_OVERRIDE = name
|
DEVICE_OVERRIDE = name
|
||||||
|
|
||||||
def get_device_name():
|
def get_device_name():
|
||||||
global DEVICE_OVERRIDE
|
global DEVICE_OVERRIDE
|
||||||
if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "":
|
if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "":
|
||||||
return DEVICE_OVERRIDE
|
return DEVICE_OVERRIDE
|
||||||
|
|
||||||
name = 'cpu'
|
name = 'cpu'
|
||||||
|
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
name = 'cuda'
|
name = 'cuda'
|
||||||
elif has_dml():
|
elif has_dml():
|
||||||
name = 'dml'
|
name = 'dml'
|
||||||
|
|
||||||
return name
|
return name
|
||||||
|
|
||||||
def get_device(verbose=False):
|
def get_device(verbose=False):
|
||||||
name = get_device_name()
|
name = get_device_name()
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
if name == 'cpu':
|
if name == 'cpu':
|
||||||
print("No hardware acceleration is available, falling back to CPU...")
|
print("No hardware acceleration is available, falling back to CPU...")
|
||||||
else:
|
else:
|
||||||
print(f"Hardware acceleration found: {name}")
|
print(f"Hardware acceleration found: {name}")
|
||||||
|
|
||||||
if name == "dml":
|
if name == "dml":
|
||||||
import torch_directml
|
import torch_directml
|
||||||
return torch_directml.device()
|
return torch_directml.device()
|
||||||
|
|
||||||
return torch.device(name)
|
return torch.device(name)
|
||||||
|
|
||||||
def get_device_batch_size():
|
def get_device_batch_size():
|
||||||
available = 1
|
available = 1
|
||||||
name = get_device_name()
|
name = get_device_name()
|
||||||
|
|
||||||
if name == "dml":
|
if name == "dml":
|
||||||
# there's nothing publically accessible in the DML API that exposes this
|
# there's nothing publicly accessible in the DML API that exposes this
|
||||||
# there's a method to get currently used RAM statistics... as tiles
|
# there's a method to get currently used RAM statistics... as tiles
|
||||||
available = 1
|
available = 1
|
||||||
elif name == "cuda":
|
elif name == "cuda":
|
||||||
_, available = torch.cuda.mem_get_info()
|
_,available = torch.cuda.mem_get_info()
|
||||||
elif name == "cpu":
|
elif name == "cpu":
|
||||||
available = psutil.virtual_memory()[4]
|
available = psutil.virtual_memory()[4]
|
||||||
|
|
||||||
availableGb = available / (1024 ** 3)
|
availableGb = available / (1024 ** 3)
|
||||||
if availableGb > 14:
|
|
||||||
return 16
|
print(f"Total device memory available: {availableGb}")
|
||||||
elif availableGb > 10:
|
if availableGb > 18:
|
||||||
return 8
|
print(f"Setting AutoRegressive Batch Size to: 32")
|
||||||
elif availableGb > 7:
|
print(f"Damn. Nice GPU Dude.")
|
||||||
return 4
|
return 32
|
||||||
return 1
|
elif availableGb > 14:
|
||||||
|
print(f"Setting AutoRegressive Batch Size to: 16")
|
||||||
def get_device_count(name=get_device_name()):
|
return 16
|
||||||
if name == "cuda":
|
elif availableGb > 10:
|
||||||
return torch.cuda.device_count()
|
print(f"Setting AutoRegressive Batch Size to: 8")
|
||||||
if name == "dml":
|
return 8
|
||||||
import torch_directml
|
elif availableGb > 7:
|
||||||
return torch_directml.device_count()
|
print(f"Setting AutoRegressive Batch Size to: 4")
|
||||||
|
return 4
|
||||||
return 1
|
print(f"Setting AutoRegressive Batch Size to: 1")
|
||||||
|
print(f"Don't cry about it if it doesn't work.")
|
||||||
|
return 1
|
||||||
if has_dml():
|
|
||||||
_cumsum = torch.cumsum
|
def get_device_count(name=get_device_name()):
|
||||||
_repeat_interleave = torch.repeat_interleave
|
if name == "cuda":
|
||||||
_multinomial = torch.multinomial
|
return torch.cuda.device_count()
|
||||||
|
if name == "dml":
|
||||||
_Tensor_new = torch.Tensor.new
|
import torch_directml
|
||||||
_Tensor_cumsum = torch.Tensor.cumsum
|
return torch_directml.device_count()
|
||||||
_Tensor_repeat_interleave = torch.Tensor.repeat_interleave
|
|
||||||
_Tensor_multinomial = torch.Tensor.multinomial
|
return 1
|
||||||
|
|
||||||
torch.cumsum = lambda input, *args, **kwargs: ( _cumsum(input.to("cpu"), *args, **kwargs).to(input.device) )
|
|
||||||
torch.repeat_interleave = lambda input, *args, **kwargs: ( _repeat_interleave(input.to("cpu"), *args, **kwargs).to(input.device) )
|
if has_dml():
|
||||||
torch.multinomial = lambda input, *args, **kwargs: ( _multinomial(input.to("cpu"), *args, **kwargs).to(input.device) )
|
_cumsum = torch.cumsum
|
||||||
|
_repeat_interleave = torch.repeat_interleave
|
||||||
torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) )
|
_multinomial = torch.multinomial
|
||||||
torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) )
|
|
||||||
torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) )
|
_Tensor_new = torch.Tensor.new
|
||||||
|
_Tensor_cumsum = torch.Tensor.cumsum
|
||||||
|
_Tensor_repeat_interleave = torch.Tensor.repeat_interleave
|
||||||
|
_Tensor_multinomial = torch.Tensor.multinomial
|
||||||
|
|
||||||
|
torch.cumsum = lambda input, *args, **kwargs: ( _cumsum(input.to("cpu"), *args, **kwargs).to(input.device) )
|
||||||
|
torch.repeat_interleave = lambda input, *args, **kwargs: ( _repeat_interleave(input.to("cpu"), *args, **kwargs).to(input.device) )
|
||||||
|
torch.multinomial = lambda input, *args, **kwargs: ( _multinomial(input.to("cpu"), *args, **kwargs).to(input.device) )
|
||||||
|
|
||||||
|
torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) )
|
||||||
|
torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) )
|
||||||
|
torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) )
|
||||||
torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) )
|
torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) )
|
Loading…
Reference in New Issue
Block a user