tortoise-tts/tortoise/utils/device.py

import torch
import psutil
import importlib

DEVICE_OVERRIDE = None

def has_dml():
    loader = importlib.find_loader('torch_directml')
    if loader is None:
        return False
    
    import torch_directml
    return torch_directml.is_available()

def set_device_name(name):
    global DEVICE_OVERRIDE
    DEVICE_OVERRIDE = name

def get_device_name():
    global DEVICE_OVERRIDE
    if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "":
        return DEVICE_OVERRIDE

    name = 'cpu'

    if torch.cuda.is_available():
        name = 'cuda'
    elif has_dml():
        name = 'dml'

    return name

def get_device(verbose=False):
    name = get_device_name()

    if verbose:
        if name == 'cpu':
            print("No hardware acceleration is available, falling back to CPU...")    
        else:
            print(f"Hardware acceleration found: {name}")

    if name == "dml":
        import torch_directml
        return torch_directml.device()

    return torch.device(name)

def get_device_batch_size():
    available = 1
    name = get_device_name()
    
    if name == "dml":
        # there's nothing publicly accessible in the DML API that exposes this
        # there's a method to get currently used RAM statistics... as tiles
        available = 1
    elif name == "cuda":
        _,available = torch.cuda.mem_get_info()
    elif name == "cpu":
        available = psutil.virtual_memory()[4]

    availableGb = available / (1024 ** 3)
    
    print(f"Total device memory available: {availableGb}")
    if availableGb > 18:
        print(f"Setting AutoRegressive Batch Size to: 32")
        print(f"Damn. Nice GPU Dude.")
        return 32
    elif availableGb > 14:
        print(f"Setting AutoRegressive Batch Size to: 16")
        return 16
    elif availableGb > 10:
        print(f"Setting AutoRegressive Batch Size to: 8")
        return 8
    elif availableGb > 7:
        print(f"Setting AutoRegressive Batch Size to: 4")
        return 4
    print(f"Setting AutoRegressive Batch Size to: 1")
    print(f"Don't cry about it if it doesn't work.")
    return 1

def get_device_count(name=get_device_name()):
    if name == "cuda":
        return torch.cuda.device_count()
    if name == "dml":
        import torch_directml
        return torch_directml.device_count()

    return 1


if has_dml():
    _cumsum = torch.cumsum
    _repeat_interleave = torch.repeat_interleave
    _multinomial = torch.multinomial
    
    _Tensor_new = torch.Tensor.new
    _Tensor_cumsum = torch.Tensor.cumsum
    _Tensor_repeat_interleave = torch.Tensor.repeat_interleave
    _Tensor_multinomial = torch.Tensor.multinomial

    torch.cumsum = lambda input, *args, **kwargs: ( _cumsum(input.to("cpu"), *args, **kwargs).to(input.device) )
    torch.repeat_interleave = lambda input, *args, **kwargs: ( _repeat_interleave(input.to("cpu"), *args, **kwargs).to(input.device) )
    torch.multinomial = lambda input, *args, **kwargs: ( _multinomial(input.to("cpu"), *args, **kwargs).to(input.device) )
    
    torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) )
    torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) )
    torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) )
    torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) )
Update 'tortoise/utils/device.py' Noticed that the autoregressive batch size was being set off of VRAM size. Adjusted to scale for the VRAM capacity of 90 series GPUs. In this case, 16 -> 32 batches. Using the standard pre-set with ChungusVGAN, I went from 16 steps to 8. Over an average of 3 runs, I achieved an average of 294 seconds with 16 batches, to 234 seconds with 32. Can't complain at a 1.2x speed increase with functionally 2 lines of code. Can't complain. I restarted tortoise each run, and executing ```torch.cuda.empty_cache()``` just before loading the autoregressive model to clean the memory cache each time. 2023-03-07 14:05:27 +00:00			`import torch`
			`import psutil`
			`import importlib`

			`DEVICE_OVERRIDE = None`

			`def has_dml():`
			`loader = importlib.find_loader('torch_directml')`
			`if loader is None:`
			`return False`

			`import torch_directml`
			`return torch_directml.is_available()`

			`def set_device_name(name):`
			`global DEVICE_OVERRIDE`
			`DEVICE_OVERRIDE = name`

			`def get_device_name():`
			`global DEVICE_OVERRIDE`
			`if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "":`
			`return DEVICE_OVERRIDE`

			`name = 'cpu'`

			`if torch.cuda.is_available():`
			`name = 'cuda'`
			`elif has_dml():`
			`name = 'dml'`

			`return name`

			`def get_device(verbose=False):`
			`name = get_device_name()`

			`if verbose:`
			`if name == 'cpu':`
			`print("No hardware acceleration is available, falling back to CPU...")`
			`else:`
			`print(f"Hardware acceleration found: {name}")`

			`if name == "dml":`
			`import torch_directml`
			`return torch_directml.device()`

			`return torch.device(name)`

			`def get_device_batch_size():`
			`available = 1`
			`name = get_device_name()`

			`if name == "dml":`
			`# there's nothing publicly accessible in the DML API that exposes this`
			`# there's a method to get currently used RAM statistics... as tiles`
			`available = 1`
			`elif name == "cuda":`
			`_,available = torch.cuda.mem_get_info()`
			`elif name == "cpu":`
			`available = psutil.virtual_memory()[4]`

			`availableGb = available / (1024 ** 3)`

			`print(f"Total device memory available: {availableGb}")`
			`if availableGb > 18:`
			`print(f"Setting AutoRegressive Batch Size to: 32")`
			`print(f"Damn. Nice GPU Dude.")`
			`return 32`
			`elif availableGb > 14:`
			`print(f"Setting AutoRegressive Batch Size to: 16")`
			`return 16`
			`elif availableGb > 10:`
			`print(f"Setting AutoRegressive Batch Size to: 8")`
			`return 8`
			`elif availableGb > 7:`
			`print(f"Setting AutoRegressive Batch Size to: 4")`
			`return 4`
			`print(f"Setting AutoRegressive Batch Size to: 1")`
			`print(f"Don't cry about it if it doesn't work.")`
			`return 1`

			`def get_device_count(name=get_device_name()):`
			`if name == "cuda":`
			`return torch.cuda.device_count()`
			`if name == "dml":`
			`import torch_directml`
			`return torch_directml.device_count()`

			`return 1`


			`if has_dml():`
			`_cumsum = torch.cumsum`
			`_repeat_interleave = torch.repeat_interleave`
			`_multinomial = torch.multinomial`

			`_Tensor_new = torch.Tensor.new`
			`_Tensor_cumsum = torch.Tensor.cumsum`
			`_Tensor_repeat_interleave = torch.Tensor.repeat_interleave`
			`_Tensor_multinomial = torch.Tensor.multinomial`

			`torch.cumsum = lambda input, args, kwargs: ( _cumsum(input.to("cpu"), args, **kwargs).to(input.device) )`
			`torch.repeat_interleave = lambda input, args, kwargs: ( _repeat_interleave(input.to("cpu"), args, **kwargs).to(input.device) )`
			`torch.multinomial = lambda input, args, kwargs: ( _multinomial(input.to("cpu"), args, **kwargs).to(input.device) )`

			`torch.Tensor.new = lambda self, args, kwargs: ( _Tensor_new(self.to("cpu"), args, **kwargs).to(self.device) )`
			`torch.Tensor.cumsum = lambda self, args, kwargs: ( _Tensor_cumsum(self.to("cpu"), args, **kwargs).to(self.device) )`
			`torch.Tensor.repeat_interleave = lambda self, args, kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), args, **kwargs).to(self.device) )`
I didn't have to suck off a wizard for DirectML support (courtesy of https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/7600 for leading the way) 2023-02-09 05:05:21 +00:00			`torch.Tensor.multinomial = lambda self, args, kwargs: ( _Tensor_multinomial(self.to("cpu"), args, **kwargs).to(self.device) )`