From e6508004477b56bf358e0104f1d571e22f25d951 Mon Sep 17 00:00:00 2001 From: deviandice Date: Tue, 7 Mar 2023 14:05:27 +0000 Subject: [PATCH] Update 'tortoise/utils/device.py' Noticed that the autoregressive batch size was being set off of VRAM size. Adjusted to scale for the VRAM capacity of 90 series GPUs. In this case, 16 -> 32 batches. Using the standard pre-set with ChungusVGAN, I went from 16 steps to 8. Over an average of 3 runs, I achieved an average of 294 seconds with 16 batches, to 234 seconds with 32. Can't complain at a 1.2x speed increase with functionally 2 lines of code. Can't complain. I restarted tortoise each run, and executing ```torch.cuda.empty_cache()``` just before loading the autoregressive model to clean the memory cache each time. --- tortoise/utils/device.py | 203 +++++++++++++++++++++------------------ 1 file changed, 107 insertions(+), 96 deletions(-) diff --git a/tortoise/utils/device.py b/tortoise/utils/device.py index 3ab52e2..72fc142 100755 --- a/tortoise/utils/device.py +++ b/tortoise/utils/device.py @@ -1,97 +1,108 @@ -import torch -import psutil -import importlib - -DEVICE_OVERRIDE = None - -def has_dml(): - loader = importlib.find_loader('torch_directml') - if loader is None: - return False - - import torch_directml - return torch_directml.is_available() - -def set_device_name(name): - global DEVICE_OVERRIDE - DEVICE_OVERRIDE = name - -def get_device_name(): - global DEVICE_OVERRIDE - if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "": - return DEVICE_OVERRIDE - - name = 'cpu' - - if torch.cuda.is_available(): - name = 'cuda' - elif has_dml(): - name = 'dml' - - return name - -def get_device(verbose=False): - name = get_device_name() - - if verbose: - if name == 'cpu': - print("No hardware acceleration is available, falling back to CPU...") - else: - print(f"Hardware acceleration found: {name}") - - if name == "dml": - import torch_directml - return torch_directml.device() - - return torch.device(name) - -def get_device_batch_size(): - available = 1 - name = get_device_name() - - if name == "dml": - # there's nothing publically accessible in the DML API that exposes this - # there's a method to get currently used RAM statistics... as tiles - available = 1 - elif name == "cuda": - _, available = torch.cuda.mem_get_info() - elif name == "cpu": - available = psutil.virtual_memory()[4] - - availableGb = available / (1024 ** 3) - if availableGb > 14: - return 16 - elif availableGb > 10: - return 8 - elif availableGb > 7: - return 4 - return 1 - -def get_device_count(name=get_device_name()): - if name == "cuda": - return torch.cuda.device_count() - if name == "dml": - import torch_directml - return torch_directml.device_count() - - return 1 - - -if has_dml(): - _cumsum = torch.cumsum - _repeat_interleave = torch.repeat_interleave - _multinomial = torch.multinomial - - _Tensor_new = torch.Tensor.new - _Tensor_cumsum = torch.Tensor.cumsum - _Tensor_repeat_interleave = torch.Tensor.repeat_interleave - _Tensor_multinomial = torch.Tensor.multinomial - - torch.cumsum = lambda input, *args, **kwargs: ( _cumsum(input.to("cpu"), *args, **kwargs).to(input.device) ) - torch.repeat_interleave = lambda input, *args, **kwargs: ( _repeat_interleave(input.to("cpu"), *args, **kwargs).to(input.device) ) - torch.multinomial = lambda input, *args, **kwargs: ( _multinomial(input.to("cpu"), *args, **kwargs).to(input.device) ) - - torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) ) - torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) ) - torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) ) +import torch +import psutil +import importlib + +DEVICE_OVERRIDE = None + +def has_dml(): + loader = importlib.find_loader('torch_directml') + if loader is None: + return False + + import torch_directml + return torch_directml.is_available() + +def set_device_name(name): + global DEVICE_OVERRIDE + DEVICE_OVERRIDE = name + +def get_device_name(): + global DEVICE_OVERRIDE + if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "": + return DEVICE_OVERRIDE + + name = 'cpu' + + if torch.cuda.is_available(): + name = 'cuda' + elif has_dml(): + name = 'dml' + + return name + +def get_device(verbose=False): + name = get_device_name() + + if verbose: + if name == 'cpu': + print("No hardware acceleration is available, falling back to CPU...") + else: + print(f"Hardware acceleration found: {name}") + + if name == "dml": + import torch_directml + return torch_directml.device() + + return torch.device(name) + +def get_device_batch_size(): + available = 1 + name = get_device_name() + + if name == "dml": + # there's nothing publicly accessible in the DML API that exposes this + # there's a method to get currently used RAM statistics... as tiles + available = 1 + elif name == "cuda": + _,available = torch.cuda.mem_get_info() + elif name == "cpu": + available = psutil.virtual_memory()[4] + + availableGb = available / (1024 ** 3) + + print(f"Total device memory available: {availableGb}") + if availableGb > 18: + print(f"Setting AutoRegressive Batch Size to: 32") + print(f"Damn. Nice GPU Dude.") + return 32 + elif availableGb > 14: + print(f"Setting AutoRegressive Batch Size to: 16") + return 16 + elif availableGb > 10: + print(f"Setting AutoRegressive Batch Size to: 8") + return 8 + elif availableGb > 7: + print(f"Setting AutoRegressive Batch Size to: 4") + return 4 + print(f"Setting AutoRegressive Batch Size to: 1") + print(f"Don't cry about it if it doesn't work.") + return 1 + +def get_device_count(name=get_device_name()): + if name == "cuda": + return torch.cuda.device_count() + if name == "dml": + import torch_directml + return torch_directml.device_count() + + return 1 + + +if has_dml(): + _cumsum = torch.cumsum + _repeat_interleave = torch.repeat_interleave + _multinomial = torch.multinomial + + _Tensor_new = torch.Tensor.new + _Tensor_cumsum = torch.Tensor.cumsum + _Tensor_repeat_interleave = torch.Tensor.repeat_interleave + _Tensor_multinomial = torch.Tensor.multinomial + + torch.cumsum = lambda input, *args, **kwargs: ( _cumsum(input.to("cpu"), *args, **kwargs).to(input.device) ) + torch.repeat_interleave = lambda input, *args, **kwargs: ( _repeat_interleave(input.to("cpu"), *args, **kwargs).to(input.device) ) + torch.multinomial = lambda input, *args, **kwargs: ( _multinomial(input.to("cpu"), *args, **kwargs).to(input.device) ) + + torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) ) + torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) ) + torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) ) torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) ) \ No newline at end of file