From e6508004477b56bf358e0104f1d571e22f25d951 Mon Sep 17 00:00:00 2001
From: deviandice <deviandice@noreply.localhost>
Date: Tue, 7 Mar 2023 14:05:27 +0000
Subject: [PATCH] Update 'tortoise/utils/device.py'

Noticed that the autoregressive batch size was being set off of VRAM size. Adjusted to scale for the VRAM capacity of 90 series GPUs. In this case, 16 -> 32 batches.

Using the standard pre-set with ChungusVGAN, I went from 16 steps to 8.
Over an average of 3 runs, I achieved an average of 294 seconds with 16 batches, to 234 seconds with 32. Can't complain at a 1.2x speed increase with functionally 2 lines of code. Can't complain.

I restarted tortoise each run, and executing ```torch.cuda.empty_cache()``` just before loading the autoregressive model to clean the memory cache each time.
---
 tortoise/utils/device.py | 203 +++++++++++++++++++++------------------
 1 file changed, 107 insertions(+), 96 deletions(-)

diff --git a/tortoise/utils/device.py b/tortoise/utils/device.py
index 3ab52e2..72fc142 100755
--- a/tortoise/utils/device.py
+++ b/tortoise/utils/device.py
@@ -1,97 +1,108 @@
-import torch
-import psutil
-import importlib
-
-DEVICE_OVERRIDE = None
-
-def has_dml():
-    loader = importlib.find_loader('torch_directml')
-    if loader is None:
-        return False
-    
-    import torch_directml
-    return torch_directml.is_available()
-
-def set_device_name(name):
-    global DEVICE_OVERRIDE
-    DEVICE_OVERRIDE = name
-
-def get_device_name():
-    global DEVICE_OVERRIDE
-    if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "":
-        return DEVICE_OVERRIDE
-
-    name = 'cpu'
-
-    if torch.cuda.is_available():
-        name = 'cuda'
-    elif has_dml():
-        name = 'dml'
-
-    return name
-
-def get_device(verbose=False):
-    name = get_device_name()
-
-    if verbose:
-        if name == 'cpu':
-            print("No hardware acceleration is available, falling back to CPU...")    
-        else:
-            print(f"Hardware acceleration found: {name}")
-
-    if name == "dml":
-        import torch_directml
-        return torch_directml.device()
-
-    return torch.device(name)
-
-def get_device_batch_size():
-    available = 1
-    name = get_device_name()
-
-    if name == "dml":
-        # there's nothing publically accessible in the DML API that exposes this
-        # there's a method to get currently used RAM statistics... as tiles
-        available = 1
-    elif name == "cuda":
-        _, available = torch.cuda.mem_get_info()
-    elif name == "cpu":
-        available = psutil.virtual_memory()[4]
-
-    availableGb = available / (1024 ** 3)
-    if availableGb > 14:
-        return 16
-    elif availableGb > 10:
-        return 8
-    elif availableGb > 7:
-        return 4
-    return 1
-
-def get_device_count(name=get_device_name()):
-    if name == "cuda":
-        return torch.cuda.device_count()
-    if name == "dml":
-        import torch_directml
-        return torch_directml.device_count()
-
-    return 1
-
-
-if has_dml():
-    _cumsum = torch.cumsum
-    _repeat_interleave = torch.repeat_interleave
-    _multinomial = torch.multinomial
-    
-    _Tensor_new = torch.Tensor.new
-    _Tensor_cumsum = torch.Tensor.cumsum
-    _Tensor_repeat_interleave = torch.Tensor.repeat_interleave
-    _Tensor_multinomial = torch.Tensor.multinomial
-
-    torch.cumsum = lambda input, *args, **kwargs: ( _cumsum(input.to("cpu"), *args, **kwargs).to(input.device) )
-    torch.repeat_interleave = lambda input, *args, **kwargs: ( _repeat_interleave(input.to("cpu"), *args, **kwargs).to(input.device) )
-    torch.multinomial = lambda input, *args, **kwargs: ( _multinomial(input.to("cpu"), *args, **kwargs).to(input.device) )
-    
-    torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) )
-    torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) )
-    torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) )
+import torch
+import psutil
+import importlib
+
+DEVICE_OVERRIDE = None
+
+def has_dml():
+    loader = importlib.find_loader('torch_directml')
+    if loader is None:
+        return False
+    
+    import torch_directml
+    return torch_directml.is_available()
+
+def set_device_name(name):
+    global DEVICE_OVERRIDE
+    DEVICE_OVERRIDE = name
+
+def get_device_name():
+    global DEVICE_OVERRIDE
+    if DEVICE_OVERRIDE is not None and DEVICE_OVERRIDE != "":
+        return DEVICE_OVERRIDE
+
+    name = 'cpu'
+
+    if torch.cuda.is_available():
+        name = 'cuda'
+    elif has_dml():
+        name = 'dml'
+
+    return name
+
+def get_device(verbose=False):
+    name = get_device_name()
+
+    if verbose:
+        if name == 'cpu':
+            print("No hardware acceleration is available, falling back to CPU...")    
+        else:
+            print(f"Hardware acceleration found: {name}")
+
+    if name == "dml":
+        import torch_directml
+        return torch_directml.device()
+
+    return torch.device(name)
+
+def get_device_batch_size():
+    available = 1
+    name = get_device_name()
+    
+    if name == "dml":
+        # there's nothing publicly accessible in the DML API that exposes this
+        # there's a method to get currently used RAM statistics... as tiles
+        available = 1
+    elif name == "cuda":
+        _,available = torch.cuda.mem_get_info()
+    elif name == "cpu":
+        available = psutil.virtual_memory()[4]
+
+    availableGb = available / (1024 ** 3)
+    
+    print(f"Total device memory available: {availableGb}")
+    if availableGb > 18:
+        print(f"Setting AutoRegressive Batch Size to: 32")
+        print(f"Damn. Nice GPU Dude.")
+        return 32
+    elif availableGb > 14:
+        print(f"Setting AutoRegressive Batch Size to: 16")
+        return 16
+    elif availableGb > 10:
+        print(f"Setting AutoRegressive Batch Size to: 8")
+        return 8
+    elif availableGb > 7:
+        print(f"Setting AutoRegressive Batch Size to: 4")
+        return 4
+    print(f"Setting AutoRegressive Batch Size to: 1")
+    print(f"Don't cry about it if it doesn't work.")
+    return 1
+
+def get_device_count(name=get_device_name()):
+    if name == "cuda":
+        return torch.cuda.device_count()
+    if name == "dml":
+        import torch_directml
+        return torch_directml.device_count()
+
+    return 1
+
+
+if has_dml():
+    _cumsum = torch.cumsum
+    _repeat_interleave = torch.repeat_interleave
+    _multinomial = torch.multinomial
+    
+    _Tensor_new = torch.Tensor.new
+    _Tensor_cumsum = torch.Tensor.cumsum
+    _Tensor_repeat_interleave = torch.Tensor.repeat_interleave
+    _Tensor_multinomial = torch.Tensor.multinomial
+
+    torch.cumsum = lambda input, *args, **kwargs: ( _cumsum(input.to("cpu"), *args, **kwargs).to(input.device) )
+    torch.repeat_interleave = lambda input, *args, **kwargs: ( _repeat_interleave(input.to("cpu"), *args, **kwargs).to(input.device) )
+    torch.multinomial = lambda input, *args, **kwargs: ( _multinomial(input.to("cpu"), *args, **kwargs).to(input.device) )
+    
+    torch.Tensor.new = lambda self, *args, **kwargs: ( _Tensor_new(self.to("cpu"), *args, **kwargs).to(self.device) )
+    torch.Tensor.cumsum = lambda self, *args, **kwargs: ( _Tensor_cumsum(self.to("cpu"), *args, **kwargs).to(self.device) )
+    torch.Tensor.repeat_interleave = lambda self, *args, **kwargs: ( _Tensor_repeat_interleave(self.to("cpu"), *args, **kwargs).to(self.device) )
     torch.Tensor.multinomial = lambda self, *args, **kwargs: ( _Tensor_multinomial(self.to("cpu"), *args, **kwargs).to(self.device) )
\ No newline at end of file