From c9ec6b28ef7d2f7aada381b1ce7660db27be02f2 Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Thu, 1 Aug 2024 20:56:28 -0500
Subject: [PATCH] it actually wasn't working because Engines.__init__()
 automatically moves the entire module to the requested device, which was
 being called after offloading the model in the test trainer (and it seems I
 cant do it without injecting a bunch of shit in modeling_llama.py)

---
 vall_e/config.py        |  1 +
 vall_e/models/ar_nar.py | 11 ++++++-----
 vall_e/utils/utils.py   | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/vall_e/config.py b/vall_e/config.py
index c0e0732..b026f6c 100755
--- a/vall_e/config.py
+++ b/vall_e/config.py
@@ -683,6 +683,7 @@ class Optimizations:
 	model_offloading: dict | None = None # automatically splits the model over a list of devices
 	# example: {"include":["model"], "limits": [ (6 * 1024) * (1024 ** 2), -1 ]} will have the GPU capped to 6GiB, and offload the remaining layers to CPU
 	# example: {"include":["model"], "device": ["cuda:0", "cuda:1"], "limits": [ 0.5, 0.5 ]} will have the GPU 1 try and use 50% of the model, and GPU 2 try and use the other 50%
+	# | {"assign": [[ f'layers.{i}.' for i in range(0,6) ], [ f'layers.{i}.' for i in range(6,12) ]]} will assign layers 0-5 to device 1, and 6-12 to device 2
 
 @dataclass()
 class Config(BaseConfig):
diff --git a/vall_e/models/ar_nar.py b/vall_e/models/ar_nar.py
index d44ea2a..83d4f9d 100644
--- a/vall_e/models/ar_nar.py
+++ b/vall_e/models/ar_nar.py
@@ -526,17 +526,18 @@ def example_usage():
 	"""
 	cfg.optimizations.model_offloading = {
 		"devices": ["cuda:0", "cpu"],
-		"limits": [ 0.5, -1 ]
-	#	"limits": [ 256 * (1024 ** 2), -1 ]
+	#	"limits": [ 0.9, -1 ],
+		"assign": [[ f'layers.{i}.' for i in range(0,6) ], [ f'layers.{i}.' for i in range(6,12) ]],
+		"limits": [ 256 * (1024 ** 2), -1 ]
 	}
 	"""
-	if cfg.optimizations.model_offloading:
-		model = ml.offload_model( model, policy=cfg.optimizations.model_offloading )
 	
 	engine = Engine(model=model, optimizer=optimizer)
-
 	engines = Engines({"ar+nar": engine})
 	engines.setup()
+	
+	if cfg.optimizations.model_offloading:
+		model = ml.offload_model( model, policy=cfg.optimizations.model_offloading )
 
 	"""
 	torch.save( {
diff --git a/vall_e/utils/utils.py b/vall_e/utils/utils.py
index 366eb2f..4faead2 100755
--- a/vall_e/utils/utils.py
+++ b/vall_e/utils/utils.py
@@ -394,12 +394,18 @@ def get_model_offload_policy(module, policy=None):
 	# default to only include the core model, and not the other modules (embeddings) in the splitting policy
 	if "include" not in policy:
 		policy["include"] = ["model"]
+	
 	if "limits" not in policy:
 		policy["limits"] = []
 
+	if "assign" not in policy:
+		policy["assign"] = []
+
 	if "devices" not in policy:
 		policy["devices"]  = [f'{"cuda"}:{i}' for i in range(torch.cuda.device_count())] + ['cpu'] # + cpu to spill the remainder on CPU if overbudget
 
+	print( policy )
+
 	# create initial device info
 	devices = [ get_device_properties(device) | {"modules": []} for device in policy["devices"] ]
 	modules = [ (name, get_module_size(module)) for name, module in module.named_modules() if not [*module.named_children()] and passes_policy( policy, name ) ]
@@ -422,8 +428,42 @@ def get_model_offload_policy(module, policy=None):
 		# cap to requested size
 		devices[i]["free"] = cap
 
+	# assign if specific parts of the model are requested for assignment
+	if policy["assign"]:
+		discarded = []
+		# yuck, there has to be a better way
+		for device_index, includes in enumerate( policy["assign"] ):
+			device = devices[device_index]
+
+			buffered_modules = []
+			buffered_size = device["free"]
+
+			# iterate through list of modules to compare against includes
+			for name, size in modules:
+				# doesn't pass policy
+				if not passes_policy( {"include": includes}, name ):
+					continue
+				# check if within budget
+				if buffered_size - size >= 0:
+					# add to buffer
+					buffered_modules.append( name )
+					buffered_size -= size
+				# budget exceeded, flush buffer
+				else:
+					discarded += buffered_modules
+					buffered_modules = []
+					buffered_size = 0
+					break
+
+			if buffered_modules and buffered_size:
+				device["modules"] += buffered_modules
+				device["free"] = buffered_size
+
+		modules = discarded
+
 	device_index = 0
 	module_index = 0
+	# assign modules to each device
 	while module_index < len(modules) and device_index < len(devices):
 		device = devices[device_index]
 		name, size = modules[module_index]