diff --git a/README.md b/README.md
index ff4249a..38fc2ca 100755
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ I ***do not*** plan on making this uber-user friendly like [mrq/ai-voice-cloning
 
 ## Caveats
 
-A local LM is quite slow. Things seem to be getting faster as llama.cpp is being developed.
+A local LM is quite slow. Things seem to be getting faster as llama.cpp is being developed. GPU offloading (and the OpenCL PR) seems to bring some very nice hope in just scrapping this in Python and just integrate it entirely in C++.
 
 Even using one that's more instruction-tuned like Vicuna (with a `SYSTEM:\nUSER:\nASSISTANT:` structure of prompts), it's still inconsistent.
 
diff --git a/src/ext/generative_agent.py b/src/ext/generative_agent.py
index d506eec..312bda7 100755
--- a/src/ext/generative_agent.py
+++ b/src/ext/generative_agent.py
@@ -122,14 +122,12 @@ class GenerativeAgent(BaseModel):
         memory = ""
 
         for mem in relevant_memories:
-            if mem in summary or mem in memory:
+            if mem in summary or mem in memory or mem in observation:
                 continue
             memory += f"\n{mem}"
         
         for mem in recent_memories:
-            if mem in summary:
-                continue
-            if mem is observation:
+            if mem in summary or mem in observation:
                 continue
             # erase it, move it to bottom
             if mem in memory:
@@ -169,7 +167,7 @@ class GenerativeAgent(BaseModel):
 
         if self.verbose:
             print(reaction)
-        return reaction
+        return f'{self.name}: {reaction}'
 
     def generate_response(self, observation: str) -> Tuple[bool, str]:
         """React to a given observation."""
@@ -195,7 +193,7 @@ class GenerativeAgent(BaseModel):
         summary = self.chain(prompt).run(stop=get_stop_tokens(), name=self.name, summary=self.summaries[-1] if len(self.summaries) else self.summary, queries=[f"{self.name}'s core characteristics"]).strip()
         if self.verbose:
             print(summary)
-        return summary
+        return f'{self.name} {summary}'
 
     def get_summary(self, force_refresh: bool = False) -> str:
         """Return a descriptive summary of the agent."""
diff --git a/src/ext/prompts.py b/src/ext/prompts.py
index 028681f..84f063b 100755
--- a/src/ext/prompts.py
+++ b/src/ext/prompts.py
@@ -61,8 +61,7 @@ PROMPTS = {
     },
     "generate_reaction": {
         "system": (
-            "[Write one reply. Always stay in character. Maintain a casual tone using beige prose. Be brief. Avoid repeating anything below.]"
-            "\nCurrent Time: {current_time}"
+            #"\nCurrent Time: {current_time}" # commented out, not necessary if I'm not passing time anyways, and I think bigger LLMs would only take advantage of it / llama's prompt caching will get ruined with this changing
             "\n{summary}"
             "\n{memory}"
             "\n{observation}"
@@ -70,7 +69,7 @@ PROMPTS = {
         "user": (
             "{suffix}"
         ),
-        "assistant": ""
+        "assistant": "{name}: "
     },
 
     #
@@ -88,6 +87,7 @@ PROMPT_TUNES = {
     "supercot": "{role}:\n{query}",
     "alpasta": "{role}# {query}",
     "cocktail": "{role}: {query}",
+    "wizard-vicuna": "{role}: {query}",
 }
 PROMPT_ROLES = {
     "vicuna": {
@@ -100,6 +100,11 @@ PROMPT_ROLES = {
         "user": "### Input",
         "assistant": "### Response",
     },
+    "wizard-vicuna": {
+        "system": "### Instruction",
+        "user": "### Input",
+        "assistant": "### Response",
+    },
     "alpasta": {
         "system": "<|system|>",
         "user": "<|user|>",
@@ -173,4 +178,5 @@ def get_prompt( key, tune=LLM_PROMPT_TUNE ):
     output = "\n".join(outputs)
     #if LLM_PROMPT_TUNE == "cocktail":
     output = output.strip()
+    print([output[-1]])
     return output
\ No newline at end of file
diff --git a/src/main.py b/src/main.py
index 6639fe0..e862a1b 100755
--- a/src/main.py
+++ b/src/main.py
@@ -78,7 +78,7 @@ def get_summary_proxy( agents ):
 		messages.append(get_summary( agent, force_refresh = True ))
 	return "\n".join(messages)
 
-def run_conversation_proxy( agents, message, limit=4 ):
+def run_conversation_proxy( agents, message, limit=32 ):
 	agents = [ AGENTS[agent] for agent in agents ]
 
 	if len(agents) < 2:
@@ -95,7 +95,6 @@ def run_conversation_proxy( agents, message, limit=4 ):
 		importance_score = agent_observes( agent, [ message ], importance_score=importance_score )[0][0]
 	agents = agents[1:] + [agents[0]]
 
-	dialogue = []
 	while True:
 		for agent in agents:
 			message = agent_reacts( agent, [ message ] )[0]
diff --git a/src/utils.py b/src/utils.py
index 0a3fd94..2487d3a 100755
--- a/src/utils.py
+++ b/src/utils.py
@@ -27,28 +27,33 @@ from langchain.vectorstores import FAISS
 LLM_TYPE = os.environ.get('LLM_TYPE', "llamacpp") # options: llamacpp, oai
 LLM_LOCAL_MODEL = os.environ.get('LLM_MODEL', 
 	#"./models/ggml-vicuna-13b-1.1/ggml-vic13b-uncensored-q4_2.bin"
-	"./models/ggml-vicuna-13b-cocktail-v1-q5_0.bin"
+	#"./models/ggml-vicuna-13b-cocktail-v1-q5_0.bin"
+	#"./models/WizardML-Unc-13b-Q5_1.bin"
 	#"./models/llama-13b-supercot-ggml/ggml-model-q4_2.bin"
 	#"./models/llama-33b-supercot-ggml/ggml-model-q4_2.bin"
 	#"./models/gpt4-x-alpasta-30b-ggml-q4_1.bin"
+
+	"./models/Wizard-Vicuna-13B-Uncensored.ggml.q5_1.bin"
+	#"./models/wizardlm-13b-uncensored-ggml-q5_1.bin"
 )
 LLM_CONTEXT = int(os.environ.get('LLM_CONTEXT', '2048'))
 LLM_THREADS = int(os.environ.get('LLM_THREADS', '6'))
+LLM_GPU_LAYERS = int(os.environ.get('LLM_GPU_LAYERS', '99'))
 LLM_TEMPERATURE = float(os.environ.get('LLM_TEMPERATURE', '0.99'))
 EMBEDDING_TYPE = os.environ.get("LLM_EMBEDDING_TYPE", "hf") # options: llamacpp, oai, hf
 
-#LLM_TYPE="oai"
-#os.environ['OPENAI_API_BASE']="https://oai.ecker.tech/proxy/openai"
-#os.environ['OPENAI_API_KEY']=""
-
 # deduce a default given a model path
 if LLM_TYPE=="oai":
 	LLM_PROMPT_TUNE_DEFAULT = "oai"
 else:
 	if "supercot" in LLM_LOCAL_MODEL.lower():
 		LLM_PROMPT_TUNE_DEFAULT = "supercot"
+	if "wizard-vicuna" in LLM_LOCAL_MODEL.lower():
+		LLM_PROMPT_TUNE_DEFAULT = "wizard-vicuna"
 	elif "vicuna" in LLM_LOCAL_MODEL.lower():
 		LLM_PROMPT_TUNE_DEFAULT = "vicuna"
+	elif "wizard" in LLM_LOCAL_MODEL.lower():
+		LLM_PROMPT_TUNE_DEFAULT = "vicuna"
 	elif "alpasta" in LLM_LOCAL_MODEL.lower():
 		LLM_PROMPT_TUNE_DEFAULT = "alpasta"
 	elif "cocktail" in LLM_LOCAL_MODEL.lower():
@@ -72,6 +77,7 @@ if LLM_TYPE=="llamacpp":
 		callback_manager=callback_manager,
 		verbose=True,
 		n_ctx=LLM_CONTEXT,
+		n_gpu_layers=LLM_GPU_LAYERS,
 		temperature=LLM_TEMPERATURE,
 		#n_threads=LLM_THREADS,
 		#use_mlock=True,
@@ -176,8 +182,9 @@ def save_agent( agent ):
 		"status": agent.status,
 		"summary": agent.summary,
 		"summaries": agent.summaries,
-		"memory_retriever": agent.memory.memory_retriever,
+		"memories": [ memory for memory in agent.memory.memory_retriever.memory_stream ]
 	}
+
 	path = f"./agents/{agent.name}.pth"
 	pickle.dump(obj, open(path, 'wb'))
 	print(f"Saved agent:", path)
@@ -186,7 +193,11 @@ def load_agent( name ):
 	path = f"./agents/{name}.pth"
 	obj = pickle.load(open(path, 'rb'))
 	agent = create_agent(**obj)
-	agent.memory.memory_retriever = obj["memory_retriever"]
+	#agent.memory.memory_retriever.memory_stream = obj["memories"]
+	for memory in obj["memories"]:
+		print("Loaded memory:", memory)
+		agent.memory.memory_retriever.add_documents([memory])
+
 	print(f"Loaded agent:", path)
 	return agent