From f10ea1ec2a0c207d2420935ca246a72b5966b14e Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Sun, 30 Apr 2023 22:56:02 +0000
Subject: [PATCH] added prompt tuning for superCOT (which 33B seems to be the
 best approach for a local LLM)

---
 README.md                   |   8 +-
 src/ext/__init__.py         |   1 +
 src/ext/generative_agent.py |  42 +++-
 src/ext/memory.py           |   8 +-
 src/ext/prompts.py          | 389 +++++++++++++++++++++++++-----------
 src/main.py                 |  10 +-
 src/utils.py                |  40 ++--
 7 files changed, 343 insertions(+), 155 deletions(-)

diff --git a/README.md b/README.md
index 8cf0ebd..aeb8df8 100755
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ Set your environment variables accordingly:
 	- `OPENAI_API_MODEL`: target model
 * `LLM_MODEL`: (`./path/to/your/llama/model.bin`): path to your GGML-formatted LLaMA model, if using `llamacpp` as the LLM backend
 * `LLM_EMBEDDING_TYPE`: (`oai`, `llamacpp`, `hf`): the embedding model to use for similarity computing.
-* `LLM_PROMPT_TUNE`: (`oai`, `vicuna`): prompt formatting to use, for variants with specific finetunes for instructions, etc.
+* `LLM_PROMPT_TUNE`: (`oai`, `vicuna`, `supercot`): prompt formatting to use, for variants with specific finetunes for instructions, etc.
 * `LLM_CONTEXT`: sets maximum context size
 
 To run:
@@ -44,6 +44,10 @@ I ***do not*** plan on making this uber-user friendly like [mrq/ai-voice-cloning
 
 ## Caveats
 
-A local LM is quite slow. Even using one that's more instruction-tuned like Vicuna (with a `SYSTEM:\nUSER:\nASSISTANT:` structure of prompts) is still inconsistent.
+A local LM is quite slow.
+
+Even using one that's more instruction-tuned like Vicuna (with a `SYSTEM:\nUSER:\nASSISTANT:` structure of prompts) is still inconsistent.
+
+However, I seem to be getting consistent results with SuperCOT 33B, it's just, well, slow.
 
 GPT4 seems to Just Work, unfortunately.
\ No newline at end of file
diff --git a/src/ext/__init__.py b/src/ext/__init__.py
index 9be1e02..ac5b5c7 100755
--- a/src/ext/__init__.py
+++ b/src/ext/__init__.py
@@ -25,5 +25,6 @@ THE SOFTWARE.
 """Generative Agents primitives."""
 from .generative_agent import GenerativeAgent
 from .memory import GenerativeAgentMemory
+from .prompts import get_prompt, get_roles
 
 __all__ = ["GenerativeAgent", "GenerativeAgentMemory"]
diff --git a/src/ext/generative_agent.py b/src/ext/generative_agent.py
index d855035..67efd2c 100755
--- a/src/ext/generative_agent.py
+++ b/src/ext/generative_agent.py
@@ -84,17 +84,25 @@ class GenerativeAgent(BaseModel):
 
     def _get_entity_from_observation(self, observation: str) -> str:
         prompt = PromptTemplate.from_template(get_prompt('entity_from_observation'))
-        return self.chain(prompt).run(observation=observation).strip()
+        response = self.chain(prompt).run(observation=observation).strip().replace("Entity=", "").replace("Entity: ", "") # OAI will keep this
+        if self.verbose:
+            print(response)
+        return response
 
     def _get_entity_action(self, observation: str, entity_name: str) -> str:
         prompt = PromptTemplate.from_template(get_prompt('entity_action'))
-        return self.chain(prompt).run(entity=entity_name, observation=observation).strip()
+        response = self.chain(prompt).run(entity=entity_name, observation=observation).strip()
+        if self.verbose:
+            print(response)
+        return response
 
     def summarize_related_memories(self, observation: str) -> str:
         """Summarize memories that are most relevant to an observation."""
         prompt = PromptTemplate.from_template(get_prompt('summarize_related_memories'))
-        entity_name = self._get_entity_from_observation(observation).split("\n")[0]
+        entity_name = self._get_entity_from_observation(observation).split("\n")[0].strip()
         q1 = f"What is the relationship between {self.name} and {entity_name}"
+        if self.name.strip() == entity_name:
+            return ""
 
         # this is unused, so ignore for now
         """
@@ -103,6 +111,8 @@ class GenerativeAgent(BaseModel):
         summary = self.chain(prompt=prompt).run(q1=q1, queries=[q1, q2]).strip()
         """
         summary = self.chain(prompt=prompt).run(q1=q1, queries=[q1]).strip()
+        if self.verbose:
+            print(summary)
         return summary
 
         #return self.chain(prompt=prompt).run(q1=q1, q2=q2).strip()
@@ -128,7 +138,10 @@ class GenerativeAgent(BaseModel):
         consumed_tokens = self.llm.get_num_tokens(formatted_prompt)
 
         kwargs[self.memory.most_recent_memories_token_key] = consumed_tokens
-        return self.chain(prompt=prompt).run(**kwargs).strip()
+        reaction = self.chain(prompt=prompt).run(**kwargs).strip()
+        if self.verbose:
+            print(reaction)
+        return reaction
 
     def _clean_response(self, text: str) -> str:
         return re.sub(f"^{self.name} ", "", text.strip()).strip()
@@ -138,18 +151,20 @@ class GenerativeAgent(BaseModel):
         full_result = self._generate_reaction(observation, get_prompt('suffix_generate_reaction'))
         candidates = full_result.replace(u"\u200B", "").strip().split("\n")
         
-        result = ""
+        response = ""
         results = []
         
         for candidate in candidates:
             if "REACT:" in candidate or "SAY:" in candidate:
-                candidate = candidate.strip()
-                results.append(f'reacted by {candidate}'.replace("SAY:", "saying").replace("reacted by REACT: ", ""))
+                # can't be assed to iteratively replace
+                candidate = candidate.strip().replace("React:", "REACT:").replace("Say:", "SAY:")
+                results.append(f'{candidate}'.replace("SAY:", "said").replace(f"REACT: {self.name}", "").replace("REACT:", ""))
         if len(results) > 0:
-            result = "and".join(results)
-            response = f"reacted by {result}"
+            response = "and".join(results).strip().replace("  ", " ")
+            valid = True
         else:
-            response = f"did not react"
+            response = f"did not react in a relevant way"
+            valid = False
 
         # AAA
         self.memory.save_context(
@@ -158,6 +173,10 @@ class GenerativeAgent(BaseModel):
                 self.memory.add_memory_key: f"{self.name} observed {observation} and {response}"
             },
         )
+
+        return valid, f"{self.name} {response}"
+
+        """
         if "REACT:" in result:
             reaction = self._clean_response(result.split("REACT:")[-1])
             return True, f"{self.name} {reaction}"
@@ -166,6 +185,7 @@ class GenerativeAgent(BaseModel):
             return True, f"{self.name} said {said_value}"
         else:
             return False, f"{self.name} did not react in a relevant way"
+        """
 
     def generate_dialogue_response(self, observation: str) -> Tuple[bool, str]:
         """React to a given observation."""
@@ -206,6 +226,8 @@ class GenerativeAgent(BaseModel):
         # The agent seeks to think about their core characteristics.
         prompt = PromptTemplate.from_template(get_prompt('compute_agent_summary'))
         summary = self.chain(prompt).run(name=self.name, queries=[f"{self.name}'s core characteristics"]).strip()
+        if self.verbose:
+            print(summary)
         return summary
 
     def get_summary(self, force_refresh: bool = False) -> str:
diff --git a/src/ext/memory.py b/src/ext/memory.py
index fb79cb8..678c1a2 100755
--- a/src/ext/memory.py
+++ b/src/ext/memory.py
@@ -85,6 +85,9 @@ class GenerativeAgentMemory(BaseMemory):
         observations = self.memory_retriever.memory_stream[-last_k:]
         observation_str = "\n".join([o.page_content for o in observations])
         result = self.chain(prompt).run(observations=observation_str)
+        if self.verbose:
+            print(result)
+
         return self._parse_list(result)
 
     def _get_insights_on_topic(self, topic: str) -> List[str]:
@@ -121,14 +124,15 @@ class GenerativeAgentMemory(BaseMemory):
         prompt = PromptTemplate.from_template(get_prompt("memory_importance"))
         score = self.chain(prompt).run(memory_content=memory_content).strip()
         if self.verbose:
-            logger.info(f"Importance score: {score}")
+            print(f"Importance score: {score}")
         try:
             match = re.search(r"(\d+)", score)
             if match:
                 return (float(match.group(0)) / 10) * self.importance_weight
         except Exception as e:
             print(colored("[Scoring Error]", "red"), score)
-        return 0.0
+
+        return (float(2) / 10) * self.importance_weight
 
     def add_memory(self, memory_content: str) -> List[str]:
         """Add an observation or memory to the agent's memory."""
diff --git a/src/ext/prompts.py b/src/ext/prompts.py
index 69ac3e6..b9f8555 100755
--- a/src/ext/prompts.py
+++ b/src/ext/prompts.py
@@ -2,127 +2,265 @@ import os
 
 LLM_PROMPT_TUNE = os.environ.get('LLM_PROMPT_TUNE', "vicuna") # oai, vicuna
 
-PROMPTS = {
-    "entity_from_observation": {
-        "system": (
-            "What is the observed entity in the following observation?"
-            " ONLY report one object and write one sentence."
-            " Write `END` afterwards."
-        ),
-        "user": (
-            "Observation: {observation}"
-        ),
-        "assistant": "Entity=",
-    },
-    "entity_action": {
-        "system": (
-            "What is `{entity}` doing in the following observation?"
-            " ONLY report one object and write one sentence."
-            " Write `END` afterwards."
-        ),
-        "user": (
-            "Observation: {observation}"
-        ),
-        "assistant": "`{entity}` is ",
-    },
-    "summarize_related_memories": {
-        "system": (
-            "Given the following context, answer the following question in four sentences or less."
-            " Write `END` afterwards."
-        ),
-        "user": (
-            "{q1}?"
-            "\nContext: {relevant_memories_simple}"
-        ),
-        "assistant": "Relevant context: ",
-    },
-    "compute_agent_summary": {
-        "system": (
-            "Given the following statements, how would you summarize {name}'s core characteristics?"
-            " Do not embellish under any circumstances."
-            " Write `END` afterwards."
-        ),
-        "user": (
-            "Statements: {relevant_memories_simple}"
-        ),
-        "assistant": "Summary: ",
-    },
-    "topic_of_reflection": {
-        "system": (
-            "Given only the following information, what are the 3 most salient"
-            " high-level questions we can answer about the subjects in the statements?"
-            " Provide each question on a new line."
-        ),
-        "user": (
-            "Information: {observations}"
-        ),
-        "assistant": "",
-    },
-    "insights_on_topic": {
-        "system": (
-            "Given the following statements about {topic},"
-            " what 5 high-level insights can you infer?"
-            " (example format: insight (because of 1, 5, 3))"
-        ),
-        "user": (
-            "Statements: {related_statements}"
-        ),
-        "assistant": "",
-    },
-    "memory_importance": {
-        "system": (
-            "On the scale of 1 to 10, where 1 is purely mundane"
-            " (e.g., brushing teeth, making bed) and 10 is extremely poignant"
-            " (e.g., a break up, college acceptance),"
-            " rate the likely poignancy of the following piece of memory."
-            " Respond with only a single integer followed by 'END'."
-        ),
-        "user": (
-            "Memory: {memory_content}"
-        ),
-        "assistant": "Rating: ",
-    },
-    "generate_reaction": {
-        "system": (
-            "It is {current_time}."
-            " The following is a description of {agent_name}:"
-            "\n{agent_summary_description}"
-            "\n{agent_name}'s status: {agent_status}"
-            "\nSummary of relevant context from {agent_name}'s memory: {relevant_memories}"
-            "\nMost recent observations: {most_recent_memories}"
-            "\n\n{suffix}"
-        ),
-        "user": (
-            "Observation: {observation}"
-        ),
-        "assistant": ""
-    },
+# split because I can't prematurely end on the END token like I can with a local LM
+if LLM_PROMPT_TUNE == "oai":
+    PROMPTS = {
+        "entity_from_observation": {
+            "system": (
+                "What is the observed entity in the following observation?"
+                " ONLY report one object and write one sentence."
+            ),
+            "user": (
+                "Observation: {observation}"
+            ),
+            "assistant": "Entity=",
+        },
+        "entity_action": {
+            "system": (
+                "What is `{entity}` doing in the following observation?"
+                " ONLY report one object and write one sentence."
+            ),
+            "user": (
+                "Observation: {observation}"
+            ),
+            "assistant": "`{entity}` is ",
+        },
+        "summarize_related_memories": {
+            "system": (
+                "Given the following context, answer the following question in four sentences or less. Summarize the answer as well."
+            ),
+            "user": (
+                "{q1}?"
+                "\nContext: {relevant_memories_simple}"
+            ),
+            "assistant": "Summary of relevant context: ",
+        },
+        "compute_agent_summary": {
+            "system": (
+                "Given the following statements, how would you summarize {name}'s core characteristics?"
+                " Do not embellish under any circumstances."
+            ),
+            "user": (
+                "Statements: {relevant_memories_simple}"
+            ),
+            "assistant": "Summary: ",
+        },
+        "topic_of_reflection": {
+            "system": (
+                "Given only the following information, what are the 3 most salient"
+                " high-level questions we can answer about the subjects in the statements?"
+                " Provide each question on a new line."
+            ),
+            "user": (
+                "Information: {observations}"
+            ),
+            "assistant": "",
+        },
+        "insights_on_topic": {
+            "system": (
+                "Given the following statements about {topic},"
+                " what 5 high-level insights can you infer?"
+                " (example format: insight (because of 1, 5, 3))"
+            ),
+            "user": (
+                "Statements: {related_statements}"
+            ),
+            "assistant": "",
+        },
+        "memory_importance": {
+            "system": (
+                "On the scale of 1 to 10, where 1 is purely mundane"
+                " (e.g., brushing teeth, making bed) and 10 is extremely poignant"
+                " (e.g., a break up, college acceptance),"
+                " rate the likely poignancy of the following piece of memory."
+                " Respond with only a single integer."
+            ),
+            "user": (
+                "Memory: {memory_content}"
+            ),
+            "assistant": "Rating: ",
+        },
+        "generate_reaction": {
+            "system": (
+                "It is {current_time}."
+                " The following is a description of {agent_name}:"
+                "\n{agent_summary_description}"
+                "\n{agent_name}'s status: {agent_status}"
+                "\nSummary of relevant context from {agent_name}'s memory: {relevant_memories}"
+                "\nMost recent observations: {most_recent_memories}"
+                "\n\n{suffix}"
+            ),
+            "user": (
+                "Observation: {observation}"
+            ),
+            "assistant": ""
+        },
 
-    #
-    "context": (
-        "" # insert your JB here
-    ),
-    "suffix_generate_reaction": (
-        "Given the following observation, in one sentence, how would {agent_name} appropriately react?"
-        "\nIf the action is to engage in dialogue, write `SAY: \"what to say\"`."
-        "\nOtherwise, write `REACT: {agent_name}'s reaction`."
-        "\nWrite 'END' afterwards."
-    ),
-    "suffix_generate_dialogue_response": (
-        "Given the following observation, in one sentence, what would {agent_name} say?"
-        "\nTo continue the conversation, write: `SAY: \"what to say\"`."
-        "\nOtherwise, to end the conversation, write: `GOODBYE: \"what to say\"`."
-        "\nWrite \"END\" afterwards."
-    ),
-}
+        #
+        "context": ( # insert your JB here
+            ""
+        ),
+        "suffix_generate_reaction": (
+            "Given the following observation, in one sentence, how would {agent_name} appropriately react?"
+            "\nWrite 1 reply only in internet RP style, italicize actions, and avoid quotation marks. Use markdown. Be proactive, creative, and drive the plot and conversation forward. Write no less than six sentences each. Always stay in character and avoid repetition."
+            "\nIf the action is to engage in dialogue, write `SAY: \"what to say\"`."
+            "\nOtherwise, write `REACT: {agent_name}'s reaction`."
+        ),
+        "suffix_generate_dialogue_response": (
+            "\nWrite 1 reply only in internet RP style, italicize actions, and avoid quotation marks. Use markdown. Be proactive, creative, and drive the plot and conversation forward. Write no less than six sentences each. Always stay in character and avoid repetition."
+            "Given the following observation, in one sentence, what would {agent_name} say?"
+            "\nTo continue the conversation, write: `SAY: \"what to say\"`."
+            "\nOtherwise, to end the conversation, write: `GOODBYE: \"what to say\"`."
+        ),
+    }
+else:
+    PROMPTS = {
+        "entity_from_observation": {
+            "system": (
+                "What is the observed entity in the following observation?"
+                " ONLY report one object and write one sentence."
+                " Write `END` afterwards."
+            ),
+            "user": (
+                "Observation: {observation}"
+            ),
+            "assistant": "Entity=",
+        },
+        "entity_action": {
+            "system": (
+                "What is `{entity}` doing in the following observation?"
+                " ONLY report one object and write one sentence."
+                " Write `END` afterwards."
+            ),
+            "user": (
+                "Observation: {observation}"
+            ),
+            "assistant": "`{entity}` is ",
+        },
+        "summarize_related_memories": {
+            "system": (
+                "Given the following context, answer the following question in four sentences or less. Summarize the answer as well."
+                " Write `END` afterwards."
+                "\nContext: {relevant_memories_simple}"
+            ),
+            "user": (
+                "{q1}?"
+            ),
+            "assistant": "Summary of relevant context: ",
+        },
+        "compute_agent_summary": {
+            "system": (
+                "Given the following statements, how would you summarize {name}'s core characteristics?"
+                " Do not embellish under any circumstances."
+                " Write `END` afterwards."
+            ),
+            "user": (
+                "Statements: {relevant_memories_simple}"
+            ),
+            "assistant": "Summary: ",
+        },
+        "topic_of_reflection": {
+            "system": (
+                "Given only the following information, what are the 3 most salient"
+                " high-level questions we can answer about the subjects in the statements?"
+                " Provide each question on a new line."
+            ),
+            "user": (
+                "Information: {observations}"
+            ),
+            "assistant": "",
+        },
+        "insights_on_topic": {
+            "system": (
+                "Given the following statements about {topic},"
+                " what 5 high-level insights can you infer?"
+                " (example format: insight (because of 1, 5, 3))"
+            ),
+            "user": (
+                "Statements: {related_statements}"
+            ),
+            "assistant": "",
+        },
+        "memory_importance": {
+            "system": (
+                "On the scale of 1 to 10, where 1 is purely mundane"
+                " (e.g., brushing teeth, making bed) and 10 is extremely poignant"
+                " (e.g., a break up, college acceptance),"
+                " rate the likely poignancy of the following piece of memory."
+                " Respond with only a single integer followed by 'END'."
+            ),
+            "user": (
+                "Memory: {memory_content}"
+            ),
+            "assistant": "Rating: ",
+        },
+        "generate_reaction": {
+            "system": (
+                "It is {current_time}."
+                " The following is a description of {agent_name}:"
+                "\n{agent_summary_description}"
+                "\n{agent_name}'s status: {agent_status}"
+                "\nSummary of relevant context from {agent_name}'s memory: {relevant_memories}"
+                "\nMost recent observations: {most_recent_memories}"
+                "\n\n{suffix}"
+            ),
+            "user": (
+                "Observation: {observation}"
+            ),
+            "assistant": ""
+        },
+
+        #
+        "context": (
+            "" # insert your JB here
+        ),
+        "suffix_generate_reaction": (
+            "Given the following observation, in one sentence, how would {agent_name} appropriately react?"
+            "\nWrite 1 reply only in internet RP style, italicize actions, and avoid quotation marks. Use markdown. Be proactive, creative, and drive the plot and conversation forward. Write no less than six sentences each. Always stay in character and avoid repetition."
+            "\nIf the action is to engage in dialogue, write `SAY: \"what to say\"`."
+            "\nOtherwise, write `REACT: {agent_name}'s reaction`."
+            "\nWrite 'END' afterwards."
+        ),
+        "suffix_generate_dialogue_response": (
+            "Given the following observation, in one sentence, what would {agent_name} say?"
+            "\nWrite 1 reply only in internet RP style, italicize actions, and avoid quotation marks. Use markdown. Be proactive, creative, and drive the plot and conversation forward. Write no less than six sentences each. Always stay in character and avoid repetition."
+            "\nTo continue the conversation, write: `SAY: \"what to say\"`."
+            "\nOtherwise, to end the conversation, write: `GOODBYE: \"what to say\"`."
+            "\nWrite \"END\" afterwards."
+        ),
+    }
 
 PROMPT_TUNES = {
     "default": "{query}",
-    "vicuna": "{ROLE}: {query}"
+    "vicuna": "{role}: {query}",
+    "supercot": "{role}:\n{query}",
+}
+PROMPT_ROLES = {
+    "vicuna": {
+        "system": "SYSTEM",
+        "user": "USER",
+        "assistant": "ASSISTANT",
+    },
+    "supercot": {
+        "system": "### Instruction",
+        "user": "### Input",
+        "assistant": "### Response",
+    }
 }
 
 ROLES = [ "system", "user", "assistant" ]
 
+for k in PROMPTS:
+    if k == "context":
+        continue
+
+def get_roles( tune=LLM_PROMPT_TUNE, special=True ):
+    if tune in PROMPT_ROLES:
+        return list(PROMPT_ROLES[tune].values())
+    if special:
+        return []
+    return ROLES
+
 def get_prompt( key, tune=LLM_PROMPT_TUNE ):
     prompt = PROMPTS[key]
 
@@ -134,20 +272,29 @@ def get_prompt( key, tune=LLM_PROMPT_TUNE ):
     if tune not in PROMPT_TUNES:
         tune = "default"
 
+    context = PROMPTS["context"]
+    if context:
+        if "system" in prompt:
+            if context not in prompt["system"]:
+                prompt["system"] = f'{context}\n{prompt["system"]}'
+        else:
+            prompt["system"] = f'{context}'
+
     outputs = []
-    for role in ROLES:
+    for r in ROLES:
+        role = f'{r}' # i can't be assed to check if strings COW
         if role not in prompt:
-            # implicitly add in our context as a system message
-            if role == "system" and PROMPTS["context"]:
-                query = PROMPTS["context"]
-            else:
-                continue
+            continue
         else:
             query = prompt[role]
+
+        if tune in PROMPT_ROLES:
+            roles = PROMPT_ROLES[tune]
+            if role in roles:
+                role = roles[role]
         
         output = f'{PROMPT_TUNES[tune]}'
-        output = output.replace("{role}", role.lower())
-        output = output.replace("{ROLE}", role.upper())
+        output = output.replace("{role}", role)
         output = output.replace("{query}", query)
         outputs.append(output)
 
diff --git a/src/main.py b/src/main.py
index bb05c26..a978a8f 100755
--- a/src/main.py
+++ b/src/main.py
@@ -38,6 +38,8 @@ def agent_observes_proxy( agents, observations ):
 
 	messages = []
 	for agent in agents:
+		if agent not in AGENTS:
+			load_agent( agent )
 		agent = AGENTS[agent]
 		observations = observations.split("\n")
 		results = agent_observes( agent, observations )
@@ -50,6 +52,8 @@ def interview_agent_proxy( agents, message ):
 	
 	messages = []
 	for agent in agents:
+		if agent not in AGENTS:
+			load_agent( agent )
 		agent = AGENTS[agent]
 		messages.append(interview_agent( agent, message )[-1])
 	return "\n".join(messages)
@@ -60,13 +64,15 @@ def get_summary_proxy( agents ):
 	
 	messages = []
 	for agent in agents:
+		if agent not in AGENTS:
+			load_agent( agent )
 		agent = AGENTS[agent]
 		messages.append(get_summary( agent, force_refresh = True ))
 	return "\n".join(messages)
 
 def run_conversation_proxy( agents, message ):
 	agents = [ AGENTS[agent] for agent in agents ]
-	messages = run_conversation( agents, message, limit=len(agents)*3 )
+	messages = run_conversation( agents, message, limit=len(agents)*2 )
 	return "\n".join(messages)
 
 def view_agent( agents, last_k = 50 ):
@@ -75,6 +81,8 @@ def view_agent( agents, last_k = 50 ):
 	
 	messages = []
 	for agent in agents:
+		if agent not in AGENTS:
+			load_agent( agent )
 		agent = AGENTS[agent]
 		memories = agent.memory.memory_retriever.memory_stream[-last_k:]
 		memories = "\n".join([ document.page_content for document in memories])
diff --git a/src/utils.py b/src/utils.py
index b322a5f..937fa5a 100755
--- a/src/utils.py
+++ b/src/utils.py
@@ -21,29 +21,40 @@ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.vectorstores import FAISS
 
 # shit I can shove behind an env var
+os.environ['LLM_PROMPT_TUNE'] = "supercot"
+
 LLM_TYPE = os.environ.get('LLM_TYPE', "llamacpp") # options: llamacpp, oai
-LLM_LOCAL_MODEL = os.environ.get('LLM_MODEL', "./models/ggml-vicuna-13b-1.1/ggml-vic13b-uncensored-q4_2.bin") # "./models/llama-13b-supercot-ggml/ggml-model-q4_0.bin"
+LLM_LOCAL_MODEL = os.environ.get('LLM_MODEL', 
+	#"./models/ggml-vicuna-13b-1.1/ggml-vic13b-uncensored-q4_2.bin"
+	#"./models/llama-13b-supercot-ggml/ggml-model-q4_0.bin"
+	"./models/llama-33b-supercot-ggml/ggml-model-q4_2.bin"
+)
 LLM_CONTEXT = int(os.environ.get('LLM_CONTEXT', '2048'))
 LLM_THREADS = int(os.environ.get('LLM_THREADS', '6'))
 EMBEDDING_TYPE = os.environ.get("LLM_EMBEDDING_TYPE", "hf") # options: llamacpp, oai, hf
 
+if LLM_TYPE=="oai":
+	os.environ['LLM_PROMPT_TUNE'] = "oai"
+LLM_PROMPT_TUNE = os.environ.get('LLM_PROMPT_TUNE', "supercot")
+
 callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) # unncessesary but whatever
+
+# Overrides for some fixes, like scoring memory and LLM-specific promptings
+from ext import GenerativeAgent, GenerativeAgentMemory, get_roles
+
+STOP_TOKENS = ["END"]
+for role in get_roles( tune=LLM_PROMPT_TUNE, special=True ):
+	STOP_TOKENS.append(f'{role}:')
+
 if LLM_TYPE=="llamacpp":
 	from langchain.llms import LlamaCpp
-	
-	STOP_TOKENS = ["END"]
-
-	if os.environ.get('LLM_PROMPT_TUNE', "vicuna") == "vicuna":
-		STOP_TOKENS.append("SYSTEM:")
-		STOP_TOKENS.append("USER:")
-		STOP_TOKENS.append("ASSISTANT:")
 
 	LLM = LlamaCpp(
 		model_path=LLM_LOCAL_MODEL,
 		callback_manager=callback_manager,
 		verbose=True,
 		n_ctx=LLM_CONTEXT,
-		n_threads=LLM_THREADS,
+		#n_threads=LLM_THREADS,
 		use_mlock=True,
 		use_mmap=True,
 		stop=STOP_TOKENS
@@ -51,10 +62,6 @@ if LLM_TYPE=="llamacpp":
 elif LLM_TYPE=="oai":
 	from langchain.chat_models import ChatOpenAI
 
-	# os.environ["OPENAI_API_BASE"] = ""
-	# os.environ["OPENAI_API_KEY"] = ""
-	os.environ['LLM_PROMPT_TUNE'] = "vicuna"
-
 	# Override for Todd
 	if os.environ.get('LANGCHAIN_OVERRIDE_RESULT', '1') == '1':
 		from langchain.schema import Generation, ChatResult, LLMResult, ChatGeneration
@@ -104,12 +111,6 @@ elif EMBEDDING_TYPE == "llamacpp":
 else:
 	raise f"Invalid embedding type: {EMBEDDING_TYPE}"
 
-# Overrides for some fixes, like scoring memory and LLM-specific promptings
-if os.environ.get('LANGCHAIN_OVERRIDE', '1') == '1':
-	from ext import GenerativeAgent, GenerativeAgentMemory
-else:
-	from langchain.experimental.generative_agents import GenerativeAgent, GenerativeAgentMemory
-
 def _relevance_score_fn(score: float) -> float:
 	if EMBEDDING_TYPE == "oai":
 		return 1.0 - score / math.sqrt(2)
@@ -140,6 +141,7 @@ def _create_new_memories():
 def create_agent(**kwargs):
 	settings = {
 		"llm": LLM,
+		"verbose": True,
 		"memory": _create_new_memories(),
 	}
 	settings.update(kwargs)