added prompt tuning for superCOT (which 33B seems to be the best approach for a local LLM)

2023-04-30 22:56:02 +00:00 · 2023-04-30 22:56:02 +00:00 · f10ea1ec2a
commit f10ea1ec2a
parent 089b7043b9
7 changed files with 343 additions and 155 deletions
--- a/README.md
+++ b/README.md
@ -29,7 +29,7 @@ Set your environment variables accordingly:
 	- `OPENAI_API_MODEL`: target model
 * `LLM_MODEL`: (`./path/to/your/llama/model.bin`): path to your GGML-formatted LLaMA model, if using `llamacpp` as the LLM backend
 * `LLM_EMBEDDING_TYPE`: (`oai`, `llamacpp`, `hf`): the embedding model to use for similarity computing.
-* `LLM_PROMPT_TUNE`: (`oai`, `vicuna`): prompt formatting to use, for variants with specific finetunes for instructions, etc.
+* `LLM_PROMPT_TUNE`: (`oai`, `vicuna`, `supercot`): prompt formatting to use, for variants with specific finetunes for instructions, etc.
 * `LLM_CONTEXT`: sets maximum context size

 To run:
@ -44,6 +44,10 @@ I ***do not*** plan on making this uber-user friendly like [mrq/ai-voice-cloning

 ## Caveats

-A local LM is quite slow. Even using one that's more instruction-tuned like Vicuna (with a `SYSTEM:\nUSER:\nASSISTANT:` structure of prompts) is still inconsistent.
+A local LM is quite slow.
+
+Even using one that's more instruction-tuned like Vicuna (with a `SYSTEM:\nUSER:\nASSISTANT:` structure of prompts) is still inconsistent.
+
+However, I seem to be getting consistent results with SuperCOT 33B, it's just, well, slow.

 GPT4 seems to Just Work, unfortunately.
--- a/src/ext/init.py
+++ b/src/ext/init.py
@ -25,5 +25,6 @@ THE SOFTWARE.
 """Generative Agents primitives."""
 from .generative_agent import GenerativeAgent
 from .memory import GenerativeAgentMemory
+from .prompts import get_prompt, get_roles

 __all__ = ["GenerativeAgent", "GenerativeAgentMemory"]
--- a/src/ext/generative_agent.py
+++ b/src/ext/generative_agent.py
@ -84,17 +84,25 @@ class GenerativeAgent(BaseModel):

    def _get_entity_from_observation(self, observation: str) -> str:
        prompt = PromptTemplate.from_template(get_prompt('entity_from_observation'))
-        return self.chain(prompt).run(observation=observation).strip()
+        response = self.chain(prompt).run(observation=observation).strip().replace("Entity=", "").replace("Entity: ", "") # OAI will keep this
+        if self.verbose:
+            print(response)
+        return response

    def _get_entity_action(self, observation: str, entity_name: str) -> str:
        prompt = PromptTemplate.from_template(get_prompt('entity_action'))
-        return self.chain(prompt).run(entity=entity_name, observation=observation).strip()
+        response = self.chain(prompt).run(entity=entity_name, observation=observation).strip()
+        if self.verbose:
+            print(response)
+        return response

    def summarize_related_memories(self, observation: str) -> str:
        """Summarize memories that are most relevant to an observation."""
        prompt = PromptTemplate.from_template(get_prompt('summarize_related_memories'))
-        entity_name = self._get_entity_from_observation(observation).split("\n")[0]
+        entity_name = self._get_entity_from_observation(observation).split("\n")[0].strip()
        q1 = f"What is the relationship between {self.name} and {entity_name}"
+        if self.name.strip() == entity_name:
+            return ""

        # this is unused, so ignore for now
        """
@ -103,6 +111,8 @@ class GenerativeAgent(BaseModel):
        summary = self.chain(prompt=prompt).run(q1=q1, queries=[q1, q2]).strip()
        """
        summary = self.chain(prompt=prompt).run(q1=q1, queries=[q1]).strip()
+        if self.verbose:
+            print(summary)
        return summary

        #return self.chain(prompt=prompt).run(q1=q1, q2=q2).strip()
@ -128,7 +138,10 @@ class GenerativeAgent(BaseModel):
        consumed_tokens = self.llm.get_num_tokens(formatted_prompt)

        kwargs[self.memory.most_recent_memories_token_key] = consumed_tokens
-        return self.chain(prompt=prompt).run(**kwargs).strip()
+        reaction = self.chain(prompt=prompt).run(**kwargs).strip()
+        if self.verbose:
+            print(reaction)
+        return reaction

    def _clean_response(self, text: str) -> str:
        return re.sub(f"^{self.name} ", "", text.strip()).strip()
@ -138,18 +151,20 @@ class GenerativeAgent(BaseModel):
        full_result = self._generate_reaction(observation, get_prompt('suffix_generate_reaction'))
        candidates = full_result.replace(u"\u200B", "").strip().split("\n")
        
-        result = ""
+        response = ""
        results = []
        
        for candidate in candidates:
            if "REACT:" in candidate or "SAY:" in candidate:
-                candidate = candidate.strip()
-                results.append(f'reacted by {candidate}'.replace("SAY:", "saying").replace("reacted by REACT: ", ""))
+                # can't be assed to iteratively replace
+                candidate = candidate.strip().replace("React:", "REACT:").replace("Say:", "SAY:")
+                results.append(f'{candidate}'.replace("SAY:", "said").replace(f"REACT: {self.name}", "").replace("REACT:", ""))
        if len(results) > 0:
-            result = "and".join(results)
-            response = f"reacted by {result}"
+            response = "and".join(results).strip().replace("  ", " ")
+            valid = True
        else:
-            response = f"did not react"
+            response = f"did not react in a relevant way"
+            valid = False

        # AAA
        self.memory.save_context(
@ -158,6 +173,10 @@ class GenerativeAgent(BaseModel):
                self.memory.add_memory_key: f"{self.name} observed {observation} and {response}"
            },
        )
+
+        return valid, f"{self.name} {response}"
+
+        """
        if "REACT:" in result:
            reaction = self._clean_response(result.split("REACT:")[-1])
            return True, f"{self.name} {reaction}"
@ -166,6 +185,7 @@ class GenerativeAgent(BaseModel):
            return True, f"{self.name} said {said_value}"
        else:
            return False, f"{self.name} did not react in a relevant way"
+        """

    def generate_dialogue_response(self, observation: str) -> Tuple[bool, str]:
        """React to a given observation."""
@ -206,6 +226,8 @@ class GenerativeAgent(BaseModel):
        # The agent seeks to think about their core characteristics.
        prompt = PromptTemplate.from_template(get_prompt('compute_agent_summary'))
        summary = self.chain(prompt).run(name=self.name, queries=[f"{self.name}'s core characteristics"]).strip()
+        if self.verbose:
+            print(summary)
        return summary

    def get_summary(self, force_refresh: bool = False) -> str:
--- a/src/ext/memory.py
+++ b/src/ext/memory.py
@ -85,6 +85,9 @@ class GenerativeAgentMemory(BaseMemory):
        observations = self.memory_retriever.memory_stream[-last_k:]
        observation_str = "\n".join([o.page_content for o in observations])
        result = self.chain(prompt).run(observations=observation_str)
+        if self.verbose:
+            print(result)
+
        return self._parse_list(result)

    def _get_insights_on_topic(self, topic: str) -> List[str]:
@ -121,14 +124,15 @@ class GenerativeAgentMemory(BaseMemory):
        prompt = PromptTemplate.from_template(get_prompt("memory_importance"))
        score = self.chain(prompt).run(memory_content=memory_content).strip()
        if self.verbose:
-            logger.info(f"Importance score: {score}")
+            print(f"Importance score: {score}")
        try:
            match = re.search(r"(\d+)", score)
            if match:
                return (float(match.group(0)) / 10) * self.importance_weight
        except Exception as e:
            print(colored("[Scoring Error]", "red"), score)
-        return 0.0
+
+        return (float(2) / 10) * self.importance_weight

    def add_memory(self, memory_content: str) -> List[str]:
        """Add an observation or memory to the agent's memory."""
--- a/src/ext/prompts.py
+++ b/src/ext/prompts.py
@ -2,127 +2,265 @@ import os

 LLM_PROMPT_TUNE = os.environ.get('LLM_PROMPT_TUNE', "vicuna") # oai, vicuna

-PROMPTS = {
-    "entity_from_observation": {
-        "system": (
-            "What is the observed entity in the following observation?"
-            " ONLY report one object and write one sentence."
-            " Write `END` afterwards."
-        ),
-        "user": (
-            "Observation: {observation}"
-        ),
-        "assistant": "Entity=",
-    },
-    "entity_action": {
-        "system": (
-            "What is `{entity}` doing in the following observation?"
-            " ONLY report one object and write one sentence."
-            " Write `END` afterwards."
-        ),
-        "user": (
-            "Observation: {observation}"
-        ),
-        "assistant": "`{entity}` is ",
-    },
-    "summarize_related_memories": {
-        "system": (
-            "Given the following context, answer the following question in four sentences or less."
-            " Write `END` afterwards."
-        ),
-        "user": (
-            "{q1}?"
-            "\nContext: {relevant_memories_simple}"
-        ),
-        "assistant": "Relevant context: ",
-    },
-    "compute_agent_summary": {
-        "system": (
-            "Given the following statements, how would you summarize {name}'s core characteristics?"
-            " Do not embellish under any circumstances."
-            " Write `END` afterwards."
-        ),
-        "user": (
-            "Statements: {relevant_memories_simple}"
-        ),
-        "assistant": "Summary: ",
-    },
-    "topic_of_reflection": {
-        "system": (
-            "Given only the following information, what are the 3 most salient"
-            " high-level questions we can answer about the subjects in the statements?"
-            " Provide each question on a new line."
-        ),
-        "user": (
-            "Information: {observations}"
-        ),
-        "assistant": "",
-    },
-    "insights_on_topic": {
-        "system": (
-            "Given the following statements about {topic},"
-            " what 5 high-level insights can you infer?"
-            " (example format: insight (because of 1, 5, 3))"
-        ),
-        "user": (
-            "Statements: {related_statements}"
-        ),
-        "assistant": "",
-    },
-    "memory_importance": {
-        "system": (
-            "On the scale of 1 to 10, where 1 is purely mundane"
-            " (e.g., brushing teeth, making bed) and 10 is extremely poignant"
-            " (e.g., a break up, college acceptance),"
-            " rate the likely poignancy of the following piece of memory."
-            " Respond with only a single integer followed by 'END'."
-        ),
-        "user": (
-            "Memory: {memory_content}"
-        ),
-        "assistant": "Rating: ",
-    },
-    "generate_reaction": {
-        "system": (
-            "It is {current_time}."
-            " The following is a description of {agent_name}:"
-            "\n{agent_summary_description}"
-            "\n{agent_name}'s status: {agent_status}"
-            "\nSummary of relevant context from {agent_name}'s memory: {relevant_memories}"
-            "\nMost recent observations: {most_recent_memories}"
-            "\n\n{suffix}"
-        ),
-        "user": (
-            "Observation: {observation}"
-        ),
-        "assistant": ""
-    },
+# split because I can't prematurely end on the END token like I can with a local LM
+if LLM_PROMPT_TUNE == "oai":
+    PROMPTS = {
+        "entity_from_observation": {
+            "system": (
+                "What is the observed entity in the following observation?"
+                " ONLY report one object and write one sentence."
+            ),
+            "user": (
+                "Observation: {observation}"
+            ),
+            "assistant": "Entity=",
+        },
+        "entity_action": {
+            "system": (
+                "What is `{entity}` doing in the following observation?"
+                " ONLY report one object and write one sentence."
+            ),
+            "user": (
+                "Observation: {observation}"
+            ),
+            "assistant": "`{entity}` is ",
+        },
+        "summarize_related_memories": {
+            "system": (
+                "Given the following context, answer the following question in four sentences or less. Summarize the answer as well."
+            ),
+            "user": (
+                "{q1}?"
+                "\nContext: {relevant_memories_simple}"
+            ),
+            "assistant": "Summary of relevant context: ",
+        },
+        "compute_agent_summary": {
+            "system": (
+                "Given the following statements, how would you summarize {name}'s core characteristics?"
+                " Do not embellish under any circumstances."
+            ),
+            "user": (
+                "Statements: {relevant_memories_simple}"
+            ),
+            "assistant": "Summary: ",
+        },
+        "topic_of_reflection": {
+            "system": (
+                "Given only the following information, what are the 3 most salient"
+                " high-level questions we can answer about the subjects in the statements?"
+                " Provide each question on a new line."
+            ),
+            "user": (
+                "Information: {observations}"
+            ),
+            "assistant": "",
+        },
+        "insights_on_topic": {
+            "system": (
+                "Given the following statements about {topic},"
+                " what 5 high-level insights can you infer?"
+                " (example format: insight (because of 1, 5, 3))"
+            ),
+            "user": (
+                "Statements: {related_statements}"
+            ),
+            "assistant": "",
+        },
+        "memory_importance": {
+            "system": (
+                "On the scale of 1 to 10, where 1 is purely mundane"
+                " (e.g., brushing teeth, making bed) and 10 is extremely poignant"
+                " (e.g., a break up, college acceptance),"
+                " rate the likely poignancy of the following piece of memory."
+                " Respond with only a single integer."
+            ),
+            "user": (
+                "Memory: {memory_content}"
+            ),
+            "assistant": "Rating: ",
+        },
+        "generate_reaction": {
+            "system": (
+                "It is {current_time}."
+                " The following is a description of {agent_name}:"
+                "\n{agent_summary_description}"
+                "\n{agent_name}'s status: {agent_status}"
+                "\nSummary of relevant context from {agent_name}'s memory: {relevant_memories}"
+                "\nMost recent observations: {most_recent_memories}"
+                "\n\n{suffix}"
+            ),
+            "user": (
+                "Observation: {observation}"
+            ),
+            "assistant": ""
+        },

-    #
-    "context": (
-        "" # insert your JB here
-    ),
-    "suffix_generate_reaction": (
-        "Given the following observation, in one sentence, how would {agent_name} appropriately react?"
-        "\nIf the action is to engage in dialogue, write `SAY: \"what to say\"`."
-        "\nOtherwise, write `REACT: {agent_name}'s reaction`."
-        "\nWrite 'END' afterwards."
-    ),
-    "suffix_generate_dialogue_response": (
-        "Given the following observation, in one sentence, what would {agent_name} say?"
-        "\nTo continue the conversation, write: `SAY: \"what to say\"`."
-        "\nOtherwise, to end the conversation, write: `GOODBYE: \"what to say\"`."
-        "\nWrite \"END\" afterwards."
-    ),
-}
+        #
+        "context": ( # insert your JB here
+            ""
+        ),
+        "suffix_generate_reaction": (
+            "Given the following observation, in one sentence, how would {agent_name} appropriately react?"
+            "\nWrite 1 reply only in internet RP style, italicize actions, and avoid quotation marks. Use markdown. Be proactive, creative, and drive the plot and conversation forward. Write no less than six sentences each. Always stay in character and avoid repetition."
+            "\nIf the action is to engage in dialogue, write `SAY: \"what to say\"`."
+            "\nOtherwise, write `REACT: {agent_name}'s reaction`."
+        ),
+        "suffix_generate_dialogue_response": (
+            "\nWrite 1 reply only in internet RP style, italicize actions, and avoid quotation marks. Use markdown. Be proactive, creative, and drive the plot and conversation forward. Write no less than six sentences each. Always stay in character and avoid repetition."
+            "Given the following observation, in one sentence, what would {agent_name} say?"
+            "\nTo continue the conversation, write: `SAY: \"what to say\"`."
+            "\nOtherwise, to end the conversation, write: `GOODBYE: \"what to say\"`."
+        ),
+    }
+else:
+    PROMPTS = {
+        "entity_from_observation": {
+            "system": (
+                "What is the observed entity in the following observation?"
+                " ONLY report one object and write one sentence."
+                " Write `END` afterwards."
+            ),
+            "user": (
+                "Observation: {observation}"
+            ),
+            "assistant": "Entity=",
+        },
+        "entity_action": {
+            "system": (
+                "What is `{entity}` doing in the following observation?"
+                " ONLY report one object and write one sentence."
+                " Write `END` afterwards."
+            ),
+            "user": (
+                "Observation: {observation}"
+            ),
+            "assistant": "`{entity}` is ",
+        },
+        "summarize_related_memories": {
+            "system": (
+                "Given the following context, answer the following question in four sentences or less. Summarize the answer as well."
+                " Write `END` afterwards."
+                "\nContext: {relevant_memories_simple}"
+            ),
+            "user": (
+                "{q1}?"
+            ),
+            "assistant": "Summary of relevant context: ",
+        },
+        "compute_agent_summary": {
+            "system": (
+                "Given the following statements, how would you summarize {name}'s core characteristics?"
+                " Do not embellish under any circumstances."
+                " Write `END` afterwards."
+            ),
+            "user": (
+                "Statements: {relevant_memories_simple}"
+            ),
+            "assistant": "Summary: ",
+        },
+        "topic_of_reflection": {
+            "system": (
+                "Given only the following information, what are the 3 most salient"
+                " high-level questions we can answer about the subjects in the statements?"
+                " Provide each question on a new line."
+            ),
+            "user": (
+                "Information: {observations}"
+            ),
+            "assistant": "",
+        },
+        "insights_on_topic": {
+            "system": (
+                "Given the following statements about {topic},"
+                " what 5 high-level insights can you infer?"
+                " (example format: insight (because of 1, 5, 3))"
+            ),
+            "user": (
+                "Statements: {related_statements}"
+            ),
+            "assistant": "",
+        },
+        "memory_importance": {
+            "system": (
+                "On the scale of 1 to 10, where 1 is purely mundane"
+                " (e.g., brushing teeth, making bed) and 10 is extremely poignant"
+                " (e.g., a break up, college acceptance),"
+                " rate the likely poignancy of the following piece of memory."
+                " Respond with only a single integer followed by 'END'."
+            ),
+            "user": (
+                "Memory: {memory_content}"
+            ),
+            "assistant": "Rating: ",
+        },
+        "generate_reaction": {
+            "system": (
+                "It is {current_time}."
+                " The following is a description of {agent_name}:"
+                "\n{agent_summary_description}"
+                "\n{agent_name}'s status: {agent_status}"
+                "\nSummary of relevant context from {agent_name}'s memory: {relevant_memories}"
+                "\nMost recent observations: {most_recent_memories}"
+                "\n\n{suffix}"
+            ),
+            "user": (
+                "Observation: {observation}"
+            ),
+            "assistant": ""
+        },
+
+        #
+        "context": (
+            "" # insert your JB here
+        ),
+        "suffix_generate_reaction": (
+            "Given the following observation, in one sentence, how would {agent_name} appropriately react?"
+            "\nWrite 1 reply only in internet RP style, italicize actions, and avoid quotation marks. Use markdown. Be proactive, creative, and drive the plot and conversation forward. Write no less than six sentences each. Always stay in character and avoid repetition."
+            "\nIf the action is to engage in dialogue, write `SAY: \"what to say\"`."
+            "\nOtherwise, write `REACT: {agent_name}'s reaction`."
+            "\nWrite 'END' afterwards."
+        ),
+        "suffix_generate_dialogue_response": (
+            "Given the following observation, in one sentence, what would {agent_name} say?"
+            "\nWrite 1 reply only in internet RP style, italicize actions, and avoid quotation marks. Use markdown. Be proactive, creative, and drive the plot and conversation forward. Write no less than six sentences each. Always stay in character and avoid repetition."
+            "\nTo continue the conversation, write: `SAY: \"what to say\"`."
+            "\nOtherwise, to end the conversation, write: `GOODBYE: \"what to say\"`."
+            "\nWrite \"END\" afterwards."
+        ),
+    }

 PROMPT_TUNES = {
    "default": "{query}",
-    "vicuna": "{ROLE}: {query}"
+    "vicuna": "{role}: {query}",
+    "supercot": "{role}:\n{query}",
+}
+PROMPT_ROLES = {
+    "vicuna": {
+        "system": "SYSTEM",
+        "user": "USER",
+        "assistant": "ASSISTANT",
+    },
+    "supercot": {
+        "system": "### Instruction",
+        "user": "### Input",
+        "assistant": "### Response",
+    }
 }

 ROLES = [ "system", "user", "assistant" ]

+for k in PROMPTS:
+    if k == "context":
+        continue
+
+def get_roles( tune=LLM_PROMPT_TUNE, special=True ):
+    if tune in PROMPT_ROLES:
+        return list(PROMPT_ROLES[tune].values())
+    if special:
+        return []
+    return ROLES
+
 def get_prompt( key, tune=LLM_PROMPT_TUNE ):
    prompt = PROMPTS[key]

@ -134,20 +272,29 @@ def get_prompt( key, tune=LLM_PROMPT_TUNE ):
    if tune not in PROMPT_TUNES:
        tune = "default"

+    context = PROMPTS["context"]
+    if context:
+        if "system" in prompt:
+            if context not in prompt["system"]:
+                prompt["system"] = f'{context}\n{prompt["system"]}'
+        else:
+            prompt["system"] = f'{context}'
+
    outputs = []
-    for role in ROLES:
+    for r in ROLES:
+        role = f'{r}' # i can't be assed to check if strings COW
        if role not in prompt:
-            # implicitly add in our context as a system message
-            if role == "system" and PROMPTS["context"]:
-                query = PROMPTS["context"]
-            else:
-                continue
+            continue
        else:
            query = prompt[role]
+
+        if tune in PROMPT_ROLES:
+            roles = PROMPT_ROLES[tune]
+            if role in roles:
+                role = roles[role]
        
        output = f'{PROMPT_TUNES[tune]}'
-        output = output.replace("{role}", role.lower())
-        output = output.replace("{ROLE}", role.upper())
+        output = output.replace("{role}", role)
        output = output.replace("{query}", query)
        outputs.append(output)

--- a/src/main.py
+++ b/src/main.py
@ -38,6 +38,8 @@ def agent_observes_proxy( agents, observations ):

 	messages = []
 	for agent in agents:
+		if agent not in AGENTS:
+			load_agent( agent )
 		agent = AGENTS[agent]
 		observations = observations.split("\n")
 		results = agent_observes( agent, observations )
@ -50,6 +52,8 @@ def interview_agent_proxy( agents, message ):
 	
 	messages = []
 	for agent in agents:
+		if agent not in AGENTS:
+			load_agent( agent )
 		agent = AGENTS[agent]
 		messages.append(interview_agent( agent, message )[-1])
 	return "\n".join(messages)
@ -60,13 +64,15 @@ def get_summary_proxy( agents ):
 	
 	messages = []
 	for agent in agents:
+		if agent not in AGENTS:
+			load_agent( agent )
 		agent = AGENTS[agent]
 		messages.append(get_summary( agent, force_refresh = True ))
 	return "\n".join(messages)

 def run_conversation_proxy( agents, message ):
 	agents = [ AGENTS[agent] for agent in agents ]
-	messages = run_conversation( agents, message, limit=len(agents)*3 )
+	messages = run_conversation( agents, message, limit=len(agents)*2 )
 	return "\n".join(messages)

 def view_agent( agents, last_k = 50 ):
@ -75,6 +81,8 @@ def view_agent( agents, last_k = 50 ):
 	
 	messages = []
 	for agent in agents:
+		if agent not in AGENTS:
+			load_agent( agent )
 		agent = AGENTS[agent]
 		memories = agent.memory.memory_retriever.memory_stream[-last_k:]
 		memories = "\n".join([ document.page_content for document in memories])
--- a/src/utils.py
+++ b/src/utils.py
@ -21,29 +21,40 @@ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.vectorstores import FAISS

 # shit I can shove behind an env var
+os.environ['LLM_PROMPT_TUNE'] = "supercot"
+
 LLM_TYPE = os.environ.get('LLM_TYPE', "llamacpp") # options: llamacpp, oai
-LLM_LOCAL_MODEL = os.environ.get('LLM_MODEL', "./models/ggml-vicuna-13b-1.1/ggml-vic13b-uncensored-q4_2.bin") # "./models/llama-13b-supercot-ggml/ggml-model-q4_0.bin"
+LLM_LOCAL_MODEL = os.environ.get('LLM_MODEL', 
+	#"./models/ggml-vicuna-13b-1.1/ggml-vic13b-uncensored-q4_2.bin"
+	#"./models/llama-13b-supercot-ggml/ggml-model-q4_0.bin"
+	"./models/llama-33b-supercot-ggml/ggml-model-q4_2.bin"
+)
 LLM_CONTEXT = int(os.environ.get('LLM_CONTEXT', '2048'))
 LLM_THREADS = int(os.environ.get('LLM_THREADS', '6'))
 EMBEDDING_TYPE = os.environ.get("LLM_EMBEDDING_TYPE", "hf") # options: llamacpp, oai, hf

+if LLM_TYPE=="oai":
+	os.environ['LLM_PROMPT_TUNE'] = "oai"
+LLM_PROMPT_TUNE = os.environ.get('LLM_PROMPT_TUNE', "supercot")
+
 callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) # unncessesary but whatever
+
+# Overrides for some fixes, like scoring memory and LLM-specific promptings
+from ext import GenerativeAgent, GenerativeAgentMemory, get_roles
+
+STOP_TOKENS = ["END"]
+for role in get_roles( tune=LLM_PROMPT_TUNE, special=True ):
+	STOP_TOKENS.append(f'{role}:')
+
 if LLM_TYPE=="llamacpp":
 	from langchain.llms import LlamaCpp
-	
-	STOP_TOKENS = ["END"]
-
-	if os.environ.get('LLM_PROMPT_TUNE', "vicuna") == "vicuna":
-		STOP_TOKENS.append("SYSTEM:")
-		STOP_TOKENS.append("USER:")
-		STOP_TOKENS.append("ASSISTANT:")

 	LLM = LlamaCpp(
 		model_path=LLM_LOCAL_MODEL,
 		callback_manager=callback_manager,
 		verbose=True,
 		n_ctx=LLM_CONTEXT,
-		n_threads=LLM_THREADS,
+		#n_threads=LLM_THREADS,
 		use_mlock=True,
 		use_mmap=True,
 		stop=STOP_TOKENS
@ -51,10 +62,6 @@ if LLM_TYPE=="llamacpp":
 elif LLM_TYPE=="oai":
 	from langchain.chat_models import ChatOpenAI

-	# os.environ["OPENAI_API_BASE"] = ""
-	# os.environ["OPENAI_API_KEY"] = ""
-	os.environ['LLM_PROMPT_TUNE'] = "vicuna"
-
 	# Override for Todd
 	if os.environ.get('LANGCHAIN_OVERRIDE_RESULT', '1') == '1':
 		from langchain.schema import Generation, ChatResult, LLMResult, ChatGeneration
@ -104,12 +111,6 @@ elif EMBEDDING_TYPE == "llamacpp":
 else:
 	raise f"Invalid embedding type: {EMBEDDING_TYPE}"

-# Overrides for some fixes, like scoring memory and LLM-specific promptings
-if os.environ.get('LANGCHAIN_OVERRIDE', '1') == '1':
-	from ext import GenerativeAgent, GenerativeAgentMemory
-else:
-	from langchain.experimental.generative_agents import GenerativeAgent, GenerativeAgentMemory
-
 def _relevance_score_fn(score: float) -> float:
 	if EMBEDDING_TYPE == "oai":
 		return 1.0 - score / math.sqrt(2)
@ -140,6 +141,7 @@ def _create_new_memories():
 def create_agent(**kwargs):
 	settings = {
 		"llm": LLM,
+		"verbose": True,
 		"memory": _create_new_memories(),
 	}
 	settings.update(kwargs)