From 287406e7bab8c22122a069bd09b6069451286bbe Mon Sep 17 00:00:00 2001 From: mrq Date: Tue, 9 May 2023 23:57:54 +0000 Subject: [PATCH] tunings --- README.md | 6 +-- src/ext/generative_agent.py | 82 ++++++++++++++++++++++++------------- src/ext/memory.py | 12 ++++-- src/ext/prompts.py | 77 +++++++++++----------------------- src/main.py | 28 +++++++------ src/utils.py | 47 +++++++++++++-------- 6 files changed, 135 insertions(+), 117 deletions(-) diff --git a/README.md b/README.md index 60b03eb..ff4249a 100755 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Set your environment variables accordingly: - `OPENAI_API_MODEL`: target model * `LLM_MODEL`: (`./path/to/your/llama/model.bin`): path to your GGML-formatted LLaMA model, if using `llamacpp` as the LLM backend * `LLM_EMBEDDING_TYPE`: (`oai`, `llamacpp`, `hf`): the embedding model to use for similarity computing. -* `LLM_PROMPT_TUNE`: (`oai`, `vicuna`, `supercot`): prompt formatting to use, for variants with specific finetunes for instructions, etc. +* `LLM_PROMPT_TUNE`: (`oai`, `vicuna`, `supercot`, `cocktail`): prompt formatting to use, for variants with specific finetunes for instructions, etc. * `LLM_CONTEXT`: sets maximum context size To run: @@ -44,11 +44,11 @@ I ***do not*** plan on making this uber-user friendly like [mrq/ai-voice-cloning ## Caveats -A local LM is quite slow. +A local LM is quite slow. Things seem to be getting faster as llama.cpp is being developed. Even using one that's more instruction-tuned like Vicuna (with a `SYSTEM:\nUSER:\nASSISTANT:` structure of prompts), it's still inconsistent. -However, I seem to be getting consistent results with SuperCOT 33B, it's just, well, slow. SuperCOT 13B seems to be giving better answers over Vicuna-1.1 13B, so. +However, I seem to be getting consistent results with SuperCOT 33B, it's just, well, slow. SuperCOT 13B seems to be giving better answers over Vicuna-1.1 13B, so. Cocktail 13B seems to be the best of the 13Bs. A ***lot*** of prompt wrangling is needed, and a lot of the routines could be polished up (for example, an observation queries the LM for a rating, and each response reaction requires quering for the observed entity, then the relationship between an agent and observed entity which ends up just summarizing relevant context/memories, and then queries for a response), and if one of these steps fails, then the fail rate is higher. If anything, I might as well just work from the ground up and only really salvage the use of FAISS to store embedded-vectors. diff --git a/src/ext/generative_agent.py b/src/ext/generative_agent.py index 9cd9e11..d506eec 100755 --- a/src/ext/generative_agent.py +++ b/src/ext/generative_agent.py @@ -86,16 +86,26 @@ class GenerativeAgent(BaseModel): llm=self.llm, prompt=prompt, verbose=self.verbose, memory=self.memory ) - def get_most_recent_memories(self, last_k: int = 4) -> str: + def get_most_recent_memories(self, last_k: int = 8) -> str: memories = self.memory.memory_retriever.memory_stream[-last_k:] - return [ document.page_content for document in memories ] + return [ document.page_content.replace(u"\u200B", "").strip() for document in memories ] - def summarize_related_memories(self, observation: str) -> str: - """Summarize memories that are most relevant to an observation.""" + def get_relevant_memories(self, observation: str, first_k : int = 8) -> str: + queries = [ observation ] + relevant_memories = [ + mem.page_content.replace(u"\u200B", "").strip() for query in queries for mem in self.memory.fetch_memories(query) + ] + relevant_memories = relevant_memories[:first_k] + relevant_memories.reverse() + return relevant_memories + + """ + def summarize_related_memories(self, observation: str, first_k : int = 4) -> str: prompt = PromptTemplate.from_template(get_prompt('summarize_related_memories')) - q1 = f"Summarize the relationship between the subjects in that interaction." - summary = self.chain(prompt=prompt).run(name=self.name, stop=get_stop_tokens(), q1=q1, observation=observation, queries=[observation]).strip() + query = f"Summarize the relationship between the subjects in that interaction in two sentences or less. Avoid repeating." + summary = self.chain(prompt=prompt).run(name=self.name, stop=get_stop_tokens(), query=query, observation=observation, queries=[observation]).strip() return f'{self.name} {summary}' + """ #return self.chain(prompt=prompt).run(stop=get_stop_tokens(), q1=q1, q2=q2).strip() @@ -104,17 +114,27 @@ class GenerativeAgent(BaseModel): prompt = PromptTemplate.from_template( get_prompt('generate_reaction').replace("{suffix}", suffix) ) - summary = self.get_summary().replace(u"\u200B", "").strip() - relevant_memories = self.summarize_related_memories(observation).replace(u"\u200B", "").strip() - recent_memories = "\n".join(self.get_most_recent_memories()) + summary = self.get_summary() + relevant_memories = self.get_relevant_memories(observation) + recent_memories = self.get_most_recent_memories() - # I think relevant_memories is suppose to only provide context for a relationship between agent and observer, as suggested with the query - # but the original implementation seems to just leverage it to further filter relevant memories, per the name + # avoid repeating + memory = "" - if relevant_memories and relevant_memories != "N/A": - memory = relevant_memories - else: - memory = "\n".join(self.get_most_recent_memories()) + for mem in relevant_memories: + if mem in summary or mem in memory: + continue + memory += f"\n{mem}" + + for mem in recent_memories: + if mem in summary: + continue + if mem is observation: + continue + # erase it, move it to bottom + if mem in memory: + memory = memory.replace(f'{mem}\n', "") + memory += f"\n{mem}" current_time_str = datetime.now().strftime("%B %d, %Y, %I:%M %p") kwargs: Dict[str, Any] = dict( @@ -127,12 +147,23 @@ class GenerativeAgent(BaseModel): #recent_memories=recent_memories if recent_memories else "N/A", observation=observation if observation else "N/A", ) - reaction = self.chain(prompt=prompt).run(stop=get_stop_tokens(), queries=[observation], **kwargs).strip() + reaction = self.chain(prompt=prompt).run(stop=get_stop_tokens(), **kwargs).strip() + import re + + emoji_pattern = re.compile("[" + u"\U0001F600-\U0001F64F" # emoticons + u"\U0001F300-\U0001F5FF" # symbols & pictographs + u"\U0001F680-\U0001F6FF" # transport & map symbols + u"\U0001F1E0-\U0001F1FF" # flags (iOS) + "]+", flags=re.UNICODE) + reaction = emoji_pattern.sub(r'', reaction) # cleanup reactions = reaction.replace(u"\u200B", "").strip().split("\n") for reaction in reactions: + if reaction in summary or reaction in memory: + continue if reaction: break @@ -140,20 +171,14 @@ class GenerativeAgent(BaseModel): print(reaction) return reaction - def _clean_response(self, text: str) -> str: - return re.sub(f"^{self.name} ", "", text.strip()).strip() - def generate_response(self, observation: str) -> Tuple[bool, str]: """React to a given observation.""" call_to_action_template = get_prompt('suffix_generate_response') - full_result = f"{self.name} {self._generate_reaction(observation, call_to_action_template)}" - - self.memory.save_context( - {}, - { - self.memory.add_memory_key: full_result - }, - ) + full_result = "" + while not full_result: + full_result = f"{self._generate_reaction(observation, call_to_action_template)}" + if full_result: + break return True, full_result @@ -191,7 +216,8 @@ class GenerativeAgent(BaseModel): f"Status: {self.status}" ] - return "\n".join([ value for value in values if value[-3:] != "N/A" ]) + f"\n{self.summary.strip()}" + summary = "\n".join([ value for value in values if value[-3:] != "N/A" ]) + f"\nSummary: {self.summary.strip()}" + return summary.replace(u"\u200B", "").strip() def get_full_header(self, force_refresh: bool = False) -> str: """Return a full header of the agent's status, summary, and current time.""" diff --git a/src/ext/memory.py b/src/ext/memory.py index 80cc501..82f7f50 100755 --- a/src/ext/memory.py +++ b/src/ext/memory.py @@ -71,6 +71,8 @@ class GenerativeAgentMemory(BaseMemory): relevant_memories_simple_key: str = "relevant_memories_simple" most_recent_memories_key: str = "most_recent_memories" + reflecting: bool = False + def chain(self, prompt: PromptTemplate) -> LLMChain: return LLMChain(llm=self.llm, prompt=prompt, verbose=self.verbose) @@ -133,9 +135,10 @@ class GenerativeAgentMemory(BaseMemory): return (float(2) / 10) * self.importance_weight - def add_memory(self, memory_content: str) -> List[str]: + def add_memory(self, memory_content: str, importance_score: int = 0) -> List[str]: """Add an observation or memory to the agent's memory.""" - importance_score = self._score_memory_importance(memory_content) + if not importance_score: + importance_score = self._score_memory_importance(memory_content) self.aggregate_importance += importance_score document = Document( page_content=memory_content, metadata={"importance": importance_score} ) result = self.memory_retriever.add_documents([document]) @@ -146,10 +149,13 @@ class GenerativeAgentMemory(BaseMemory): if ( self.reflection_threshold is not None and self.aggregate_importance > self.reflection_threshold + and not self.reflecting ): + self.reflecting = True self.pause_to_reflect() # Hack to clear the importance from reflection self.aggregate_importance = 0.0 + self.reflecting = False return (importance_score, result) @@ -169,7 +175,7 @@ class GenerativeAgentMemory(BaseMemory): return "\n".join([f"{mem}" for mem in content]) def format_memories_simple(self, relevant_memories: List[Document]) -> str: - return "; ".join([f"{mem.page_content}" for mem in relevant_memories]).replace(".;", ";") + return "; ".join([f"{mem.page_content}" for mem in relevant_memories]).replace(".;", ".\n") def _get_memories_until_limit(self, consumed_tokens: int) -> str: """Reduce the number of tokens in the documents.""" diff --git a/src/ext/prompts.py b/src/ext/prompts.py index 43e521a..028681f 100755 --- a/src/ext/prompts.py +++ b/src/ext/prompts.py @@ -2,32 +2,10 @@ import os LLM_PROMPT_TUNE = os.environ.get('LLM_PROMPT_TUNE') # oai, vicuna, supercot -USE_STOP_HINT = [ "llama" ] - PROMPTS = { - "entity_from_observation": { - "system": ( - "What is the observed entity in the following observation?" - " ONLY report one object and write one sentence." - ), - "user": ( - "{observation}" - ), - "assistant": "Entity = ", - }, - "entity_action": { - "system": ( - "What is `{entity}` doing in the following observation?" - " ONLY write one sentence." - ), - "user": ( - "{observation}" - ), - "assistant": "{entity} is ", - }, "summarize_related_memories": { "system": ( - "{q1}" + "{query}" ), "user": ( "{relevant_memories_simple}" @@ -44,7 +22,7 @@ PROMPTS = { "{summary}" "\n{relevant_memories_simple}" ), - "assistant": "", + "assistant": "{name} ", }, "topic_of_reflection": { "system": ( @@ -53,7 +31,7 @@ PROMPTS = { " Provide each question on a new line." ), "user": ( - "{observations}" + "Information: {observations}" ), "assistant": "", }, @@ -77,24 +55,22 @@ PROMPTS = { "\nRespond with only a single integer." ), "user": ( - "{memory_content}" + "Event: {memory_content}" ), - "assistant": "", + "assistant": "Rating: ", }, "generate_reaction": { "system": ( - "\nIt is {current_time}." + "[Write one reply. Always stay in character. Maintain a casual tone using beige prose. Be brief. Avoid repeating anything below.]" + "\nCurrent Time: {current_time}" "\n{summary}" - "\n{relevant_memories_simple}" "\n{memory}" - #"\nRecent memories: {recent_memories}" - #"\nRelevant memories: {relevant_memories}" - "\n\n{suffix}" + "\n{observation}" ), "user": ( - "{observation}" + "{suffix}" ), - "assistant": "{name} " + "assistant": "" }, # @@ -102,24 +78,7 @@ PROMPTS = { "" ), "suffix_generate_response": ( - "Given the following observation, how would {name} respond?" - "\nWrite only one sentence." - ), - - ## - "suffix_generate_reaction": ( - "Given the following observation, how would {name} appropriately react?" - "\nIf the action is to engage in dialogue, only write `SAY: \"what to say\"`." - "\nOr otherwise, only write `REACT: how to react`." - "\nWrite ONLY one line, one sentence." - #"\nBe proactive, creative, and drive the plot and conversation forward." - ), - "suffix_generate_dialogue": ( - "Given the following observation, what would {name} say?" - "\nTo continue the conversation, only write: `SAY: \"what to say\"`." - "\nOr otherwise, to end the conversation, only write: `GOODBYE: \"what to say\"`." - "\nWrite ONLY one line, one sentence." - #"\nBe proactive, creative, and drive the plot and conversation forward." + "Given the current situation, in one sentence, what is {name}'s next response?" ), } @@ -128,6 +87,7 @@ PROMPT_TUNES = { "vicuna": "{role}: {query}", "supercot": "{role}:\n{query}", "alpasta": "{role}# {query}", + "cocktail": "{role}: {query}", } PROMPT_ROLES = { "vicuna": { @@ -145,6 +105,11 @@ PROMPT_ROLES = { "user": "<|user|>", "assistant": "<|assistant|>", }, + "cocktail": { + "system": "", + "user": "USER", + "assistant": "ASSOCIATE", + }, } ROLES = [ "system", "user", "assistant" ] @@ -153,7 +118,8 @@ ROLES = [ "system", "user", "assistant" ] def get_stop_tokens( tokens=[], tune=LLM_PROMPT_TUNE ): STOP_TOKENS = ["###"] + tokens for role in get_roles( tune=LLM_PROMPT_TUNE, special=True ): - STOP_TOKENS.append(f'{role}') + if role: + STOP_TOKENS.append(f'{role}') return STOP_TOKENS for k in PROMPTS: @@ -204,4 +170,7 @@ def get_prompt( key, tune=LLM_PROMPT_TUNE ): output = output.replace("{query}", query) outputs.append(output) - return "\n".join(outputs) \ No newline at end of file + output = "\n".join(outputs) + #if LLM_PROMPT_TUNE == "cocktail": + output = output.strip() + return output \ No newline at end of file diff --git a/src/main.py b/src/main.py index 504a3b1..6639fe0 100755 --- a/src/main.py +++ b/src/main.py @@ -37,8 +37,7 @@ def agent_observes_proxy( agents, observations ): if agent not in AGENTS: load_agent( agent ) agent = AGENTS[agent] - observations = observations.split("\n") - results = agent_observes( agent, observations ) + results = agent_observes( agent, observations.split("\n") ) messages.append(f"[{agent.name}] Observation noted. Importance score: {[ result[0] for result in results ]}") return "\n".join(messages) @@ -51,8 +50,7 @@ def agent_reacts_proxy( agents, observations ): if agent not in AGENTS: load_agent( agent ) agent = AGENTS[agent] - observations = observations.split("\n") - response = agent_reacts( agent, observations ) + response = agent_reacts( agent, observations.split("\n") ) messages.append(f"[{agent.name}] {response}") return "\n".join(messages) @@ -80,29 +78,35 @@ def get_summary_proxy( agents ): messages.append(get_summary( agent, force_refresh = True )) return "\n".join(messages) -def run_conversation_proxy( agents, observation, limit=2 ): +def run_conversation_proxy( agents, message, limit=4 ): agents = [ AGENTS[agent] for agent in agents ] if len(agents) < 2: raise "Not enough agents" dialogue = [] - dialogue.append(f'[{agents[0].name}] {observation}') + dialogue.append(f'[{agents[0].name}] {message}') + yield "\n".join(dialogue) """Runs a conversation between agents.""" print(colored("[Conversation]", "magenta")) - yield "\n".join(dialogue) - agent_observes( agents[0], [observation] ) + importance_score = 0 + for agent in agents: + importance_score = agent_observes( agent, [ message ], importance_score=importance_score )[0][0] agents = agents[1:] + [agents[0]] dialogue = [] while True: for agent in agents: - observation = agent_reacts( agent, [ observation ] )[0] - yield observation - if limit > 0 and len(dialogue) >= limit: + message = agent_reacts( agent, [ message ] )[0] + importance_score = 0 + for a in agents: + importance_score = agent_observes( a, [ message ], importance_score=importance_score )[0][0] + + dialogue.append(f'[{agent.name}] {message}') + yield "\n".join(dialogue) + if limit > 0 and len(dialogue) >= limit * len(agents): break - return dialogue print("END") dialogue.append("END") return "\n".join(dialogue) diff --git a/src/utils.py b/src/utils.py index 8239ea0..0a3fd94 100755 --- a/src/utils.py +++ b/src/utils.py @@ -27,14 +27,20 @@ from langchain.vectorstores import FAISS LLM_TYPE = os.environ.get('LLM_TYPE', "llamacpp") # options: llamacpp, oai LLM_LOCAL_MODEL = os.environ.get('LLM_MODEL', #"./models/ggml-vicuna-13b-1.1/ggml-vic13b-uncensored-q4_2.bin" - "./models/llama-13b-supercot-ggml/ggml-model-q4_2.bin" + "./models/ggml-vicuna-13b-cocktail-v1-q5_0.bin" + #"./models/llama-13b-supercot-ggml/ggml-model-q4_2.bin" #"./models/llama-33b-supercot-ggml/ggml-model-q4_2.bin" #"./models/gpt4-x-alpasta-30b-ggml-q4_1.bin" ) LLM_CONTEXT = int(os.environ.get('LLM_CONTEXT', '2048')) LLM_THREADS = int(os.environ.get('LLM_THREADS', '6')) +LLM_TEMPERATURE = float(os.environ.get('LLM_TEMPERATURE', '0.99')) EMBEDDING_TYPE = os.environ.get("LLM_EMBEDDING_TYPE", "hf") # options: llamacpp, oai, hf +#LLM_TYPE="oai" +#os.environ['OPENAI_API_BASE']="https://oai.ecker.tech/proxy/openai" +#os.environ['OPENAI_API_KEY']="" + # deduce a default given a model path if LLM_TYPE=="oai": LLM_PROMPT_TUNE_DEFAULT = "oai" @@ -45,6 +51,8 @@ else: LLM_PROMPT_TUNE_DEFAULT = "vicuna" elif "alpasta" in LLM_LOCAL_MODEL.lower(): LLM_PROMPT_TUNE_DEFAULT = "alpasta" + elif "cocktail" in LLM_LOCAL_MODEL.lower(): + LLM_PROMPT_TUNE_DEFAULT = "cocktail" else: LLM_PROMPT_TUNE_DEFAULT = "llama" @@ -64,6 +72,7 @@ if LLM_TYPE=="llamacpp": callback_manager=callback_manager, verbose=True, n_ctx=LLM_CONTEXT, + temperature=LLM_TEMPERATURE, #n_threads=LLM_THREADS, #use_mlock=True, #use_mmap=True, @@ -89,6 +98,7 @@ elif LLM_TYPE=="oai": LLM = ChatOpenAI( max_tokens=LLM_CONTEXT, + temperature=LLM_TEMPERATURE, model_name=os.environ.get('OPENAI_MODEL_NAME', 'gpt-4'), ) @@ -98,7 +108,7 @@ else: if EMBEDDING_TYPE == "hf": from langchain.embeddings import HuggingFaceEmbeddings - EMBEDDINGS_MODEL = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") + EMBEDDINGS_MODEL = HuggingFaceEmbeddings() EMBEDDINGS_SIZE = 768 elif EMBEDDING_TYPE == "oai": from langchain.embeddings import OpenAIEmbeddings @@ -110,10 +120,6 @@ elif EMBEDDING_TYPE == "llamacpp": EMBEDDINGS_MODEL = LlamaCppEmbeddings( model_path=LLM_LOCAL_MODEL, - n_ctx=LLM_CONTEXT, - n_threads=LLM_THREADS, - use_mlock=True, - use_mmap=True, ) EMBEDDINGS_SIZE = 5120 else: @@ -143,7 +149,7 @@ def _create_new_memories(): memory_retriever=_create_new_memory_retriever(), reflection_threshold=8, verbose=True, - max_tokens_limit=128 # LLM_CONTEXT/4 + max_tokens_limit=LLM_CONTEXT/2 ) def create_agent(**kwargs): @@ -190,40 +196,47 @@ def get_summary(agent: GenerativeAgent, force_refresh: bool = True) -> str: print(summary) return summary -def agent_observes( agent: GenerativeAgent, observations: List[str] ): +def agent_observes( agent: GenerativeAgent, observations: List[str], importance_score=0 ): results = [] for observation in observations: observation = observation.replace("{name}", agent.name) - print(colored("[Observation]", "magenta"), observation) - results.append(agent.memory.add_memory(observation)) + print(colored("[Observation]", "magenta"), f'[{agent.name}] {observation}') + results.append(agent.memory.add_memory(observation, importance_score=importance_score)) return results def agent_reacts( agent: GenerativeAgent, observations: List[str] ): results = [] for observation in observations: observation = observation.replace("{name}", agent.name) - print(colored("[Observation]", "magenta"), observation) + print(colored("[Observation]", "magenta"), f'[{agent.name}] {observation}') _, response = agent.generate_response(observation) - print(colored("[Reaction]", "magenta"), response) + print(colored("[Reaction]", "magenta"), f'[{agent.name}] {response}') results.append(response) return results -def interview_agent(agent: GenerativeAgent, message: str, username: str = "Person A") -> str: +def interview_agent(agent: GenerativeAgent, message: str) -> str: message = message.replace("{name}", agent.name) - new_message = f"{username} says {message}" - print(colored("[Interview]", "magenta"), message) - return agent.generate_dialogue_response(new_message) + print(colored("[Interview]", "magenta"), f"[User] {message}") + _, response = agent.generate_response(message) + print(colored("[Interview]", "magenta"), f"[{agent.name}] {response}") + return response def run_conversation(agents: List[GenerativeAgent], observation: str, limit: int = 0, p_reaction: float = 1 ) -> None: print(colored("[Conversation]", "magenta")) - agent_observes( agents[0], [observation] ) + for agent in agents: + agent_observes( agent, [observation] ) + agents = agents[1:] + [agents[0]] dialogue = [] while True: for agent in agents: observation = agent_reacts( agent, [ observation ] )[0] + for a in agents: + if a is agent: + continue + agent_observes( a, [ observation ] ) if limit > 0 and len(dialogue) >= limit: break return dialogue \ No newline at end of file