reworked saving/loading agent by just saving the memory documents themselves and adding them on load, rather than serializing the entire memory object and it breaking between systems / wizard prompt tune / more tunings
This commit is contained in:
parent
287406e7ba
commit
6fa2c18fb1
|
@ -44,7 +44,7 @@ I ***do not*** plan on making this uber-user friendly like [mrq/ai-voice-cloning
|
||||||
|
|
||||||
## Caveats
|
## Caveats
|
||||||
|
|
||||||
A local LM is quite slow. Things seem to be getting faster as llama.cpp is being developed.
|
A local LM is quite slow. Things seem to be getting faster as llama.cpp is being developed. GPU offloading (and the OpenCL PR) seems to bring some very nice hope in just scrapping this in Python and just integrate it entirely in C++.
|
||||||
|
|
||||||
Even using one that's more instruction-tuned like Vicuna (with a `SYSTEM:\nUSER:\nASSISTANT:` structure of prompts), it's still inconsistent.
|
Even using one that's more instruction-tuned like Vicuna (with a `SYSTEM:\nUSER:\nASSISTANT:` structure of prompts), it's still inconsistent.
|
||||||
|
|
||||||
|
|
|
@ -122,14 +122,12 @@ class GenerativeAgent(BaseModel):
|
||||||
memory = ""
|
memory = ""
|
||||||
|
|
||||||
for mem in relevant_memories:
|
for mem in relevant_memories:
|
||||||
if mem in summary or mem in memory:
|
if mem in summary or mem in memory or mem in observation:
|
||||||
continue
|
continue
|
||||||
memory += f"\n{mem}"
|
memory += f"\n{mem}"
|
||||||
|
|
||||||
for mem in recent_memories:
|
for mem in recent_memories:
|
||||||
if mem in summary:
|
if mem in summary or mem in observation:
|
||||||
continue
|
|
||||||
if mem is observation:
|
|
||||||
continue
|
continue
|
||||||
# erase it, move it to bottom
|
# erase it, move it to bottom
|
||||||
if mem in memory:
|
if mem in memory:
|
||||||
|
@ -169,7 +167,7 @@ class GenerativeAgent(BaseModel):
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(reaction)
|
print(reaction)
|
||||||
return reaction
|
return f'{self.name}: {reaction}'
|
||||||
|
|
||||||
def generate_response(self, observation: str) -> Tuple[bool, str]:
|
def generate_response(self, observation: str) -> Tuple[bool, str]:
|
||||||
"""React to a given observation."""
|
"""React to a given observation."""
|
||||||
|
@ -195,7 +193,7 @@ class GenerativeAgent(BaseModel):
|
||||||
summary = self.chain(prompt).run(stop=get_stop_tokens(), name=self.name, summary=self.summaries[-1] if len(self.summaries) else self.summary, queries=[f"{self.name}'s core characteristics"]).strip()
|
summary = self.chain(prompt).run(stop=get_stop_tokens(), name=self.name, summary=self.summaries[-1] if len(self.summaries) else self.summary, queries=[f"{self.name}'s core characteristics"]).strip()
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(summary)
|
print(summary)
|
||||||
return summary
|
return f'{self.name} {summary}'
|
||||||
|
|
||||||
def get_summary(self, force_refresh: bool = False) -> str:
|
def get_summary(self, force_refresh: bool = False) -> str:
|
||||||
"""Return a descriptive summary of the agent."""
|
"""Return a descriptive summary of the agent."""
|
||||||
|
|
|
@ -61,8 +61,7 @@ PROMPTS = {
|
||||||
},
|
},
|
||||||
"generate_reaction": {
|
"generate_reaction": {
|
||||||
"system": (
|
"system": (
|
||||||
"[Write one reply. Always stay in character. Maintain a casual tone using beige prose. Be brief. Avoid repeating anything below.]"
|
#"\nCurrent Time: {current_time}" # commented out, not necessary if I'm not passing time anyways, and I think bigger LLMs would only take advantage of it / llama's prompt caching will get ruined with this changing
|
||||||
"\nCurrent Time: {current_time}"
|
|
||||||
"\n{summary}"
|
"\n{summary}"
|
||||||
"\n{memory}"
|
"\n{memory}"
|
||||||
"\n{observation}"
|
"\n{observation}"
|
||||||
|
@ -70,7 +69,7 @@ PROMPTS = {
|
||||||
"user": (
|
"user": (
|
||||||
"{suffix}"
|
"{suffix}"
|
||||||
),
|
),
|
||||||
"assistant": ""
|
"assistant": "{name}: "
|
||||||
},
|
},
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -88,6 +87,7 @@ PROMPT_TUNES = {
|
||||||
"supercot": "{role}:\n{query}",
|
"supercot": "{role}:\n{query}",
|
||||||
"alpasta": "{role}# {query}",
|
"alpasta": "{role}# {query}",
|
||||||
"cocktail": "{role}: {query}",
|
"cocktail": "{role}: {query}",
|
||||||
|
"wizard-vicuna": "{role}: {query}",
|
||||||
}
|
}
|
||||||
PROMPT_ROLES = {
|
PROMPT_ROLES = {
|
||||||
"vicuna": {
|
"vicuna": {
|
||||||
|
@ -100,6 +100,11 @@ PROMPT_ROLES = {
|
||||||
"user": "### Input",
|
"user": "### Input",
|
||||||
"assistant": "### Response",
|
"assistant": "### Response",
|
||||||
},
|
},
|
||||||
|
"wizard-vicuna": {
|
||||||
|
"system": "### Instruction",
|
||||||
|
"user": "### Input",
|
||||||
|
"assistant": "### Response",
|
||||||
|
},
|
||||||
"alpasta": {
|
"alpasta": {
|
||||||
"system": "<|system|>",
|
"system": "<|system|>",
|
||||||
"user": "<|user|>",
|
"user": "<|user|>",
|
||||||
|
@ -173,4 +178,5 @@ def get_prompt( key, tune=LLM_PROMPT_TUNE ):
|
||||||
output = "\n".join(outputs)
|
output = "\n".join(outputs)
|
||||||
#if LLM_PROMPT_TUNE == "cocktail":
|
#if LLM_PROMPT_TUNE == "cocktail":
|
||||||
output = output.strip()
|
output = output.strip()
|
||||||
|
print([output[-1]])
|
||||||
return output
|
return output
|
|
@ -78,7 +78,7 @@ def get_summary_proxy( agents ):
|
||||||
messages.append(get_summary( agent, force_refresh = True ))
|
messages.append(get_summary( agent, force_refresh = True ))
|
||||||
return "\n".join(messages)
|
return "\n".join(messages)
|
||||||
|
|
||||||
def run_conversation_proxy( agents, message, limit=4 ):
|
def run_conversation_proxy( agents, message, limit=32 ):
|
||||||
agents = [ AGENTS[agent] for agent in agents ]
|
agents = [ AGENTS[agent] for agent in agents ]
|
||||||
|
|
||||||
if len(agents) < 2:
|
if len(agents) < 2:
|
||||||
|
@ -95,7 +95,6 @@ def run_conversation_proxy( agents, message, limit=4 ):
|
||||||
importance_score = agent_observes( agent, [ message ], importance_score=importance_score )[0][0]
|
importance_score = agent_observes( agent, [ message ], importance_score=importance_score )[0][0]
|
||||||
agents = agents[1:] + [agents[0]]
|
agents = agents[1:] + [agents[0]]
|
||||||
|
|
||||||
dialogue = []
|
|
||||||
while True:
|
while True:
|
||||||
for agent in agents:
|
for agent in agents:
|
||||||
message = agent_reacts( agent, [ message ] )[0]
|
message = agent_reacts( agent, [ message ] )[0]
|
||||||
|
|
25
src/utils.py
25
src/utils.py
|
@ -27,28 +27,33 @@ from langchain.vectorstores import FAISS
|
||||||
LLM_TYPE = os.environ.get('LLM_TYPE', "llamacpp") # options: llamacpp, oai
|
LLM_TYPE = os.environ.get('LLM_TYPE', "llamacpp") # options: llamacpp, oai
|
||||||
LLM_LOCAL_MODEL = os.environ.get('LLM_MODEL',
|
LLM_LOCAL_MODEL = os.environ.get('LLM_MODEL',
|
||||||
#"./models/ggml-vicuna-13b-1.1/ggml-vic13b-uncensored-q4_2.bin"
|
#"./models/ggml-vicuna-13b-1.1/ggml-vic13b-uncensored-q4_2.bin"
|
||||||
"./models/ggml-vicuna-13b-cocktail-v1-q5_0.bin"
|
#"./models/ggml-vicuna-13b-cocktail-v1-q5_0.bin"
|
||||||
|
#"./models/WizardML-Unc-13b-Q5_1.bin"
|
||||||
#"./models/llama-13b-supercot-ggml/ggml-model-q4_2.bin"
|
#"./models/llama-13b-supercot-ggml/ggml-model-q4_2.bin"
|
||||||
#"./models/llama-33b-supercot-ggml/ggml-model-q4_2.bin"
|
#"./models/llama-33b-supercot-ggml/ggml-model-q4_2.bin"
|
||||||
#"./models/gpt4-x-alpasta-30b-ggml-q4_1.bin"
|
#"./models/gpt4-x-alpasta-30b-ggml-q4_1.bin"
|
||||||
|
|
||||||
|
"./models/Wizard-Vicuna-13B-Uncensored.ggml.q5_1.bin"
|
||||||
|
#"./models/wizardlm-13b-uncensored-ggml-q5_1.bin"
|
||||||
)
|
)
|
||||||
LLM_CONTEXT = int(os.environ.get('LLM_CONTEXT', '2048'))
|
LLM_CONTEXT = int(os.environ.get('LLM_CONTEXT', '2048'))
|
||||||
LLM_THREADS = int(os.environ.get('LLM_THREADS', '6'))
|
LLM_THREADS = int(os.environ.get('LLM_THREADS', '6'))
|
||||||
|
LLM_GPU_LAYERS = int(os.environ.get('LLM_GPU_LAYERS', '99'))
|
||||||
LLM_TEMPERATURE = float(os.environ.get('LLM_TEMPERATURE', '0.99'))
|
LLM_TEMPERATURE = float(os.environ.get('LLM_TEMPERATURE', '0.99'))
|
||||||
EMBEDDING_TYPE = os.environ.get("LLM_EMBEDDING_TYPE", "hf") # options: llamacpp, oai, hf
|
EMBEDDING_TYPE = os.environ.get("LLM_EMBEDDING_TYPE", "hf") # options: llamacpp, oai, hf
|
||||||
|
|
||||||
#LLM_TYPE="oai"
|
|
||||||
#os.environ['OPENAI_API_BASE']="https://oai.ecker.tech/proxy/openai"
|
|
||||||
#os.environ['OPENAI_API_KEY']=""
|
|
||||||
|
|
||||||
# deduce a default given a model path
|
# deduce a default given a model path
|
||||||
if LLM_TYPE=="oai":
|
if LLM_TYPE=="oai":
|
||||||
LLM_PROMPT_TUNE_DEFAULT = "oai"
|
LLM_PROMPT_TUNE_DEFAULT = "oai"
|
||||||
else:
|
else:
|
||||||
if "supercot" in LLM_LOCAL_MODEL.lower():
|
if "supercot" in LLM_LOCAL_MODEL.lower():
|
||||||
LLM_PROMPT_TUNE_DEFAULT = "supercot"
|
LLM_PROMPT_TUNE_DEFAULT = "supercot"
|
||||||
|
if "wizard-vicuna" in LLM_LOCAL_MODEL.lower():
|
||||||
|
LLM_PROMPT_TUNE_DEFAULT = "wizard-vicuna"
|
||||||
elif "vicuna" in LLM_LOCAL_MODEL.lower():
|
elif "vicuna" in LLM_LOCAL_MODEL.lower():
|
||||||
LLM_PROMPT_TUNE_DEFAULT = "vicuna"
|
LLM_PROMPT_TUNE_DEFAULT = "vicuna"
|
||||||
|
elif "wizard" in LLM_LOCAL_MODEL.lower():
|
||||||
|
LLM_PROMPT_TUNE_DEFAULT = "vicuna"
|
||||||
elif "alpasta" in LLM_LOCAL_MODEL.lower():
|
elif "alpasta" in LLM_LOCAL_MODEL.lower():
|
||||||
LLM_PROMPT_TUNE_DEFAULT = "alpasta"
|
LLM_PROMPT_TUNE_DEFAULT = "alpasta"
|
||||||
elif "cocktail" in LLM_LOCAL_MODEL.lower():
|
elif "cocktail" in LLM_LOCAL_MODEL.lower():
|
||||||
|
@ -72,6 +77,7 @@ if LLM_TYPE=="llamacpp":
|
||||||
callback_manager=callback_manager,
|
callback_manager=callback_manager,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
n_ctx=LLM_CONTEXT,
|
n_ctx=LLM_CONTEXT,
|
||||||
|
n_gpu_layers=LLM_GPU_LAYERS,
|
||||||
temperature=LLM_TEMPERATURE,
|
temperature=LLM_TEMPERATURE,
|
||||||
#n_threads=LLM_THREADS,
|
#n_threads=LLM_THREADS,
|
||||||
#use_mlock=True,
|
#use_mlock=True,
|
||||||
|
@ -176,8 +182,9 @@ def save_agent( agent ):
|
||||||
"status": agent.status,
|
"status": agent.status,
|
||||||
"summary": agent.summary,
|
"summary": agent.summary,
|
||||||
"summaries": agent.summaries,
|
"summaries": agent.summaries,
|
||||||
"memory_retriever": agent.memory.memory_retriever,
|
"memories": [ memory for memory in agent.memory.memory_retriever.memory_stream ]
|
||||||
}
|
}
|
||||||
|
|
||||||
path = f"./agents/{agent.name}.pth"
|
path = f"./agents/{agent.name}.pth"
|
||||||
pickle.dump(obj, open(path, 'wb'))
|
pickle.dump(obj, open(path, 'wb'))
|
||||||
print(f"Saved agent:", path)
|
print(f"Saved agent:", path)
|
||||||
|
@ -186,7 +193,11 @@ def load_agent( name ):
|
||||||
path = f"./agents/{name}.pth"
|
path = f"./agents/{name}.pth"
|
||||||
obj = pickle.load(open(path, 'rb'))
|
obj = pickle.load(open(path, 'rb'))
|
||||||
agent = create_agent(**obj)
|
agent = create_agent(**obj)
|
||||||
agent.memory.memory_retriever = obj["memory_retriever"]
|
#agent.memory.memory_retriever.memory_stream = obj["memories"]
|
||||||
|
for memory in obj["memories"]:
|
||||||
|
print("Loaded memory:", memory)
|
||||||
|
agent.memory.memory_retriever.add_documents([memory])
|
||||||
|
|
||||||
print(f"Loaded agent:", path)
|
print(f"Loaded agent:", path)
|
||||||
return agent
|
return agent
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user