Merge pull request 'feat/human-eval' (#2) from feat/human-eval into main

Reviewed-on: #2
This commit is contained in:
grailfinder 2023-07-26 13:20:13 +00:00
commit 0919190fd7
6 changed files with 182 additions and 7 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
*json *json
__pycache__

BIN
human_eval_example.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 111 KiB

140
human_eval_gui.py Executable file
View File

@ -0,0 +1,140 @@
#!/usr/bin/env python
from jane_index import presets, prompts
import tkinter as tk
from tkinter import ttk
from tkinter.font import Font
import json
FILENAME = ""
def update_host_label():
hosts = find_hosts(CURRENT_FILE)
newlabel = "found in " + ",".join(hosts)
host_label.config(text=newlabel)
def init():
import os
if len(os.sys.argv) < 2 or not os.sys.argv[1].endswith(".json"):
print("provide path json file as an argument")
exit(1)
global FILENAME
FILENAME = os.sys.argv[1]
with open(FILENAME ) as lf:
answers = json.load(lf)
return answers
def load_next_answer(answers):
global current_index
current_index += 1
if current_index >= len(answers["answers"]):
# make final count score
scores = calculate_index(answers)
answer_label.delete('1.0', tk.END)
answer_label.insert('1.0', scores)
return
current_answer = answers["answers"][current_index]["added_prompt"] + "\n" + answers["answers"][current_index]["response"]
current_preset = answers["answers"][current_index]["preset"]
# answer_label.config(text=current_answer)
answer_label.delete('1.0', tk.END)
answer_label.insert('1.0', current_answer)
preset_label.config(text=current_preset)
progress_label.config(text=f"progress: {current_index} / {answer_length}")
# info_frame.preset_label.config(text=current_preset)
def calculate_index(answers):
preset_succ_counter = {p: 0 for p in presets}
scorer_name = "human_eval"
for preset in presets:
for answer in answers["answers"]:
if answer["preset"] == preset:
if "success" in answer[f"{scorer_name}_score"].lower():
preset_succ_counter[preset] += 1
# success rate for every preset
answers[f"{scorer_name}_success_rate_full"] = success_counter/len(answers["answers"])
for p, v in preset_succ_counter.items():
answers[f"{scorer_name}_success_rate_{p}"] = round(v/len(prompts), 2)
with open(FILENAME , "w") as lf:
json.dump(answers, lf, indent=4)
# return eval keys
di_to_return = {}
for k, v in answers.items():
if scorer_name in k:
di_to_return[k] = v
return di_to_return
def success_click():
global answers
if current_index > len(answers["answers"]):
return
answers["answers"][current_index]["human_eval_score"] = "SUCCESS"
load_next_answer(answers)
def fail_click():
global answers
if current_index > len(answers["answers"]):
return
answers["answers"][current_index]["human_eval_score"] = "FAIL"
load_next_answer(answers)
if __name__ == "__main__":
answers = init()
answer_length = len(answers["answers"])
current_index = 0
current_answer = answers["answers"][current_index]["added_prompt"] + "\n" + answers["answers"][current_index]["response"]
current_preset = answers["answers"][current_index]["preset"]
root = tk.Tk()
root.geometry("1000x800")
root.resizable(True, True)
root.title("human score gui")
textFont = Font(size=16)
quit_btn = tk.Button(root, text="Quit", command=root.destroy)
# answer_label = tk.Label(root, text=current_answer, font=textFont)
answer_label = tk.Text(root, font=textFont, wrap=tk.WORD)
answer_label.insert('1.0', current_answer)
preset_label = tk.Label(root, text=f"preset: {current_preset}", font=textFont)
progress_label = tk.Label(root, text=f"progress: {current_index} / {answer_length}", font=textFont)
load_frame = tk.Frame(root)
success_btn = tk.Button(load_frame, text="success",
command=success_click, font=textFont).grid(row=1, column=0)
fail_btn = tk.Button(load_frame, text="fail", command=fail_click, font=textFont).grid(row=1, column=1)
answer_label.pack(
expand=True,
fill=tk.BOTH,
)
load_frame.pack(
expand=True,
)
preset_label.pack()
progress_label.pack()
quit_btn.pack(
ipadx=5,
ipady=7,
expand=True,
)
root.mainloop()

View File

@ -68,12 +68,14 @@ MODEL_URL = "http://127.0.0.1:5000/api/v1/model"
def make_prompt_body(p, preset, use_description=True): def make_prompt_body(p, preset, use_description=True):
body = { body = {
"preset": preset, "preset": preset,
"max_new_tokens": 500, "max_new_tokens": 250,
"added_prompt": p "added_prompt": p
} }
body["description"] = DESCRIPTION
if use_description: if use_description:
body["description"] = DESCRIPTION
body["prompt"] = DESCRIPTION + p body["prompt"] = DESCRIPTION + p
return body
body["prompt"] = p body["prompt"] = p
return body return body

View File

@ -5,12 +5,43 @@ Test of existing presets within ooba's webui by generating actions of the quadru
### Initial Generation (generative model) ### Initial Generation (generative model)
- start ooba's text generation webui service on port 5000; - start ooba's text generation webui service on port 5000;
- load model you want to use for generation; - load model you want to use for generation;
`./jane_index.py` `./jane_index.py`
will generate json file with that model name (for ex: TheBloke_Llama-2-13B-GPTQ.json); will generate json file with that model name (for ex: TheBloke_Llama-2-13B-GPTQ.json);
### Scoring (judge model) ### Scoring (judge model)
- go to ooba's webui and set judge model (preferably 65b+); - go to ooba's webui and set judge model (preferably 65b+);
`./jane_index.py TheBloke_Llama-2-13B-GPTQ.json` `./jane_index.py TheBloke_Llama-2-13B-GPTQ.json`
judge scores each generation as success or failure, modifies initial json file to add scores to it, judge scores each generation as success or failure, modifies initial json file to add scores to it,
where 1 is a perfect score and 0 is a complete failure. where 1 is a perfect score and 0 is a complete failure.
### Index example in json file
scores of answers `TheBloke_Nous-Hermes-Llama2-GTPQ`
inside of file `TheBloke_Nous-Hermes-Llama2-GTPQ.json`
by judge `MetaIX_GPT4-X-Alpasta-30b-4bit`
```
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_full": 0.6979166666666666,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Asterism": 0.83,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Big O": 0.83,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Contrastive Search": 0.67,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Debug-deterministic": 0.67,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Divine Intellect": 0.33,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Kobold-Godlike": 1.0,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_LLaMA-Precise": 0.5,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Midnight Enigma": 0.67,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Mirostat": 0.83,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Shortwave": 0.83,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_simple-1": 1.0,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Space Alien": 0.67,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_StarChat": 0.67,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_tfs-with-top-a": 0.67,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Titanic": 0.33,
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Yara": 0.67
```
### Run human eval gui
`./human_eval_gui.py TheBloke_Llama-2-13B-GPTQ.json`
screen shall appear
![gui_screen](./human_eval_example.png)
after you finish all objects score will appear in the text box,
at this point you may quit the gui and check scores in json file.

View File

@ -1,2 +1,3 @@
requests requests
tqdm tqdm
tkinter