Merge pull request 'feat/human-eval' (#2) from feat/human-eval into main
Reviewed-on: #2
This commit is contained in:
commit
0919190fd7
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1 +1,2 @@
|
|||
*json
|
||||
__pycache__
|
||||
|
|
BIN
human_eval_example.png
Normal file
BIN
human_eval_example.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 111 KiB |
140
human_eval_gui.py
Executable file
140
human_eval_gui.py
Executable file
|
@ -0,0 +1,140 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
from jane_index import presets, prompts
|
||||
import tkinter as tk
|
||||
from tkinter import ttk
|
||||
from tkinter.font import Font
|
||||
import json
|
||||
|
||||
FILENAME = ""
|
||||
|
||||
|
||||
def update_host_label():
|
||||
hosts = find_hosts(CURRENT_FILE)
|
||||
newlabel = "found in " + ",".join(hosts)
|
||||
host_label.config(text=newlabel)
|
||||
|
||||
|
||||
def init():
|
||||
import os
|
||||
|
||||
if len(os.sys.argv) < 2 or not os.sys.argv[1].endswith(".json"):
|
||||
print("provide path json file as an argument")
|
||||
exit(1)
|
||||
|
||||
global FILENAME
|
||||
FILENAME = os.sys.argv[1]
|
||||
with open(FILENAME ) as lf:
|
||||
answers = json.load(lf)
|
||||
return answers
|
||||
|
||||
def load_next_answer(answers):
|
||||
global current_index
|
||||
current_index += 1
|
||||
if current_index >= len(answers["answers"]):
|
||||
# make final count score
|
||||
scores = calculate_index(answers)
|
||||
answer_label.delete('1.0', tk.END)
|
||||
answer_label.insert('1.0', scores)
|
||||
return
|
||||
|
||||
current_answer = answers["answers"][current_index]["added_prompt"] + "\n" + answers["answers"][current_index]["response"]
|
||||
current_preset = answers["answers"][current_index]["preset"]
|
||||
|
||||
|
||||
# answer_label.config(text=current_answer)
|
||||
answer_label.delete('1.0', tk.END)
|
||||
answer_label.insert('1.0', current_answer)
|
||||
preset_label.config(text=current_preset)
|
||||
progress_label.config(text=f"progress: {current_index} / {answer_length}")
|
||||
# info_frame.preset_label.config(text=current_preset)
|
||||
|
||||
def calculate_index(answers):
|
||||
preset_succ_counter = {p: 0 for p in presets}
|
||||
scorer_name = "human_eval"
|
||||
|
||||
for preset in presets:
|
||||
for answer in answers["answers"]:
|
||||
if answer["preset"] == preset:
|
||||
if "success" in answer[f"{scorer_name}_score"].lower():
|
||||
preset_succ_counter[preset] += 1
|
||||
|
||||
# success rate for every preset
|
||||
answers[f"{scorer_name}_success_rate_full"] = success_counter/len(answers["answers"])
|
||||
|
||||
for p, v in preset_succ_counter.items():
|
||||
answers[f"{scorer_name}_success_rate_{p}"] = round(v/len(prompts), 2)
|
||||
|
||||
with open(FILENAME , "w") as lf:
|
||||
json.dump(answers, lf, indent=4)
|
||||
|
||||
# return eval keys
|
||||
di_to_return = {}
|
||||
for k, v in answers.items():
|
||||
if scorer_name in k:
|
||||
di_to_return[k] = v
|
||||
return di_to_return
|
||||
|
||||
|
||||
def success_click():
|
||||
global answers
|
||||
if current_index > len(answers["answers"]):
|
||||
return
|
||||
answers["answers"][current_index]["human_eval_score"] = "SUCCESS"
|
||||
load_next_answer(answers)
|
||||
|
||||
def fail_click():
|
||||
global answers
|
||||
if current_index > len(answers["answers"]):
|
||||
return
|
||||
answers["answers"][current_index]["human_eval_score"] = "FAIL"
|
||||
load_next_answer(answers)
|
||||
|
||||
if __name__ == "__main__":
|
||||
answers = init()
|
||||
answer_length = len(answers["answers"])
|
||||
current_index = 0
|
||||
|
||||
current_answer = answers["answers"][current_index]["added_prompt"] + "\n" + answers["answers"][current_index]["response"]
|
||||
current_preset = answers["answers"][current_index]["preset"]
|
||||
|
||||
|
||||
root = tk.Tk()
|
||||
root.geometry("1000x800")
|
||||
root.resizable(True, True)
|
||||
root.title("human score gui")
|
||||
textFont = Font(size=16)
|
||||
|
||||
quit_btn = tk.Button(root, text="Quit", command=root.destroy)
|
||||
# answer_label = tk.Label(root, text=current_answer, font=textFont)
|
||||
answer_label = tk.Text(root, font=textFont, wrap=tk.WORD)
|
||||
answer_label.insert('1.0', current_answer)
|
||||
|
||||
|
||||
preset_label = tk.Label(root, text=f"preset: {current_preset}", font=textFont)
|
||||
progress_label = tk.Label(root, text=f"progress: {current_index} / {answer_length}", font=textFont)
|
||||
|
||||
load_frame = tk.Frame(root)
|
||||
|
||||
success_btn = tk.Button(load_frame, text="success",
|
||||
command=success_click, font=textFont).grid(row=1, column=0)
|
||||
fail_btn = tk.Button(load_frame, text="fail", command=fail_click, font=textFont).grid(row=1, column=1)
|
||||
|
||||
answer_label.pack(
|
||||
expand=True,
|
||||
fill=tk.BOTH,
|
||||
)
|
||||
load_frame.pack(
|
||||
expand=True,
|
||||
)
|
||||
preset_label.pack()
|
||||
progress_label.pack()
|
||||
quit_btn.pack(
|
||||
ipadx=5,
|
||||
ipady=7,
|
||||
expand=True,
|
||||
)
|
||||
root.mainloop()
|
||||
|
||||
|
||||
|
|
@ -68,12 +68,14 @@ MODEL_URL = "http://127.0.0.1:5000/api/v1/model"
|
|||
def make_prompt_body(p, preset, use_description=True):
|
||||
body = {
|
||||
"preset": preset,
|
||||
"max_new_tokens": 500,
|
||||
"max_new_tokens": 250,
|
||||
"added_prompt": p
|
||||
}
|
||||
if use_description:
|
||||
body["description"] = DESCRIPTION
|
||||
if use_description:
|
||||
body["prompt"] = DESCRIPTION + p
|
||||
return body
|
||||
|
||||
body["prompt"] = p
|
||||
return body
|
||||
|
||||
|
|
31
readme.md
31
readme.md
|
@ -14,3 +14,34 @@ will generate json file with that model name (for ex: TheBloke_Llama-2-13B-GPTQ.
|
|||
`./jane_index.py TheBloke_Llama-2-13B-GPTQ.json`
|
||||
judge scores each generation as success or failure, modifies initial json file to add scores to it,
|
||||
where 1 is a perfect score and 0 is a complete failure.
|
||||
|
||||
### Index example in json file
|
||||
scores of answers `TheBloke_Nous-Hermes-Llama2-GTPQ`
|
||||
inside of file `TheBloke_Nous-Hermes-Llama2-GTPQ.json`
|
||||
by judge `MetaIX_GPT4-X-Alpasta-30b-4bit`
|
||||
```
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_full": 0.6979166666666666,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Asterism": 0.83,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Big O": 0.83,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Contrastive Search": 0.67,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Debug-deterministic": 0.67,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Divine Intellect": 0.33,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Kobold-Godlike": 1.0,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_LLaMA-Precise": 0.5,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Midnight Enigma": 0.67,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Mirostat": 0.83,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Shortwave": 0.83,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_simple-1": 1.0,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Space Alien": 0.67,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_StarChat": 0.67,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_tfs-with-top-a": 0.67,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Titanic": 0.33,
|
||||
"MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Yara": 0.67
|
||||
```
|
||||
|
||||
### Run human eval gui
|
||||
`./human_eval_gui.py TheBloke_Llama-2-13B-GPTQ.json`
|
||||
screen shall appear
|
||||

|
||||
after you finish all objects score will appear in the text box,
|
||||
at this point you may quit the gui and check scores in json file.
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
requests
|
||||
tqdm
|
||||
tkinter
|
||||
|
|
Loading…
Reference in New Issue
Block a user