Merge pull request 'feat/human-eval' (#2) from feat/human-eval into main

Reviewed-on: #2
2023-07-26 13:20:13 +00:00 · 2023-07-26 13:20:13 +00:00 · 0919190fd7
commit 0919190fd7
parent fe3197d043 0ba1dd1f8e
6 changed files with 182 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 *json
+__pycache__
--- a/human_eval_example.png
+++ b/human_eval_example.png
--- a/human_eval_gui.py
+++ b/human_eval_gui.py
@ -0,0 +1,140 @@
+#!/usr/bin/env python
+
+from jane_index import presets, prompts
+import tkinter as tk
+from tkinter import ttk
+from tkinter.font import Font
+import json
+
+FILENAME = ""
+
+
+def update_host_label():
+    hosts = find_hosts(CURRENT_FILE)
+    newlabel = "found in " + ",".join(hosts)
+    host_label.config(text=newlabel)
+
+
+def init():
+    import os
+
+    if len(os.sys.argv) < 2 or not os.sys.argv[1].endswith(".json"):
+        print("provide path json file as an argument")
+        exit(1)
+
+    global FILENAME
+    FILENAME = os.sys.argv[1]
+    with open(FILENAME ) as lf:
+        answers = json.load(lf)
+    return answers
+
+def load_next_answer(answers):
+    global current_index
+    current_index += 1
+    if current_index >= len(answers["answers"]):
+        # make final count score
+        scores = calculate_index(answers)
+        answer_label.delete('1.0', tk.END)
+        answer_label.insert('1.0', scores)
+        return
+
+    current_answer = answers["answers"][current_index]["added_prompt"] + "\n" + answers["answers"][current_index]["response"]
+    current_preset = answers["answers"][current_index]["preset"]
+
+
+    # answer_label.config(text=current_answer)
+    answer_label.delete('1.0', tk.END)
+    answer_label.insert('1.0', current_answer)
+    preset_label.config(text=current_preset)
+    progress_label.config(text=f"progress: {current_index} / {answer_length}")
+    # info_frame.preset_label.config(text=current_preset)
+
+def calculate_index(answers):
+    preset_succ_counter = {p: 0 for p in presets}
+    scorer_name = "human_eval"
+
+    for preset in presets:
+        for answer in answers["answers"]:
+            if answer["preset"] == preset:
+                if "success" in answer[f"{scorer_name}_score"].lower():
+                    preset_succ_counter[preset] += 1
+
+    # success rate for every preset
+    answers[f"{scorer_name}_success_rate_full"] = success_counter/len(answers["answers"])
+
+    for p, v in preset_succ_counter.items():
+        answers[f"{scorer_name}_success_rate_{p}"] = round(v/len(prompts), 2)
+
+    with open(FILENAME , "w") as lf:
+        json.dump(answers, lf, indent=4)
+
+    # return eval keys
+    di_to_return = {}
+    for k, v  in answers.items():
+        if scorer_name in k:
+            di_to_return[k] = v
+    return di_to_return
+
+
+def success_click():
+    global answers
+    if current_index > len(answers["answers"]):
+        return
+    answers["answers"][current_index]["human_eval_score"] = "SUCCESS"
+    load_next_answer(answers)
+
+def fail_click():
+    global answers
+    if current_index > len(answers["answers"]):
+        return
+    answers["answers"][current_index]["human_eval_score"] = "FAIL"
+    load_next_answer(answers)
+
+if __name__ == "__main__":
+    answers = init()
+    answer_length = len(answers["answers"])
+    current_index = 0
+
+    current_answer = answers["answers"][current_index]["added_prompt"] + "\n" + answers["answers"][current_index]["response"]
+    current_preset = answers["answers"][current_index]["preset"]
+
+
+    root = tk.Tk()
+    root.geometry("1000x800")
+    root.resizable(True, True)
+    root.title("human score gui")
+    textFont = Font(size=16)
+
+    quit_btn = tk.Button(root, text="Quit", command=root.destroy)
+    # answer_label = tk.Label(root, text=current_answer, font=textFont)
+    answer_label = tk.Text(root, font=textFont, wrap=tk.WORD)
+    answer_label.insert('1.0', current_answer)
+
+
+    preset_label = tk.Label(root, text=f"preset: {current_preset}", font=textFont)
+    progress_label = tk.Label(root, text=f"progress: {current_index} / {answer_length}", font=textFont)
+
+    load_frame = tk.Frame(root)
+
+    success_btn = tk.Button(load_frame, text="success",
+                            command=success_click, font=textFont).grid(row=1, column=0)
+    fail_btn = tk.Button(load_frame, text="fail", command=fail_click, font=textFont).grid(row=1, column=1)
+
+    answer_label.pack(
+            expand=True,
+            fill=tk.BOTH,
+            )
+    load_frame.pack(
+        expand=True,
+    )
+    preset_label.pack()
+    progress_label.pack()
+    quit_btn.pack(
+        ipadx=5,
+        ipady=7,
+        expand=True,
+    )
+    root.mainloop()
+
+
+
--- a/jane_index.py
+++ b/jane_index.py
@ -68,12 +68,14 @@ MODEL_URL = "http://127.0.0.1:5000/api/v1/model"
 def make_prompt_body(p, preset, use_description=True):
    body = {
        "preset": preset,
-        "max_new_tokens": 500,
+        "max_new_tokens": 250,
        "added_prompt": p
    }
-    if use_description:
    body["description"] = DESCRIPTION
+    if use_description:
        body["prompt"] = DESCRIPTION + p
+        return body
+
    body["prompt"] = p
    return body

--- a/readme.md
+++ b/readme.md
@ -14,3 +14,34 @@ will generate json file with that model name (for ex: TheBloke_Llama-2-13B-GPTQ.
 `./jane_index.py TheBloke_Llama-2-13B-GPTQ.json`
 judge scores each generation as success or failure, modifies initial json file to add scores to it,
 where 1 is a perfect score and 0 is a complete failure.
+
+### Index example in json file
+scores of answers `TheBloke_Nous-Hermes-Llama2-GTPQ`   
+inside of file `TheBloke_Nous-Hermes-Llama2-GTPQ.json`  
+by judge `MetaIX_GPT4-X-Alpasta-30b-4bit`  
+```
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_full": 0.6979166666666666,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Asterism": 0.83,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Big O": 0.83,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Contrastive Search": 0.67,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Debug-deterministic": 0.67,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Divine Intellect": 0.33,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Kobold-Godlike": 1.0,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_LLaMA-Precise": 0.5,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Midnight Enigma": 0.67,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Mirostat": 0.83,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Shortwave": 0.83,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_simple-1": 1.0,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Space Alien": 0.67,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_StarChat": 0.67,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_tfs-with-top-a": 0.67,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Titanic": 0.33,
+    "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Yara": 0.67
+```
+
+### Run human eval gui
+`./human_eval_gui.py TheBloke_Llama-2-13B-GPTQ.json`  
+screen shall appear  
+![gui_screen](./human_eval_example.png)  
+after you finish all objects score will appear in the text box,  
+at this point you may quit the gui and check scores in json file.
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,3 @@
 requests
 tqdm
+tkinter