diff --git a/.gitignore b/.gitignore index 8d080e8..b3981f0 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ *json +__pycache__ diff --git a/human_eval_example.png b/human_eval_example.png new file mode 100644 index 0000000..ce330aa Binary files /dev/null and b/human_eval_example.png differ diff --git a/human_eval_gui.py b/human_eval_gui.py new file mode 100755 index 0000000..e82809f --- /dev/null +++ b/human_eval_gui.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python + +from jane_index import presets, prompts +import tkinter as tk +from tkinter import ttk +from tkinter.font import Font +import json + +FILENAME = "" + + +def update_host_label(): + hosts = find_hosts(CURRENT_FILE) + newlabel = "found in " + ",".join(hosts) + host_label.config(text=newlabel) + + +def init(): + import os + + if len(os.sys.argv) < 2 or not os.sys.argv[1].endswith(".json"): + print("provide path json file as an argument") + exit(1) + + global FILENAME + FILENAME = os.sys.argv[1] + with open(FILENAME ) as lf: + answers = json.load(lf) + return answers + +def load_next_answer(answers): + global current_index + current_index += 1 + if current_index >= len(answers["answers"]): + # make final count score + scores = calculate_index(answers) + answer_label.delete('1.0', tk.END) + answer_label.insert('1.0', scores) + return + + current_answer = answers["answers"][current_index]["added_prompt"] + "\n" + answers["answers"][current_index]["response"] + current_preset = answers["answers"][current_index]["preset"] + + + # answer_label.config(text=current_answer) + answer_label.delete('1.0', tk.END) + answer_label.insert('1.0', current_answer) + preset_label.config(text=current_preset) + progress_label.config(text=f"progress: {current_index} / {answer_length}") + # info_frame.preset_label.config(text=current_preset) + +def calculate_index(answers): + preset_succ_counter = {p: 0 for p in presets} + scorer_name = "human_eval" + + for preset in presets: + for answer in answers["answers"]: + if answer["preset"] == preset: + if "success" in answer[f"{scorer_name}_score"].lower(): + preset_succ_counter[preset] += 1 + + # success rate for every preset + answers[f"{scorer_name}_success_rate_full"] = success_counter/len(answers["answers"]) + + for p, v in preset_succ_counter.items(): + answers[f"{scorer_name}_success_rate_{p}"] = round(v/len(prompts), 2) + + with open(FILENAME , "w") as lf: + json.dump(answers, lf, indent=4) + + # return eval keys + di_to_return = {} + for k, v in answers.items(): + if scorer_name in k: + di_to_return[k] = v + return di_to_return + + +def success_click(): + global answers + if current_index > len(answers["answers"]): + return + answers["answers"][current_index]["human_eval_score"] = "SUCCESS" + load_next_answer(answers) + +def fail_click(): + global answers + if current_index > len(answers["answers"]): + return + answers["answers"][current_index]["human_eval_score"] = "FAIL" + load_next_answer(answers) + +if __name__ == "__main__": + answers = init() + answer_length = len(answers["answers"]) + current_index = 0 + + current_answer = answers["answers"][current_index]["added_prompt"] + "\n" + answers["answers"][current_index]["response"] + current_preset = answers["answers"][current_index]["preset"] + + + root = tk.Tk() + root.geometry("1000x800") + root.resizable(True, True) + root.title("human score gui") + textFont = Font(size=16) + + quit_btn = tk.Button(root, text="Quit", command=root.destroy) + # answer_label = tk.Label(root, text=current_answer, font=textFont) + answer_label = tk.Text(root, font=textFont, wrap=tk.WORD) + answer_label.insert('1.0', current_answer) + + + preset_label = tk.Label(root, text=f"preset: {current_preset}", font=textFont) + progress_label = tk.Label(root, text=f"progress: {current_index} / {answer_length}", font=textFont) + + load_frame = tk.Frame(root) + + success_btn = tk.Button(load_frame, text="success", + command=success_click, font=textFont).grid(row=1, column=0) + fail_btn = tk.Button(load_frame, text="fail", command=fail_click, font=textFont).grid(row=1, column=1) + + answer_label.pack( + expand=True, + fill=tk.BOTH, + ) + load_frame.pack( + expand=True, + ) + preset_label.pack() + progress_label.pack() + quit_btn.pack( + ipadx=5, + ipady=7, + expand=True, + ) + root.mainloop() + + + diff --git a/jane_index.py b/jane_index.py index 847994d..ff6f96f 100755 --- a/jane_index.py +++ b/jane_index.py @@ -68,12 +68,14 @@ MODEL_URL = "http://127.0.0.1:5000/api/v1/model" def make_prompt_body(p, preset, use_description=True): body = { "preset": preset, - "max_new_tokens": 500, + "max_new_tokens": 250, "added_prompt": p } + body["description"] = DESCRIPTION if use_description: - body["description"] = DESCRIPTION body["prompt"] = DESCRIPTION + p + return body + body["prompt"] = p return body diff --git a/readme.md b/readme.md index ea69af7..ba76dab 100644 --- a/readme.md +++ b/readme.md @@ -5,12 +5,43 @@ Test of existing presets within ooba's webui by generating actions of the quadru ### Initial Generation (generative model) - start ooba's text generation webui service on port 5000; -- load model you want to use for generation; -`./jane_index.py` +- load model you want to use for generation; +`./jane_index.py` will generate json file with that model name (for ex: TheBloke_Llama-2-13B-GPTQ.json); ### Scoring (judge model) -- go to ooba's webui and set judge model (preferably 65b+); -`./jane_index.py TheBloke_Llama-2-13B-GPTQ.json` -judge scores each generation as success or failure, modifies initial json file to add scores to it, +- go to ooba's webui and set judge model (preferably 65b+); +`./jane_index.py TheBloke_Llama-2-13B-GPTQ.json` +judge scores each generation as success or failure, modifies initial json file to add scores to it, where 1 is a perfect score and 0 is a complete failure. + +### Index example in json file +scores of answers `TheBloke_Nous-Hermes-Llama2-GTPQ` +inside of file `TheBloke_Nous-Hermes-Llama2-GTPQ.json` +by judge `MetaIX_GPT4-X-Alpasta-30b-4bit` +``` + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_full": 0.6979166666666666, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Asterism": 0.83, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Big O": 0.83, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Contrastive Search": 0.67, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Debug-deterministic": 0.67, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Divine Intellect": 0.33, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Kobold-Godlike": 1.0, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_LLaMA-Precise": 0.5, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Midnight Enigma": 0.67, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Mirostat": 0.83, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Shortwave": 0.83, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_simple-1": 1.0, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Space Alien": 0.67, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_StarChat": 0.67, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_tfs-with-top-a": 0.67, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Titanic": 0.33, + "MetaIX_GPT4-X-Alpasta-30b-4bit_success_rate_Yara": 0.67 +``` + +### Run human eval gui +`./human_eval_gui.py TheBloke_Llama-2-13B-GPTQ.json` +screen shall appear +![gui_screen](./human_eval_example.png) +after you finish all objects score will appear in the text box, +at this point you may quit the gui and check scores in json file. diff --git a/requirements.txt b/requirements.txt index 5bb8c66..a5f0564 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ requests tqdm +tkinter