show_results.py
1 import re 2 import sys 3 from prettytable import PrettyTable 4 import json 5 import numpy as np 6 7 8 def show_evalplus(results_path): 9 evalplus_humaneval = open(f"{results_path}/evalplus/humaneval_results.txt").read() 10 match = re.search(r"pass@1:(.*?)\n.*?pass@1:(.*?)\n", evalplus_humaneval, flags=re.DOTALL) 11 results.extend([match.group(1).strip(), match.group(2).strip()]) 12 13 evalplus_mbpp = open(f"{results_path}/evalplus/mbpp_results.txt").read() 14 match = re.search(r"pass@1:(.*?)\n.*?pass@1:(.*?)\n", evalplus_mbpp, flags=re.DOTALL) 15 results.extend([match.group(1).strip(), match.group(2).strip()]) 16 17 results = [float(s) for s in results] 18 results.insert(0, np.average(results)) 19 table = PrettyTable() 20 table.field_names = [ 21 "EvalPlus (avg)", 22 'EvalPlus (HumanEval)', 23 'EvalPlus (HumanEval+)', 24 'EvalPlus (MBPP)', 25 'EvalPlus (MBPP+)', 26 ] 27 table.add_row(results) 28 return str(table) 29 30 31 def show_livecodebench(results_path): 32 livecodebench = json.load(open(f"{results_path}/livecodebench/results.json")) 33 livecodebench_pass_at_1, livecodebench_pass_at_5 = livecodebench[0]["pass@1"], livecodebench[0]["pass@5"] 34 results.extend([livecodebench_pass_at_1, livecodebench_pass_at_5]) 35 results.insert(0, np.average(results)) 36 table = PrettyTable() 37 table.field_names = [ 38 "Livecodebench (avg)", 39 "livecodebench (pass@1)", 40 "livecodebench (pass@10)", 41 ] 42 table.add_row(results) 43 return str(results) 44 45 46 def show_multiple(results_path): 47 results = [] 48 lgs = ["python", "cs", "cpp", "java", "php", "ts", "sh", "js"] 49 for lg in lgs: 50 score = json.load(open(f"{results_path}/MultiPL-E/results_{lg}.jsonl")) 51 results.append(score["pass@1"]) 52 results = [float(r) for r in results] 53 results.insert(0, np.average(results)) 54 table = PrettyTable() 55 table.field_names = ["MultiPL-E (avg)", "MultiPL-E (python)", "MultiPL-E (cs)", "MultiPL-E (cpp)", "MultiPL-E (java)", "MultiPL-E (php)", "MultiPL-E (ts)", "MultiPL-E (sh)", "MultiPL-E (js)"] 56 table.add_row(results) 57 return str(results) 58 59 60 def main(results_path="code-evaluation/results/"): 61 tabs = [] 62 tabs.append(show_evalplus(results_path)) 63 tabs.append(show_evalplus(results_path)) 64 tabs.append(show_evalplus(results_path)) 65 with open(f"{results_path}/all_results.txt", "w") as w: 66 for tab in tabs: 67 w.write(f"{tab}\n") 68 print(f"All results saving to {results_path}/all_results.txt") 69 70 71 if __name__ == "__main__": 72 args = sys.argv 73 results_path = "results_path" 74 if len(args) > 1: 75 results_path = args[1] 76 main(results_path=results_path)