old_results_check.py
1 import numpy as np 2 import json 3 from lcb_runner.benchmarks import load_generation_dataset, CodeGenerationProblem 4 from lcb_runner.evaluation import codegen_metrics 5 6 7 dataset = load_generation_dataset() 8 9 dataset = sorted(dataset, key=lambda x: x.question_id) 10 11 12 def check_model(model_key): 13 path = f"/home/naman/Repos/LiveCodeBench/run_models_outputs/{model_key}/chat_0.2_checked.json" 14 with open(path) as f: 15 old_results = json.load(f) 16 old_results = sorted(old_results, key=lambda x: x["question_id"]) 17 assert old_results[0]["question_id"] == dataset[0].question_id 18 19 def debug(idx): 20 codegen_metrics( 21 [dataset[idx].get_evaluation_sample()], 22 [old_results[idx]["code_list"][:1]], 23 debug=True, 24 ) 25 26 def run(idx): 27 return codegen_metrics( 28 [dataset[idx].get_evaluation_sample()], 29 [old_results[idx]["code_list"]], 30 ) 31 32 debug(380) 33 exit() 34 # debug(196) 35 # debug(352) 36 37 metrics = codegen_metrics( 38 [d.get_evaluation_sample() for d in dataset], 39 [r["code_list"] for r in old_results], 40 num_process_evaluate=12, 41 ) 42 old_pass1 = np.mean([np.mean(r["pass1_list"]) for r in old_results]) 43 44 print(old_pass1) 45 print(metrics[0]["pass@1"]) 46 47 for idx in range(400): 48 old_pass1 = np.mean(old_results[idx]["pass1_list"]) 49 new_pass1 = metrics[0]["detail"]["pass@1"][idx] 50 if not abs(old_pass1 - new_pass1) < 1e-4: 51 print(idx, old_pass1, new_pass1) 52 53 54 # model_key = "GPT-4-Turbo-1106" 55 # check_model(model_key) 56 57 model_key = "Claude-3-Opus" 58 check_model(model_key) 59 60 model_key = "GPT-4-0613" 61 check_model(model_key) 62 63 model_key = "Mistral-Large" 64 check_model(model_key) 65 66 model_key = "Claude-3-Sonnet" 67 check_model(model_key) 68 69 model_key = "GPT-3.5-Turbo-0301" 70 check_model(model_key) 71 72 model_key = "Gemini-Pro" 73 check_model(model_key)