Cradicle Explorer

/ qwencoder-eval / instruct / livecode_bench / lcb_runner / evaluation / old_results_check.py
old_results_check.py
 1  import numpy as np
 2  import json
 3  from lcb_runner.benchmarks import load_generation_dataset, CodeGenerationProblem
 4  from lcb_runner.evaluation import codegen_metrics
 5  
 6  
 7  dataset = load_generation_dataset()
 8  
 9  dataset = sorted(dataset, key=lambda x: x.question_id)
10  
11  
12  def check_model(model_key):
13      path = f"/home/naman/Repos/LiveCodeBench/run_models_outputs/{model_key}/chat_0.2_checked.json"
14      with open(path) as f:
15          old_results = json.load(f)
16      old_results = sorted(old_results, key=lambda x: x["question_id"])
17      assert old_results[0]["question_id"] == dataset[0].question_id
18  
19      def debug(idx):
20          codegen_metrics(
21              [dataset[idx].get_evaluation_sample()],
22              [old_results[idx]["code_list"][:1]],
23              debug=True,
24          )
25  
26      def run(idx):
27          return codegen_metrics(
28              [dataset[idx].get_evaluation_sample()],
29              [old_results[idx]["code_list"]],
30          )
31  
32      debug(380)
33      exit()
34      # debug(196)
35      # debug(352)
36  
37      metrics = codegen_metrics(
38          [d.get_evaluation_sample() for d in dataset],
39          [r["code_list"] for r in old_results],
40          num_process_evaluate=12,
41      )
42      old_pass1 = np.mean([np.mean(r["pass1_list"]) for r in old_results])
43  
44      print(old_pass1)
45      print(metrics[0]["pass@1"])
46  
47      for idx in range(400):
48          old_pass1 = np.mean(old_results[idx]["pass1_list"])
49          new_pass1 = metrics[0]["detail"]["pass@1"][idx]
50          if not abs(old_pass1 - new_pass1) < 1e-4:
51              print(idx, old_pass1, new_pass1)
52  
53  
54  # model_key = "GPT-4-Turbo-1106"
55  # check_model(model_key)
56  
57  model_key = "Claude-3-Opus"
58  check_model(model_key)
59  
60  model_key = "GPT-4-0613"
61  check_model(model_key)
62  
63  model_key = "Mistral-Large"
64  check_model(model_key)
65  
66  model_key = "Claude-3-Sonnet"
67  check_model(model_key)
68  
69  model_key = "GPT-3.5-Turbo-0301"
70  check_model(model_key)
71  
72  model_key = "Gemini-Pro"
73  check_model(model_key)