Cradicle Explorer

/ qwencoder-eval / instruct / BigCodeBench / syncheck.py
syncheck.py
  1  """This file checks two things:
  2  1. Is the LLMs codegen completed for each benchmark?
  3  2. Warn the code that are not compilable (it could be some impl issues).
  4  """
  5  
  6  import ast
  7  import traceback
  8  
  9  from termcolor import colored
 10  
 11  from data import load_solutions
 12  
 13  
 14  def syntax_check(code, verbose=False):
 15      try:
 16          ast.parse(code)
 17          return True
 18      except (SyntaxError, MemoryError):
 19          if verbose:
 20              traceback.print_exc()
 21          return False
 22  
 23  
 24  def script(
 25      samples: str, nsample_check: int = None, verbose: bool = False
 26  ):
 27      # List[Dict{"task_id", "solution"}]
 28      solutions = load_solutions(samples)
 29  
 30      from bigcodebench.data import get_bigcodebench
 31  
 32      dataset = get_bigcodebench()
 33      dataset_name = "BigCodeBench"
 34  
 35      print(colored(f"Dataset: {dataset_name}", "blue"))
 36  
 37      id2solutions = {}
 38      for solution in solutions:
 39          task_id = solution["task_id"]
 40          if task_id not in id2solutions:
 41              id2solutions[task_id] = []
 42          if "solution" not in solution:
 43              assert "completion" in solution, "solution or completion must exist!"
 44              solution["solution"] = dataset[task_id]["complete_prompt"] + solution["completion"]
 45          id2solutions[task_id].append(solution)
 46  
 47      print(colored("==============================", "blue"))
 48      print(colored(" ::: Checking completeness... ", "blue"))
 49      print(colored(" ::::: All tasks complete?    ", "blue"))
 50      ndone = 0
 51  
 52      task_ids = dataset.keys()
 53      ntask = len(task_ids)
 54      for task_id in task_ids:
 55          if task_id not in id2solutions:
 56              print(colored(f" ⚠️ {task_id} is missing!", "red"))
 57              continue
 58          nfiles = len(id2solutions[task_id])
 59  
 60          if nsample_check is None or nfiles <= nsample_check:
 61              ndone += 1
 62              continue
 63  
 64          print(
 65              colored(
 66                  f" ⚠️ {task_id} only has {nfiles} samples! But {nsample_check} are expected.",
 67                  "red",
 68              )
 69          )
 70  
 71      # check if there is enough number of samples here.
 72      if nsample_check is not None:
 73          if ntask != ndone:
 74              ntbd = ntask - ndone
 75              print(colored(f" ::::: ⚠️ {ntbd}/{ntask} tasks incomplete!", "red"))
 76          else:
 77              print(colored(f" ::::: All {ntask} tasks complete!", "green"))
 78  
 79      print(colored("==============================", "blue"))
 80      print(colored(" ::: Checking compilation...  ", "blue"))
 81      print(colored(" ::::: All code compilable?   ", "blue"))
 82      ncode = 0
 83      nwrong = 0
 84      for task_id in task_ids:
 85          # task_id must exist
 86          if task_id not in id2solutions:
 87              continue
 88  
 89          for solution in id2solutions[task_id]:
 90              ncode += 1
 91              code = solution["solution"]
 92              dbg_identifier = solution["_identifier"]
 93              if code.strip() == "":
 94                  print(colored(f" ⚠️ {dbg_identifier} is empty!", "red"))
 95                  nwrong += 1
 96              elif not syntax_check(code, verbose):
 97                  print(colored(f" ⚠️ {dbg_identifier} is not compilable!", "red"))
 98                  nwrong += 1
 99      if 0 != nwrong:
100          print(colored(f" ::::: ⚠️ {nwrong}/{ncode} code are not compilable!", "red"))
101      else:
102          print(colored(f" ::::: All {ncode} code are compilable!", "green"))
103  
104  
105  def main():
106      from fire import Fire
107  
108      Fire(script)
109  
110  
111  if __name__ == "__main__":
112      main()