syncheck.py
1 """This file checks two things: 2 1. Is the LLMs codegen completed for each benchmark? 3 2. Warn the code that are not compilable (it could be some impl issues). 4 """ 5 6 import ast 7 import traceback 8 9 from termcolor import colored 10 11 from data import load_solutions 12 13 14 def syntax_check(code, verbose=False): 15 try: 16 ast.parse(code) 17 return True 18 except (SyntaxError, MemoryError): 19 if verbose: 20 traceback.print_exc() 21 return False 22 23 24 def script( 25 samples: str, nsample_check: int = None, verbose: bool = False 26 ): 27 # List[Dict{"task_id", "solution"}] 28 solutions = load_solutions(samples) 29 30 from bigcodebench.data import get_bigcodebench 31 32 dataset = get_bigcodebench() 33 dataset_name = "BigCodeBench" 34 35 print(colored(f"Dataset: {dataset_name}", "blue")) 36 37 id2solutions = {} 38 for solution in solutions: 39 task_id = solution["task_id"] 40 if task_id not in id2solutions: 41 id2solutions[task_id] = [] 42 if "solution" not in solution: 43 assert "completion" in solution, "solution or completion must exist!" 44 solution["solution"] = dataset[task_id]["complete_prompt"] + solution["completion"] 45 id2solutions[task_id].append(solution) 46 47 print(colored("==============================", "blue")) 48 print(colored(" ::: Checking completeness... ", "blue")) 49 print(colored(" ::::: All tasks complete? ", "blue")) 50 ndone = 0 51 52 task_ids = dataset.keys() 53 ntask = len(task_ids) 54 for task_id in task_ids: 55 if task_id not in id2solutions: 56 print(colored(f" ⚠️ {task_id} is missing!", "red")) 57 continue 58 nfiles = len(id2solutions[task_id]) 59 60 if nsample_check is None or nfiles <= nsample_check: 61 ndone += 1 62 continue 63 64 print( 65 colored( 66 f" ⚠️ {task_id} only has {nfiles} samples! But {nsample_check} are expected.", 67 "red", 68 ) 69 ) 70 71 # check if there is enough number of samples here. 72 if nsample_check is not None: 73 if ntask != ndone: 74 ntbd = ntask - ndone 75 print(colored(f" ::::: ⚠️ {ntbd}/{ntask} tasks incomplete!", "red")) 76 else: 77 print(colored(f" ::::: All {ntask} tasks complete!", "green")) 78 79 print(colored("==============================", "blue")) 80 print(colored(" ::: Checking compilation... ", "blue")) 81 print(colored(" ::::: All code compilable? ", "blue")) 82 ncode = 0 83 nwrong = 0 84 for task_id in task_ids: 85 # task_id must exist 86 if task_id not in id2solutions: 87 continue 88 89 for solution in id2solutions[task_id]: 90 ncode += 1 91 code = solution["solution"] 92 dbg_identifier = solution["_identifier"] 93 if code.strip() == "": 94 print(colored(f" ⚠️ {dbg_identifier} is empty!", "red")) 95 nwrong += 1 96 elif not syntax_check(code, verbose): 97 print(colored(f" ⚠️ {dbg_identifier} is not compilable!", "red")) 98 nwrong += 1 99 if 0 != nwrong: 100 print(colored(f" ::::: ⚠️ {nwrong}/{ncode} code are not compilable!", "red")) 101 else: 102 print(colored(f" ::::: All {ncode} code are compilable!", "green")) 103 104 105 def main(): 106 from fire import Fire 107 108 Fire(script) 109 110 111 if __name__ == "__main__": 112 main()