convert_data.py
1 import json 2 import datasets 3 import os 4 from enum import Enum 5 from datetime import datetime 6 from dataclasses import dataclass 7 import jsonlines 8 import tqdm 9 import numpy as np 10 class PromptConstants: 11 SYSTEM_MESSAGE_GENERIC = f"You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program." 12 13 SYSTEM_MESSAGE_DEEPSEEK = f"You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you answer questions related to computer science." 14 15 SYSTEM_MESSAGE_CODEQWEN = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user" 16 17 SYSTEM_MESSAGE_MAGIC = f"You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n@@ Instruction\n" 18 19 SYSTEM_MESSAGE_WIZARD = "Below is an instruction that describes a task. Write a response that appropriately completes the request." 20 21 SYSTEM_MESSAGE_PHIND = f"""You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program. Put your fixed program within code delimiters, for example: 22 ```python 23 # YOUR CODE HERE 24 ```""" 25 26 FORMATTING_MESSAGE_WITH_STARTER_CODE = "You will use the following starter code to write the solution to the problem and enclose your code within delimiters." 27 28 FORMATTING_WITHOUT_STARTER_CODE = "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows." 29 30 class Platform(Enum): 31 LEETCODE = "leetcode" 32 CODEFORCES = "codeforces" 33 ATCODER = "atcoder" 34 35 36 class Difficulty(Enum): 37 EASY = "easy" 38 MEDIUM = "medium" 39 HARD = "hard" 40 41 42 class TestType(Enum): 43 STDIN = "stdin" 44 FUNCTIONAL = "functional" 45 46 @dataclass 47 class Test: 48 input: str 49 output: str 50 testtype: TestType 51 52 def __post_init__(self): 53 self.testtype = TestType(self.testtype) 54 55 @dataclass 56 class CodeGenerationProblem: 57 question_title: str 58 question_content: str 59 platform: Platform 60 question_id: str 61 contest_id: str 62 contest_date: datetime 63 starter_code: str 64 difficulty: Difficulty 65 public_test_cases: list[Test] 66 private_test_cases: list[Test] 67 metadata: dict 68 69 def __post_init__(self): 70 self.platform = Platform(self.platform) 71 self.difficulty = Difficulty(self.difficulty) 72 73 self.public_test_cases = json.loads(self.public_test_cases) 74 self.public_test_cases = [Test(**t) for t in self.public_test_cases] 75 76 self.private_test_cases = json.loads(self.private_test_cases) 77 self.private_test_cases = [Test(**t) for t in self.private_test_cases] 78 79 self.metadata = json.loads(self.metadata) 80 81 def insert_output(self, output_list: list[str], code_list: list[str]) -> dict: 82 return { 83 "question_title": self.question_title, 84 "question_content": self.question_content, 85 "platform": self.platform.value, 86 "question_id": self.question_id, 87 "contest_id": self.contest_id, 88 "contest_date": self.contest_date.isoformat(), 89 "starter_code": self.starter_code, 90 "difficulty": self.difficulty.value, 91 "output_list": output_list, 92 "code_list": code_list, 93 } 94 95 def insert_output_evaluation( 96 self, output_list: list[str], code_list: list[str], graded_list: list[bool] 97 ) -> dict: 98 output = self.insert_output(output_list, code_list) 99 output["graded_list"] = graded_list 100 output["pass@1"] = graded_list.count(True) / len(graded_list) 101 return output 102 103 def get_evaluation_sample(self): 104 return { 105 "input_output": 106 json.dumps( 107 { 108 "inputs": [ 109 t.input 110 for t in self.public_test_cases + self.private_test_cases 111 ], 112 "outputs": [ 113 t.output 114 for t in self.public_test_cases + self.private_test_cases 115 ], 116 "fn_name": self.metadata.get("func_name", None), 117 } 118 ) 119 } 120 121 122 def convert_file(source_path, target_path): 123 def get_codeqwen_question_template_answer(question: CodeGenerationProblem): 124 prompt = "You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.\n\n" 125 prompt += f"Question: {question.question_content}\n\n" 126 if question.starter_code: 127 prompt += ( 128 f"{PromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n" 129 ) 130 prompt += f"```python\n{question.starter_code}\n```\n\n" 131 else: 132 prompt += ( 133 f"{PromptConstants.FORMATTING_WITHOUT_STARTER_CODE}\n" 134 ) 135 prompt += f"```python\n# YOUR CODE HERE\n```\n\n" 136 return prompt 137 138 def convert(sample): 139 prompt = get_codeqwen_question_template_answer(sample) 140 tests = sample.get_evaluation_sample() 141 data = { 142 "prompt": prompt, 143 "_test": tests, 144 "entry_point": sample.starter_code, 145 "tags": f"coding,en,python,core", 146 "task": f"livecodebench", 147 "source": f"livecodebench", 148 "eval_args": { 149 "greedy": False, 150 "seed": 1234, 151 "out_seq_length": 1200, 152 "repetition_penalty": 1.0, 153 "temperature": 0.2, 154 #"beam_size": 10, 155 #"presence_penalty": 2.0, 156 #"system_str": "你是一个专业的数学家,擅长解答数学问题。", 157 "top_k": -1, 158 "top_p": 0.95, 159 } 160 } 161 return data 162 163 if not os.path.exists(os.path.dirname(target_path)): 164 os.makedirs(os.path.dirname(target_path)) 165 166 with jsonlines.open(target_path, 'w') as w: 167 dataset = datasets.load_dataset(source_path)["test"] 168 dataset = [CodeGenerationProblem(**p) for p in dataset] 169 for i, sample in tqdm.tqdm(enumerate(dataset)): 170 new_data = convert(sample) 171 new_data[f"sampling_cluster"] = i 172 n_sampling = 1 173 for _ in range(n_sampling): 174 w.write(new_data) 175 176 177 with jsonlines.open(target_path + ".sampled", 'w') as w: 178 dataset = datasets.load_dataset(source_path)["test"] 179 dataset = [CodeGenerationProblem(**p) for p in dataset][:5] 180 for i, sample in tqdm.tqdm(enumerate(dataset)): 181 new_data = convert(sample) 182 new_data[f"sampling_cluster"] = i 183 n_sampling = 1 184 for _ in range(n_sampling): 185 w.write(new_data) 186 187 188 if __name__ == "__main__": 189 convert_file("./data/livecodebench___code_generation", "./data/livecodebench.jsonl") 190