utils.py
1 import gzip 2 import json 3 import os 4 from os import PathLike 5 from typing import Dict, Iterable 6 7 import tempdir 8 import wget 9 from appdirs import user_cache_dir 10 11 CACHE_DIR = user_cache_dir("bigcodebench") 12 13 14 def get_dataset_metadata(version: str, subset: str="full"): 15 extra = "-" + subset.capitalize() if subset != "full" else "" 16 url = f"https://github.com/bigcode-project/bigcodebench-annotation/releases/download/{version}/BigCodeBench{extra}.jsonl.gz" 17 cache_path = os.path.join(CACHE_DIR, f"BigCodeBench{extra}-{version}.jsonl") 18 return url, cache_path 19 20 21 def make_cache(gzip_url, hf_data, cache_path, gh=False): 22 # Check if open eval file exists in CACHE_DIR 23 24 if not os.path.exists(cache_path): 25 if gh: 26 # Install BigCodeBench dataset and parse as jsonl 27 print(f"Downloading dataset from {gzip_url}") 28 with tempdir.TempDir() as tmpdir: 29 gz_path = os.path.join(tmpdir, f"data.jsonl.gz") 30 wget.download(gzip_url, gz_path) 31 32 with gzip.open(gz_path, "rb") as f: 33 data = f.read().decode("utf-8") 34 35 # create CACHE_DIR if not exists 36 if not os.path.exists(CACHE_DIR): 37 os.makedirs(CACHE_DIR) 38 39 # Write the original open eval file to CACHE_DIR 40 with open(cache_path, "w") as f: 41 f.write(data) 42 else: 43 hf_data.to_json(cache_path) 44 45 46 def write_jsonl( 47 filename: str, data: Iterable[Dict], append: bool = False, drop_builtin: bool = True 48 ): 49 """ 50 Writes an iterable of dictionaries to jsonl 51 """ 52 if append: 53 mode = "ab" 54 else: 55 mode = "wb" 56 filename = os.path.expanduser(filename) 57 if filename.endswith(".gz"): 58 with open(filename, mode) as fp: 59 with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp: 60 for x in data: 61 if drop_builtin: 62 x = {k: v for k, v in x.items() if not k.startswith("_")} 63 gzfp.write((json.dumps(x) + "\n").encode("utf-8")) 64 else: 65 with open(filename, mode) as fp: 66 for x in data: 67 if drop_builtin: 68 x = {k: v for k, v in x.items() if not k.startswith("_")} 69 fp.write((json.dumps(x) + "\n").encode("utf-8")) 70 71 72 def stream_jsonl(filename: str) -> Iterable[Dict]: 73 """ 74 Parses each jsonl line and yields it as a dictionary 75 """ 76 if filename.endswith(".gz"): 77 with open(filename, "rb") as gzfp: 78 with gzip.open(gzfp, "rt") as fp: 79 for line in fp: 80 if any(not x.isspace() for x in line): 81 yield json.loads(line) 82 else: 83 with open(filename, "r") as fp: 84 for line in fp: 85 if any(not x.isspace() for x in line): 86 yield json.loads(line) 87 88 89 def load_solutions(sample_path: PathLike) -> Iterable[Dict]: 90 """We accept two formats of inputs. 91 + `sample.jsonl` which is the format from BigCodeBench, i.e., {task_id, completion or solution}. 92 + A folder which contains sub-folders named after the task_id. Each sub-folder 93 contains samples named in `[?].py` where `?` is the solution id starting with 0. 94 Different from `sample.jsonl`, the solutions must be complete (with prompt prefix). 95 """ 96 97 # if it is a file 98 if os.path.isfile(sample_path): 99 for i, sample in enumerate(stream_jsonl(sample_path)): 100 assert ( 101 "completion" in sample or "solution" in sample 102 ), "No completion or solution found in sample!" 103 assert "solution" not in sample or isinstance( 104 sample["solution"], str 105 ), "Solution must be a string! If you have multiple solutions, please repeat the task_id." 106 assert "completion" not in sample or isinstance( 107 sample["completion"], str 108 ), "Completion must be a string! If you have multiple solutions, please repeat the task_id." 109 110 sample["_identifier"] = ( 111 sample["task_id"] + f" (line {i+1} in {sample_path})" 112 ) 113 yield sample 114 else: 115 # if it is a folder 116 sample_path=sample_path.replace("//", "/") 117 for task_id in os.listdir(sample_path): 118 task_path = os.path.join(sample_path, task_id) 119 if not os.path.isdir(task_path): 120 continue 121 122 for solution_id in os.listdir(task_path): 123 solution_path = os.path.join(task_path, solution_id) 124 if os.path.isfile(solution_path) and solution_path.endswith(".py"): 125 with open(solution_path, "r") as f: 126 completion = f.read() 127 yield { 128 "_identifier": solution_path, 129 "_path": solution_path, 130 "task_id": task_id.replace("_", "/"), 131 "solution": completion, 132 } 133 134 135 def write_directory(directory: PathLike, data: Iterable[Dict]): 136 os.makedirs(directory, exist_ok=True) 137 counters = {} 138 for sample in data: 139 assert "solution" in sample, "Samples must come with `solution` field!" 140 task_id = sample["task_id"].replace("/", "_") 141 task_dir = os.path.join(directory, task_id) 142 os.makedirs(task_dir, exist_ok=True) 143 if task_id not in counters: 144 counters[task_id] = 0 145 sample_id = counters[task_id] 146 with open(os.path.join(task_dir, f"{sample_id}.py"), "w") as f: 147 f.write(sample["solution"]) 148 counters[task_id] += 1 149 150 151 def completeness_check(name, data): 152 for task_id, task in data.items(): 153 for key in [ 154 "complete_prompt", 155 "instruct_prompt", 156 "canonical_solution", 157 "code_prompt", 158 "test", 159 "entry_point" 160 ]: 161 assert key in task, f"{key} not found in {name} #{task_id}!" 162 163 164 def to_raw(string): 165 return string.encode("unicode-escape").decode().replace("\\\\", "\\")