utils.py
  1  import gzip
  2  import json
  3  import os
  4  from os import PathLike
  5  from typing import Dict, Iterable
  6  
  7  import tempdir
  8  import wget
  9  from appdirs import user_cache_dir
 10  
 11  CACHE_DIR = user_cache_dir("bigcodebench")
 12  
 13  
 14  def get_dataset_metadata(version: str, subset: str="full"):
 15      extra = "-" + subset.capitalize() if subset != "full" else ""
 16      url = f"https://github.com/bigcode-project/bigcodebench-annotation/releases/download/{version}/BigCodeBench{extra}.jsonl.gz"
 17      cache_path = os.path.join(CACHE_DIR, f"BigCodeBench{extra}-{version}.jsonl")
 18      return url, cache_path
 19  
 20  
 21  def make_cache(gzip_url, hf_data, cache_path, gh=False):
 22      # Check if open eval file exists in CACHE_DIR
 23      
 24      if not os.path.exists(cache_path):
 25          if gh:
 26              # Install BigCodeBench dataset and parse as jsonl
 27              print(f"Downloading dataset from {gzip_url}")
 28              with tempdir.TempDir() as tmpdir:
 29                  gz_path = os.path.join(tmpdir, f"data.jsonl.gz")
 30                  wget.download(gzip_url, gz_path)
 31  
 32                  with gzip.open(gz_path, "rb") as f:
 33                      data = f.read().decode("utf-8")
 34  
 35              # create CACHE_DIR if not exists
 36              if not os.path.exists(CACHE_DIR):
 37                  os.makedirs(CACHE_DIR)
 38  
 39              # Write the original open eval file to CACHE_DIR
 40              with open(cache_path, "w") as f:
 41                  f.write(data)
 42          else:
 43              hf_data.to_json(cache_path)
 44  
 45  
 46  def write_jsonl(
 47      filename: str, data: Iterable[Dict], append: bool = False, drop_builtin: bool = True
 48  ):
 49      """
 50      Writes an iterable of dictionaries to jsonl
 51      """
 52      if append:
 53          mode = "ab"
 54      else:
 55          mode = "wb"
 56      filename = os.path.expanduser(filename)
 57      if filename.endswith(".gz"):
 58          with open(filename, mode) as fp:
 59              with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
 60                  for x in data:
 61                      if drop_builtin:
 62                          x = {k: v for k, v in x.items() if not k.startswith("_")}
 63                      gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
 64      else:
 65          with open(filename, mode) as fp:
 66              for x in data:
 67                  if drop_builtin:
 68                      x = {k: v for k, v in x.items() if not k.startswith("_")}
 69                  fp.write((json.dumps(x) + "\n").encode("utf-8"))
 70  
 71  
 72  def stream_jsonl(filename: str) -> Iterable[Dict]:
 73      """
 74      Parses each jsonl line and yields it as a dictionary
 75      """
 76      if filename.endswith(".gz"):
 77          with open(filename, "rb") as gzfp:
 78              with gzip.open(gzfp, "rt") as fp:
 79                  for line in fp:
 80                      if any(not x.isspace() for x in line):
 81                          yield json.loads(line)
 82      else:
 83          with open(filename, "r") as fp:
 84              for line in fp:
 85                  if any(not x.isspace() for x in line):
 86                      yield json.loads(line)
 87  
 88  
 89  def load_solutions(sample_path: PathLike) -> Iterable[Dict]:
 90      """We accept two formats of inputs.
 91      + `sample.jsonl` which is the format from BigCodeBench, i.e., {task_id, completion or solution}.
 92      + A folder which contains sub-folders named after the task_id. Each sub-folder
 93      contains samples named in `[?].py` where `?` is the solution id starting with 0.
 94      Different from `sample.jsonl`, the solutions must be complete (with prompt prefix).
 95      """
 96  
 97      # if it is a file
 98      if os.path.isfile(sample_path):
 99          for i, sample in enumerate(stream_jsonl(sample_path)):
100              assert (
101                  "completion" in sample or "solution" in sample
102              ), "No completion or solution found in sample!"
103              assert "solution" not in sample or isinstance(
104                  sample["solution"], str
105              ), "Solution must be a string! If you have multiple solutions, please repeat the task_id."
106              assert "completion" not in sample or isinstance(
107                  sample["completion"], str
108              ), "Completion must be a string! If you have multiple solutions, please repeat the task_id."
109  
110              sample["_identifier"] = (
111                  sample["task_id"] + f" (line {i+1} in {sample_path})"
112              )
113              yield sample
114      else:
115          # if it is a folder
116          sample_path=sample_path.replace("//", "/")
117          for task_id in os.listdir(sample_path):
118              task_path = os.path.join(sample_path, task_id)
119              if not os.path.isdir(task_path):
120                  continue
121  
122              for solution_id in os.listdir(task_path):
123                  solution_path = os.path.join(task_path, solution_id)
124                  if os.path.isfile(solution_path) and solution_path.endswith(".py"):
125                      with open(solution_path, "r") as f:
126                          completion = f.read()
127                      yield {
128                          "_identifier": solution_path,
129                          "_path": solution_path,
130                          "task_id": task_id.replace("_", "/"),
131                          "solution": completion,
132                      }
133  
134  
135  def write_directory(directory: PathLike, data: Iterable[Dict]):
136      os.makedirs(directory, exist_ok=True)
137      counters = {}
138      for sample in data:
139          assert "solution" in sample, "Samples must come with `solution` field!"
140          task_id = sample["task_id"].replace("/", "_")
141          task_dir = os.path.join(directory, task_id)
142          os.makedirs(task_dir, exist_ok=True)
143          if task_id not in counters:
144              counters[task_id] = 0
145          sample_id = counters[task_id]
146          with open(os.path.join(task_dir, f"{sample_id}.py"), "w") as f:
147              f.write(sample["solution"])
148          counters[task_id] += 1
149  
150  
151  def completeness_check(name, data):
152      for task_id, task in data.items():
153          for key in [
154              "complete_prompt",
155              "instruct_prompt",
156              "canonical_solution",
157              "code_prompt",
158              "test",
159              "entry_point"
160          ]:
161              assert key in task, f"{key} not found in {name} #{task_id}!"
162  
163  
164  def to_raw(string):
165      return string.encode("unicode-escape").decode().replace("\\\\", "\\")