data.py
 1  from typing import Iterable, Dict
 2  import gzip
 3  import json
 4  import os
 5  
 6  
 7  ROOT = os.path.dirname(os.path.abspath(__file__))
 8  HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz")
 9  
10  
11  def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
12      return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
13  
14  
15  def stream_jsonl(filename: str) -> Iterable[Dict]:
16      """
17      Parses each jsonl line and yields it as a dictionary
18      """
19      if filename.endswith(".gz"):
20          with open(filename, "rb") as gzfp:
21              with gzip.open(gzfp, 'rt') as fp:
22                  for line in fp:
23                      if any(not x.isspace() for x in line):
24                          yield json.loads(line)
25      else:
26          with open(filename, "r", encoding="utf-8") as fp:
27              for line in fp:
28                  if any(not x.isspace() for x in line):
29                      yield json.loads(line)
30  
31  
32  def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
33      """
34      Writes an iterable of dictionaries to jsonl
35      """
36      if append:
37          mode = 'ab'
38      else:
39          mode = 'wb'
40      filename = os.path.expanduser(filename)
41      if filename.endswith(".gz"):
42          with open(filename, mode) as fp:
43              with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
44                  for x in data:
45                      gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
46      else:
47          with open(filename, mode) as fp:
48              for x in data:
49                  fp.write((json.dumps(x) + "\n").encode('utf-8'))