data.py
1 from typing import Iterable, Dict 2 import gzip 3 import json 4 import os 5 6 7 ROOT = os.path.dirname(os.path.abspath(__file__)) 8 HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz") 9 10 11 def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]: 12 return {task["task_id"]: task for task in stream_jsonl(evalset_file)} 13 14 15 def stream_jsonl(filename: str) -> Iterable[Dict]: 16 """ 17 Parses each jsonl line and yields it as a dictionary 18 """ 19 if filename.endswith(".gz"): 20 with open(filename, "rb") as gzfp: 21 with gzip.open(gzfp, 'rt') as fp: 22 for line in fp: 23 if any(not x.isspace() for x in line): 24 yield json.loads(line) 25 else: 26 with open(filename, "r", encoding="utf-8") as fp: 27 for line in fp: 28 if any(not x.isspace() for x in line): 29 yield json.loads(line) 30 31 32 def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False): 33 """ 34 Writes an iterable of dictionaries to jsonl 35 """ 36 if append: 37 mode = 'ab' 38 else: 39 mode = 'wb' 40 filename = os.path.expanduser(filename) 41 if filename.endswith(".gz"): 42 with open(filename, mode) as fp: 43 with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp: 44 for x in data: 45 gzfp.write((json.dumps(x) + "\n").encode('utf-8')) 46 else: 47 with open(filename, mode) as fp: 48 for x in data: 49 fp.write((json.dumps(x) + "\n").encode('utf-8'))