12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849 |
- from typing import Iterable, Dict
- import gzip
- import json
- import os
- ROOT = os.path.dirname(os.path.abspath(__file__))
- HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz")
- def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
- return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
- def stream_jsonl(filename: str) -> Iterable[Dict]:
- """
- Parses each jsonl line and yields it as a dictionary
- """
- if filename.endswith(".gz"):
- with open(filename, "rb") as gzfp:
- with gzip.open(gzfp, 'rt') as fp:
- for line in fp:
- if any(not x.isspace() for x in line):
- yield json.loads(line)
- else:
- with open(filename, "r") as fp:
- for line in fp:
- if any(not x.isspace() for x in line):
- yield json.loads(line)
- def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
- """
- Writes an iterable of dictionaries to jsonl
- """
- if append:
- mode = 'ab'
- else:
- mode = 'wb'
- filename = os.path.expanduser(filename)
- if filename.endswith(".gz"):
- with open(filename, mode) as fp:
- with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
- for x in data:
- gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
- else:
- with open(filename, mode) as fp:
- for x in data:
- fp.write((json.dumps(x) + "\n").encode('utf-8'))
|