2
0

data.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. from typing import Iterable, Dict
  2. import gzip
  3. import json
  4. import os
  5. ROOT = os.path.dirname(os.path.abspath(__file__))
  6. HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz")
  7. def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
  8. return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
  9. def stream_jsonl(filename: str) -> Iterable[Dict]:
  10. """
  11. Parses each jsonl line and yields it as a dictionary
  12. """
  13. if filename.endswith(".gz"):
  14. with open(filename, "rb") as gzfp:
  15. with gzip.open(gzfp, 'rt') as fp:
  16. for line in fp:
  17. if any(not x.isspace() for x in line):
  18. yield json.loads(line)
  19. else:
  20. with open(filename, "r") as fp:
  21. for line in fp:
  22. if any(not x.isspace() for x in line):
  23. yield json.loads(line)
  24. def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
  25. """
  26. Writes an iterable of dictionaries to jsonl
  27. """
  28. if append:
  29. mode = 'ab'
  30. else:
  31. mode = 'wb'
  32. filename = os.path.expanduser(filename)
  33. if filename.endswith(".gz"):
  34. with open(filename, mode) as fp:
  35. with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
  36. for x in data:
  37. gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
  38. else:
  39. with open(filename, mode) as fp:
  40. for x in data:
  41. fp.write((json.dumps(x) + "\n").encode('utf-8'))