tokenize_pile.py 631 B

123456789101112131415161718192021222324
  1. import json
  2. import tqdm
  3. from icetk import icetk
  4. from multiprocessing import Pool
  5. DATA_PATH = "/mnt/yrfs/aohan/data/english_data/pile/val.jsonl"
  6. OUTPUT_PATH = "/mnt/yrfs/aohan/data/english_data/pile/val_tokenized.jsonl"
  7. def get_data(line):
  8. item = json.loads(line)
  9. item["text_pretokenized"] = item["text"]
  10. item["text"] = icetk.encode(item["text_pretokenized"])
  11. return json.dumps(item) + "\n"
  12. with open(DATA_PATH, "r") as file:
  13. data = file.readlines()
  14. with Pool(16) as p:
  15. result = list(tqdm.tqdm(p.imap(get_data, data), total=len(data)))
  16. with open(OUTPUT_PATH, "w") as file:
  17. file.writelines(result)