Browse Source

Add pre-tokenize scripts for pile

Sengxian 2 years ago
parent
commit
0f02a4626d
2 changed files with 24 additions and 0 deletions
  1. 0 0
      tools/__init__.py
  2. 24 0
      tools/tokenize_pile.py

+ 0 - 0
tools/__init__.py


+ 24 - 0
tools/tokenize_pile.py

@@ -0,0 +1,24 @@
+import json
+import tqdm
+from icetk import icetk
+from multiprocessing import Pool
+
+DATA_PATH = "/mnt/yrfs/aohan/data/english_data/pile/val.jsonl"
+OUTPUT_PATH = "/mnt/yrfs/aohan/data/english_data/pile/val_tokenized.jsonl"
+
+
+def get_data(line):
+    item = json.loads(line)
+    item["text_pretokenized"] = item["text"]
+    item["text"] = icetk.encode(item["text_pretokenized"])
+    return json.dumps(item) + "\n"
+
+
+with open(DATA_PATH, "r") as file:
+    data = file.readlines()
+
+with Pool(16) as p:
+    result = list(tqdm.tqdm(p.imap(get_data, data), total=len(data)))
+
+with open(OUTPUT_PATH, "w") as file:
+    file.writelines(result)