github
/
GLM-130B
peilaus alkaen https://github.com/THUDM/GLM-130B.git


			
							123456789101112131415161718192021222324
							import json
import tqdm
from icetk import icetk
from multiprocessing import Pool

DATA_PATH = "/mnt/yrfs/aohan/data/english_data/pile/val.jsonl"
OUTPUT_PATH = "/mnt/yrfs/aohan/data/english_data/pile/val_tokenized.jsonl"


def get_data(line):
    item = json.loads(line)
    item["text_pretokenized"] = item["text"]
    item["text"] = icetk.encode(item["text_pretokenized"])
    return json.dumps(item) + "\n"


with open(DATA_PATH, "r") as file:
    data = file.readlines()

with Pool(16) as p:
    result = list(tqdm.tqdm(p.imap(get_data, data), total=len(data)))

with open(OUTPUT_PATH, "w") as file:
    file.writelines(result)