tokenize_huggingface.py 3.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. import os
  2. from transformers import AutoTokenizer
  3. os.environ['TOKENIZERS_PARALLELISM'] = "false"
  4. list_repo_hf = ["databricks/dolly-v2-3b", # dolly-v2 (3b, 7b, 12b models share the same tokenizer)
  5. "gpt2", # gpt-2 (gpt2-xl, gpt2-large share the same tokenizer)
  6. "uer/gpt2-chinese-cluecorpussmall", # gpt-2-chinese
  7. "EleutherAI/gpt-j-6b", # gpt-j
  8. "EleutherAI/gpt-neox-20b", # gpt-neox
  9. "EleutherAI/polyglot-ko-1.3b", # gpt-neox (polyglot-ko 5.8b and 12.8b share the same tokenizer")
  10. "rinna/japanese-gpt-neox-3.6b", # gpt-neox
  11. # mpt-7b (uses gpt-neox-20b tokenizer)
  12. "replit/replit-code-v1-3b", # replit
  13. "bigcode/starcoder", # starcoder (huggingface-cli login required)
  14. "openai/whisper-tiny" # whisper (base, large, large-v2 share the same tokenizer)
  15. ]
  16. repo2ggml = {"databricks/dolly-v2-3b" : "dolly-v2",
  17. "gpt2" : "gpt-2",
  18. "uer/gpt2-chinese-cluecorpussmall" : "gpt-2-chinese",
  19. "EleutherAI/gpt-j-6b" : "gpt-j",
  20. "EleutherAI/gpt-neox-20b" : "gpt-neox",
  21. "EleutherAI/polyglot-ko-1.3b" : "polyglot-ko",
  22. "rinna/japanese-gpt-neox-3.6b" : "gpt-neox-japanese",
  23. "replit/replit-code-v1-3b" : "replit",
  24. "bigcode/starcoder" : "starcoder",
  25. "openai/whisper-tiny" : "whisper"}
  26. repo2language = {"databricks/dolly-v2-3b" : "english",
  27. "gpt2" : "english",
  28. "uer/gpt2-chinese-cluecorpussmall" : "chinese",
  29. "EleutherAI/gpt-j-6b" : "english",
  30. "EleutherAI/gpt-neox-20b" : "english",
  31. "EleutherAI/polyglot-ko-1.3b" : "korean",
  32. "rinna/japanese-gpt-neox-3.6b" : "japanese",
  33. "replit/replit-code-v1-3b" : "english",
  34. "bigcode/starcoder" : "english",
  35. "openai/whisper-tiny" : "english"}
  36. delimeter = ": "
  37. test_sentences = []
  38. with open("test-cases.txt", "r") as f:
  39. lines = [l.rstrip() for l in f.readlines()]
  40. for l in lines:
  41. if delimeter in l:
  42. language = l[:l.index(delimeter)]
  43. sentence = l[l.index(delimeter) + len(delimeter):]
  44. test_sentences.append((language.lower(), sentence))
  45. for repo in list_repo_hf:
  46. target_language = repo2language[repo]
  47. tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True)
  48. tokens_hf = []
  49. for language, sentence in test_sentences:
  50. if language == target_language:
  51. tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))
  52. tokens_hf.append((sentence, tokens))
  53. save_txt = repo2ggml[repo] + ".txt"
  54. with open(save_txt, "w") as f:
  55. f.writelines([sentence + " => " + ",".join(str(t) for t in tokens) + "\n" for sentence, tokens in tokens_hf])