| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465 | 
							- import os
 
- from transformers import AutoTokenizer
 
- os.environ['TOKENIZERS_PARALLELISM'] = "false"
 
- list_repo_hf  = ["databricks/dolly-v2-3b",           # dolly-v2 (3b, 7b, 12b models share the same tokenizer)
 
-                  "gpt2",                             # gpt-2 (gpt2-xl, gpt2-large share the same tokenizer)
 
-                  "uer/gpt2-chinese-cluecorpussmall", # gpt-2-chinese
 
-                  "EleutherAI/gpt-j-6b",              # gpt-j
 
-                  "EleutherAI/gpt-neox-20b",          # gpt-neox
 
-                  "EleutherAI/polyglot-ko-1.3b",      # gpt-neox (polyglot-ko 5.8b and 12.8b share the same tokenizer")
 
-                  "rinna/japanese-gpt-neox-3.6b",     # gpt-neox
 
-                  # mpt-7b (uses gpt-neox-20b tokenizer)
 
-                  "replit/replit-code-v1-3b",         # replit
 
-                  "bigcode/starcoder",                # starcoder (huggingface-cli login required)
 
-                  "openai/whisper-tiny"               # whisper (base, large, large-v2 share the same tokenizer)
 
-                  ]
 
- repo2ggml     = {"databricks/dolly-v2-3b"           : "dolly-v2",
 
-                  "gpt2"                             : "gpt-2",
 
-                  "uer/gpt2-chinese-cluecorpussmall" : "gpt-2-chinese",
 
-                  "EleutherAI/gpt-j-6b"              : "gpt-j",
 
-                  "EleutherAI/gpt-neox-20b"          : "gpt-neox",
 
-                  "EleutherAI/polyglot-ko-1.3b"      : "polyglot-ko",
 
-                  "rinna/japanese-gpt-neox-3.6b"     : "gpt-neox-japanese",
 
-                  "replit/replit-code-v1-3b"         : "replit",
 
-                  "bigcode/starcoder"                : "starcoder",
 
-                  "openai/whisper-tiny"              : "whisper"}
 
- repo2language = {"databricks/dolly-v2-3b"           : "english",
 
-                  "gpt2"                             : "english",
 
-                  "uer/gpt2-chinese-cluecorpussmall" : "chinese",
 
-                  "EleutherAI/gpt-j-6b"              : "english",
 
-                  "EleutherAI/gpt-neox-20b"          : "english",
 
-                  "EleutherAI/polyglot-ko-1.3b"      : "korean",
 
-                  "rinna/japanese-gpt-neox-3.6b"     : "japanese",
 
-                  "replit/replit-code-v1-3b"         : "english",
 
-                  "bigcode/starcoder"                : "english",
 
-                  "openai/whisper-tiny"              : "english"}
 
- delimeter = ": "
 
- test_sentences = []
 
- with open("test-cases.txt", "r") as f:
 
-     lines = [l.rstrip() for l in f.readlines()]
 
-     for l in lines:
 
-         if delimeter in l:
 
-             language = l[:l.index(delimeter)]
 
-             sentence = l[l.index(delimeter) + len(delimeter):]
 
-             test_sentences.append((language.lower(), sentence))
 
- for repo in list_repo_hf:
 
-     target_language = repo2language[repo]
 
-     tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True)
 
-     tokens_hf = []
 
-     for language, sentence in test_sentences:
 
-         if language == target_language:
 
-             tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))
 
-             tokens_hf.append((sentence, tokens))
 
-     save_txt = repo2ggml[repo] + ".txt"
 
-     with open(save_txt, "w") as f:
 
-         f.writelines([sentence + " => " + ",".join(str(t) for t in tokens) + "\n" for sentence, tokens in tokens_hf])
 
 
  |