Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

xu-song commited on Jan 31, 2024

Commit

c75633b

1 Parent(s): 6bdf6c6

add more tokenizer

Files changed (9) hide show

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ sentencepiece
 tiktoken
 icetk
 torch
-zhon

 tiktoken
 icetk
 torch
+zhon
+nltk

utils/compress_rate_util.py CHANGED Viewed

@@ -1,9 +1,7 @@
 """
-中文数据
-英文数据：
 """

 """
+中文数据：clue superclue
+英文数据：glue cnn_dailymail gigaword
 """

utils/zh_util.py CHANGED Viewed

@@ -72,7 +72,7 @@ def iter_vocab(tokenizer, name="", from_cache=True):
     if from_cache and name in cache:
         return cache[name]
-    f_out = open(name + "_vocab.zh.jsonl", "w", encoding="utf-8")
     zh_token_count = {"total": 0, "中文单字": 0, "中文多字": 0}
     # zh_token_count = {"total": 0, "包含1个中文单字": 0, "中文多字": 0}
@@ -91,7 +91,7 @@ def iter_vocab(tokenizer, name="", from_cache=True):
         if isinstance(token, bytes):
             token = token.decode("utf-8", errors="ignore")
-        digit_count = get_digit_count(token)
         zh_count = get_zh_count(decode_str)
         space_count = get_space_count(decode_str)
@@ -99,7 +99,7 @@ def iter_vocab(tokenizer, name="", from_cache=True):
             {"id": token_id,
              "token": token,
              "token_decode": decode_str,
-             "token_len": len(token),
              "zh_count": zh_count,
              "space_count": space_count,
              "digit_count": digit_count,

     if from_cache and name in cache:
         return cache[name]
+    f_out = open(name + "_vocab.jsonl", "w", encoding="utf-8")
     zh_token_count = {"total": 0, "中文单字": 0, "中文多字": 0}
     # zh_token_count = {"total": 0, "包含1个中文单字": 0, "中文多字": 0}
         if isinstance(token, bytes):
             token = token.decode("utf-8", errors="ignore")
+        digit_count = get_digit_count(decode_str)
         zh_count = get_zh_count(decode_str)
         space_count = get_space_count(decode_str)
             {"id": token_id,
              "token": token,
              "token_decode": decode_str,
+             "token_len": len(decode_str),
              "zh_count": zh_count,
              "space_count": space_count,
              "digit_count": digit_count,

vocab/__init__.py CHANGED Viewed

@@ -130,6 +130,10 @@ all_tokenizers = [
     "phi_1",
     "phi_2",
     "solar_10_7b",
 "wizardcoder_python_7b_v1",
 "wizardlm_7b_v1",
 "wizardmath_70b_v1",

     "phi_1",
     "phi_2",
     "solar_10_7b",
+    "mobilebert_uncased",
+    "mobilenet_v2",
+    "switch_c_2048",
+    "byt5_small",
 "wizardcoder_python_7b_v1",
 "wizardlm_7b_v1",
 "wizardmath_70b_v1",

vocab/byt5_small/__init__.py ADDED Viewed


1	+ from transformers import AutoTokenizer
2	+
3	+ tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')

vocab/llama/demo.py CHANGED Viewed

@@ -30,4 +30,20 @@ tokens = [    1, 29961, 25580, 29962,  3532, 14816, 29903,  6778,    13,  3492,
 text = tokenizer.decode(tokens)
 print(text)
 for token_id in tokens:
-    print(json.dumps({"token_id": token_id, "decode_str": tokenizer.decode([token_id]), "token": tokenizer.convert_ids_to_tokens([token_id][0])}, ensure_ascii=False))

 text = tokenizer.decode(tokens)
 print(text)
 for token_id in tokens:
+    print(json.dumps({"token_id": token_id, "decode_str": tokenizer.decode([token_id]), "token": tokenizer.convert_ids_to_tokens([token_id][0])}, ensure_ascii=False))
+def byte_token():
+    """
+    为什么 \n 是 "<0x0A>"
+    8 11    145
+    :return:
+    """
+    for token_id in [8, 11, 145]:
+        token_str = tokenizer.decode([token_id])
+        print(token_str)
+byte_token()

vocab/mobilebert_uncased/__init__.py ADDED Viewed


1	+ from transformers import AutoTokenizer
2	+ tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased", trust_remote_code=True)

vocab/mobilenet_v2/__init__.py ADDED Viewed


1	+ from transformers import AutoTokenizer
2	+ tokenizer = AutoTokenizer.from_pretrained("google/mobilenet_v2_1.0_224", trust_remote_code=True)

vocab/switch_c_2048/__init__.py ADDED Viewed


1	+
2	+ from transformers import AutoTokenizer
3	+
4	+ tokenizer = AutoTokenizer.from_pretrained("google/switch-c-2048")