Spaces:

yhavinga
/

dutch-tokenizer-arena

Sleeping

xu-song commited on Jan 31, 2024

Commit

6bdf6c6

1 Parent(s): 9820e00

update

Files changed (2) hide show

README.md CHANGED Viewed

@@ -18,9 +18,6 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
 ## TODO
-- 'MossTokenizer' object has no attribute 'encoder'
-- chatglmTokenizer

vocab/chatglm_6b/__init__.py CHANGED Viewed

@@ -6,17 +6,13 @@ import os
 import config
 from transformers import AutoTokenizer
-# if config.USE_REMOTE:
-tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
-# else:
-#     os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
-#     CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
-#     TOKENIZER_DIR = os.path.join(CURRENT_DIR, "chatglm_6b")
-#     tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
 # https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py#L153
 tokenizer.comments = f"num_image_tokens: {tokenizer.sp_tokenizer.num_image_tokens}; num_image_tokens: {tokenizer.sp_tokenizer.num_text_tokens} "

 import config
 from transformers import AutoTokenizer
+if False:  # 有bug
+    tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+else:
+    os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+    CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+    TOKENIZER_DIR = os.path.join(CURRENT_DIR, "chatglm_6b")
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
 # https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py#L153
 tokenizer.comments = f"num_image_tokens: {tokenizer.sp_tokenizer.num_image_tokens}; num_image_tokens: {tokenizer.sp_tokenizer.num_text_tokens} "