Spaces:
Sleeping
Sleeping
| """ | |
| special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"] | |
| 啥? 啥? bos eos | |
| [MASK] for short blank filling - 150000 | |
| [sMASK] for sentence filling - | |
| [gMASK] for left-to-right generation. - 150001 | |
| text.replace("\t", f"<|tab|>") | |
| text.replace(" " * i, f"<|blank_{length}|>") | |
| text.replace("\n", "<n>") | |
| "bos_token": "<sop>", startofpiece | |
| "eop_token": "<eop>", | |
| "eos_token": "</s>", | |
| ## 确认 | |
| 130005 = <eop> | |
| ## 源码: | |
| - https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py#L32 | |
| """ | |
| import os | |
| from transformers import AutoTokenizer | |
| os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" | |
| # tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) | |
| tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True) | |
| def encode_text(text): | |
| """ | |
| 能够编码 | |
| """ | |
| tokens = tokenizer.tokenize(text) | |
| token_id = tokenizer.encode(text=text, add_special_tokens=False) | |
| decoded_text = tokenizer.decode(token_id) | |
| print("tokens: ", tokens, ";\tid: ", token_id, ";\ttext: ", decoded_text) | |
| def test_space(): | |
| # " " 编码后是空的 | |
| for text in [" ", "\t", "你是谁", "你是\n谁", "你是 谁", "你是 谁", "'[Round 0]\n问:你是谁\n答:我是一个名为 ChatGLM-6B 的人工智能助手,是基于清华大学 KEG 实验室和智谱 AI 公司于 2023 年共同训练的语言模型开发的。我的任务是针对用户的问题和要求提供适当的答复和支持。\n[Round 1]\n问:你会干什么\n答:"]: | |
| encode_text(text) | |
| def test_case(): | |
| for text in ["Good morning", "good morning", "good morning", "goog morningabc"]: | |
| encode_text(text) | |
| def export(): | |
| with open("chatglm.vocab", "w", encoding="utf-8") as f_out: | |
| vocab_size = len(tokenizer.sp_tokenizer.text_tokenizer.proto.pieces) | |
| for i in range(vocab_size): | |
| f_out.write(tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[i].piece + "\n") | |
| # export() | |
| def test_tokens(): | |
| tokens = [43435] | |
| tokens = [ 53, 6945, 5, 8, 42, 4, 64286, 12, 74874, | |
| 4, 67342, 12, 74874, 130328, 130247, 130233, 130227, 35, | |
| 65806, 68241, 75890, 14132, 5388, 340, 11, 21, 222, | |
| 6, 76693, 66877, 63852, 6, 66430, 68747, 102501, 63823, | |
| 4, 52, 6945, 5, 9, 42, 4, 64286, 12, | |
| 65450, 83400, 64213, 66846, 4, 67342, 12, 130001, 130004, | |
| 74747, 83400, 66115, 90478, 70597, 63826, 68076, 6, 63873, | |
| 68684, 64113, 120922, 73129, 63823, 65056, 63829, 63948, 64124, | |
| 79727, 64447, 12, 4, 4, 9, 7, 5, 64716, | |
| 93067, 95119, 64560, 12, 66524, 63827, 70682, 63944, 89160, | |
| 63826, 71304, 6, 79553, 67155, 63826, 68668, 63843, 91351, | |
| 96846, 63823, 4, 4, 10, 7, 5, 95472, 74107, | |
| 66625, 64285, 12, 64442, 67201, 69609, 63824, 81548, 63824, | |
| 70870, 63826, 66800, 6, 94824, 63959, 65195, 65515, 63824, | |
| 64392, 69584, 63824, 81198, 63914, 63835, 63823, 4, 4, | |
| 13, 7, 5, 66544, 69656, 12, 66533, 63891, 63948, | |
| 66544, 69726, 6, 63906, 86089, 63824, 88419, 63824, 69765, | |
| 63853, 64369, 102753, 64736, 63823, 4, 4, 16, 7, | |
| 5, 65073, 63827, 72151, 64020, 67491, 66469, 63853, 68168, | |
| 12, 65289, 95128, 63826, 68819, 6, 118679, 66115, 64174, | |
| 66625, 63823, 4, 4, 15, 7, 5, 86790, 12, | |
| 70666, 89266, 63878, 66544, 69656, 6, 67623, 73129, 63823, | |
| 4, 4, 21, 7, 71210, 79856, 63912, 63831, 66625, | |
| 69204, 64659, 12, 66312, 63922, 64984, 67427, 63824, 63959, | |
| 65419, 63853, 64384, 63835, 63823, 4, 4, 63976, 106490, | |
| 65921, 64542, 73129, 6, 63852, 80917, 65207, 64678, 63853, | |
| 66625, 64427, 6, 89385, 64124, 79727, 64447, 63823, 130005] | |
| # print(tokenizer.decode(tokens)) | |
| start_idx = 0 # chatglm里的token_id是从0开始的 | |
| # start_idx = 20000 # 默认词典,前20000是图片 | |
| for i, token in enumerate(tokens): | |
| # print(i, token, tokenizer.decode([token - start_idx])) | |
| # print(tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[token - start_idx].piece, end=" ") | |
| print(i, token, tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[token - start_idx].piece) | |
| test_tokens() | |
| # tokenizer.sp_tokenizer.text_tokenizer.convert_token_to_id(x) + tokenizer.sp_tokenizer.num_image_tokens | |
| # test_case() | |
| # test_space() | |
| # s | |