update
Browse files- .gitignore +2 -1
- README.md +8 -1
- app.py +140 -43
- app_v1.py +196 -0
- images/VS.svg +7 -0
- tokenizer.py +0 -0
- vocab/__init__.py +13 -2
- vocab/baichuan_7b/__init__.py +3 -0
- vocab/{chatglm → chatglm_6b}/README.md +0 -0
- vocab/{chatglm → chatglm_6b}/__init__.py +0 -0
- vocab/{chatglm → chatglm_6b}/chatglm.vocab +0 -0
- vocab/{chatglm → chatglm_6b}/test_chatglm.py +0 -0
- vocab/{chatglm → chatglm_6b}/tokenizer/config.json +0 -0
- vocab/{chatglm → chatglm_6b}/tokenizer/ice_text.model +0 -0
- vocab/{chatglm → chatglm_6b}/tokenizer/tokenization_chatglm.py +0 -0
- vocab/{chatglm → chatglm_6b}/tokenizer/tokenizer_config.json +0 -0
- vocab/gpt_35_turbo/__init__.py +4 -0
- vocab/gpt_35_turbo/test2.py +4 -0
- vocab/{bert_kplug → kplug}/README.md +0 -0
- vocab/kplug/__init__.py +0 -0
- vocab/{bert_kplug → kplug}/bpe_oov.py +0 -0
- vocab/{bert_kplug → kplug}/bpe_oov2.py +0 -0
- vocab/{bert_kplug → kplug}/jd_vocab.py +0 -0
- vocab/{bert_kplug → kplug}/langconv.py +0 -0
- vocab/{bert_kplug → kplug}/test_langconv.py +0 -0
- vocab/{bert_kplug → kplug}/vocab.jd.txt +0 -0
- vocab/{bert_kplug → kplug}/vocab.jd.txt.v2 +0 -0
- vocab/{bert_kplug → kplug}/zh_wiki.py +0 -0
    	
        .gitignore
    CHANGED
    
    | @@ -13,4 +13,5 @@ dist/ | |
| 13 | 
             
            downloads/
         | 
| 14 | 
             
            eggs/
         | 
| 15 | 
             
            .eggs/
         | 
| 16 | 
            -
            .idea/
         | 
|  | 
|  | |
| 13 | 
             
            downloads/
         | 
| 14 | 
             
            eggs/
         | 
| 15 | 
             
            .eggs/
         | 
| 16 | 
            +
            .idea/
         | 
| 17 | 
            +
            gradio_cached_examples
         | 
    	
        README.md
    CHANGED
    
    | @@ -1,5 +1,5 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
            -
            title:  | 
| 3 | 
             
            emoji: ⚡
         | 
| 4 | 
             
            colorFrom: red
         | 
| 5 | 
             
            colorTo: gray
         | 
| @@ -10,3 +10,10 @@ pinned: false | |
| 10 | 
             
            ---
         | 
| 11 |  | 
| 12 | 
             
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
            +
            title: Tokenizer Arena
         | 
| 3 | 
             
            emoji: ⚡
         | 
| 4 | 
             
            colorFrom: red
         | 
| 5 | 
             
            colorTo: gray
         | 
|  | |
| 10 | 
             
            ---
         | 
| 11 |  | 
| 12 | 
             
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
            ## ss
         | 
| 16 | 
            +
             | 
| 17 | 
            +
             | 
| 18 | 
            +
            ## ss
         | 
| 19 | 
            +
             | 
    	
        app.py
    CHANGED
    
    | @@ -9,7 +9,10 @@ plots | |
| 9 | 
             
            table
         | 
| 10 |  | 
| 11 | 
             
            ## related demo
         | 
| 12 | 
            -
            http://text-processing.com/demo/tokenize/
         | 
|  | |
|  | |
|  | |
| 13 |  | 
| 14 | 
             
            ## 可视化
         | 
| 15 |  | 
| @@ -28,15 +31,28 @@ css = """ | |
| 28 | 
             
            .space-show {white-space: pre-wrap;}
         | 
| 29 | 
             
            .cell-wrap {white-space: pre-wrap;}
         | 
| 30 | 
             
            .category-legend {display: none !important}
         | 
|  | |
|  | |
| 31 | 
             
            """
         | 
| 32 |  | 
| 33 | 
            -
            example_text = """ | 
| 34 | 
            -
             | 
| 35 | 
            -
            空格测试:  2个空格        8个空格
         | 
| 36 | 
            -
            数字测试:(10086 + 98) = 100184"""
         | 
| 37 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 38 |  | 
| 39 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 40 | 
             
                print(text, tokenizer_type)
         | 
| 41 | 
             
                pos_tokens = []
         | 
| 42 | 
             
                tokenizer = load_tokener(tokenizer_type)
         | 
| @@ -46,12 +62,17 @@ def tokenize(text, tokenizer_type): | |
| 46 |  | 
| 47 | 
             
                for idx, token_id in enumerate(encoding):
         | 
| 48 | 
             
                    decode_text = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �,对应 "\ufffd"
         | 
| 49 | 
            -
                    pos_tokens.extend([(decode_text, str(idx %  | 
| 50 |  | 
| 51 | 
             
                    # token  "Byte":  # 这是 utf-8编码吧?
         | 
| 52 | 
             
                    token = tokenizer.convert_ids_to_tokens([token_id])[0]
         | 
| 53 | 
             
                    if isinstance(token, bytes):
         | 
| 54 | 
            -
                         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 55 | 
             
                        token_bytes = token
         | 
| 56 | 
             
                        json_dumps = json.dumps(token_str)
         | 
| 57 | 
             
                    elif isinstance(token, str):
         | 
| @@ -61,9 +82,11 @@ def tokenize(text, tokenizer_type): | |
| 61 | 
             
                    else:
         | 
| 62 | 
             
                        return
         | 
| 63 |  | 
|  | |
|  | |
| 64 | 
             
                    table.append(
         | 
| 65 | 
             
                        {"TokenID": token_id,
         | 
| 66 | 
            -
                         "Token": token_str,  # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
         | 
| 67 | 
             
                         "Text": decode_text,  #
         | 
| 68 | 
             
                         # "Bytes": token_bytes,  # bytes类型在gradio前端页面被解码成字符串,比如   b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
         | 
| 69 | 
             
                         "Bytes": str(token_bytes),
         | 
| @@ -73,74 +96,148 @@ def tokenize(text, tokenizer_type): | |
| 73 |  | 
| 74 | 
             
                table_df = pd.DataFrame(table)
         | 
| 75 | 
             
                print(table)
         | 
| 76 | 
            -
                print(table_df)
         | 
|  | |
|  | |
|  | |
| 77 |  | 
| 78 | 
            -
             | 
|  | |
|  | |
|  | |
| 79 |  | 
| 80 |  | 
|  | |
|  | |
|  | |
|  | |
| 81 | 
             
            def test_coding():
         | 
| 82 | 
             
                bytes1 = b'\xe4\xb8\xad'
         | 
| 83 | 
             
                print(bytes1)  # b'\xe4\xb8\xad'
         | 
| 84 |  | 
| 85 |  | 
| 86 | 
             
            with gr.Blocks(css=css) as demo:
         | 
| 87 | 
            -
                gr.HTML("""<h1 align="center">Tokenizer Arena | 
| 88 | 
             
                # links: https://www.coderstool.com/utf8-encoding-decoding
         | 
|  | |
|  | |
|  | |
| 89 | 
             
                #
         | 
|  | |
| 90 |  | 
| 91 |  | 
|  | |
| 92 | 
             
                user_input = gr.Textbox(
         | 
| 93 | 
             
                    value=example_text,
         | 
| 94 | 
            -
                     | 
|  | |
|  | |
| 95 | 
             
                )  # placeholder="Enter sentence here..."
         | 
| 96 |  | 
| 97 | 
             
                # submitBtn = gr.Button("生成回复", variant="primary")
         | 
| 98 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 99 | 
             
                # TODO: 图 表 压缩率
         | 
| 100 | 
            -
                # llama chatglm gpt_nexo_20b baichuan  baichuan_7b
         | 
| 101 | 
             
                with gr.Row():
         | 
| 102 | 
             
                    with gr.Column():
         | 
| 103 | 
            -
                        tokenizer_type_1 = gr.Dropdown(
         | 
| 104 | 
            -
                            all_tokenizers, value="llama", label="tokenizer"
         | 
| 105 | 
            -
                        )
         | 
| 106 | 
            -
                        token_counter_1 = None  # 计数器
         | 
| 107 | 
             
                        output_text_1 = gr.Highlightedtext(
         | 
| 108 | 
            -
                            label=" | 
| 109 | 
             
                            show_legend=True,
         | 
| 110 | 
             
                            elem_classes="space-show"
         | 
| 111 | 
             
                        )
         | 
| 112 | 
            -
             | 
| 113 | 
            -
                        output_table_1 = gr.Dataframe(
         | 
| 114 | 
            -
                            headers=["TokenID", "Byte", "Text"],
         | 
| 115 | 
            -
                            datatype=["str", "str", "str"],
         | 
| 116 | 
            -
                            #elem_classes="space-show",   # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
         | 
| 117 | 
            -
                        )
         | 
| 118 | 
            -
             | 
| 119 | 
             
                    with gr.Column():
         | 
| 120 | 
            -
                        tokenizer_type_2 = gr.Dropdown(
         | 
| 121 | 
            -
                            all_tokenizers, value="baichuan_7b", label="tokenizer"
         | 
| 122 | 
            -
                        )
         | 
| 123 | 
            -
                        token_counter_2 = None  # 计数器
         | 
| 124 | 
             
                        output_text_2 = gr.Highlightedtext(
         | 
| 125 | 
            -
                            label=" | 
| 126 | 
             
                            show_legend=True,
         | 
| 127 | 
             
                            elem_classes="space-show"
         | 
| 128 | 
             
                        )
         | 
| 129 |  | 
| 130 | 
            -
             | 
| 131 | 
            -
             | 
| 132 | 
            -
             | 
| 133 | 
            -
                         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 134 |  | 
| 135 | 
            -
             | 
| 136 | 
            -
             | 
| 137 | 
            -
                                      [output_text_1, output_table_1])
         | 
| 138 | 
            -
                    tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1])
         | 
| 139 |  | 
| 140 | 
            -
             | 
| 141 | 
            -
             | 
| 142 | 
            -
             | 
| 143 | 
            -
                     | 
|  | |
|  | |
|  | |
| 144 |  | 
| 145 | 
             
                # submitBtn.click(tokenize, [user_input, tokenizer_type], outputs,
         | 
| 146 | 
             
                #                 show_progress=True)
         | 
|  | |
| 9 | 
             
            table
         | 
| 10 |  | 
| 11 | 
             
            ## related demo
         | 
| 12 | 
            +
            - [](http://text-processing.com/demo/tokenize/)
         | 
| 13 | 
            +
            - [gpt-tokenizer](https://gpt-tokenizer.dev/)
         | 
| 14 | 
            +
            - [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/)
         | 
| 15 | 
            +
            - [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)
         | 
| 16 |  | 
| 17 | 
             
            ## 可视化
         | 
| 18 |  | 
|  | |
| 31 | 
             
            .space-show {white-space: pre-wrap;}
         | 
| 32 | 
             
            .cell-wrap {white-space: pre-wrap;}
         | 
| 33 | 
             
            .category-legend {display: none !important}
         | 
| 34 | 
            +
            .statistics textarea {min-width: min(50px,100%) !important; font-size: 20px !important; font-weight: 600 !important; text-align: center !important; border: none !important;}
         | 
| 35 | 
            +
            .statistics label {text-align: center !important;}
         | 
| 36 | 
             
            """
         | 
| 37 |  | 
| 38 | 
            +
            example_text = """Replace this text in the input field to see how tokenization works
         | 
| 39 | 
            +
            华为智能音箱发布:华为Sound X"""
         | 
|  | |
|  | |
| 40 |  | 
| 41 | 
            +
            # llama chatglm_6b gpt_nexo_20b baichuan  baichuan_7b
         | 
| 42 | 
            +
            examples = [
         | 
| 43 | 
            +
                ["空格测试:  2个空格        8个空格", "llama", "chatglm_6b"],  # chatglm 有blank_n,
         | 
| 44 | 
            +
                ["标点测试:,。!?;", "baichuan_7b", "llama"],
         | 
| 45 | 
            +
                ["符号测试:🦙", "baichuan_7b", "llama"],
         | 
| 46 | 
            +
                ["中文测试:🦙", "baichuan_7b", "llama"],
         | 
| 47 | 
            +
                ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
         | 
| 48 | 
            +
            ]
         | 
| 49 |  | 
| 50 | 
            +
             | 
| 51 | 
            +
             | 
| 52 | 
            +
            def tokenize(text, tokenizer_type, color_num=5):
         | 
| 53 | 
            +
                """
         | 
| 54 | 
            +
                TODO: cache tokenizer
         | 
| 55 | 
            +
                """
         | 
| 56 | 
             
                print(text, tokenizer_type)
         | 
| 57 | 
             
                pos_tokens = []
         | 
| 58 | 
             
                tokenizer = load_tokener(tokenizer_type)
         | 
|  | |
| 62 |  | 
| 63 | 
             
                for idx, token_id in enumerate(encoding):
         | 
| 64 | 
             
                    decode_text = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �,对应 "\ufffd"
         | 
| 65 | 
            +
                    pos_tokens.extend([(decode_text, str(idx % color_num))])
         | 
| 66 |  | 
| 67 | 
             
                    # token  "Byte":  # 这是 utf-8编码吧?
         | 
| 68 | 
             
                    token = tokenizer.convert_ids_to_tokens([token_id])[0]
         | 
| 69 | 
             
                    if isinstance(token, bytes):
         | 
| 70 | 
            +
                        try:
         | 
| 71 | 
            +
                            token_str = token.decode("utf-8")
         | 
| 72 | 
            +
                        except:
         | 
| 73 | 
            +
                            token_str = token.decode("utf-8", errors="ignore")
         | 
| 74 | 
            +
                            print("decode_error", token, token_str)
         | 
| 75 | 
            +
             | 
| 76 | 
             
                        token_bytes = token
         | 
| 77 | 
             
                        json_dumps = json.dumps(token_str)
         | 
| 78 | 
             
                    elif isinstance(token, str):
         | 
|  | |
| 82 | 
             
                    else:
         | 
| 83 | 
             
                        return
         | 
| 84 |  | 
| 85 | 
            +
             | 
| 86 | 
            +
             | 
| 87 | 
             
                    table.append(
         | 
| 88 | 
             
                        {"TokenID": token_id,
         | 
| 89 | 
            +
                         "⭐Token": token_str,  # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
         | 
| 90 | 
             
                         "Text": decode_text,  #
         | 
| 91 | 
             
                         # "Bytes": token_bytes,  # bytes类型在gradio前端页面被解码成字符串,比如   b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
         | 
| 92 | 
             
                         "Bytes": str(token_bytes),
         | 
|  | |
| 96 |  | 
| 97 | 
             
                table_df = pd.DataFrame(table)
         | 
| 98 | 
             
                print(table)
         | 
| 99 | 
            +
                # print(table_df)
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                return pos_tokens, table_df, len(encoding)
         | 
| 102 | 
            +
             | 
| 103 |  | 
| 104 | 
            +
            def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
         | 
| 105 | 
            +
                pos_tokens_1, table_df_1, token_size_1 = tokenize(text, tokenizer_type_1)
         | 
| 106 | 
            +
                pos_tokens_2, table_df_2, token_size_2 = tokenize(text, tokenizer_type_2)
         | 
| 107 | 
            +
                return pos_tokens_1, table_df_1, token_size_1, pos_tokens_2, table_df_2, token_size_2
         | 
| 108 |  | 
| 109 |  | 
| 110 | 
            +
            def get_vocab_size(tokenizer_type):
         | 
| 111 | 
            +
                tokenizer = load_tokener(tokenizer_type)
         | 
| 112 | 
            +
                return tokenizer.vocab_size
         | 
| 113 | 
            +
             | 
| 114 | 
             
            def test_coding():
         | 
| 115 | 
             
                bytes1 = b'\xe4\xb8\xad'
         | 
| 116 | 
             
                print(bytes1)  # b'\xe4\xb8\xad'
         | 
| 117 |  | 
| 118 |  | 
| 119 | 
             
            with gr.Blocks(css=css) as demo:
         | 
| 120 | 
            +
                gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
         | 
| 121 | 
             
                # links: https://www.coderstool.com/utf8-encoding-decoding
         | 
| 122 | 
            +
                # 功能:输入文本,进行分词
         | 
| 123 | 
            +
                # 分词器:常见的分词器有集中,
         | 
| 124 | 
            +
                # 背景:方便分词、看词粒度、对比
         | 
| 125 | 
             
                #
         | 
| 126 | 
            +
                # Byte: 表示分词
         | 
| 127 |  | 
| 128 |  | 
| 129 | 
            +
                gr.Markdown("## Input Text")
         | 
| 130 | 
             
                user_input = gr.Textbox(
         | 
| 131 | 
             
                    value=example_text,
         | 
| 132 | 
            +
                    label="Input Text",
         | 
| 133 | 
            +
                    lines=5,
         | 
| 134 | 
            +
                    show_label=False,
         | 
| 135 | 
             
                )  # placeholder="Enter sentence here..."
         | 
| 136 |  | 
| 137 | 
             
                # submitBtn = gr.Button("生成回复", variant="primary")
         | 
| 138 |  | 
| 139 | 
            +
                gr.Markdown("## Tokenization")
         | 
| 140 | 
            +
             | 
| 141 | 
            +
                with gr.Row():
         | 
| 142 | 
            +
                    with gr.Column(scale=6):
         | 
| 143 | 
            +
                        with gr.Group():
         | 
| 144 | 
            +
                            tokenizer_type_1 = gr.Dropdown(
         | 
| 145 | 
            +
                                all_tokenizers,
         | 
| 146 | 
            +
                                value="llama",
         | 
| 147 | 
            +
                                label="Tokenizer 1",
         | 
| 148 | 
            +
                            )
         | 
| 149 | 
            +
                            with gr.Group():
         | 
| 150 | 
            +
                                """
         | 
| 151 | 
            +
                                <div class="stat"><div class="stat-value">69</div><div class="stat-label">Characters</div></div>
         | 
| 152 | 
            +
                                """
         | 
| 153 | 
            +
                                with gr.Row():
         | 
| 154 | 
            +
                                    stats_vocab_size_1 = gr.TextArea(
         | 
| 155 | 
            +
                                        label="VocabSize",
         | 
| 156 | 
            +
                                        lines=1,
         | 
| 157 | 
            +
                                        elem_classes="statistics"
         | 
| 158 | 
            +
                                    )
         | 
| 159 | 
            +
                                    stats_token_size_1 = gr.TextArea(
         | 
| 160 | 
            +
                                        label="Tokens",
         | 
| 161 | 
            +
                                        lines=1,
         | 
| 162 | 
            +
                                        elem_classes="statistics"
         | 
| 163 | 
            +
                                    )
         | 
| 164 | 
            +
                                    stats_3 = gr.TextArea(
         | 
| 165 | 
            +
                                        label="Compress Rate",
         | 
| 166 | 
            +
                                        lines=1,
         | 
| 167 | 
            +
                                        elem_classes="statistics"
         | 
| 168 | 
            +
                                    )
         | 
| 169 | 
            +
                    # https://www.onlinewebfonts.com/icon/418591
         | 
| 170 | 
            +
                    gr.Image("images/VS.svg", scale=1, show_label=False, show_download_button=False, container=False) # height=10,
         | 
| 171 | 
            +
                    with gr.Column(scale=6):
         | 
| 172 | 
            +
                        with gr.Group():
         | 
| 173 | 
            +
                            tokenizer_type_2 = gr.Dropdown(
         | 
| 174 | 
            +
                                all_tokenizers,
         | 
| 175 | 
            +
                                value="baichuan_7b",
         | 
| 176 | 
            +
                                label="Tokenizer 2",
         | 
| 177 | 
            +
                            )
         | 
| 178 | 
            +
                            with gr.Group():
         | 
| 179 | 
            +
                                with gr.Row():
         | 
| 180 | 
            +
                                    stats_vocab_size_2 = gr.TextArea(
         | 
| 181 | 
            +
                                        label="VocabSize",
         | 
| 182 | 
            +
                                        lines=1,
         | 
| 183 | 
            +
                                        elem_classes="statistics"
         | 
| 184 | 
            +
                                    )
         | 
| 185 | 
            +
                                    stats_token_size_2 = gr.TextArea(
         | 
| 186 | 
            +
                                        label="Tokens",
         | 
| 187 | 
            +
                                        lines=1,
         | 
| 188 | 
            +
                                        elem_classes="statistics"
         | 
| 189 | 
            +
                                    )
         | 
| 190 | 
            +
                                    stats_6 = gr.TextArea(
         | 
| 191 | 
            +
                                        label="Compress Rate",
         | 
| 192 | 
            +
                                        lines=1,
         | 
| 193 | 
            +
                                        elem_classes="statistics"
         | 
| 194 | 
            +
                                    )
         | 
| 195 | 
            +
             | 
| 196 | 
            +
             | 
| 197 | 
            +
             | 
| 198 | 
             
                # TODO: 图 表 压缩率
         | 
|  | |
| 199 | 
             
                with gr.Row():
         | 
| 200 | 
             
                    with gr.Column():
         | 
|  | |
|  | |
|  | |
|  | |
| 201 | 
             
                        output_text_1 = gr.Highlightedtext(
         | 
| 202 | 
            +
                            label="Tokens 1",
         | 
| 203 | 
             
                            show_legend=True,
         | 
| 204 | 
             
                            elem_classes="space-show"
         | 
| 205 | 
             
                        )
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 206 | 
             
                    with gr.Column():
         | 
|  | |
|  | |
|  | |
|  | |
| 207 | 
             
                        output_text_2 = gr.Highlightedtext(
         | 
| 208 | 
            +
                            label="Tokens 2",
         | 
| 209 | 
             
                            show_legend=True,
         | 
| 210 | 
             
                            elem_classes="space-show"
         | 
| 211 | 
             
                        )
         | 
| 212 |  | 
| 213 | 
            +
                with gr.Row():
         | 
| 214 | 
            +
                    output_table_1 = gr.Dataframe(
         | 
| 215 | 
            +
                        headers=["TokenID", "Byte", "Text"],
         | 
| 216 | 
            +
                        datatype=["str", "str", "str"],
         | 
| 217 | 
            +
                        # elem_classes="space-show",   # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
         | 
| 218 | 
            +
                    )
         | 
| 219 | 
            +
                    output_table_2 = gr.Dataframe(
         | 
| 220 | 
            +
                        headers=["TokenID", "Token", "Text"],
         | 
| 221 | 
            +
                        datatype=["str", "str", "str"],
         | 
| 222 | 
            +
                    )
         | 
| 223 | 
            +
             | 
| 224 | 
            +
                tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1])
         | 
| 225 | 
            +
                tokenizer_type_1.change(get_vocab_size, [tokenizer_type_1], [stats_vocab_size_1])
         | 
| 226 | 
            +
             | 
| 227 | 
            +
                user_input.change(tokenize_pair,
         | 
| 228 | 
            +
                                  [user_input, tokenizer_type_1, tokenizer_type_2],
         | 
| 229 | 
            +
                                  [output_text_1, output_table_1, stats_token_size_1, output_text_2, output_table_2, stats_token_size_2])
         | 
| 230 |  | 
| 231 | 
            +
                tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2, stats_token_size_2])
         | 
| 232 | 
            +
                tokenizer_type_2.change(get_vocab_size, [tokenizer_type_2], [stats_vocab_size_2])
         | 
|  | |
|  | |
| 233 |  | 
| 234 | 
            +
                gr.Examples(
         | 
| 235 | 
            +
                    examples,
         | 
| 236 | 
            +
                    [user_input, tokenizer_type_1, tokenizer_type_2],
         | 
| 237 | 
            +
                    [output_text_1, output_table_1, stats_token_size_1, output_text_2, output_table_2, stats_token_size_2],
         | 
| 238 | 
            +
                    tokenize_pair,
         | 
| 239 | 
            +
                    cache_examples=True,
         | 
| 240 | 
            +
                )
         | 
| 241 |  | 
| 242 | 
             
                # submitBtn.click(tokenize, [user_input, tokenizer_type], outputs,
         | 
| 243 | 
             
                #                 show_progress=True)
         | 
    	
        app_v1.py
    ADDED
    
    | @@ -0,0 +1,196 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # coding=utf-8
         | 
| 2 | 
            +
            # author: xusong
         | 
| 3 | 
            +
            # time: 2022/8/23 16:06
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            """
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            plots
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            table
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            ## related demo
         | 
| 12 | 
            +
            - [](http://text-processing.com/demo/tokenize/)
         | 
| 13 | 
            +
            - [gpt-tokenizer](https://gpt-tokenizer.dev/)
         | 
| 14 | 
            +
            - [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/)
         | 
| 15 | 
            +
            - [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            ## 可视化
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            [ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
         | 
| 20 | 
            +
            """
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            import json
         | 
| 23 | 
            +
            import pandas as pd
         | 
| 24 | 
            +
            import gradio as gr
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            from vocab import all_tokenizers, load_tokener
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            # 显示空格:https://blog.csdn.net/liuxiao723846/article/details/118994673
         | 
| 29 | 
            +
            # 隐藏legend:
         | 
| 30 | 
            +
            css = """
         | 
| 31 | 
            +
            .space-show {white-space: pre-wrap;}
         | 
| 32 | 
            +
            .cell-wrap {white-space: pre-wrap;}
         | 
| 33 | 
            +
            .category-legend {display: none !important}
         | 
| 34 | 
            +
            """
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            example_text = """Replace this text in the input field to see how tokenization works
         | 
| 37 | 
            +
            中文测试:华为智能音箱发布:华为Sound X。維基百科由非營利組織──維基媒體基金會負責維持
         | 
| 38 | 
            +
            数字测试:(10086 + 98) = 100184"""
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            # llama chatglm_6b gpt_nexo_20b baichuan  baichuan_7b
         | 
| 41 | 
            +
            examples = [
         | 
| 42 | 
            +
                # ["空格测试:  2个空格        8个空格", "llama", "chatglm_6b"],  # chatglm 有blank_n,
         | 
| 43 | 
            +
                ["标点测试:,。!?;", "baichuan_7b", "llama"],
         | 
| 44 | 
            +
                ["标点测试:🦙", "baichuan_7b", "llama"],
         | 
| 45 | 
            +
            ]
         | 
| 46 | 
            +
             | 
| 47 | 
            +
             | 
| 48 | 
            +
            def tokenize(text, tokenizer_type, color_num=5):
         | 
| 49 | 
            +
                print(text, tokenizer_type)
         | 
| 50 | 
            +
                pos_tokens = []
         | 
| 51 | 
            +
                tokenizer = load_tokener(tokenizer_type)
         | 
| 52 | 
            +
                encoding = tokenizer.encode(text)
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                table = []
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                for idx, token_id in enumerate(encoding):
         | 
| 57 | 
            +
                    decode_text = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �,对应 "\ufffd"
         | 
| 58 | 
            +
                    pos_tokens.extend([(decode_text, str(idx % color_num))])
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                    # token  "Byte":  # 这是 utf-8编码吧?
         | 
| 61 | 
            +
                    token = tokenizer.convert_ids_to_tokens([token_id])[0]
         | 
| 62 | 
            +
                    if isinstance(token, bytes):
         | 
| 63 | 
            +
                        try:
         | 
| 64 | 
            +
                            token_str = token.decode("utf-8")
         | 
| 65 | 
            +
                        except:
         | 
| 66 | 
            +
                            token_str = token.decode("utf-8", errors="ignore")
         | 
| 67 | 
            +
                            print("decode_error", token, token_str)
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                        token_bytes = token
         | 
| 70 | 
            +
                        json_dumps = json.dumps(token_str)
         | 
| 71 | 
            +
                    elif isinstance(token, str):
         | 
| 72 | 
            +
                        token_str = token
         | 
| 73 | 
            +
                        token_bytes = bytes(token_str, "utf-8")
         | 
| 74 | 
            +
                        json_dumps = json.dumps(token_str)
         | 
| 75 | 
            +
                    else:
         | 
| 76 | 
            +
                        return
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                    table.append(
         | 
| 79 | 
            +
                        {"TokenID": token_id,
         | 
| 80 | 
            +
                         "Token": token_str,  # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
         | 
| 81 | 
            +
                         "Text": decode_text,  #
         | 
| 82 | 
            +
                         # "Bytes": token_bytes,  # bytes类型在gradio前端页面被解码成字符串,比如   b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
         | 
| 83 | 
            +
                         "Bytes": str(token_bytes),
         | 
| 84 | 
            +
                         # "Unicode": json_dumps  # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
         | 
| 85 | 
            +
                         }
         | 
| 86 | 
            +
                    )
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                table_df = pd.DataFrame(table)
         | 
| 89 | 
            +
                print(table)
         | 
| 90 | 
            +
                # print(table_df)
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                return pos_tokens, table_df
         | 
| 93 | 
            +
             | 
| 94 | 
            +
             | 
| 95 | 
            +
            def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
         | 
| 96 | 
            +
                pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
         | 
| 97 | 
            +
                pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
         | 
| 98 | 
            +
                return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
         | 
| 99 | 
            +
             | 
| 100 | 
            +
             | 
| 101 | 
            +
            def test_coding():
         | 
| 102 | 
            +
                bytes1 = b'\xe4\xb8\xad'
         | 
| 103 | 
            +
                print(bytes1)  # b'\xe4\xb8\xad'
         | 
| 104 | 
            +
             | 
| 105 | 
            +
             | 
| 106 | 
            +
            with gr.Blocks(css=css) as demo:
         | 
| 107 | 
            +
                gr.HTML("""<h1 align="center">The Tokenizer Arena</h1>""")
         | 
| 108 | 
            +
                # links: https://www.coderstool.com/utf8-encoding-decoding
         | 
| 109 | 
            +
                #
         | 
| 110 | 
            +
             | 
| 111 | 
            +
             | 
| 112 | 
            +
             | 
| 113 | 
            +
                gr.Markdown("## Input Text")
         | 
| 114 | 
            +
                user_input = gr.Textbox(
         | 
| 115 | 
            +
                    value=example_text,
         | 
| 116 | 
            +
                    label="Input Text",
         | 
| 117 | 
            +
                    lines=5
         | 
| 118 | 
            +
                )  # placeholder="Enter sentence here..."
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                # submitBtn = gr.Button("生成回复", variant="primary")
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                gr.Markdown("## Tokenization")
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                # with gr.Row():
         | 
| 125 | 
            +
             | 
| 126 | 
            +
             | 
| 127 | 
            +
             | 
| 128 | 
            +
                # TODO: 图 表 压缩率
         | 
| 129 | 
            +
                with gr.Row():
         | 
| 130 | 
            +
                    with gr.Column():
         | 
| 131 | 
            +
                        tokenizer_type_1 = gr.Dropdown(
         | 
| 132 | 
            +
                            all_tokenizers,
         | 
| 133 | 
            +
                            value="llama",
         | 
| 134 | 
            +
                            label="Tokenizer 1",
         | 
| 135 | 
            +
                        )
         | 
| 136 | 
            +
                        token_counter_1 = None  # 计数器
         | 
| 137 | 
            +
                        output_text_1 = gr.Highlightedtext(
         | 
| 138 | 
            +
                            label="Tokens 1",
         | 
| 139 | 
            +
                            show_legend=True,
         | 
| 140 | 
            +
                            elem_classes="space-show"
         | 
| 141 | 
            +
                        )
         | 
| 142 | 
            +
             | 
| 143 | 
            +
                    with gr.Column():
         | 
| 144 | 
            +
                        tokenizer_type_2 = gr.Dropdown(
         | 
| 145 | 
            +
                            all_tokenizers,
         | 
| 146 | 
            +
                            value="baichuan_7b",
         | 
| 147 | 
            +
                            label="Tokenizer 2"
         | 
| 148 | 
            +
                        )
         | 
| 149 | 
            +
                        token_counter_2 = None  # 计数器
         | 
| 150 | 
            +
                        output_text_2 = gr.Highlightedtext(
         | 
| 151 | 
            +
                            label="Tokens 2",
         | 
| 152 | 
            +
                            show_legend=True,
         | 
| 153 | 
            +
                            elem_classes="space-show"
         | 
| 154 | 
            +
                        )
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                with gr.Row():
         | 
| 157 | 
            +
                    output_table_1 = gr.Dataframe(
         | 
| 158 | 
            +
                        headers=["TokenID", "Byte", "Text"],
         | 
| 159 | 
            +
                        datatype=["str", "str", "str"],
         | 
| 160 | 
            +
                        # elem_classes="space-show",   # 给���个Dataframe加这个css不起作用,因此直接修改cell-wrap
         | 
| 161 | 
            +
                    )
         | 
| 162 | 
            +
                    output_table_2 = gr.Dataframe(
         | 
| 163 | 
            +
                        headers=["TokenID", "Token", "Text"],
         | 
| 164 | 
            +
                        datatype=["str", "str", "str"],
         | 
| 165 | 
            +
                    )
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                user_input.change(tokenize,
         | 
| 168 | 
            +
                                  [user_input, tokenizer_type_1],
         | 
| 169 | 
            +
                                  [output_text_1, output_table_1])
         | 
| 170 | 
            +
                tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1])
         | 
| 171 | 
            +
             | 
| 172 | 
            +
                user_input.change(tokenize,
         | 
| 173 | 
            +
                                  [user_input, tokenizer_type_2],
         | 
| 174 | 
            +
                                  [output_text_2, output_table_2])
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2])
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                gr.Examples(
         | 
| 179 | 
            +
                    examples,
         | 
| 180 | 
            +
                    [user_input, tokenizer_type_1, tokenizer_type_2],
         | 
| 181 | 
            +
                    [output_text_1, output_table_1, output_text_2, output_table_2],
         | 
| 182 | 
            +
                    tokenize_pair,
         | 
| 183 | 
            +
                    cache_examples=True,
         | 
| 184 | 
            +
                )
         | 
| 185 | 
            +
             | 
| 186 | 
            +
                # submitBtn.click(tokenize, [user_input, tokenizer_type], outputs,
         | 
| 187 | 
            +
                #                 show_progress=True)
         | 
| 188 | 
            +
             | 
| 189 | 
            +
                # examples=[
         | 
| 190 | 
            +
                #     ["What a beautiful morning for a walk!"],
         | 
| 191 | 
            +
                #     ["It was the best of times, it was the worst of times."],
         | 
| 192 | 
            +
                #     ["多个空格    It  ss  was the best of times, it was the worst of times."],
         | 
| 193 | 
            +
                # ]
         | 
| 194 | 
            +
             | 
| 195 | 
            +
            if __name__ == "__main__":
         | 
| 196 | 
            +
                demo.launch()
         | 
    	
        images/VS.svg
    ADDED
    
    |  | 
    	
        tokenizer.py
    ADDED
    
    | 
            File without changes
         | 
    	
        vocab/__init__.py
    CHANGED
    
    | @@ -1,7 +1,18 @@ | |
| 1 | 
            -
            import transformers
         | 
| 2 | 
             
            import importlib
         | 
| 3 | 
             
            from enum import Enum, auto
         | 
| 4 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 5 | 
             
            Animal = Enum('Animal', 'ANT BEE CAT DOG')
         | 
| 6 |  | 
| 7 | 
             
            uniq_tokenizers = [
         | 
| @@ -29,7 +40,7 @@ all_tokenizers = [ | |
| 29 | 
             
                #
         | 
| 30 | 
             
                # ##### glm系列
         | 
| 31 | 
             
                # "glm_chinese",
         | 
| 32 | 
            -
                " | 
| 33 | 
             
                #
         | 
| 34 | 
             
                # #### llama alpaca系列
         | 
| 35 | 
             
                "llama",  #  '中文单字': 700, '中文多字': 0
         | 
|  | |
|  | |
| 1 | 
             
            import importlib
         | 
| 2 | 
             
            from enum import Enum, auto
         | 
| 3 |  | 
| 4 | 
            +
             | 
| 5 | 
            +
            """
         | 
| 6 | 
            +
            Interface:
         | 
| 7 | 
            +
            - 
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            tokenizer.parent = ""
         | 
| 10 | 
            +
            tokenizer.type = TokenizerType.ByteBPE.name
         | 
| 11 | 
            +
            tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
         | 
| 12 | 
            +
            tokenizer.comments = "split all numbers into individual digits, " \
         | 
| 13 | 
            +
                                 "and fallback to bytes to decompose unknown UTF-8 characters"
         | 
| 14 | 
            +
            """
         | 
| 15 | 
            +
             | 
| 16 | 
             
            Animal = Enum('Animal', 'ANT BEE CAT DOG')
         | 
| 17 |  | 
| 18 | 
             
            uniq_tokenizers = [
         | 
|  | |
| 40 | 
             
                #
         | 
| 41 | 
             
                # ##### glm系列
         | 
| 42 | 
             
                # "glm_chinese",
         | 
| 43 | 
            +
                "chatglm_6b",
         | 
| 44 | 
             
                #
         | 
| 45 | 
             
                # #### llama alpaca系列
         | 
| 46 | 
             
                "llama",  #  '中文单字': 700, '中文多字': 0
         | 
    	
        vocab/baichuan_7b/__init__.py
    CHANGED
    
    | @@ -6,3 +6,6 @@ tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan-7B", trust_remo | |
| 6 |  | 
| 7 | 
             
            # byte-bpe  sentencepiece
         | 
| 8 | 
             
            tokenizer.type = TokenizerType.ByteBPE
         | 
|  | |
|  | |
|  | 
|  | |
| 6 |  | 
| 7 | 
             
            # byte-bpe  sentencepiece
         | 
| 8 | 
             
            tokenizer.type = TokenizerType.ByteBPE
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            tokenizer.comments = "使用 SentencePiece 中的 Byte-Pair Encoding (BPE) 作为分词算法"
         | 
| 11 | 
            +
             | 
    	
        vocab/{chatglm → chatglm_6b}/README.md
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/{chatglm → chatglm_6b}/__init__.py
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/{chatglm → chatglm_6b}/chatglm.vocab
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/{chatglm → chatglm_6b}/test_chatglm.py
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/{chatglm → chatglm_6b}/tokenizer/config.json
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/{chatglm → chatglm_6b}/tokenizer/ice_text.model
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/{chatglm → chatglm_6b}/tokenizer/tokenization_chatglm.py
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/{chatglm → chatglm_6b}/tokenizer/tokenizer_config.json
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/gpt_35_turbo/__init__.py
    CHANGED
    
    | @@ -16,7 +16,11 @@ def decode(self, tokens, errors="replace"): | |
| 16 | 
             
                    decode_str = "null"
         | 
| 17 | 
             
                return decode_str
         | 
| 18 |  | 
|  | |
|  | |
|  | |
| 19 |  | 
| 20 | 
             
            Encoding.decode = decode
         | 
|  | |
| 21 |  | 
| 22 |  | 
|  | |
| 16 | 
             
                    decode_str = "null"
         | 
| 17 | 
             
                return decode_str
         | 
| 18 |  | 
| 19 | 
            +
            def convert_ids_to_tokens(self, tokens):
         | 
| 20 | 
            +
                return tokenizer.decode_tokens_bytes(tokens)
         | 
| 21 | 
            +
             | 
| 22 |  | 
| 23 | 
             
            Encoding.decode = decode
         | 
| 24 | 
            +
            Encoding.convert_ids_to_tokens = convert_ids_to_tokens
         | 
| 25 |  | 
| 26 |  | 
    	
        vocab/gpt_35_turbo/test2.py
    CHANGED
    
    | @@ -22,6 +22,10 @@ print(decoding_bytes) | |
| 22 | 
             
            #     print(token, token_str, json.dumps(token_str))
         | 
| 23 |  | 
| 24 |  | 
|  | |
|  | |
|  | |
|  | |
| 25 | 
             
            f_out = open("vocab.jsonl", "w")
         | 
| 26 | 
             
            # 100255
         | 
| 27 | 
             
            for i in range(tokenizer.n_vocab):
         | 
|  | |
| 22 | 
             
            #     print(token, token_str, json.dumps(token_str))
         | 
| 23 |  | 
| 24 |  | 
| 25 | 
            +
            tokenizer.decode_tokens_bytes([10])
         | 
| 26 | 
            +
            tokenizer.decode_single_token_bytes(10)
         | 
| 27 | 
            +
            tokenizer.decode_bytes([10])
         | 
| 28 | 
            +
             | 
| 29 | 
             
            f_out = open("vocab.jsonl", "w")
         | 
| 30 | 
             
            # 100255
         | 
| 31 | 
             
            for i in range(tokenizer.n_vocab):
         | 
    	
        vocab/{bert_kplug → kplug}/README.md
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/kplug/__init__.py
    ADDED
    
    | 
            File without changes
         | 
    	
        vocab/{bert_kplug → kplug}/bpe_oov.py
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/{bert_kplug → kplug}/bpe_oov2.py
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/{bert_kplug → kplug}/jd_vocab.py
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/{bert_kplug → kplug}/langconv.py
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/{bert_kplug → kplug}/test_langconv.py
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/{bert_kplug → kplug}/vocab.jd.txt
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/{bert_kplug → kplug}/vocab.jd.txt.v2
    RENAMED
    
    | 
            File without changes
         | 
    	
        vocab/{bert_kplug → kplug}/zh_wiki.py
    RENAMED
    
    | 
            File without changes
         |