Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

App Files Files Community

xu-song commited on Sep 1, 2023

Commit

428b731

1 Parent(s): 751936e

update

Browse files

Files changed (28) hide show

.gitignore +2 -1
README.md +8 -1
app.py +140 -43
app_v1.py +196 -0
images/VS.svg +7 -0
tokenizer.py +0 -0
vocab/__init__.py +13 -2
vocab/baichuan_7b/__init__.py +3 -0
vocab/{chatglm → chatglm_6b}/README.md +0 -0
vocab/{chatglm → chatglm_6b}/__init__.py +0 -0
vocab/{chatglm → chatglm_6b}/chatglm.vocab +0 -0
vocab/{chatglm → chatglm_6b}/test_chatglm.py +0 -0
vocab/{chatglm → chatglm_6b}/tokenizer/config.json +0 -0
vocab/{chatglm → chatglm_6b}/tokenizer/ice_text.model +0 -0
vocab/{chatglm → chatglm_6b}/tokenizer/tokenization_chatglm.py +0 -0
vocab/{chatglm → chatglm_6b}/tokenizer/tokenizer_config.json +0 -0
vocab/gpt_35_turbo/__init__.py +4 -0
vocab/gpt_35_turbo/test2.py +4 -0
vocab/{bert_kplug → kplug}/README.md +0 -0
vocab/kplug/__init__.py +0 -0
vocab/{bert_kplug → kplug}/bpe_oov.py +0 -0
vocab/{bert_kplug → kplug}/bpe_oov2.py +0 -0
vocab/{bert_kplug → kplug}/jd_vocab.py +0 -0
vocab/{bert_kplug → kplug}/langconv.py +0 -0
vocab/{bert_kplug → kplug}/test_langconv.py +0 -0
vocab/{bert_kplug → kplug}/vocab.jd.txt +0 -0
vocab/{bert_kplug → kplug}/vocab.jd.txt.v2 +0 -0
vocab/{bert_kplug → kplug}/zh_wiki.py +0 -0

.gitignore CHANGED Viewed

@@ -13,4 +13,5 @@ dist/
 downloads/
 eggs/
 .eggs/
-.idea/

 downloads/
 eggs/
 .eggs/
+.idea/
+gradio_cached_examples

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: The Tokenizer Playground
 emoji: ⚡
 colorFrom: red
 colorTo: gray
@@ -10,3 +10,10 @@ pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Tokenizer Arena
 emoji: ⚡
 colorFrom: red
 colorTo: gray
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+## ss
+## ss

app.py CHANGED Viewed

@@ -9,7 +9,10 @@ plots
 table
 ## related demo
-http://text-processing.com/demo/tokenize/
 ## 可视化
@@ -28,15 +31,28 @@ css = """
 .space-show {white-space: pre-wrap;}
 .cell-wrap {white-space: pre-wrap;}
 .category-legend {display: none !important}
 """
-example_text = """中文测试：华为智能音箱发布：华为Sound X。維基百科由非營利組織──維基媒體基金會負責維持
-标点测试：，。！？；
-空格测试：  2个空格        8个空格
-数字测试：(10086 + 98) = 100184"""
-def tokenize(text, tokenizer_type):
     print(text, tokenizer_type)
     pos_tokens = []
     tokenizer = load_tokener(tokenizer_type)
@@ -46,12 +62,17 @@ def tokenize(text, tokenizer_type):
     for idx, token_id in enumerate(encoding):
         decode_text = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �，对应 "\ufffd"
-        pos_tokens.extend([(decode_text, str(idx % 3))])
         # token  "Byte":  # 这是 utf-8编码吧？
         token = tokenizer.convert_ids_to_tokens([token_id])[0]
         if isinstance(token, bytes):
-            token_str = token.decode("utf-8")
             token_bytes = token
             json_dumps = json.dumps(token_str)
         elif isinstance(token, str):
@@ -61,9 +82,11 @@ def tokenize(text, tokenizer_type):
         else:
             return
         table.append(
             {"TokenID": token_id,
-             "Token": token_str,  # utf-8解码后的字符串，为什么有些是 <0xE7>，表示什么？比如llama
              "Text": decode_text,  #
              # "Bytes": token_bytes,  # bytes类型在gradio前端页面被解码成字符串，比如   b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
              "Bytes": str(token_bytes),
@@ -73,74 +96,148 @@ def tokenize(text, tokenizer_type):
     table_df = pd.DataFrame(table)
     print(table)
-    print(table_df)
-    return pos_tokens, table_df
 def test_coding():
     bytes1 = b'\xe4\xb8\xad'
     print(bytes1)  # b'\xe4\xb8\xad'
 with gr.Blocks(css=css) as demo:
-    gr.HTML("""<h1 align="center">Tokenizer Arena</h1>""")
     # links: https://www.coderstool.com/utf8-encoding-decoding
     #
     user_input = gr.Textbox(
         value=example_text,
-        lines=5
     )  # placeholder="Enter sentence here..."
     # submitBtn = gr.Button("生成回复", variant="primary")
     # TODO: 图 表 压缩率
-    # llama chatglm gpt_nexo_20b baichuan  baichuan_7b
     with gr.Row():
         with gr.Column():
-            tokenizer_type_1 = gr.Dropdown(
-                all_tokenizers, value="llama", label="tokenizer"
-            )
-            token_counter_1 = None  # 计数器
             output_text_1 = gr.Highlightedtext(
-                label="Tokenization",
                 show_legend=True,
                 elem_classes="space-show"
             )
-            output_table_1 = gr.Dataframe(
-                headers=["TokenID", "Byte", "Text"],
-                datatype=["str", "str", "str"],
-                #elem_classes="space-show",   # 给整个Dataframe加这个css不起作用，因此直接修改cell-wrap
-            )
         with gr.Column():
-            tokenizer_type_2 = gr.Dropdown(
-                all_tokenizers, value="baichuan_7b", label="tokenizer"
-            )
-            token_counter_2 = None  # 计数器
             output_text_2 = gr.Highlightedtext(
-                label="Tokenization",
                 show_legend=True,
                 elem_classes="space-show"
             )
-            output_table_2 = gr.Dataframe(
-                headers=["TokenID", "Token", "Text"],
-                datatype=["str", "str", "str"],
-            )
-        user_input.change(tokenize,
-                          [user_input, tokenizer_type_1],
-                          [output_text_1, output_table_1])
-        tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1])
-        user_input.change(tokenize,
-                          [user_input, tokenizer_type_2],
-                          [output_text_2, output_table_2])
-        tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2])
     # submitBtn.click(tokenize, [user_input, tokenizer_type], outputs,
     #                 show_progress=True)

 table
 ## related demo
+- [](http://text-processing.com/demo/tokenize/)
+- [gpt-tokenizer](https://gpt-tokenizer.dev/)
+- [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/)
+- [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)
 ## 可视化
 .space-show {white-space: pre-wrap;}
 .cell-wrap {white-space: pre-wrap;}
 .category-legend {display: none !important}
+.statistics textarea {min-width: min(50px,100%) !important; font-size: 20px !important; font-weight: 600 !important; text-align: center !important; border: none !important;}
+.statistics label {text-align: center !important;}
 """
+example_text = """Replace this text in the input field to see how tokenization works
+华为智能音箱发布：华为Sound X"""
+# llama chatglm_6b gpt_nexo_20b baichuan  baichuan_7b
+examples = [
+    ["空格测试：  2个空格        8个空格", "llama", "chatglm_6b"],  # chatglm 有blank_n,
+    ["标点测试：，。！？；", "baichuan_7b", "llama"],
+    ["符号测试：🦙", "baichuan_7b", "llama"],
+    ["中文测试：🦙", "baichuan_7b", "llama"],
+    ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
+]
+def tokenize(text, tokenizer_type, color_num=5):
+    """
+    TODO: cache tokenizer
+    """
     print(text, tokenizer_type)
     pos_tokens = []
     tokenizer = load_tokener(tokenizer_type)
     for idx, token_id in enumerate(encoding):
         decode_text = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �，对应 "\ufffd"
+        pos_tokens.extend([(decode_text, str(idx % color_num))])
         # token  "Byte":  # 这是 utf-8编码吧？
         token = tokenizer.convert_ids_to_tokens([token_id])[0]
         if isinstance(token, bytes):
+            try:
+                token_str = token.decode("utf-8")
+            except:
+                token_str = token.decode("utf-8", errors="ignore")
+                print("decode_error", token, token_str)
             token_bytes = token
             json_dumps = json.dumps(token_str)
         elif isinstance(token, str):
         else:
             return
         table.append(
             {"TokenID": token_id,
+             "⭐Token": token_str,  # utf-8解码后的字符串，为什么有些是 <0xE7>，表示什么？比如llama
              "Text": decode_text,  #
              # "Bytes": token_bytes,  # bytes类型在gradio前端页面被解码成字符串，比如   b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
              "Bytes": str(token_bytes),
     table_df = pd.DataFrame(table)
     print(table)
+    # print(table_df)
+    return pos_tokens, table_df, len(encoding)
+def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
+    pos_tokens_1, table_df_1, token_size_1 = tokenize(text, tokenizer_type_1)
+    pos_tokens_2, table_df_2, token_size_2 = tokenize(text, tokenizer_type_2)
+    return pos_tokens_1, table_df_1, token_size_1, pos_tokens_2, table_df_2, token_size_2
+def get_vocab_size(tokenizer_type):
+    tokenizer = load_tokener(tokenizer_type)
+    return tokenizer.vocab_size
 def test_coding():
     bytes1 = b'\xe4\xb8\xad'
     print(bytes1)  # b'\xe4\xb8\xad'
 with gr.Blocks(css=css) as demo:
+    gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
     # links: https://www.coderstool.com/utf8-encoding-decoding
+    # 功能：输入文本，进行分词
+    # 分词器：常见的分词器有集中，
+    # 背景：方便分词、看词粒度、对比
     #
+    # Byte: 表示分词
+    gr.Markdown("## Input Text")
     user_input = gr.Textbox(
         value=example_text,
+        label="Input Text",
+        lines=5,
+        show_label=False,
     )  # placeholder="Enter sentence here..."
     # submitBtn = gr.Button("生成回复", variant="primary")
+    gr.Markdown("## Tokenization")
+    with gr.Row():
+        with gr.Column(scale=6):
+            with gr.Group():
+                tokenizer_type_1 = gr.Dropdown(
+                    all_tokenizers,
+                    value="llama",
+                    label="Tokenizer 1",
+                )
+                with gr.Group():
+                    """
+                    <div class="stat"><div class="stat-value">69</div><div class="stat-label">Characters</div></div>
+                    """
+                    with gr.Row():
+                        stats_vocab_size_1 = gr.TextArea(
+                            label="VocabSize",
+                            lines=1,
+                            elem_classes="statistics"
+                        )
+                        stats_token_size_1 = gr.TextArea(
+                            label="Tokens",
+                            lines=1,
+                            elem_classes="statistics"
+                        )
+                        stats_3 = gr.TextArea(
+                            label="Compress Rate",
+                            lines=1,
+                            elem_classes="statistics"
+                        )
+        # https://www.onlinewebfonts.com/icon/418591
+        gr.Image("images/VS.svg", scale=1, show_label=False, show_download_button=False, container=False) # height=10,
+        with gr.Column(scale=6):
+            with gr.Group():
+                tokenizer_type_2 = gr.Dropdown(
+                    all_tokenizers,
+                    value="baichuan_7b",
+                    label="Tokenizer 2",
+                )
+                with gr.Group():
+                    with gr.Row():
+                        stats_vocab_size_2 = gr.TextArea(
+                            label="VocabSize",
+                            lines=1,
+                            elem_classes="statistics"
+                        )
+                        stats_token_size_2 = gr.TextArea(
+                            label="Tokens",
+                            lines=1,
+                            elem_classes="statistics"
+                        )
+                        stats_6 = gr.TextArea(
+                            label="Compress Rate",
+                            lines=1,
+                            elem_classes="statistics"
+                        )
     # TODO: 图 表 压缩率
     with gr.Row():
         with gr.Column():
             output_text_1 = gr.Highlightedtext(
+                label="Tokens 1",
                 show_legend=True,
                 elem_classes="space-show"
             )
         with gr.Column():
             output_text_2 = gr.Highlightedtext(
+                label="Tokens 2",
                 show_legend=True,
                 elem_classes="space-show"
             )
+    with gr.Row():
+        output_table_1 = gr.Dataframe(
+            headers=["TokenID", "Byte", "Text"],
+            datatype=["str", "str", "str"],
+            # elem_classes="space-show",   # 给整个Dataframe加这个css不起作用，因此直接修改cell-wrap
+        )
+        output_table_2 = gr.Dataframe(
+            headers=["TokenID", "Token", "Text"],
+            datatype=["str", "str", "str"],
+        )
+    tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1])
+    tokenizer_type_1.change(get_vocab_size, [tokenizer_type_1], [stats_vocab_size_1])
+    user_input.change(tokenize_pair,
+                      [user_input, tokenizer_type_1, tokenizer_type_2],
+                      [output_text_1, output_table_1, stats_token_size_1, output_text_2, output_table_2, stats_token_size_2])
+    tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2, stats_token_size_2])
+    tokenizer_type_2.change(get_vocab_size, [tokenizer_type_2], [stats_vocab_size_2])
+    gr.Examples(
+        examples,
+        [user_input, tokenizer_type_1, tokenizer_type_2],
+        [output_text_1, output_table_1, stats_token_size_1, output_text_2, output_table_2, stats_token_size_2],
+        tokenize_pair,
+        cache_examples=True,
+    )
     # submitBtn.click(tokenize, [user_input, tokenizer_type], outputs,
     #                 show_progress=True)

app_v1.py ADDED Viewed

	@@ -0,0 +1,196 @@

+# coding=utf-8
+# author: xusong
+# time: 2022/8/23 16:06
+"""
+plots
+table
+## related demo
+- [](http://text-processing.com/demo/tokenize/)
+- [gpt-tokenizer](https://gpt-tokenizer.dev/)
+- [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/)
+- [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)
+## 可视化
+[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
+"""
+import json
+import pandas as pd
+import gradio as gr
+from vocab import all_tokenizers, load_tokener
+# 显示空格：https://blog.csdn.net/liuxiao723846/article/details/118994673
+# 隐藏legend：
+css = """
+.space-show {white-space: pre-wrap;}
+.cell-wrap {white-space: pre-wrap;}
+.category-legend {display: none !important}
+"""
+example_text = """Replace this text in the input field to see how tokenization works
+中文测试：华为智能音箱发布：华为Sound X。維基百科由非營利組織──維基媒體基金會負責維持
+数字测试：(10086 + 98) = 100184"""
+# llama chatglm_6b gpt_nexo_20b baichuan  baichuan_7b
+examples = [
+    # ["空格测试：  2个空格        8个空格", "llama", "chatglm_6b"],  # chatglm 有blank_n,
+    ["标点测试：，。！？；", "baichuan_7b", "llama"],
+    ["标点测试：🦙", "baichuan_7b", "llama"],
+]
+def tokenize(text, tokenizer_type, color_num=5):
+    print(text, tokenizer_type)
+    pos_tokens = []
+    tokenizer = load_tokener(tokenizer_type)
+    encoding = tokenizer.encode(text)
+    table = []
+    for idx, token_id in enumerate(encoding):
+        decode_text = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �，对应 "\ufffd"
+        pos_tokens.extend([(decode_text, str(idx % color_num))])
+        # token  "Byte":  # 这是 utf-8编码吧？
+        token = tokenizer.convert_ids_to_tokens([token_id])[0]
+        if isinstance(token, bytes):
+            try:
+                token_str = token.decode("utf-8")
+            except:
+                token_str = token.decode("utf-8", errors="ignore")
+                print("decode_error", token, token_str)
+            token_bytes = token
+            json_dumps = json.dumps(token_str)
+        elif isinstance(token, str):
+            token_str = token
+            token_bytes = bytes(token_str, "utf-8")
+            json_dumps = json.dumps(token_str)
+        else:
+            return
+        table.append(
+            {"TokenID": token_id,
+             "Token": token_str,  # utf-8解码后的字符串，为什么有些是 <0xE7>，表示什么？比如llama
+             "Text": decode_text,  #
+             # "Bytes": token_bytes,  # bytes类型在gradio前端页面被解码成字符串，比如   b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
+             "Bytes": str(token_bytes),
+             # "Unicode": json_dumps  # unicode, 如果是ascii码，就直接显示。如果不是ascii码，就显示unicode
+             }
+        )
+    table_df = pd.DataFrame(table)
+    print(table)
+    # print(table_df)
+    return pos_tokens, table_df
+def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
+    pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
+    pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
+    return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
+def test_coding():
+    bytes1 = b'\xe4\xb8\xad'
+    print(bytes1)  # b'\xe4\xb8\xad'
+with gr.Blocks(css=css) as demo:
+    gr.HTML("""<h1 align="center">The Tokenizer Arena</h1>""")
+    # links: https://www.coderstool.com/utf8-encoding-decoding
+    #
+    gr.Markdown("## Input Text")
+    user_input = gr.Textbox(
+        value=example_text,
+        label="Input Text",
+        lines=5
+    )  # placeholder="Enter sentence here..."
+    # submitBtn = gr.Button("生成回复", variant="primary")
+    gr.Markdown("## Tokenization")
+    # with gr.Row():
+    # TODO: 图 表 压缩率
+    with gr.Row():
+        with gr.Column():
+            tokenizer_type_1 = gr.Dropdown(
+                all_tokenizers,
+                value="llama",
+                label="Tokenizer 1",
+            )
+            token_counter_1 = None  # 计数器
+            output_text_1 = gr.Highlightedtext(
+                label="Tokens 1",
+                show_legend=True,
+                elem_classes="space-show"
+            )
+        with gr.Column():
+            tokenizer_type_2 = gr.Dropdown(
+                all_tokenizers,
+                value="baichuan_7b",
+                label="Tokenizer 2"
+            )
+            token_counter_2 = None  # 计数器
+            output_text_2 = gr.Highlightedtext(
+                label="Tokens 2",
+                show_legend=True,
+                elem_classes="space-show"
+            )
+    with gr.Row():
+        output_table_1 = gr.Dataframe(
+            headers=["TokenID", "Byte", "Text"],
+            datatype=["str", "str", "str"],
+            # elem_classes="space-show",   # 给���个Dataframe加这个css不起作用，因此直接修改cell-wrap
+        )
+        output_table_2 = gr.Dataframe(
+            headers=["TokenID", "Token", "Text"],
+            datatype=["str", "str", "str"],
+        )
+    user_input.change(tokenize,
+                      [user_input, tokenizer_type_1],
+                      [output_text_1, output_table_1])
+    tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1])
+    user_input.change(tokenize,
+                      [user_input, tokenizer_type_2],
+                      [output_text_2, output_table_2])
+    tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2])
+    gr.Examples(
+        examples,
+        [user_input, tokenizer_type_1, tokenizer_type_2],
+        [output_text_1, output_table_1, output_text_2, output_table_2],
+        tokenize_pair,
+        cache_examples=True,
+    )
+    # submitBtn.click(tokenize, [user_input, tokenizer_type], outputs,
+    #                 show_progress=True)
+    # examples=[
+    #     ["What a beautiful morning for a walk!"],
+    #     ["It was the best of times, it was the worst of times."],
+    #     ["多个空格    It  ss  was the best of times, it was the worst of times."],
+    # ]
+if __name__ == "__main__":
+    demo.launch()

images/VS.svg ADDED Viewed

tokenizer.py ADDED Viewed

File without changes

vocab/__init__.py CHANGED Viewed

@@ -1,7 +1,18 @@
-import transformers
 import importlib
 from enum import Enum, auto
 Animal = Enum('Animal', 'ANT BEE CAT DOG')
 uniq_tokenizers = [
@@ -29,7 +40,7 @@ all_tokenizers = [
     #
     # ##### glm系列
     # "glm_chinese",
-    "chatglm",
     #
     # #### llama alpaca系列
     "llama",  #  '中文单字': 700, '中文多字': 0

 import importlib
 from enum import Enum, auto
+"""
+Interface:
+-
+tokenizer.parent = ""
+tokenizer.type = TokenizerType.ByteBPE.name
+tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
+tokenizer.comments = "split all numbers into individual digits, " \
+                     "and fallback to bytes to decompose unknown UTF-8 characters"
+"""
 Animal = Enum('Animal', 'ANT BEE CAT DOG')
 uniq_tokenizers = [
     #
     # ##### glm系列
     # "glm_chinese",
+    "chatglm_6b",
     #
     # #### llama alpaca系列
     "llama",  #  '中文单字': 700, '中文多字': 0

vocab/baichuan_7b/__init__.py CHANGED Viewed

@@ -6,3 +6,6 @@ tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan-7B", trust_remo
 # byte-bpe  sentencepiece
 tokenizer.type = TokenizerType.ByteBPE

 # byte-bpe  sentencepiece
 tokenizer.type = TokenizerType.ByteBPE
+tokenizer.comments = "使用 SentencePiece 中的 Byte-Pair Encoding (BPE) 作为分词算法"

vocab/{chatglm → chatglm_6b}/README.md RENAMED Viewed

File without changes

vocab/{chatglm → chatglm_6b}/__init__.py RENAMED Viewed

File without changes

vocab/{chatglm → chatglm_6b}/chatglm.vocab RENAMED Viewed

File without changes

vocab/{chatglm → chatglm_6b}/test_chatglm.py RENAMED Viewed

File without changes

vocab/{chatglm → chatglm_6b}/tokenizer/config.json RENAMED Viewed

File without changes

vocab/{chatglm → chatglm_6b}/tokenizer/ice_text.model RENAMED Viewed

File without changes

vocab/{chatglm → chatglm_6b}/tokenizer/tokenization_chatglm.py RENAMED Viewed

File without changes

vocab/{chatglm → chatglm_6b}/tokenizer/tokenizer_config.json RENAMED Viewed

File without changes

vocab/gpt_35_turbo/__init__.py CHANGED Viewed

@@ -16,7 +16,11 @@ def decode(self, tokens, errors="replace"):
         decode_str = "null"
     return decode_str
 Encoding.decode = decode

         decode_str = "null"
     return decode_str
+def convert_ids_to_tokens(self, tokens):
+    return tokenizer.decode_tokens_bytes(tokens)
 Encoding.decode = decode
+Encoding.convert_ids_to_tokens = convert_ids_to_tokens

vocab/gpt_35_turbo/test2.py CHANGED Viewed

@@ -22,6 +22,10 @@ print(decoding_bytes)
 #     print(token, token_str, json.dumps(token_str))
 f_out = open("vocab.jsonl", "w")
 # 100255
 for i in range(tokenizer.n_vocab):

 #     print(token, token_str, json.dumps(token_str))
+tokenizer.decode_tokens_bytes([10])
+tokenizer.decode_single_token_bytes(10)
+tokenizer.decode_bytes([10])
 f_out = open("vocab.jsonl", "w")
 # 100255
 for i in range(tokenizer.n_vocab):

vocab/{bert_kplug → kplug}/README.md RENAMED Viewed

File without changes

vocab/kplug/__init__.py ADDED Viewed

File without changes

vocab/{bert_kplug → kplug}/bpe_oov.py RENAMED Viewed

File without changes

vocab/{bert_kplug → kplug}/bpe_oov2.py RENAMED Viewed

File without changes

vocab/{bert_kplug → kplug}/jd_vocab.py RENAMED Viewed

File without changes

vocab/{bert_kplug → kplug}/langconv.py RENAMED Viewed

File without changes

vocab/{bert_kplug → kplug}/test_langconv.py RENAMED Viewed

File without changes

vocab/{bert_kplug → kplug}/vocab.jd.txt RENAMED Viewed

File without changes

vocab/{bert_kplug → kplug}/vocab.jd.txt.v2 RENAMED Viewed

File without changes

vocab/{bert_kplug → kplug}/zh_wiki.py RENAMED Viewed

File without changes