fix tiktoken
Browse files- examples.py +22 -5
- tokenizer/tiktoken_patch.py +69 -0
- util.py +1 -0
- vocab/__init__.py +1 -1
- vocab/gpt_35_turbo/__init__.py +2 -69
- vocab/gpt_neox_chinese_v1/20B_tokenizer.tmp.json +0 -0
- vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.json +0 -0
- vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.mock.json +0 -0
- vocab/gpt_neox_chinese_v1/README.md +0 -64
- vocab/gpt_neox_chinese_v1/__init__.py +0 -14
- vocab/gpt_neox_chinese_v1/build_tokenizer_chinese.py +0 -61
- vocab/gpt_neox_chinese_v1/build_tokenizer_chinese_2.py +0 -50
- vocab/gpt_neox_chinese_v1/mock.py +0 -32
- vocab/gpt_neox_chinese_v1/test_tokenizer.py +0 -43
- vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.append.json +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.insert.json +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.json +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.2.json +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.tmp.json +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/README.md +0 -3
- vocab/gpt_neox_chinese_v1/to_v2/add_token_utils.py +0 -185
- vocab/gpt_neox_chinese_v1/to_v2/get_unused_id.py +0 -205
- vocab/gpt_neox_chinese_v1/to_v2/oov.add.txt +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/oov.txt +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/sort_test.py +0 -18
- vocab/gpt_neox_chinese_v1/to_v2/test2.py +0 -42
- vocab/gpt_neox_chinese_v1/to_v2/test_oov.py +0 -69
- vocab/gpt_neox_chinese_v1/to_v2/test_queue.py +0 -20
- vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.remove.jsonl +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.sort_by_count.jsonl +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.txt +0 -0
- vocab/gpt_neox_chinese_v1/tokenizer/__init__.py +0 -16
- vocab/gpt_neox_chinese_v1/tokenizer/gpt2_tokenization.py +0 -368
- vocab/gpt_neox_chinese_v1/tokenizer/tokenizer.py +0 -402
- vocab/gpt_neox_chinese_v1/tokenizer/train_tokenizer.py +0 -126
- vocab/gpt_neox_chinese_v1/trouble-shooting.md +0 -22
- vocab/moss/__init__.py +1 -1
- vocab/text_davinci_003/__init__.py +14 -59
examples.py
CHANGED
|
@@ -1,12 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
examples = {
|
| 2 |
"en": [
|
| 3 |
-
["
|
|
|
|
| 4 |
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
|
| 5 |
["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
|
| 6 |
["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
|
| 7 |
-
|
| 8 |
-
]
|
| 9 |
-
,
|
| 10 |
"zh": [
|
| 11 |
["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
|
| 12 |
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
|
@@ -14,7 +32,6 @@ examples = {
|
|
| 14 |
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
| 15 |
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
|
| 16 |
]
|
| 17 |
-
|
| 18 |
}
|
| 19 |
|
| 20 |
more_examples = [
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
|
| 3 |
+
## characters
|
| 4 |
+
|
| 5 |
+
- alphanumeric characters
|
| 6 |
+
- numeric characters
|
| 7 |
+
- special characters: A special character is a character that is not an alphabetic or numeric character.
|
| 8 |
+
- ASCII control characters
|
| 9 |
+
- punctuation marks
|
| 10 |
+
- accent marks
|
| 11 |
+
- 数学符号
|
| 12 |
+
- whitespace:
|
| 13 |
+
- https://en.wikipedia.org/wiki/Whitespace_character
|
| 14 |
+
- https://emptycharacter.com/
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
https://www.computerhope.com/jargon/s/specchar.htm
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
examples = {
|
| 21 |
"en": [
|
| 22 |
+
["number: (10086 + 98) = 100184", "llama", "bloom"],
|
| 23 |
+
["whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"], # chatglm 有blank_n,
|
| 24 |
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
|
| 25 |
["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
|
| 26 |
["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
|
| 27 |
+
],
|
|
|
|
|
|
|
| 28 |
"zh": [
|
| 29 |
["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
|
| 30 |
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
|
|
|
| 32 |
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
| 33 |
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
|
| 34 |
]
|
|
|
|
| 35 |
}
|
| 36 |
|
| 37 |
more_examples = [
|
tokenizer/tiktoken_patch.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from tiktoken import Encoding
|
| 3 |
+
from utils.log_util import logger
|
| 4 |
+
|
| 5 |
+
def decode(self, tokens, errors="replace", skip_special_tokens=False):
|
| 6 |
+
"""
|
| 7 |
+
默认的decode,可能会报错,详见 decode_test.py
|
| 8 |
+
skip_special_tokens 是为了兼容 hf_tokenizer
|
| 9 |
+
"""
|
| 10 |
+
try:
|
| 11 |
+
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
| 12 |
+
except:
|
| 13 |
+
decode_str = "null"
|
| 14 |
+
return decode_str
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
|
| 18 |
+
"""
|
| 19 |
+
为什么没有这个方法?
|
| 20 |
+
"""
|
| 21 |
+
try:
|
| 22 |
+
return self.decode_tokens_bytes(tokens)
|
| 23 |
+
except Exception as e:
|
| 24 |
+
# 什么要返回None?见zh_util.py
|
| 25 |
+
# 16个空闲id, 100256 100261-100275
|
| 26 |
+
logger.error(e)
|
| 27 |
+
return [None for _ in tokens]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def get_vocab(self, token_type="str"):
|
| 31 |
+
"""Returns vocab as a dict
|
| 32 |
+
:param token_type: ["str", "byte"]
|
| 33 |
+
:return:
|
| 34 |
+
"""
|
| 35 |
+
vocab = {}
|
| 36 |
+
key_error_list = []
|
| 37 |
+
unicode_decode_error_list = []
|
| 38 |
+
for i in range(self.vocab_size):
|
| 39 |
+
try:
|
| 40 |
+
token_byte = self.convert_ids_to_tokens([i])[0]
|
| 41 |
+
if token_byte is None:
|
| 42 |
+
continue
|
| 43 |
+
# token_str = token_byte.decode("utf-8")
|
| 44 |
+
vocab[token_byte] = i
|
| 45 |
+
|
| 46 |
+
except UnicodeDecodeError: # 773 UnicodeDecodeError
|
| 47 |
+
unicode_decode_error_list.append((i, str(token_byte)))
|
| 48 |
+
vocab[token_byte] = i
|
| 49 |
+
|
| 50 |
+
# vocab.update(self.added_tokens_encoder)
|
| 51 |
+
logger.info(f"{self.name} {len(key_error_list)} KeyError: {key_error_list}")
|
| 52 |
+
logger.info(f"{self.name} {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
|
| 53 |
+
return vocab
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def encode(self, *args, **kwargs):
|
| 57 |
+
"""
|
| 58 |
+
add_special_token 是为了兼容 hf_tokenizer
|
| 59 |
+
"""
|
| 60 |
+
kwargs.pop("add_special_tokens", None)
|
| 61 |
+
return self._encode(*args, **kwargs)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# tiktoken patch
|
| 65 |
+
Encoding._encode = Encoding.encode
|
| 66 |
+
Encoding.encode = encode
|
| 67 |
+
Encoding.decode = decode
|
| 68 |
+
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
| 69 |
+
Encoding.get_vocab = get_vocab
|
util.py
CHANGED
|
@@ -52,6 +52,7 @@ def tokenize(text, tokenizer_type, color_num=5):
|
|
| 52 |
# continue
|
| 53 |
|
| 54 |
# ⭐
|
|
|
|
| 55 |
table.append(
|
| 56 |
{"TokenID": token_id,
|
| 57 |
"Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
|
|
|
|
| 52 |
# continue
|
| 53 |
|
| 54 |
# ⭐
|
| 55 |
+
# TODO: gpt3.5_turbo错误: 只有id和text是对的,token和 utf8都是错的。说明 convert_ids_to_tokens 出错了。
|
| 56 |
table.append(
|
| 57 |
{"TokenID": token_id,
|
| 58 |
"Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
|
vocab/__init__.py
CHANGED
|
@@ -85,7 +85,7 @@ all_tokenizers = [
|
|
| 85 |
# "gpt_neox_chinese_v1",
|
| 86 |
#
|
| 87 |
# ##### glm系列
|
| 88 |
-
"glm_chinese",
|
| 89 |
"chatglm_6b",
|
| 90 |
"chatglm2_6b",
|
| 91 |
"chatglm3_6b",
|
|
|
|
| 85 |
# "gpt_neox_chinese_v1",
|
| 86 |
#
|
| 87 |
# ##### glm系列
|
| 88 |
+
# "glm_chinese",
|
| 89 |
"chatglm_6b",
|
| 90 |
"chatglm2_6b",
|
| 91 |
"chatglm3_6b",
|
vocab/gpt_35_turbo/__init__.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
"""
|
| 4 |
|
| 5 |
import tiktoken
|
| 6 |
-
|
| 7 |
-
from utils.log_util import logger
|
| 8 |
|
| 9 |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
| 10 |
tokenizer.vocab_size = tokenizer.n_vocab
|
|
@@ -12,69 +11,3 @@ tokenizer.vocab_size = tokenizer.n_vocab
|
|
| 12 |
tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
|
| 13 |
tokenizer.reversible = True # It's reversible and lossless, so you can convert tokens back into the original text
|
| 14 |
|
| 15 |
-
|
| 16 |
-
def decode(self, tokens, errors="replace", skip_special_tokens=False):
|
| 17 |
-
"""
|
| 18 |
-
默认的decode,可能会报错,详见 decode_test.py
|
| 19 |
-
skip_special_tokens 是为了兼容 hf_tokenizer
|
| 20 |
-
"""
|
| 21 |
-
try:
|
| 22 |
-
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
| 23 |
-
except:
|
| 24 |
-
decode_str = "null"
|
| 25 |
-
return decode_str
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
|
| 29 |
-
"""
|
| 30 |
-
为什么没有这个方法?
|
| 31 |
-
"""
|
| 32 |
-
try:
|
| 33 |
-
return self.decode_tokens_bytes(tokens)
|
| 34 |
-
except Exception as e:
|
| 35 |
-
# 什么要返回None?见zh_util.py
|
| 36 |
-
# 16个空闲id, 100256 100261-100275
|
| 37 |
-
logger.error(e)
|
| 38 |
-
return [None for _ in tokens]
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
def get_vocab(self, token_type="str"):
|
| 42 |
-
"""Returns vocab as a dict
|
| 43 |
-
:param token_type: ["str", "byte"]
|
| 44 |
-
:return:
|
| 45 |
-
"""
|
| 46 |
-
vocab = {}
|
| 47 |
-
key_error_list = []
|
| 48 |
-
unicode_decode_error_list = []
|
| 49 |
-
for i in range(self.vocab_size):
|
| 50 |
-
try:
|
| 51 |
-
token_byte = self.convert_ids_to_tokens([i])[0]
|
| 52 |
-
if token_byte is None:
|
| 53 |
-
continue
|
| 54 |
-
# token_str = token_byte.decode("utf-8")
|
| 55 |
-
vocab[token_byte] = i
|
| 56 |
-
|
| 57 |
-
except UnicodeDecodeError: # 773 UnicodeDecodeError
|
| 58 |
-
unicode_decode_error_list.append((i, str(token_byte)))
|
| 59 |
-
vocab[token_byte] = i
|
| 60 |
-
|
| 61 |
-
# vocab.update(self.added_tokens_encoder)
|
| 62 |
-
logger.info(f"gpt_35_turbo {len(key_error_list)} KeyError: {key_error_list}")
|
| 63 |
-
logger.info(f"gpt_35_turbo {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
|
| 64 |
-
return vocab
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
def encode(self, *args, **kwargs):
|
| 68 |
-
"""
|
| 69 |
-
add_special_token 是为了兼容 hf_tokenizer
|
| 70 |
-
"""
|
| 71 |
-
kwargs.pop("add_special_tokens", None)
|
| 72 |
-
return self._encode(*args, **kwargs)
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
# tiktoken patch
|
| 76 |
-
Encoding._encode = Encoding.encode
|
| 77 |
-
Encoding.encode = encode
|
| 78 |
-
Encoding.decode = decode
|
| 79 |
-
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
| 80 |
-
Encoding.get_vocab = get_vocab
|
|
|
|
| 1 |
"""
|
| 2 |
+
|
| 3 |
"""
|
| 4 |
|
| 5 |
import tiktoken
|
| 6 |
+
import tokenizer.tiktoken_patch
|
|
|
|
| 7 |
|
| 8 |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
| 9 |
tokenizer.vocab_size = tokenizer.n_vocab
|
|
|
|
| 11 |
tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
|
| 12 |
tokenizer.reversible = True # It's reversible and lossless, so you can convert tokens back into the original text
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/20B_tokenizer.tmp.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.mock.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab/gpt_neox_chinese_v1/README.md
DELETED
|
@@ -1,64 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
```
|
| 4 |
-
added vocab (size: 54634) with 22 dummy tokens (new size: 54656)
|
| 5 |
-
Vocab size: 54634
|
| 6 |
-
|
| 7 |
-
训练数据
|
| 8 |
-
```
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
## 20B
|
| 15 |
-
|
| 16 |
-
[configs/20B.yml](https://github.com/EleutherAI/gpt-neox/blob/main/configs/20B.yml#L7)
|
| 17 |
-
```
|
| 18 |
-
"vocab-file": "./20B_checkpoints/20B_tokenizer.json",
|
| 19 |
-
```
|
| 20 |
-
|
| 21 |
-
Vocab size: 50277
|
| 22 |
-
self.padded_vocab_size = 50304
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
padded vocab (size: 50277) with 27 dummy tokens (new size: 50304)
|
| 26 |
-
|
| 27 |
-
## 词典
|
| 28 |
-
|
| 29 |
-
见 convert_vocab_to_txt.py
|
| 30 |
-
|
| 31 |
-
```
|
| 32 |
-
{"id": 13609, "token": "\u00e4\u00b8\u0143", "token_decode": "\u4e2d"} 中
|
| 33 |
-
|
| 34 |
-
# 多个符号拼接在一起的
|
| 35 |
-
{"id": 13663, "token": ".*]{}", "token_decode": ".*]{}"} .*]{}
|
| 36 |
-
|
| 37 |
-
# ss
|
| 38 |
-
|
| 39 |
-
```
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
## 中文支持
|
| 43 |
-
|
| 44 |
-
基本没有OOV。
|
| 45 |
-
|
| 46 |
-
gpt-neox是在800G英文数据集上训练的,为啥词典支持中文?因为是byte-level BPE
|
| 47 |
-
|
| 48 |
-
```
|
| 49 |
-
丁 [3218, 212]
|
| 50 |
-
七 [3218, 214]
|
| 51 |
-
万 [3218, 218]
|
| 52 |
-
诀 [11894, 211]
|
| 53 |
-
证 [11894, 212]
|
| 54 |
-
```
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
编码长度统计: Counter({2: 4190, 3: 1295, 1: 285})
|
| 58 |
-
平均编码长度: 2.1750433275563257
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
## ss
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/__init__.py
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
import os
|
| 3 |
-
from tokenizers import Tokenizer
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 7 |
-
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "20B_tokenizer_chinese.json")
|
| 8 |
-
|
| 9 |
-
tokenizer = Tokenizer.from_file(TOKENIZER_DIR)
|
| 10 |
-
|
| 11 |
-
tokenizer.vocab_size = tokenizer.get_vocab_size(with_added_tokens=True)
|
| 12 |
-
|
| 13 |
-
# vocab_size = len(tokenizer.get_vocab())
|
| 14 |
-
# vocab_size = tokenizer.vocab_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/build_tokenizer_chinese.py
DELETED
|
@@ -1,61 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
merge 是干嘛的?
|
| 3 |
-
|
| 4 |
-
## 结果
|
| 5 |
-
|
| 6 |
-
共merge 4357 个 token
|
| 7 |
-
"""
|
| 8 |
-
|
| 9 |
-
import json
|
| 10 |
-
from tokenizers import Tokenizer
|
| 11 |
-
from data_sample.oov_base import jd_vocab_tokens
|
| 12 |
-
from zhon.hanzi import punctuation as zh_punc
|
| 13 |
-
|
| 14 |
-
def load_base_tokenizer(vocab_path):
|
| 15 |
-
data = json.load(open(vocab_path, "r", encoding="utf-8"))
|
| 16 |
-
tokenizer = Tokenizer.from_file(vocab_path)
|
| 17 |
-
print("vocab_size with added_tokens:", )
|
| 18 |
-
return data, tokenizer
|
| 19 |
-
|
| 20 |
-
data, base_tokenizer = load_base_tokenizer("../gpt_nexo_20b/20B_tokenizer.json")
|
| 21 |
-
vocab = data["model"]["vocab"]
|
| 22 |
-
merges = data["model"]["merges"]
|
| 23 |
-
vocab_size = base_tokenizer.get_vocab_size(with_added_tokens=True)
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
"""
|
| 27 |
-
方式一:原有的added_tokens保持id不变。方式二:原有的added_tokens进行id移位。
|
| 28 |
-
以下采用方式一。
|
| 29 |
-
"""
|
| 30 |
-
new_added_tokens = {}
|
| 31 |
-
for word in jd_vocab_tokens + list(zh_punc):
|
| 32 |
-
if len(word) > 1 or word in new_added_tokens:
|
| 33 |
-
continue
|
| 34 |
-
encoding = base_tokenizer.encode(word)
|
| 35 |
-
# if len(encoding.ids) > 1:
|
| 36 |
-
if len(encoding.ids) == 2: # 3个的,怎么处理?
|
| 37 |
-
tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
|
| 38 |
-
# print("merging", vocab_size, word, json.dumps(tokens))
|
| 39 |
-
vocab["".join(tokens)] = vocab_size
|
| 40 |
-
new_added_tokens[word] = vocab_size
|
| 41 |
-
vocab_size += 1
|
| 42 |
-
merges.append(" ".join(tokens))
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
print("共merge %d 个 token" % (len(new_added_tokens)))
|
| 47 |
-
|
| 48 |
-
with open("20B_tokenizer_chinese.json", "w", encoding="utf-8") as f_out:
|
| 49 |
-
json.dump(data, f_out, indent=2)
|
| 50 |
-
|
| 51 |
-
## check
|
| 52 |
-
tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.json")
|
| 53 |
-
all_error_ids = []
|
| 54 |
-
for word, idx in new_added_tokens.items():
|
| 55 |
-
decode_str = tokenizer.decode([idx])
|
| 56 |
-
if word != decode_str:
|
| 57 |
-
all_error_ids.append(idx)
|
| 58 |
-
print(idx, word, decode_str)
|
| 59 |
-
|
| 60 |
-
print(all_error_ids)
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/build_tokenizer_chinese_2.py
DELETED
|
@@ -1,50 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
merge 是干嘛的?
|
| 3 |
-
|
| 4 |
-
## 结果
|
| 5 |
-
|
| 6 |
-
共merge 4357 个 token
|
| 7 |
-
"""
|
| 8 |
-
|
| 9 |
-
import json
|
| 10 |
-
from tokenizers import Tokenizer
|
| 11 |
-
from data_sample.oov_base import jd_vocab_tokens
|
| 12 |
-
from zhon.hanzi import punctuation as zh_punc
|
| 13 |
-
|
| 14 |
-
def load_base_tokenizer():
|
| 15 |
-
old_vocab_path = "../gpt_nexo_20b/20B_tokenizer.json"
|
| 16 |
-
data = json.load(open(old_vocab_path, "r", encoding="utf-8"))
|
| 17 |
-
tokenizer = Tokenizer.from_file(old_vocab_path)
|
| 18 |
-
print("vocab_size with added_tokens:", )
|
| 19 |
-
return data, tokenizer
|
| 20 |
-
|
| 21 |
-
data, base_tokenizer = load_base_tokenizer()
|
| 22 |
-
vocab = data["model"]["vocab"]
|
| 23 |
-
merges = data["model"]["merges"]
|
| 24 |
-
vocab_size = base_tokenizer.get_vocab_size(with_added_tokens=True)
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
"""
|
| 28 |
-
方式一:原有的added_tokens保持id不变。方式二:原有的added_tokens进行id移位。
|
| 29 |
-
以下采用方式一。
|
| 30 |
-
"""
|
| 31 |
-
new_added_tokens = set()
|
| 32 |
-
for word in jd_vocab_tokens + list(zh_punc):
|
| 33 |
-
if len(word) > 1 or word in new_added_tokens:
|
| 34 |
-
continue
|
| 35 |
-
encoding = base_tokenizer.encode(word)
|
| 36 |
-
# if len(encoding.ids) > 1:
|
| 37 |
-
if len(encoding.ids) == 2: # 3个的,怎么处理?
|
| 38 |
-
tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
|
| 39 |
-
print("merging", vocab_size, word, json.dumps(tokens))
|
| 40 |
-
vocab["".join(tokens)] = vocab_size
|
| 41 |
-
vocab_size += 1
|
| 42 |
-
merges.append(" ".join(tokens))
|
| 43 |
-
new_added_tokens.add(word)
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
print("共merge %d 个 token" % (len(new_added_tokens)))
|
| 47 |
-
|
| 48 |
-
f_out = open("20B_tokenizer_chinese_2.json", "w", encoding="utf-8")
|
| 49 |
-
|
| 50 |
-
json.dump(data, f_out, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/mock.py
DELETED
|
@@ -1,32 +0,0 @@
|
|
| 1 |
-
import copy
|
| 2 |
-
import json
|
| 3 |
-
from tokenizers import Tokenizer
|
| 4 |
-
|
| 5 |
-
def export_mock_tokenizer():
|
| 6 |
-
input_path = "20B_tokenizer_chinese.json"
|
| 7 |
-
|
| 8 |
-
tokenizer = json.load(open(input_path, "r", encoding="utf-8"))
|
| 9 |
-
|
| 10 |
-
vocab = tokenizer["model"]["vocab"]
|
| 11 |
-
added_tokens = [token["id"] for token in tokenizer["added_tokens"]]
|
| 12 |
-
|
| 13 |
-
for k, v in copy.deepcopy(vocab).items():
|
| 14 |
-
if v not in added_tokens:
|
| 15 |
-
vocab[str(v)] = v
|
| 16 |
-
vocab.pop(k)
|
| 17 |
-
|
| 18 |
-
out_path = input_path.replace(".json", ".mock.json")
|
| 19 |
-
with open(out_path, "w", encoding="utf-8") as f_out:
|
| 20 |
-
f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
def mock2():
|
| 24 |
-
pass
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
def load_mock_tokenizer():
|
| 28 |
-
tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.mock.json")
|
| 29 |
-
print('')
|
| 30 |
-
|
| 31 |
-
export_mock_tokenizer()
|
| 32 |
-
load_mock_tokenizer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/test_tokenizer.py
DELETED
|
@@ -1,43 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
from tokenizers import Tokenizer
|
| 3 |
-
|
| 4 |
-
tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.json")
|
| 5 |
-
print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True))
|
| 6 |
-
print("vocab_size without added_tokens:", tokenizer.get_vocab_size(with_added_tokens=False))
|
| 7 |
-
|
| 8 |
-
def test_token():
|
| 9 |
-
"""
|
| 10 |
-
:return:
|
| 11 |
-
"""
|
| 12 |
-
text = " \t\n中国解决方法黑白侗鸩玥,。!"
|
| 13 |
-
# text = open("../../data_sample/EBKE20150806001_epub_30198917_30198917.txt", "r", encoding="utf-8").readline()
|
| 14 |
-
encoding = tokenizer.encode(text)
|
| 15 |
-
decoding = tokenizer.decode(encoding.ids)
|
| 16 |
-
print(decoding)
|
| 17 |
-
for word in text:
|
| 18 |
-
encoding = tokenizer.encode(word)
|
| 19 |
-
for token_id in encoding.ids:
|
| 20 |
-
decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
|
| 21 |
-
token = tokenizer.id_to_token(token_id)
|
| 22 |
-
print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
|
| 23 |
-
|
| 24 |
-
def test_encode():
|
| 25 |
-
text = "中国解决方法黑白侗鸩,。!?;一个人去哪里疗疗<|endoftext|>一 个刹车卉"
|
| 26 |
-
encoding = tokenizer.encode(text)
|
| 27 |
-
print(tokenizer.decode(encoding.ids))
|
| 28 |
-
for token_id in encoding.ids:
|
| 29 |
-
decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
|
| 30 |
-
token = tokenizer.id_to_token(token_id)
|
| 31 |
-
print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
|
| 32 |
-
|
| 33 |
-
def test_decode():
|
| 34 |
-
encoding = [30903, 20287, 20005, 52300, 25949, 30329, 50039, 31949, 25538,
|
| 35 |
-
34698, 18764, 5225, 53915, 163, 223]
|
| 36 |
-
|
| 37 |
-
decode_str = tokenizer.decode(encoding, skip_special_tokens=False)
|
| 38 |
-
print(decode_str)
|
| 39 |
-
|
| 40 |
-
# test_token()
|
| 41 |
-
test_encode()
|
| 42 |
-
# test_decode()
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.append.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.insert.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.2.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.tmp.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/README.md
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
扩充词典到 v2
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/add_token_utils.py
DELETED
|
@@ -1,185 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
import shutil
|
| 6 |
-
import json
|
| 7 |
-
from queue import Queue
|
| 8 |
-
from tokenizers import Tokenizer
|
| 9 |
-
from data_sample.oov_base import jd_vocab_tokens
|
| 10 |
-
from zhon.hanzi import punctuation as zh_punc
|
| 11 |
-
|
| 12 |
-
def load_base_tokenizer(tokenizer_path):
|
| 13 |
-
print("loading", tokenizer_path)
|
| 14 |
-
data = json.load(open(tokenizer_path, "r", encoding="utf-8"))
|
| 15 |
-
tokenizer = Tokenizer.from_file(tokenizer_path)
|
| 16 |
-
print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True))
|
| 17 |
-
return data, tokenizer
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
def insert_token(word, index):
|
| 21 |
-
pass
|
| 22 |
-
|
| 23 |
-
# 不能删除的token。比如初始统计是低频的,可以删除,但是新增词典里包含的。
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def load_reserve_tokens(word_list, base_tokenizer):
|
| 27 |
-
data, base_tokenizer = base_tokenizer
|
| 28 |
-
reserved_token = set()
|
| 29 |
-
for word in word_list:
|
| 30 |
-
encoding = base_tokenizer.encode(word)
|
| 31 |
-
tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
|
| 32 |
-
for i in range(0, len(encoding.ids)):
|
| 33 |
-
reserved_token.add("".join(tokens[:i+1]))
|
| 34 |
-
return reserved_token
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
reserved_token = set()
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
def append_token(word_list, base_tokenizer, output_tokenizer_path, unused_ids=None):
|
| 41 |
-
"""
|
| 42 |
-
append token to the end of vocab
|
| 43 |
-
"""
|
| 44 |
-
new_vocab = set()
|
| 45 |
-
new_merges = set()
|
| 46 |
-
|
| 47 |
-
data, base_tokenizer = base_tokenizer
|
| 48 |
-
vocab = data["model"]["vocab"]
|
| 49 |
-
merges = data["model"]["merges"]
|
| 50 |
-
vocab_size = base_tokenizer.basic_count(with_added_tokens=True)
|
| 51 |
-
|
| 52 |
-
for word in word_list:
|
| 53 |
-
encoding = base_tokenizer.encode(word)
|
| 54 |
-
if len(encoding.ids) == 1:
|
| 55 |
-
continue
|
| 56 |
-
|
| 57 |
-
if len(encoding.ids) >= 4:
|
| 58 |
-
print("[ERROR]: encoding不能超过4", word, encoding)
|
| 59 |
-
|
| 60 |
-
tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
|
| 61 |
-
# print("merging", word, json.dumps(tokens))
|
| 62 |
-
for i in range(1, len(encoding.ids)):
|
| 63 |
-
new_vocab.add("".join(tokens[:i+1]))
|
| 64 |
-
new_merges.add("".join(tokens[:i]) + " " + tokens[i])
|
| 65 |
-
|
| 66 |
-
# append to the end of vocab
|
| 67 |
-
# print("new_vocab size", len(new_vocab))
|
| 68 |
-
# print("new_merges size", len(new_merges))
|
| 69 |
-
if unused_ids == None:
|
| 70 |
-
for token in new_vocab:
|
| 71 |
-
vocab[token] = vocab_size
|
| 72 |
-
vocab_size += 1
|
| 73 |
-
merges += new_merges
|
| 74 |
-
else:
|
| 75 |
-
for iddx, token in enumerate(new_vocab):
|
| 76 |
-
# print(unused_ids.qsize())
|
| 77 |
-
unused_token_id, unused_token_str, unused_merges = unused_ids.get()
|
| 78 |
-
if unused_token_id == 39468:
|
| 79 |
-
print("catch")
|
| 80 |
-
if unused_token_str in reserved_token:
|
| 81 |
-
print("skip unused token", unused_token_id, unused_token_str)
|
| 82 |
-
unused_token_id, unused_token_str, unused_merges = unused_ids.get()
|
| 83 |
-
|
| 84 |
-
print("[%d]merging %s to unused %s %s" % (unused_ids.qsize(), json.dumps(token), unused_token_id, json.dumps(unused_token_str)) )
|
| 85 |
-
vocab[token] = unused_token_id
|
| 86 |
-
if unused_token_id != vocab.pop(unused_token_str):
|
| 87 |
-
print("ERROR")
|
| 88 |
-
# assert unused_token_id == vocab.pop(unused_token_str)
|
| 89 |
-
merges.remove(unused_merges)
|
| 90 |
-
# print(new_merges)
|
| 91 |
-
merges += new_merges
|
| 92 |
-
|
| 93 |
-
# print("共merge %d 个 token" % (len(new_vocab)))
|
| 94 |
-
# print(json.dumps(list(new_vocab)))
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
with open(output_tokenizer_path, "w", encoding="utf-8") as f_out:
|
| 98 |
-
json.dump(data, f_out, indent=2)
|
| 99 |
-
|
| 100 |
-
return data, base_tokenizer
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
# data, base_tokenizer = load_base_tokenizer(output_tokenizer_path)
|
| 106 |
-
# encoding = base_tokenizer.encode(word)
|
| 107 |
-
# print(encoding.ids)
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
def load_unused_id():
|
| 111 |
-
unused_ids = Queue(maxsize=0)
|
| 112 |
-
for line in open("word_count.corpus.remove.jsonl", "r", encoding="utf-8"):
|
| 113 |
-
line_data = json.loads(line)
|
| 114 |
-
token_id = line_data["id"]
|
| 115 |
-
token_str = line_data["token"]
|
| 116 |
-
merges = line_data["merges"]
|
| 117 |
-
unused_ids.put((token_id, token_str, merges))
|
| 118 |
-
# for i in range(2000):
|
| 119 |
-
# unused_ids.get()
|
| 120 |
-
return unused_ids
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
def check_tokenize(base_tokenizer, word):
|
| 124 |
-
data, base_tokenizer = base_tokenizer
|
| 125 |
-
encodings = base_tokenizer.encode(word)
|
| 126 |
-
assert len(encodings.ids) == 1
|
| 127 |
-
assert base_tokenizer.decode(encodings.ids) == word
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
def add_tokens():
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
unused_ids = load_unused_id()
|
| 134 |
-
add_tokens = [line.strip() for line in open("oov.add.txt", "r", encoding="utf-8")]
|
| 135 |
-
add_chars = [char for token in add_tokens for char in token]
|
| 136 |
-
add_chars = list(set(add_chars))
|
| 137 |
-
add_words = [token for token in add_tokens if len(token) > 1]
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
tokenizer_path = "../20B_tokenizer_chinese.json"
|
| 141 |
-
# tokenizer_path = "../../gpt_nexo_20b/20B_tokenizer.json"
|
| 142 |
-
base_tokenizer = load_base_tokenizer(tokenizer_path)
|
| 143 |
-
reserved_token.update(load_reserve_tokens(add_chars, base_tokenizer))
|
| 144 |
-
|
| 145 |
-
## add chars
|
| 146 |
-
append_token(add_chars, base_tokenizer, "20B_tokenizer.1.json", unused_ids=unused_ids)
|
| 147 |
-
print(unused_ids.qsize()) # 22320
|
| 148 |
-
new_tokenizer = load_base_tokenizer("20B_tokenizer.1.json")
|
| 149 |
-
|
| 150 |
-
append_token(add_words,
|
| 151 |
-
new_tokenizer, "20B_tokenizer.2.json", unused_ids=unused_ids)
|
| 152 |
-
new_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
|
| 153 |
-
|
| 154 |
-
#
|
| 155 |
-
# ## add words
|
| 156 |
-
# while unused_ids.qsize() != 22320:
|
| 157 |
-
# unused_ids.get()
|
| 158 |
-
# assert unused_ids.qsize() == 22320
|
| 159 |
-
#
|
| 160 |
-
# shutil.copyfile("20B_tokenizer.1.json", "20B_tokenizer.2.json")
|
| 161 |
-
# while len(add_words) > 0:
|
| 162 |
-
# new_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
|
| 163 |
-
# append_token([add_words.pop()],
|
| 164 |
-
# new_tokenizer, "20B_tokenizer.2.json", unused_ids=unused_ids)
|
| 165 |
-
# # new_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
def check_all_tokens():
|
| 169 |
-
add_tokens = [line.strip() for line in open("oov.add.txt", "r", encoding="utf-8")]
|
| 170 |
-
add_chars = [char for token in add_tokens for char in token]
|
| 171 |
-
add_chars = list(set(add_chars))
|
| 172 |
-
add_words = [token for token in add_tokens if len(token) > 1]
|
| 173 |
-
# add_chars = ['吳']
|
| 174 |
-
base_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
|
| 175 |
-
for k in add_chars:
|
| 176 |
-
check_tokenize(base_tokenizer, k)
|
| 177 |
-
for word in add_words:
|
| 178 |
-
# print(word)
|
| 179 |
-
check_tokenize(base_tokenizer, word)
|
| 180 |
-
|
| 181 |
-
add_tokens()
|
| 182 |
-
check_all_tokens()
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/get_unused_id.py
DELETED
|
@@ -1,205 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
获取超低频token,用于裁剪
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
-
import copy
|
| 6 |
-
import glob
|
| 7 |
-
import json
|
| 8 |
-
from collections import defaultdict
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
def word_count():
|
| 12 |
-
from collections import Counter
|
| 13 |
-
from megatron.data.indexed_dataset import MMapIndexedDataset
|
| 14 |
-
counter = Counter()
|
| 15 |
-
for file_name in glob.glob("data/jd/*.bin"):
|
| 16 |
-
print(file_name)
|
| 17 |
-
file_name = file_name[:-4]
|
| 18 |
-
dataset = MMapIndexedDataset(file_name, skip_warmup=True)
|
| 19 |
-
for doc in dataset:
|
| 20 |
-
counter.update(doc)
|
| 21 |
-
|
| 22 |
-
f_out = open("word_count.txt", "w", encoding="utf-8")
|
| 23 |
-
for token_id, count in counter.most_common():
|
| 24 |
-
f_out.write("%d\t%d\n" % (token_id, count))
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
def get_unused_id():
|
| 28 |
-
pass
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
def print_word_count():
|
| 32 |
-
from tokenizers import Tokenizer
|
| 33 |
-
tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")
|
| 34 |
-
data = json.load(open("../20B_tokenizer_chinese.json", "r", encoding="utf-8"))
|
| 35 |
-
|
| 36 |
-
vocab = data["model"]["vocab"]
|
| 37 |
-
merges = data["model"]["merges"]
|
| 38 |
-
merge_dict = {}
|
| 39 |
-
|
| 40 |
-
sorted_parts = []
|
| 41 |
-
for merge in merges:
|
| 42 |
-
idx = merge.find(" ")
|
| 43 |
-
token_str = merge[:idx] + merge[idx + 1:]
|
| 44 |
-
merge_dict[token_str] = (merge[:idx], merge[idx + 1:])
|
| 45 |
-
sorted_parts += [token_str, merge[:idx], merge[idx + 1:]]
|
| 46 |
-
id2vocab = {idx: token for token, idx in vocab.items()}
|
| 47 |
-
|
| 48 |
-
# 补充 sorted_parts,并排序
|
| 49 |
-
all_tokens = [line.strip().split("\t") for line in open("word_count.corpus.txt", "r", encoding="utf-8")]
|
| 50 |
-
raw_token_count = {int(token_id): int(count) for token_id, count in all_tokens}
|
| 51 |
-
sorted_parts = set(sorted_parts)
|
| 52 |
-
for token_id in raw_token_count:
|
| 53 |
-
if token_id in [35448, 40519]:
|
| 54 |
-
print("ddd")
|
| 55 |
-
token_str = id2vocab[token_id]
|
| 56 |
-
if token_str not in sorted_parts:
|
| 57 |
-
sorted_parts.add(token_str)
|
| 58 |
-
# print(token_id, token_str, json.dumps(token_str), raw_token_count[token_id], " not in parts")
|
| 59 |
-
sorted_parts = sorted(set(sorted_parts), key=lambda k: len(k), reverse=True)
|
| 60 |
-
|
| 61 |
-
# 重新计算merge的频率
|
| 62 |
-
# token_count = copy.deepcopy(raw_token_count)
|
| 63 |
-
token_count = defaultdict(int)
|
| 64 |
-
for token_str in sorted_parts: # 从长到短 遍历 (否则要深度遍历,)
|
| 65 |
-
token_id = vocab[token_str]
|
| 66 |
-
if token_id in [35448, 40519]:
|
| 67 |
-
print("ddd")
|
| 68 |
-
|
| 69 |
-
count = raw_token_count.get(token_id, 0)
|
| 70 |
-
token_count[token_id] += count # 原token 的词频
|
| 71 |
-
if token_str in merge_dict:
|
| 72 |
-
if vocab[merge_dict[token_str][0]] in [35448, 40519] or vocab[merge_dict[token_str][1]] in [35448, 40519]:
|
| 73 |
-
print("ddd")
|
| 74 |
-
token_count[vocab[merge_dict[token_str][0]]] += token_count[token_id]
|
| 75 |
-
token_count[vocab[merge_dict[token_str][1]]] += token_count[token_id]
|
| 76 |
-
else:
|
| 77 |
-
print(token_id, json.dumps(token_str))
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
# 重新排序 (按频率升序排列,相同频率按长度降序排列)
|
| 81 |
-
sorted_token_count = sorted(token_count.items(), key=lambda kv: (kv[1], -len(id2vocab[kv[0]])))
|
| 82 |
-
f_out = open("word_count.corpus.sort_by_count.jsonl", "w", encoding="utf-8")
|
| 83 |
-
for token_id, count in sorted_token_count:
|
| 84 |
-
# for token_str, count in token_count.items():
|
| 85 |
-
token_str = id2vocab[token_id]
|
| 86 |
-
# token_id = vocab[token_str]
|
| 87 |
-
decode_str = tokenizer.decode([token_id]) # 解码会失真
|
| 88 |
-
if token_str in merge_dict:
|
| 89 |
-
merges = " ".join(merge_dict[token_str])
|
| 90 |
-
else:
|
| 91 |
-
merges = "NULL"
|
| 92 |
-
f_out.write(json.dumps(
|
| 93 |
-
{"id": token_id, "token": token_str, "merges": merges, "raw_count": raw_token_count.get(token_id, 0),
|
| 94 |
-
"count": count, "decode_str": decode_str}) + "\n")
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
def get_remove_words():
|
| 98 |
-
from tokenizers import Tokenizer
|
| 99 |
-
tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")
|
| 100 |
-
|
| 101 |
-
data = json.load(open("../20B_tokenizer_chinese.json", "r", encoding="utf-8"))
|
| 102 |
-
added_tokens = [token["id"] for token in data["added_tokens"]]
|
| 103 |
-
|
| 104 |
-
vocab = data["model"]["vocab"]
|
| 105 |
-
merges = data["model"]["merges"]
|
| 106 |
-
id2vocab = {idx: token for token, idx in vocab.items()}
|
| 107 |
-
|
| 108 |
-
merge_dict = {k.replace(" ", "", 1): k for k in merges}
|
| 109 |
-
|
| 110 |
-
token_count = {}
|
| 111 |
-
for line in open("word_count.corpus.sort_by_count.jsonl", "r", encoding="utf-8"):
|
| 112 |
-
line_data = json.loads(line)
|
| 113 |
-
token_id = int(line_data["id"])
|
| 114 |
-
count = int(line_data["count"])
|
| 115 |
-
token_count[token_id] = count
|
| 116 |
-
|
| 117 |
-
f_out = open("word_count.corpus.remove.jsonl", "w", encoding="utf-8")
|
| 118 |
-
remove_vocab_set = set()
|
| 119 |
-
|
| 120 |
-
# # 1. 去掉错误token
|
| 121 |
-
# error_tokens = [54611, 54612, 54613, 54614, 54615, 54616, 54617, 54618, 54619, 54620, 54621, 54622,
|
| 122 |
-
# 54623, 54624, 54625, 54626, 54627, 54628, 54629, 54630, 54631, 54632, 54633]
|
| 123 |
-
# for token_id in error_tokens:
|
| 124 |
-
# token_str = id2vocab[token_id]
|
| 125 |
-
# # token_str = tokenizer.id_to_token(token_id) # 失真
|
| 126 |
-
# remove_vocab_set.add(token_id)
|
| 127 |
-
# f_out.write(json.dumps(
|
| 128 |
-
# {"id": token_id, "token": token_str, "merges": merge_dict.get(token_str), "count": 0,
|
| 129 |
-
# "type": "error-char"}) + "\n")
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
# 2. 去掉超长token
|
| 133 |
-
# for token_id in range(tokenizer.get_vocab_size()):
|
| 134 |
-
# if token_id in added_tokens:
|
| 135 |
-
# continue
|
| 136 |
-
# token_str = id2vocab[token_id]
|
| 137 |
-
# # token_str = tokenizer.id_to_token(token_id) # 也会失真,比如 54611 个token
|
| 138 |
-
# decode_str = tokenizer.decode([token_id]) # decode会失真,比如 Ġ 会变成空格
|
| 139 |
-
# if len(decode_str) > 8 and len(set(decode_str)) < 3:
|
| 140 |
-
# if token_id in remove_vocab_set:
|
| 141 |
-
# continue
|
| 142 |
-
# remove_vocab_set.add(token_id)
|
| 143 |
-
# f_out.write(
|
| 144 |
-
# json.dumps({"id": token_id, "token": token_str,
|
| 145 |
-
# "merges": merge_dict.get(token_str), "count": token_count.get(token_id, 0),
|
| 146 |
-
# "type": "按长度过滤"}, ensure_ascii=False) + "\n")
|
| 147 |
-
#
|
| 148 |
-
# # 删除依赖,(否则会造成 merges中存在oov的token)
|
| 149 |
-
# #
|
| 150 |
-
# for merge in merges:
|
| 151 |
-
# if token_str in merge:
|
| 152 |
-
# # if token_str + " " in merge or " " + token_str in merge:
|
| 153 |
-
# parent_token_str = merge.replace(" ", "", 1)
|
| 154 |
-
# parent_token_id = vocab[parent_token_str]
|
| 155 |
-
# if parent_token_id in remove_vocab_set:
|
| 156 |
-
# continue
|
| 157 |
-
# remove_vocab_set.add(parent_token_id)
|
| 158 |
-
# f_out.write(
|
| 159 |
-
# json.dumps({"id": parent_token_id, "token": parent_token_str,
|
| 160 |
-
# "merges": merge, "count": token_count.get(parent_token_id, 0),
|
| 161 |
-
# "type": "按长度过滤-依赖删除"}, ensure_ascii=False) + "\n")
|
| 162 |
-
|
| 163 |
-
# 3. 去掉低频token
|
| 164 |
-
for token_id, count in list(token_count.items())[:25000]:
|
| 165 |
-
# token_id = 6460
|
| 166 |
-
if token_id in added_tokens:
|
| 167 |
-
continue
|
| 168 |
-
if token_id in remove_vocab_set:
|
| 169 |
-
continue
|
| 170 |
-
|
| 171 |
-
token_str = tokenizer.id_to_token(token_id)
|
| 172 |
-
# token_str = tokenizer.decode([int(token_id)])
|
| 173 |
-
if len(token_str.strip()) > 1:
|
| 174 |
-
remove_vocab_set.add(token_id)
|
| 175 |
-
f_out.write(json.dumps(
|
| 176 |
-
{"id": token_id, "token": token_str, "merges": merge_dict.get(token_str), "count": count,
|
| 177 |
-
"type": "remove by frequency"}) + "\n")
|
| 178 |
-
|
| 179 |
-
######## 已经按频率排序的,就不需要删除依赖了
|
| 180 |
-
# # 删除依赖,(否则会造成 merges中存在oov的token)
|
| 181 |
-
# for merge in merges:
|
| 182 |
-
# # if token_str + " " in merge or " " + token_str in merge:
|
| 183 |
-
# if token_str in merge:
|
| 184 |
-
# parent_token_str = merge.replace(" ", "", 1)
|
| 185 |
-
# parent_token_id = vocab[parent_token_str]
|
| 186 |
-
# if parent_token_id in remove_vocab_set:
|
| 187 |
-
# continue
|
| 188 |
-
# remove_vocab_set.add(parent_token_id)
|
| 189 |
-
# f_out.write(
|
| 190 |
-
# json.dumps({"id": parent_token_id, "token": parent_token_str,
|
| 191 |
-
# "merges": merge, "count": token_count.get(parent_token_id, 0),
|
| 192 |
-
# "type": "按频率过滤-依赖删除"}, ensure_ascii=False) + "\n")
|
| 193 |
-
|
| 194 |
-
# remove 24969 tokens
|
| 195 |
-
print("remove %d tokens" % (len(remove_vocab_set)))
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
def ss():
|
| 199 |
-
pass
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
# word_count()
|
| 203 |
-
# print_word_count()
|
| 204 |
-
get_remove_words()
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/oov.add.txt
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/oov.txt
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/sort_test.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
a = {
|
| 5 |
-
"aa", 1,
|
| 6 |
-
"aaa", 1,
|
| 7 |
-
"aaaa", 1,
|
| 8 |
-
"aaaaaa", 1,
|
| 9 |
-
"aaaaaaa", 1,
|
| 10 |
-
|
| 11 |
-
"baa", 3,
|
| 12 |
-
"baaa", 2,
|
| 13 |
-
"baaaa", 2,
|
| 14 |
-
"baaaaaa", 2,
|
| 15 |
-
"baaaaaaa", 2,
|
| 16 |
-
}
|
| 17 |
-
|
| 18 |
-
sorted(a.items(), key=lambda kv:(kv[1], ))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/test2.py
DELETED
|
@@ -1,42 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
from tokenizers import Tokenizer
|
| 3 |
-
from data_sample.oov_base import jd_vocab_tokens
|
| 4 |
-
from zhon.hanzi import punctuation as zh_punc
|
| 5 |
-
|
| 6 |
-
def load_base_tokenizer(tokenizer_path):
|
| 7 |
-
print("loading", tokenizer_path)
|
| 8 |
-
data = json.load(open(tokenizer_path, "r", encoding="utf-8"))
|
| 9 |
-
tokenizer = Tokenizer.from_file(tokenizer_path)
|
| 10 |
-
print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True))
|
| 11 |
-
return data, tokenizer
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
def append_token(word_list, base_tokenizer, unused_ids=None):
|
| 15 |
-
"""
|
| 16 |
-
append token to the end of vocab
|
| 17 |
-
"""
|
| 18 |
-
new_vocab = set()
|
| 19 |
-
new_merges = set()
|
| 20 |
-
|
| 21 |
-
data, base_tokenizer = base_tokenizer
|
| 22 |
-
vocab = data["model"]["vocab"]
|
| 23 |
-
merges = data["model"]["merges"]
|
| 24 |
-
vocab_size = base_tokenizer.basic_count(with_added_tokens=True)
|
| 25 |
-
|
| 26 |
-
for word in word_list:
|
| 27 |
-
encoding = base_tokenizer.encode(word)
|
| 28 |
-
if len(encoding.ids) == 1:
|
| 29 |
-
continue
|
| 30 |
-
|
| 31 |
-
if len(encoding.ids) >= 4:
|
| 32 |
-
print("[ERROR]: encoding不能超过4", word, encoding)
|
| 33 |
-
|
| 34 |
-
tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
|
| 35 |
-
if "\u00e6\u00a5\u0143" in tokens:
|
| 36 |
-
print(word)
|
| 37 |
-
|
| 38 |
-
add_tokens = [line.strip() for line in open("oov.add.txt", "r", encoding="utf-8")]
|
| 39 |
-
add_words = [token for token in add_tokens if len(token) > 1]
|
| 40 |
-
new_tokenizer = load_base_tokenizer("20B_tokenizer.1.json")
|
| 41 |
-
|
| 42 |
-
append_token(add_words, new_tokenizer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/test_oov.py
DELETED
|
@@ -1,69 +0,0 @@
|
|
| 1 |
-
from tokenizers import Tokenizer
|
| 2 |
-
|
| 3 |
-
tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")
|
| 4 |
-
|
| 5 |
-
def get_oov():
|
| 6 |
-
|
| 7 |
-
f_out = open("oov.txt", "w", encoding="utf-8")
|
| 8 |
-
all_words = open("../../vocab.freq.zh.txt", "r", encoding="utf-8")
|
| 9 |
-
for line in all_words:
|
| 10 |
-
word, count = line.strip().split("\t")
|
| 11 |
-
if "�" in word or word in ["之长", "个好", "亿亿", "余个", "聊了", "与该", "多花"]:
|
| 12 |
-
continue
|
| 13 |
-
|
| 14 |
-
encoding = tokenizer.encode(word)
|
| 15 |
-
if len(encoding.ids) > 1:
|
| 16 |
-
f_out.write(line)
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
def build_vocab():
|
| 20 |
-
pass
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def convert_oov_to_merges():
|
| 25 |
-
"""将词拆分成merge分组,必须是两个一组,
|
| 26 |
-
比如
|
| 27 |
-
承担 -> 承 担
|
| 28 |
-
天津市 -> 天津 市
|
| 29 |
-
社会保障 -> 社会 保障
|
| 30 |
-
的一部分 -> 的 一部分 -> 一 部分
|
| 31 |
-
"""
|
| 32 |
-
all_tokens_and_counts = [line.strip().split("\t") for line in open("oov.txt", "r", encoding="utf-8")]
|
| 33 |
-
all_tokens = [token for token,count in all_tokens_and_counts if int(count) > 2] # 至少3个词典中出现过
|
| 34 |
-
len1 = [token for token in all_tokens if len(token) == 1]
|
| 35 |
-
len2 = [token for token in all_tokens if len(token) == 2]
|
| 36 |
-
len3 = [token for token in all_tokens if len(token) == 3]
|
| 37 |
-
len4 = [token for token in all_tokens if len(token) == 4]
|
| 38 |
-
print(len(len1), len(len2), len(len3), len(len4))
|
| 39 |
-
|
| 40 |
-
# vocab = set(["天津", "社会", "保障", "部分", "一部分", "需要", "数据", "使用", "我们", "一个",] + len2)
|
| 41 |
-
# vocab = set(["天津", "社会", "保障", "部分", "需要", "数据", "使用", "我们", "一个"] + len2)
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
with open("oov.add.txt", "w", encoding="utf-8") as f_out:
|
| 45 |
-
for token in len1:
|
| 46 |
-
f_out.write(token + "\n")
|
| 47 |
-
for token in len2[:20000]:
|
| 48 |
-
f_out.write(token + "\n")
|
| 49 |
-
# f_out.write(token[0] + " " + token[1] + "\n")
|
| 50 |
-
|
| 51 |
-
# for token in len3:
|
| 52 |
-
# idx = -1
|
| 53 |
-
# for part in len2:
|
| 54 |
-
# if part in token:
|
| 55 |
-
# idx = token.find(part)
|
| 56 |
-
# break
|
| 57 |
-
# if idx == -1:
|
| 58 |
-
# print("not found", token)
|
| 59 |
-
# elif idx == 0:
|
| 60 |
-
# f_out.write(token[0] + " " + token[1:] + "\n")
|
| 61 |
-
# else:
|
| 62 |
-
# f_out.write(token[:2] + " " + token[2] + "\n")
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
get_oov()
|
| 69 |
-
convert_oov_to_merges()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/test_queue.py
DELETED
|
@@ -1,20 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
from queue import Queue
|
| 3 |
-
|
| 4 |
-
q = Queue(maxsize=0)
|
| 5 |
-
|
| 6 |
-
#写入队列数据
|
| 7 |
-
q.put(0)
|
| 8 |
-
q.put(1)
|
| 9 |
-
q.put(2)
|
| 10 |
-
|
| 11 |
-
#输出当前队列所有数据
|
| 12 |
-
print(q.queue)
|
| 13 |
-
#删除队列数据,并返回该数据
|
| 14 |
-
q.get()
|
| 15 |
-
#输也所有队列数据
|
| 16 |
-
print(q.queue)
|
| 17 |
-
|
| 18 |
-
for i in range(10):
|
| 19 |
-
print(q.get(), q.qsize())
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.remove.jsonl
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.sort_by_count.jsonl
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.txt
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab/gpt_neox_chinese_v1/tokenizer/__init__.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
#
|
| 3 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
-
# you may not use this file except in compliance with the License.
|
| 5 |
-
# You may obtain a copy of the License at
|
| 6 |
-
#
|
| 7 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
-
#
|
| 9 |
-
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
-
# See the License for the specific language governing permissions and
|
| 13 |
-
# limitations under the License.
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
from .tokenizer import build_tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/tokenizer/gpt2_tokenization.py
DELETED
|
@@ -1,368 +0,0 @@
|
|
| 1 |
-
# Copyright (c) 2021, EleutherAI
|
| 2 |
-
# This file is based on code by the authors denoted below and has been modified from its original version.
|
| 3 |
-
#
|
| 4 |
-
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
|
| 5 |
-
#
|
| 6 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 7 |
-
# you may not use this file except in compliance with the License.
|
| 8 |
-
# You may obtain a copy of the License at
|
| 9 |
-
#
|
| 10 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
-
#
|
| 12 |
-
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 15 |
-
# See the License for the specific language governing permissions and
|
| 16 |
-
# limitations under the License.
|
| 17 |
-
|
| 18 |
-
"""Tokenization classes for OpenAI GPT."""
|
| 19 |
-
|
| 20 |
-
from __future__ import absolute_import, division, print_function, unicode_literals
|
| 21 |
-
|
| 22 |
-
import sys
|
| 23 |
-
import json
|
| 24 |
-
import logging
|
| 25 |
-
import os
|
| 26 |
-
import regex as re
|
| 27 |
-
from io import open
|
| 28 |
-
|
| 29 |
-
from functools import lru_cache
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
logger = logging.getLogger(__name__)
|
| 33 |
-
|
| 34 |
-
PRETRAINED_VOCAB_ARCHIVE_MAP = {
|
| 35 |
-
"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
|
| 36 |
-
}
|
| 37 |
-
PRETRAINED_MERGES_ARCHIVE_MAP = {
|
| 38 |
-
"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
|
| 39 |
-
}
|
| 40 |
-
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
|
| 41 |
-
"gpt2": 1024,
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
-
VOCAB_NAME = "vocab.json"
|
| 45 |
-
MERGES_NAME = "merges.txt"
|
| 46 |
-
SPECIAL_TOKENS_NAME = "special_tokens.txt"
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
@lru_cache()
|
| 50 |
-
def bytes_to_unicode():
|
| 51 |
-
"""
|
| 52 |
-
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
| 53 |
-
The reversible bpe codes work on unicode strings.
|
| 54 |
-
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
| 55 |
-
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
| 56 |
-
This is a significant percentage of your normal, say, 32K bpe vocab.
|
| 57 |
-
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
| 58 |
-
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
| 59 |
-
"""
|
| 60 |
-
_chr = unichr if sys.version_info[0] == 2 else chr
|
| 61 |
-
bs = (
|
| 62 |
-
list(range(ord("!"), ord("~") + 1))
|
| 63 |
-
+ list(range(ord("¡"), ord("¬") + 1))
|
| 64 |
-
+ list(range(ord("®"), ord("ÿ") + 1))
|
| 65 |
-
)
|
| 66 |
-
cs = bs[:]
|
| 67 |
-
n = 0
|
| 68 |
-
for b in range(2**8):
|
| 69 |
-
if b not in bs:
|
| 70 |
-
bs.append(b)
|
| 71 |
-
cs.append(2**8 + n)
|
| 72 |
-
n += 1
|
| 73 |
-
cs = [_chr(n) for n in cs]
|
| 74 |
-
return dict(zip(bs, cs))
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
def get_pairs(word):
|
| 78 |
-
"""Return set of symbol pairs in a word.
|
| 79 |
-
|
| 80 |
-
Word is represented as tuple of symbols (symbols being variable-length strings).
|
| 81 |
-
"""
|
| 82 |
-
pairs = set()
|
| 83 |
-
prev_char = word[0]
|
| 84 |
-
for char in word[1:]:
|
| 85 |
-
pairs.add((prev_char, char))
|
| 86 |
-
prev_char = char
|
| 87 |
-
return pairs
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
class GPT2Tokenizer(object):
|
| 91 |
-
"""
|
| 92 |
-
GPT-2 BPE tokenizer. Peculiarities:
|
| 93 |
-
- Byte-level BPE
|
| 94 |
-
"""
|
| 95 |
-
|
| 96 |
-
@classmethod
|
| 97 |
-
def from_pretrained(
|
| 98 |
-
cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs
|
| 99 |
-
):
|
| 100 |
-
"""
|
| 101 |
-
Instantiate a PreTrainedBertModel from a pre-trained model file.
|
| 102 |
-
Download and cache the pre-trained model file if needed.
|
| 103 |
-
"""
|
| 104 |
-
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
| 105 |
-
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
|
| 106 |
-
merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
|
| 107 |
-
special_tokens_file = None
|
| 108 |
-
else:
|
| 109 |
-
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
|
| 110 |
-
merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
|
| 111 |
-
special_tokens_file = os.path.join(
|
| 112 |
-
pretrained_model_name_or_path, SPECIAL_TOKENS_NAME
|
| 113 |
-
)
|
| 114 |
-
if not os.path.exists(special_tokens_file):
|
| 115 |
-
special_tokens_file = None
|
| 116 |
-
else:
|
| 117 |
-
logger.info(
|
| 118 |
-
"loading special tokens file {}".format(special_tokens_file)
|
| 119 |
-
)
|
| 120 |
-
# redirect to the cache, if necessary
|
| 121 |
-
try:
|
| 122 |
-
from .file_utils import cached_path
|
| 123 |
-
|
| 124 |
-
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
|
| 125 |
-
resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
|
| 126 |
-
except EnvironmentError:
|
| 127 |
-
logger.error(
|
| 128 |
-
"Model name '{}' was not found in model name list ({}). "
|
| 129 |
-
"We assumed '{}' was a path or url but couldn't find files {} and {} "
|
| 130 |
-
"at this path or url.".format(
|
| 131 |
-
pretrained_model_name_or_path,
|
| 132 |
-
", ".join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
|
| 133 |
-
pretrained_model_name_or_path,
|
| 134 |
-
vocab_file,
|
| 135 |
-
merges_file,
|
| 136 |
-
)
|
| 137 |
-
)
|
| 138 |
-
return None
|
| 139 |
-
if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
|
| 140 |
-
logger.info("loading vocabulary file {}".format(vocab_file))
|
| 141 |
-
logger.info("loading merges file {}".format(merges_file))
|
| 142 |
-
else:
|
| 143 |
-
logger.info(
|
| 144 |
-
"loading vocabulary file {} from cache at {}".format(
|
| 145 |
-
vocab_file, resolved_vocab_file
|
| 146 |
-
)
|
| 147 |
-
)
|
| 148 |
-
logger.info(
|
| 149 |
-
"loading merges file {} from cache at {}".format(
|
| 150 |
-
merges_file, resolved_merges_file
|
| 151 |
-
)
|
| 152 |
-
)
|
| 153 |
-
if (
|
| 154 |
-
pretrained_model_name_or_path
|
| 155 |
-
in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
|
| 156 |
-
):
|
| 157 |
-
# if we're using a pretrained model, ensure the tokenizer won't index sequences longer
|
| 158 |
-
# than the number of positional embeddings
|
| 159 |
-
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
|
| 160 |
-
pretrained_model_name_or_path
|
| 161 |
-
]
|
| 162 |
-
kwargs["max_len"] = min(kwargs.get("max_len", int(1e12)), max_len)
|
| 163 |
-
# Instantiate tokenizer.
|
| 164 |
-
if special_tokens_file and "special_tokens" not in kwargs:
|
| 165 |
-
special_tokens = (
|
| 166 |
-
open(special_tokens_file, encoding="utf-8").read().split("\n")[:-1]
|
| 167 |
-
)
|
| 168 |
-
else:
|
| 169 |
-
special_tokens = kwargs.pop("special_tokens", [])
|
| 170 |
-
tokenizer = cls(
|
| 171 |
-
resolved_vocab_file,
|
| 172 |
-
resolved_merges_file,
|
| 173 |
-
special_tokens=special_tokens,
|
| 174 |
-
*inputs,
|
| 175 |
-
**kwargs
|
| 176 |
-
)
|
| 177 |
-
return tokenizer
|
| 178 |
-
|
| 179 |
-
def __init__(
|
| 180 |
-
self,
|
| 181 |
-
vocab_file,
|
| 182 |
-
merges_file,
|
| 183 |
-
errors="replace",
|
| 184 |
-
special_tokens=None,
|
| 185 |
-
max_len=None,
|
| 186 |
-
):
|
| 187 |
-
self.max_len = max_len if max_len is not None else int(1e12)
|
| 188 |
-
self.encoder = json.load(open(vocab_file))
|
| 189 |
-
self.decoder = {v: k for k, v in self.encoder.items()}
|
| 190 |
-
self.errors = errors # how to handle errors in decoding
|
| 191 |
-
self.byte_encoder = bytes_to_unicode()
|
| 192 |
-
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
| 193 |
-
bpe_data = open(merges_file, encoding="utf-8").read().split("\n")[1:-1]
|
| 194 |
-
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
|
| 195 |
-
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
| 196 |
-
|
| 197 |
-
# Should haved added re.IGNORECASE so BPE merges can happen for
|
| 198 |
-
# capitalized versions of contractions
|
| 199 |
-
self.pat = re.compile(
|
| 200 |
-
r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
|
| 201 |
-
)
|
| 202 |
-
|
| 203 |
-
self.special_tokens = {}
|
| 204 |
-
self.special_tokens_decoder = {}
|
| 205 |
-
self.set_special_tokens(special_tokens)
|
| 206 |
-
|
| 207 |
-
def __len__(self):
|
| 208 |
-
return len(self.encoder) + len(self.special_tokens)
|
| 209 |
-
|
| 210 |
-
def set_special_tokens(self, special_tokens):
|
| 211 |
-
"""Add a list of additional tokens to the encoder.
|
| 212 |
-
The additional tokens are indexed starting from the last index of the
|
| 213 |
-
current vocabulary in the order of the `special_tokens` list.
|
| 214 |
-
"""
|
| 215 |
-
if not special_tokens:
|
| 216 |
-
self.special_tokens = {}
|
| 217 |
-
self.special_tokens_decoder = {}
|
| 218 |
-
return
|
| 219 |
-
self.special_tokens = dict(
|
| 220 |
-
(tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)
|
| 221 |
-
)
|
| 222 |
-
self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
|
| 223 |
-
logger.info("Special tokens {}".format(self.special_tokens))
|
| 224 |
-
|
| 225 |
-
@lru_cache(maxsize=131072)
|
| 226 |
-
def bpe(self, token):
|
| 227 |
-
word = tuple(token)
|
| 228 |
-
pairs = get_pairs(word)
|
| 229 |
-
|
| 230 |
-
if not pairs:
|
| 231 |
-
return token
|
| 232 |
-
|
| 233 |
-
while True:
|
| 234 |
-
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
|
| 235 |
-
if bigram not in self.bpe_ranks:
|
| 236 |
-
break
|
| 237 |
-
first, second = bigram
|
| 238 |
-
new_word = []
|
| 239 |
-
i = 0
|
| 240 |
-
while i < len(word):
|
| 241 |
-
try:
|
| 242 |
-
j = word.index(first, i)
|
| 243 |
-
new_word.extend(word[i:j])
|
| 244 |
-
i = j
|
| 245 |
-
except BaseException:
|
| 246 |
-
new_word.extend(word[i:])
|
| 247 |
-
break
|
| 248 |
-
|
| 249 |
-
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
|
| 250 |
-
new_word.append(first + second)
|
| 251 |
-
i += 2
|
| 252 |
-
else:
|
| 253 |
-
new_word.append(word[i])
|
| 254 |
-
i += 1
|
| 255 |
-
new_word = tuple(new_word)
|
| 256 |
-
word = new_word
|
| 257 |
-
if len(word) == 1:
|
| 258 |
-
break
|
| 259 |
-
else:
|
| 260 |
-
pairs = get_pairs(word)
|
| 261 |
-
word = " ".join(word)
|
| 262 |
-
return word
|
| 263 |
-
|
| 264 |
-
def tokenize(self, text):
|
| 265 |
-
"""Tokenize a string."""
|
| 266 |
-
bpe_tokens = []
|
| 267 |
-
for token in re.findall(self.pat, text):
|
| 268 |
-
if sys.version_info[0] == 2:
|
| 269 |
-
token = "".join(self.byte_encoder[ord(b)] for b in token)
|
| 270 |
-
else:
|
| 271 |
-
token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
|
| 272 |
-
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
|
| 273 |
-
return bpe_tokens
|
| 274 |
-
|
| 275 |
-
def convert_tokens_to_ids(self, tokens):
|
| 276 |
-
"""Converts a sequence of tokens into ids using the vocab."""
|
| 277 |
-
ids = []
|
| 278 |
-
if isinstance(tokens, str) or (
|
| 279 |
-
sys.version_info[0] == 2 and isinstance(tokens, unicode)
|
| 280 |
-
):
|
| 281 |
-
if tokens in self.special_tokens:
|
| 282 |
-
return self.special_tokens[tokens]
|
| 283 |
-
else:
|
| 284 |
-
return self.encoder.get(tokens, 0)
|
| 285 |
-
for token in tokens:
|
| 286 |
-
if token in self.special_tokens:
|
| 287 |
-
ids.append(self.special_tokens[token])
|
| 288 |
-
else:
|
| 289 |
-
ids.append(self.encoder.get(token, 0))
|
| 290 |
-
if len(ids) > self.max_len:
|
| 291 |
-
logger.warning(
|
| 292 |
-
"Token indices sequence length is longer than the specified maximum "
|
| 293 |
-
" sequence length for this OpenAI GPT model ({} > {}). Running this"
|
| 294 |
-
" sequence through the model will result in indexing errors".format(
|
| 295 |
-
len(ids), self.max_len
|
| 296 |
-
)
|
| 297 |
-
)
|
| 298 |
-
return ids
|
| 299 |
-
|
| 300 |
-
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
| 301 |
-
"""Converts a sequence of ids in BPE tokens using the vocab."""
|
| 302 |
-
tokens = []
|
| 303 |
-
for i in ids:
|
| 304 |
-
if i in self.special_tokens_decoder:
|
| 305 |
-
if not skip_special_tokens:
|
| 306 |
-
tokens.append(self.special_tokens_decoder[i])
|
| 307 |
-
else:
|
| 308 |
-
tokens.append(self.decoder[i])
|
| 309 |
-
return tokens
|
| 310 |
-
|
| 311 |
-
def encode(self, text):
|
| 312 |
-
return self.convert_tokens_to_ids(self.tokenize(text))
|
| 313 |
-
|
| 314 |
-
def decode(self, tokens):
|
| 315 |
-
text = "".join([self.decoder[token] for token in tokens])
|
| 316 |
-
text = bytearray([self.byte_decoder[c] for c in text]).decode(
|
| 317 |
-
"utf-8", errors=self.errors
|
| 318 |
-
)
|
| 319 |
-
return text
|
| 320 |
-
|
| 321 |
-
def save_vocabulary(self, vocab_path):
|
| 322 |
-
"""Save the tokenizer vocabulary and merge files to a directory."""
|
| 323 |
-
if not os.path.isdir(vocab_path):
|
| 324 |
-
logger.error(
|
| 325 |
-
"Vocabulary path ({}) should be a directory".format(vocab_path)
|
| 326 |
-
)
|
| 327 |
-
return
|
| 328 |
-
vocab_file = os.path.join(vocab_path, VOCAB_NAME)
|
| 329 |
-
merge_file = os.path.join(vocab_path, MERGES_NAME)
|
| 330 |
-
special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
|
| 331 |
-
|
| 332 |
-
with open(vocab_file, "w", encoding="utf-8") as f:
|
| 333 |
-
f.write(json.dumps(self.encoder, ensure_ascii=False))
|
| 334 |
-
|
| 335 |
-
index = 0
|
| 336 |
-
with open(merge_file, "w", encoding="utf-8") as writer:
|
| 337 |
-
writer.write("#version: 0.2\n")
|
| 338 |
-
for bpe_tokens, token_index in sorted(
|
| 339 |
-
self.bpe_ranks.items(), key=lambda kv: kv[1]
|
| 340 |
-
):
|
| 341 |
-
if index != token_index:
|
| 342 |
-
logger.warning(
|
| 343 |
-
"Saving vocabulary to {}: BPE merge indices are not consecutive."
|
| 344 |
-
" Please check that the tokenizer is not corrupted!".format(
|
| 345 |
-
merge_file
|
| 346 |
-
)
|
| 347 |
-
)
|
| 348 |
-
index = token_index
|
| 349 |
-
writer.write(" ".join(bpe_tokens) + "\n")
|
| 350 |
-
index += 1
|
| 351 |
-
|
| 352 |
-
index = len(self.encoder)
|
| 353 |
-
with open(special_tokens_file, "w", encoding="utf-8") as writer:
|
| 354 |
-
for token, token_index in sorted(
|
| 355 |
-
self.special_tokens.items(), key=lambda kv: kv[1]
|
| 356 |
-
):
|
| 357 |
-
if index != token_index:
|
| 358 |
-
logger.warning(
|
| 359 |
-
"Saving special tokens vocabulary to {}: BPE indices are not consecutive."
|
| 360 |
-
" Please check that the tokenizer is not corrupted!".format(
|
| 361 |
-
special_tokens_file
|
| 362 |
-
)
|
| 363 |
-
)
|
| 364 |
-
index = token_index
|
| 365 |
-
writer.write(token + "\n")
|
| 366 |
-
index += 1
|
| 367 |
-
|
| 368 |
-
return vocab_file, merge_file, special_tokens_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/tokenizer/tokenizer.py
DELETED
|
@@ -1,402 +0,0 @@
|
|
| 1 |
-
# Copyright (c) 2021, EleutherAI
|
| 2 |
-
# This file is based on code by the authors denoted below and has been modified from its original version.
|
| 3 |
-
#
|
| 4 |
-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 5 |
-
#
|
| 6 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 7 |
-
# you may not use this file except in compliance with the License.
|
| 8 |
-
# You may obtain a copy of the License at
|
| 9 |
-
#
|
| 10 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
-
#
|
| 12 |
-
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 15 |
-
# See the License for the specific language governing permissions and
|
| 16 |
-
# limitations under the License.
|
| 17 |
-
|
| 18 |
-
"""Megatron tokenizers."""
|
| 19 |
-
|
| 20 |
-
from abc import ABC
|
| 21 |
-
from abc import abstractmethod
|
| 22 |
-
|
| 23 |
-
from tokenizers import Tokenizer
|
| 24 |
-
from transformers import GPT2Tokenizer, GPT2TokenizerFast
|
| 25 |
-
import numpy as np
|
| 26 |
-
import sentencepiece as spm
|
| 27 |
-
from typing import List, Union
|
| 28 |
-
from .gpt2_tokenization import GPT2Tokenizer
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
def build_tokenizer(args):
|
| 32 |
-
"""Initialize tokenizer."""
|
| 33 |
-
if args.rank == 0:
|
| 34 |
-
print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True)
|
| 35 |
-
|
| 36 |
-
# Select and instantiate the tokenizer.
|
| 37 |
-
if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower():
|
| 38 |
-
assert args.vocab_file is not None
|
| 39 |
-
assert args.merge_file is not None
|
| 40 |
-
tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
|
| 41 |
-
elif args.tokenizer_type.lower() == "SPMTokenizer".lower():
|
| 42 |
-
assert args.vocab_file is not None
|
| 43 |
-
tokenizer = SentencePieceTokenizer(args.vocab_file)
|
| 44 |
-
elif args.tokenizer_type.lower() == "HFTokenizer".lower():
|
| 45 |
-
assert args.vocab_file is not None
|
| 46 |
-
tokenizer = HFTokenizer(args.vocab_file)
|
| 47 |
-
elif args.tokenizer_type.lower() == "HFGPT2Tokenizer".lower():
|
| 48 |
-
if args.vocab_file is None:
|
| 49 |
-
print(
|
| 50 |
-
"WARNING: No vocab file found, loading Huggingface's pretrained GPT2Tokenizer"
|
| 51 |
-
)
|
| 52 |
-
tokenizer = HFGPT2Tokenizer(args.vocab_file)
|
| 53 |
-
elif args.tokenizer_type.lower() == "CharLevelTokenizer".lower():
|
| 54 |
-
tokenizer = CharLevelTokenizer(vocab_size=512)
|
| 55 |
-
elif args.tokenizer_type.lower() == "TiktokenTokenizer".lower():
|
| 56 |
-
assert args.vocab_file is not None
|
| 57 |
-
tokenizer = TiktokenTokenizer(args.vocab_file)
|
| 58 |
-
else:
|
| 59 |
-
raise NotImplementedError(
|
| 60 |
-
"{} tokenizer is not " "implemented.".format(args.tokenizer_type)
|
| 61 |
-
)
|
| 62 |
-
|
| 63 |
-
# Add vocab size.
|
| 64 |
-
args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
|
| 65 |
-
|
| 66 |
-
return tokenizer
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
def _vocab_size_with_padding(orig_vocab_size, args):
|
| 70 |
-
"""Pad vocab size so it is divisible by model parallel size and
|
| 71 |
-
still having GPU friendly size."""
|
| 72 |
-
|
| 73 |
-
after = orig_vocab_size
|
| 74 |
-
multiple = args.make_vocab_size_divisible_by * args.model_parallel_size
|
| 75 |
-
while (after % multiple) != 0:
|
| 76 |
-
after += 1
|
| 77 |
-
if args.rank == 0:
|
| 78 |
-
print(
|
| 79 |
-
" > padded vocab (size: {}) with {} dummy tokens "
|
| 80 |
-
"(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after),
|
| 81 |
-
flush=True,
|
| 82 |
-
)
|
| 83 |
-
return after
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
class AbstractTokenizer(ABC):
|
| 87 |
-
"""Abstract class for tokenizer."""
|
| 88 |
-
|
| 89 |
-
def __init__(self, name):
|
| 90 |
-
self.name = name
|
| 91 |
-
super().__init__()
|
| 92 |
-
|
| 93 |
-
@property
|
| 94 |
-
@abstractmethod
|
| 95 |
-
def vocab_size(self):
|
| 96 |
-
pass
|
| 97 |
-
|
| 98 |
-
@property
|
| 99 |
-
@abstractmethod
|
| 100 |
-
def vocab(self):
|
| 101 |
-
"""Dictionary from vocab text token to id token."""
|
| 102 |
-
pass
|
| 103 |
-
|
| 104 |
-
@property
|
| 105 |
-
@abstractmethod
|
| 106 |
-
def inv_vocab(self):
|
| 107 |
-
"""Dictionary from vocab id token to text token."""
|
| 108 |
-
pass
|
| 109 |
-
|
| 110 |
-
@abstractmethod
|
| 111 |
-
def tokenize(self, text):
|
| 112 |
-
pass
|
| 113 |
-
|
| 114 |
-
def detokenize(self, token_ids):
|
| 115 |
-
raise NotImplementedError(
|
| 116 |
-
"detokenizer is not implemented for {} " "tokenizer".format(self.name)
|
| 117 |
-
)
|
| 118 |
-
|
| 119 |
-
@property
|
| 120 |
-
def cls(self):
|
| 121 |
-
raise NotImplementedError(
|
| 122 |
-
"CLS is not provided for {} " "tokenizer".format(self.name)
|
| 123 |
-
)
|
| 124 |
-
|
| 125 |
-
@property
|
| 126 |
-
def sep(self):
|
| 127 |
-
raise NotImplementedError(
|
| 128 |
-
"SEP is not provided for {} " "tokenizer".format(self.name)
|
| 129 |
-
)
|
| 130 |
-
|
| 131 |
-
@property
|
| 132 |
-
def pad(self):
|
| 133 |
-
raise NotImplementedError(
|
| 134 |
-
"PAD is not provided for {} " "tokenizer".format(self.name)
|
| 135 |
-
)
|
| 136 |
-
|
| 137 |
-
@property
|
| 138 |
-
def eod(self):
|
| 139 |
-
raise NotImplementedError(
|
| 140 |
-
"EOD is not provided for {} " "tokenizer".format(self.name)
|
| 141 |
-
)
|
| 142 |
-
|
| 143 |
-
@property
|
| 144 |
-
def mask(self):
|
| 145 |
-
raise NotImplementedError(
|
| 146 |
-
"MASK is not provided for {} " "tokenizer".format(self.name)
|
| 147 |
-
)
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
class _GPT2BPETokenizer(AbstractTokenizer):
|
| 151 |
-
"""Original GPT2 BPE tokenizer."""
|
| 152 |
-
|
| 153 |
-
def __init__(self, vocab_file, merge_file):
|
| 154 |
-
name = "GPT2 BPE"
|
| 155 |
-
super().__init__(name)
|
| 156 |
-
|
| 157 |
-
self.tokenizer = GPT2Tokenizer(
|
| 158 |
-
vocab_file, merge_file, errors="replace", special_tokens=[], max_len=None
|
| 159 |
-
)
|
| 160 |
-
self.eod_id = self.tokenizer.encoder["<|endoftext|>"]
|
| 161 |
-
|
| 162 |
-
@property
|
| 163 |
-
def vocab_size(self):
|
| 164 |
-
return len(self.tokenizer.encoder)
|
| 165 |
-
|
| 166 |
-
@property
|
| 167 |
-
def vocab(self):
|
| 168 |
-
return self.tokenizer.encoder
|
| 169 |
-
|
| 170 |
-
@property
|
| 171 |
-
def inv_vocab(self):
|
| 172 |
-
return self.tokenizer.decoder
|
| 173 |
-
|
| 174 |
-
def tokenize(self, text):
|
| 175 |
-
return self.tokenizer.encode(text)
|
| 176 |
-
|
| 177 |
-
def detokenize(self, token_ids):
|
| 178 |
-
return self.tokenizer.decode(token_ids)
|
| 179 |
-
|
| 180 |
-
@property
|
| 181 |
-
def eod(self):
|
| 182 |
-
return self.eod_id
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
class SentencePieceTokenizer(AbstractTokenizer):
|
| 186 |
-
"""Designed to Integrate SP's Tokenizer."""
|
| 187 |
-
|
| 188 |
-
def __init__(self, vocab_file):
|
| 189 |
-
name = "SPM"
|
| 190 |
-
super().__init__(name)
|
| 191 |
-
|
| 192 |
-
self.tokenizer = spm.SentencePieceProcessor(model_file=vocab_file)
|
| 193 |
-
self.eod_id = self.tokenizer.piece_to_id("<|endoftext|>")
|
| 194 |
-
|
| 195 |
-
@property
|
| 196 |
-
def vocab_size(self):
|
| 197 |
-
return self.tokenizer.get_piece_size()
|
| 198 |
-
|
| 199 |
-
@property
|
| 200 |
-
def vocab(self):
|
| 201 |
-
return {
|
| 202 |
-
self.tokenizer.id_to_piece(idx): idx
|
| 203 |
-
for idx in range(self.tokenizer.get_piece_size())
|
| 204 |
-
}
|
| 205 |
-
|
| 206 |
-
@property
|
| 207 |
-
def inv_vocab(self):
|
| 208 |
-
return {
|
| 209 |
-
idx: self.tokenizer.id_to_piece(idx)
|
| 210 |
-
for idx in range(self.tokenizer.get_piece_size())
|
| 211 |
-
}
|
| 212 |
-
|
| 213 |
-
def tokenize(self, text):
|
| 214 |
-
return self.tokenizer.encode(text)
|
| 215 |
-
|
| 216 |
-
def detokenize(self, token_ids):
|
| 217 |
-
return self.tokenizer.decode(token_ids)
|
| 218 |
-
|
| 219 |
-
@property
|
| 220 |
-
def eod(self):
|
| 221 |
-
return self.eod_id
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
class HFTokenizer(AbstractTokenizer):
|
| 225 |
-
"""Designed to Integrate HF's Tokenizer library."""
|
| 226 |
-
|
| 227 |
-
def __init__(self, vocab_file):
|
| 228 |
-
name = "HFTokenizer"
|
| 229 |
-
super().__init__(name)
|
| 230 |
-
|
| 231 |
-
self.tokenizer = Tokenizer.from_file(vocab_file)
|
| 232 |
-
self.eod_id = self.tokenizer.token_to_id("<|endoftext|>")
|
| 233 |
-
self.pad_id = self.tokenizer.token_to_id("<|padding|>")
|
| 234 |
-
|
| 235 |
-
@property
|
| 236 |
-
def vocab_size(self):
|
| 237 |
-
return self.tokenizer.get_vocab_size()
|
| 238 |
-
|
| 239 |
-
@property
|
| 240 |
-
def vocab(self):
|
| 241 |
-
return self.tokenizer.get_vocab()
|
| 242 |
-
|
| 243 |
-
@property
|
| 244 |
-
def inv_vocab(self):
|
| 245 |
-
return self.tokenizer.decoder
|
| 246 |
-
|
| 247 |
-
def tokenize(self, text: str):
|
| 248 |
-
return self.tokenizer.encode(text).ids
|
| 249 |
-
|
| 250 |
-
def tokenize_batch(self, text_batch: Union[List[str], str]):
|
| 251 |
-
return self.tokenizer.encode_batch(text_batch)
|
| 252 |
-
|
| 253 |
-
def detokenize(self, token_ids):
|
| 254 |
-
return self.tokenizer.decode(token_ids)
|
| 255 |
-
|
| 256 |
-
@property
|
| 257 |
-
def eod(self):
|
| 258 |
-
return self.eod_id
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
class HFGPT2Tokenizer(AbstractTokenizer):
|
| 262 |
-
"""Designed to Integrate the pretrained OpenAI GPT2 Tokenizers from HF"""
|
| 263 |
-
|
| 264 |
-
def __init__(self, vocab_file=None, fast=True):
|
| 265 |
-
name = "HFGPT2Tokenizer"
|
| 266 |
-
if fast:
|
| 267 |
-
name += "Fast"
|
| 268 |
-
super().__init__(name)
|
| 269 |
-
if vocab_file is None:
|
| 270 |
-
vocab_file = "gpt2"
|
| 271 |
-
if fast:
|
| 272 |
-
self.tokenizer = GPT2TokenizerFast.from_pretrained(vocab_file)
|
| 273 |
-
else:
|
| 274 |
-
self.tokenizer = GPT2Tokenizer.from_pretrained(vocab_file)
|
| 275 |
-
|
| 276 |
-
self.tokenizer.add_special_tokens({"pad_token": "<|padding|>"})
|
| 277 |
-
self.eod_id = self.tokenizer.eos_token_id
|
| 278 |
-
self.pad_id = self.tokenizer.pad_token_id
|
| 279 |
-
|
| 280 |
-
@property
|
| 281 |
-
def vocab_size(self):
|
| 282 |
-
return len(self.tokenizer)
|
| 283 |
-
|
| 284 |
-
@property
|
| 285 |
-
def vocab(self):
|
| 286 |
-
return self.tokenizer.get_vocab()
|
| 287 |
-
|
| 288 |
-
@property
|
| 289 |
-
def inv_vocab(self):
|
| 290 |
-
return self.tokenizer._tokenizer.decoder
|
| 291 |
-
|
| 292 |
-
def tokenize(self, text: str):
|
| 293 |
-
return self.tokenizer.encode(text)
|
| 294 |
-
|
| 295 |
-
def tokenize_batch(self, text_batch: Union[List[str], str]):
|
| 296 |
-
if isinstance(text_batch, str):
|
| 297 |
-
text_batch = [text_batch]
|
| 298 |
-
return [self.tokenize(t) for t in text_batch]
|
| 299 |
-
|
| 300 |
-
def detokenize(self, token_ids):
|
| 301 |
-
return self.tokenizer.decode(token_ids)
|
| 302 |
-
|
| 303 |
-
@property
|
| 304 |
-
def eod(self):
|
| 305 |
-
return self.eod_id
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
class CharLevelTokenizer(AbstractTokenizer):
|
| 309 |
-
"""Character Level Tokenizer"""
|
| 310 |
-
|
| 311 |
-
def __init__(self, vocab_size):
|
| 312 |
-
name = "CharLevelTokenizer"
|
| 313 |
-
super().__init__(name)
|
| 314 |
-
self._vocab_size = vocab_size
|
| 315 |
-
self.eod_id = 0
|
| 316 |
-
self.pad_id = 1
|
| 317 |
-
|
| 318 |
-
def clamp(self, n):
|
| 319 |
-
return max(32, min(n, self.vocab_size))
|
| 320 |
-
|
| 321 |
-
@property
|
| 322 |
-
def vocab_size(self):
|
| 323 |
-
return self._vocab_size
|
| 324 |
-
|
| 325 |
-
@property
|
| 326 |
-
def vocab(self):
|
| 327 |
-
raise NotImplementedError
|
| 328 |
-
|
| 329 |
-
@property
|
| 330 |
-
def inv_vocab(self):
|
| 331 |
-
raise NotImplementedError
|
| 332 |
-
|
| 333 |
-
def decode_token(self, token: int):
|
| 334 |
-
return str(chr(self.clamp(token)))
|
| 335 |
-
|
| 336 |
-
def tokenize(self, text: str):
|
| 337 |
-
return list(np.fromstring(text, dtype=np.uint8))
|
| 338 |
-
|
| 339 |
-
def tokenize_batch(self, text_batch: Union[List[str], str]):
|
| 340 |
-
if isinstance(text_batch, list):
|
| 341 |
-
return [self.tokenize(s) for s in text_batch]
|
| 342 |
-
else:
|
| 343 |
-
return self.tokenize(text_batch)
|
| 344 |
-
|
| 345 |
-
def detokenize(self, token_ids):
|
| 346 |
-
return "".join(list(map(self.decode_token, token_ids)))
|
| 347 |
-
|
| 348 |
-
@property
|
| 349 |
-
def eod(self):
|
| 350 |
-
return self.eod_id
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
class TiktokenTokenizer(AbstractTokenizer):
|
| 354 |
-
"""Tokenizer from OpenAI's tiktoken implementation"""
|
| 355 |
-
|
| 356 |
-
def __init__(self, vocab_file):
|
| 357 |
-
try:
|
| 358 |
-
import tiktoken
|
| 359 |
-
except ModuleNotFoundError:
|
| 360 |
-
print("Please install tiktoken: (https://github.com/openai/tiktoken)")
|
| 361 |
-
raise Exception
|
| 362 |
-
|
| 363 |
-
name = "TiktokenTokenizer"
|
| 364 |
-
super().__init__(name)
|
| 365 |
-
|
| 366 |
-
self.tokenizer = tiktoken.get_encoding(vocab_file)
|
| 367 |
-
self.eod_id = self.tokenizer.eot_token
|
| 368 |
-
self.pad_id = None
|
| 369 |
-
|
| 370 |
-
@property
|
| 371 |
-
def vocab_size(self):
|
| 372 |
-
return self.tokenizer.n_vocab
|
| 373 |
-
|
| 374 |
-
@property
|
| 375 |
-
def vocab(self):
|
| 376 |
-
raise NotImplementedError(
|
| 377 |
-
"TiktokenTokenizer does not implement vocabulary access."
|
| 378 |
-
)
|
| 379 |
-
|
| 380 |
-
@property
|
| 381 |
-
def inv_vocab(self):
|
| 382 |
-
raise NotImplementedError(
|
| 383 |
-
"TiktokenTokenizer does not implement vocabulary access. \
|
| 384 |
-
To get the idx-th token in vocabulary, use tokenizer.decode([idx]) ."
|
| 385 |
-
)
|
| 386 |
-
|
| 387 |
-
def tokenize(self, text: str):
|
| 388 |
-
return self.tokenizer.encode(text) # , allowed_special="all")
|
| 389 |
-
|
| 390 |
-
def tokenize_batch(self, text_batch: List[str]):
|
| 391 |
-
return self.tokenizer.encode_batch(text_batch, allowed_special="all")
|
| 392 |
-
|
| 393 |
-
def detokenize(self, token_ids):
|
| 394 |
-
return self.tokenizer.decode(tokens=token_ids, errors="strict")
|
| 395 |
-
|
| 396 |
-
@property
|
| 397 |
-
def eod(self):
|
| 398 |
-
return self.eod_id
|
| 399 |
-
|
| 400 |
-
@property
|
| 401 |
-
def pad(self):
|
| 402 |
-
raise NotImplementedError
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/tokenizer/train_tokenizer.py
DELETED
|
@@ -1,126 +0,0 @@
|
|
| 1 |
-
# Copyright (c) 2021, EleutherAI
|
| 2 |
-
#
|
| 3 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
-
# you may not use this file except in compliance with the License.
|
| 5 |
-
# You may obtain a copy of the License at
|
| 6 |
-
#
|
| 7 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
-
#
|
| 9 |
-
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
-
# See the License for the specific language governing permissions and
|
| 13 |
-
# limitations under the License.
|
| 14 |
-
|
| 15 |
-
"""
|
| 16 |
-
Assumes a dataset of jsonl files in the same format as the neox training set.
|
| 17 |
-
"""
|
| 18 |
-
|
| 19 |
-
from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
|
| 20 |
-
from tokenizers.normalizers import NFKC
|
| 21 |
-
|
| 22 |
-
from glob import glob
|
| 23 |
-
import os
|
| 24 |
-
import json
|
| 25 |
-
import argparse
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
def load_jsonl(input_path, quiet=True) -> list:
|
| 29 |
-
"""
|
| 30 |
-
Read list of objects from a JSON lines file.
|
| 31 |
-
"""
|
| 32 |
-
data = []
|
| 33 |
-
with open(input_path, "r", encoding="utf-8") as f:
|
| 34 |
-
for line in f:
|
| 35 |
-
data.append(json.loads(line.rstrip("\n|\r")))
|
| 36 |
-
if not quiet:
|
| 37 |
-
print("Loaded {} records from {}".format(len(data), input_path))
|
| 38 |
-
return data
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
def json_iterator(input_dir, text_key="text"):
|
| 42 |
-
all_jsonls = glob(f"{input_dir}/*.jsonl") + glob(f"{input_dir}/*.json")
|
| 43 |
-
for j in all_jsonls:
|
| 44 |
-
data = load_jsonl(j)
|
| 45 |
-
for doc in data:
|
| 46 |
-
yield doc[text_key]
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
def train_tokenizer(
|
| 50 |
-
input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000
|
| 51 |
-
):
|
| 52 |
-
"""
|
| 53 |
-
Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path`
|
| 54 |
-
|
| 55 |
-
:param input_dir: input directory containing jsonl files
|
| 56 |
-
:param save_path: path to save tokenizer to
|
| 57 |
-
:param tokenizer_type: type of tokenizer to train.
|
| 58 |
-
:param vocab_size: int, size of tokenizer's vocab
|
| 59 |
-
:return:
|
| 60 |
-
"""
|
| 61 |
-
|
| 62 |
-
if tokenizer_type == "BPE":
|
| 63 |
-
model = models.BPE()
|
| 64 |
-
else:
|
| 65 |
-
raise NotImplementedError(f"Tokenizer type {tokenizer_type} not implemented")
|
| 66 |
-
tokenizer = Tokenizer(model)
|
| 67 |
-
|
| 68 |
-
# Customize pre-tokenization and decoding
|
| 69 |
-
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
|
| 70 |
-
tokenizer.decoder = decoders.ByteLevel()
|
| 71 |
-
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
|
| 72 |
-
tokenizer.normalizer = NFKC()
|
| 73 |
-
|
| 74 |
-
# And then train
|
| 75 |
-
trainer = trainers.BpeTrainer(
|
| 76 |
-
vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"]
|
| 77 |
-
)
|
| 78 |
-
tokenizer.train_from_iterator(json_iterator(input_dir), trainer)
|
| 79 |
-
|
| 80 |
-
# And Save it
|
| 81 |
-
tokenizer.save(save_path, pretty=True)
|
| 82 |
-
print(f"Tokenizer saved at {save_path}")
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
def parse_args():
|
| 86 |
-
parser = argparse.ArgumentParser(
|
| 87 |
-
description="script for training a multilingual "
|
| 88 |
-
"HF tokenizer on CC dumps with upweighting for low resource languages"
|
| 89 |
-
)
|
| 90 |
-
parser.add_argument(
|
| 91 |
-
"--json_input_dir",
|
| 92 |
-
type=str,
|
| 93 |
-
help="Path to folder containing tokenizer training data in jsonl format",
|
| 94 |
-
)
|
| 95 |
-
parser.add_argument(
|
| 96 |
-
"--tokenizer_output_path",
|
| 97 |
-
type=str,
|
| 98 |
-
help="Path to which your trained tokenizer will be saved (should end in .json)",
|
| 99 |
-
)
|
| 100 |
-
parser.add_argument(
|
| 101 |
-
"--tokenizer_type",
|
| 102 |
-
type=str,
|
| 103 |
-
help="type of tokenizer to train, currently only BPE is supported",
|
| 104 |
-
choices=["BPE"],
|
| 105 |
-
default=["BPE"],
|
| 106 |
-
)
|
| 107 |
-
parser.add_argument(
|
| 108 |
-
"-v",
|
| 109 |
-
"--vocab_size",
|
| 110 |
-
help="vocabulary size of tokenizer, default=52k",
|
| 111 |
-
type=int,
|
| 112 |
-
default=52000,
|
| 113 |
-
)
|
| 114 |
-
return parser.parse_args()
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
if __name__ == "__main__":
|
| 118 |
-
|
| 119 |
-
args = parse_args()
|
| 120 |
-
|
| 121 |
-
train_tokenizer(
|
| 122 |
-
args.json_input_dir,
|
| 123 |
-
save_path=args.tokenizer_output_path,
|
| 124 |
-
tokenizer_type=args.tokenizer_type,
|
| 125 |
-
vocab_size=args.vocab_size,
|
| 126 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/trouble-shooting.md
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
## Exception: data did not match any variant of untagged enum ModelWrapper at line 108219 column 3
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
## The OrderedVocab you are attempting to save contains a hole for index 50254, your vocabulary could be corrupted !
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
```
|
| 12 |
-
The OrderedVocab you are attempting to save contains a hole for index 50254, your vocabulary could be corrupted !
|
| 13 |
-
The OrderedVocab you are attempting to save contains a hole for index 50255, your vocabulary could be corrupted !
|
| 14 |
-
The OrderedVocab you are attempting to save contains a hole for index 50256, your vocabulary could be corrupted !
|
| 15 |
-
```
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
原因:50254 这些token并未在vocab中定义,只在 `added_tokens` 里定义了。
|
| 19 |
-
|
| 20 |
-
## ss
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/moss/__init__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
|
| 2 |
import os
|
| 3 |
-
from transformers import AutoTokenizer
|
| 4 |
|
| 5 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 6 |
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "moss-moon-003-sft")
|
|
|
|
| 1 |
|
| 2 |
import os
|
| 3 |
+
from transformers import AutoTokenizer
|
| 4 |
|
| 5 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 6 |
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "moss-moon-003-sft")
|
vocab/text_davinci_003/__init__.py
CHANGED
|
@@ -1,70 +1,25 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
-
import tiktoken
|
| 6 |
-
from tiktoken import Encoding
|
| 7 |
-
from utils.log_util import logger
|
| 8 |
-
|
| 9 |
-
tokenizer = tiktoken.encoding_for_model('text-davinci-003')
|
| 10 |
-
tokenizer.vocab_size = tokenizer.n_vocab
|
| 11 |
-
|
| 12 |
-
tokenizer.comments = ""
|
| 13 |
-
tokenizer.reversible = True
|
| 14 |
-
|
| 15 |
|
| 16 |
|
| 17 |
|
| 18 |
-
|
| 19 |
-
"""
|
| 20 |
-
默认的decode,可能会报错,详见 decode_test.py
|
| 21 |
-
skip_special_tokens 是为了兼容 hf_tokenizer
|
| 22 |
-
"""
|
| 23 |
-
try:
|
| 24 |
-
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
| 25 |
-
except:
|
| 26 |
-
decode_str = "null"
|
| 27 |
-
return decode_str
|
| 28 |
|
| 29 |
-
|
| 30 |
-
""
|
| 31 |
-
|
| 32 |
-
"""
|
| 33 |
-
try:
|
| 34 |
-
return tokenizer.decode_tokens_bytes(tokens)
|
| 35 |
-
except:
|
| 36 |
-
# 什么要返回None?见zh_util.py
|
| 37 |
-
# 16个空闲id, 100256 100261-100275
|
| 38 |
-
return [None for token in tokens]
|
| 39 |
|
| 40 |
-
def get_vocab(self, token_type="str"):
|
| 41 |
-
"""Returns vocab as a dict
|
| 42 |
-
:param token_type: ["str", "byte"]
|
| 43 |
-
:return:
|
| 44 |
-
"""
|
| 45 |
-
vocab = {}
|
| 46 |
-
key_error_list = []
|
| 47 |
-
unicode_decode_error_list = []
|
| 48 |
-
for i in range(self.vocab_size):
|
| 49 |
-
try:
|
| 50 |
-
token_byte = self.convert_ids_to_tokens([i])[0]
|
| 51 |
-
if token_byte is None:
|
| 52 |
-
continue
|
| 53 |
-
# token_str = token_byte.decode("utf-8")
|
| 54 |
-
vocab[token_byte] = i
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
vocab[token_byte] = i
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
logger.info(f"text-davinci-003 {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
|
| 63 |
-
return vocab
|
| 64 |
|
|
|
|
|
|
|
| 65 |
|
|
|
|
|
|
|
| 66 |
|
| 67 |
-
# tiktoken patch
|
| 68 |
-
Encoding.decode = decode
|
| 69 |
-
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
| 70 |
-
Encoding.get_vocab = get_vocab
|
|
|
|
| 1 |
"""
|
| 2 |
+
,请
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
|
| 6 |
+
## tiktoken API
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
tokens = enc.encode("hello world")
|
| 9 |
+
assert enc.decode(tokens) == "hello world"
|
| 10 |
+
assert enc.decode_bytes(tokens) == b"hello world"
|
| 11 |
+
assert enc.decode_tokens_bytes(tokens) == [b"hello", b" world"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
decode_single_token_bytes
|
| 15 |
+
"""
|
|
|
|
| 16 |
|
| 17 |
+
import tiktoken
|
| 18 |
+
import tokenizer.tiktoken_patch
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
tokenizer = tiktoken.encoding_for_model('text-davinci-003')
|
| 21 |
+
tokenizer.vocab_size = tokenizer.n_vocab
|
| 22 |
|
| 23 |
+
tokenizer.comments = ""
|
| 24 |
+
tokenizer.reversible = True
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|