Spaces:
Sleeping
Sleeping
| """ | |
| special_symbols: https://github.com/google/sentencepiece/blob/master/doc/special_symbols.md | |
| emoji: | |
| """ | |
| import sys | |
| # 来自 https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L55 | |
| # 啥意思? | |
| def bytes_to_unicode(): | |
| """ | |
| Returns list of utf-8 byte and a corresponding list of unicode strings. | |
| The reversible bpe codes work on unicode strings. | |
| This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. | |
| When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. | |
| This is a signficant percentage of your normal, say, 32K bpe vocab. | |
| To avoid that, we want lookup tables between utf-8 bytes and unicode strings. | |
| And avoids mapping to whitespace/control characters the bpe code barfs on. | |
| """ | |
| _chr = unichr if sys.version_info[0] == 2 else chr | |
| bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \ | |
| list(range(ord("®"), ord("ÿ") + 1)) | |
| cs = bs[:] | |
| n = 0 | |
| for b in range(2**8): | |
| if b not in bs: | |
| bs.append(b) | |
| cs.append(2**8 + n) | |
| n += 1 | |
| cs = [_chr(n) for n in cs] | |
| return dict(zip(bs, cs)) | |
| aa = bytes_to_unicode() | |
| print(aa) |