| """ | |
| """ | |
| from vocab.gpt2 import tokenizer | |
| # from transformers import GPT2Tokenizer | |
| # # tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
| # tokenizer = GPT2Tokenizer.from_pretrained("tokenizer") | |
| print(tokenizer.bpe('中国')) | |
| # | |
| print(tokenizer.encode("Hello world")) # 默认 add_prefix_space=False | |
| print(tokenizer.encode("Hello world", add_prefix_space=True)) | |
| print(tokenizer.encode(" Hello world")) | |
| print(tokenizer.encode("Hello world", add_special_tokens=True)) # add_special_tokens 没用 | |
| print(tokenizer.encode(text='中国\n', add_special_tokens=False)) | |
| # | |
| # print(tokenizer.encode(text='中国', add_special_tokens=False)) | |
| # | |
| # print(tokenizer.tokenize('I love Salah and salad')) | |