Upload tokenizer

Files changed (4) hide show

special_tokens_map.json ADDED Viewed

+{
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenizer_utils.CryptGPTTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "max_length": 1024,
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "stride": 0,
+  "tokenizer_class": "CryptGPTTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first"
+}

tokenizer_utils.py ADDED Viewed

+from transformers import PreTrainedTokenizerFast
+class CryptGPTTokenizer(PreTrainedTokenizerFast):
+    @staticmethod
+    def clean_up_tokenization(out_string):
+        return out_string.replace(' ', "")
+CryptGPTTokenizer.register_for_auto_class("AutoTokenizer")