AhmedGaver commited on
Commit
fa11683
·
verified ·
1 Parent(s): 49a7f8d

Upload v2 of URL classifier model

Browse files

Metrics:
- Test Accuracy: 0.9959
- Test F1: 0.9959
- Test FPR: 0.0061
- Test FNR: 0.0024

Files changed (3) hide show
  1. special_tokens_map.json +6 -0
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +42 -10
special_tokens_map.json CHANGED
@@ -1,4 +1,10 @@
1
  {
 
 
 
 
 
 
2
  "cls_token": {
3
  "content": "[CLS]",
4
  "lstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ "[DOMAIN]",
4
+ "[PATH]",
5
+ "[IP]",
6
+ "[IPv6]"
7
+ ],
8
  "cls_token": {
9
  "content": "[CLS]",
10
  "lstrip": false,
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "[PAD]",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
@@ -9,7 +9,7 @@
9
  "special": true
10
  },
11
  "1": {
12
- "content": "[UNK]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
@@ -39,27 +39,59 @@
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  }
43
  },
 
 
 
 
 
 
44
  "clean_up_tokenization_spaces": true,
45
  "cls_token": "[CLS]",
46
- "do_basic_tokenize": true,
47
- "do_lower_case": true,
48
  "extra_special_tokens": {},
49
  "mask_token": "[MASK]",
50
- "max_len": 64,
51
  "max_length": 64,
52
- "model_max_length": 64,
53
- "never_split": null,
54
  "pad_to_multiple_of": null,
55
  "pad_token": "[PAD]",
56
  "pad_token_type_id": 0,
57
  "padding_side": "right",
58
  "sep_token": "[SEP]",
59
  "stride": 0,
60
- "strip_accents": null,
61
- "tokenize_chinese_chars": true,
62
- "tokenizer_class": "BertTokenizer",
63
  "truncation_side": "right",
64
  "truncation_strategy": "longest_first",
65
  "unk_token": "[UNK]"
 
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "[UNK]",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
 
9
  "special": true
10
  },
11
  "1": {
12
+ "content": "[PAD]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
 
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
+ },
43
+ "5": {
44
+ "content": "[DOMAIN]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "[PATH]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "7": {
60
+ "content": "[IP]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "8": {
68
+ "content": "[IPv6]",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
  }
75
  },
76
+ "additional_special_tokens": [
77
+ "[DOMAIN]",
78
+ "[PATH]",
79
+ "[IP]",
80
+ "[IPv6]"
81
+ ],
82
  "clean_up_tokenization_spaces": true,
83
  "cls_token": "[CLS]",
 
 
84
  "extra_special_tokens": {},
85
  "mask_token": "[MASK]",
 
86
  "max_length": 64,
87
+ "model_max_length": 1024,
 
88
  "pad_to_multiple_of": null,
89
  "pad_token": "[PAD]",
90
  "pad_token_type_id": 0,
91
  "padding_side": "right",
92
  "sep_token": "[SEP]",
93
  "stride": 0,
94
+ "tokenizer_class": "PreTrainedTokenizerFast",
 
 
95
  "truncation_side": "right",
96
  "truncation_strategy": "longest_first",
97
  "unk_token": "[UNK]"