jahyungu commited on
Commit
31c6b33
·
verified ·
1 Parent(s): 45488cf

Training in progress, epoch 1

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '
2
+ ' + message['content'] + '<|end|>
3
+ ' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>
4
+ ' }}{% else %}{{ eos_token }}{% endif %}
cl100k_base.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Phi3SmallForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout_prob": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_phi3_small.Phi3SmallConfig",
9
+ "AutoModelForCausalLM": "modeling_phi3_small.Phi3SmallForCausalLM",
10
+ "AutoModelForSequenceClassification": "modeling_phi3_small.Phi3SmallForSequenceClassification",
11
+ "AutoTokenizer": "tokenization_phi3_small.Phi3SmallTokenizer"
12
+ },
13
+ "blocksparse_block_size": 64,
14
+ "blocksparse_homo_head_pattern": false,
15
+ "blocksparse_num_local_blocks": 16,
16
+ "blocksparse_triton_kernel_block_size": 64,
17
+ "blocksparse_vert_stride": 8,
18
+ "bos_token_id": 100257,
19
+ "dense_attention_every_n_layers": 2,
20
+ "dummy_token_indices": [
21
+ 100256,
22
+ 100258,
23
+ 100259,
24
+ 100260,
25
+ 100264,
26
+ 100265,
27
+ 100267,
28
+ 100268,
29
+ 100269,
30
+ 100270,
31
+ 100271,
32
+ 100272,
33
+ 100273,
34
+ 100274,
35
+ 100275,
36
+ 100276,
37
+ 100277,
38
+ 100278,
39
+ 100279,
40
+ 100280,
41
+ 100281,
42
+ 100282,
43
+ 100283,
44
+ 100284,
45
+ 100285,
46
+ 100286,
47
+ 100287,
48
+ 100288,
49
+ 100289,
50
+ 100290,
51
+ 100291,
52
+ 100292,
53
+ 100293,
54
+ 100294,
55
+ 100295,
56
+ 100296,
57
+ 100297,
58
+ 100298,
59
+ 100299,
60
+ 100300,
61
+ 100301,
62
+ 100302,
63
+ 100303,
64
+ 100304,
65
+ 100305,
66
+ 100306,
67
+ 100307,
68
+ 100308,
69
+ 100309,
70
+ 100310,
71
+ 100311,
72
+ 100312,
73
+ 100313,
74
+ 100314,
75
+ 100315,
76
+ 100316,
77
+ 100317,
78
+ 100318,
79
+ 100319,
80
+ 100320,
81
+ 100321,
82
+ 100322,
83
+ 100323,
84
+ 100324,
85
+ 100325,
86
+ 100326,
87
+ 100327,
88
+ 100328,
89
+ 100329,
90
+ 100330,
91
+ 100331,
92
+ 100332,
93
+ 100333,
94
+ 100334,
95
+ 100335,
96
+ 100336,
97
+ 100337,
98
+ 100338,
99
+ 100339,
100
+ 100340,
101
+ 100341,
102
+ 100342,
103
+ 100343,
104
+ 100344,
105
+ 100345,
106
+ 100346,
107
+ 100347,
108
+ 100348,
109
+ 100349,
110
+ 100350,
111
+ 100351
112
+ ],
113
+ "embedding_dropout_prob": 0.1,
114
+ "eos_token_id": 100257,
115
+ "ff_dim_multiplier": null,
116
+ "ff_intermediate_size": 14336,
117
+ "ffn_dropout_prob": 0.1,
118
+ "gegelu_limit": 20.0,
119
+ "gegelu_pad_to_256": true,
120
+ "hidden_act": "gegelu",
121
+ "hidden_size": 4096,
122
+ "initializer_range": 0.02,
123
+ "layer_norm_epsilon": 1e-05,
124
+ "max_position_embeddings": 8192,
125
+ "model_type": "phi3small",
126
+ "mup_attn_multiplier": 1.0,
127
+ "mup_embedding_multiplier": 10.0,
128
+ "mup_use_scaling": true,
129
+ "mup_width_multiplier": 8.0,
130
+ "num_attention_heads": 32,
131
+ "num_hidden_layers": 32,
132
+ "num_key_value_heads": 8,
133
+ "pad_sequence_to_multiple_of_64": true,
134
+ "pad_token_id": 100257,
135
+ "reorder_and_upcast_attn": false,
136
+ "rope_embedding_base": 1000000,
137
+ "rope_position_scale": 1.0,
138
+ "rope_scaling": null,
139
+ "torch_dtype": "bfloat16",
140
+ "transformers_version": "4.55.0",
141
+ "use_cache": true,
142
+ "vocab_size": 100352
143
+ }
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ce0e92488c26f51c73f8b834630aad17cc7ee43ea1b20110366a43afb66554d
3
+ size 4832944248
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:854b1b634f813523a0ffd99234cdc948cd96843aff8c4b7e8ac6d989506ae2d2
3
+ size 4799609488
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aecef4660fffd2dd0de236c479f311d6a8093092d0b44337aa6235093273e17
3
+ size 4799609504
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8e8d683a438b46cc25275619434acab732027a068e376424cf3b78002c2eda1
3
+ size 352437304
model.safetensors.index.json ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 7392272384,
4
+ "total_size": 14784552960
5
+ },
6
+ "weight_map": {
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.final_layernorm.bias": "model-00004-of-00004.safetensors",
9
+ "model.final_layernorm.weight": "model-00004-of-00004.safetensors",
10
+ "model.layers.0.input_layernorm.bias": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.dense.bias": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.dense.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.0.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
21
+ "model.layers.0.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.0.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.input_layernorm.bias": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.dense.bias": "model-00001-of-00004.safetensors",
32
+ "model.layers.1.self_attn.dense.weight": "model-00001-of-00004.safetensors",
33
+ "model.layers.1.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
34
+ "model.layers.1.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
35
+ "model.layers.1.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
36
+ "model.layers.10.input_layernorm.bias": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.10.self_attn.dense.bias": "model-00002-of-00004.safetensors",
45
+ "model.layers.10.self_attn.dense.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.10.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
47
+ "model.layers.10.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.10.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.input_layernorm.bias": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
56
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.11.self_attn.dense.bias": "model-00002-of-00004.safetensors",
58
+ "model.layers.11.self_attn.dense.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.11.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
60
+ "model.layers.11.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.11.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.input_layernorm.bias": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.12.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
69
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.12.self_attn.dense.bias": "model-00002-of-00004.safetensors",
71
+ "model.layers.12.self_attn.dense.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.12.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
73
+ "model.layers.12.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.12.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.input_layernorm.bias": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
80
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.13.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
82
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.13.self_attn.dense.bias": "model-00002-of-00004.safetensors",
84
+ "model.layers.13.self_attn.dense.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.13.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
86
+ "model.layers.13.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.13.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.input_layernorm.bias": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.14.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
93
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.14.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
95
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.14.self_attn.dense.bias": "model-00002-of-00004.safetensors",
97
+ "model.layers.14.self_attn.dense.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.14.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
99
+ "model.layers.14.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.14.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.input_layernorm.bias": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
104
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.15.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
106
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.15.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
108
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.15.self_attn.dense.bias": "model-00002-of-00004.safetensors",
110
+ "model.layers.15.self_attn.dense.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.15.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
112
+ "model.layers.15.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.15.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.input_layernorm.bias": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.16.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
117
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.16.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
119
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.16.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
121
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
122
+ "model.layers.16.self_attn.dense.bias": "model-00002-of-00004.safetensors",
123
+ "model.layers.16.self_attn.dense.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.16.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
125
+ "model.layers.16.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.16.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.input_layernorm.bias": "model-00002-of-00004.safetensors",
128
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.17.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
130
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.17.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
132
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.17.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
134
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.17.self_attn.dense.bias": "model-00002-of-00004.safetensors",
136
+ "model.layers.17.self_attn.dense.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.17.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
138
+ "model.layers.17.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.17.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
140
+ "model.layers.18.input_layernorm.bias": "model-00002-of-00004.safetensors",
141
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
142
+ "model.layers.18.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
143
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
144
+ "model.layers.18.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
145
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
146
+ "model.layers.18.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
147
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
148
+ "model.layers.18.self_attn.dense.bias": "model-00002-of-00004.safetensors",
149
+ "model.layers.18.self_attn.dense.weight": "model-00002-of-00004.safetensors",
150
+ "model.layers.18.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
151
+ "model.layers.18.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
152
+ "model.layers.18.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
153
+ "model.layers.19.input_layernorm.bias": "model-00002-of-00004.safetensors",
154
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
155
+ "model.layers.19.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
156
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
157
+ "model.layers.19.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
158
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
159
+ "model.layers.19.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
160
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
161
+ "model.layers.19.self_attn.dense.bias": "model-00002-of-00004.safetensors",
162
+ "model.layers.19.self_attn.dense.weight": "model-00002-of-00004.safetensors",
163
+ "model.layers.19.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
164
+ "model.layers.19.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
165
+ "model.layers.19.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
166
+ "model.layers.2.input_layernorm.bias": "model-00001-of-00004.safetensors",
167
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
168
+ "model.layers.2.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
169
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
170
+ "model.layers.2.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
171
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
172
+ "model.layers.2.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
173
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
174
+ "model.layers.2.self_attn.dense.bias": "model-00001-of-00004.safetensors",
175
+ "model.layers.2.self_attn.dense.weight": "model-00001-of-00004.safetensors",
176
+ "model.layers.2.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
177
+ "model.layers.2.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
178
+ "model.layers.2.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
179
+ "model.layers.20.input_layernorm.bias": "model-00003-of-00004.safetensors",
180
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.20.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
182
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.20.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
184
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.20.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
186
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.20.self_attn.dense.bias": "model-00002-of-00004.safetensors",
188
+ "model.layers.20.self_attn.dense.weight": "model-00002-of-00004.safetensors",
189
+ "model.layers.20.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
190
+ "model.layers.20.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
191
+ "model.layers.20.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
192
+ "model.layers.21.input_layernorm.bias": "model-00003-of-00004.safetensors",
193
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.21.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
195
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.21.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
197
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.21.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
199
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.21.self_attn.dense.bias": "model-00003-of-00004.safetensors",
201
+ "model.layers.21.self_attn.dense.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.21.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
203
+ "model.layers.21.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.21.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
205
+ "model.layers.22.input_layernorm.bias": "model-00003-of-00004.safetensors",
206
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.22.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
208
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.22.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
210
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.22.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
212
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.22.self_attn.dense.bias": "model-00003-of-00004.safetensors",
214
+ "model.layers.22.self_attn.dense.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.22.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
216
+ "model.layers.22.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.22.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
218
+ "model.layers.23.input_layernorm.bias": "model-00003-of-00004.safetensors",
219
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.23.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
221
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.23.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
223
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.23.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
225
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.23.self_attn.dense.bias": "model-00003-of-00004.safetensors",
227
+ "model.layers.23.self_attn.dense.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.23.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
229
+ "model.layers.23.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.23.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
231
+ "model.layers.24.input_layernorm.bias": "model-00003-of-00004.safetensors",
232
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.24.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
234
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.24.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
236
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.24.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
238
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.24.self_attn.dense.bias": "model-00003-of-00004.safetensors",
240
+ "model.layers.24.self_attn.dense.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.24.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
242
+ "model.layers.24.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.24.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
244
+ "model.layers.25.input_layernorm.bias": "model-00003-of-00004.safetensors",
245
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.25.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
247
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.25.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
249
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.25.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
251
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.25.self_attn.dense.bias": "model-00003-of-00004.safetensors",
253
+ "model.layers.25.self_attn.dense.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.25.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
255
+ "model.layers.25.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.25.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
257
+ "model.layers.26.input_layernorm.bias": "model-00003-of-00004.safetensors",
258
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.26.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
260
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
261
+ "model.layers.26.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
262
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
263
+ "model.layers.26.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
264
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
265
+ "model.layers.26.self_attn.dense.bias": "model-00003-of-00004.safetensors",
266
+ "model.layers.26.self_attn.dense.weight": "model-00003-of-00004.safetensors",
267
+ "model.layers.26.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
268
+ "model.layers.26.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
269
+ "model.layers.26.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
270
+ "model.layers.27.input_layernorm.bias": "model-00003-of-00004.safetensors",
271
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
272
+ "model.layers.27.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
273
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
274
+ "model.layers.27.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
275
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
276
+ "model.layers.27.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
277
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
278
+ "model.layers.27.self_attn.dense.bias": "model-00003-of-00004.safetensors",
279
+ "model.layers.27.self_attn.dense.weight": "model-00003-of-00004.safetensors",
280
+ "model.layers.27.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
281
+ "model.layers.27.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
282
+ "model.layers.27.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
283
+ "model.layers.28.input_layernorm.bias": "model-00003-of-00004.safetensors",
284
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
285
+ "model.layers.28.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
286
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
287
+ "model.layers.28.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
288
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
289
+ "model.layers.28.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
290
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
291
+ "model.layers.28.self_attn.dense.bias": "model-00003-of-00004.safetensors",
292
+ "model.layers.28.self_attn.dense.weight": "model-00003-of-00004.safetensors",
293
+ "model.layers.28.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
294
+ "model.layers.28.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
295
+ "model.layers.28.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
296
+ "model.layers.29.input_layernorm.bias": "model-00003-of-00004.safetensors",
297
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
298
+ "model.layers.29.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
299
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
300
+ "model.layers.29.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
301
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
302
+ "model.layers.29.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
303
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
304
+ "model.layers.29.self_attn.dense.bias": "model-00003-of-00004.safetensors",
305
+ "model.layers.29.self_attn.dense.weight": "model-00003-of-00004.safetensors",
306
+ "model.layers.29.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
307
+ "model.layers.29.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
308
+ "model.layers.29.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
309
+ "model.layers.3.input_layernorm.bias": "model-00001-of-00004.safetensors",
310
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.3.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
312
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.3.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
314
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
315
+ "model.layers.3.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
316
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
317
+ "model.layers.3.self_attn.dense.bias": "model-00001-of-00004.safetensors",
318
+ "model.layers.3.self_attn.dense.weight": "model-00001-of-00004.safetensors",
319
+ "model.layers.3.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
320
+ "model.layers.3.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
321
+ "model.layers.3.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
322
+ "model.layers.30.input_layernorm.bias": "model-00003-of-00004.safetensors",
323
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
324
+ "model.layers.30.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
325
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
326
+ "model.layers.30.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
327
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
328
+ "model.layers.30.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
329
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
330
+ "model.layers.30.self_attn.dense.bias": "model-00003-of-00004.safetensors",
331
+ "model.layers.30.self_attn.dense.weight": "model-00003-of-00004.safetensors",
332
+ "model.layers.30.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
333
+ "model.layers.30.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
334
+ "model.layers.30.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
335
+ "model.layers.31.input_layernorm.bias": "model-00004-of-00004.safetensors",
336
+ "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
337
+ "model.layers.31.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
338
+ "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
339
+ "model.layers.31.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
340
+ "model.layers.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
341
+ "model.layers.31.post_attention_layernorm.bias": "model-00004-of-00004.safetensors",
342
+ "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
343
+ "model.layers.31.self_attn.dense.bias": "model-00003-of-00004.safetensors",
344
+ "model.layers.31.self_attn.dense.weight": "model-00003-of-00004.safetensors",
345
+ "model.layers.31.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
346
+ "model.layers.31.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
347
+ "model.layers.31.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
348
+ "model.layers.4.input_layernorm.bias": "model-00001-of-00004.safetensors",
349
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
350
+ "model.layers.4.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
351
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
352
+ "model.layers.4.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
353
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
354
+ "model.layers.4.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
355
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
356
+ "model.layers.4.self_attn.dense.bias": "model-00001-of-00004.safetensors",
357
+ "model.layers.4.self_attn.dense.weight": "model-00001-of-00004.safetensors",
358
+ "model.layers.4.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
359
+ "model.layers.4.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
360
+ "model.layers.4.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
361
+ "model.layers.5.input_layernorm.bias": "model-00001-of-00004.safetensors",
362
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
363
+ "model.layers.5.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
364
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
365
+ "model.layers.5.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
366
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
367
+ "model.layers.5.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
368
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
369
+ "model.layers.5.self_attn.dense.bias": "model-00001-of-00004.safetensors",
370
+ "model.layers.5.self_attn.dense.weight": "model-00001-of-00004.safetensors",
371
+ "model.layers.5.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
372
+ "model.layers.5.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
373
+ "model.layers.5.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
374
+ "model.layers.6.input_layernorm.bias": "model-00001-of-00004.safetensors",
375
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
376
+ "model.layers.6.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
377
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
378
+ "model.layers.6.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
379
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
380
+ "model.layers.6.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
381
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
382
+ "model.layers.6.self_attn.dense.bias": "model-00001-of-00004.safetensors",
383
+ "model.layers.6.self_attn.dense.weight": "model-00001-of-00004.safetensors",
384
+ "model.layers.6.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
385
+ "model.layers.6.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
386
+ "model.layers.6.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
387
+ "model.layers.7.input_layernorm.bias": "model-00001-of-00004.safetensors",
388
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
389
+ "model.layers.7.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
390
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
391
+ "model.layers.7.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
392
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
393
+ "model.layers.7.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
394
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
395
+ "model.layers.7.self_attn.dense.bias": "model-00001-of-00004.safetensors",
396
+ "model.layers.7.self_attn.dense.weight": "model-00001-of-00004.safetensors",
397
+ "model.layers.7.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
398
+ "model.layers.7.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
399
+ "model.layers.7.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
400
+ "model.layers.8.input_layernorm.bias": "model-00001-of-00004.safetensors",
401
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
402
+ "model.layers.8.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
403
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
404
+ "model.layers.8.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
405
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
406
+ "model.layers.8.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
407
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
408
+ "model.layers.8.self_attn.dense.bias": "model-00001-of-00004.safetensors",
409
+ "model.layers.8.self_attn.dense.weight": "model-00001-of-00004.safetensors",
410
+ "model.layers.8.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
411
+ "model.layers.8.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
412
+ "model.layers.8.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
413
+ "model.layers.9.input_layernorm.bias": "model-00002-of-00004.safetensors",
414
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
415
+ "model.layers.9.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
416
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
417
+ "model.layers.9.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
418
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
419
+ "model.layers.9.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
420
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
421
+ "model.layers.9.self_attn.dense.bias": "model-00001-of-00004.safetensors",
422
+ "model.layers.9.self_attn.dense.weight": "model-00001-of-00004.safetensors",
423
+ "model.layers.9.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
424
+ "model.layers.9.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
425
+ "model.layers.9.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors"
426
+ }
427
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>"
5
+ }
tokenization_phi3_small.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/tokenization_qwen.py
2
+ import os
3
+ from typing import Collection, List, Optional, Dict, Set, Tuple, Union
4
+
5
+ from functools import cached_property
6
+
7
+ import base64
8
+ import requests
9
+
10
+ from transformers import PreTrainedTokenizer, AddedToken, AutoConfig
11
+ from transformers.models.auto.tokenization_auto import get_tokenizer_config
12
+ import tiktoken
13
+
14
+
15
+ """
16
+ This tokenizer is almost identical to tiktoken.get_encoding("cl100k_base")
17
+ with a few additional special tokens to support the ChatML format.
18
+
19
+ TODO(bapatra): Right now, I do not save the special tokens to the vocab file.
20
+ Maybe in the future, that would be useful? Can add that support later.
21
+
22
+ """
23
+
24
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
25
+ with open(tiktoken_bpe_file, "rb") as f:
26
+ contents = f.read()
27
+ return {
28
+ base64.b64decode(token): int(rank)
29
+ for token, rank in (line.split() for line in contents.splitlines() if line)
30
+ }
31
+
32
+ # On the megatron codebase, we pad vocabularies to ensure matrix multiplication is fast.
33
+ # this in turn causes some indices to be empty. We account for these empty indices by adding
34
+ # dummy tokens to the tokenizer.
35
+
36
+ EFFECTIVE_PADDED_VOCAB_SIZE = 100352
37
+ ACTUAL_VOCAB_SIZE = 100276
38
+
39
+
40
+ DUMMY_TOKENS = {
41
+ f"<|dummy_id_{11 + offset}|>": 100276 + offset
42
+ for offset in range(1, EFFECTIVE_PADDED_VOCAB_SIZE - ACTUAL_VOCAB_SIZE)
43
+ }
44
+
45
+ SPECIAL_TOKENS = {
46
+ # tiktoken.get_encoding("cl100k_base")._special_tokens
47
+ '<|endoftext|>': 100257,
48
+ '<|fim_prefix|>': 100258,
49
+ '<|fim_middle|>': 100259,
50
+ '<|fim_suffix|>': 100260,
51
+ # Special tokens for post-training
52
+ "<|system|>": 100261,
53
+ "<|user|>": 100262,
54
+ "<|assistant|>": 100263,
55
+ # Dummy unused tokens
56
+ "<|dummy_id_0|>": 100264,
57
+ "<|dummy_id_1|>": 100265,
58
+ # Special tokens for post-training continued
59
+ "<|end|>": 100266,
60
+ # Some dummy tokens, so that tokenization is contiguous and does not cause issues
61
+ # Note that the 100256th token of tiktoken.get_encoding("cl100k_base") does not
62
+ # actually map to anything. So we use a dummy token here.
63
+ "<|dummy_id_2|>": 100256,
64
+ # Likewise, tokens from 100267 to 100275 are also unused
65
+ "<|dummy_id_3|>": 100267,
66
+ "<|dummy_id_4|>": 100268,
67
+ "<|dummy_id_5|>": 100269,
68
+ "<|dummy_id_6|>": 100270,
69
+ "<|dummy_id_7|>": 100271,
70
+ "<|dummy_id_8|>": 100272,
71
+ "<|dummy_id_9|>": 100273,
72
+ "<|dummy_id_10|>": 100274,
73
+ "<|dummy_id_11|>": 100275,
74
+ # The final end of prompt token
75
+ # (unused, but present as a part of tiktoken.get_encoding("cl100k_base")._special_tokens)
76
+ '<|endofprompt|>': 100276,
77
+ # Dummy tokens to account for padding of the tokenizer
78
+ # We pad to ensure tensor cores are used for vocab multiplication
79
+ **DUMMY_TOKENS
80
+ }
81
+
82
+ class Phi3SmallTokenizer(PreTrainedTokenizer):
83
+ vocab_files_names = {
84
+ "vocab_file": "cl100k_base.tiktoken"
85
+ }
86
+
87
+ model_input_names: List[str] = ["input_ids", "attention_mask"]
88
+ padding_side = "left"
89
+
90
+ def __init__(
91
+ self,
92
+ vocab_file: Optional[str] = None,
93
+ errors: str = "replace",
94
+ **kwargs
95
+ ) -> None:
96
+ # PreTrainedTokenizer's init calls _add_tokens, which in turn checks
97
+ # if the token is present in `self.special_tokens``. Hence instantiating it here.
98
+ # The way Qwen gets around this is by checking against SPECIAL_TOKENS
99
+ # But I think it's better to check against the objects own `special_tokens`
100
+ # in case we eventually want to allow the tokenizer to have special tokens.
101
+ self.special_tokens = SPECIAL_TOKENS
102
+
103
+ super().__init__(**kwargs)
104
+ self.errors = errors
105
+
106
+ try:
107
+ base = tiktoken.get_encoding("cl100k_base")
108
+ # This deals with the scenario where user has restricted internet access
109
+ # and thus fails to download the tokenizer file from https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
110
+ # It is assumed that user should be able to access files on huggingface hub.
111
+ except requests.RequestException:
112
+ import hashlib
113
+ from transformers.utils import cached_file
114
+ cached_tokenizer_path = cached_file(
115
+ "microsoft/Phi-3-small-8k-instruct",
116
+ "cl100k_base.tiktoken",
117
+ _raise_exceptions_for_gated_repo=False,
118
+ _raise_exceptions_for_missing_entries=False,
119
+ _raise_exceptions_for_connection_errors=False
120
+ )
121
+ tiktoken_cache_dir = os.path.dirname(cached_tokenizer_path)
122
+ tiktoken_cache_path = os.path.join(
123
+ tiktoken_cache_dir,
124
+ hashlib.sha1("https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken".encode()).hexdigest()
125
+ )
126
+ if not os.path.exists(tiktoken_cache_path):
127
+ os.rename(cached_tokenizer_path, tiktoken_cache_path)
128
+ os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
129
+ base = tiktoken.get_encoding("cl100k_base")
130
+
131
+ if vocab_file is None:
132
+ self.mergeable_ranks: Dict[bytes, int] = base._mergeable_ranks
133
+ else:
134
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)
135
+
136
+ self.pat_str = base._pat_str
137
+
138
+ enc = tiktoken.Encoding(
139
+ name="phi3small",
140
+ pat_str=self.pat_str,
141
+ mergeable_ranks=self.mergeable_ranks,
142
+ special_tokens=self.special_tokens,
143
+ )
144
+ self.tokenizer = enc
145
+
146
+ self.decoder: Dict[int, bytes] = {
147
+ v: k for k, v in self.mergeable_ranks.items()
148
+ }
149
+ self.decoder.update({v: k for k, v in self.special_tokens.items()})
150
+
151
+ self.eod_id = self.tokenizer.eot_token
152
+ self._eos_token = self._convert_id_to_token(self.eod_id)
153
+
154
+ # Setting the bos_token to be the same as the eos_token
155
+ # Note that this is **not** the correct thing to do, and is done
156
+ # just so that some of the downstream libraries do not break.
157
+ self._bos_token = self._eos_token
158
+
159
+ # Assign the special tokens to class variables
160
+ self.system_id = self.special_tokens["<|system|>"]
161
+ self.user_id = self.special_tokens["<|user|>"]
162
+ self.assistant_id = self.special_tokens["<|assistant|>"]
163
+ self.end_id = self.special_tokens["<|end|>"]
164
+
165
+ @cached_property
166
+ def dummy_token_indices(self) -> List[int]:
167
+ # There are some additional special tokens in the cl100k_base tokenizer
168
+ # that we do not use. Hence, we also consider them to be dummy tokens.
169
+ additional_tokens = [
170
+ "<|fim_prefix|>",
171
+ "<|fim_middle|>",
172
+ "<|fim_suffix|>",
173
+ "<|endofprompt|>"
174
+ ]
175
+ dummy_token_indices = [index for token, index in self.special_tokens.items() if "dummy_id" in token]
176
+ dummy_token_indices.extend([self.special_tokens[token] for token in additional_tokens])
177
+ return sorted(dummy_token_indices)
178
+
179
+ def __getstate__(self):
180
+ state = self.__dict__.copy()
181
+ del state["tokenizer"]
182
+ return state
183
+
184
+ def __setstate__(self, state):
185
+ self.__dict__ = state
186
+ enc = tiktoken.Encoding(
187
+ name="cl100k_im",
188
+ pat_str=self.pat_str,
189
+ mergeable_ranks=self.mergeable_ranks,
190
+ special_tokens=self.special_tokens,
191
+ )
192
+ self.tokenizer = enc
193
+
194
+ def __len__(self):
195
+ return self.tokenizer.n_vocab
196
+
197
+ @classmethod
198
+ def from_pretrained(
199
+ cls,
200
+ pretrained_model_name_or_path: Union[str, os.PathLike],
201
+ *init_inputs,
202
+ **kwargs,
203
+ ):
204
+ cls_kwargs = kwargs
205
+ # First try to load from the tokenization config if it exists
206
+ tokenization_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
207
+ if tokenization_config:
208
+ cls_kwargs = {
209
+ **tokenization_config,
210
+ **cls_kwargs
211
+ }
212
+ else:
213
+ config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
214
+ cls_kwargs["model_max_length"] = config.max_position_embeddings
215
+ return cls(**cls_kwargs)
216
+
217
+ def get_vocab(self) -> Dict[Union[str, bytes], int]:
218
+ return {**self.mergeable_ranks, **self.special_tokens}
219
+
220
+ def convert_tokens_to_ids(
221
+ self,
222
+ tokens: Union[bytes, str, List[Union[bytes, str]]]
223
+ ) -> Union[int, List[int]]:
224
+ ids = []
225
+ if isinstance(tokens, (str, bytes)):
226
+ if tokens in self.special_tokens:
227
+ return self.special_tokens[tokens]
228
+ else:
229
+ return self.mergeable_ranks.get(tokens)
230
+ ids: List[int] = []
231
+ for token in tokens:
232
+ ids.append(self.convert_tokens_to_ids(token))
233
+ return ids
234
+
235
+ def _add_tokens(
236
+ self,
237
+ new_tokens: Union[List[str], List[AddedToken]],
238
+ special_tokens: bool = False,
239
+ ) -> int:
240
+ if not special_tokens and new_tokens:
241
+ raise ValueError("Only special tokens can be added to this tokenizer")
242
+ for token in new_tokens:
243
+ surface_form = token.content if isinstance(token, AddedToken) else token
244
+ if surface_form not in self.special_tokens:
245
+ raise ValueError(
246
+ "For now, we do not support unknown special tokens\n"
247
+ "In the future, if there is a need for this, we can add special tokens to the tokenizer\n"
248
+ "starting from rank 100261 - 100263 and then 100266 - 100275.\n"
249
+ "And finally, we can re-construct the enc object back\n"
250
+ )
251
+ return 0
252
+
253
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
254
+ file_path = os.path.join(save_directory, "cl100k_base.tiktoken")
255
+ with open(file_path, "w") as f:
256
+ for token, rank in self.mergeable_ranks.items():
257
+ line = base64.b64encode(token).decode("utf-8") + " " + str(rank) + "\n"
258
+ f.write(line)
259
+ return (file_path,)
260
+
261
+ def tokenize(
262
+ self,
263
+ text: str,
264
+ allowed_special: Union[Set, str] = "all",
265
+ disallowed_special: Union[Collection, str] = (),
266
+ **kwargs
267
+ ) -> List[Union[bytes, str]]:
268
+ tokens: List[Union[bytes, str]] = []
269
+ for token_id in self.tokenizer.encode(
270
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
271
+ ):
272
+ tokens.append(self.decoder[token_id])
273
+ return tokens
274
+
275
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
276
+ """
277
+ Converts a sequence of tokens in a single string.
278
+ """
279
+ text = ""
280
+ temp = b""
281
+ for t in tokens:
282
+ if isinstance(t, str):
283
+ if temp:
284
+ text += temp.decode("utf-8", errors=self.errors)
285
+ temp = b""
286
+ text += t
287
+ elif isinstance(t, bytes):
288
+ temp += t
289
+ else:
290
+ raise TypeError("token should only be of type types or str")
291
+ if temp:
292
+ text += temp.decode("utf-8", errors=self.errors)
293
+ return text
294
+
295
+ @property
296
+ def vocab_size(self):
297
+ return self.tokenizer.n_vocab
298
+
299
+ @property
300
+ def eos_token_id(self) -> int:
301
+ return self.eod_id
302
+
303
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
304
+ """Converts an id to a token, special tokens included"""
305
+ if index in self.decoder:
306
+ return self.decoder[index]
307
+ raise ValueError("unknown ids")
308
+
309
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
310
+ """Converts a token to an id using the vocab, special tokens included"""
311
+ if token in self.special_tokens:
312
+ return self.special_tokens[token]
313
+ if token in self.mergeable_ranks:
314
+ return self.mergeable_ranks[token]
315
+ raise ValueError("unknown token")
316
+
317
+ def _tokenize(self, text: str, **kwargs):
318
+ """
319
+ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
320
+ vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
321
+ Do NOT take care of added tokens.
322
+ """
323
+ raise NotImplementedError
324
+
325
+ def _decode(
326
+ self,
327
+ token_ids: Union[int, List[int]],
328
+ skip_special_tokens: bool = False,
329
+ errors: str = None,
330
+ **kwargs,
331
+ ) -> str:
332
+ if isinstance(token_ids, int):
333
+ token_ids = [token_ids]
334
+ if skip_special_tokens:
335
+ token_ids = [i for i in token_ids if i < self.eod_id]
336
+ return self.tokenizer.decode(token_ids, errors=errors or self.errors)
337
+
338
+
tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": "1535ae26fb4faada95c6950e8bc6e867cdad6b00",
3
+ "_from_auto": true,
4
+ "added_tokens_decoder": {},
5
+ "auto_map": {
6
+ "AutoTokenizer": [
7
+ "tokenization_phi3_small.Phi3SmallTokenizer",
8
+ null
9
+ ]
10
+ },
11
+ "bos_token": "<|endoftext|>",
12
+ "clean_up_tokenization_spaces": true,
13
+ "eos_token": "<|endoftext|>",
14
+ "extra_special_tokens": {},
15
+ "model_max_length": 8192,
16
+ "pad_token": "<|endoftext|>",
17
+ "tokenizer_class": "Phi3SmallTokenizer",
18
+ "trust_remote_code": true
19
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97ec7a03b6bc5a8d0661db4abd52ae7cc082e6e9051ac763e4371845127ab27b
3
+ size 5496