Nidum.AI commited on
Commit
5f2075e
·
1 Parent(s): 45bcbad

Add pruned MiniMax M2 THRIFT shards and configs

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -34
  2. added_tokens.json +56 -0
  3. chat_template.jinja +159 -0
  4. config.json +120 -0
  5. configuration_minimax_m2.py +131 -0
  6. generation_config.json +6 -0
  7. merges.txt +0 -0
  8. model-00001-of-00070.safetensors +3 -0
  9. model-00002-of-00070.safetensors +3 -0
  10. model-00003-of-00070.safetensors +3 -0
  11. model-00004-of-00070.safetensors +3 -0
  12. model-00005-of-00070.safetensors +3 -0
  13. model-00006-of-00070.safetensors +3 -0
  14. model-00007-of-00070.safetensors +3 -0
  15. model-00008-of-00070.safetensors +3 -0
  16. model-00009-of-00070.safetensors +3 -0
  17. model-00010-of-00070.safetensors +3 -0
  18. model-00011-of-00070.safetensors +3 -0
  19. model-00012-of-00070.safetensors +3 -0
  20. model-00013-of-00070.safetensors +3 -0
  21. model-00014-of-00070.safetensors +3 -0
  22. model-00015-of-00070.safetensors +3 -0
  23. model-00016-of-00070.safetensors +3 -0
  24. model-00017-of-00070.safetensors +3 -0
  25. model-00018-of-00070.safetensors +3 -0
  26. model-00019-of-00070.safetensors +3 -0
  27. model-00020-of-00070.safetensors +3 -0
  28. model-00021-of-00070.safetensors +3 -0
  29. model-00022-of-00070.safetensors +3 -0
  30. model-00023-of-00070.safetensors +3 -0
  31. model-00024-of-00070.safetensors +3 -0
  32. model-00025-of-00070.safetensors +3 -0
  33. model-00026-of-00070.safetensors +3 -0
  34. model-00027-of-00070.safetensors +3 -0
  35. model-00028-of-00070.safetensors +3 -0
  36. model-00029-of-00070.safetensors +3 -0
  37. model-00030-of-00070.safetensors +3 -0
  38. model-00031-of-00070.safetensors +3 -0
  39. model-00032-of-00070.safetensors +3 -0
  40. model-00033-of-00070.safetensors +3 -0
  41. model-00034-of-00070.safetensors +3 -0
  42. model-00035-of-00070.safetensors +3 -0
  43. model-00036-of-00070.safetensors +3 -0
  44. model-00037-of-00070.safetensors +3 -0
  45. model-00038-of-00070.safetensors +3 -0
  46. model-00039-of-00070.safetensors +3 -0
  47. model-00040-of-00070.safetensors +3 -0
  48. model-00041-of-00070.safetensors +3 -0
  49. model-00042-of-00070.safetensors +3 -0
  50. model-00043-of-00070.safetensors +3 -0
.gitattributes CHANGED
@@ -1,36 +1,5 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.bin filter=lfs diff=lfs merge=lfs -text
3
+ *.pt filter=lfs diff=lfs merge=lfs -text
4
+ *.onnx filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
5
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</minimax:tool_call>": 200053,
3
+ "</think>": 200051,
4
+ "<add_file>": 200036,
5
+ "<code_context>": 200043,
6
+ "<code_interpreter>": 200023,
7
+ "<commit_after>": 200018,
8
+ "<commit_before>": 200016,
9
+ "<commit_message>": 200040,
10
+ "<commit_msg>": 200017,
11
+ "<delete_file>": 200037,
12
+ "<edit_file>": 200039,
13
+ "<empty_output>": 200015,
14
+ "<empty_source_file>": 200041,
15
+ "<file_content>": 200044,
16
+ "<file_sep>": 200049,
17
+ "<filename>": 200006,
18
+ "<filepath>": 200048,
19
+ "<fim_middle>": 200002,
20
+ "<fim_pad>": 200004,
21
+ "<fim_prefix>": 200001,
22
+ "<fim_suffix>": 200003,
23
+ "<function_call>": 200022,
24
+ "<gh_stars>": 200007,
25
+ "<issue_closed>": 200010,
26
+ "<issue_comment>": 200009,
27
+ "<issue_start>": 200008,
28
+ "<jupyter_code>": 200013,
29
+ "<jupyter_error>": 200035,
30
+ "<jupyter_output>": 200014,
31
+ "<jupyter_start>": 200011,
32
+ "<jupyter_text>": 200012,
33
+ "<minimax:tool_call>": 200052,
34
+ "<pr_start>": 200046,
35
+ "<rename_file>": 200038,
36
+ "<repo_struct>": 200042,
37
+ "<reponame>": 200005,
38
+ "<review_comment>": 200047,
39
+ "<source_files>": 200045,
40
+ "<think>": 200050,
41
+ "[e~[": 200020,
42
+ "]!d~[": 200021,
43
+ "]!p~[": 200000,
44
+ "]<]end of image[>[": 200030,
45
+ "]<]end of speech[>[": 200028,
46
+ "]<]end of video[>[": 200032,
47
+ "]<]image[>[": 200025,
48
+ "]<]speech[>[": 200024,
49
+ "]<]start of image[>[": 200029,
50
+ "]<]start of speech[>[": 200027,
51
+ "]<]start of video[>[": 200031,
52
+ "]<]video[>[": 200026,
53
+ "]<]vision pad[>[": 200033,
54
+ "]~!b[": 200034,
55
+ "]~b]": 200019
56
+ }
chat_template.jinja ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {# ----------‑‑‑ special token variables ‑‑‑---------- #}
2
+ {%- set toolcall_begin_token = '<minimax:tool_call>' -%}
3
+ {%- set toolcall_end_token = '</minimax:tool_call>' -%}
4
+ {#- Tool Rendering Functions ============================================== -#}
5
+ {%- macro render_tool_namespace(namespace_name, tool_list) -%}
6
+ {%- for tool in tool_list -%}
7
+ <tool>{{ tool.function | tojson(ensure_ascii=False) }}</tool>
8
+ {% endfor -%}
9
+ {%- endmacro -%}
10
+ {%- macro visible_text(content) -%}
11
+ {%- if content is string -%}
12
+ {{ content }}
13
+ {%- elif content is iterable and content is not mapping -%}
14
+ {%- for item in content -%}
15
+ {%- if item is mapping and item.type == 'text' -%}
16
+ {{- item.text }}
17
+ {%- elif item is string -%}
18
+ {{- item }}
19
+ {%- endif -%}
20
+ {%- endfor -%}
21
+ {%- else -%}
22
+ {{- content }}
23
+ {%- endif -%}
24
+ {%- endmacro -%}
25
+ {#- System Message Construction ============================================ -#}
26
+ {%- macro build_system_message(system_message) -%}
27
+ {%- if system_message and system_message.content -%}
28
+ {{- visible_text(system_message.content) }}
29
+ {%- else -%}
30
+ {%- if model_identity is not defined -%}
31
+ {%- set model_identity = "You are a helpful assistant." -%}
32
+ {%- endif -%}
33
+ {{- model_identity }}
34
+ {%- endif -%}
35
+
36
+ {#- Handle current_date -#}
37
+ {%- if system_message and system_message.current_date -%}
38
+ {{- '\n' ~ 'Current date: ' + system_message.current_date }}
39
+ {%- endif -%}
40
+ {#- Handle current_location -#}
41
+ {%- if system_message and system_message.current_location -%}
42
+ {{- '\n' ~ 'Current location: ' + system_message.current_location }}
43
+ {%- endif -%}
44
+ {%- endmacro -%}
45
+ {#- Main Template Logic ================================================= -#}
46
+ {#- Extract system message (only first message if it's system) -#}
47
+ {%- set system_message = none -%}
48
+ {%- set conversation_messages = messages -%}
49
+ {%- if messages and messages[0].role == "system" -%}
50
+ {%- set system_message = messages[0] -%}
51
+ {%- set conversation_messages = messages[1:] -%}
52
+ {%- endif -%}
53
+ {#- Get the last user message turn, for interleved thinking -#}
54
+ {%- set ns = namespace(last_user_index=-1) %}
55
+ {% for m in conversation_messages %}
56
+ {%- if m.role == 'user' %}
57
+ {% set ns.last_user_index = loop.index0 -%}
58
+ {%- endif %}
59
+ {%- endfor %}
60
+ {#- Render system message -#}
61
+ {{- ']~!b[' ~ ']~b]system' ~ '\n' }}
62
+ {{- build_system_message(system_message) }}
63
+ {#- Render tools if available -#}
64
+ {%- if tools -%}
65
+ {{- '\n\n' ~ '# Tools' ~ '\n' ~ 'You may call one or more tools to assist with the user query.\nHere are the tools available in JSONSchema format:' ~ '\n' }}
66
+ {{- '\n' ~ '<tools>' ~ '\n' }}
67
+ {{- render_tool_namespace("functions", tools) }}
68
+ {{- '</tools>' ~ '\n\n' }}
69
+ {{- 'When making tool calls, use XML format to invoke tools and pass parameters:' ~ '\n' }}
70
+ {{- '\n' ~ toolcall_begin_token }}
71
+ <invoke name="tool-name-1">
72
+ <parameter name="param-key-1">param-value-1</parameter>
73
+ <parameter name="param-key-2">param-value-2</parameter>
74
+ ...
75
+ </invoke>
76
+ {{- '\n' ~ toolcall_end_token }}
77
+ {%- endif -%}
78
+ {{- '[e~[\n' }}
79
+
80
+ {#- Render messages -#}
81
+ {%- set last_tool_call = namespace(name=none) -%}
82
+ {%- for message in conversation_messages -%}
83
+ {%- if message.role == 'assistant' -%}
84
+ {#- Only render reasoning_content if no user message follows -#}
85
+ {{- ']~b]ai' ~ '\n' }}
86
+
87
+ {%- set reasoning_content = '' %}
88
+ {%- set content = visible_text(message.content) %}
89
+ {%- if message.reasoning_content is string %}
90
+ {%- set reasoning_content = message.reasoning_content %}
91
+ {%- else %}
92
+ {%- if '</think>' in content %}
93
+ {%- set reasoning_content = content.split('</think>')[0].strip('\n').split('<think>')[-1].strip('\n') %}
94
+ {%- set content = content.split('</think>')[-1].strip('\n') %}
95
+ {%- endif %}
96
+ {%- endif %}
97
+ {%- if reasoning_content and loop.index0 > ns.last_user_index -%}
98
+ {{- '<think>' ~ '\n' ~ reasoning_content ~ '\n' ~ '</think>' ~ '\n\n' }}
99
+ {%- endif -%}
100
+ {%- if content -%}
101
+ {{- content }}
102
+ {%- endif -%}
103
+ {%- if message.tool_calls -%}
104
+ {{- '\n' ~ toolcall_begin_token ~ '\n' }}
105
+
106
+ {%- for tool_call in message.tool_calls -%}
107
+ {%- if tool_call.function %}
108
+ {%- set tool_call = tool_call.function %}
109
+ {%- endif %}
110
+ {{- '<invoke name="' + tool_call.name + '">' }}
111
+ {% set _args = tool_call.arguments %}
112
+ {%- for k, v in _args.items() %}
113
+ {{- '<parameter name="' + k + '">' }}
114
+ {{- v | tojson(ensure_ascii=False) if v is not string else v }}
115
+ {{- '</parameter>' }}
116
+ {% endfor %}
117
+ {{- '</invoke>' ~ '\n' }}
118
+ {%- endfor -%}
119
+
120
+ {{- toolcall_end_token}}
121
+ {%- set last_tool_call.name = message.tool_calls[-1].name -%}
122
+ {%- else -%}
123
+ {%- set last_tool_call.name = none -%}
124
+ {%- endif -%}
125
+ {{- '[e~[' ~ '\n' }}
126
+
127
+ {%- elif message.role == 'tool' -%}
128
+ {%- if last_tool_call.name is none -%}
129
+ {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
130
+ {%- endif -%}
131
+ {%- if loop.first or (conversation_messages[loop.index0 - 1].role != 'tool') -%}
132
+ {{- ']~b]tool' }}
133
+ {%- endif -%}
134
+ {%- if message.content is string -%}
135
+ {{- '\n<response>' }}
136
+ {{- message.content }}
137
+ {{- '</response>' }}
138
+ {%- else -%}
139
+ {%- for tr in message.content -%}
140
+ {{- '\n<response>' }}
141
+ {{- tr.output if tr.output is defined else (tr.text if tr.type == 'text' and tr.text is defined else tr) }}
142
+ {{- '\n</response>' }}
143
+ {%- endfor -%}
144
+ {%- endif -%}
145
+ {%- if loop.last or (conversation_messages[loop.index0 + 1].role != 'tool') -%}
146
+ {{- '[e~[\n' -}}
147
+ {%- endif -%}
148
+
149
+ {%- elif message.role == 'user' -%}
150
+ {{- ']~b]user' ~ '\n' }}
151
+ {{- visible_text(message.content) }}
152
+ {{- '[e~[' ~ '\n' }}
153
+ {%- endif -%}
154
+ {%- endfor -%}
155
+
156
+ {#- Generation prompt -#}
157
+ {%- if add_generation_prompt -%}
158
+ {{- ']~b]ai' ~ '\n' ~ '<think>' ~ '\n' }}
159
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MiniMaxM2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "attn_type_list": [
7
+ 1,
8
+ 1,
9
+ 1,
10
+ 1,
11
+ 1,
12
+ 1,
13
+ 1,
14
+ 1,
15
+ 1,
16
+ 1,
17
+ 1,
18
+ 1,
19
+ 1,
20
+ 1,
21
+ 1,
22
+ 1,
23
+ 1,
24
+ 1,
25
+ 1,
26
+ 1,
27
+ 1,
28
+ 1,
29
+ 1,
30
+ 1,
31
+ 1,
32
+ 1,
33
+ 1,
34
+ 1,
35
+ 1,
36
+ 1,
37
+ 1,
38
+ 1,
39
+ 1,
40
+ 1,
41
+ 1,
42
+ 1,
43
+ 1,
44
+ 1,
45
+ 1,
46
+ 1,
47
+ 1,
48
+ 1,
49
+ 1,
50
+ 1,
51
+ 1,
52
+ 1,
53
+ 1,
54
+ 1,
55
+ 1,
56
+ 1,
57
+ 1,
58
+ 1,
59
+ 1,
60
+ 1,
61
+ 1,
62
+ 1,
63
+ 1,
64
+ 1,
65
+ 1,
66
+ 1,
67
+ 1,
68
+ 1
69
+ ],
70
+ "attn_window_size": null,
71
+ "auto_map": {
72
+ "AutoConfig": "configuration_minimax_m2.MiniMaxM2Config",
73
+ "AutoModelForCausalLM": "modeling_minimax_m2.MiniMaxM2ForCausalLM"
74
+ },
75
+ "head_dim": 128,
76
+ "hidden_act": "silu",
77
+ "hidden_size": 3072,
78
+ "initializer_range": 0.02,
79
+ "intermediate_size": 1536,
80
+ "layernorm_full_attention_beta": 1.0,
81
+ "layernorm_linear_attention_beta": 1.0,
82
+ "layernorm_mlp_beta": 1.0,
83
+ "max_model_len": null,
84
+ "max_position_embeddings": 196608,
85
+ "mlp_intermediate_size": 8192,
86
+ "model_type": "minimax",
87
+ "mtp_transformer_layers": 1,
88
+ "num_attention_heads": 48,
89
+ "num_expert_group": null,
90
+ "num_experts_per_tok": 8,
91
+ "num_hidden_layers": 62,
92
+ "num_key_value_heads": 8,
93
+ "num_local_experts": 192,
94
+ "num_mtp_modules": 3,
95
+ "output_router_logits": false,
96
+ "partial_rotary_factor": 0.5,
97
+ "qk_norm_type": "per_layer",
98
+ "rms_norm_eps": 1e-06,
99
+ "rope_scaling": null,
100
+ "rope_theta": 5000000,
101
+ "rotary_dim": 64,
102
+ "routed_scaling_factor": 1.0,
103
+ "router_aux_loss_coef": 0.001,
104
+ "router_jitter_noise": 0.0,
105
+ "scoring_func": "sigmoid",
106
+ "shared_intermediate_size": 0,
107
+ "shared_moe_mode": "sigmoid",
108
+ "sliding_window": null,
109
+ "swa_rope_theta": -1.0,
110
+ "tie_word_embeddings": false,
111
+ "topk_group": null,
112
+ "torch_dtype": "bfloat16",
113
+ "transformers_version": "4.55.0",
114
+ "use_cache": false,
115
+ "use_grouped_topk": true,
116
+ "use_mtp": true,
117
+ "use_qk_norm": true,
118
+ "use_routing_bias": true,
119
+ "vocab_size": 200064
120
+ }
configuration_minimax_m2.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
2
+ # SPDX-FileCopyrightText: 2024-2025 [email protected]
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ # Contact: [email protected], x.com/qubitium
5
+
6
+ """Configuration for the MiniMax M2 architecture."""
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import List, Optional, Union
11
+
12
+ from transformers.configuration_utils import PretrainedConfig
13
+
14
+
15
+ class MiniMaxM2Config(PretrainedConfig):
16
+ model_type = "minimax"
17
+
18
+ def __init__(
19
+ self,
20
+ vocab_size: int = 200_064,
21
+ hidden_size: int = 3_072,
22
+ intermediate_size: int = 1_536,
23
+ mlp_intermediate_size: int = 8_192,
24
+ num_hidden_layers: int = 62,
25
+ num_attention_heads: int = 48,
26
+ num_key_value_heads: int = 8,
27
+ head_dim: Optional[int] = 128,
28
+ num_local_experts: int = 256,
29
+ num_experts_per_tok: int = 8,
30
+ attn_type_list: Optional[List[int]] = None,
31
+ attention_dropout: float = 0.0,
32
+ hidden_act: str = "silu",
33
+ rms_norm_eps: float = 1e-6,
34
+ max_position_embeddings: int = 196_608,
35
+ rope_theta: float = 5_000_000.0,
36
+ rotary_dim: int = 64,
37
+ rope_scaling: Optional[dict] = None,
38
+ use_qk_norm: bool = True,
39
+ qk_norm_type: str = "per_layer",
40
+ use_routing_bias: bool = True,
41
+ scoring_func: str = "sigmoid",
42
+ router_aux_loss_coef: float = 0.001,
43
+ router_jitter_noise: float = 0.0,
44
+ output_router_logits: bool = False,
45
+ use_grouped_topk: bool = True,
46
+ num_expert_group: Optional[int] = None,
47
+ topk_group: Optional[int] = None,
48
+ routed_scaling_factor: float = 1.0,
49
+ layernorm_full_attention_beta: float = 1.0,
50
+ layernorm_linear_attention_beta: float = 1.0,
51
+ layernorm_mlp_beta: float = 1.0,
52
+ shared_intermediate_size: int = 0,
53
+ shared_moe_mode: str = "sigmoid",
54
+ use_mtp: bool = True,
55
+ num_mtp_modules: int = 3,
56
+ mtp_transformer_layers: int = 1,
57
+ attn_window_size: Optional[Union[int, List[int]]] = None,
58
+ swa_rope_theta: float = -1.0,
59
+ sliding_window: Optional[int] = None,
60
+ initializer_range: float = 0.02,
61
+ tie_word_embeddings: bool = False,
62
+ max_model_len: Optional[int] = None,
63
+ bos_token_id: Optional[int] = None,
64
+ eos_token_id: Optional[int] = None,
65
+ pad_token_id: Optional[int] = None,
66
+ use_cache: bool = True,
67
+ **kwargs,
68
+ ) -> None:
69
+ quantization_config = kwargs.pop("quantization_config", None)
70
+ transformers_version = kwargs.pop("transformers_version", None)
71
+
72
+ super().__init__(
73
+ bos_token_id=bos_token_id,
74
+ eos_token_id=eos_token_id,
75
+ tie_word_embeddings=tie_word_embeddings,
76
+ pad_token_id=pad_token_id,
77
+ **kwargs,
78
+ )
79
+
80
+ self.vocab_size = vocab_size
81
+ self.hidden_size = hidden_size
82
+ self.intermediate_size = intermediate_size
83
+ self.mlp_intermediate_size = mlp_intermediate_size
84
+ self.num_hidden_layers = num_hidden_layers
85
+ self.num_attention_heads = num_attention_heads
86
+ self.num_key_value_heads = num_key_value_heads
87
+ self.head_dim = head_dim or hidden_size // num_attention_heads
88
+ self.num_local_experts = num_local_experts
89
+ self.num_experts_per_tok = num_experts_per_tok
90
+ self.attn_type_list = attn_type_list or [1] * num_hidden_layers
91
+ self.attention_dropout = attention_dropout
92
+ self.hidden_act = hidden_act
93
+ self.rms_norm_eps = rms_norm_eps
94
+ self.max_position_embeddings = max_position_embeddings
95
+ self.rope_theta = rope_theta
96
+ self.rotary_dim = rotary_dim
97
+ self.rope_scaling = rope_scaling
98
+ self.use_qk_norm = use_qk_norm
99
+ self.qk_norm_type = qk_norm_type
100
+ self.use_routing_bias = use_routing_bias
101
+ self.scoring_func = scoring_func
102
+ self.router_aux_loss_coef = router_aux_loss_coef
103
+ self.router_jitter_noise = router_jitter_noise
104
+ self.output_router_logits = output_router_logits
105
+ self.use_grouped_topk = use_grouped_topk
106
+ self.num_expert_group = num_expert_group
107
+ self.topk_group = topk_group
108
+ self.routed_scaling_factor = routed_scaling_factor
109
+ self.layernorm_full_attention_beta = layernorm_full_attention_beta
110
+ self.layernorm_linear_attention_beta = layernorm_linear_attention_beta
111
+ self.layernorm_mlp_beta = layernorm_mlp_beta
112
+ self.shared_intermediate_size = shared_intermediate_size
113
+ self.shared_moe_mode = shared_moe_mode
114
+ self.use_mtp = use_mtp
115
+ self.num_mtp_modules = num_mtp_modules
116
+ self.mtp_transformer_layers = mtp_transformer_layers
117
+ self.attn_window_size = attn_window_size
118
+ self.swa_rope_theta = swa_rope_theta
119
+ self.sliding_window = sliding_window
120
+ self.initializer_range = initializer_range
121
+ self.max_model_len = max_model_len
122
+ self.use_cache = use_cache
123
+
124
+ # Convenient accessor used by rotary embedding helper
125
+ self.partial_rotary_factor = float(self.rotary_dim) / float(self.head_dim)
126
+ if quantization_config is not None:
127
+ self.quantization_config = quantization_config
128
+ self.transformers_version = transformers_version
129
+
130
+
131
+ __all__ = ["MiniMaxM2Config"]
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "top_k": 40,
4
+ "top_p": 0.95,
5
+ "transformers_version": "4.55.0"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92ce9d4dac7d33263ec980a594d86593272d3ab1907ae152775f355ee17602a1
3
+ size 4997842560
model-00002-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22f4f1bc412143547a5cc5a9492cc31add89c9c26f88df33ace8a71c43b9e028
3
+ size 4996692376
model-00003-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:969ec2fe7af75411b511a8dab9dc380c201a0fd579808e43babf105b5edb8578
3
+ size 4996692392
model-00004-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cbb2d048f6f177917de1a1ade7b000bb88d4ac03283b15faf7b157572f09f88
3
+ size 4996692432
model-00005-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48d0da24d5a8f2486a2686c44f896e8d30c6c42888d7bdc629d7bdb24e9dc5cf
3
+ size 4996692432
model-00006-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adb79dba9bb640bf26a39fcadee3c625a958e8b311dca95a4e429643001bb3a0
3
+ size 4996692432
model-00007-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14198f3595051135c45de69c2e6be6658db6a6e8b70f1aa804cfd37c080f471a
3
+ size 4996692432
model-00008-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d036567118e18a6796312aef53a8787b3f8e1c4f77aeedc50fc11c1b772ebedc
3
+ size 4977802792
model-00009-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76d004002f3bcdc3f1ed018624712e9f49178fe7e849583722a85345ad514bb5
3
+ size 4992354664
model-00010-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:037abcefe796e54d0d811ae5e1f8896aa81207dfc9533e6370fc034fd4a0b3fe
3
+ size 4996692368
model-00011-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eb8f27d65720a4d7e2502aedeb17ac34add567e278eb12c7193cd06d60768b7
3
+ size 4996692376
model-00012-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca3734e67fdb4fdaa9eab3bce7b3f58d81b5185a8687275702e1f2c312444bcc
3
+ size 4996692736
model-00013-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5fa2066ba143889e76f267a4483f097388b4422220589b4623af592988fda4f
3
+ size 4996692904
model-00014-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f9ff75bf7e987a72e1457ed1480b220c0edd354b3775d7ee13979174226bec3
3
+ size 4996692960
model-00015-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fca8287025bb9bc461dae02ba76a0fb6920ce756ff4cc0eaab02ada55164135d
3
+ size 4996692960
model-00016-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72fb56372bd4f2862bfe102d14eaf5ad2e378d4374b74e96fdf144e68cc4bc0b
3
+ size 4996692960
model-00017-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5427acfa913f92ae7c7ea986d96dbe729806ce4c731cfb9deaf1be2167a1718f
3
+ size 4996692960
model-00018-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99dd2779ddb568eec957580653923ec05879e05027a8d30a56070c1cfa4fa8f9
3
+ size 4996692968
model-00019-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab3d12ad435507ea42c97c63220acdb5471027e4646940145705fa0f7bbcb231
3
+ size 4992340208
model-00020-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb4c4f4b9a7739d48d521126da70f36d1411e683acf6ecec2a131bc208b9cbad
3
+ size 4996692896
model-00021-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:774792c0526a5315515556087ec8ee87c8861a41c1d28d175343e46bacecbed1
3
+ size 4996692896
model-00022-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:703b9887e755a866e68737620f67eb668eae3456bb1fe1439c5865a27a7ae988
3
+ size 4996692904
model-00023-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6c70264fd3a71f44f9adb646470018c6f5b1a996947067310e084116e749c52
3
+ size 4996692904
model-00024-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf94a3627fb32d9505c95fcac7ff112ba65d10d4ff65479b802b6457b05974d1
3
+ size 4996692936
model-00025-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95ef12862c60786dce34383a4ba0db2ab706a3c5117d70dd56f4eb2f4c65e6e6
3
+ size 4996692960
model-00026-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:740b8a48b514c4855d74c5262d11a8e8f890a3c0c26038ad7579a5b09dd13144
3
+ size 4996692960
model-00027-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91f105473920bc03907183f076b729fbdc2eb378a1bb23dfea23b62815545f97
3
+ size 4996692960
model-00028-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bc19c9e7cb7c01625f2a8fca20b216900ee75582708e46e84d8aedb249143cf
3
+ size 4996692960
model-00029-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:251dfaca5c3b81362e54d4e4e5ace14d2837dd7e2b3958a7168246244e2279d9
3
+ size 4992340240
model-00030-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d98a6dc2c1ddd6eef7f24af38f2a2068c47a0b385fa23c5219ab4c8e062d1d34
3
+ size 4996692888
model-00031-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:671334330b78709a02a1b9dcc20be5ce575a0177cf46bc538539941c1c26f335
3
+ size 4996692896
model-00032-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fc96ad8210ca4941622e7f9313d4d98683b4f1e66165629727025ffe7e36b1b
3
+ size 4996692904
model-00033-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28c0e3c7a66485517126362209eaabc261128c24fcc26fd8c33e62aec4998b66
3
+ size 4996692904
model-00034-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bdd65a4af8e04fa9ee17476523fa5e7eec493439cbbd8e2922b11eaf2a8c8a1
3
+ size 4996692912
model-00035-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c38159eb713042e382f6e7b39a96074a400d360d01926ef8a1fba9a567f9c63
3
+ size 4996692960
model-00036-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b1470e6b21485c800b5ca78ac2eb234a4fe68a48ad81a927061e8cdc267d55c
3
+ size 4996692960
model-00037-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ec6ee695578b68ab3e44a8e0b156ca30dbf9295b2a31843965b47c1b92d265b
3
+ size 4996692960
model-00038-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92504af729ef44ad1fcca9b1b62e4094fbce9959c5446bd2a0200664ec22f312
3
+ size 4996692960
model-00039-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51a4886bdf970605273d786eda087a7910b7dd14b1c24c7cbafcfad00121799b
3
+ size 4996692976
model-00040-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecb4a10b1524c9a21f393057804a5a67f8fd2512a64d77802abf7af50396892a
3
+ size 4992340192
model-00041-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4de755d6b1ae96b3f1b23a77f78ff3f4f6176914c8f195cde28f2a3f5866df7f
3
+ size 4996692896
model-00042-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:769d88f546c70cef3c1c502e9c420ee9adcd5a2775be3f3f08c0d11a7f140404
3
+ size 4996692896
model-00043-of-00070.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb733d5f36328d4cd24f7f1d42813c4539ebeedf2ce6f71c2c4552e7355aa269
3
+ size 4996692904