reaperdoesntknow commited on
Commit
7a75b28
·
verified ·
1 Parent(s): a61b9ff

Upload MoAMetricLM

Browse files
Files changed (3) hide show
  1. README.md +9 -9
  2. config.json +7 -1
  3. pytorch_model.bin +2 -2
README.md CHANGED
@@ -1,19 +1,19 @@
1
  ---
2
  language:
3
- - en
4
  license: apache-2.0
5
  library_name: transformers
6
  pipeline_tag: text-generation
7
  tags:
8
- - mixture-of-attentions
9
- - distance-attention
10
- - metric-attention
11
- - mqa
12
- - hyperffn
13
- - router-gating
14
  datasets:
15
- - nvidia/Nemotron-Math-HumanReasoning
16
- - WeMake/Intelligent-Content-Understanding
17
  ---
18
 
19
  # MoAMetricLM‑100M — Mixture of Attentions (MoA)
 
1
  ---
2
  language:
3
+ - en
4
  license: apache-2.0
5
  library_name: transformers
6
  pipeline_tag: text-generation
7
  tags:
8
+ - mixture-of-attentions
9
+ - distance-attention
10
+ - metric-attention
11
+ - mqa
12
+ - hyperffn
13
+ - router-gating
14
  datasets:
15
+ - nvidia/Nemotron-Math-HumanReasoning
16
+ - WeMake/Intelligent-Content-Understanding
17
  ---
18
 
19
  # MoAMetricLM‑100M — Mixture of Attentions (MoA)
config.json CHANGED
@@ -8,11 +8,13 @@
8
  "bos_token_id": 101,
9
  "conv_kernel": 5,
10
  "conv_mult": 2,
11
- "dim": 512,
 
12
  "drop_path": 0.0,
13
  "dtype": "float32",
14
  "enable_feature_gates": true,
15
  "enable_router_gates": true,
 
16
  "eos_token_id": 102,
17
  "ff_mult": 4,
18
  "ffn_hidden": 2048,
@@ -23,16 +25,19 @@
23
  "lr_rank": 32,
24
  "maha_init": 1.0,
25
  "max_position_embeddings": 1024,
 
26
  "metric": "l2",
27
  "mixer_hidden": 2048,
28
  "model_type": "moa_metric",
29
  "mqa_q_heads": 64,
 
30
  "n_token_router_heads": 4,
31
  "num_hidden_layers": 6,
32
  "num_layers": 6,
33
  "origin_init_scale": 0.0,
34
  "pad_token_id": 0,
35
  "proj_drop": 0.1,
 
36
  "radius_init": 3.0,
37
  "router_bias_heads": 4,
38
  "router_dropout": 0.1,
@@ -41,6 +46,7 @@
41
  "router_temperature": 1.0,
42
  "router_topk": 2,
43
  "shared_kv_ratio": 0.5,
 
44
  "ti_reg_samples": 0,
45
  "ti_reg_weight": 0.0,
46
  "transformers_version": "4.56.1",
 
8
  "bos_token_id": 101,
9
  "conv_kernel": 5,
10
  "conv_mult": 2,
11
+ "dim": 1024,
12
+ "discrepancy_modulation": true,
13
  "drop_path": 0.0,
14
  "dtype": "float32",
15
  "enable_feature_gates": true,
16
  "enable_router_gates": true,
17
+ "energy_amplification": 0.1,
18
  "eos_token_id": 102,
19
  "ff_mult": 4,
20
  "ffn_hidden": 2048,
 
25
  "lr_rank": 32,
26
  "maha_init": 1.0,
27
  "max_position_embeddings": 1024,
28
+ "max_seq_len_cached": 8192,
29
  "metric": "l2",
30
  "mixer_hidden": 2048,
31
  "model_type": "moa_metric",
32
  "mqa_q_heads": 64,
33
+ "n_branches": 3,
34
  "n_token_router_heads": 4,
35
  "num_hidden_layers": 6,
36
  "num_layers": 6,
37
  "origin_init_scale": 0.0,
38
  "pad_token_id": 0,
39
  "proj_drop": 0.1,
40
+ "r_basis": 16,
41
  "radius_init": 3.0,
42
  "router_bias_heads": 4,
43
  "router_dropout": 0.1,
 
46
  "router_temperature": 1.0,
47
  "router_topk": 2,
48
  "shared_kv_ratio": 0.5,
49
+ "theta_base": 10000.0,
50
  "ti_reg_samples": 0,
51
  "ti_reg_weight": 0.0,
52
  "transformers_version": "4.56.1",
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5dfda2c9a5c219e99a7073701d0e0a34d787412ee03163dde932630a05c5e849
3
- size 300651083
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c402df542bb46e012d50547bbe8b8d80f580abfce30e6462196f238b9ceeb3c4
3
+ size 1047209715