reaperdoesntknow
/

MoA-100M

Text Generation

mixture-of-attentions

distance-attention

metric-attention

Model card Files Files and versions

reaperdoesntknow commited on Sep 21

Commit

7a75b28

·

verified ·

1 Parent(s): a61b9ff

Upload MoAMetricLM

Files changed (3) hide show

README.md +9 -9
config.json +7 -1
pytorch_model.bin +2 -2

README.md CHANGED Viewed

@@ -1,19 +1,19 @@
 ---
 language:
-  - en
 license: apache-2.0
 library_name: transformers
 pipeline_tag: text-generation
 tags:
-  - mixture-of-attentions
-  - distance-attention
-  - metric-attention
-  - mqa
-  - hyperffn
-  - router-gating
 datasets:
-  - nvidia/Nemotron-Math-HumanReasoning
-  - WeMake/Intelligent-Content-Understanding
 ---
 # MoAMetricLM‑100M — Mixture of Attentions (MoA)

 ---
 language:
+- en
 license: apache-2.0
 library_name: transformers
 pipeline_tag: text-generation
 tags:
+- mixture-of-attentions
+- distance-attention
+- metric-attention
+- mqa
+- hyperffn
+- router-gating
 datasets:
+- nvidia/Nemotron-Math-HumanReasoning
+- WeMake/Intelligent-Content-Understanding
 ---
 # MoAMetricLM‑100M — Mixture of Attentions (MoA)

config.json CHANGED Viewed

@@ -8,11 +8,13 @@
   "bos_token_id": 101,
   "conv_kernel": 5,
   "conv_mult": 2,
-  "dim": 512,
   "drop_path": 0.0,
   "dtype": "float32",
   "enable_feature_gates": true,
   "enable_router_gates": true,
   "eos_token_id": 102,
   "ff_mult": 4,
   "ffn_hidden": 2048,
@@ -23,16 +25,19 @@
   "lr_rank": 32,
   "maha_init": 1.0,
   "max_position_embeddings": 1024,
   "metric": "l2",
   "mixer_hidden": 2048,
   "model_type": "moa_metric",
   "mqa_q_heads": 64,
   "n_token_router_heads": 4,
   "num_hidden_layers": 6,
   "num_layers": 6,
   "origin_init_scale": 0.0,
   "pad_token_id": 0,
   "proj_drop": 0.1,
   "radius_init": 3.0,
   "router_bias_heads": 4,
   "router_dropout": 0.1,
@@ -41,6 +46,7 @@
   "router_temperature": 1.0,
   "router_topk": 2,
   "shared_kv_ratio": 0.5,
   "ti_reg_samples": 0,
   "ti_reg_weight": 0.0,
   "transformers_version": "4.56.1",

   "bos_token_id": 101,
   "conv_kernel": 5,
   "conv_mult": 2,
+  "dim": 1024,
+  "discrepancy_modulation": true,
   "drop_path": 0.0,
   "dtype": "float32",
   "enable_feature_gates": true,
   "enable_router_gates": true,
+  "energy_amplification": 0.1,
   "eos_token_id": 102,
   "ff_mult": 4,
   "ffn_hidden": 2048,
   "lr_rank": 32,
   "maha_init": 1.0,
   "max_position_embeddings": 1024,
+  "max_seq_len_cached": 8192,
   "metric": "l2",
   "mixer_hidden": 2048,
   "model_type": "moa_metric",
   "mqa_q_heads": 64,
+  "n_branches": 3,
   "n_token_router_heads": 4,
   "num_hidden_layers": 6,
   "num_layers": 6,
   "origin_init_scale": 0.0,
   "pad_token_id": 0,
   "proj_drop": 0.1,
+  "r_basis": 16,
   "radius_init": 3.0,
   "router_bias_heads": 4,
   "router_dropout": 0.1,
   "router_temperature": 1.0,
   "router_topk": 2,
   "shared_kv_ratio": 0.5,
+  "theta_base": 10000.0,
   "ti_reg_samples": 0,
   "ti_reg_weight": 0.0,
   "transformers_version": "4.56.1",

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5dfda2c9a5c219e99a7073701d0e0a34d787412ee03163dde932630a05c5e849
-size 300651083

 version https://git-lfs.github.com/spec/v1
+oid sha256:c402df542bb46e012d50547bbe8b8d80f580abfce30e6462196f238b9ceeb3c4
+size 1047209715