File size: 1,258 Bytes
377cc15
 
 
 
 
 
 
 
 
 
7a75b28
 
377cc15
 
 
 
7a75b28
377cc15
 
 
 
 
 
 
 
 
 
7a75b28
377cc15
 
 
 
7a75b28
377cc15
 
 
 
 
 
7a75b28
377cc15
 
 
 
 
 
 
 
7a75b28
377cc15
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
{
  "alpha_init": 1.0,
  "architectures": [
    "MoAMetricLM"
  ],
  "attn_drop": 0.1,
  "attn_heads": 32,
  "bos_token_id": 101,
  "conv_kernel": 5,
  "conv_mult": 2,
  "dim": 1024,
  "discrepancy_modulation": true,
  "drop_path": 0.0,
  "dtype": "float32",
  "enable_feature_gates": true,
  "enable_router_gates": true,
  "energy_amplification": 0.1,
  "eos_token_id": 102,
  "ff_mult": 4,
  "ffn_hidden": 2048,
  "head_feature_heads": 8,
  "layer_scale_init_value": 0.0001,
  "learn_alpha": true,
  "learn_radius": true,
  "lr_rank": 32,
  "maha_init": 1.0,
  "max_position_embeddings": 1024,
  "max_seq_len_cached": 8192,
  "metric": "l2",
  "mixer_hidden": 2048,
  "model_type": "moa_metric",
  "mqa_q_heads": 64,
  "n_branches": 3,
  "n_token_router_heads": 4,
  "num_hidden_layers": 6,
  "num_layers": 6,
  "origin_init_scale": 0.0,
  "pad_token_id": 0,
  "proj_drop": 0.1,
  "r_basis": 16,
  "radius_init": 3.0,
  "router_bias_heads": 4,
  "router_dropout": 0.1,
  "router_hidden": 2048,
  "router_init_temperature": 2.0,
  "router_temperature": 1.0,
  "router_topk": 2,
  "shared_kv_ratio": 0.5,
  "theta_base": 10000.0,
  "ti_reg_samples": 0,
  "ti_reg_weight": 0.0,
  "transformers_version": "4.56.1",
  "use_balls": true,
  "vocab_size": 50257
}