File size: 1,258 Bytes
377cc15 7a75b28 377cc15 7a75b28 377cc15 7a75b28 377cc15 7a75b28 377cc15 7a75b28 377cc15 7a75b28 377cc15 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
{
"alpha_init": 1.0,
"architectures": [
"MoAMetricLM"
],
"attn_drop": 0.1,
"attn_heads": 32,
"bos_token_id": 101,
"conv_kernel": 5,
"conv_mult": 2,
"dim": 1024,
"discrepancy_modulation": true,
"drop_path": 0.0,
"dtype": "float32",
"enable_feature_gates": true,
"enable_router_gates": true,
"energy_amplification": 0.1,
"eos_token_id": 102,
"ff_mult": 4,
"ffn_hidden": 2048,
"head_feature_heads": 8,
"layer_scale_init_value": 0.0001,
"learn_alpha": true,
"learn_radius": true,
"lr_rank": 32,
"maha_init": 1.0,
"max_position_embeddings": 1024,
"max_seq_len_cached": 8192,
"metric": "l2",
"mixer_hidden": 2048,
"model_type": "moa_metric",
"mqa_q_heads": 64,
"n_branches": 3,
"n_token_router_heads": 4,
"num_hidden_layers": 6,
"num_layers": 6,
"origin_init_scale": 0.0,
"pad_token_id": 0,
"proj_drop": 0.1,
"r_basis": 16,
"radius_init": 3.0,
"router_bias_heads": 4,
"router_dropout": 0.1,
"router_hidden": 2048,
"router_init_temperature": 2.0,
"router_temperature": 1.0,
"router_topk": 2,
"shared_kv_ratio": 0.5,
"theta_base": 10000.0,
"ti_reg_samples": 0,
"ti_reg_weight": 0.0,
"transformers_version": "4.56.1",
"use_balls": true,
"vocab_size": 50257
}
|