{ "alpha_init": 1.0, "architectures": [ "MoAMetricLM" ], "attn_drop": 0.1, "attn_heads": 32, "bos_token_id": 101, "conv_kernel": 5, "conv_mult": 2, "dim": 1024, "discrepancy_modulation": true, "drop_path": 0.0, "dtype": "float32", "enable_feature_gates": true, "enable_router_gates": true, "energy_amplification": 0.1, "eos_token_id": 102, "ff_mult": 4, "ffn_hidden": 2048, "head_feature_heads": 8, "layer_scale_init_value": 0.0001, "learn_alpha": true, "learn_radius": true, "lr_rank": 32, "maha_init": 1.0, "max_position_embeddings": 1024, "max_seq_len_cached": 8192, "metric": "l2", "mixer_hidden": 2048, "model_type": "moa_metric", "mqa_q_heads": 64, "n_branches": 3, "n_token_router_heads": 4, "num_hidden_layers": 6, "num_layers": 6, "origin_init_scale": 0.0, "pad_token_id": 0, "proj_drop": 0.1, "r_basis": 16, "radius_init": 3.0, "router_bias_heads": 4, "router_dropout": 0.1, "router_hidden": 2048, "router_init_temperature": 2.0, "router_temperature": 1.0, "router_topk": 2, "shared_kv_ratio": 0.5, "theta_base": 10000.0, "ti_reg_samples": 0, "ti_reg_weight": 0.0, "transformers_version": "4.56.1", "use_balls": true, "vocab_size": 50257 }