| encoder_config: | |
| vocab_size: 256 | |
| hidden_size: 1024 | |
| num_hidden_layers: 6 | |
| num_attention_heads: 8 | |
| num_key_value_heads: 8 | |
| rms_norm_eps: 1.0e-05 | |
| intermediate_size: 2816 | |
| max_position_embeddings: 262144 | |
| rope_scaling: | |
| rope_type: default | |
| rope_theta: 100000 | |
| mlp_bias: false | |
| use_cache: true | |
| sliding_window: 768 | |
| transformers_version: null | |
| key_query_norm: true | |
| key_query_norm_per_head: true | |
| is_neox_style: true | |
| cross_attention_config: | |
| hidden_size_q: 4096 | |
| hidden_size_kv: 1024 | |
| hidden_size: 4096 | |
| num_attention_heads: 32 | |
| attention_num_kv_heads: 32 | |
| word_window_size: 1 | |
| key_query_norm: true | |
| key_query_norm_per_head: true | |
| backbone_config: | |
| vocab_size: 0 | |
| hidden_size: 4096 | |
| num_hidden_layers: 32 | |
| num_attention_heads: 32 | |
| num_key_value_heads: 8 | |
| rms_norm_eps: 1.0e-05 | |
| intermediate_size: 14336 | |
| max_position_embeddings: 32900 | |
| rope_scaling: | |
| rope_type: default | |
| rope_theta: 500000 | |
| mlp_bias: false | |
| use_cache: true | |
| sliding_window: null | |
| transformers_version: null | |
| key_query_norm: true | |
| key_query_norm_per_head: true | |
| is_neox_style: true | |
| decoder_config: | |
| vocab_size: 256 | |
| hidden_size: 1024 | |
| num_hidden_layers: 4 | |
| num_attention_heads: 8 | |
| num_key_value_heads: 8 | |
| rms_norm_eps: 1.0e-05 | |
| intermediate_size: 2816 | |
| max_position_embeddings: 262144 | |
| rope_scaling: | |
| rope_type: default | |
| rope_theta: 100000 | |
| mlp_bias: false | |
| use_cache: true | |
| sliding_window: 768 | |
| transformers_version: null | |
| key_query_norm: true | |
| key_query_norm_per_head: true | |
| is_neox_style: true | |
| cross_attn_every_layer: true | |
| cross_attention_config: | |
| hidden_size_q: 1024 | |
| hidden_size_kv: 4096 | |
| hidden_size: 1024 | |
| num_attention_heads: 8 | |
| attention_num_kv_heads: 8 | |
| word_window_size: 1 | |
| key_query_norm: true | |
| key_query_norm_per_head: true | |
| model_type: hierarchical_autoregressive_transformer | |
| transformers_version: 4.46.3 | |
| auto_map: | |
| AutoConfig: config.HATArchitectureConfig | |
| AutoModelForCausalLM: model.HATForCausalLM | |
| special_token_dict: {} | |
| max_word_size: 100 | |
| sliding_window: 768 | |
| max_position_embeddings: 262144 | |
| torch_dtype: bfloat16 | |
| architectures: | |
| - HATDecoderForCausalLM | |