encoder_config: vocab_size: 256 hidden_size: 1024 num_hidden_layers: 6 num_attention_heads: 8 num_key_value_heads: 8 rms_norm_eps: 1.0e-05 intermediate_size: 2816 max_position_embeddings: 262144 rope_scaling: rope_type: default rope_theta: 100000 mlp_bias: false use_cache: true sliding_window: 768 transformers_version: null key_query_norm: true key_query_norm_per_head: true is_neox_style: true cross_attention_config: hidden_size_q: 4096 hidden_size_kv: 1024 hidden_size: 4096 num_attention_heads: 32 attention_num_kv_heads: 32 word_window_size: 1 key_query_norm: true key_query_norm_per_head: true backbone_config: vocab_size: 0 hidden_size: 4096 num_hidden_layers: 32 num_attention_heads: 32 num_key_value_heads: 8 rms_norm_eps: 1.0e-05 intermediate_size: 14336 max_position_embeddings: 32900 rope_scaling: rope_type: default rope_theta: 500000 mlp_bias: false use_cache: true sliding_window: null transformers_version: null key_query_norm: true key_query_norm_per_head: true is_neox_style: true decoder_config: vocab_size: 256 hidden_size: 1024 num_hidden_layers: 4 num_attention_heads: 8 num_key_value_heads: 8 rms_norm_eps: 1.0e-05 intermediate_size: 2816 max_position_embeddings: 262144 rope_scaling: rope_type: default rope_theta: 100000 mlp_bias: false use_cache: true sliding_window: 768 transformers_version: null key_query_norm: true key_query_norm_per_head: true is_neox_style: true cross_attn_every_layer: true cross_attention_config: hidden_size_q: 1024 hidden_size_kv: 4096 hidden_size: 1024 num_attention_heads: 8 attention_num_kv_heads: 8 word_window_size: 1 key_query_norm: true key_query_norm_per_head: true model_type: hierarchical_autoregressive_transformer transformers_version: 4.46.3 auto_map: AutoConfig: config.HATArchitectureConfig AutoModelForCausalLM: model.HATForCausalLM special_token_dict: {} max_word_size: 100 sliding_window: 768 max_position_embeddings: 262144 torch_dtype: bfloat16 architectures: - HATDecoderForCausalLM