| { | |
| "attn_dropout_p": 0.0, | |
| "d_model": 832, | |
| "ff_dim": 2048, | |
| "ffn_dropout_p": 0.2, | |
| "learn_te": true, | |
| "n_heads": 16, | |
| "n_layers": 12, | |
| "resid_dropout_p": 0.2, | |
| "s1_bits": 10, | |
| "s2_bits": 10, | |
| "token_dropout_p": 0.0 | |
| } |
| { | |
| "attn_dropout_p": 0.0, | |
| "d_model": 832, | |
| "ff_dim": 2048, | |
| "ffn_dropout_p": 0.2, | |
| "learn_te": true, | |
| "n_heads": 16, | |
| "n_layers": 12, | |
| "resid_dropout_p": 0.2, | |
| "s1_bits": 10, | |
| "s2_bits": 10, | |
| "token_dropout_p": 0.0 | |
| } |