| { | |
| "n_layers": 1, | |
| "d_model": 256, | |
| "d_mlp": 1024, | |
| "d_head": 64, | |
| "n_heads": 4, | |
| "lr_hidden": 0.002, | |
| "lr_vector": 0.001, | |
| "batch_size_per_device": 32, | |
| "batches_per_step": 1, | |
| "seed": 1297, | |
| "save_checkpoints": true, | |
| "debug": false, | |
| "debug_batch": false, | |
| "normalization": "LN", | |
| "max_tokens": 10000000000, | |
| "version": 427, | |
| "use_bfloat16_matmul": true, | |
| "n_ctx": 1024, | |
| "d_vocab": 48262, | |
| "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", | |
| "betas": [ | |
| 0.9, | |
| 0.99 | |
| ], | |
| "weight_decay": 0.05, | |
| "dataset_name": "c4", | |
| "grad_norm_clip": 1.0, | |
| "n_devices": 8, | |
| "act_fn": "solu_ln", | |
| "shortformer_pos": false, | |
| "attn_only": false, | |
| "ln_eps": 1e-05, | |
| "lr_schedule": "cosine_warmup", | |
| "warmup_tokens": 300000000, | |
| "train_loss_ewma_beta": 0.99, | |
| "truncate_tokens": 1000000000000, | |
| "log_interval": 50, | |
| "initializer_scale_global": 1.0, | |
| "initializer_scale_hidden": 0.02, | |
| "initializer_scale_embed": 0.1, | |
| "initializer_scale_unembed": 0.02, | |
| "neuron_scale": 1.0, | |
| "neuron_temp": 1.0, | |
| "use_acc": false, | |
| "weight_init_scheme": "gpt2", | |
| "fixed_init": "", | |
| "store_init": false, | |
| "control": 1.0, | |
| "tokens_per_step": 262144, | |
| "batch_size": 256, | |
| "max_steps": 38146, | |
| "warmup_steps": 1144, | |
| "n_params": 786432 | |
| } |