{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.019536019536019536, "eval_steps": 500, "global_step": 32, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 4.13485050201416, "epoch": 0.0006105006105006105, "grad_norm": 6.779195785522461, "learning_rate": 0.0003, "loss": 6.2653, "mean_token_accuracy": 0.16929133236408234, "num_tokens": 1024.0, "step": 1 }, { "entropy": 5.793717861175537, "epoch": 0.001221001221001221, "grad_norm": 16.547292709350586, "learning_rate": 0.00029927770900082954, "loss": 7.4425, "mean_token_accuracy": 0.07185039669275284, "num_tokens": 2048.0, "step": 2 }, { "entropy": 5.173164367675781, "epoch": 0.0018315018315018315, "grad_norm": 8.281397819519043, "learning_rate": 0.00029711779206048454, "loss": 7.0805, "mean_token_accuracy": 0.04595404490828514, "num_tokens": 3072.0, "step": 3 }, { "entropy": 5.790804862976074, "epoch": 0.002442002442002442, "grad_norm": 7.232938289642334, "learning_rate": 0.0002935410503598313, "loss": 5.9113, "mean_token_accuracy": 0.09690721333026886, "num_tokens": 4096.0, "step": 4 }, { "entropy": 5.514730453491211, "epoch": 0.0030525030525030525, "grad_norm": 5.750503063201904, "learning_rate": 0.000288581929876693, "loss": 5.9462, "mean_token_accuracy": 0.10728346556425095, "num_tokens": 5120.0, "step": 5 }, { "entropy": 4.5655198097229, "epoch": 0.003663003663003663, "grad_norm": 9.491901397705078, "learning_rate": 0.0002822881896522532, "loss": 5.7517, "mean_token_accuracy": 0.18307086825370789, "num_tokens": 6144.0, "step": 6 }, { "entropy": 4.92832088470459, "epoch": 0.004273504273504274, "grad_norm": 4.776255130767822, "learning_rate": 0.0002747204418453818, "loss": 5.6682, "mean_token_accuracy": 0.17814961075782776, "num_tokens": 7168.0, "step": 7 }, { "entropy": 6.124536991119385, "epoch": 0.004884004884004884, "grad_norm": 11.058452606201172, "learning_rate": 0.0002659515680044105, "loss": 4.9755, "mean_token_accuracy": 0.18397626280784607, "num_tokens": 8192.0, "step": 8 }, { "entropy": 4.757119655609131, "epoch": 0.005494505494505495, "grad_norm": 30.98914909362793, "learning_rate": 0.00025606601717798207, "loss": 5.5204, "mean_token_accuracy": 0.1530286967754364, "num_tokens": 9216.0, "step": 9 }, { "entropy": 3.9111785888671875, "epoch": 0.006105006105006105, "grad_norm": 6.036701202392578, "learning_rate": 0.0002451589926245468, "loss": 5.7534, "mean_token_accuracy": 0.15548455715179443, "num_tokens": 10240.0, "step": 10 }, { "entropy": 5.500742435455322, "epoch": 0.006715506715506716, "grad_norm": 3.4777474403381348, "learning_rate": 0.0002333355349529403, "loss": 5.7081, "mean_token_accuracy": 0.15589743852615356, "num_tokens": 11264.0, "step": 11 }, { "entropy": 5.70326566696167, "epoch": 0.007326007326007326, "grad_norm": 3.508657217025757, "learning_rate": 0.00022070951052389966, "loss": 5.0422, "mean_token_accuracy": 0.18095238506793976, "num_tokens": 12288.0, "step": 12 }, { "entropy": 4.923552513122559, "epoch": 0.007936507936507936, "grad_norm": 3.8143184185028076, "learning_rate": 0.00020740251485476345, "loss": 5.1931, "mean_token_accuracy": 0.169199600815773, "num_tokens": 13312.0, "step": 13 }, { "entropy": 4.681103706359863, "epoch": 0.008547008547008548, "grad_norm": 3.4933412075042725, "learning_rate": 0.0001935427015881693, "loss": 5.573, "mean_token_accuracy": 0.17606329917907715, "num_tokens": 14336.0, "step": 14 }, { "entropy": 5.161688327789307, "epoch": 0.009157509157509158, "grad_norm": 6.412088394165039, "learning_rate": 0.00017926354830241924, "loss": 5.2034, "mean_token_accuracy": 0.20263424515724182, "num_tokens": 15360.0, "step": 15 }, { "entropy": 5.262051582336426, "epoch": 0.009768009768009768, "grad_norm": 3.836217164993286, "learning_rate": 0.0001647025710494341, "loss": 4.6478, "mean_token_accuracy": 0.22834645211696625, "num_tokens": 16384.0, "step": 16 }, { "entropy": 5.14174747467041, "epoch": 0.010378510378510378, "grad_norm": 6.356348037719727, "learning_rate": 0.00015, "loss": 5.7193, "mean_token_accuracy": 0.174869105219841, "num_tokens": 17408.0, "step": 17 }, { "entropy": 4.554754734039307, "epoch": 0.01098901098901099, "grad_norm": 3.5281262397766113, "learning_rate": 0.0001352974289505659, "loss": 4.8491, "mean_token_accuracy": 0.22342519462108612, "num_tokens": 18432.0, "step": 18 }, { "entropy": 5.017117500305176, "epoch": 0.0115995115995116, "grad_norm": 3.1419517993927, "learning_rate": 0.00012073645169758076, "loss": 5.3589, "mean_token_accuracy": 0.20088790357112885, "num_tokens": 19456.0, "step": 19 }, { "entropy": 4.860474586486816, "epoch": 0.01221001221001221, "grad_norm": 3.614912986755371, "learning_rate": 0.00010645729841183066, "loss": 5.1, "mean_token_accuracy": 0.19789473712444305, "num_tokens": 20480.0, "step": 20 }, { "entropy": 5.405769348144531, "epoch": 0.01282051282051282, "grad_norm": 17.46617317199707, "learning_rate": 9.259748514523653e-05, "loss": 5.4736, "mean_token_accuracy": 0.17610710859298706, "num_tokens": 21504.0, "step": 21 }, { "entropy": 5.062948703765869, "epoch": 0.013431013431013432, "grad_norm": 3.2527050971984863, "learning_rate": 7.929048947610034e-05, "loss": 4.5604, "mean_token_accuracy": 0.21810699999332428, "num_tokens": 22528.0, "step": 22 }, { "entropy": 4.796494483947754, "epoch": 0.014041514041514042, "grad_norm": 3.152696132659912, "learning_rate": 6.66644650470597e-05, "loss": 4.1272, "mean_token_accuracy": 0.28236493468284607, "num_tokens": 23552.0, "step": 23 }, { "entropy": 4.756124496459961, "epoch": 0.014652014652014652, "grad_norm": 2.406891345977783, "learning_rate": 5.4841007375453186e-05, "loss": 4.5433, "mean_token_accuracy": 0.24015748500823975, "num_tokens": 24576.0, "step": 24 }, { "entropy": 4.696354389190674, "epoch": 0.015262515262515262, "grad_norm": 3.4845199584960938, "learning_rate": 4.3933982822017876e-05, "loss": 4.6104, "mean_token_accuracy": 0.24015748500823975, "num_tokens": 25600.0, "step": 25 }, { "entropy": 4.748116493225098, "epoch": 0.015873015873015872, "grad_norm": 63.001136779785156, "learning_rate": 3.404843199558945e-05, "loss": 4.5619, "mean_token_accuracy": 0.24924011528491974, "num_tokens": 26624.0, "step": 26 }, { "entropy": 4.614930152893066, "epoch": 0.016483516483516484, "grad_norm": 2.197350025177002, "learning_rate": 2.5279558154618197e-05, "loss": 4.7666, "mean_token_accuracy": 0.24020101130008698, "num_tokens": 27648.0, "step": 27 }, { "entropy": 4.691383361816406, "epoch": 0.017094017094017096, "grad_norm": 2.5826752185821533, "learning_rate": 1.7711810347746757e-05, "loss": 4.6907, "mean_token_accuracy": 0.2278876155614853, "num_tokens": 28672.0, "step": 28 }, { "entropy": 4.772076606750488, "epoch": 0.017704517704517704, "grad_norm": 3.7122533321380615, "learning_rate": 1.1418070123306989e-05, "loss": 4.6573, "mean_token_accuracy": 0.20523138344287872, "num_tokens": 29696.0, "step": 29 }, { "entropy": 4.471816062927246, "epoch": 0.018315018315018316, "grad_norm": 2.352668285369873, "learning_rate": 6.458949640168675e-06, "loss": 4.3323, "mean_token_accuracy": 0.2381889820098877, "num_tokens": 30720.0, "step": 30 }, { "entropy": 4.810123443603516, "epoch": 0.018925518925518924, "grad_norm": 2.075646162033081, "learning_rate": 2.882207939515435e-06, "loss": 4.9162, "mean_token_accuracy": 0.2311507910490036, "num_tokens": 31744.0, "step": 31 }, { "entropy": 4.847671031951904, "epoch": 0.019536019536019536, "grad_norm": 2.874021291732788, "learning_rate": 7.222909991704773e-07, "loss": 4.9275, "mean_token_accuracy": 0.20472441613674164, "num_tokens": 32768.0, "step": 32 } ], "logging_steps": 1, "max_steps": 32, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 16, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 137211949350912.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }