| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.019536019536019536, | |
| "eval_steps": 500, | |
| "global_step": 32, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 4.13485050201416, | |
| "epoch": 0.0006105006105006105, | |
| "grad_norm": 6.779195785522461, | |
| "learning_rate": 0.0003, | |
| "loss": 6.2653, | |
| "mean_token_accuracy": 0.16929133236408234, | |
| "num_tokens": 1024.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 5.793717861175537, | |
| "epoch": 0.001221001221001221, | |
| "grad_norm": 16.547292709350586, | |
| "learning_rate": 0.00029927770900082954, | |
| "loss": 7.4425, | |
| "mean_token_accuracy": 0.07185039669275284, | |
| "num_tokens": 2048.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 5.173164367675781, | |
| "epoch": 0.0018315018315018315, | |
| "grad_norm": 8.281397819519043, | |
| "learning_rate": 0.00029711779206048454, | |
| "loss": 7.0805, | |
| "mean_token_accuracy": 0.04595404490828514, | |
| "num_tokens": 3072.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 5.790804862976074, | |
| "epoch": 0.002442002442002442, | |
| "grad_norm": 7.232938289642334, | |
| "learning_rate": 0.0002935410503598313, | |
| "loss": 5.9113, | |
| "mean_token_accuracy": 0.09690721333026886, | |
| "num_tokens": 4096.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 5.514730453491211, | |
| "epoch": 0.0030525030525030525, | |
| "grad_norm": 5.750503063201904, | |
| "learning_rate": 0.000288581929876693, | |
| "loss": 5.9462, | |
| "mean_token_accuracy": 0.10728346556425095, | |
| "num_tokens": 5120.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 4.5655198097229, | |
| "epoch": 0.003663003663003663, | |
| "grad_norm": 9.491901397705078, | |
| "learning_rate": 0.0002822881896522532, | |
| "loss": 5.7517, | |
| "mean_token_accuracy": 0.18307086825370789, | |
| "num_tokens": 6144.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 4.92832088470459, | |
| "epoch": 0.004273504273504274, | |
| "grad_norm": 4.776255130767822, | |
| "learning_rate": 0.0002747204418453818, | |
| "loss": 5.6682, | |
| "mean_token_accuracy": 0.17814961075782776, | |
| "num_tokens": 7168.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 6.124536991119385, | |
| "epoch": 0.004884004884004884, | |
| "grad_norm": 11.058452606201172, | |
| "learning_rate": 0.0002659515680044105, | |
| "loss": 4.9755, | |
| "mean_token_accuracy": 0.18397626280784607, | |
| "num_tokens": 8192.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 4.757119655609131, | |
| "epoch": 0.005494505494505495, | |
| "grad_norm": 30.98914909362793, | |
| "learning_rate": 0.00025606601717798207, | |
| "loss": 5.5204, | |
| "mean_token_accuracy": 0.1530286967754364, | |
| "num_tokens": 9216.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 3.9111785888671875, | |
| "epoch": 0.006105006105006105, | |
| "grad_norm": 6.036701202392578, | |
| "learning_rate": 0.0002451589926245468, | |
| "loss": 5.7534, | |
| "mean_token_accuracy": 0.15548455715179443, | |
| "num_tokens": 10240.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 5.500742435455322, | |
| "epoch": 0.006715506715506716, | |
| "grad_norm": 3.4777474403381348, | |
| "learning_rate": 0.0002333355349529403, | |
| "loss": 5.7081, | |
| "mean_token_accuracy": 0.15589743852615356, | |
| "num_tokens": 11264.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 5.70326566696167, | |
| "epoch": 0.007326007326007326, | |
| "grad_norm": 3.508657217025757, | |
| "learning_rate": 0.00022070951052389966, | |
| "loss": 5.0422, | |
| "mean_token_accuracy": 0.18095238506793976, | |
| "num_tokens": 12288.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 4.923552513122559, | |
| "epoch": 0.007936507936507936, | |
| "grad_norm": 3.8143184185028076, | |
| "learning_rate": 0.00020740251485476345, | |
| "loss": 5.1931, | |
| "mean_token_accuracy": 0.169199600815773, | |
| "num_tokens": 13312.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 4.681103706359863, | |
| "epoch": 0.008547008547008548, | |
| "grad_norm": 3.4933412075042725, | |
| "learning_rate": 0.0001935427015881693, | |
| "loss": 5.573, | |
| "mean_token_accuracy": 0.17606329917907715, | |
| "num_tokens": 14336.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 5.161688327789307, | |
| "epoch": 0.009157509157509158, | |
| "grad_norm": 6.412088394165039, | |
| "learning_rate": 0.00017926354830241924, | |
| "loss": 5.2034, | |
| "mean_token_accuracy": 0.20263424515724182, | |
| "num_tokens": 15360.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 5.262051582336426, | |
| "epoch": 0.009768009768009768, | |
| "grad_norm": 3.836217164993286, | |
| "learning_rate": 0.0001647025710494341, | |
| "loss": 4.6478, | |
| "mean_token_accuracy": 0.22834645211696625, | |
| "num_tokens": 16384.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 5.14174747467041, | |
| "epoch": 0.010378510378510378, | |
| "grad_norm": 6.356348037719727, | |
| "learning_rate": 0.00015, | |
| "loss": 5.7193, | |
| "mean_token_accuracy": 0.174869105219841, | |
| "num_tokens": 17408.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 4.554754734039307, | |
| "epoch": 0.01098901098901099, | |
| "grad_norm": 3.5281262397766113, | |
| "learning_rate": 0.0001352974289505659, | |
| "loss": 4.8491, | |
| "mean_token_accuracy": 0.22342519462108612, | |
| "num_tokens": 18432.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 5.017117500305176, | |
| "epoch": 0.0115995115995116, | |
| "grad_norm": 3.1419517993927, | |
| "learning_rate": 0.00012073645169758076, | |
| "loss": 5.3589, | |
| "mean_token_accuracy": 0.20088790357112885, | |
| "num_tokens": 19456.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 4.860474586486816, | |
| "epoch": 0.01221001221001221, | |
| "grad_norm": 3.614912986755371, | |
| "learning_rate": 0.00010645729841183066, | |
| "loss": 5.1, | |
| "mean_token_accuracy": 0.19789473712444305, | |
| "num_tokens": 20480.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 5.405769348144531, | |
| "epoch": 0.01282051282051282, | |
| "grad_norm": 17.46617317199707, | |
| "learning_rate": 9.259748514523653e-05, | |
| "loss": 5.4736, | |
| "mean_token_accuracy": 0.17610710859298706, | |
| "num_tokens": 21504.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 5.062948703765869, | |
| "epoch": 0.013431013431013432, | |
| "grad_norm": 3.2527050971984863, | |
| "learning_rate": 7.929048947610034e-05, | |
| "loss": 4.5604, | |
| "mean_token_accuracy": 0.21810699999332428, | |
| "num_tokens": 22528.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 4.796494483947754, | |
| "epoch": 0.014041514041514042, | |
| "grad_norm": 3.152696132659912, | |
| "learning_rate": 6.66644650470597e-05, | |
| "loss": 4.1272, | |
| "mean_token_accuracy": 0.28236493468284607, | |
| "num_tokens": 23552.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 4.756124496459961, | |
| "epoch": 0.014652014652014652, | |
| "grad_norm": 2.406891345977783, | |
| "learning_rate": 5.4841007375453186e-05, | |
| "loss": 4.5433, | |
| "mean_token_accuracy": 0.24015748500823975, | |
| "num_tokens": 24576.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 4.696354389190674, | |
| "epoch": 0.015262515262515262, | |
| "grad_norm": 3.4845199584960938, | |
| "learning_rate": 4.3933982822017876e-05, | |
| "loss": 4.6104, | |
| "mean_token_accuracy": 0.24015748500823975, | |
| "num_tokens": 25600.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 4.748116493225098, | |
| "epoch": 0.015873015873015872, | |
| "grad_norm": 63.001136779785156, | |
| "learning_rate": 3.404843199558945e-05, | |
| "loss": 4.5619, | |
| "mean_token_accuracy": 0.24924011528491974, | |
| "num_tokens": 26624.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 4.614930152893066, | |
| "epoch": 0.016483516483516484, | |
| "grad_norm": 2.197350025177002, | |
| "learning_rate": 2.5279558154618197e-05, | |
| "loss": 4.7666, | |
| "mean_token_accuracy": 0.24020101130008698, | |
| "num_tokens": 27648.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 4.691383361816406, | |
| "epoch": 0.017094017094017096, | |
| "grad_norm": 2.5826752185821533, | |
| "learning_rate": 1.7711810347746757e-05, | |
| "loss": 4.6907, | |
| "mean_token_accuracy": 0.2278876155614853, | |
| "num_tokens": 28672.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 4.772076606750488, | |
| "epoch": 0.017704517704517704, | |
| "grad_norm": 3.7122533321380615, | |
| "learning_rate": 1.1418070123306989e-05, | |
| "loss": 4.6573, | |
| "mean_token_accuracy": 0.20523138344287872, | |
| "num_tokens": 29696.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 4.471816062927246, | |
| "epoch": 0.018315018315018316, | |
| "grad_norm": 2.352668285369873, | |
| "learning_rate": 6.458949640168675e-06, | |
| "loss": 4.3323, | |
| "mean_token_accuracy": 0.2381889820098877, | |
| "num_tokens": 30720.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 4.810123443603516, | |
| "epoch": 0.018925518925518924, | |
| "grad_norm": 2.075646162033081, | |
| "learning_rate": 2.882207939515435e-06, | |
| "loss": 4.9162, | |
| "mean_token_accuracy": 0.2311507910490036, | |
| "num_tokens": 31744.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 4.847671031951904, | |
| "epoch": 0.019536019536019536, | |
| "grad_norm": 2.874021291732788, | |
| "learning_rate": 7.222909991704773e-07, | |
| "loss": 4.9275, | |
| "mean_token_accuracy": 0.20472441613674164, | |
| "num_tokens": 32768.0, | |
| "step": 32 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 32, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 16, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 137211949350912.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |