{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.3262875378131866, "epoch": 0.19047619047619047, "grad_norm": 0.4341161847114563, "learning_rate": 0.0, "loss": 2.5209, "mean_token_accuracy": 0.5272354185581207, "num_tokens": 3907.0, "step": 1 }, { "entropy": 1.4145832359790802, "epoch": 0.38095238095238093, "grad_norm": 0.39086446166038513, "learning_rate": 0.0001, "loss": 2.5619, "mean_token_accuracy": 0.5107117593288422, "num_tokens": 7798.0, "step": 2 }, { "entropy": 1.3431878983974457, "epoch": 0.5714285714285714, "grad_norm": 0.4263755679130554, "learning_rate": 0.0002, "loss": 2.476, "mean_token_accuracy": 0.5177250653505325, "num_tokens": 11669.0, "step": 3 }, { "entropy": 1.5617998540401459, "epoch": 0.7619047619047619, "grad_norm": 0.3882932662963867, "learning_rate": 0.00019986800724660115, "loss": 2.4395, "mean_token_accuracy": 0.509832575917244, "num_tokens": 15400.0, "step": 4 }, { "entropy": 1.5141464471817017, "epoch": 0.9523809523809523, "grad_norm": 0.37078166007995605, "learning_rate": 0.0001994724161438924, "loss": 2.0727, "mean_token_accuracy": 0.5462771654129028, "num_tokens": 19278.0, "step": 5 }, { "entropy": 1.545729160308838, "epoch": 1.0, "grad_norm": 0.4754142165184021, "learning_rate": 0.00019881438702873738, "loss": 1.8424, "mean_token_accuracy": 0.5563093423843384, "num_tokens": 20018.0, "step": 6 }, { "entropy": 1.6577311158180237, "epoch": 1.1904761904761905, "grad_norm": 0.8796502947807312, "learning_rate": 0.0001978958500139078, "loss": 1.7406, "mean_token_accuracy": 0.6021987497806549, "num_tokens": 23850.0, "step": 7 }, { "entropy": 1.6507906019687653, "epoch": 1.380952380952381, "grad_norm": 0.4536665081977844, "learning_rate": 0.00019671949932673007, "loss": 1.6202, "mean_token_accuracy": 0.6040006428956985, "num_tokens": 27559.0, "step": 8 }, { "entropy": 1.4640164971351624, "epoch": 1.5714285714285714, "grad_norm": 0.5235773324966431, "learning_rate": 0.00019528878540645223, "loss": 1.438, "mean_token_accuracy": 0.6390907764434814, "num_tokens": 31555.0, "step": 9 }, { "entropy": 1.375329077243805, "epoch": 1.7619047619047619, "grad_norm": 0.5303475260734558, "learning_rate": 0.00019360790478351126, "loss": 1.3535, "mean_token_accuracy": 0.6558270305395126, "num_tokens": 35542.0, "step": 10 }, { "entropy": 1.260793536901474, "epoch": 1.9523809523809523, "grad_norm": 0.24268986284732819, "learning_rate": 0.00019168178777038614, "loss": 1.3216, "mean_token_accuracy": 0.6668620556592941, "num_tokens": 39306.0, "step": 11 }, { "entropy": 1.1681206226348877, "epoch": 2.0, "grad_norm": 0.4024941325187683, "learning_rate": 0.00018951608400014208, "loss": 1.223, "mean_token_accuracy": 0.6863824129104614, "num_tokens": 40036.0, "step": 12 }, { "entropy": 1.1119624972343445, "epoch": 2.1904761904761907, "grad_norm": 0.23261502385139465, "learning_rate": 0.00018711714585508305, "loss": 1.1513, "mean_token_accuracy": 0.7163149267435074, "num_tokens": 43934.0, "step": 13 }, { "entropy": 1.0272830575704575, "epoch": 2.380952380952381, "grad_norm": 0.2817513048648834, "learning_rate": 0.00018449200983412017, "loss": 1.0321, "mean_token_accuracy": 0.7490329742431641, "num_tokens": 47807.0, "step": 14 }, { "entropy": 1.0256501138210297, "epoch": 2.571428571428571, "grad_norm": 0.42125505208969116, "learning_rate": 0.00018164837591350794, "loss": 1.0104, "mean_token_accuracy": 0.7377028465270996, "num_tokens": 51668.0, "step": 15 }, { "entropy": 0.9243793785572052, "epoch": 2.761904761904762, "grad_norm": 0.2722397446632385, "learning_rate": 0.00017859458496148728, "loss": 0.9228, "mean_token_accuracy": 0.7580588161945343, "num_tokens": 55552.0, "step": 16 }, { "entropy": 0.8618854880332947, "epoch": 2.9523809523809526, "grad_norm": 0.2660452723503113, "learning_rate": 0.00017533959427308178, "loss": 0.9357, "mean_token_accuracy": 0.7539641112089157, "num_tokens": 59348.0, "step": 17 }, { "entropy": 0.8472050428390503, "epoch": 3.0, "grad_norm": 0.507022500038147, "learning_rate": 0.0001718929512968081, "loss": 0.973, "mean_token_accuracy": 0.7524893283843994, "num_tokens": 60054.0, "step": 18 }, { "entropy": 0.6867058873176575, "epoch": 3.1904761904761907, "grad_norm": 0.22445128858089447, "learning_rate": 0.0001682647656303645, "loss": 0.6976, "mean_token_accuracy": 0.8361103683710098, "num_tokens": 63922.0, "step": 19 }, { "entropy": 0.657625138759613, "epoch": 3.380952380952381, "grad_norm": 0.21392887830734253, "learning_rate": 0.00016446567936743888, "loss": 0.6439, "mean_token_accuracy": 0.8438704013824463, "num_tokens": 67884.0, "step": 20 }, { "entropy": 0.645979255437851, "epoch": 3.571428571428571, "grad_norm": 0.2413715124130249, "learning_rate": 0.00016050683588261444, "loss": 0.6449, "mean_token_accuracy": 0.8318868428468704, "num_tokens": 71798.0, "step": 21 }, { "entropy": 0.6242164075374603, "epoch": 3.761904761904762, "grad_norm": 0.2577076852321625, "learning_rate": 0.00015639984714593195, "loss": 0.6241, "mean_token_accuracy": 0.8422446697950363, "num_tokens": 75561.0, "step": 22 }, { "entropy": 0.5975085943937302, "epoch": 3.9523809523809526, "grad_norm": 0.28406909108161926, "learning_rate": 0.00015215675966298112, "loss": 0.5852, "mean_token_accuracy": 0.8513532727956772, "num_tokens": 79338.0, "step": 23 }, { "entropy": 0.5571812391281128, "epoch": 4.0, "grad_norm": 0.4610949456691742, "learning_rate": 0.00014779001914042383, "loss": 0.4958, "mean_token_accuracy": 0.8864569067955017, "num_tokens": 80072.0, "step": 24 }, { "entropy": 0.4985816180706024, "epoch": 4.190476190476191, "grad_norm": 0.1998254358768463, "learning_rate": 0.0001433124339805923, "loss": 0.4005, "mean_token_accuracy": 0.9142640978097916, "num_tokens": 83945.0, "step": 25 }, { "entropy": 0.45187360793352127, "epoch": 4.380952380952381, "grad_norm": 0.21039558947086334, "learning_rate": 0.0001387371377122382, "loss": 0.3665, "mean_token_accuracy": 0.9175333380699158, "num_tokens": 87881.0, "step": 26 }, { "entropy": 0.4067419543862343, "epoch": 4.571428571428571, "grad_norm": 0.18988683819770813, "learning_rate": 0.00013407755046762992, "loss": 0.3503, "mean_token_accuracy": 0.9209371358156204, "num_tokens": 91719.0, "step": 27 }, { "entropy": 0.3633280023932457, "epoch": 4.761904761904762, "grad_norm": 0.19534839689731598, "learning_rate": 0.0001293473396189922, "loss": 0.3161, "mean_token_accuracy": 0.9321369230747223, "num_tokens": 95652.0, "step": 28 }, { "entropy": 0.33942626416683197, "epoch": 4.9523809523809526, "grad_norm": 0.19549953937530518, "learning_rate": 0.00012456037968974884, "loss": 0.313, "mean_token_accuracy": 0.9316133558750153, "num_tokens": 99430.0, "step": 29 }, { "entropy": 0.3430580794811249, "epoch": 5.0, "grad_norm": 0.5042015910148621, "learning_rate": 0.00011973071165815478, "loss": 0.349, "mean_token_accuracy": 0.922374427318573, "num_tokens": 100090.0, "step": 30 }, { "entropy": 0.2580493614077568, "epoch": 5.190476190476191, "grad_norm": 0.16134287416934967, "learning_rate": 0.0001148725017726876, "loss": 0.2413, "mean_token_accuracy": 0.9544335603713989, "num_tokens": 103915.0, "step": 31 }, { "entropy": 0.23024286702275276, "epoch": 5.380952380952381, "grad_norm": 0.16694019734859467, "learning_rate": 0.00011000000000000002, "loss": 0.2109, "mean_token_accuracy": 0.9610596597194672, "num_tokens": 107713.0, "step": 32 }, { "entropy": 0.21569984406232834, "epoch": 5.571428571428571, "grad_norm": 0.17621855437755585, "learning_rate": 0.00010512749822731243, "loss": 0.2094, "mean_token_accuracy": 0.9631908386945724, "num_tokens": 111585.0, "step": 33 }, { "entropy": 0.21424956247210503, "epoch": 5.761904761904762, "grad_norm": 0.19082747399806976, "learning_rate": 0.00010026928834184526, "loss": 0.2124, "mean_token_accuracy": 0.9592168480157852, "num_tokens": 115386.0, "step": 34 }, { "entropy": 0.1874239295721054, "epoch": 5.9523809523809526, "grad_norm": 0.16567201912403107, "learning_rate": 9.543962031025118e-05, "loss": 0.1744, "mean_token_accuracy": 0.9665667712688446, "num_tokens": 119385.0, "step": 35 }, { "entropy": 0.22158779203891754, "epoch": 6.0, "grad_norm": 0.3941231369972229, "learning_rate": 9.065266038100784e-05, "loss": 0.2016, "mean_token_accuracy": 0.9597222208976746, "num_tokens": 120108.0, "step": 36 }, { "entropy": 0.1528039053082466, "epoch": 6.190476190476191, "grad_norm": 0.11817783862352371, "learning_rate": 8.592244953237014e-05, "loss": 0.1497, "mean_token_accuracy": 0.9792557656764984, "num_tokens": 124031.0, "step": 37 }, { "entropy": 0.1606922298669815, "epoch": 6.380952380952381, "grad_norm": 0.1441129893064499, "learning_rate": 8.126286228776183e-05, "loss": 0.1605, "mean_token_accuracy": 0.9716966897249222, "num_tokens": 127794.0, "step": 38 }, { "entropy": 0.13986552879214287, "epoch": 6.571428571428571, "grad_norm": 0.11448748409748077, "learning_rate": 7.66875660194077e-05, "loss": 0.1379, "mean_token_accuracy": 0.9764093905687332, "num_tokens": 131650.0, "step": 39 }, { "entropy": 0.13382179662585258, "epoch": 6.761904761904762, "grad_norm": 0.1236938089132309, "learning_rate": 7.22099808595762e-05, "loss": 0.1341, "mean_token_accuracy": 0.9794343560934067, "num_tokens": 135652.0, "step": 40 }, { "entropy": 0.15034474432468414, "epoch": 6.9523809523809526, "grad_norm": 0.19176296889781952, "learning_rate": 6.78432403370189e-05, "loss": 0.152, "mean_token_accuracy": 0.9750390499830246, "num_tokens": 139360.0, "step": 41 }, { "entropy": 0.1320204734802246, "epoch": 7.0, "grad_norm": 0.2667448818683624, "learning_rate": 6.360015285406804e-05, "loss": 0.1336, "mean_token_accuracy": 0.9764088988304138, "num_tokens": 140126.0, "step": 42 }, { "entropy": 0.1247200220823288, "epoch": 7.190476190476191, "grad_norm": 0.11455179005861282, "learning_rate": 5.94931641173856e-05, "loss": 0.1251, "mean_token_accuracy": 0.9828500151634216, "num_tokens": 144047.0, "step": 43 }, { "entropy": 0.12862810119986534, "epoch": 7.380952380952381, "grad_norm": 0.0989384576678276, "learning_rate": 5.5534320632561165e-05, "loss": 0.1256, "mean_token_accuracy": 0.9814009815454483, "num_tokens": 147708.0, "step": 44 }, { "entropy": 0.11727739311754704, "epoch": 7.571428571428571, "grad_norm": 0.12695352733135223, "learning_rate": 5.173523436963552e-05, "loss": 0.1143, "mean_token_accuracy": 0.9824336171150208, "num_tokens": 151651.0, "step": 45 }, { "entropy": 0.11429525911808014, "epoch": 7.761904761904762, "grad_norm": 0.12735556066036224, "learning_rate": 4.8107048703191896e-05, "loss": 0.1092, "mean_token_accuracy": 0.9818378686904907, "num_tokens": 155633.0, "step": 46 }, { "entropy": 0.12080883048474789, "epoch": 7.9523809523809526, "grad_norm": 0.11843378096818924, "learning_rate": 4.4660405726918245e-05, "loss": 0.1111, "mean_token_accuracy": 0.9824976027011871, "num_tokens": 159459.0, "step": 47 }, { "entropy": 0.13137711584568024, "epoch": 8.0, "grad_norm": 0.29663848876953125, "learning_rate": 4.140541503851273e-05, "loss": 0.1371, "mean_token_accuracy": 0.9780058860778809, "num_tokens": 160144.0, "step": 48 }, { "entropy": 0.11721896752715111, "epoch": 8.19047619047619, "grad_norm": 0.11241921037435532, "learning_rate": 3.8351624086492065e-05, "loss": 0.1074, "mean_token_accuracy": 0.9836096614599228, "num_tokens": 164003.0, "step": 49 }, { "entropy": 0.11027799919247627, "epoch": 8.380952380952381, "grad_norm": 0.11503297835588455, "learning_rate": 3.550799016587986e-05, "loss": 0.1007, "mean_token_accuracy": 0.9842301160097122, "num_tokens": 167950.0, "step": 50 }, { "entropy": 0.11789926327764988, "epoch": 8.571428571428571, "grad_norm": 0.0783538818359375, "learning_rate": 3.288285414491699e-05, "loss": 0.1085, "mean_token_accuracy": 0.9823660999536514, "num_tokens": 171709.0, "step": 51 }, { "entropy": 0.11202635429799557, "epoch": 8.761904761904763, "grad_norm": 0.07245591282844543, "learning_rate": 3.0483915999857948e-05, "loss": 0.1007, "mean_token_accuracy": 0.9845083355903625, "num_tokens": 175602.0, "step": 52 }, { "entropy": 0.1177611444145441, "epoch": 8.952380952380953, "grad_norm": 0.08537604659795761, "learning_rate": 2.8318212229613883e-05, "loss": 0.1035, "mean_token_accuracy": 0.9821241199970245, "num_tokens": 179385.0, "step": 53 }, { "entropy": 0.10294436663389206, "epoch": 9.0, "grad_norm": 0.13060350716114044, "learning_rate": 2.6392095216488778e-05, "loss": 0.0921, "mean_token_accuracy": 0.9844961166381836, "num_tokens": 180162.0, "step": 54 }, { "entropy": 0.11118071712553501, "epoch": 9.19047619047619, "grad_norm": 0.09073005616664886, "learning_rate": 2.471121459354779e-05, "loss": 0.1011, "mean_token_accuracy": 0.9844587743282318, "num_tokens": 183907.0, "step": 55 }, { "entropy": 0.10471968166530132, "epoch": 9.380952380952381, "grad_norm": 0.0956854298710823, "learning_rate": 2.328050067326994e-05, "loss": 0.0964, "mean_token_accuracy": 0.9840410053730011, "num_tokens": 187815.0, "step": 56 }, { "entropy": 0.10486606322228909, "epoch": 9.571428571428571, "grad_norm": 0.06581596285104752, "learning_rate": 2.2104149986092204e-05, "loss": 0.0968, "mean_token_accuracy": 0.9840281754732132, "num_tokens": 191715.0, "step": 57 }, { "entropy": 0.10280009731650352, "epoch": 9.761904761904763, "grad_norm": 0.06510630995035172, "learning_rate": 2.118561297126265e-05, "loss": 0.091, "mean_token_accuracy": 0.9846882820129395, "num_tokens": 195649.0, "step": 58 }, { "entropy": 0.10552813299000263, "epoch": 9.952380952380953, "grad_norm": 0.07446446269750595, "learning_rate": 2.0527583856107638e-05, "loss": 0.0983, "mean_token_accuracy": 0.9825079441070557, "num_tokens": 199450.0, "step": 59 }, { "entropy": 0.10964296013116837, "epoch": 10.0, "grad_norm": 0.09576558321714401, "learning_rate": 2.013199275339886e-05, "loss": 0.0918, "mean_token_accuracy": 0.9834938049316406, "num_tokens": 200180.0, "step": 60 } ], "logging_steps": 1, "max_steps": 60, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.44086860888277e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }