{ "best_metric": 0.9445667862892151, "best_model_checkpoint": "./fine_tuned_models/competition/Llama-3.1-8B-Instruct/checkpoint-2600", "epoch": 2.5669135802469136, "eval_steps": 100, "global_step": 2600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019753086419753086, "grad_norm": 1.2435259819030762, "learning_rate": 1.9736842105263157e-06, "loss": 2.1366, "step": 20 }, { "epoch": 0.03950617283950617, "grad_norm": 1.1957170963287354, "learning_rate": 3.9473684210526315e-06, "loss": 2.0728, "step": 40 }, { "epoch": 0.05925925925925926, "grad_norm": 1.0101041793823242, "learning_rate": 5.921052631578948e-06, "loss": 1.9727, "step": 60 }, { "epoch": 0.07901234567901234, "grad_norm": 1.057698130607605, "learning_rate": 7.894736842105263e-06, "loss": 1.6864, "step": 80 }, { "epoch": 0.09876543209876543, "grad_norm": 0.8825891017913818, "learning_rate": 9.868421052631579e-06, "loss": 1.3623, "step": 100 }, { "epoch": 0.09876543209876543, "eval_loss": 1.2630479335784912, "eval_runtime": 402.8127, "eval_samples_per_second": 4.469, "eval_steps_per_second": 1.117, "step": 100 }, { "epoch": 0.11851851851851852, "grad_norm": 0.728771448135376, "learning_rate": 1.1842105263157895e-05, "loss": 1.2157, "step": 120 }, { "epoch": 0.1382716049382716, "grad_norm": 0.7429773211479187, "learning_rate": 1.3815789473684211e-05, "loss": 1.1921, "step": 140 }, { "epoch": 0.1580246913580247, "grad_norm": 0.731153666973114, "learning_rate": 1.5789473684210526e-05, "loss": 1.1441, "step": 160 }, { "epoch": 0.17777777777777778, "grad_norm": 0.7741252779960632, "learning_rate": 1.7763157894736842e-05, "loss": 1.1457, "step": 180 }, { "epoch": 0.19753086419753085, "grad_norm": 0.7633986473083496, "learning_rate": 1.9736842105263158e-05, "loss": 1.132, "step": 200 }, { "epoch": 0.19753086419753085, "eval_loss": 1.1129120588302612, "eval_runtime": 402.8555, "eval_samples_per_second": 4.468, "eval_steps_per_second": 1.117, "step": 200 }, { "epoch": 0.21728395061728395, "grad_norm": 0.8183904886245728, "learning_rate": 2.1710526315789474e-05, "loss": 1.0909, "step": 220 }, { "epoch": 0.23703703703703705, "grad_norm": 0.8751975297927856, "learning_rate": 2.368421052631579e-05, "loss": 1.0847, "step": 240 }, { "epoch": 0.25679012345679014, "grad_norm": 0.8953943848609924, "learning_rate": 2.5657894736842107e-05, "loss": 1.0808, "step": 260 }, { "epoch": 0.2765432098765432, "grad_norm": 0.9229008555412292, "learning_rate": 2.7631578947368423e-05, "loss": 1.0868, "step": 280 }, { "epoch": 0.2962962962962963, "grad_norm": 1.023574709892273, "learning_rate": 2.9605263157894735e-05, "loss": 1.0812, "step": 300 }, { "epoch": 0.2962962962962963, "eval_loss": 1.0787839889526367, "eval_runtime": 403.1582, "eval_samples_per_second": 4.465, "eval_steps_per_second": 1.116, "step": 300 }, { "epoch": 0.3160493827160494, "grad_norm": 1.026982069015503, "learning_rate": 2.9997461206762896e-05, "loss": 1.0907, "step": 320 }, { "epoch": 0.3358024691358025, "grad_norm": 1.0427346229553223, "learning_rate": 2.998714883211034e-05, "loss": 1.0387, "step": 340 }, { "epoch": 0.35555555555555557, "grad_norm": 1.0824358463287354, "learning_rate": 2.9968909651554313e-05, "loss": 1.0598, "step": 360 }, { "epoch": 0.37530864197530867, "grad_norm": 1.0790863037109375, "learning_rate": 2.994275331192262e-05, "loss": 1.0785, "step": 380 }, { "epoch": 0.3950617283950617, "grad_norm": 1.0331053733825684, "learning_rate": 2.9908693647482852e-05, "loss": 1.0708, "step": 400 }, { "epoch": 0.3950617283950617, "eval_loss": 1.052521824836731, "eval_runtime": 402.8901, "eval_samples_per_second": 4.468, "eval_steps_per_second": 1.117, "step": 400 }, { "epoch": 0.4148148148148148, "grad_norm": 1.1587470769882202, "learning_rate": 2.9866748672625354e-05, "loss": 1.0592, "step": 420 }, { "epoch": 0.4345679012345679, "grad_norm": 1.072559118270874, "learning_rate": 2.9816940572335275e-05, "loss": 1.0602, "step": 440 }, { "epoch": 0.454320987654321, "grad_norm": 1.171120047569275, "learning_rate": 2.9759295690458793e-05, "loss": 1.0729, "step": 460 }, { "epoch": 0.4740740740740741, "grad_norm": 1.1216728687286377, "learning_rate": 2.9693844515769658e-05, "loss": 1.0471, "step": 480 }, { "epoch": 0.49382716049382713, "grad_norm": 1.2060222625732422, "learning_rate": 2.9620621665843482e-05, "loss": 1.048, "step": 500 }, { "epoch": 0.49382716049382713, "eval_loss": 1.0372475385665894, "eval_runtime": 402.9091, "eval_samples_per_second": 4.468, "eval_steps_per_second": 1.117, "step": 500 }, { "epoch": 0.5135802469135803, "grad_norm": 1.1379283666610718, "learning_rate": 2.953966586874827e-05, "loss": 1.0483, "step": 520 }, { "epoch": 0.5333333333333333, "grad_norm": 1.1166441440582275, "learning_rate": 2.9451019942560854e-05, "loss": 1.0466, "step": 540 }, { "epoch": 0.5530864197530864, "grad_norm": 1.1171513795852661, "learning_rate": 2.9354730772720152e-05, "loss": 1.0123, "step": 560 }, { "epoch": 0.5728395061728395, "grad_norm": 1.2504868507385254, "learning_rate": 2.9250849287229115e-05, "loss": 1.0566, "step": 580 }, { "epoch": 0.5925925925925926, "grad_norm": 1.2809154987335205, "learning_rate": 2.9139430429718557e-05, "loss": 1.0678, "step": 600 }, { "epoch": 0.5925925925925926, "eval_loss": 1.0224571228027344, "eval_runtime": 403.0073, "eval_samples_per_second": 4.466, "eval_steps_per_second": 1.117, "step": 600 }, { "epoch": 0.6123456790123457, "grad_norm": 1.1451176404953003, "learning_rate": 2.90205331303871e-05, "loss": 1.0123, "step": 620 }, { "epoch": 0.6320987654320988, "grad_norm": 1.2070196866989136, "learning_rate": 2.889422027483259e-05, "loss": 1.0113, "step": 640 }, { "epoch": 0.6518518518518519, "grad_norm": 1.1931513547897339, "learning_rate": 2.876055867079146e-05, "loss": 0.9918, "step": 660 }, { "epoch": 0.671604938271605, "grad_norm": 1.2010815143585205, "learning_rate": 2.8619619012803674e-05, "loss": 1.0223, "step": 680 }, { "epoch": 0.691358024691358, "grad_norm": 1.2849438190460205, "learning_rate": 2.847147584482188e-05, "loss": 1.0262, "step": 700 }, { "epoch": 0.691358024691358, "eval_loss": 1.0122270584106445, "eval_runtime": 402.9377, "eval_samples_per_second": 4.467, "eval_steps_per_second": 1.117, "step": 700 }, { "epoch": 0.7111111111111111, "grad_norm": 1.1782609224319458, "learning_rate": 2.8316207520784622e-05, "loss": 1.01, "step": 720 }, { "epoch": 0.7308641975308642, "grad_norm": 1.1860291957855225, "learning_rate": 2.81538961631744e-05, "loss": 1.0329, "step": 740 }, { "epoch": 0.7506172839506173, "grad_norm": 1.1462602615356445, "learning_rate": 2.7984627619582507e-05, "loss": 1.0208, "step": 760 }, { "epoch": 0.7703703703703704, "grad_norm": 1.2748075723648071, "learning_rate": 2.7808491417303672e-05, "loss": 1.0237, "step": 780 }, { "epoch": 0.7901234567901234, "grad_norm": 1.2756584882736206, "learning_rate": 2.7625580715984452e-05, "loss": 1.016, "step": 800 }, { "epoch": 0.7901234567901234, "eval_loss": 1.0034656524658203, "eval_runtime": 403.3086, "eval_samples_per_second": 4.463, "eval_steps_per_second": 1.116, "step": 800 }, { "epoch": 0.8098765432098766, "grad_norm": 1.1677286624908447, "learning_rate": 2.7435992258350444e-05, "loss": 1.0283, "step": 820 }, { "epoch": 0.8296296296296296, "grad_norm": 1.1957719326019287, "learning_rate": 2.7239826319038413e-05, "loss": 1.0048, "step": 840 }, { "epoch": 0.8493827160493828, "grad_norm": 1.3127700090408325, "learning_rate": 2.703718665156033e-05, "loss": 0.9976, "step": 860 }, { "epoch": 0.8691358024691358, "grad_norm": 1.1668589115142822, "learning_rate": 2.6828180433427437e-05, "loss": 0.9979, "step": 880 }, { "epoch": 0.8888888888888888, "grad_norm": 1.3369258642196655, "learning_rate": 2.6612918209463323e-05, "loss": 0.9872, "step": 900 }, { "epoch": 0.8888888888888888, "eval_loss": 0.9982550740242004, "eval_runtime": 403.0757, "eval_samples_per_second": 4.466, "eval_steps_per_second": 1.116, "step": 900 }, { "epoch": 0.908641975308642, "grad_norm": 1.3509835004806519, "learning_rate": 2.6391513833336013e-05, "loss": 1.0059, "step": 920 }, { "epoch": 0.928395061728395, "grad_norm": 1.333506464958191, "learning_rate": 2.616408440733997e-05, "loss": 1.0067, "step": 940 }, { "epoch": 0.9481481481481482, "grad_norm": 1.1939771175384521, "learning_rate": 2.5930750220459893e-05, "loss": 1.0001, "step": 960 }, { "epoch": 0.9679012345679012, "grad_norm": 1.27733314037323, "learning_rate": 2.569163468474906e-05, "loss": 0.9953, "step": 980 }, { "epoch": 0.9876543209876543, "grad_norm": 1.2627174854278564, "learning_rate": 2.5446864270055825e-05, "loss": 0.9927, "step": 1000 }, { "epoch": 0.9876543209876543, "eval_loss": 0.9864739775657654, "eval_runtime": 402.9032, "eval_samples_per_second": 4.468, "eval_steps_per_second": 1.117, "step": 1000 }, { "epoch": 1.0069135802469136, "grad_norm": 1.3273580074310303, "learning_rate": 2.5196568437132855e-05, "loss": 0.9744, "step": 1020 }, { "epoch": 1.0266666666666666, "grad_norm": 1.3258188962936401, "learning_rate": 2.4940879569164453e-05, "loss": 0.9355, "step": 1040 }, { "epoch": 1.0464197530864197, "grad_norm": 1.3573037385940552, "learning_rate": 2.467993290174819e-05, "loss": 0.961, "step": 1060 }, { "epoch": 1.066172839506173, "grad_norm": 1.4085516929626465, "learning_rate": 2.4413866451367836e-05, "loss": 0.9424, "step": 1080 }, { "epoch": 1.085925925925926, "grad_norm": 1.309260368347168, "learning_rate": 2.414282094239555e-05, "loss": 0.9625, "step": 1100 }, { "epoch": 1.085925925925926, "eval_loss": 0.9823839068412781, "eval_runtime": 402.9497, "eval_samples_per_second": 4.467, "eval_steps_per_second": 1.117, "step": 1100 }, { "epoch": 1.105679012345679, "grad_norm": 1.4734690189361572, "learning_rate": 2.386693973266173e-05, "loss": 0.9332, "step": 1120 }, { "epoch": 1.125432098765432, "grad_norm": 1.3914655447006226, "learning_rate": 2.3586368737632124e-05, "loss": 0.9868, "step": 1140 }, { "epoch": 1.145185185185185, "grad_norm": 1.3848929405212402, "learning_rate": 2.3301256353232112e-05, "loss": 0.9575, "step": 1160 }, { "epoch": 1.1649382716049383, "grad_norm": 1.3042232990264893, "learning_rate": 2.3011753377359084e-05, "loss": 0.9641, "step": 1180 }, { "epoch": 1.1846913580246914, "grad_norm": 1.361521601676941, "learning_rate": 2.2718012930124407e-05, "loss": 0.9966, "step": 1200 }, { "epoch": 1.1846913580246914, "eval_loss": 0.9790483117103577, "eval_runtime": 402.9963, "eval_samples_per_second": 4.467, "eval_steps_per_second": 1.117, "step": 1200 }, { "epoch": 1.2044444444444444, "grad_norm": 1.4234545230865479, "learning_rate": 2.2420190372867144e-05, "loss": 0.9478, "step": 1220 }, { "epoch": 1.2241975308641975, "grad_norm": 1.3092103004455566, "learning_rate": 2.2118443225982365e-05, "loss": 0.9564, "step": 1240 }, { "epoch": 1.2439506172839505, "grad_norm": 1.3332173824310303, "learning_rate": 2.1812931085607537e-05, "loss": 0.9605, "step": 1260 }, { "epoch": 1.2637037037037038, "grad_norm": 1.3987936973571777, "learning_rate": 2.1503815539211034e-05, "loss": 0.9513, "step": 1280 }, { "epoch": 1.2834567901234568, "grad_norm": 1.383507251739502, "learning_rate": 2.119126008012741e-05, "loss": 0.9631, "step": 1300 }, { "epoch": 1.2834567901234568, "eval_loss": 0.9741268754005432, "eval_runtime": 403.0002, "eval_samples_per_second": 4.466, "eval_steps_per_second": 1.117, "step": 1300 }, { "epoch": 1.3032098765432099, "grad_norm": 1.348244309425354, "learning_rate": 2.0875430021084665e-05, "loss": 0.9461, "step": 1320 }, { "epoch": 1.322962962962963, "grad_norm": 1.3115851879119873, "learning_rate": 2.0556492406769176e-05, "loss": 0.9219, "step": 1340 }, { "epoch": 1.342716049382716, "grad_norm": 1.4104254245758057, "learning_rate": 2.0234615925474648e-05, "loss": 0.9673, "step": 1360 }, { "epoch": 1.3624691358024692, "grad_norm": 1.4491636753082275, "learning_rate": 1.9909970819881708e-05, "loss": 0.9631, "step": 1380 }, { "epoch": 1.3822222222222222, "grad_norm": 1.4846336841583252, "learning_rate": 1.9582728797015385e-05, "loss": 0.9322, "step": 1400 }, { "epoch": 1.3822222222222222, "eval_loss": 0.9711065292358398, "eval_runtime": 403.2529, "eval_samples_per_second": 4.464, "eval_steps_per_second": 1.116, "step": 1400 }, { "epoch": 1.4019753086419753, "grad_norm": 1.5250033140182495, "learning_rate": 1.9253062937428073e-05, "loss": 0.9544, "step": 1420 }, { "epoch": 1.4217283950617283, "grad_norm": 1.3995600938796997, "learning_rate": 1.892114760365605e-05, "loss": 0.913, "step": 1440 }, { "epoch": 1.4414814814814814, "grad_norm": 1.4669514894485474, "learning_rate": 1.8587158347997932e-05, "loss": 0.9457, "step": 1460 }, { "epoch": 1.4612345679012346, "grad_norm": 1.8105261325836182, "learning_rate": 1.8251271819663863e-05, "loss": 0.9388, "step": 1480 }, { "epoch": 1.4809876543209877, "grad_norm": 1.3226635456085205, "learning_rate": 1.7913665671344526e-05, "loss": 0.959, "step": 1500 }, { "epoch": 1.4809876543209877, "eval_loss": 0.9656322598457336, "eval_runtime": 402.9036, "eval_samples_per_second": 4.468, "eval_steps_per_second": 1.117, "step": 1500 }, { "epoch": 1.5007407407407407, "grad_norm": 1.3588342666625977, "learning_rate": 1.7574518465249413e-05, "loss": 0.9523, "step": 1520 }, { "epoch": 1.520493827160494, "grad_norm": 1.3227828741073608, "learning_rate": 1.7234009578664066e-05, "loss": 0.9243, "step": 1540 }, { "epoch": 1.5402469135802468, "grad_norm": 1.3938301801681519, "learning_rate": 1.6892319109076183e-05, "loss": 0.9502, "step": 1560 }, { "epoch": 1.56, "grad_norm": 1.410733938217163, "learning_rate": 1.654962777892081e-05, "loss": 0.9601, "step": 1580 }, { "epoch": 1.579753086419753, "grad_norm": 1.5061428546905518, "learning_rate": 1.6206116839995033e-05, "loss": 0.9697, "step": 1600 }, { "epoch": 1.579753086419753, "eval_loss": 0.9622066020965576, "eval_runtime": 402.8036, "eval_samples_per_second": 4.469, "eval_steps_per_second": 1.117, "step": 1600 }, { "epoch": 1.5995061728395061, "grad_norm": 1.4251749515533447, "learning_rate": 1.5861967977592642e-05, "loss": 0.9366, "step": 1620 }, { "epoch": 1.6192592592592594, "grad_norm": 1.4466283321380615, "learning_rate": 1.5517363214409528e-05, "loss": 0.9042, "step": 1640 }, { "epoch": 1.6390123456790122, "grad_norm": 1.3744322061538696, "learning_rate": 1.5172484814270635e-05, "loss": 0.9618, "step": 1660 }, { "epoch": 1.6587654320987655, "grad_norm": 1.360540509223938, "learning_rate": 1.4827515185729368e-05, "loss": 0.9492, "step": 1680 }, { "epoch": 1.6785185185185185, "grad_norm": 1.4454694986343384, "learning_rate": 1.4482636785590474e-05, "loss": 0.9238, "step": 1700 }, { "epoch": 1.6785185185185185, "eval_loss": 0.9586638808250427, "eval_runtime": 402.9204, "eval_samples_per_second": 4.467, "eval_steps_per_second": 1.117, "step": 1700 }, { "epoch": 1.6982716049382716, "grad_norm": 1.4837932586669922, "learning_rate": 1.413803202240736e-05, "loss": 0.939, "step": 1720 }, { "epoch": 1.7180246913580248, "grad_norm": 1.4821239709854126, "learning_rate": 1.3793883160004971e-05, "loss": 0.9278, "step": 1740 }, { "epoch": 1.7377777777777776, "grad_norm": 1.5507407188415527, "learning_rate": 1.3450372221079193e-05, "loss": 0.9361, "step": 1760 }, { "epoch": 1.757530864197531, "grad_norm": 1.5079405307769775, "learning_rate": 1.3107680890923821e-05, "loss": 0.8979, "step": 1780 }, { "epoch": 1.777283950617284, "grad_norm": 1.5070364475250244, "learning_rate": 1.2765990421335937e-05, "loss": 0.9364, "step": 1800 }, { "epoch": 1.777283950617284, "eval_loss": 0.9560103416442871, "eval_runtime": 402.9582, "eval_samples_per_second": 4.467, "eval_steps_per_second": 1.117, "step": 1800 }, { "epoch": 1.797037037037037, "grad_norm": 1.4538744688034058, "learning_rate": 1.2425481534750591e-05, "loss": 0.9266, "step": 1820 }, { "epoch": 1.8167901234567903, "grad_norm": 1.4080045223236084, "learning_rate": 1.2086334328655478e-05, "loss": 0.9155, "step": 1840 }, { "epoch": 1.836543209876543, "grad_norm": 1.4733538627624512, "learning_rate": 1.1748728180336137e-05, "loss": 0.9393, "step": 1860 }, { "epoch": 1.8562962962962963, "grad_norm": 1.4632246494293213, "learning_rate": 1.141284165200207e-05, "loss": 0.9501, "step": 1880 }, { "epoch": 1.8760493827160494, "grad_norm": 1.5355905294418335, "learning_rate": 1.1078852396343955e-05, "loss": 0.9259, "step": 1900 }, { "epoch": 1.8760493827160494, "eval_loss": 0.9529610276222229, "eval_runtime": 403.067, "eval_samples_per_second": 4.466, "eval_steps_per_second": 1.116, "step": 1900 }, { "epoch": 1.8958024691358024, "grad_norm": 1.4178493022918701, "learning_rate": 1.0746937062571928e-05, "loss": 0.9466, "step": 1920 }, { "epoch": 1.9155555555555557, "grad_norm": 1.423301339149475, "learning_rate": 1.0417271202984614e-05, "loss": 0.9093, "step": 1940 }, { "epoch": 1.9353086419753085, "grad_norm": 1.4882200956344604, "learning_rate": 1.0090029180118293e-05, "loss": 0.9352, "step": 1960 }, { "epoch": 1.9550617283950618, "grad_norm": 1.653830647468567, "learning_rate": 9.765384074525358e-06, "loss": 0.9205, "step": 1980 }, { "epoch": 1.9748148148148148, "grad_norm": 1.5376299619674683, "learning_rate": 9.443507593230829e-06, "loss": 0.9013, "step": 2000 }, { "epoch": 1.9748148148148148, "eval_loss": 0.9503313302993774, "eval_runtime": 403.203, "eval_samples_per_second": 4.464, "eval_steps_per_second": 1.116, "step": 2000 }, { "epoch": 1.9945679012345678, "grad_norm": 1.443206787109375, "learning_rate": 9.124569978915336e-06, "loss": 0.9295, "step": 2020 }, { "epoch": 2.013827160493827, "grad_norm": 1.4827468395233154, "learning_rate": 8.808739919872588e-06, "loss": 0.9044, "step": 2040 }, { "epoch": 2.0335802469135804, "grad_norm": 1.534474492073059, "learning_rate": 8.496184460788969e-06, "loss": 0.8971, "step": 2060 }, { "epoch": 2.0533333333333332, "grad_norm": 1.4375228881835938, "learning_rate": 8.187068914392464e-06, "loss": 0.9005, "step": 2080 }, { "epoch": 2.0730864197530865, "grad_norm": 1.5929490327835083, "learning_rate": 7.881556774017635e-06, "loss": 0.8931, "step": 2100 }, { "epoch": 2.0730864197530865, "eval_loss": 0.9495877027511597, "eval_runtime": 402.8934, "eval_samples_per_second": 4.468, "eval_steps_per_second": 1.117, "step": 2100 }, { "epoch": 2.0928395061728393, "grad_norm": 1.5017647743225098, "learning_rate": 7.579809627132857e-06, "loss": 0.9019, "step": 2120 }, { "epoch": 2.1125925925925926, "grad_norm": 1.4881539344787598, "learning_rate": 7.281987069875591e-06, "loss": 0.891, "step": 2140 }, { "epoch": 2.132345679012346, "grad_norm": 1.5712181329727173, "learning_rate": 6.988246622640921e-06, "loss": 0.8658, "step": 2160 }, { "epoch": 2.1520987654320987, "grad_norm": 1.5390993356704712, "learning_rate": 6.698743646767891e-06, "loss": 0.889, "step": 2180 }, { "epoch": 2.171851851851852, "grad_norm": 1.8116004467010498, "learning_rate": 6.413631262367882e-06, "loss": 0.8653, "step": 2200 }, { "epoch": 2.171851851851852, "eval_loss": 0.9501029253005981, "eval_runtime": 403.1544, "eval_samples_per_second": 4.465, "eval_steps_per_second": 1.116, "step": 2200 }, { "epoch": 2.1916049382716047, "grad_norm": 1.7029999494552612, "learning_rate": 6.1330602673382725e-06, "loss": 0.8842, "step": 2220 }, { "epoch": 2.211358024691358, "grad_norm": 1.5634303092956543, "learning_rate": 5.857179057604451e-06, "loss": 0.8758, "step": 2240 }, { "epoch": 2.2311111111111113, "grad_norm": 1.5488015413284302, "learning_rate": 5.586133548632161e-06, "loss": 0.8721, "step": 2260 }, { "epoch": 2.250864197530864, "grad_norm": 1.6451934576034546, "learning_rate": 5.320067098251815e-06, "loss": 0.8661, "step": 2280 }, { "epoch": 2.2706172839506173, "grad_norm": 1.4476590156555176, "learning_rate": 5.0591204308355465e-06, "loss": 0.8667, "step": 2300 }, { "epoch": 2.2706172839506173, "eval_loss": 0.947265088558197, "eval_runtime": 403.553, "eval_samples_per_second": 4.46, "eval_steps_per_second": 1.115, "step": 2300 }, { "epoch": 2.29037037037037, "grad_norm": 1.5609101057052612, "learning_rate": 4.803431562867145e-06, "loss": 0.8949, "step": 2320 }, { "epoch": 2.3101234567901234, "grad_norm": 1.7399330139160156, "learning_rate": 4.5531357299441774e-06, "loss": 0.9033, "step": 2340 }, { "epoch": 2.3298765432098767, "grad_norm": 1.735573410987854, "learning_rate": 4.3083653152509434e-06, "loss": 0.8689, "step": 2360 }, { "epoch": 2.3496296296296295, "grad_norm": 1.5185174942016602, "learning_rate": 4.069249779540108e-06, "loss": 0.9064, "step": 2380 }, { "epoch": 2.3693827160493828, "grad_norm": 1.6329830884933472, "learning_rate": 3.835915592660032e-06, "loss": 0.9104, "step": 2400 }, { "epoch": 2.3693827160493828, "eval_loss": 0.9463061690330505, "eval_runtime": 403.0072, "eval_samples_per_second": 4.466, "eval_steps_per_second": 1.117, "step": 2400 }, { "epoch": 2.389135802469136, "grad_norm": 1.6031969785690308, "learning_rate": 3.6084861666639856e-06, "loss": 0.8826, "step": 2420 }, { "epoch": 2.408888888888889, "grad_norm": 1.660013198852539, "learning_rate": 3.387081790536677e-06, "loss": 0.8911, "step": 2440 }, { "epoch": 2.428641975308642, "grad_norm": 1.5472162961959839, "learning_rate": 3.171819566572563e-06, "loss": 0.9027, "step": 2460 }, { "epoch": 2.448395061728395, "grad_norm": 1.729270100593567, "learning_rate": 2.962813348439669e-06, "loss": 0.8675, "step": 2480 }, { "epoch": 2.468148148148148, "grad_norm": 1.6325998306274414, "learning_rate": 2.7601736809615883e-06, "loss": 0.9258, "step": 2500 }, { "epoch": 2.468148148148148, "eval_loss": 0.9453881978988647, "eval_runtime": 402.8185, "eval_samples_per_second": 4.469, "eval_steps_per_second": 1.117, "step": 2500 }, { "epoch": 2.487901234567901, "grad_norm": 1.644284963607788, "learning_rate": 2.564007741649559e-06, "loss": 0.8935, "step": 2520 }, { "epoch": 2.5076543209876543, "grad_norm": 1.6974362134933472, "learning_rate": 2.374419284015552e-06, "loss": 0.9047, "step": 2540 }, { "epoch": 2.5274074074074075, "grad_norm": 1.5513502359390259, "learning_rate": 2.191508582696329e-06, "loss": 0.8889, "step": 2560 }, { "epoch": 2.5471604938271604, "grad_norm": 1.683895230293274, "learning_rate": 2.0153723804174945e-06, "loss": 0.8882, "step": 2580 }, { "epoch": 2.5669135802469136, "grad_norm": 1.5900335311889648, "learning_rate": 1.8461038368256027e-06, "loss": 0.8868, "step": 2600 }, { "epoch": 2.5669135802469136, "eval_loss": 0.9445667862892151, "eval_runtime": 403.0265, "eval_samples_per_second": 4.466, "eval_steps_per_second": 1.117, "step": 2600 } ], "logging_steps": 20, "max_steps": 3036, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.8563300999472087e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }