lemexp-task1-v3-template_full_nodefs-Llama-3.2-1B-8lr-12epochs-no-eos
/
checkpoint-77845
/trainer_state.json
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 3114, | |
| "global_step": 77845, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03211510052026463, | |
| "grad_norm": 1.2270241975784302, | |
| "learning_rate": 0.0007978718393388571, | |
| "loss": 0.4499, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.06423020104052926, | |
| "grad_norm": 1.1903154850006104, | |
| "learning_rate": 0.0007957308326375062, | |
| "loss": 0.3541, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.09634530156079389, | |
| "grad_norm": 1.1337592601776123, | |
| "learning_rate": 0.0007935898259361552, | |
| "loss": 0.3336, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.12846040208105852, | |
| "grad_norm": 0.9932805895805359, | |
| "learning_rate": 0.0007914488192348043, | |
| "loss": 0.3187, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.16057550260132314, | |
| "grad_norm": 1.4273182153701782, | |
| "learning_rate": 0.0007893120945468559, | |
| "loss": 0.3094, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.19269060312158778, | |
| "grad_norm": 1.296076774597168, | |
| "learning_rate": 0.000787171087845505, | |
| "loss": 0.306, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.2000128460402081, | |
| "eval_loss": 0.3016555607318878, | |
| "eval_runtime": 4.134, | |
| "eval_samples_per_second": 120.948, | |
| "eval_steps_per_second": 7.741, | |
| "step": 3114 | |
| }, | |
| { | |
| "epoch": 0.2248057036418524, | |
| "grad_norm": 1.1079399585723877, | |
| "learning_rate": 0.0007850343631575566, | |
| "loss": 0.2993, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.25692080416211704, | |
| "grad_norm": 1.5939998626708984, | |
| "learning_rate": 0.0007828933564562057, | |
| "loss": 0.2985, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.28903590468238166, | |
| "grad_norm": 1.4033524990081787, | |
| "learning_rate": 0.0007807523497548548, | |
| "loss": 0.2959, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.3211510052026463, | |
| "grad_norm": 1.9331912994384766, | |
| "learning_rate": 0.0007786113430535038, | |
| "loss": 0.2881, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.3532661057229109, | |
| "grad_norm": 1.112199068069458, | |
| "learning_rate": 0.0007764703363521528, | |
| "loss": 0.2874, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.38538120624317557, | |
| "grad_norm": 0.9604991674423218, | |
| "learning_rate": 0.0007743293296508018, | |
| "loss": 0.2867, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.4000256920804162, | |
| "eval_loss": 0.27945849299430847, | |
| "eval_runtime": 4.1149, | |
| "eval_samples_per_second": 121.509, | |
| "eval_steps_per_second": 7.777, | |
| "step": 6228 | |
| }, | |
| { | |
| "epoch": 0.4174963067634402, | |
| "grad_norm": 1.542017936706543, | |
| "learning_rate": 0.0007721883229494509, | |
| "loss": 0.2842, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.4496114072837048, | |
| "grad_norm": 0.9913608431816101, | |
| "learning_rate": 0.0007700473162480999, | |
| "loss": 0.2805, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.4817265078039694, | |
| "grad_norm": 1.0861561298370361, | |
| "learning_rate": 0.000767906309546749, | |
| "loss": 0.2795, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.5138416083242341, | |
| "grad_norm": 1.408018946647644, | |
| "learning_rate": 0.0007657695848588007, | |
| "loss": 0.2791, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.5459567088444987, | |
| "grad_norm": 0.935492992401123, | |
| "learning_rate": 0.0007636285781574497, | |
| "loss": 0.2768, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.5780718093647633, | |
| "grad_norm": 0.974107027053833, | |
| "learning_rate": 0.0007614875714560987, | |
| "loss": 0.2775, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.6000385381206244, | |
| "eval_loss": 0.26882508397102356, | |
| "eval_runtime": 4.0584, | |
| "eval_samples_per_second": 123.2, | |
| "eval_steps_per_second": 7.885, | |
| "step": 9342 | |
| }, | |
| { | |
| "epoch": 0.610186909885028, | |
| "grad_norm": 1.00677490234375, | |
| "learning_rate": 0.0007593465647547478, | |
| "loss": 0.2719, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.6423020104052926, | |
| "grad_norm": 1.3076094388961792, | |
| "learning_rate": 0.0007572098400667994, | |
| "loss": 0.2733, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.6744171109255572, | |
| "grad_norm": 1.332555890083313, | |
| "learning_rate": 0.0007550688333654485, | |
| "loss": 0.2738, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.7065322114458218, | |
| "grad_norm": 1.065308928489685, | |
| "learning_rate": 0.0007529278266640976, | |
| "loss": 0.2697, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.7386473119660865, | |
| "grad_norm": 1.1714705228805542, | |
| "learning_rate": 0.0007507868199627465, | |
| "loss": 0.2718, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.7707624124863511, | |
| "grad_norm": 1.545327067375183, | |
| "learning_rate": 0.0007486500952747983, | |
| "loss": 0.2701, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.8000513841608324, | |
| "eval_loss": 0.25833237171173096, | |
| "eval_runtime": 3.9621, | |
| "eval_samples_per_second": 126.196, | |
| "eval_steps_per_second": 8.077, | |
| "step": 12456 | |
| }, | |
| { | |
| "epoch": 0.8028775130066157, | |
| "grad_norm": 0.8509078621864319, | |
| "learning_rate": 0.0007465090885734473, | |
| "loss": 0.2702, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.8349926135268804, | |
| "grad_norm": 1.0517570972442627, | |
| "learning_rate": 0.0007443680818720963, | |
| "loss": 0.2684, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.8671077140471449, | |
| "grad_norm": 1.0709669589996338, | |
| "learning_rate": 0.000742231357184148, | |
| "loss": 0.2674, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.8992228145674096, | |
| "grad_norm": 1.1354570388793945, | |
| "learning_rate": 0.0007400903504827971, | |
| "loss": 0.269, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.9313379150876743, | |
| "grad_norm": 1.5820938348770142, | |
| "learning_rate": 0.000737949343781446, | |
| "loss": 0.2665, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.9634530156079388, | |
| "grad_norm": 1.3303571939468384, | |
| "learning_rate": 0.0007358083370800951, | |
| "loss": 0.2644, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.9955681161282035, | |
| "grad_norm": 1.0390913486480713, | |
| "learning_rate": 0.0007336716123921468, | |
| "loss": 0.268, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.0000642302010405, | |
| "eval_loss": 0.25299617648124695, | |
| "eval_runtime": 3.8751, | |
| "eval_samples_per_second": 129.03, | |
| "eval_steps_per_second": 8.258, | |
| "step": 15570 | |
| }, | |
| { | |
| "epoch": 1.0276832166484682, | |
| "grad_norm": 1.2327704429626465, | |
| "learning_rate": 0.0007315306056907958, | |
| "loss": 0.263, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.0597983171687329, | |
| "grad_norm": 0.9403806924819946, | |
| "learning_rate": 0.0007293895989894448, | |
| "loss": 0.2633, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.0919134176889973, | |
| "grad_norm": 1.1138664484024048, | |
| "learning_rate": 0.0007272485922880939, | |
| "loss": 0.2608, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.124028518209262, | |
| "grad_norm": 1.1546539068222046, | |
| "learning_rate": 0.000725107585586743, | |
| "loss": 0.2569, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.1561436187295266, | |
| "grad_norm": 1.0123635530471802, | |
| "learning_rate": 0.0007229665788853919, | |
| "loss": 0.2596, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.1882587192497913, | |
| "grad_norm": 1.1647980213165283, | |
| "learning_rate": 0.000720825572184041, | |
| "loss": 0.2609, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.2000770762412487, | |
| "eval_loss": 0.2469949871301651, | |
| "eval_runtime": 3.796, | |
| "eval_samples_per_second": 131.719, | |
| "eval_steps_per_second": 8.43, | |
| "step": 18684 | |
| }, | |
| { | |
| "epoch": 1.2203738197700558, | |
| "grad_norm": 1.2368906736373901, | |
| "learning_rate": 0.00071868456548269, | |
| "loss": 0.2597, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.2524889202903204, | |
| "grad_norm": 0.9881177544593811, | |
| "learning_rate": 0.000716543558781339, | |
| "loss": 0.2563, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.2846040208105851, | |
| "grad_norm": 0.9961882829666138, | |
| "learning_rate": 0.0007144068340933907, | |
| "loss": 0.2563, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.3167191213308498, | |
| "grad_norm": 1.6545355319976807, | |
| "learning_rate": 0.0007122658273920398, | |
| "loss": 0.2566, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.3488342218511145, | |
| "grad_norm": 1.2175770998001099, | |
| "learning_rate": 0.0007101248206906887, | |
| "loss": 0.251, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.3809493223713791, | |
| "grad_norm": 1.2942149639129639, | |
| "learning_rate": 0.0007079838139893379, | |
| "loss": 0.2549, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.4000899222814567, | |
| "eval_loss": 0.24247248470783234, | |
| "eval_runtime": 4.0127, | |
| "eval_samples_per_second": 124.605, | |
| "eval_steps_per_second": 7.975, | |
| "step": 21798 | |
| }, | |
| { | |
| "epoch": 1.4130644228916436, | |
| "grad_norm": 0.9972023963928223, | |
| "learning_rate": 0.0007058470893013895, | |
| "loss": 0.2532, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.4451795234119083, | |
| "grad_norm": 2.422755479812622, | |
| "learning_rate": 0.0007037060826000386, | |
| "loss": 0.2525, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.477294623932173, | |
| "grad_norm": 1.0350821018218994, | |
| "learning_rate": 0.0007015650758986876, | |
| "loss": 0.2528, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.5094097244524374, | |
| "grad_norm": 0.9712342023849487, | |
| "learning_rate": 0.0006994240691973367, | |
| "loss": 0.2498, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.541524824972702, | |
| "grad_norm": 1.0698814392089844, | |
| "learning_rate": 0.0006972830624959856, | |
| "loss": 0.2511, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.5736399254929667, | |
| "grad_norm": 1.0637270212173462, | |
| "learning_rate": 0.0006951463378080374, | |
| "loss": 0.2542, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.6001027683216649, | |
| "eval_loss": 0.23835672438144684, | |
| "eval_runtime": 3.8011, | |
| "eval_samples_per_second": 131.543, | |
| "eval_steps_per_second": 8.419, | |
| "step": 24912 | |
| }, | |
| { | |
| "epoch": 1.6057550260132314, | |
| "grad_norm": 1.08571195602417, | |
| "learning_rate": 0.0006930053311066865, | |
| "loss": 0.2531, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.637870126533496, | |
| "grad_norm": 0.9403467774391174, | |
| "learning_rate": 0.0006908643244053354, | |
| "loss": 0.2522, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.6699852270537607, | |
| "grad_norm": 0.8324321508407593, | |
| "learning_rate": 0.0006887233177039845, | |
| "loss": 0.2493, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.7021003275740254, | |
| "grad_norm": 0.9599499702453613, | |
| "learning_rate": 0.0006865908750294389, | |
| "loss": 0.2485, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.73421542809429, | |
| "grad_norm": 1.369850754737854, | |
| "learning_rate": 0.0006844498683280879, | |
| "loss": 0.25, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.7663305286145545, | |
| "grad_norm": 1.042289137840271, | |
| "learning_rate": 0.0006823088616267369, | |
| "loss": 0.2449, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.7984456291348192, | |
| "grad_norm": 1.2191327810287476, | |
| "learning_rate": 0.000680167854925386, | |
| "loss": 0.2489, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.8001156143618728, | |
| "eval_loss": 0.23773300647735596, | |
| "eval_runtime": 4.0318, | |
| "eval_samples_per_second": 124.015, | |
| "eval_steps_per_second": 7.937, | |
| "step": 28026 | |
| }, | |
| { | |
| "epoch": 1.8305607296550839, | |
| "grad_norm": 0.9970433712005615, | |
| "learning_rate": 0.0006780268482240349, | |
| "loss": 0.2517, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.8626758301753483, | |
| "grad_norm": 1.0307445526123047, | |
| "learning_rate": 0.000675885841522684, | |
| "loss": 0.2462, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.894790930695613, | |
| "grad_norm": 1.1497652530670166, | |
| "learning_rate": 0.000673744834821333, | |
| "loss": 0.2494, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.9269060312158777, | |
| "grad_norm": 0.8870740532875061, | |
| "learning_rate": 0.000671603828119982, | |
| "loss": 0.2459, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.9590211317361423, | |
| "grad_norm": 1.0110082626342773, | |
| "learning_rate": 0.0006694671034320337, | |
| "loss": 0.2466, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.991136232256407, | |
| "grad_norm": 0.9974693655967712, | |
| "learning_rate": 0.0006673303787440855, | |
| "loss": 0.2469, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.000128460402081, | |
| "eval_loss": 0.23335064947605133, | |
| "eval_runtime": 3.9065, | |
| "eval_samples_per_second": 127.993, | |
| "eval_steps_per_second": 8.192, | |
| "step": 31140 | |
| }, | |
| { | |
| "epoch": 2.0232513327766717, | |
| "grad_norm": 0.8978867530822754, | |
| "learning_rate": 0.0006651893720427345, | |
| "loss": 0.2447, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 2.0553664332969364, | |
| "grad_norm": 0.9614811539649963, | |
| "learning_rate": 0.0006630483653413835, | |
| "loss": 0.2434, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 2.087481533817201, | |
| "grad_norm": 0.9430557489395142, | |
| "learning_rate": 0.0006609073586400326, | |
| "loss": 0.2404, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 2.1195966343374657, | |
| "grad_norm": 1.088191270828247, | |
| "learning_rate": 0.0006587663519386816, | |
| "loss": 0.2403, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 2.15171173485773, | |
| "grad_norm": 1.05572509765625, | |
| "learning_rate": 0.0006566253452373306, | |
| "loss": 0.2417, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 2.1838268353779946, | |
| "grad_norm": 1.0838186740875244, | |
| "learning_rate": 0.0006544843385359796, | |
| "loss": 0.2416, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 2.2001413064422892, | |
| "eval_loss": 0.24185791611671448, | |
| "eval_runtime": 3.8292, | |
| "eval_samples_per_second": 130.575, | |
| "eval_steps_per_second": 8.357, | |
| "step": 34254 | |
| }, | |
| { | |
| "epoch": 2.2159419358982593, | |
| "grad_norm": 0.9424343705177307, | |
| "learning_rate": 0.0006523433318346287, | |
| "loss": 0.2448, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 2.248057036418524, | |
| "grad_norm": 1.4872556924819946, | |
| "learning_rate": 0.0006502023251332776, | |
| "loss": 0.2443, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.2801721369387886, | |
| "grad_norm": 1.3446072340011597, | |
| "learning_rate": 0.0006480613184319268, | |
| "loss": 0.2412, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 2.3122872374590533, | |
| "grad_norm": 0.9808722138404846, | |
| "learning_rate": 0.0006459245937439784, | |
| "loss": 0.2386, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 2.344402337979318, | |
| "grad_norm": 1.144508719444275, | |
| "learning_rate": 0.0006437835870426275, | |
| "loss": 0.2387, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 2.3765174384995826, | |
| "grad_norm": 0.9765673279762268, | |
| "learning_rate": 0.0006416425803412765, | |
| "loss": 0.2402, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 2.4001541524824974, | |
| "eval_loss": 0.2269136607646942, | |
| "eval_runtime": 4.0495, | |
| "eval_samples_per_second": 123.471, | |
| "eval_steps_per_second": 7.902, | |
| "step": 37368 | |
| }, | |
| { | |
| "epoch": 2.4086325390198473, | |
| "grad_norm": 1.0742825269699097, | |
| "learning_rate": 0.0006395015736399256, | |
| "loss": 0.2407, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 2.4407476395401115, | |
| "grad_norm": 1.157128930091858, | |
| "learning_rate": 0.0006373648489519773, | |
| "loss": 0.2376, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 2.472862740060376, | |
| "grad_norm": 0.9436767101287842, | |
| "learning_rate": 0.0006352238422506263, | |
| "loss": 0.2355, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 2.504977840580641, | |
| "grad_norm": 1.5304116010665894, | |
| "learning_rate": 0.0006330828355492754, | |
| "loss": 0.2388, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 2.5370929411009056, | |
| "grad_norm": 0.9645853638648987, | |
| "learning_rate": 0.000630946110861327, | |
| "loss": 0.236, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 2.5692080416211702, | |
| "grad_norm": 0.77340167760849, | |
| "learning_rate": 0.0006288051041599761, | |
| "loss": 0.2401, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.600166998522705, | |
| "eval_loss": 0.22546811401844025, | |
| "eval_runtime": 3.7972, | |
| "eval_samples_per_second": 131.676, | |
| "eval_steps_per_second": 8.427, | |
| "step": 40482 | |
| }, | |
| { | |
| "epoch": 2.601323142141435, | |
| "grad_norm": 1.2897224426269531, | |
| "learning_rate": 0.0006266640974586251, | |
| "loss": 0.2372, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 2.6334382426616996, | |
| "grad_norm": 0.92377108335495, | |
| "learning_rate": 0.0006245230907572741, | |
| "loss": 0.2378, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.6655533431819642, | |
| "grad_norm": 1.231541395187378, | |
| "learning_rate": 0.0006223820840559231, | |
| "loss": 0.236, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 2.697668443702229, | |
| "grad_norm": 1.1643468141555786, | |
| "learning_rate": 0.0006202410773545722, | |
| "loss": 0.2343, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.729783544222493, | |
| "grad_norm": 0.9667991399765015, | |
| "learning_rate": 0.0006181043526666238, | |
| "loss": 0.2336, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 2.7618986447427583, | |
| "grad_norm": 0.9757621884346008, | |
| "learning_rate": 0.0006159633459652729, | |
| "loss": 0.2333, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 2.7940137452630225, | |
| "grad_norm": 1.1959055662155151, | |
| "learning_rate": 0.000613822339263922, | |
| "loss": 0.2368, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 2.8001798445629134, | |
| "eval_loss": 0.23977364599704742, | |
| "eval_runtime": 4.0372, | |
| "eval_samples_per_second": 123.848, | |
| "eval_steps_per_second": 7.926, | |
| "step": 43596 | |
| }, | |
| { | |
| "epoch": 2.826128845783287, | |
| "grad_norm": 0.9506617784500122, | |
| "learning_rate": 0.0006116813325625709, | |
| "loss": 0.2315, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 2.858243946303552, | |
| "grad_norm": 1.273672103881836, | |
| "learning_rate": 0.0006095446078746227, | |
| "loss": 0.2347, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 2.8903590468238165, | |
| "grad_norm": 1.50209641456604, | |
| "learning_rate": 0.0006074036011732717, | |
| "loss": 0.2361, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 2.922474147344081, | |
| "grad_norm": 0.9982122182846069, | |
| "learning_rate": 0.0006052625944719207, | |
| "loss": 0.2338, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 2.954589247864346, | |
| "grad_norm": 1.061805009841919, | |
| "learning_rate": 0.0006031215877705697, | |
| "loss": 0.2325, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 2.9867043483846105, | |
| "grad_norm": 1.1958117485046387, | |
| "learning_rate": 0.0006009805810692188, | |
| "loss": 0.2309, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 3.0001926906031215, | |
| "eval_loss": 0.22259989380836487, | |
| "eval_runtime": 3.8294, | |
| "eval_samples_per_second": 130.568, | |
| "eval_steps_per_second": 8.356, | |
| "step": 46710 | |
| }, | |
| { | |
| "epoch": 3.018819448904875, | |
| "grad_norm": 0.9146483540534973, | |
| "learning_rate": 0.0005988438563812704, | |
| "loss": 0.23, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 3.05093454942514, | |
| "grad_norm": 0.9622049927711487, | |
| "learning_rate": 0.0005967028496799195, | |
| "loss": 0.2294, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 3.0830496499454045, | |
| "grad_norm": 0.9770357608795166, | |
| "learning_rate": 0.0005945618429785685, | |
| "loss": 0.2293, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 3.1151647504656688, | |
| "grad_norm": 1.1833593845367432, | |
| "learning_rate": 0.0005924208362772176, | |
| "loss": 0.2266, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 3.1472798509859334, | |
| "grad_norm": 0.7183510065078735, | |
| "learning_rate": 0.0005902798295758665, | |
| "loss": 0.2278, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 3.179394951506198, | |
| "grad_norm": 0.8913053870201111, | |
| "learning_rate": 0.0005881388228745156, | |
| "loss": 0.2289, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 3.2002055366433297, | |
| "eval_loss": 0.22066444158554077, | |
| "eval_runtime": 3.8492, | |
| "eval_samples_per_second": 129.896, | |
| "eval_steps_per_second": 8.313, | |
| "step": 49824 | |
| }, | |
| { | |
| "epoch": 3.211510052026463, | |
| "grad_norm": 0.800855815410614, | |
| "learning_rate": 0.0005859978161731647, | |
| "loss": 0.2271, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 3.2436251525467275, | |
| "grad_norm": 0.8037746548652649, | |
| "learning_rate": 0.0005838568094718137, | |
| "loss": 0.2295, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 3.275740253066992, | |
| "grad_norm": 0.9885351657867432, | |
| "learning_rate": 0.0005817158027704627, | |
| "loss": 0.224, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 3.307855353587257, | |
| "grad_norm": 0.8889601826667786, | |
| "learning_rate": 0.0005795790780825145, | |
| "loss": 0.2274, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 3.3399704541075215, | |
| "grad_norm": 1.0997310876846313, | |
| "learning_rate": 0.0005774423533945662, | |
| "loss": 0.2225, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 3.372085554627786, | |
| "grad_norm": 0.7647742629051208, | |
| "learning_rate": 0.0005753013466932152, | |
| "loss": 0.226, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 3.400218382683538, | |
| "eval_loss": 0.21940012276172638, | |
| "eval_runtime": 4.117, | |
| "eval_samples_per_second": 121.447, | |
| "eval_steps_per_second": 7.773, | |
| "step": 52938 | |
| }, | |
| { | |
| "epoch": 3.404200655148051, | |
| "grad_norm": 1.265453815460205, | |
| "learning_rate": 0.0005731603399918643, | |
| "loss": 0.2285, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 3.436315755668315, | |
| "grad_norm": 0.9455955028533936, | |
| "learning_rate": 0.0005710193332905132, | |
| "loss": 0.2283, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 3.4684308561885797, | |
| "grad_norm": 1.289652943611145, | |
| "learning_rate": 0.0005688783265891623, | |
| "loss": 0.2268, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 3.5005459567088444, | |
| "grad_norm": 1.1715284585952759, | |
| "learning_rate": 0.0005667373198878113, | |
| "loss": 0.2282, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 3.532661057229109, | |
| "grad_norm": 0.9027577042579651, | |
| "learning_rate": 0.0005645963131864603, | |
| "loss": 0.2257, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 3.5647761577493737, | |
| "grad_norm": 0.9521860480308533, | |
| "learning_rate": 0.0005624553064851093, | |
| "loss": 0.2258, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 3.5968912582696384, | |
| "grad_norm": 1.1611847877502441, | |
| "learning_rate": 0.0005603185817971611, | |
| "loss": 0.2249, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 3.6002312287237457, | |
| "eval_loss": 0.21782347559928894, | |
| "eval_runtime": 3.787, | |
| "eval_samples_per_second": 132.029, | |
| "eval_steps_per_second": 8.45, | |
| "step": 56052 | |
| }, | |
| { | |
| "epoch": 3.629006358789903, | |
| "grad_norm": 1.0853767395019531, | |
| "learning_rate": 0.00055817757509581, | |
| "loss": 0.2231, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 3.6611214593101677, | |
| "grad_norm": 1.7563400268554688, | |
| "learning_rate": 0.0005560365683944591, | |
| "loss": 0.2235, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 3.6932365598304324, | |
| "grad_norm": 1.2496379613876343, | |
| "learning_rate": 0.0005538955616931081, | |
| "loss": 0.2206, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 3.7253516603506966, | |
| "grad_norm": 0.9466719031333923, | |
| "learning_rate": 0.0005517588370051598, | |
| "loss": 0.2241, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 3.7574667608709618, | |
| "grad_norm": 0.9584017992019653, | |
| "learning_rate": 0.0005496178303038089, | |
| "loss": 0.2164, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 3.789581861391226, | |
| "grad_norm": 0.9684711694717407, | |
| "learning_rate": 0.0005474768236024579, | |
| "loss": 0.2214, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 3.800244074763954, | |
| "eval_loss": 0.21725060045719147, | |
| "eval_runtime": 4.1337, | |
| "eval_samples_per_second": 120.958, | |
| "eval_steps_per_second": 7.741, | |
| "step": 59166 | |
| }, | |
| { | |
| "epoch": 3.8216969619114907, | |
| "grad_norm": 2.9653055667877197, | |
| "learning_rate": 0.000545335816901107, | |
| "loss": 0.2248, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 3.8538120624317553, | |
| "grad_norm": 1.0390269756317139, | |
| "learning_rate": 0.0005431990922131586, | |
| "loss": 0.2192, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.88592716295202, | |
| "grad_norm": 1.2799882888793945, | |
| "learning_rate": 0.0005410580855118077, | |
| "loss": 0.2228, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 3.9180422634722847, | |
| "grad_norm": 0.9130102396011353, | |
| "learning_rate": 0.0005389213608238593, | |
| "loss": 0.2228, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 3.9501573639925494, | |
| "grad_norm": 1.771164894104004, | |
| "learning_rate": 0.0005367803541225084, | |
| "loss": 0.2228, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 3.982272464512814, | |
| "grad_norm": 0.9048191905021667, | |
| "learning_rate": 0.0005346393474211574, | |
| "loss": 0.2207, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 4.000256920804162, | |
| "eval_loss": 0.21284444630146027, | |
| "eval_runtime": 4.0636, | |
| "eval_samples_per_second": 123.045, | |
| "eval_steps_per_second": 7.875, | |
| "step": 62280 | |
| }, | |
| { | |
| "epoch": 4.014387565033078, | |
| "grad_norm": 1.1616911888122559, | |
| "learning_rate": 0.0005324983407198065, | |
| "loss": 0.2192, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 4.046502665553343, | |
| "grad_norm": 1.1269534826278687, | |
| "learning_rate": 0.0005303573340184554, | |
| "loss": 0.2164, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 4.078617766073608, | |
| "grad_norm": 0.9181855320930481, | |
| "learning_rate": 0.0005282206093305072, | |
| "loss": 0.2185, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 4.110732866593873, | |
| "grad_norm": 1.3248172998428345, | |
| "learning_rate": 0.0005260796026291563, | |
| "loss": 0.218, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 4.142847967114137, | |
| "grad_norm": 1.0312740802764893, | |
| "learning_rate": 0.0005239385959278053, | |
| "loss": 0.217, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 4.174963067634402, | |
| "grad_norm": 1.00308096408844, | |
| "learning_rate": 0.0005217975892264544, | |
| "loss": 0.2158, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 4.20026976684437, | |
| "eval_loss": 0.21044744551181793, | |
| "eval_runtime": 3.8176, | |
| "eval_samples_per_second": 130.973, | |
| "eval_steps_per_second": 8.382, | |
| "step": 65394 | |
| }, | |
| { | |
| "epoch": 4.207078168154666, | |
| "grad_norm": 0.6961658000946045, | |
| "learning_rate": 0.0005196565825251034, | |
| "loss": 0.2168, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 4.239193268674931, | |
| "grad_norm": 0.9448217749595642, | |
| "learning_rate": 0.0005175155758237524, | |
| "loss": 0.2163, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 4.271308369195196, | |
| "grad_norm": 0.9778387546539307, | |
| "learning_rate": 0.0005153745691224014, | |
| "loss": 0.217, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 4.30342346971546, | |
| "grad_norm": 1.1238789558410645, | |
| "learning_rate": 0.0005132335624210505, | |
| "loss": 0.2174, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 4.335538570235725, | |
| "grad_norm": 0.926021933555603, | |
| "learning_rate": 0.0005110968377331021, | |
| "loss": 0.2119, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 4.367653670755989, | |
| "grad_norm": 1.2473511695861816, | |
| "learning_rate": 0.0005089558310317512, | |
| "loss": 0.2187, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 4.399768771276254, | |
| "grad_norm": 1.4314179420471191, | |
| "learning_rate": 0.0005068148243304002, | |
| "loss": 0.2147, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 4.4002826128845784, | |
| "eval_loss": 0.2070987969636917, | |
| "eval_runtime": 3.7911, | |
| "eval_samples_per_second": 131.887, | |
| "eval_steps_per_second": 8.441, | |
| "step": 68508 | |
| }, | |
| { | |
| "epoch": 4.4318838717965185, | |
| "grad_norm": 1.2765947580337524, | |
| "learning_rate": 0.0005046738176290492, | |
| "loss": 0.2123, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 4.463998972316784, | |
| "grad_norm": 0.9501237273216248, | |
| "learning_rate": 0.0005025370929411009, | |
| "loss": 0.2142, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 4.496114072837048, | |
| "grad_norm": 1.161289930343628, | |
| "learning_rate": 0.00050039608623975, | |
| "loss": 0.2125, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 4.528229173357312, | |
| "grad_norm": 0.9597361087799072, | |
| "learning_rate": 0.0004982550795383989, | |
| "loss": 0.213, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 4.560344273877577, | |
| "grad_norm": 1.041593074798584, | |
| "learning_rate": 0.000496114072837048, | |
| "loss": 0.2147, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 4.592459374397842, | |
| "grad_norm": 0.8988145589828491, | |
| "learning_rate": 0.0004939773481490998, | |
| "loss": 0.2139, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 4.600295458924786, | |
| "eval_loss": 0.20830930769443512, | |
| "eval_runtime": 3.7998, | |
| "eval_samples_per_second": 131.585, | |
| "eval_steps_per_second": 8.421, | |
| "step": 71622 | |
| }, | |
| { | |
| "epoch": 4.624574474918107, | |
| "grad_norm": 0.9001346230506897, | |
| "learning_rate": 0.0004918363414477487, | |
| "loss": 0.2132, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 4.656689575438371, | |
| "grad_norm": 0.8040429949760437, | |
| "learning_rate": 0.0004896953347463978, | |
| "loss": 0.2113, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 4.688804675958636, | |
| "grad_norm": 0.9886132478713989, | |
| "learning_rate": 0.0004875543280450468, | |
| "loss": 0.2085, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 4.7209197764789, | |
| "grad_norm": 1.1031527519226074, | |
| "learning_rate": 0.0004854133213436958, | |
| "loss": 0.2117, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 4.753034876999165, | |
| "grad_norm": 0.9805654287338257, | |
| "learning_rate": 0.0004832723146423448, | |
| "loss": 0.2087, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 4.7851499775194295, | |
| "grad_norm": 1.1665472984313965, | |
| "learning_rate": 0.00048113130794099387, | |
| "loss": 0.2094, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 4.800308304964995, | |
| "eval_loss": 0.20766516029834747, | |
| "eval_runtime": 3.8162, | |
| "eval_samples_per_second": 131.019, | |
| "eval_steps_per_second": 8.385, | |
| "step": 74736 | |
| }, | |
| { | |
| "epoch": 4.817265078039695, | |
| "grad_norm": 1.1881592273712158, | |
| "learning_rate": 0.00047899030123964287, | |
| "loss": 0.2123, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 4.849380178559959, | |
| "grad_norm": 1.1299117803573608, | |
| "learning_rate": 0.0004768535765516946, | |
| "loss": 0.2048, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 4.881495279080223, | |
| "grad_norm": 0.9001392722129822, | |
| "learning_rate": 0.00047471256985034363, | |
| "loss": 0.2067, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 4.913610379600488, | |
| "grad_norm": 0.7669143676757812, | |
| "learning_rate": 0.00047257156314899263, | |
| "loss": 0.2133, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 4.945725480120752, | |
| "grad_norm": 1.0141063928604126, | |
| "learning_rate": 0.0004704305564476417, | |
| "loss": 0.208, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 4.9778405806410175, | |
| "grad_norm": 0.9920214414596558, | |
| "learning_rate": 0.0004682895497462907, | |
| "loss": 0.2072, | |
| "step": 77500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 186828, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 12, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.7295700226799043e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |