{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 3114, "global_step": 108983, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03211510052026463, "grad_norm": 1.2270241975784302, "learning_rate": 0.0007978718393388571, "loss": 0.4499, "step": 500 }, { "epoch": 0.06423020104052926, "grad_norm": 1.1903154850006104, "learning_rate": 0.0007957308326375062, "loss": 0.3541, "step": 1000 }, { "epoch": 0.09634530156079389, "grad_norm": 1.1337592601776123, "learning_rate": 0.0007935898259361552, "loss": 0.3336, "step": 1500 }, { "epoch": 0.12846040208105852, "grad_norm": 0.9932805895805359, "learning_rate": 0.0007914488192348043, "loss": 0.3187, "step": 2000 }, { "epoch": 0.16057550260132314, "grad_norm": 1.4273182153701782, "learning_rate": 0.0007893120945468559, "loss": 0.3094, "step": 2500 }, { "epoch": 0.19269060312158778, "grad_norm": 1.296076774597168, "learning_rate": 0.000787171087845505, "loss": 0.306, "step": 3000 }, { "epoch": 0.2000128460402081, "eval_loss": 0.3016555607318878, "eval_runtime": 4.134, "eval_samples_per_second": 120.948, "eval_steps_per_second": 7.741, "step": 3114 }, { "epoch": 0.2248057036418524, "grad_norm": 1.1079399585723877, "learning_rate": 0.0007850343631575566, "loss": 0.2993, "step": 3500 }, { "epoch": 0.25692080416211704, "grad_norm": 1.5939998626708984, "learning_rate": 0.0007828933564562057, "loss": 0.2985, "step": 4000 }, { "epoch": 0.28903590468238166, "grad_norm": 1.4033524990081787, "learning_rate": 0.0007807523497548548, "loss": 0.2959, "step": 4500 }, { "epoch": 0.3211510052026463, "grad_norm": 1.9331912994384766, "learning_rate": 0.0007786113430535038, "loss": 0.2881, "step": 5000 }, { "epoch": 0.3532661057229109, "grad_norm": 1.112199068069458, "learning_rate": 0.0007764703363521528, "loss": 0.2874, "step": 5500 }, { "epoch": 0.38538120624317557, "grad_norm": 0.9604991674423218, "learning_rate": 0.0007743293296508018, "loss": 0.2867, "step": 6000 }, { "epoch": 0.4000256920804162, "eval_loss": 0.27945849299430847, "eval_runtime": 4.1149, "eval_samples_per_second": 121.509, "eval_steps_per_second": 7.777, "step": 6228 }, { "epoch": 0.4174963067634402, "grad_norm": 1.542017936706543, "learning_rate": 0.0007721883229494509, "loss": 0.2842, "step": 6500 }, { "epoch": 0.4496114072837048, "grad_norm": 0.9913608431816101, "learning_rate": 0.0007700473162480999, "loss": 0.2805, "step": 7000 }, { "epoch": 0.4817265078039694, "grad_norm": 1.0861561298370361, "learning_rate": 0.000767906309546749, "loss": 0.2795, "step": 7500 }, { "epoch": 0.5138416083242341, "grad_norm": 1.408018946647644, "learning_rate": 0.0007657695848588007, "loss": 0.2791, "step": 8000 }, { "epoch": 0.5459567088444987, "grad_norm": 0.935492992401123, "learning_rate": 0.0007636285781574497, "loss": 0.2768, "step": 8500 }, { "epoch": 0.5780718093647633, "grad_norm": 0.974107027053833, "learning_rate": 0.0007614875714560987, "loss": 0.2775, "step": 9000 }, { "epoch": 0.6000385381206244, "eval_loss": 0.26882508397102356, "eval_runtime": 4.0584, "eval_samples_per_second": 123.2, "eval_steps_per_second": 7.885, "step": 9342 }, { "epoch": 0.610186909885028, "grad_norm": 1.00677490234375, "learning_rate": 0.0007593465647547478, "loss": 0.2719, "step": 9500 }, { "epoch": 0.6423020104052926, "grad_norm": 1.3076094388961792, "learning_rate": 0.0007572098400667994, "loss": 0.2733, "step": 10000 }, { "epoch": 0.6744171109255572, "grad_norm": 1.332555890083313, "learning_rate": 0.0007550688333654485, "loss": 0.2738, "step": 10500 }, { "epoch": 0.7065322114458218, "grad_norm": 1.065308928489685, "learning_rate": 0.0007529278266640976, "loss": 0.2697, "step": 11000 }, { "epoch": 0.7386473119660865, "grad_norm": 1.1714705228805542, "learning_rate": 0.0007507868199627465, "loss": 0.2718, "step": 11500 }, { "epoch": 0.7707624124863511, "grad_norm": 1.545327067375183, "learning_rate": 0.0007486500952747983, "loss": 0.2701, "step": 12000 }, { "epoch": 0.8000513841608324, "eval_loss": 0.25833237171173096, "eval_runtime": 3.9621, "eval_samples_per_second": 126.196, "eval_steps_per_second": 8.077, "step": 12456 }, { "epoch": 0.8028775130066157, "grad_norm": 0.8509078621864319, "learning_rate": 0.0007465090885734473, "loss": 0.2702, "step": 12500 }, { "epoch": 0.8349926135268804, "grad_norm": 1.0517570972442627, "learning_rate": 0.0007443680818720963, "loss": 0.2684, "step": 13000 }, { "epoch": 0.8671077140471449, "grad_norm": 1.0709669589996338, "learning_rate": 0.000742231357184148, "loss": 0.2674, "step": 13500 }, { "epoch": 0.8992228145674096, "grad_norm": 1.1354570388793945, "learning_rate": 0.0007400903504827971, "loss": 0.269, "step": 14000 }, { "epoch": 0.9313379150876743, "grad_norm": 1.5820938348770142, "learning_rate": 0.000737949343781446, "loss": 0.2665, "step": 14500 }, { "epoch": 0.9634530156079388, "grad_norm": 1.3303571939468384, "learning_rate": 0.0007358083370800951, "loss": 0.2644, "step": 15000 }, { "epoch": 0.9955681161282035, "grad_norm": 1.0390913486480713, "learning_rate": 0.0007336716123921468, "loss": 0.268, "step": 15500 }, { "epoch": 1.0000642302010405, "eval_loss": 0.25299617648124695, "eval_runtime": 3.8751, "eval_samples_per_second": 129.03, "eval_steps_per_second": 8.258, "step": 15570 }, { "epoch": 1.0276832166484682, "grad_norm": 1.2327704429626465, "learning_rate": 0.0007315306056907958, "loss": 0.263, "step": 16000 }, { "epoch": 1.0597983171687329, "grad_norm": 0.9403806924819946, "learning_rate": 0.0007293895989894448, "loss": 0.2633, "step": 16500 }, { "epoch": 1.0919134176889973, "grad_norm": 1.1138664484024048, "learning_rate": 0.0007272485922880939, "loss": 0.2608, "step": 17000 }, { "epoch": 1.124028518209262, "grad_norm": 1.1546539068222046, "learning_rate": 0.000725107585586743, "loss": 0.2569, "step": 17500 }, { "epoch": 1.1561436187295266, "grad_norm": 1.0123635530471802, "learning_rate": 0.0007229665788853919, "loss": 0.2596, "step": 18000 }, { "epoch": 1.1882587192497913, "grad_norm": 1.1647980213165283, "learning_rate": 0.000720825572184041, "loss": 0.2609, "step": 18500 }, { "epoch": 1.2000770762412487, "eval_loss": 0.2469949871301651, "eval_runtime": 3.796, "eval_samples_per_second": 131.719, "eval_steps_per_second": 8.43, "step": 18684 }, { "epoch": 1.2203738197700558, "grad_norm": 1.2368906736373901, "learning_rate": 0.00071868456548269, "loss": 0.2597, "step": 19000 }, { "epoch": 1.2524889202903204, "grad_norm": 0.9881177544593811, "learning_rate": 0.000716543558781339, "loss": 0.2563, "step": 19500 }, { "epoch": 1.2846040208105851, "grad_norm": 0.9961882829666138, "learning_rate": 0.0007144068340933907, "loss": 0.2563, "step": 20000 }, { "epoch": 1.3167191213308498, "grad_norm": 1.6545355319976807, "learning_rate": 0.0007122658273920398, "loss": 0.2566, "step": 20500 }, { "epoch": 1.3488342218511145, "grad_norm": 1.2175770998001099, "learning_rate": 0.0007101248206906887, "loss": 0.251, "step": 21000 }, { "epoch": 1.3809493223713791, "grad_norm": 1.2942149639129639, "learning_rate": 0.0007079838139893379, "loss": 0.2549, "step": 21500 }, { "epoch": 1.4000899222814567, "eval_loss": 0.24247248470783234, "eval_runtime": 4.0127, "eval_samples_per_second": 124.605, "eval_steps_per_second": 7.975, "step": 21798 }, { "epoch": 1.4130644228916436, "grad_norm": 0.9972023963928223, "learning_rate": 0.0007058470893013895, "loss": 0.2532, "step": 22000 }, { "epoch": 1.4451795234119083, "grad_norm": 2.422755479812622, "learning_rate": 0.0007037060826000386, "loss": 0.2525, "step": 22500 }, { "epoch": 1.477294623932173, "grad_norm": 1.0350821018218994, "learning_rate": 0.0007015650758986876, "loss": 0.2528, "step": 23000 }, { "epoch": 1.5094097244524374, "grad_norm": 0.9712342023849487, "learning_rate": 0.0006994240691973367, "loss": 0.2498, "step": 23500 }, { "epoch": 1.541524824972702, "grad_norm": 1.0698814392089844, "learning_rate": 0.0006972830624959856, "loss": 0.2511, "step": 24000 }, { "epoch": 1.5736399254929667, "grad_norm": 1.0637270212173462, "learning_rate": 0.0006951463378080374, "loss": 0.2542, "step": 24500 }, { "epoch": 1.6001027683216649, "eval_loss": 0.23835672438144684, "eval_runtime": 3.8011, "eval_samples_per_second": 131.543, "eval_steps_per_second": 8.419, "step": 24912 }, { "epoch": 1.6057550260132314, "grad_norm": 1.08571195602417, "learning_rate": 0.0006930053311066865, "loss": 0.2531, "step": 25000 }, { "epoch": 1.637870126533496, "grad_norm": 0.9403467774391174, "learning_rate": 0.0006908643244053354, "loss": 0.2522, "step": 25500 }, { "epoch": 1.6699852270537607, "grad_norm": 0.8324321508407593, "learning_rate": 0.0006887233177039845, "loss": 0.2493, "step": 26000 }, { "epoch": 1.7021003275740254, "grad_norm": 0.9599499702453613, "learning_rate": 0.0006865908750294389, "loss": 0.2485, "step": 26500 }, { "epoch": 1.73421542809429, "grad_norm": 1.369850754737854, "learning_rate": 0.0006844498683280879, "loss": 0.25, "step": 27000 }, { "epoch": 1.7663305286145545, "grad_norm": 1.042289137840271, "learning_rate": 0.0006823088616267369, "loss": 0.2449, "step": 27500 }, { "epoch": 1.7984456291348192, "grad_norm": 1.2191327810287476, "learning_rate": 0.000680167854925386, "loss": 0.2489, "step": 28000 }, { "epoch": 1.8001156143618728, "eval_loss": 0.23773300647735596, "eval_runtime": 4.0318, "eval_samples_per_second": 124.015, "eval_steps_per_second": 7.937, "step": 28026 }, { "epoch": 1.8305607296550839, "grad_norm": 0.9970433712005615, "learning_rate": 0.0006780268482240349, "loss": 0.2517, "step": 28500 }, { "epoch": 1.8626758301753483, "grad_norm": 1.0307445526123047, "learning_rate": 0.000675885841522684, "loss": 0.2462, "step": 29000 }, { "epoch": 1.894790930695613, "grad_norm": 1.1497652530670166, "learning_rate": 0.000673744834821333, "loss": 0.2494, "step": 29500 }, { "epoch": 1.9269060312158777, "grad_norm": 0.8870740532875061, "learning_rate": 0.000671603828119982, "loss": 0.2459, "step": 30000 }, { "epoch": 1.9590211317361423, "grad_norm": 1.0110082626342773, "learning_rate": 0.0006694671034320337, "loss": 0.2466, "step": 30500 }, { "epoch": 1.991136232256407, "grad_norm": 0.9974693655967712, "learning_rate": 0.0006673303787440855, "loss": 0.2469, "step": 31000 }, { "epoch": 2.000128460402081, "eval_loss": 0.23335064947605133, "eval_runtime": 3.9065, "eval_samples_per_second": 127.993, "eval_steps_per_second": 8.192, "step": 31140 }, { "epoch": 2.0232513327766717, "grad_norm": 0.8978867530822754, "learning_rate": 0.0006651893720427345, "loss": 0.2447, "step": 31500 }, { "epoch": 2.0553664332969364, "grad_norm": 0.9614811539649963, "learning_rate": 0.0006630483653413835, "loss": 0.2434, "step": 32000 }, { "epoch": 2.087481533817201, "grad_norm": 0.9430557489395142, "learning_rate": 0.0006609073586400326, "loss": 0.2404, "step": 32500 }, { "epoch": 2.1195966343374657, "grad_norm": 1.088191270828247, "learning_rate": 0.0006587663519386816, "loss": 0.2403, "step": 33000 }, { "epoch": 2.15171173485773, "grad_norm": 1.05572509765625, "learning_rate": 0.0006566253452373306, "loss": 0.2417, "step": 33500 }, { "epoch": 2.1838268353779946, "grad_norm": 1.0838186740875244, "learning_rate": 0.0006544843385359796, "loss": 0.2416, "step": 34000 }, { "epoch": 2.2001413064422892, "eval_loss": 0.24185791611671448, "eval_runtime": 3.8292, "eval_samples_per_second": 130.575, "eval_steps_per_second": 8.357, "step": 34254 }, { "epoch": 2.2159419358982593, "grad_norm": 0.9424343705177307, "learning_rate": 0.0006523433318346287, "loss": 0.2448, "step": 34500 }, { "epoch": 2.248057036418524, "grad_norm": 1.4872556924819946, "learning_rate": 0.0006502023251332776, "loss": 0.2443, "step": 35000 }, { "epoch": 2.2801721369387886, "grad_norm": 1.3446072340011597, "learning_rate": 0.0006480613184319268, "loss": 0.2412, "step": 35500 }, { "epoch": 2.3122872374590533, "grad_norm": 0.9808722138404846, "learning_rate": 0.0006459245937439784, "loss": 0.2386, "step": 36000 }, { "epoch": 2.344402337979318, "grad_norm": 1.144508719444275, "learning_rate": 0.0006437835870426275, "loss": 0.2387, "step": 36500 }, { "epoch": 2.3765174384995826, "grad_norm": 0.9765673279762268, "learning_rate": 0.0006416425803412765, "loss": 0.2402, "step": 37000 }, { "epoch": 2.4001541524824974, "eval_loss": 0.2269136607646942, "eval_runtime": 4.0495, "eval_samples_per_second": 123.471, "eval_steps_per_second": 7.902, "step": 37368 }, { "epoch": 2.4086325390198473, "grad_norm": 1.0742825269699097, "learning_rate": 0.0006395015736399256, "loss": 0.2407, "step": 37500 }, { "epoch": 2.4407476395401115, "grad_norm": 1.157128930091858, "learning_rate": 0.0006373648489519773, "loss": 0.2376, "step": 38000 }, { "epoch": 2.472862740060376, "grad_norm": 0.9436767101287842, "learning_rate": 0.0006352238422506263, "loss": 0.2355, "step": 38500 }, { "epoch": 2.504977840580641, "grad_norm": 1.5304116010665894, "learning_rate": 0.0006330828355492754, "loss": 0.2388, "step": 39000 }, { "epoch": 2.5370929411009056, "grad_norm": 0.9645853638648987, "learning_rate": 0.000630946110861327, "loss": 0.236, "step": 39500 }, { "epoch": 2.5692080416211702, "grad_norm": 0.77340167760849, "learning_rate": 0.0006288051041599761, "loss": 0.2401, "step": 40000 }, { "epoch": 2.600166998522705, "eval_loss": 0.22546811401844025, "eval_runtime": 3.7972, "eval_samples_per_second": 131.676, "eval_steps_per_second": 8.427, "step": 40482 }, { "epoch": 2.601323142141435, "grad_norm": 1.2897224426269531, "learning_rate": 0.0006266640974586251, "loss": 0.2372, "step": 40500 }, { "epoch": 2.6334382426616996, "grad_norm": 0.92377108335495, "learning_rate": 0.0006245230907572741, "loss": 0.2378, "step": 41000 }, { "epoch": 2.6655533431819642, "grad_norm": 1.231541395187378, "learning_rate": 0.0006223820840559231, "loss": 0.236, "step": 41500 }, { "epoch": 2.697668443702229, "grad_norm": 1.1643468141555786, "learning_rate": 0.0006202410773545722, "loss": 0.2343, "step": 42000 }, { "epoch": 2.729783544222493, "grad_norm": 0.9667991399765015, "learning_rate": 0.0006181043526666238, "loss": 0.2336, "step": 42500 }, { "epoch": 2.7618986447427583, "grad_norm": 0.9757621884346008, "learning_rate": 0.0006159633459652729, "loss": 0.2333, "step": 43000 }, { "epoch": 2.7940137452630225, "grad_norm": 1.1959055662155151, "learning_rate": 0.000613822339263922, "loss": 0.2368, "step": 43500 }, { "epoch": 2.8001798445629134, "eval_loss": 0.23977364599704742, "eval_runtime": 4.0372, "eval_samples_per_second": 123.848, "eval_steps_per_second": 7.926, "step": 43596 }, { "epoch": 2.826128845783287, "grad_norm": 0.9506617784500122, "learning_rate": 0.0006116813325625709, "loss": 0.2315, "step": 44000 }, { "epoch": 2.858243946303552, "grad_norm": 1.273672103881836, "learning_rate": 0.0006095446078746227, "loss": 0.2347, "step": 44500 }, { "epoch": 2.8903590468238165, "grad_norm": 1.50209641456604, "learning_rate": 0.0006074036011732717, "loss": 0.2361, "step": 45000 }, { "epoch": 2.922474147344081, "grad_norm": 0.9982122182846069, "learning_rate": 0.0006052625944719207, "loss": 0.2338, "step": 45500 }, { "epoch": 2.954589247864346, "grad_norm": 1.061805009841919, "learning_rate": 0.0006031215877705697, "loss": 0.2325, "step": 46000 }, { "epoch": 2.9867043483846105, "grad_norm": 1.1958117485046387, "learning_rate": 0.0006009805810692188, "loss": 0.2309, "step": 46500 }, { "epoch": 3.0001926906031215, "eval_loss": 0.22259989380836487, "eval_runtime": 3.8294, "eval_samples_per_second": 130.568, "eval_steps_per_second": 8.356, "step": 46710 }, { "epoch": 3.018819448904875, "grad_norm": 0.9146483540534973, "learning_rate": 0.0005988438563812704, "loss": 0.23, "step": 47000 }, { "epoch": 3.05093454942514, "grad_norm": 0.9622049927711487, "learning_rate": 0.0005967028496799195, "loss": 0.2294, "step": 47500 }, { "epoch": 3.0830496499454045, "grad_norm": 0.9770357608795166, "learning_rate": 0.0005945618429785685, "loss": 0.2293, "step": 48000 }, { "epoch": 3.1151647504656688, "grad_norm": 1.1833593845367432, "learning_rate": 0.0005924208362772176, "loss": 0.2266, "step": 48500 }, { "epoch": 3.1472798509859334, "grad_norm": 0.7183510065078735, "learning_rate": 0.0005902798295758665, "loss": 0.2278, "step": 49000 }, { "epoch": 3.179394951506198, "grad_norm": 0.8913053870201111, "learning_rate": 0.0005881388228745156, "loss": 0.2289, "step": 49500 }, { "epoch": 3.2002055366433297, "eval_loss": 0.22066444158554077, "eval_runtime": 3.8492, "eval_samples_per_second": 129.896, "eval_steps_per_second": 8.313, "step": 49824 }, { "epoch": 3.211510052026463, "grad_norm": 0.800855815410614, "learning_rate": 0.0005859978161731647, "loss": 0.2271, "step": 50000 }, { "epoch": 3.2436251525467275, "grad_norm": 0.8037746548652649, "learning_rate": 0.0005838568094718137, "loss": 0.2295, "step": 50500 }, { "epoch": 3.275740253066992, "grad_norm": 0.9885351657867432, "learning_rate": 0.0005817158027704627, "loss": 0.224, "step": 51000 }, { "epoch": 3.307855353587257, "grad_norm": 0.8889601826667786, "learning_rate": 0.0005795790780825145, "loss": 0.2274, "step": 51500 }, { "epoch": 3.3399704541075215, "grad_norm": 1.0997310876846313, "learning_rate": 0.0005774423533945662, "loss": 0.2225, "step": 52000 }, { "epoch": 3.372085554627786, "grad_norm": 0.7647742629051208, "learning_rate": 0.0005753013466932152, "loss": 0.226, "step": 52500 }, { "epoch": 3.400218382683538, "eval_loss": 0.21940012276172638, "eval_runtime": 4.117, "eval_samples_per_second": 121.447, "eval_steps_per_second": 7.773, "step": 52938 }, { "epoch": 3.404200655148051, "grad_norm": 1.265453815460205, "learning_rate": 0.0005731603399918643, "loss": 0.2285, "step": 53000 }, { "epoch": 3.436315755668315, "grad_norm": 0.9455955028533936, "learning_rate": 0.0005710193332905132, "loss": 0.2283, "step": 53500 }, { "epoch": 3.4684308561885797, "grad_norm": 1.289652943611145, "learning_rate": 0.0005688783265891623, "loss": 0.2268, "step": 54000 }, { "epoch": 3.5005459567088444, "grad_norm": 1.1715284585952759, "learning_rate": 0.0005667373198878113, "loss": 0.2282, "step": 54500 }, { "epoch": 3.532661057229109, "grad_norm": 0.9027577042579651, "learning_rate": 0.0005645963131864603, "loss": 0.2257, "step": 55000 }, { "epoch": 3.5647761577493737, "grad_norm": 0.9521860480308533, "learning_rate": 0.0005624553064851093, "loss": 0.2258, "step": 55500 }, { "epoch": 3.5968912582696384, "grad_norm": 1.1611847877502441, "learning_rate": 0.0005603185817971611, "loss": 0.2249, "step": 56000 }, { "epoch": 3.6002312287237457, "eval_loss": 0.21782347559928894, "eval_runtime": 3.787, "eval_samples_per_second": 132.029, "eval_steps_per_second": 8.45, "step": 56052 }, { "epoch": 3.629006358789903, "grad_norm": 1.0853767395019531, "learning_rate": 0.00055817757509581, "loss": 0.2231, "step": 56500 }, { "epoch": 3.6611214593101677, "grad_norm": 1.7563400268554688, "learning_rate": 0.0005560365683944591, "loss": 0.2235, "step": 57000 }, { "epoch": 3.6932365598304324, "grad_norm": 1.2496379613876343, "learning_rate": 0.0005538955616931081, "loss": 0.2206, "step": 57500 }, { "epoch": 3.7253516603506966, "grad_norm": 0.9466719031333923, "learning_rate": 0.0005517588370051598, "loss": 0.2241, "step": 58000 }, { "epoch": 3.7574667608709618, "grad_norm": 0.9584017992019653, "learning_rate": 0.0005496178303038089, "loss": 0.2164, "step": 58500 }, { "epoch": 3.789581861391226, "grad_norm": 0.9684711694717407, "learning_rate": 0.0005474768236024579, "loss": 0.2214, "step": 59000 }, { "epoch": 3.800244074763954, "eval_loss": 0.21725060045719147, "eval_runtime": 4.1337, "eval_samples_per_second": 120.958, "eval_steps_per_second": 7.741, "step": 59166 }, { "epoch": 3.8216969619114907, "grad_norm": 2.9653055667877197, "learning_rate": 0.000545335816901107, "loss": 0.2248, "step": 59500 }, { "epoch": 3.8538120624317553, "grad_norm": 1.0390269756317139, "learning_rate": 0.0005431990922131586, "loss": 0.2192, "step": 60000 }, { "epoch": 3.88592716295202, "grad_norm": 1.2799882888793945, "learning_rate": 0.0005410580855118077, "loss": 0.2228, "step": 60500 }, { "epoch": 3.9180422634722847, "grad_norm": 0.9130102396011353, "learning_rate": 0.0005389213608238593, "loss": 0.2228, "step": 61000 }, { "epoch": 3.9501573639925494, "grad_norm": 1.771164894104004, "learning_rate": 0.0005367803541225084, "loss": 0.2228, "step": 61500 }, { "epoch": 3.982272464512814, "grad_norm": 0.9048191905021667, "learning_rate": 0.0005346393474211574, "loss": 0.2207, "step": 62000 }, { "epoch": 4.000256920804162, "eval_loss": 0.21284444630146027, "eval_runtime": 4.0636, "eval_samples_per_second": 123.045, "eval_steps_per_second": 7.875, "step": 62280 }, { "epoch": 4.014387565033078, "grad_norm": 1.1616911888122559, "learning_rate": 0.0005324983407198065, "loss": 0.2192, "step": 62500 }, { "epoch": 4.046502665553343, "grad_norm": 1.1269534826278687, "learning_rate": 0.0005303573340184554, "loss": 0.2164, "step": 63000 }, { "epoch": 4.078617766073608, "grad_norm": 0.9181855320930481, "learning_rate": 0.0005282206093305072, "loss": 0.2185, "step": 63500 }, { "epoch": 4.110732866593873, "grad_norm": 1.3248172998428345, "learning_rate": 0.0005260796026291563, "loss": 0.218, "step": 64000 }, { "epoch": 4.142847967114137, "grad_norm": 1.0312740802764893, "learning_rate": 0.0005239385959278053, "loss": 0.217, "step": 64500 }, { "epoch": 4.174963067634402, "grad_norm": 1.00308096408844, "learning_rate": 0.0005217975892264544, "loss": 0.2158, "step": 65000 }, { "epoch": 4.20026976684437, "eval_loss": 0.21044744551181793, "eval_runtime": 3.8176, "eval_samples_per_second": 130.973, "eval_steps_per_second": 8.382, "step": 65394 }, { "epoch": 4.207078168154666, "grad_norm": 0.6961658000946045, "learning_rate": 0.0005196565825251034, "loss": 0.2168, "step": 65500 }, { "epoch": 4.239193268674931, "grad_norm": 0.9448217749595642, "learning_rate": 0.0005175155758237524, "loss": 0.2163, "step": 66000 }, { "epoch": 4.271308369195196, "grad_norm": 0.9778387546539307, "learning_rate": 0.0005153745691224014, "loss": 0.217, "step": 66500 }, { "epoch": 4.30342346971546, "grad_norm": 1.1238789558410645, "learning_rate": 0.0005132335624210505, "loss": 0.2174, "step": 67000 }, { "epoch": 4.335538570235725, "grad_norm": 0.926021933555603, "learning_rate": 0.0005110968377331021, "loss": 0.2119, "step": 67500 }, { "epoch": 4.367653670755989, "grad_norm": 1.2473511695861816, "learning_rate": 0.0005089558310317512, "loss": 0.2187, "step": 68000 }, { "epoch": 4.399768771276254, "grad_norm": 1.4314179420471191, "learning_rate": 0.0005068148243304002, "loss": 0.2147, "step": 68500 }, { "epoch": 4.4002826128845784, "eval_loss": 0.2070987969636917, "eval_runtime": 3.7911, "eval_samples_per_second": 131.887, "eval_steps_per_second": 8.441, "step": 68508 }, { "epoch": 4.4318838717965185, "grad_norm": 1.2765947580337524, "learning_rate": 0.0005046738176290492, "loss": 0.2123, "step": 69000 }, { "epoch": 4.463998972316784, "grad_norm": 0.9501237273216248, "learning_rate": 0.0005025370929411009, "loss": 0.2142, "step": 69500 }, { "epoch": 4.496114072837048, "grad_norm": 1.161289930343628, "learning_rate": 0.00050039608623975, "loss": 0.2125, "step": 70000 }, { "epoch": 4.528229173357312, "grad_norm": 0.9597361087799072, "learning_rate": 0.0004982550795383989, "loss": 0.213, "step": 70500 }, { "epoch": 4.560344273877577, "grad_norm": 1.041593074798584, "learning_rate": 0.000496114072837048, "loss": 0.2147, "step": 71000 }, { "epoch": 4.592459374397842, "grad_norm": 0.8988145589828491, "learning_rate": 0.0004939773481490998, "loss": 0.2139, "step": 71500 }, { "epoch": 4.600295458924786, "eval_loss": 0.20830930769443512, "eval_runtime": 3.7998, "eval_samples_per_second": 131.585, "eval_steps_per_second": 8.421, "step": 71622 }, { "epoch": 4.624574474918107, "grad_norm": 0.9001346230506897, "learning_rate": 0.0004918363414477487, "loss": 0.2132, "step": 72000 }, { "epoch": 4.656689575438371, "grad_norm": 0.8040429949760437, "learning_rate": 0.0004896953347463978, "loss": 0.2113, "step": 72500 }, { "epoch": 4.688804675958636, "grad_norm": 0.9886132478713989, "learning_rate": 0.0004875543280450468, "loss": 0.2085, "step": 73000 }, { "epoch": 4.7209197764789, "grad_norm": 1.1031527519226074, "learning_rate": 0.0004854133213436958, "loss": 0.2117, "step": 73500 }, { "epoch": 4.753034876999165, "grad_norm": 0.9805654287338257, "learning_rate": 0.0004832723146423448, "loss": 0.2087, "step": 74000 }, { "epoch": 4.7851499775194295, "grad_norm": 1.1665472984313965, "learning_rate": 0.00048113130794099387, "loss": 0.2094, "step": 74500 }, { "epoch": 4.800308304964995, "eval_loss": 0.20766516029834747, "eval_runtime": 3.8162, "eval_samples_per_second": 131.019, "eval_steps_per_second": 8.385, "step": 74736 }, { "epoch": 4.817265078039695, "grad_norm": 1.1881592273712158, "learning_rate": 0.00047899030123964287, "loss": 0.2123, "step": 75000 }, { "epoch": 4.849380178559959, "grad_norm": 1.1299117803573608, "learning_rate": 0.0004768535765516946, "loss": 0.2048, "step": 75500 }, { "epoch": 4.881495279080223, "grad_norm": 0.9001392722129822, "learning_rate": 0.00047471256985034363, "loss": 0.2067, "step": 76000 }, { "epoch": 4.913610379600488, "grad_norm": 0.7669143676757812, "learning_rate": 0.00047257156314899263, "loss": 0.2133, "step": 76500 }, { "epoch": 4.945725480120752, "grad_norm": 1.0141063928604126, "learning_rate": 0.0004704305564476417, "loss": 0.208, "step": 77000 }, { "epoch": 4.9778405806410175, "grad_norm": 0.9920214414596558, "learning_rate": 0.0004682895497462907, "loss": 0.2072, "step": 77500 }, { "epoch": 5.000321151005203, "eval_loss": 0.197199285030365, "eval_runtime": 3.8359, "eval_samples_per_second": 130.346, "eval_steps_per_second": 8.342, "step": 77850 }, { "epoch": 5.009955681161282, "grad_norm": 1.1408747434616089, "learning_rate": 0.0004661485430449397, "loss": 0.212, "step": 78000 }, { "epoch": 5.042070781681547, "grad_norm": 1.742897629737854, "learning_rate": 0.00046400753634358874, "loss": 0.2056, "step": 78500 }, { "epoch": 5.074185882201811, "grad_norm": 1.0371334552764893, "learning_rate": 0.00046186652964223785, "loss": 0.2052, "step": 79000 }, { "epoch": 5.106300982722076, "grad_norm": 1.1192116737365723, "learning_rate": 0.00045972552294088685, "loss": 0.2032, "step": 79500 }, { "epoch": 5.1384160832423404, "grad_norm": 1.0541894435882568, "learning_rate": 0.0004575845162395359, "loss": 0.2093, "step": 80000 }, { "epoch": 5.170531183762606, "grad_norm": 0.9582578539848328, "learning_rate": 0.0004554477915515876, "loss": 0.2039, "step": 80500 }, { "epoch": 5.20033399704541, "eval_loss": 0.19644007086753845, "eval_runtime": 3.8681, "eval_samples_per_second": 129.262, "eval_steps_per_second": 8.273, "step": 80964 }, { "epoch": 5.20264628428287, "grad_norm": 0.9570828080177307, "learning_rate": 0.0004533067848502366, "loss": 0.2046, "step": 81000 }, { "epoch": 5.234761384803134, "grad_norm": 1.1668864488601685, "learning_rate": 0.00045116577814888567, "loss": 0.2035, "step": 81500 }, { "epoch": 5.266876485323399, "grad_norm": 0.7868309617042542, "learning_rate": 0.00044902477144753467, "loss": 0.2058, "step": 82000 }, { "epoch": 5.298991585843663, "grad_norm": 0.9544404149055481, "learning_rate": 0.0004468880467595864, "loss": 0.2049, "step": 82500 }, { "epoch": 5.3311066863639285, "grad_norm": 1.0466246604919434, "learning_rate": 0.00044474704005823543, "loss": 0.1982, "step": 83000 }, { "epoch": 5.363221786884193, "grad_norm": 0.9287202954292297, "learning_rate": 0.00044260603335688443, "loss": 0.2047, "step": 83500 }, { "epoch": 5.395336887404458, "grad_norm": 0.6822737455368042, "learning_rate": 0.0004404650266555335, "loss": 0.2036, "step": 84000 }, { "epoch": 5.400346843085619, "eval_loss": 0.1947784423828125, "eval_runtime": 3.7994, "eval_samples_per_second": 131.6, "eval_steps_per_second": 8.422, "step": 84078 }, { "epoch": 5.427451987924722, "grad_norm": 0.7541738748550415, "learning_rate": 0.0004383240199541825, "loss": 0.2037, "step": 84500 }, { "epoch": 5.459567088444987, "grad_norm": 1.0783097743988037, "learning_rate": 0.0004361830132528315, "loss": 0.2032, "step": 85000 }, { "epoch": 5.491682188965251, "grad_norm": 0.9074442982673645, "learning_rate": 0.00043404200655148054, "loss": 0.2022, "step": 85500 }, { "epoch": 5.5237972894855165, "grad_norm": 0.9655330777168274, "learning_rate": 0.00043190099985012954, "loss": 0.2023, "step": 86000 }, { "epoch": 5.555912390005781, "grad_norm": 0.8551456332206726, "learning_rate": 0.0004297599931487786, "loss": 0.1979, "step": 86500 }, { "epoch": 5.588027490526045, "grad_norm": 1.0667117834091187, "learning_rate": 0.0004276232684608303, "loss": 0.2031, "step": 87000 }, { "epoch": 5.600359689125827, "eval_loss": 0.19503723084926605, "eval_runtime": 3.8204, "eval_samples_per_second": 130.877, "eval_steps_per_second": 8.376, "step": 87192 }, { "epoch": 5.62014259104631, "grad_norm": 1.0949949026107788, "learning_rate": 0.0004254822617594793, "loss": 0.1989, "step": 87500 }, { "epoch": 5.652257691566574, "grad_norm": 0.9818968176841736, "learning_rate": 0.00042334125505812836, "loss": 0.1988, "step": 88000 }, { "epoch": 5.684372792086839, "grad_norm": 0.7903144359588623, "learning_rate": 0.00042120024835677736, "loss": 0.2005, "step": 88500 }, { "epoch": 5.716487892607104, "grad_norm": 0.7773236036300659, "learning_rate": 0.00041905924165542636, "loss": 0.1978, "step": 89000 }, { "epoch": 5.748602993127369, "grad_norm": 0.7405574917793274, "learning_rate": 0.0004169225169674781, "loss": 0.2021, "step": 89500 }, { "epoch": 5.780718093647633, "grad_norm": 0.8121611475944519, "learning_rate": 0.0004147815102661271, "loss": 0.1964, "step": 90000 }, { "epoch": 5.800372535166035, "eval_loss": 0.1933530569076538, "eval_runtime": 4.0989, "eval_samples_per_second": 121.983, "eval_steps_per_second": 7.807, "step": 90306 }, { "epoch": 5.812833194167898, "grad_norm": 0.9987449049949646, "learning_rate": 0.00041264478557817883, "loss": 0.1965, "step": 90500 }, { "epoch": 5.844948294688162, "grad_norm": 0.9291335940361023, "learning_rate": 0.0004105037788768279, "loss": 0.1971, "step": 91000 }, { "epoch": 5.8770633952084275, "grad_norm": 1.1075915098190308, "learning_rate": 0.0004083627721754769, "loss": 0.2019, "step": 91500 }, { "epoch": 5.909178495728692, "grad_norm": 0.9010976552963257, "learning_rate": 0.00040622176547412594, "loss": 0.1988, "step": 92000 }, { "epoch": 5.941293596248956, "grad_norm": 0.8586589097976685, "learning_rate": 0.00040408075877277494, "loss": 0.1998, "step": 92500 }, { "epoch": 5.973408696769221, "grad_norm": 0.9236857891082764, "learning_rate": 0.00040194403408482676, "loss": 0.1982, "step": 93000 }, { "epoch": 6.000385381206243, "eval_loss": 0.18391552567481995, "eval_runtime": 3.945, "eval_samples_per_second": 126.744, "eval_steps_per_second": 8.112, "step": 93420 }, { "epoch": 6.005523797289485, "grad_norm": 0.7874048948287964, "learning_rate": 0.0003998030273834757, "loss": 0.1963, "step": 93500 }, { "epoch": 6.03763889780975, "grad_norm": 1.1348239183425903, "learning_rate": 0.00039766630269552747, "loss": 0.194, "step": 94000 }, { "epoch": 6.069753998330015, "grad_norm": 1.102102279663086, "learning_rate": 0.00039552529599417647, "loss": 0.196, "step": 94500 }, { "epoch": 6.10186909885028, "grad_norm": 0.9097464084625244, "learning_rate": 0.00039338428929282547, "loss": 0.1932, "step": 95000 }, { "epoch": 6.133984199370544, "grad_norm": 0.8406012654304504, "learning_rate": 0.0003912432825914745, "loss": 0.1948, "step": 95500 }, { "epoch": 6.166099299890809, "grad_norm": 0.9816380739212036, "learning_rate": 0.0003891022758901235, "loss": 0.1917, "step": 96000 }, { "epoch": 6.198214400411073, "grad_norm": 0.9839223623275757, "learning_rate": 0.00038696126918877263, "loss": 0.1929, "step": 96500 }, { "epoch": 6.200398227246451, "eval_loss": 0.18816132843494415, "eval_runtime": 3.7768, "eval_samples_per_second": 132.386, "eval_steps_per_second": 8.473, "step": 96534 }, { "epoch": 6.2303295009313375, "grad_norm": 1.0447883605957031, "learning_rate": 0.00038482026248742163, "loss": 0.1937, "step": 97000 }, { "epoch": 6.262444601451603, "grad_norm": 0.8144561052322388, "learning_rate": 0.00038267925578607063, "loss": 0.1904, "step": 97500 }, { "epoch": 6.294559701971867, "grad_norm": 0.7006298303604126, "learning_rate": 0.0003805382490847197, "loss": 0.1876, "step": 98000 }, { "epoch": 6.326674802492132, "grad_norm": 1.2990646362304688, "learning_rate": 0.0003783972423833687, "loss": 0.1957, "step": 98500 }, { "epoch": 6.358789903012396, "grad_norm": 0.9144965410232544, "learning_rate": 0.00037625623568201774, "loss": 0.1921, "step": 99000 }, { "epoch": 6.390905003532661, "grad_norm": 2.249871253967285, "learning_rate": 0.00037411522898066674, "loss": 0.1917, "step": 99500 }, { "epoch": 6.4004110732866595, "eval_loss": 0.1845063716173172, "eval_runtime": 3.865, "eval_samples_per_second": 129.365, "eval_steps_per_second": 8.279, "step": 99648 }, { "epoch": 6.423020104052926, "grad_norm": 1.4406355619430542, "learning_rate": 0.00037197850429271845, "loss": 0.1907, "step": 100000 }, { "epoch": 6.455135204573191, "grad_norm": 0.8511375188827515, "learning_rate": 0.0003698374975913675, "loss": 0.1908, "step": 100500 }, { "epoch": 6.487250305093455, "grad_norm": 0.8903339505195618, "learning_rate": 0.0003676964908900165, "loss": 0.1914, "step": 101000 }, { "epoch": 6.519365405613719, "grad_norm": 0.7510945796966553, "learning_rate": 0.0003655554841886655, "loss": 0.1901, "step": 101500 }, { "epoch": 6.551480506133984, "grad_norm": 1.0843825340270996, "learning_rate": 0.00036341875950071726, "loss": 0.1884, "step": 102000 }, { "epoch": 6.5835956066542485, "grad_norm": 1.074331283569336, "learning_rate": 0.000361282034812769, "loss": 0.1917, "step": 102500 }, { "epoch": 6.600423919326867, "eval_loss": 0.1811291128396988, "eval_runtime": 3.849, "eval_samples_per_second": 129.904, "eval_steps_per_second": 8.314, "step": 102762 }, { "epoch": 6.615710707174514, "grad_norm": 0.7904005646705627, "learning_rate": 0.000359141028111418, "loss": 0.1889, "step": 103000 }, { "epoch": 6.647825807694778, "grad_norm": 0.9564907550811768, "learning_rate": 0.00035700002141006703, "loss": 0.1903, "step": 103500 }, { "epoch": 6.679940908215043, "grad_norm": 0.8276146054267883, "learning_rate": 0.0003548590147087161, "loss": 0.1887, "step": 104000 }, { "epoch": 6.712056008735307, "grad_norm": 0.9399134516716003, "learning_rate": 0.0003527180080073651, "loss": 0.1888, "step": 104500 }, { "epoch": 6.744171109255572, "grad_norm": 0.8027148246765137, "learning_rate": 0.00035057700130601414, "loss": 0.1865, "step": 105000 }, { "epoch": 6.7762862097758365, "grad_norm": 0.9904204607009888, "learning_rate": 0.00034843599460466314, "loss": 0.1866, "step": 105500 }, { "epoch": 6.800436765367076, "eval_loss": 0.18000289797782898, "eval_runtime": 4.1057, "eval_samples_per_second": 121.782, "eval_steps_per_second": 7.794, "step": 105876 }, { "epoch": 6.808401310296102, "grad_norm": 0.7759542465209961, "learning_rate": 0.00034629498790331214, "loss": 0.1906, "step": 106000 }, { "epoch": 6.840516410816366, "grad_norm": 1.0268630981445312, "learning_rate": 0.0003441539812019612, "loss": 0.1861, "step": 106500 }, { "epoch": 6.87263151133663, "grad_norm": 1.0219001770019531, "learning_rate": 0.0003420172565140129, "loss": 0.1873, "step": 107000 }, { "epoch": 6.904746611856895, "grad_norm": 1.0753690004348755, "learning_rate": 0.00033987624981266195, "loss": 0.1866, "step": 107500 }, { "epoch": 6.936861712377159, "grad_norm": 0.9647945165634155, "learning_rate": 0.00033773952512471366, "loss": 0.1846, "step": 108000 }, { "epoch": 6.9689768128974245, "grad_norm": 0.9410663843154907, "learning_rate": 0.00033559851842336266, "loss": 0.1885, "step": 108500 } ], "logging_steps": 500, "max_steps": 186828, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.421226484476674e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }