yalhessi's picture
Training in progress, epoch 5, checkpoint
3cce775 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 3114,
"global_step": 77845,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03211510052026463,
"grad_norm": 1.2270241975784302,
"learning_rate": 0.0007978718393388571,
"loss": 0.4499,
"step": 500
},
{
"epoch": 0.06423020104052926,
"grad_norm": 1.1903154850006104,
"learning_rate": 0.0007957308326375062,
"loss": 0.3541,
"step": 1000
},
{
"epoch": 0.09634530156079389,
"grad_norm": 1.1337592601776123,
"learning_rate": 0.0007935898259361552,
"loss": 0.3336,
"step": 1500
},
{
"epoch": 0.12846040208105852,
"grad_norm": 0.9932805895805359,
"learning_rate": 0.0007914488192348043,
"loss": 0.3187,
"step": 2000
},
{
"epoch": 0.16057550260132314,
"grad_norm": 1.4273182153701782,
"learning_rate": 0.0007893120945468559,
"loss": 0.3094,
"step": 2500
},
{
"epoch": 0.19269060312158778,
"grad_norm": 1.296076774597168,
"learning_rate": 0.000787171087845505,
"loss": 0.306,
"step": 3000
},
{
"epoch": 0.2000128460402081,
"eval_loss": 0.3016555607318878,
"eval_runtime": 4.134,
"eval_samples_per_second": 120.948,
"eval_steps_per_second": 7.741,
"step": 3114
},
{
"epoch": 0.2248057036418524,
"grad_norm": 1.1079399585723877,
"learning_rate": 0.0007850343631575566,
"loss": 0.2993,
"step": 3500
},
{
"epoch": 0.25692080416211704,
"grad_norm": 1.5939998626708984,
"learning_rate": 0.0007828933564562057,
"loss": 0.2985,
"step": 4000
},
{
"epoch": 0.28903590468238166,
"grad_norm": 1.4033524990081787,
"learning_rate": 0.0007807523497548548,
"loss": 0.2959,
"step": 4500
},
{
"epoch": 0.3211510052026463,
"grad_norm": 1.9331912994384766,
"learning_rate": 0.0007786113430535038,
"loss": 0.2881,
"step": 5000
},
{
"epoch": 0.3532661057229109,
"grad_norm": 1.112199068069458,
"learning_rate": 0.0007764703363521528,
"loss": 0.2874,
"step": 5500
},
{
"epoch": 0.38538120624317557,
"grad_norm": 0.9604991674423218,
"learning_rate": 0.0007743293296508018,
"loss": 0.2867,
"step": 6000
},
{
"epoch": 0.4000256920804162,
"eval_loss": 0.27945849299430847,
"eval_runtime": 4.1149,
"eval_samples_per_second": 121.509,
"eval_steps_per_second": 7.777,
"step": 6228
},
{
"epoch": 0.4174963067634402,
"grad_norm": 1.542017936706543,
"learning_rate": 0.0007721883229494509,
"loss": 0.2842,
"step": 6500
},
{
"epoch": 0.4496114072837048,
"grad_norm": 0.9913608431816101,
"learning_rate": 0.0007700473162480999,
"loss": 0.2805,
"step": 7000
},
{
"epoch": 0.4817265078039694,
"grad_norm": 1.0861561298370361,
"learning_rate": 0.000767906309546749,
"loss": 0.2795,
"step": 7500
},
{
"epoch": 0.5138416083242341,
"grad_norm": 1.408018946647644,
"learning_rate": 0.0007657695848588007,
"loss": 0.2791,
"step": 8000
},
{
"epoch": 0.5459567088444987,
"grad_norm": 0.935492992401123,
"learning_rate": 0.0007636285781574497,
"loss": 0.2768,
"step": 8500
},
{
"epoch": 0.5780718093647633,
"grad_norm": 0.974107027053833,
"learning_rate": 0.0007614875714560987,
"loss": 0.2775,
"step": 9000
},
{
"epoch": 0.6000385381206244,
"eval_loss": 0.26882508397102356,
"eval_runtime": 4.0584,
"eval_samples_per_second": 123.2,
"eval_steps_per_second": 7.885,
"step": 9342
},
{
"epoch": 0.610186909885028,
"grad_norm": 1.00677490234375,
"learning_rate": 0.0007593465647547478,
"loss": 0.2719,
"step": 9500
},
{
"epoch": 0.6423020104052926,
"grad_norm": 1.3076094388961792,
"learning_rate": 0.0007572098400667994,
"loss": 0.2733,
"step": 10000
},
{
"epoch": 0.6744171109255572,
"grad_norm": 1.332555890083313,
"learning_rate": 0.0007550688333654485,
"loss": 0.2738,
"step": 10500
},
{
"epoch": 0.7065322114458218,
"grad_norm": 1.065308928489685,
"learning_rate": 0.0007529278266640976,
"loss": 0.2697,
"step": 11000
},
{
"epoch": 0.7386473119660865,
"grad_norm": 1.1714705228805542,
"learning_rate": 0.0007507868199627465,
"loss": 0.2718,
"step": 11500
},
{
"epoch": 0.7707624124863511,
"grad_norm": 1.545327067375183,
"learning_rate": 0.0007486500952747983,
"loss": 0.2701,
"step": 12000
},
{
"epoch": 0.8000513841608324,
"eval_loss": 0.25833237171173096,
"eval_runtime": 3.9621,
"eval_samples_per_second": 126.196,
"eval_steps_per_second": 8.077,
"step": 12456
},
{
"epoch": 0.8028775130066157,
"grad_norm": 0.8509078621864319,
"learning_rate": 0.0007465090885734473,
"loss": 0.2702,
"step": 12500
},
{
"epoch": 0.8349926135268804,
"grad_norm": 1.0517570972442627,
"learning_rate": 0.0007443680818720963,
"loss": 0.2684,
"step": 13000
},
{
"epoch": 0.8671077140471449,
"grad_norm": 1.0709669589996338,
"learning_rate": 0.000742231357184148,
"loss": 0.2674,
"step": 13500
},
{
"epoch": 0.8992228145674096,
"grad_norm": 1.1354570388793945,
"learning_rate": 0.0007400903504827971,
"loss": 0.269,
"step": 14000
},
{
"epoch": 0.9313379150876743,
"grad_norm": 1.5820938348770142,
"learning_rate": 0.000737949343781446,
"loss": 0.2665,
"step": 14500
},
{
"epoch": 0.9634530156079388,
"grad_norm": 1.3303571939468384,
"learning_rate": 0.0007358083370800951,
"loss": 0.2644,
"step": 15000
},
{
"epoch": 0.9955681161282035,
"grad_norm": 1.0390913486480713,
"learning_rate": 0.0007336716123921468,
"loss": 0.268,
"step": 15500
},
{
"epoch": 1.0000642302010405,
"eval_loss": 0.25299617648124695,
"eval_runtime": 3.8751,
"eval_samples_per_second": 129.03,
"eval_steps_per_second": 8.258,
"step": 15570
},
{
"epoch": 1.0276832166484682,
"grad_norm": 1.2327704429626465,
"learning_rate": 0.0007315306056907958,
"loss": 0.263,
"step": 16000
},
{
"epoch": 1.0597983171687329,
"grad_norm": 0.9403806924819946,
"learning_rate": 0.0007293895989894448,
"loss": 0.2633,
"step": 16500
},
{
"epoch": 1.0919134176889973,
"grad_norm": 1.1138664484024048,
"learning_rate": 0.0007272485922880939,
"loss": 0.2608,
"step": 17000
},
{
"epoch": 1.124028518209262,
"grad_norm": 1.1546539068222046,
"learning_rate": 0.000725107585586743,
"loss": 0.2569,
"step": 17500
},
{
"epoch": 1.1561436187295266,
"grad_norm": 1.0123635530471802,
"learning_rate": 0.0007229665788853919,
"loss": 0.2596,
"step": 18000
},
{
"epoch": 1.1882587192497913,
"grad_norm": 1.1647980213165283,
"learning_rate": 0.000720825572184041,
"loss": 0.2609,
"step": 18500
},
{
"epoch": 1.2000770762412487,
"eval_loss": 0.2469949871301651,
"eval_runtime": 3.796,
"eval_samples_per_second": 131.719,
"eval_steps_per_second": 8.43,
"step": 18684
},
{
"epoch": 1.2203738197700558,
"grad_norm": 1.2368906736373901,
"learning_rate": 0.00071868456548269,
"loss": 0.2597,
"step": 19000
},
{
"epoch": 1.2524889202903204,
"grad_norm": 0.9881177544593811,
"learning_rate": 0.000716543558781339,
"loss": 0.2563,
"step": 19500
},
{
"epoch": 1.2846040208105851,
"grad_norm": 0.9961882829666138,
"learning_rate": 0.0007144068340933907,
"loss": 0.2563,
"step": 20000
},
{
"epoch": 1.3167191213308498,
"grad_norm": 1.6545355319976807,
"learning_rate": 0.0007122658273920398,
"loss": 0.2566,
"step": 20500
},
{
"epoch": 1.3488342218511145,
"grad_norm": 1.2175770998001099,
"learning_rate": 0.0007101248206906887,
"loss": 0.251,
"step": 21000
},
{
"epoch": 1.3809493223713791,
"grad_norm": 1.2942149639129639,
"learning_rate": 0.0007079838139893379,
"loss": 0.2549,
"step": 21500
},
{
"epoch": 1.4000899222814567,
"eval_loss": 0.24247248470783234,
"eval_runtime": 4.0127,
"eval_samples_per_second": 124.605,
"eval_steps_per_second": 7.975,
"step": 21798
},
{
"epoch": 1.4130644228916436,
"grad_norm": 0.9972023963928223,
"learning_rate": 0.0007058470893013895,
"loss": 0.2532,
"step": 22000
},
{
"epoch": 1.4451795234119083,
"grad_norm": 2.422755479812622,
"learning_rate": 0.0007037060826000386,
"loss": 0.2525,
"step": 22500
},
{
"epoch": 1.477294623932173,
"grad_norm": 1.0350821018218994,
"learning_rate": 0.0007015650758986876,
"loss": 0.2528,
"step": 23000
},
{
"epoch": 1.5094097244524374,
"grad_norm": 0.9712342023849487,
"learning_rate": 0.0006994240691973367,
"loss": 0.2498,
"step": 23500
},
{
"epoch": 1.541524824972702,
"grad_norm": 1.0698814392089844,
"learning_rate": 0.0006972830624959856,
"loss": 0.2511,
"step": 24000
},
{
"epoch": 1.5736399254929667,
"grad_norm": 1.0637270212173462,
"learning_rate": 0.0006951463378080374,
"loss": 0.2542,
"step": 24500
},
{
"epoch": 1.6001027683216649,
"eval_loss": 0.23835672438144684,
"eval_runtime": 3.8011,
"eval_samples_per_second": 131.543,
"eval_steps_per_second": 8.419,
"step": 24912
},
{
"epoch": 1.6057550260132314,
"grad_norm": 1.08571195602417,
"learning_rate": 0.0006930053311066865,
"loss": 0.2531,
"step": 25000
},
{
"epoch": 1.637870126533496,
"grad_norm": 0.9403467774391174,
"learning_rate": 0.0006908643244053354,
"loss": 0.2522,
"step": 25500
},
{
"epoch": 1.6699852270537607,
"grad_norm": 0.8324321508407593,
"learning_rate": 0.0006887233177039845,
"loss": 0.2493,
"step": 26000
},
{
"epoch": 1.7021003275740254,
"grad_norm": 0.9599499702453613,
"learning_rate": 0.0006865908750294389,
"loss": 0.2485,
"step": 26500
},
{
"epoch": 1.73421542809429,
"grad_norm": 1.369850754737854,
"learning_rate": 0.0006844498683280879,
"loss": 0.25,
"step": 27000
},
{
"epoch": 1.7663305286145545,
"grad_norm": 1.042289137840271,
"learning_rate": 0.0006823088616267369,
"loss": 0.2449,
"step": 27500
},
{
"epoch": 1.7984456291348192,
"grad_norm": 1.2191327810287476,
"learning_rate": 0.000680167854925386,
"loss": 0.2489,
"step": 28000
},
{
"epoch": 1.8001156143618728,
"eval_loss": 0.23773300647735596,
"eval_runtime": 4.0318,
"eval_samples_per_second": 124.015,
"eval_steps_per_second": 7.937,
"step": 28026
},
{
"epoch": 1.8305607296550839,
"grad_norm": 0.9970433712005615,
"learning_rate": 0.0006780268482240349,
"loss": 0.2517,
"step": 28500
},
{
"epoch": 1.8626758301753483,
"grad_norm": 1.0307445526123047,
"learning_rate": 0.000675885841522684,
"loss": 0.2462,
"step": 29000
},
{
"epoch": 1.894790930695613,
"grad_norm": 1.1497652530670166,
"learning_rate": 0.000673744834821333,
"loss": 0.2494,
"step": 29500
},
{
"epoch": 1.9269060312158777,
"grad_norm": 0.8870740532875061,
"learning_rate": 0.000671603828119982,
"loss": 0.2459,
"step": 30000
},
{
"epoch": 1.9590211317361423,
"grad_norm": 1.0110082626342773,
"learning_rate": 0.0006694671034320337,
"loss": 0.2466,
"step": 30500
},
{
"epoch": 1.991136232256407,
"grad_norm": 0.9974693655967712,
"learning_rate": 0.0006673303787440855,
"loss": 0.2469,
"step": 31000
},
{
"epoch": 2.000128460402081,
"eval_loss": 0.23335064947605133,
"eval_runtime": 3.9065,
"eval_samples_per_second": 127.993,
"eval_steps_per_second": 8.192,
"step": 31140
},
{
"epoch": 2.0232513327766717,
"grad_norm": 0.8978867530822754,
"learning_rate": 0.0006651893720427345,
"loss": 0.2447,
"step": 31500
},
{
"epoch": 2.0553664332969364,
"grad_norm": 0.9614811539649963,
"learning_rate": 0.0006630483653413835,
"loss": 0.2434,
"step": 32000
},
{
"epoch": 2.087481533817201,
"grad_norm": 0.9430557489395142,
"learning_rate": 0.0006609073586400326,
"loss": 0.2404,
"step": 32500
},
{
"epoch": 2.1195966343374657,
"grad_norm": 1.088191270828247,
"learning_rate": 0.0006587663519386816,
"loss": 0.2403,
"step": 33000
},
{
"epoch": 2.15171173485773,
"grad_norm": 1.05572509765625,
"learning_rate": 0.0006566253452373306,
"loss": 0.2417,
"step": 33500
},
{
"epoch": 2.1838268353779946,
"grad_norm": 1.0838186740875244,
"learning_rate": 0.0006544843385359796,
"loss": 0.2416,
"step": 34000
},
{
"epoch": 2.2001413064422892,
"eval_loss": 0.24185791611671448,
"eval_runtime": 3.8292,
"eval_samples_per_second": 130.575,
"eval_steps_per_second": 8.357,
"step": 34254
},
{
"epoch": 2.2159419358982593,
"grad_norm": 0.9424343705177307,
"learning_rate": 0.0006523433318346287,
"loss": 0.2448,
"step": 34500
},
{
"epoch": 2.248057036418524,
"grad_norm": 1.4872556924819946,
"learning_rate": 0.0006502023251332776,
"loss": 0.2443,
"step": 35000
},
{
"epoch": 2.2801721369387886,
"grad_norm": 1.3446072340011597,
"learning_rate": 0.0006480613184319268,
"loss": 0.2412,
"step": 35500
},
{
"epoch": 2.3122872374590533,
"grad_norm": 0.9808722138404846,
"learning_rate": 0.0006459245937439784,
"loss": 0.2386,
"step": 36000
},
{
"epoch": 2.344402337979318,
"grad_norm": 1.144508719444275,
"learning_rate": 0.0006437835870426275,
"loss": 0.2387,
"step": 36500
},
{
"epoch": 2.3765174384995826,
"grad_norm": 0.9765673279762268,
"learning_rate": 0.0006416425803412765,
"loss": 0.2402,
"step": 37000
},
{
"epoch": 2.4001541524824974,
"eval_loss": 0.2269136607646942,
"eval_runtime": 4.0495,
"eval_samples_per_second": 123.471,
"eval_steps_per_second": 7.902,
"step": 37368
},
{
"epoch": 2.4086325390198473,
"grad_norm": 1.0742825269699097,
"learning_rate": 0.0006395015736399256,
"loss": 0.2407,
"step": 37500
},
{
"epoch": 2.4407476395401115,
"grad_norm": 1.157128930091858,
"learning_rate": 0.0006373648489519773,
"loss": 0.2376,
"step": 38000
},
{
"epoch": 2.472862740060376,
"grad_norm": 0.9436767101287842,
"learning_rate": 0.0006352238422506263,
"loss": 0.2355,
"step": 38500
},
{
"epoch": 2.504977840580641,
"grad_norm": 1.5304116010665894,
"learning_rate": 0.0006330828355492754,
"loss": 0.2388,
"step": 39000
},
{
"epoch": 2.5370929411009056,
"grad_norm": 0.9645853638648987,
"learning_rate": 0.000630946110861327,
"loss": 0.236,
"step": 39500
},
{
"epoch": 2.5692080416211702,
"grad_norm": 0.77340167760849,
"learning_rate": 0.0006288051041599761,
"loss": 0.2401,
"step": 40000
},
{
"epoch": 2.600166998522705,
"eval_loss": 0.22546811401844025,
"eval_runtime": 3.7972,
"eval_samples_per_second": 131.676,
"eval_steps_per_second": 8.427,
"step": 40482
},
{
"epoch": 2.601323142141435,
"grad_norm": 1.2897224426269531,
"learning_rate": 0.0006266640974586251,
"loss": 0.2372,
"step": 40500
},
{
"epoch": 2.6334382426616996,
"grad_norm": 0.92377108335495,
"learning_rate": 0.0006245230907572741,
"loss": 0.2378,
"step": 41000
},
{
"epoch": 2.6655533431819642,
"grad_norm": 1.231541395187378,
"learning_rate": 0.0006223820840559231,
"loss": 0.236,
"step": 41500
},
{
"epoch": 2.697668443702229,
"grad_norm": 1.1643468141555786,
"learning_rate": 0.0006202410773545722,
"loss": 0.2343,
"step": 42000
},
{
"epoch": 2.729783544222493,
"grad_norm": 0.9667991399765015,
"learning_rate": 0.0006181043526666238,
"loss": 0.2336,
"step": 42500
},
{
"epoch": 2.7618986447427583,
"grad_norm": 0.9757621884346008,
"learning_rate": 0.0006159633459652729,
"loss": 0.2333,
"step": 43000
},
{
"epoch": 2.7940137452630225,
"grad_norm": 1.1959055662155151,
"learning_rate": 0.000613822339263922,
"loss": 0.2368,
"step": 43500
},
{
"epoch": 2.8001798445629134,
"eval_loss": 0.23977364599704742,
"eval_runtime": 4.0372,
"eval_samples_per_second": 123.848,
"eval_steps_per_second": 7.926,
"step": 43596
},
{
"epoch": 2.826128845783287,
"grad_norm": 0.9506617784500122,
"learning_rate": 0.0006116813325625709,
"loss": 0.2315,
"step": 44000
},
{
"epoch": 2.858243946303552,
"grad_norm": 1.273672103881836,
"learning_rate": 0.0006095446078746227,
"loss": 0.2347,
"step": 44500
},
{
"epoch": 2.8903590468238165,
"grad_norm": 1.50209641456604,
"learning_rate": 0.0006074036011732717,
"loss": 0.2361,
"step": 45000
},
{
"epoch": 2.922474147344081,
"grad_norm": 0.9982122182846069,
"learning_rate": 0.0006052625944719207,
"loss": 0.2338,
"step": 45500
},
{
"epoch": 2.954589247864346,
"grad_norm": 1.061805009841919,
"learning_rate": 0.0006031215877705697,
"loss": 0.2325,
"step": 46000
},
{
"epoch": 2.9867043483846105,
"grad_norm": 1.1958117485046387,
"learning_rate": 0.0006009805810692188,
"loss": 0.2309,
"step": 46500
},
{
"epoch": 3.0001926906031215,
"eval_loss": 0.22259989380836487,
"eval_runtime": 3.8294,
"eval_samples_per_second": 130.568,
"eval_steps_per_second": 8.356,
"step": 46710
},
{
"epoch": 3.018819448904875,
"grad_norm": 0.9146483540534973,
"learning_rate": 0.0005988438563812704,
"loss": 0.23,
"step": 47000
},
{
"epoch": 3.05093454942514,
"grad_norm": 0.9622049927711487,
"learning_rate": 0.0005967028496799195,
"loss": 0.2294,
"step": 47500
},
{
"epoch": 3.0830496499454045,
"grad_norm": 0.9770357608795166,
"learning_rate": 0.0005945618429785685,
"loss": 0.2293,
"step": 48000
},
{
"epoch": 3.1151647504656688,
"grad_norm": 1.1833593845367432,
"learning_rate": 0.0005924208362772176,
"loss": 0.2266,
"step": 48500
},
{
"epoch": 3.1472798509859334,
"grad_norm": 0.7183510065078735,
"learning_rate": 0.0005902798295758665,
"loss": 0.2278,
"step": 49000
},
{
"epoch": 3.179394951506198,
"grad_norm": 0.8913053870201111,
"learning_rate": 0.0005881388228745156,
"loss": 0.2289,
"step": 49500
},
{
"epoch": 3.2002055366433297,
"eval_loss": 0.22066444158554077,
"eval_runtime": 3.8492,
"eval_samples_per_second": 129.896,
"eval_steps_per_second": 8.313,
"step": 49824
},
{
"epoch": 3.211510052026463,
"grad_norm": 0.800855815410614,
"learning_rate": 0.0005859978161731647,
"loss": 0.2271,
"step": 50000
},
{
"epoch": 3.2436251525467275,
"grad_norm": 0.8037746548652649,
"learning_rate": 0.0005838568094718137,
"loss": 0.2295,
"step": 50500
},
{
"epoch": 3.275740253066992,
"grad_norm": 0.9885351657867432,
"learning_rate": 0.0005817158027704627,
"loss": 0.224,
"step": 51000
},
{
"epoch": 3.307855353587257,
"grad_norm": 0.8889601826667786,
"learning_rate": 0.0005795790780825145,
"loss": 0.2274,
"step": 51500
},
{
"epoch": 3.3399704541075215,
"grad_norm": 1.0997310876846313,
"learning_rate": 0.0005774423533945662,
"loss": 0.2225,
"step": 52000
},
{
"epoch": 3.372085554627786,
"grad_norm": 0.7647742629051208,
"learning_rate": 0.0005753013466932152,
"loss": 0.226,
"step": 52500
},
{
"epoch": 3.400218382683538,
"eval_loss": 0.21940012276172638,
"eval_runtime": 4.117,
"eval_samples_per_second": 121.447,
"eval_steps_per_second": 7.773,
"step": 52938
},
{
"epoch": 3.404200655148051,
"grad_norm": 1.265453815460205,
"learning_rate": 0.0005731603399918643,
"loss": 0.2285,
"step": 53000
},
{
"epoch": 3.436315755668315,
"grad_norm": 0.9455955028533936,
"learning_rate": 0.0005710193332905132,
"loss": 0.2283,
"step": 53500
},
{
"epoch": 3.4684308561885797,
"grad_norm": 1.289652943611145,
"learning_rate": 0.0005688783265891623,
"loss": 0.2268,
"step": 54000
},
{
"epoch": 3.5005459567088444,
"grad_norm": 1.1715284585952759,
"learning_rate": 0.0005667373198878113,
"loss": 0.2282,
"step": 54500
},
{
"epoch": 3.532661057229109,
"grad_norm": 0.9027577042579651,
"learning_rate": 0.0005645963131864603,
"loss": 0.2257,
"step": 55000
},
{
"epoch": 3.5647761577493737,
"grad_norm": 0.9521860480308533,
"learning_rate": 0.0005624553064851093,
"loss": 0.2258,
"step": 55500
},
{
"epoch": 3.5968912582696384,
"grad_norm": 1.1611847877502441,
"learning_rate": 0.0005603185817971611,
"loss": 0.2249,
"step": 56000
},
{
"epoch": 3.6002312287237457,
"eval_loss": 0.21782347559928894,
"eval_runtime": 3.787,
"eval_samples_per_second": 132.029,
"eval_steps_per_second": 8.45,
"step": 56052
},
{
"epoch": 3.629006358789903,
"grad_norm": 1.0853767395019531,
"learning_rate": 0.00055817757509581,
"loss": 0.2231,
"step": 56500
},
{
"epoch": 3.6611214593101677,
"grad_norm": 1.7563400268554688,
"learning_rate": 0.0005560365683944591,
"loss": 0.2235,
"step": 57000
},
{
"epoch": 3.6932365598304324,
"grad_norm": 1.2496379613876343,
"learning_rate": 0.0005538955616931081,
"loss": 0.2206,
"step": 57500
},
{
"epoch": 3.7253516603506966,
"grad_norm": 0.9466719031333923,
"learning_rate": 0.0005517588370051598,
"loss": 0.2241,
"step": 58000
},
{
"epoch": 3.7574667608709618,
"grad_norm": 0.9584017992019653,
"learning_rate": 0.0005496178303038089,
"loss": 0.2164,
"step": 58500
},
{
"epoch": 3.789581861391226,
"grad_norm": 0.9684711694717407,
"learning_rate": 0.0005474768236024579,
"loss": 0.2214,
"step": 59000
},
{
"epoch": 3.800244074763954,
"eval_loss": 0.21725060045719147,
"eval_runtime": 4.1337,
"eval_samples_per_second": 120.958,
"eval_steps_per_second": 7.741,
"step": 59166
},
{
"epoch": 3.8216969619114907,
"grad_norm": 2.9653055667877197,
"learning_rate": 0.000545335816901107,
"loss": 0.2248,
"step": 59500
},
{
"epoch": 3.8538120624317553,
"grad_norm": 1.0390269756317139,
"learning_rate": 0.0005431990922131586,
"loss": 0.2192,
"step": 60000
},
{
"epoch": 3.88592716295202,
"grad_norm": 1.2799882888793945,
"learning_rate": 0.0005410580855118077,
"loss": 0.2228,
"step": 60500
},
{
"epoch": 3.9180422634722847,
"grad_norm": 0.9130102396011353,
"learning_rate": 0.0005389213608238593,
"loss": 0.2228,
"step": 61000
},
{
"epoch": 3.9501573639925494,
"grad_norm": 1.771164894104004,
"learning_rate": 0.0005367803541225084,
"loss": 0.2228,
"step": 61500
},
{
"epoch": 3.982272464512814,
"grad_norm": 0.9048191905021667,
"learning_rate": 0.0005346393474211574,
"loss": 0.2207,
"step": 62000
},
{
"epoch": 4.000256920804162,
"eval_loss": 0.21284444630146027,
"eval_runtime": 4.0636,
"eval_samples_per_second": 123.045,
"eval_steps_per_second": 7.875,
"step": 62280
},
{
"epoch": 4.014387565033078,
"grad_norm": 1.1616911888122559,
"learning_rate": 0.0005324983407198065,
"loss": 0.2192,
"step": 62500
},
{
"epoch": 4.046502665553343,
"grad_norm": 1.1269534826278687,
"learning_rate": 0.0005303573340184554,
"loss": 0.2164,
"step": 63000
},
{
"epoch": 4.078617766073608,
"grad_norm": 0.9181855320930481,
"learning_rate": 0.0005282206093305072,
"loss": 0.2185,
"step": 63500
},
{
"epoch": 4.110732866593873,
"grad_norm": 1.3248172998428345,
"learning_rate": 0.0005260796026291563,
"loss": 0.218,
"step": 64000
},
{
"epoch": 4.142847967114137,
"grad_norm": 1.0312740802764893,
"learning_rate": 0.0005239385959278053,
"loss": 0.217,
"step": 64500
},
{
"epoch": 4.174963067634402,
"grad_norm": 1.00308096408844,
"learning_rate": 0.0005217975892264544,
"loss": 0.2158,
"step": 65000
},
{
"epoch": 4.20026976684437,
"eval_loss": 0.21044744551181793,
"eval_runtime": 3.8176,
"eval_samples_per_second": 130.973,
"eval_steps_per_second": 8.382,
"step": 65394
},
{
"epoch": 4.207078168154666,
"grad_norm": 0.6961658000946045,
"learning_rate": 0.0005196565825251034,
"loss": 0.2168,
"step": 65500
},
{
"epoch": 4.239193268674931,
"grad_norm": 0.9448217749595642,
"learning_rate": 0.0005175155758237524,
"loss": 0.2163,
"step": 66000
},
{
"epoch": 4.271308369195196,
"grad_norm": 0.9778387546539307,
"learning_rate": 0.0005153745691224014,
"loss": 0.217,
"step": 66500
},
{
"epoch": 4.30342346971546,
"grad_norm": 1.1238789558410645,
"learning_rate": 0.0005132335624210505,
"loss": 0.2174,
"step": 67000
},
{
"epoch": 4.335538570235725,
"grad_norm": 0.926021933555603,
"learning_rate": 0.0005110968377331021,
"loss": 0.2119,
"step": 67500
},
{
"epoch": 4.367653670755989,
"grad_norm": 1.2473511695861816,
"learning_rate": 0.0005089558310317512,
"loss": 0.2187,
"step": 68000
},
{
"epoch": 4.399768771276254,
"grad_norm": 1.4314179420471191,
"learning_rate": 0.0005068148243304002,
"loss": 0.2147,
"step": 68500
},
{
"epoch": 4.4002826128845784,
"eval_loss": 0.2070987969636917,
"eval_runtime": 3.7911,
"eval_samples_per_second": 131.887,
"eval_steps_per_second": 8.441,
"step": 68508
},
{
"epoch": 4.4318838717965185,
"grad_norm": 1.2765947580337524,
"learning_rate": 0.0005046738176290492,
"loss": 0.2123,
"step": 69000
},
{
"epoch": 4.463998972316784,
"grad_norm": 0.9501237273216248,
"learning_rate": 0.0005025370929411009,
"loss": 0.2142,
"step": 69500
},
{
"epoch": 4.496114072837048,
"grad_norm": 1.161289930343628,
"learning_rate": 0.00050039608623975,
"loss": 0.2125,
"step": 70000
},
{
"epoch": 4.528229173357312,
"grad_norm": 0.9597361087799072,
"learning_rate": 0.0004982550795383989,
"loss": 0.213,
"step": 70500
},
{
"epoch": 4.560344273877577,
"grad_norm": 1.041593074798584,
"learning_rate": 0.000496114072837048,
"loss": 0.2147,
"step": 71000
},
{
"epoch": 4.592459374397842,
"grad_norm": 0.8988145589828491,
"learning_rate": 0.0004939773481490998,
"loss": 0.2139,
"step": 71500
},
{
"epoch": 4.600295458924786,
"eval_loss": 0.20830930769443512,
"eval_runtime": 3.7998,
"eval_samples_per_second": 131.585,
"eval_steps_per_second": 8.421,
"step": 71622
},
{
"epoch": 4.624574474918107,
"grad_norm": 0.9001346230506897,
"learning_rate": 0.0004918363414477487,
"loss": 0.2132,
"step": 72000
},
{
"epoch": 4.656689575438371,
"grad_norm": 0.8040429949760437,
"learning_rate": 0.0004896953347463978,
"loss": 0.2113,
"step": 72500
},
{
"epoch": 4.688804675958636,
"grad_norm": 0.9886132478713989,
"learning_rate": 0.0004875543280450468,
"loss": 0.2085,
"step": 73000
},
{
"epoch": 4.7209197764789,
"grad_norm": 1.1031527519226074,
"learning_rate": 0.0004854133213436958,
"loss": 0.2117,
"step": 73500
},
{
"epoch": 4.753034876999165,
"grad_norm": 0.9805654287338257,
"learning_rate": 0.0004832723146423448,
"loss": 0.2087,
"step": 74000
},
{
"epoch": 4.7851499775194295,
"grad_norm": 1.1665472984313965,
"learning_rate": 0.00048113130794099387,
"loss": 0.2094,
"step": 74500
},
{
"epoch": 4.800308304964995,
"eval_loss": 0.20766516029834747,
"eval_runtime": 3.8162,
"eval_samples_per_second": 131.019,
"eval_steps_per_second": 8.385,
"step": 74736
},
{
"epoch": 4.817265078039695,
"grad_norm": 1.1881592273712158,
"learning_rate": 0.00047899030123964287,
"loss": 0.2123,
"step": 75000
},
{
"epoch": 4.849380178559959,
"grad_norm": 1.1299117803573608,
"learning_rate": 0.0004768535765516946,
"loss": 0.2048,
"step": 75500
},
{
"epoch": 4.881495279080223,
"grad_norm": 0.9001392722129822,
"learning_rate": 0.00047471256985034363,
"loss": 0.2067,
"step": 76000
},
{
"epoch": 4.913610379600488,
"grad_norm": 0.7669143676757812,
"learning_rate": 0.00047257156314899263,
"loss": 0.2133,
"step": 76500
},
{
"epoch": 4.945725480120752,
"grad_norm": 1.0141063928604126,
"learning_rate": 0.0004704305564476417,
"loss": 0.208,
"step": 77000
},
{
"epoch": 4.9778405806410175,
"grad_norm": 0.9920214414596558,
"learning_rate": 0.0004682895497462907,
"loss": 0.2072,
"step": 77500
}
],
"logging_steps": 500,
"max_steps": 186828,
"num_input_tokens_seen": 0,
"num_train_epochs": 12,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.7295700226799043e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}