agentlans's picture
Upload 12 files
8fd0950 verified
{
"best_global_step": 19958,
"best_metric": 0.2926097810268402,
"best_model_checkpoint": "/media/user/Expansion1/bge-small-en-v1.5-ultrafineweb-vs-pile-classifier/checkpoint-19958",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 99790,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.025052610482012225,
"grad_norm": 1.178984522819519,
"learning_rate": 4.9749974947389525e-05,
"loss": 0.424,
"num_input_tokens_seen": 512000,
"step": 500
},
{
"epoch": 0.05010522096402445,
"grad_norm": 16.981231689453125,
"learning_rate": 4.9499448842569396e-05,
"loss": 0.3671,
"num_input_tokens_seen": 1024000,
"step": 1000
},
{
"epoch": 0.07515783144603667,
"grad_norm": 1.642918586730957,
"learning_rate": 4.9248922737749274e-05,
"loss": 0.3593,
"num_input_tokens_seen": 1536000,
"step": 1500
},
{
"epoch": 0.1002104419280489,
"grad_norm": 1.7461471557617188,
"learning_rate": 4.899839663292916e-05,
"loss": 0.3482,
"num_input_tokens_seen": 2048000,
"step": 2000
},
{
"epoch": 0.12526305241006114,
"grad_norm": 1.169636845588684,
"learning_rate": 4.874787052810903e-05,
"loss": 0.3251,
"num_input_tokens_seen": 2560000,
"step": 2500
},
{
"epoch": 0.15031566289207335,
"grad_norm": 2.5293004512786865,
"learning_rate": 4.849734442328891e-05,
"loss": 0.3267,
"num_input_tokens_seen": 3072000,
"step": 3000
},
{
"epoch": 0.1753682733740856,
"grad_norm": 2.039461135864258,
"learning_rate": 4.824681831846879e-05,
"loss": 0.3278,
"num_input_tokens_seen": 3584000,
"step": 3500
},
{
"epoch": 0.2004208838560978,
"grad_norm": 0.4097174406051636,
"learning_rate": 4.799629221364866e-05,
"loss": 0.3077,
"num_input_tokens_seen": 4096000,
"step": 4000
},
{
"epoch": 0.22547349433811004,
"grad_norm": 9.578287124633789,
"learning_rate": 4.774576610882854e-05,
"loss": 0.3076,
"num_input_tokens_seen": 4608000,
"step": 4500
},
{
"epoch": 0.2505261048201223,
"grad_norm": 2.4667727947235107,
"learning_rate": 4.749524000400842e-05,
"loss": 0.3063,
"num_input_tokens_seen": 5120000,
"step": 5000
},
{
"epoch": 0.27557871530213446,
"grad_norm": 0.20313717424869537,
"learning_rate": 4.72447138991883e-05,
"loss": 0.3087,
"num_input_tokens_seen": 5632000,
"step": 5500
},
{
"epoch": 0.3006313257841467,
"grad_norm": 12.131691932678223,
"learning_rate": 4.6994187794368175e-05,
"loss": 0.3134,
"num_input_tokens_seen": 6144000,
"step": 6000
},
{
"epoch": 0.32568393626615894,
"grad_norm": 9.874564170837402,
"learning_rate": 4.674366168954805e-05,
"loss": 0.3176,
"num_input_tokens_seen": 6656000,
"step": 6500
},
{
"epoch": 0.3507365467481712,
"grad_norm": 0.43632322549819946,
"learning_rate": 4.649313558472793e-05,
"loss": 0.3124,
"num_input_tokens_seen": 7168000,
"step": 7000
},
{
"epoch": 0.3757891572301834,
"grad_norm": 6.8587141036987305,
"learning_rate": 4.624260947990781e-05,
"loss": 0.3025,
"num_input_tokens_seen": 7680000,
"step": 7500
},
{
"epoch": 0.4008417677121956,
"grad_norm": 0.6035759449005127,
"learning_rate": 4.5992083375087687e-05,
"loss": 0.2948,
"num_input_tokens_seen": 8192000,
"step": 8000
},
{
"epoch": 0.42589437819420783,
"grad_norm": 9.4423246383667,
"learning_rate": 4.5741557270267564e-05,
"loss": 0.3029,
"num_input_tokens_seen": 8704000,
"step": 8500
},
{
"epoch": 0.4509469886762201,
"grad_norm": 0.47421976923942566,
"learning_rate": 4.549103116544744e-05,
"loss": 0.2954,
"num_input_tokens_seen": 9216000,
"step": 9000
},
{
"epoch": 0.4759995991582323,
"grad_norm": 2.0256924629211426,
"learning_rate": 4.524050506062732e-05,
"loss": 0.308,
"num_input_tokens_seen": 9728000,
"step": 9500
},
{
"epoch": 0.5010522096402446,
"grad_norm": 2.3783328533172607,
"learning_rate": 4.49899789558072e-05,
"loss": 0.308,
"num_input_tokens_seen": 10240000,
"step": 10000
},
{
"epoch": 0.5261048201222568,
"grad_norm": 11.536542892456055,
"learning_rate": 4.4739452850987076e-05,
"loss": 0.287,
"num_input_tokens_seen": 10752000,
"step": 10500
},
{
"epoch": 0.5511574306042689,
"grad_norm": 7.235984802246094,
"learning_rate": 4.4488926746166954e-05,
"loss": 0.3005,
"num_input_tokens_seen": 11264000,
"step": 11000
},
{
"epoch": 0.5762100410862812,
"grad_norm": 11.705055236816406,
"learning_rate": 4.4238400641346825e-05,
"loss": 0.3083,
"num_input_tokens_seen": 11776000,
"step": 11500
},
{
"epoch": 0.6012626515682934,
"grad_norm": 1.2158238887786865,
"learning_rate": 4.398787453652671e-05,
"loss": 0.2983,
"num_input_tokens_seen": 12288000,
"step": 12000
},
{
"epoch": 0.6263152620503056,
"grad_norm": 13.371932029724121,
"learning_rate": 4.373734843170659e-05,
"loss": 0.3055,
"num_input_tokens_seen": 12800000,
"step": 12500
},
{
"epoch": 0.6513678725323179,
"grad_norm": 1.052199363708496,
"learning_rate": 4.348682232688646e-05,
"loss": 0.3096,
"num_input_tokens_seen": 13312000,
"step": 13000
},
{
"epoch": 0.6764204830143301,
"grad_norm": 1.528619408607483,
"learning_rate": 4.3236296222066344e-05,
"loss": 0.3024,
"num_input_tokens_seen": 13824000,
"step": 13500
},
{
"epoch": 0.7014730934963423,
"grad_norm": 7.829930305480957,
"learning_rate": 4.298577011724622e-05,
"loss": 0.296,
"num_input_tokens_seen": 14336000,
"step": 14000
},
{
"epoch": 0.7265257039783546,
"grad_norm": 13.035155296325684,
"learning_rate": 4.273524401242609e-05,
"loss": 0.3154,
"num_input_tokens_seen": 14848000,
"step": 14500
},
{
"epoch": 0.7515783144603668,
"grad_norm": 12.151269912719727,
"learning_rate": 4.248471790760598e-05,
"loss": 0.2988,
"num_input_tokens_seen": 15360000,
"step": 15000
},
{
"epoch": 0.776630924942379,
"grad_norm": 2.0840141773223877,
"learning_rate": 4.223419180278585e-05,
"loss": 0.2945,
"num_input_tokens_seen": 15872000,
"step": 15500
},
{
"epoch": 0.8016835354243912,
"grad_norm": 14.681863784790039,
"learning_rate": 4.1983665697965726e-05,
"loss": 0.3228,
"num_input_tokens_seen": 16384000,
"step": 16000
},
{
"epoch": 0.8267361459064034,
"grad_norm": 0.5756533741950989,
"learning_rate": 4.173313959314561e-05,
"loss": 0.3042,
"num_input_tokens_seen": 16896000,
"step": 16500
},
{
"epoch": 0.8517887563884157,
"grad_norm": 5.992170810699463,
"learning_rate": 4.148261348832548e-05,
"loss": 0.3052,
"num_input_tokens_seen": 17408000,
"step": 17000
},
{
"epoch": 0.8768413668704279,
"grad_norm": 0.5731572508811951,
"learning_rate": 4.123208738350536e-05,
"loss": 0.3027,
"num_input_tokens_seen": 17920000,
"step": 17500
},
{
"epoch": 0.9018939773524401,
"grad_norm": 4.941533088684082,
"learning_rate": 4.0981561278685245e-05,
"loss": 0.2897,
"num_input_tokens_seen": 18432000,
"step": 18000
},
{
"epoch": 0.9269465878344524,
"grad_norm": 2.07985520362854,
"learning_rate": 4.0731035173865116e-05,
"loss": 0.3088,
"num_input_tokens_seen": 18944000,
"step": 18500
},
{
"epoch": 0.9519991983164646,
"grad_norm": 0.6285837292671204,
"learning_rate": 4.0480509069044994e-05,
"loss": 0.3138,
"num_input_tokens_seen": 19456000,
"step": 19000
},
{
"epoch": 0.9770518087984768,
"grad_norm": 1.9885900020599365,
"learning_rate": 4.022998296422487e-05,
"loss": 0.2893,
"num_input_tokens_seen": 19968000,
"step": 19500
},
{
"epoch": 1.0,
"eval_accuracy": 0.9061278685239001,
"eval_combined_score": 2.144811219173038,
"eval_loss": 0.2926097810268402,
"eval_runtime": 20.2505,
"eval_samples_per_second": 1971.108,
"eval_steps_per_second": 246.413,
"num_input_tokens_seen": 20436992,
"step": 19958
},
{
"epoch": 1.002104419280489,
"grad_norm": 0.5625438690185547,
"learning_rate": 3.997945685940475e-05,
"loss": 0.2873,
"num_input_tokens_seen": 20480000,
"step": 20000
},
{
"epoch": 1.0271570297625012,
"grad_norm": 19.315837860107422,
"learning_rate": 3.9728930754584634e-05,
"loss": 0.2627,
"num_input_tokens_seen": 20992000,
"step": 20500
},
{
"epoch": 1.0522096402445136,
"grad_norm": 0.16634128987789154,
"learning_rate": 3.9478404649764506e-05,
"loss": 0.2664,
"num_input_tokens_seen": 21504000,
"step": 21000
},
{
"epoch": 1.0772622507265257,
"grad_norm": 139.21690368652344,
"learning_rate": 3.9227878544944383e-05,
"loss": 0.2469,
"num_input_tokens_seen": 22016000,
"step": 21500
},
{
"epoch": 1.1023148612085378,
"grad_norm": 0.19860202074050903,
"learning_rate": 3.897735244012427e-05,
"loss": 0.2786,
"num_input_tokens_seen": 22528000,
"step": 22000
},
{
"epoch": 1.1273674716905502,
"grad_norm": 0.420663058757782,
"learning_rate": 3.872682633530414e-05,
"loss": 0.2355,
"num_input_tokens_seen": 23040000,
"step": 22500
},
{
"epoch": 1.1524200821725623,
"grad_norm": 7.762341022491455,
"learning_rate": 3.847630023048402e-05,
"loss": 0.266,
"num_input_tokens_seen": 23552000,
"step": 23000
},
{
"epoch": 1.1774726926545747,
"grad_norm": 8.320157051086426,
"learning_rate": 3.8225774125663895e-05,
"loss": 0.2599,
"num_input_tokens_seen": 24064000,
"step": 23500
},
{
"epoch": 1.2025253031365868,
"grad_norm": 6.601953506469727,
"learning_rate": 3.797524802084377e-05,
"loss": 0.2692,
"num_input_tokens_seen": 24576000,
"step": 24000
},
{
"epoch": 1.2275779136185991,
"grad_norm": 2.0535728931427,
"learning_rate": 3.772472191602365e-05,
"loss": 0.2487,
"num_input_tokens_seen": 25088000,
"step": 24500
},
{
"epoch": 1.2526305241006113,
"grad_norm": 0.2633844316005707,
"learning_rate": 3.747419581120353e-05,
"loss": 0.2659,
"num_input_tokens_seen": 25600000,
"step": 25000
},
{
"epoch": 1.2776831345826234,
"grad_norm": 2.607544183731079,
"learning_rate": 3.722366970638341e-05,
"loss": 0.2809,
"num_input_tokens_seen": 26112000,
"step": 25500
},
{
"epoch": 1.3027357450646357,
"grad_norm": 0.6017013788223267,
"learning_rate": 3.6973143601563285e-05,
"loss": 0.2651,
"num_input_tokens_seen": 26624000,
"step": 26000
},
{
"epoch": 1.327788355546648,
"grad_norm": 0.30723240971565247,
"learning_rate": 3.672261749674316e-05,
"loss": 0.2565,
"num_input_tokens_seen": 27136000,
"step": 26500
},
{
"epoch": 1.3528409660286602,
"grad_norm": 0.3843832015991211,
"learning_rate": 3.647209139192304e-05,
"loss": 0.2736,
"num_input_tokens_seen": 27648000,
"step": 27000
},
{
"epoch": 1.3778935765106723,
"grad_norm": 12.198064804077148,
"learning_rate": 3.622156528710292e-05,
"loss": 0.2591,
"num_input_tokens_seen": 28160000,
"step": 27500
},
{
"epoch": 1.4029461869926847,
"grad_norm": 1.8191192150115967,
"learning_rate": 3.5971039182282796e-05,
"loss": 0.2667,
"num_input_tokens_seen": 28672000,
"step": 28000
},
{
"epoch": 1.4279987974746968,
"grad_norm": 4.33354377746582,
"learning_rate": 3.5720513077462674e-05,
"loss": 0.2812,
"num_input_tokens_seen": 29184000,
"step": 28500
},
{
"epoch": 1.4530514079567092,
"grad_norm": 6.340269088745117,
"learning_rate": 3.546998697264255e-05,
"loss": 0.2691,
"num_input_tokens_seen": 29696000,
"step": 29000
},
{
"epoch": 1.4781040184387213,
"grad_norm": 12.591937065124512,
"learning_rate": 3.521946086782243e-05,
"loss": 0.2591,
"num_input_tokens_seen": 30208000,
"step": 29500
},
{
"epoch": 1.5031566289207334,
"grad_norm": 0.40031296014785767,
"learning_rate": 3.496893476300231e-05,
"loss": 0.2735,
"num_input_tokens_seen": 30720000,
"step": 30000
},
{
"epoch": 1.5282092394027458,
"grad_norm": 3.4099674224853516,
"learning_rate": 3.4718408658182186e-05,
"loss": 0.2737,
"num_input_tokens_seen": 31232000,
"step": 30500
},
{
"epoch": 1.5532618498847581,
"grad_norm": 1.1597915887832642,
"learning_rate": 3.4467882553362064e-05,
"loss": 0.2587,
"num_input_tokens_seen": 31744000,
"step": 31000
},
{
"epoch": 1.5783144603667703,
"grad_norm": 68.36583709716797,
"learning_rate": 3.4217356448541935e-05,
"loss": 0.2533,
"num_input_tokens_seen": 32256000,
"step": 31500
},
{
"epoch": 1.6033670708487824,
"grad_norm": 2.799591302871704,
"learning_rate": 3.396683034372182e-05,
"loss": 0.2779,
"num_input_tokens_seen": 32768000,
"step": 32000
},
{
"epoch": 1.6284196813307945,
"grad_norm": 0.5911589860916138,
"learning_rate": 3.37163042389017e-05,
"loss": 0.2427,
"num_input_tokens_seen": 33280000,
"step": 32500
},
{
"epoch": 1.6534722918128069,
"grad_norm": 11.752649307250977,
"learning_rate": 3.346577813408157e-05,
"loss": 0.2387,
"num_input_tokens_seen": 33792000,
"step": 33000
},
{
"epoch": 1.6785249022948192,
"grad_norm": 0.4207652807235718,
"learning_rate": 3.3215252029261453e-05,
"loss": 0.2415,
"num_input_tokens_seen": 34304000,
"step": 33500
},
{
"epoch": 1.7035775127768313,
"grad_norm": 7.099503993988037,
"learning_rate": 3.296472592444133e-05,
"loss": 0.2649,
"num_input_tokens_seen": 34816000,
"step": 34000
},
{
"epoch": 1.7286301232588435,
"grad_norm": 0.41005975008010864,
"learning_rate": 3.27141998196212e-05,
"loss": 0.2618,
"num_input_tokens_seen": 35328000,
"step": 34500
},
{
"epoch": 1.7536827337408558,
"grad_norm": 7.169194221496582,
"learning_rate": 3.246367371480109e-05,
"loss": 0.2497,
"num_input_tokens_seen": 35840000,
"step": 35000
},
{
"epoch": 1.7787353442228682,
"grad_norm": 2.8473153114318848,
"learning_rate": 3.221314760998096e-05,
"loss": 0.2706,
"num_input_tokens_seen": 36352000,
"step": 35500
},
{
"epoch": 1.8037879547048803,
"grad_norm": 37.23502731323242,
"learning_rate": 3.1962621505160836e-05,
"loss": 0.2643,
"num_input_tokens_seen": 36864000,
"step": 36000
},
{
"epoch": 1.8288405651868924,
"grad_norm": 0.3462938666343689,
"learning_rate": 3.171209540034072e-05,
"loss": 0.258,
"num_input_tokens_seen": 37376000,
"step": 36500
},
{
"epoch": 1.8538931756689045,
"grad_norm": 4.687111854553223,
"learning_rate": 3.146156929552059e-05,
"loss": 0.2555,
"num_input_tokens_seen": 37888000,
"step": 37000
},
{
"epoch": 1.878945786150917,
"grad_norm": 0.3295840919017792,
"learning_rate": 3.121104319070047e-05,
"loss": 0.2475,
"num_input_tokens_seen": 38400000,
"step": 37500
},
{
"epoch": 1.9039983966329292,
"grad_norm": 5.046384334564209,
"learning_rate": 3.0960517085880355e-05,
"loss": 0.2483,
"num_input_tokens_seen": 38912000,
"step": 38000
},
{
"epoch": 1.9290510071149414,
"grad_norm": 4.56272029876709,
"learning_rate": 3.0709990981060226e-05,
"loss": 0.2521,
"num_input_tokens_seen": 39424000,
"step": 38500
},
{
"epoch": 1.9541036175969535,
"grad_norm": 3.8051841259002686,
"learning_rate": 3.0459464876240107e-05,
"loss": 0.2805,
"num_input_tokens_seen": 39936000,
"step": 39000
},
{
"epoch": 1.9791562280789659,
"grad_norm": 0.6326732039451599,
"learning_rate": 3.020893877141998e-05,
"loss": 0.2397,
"num_input_tokens_seen": 40448000,
"step": 39500
},
{
"epoch": 2.0,
"eval_accuracy": 0.9075809199318569,
"eval_combined_score": 2.1193511042245365,
"eval_loss": 0.31268036365509033,
"eval_runtime": 20.2588,
"eval_samples_per_second": 1970.3,
"eval_steps_per_second": 246.312,
"num_input_tokens_seen": 40873984,
"step": 39916
},
{
"epoch": 2.004208838560978,
"grad_norm": 6.647907733917236,
"learning_rate": 2.995841266659986e-05,
"loss": 0.2299,
"num_input_tokens_seen": 40960000,
"step": 40000
},
{
"epoch": 2.0292614490429903,
"grad_norm": 0.32763534784317017,
"learning_rate": 2.970788656177974e-05,
"loss": 0.1992,
"num_input_tokens_seen": 41472000,
"step": 40500
},
{
"epoch": 2.0543140595250025,
"grad_norm": 14.943070411682129,
"learning_rate": 2.9457360456959615e-05,
"loss": 0.2179,
"num_input_tokens_seen": 41984000,
"step": 41000
},
{
"epoch": 2.0793666700070146,
"grad_norm": 4.630057334899902,
"learning_rate": 2.9206834352139493e-05,
"loss": 0.2151,
"num_input_tokens_seen": 42496000,
"step": 41500
},
{
"epoch": 2.104419280489027,
"grad_norm": 15.34054183959961,
"learning_rate": 2.8956308247319375e-05,
"loss": 0.197,
"num_input_tokens_seen": 43008000,
"step": 42000
},
{
"epoch": 2.1294718909710393,
"grad_norm": 0.16923962533473969,
"learning_rate": 2.870578214249925e-05,
"loss": 0.1865,
"num_input_tokens_seen": 43520000,
"step": 42500
},
{
"epoch": 2.1545245014530514,
"grad_norm": 34.0042839050293,
"learning_rate": 2.845525603767913e-05,
"loss": 0.2122,
"num_input_tokens_seen": 44032000,
"step": 43000
},
{
"epoch": 2.1795771119350635,
"grad_norm": 5.353533744812012,
"learning_rate": 2.8204729932859e-05,
"loss": 0.2289,
"num_input_tokens_seen": 44544000,
"step": 43500
},
{
"epoch": 2.2046297224170757,
"grad_norm": 0.5980260372161865,
"learning_rate": 2.7954203828038883e-05,
"loss": 0.2175,
"num_input_tokens_seen": 45056000,
"step": 44000
},
{
"epoch": 2.2296823328990882,
"grad_norm": 12.995455741882324,
"learning_rate": 2.7703677723218764e-05,
"loss": 0.2337,
"num_input_tokens_seen": 45568000,
"step": 44500
},
{
"epoch": 2.2547349433811004,
"grad_norm": 0.11803791671991348,
"learning_rate": 2.7453151618398635e-05,
"loss": 0.2114,
"num_input_tokens_seen": 46080000,
"step": 45000
},
{
"epoch": 2.2797875538631125,
"grad_norm": 0.21874956786632538,
"learning_rate": 2.7202625513578517e-05,
"loss": 0.2215,
"num_input_tokens_seen": 46592000,
"step": 45500
},
{
"epoch": 2.3048401643451246,
"grad_norm": 42.951351165771484,
"learning_rate": 2.6952099408758398e-05,
"loss": 0.2296,
"num_input_tokens_seen": 47104000,
"step": 46000
},
{
"epoch": 2.3298927748271367,
"grad_norm": 3.324039936065674,
"learning_rate": 2.6701573303938272e-05,
"loss": 0.2118,
"num_input_tokens_seen": 47616000,
"step": 46500
},
{
"epoch": 2.3549453853091493,
"grad_norm": 0.8097792863845825,
"learning_rate": 2.645104719911815e-05,
"loss": 0.198,
"num_input_tokens_seen": 48128000,
"step": 47000
},
{
"epoch": 2.3799979957911614,
"grad_norm": 2.3140671253204346,
"learning_rate": 2.6200521094298025e-05,
"loss": 0.2251,
"num_input_tokens_seen": 48640000,
"step": 47500
},
{
"epoch": 2.4050506062731736,
"grad_norm": 5.793896675109863,
"learning_rate": 2.5949994989477906e-05,
"loss": 0.1942,
"num_input_tokens_seen": 49152000,
"step": 48000
},
{
"epoch": 2.4301032167551857,
"grad_norm": 0.08759485185146332,
"learning_rate": 2.5699468884657784e-05,
"loss": 0.2105,
"num_input_tokens_seen": 49664000,
"step": 48500
},
{
"epoch": 2.4551558272371983,
"grad_norm": 13.725948333740234,
"learning_rate": 2.544894277983766e-05,
"loss": 0.2174,
"num_input_tokens_seen": 50176000,
"step": 49000
},
{
"epoch": 2.4802084377192104,
"grad_norm": 0.24512171745300293,
"learning_rate": 2.519841667501754e-05,
"loss": 0.2213,
"num_input_tokens_seen": 50688000,
"step": 49500
},
{
"epoch": 2.5052610482012225,
"grad_norm": 24.33919906616211,
"learning_rate": 2.4947890570197415e-05,
"loss": 0.208,
"num_input_tokens_seen": 51200000,
"step": 50000
},
{
"epoch": 2.5303136586832347,
"grad_norm": 20.30912971496582,
"learning_rate": 2.4697364465377292e-05,
"loss": 0.2344,
"num_input_tokens_seen": 51712000,
"step": 50500
},
{
"epoch": 2.555366269165247,
"grad_norm": 0.27970781922340393,
"learning_rate": 2.444683836055717e-05,
"loss": 0.217,
"num_input_tokens_seen": 52224000,
"step": 51000
},
{
"epoch": 2.5804188796472594,
"grad_norm": 0.18607856333255768,
"learning_rate": 2.4196312255737048e-05,
"loss": 0.1942,
"num_input_tokens_seen": 52736000,
"step": 51500
},
{
"epoch": 2.6054714901292715,
"grad_norm": 3.2024385929107666,
"learning_rate": 2.3945786150916926e-05,
"loss": 0.2009,
"num_input_tokens_seen": 53248000,
"step": 52000
},
{
"epoch": 2.6305241006112836,
"grad_norm": 9.92158317565918,
"learning_rate": 2.3695260046096804e-05,
"loss": 0.1888,
"num_input_tokens_seen": 53760000,
"step": 52500
},
{
"epoch": 2.655576711093296,
"grad_norm": 9.307025909423828,
"learning_rate": 2.3444733941276682e-05,
"loss": 0.1927,
"num_input_tokens_seen": 54272000,
"step": 53000
},
{
"epoch": 2.6806293215753083,
"grad_norm": 32.159671783447266,
"learning_rate": 2.319420783645656e-05,
"loss": 0.213,
"num_input_tokens_seen": 54784000,
"step": 53500
},
{
"epoch": 2.7056819320573204,
"grad_norm": 11.267858505249023,
"learning_rate": 2.2943681731636438e-05,
"loss": 0.2264,
"num_input_tokens_seen": 55296000,
"step": 54000
},
{
"epoch": 2.7307345425393326,
"grad_norm": 0.18241587281227112,
"learning_rate": 2.2693155626816316e-05,
"loss": 0.2253,
"num_input_tokens_seen": 55808000,
"step": 54500
},
{
"epoch": 2.7557871530213447,
"grad_norm": 1.898651123046875,
"learning_rate": 2.244262952199619e-05,
"loss": 0.2112,
"num_input_tokens_seen": 56320000,
"step": 55000
},
{
"epoch": 2.780839763503357,
"grad_norm": 10.556557655334473,
"learning_rate": 2.219210341717607e-05,
"loss": 0.2157,
"num_input_tokens_seen": 56832000,
"step": 55500
},
{
"epoch": 2.8058923739853694,
"grad_norm": 1.2913810014724731,
"learning_rate": 2.194157731235595e-05,
"loss": 0.2218,
"num_input_tokens_seen": 57344000,
"step": 56000
},
{
"epoch": 2.8309449844673815,
"grad_norm": 20.129615783691406,
"learning_rate": 2.1691051207535827e-05,
"loss": 0.2205,
"num_input_tokens_seen": 57856000,
"step": 56500
},
{
"epoch": 2.8559975949493936,
"grad_norm": 0.3709011971950531,
"learning_rate": 2.1440525102715702e-05,
"loss": 0.2288,
"num_input_tokens_seen": 58368000,
"step": 57000
},
{
"epoch": 2.881050205431406,
"grad_norm": 24.663593292236328,
"learning_rate": 2.1189998997895583e-05,
"loss": 0.2225,
"num_input_tokens_seen": 58880000,
"step": 57500
},
{
"epoch": 2.9061028159134183,
"grad_norm": 8.534331321716309,
"learning_rate": 2.093947289307546e-05,
"loss": 0.2236,
"num_input_tokens_seen": 59392000,
"step": 58000
},
{
"epoch": 2.9311554263954305,
"grad_norm": 23.226032257080078,
"learning_rate": 2.0688946788255336e-05,
"loss": 0.2111,
"num_input_tokens_seen": 59904000,
"step": 58500
},
{
"epoch": 2.9562080368774426,
"grad_norm": 16.948610305786133,
"learning_rate": 2.0438420683435214e-05,
"loss": 0.2123,
"num_input_tokens_seen": 60416000,
"step": 59000
},
{
"epoch": 2.9812606473594547,
"grad_norm": 3.7574212551116943,
"learning_rate": 2.0187894578615095e-05,
"loss": 0.2,
"num_input_tokens_seen": 60928000,
"step": 59500
},
{
"epoch": 3.0,
"eval_accuracy": 0.9109379697364466,
"eval_combined_score": 2.0605294593435146,
"eval_loss": 0.32792404294013977,
"eval_runtime": 20.2718,
"eval_samples_per_second": 1969.037,
"eval_steps_per_second": 246.154,
"num_input_tokens_seen": 61310976,
"step": 59874
},
{
"epoch": 3.0063132578414673,
"grad_norm": 2.2473177909851074,
"learning_rate": 1.993736847379497e-05,
"loss": 0.1933,
"num_input_tokens_seen": 61440000,
"step": 60000
},
{
"epoch": 3.0313658683234794,
"grad_norm": 0.11083228886127472,
"learning_rate": 1.9686842368974847e-05,
"loss": 0.1592,
"num_input_tokens_seen": 61952000,
"step": 60500
},
{
"epoch": 3.0564184788054916,
"grad_norm": 0.06714469939470291,
"learning_rate": 1.9436316264154725e-05,
"loss": 0.158,
"num_input_tokens_seen": 62464000,
"step": 61000
},
{
"epoch": 3.0814710892875037,
"grad_norm": 0.41380876302719116,
"learning_rate": 1.9185790159334603e-05,
"loss": 0.1591,
"num_input_tokens_seen": 62976000,
"step": 61500
},
{
"epoch": 3.106523699769516,
"grad_norm": 11.745950698852539,
"learning_rate": 1.893526405451448e-05,
"loss": 0.1687,
"num_input_tokens_seen": 63488000,
"step": 62000
},
{
"epoch": 3.1315763102515284,
"grad_norm": 0.10439453274011612,
"learning_rate": 1.868473794969436e-05,
"loss": 0.1773,
"num_input_tokens_seen": 64000000,
"step": 62500
},
{
"epoch": 3.1566289207335405,
"grad_norm": 0.05308441445231438,
"learning_rate": 1.8434211844874237e-05,
"loss": 0.1729,
"num_input_tokens_seen": 64512000,
"step": 63000
},
{
"epoch": 3.1816815312155526,
"grad_norm": 2.2679662704467773,
"learning_rate": 1.8183685740054115e-05,
"loss": 0.1428,
"num_input_tokens_seen": 65024000,
"step": 63500
},
{
"epoch": 3.2067341416975648,
"grad_norm": 0.18617786467075348,
"learning_rate": 1.7933159635233993e-05,
"loss": 0.162,
"num_input_tokens_seen": 65536000,
"step": 64000
},
{
"epoch": 3.231786752179577,
"grad_norm": 0.09589721262454987,
"learning_rate": 1.768263353041387e-05,
"loss": 0.1425,
"num_input_tokens_seen": 66048000,
"step": 64500
},
{
"epoch": 3.2568393626615895,
"grad_norm": 0.03517961502075195,
"learning_rate": 1.7432107425593745e-05,
"loss": 0.1709,
"num_input_tokens_seen": 66560000,
"step": 65000
},
{
"epoch": 3.2818919731436016,
"grad_norm": 0.16013863682746887,
"learning_rate": 1.7181581320773626e-05,
"loss": 0.1671,
"num_input_tokens_seen": 67072000,
"step": 65500
},
{
"epoch": 3.3069445836256137,
"grad_norm": 0.9810895323753357,
"learning_rate": 1.6931055215953504e-05,
"loss": 0.1552,
"num_input_tokens_seen": 67584000,
"step": 66000
},
{
"epoch": 3.331997194107626,
"grad_norm": 0.12127078324556351,
"learning_rate": 1.668052911113338e-05,
"loss": 0.1752,
"num_input_tokens_seen": 68096000,
"step": 66500
},
{
"epoch": 3.3570498045896384,
"grad_norm": 0.22286617755889893,
"learning_rate": 1.6430003006313257e-05,
"loss": 0.1761,
"num_input_tokens_seen": 68608000,
"step": 67000
},
{
"epoch": 3.3821024150716505,
"grad_norm": 35.76771545410156,
"learning_rate": 1.6179476901493138e-05,
"loss": 0.1687,
"num_input_tokens_seen": 69120000,
"step": 67500
},
{
"epoch": 3.4071550255536627,
"grad_norm": 0.16311609745025635,
"learning_rate": 1.5928950796673016e-05,
"loss": 0.1448,
"num_input_tokens_seen": 69632000,
"step": 68000
},
{
"epoch": 3.432207636035675,
"grad_norm": 0.10213588923215866,
"learning_rate": 1.567842469185289e-05,
"loss": 0.1668,
"num_input_tokens_seen": 70144000,
"step": 68500
},
{
"epoch": 3.457260246517687,
"grad_norm": 0.04066482558846474,
"learning_rate": 1.542789858703277e-05,
"loss": 0.1556,
"num_input_tokens_seen": 70656000,
"step": 69000
},
{
"epoch": 3.4823128569996995,
"grad_norm": 2.4263927936553955,
"learning_rate": 1.5177372482212648e-05,
"loss": 0.1692,
"num_input_tokens_seen": 71168000,
"step": 69500
},
{
"epoch": 3.5073654674817116,
"grad_norm": 0.2218380570411682,
"learning_rate": 1.4926846377392526e-05,
"loss": 0.1552,
"num_input_tokens_seen": 71680000,
"step": 70000
},
{
"epoch": 3.5324180779637238,
"grad_norm": 0.08358863741159439,
"learning_rate": 1.4676320272572402e-05,
"loss": 0.1602,
"num_input_tokens_seen": 72192000,
"step": 70500
},
{
"epoch": 3.557470688445736,
"grad_norm": 0.16968253254890442,
"learning_rate": 1.442579416775228e-05,
"loss": 0.1651,
"num_input_tokens_seen": 72704000,
"step": 71000
},
{
"epoch": 3.5825232989277485,
"grad_norm": 20.67737579345703,
"learning_rate": 1.417526806293216e-05,
"loss": 0.173,
"num_input_tokens_seen": 73216000,
"step": 71500
},
{
"epoch": 3.6075759094097606,
"grad_norm": 15.341917991638184,
"learning_rate": 1.3924741958112036e-05,
"loss": 0.1566,
"num_input_tokens_seen": 73728000,
"step": 72000
},
{
"epoch": 3.6326285198917727,
"grad_norm": 0.07673631608486176,
"learning_rate": 1.3674215853291914e-05,
"loss": 0.1765,
"num_input_tokens_seen": 74240000,
"step": 72500
},
{
"epoch": 3.657681130373785,
"grad_norm": 0.21996235847473145,
"learning_rate": 1.342368974847179e-05,
"loss": 0.1461,
"num_input_tokens_seen": 74752000,
"step": 73000
},
{
"epoch": 3.682733740855797,
"grad_norm": 183.6659393310547,
"learning_rate": 1.317316364365167e-05,
"loss": 0.1706,
"num_input_tokens_seen": 75264000,
"step": 73500
},
{
"epoch": 3.7077863513378095,
"grad_norm": 0.1564781218767166,
"learning_rate": 1.2922637538831548e-05,
"loss": 0.159,
"num_input_tokens_seen": 75776000,
"step": 74000
},
{
"epoch": 3.7328389618198217,
"grad_norm": 8.662553787231445,
"learning_rate": 1.2672111434011424e-05,
"loss": 0.1737,
"num_input_tokens_seen": 76288000,
"step": 74500
},
{
"epoch": 3.757891572301834,
"grad_norm": 331.7611999511719,
"learning_rate": 1.2421585329191303e-05,
"loss": 0.1603,
"num_input_tokens_seen": 76800000,
"step": 75000
},
{
"epoch": 3.782944182783846,
"grad_norm": 0.09944739192724228,
"learning_rate": 1.217105922437118e-05,
"loss": 0.1508,
"num_input_tokens_seen": 77312000,
"step": 75500
},
{
"epoch": 3.8079967932658585,
"grad_norm": 0.24497084319591522,
"learning_rate": 1.1920533119551058e-05,
"loss": 0.1595,
"num_input_tokens_seen": 77824000,
"step": 76000
},
{
"epoch": 3.8330494037478706,
"grad_norm": 3.5547239780426025,
"learning_rate": 1.1670007014730936e-05,
"loss": 0.1595,
"num_input_tokens_seen": 78336000,
"step": 76500
},
{
"epoch": 3.8581020142298827,
"grad_norm": 0.3509676456451416,
"learning_rate": 1.1419480909910813e-05,
"loss": 0.1606,
"num_input_tokens_seen": 78848000,
"step": 77000
},
{
"epoch": 3.883154624711895,
"grad_norm": 19.29859161376953,
"learning_rate": 1.1168954805090691e-05,
"loss": 0.1722,
"num_input_tokens_seen": 79360000,
"step": 77500
},
{
"epoch": 3.908207235193907,
"grad_norm": 0.9797153472900391,
"learning_rate": 1.091842870027057e-05,
"loss": 0.159,
"num_input_tokens_seen": 79872000,
"step": 78000
},
{
"epoch": 3.9332598456759196,
"grad_norm": 24.612085342407227,
"learning_rate": 1.0667902595450446e-05,
"loss": 0.1656,
"num_input_tokens_seen": 80384000,
"step": 78500
},
{
"epoch": 3.9583124561579317,
"grad_norm": 42.16061782836914,
"learning_rate": 1.0417376490630325e-05,
"loss": 0.1494,
"num_input_tokens_seen": 80896000,
"step": 79000
},
{
"epoch": 3.983365066639944,
"grad_norm": 0.244501531124115,
"learning_rate": 1.0166850385810201e-05,
"loss": 0.1576,
"num_input_tokens_seen": 81408000,
"step": 79500
},
{
"epoch": 4.0,
"eval_accuracy": 0.9080068143100511,
"eval_combined_score": 2.11188865673963,
"eval_loss": 0.3887489140033722,
"eval_runtime": 20.3698,
"eval_samples_per_second": 1959.563,
"eval_steps_per_second": 244.97,
"num_input_tokens_seen": 81747968,
"step": 79832
},
{
"epoch": 4.008417677121956,
"grad_norm": 0.1654539704322815,
"learning_rate": 9.916324280990081e-06,
"loss": 0.1449,
"num_input_tokens_seen": 81920000,
"step": 80000
},
{
"epoch": 4.0334702876039685,
"grad_norm": 0.1446412056684494,
"learning_rate": 9.665798176169957e-06,
"loss": 0.1065,
"num_input_tokens_seen": 82432000,
"step": 80500
},
{
"epoch": 4.058522898085981,
"grad_norm": 0.15385593473911285,
"learning_rate": 9.415272071349835e-06,
"loss": 0.1173,
"num_input_tokens_seen": 82944000,
"step": 81000
},
{
"epoch": 4.083575508567993,
"grad_norm": 0.1097198873758316,
"learning_rate": 9.164745966529713e-06,
"loss": 0.122,
"num_input_tokens_seen": 83456000,
"step": 81500
},
{
"epoch": 4.108628119050005,
"grad_norm": 140.7417449951172,
"learning_rate": 8.914219861709591e-06,
"loss": 0.1334,
"num_input_tokens_seen": 83968000,
"step": 82000
},
{
"epoch": 4.133680729532017,
"grad_norm": 0.07995349913835526,
"learning_rate": 8.663693756889469e-06,
"loss": 0.1099,
"num_input_tokens_seen": 84480000,
"step": 82500
},
{
"epoch": 4.158733340014029,
"grad_norm": 23.00501823425293,
"learning_rate": 8.413167652069347e-06,
"loss": 0.1362,
"num_input_tokens_seen": 84992000,
"step": 83000
},
{
"epoch": 4.183785950496041,
"grad_norm": 0.17540641129016876,
"learning_rate": 8.162641547249223e-06,
"loss": 0.1119,
"num_input_tokens_seen": 85504000,
"step": 83500
},
{
"epoch": 4.208838560978054,
"grad_norm": 10.598029136657715,
"learning_rate": 7.912115442429101e-06,
"loss": 0.1174,
"num_input_tokens_seen": 86016000,
"step": 84000
},
{
"epoch": 4.233891171460066,
"grad_norm": 0.06076182797551155,
"learning_rate": 7.661589337608979e-06,
"loss": 0.104,
"num_input_tokens_seen": 86528000,
"step": 84500
},
{
"epoch": 4.258943781942079,
"grad_norm": 0.2133261263370514,
"learning_rate": 7.411063232788856e-06,
"loss": 0.1229,
"num_input_tokens_seen": 87040000,
"step": 85000
},
{
"epoch": 4.283996392424091,
"grad_norm": 8.342382431030273,
"learning_rate": 7.160537127968735e-06,
"loss": 0.1254,
"num_input_tokens_seen": 87552000,
"step": 85500
},
{
"epoch": 4.309049002906103,
"grad_norm": 0.015887776389718056,
"learning_rate": 6.910011023148612e-06,
"loss": 0.1308,
"num_input_tokens_seen": 88064000,
"step": 86000
},
{
"epoch": 4.334101613388115,
"grad_norm": 0.24978305399417877,
"learning_rate": 6.6594849183284905e-06,
"loss": 0.1106,
"num_input_tokens_seen": 88576000,
"step": 86500
},
{
"epoch": 4.359154223870127,
"grad_norm": 0.19210007786750793,
"learning_rate": 6.4089588135083675e-06,
"loss": 0.1257,
"num_input_tokens_seen": 89088000,
"step": 87000
},
{
"epoch": 4.384206834352139,
"grad_norm": 10.589780807495117,
"learning_rate": 6.1584327086882454e-06,
"loss": 0.1238,
"num_input_tokens_seen": 89600000,
"step": 87500
},
{
"epoch": 4.409259444834151,
"grad_norm": 38.77216720581055,
"learning_rate": 5.907906603868123e-06,
"loss": 0.1213,
"num_input_tokens_seen": 90112000,
"step": 88000
},
{
"epoch": 4.434312055316164,
"grad_norm": 0.1937304437160492,
"learning_rate": 5.657380499048001e-06,
"loss": 0.1346,
"num_input_tokens_seen": 90624000,
"step": 88500
},
{
"epoch": 4.4593646657981765,
"grad_norm": 0.023678578436374664,
"learning_rate": 5.406854394227879e-06,
"loss": 0.1013,
"num_input_tokens_seen": 91136000,
"step": 89000
},
{
"epoch": 4.484417276280189,
"grad_norm": 7.357041358947754,
"learning_rate": 5.156328289407756e-06,
"loss": 0.1353,
"num_input_tokens_seen": 91648000,
"step": 89500
},
{
"epoch": 4.509469886762201,
"grad_norm": 0.0861930251121521,
"learning_rate": 4.905802184587634e-06,
"loss": 0.1188,
"num_input_tokens_seen": 92160000,
"step": 90000
},
{
"epoch": 4.534522497244213,
"grad_norm": 0.05523020401597023,
"learning_rate": 4.655276079767512e-06,
"loss": 0.0998,
"num_input_tokens_seen": 92672000,
"step": 90500
},
{
"epoch": 4.559575107726225,
"grad_norm": 35.0329704284668,
"learning_rate": 4.40474997494739e-06,
"loss": 0.1286,
"num_input_tokens_seen": 93184000,
"step": 91000
},
{
"epoch": 4.584627718208237,
"grad_norm": 0.04070122167468071,
"learning_rate": 4.154223870127268e-06,
"loss": 0.116,
"num_input_tokens_seen": 93696000,
"step": 91500
},
{
"epoch": 4.609680328690249,
"grad_norm": 6.631749153137207,
"learning_rate": 3.903697765307145e-06,
"loss": 0.1079,
"num_input_tokens_seen": 94208000,
"step": 92000
},
{
"epoch": 4.634732939172261,
"grad_norm": 38.14702606201172,
"learning_rate": 3.653171660487023e-06,
"loss": 0.1338,
"num_input_tokens_seen": 94720000,
"step": 92500
},
{
"epoch": 4.6597855496542735,
"grad_norm": 0.04164925217628479,
"learning_rate": 3.402645555666901e-06,
"loss": 0.0951,
"num_input_tokens_seen": 95232000,
"step": 93000
},
{
"epoch": 4.6848381601362865,
"grad_norm": 0.12932783365249634,
"learning_rate": 3.1521194508467787e-06,
"loss": 0.123,
"num_input_tokens_seen": 95744000,
"step": 93500
},
{
"epoch": 4.709890770618299,
"grad_norm": 0.41988006234169006,
"learning_rate": 2.901593346026656e-06,
"loss": 0.1316,
"num_input_tokens_seen": 96256000,
"step": 94000
},
{
"epoch": 4.734943381100311,
"grad_norm": 0.2050684094429016,
"learning_rate": 2.6510672412065337e-06,
"loss": 0.1193,
"num_input_tokens_seen": 96768000,
"step": 94500
},
{
"epoch": 4.759995991582323,
"grad_norm": 0.08065121620893478,
"learning_rate": 2.4005411363864116e-06,
"loss": 0.1069,
"num_input_tokens_seen": 97280000,
"step": 95000
},
{
"epoch": 4.785048602064335,
"grad_norm": 0.057149503380060196,
"learning_rate": 2.150015031566289e-06,
"loss": 0.1089,
"num_input_tokens_seen": 97792000,
"step": 95500
},
{
"epoch": 4.810101212546347,
"grad_norm": 0.12982851266860962,
"learning_rate": 1.8994889267461668e-06,
"loss": 0.1025,
"num_input_tokens_seen": 98304000,
"step": 96000
},
{
"epoch": 4.835153823028359,
"grad_norm": 0.09585094451904297,
"learning_rate": 1.6489628219260447e-06,
"loss": 0.1276,
"num_input_tokens_seen": 98816000,
"step": 96500
},
{
"epoch": 4.860206433510371,
"grad_norm": 0.06750782579183578,
"learning_rate": 1.3984367171059227e-06,
"loss": 0.13,
"num_input_tokens_seen": 99328000,
"step": 97000
},
{
"epoch": 4.885259043992384,
"grad_norm": 0.16248978674411774,
"learning_rate": 1.1479106122858004e-06,
"loss": 0.125,
"num_input_tokens_seen": 99840000,
"step": 97500
},
{
"epoch": 4.9103116544743965,
"grad_norm": 0.10622742027044296,
"learning_rate": 8.973845074656779e-07,
"loss": 0.1212,
"num_input_tokens_seen": 100352000,
"step": 98000
},
{
"epoch": 4.935364264956409,
"grad_norm": 0.0945580005645752,
"learning_rate": 6.468584026455557e-07,
"loss": 0.1094,
"num_input_tokens_seen": 100864000,
"step": 98500
},
{
"epoch": 4.960416875438421,
"grad_norm": 0.2841167449951172,
"learning_rate": 3.9633229782543347e-07,
"loss": 0.1056,
"num_input_tokens_seen": 101376000,
"step": 99000
},
{
"epoch": 4.985469485920433,
"grad_norm": 0.04773109778761864,
"learning_rate": 1.4580619300531115e-07,
"loss": 0.1127,
"num_input_tokens_seen": 101888000,
"step": 99500
},
{
"epoch": 5.0,
"eval_accuracy": 0.9069295520593246,
"eval_combined_score": 2.1307642592014506,
"eval_loss": 0.4687708020210266,
"eval_runtime": 20.4238,
"eval_samples_per_second": 1954.382,
"eval_steps_per_second": 244.322,
"num_input_tokens_seen": 102184960,
"step": 99790
},
{
"epoch": 5.0,
"num_input_tokens_seen": 102184960,
"step": 99790,
"total_flos": 1.314687947575296e+16,
"train_loss": 0.2132515576335182,
"train_runtime": 1882.7905,
"train_samples_per_second": 424.009,
"train_steps_per_second": 53.001,
"train_tokens_per_second": 54273.145
}
],
"logging_steps": 500,
"max_steps": 99790,
"num_input_tokens_seen": 102184960,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.314687947575296e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}