{ "best_global_step": 19958, "best_metric": 0.2926097810268402, "best_model_checkpoint": "/media/user/Expansion1/bge-small-en-v1.5-ultrafineweb-vs-pile-classifier/checkpoint-19958", "epoch": 5.0, "eval_steps": 500, "global_step": 99790, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025052610482012225, "grad_norm": 1.178984522819519, "learning_rate": 4.9749974947389525e-05, "loss": 0.424, "num_input_tokens_seen": 512000, "step": 500 }, { "epoch": 0.05010522096402445, "grad_norm": 16.981231689453125, "learning_rate": 4.9499448842569396e-05, "loss": 0.3671, "num_input_tokens_seen": 1024000, "step": 1000 }, { "epoch": 0.07515783144603667, "grad_norm": 1.642918586730957, "learning_rate": 4.9248922737749274e-05, "loss": 0.3593, "num_input_tokens_seen": 1536000, "step": 1500 }, { "epoch": 0.1002104419280489, "grad_norm": 1.7461471557617188, "learning_rate": 4.899839663292916e-05, "loss": 0.3482, "num_input_tokens_seen": 2048000, "step": 2000 }, { "epoch": 0.12526305241006114, "grad_norm": 1.169636845588684, "learning_rate": 4.874787052810903e-05, "loss": 0.3251, "num_input_tokens_seen": 2560000, "step": 2500 }, { "epoch": 0.15031566289207335, "grad_norm": 2.5293004512786865, "learning_rate": 4.849734442328891e-05, "loss": 0.3267, "num_input_tokens_seen": 3072000, "step": 3000 }, { "epoch": 0.1753682733740856, "grad_norm": 2.039461135864258, "learning_rate": 4.824681831846879e-05, "loss": 0.3278, "num_input_tokens_seen": 3584000, "step": 3500 }, { "epoch": 0.2004208838560978, "grad_norm": 0.4097174406051636, "learning_rate": 4.799629221364866e-05, "loss": 0.3077, "num_input_tokens_seen": 4096000, "step": 4000 }, { "epoch": 0.22547349433811004, "grad_norm": 9.578287124633789, "learning_rate": 4.774576610882854e-05, "loss": 0.3076, "num_input_tokens_seen": 4608000, "step": 4500 }, { "epoch": 0.2505261048201223, "grad_norm": 2.4667727947235107, "learning_rate": 4.749524000400842e-05, "loss": 0.3063, "num_input_tokens_seen": 5120000, "step": 5000 }, { "epoch": 0.27557871530213446, "grad_norm": 0.20313717424869537, "learning_rate": 4.72447138991883e-05, "loss": 0.3087, "num_input_tokens_seen": 5632000, "step": 5500 }, { "epoch": 0.3006313257841467, "grad_norm": 12.131691932678223, "learning_rate": 4.6994187794368175e-05, "loss": 0.3134, "num_input_tokens_seen": 6144000, "step": 6000 }, { "epoch": 0.32568393626615894, "grad_norm": 9.874564170837402, "learning_rate": 4.674366168954805e-05, "loss": 0.3176, "num_input_tokens_seen": 6656000, "step": 6500 }, { "epoch": 0.3507365467481712, "grad_norm": 0.43632322549819946, "learning_rate": 4.649313558472793e-05, "loss": 0.3124, "num_input_tokens_seen": 7168000, "step": 7000 }, { "epoch": 0.3757891572301834, "grad_norm": 6.8587141036987305, "learning_rate": 4.624260947990781e-05, "loss": 0.3025, "num_input_tokens_seen": 7680000, "step": 7500 }, { "epoch": 0.4008417677121956, "grad_norm": 0.6035759449005127, "learning_rate": 4.5992083375087687e-05, "loss": 0.2948, "num_input_tokens_seen": 8192000, "step": 8000 }, { "epoch": 0.42589437819420783, "grad_norm": 9.4423246383667, "learning_rate": 4.5741557270267564e-05, "loss": 0.3029, "num_input_tokens_seen": 8704000, "step": 8500 }, { "epoch": 0.4509469886762201, "grad_norm": 0.47421976923942566, "learning_rate": 4.549103116544744e-05, "loss": 0.2954, "num_input_tokens_seen": 9216000, "step": 9000 }, { "epoch": 0.4759995991582323, "grad_norm": 2.0256924629211426, "learning_rate": 4.524050506062732e-05, "loss": 0.308, "num_input_tokens_seen": 9728000, "step": 9500 }, { "epoch": 0.5010522096402446, "grad_norm": 2.3783328533172607, "learning_rate": 4.49899789558072e-05, "loss": 0.308, "num_input_tokens_seen": 10240000, "step": 10000 }, { "epoch": 0.5261048201222568, "grad_norm": 11.536542892456055, "learning_rate": 4.4739452850987076e-05, "loss": 0.287, "num_input_tokens_seen": 10752000, "step": 10500 }, { "epoch": 0.5511574306042689, "grad_norm": 7.235984802246094, "learning_rate": 4.4488926746166954e-05, "loss": 0.3005, "num_input_tokens_seen": 11264000, "step": 11000 }, { "epoch": 0.5762100410862812, "grad_norm": 11.705055236816406, "learning_rate": 4.4238400641346825e-05, "loss": 0.3083, "num_input_tokens_seen": 11776000, "step": 11500 }, { "epoch": 0.6012626515682934, "grad_norm": 1.2158238887786865, "learning_rate": 4.398787453652671e-05, "loss": 0.2983, "num_input_tokens_seen": 12288000, "step": 12000 }, { "epoch": 0.6263152620503056, "grad_norm": 13.371932029724121, "learning_rate": 4.373734843170659e-05, "loss": 0.3055, "num_input_tokens_seen": 12800000, "step": 12500 }, { "epoch": 0.6513678725323179, "grad_norm": 1.052199363708496, "learning_rate": 4.348682232688646e-05, "loss": 0.3096, "num_input_tokens_seen": 13312000, "step": 13000 }, { "epoch": 0.6764204830143301, "grad_norm": 1.528619408607483, "learning_rate": 4.3236296222066344e-05, "loss": 0.3024, "num_input_tokens_seen": 13824000, "step": 13500 }, { "epoch": 0.7014730934963423, "grad_norm": 7.829930305480957, "learning_rate": 4.298577011724622e-05, "loss": 0.296, "num_input_tokens_seen": 14336000, "step": 14000 }, { "epoch": 0.7265257039783546, "grad_norm": 13.035155296325684, "learning_rate": 4.273524401242609e-05, "loss": 0.3154, "num_input_tokens_seen": 14848000, "step": 14500 }, { "epoch": 0.7515783144603668, "grad_norm": 12.151269912719727, "learning_rate": 4.248471790760598e-05, "loss": 0.2988, "num_input_tokens_seen": 15360000, "step": 15000 }, { "epoch": 0.776630924942379, "grad_norm": 2.0840141773223877, "learning_rate": 4.223419180278585e-05, "loss": 0.2945, "num_input_tokens_seen": 15872000, "step": 15500 }, { "epoch": 0.8016835354243912, "grad_norm": 14.681863784790039, "learning_rate": 4.1983665697965726e-05, "loss": 0.3228, "num_input_tokens_seen": 16384000, "step": 16000 }, { "epoch": 0.8267361459064034, "grad_norm": 0.5756533741950989, "learning_rate": 4.173313959314561e-05, "loss": 0.3042, "num_input_tokens_seen": 16896000, "step": 16500 }, { "epoch": 0.8517887563884157, "grad_norm": 5.992170810699463, "learning_rate": 4.148261348832548e-05, "loss": 0.3052, "num_input_tokens_seen": 17408000, "step": 17000 }, { "epoch": 0.8768413668704279, "grad_norm": 0.5731572508811951, "learning_rate": 4.123208738350536e-05, "loss": 0.3027, "num_input_tokens_seen": 17920000, "step": 17500 }, { "epoch": 0.9018939773524401, "grad_norm": 4.941533088684082, "learning_rate": 4.0981561278685245e-05, "loss": 0.2897, "num_input_tokens_seen": 18432000, "step": 18000 }, { "epoch": 0.9269465878344524, "grad_norm": 2.07985520362854, "learning_rate": 4.0731035173865116e-05, "loss": 0.3088, "num_input_tokens_seen": 18944000, "step": 18500 }, { "epoch": 0.9519991983164646, "grad_norm": 0.6285837292671204, "learning_rate": 4.0480509069044994e-05, "loss": 0.3138, "num_input_tokens_seen": 19456000, "step": 19000 }, { "epoch": 0.9770518087984768, "grad_norm": 1.9885900020599365, "learning_rate": 4.022998296422487e-05, "loss": 0.2893, "num_input_tokens_seen": 19968000, "step": 19500 }, { "epoch": 1.0, "eval_accuracy": 0.9061278685239001, "eval_combined_score": 2.144811219173038, "eval_loss": 0.2926097810268402, "eval_runtime": 20.2505, "eval_samples_per_second": 1971.108, "eval_steps_per_second": 246.413, "num_input_tokens_seen": 20436992, "step": 19958 }, { "epoch": 1.002104419280489, "grad_norm": 0.5625438690185547, "learning_rate": 3.997945685940475e-05, "loss": 0.2873, "num_input_tokens_seen": 20480000, "step": 20000 }, { "epoch": 1.0271570297625012, "grad_norm": 19.315837860107422, "learning_rate": 3.9728930754584634e-05, "loss": 0.2627, "num_input_tokens_seen": 20992000, "step": 20500 }, { "epoch": 1.0522096402445136, "grad_norm": 0.16634128987789154, "learning_rate": 3.9478404649764506e-05, "loss": 0.2664, "num_input_tokens_seen": 21504000, "step": 21000 }, { "epoch": 1.0772622507265257, "grad_norm": 139.21690368652344, "learning_rate": 3.9227878544944383e-05, "loss": 0.2469, "num_input_tokens_seen": 22016000, "step": 21500 }, { "epoch": 1.1023148612085378, "grad_norm": 0.19860202074050903, "learning_rate": 3.897735244012427e-05, "loss": 0.2786, "num_input_tokens_seen": 22528000, "step": 22000 }, { "epoch": 1.1273674716905502, "grad_norm": 0.420663058757782, "learning_rate": 3.872682633530414e-05, "loss": 0.2355, "num_input_tokens_seen": 23040000, "step": 22500 }, { "epoch": 1.1524200821725623, "grad_norm": 7.762341022491455, "learning_rate": 3.847630023048402e-05, "loss": 0.266, "num_input_tokens_seen": 23552000, "step": 23000 }, { "epoch": 1.1774726926545747, "grad_norm": 8.320157051086426, "learning_rate": 3.8225774125663895e-05, "loss": 0.2599, "num_input_tokens_seen": 24064000, "step": 23500 }, { "epoch": 1.2025253031365868, "grad_norm": 6.601953506469727, "learning_rate": 3.797524802084377e-05, "loss": 0.2692, "num_input_tokens_seen": 24576000, "step": 24000 }, { "epoch": 1.2275779136185991, "grad_norm": 2.0535728931427, "learning_rate": 3.772472191602365e-05, "loss": 0.2487, "num_input_tokens_seen": 25088000, "step": 24500 }, { "epoch": 1.2526305241006113, "grad_norm": 0.2633844316005707, "learning_rate": 3.747419581120353e-05, "loss": 0.2659, "num_input_tokens_seen": 25600000, "step": 25000 }, { "epoch": 1.2776831345826234, "grad_norm": 2.607544183731079, "learning_rate": 3.722366970638341e-05, "loss": 0.2809, "num_input_tokens_seen": 26112000, "step": 25500 }, { "epoch": 1.3027357450646357, "grad_norm": 0.6017013788223267, "learning_rate": 3.6973143601563285e-05, "loss": 0.2651, "num_input_tokens_seen": 26624000, "step": 26000 }, { "epoch": 1.327788355546648, "grad_norm": 0.30723240971565247, "learning_rate": 3.672261749674316e-05, "loss": 0.2565, "num_input_tokens_seen": 27136000, "step": 26500 }, { "epoch": 1.3528409660286602, "grad_norm": 0.3843832015991211, "learning_rate": 3.647209139192304e-05, "loss": 0.2736, "num_input_tokens_seen": 27648000, "step": 27000 }, { "epoch": 1.3778935765106723, "grad_norm": 12.198064804077148, "learning_rate": 3.622156528710292e-05, "loss": 0.2591, "num_input_tokens_seen": 28160000, "step": 27500 }, { "epoch": 1.4029461869926847, "grad_norm": 1.8191192150115967, "learning_rate": 3.5971039182282796e-05, "loss": 0.2667, "num_input_tokens_seen": 28672000, "step": 28000 }, { "epoch": 1.4279987974746968, "grad_norm": 4.33354377746582, "learning_rate": 3.5720513077462674e-05, "loss": 0.2812, "num_input_tokens_seen": 29184000, "step": 28500 }, { "epoch": 1.4530514079567092, "grad_norm": 6.340269088745117, "learning_rate": 3.546998697264255e-05, "loss": 0.2691, "num_input_tokens_seen": 29696000, "step": 29000 }, { "epoch": 1.4781040184387213, "grad_norm": 12.591937065124512, "learning_rate": 3.521946086782243e-05, "loss": 0.2591, "num_input_tokens_seen": 30208000, "step": 29500 }, { "epoch": 1.5031566289207334, "grad_norm": 0.40031296014785767, "learning_rate": 3.496893476300231e-05, "loss": 0.2735, "num_input_tokens_seen": 30720000, "step": 30000 }, { "epoch": 1.5282092394027458, "grad_norm": 3.4099674224853516, "learning_rate": 3.4718408658182186e-05, "loss": 0.2737, "num_input_tokens_seen": 31232000, "step": 30500 }, { "epoch": 1.5532618498847581, "grad_norm": 1.1597915887832642, "learning_rate": 3.4467882553362064e-05, "loss": 0.2587, "num_input_tokens_seen": 31744000, "step": 31000 }, { "epoch": 1.5783144603667703, "grad_norm": 68.36583709716797, "learning_rate": 3.4217356448541935e-05, "loss": 0.2533, "num_input_tokens_seen": 32256000, "step": 31500 }, { "epoch": 1.6033670708487824, "grad_norm": 2.799591302871704, "learning_rate": 3.396683034372182e-05, "loss": 0.2779, "num_input_tokens_seen": 32768000, "step": 32000 }, { "epoch": 1.6284196813307945, "grad_norm": 0.5911589860916138, "learning_rate": 3.37163042389017e-05, "loss": 0.2427, "num_input_tokens_seen": 33280000, "step": 32500 }, { "epoch": 1.6534722918128069, "grad_norm": 11.752649307250977, "learning_rate": 3.346577813408157e-05, "loss": 0.2387, "num_input_tokens_seen": 33792000, "step": 33000 }, { "epoch": 1.6785249022948192, "grad_norm": 0.4207652807235718, "learning_rate": 3.3215252029261453e-05, "loss": 0.2415, "num_input_tokens_seen": 34304000, "step": 33500 }, { "epoch": 1.7035775127768313, "grad_norm": 7.099503993988037, "learning_rate": 3.296472592444133e-05, "loss": 0.2649, "num_input_tokens_seen": 34816000, "step": 34000 }, { "epoch": 1.7286301232588435, "grad_norm": 0.41005975008010864, "learning_rate": 3.27141998196212e-05, "loss": 0.2618, "num_input_tokens_seen": 35328000, "step": 34500 }, { "epoch": 1.7536827337408558, "grad_norm": 7.169194221496582, "learning_rate": 3.246367371480109e-05, "loss": 0.2497, "num_input_tokens_seen": 35840000, "step": 35000 }, { "epoch": 1.7787353442228682, "grad_norm": 2.8473153114318848, "learning_rate": 3.221314760998096e-05, "loss": 0.2706, "num_input_tokens_seen": 36352000, "step": 35500 }, { "epoch": 1.8037879547048803, "grad_norm": 37.23502731323242, "learning_rate": 3.1962621505160836e-05, "loss": 0.2643, "num_input_tokens_seen": 36864000, "step": 36000 }, { "epoch": 1.8288405651868924, "grad_norm": 0.3462938666343689, "learning_rate": 3.171209540034072e-05, "loss": 0.258, "num_input_tokens_seen": 37376000, "step": 36500 }, { "epoch": 1.8538931756689045, "grad_norm": 4.687111854553223, "learning_rate": 3.146156929552059e-05, "loss": 0.2555, "num_input_tokens_seen": 37888000, "step": 37000 }, { "epoch": 1.878945786150917, "grad_norm": 0.3295840919017792, "learning_rate": 3.121104319070047e-05, "loss": 0.2475, "num_input_tokens_seen": 38400000, "step": 37500 }, { "epoch": 1.9039983966329292, "grad_norm": 5.046384334564209, "learning_rate": 3.0960517085880355e-05, "loss": 0.2483, "num_input_tokens_seen": 38912000, "step": 38000 }, { "epoch": 1.9290510071149414, "grad_norm": 4.56272029876709, "learning_rate": 3.0709990981060226e-05, "loss": 0.2521, "num_input_tokens_seen": 39424000, "step": 38500 }, { "epoch": 1.9541036175969535, "grad_norm": 3.8051841259002686, "learning_rate": 3.0459464876240107e-05, "loss": 0.2805, "num_input_tokens_seen": 39936000, "step": 39000 }, { "epoch": 1.9791562280789659, "grad_norm": 0.6326732039451599, "learning_rate": 3.020893877141998e-05, "loss": 0.2397, "num_input_tokens_seen": 40448000, "step": 39500 }, { "epoch": 2.0, "eval_accuracy": 0.9075809199318569, "eval_combined_score": 2.1193511042245365, "eval_loss": 0.31268036365509033, "eval_runtime": 20.2588, "eval_samples_per_second": 1970.3, "eval_steps_per_second": 246.312, "num_input_tokens_seen": 40873984, "step": 39916 }, { "epoch": 2.004208838560978, "grad_norm": 6.647907733917236, "learning_rate": 2.995841266659986e-05, "loss": 0.2299, "num_input_tokens_seen": 40960000, "step": 40000 }, { "epoch": 2.0292614490429903, "grad_norm": 0.32763534784317017, "learning_rate": 2.970788656177974e-05, "loss": 0.1992, "num_input_tokens_seen": 41472000, "step": 40500 }, { "epoch": 2.0543140595250025, "grad_norm": 14.943070411682129, "learning_rate": 2.9457360456959615e-05, "loss": 0.2179, "num_input_tokens_seen": 41984000, "step": 41000 }, { "epoch": 2.0793666700070146, "grad_norm": 4.630057334899902, "learning_rate": 2.9206834352139493e-05, "loss": 0.2151, "num_input_tokens_seen": 42496000, "step": 41500 }, { "epoch": 2.104419280489027, "grad_norm": 15.34054183959961, "learning_rate": 2.8956308247319375e-05, "loss": 0.197, "num_input_tokens_seen": 43008000, "step": 42000 }, { "epoch": 2.1294718909710393, "grad_norm": 0.16923962533473969, "learning_rate": 2.870578214249925e-05, "loss": 0.1865, "num_input_tokens_seen": 43520000, "step": 42500 }, { "epoch": 2.1545245014530514, "grad_norm": 34.0042839050293, "learning_rate": 2.845525603767913e-05, "loss": 0.2122, "num_input_tokens_seen": 44032000, "step": 43000 }, { "epoch": 2.1795771119350635, "grad_norm": 5.353533744812012, "learning_rate": 2.8204729932859e-05, "loss": 0.2289, "num_input_tokens_seen": 44544000, "step": 43500 }, { "epoch": 2.2046297224170757, "grad_norm": 0.5980260372161865, "learning_rate": 2.7954203828038883e-05, "loss": 0.2175, "num_input_tokens_seen": 45056000, "step": 44000 }, { "epoch": 2.2296823328990882, "grad_norm": 12.995455741882324, "learning_rate": 2.7703677723218764e-05, "loss": 0.2337, "num_input_tokens_seen": 45568000, "step": 44500 }, { "epoch": 2.2547349433811004, "grad_norm": 0.11803791671991348, "learning_rate": 2.7453151618398635e-05, "loss": 0.2114, "num_input_tokens_seen": 46080000, "step": 45000 }, { "epoch": 2.2797875538631125, "grad_norm": 0.21874956786632538, "learning_rate": 2.7202625513578517e-05, "loss": 0.2215, "num_input_tokens_seen": 46592000, "step": 45500 }, { "epoch": 2.3048401643451246, "grad_norm": 42.951351165771484, "learning_rate": 2.6952099408758398e-05, "loss": 0.2296, "num_input_tokens_seen": 47104000, "step": 46000 }, { "epoch": 2.3298927748271367, "grad_norm": 3.324039936065674, "learning_rate": 2.6701573303938272e-05, "loss": 0.2118, "num_input_tokens_seen": 47616000, "step": 46500 }, { "epoch": 2.3549453853091493, "grad_norm": 0.8097792863845825, "learning_rate": 2.645104719911815e-05, "loss": 0.198, "num_input_tokens_seen": 48128000, "step": 47000 }, { "epoch": 2.3799979957911614, "grad_norm": 2.3140671253204346, "learning_rate": 2.6200521094298025e-05, "loss": 0.2251, "num_input_tokens_seen": 48640000, "step": 47500 }, { "epoch": 2.4050506062731736, "grad_norm": 5.793896675109863, "learning_rate": 2.5949994989477906e-05, "loss": 0.1942, "num_input_tokens_seen": 49152000, "step": 48000 }, { "epoch": 2.4301032167551857, "grad_norm": 0.08759485185146332, "learning_rate": 2.5699468884657784e-05, "loss": 0.2105, "num_input_tokens_seen": 49664000, "step": 48500 }, { "epoch": 2.4551558272371983, "grad_norm": 13.725948333740234, "learning_rate": 2.544894277983766e-05, "loss": 0.2174, "num_input_tokens_seen": 50176000, "step": 49000 }, { "epoch": 2.4802084377192104, "grad_norm": 0.24512171745300293, "learning_rate": 2.519841667501754e-05, "loss": 0.2213, "num_input_tokens_seen": 50688000, "step": 49500 }, { "epoch": 2.5052610482012225, "grad_norm": 24.33919906616211, "learning_rate": 2.4947890570197415e-05, "loss": 0.208, "num_input_tokens_seen": 51200000, "step": 50000 }, { "epoch": 2.5303136586832347, "grad_norm": 20.30912971496582, "learning_rate": 2.4697364465377292e-05, "loss": 0.2344, "num_input_tokens_seen": 51712000, "step": 50500 }, { "epoch": 2.555366269165247, "grad_norm": 0.27970781922340393, "learning_rate": 2.444683836055717e-05, "loss": 0.217, "num_input_tokens_seen": 52224000, "step": 51000 }, { "epoch": 2.5804188796472594, "grad_norm": 0.18607856333255768, "learning_rate": 2.4196312255737048e-05, "loss": 0.1942, "num_input_tokens_seen": 52736000, "step": 51500 }, { "epoch": 2.6054714901292715, "grad_norm": 3.2024385929107666, "learning_rate": 2.3945786150916926e-05, "loss": 0.2009, "num_input_tokens_seen": 53248000, "step": 52000 }, { "epoch": 2.6305241006112836, "grad_norm": 9.92158317565918, "learning_rate": 2.3695260046096804e-05, "loss": 0.1888, "num_input_tokens_seen": 53760000, "step": 52500 }, { "epoch": 2.655576711093296, "grad_norm": 9.307025909423828, "learning_rate": 2.3444733941276682e-05, "loss": 0.1927, "num_input_tokens_seen": 54272000, "step": 53000 }, { "epoch": 2.6806293215753083, "grad_norm": 32.159671783447266, "learning_rate": 2.319420783645656e-05, "loss": 0.213, "num_input_tokens_seen": 54784000, "step": 53500 }, { "epoch": 2.7056819320573204, "grad_norm": 11.267858505249023, "learning_rate": 2.2943681731636438e-05, "loss": 0.2264, "num_input_tokens_seen": 55296000, "step": 54000 }, { "epoch": 2.7307345425393326, "grad_norm": 0.18241587281227112, "learning_rate": 2.2693155626816316e-05, "loss": 0.2253, "num_input_tokens_seen": 55808000, "step": 54500 }, { "epoch": 2.7557871530213447, "grad_norm": 1.898651123046875, "learning_rate": 2.244262952199619e-05, "loss": 0.2112, "num_input_tokens_seen": 56320000, "step": 55000 }, { "epoch": 2.780839763503357, "grad_norm": 10.556557655334473, "learning_rate": 2.219210341717607e-05, "loss": 0.2157, "num_input_tokens_seen": 56832000, "step": 55500 }, { "epoch": 2.8058923739853694, "grad_norm": 1.2913810014724731, "learning_rate": 2.194157731235595e-05, "loss": 0.2218, "num_input_tokens_seen": 57344000, "step": 56000 }, { "epoch": 2.8309449844673815, "grad_norm": 20.129615783691406, "learning_rate": 2.1691051207535827e-05, "loss": 0.2205, "num_input_tokens_seen": 57856000, "step": 56500 }, { "epoch": 2.8559975949493936, "grad_norm": 0.3709011971950531, "learning_rate": 2.1440525102715702e-05, "loss": 0.2288, "num_input_tokens_seen": 58368000, "step": 57000 }, { "epoch": 2.881050205431406, "grad_norm": 24.663593292236328, "learning_rate": 2.1189998997895583e-05, "loss": 0.2225, "num_input_tokens_seen": 58880000, "step": 57500 }, { "epoch": 2.9061028159134183, "grad_norm": 8.534331321716309, "learning_rate": 2.093947289307546e-05, "loss": 0.2236, "num_input_tokens_seen": 59392000, "step": 58000 }, { "epoch": 2.9311554263954305, "grad_norm": 23.226032257080078, "learning_rate": 2.0688946788255336e-05, "loss": 0.2111, "num_input_tokens_seen": 59904000, "step": 58500 }, { "epoch": 2.9562080368774426, "grad_norm": 16.948610305786133, "learning_rate": 2.0438420683435214e-05, "loss": 0.2123, "num_input_tokens_seen": 60416000, "step": 59000 }, { "epoch": 2.9812606473594547, "grad_norm": 3.7574212551116943, "learning_rate": 2.0187894578615095e-05, "loss": 0.2, "num_input_tokens_seen": 60928000, "step": 59500 }, { "epoch": 3.0, "eval_accuracy": 0.9109379697364466, "eval_combined_score": 2.0605294593435146, "eval_loss": 0.32792404294013977, "eval_runtime": 20.2718, "eval_samples_per_second": 1969.037, "eval_steps_per_second": 246.154, "num_input_tokens_seen": 61310976, "step": 59874 }, { "epoch": 3.0063132578414673, "grad_norm": 2.2473177909851074, "learning_rate": 1.993736847379497e-05, "loss": 0.1933, "num_input_tokens_seen": 61440000, "step": 60000 }, { "epoch": 3.0313658683234794, "grad_norm": 0.11083228886127472, "learning_rate": 1.9686842368974847e-05, "loss": 0.1592, "num_input_tokens_seen": 61952000, "step": 60500 }, { "epoch": 3.0564184788054916, "grad_norm": 0.06714469939470291, "learning_rate": 1.9436316264154725e-05, "loss": 0.158, "num_input_tokens_seen": 62464000, "step": 61000 }, { "epoch": 3.0814710892875037, "grad_norm": 0.41380876302719116, "learning_rate": 1.9185790159334603e-05, "loss": 0.1591, "num_input_tokens_seen": 62976000, "step": 61500 }, { "epoch": 3.106523699769516, "grad_norm": 11.745950698852539, "learning_rate": 1.893526405451448e-05, "loss": 0.1687, "num_input_tokens_seen": 63488000, "step": 62000 }, { "epoch": 3.1315763102515284, "grad_norm": 0.10439453274011612, "learning_rate": 1.868473794969436e-05, "loss": 0.1773, "num_input_tokens_seen": 64000000, "step": 62500 }, { "epoch": 3.1566289207335405, "grad_norm": 0.05308441445231438, "learning_rate": 1.8434211844874237e-05, "loss": 0.1729, "num_input_tokens_seen": 64512000, "step": 63000 }, { "epoch": 3.1816815312155526, "grad_norm": 2.2679662704467773, "learning_rate": 1.8183685740054115e-05, "loss": 0.1428, "num_input_tokens_seen": 65024000, "step": 63500 }, { "epoch": 3.2067341416975648, "grad_norm": 0.18617786467075348, "learning_rate": 1.7933159635233993e-05, "loss": 0.162, "num_input_tokens_seen": 65536000, "step": 64000 }, { "epoch": 3.231786752179577, "grad_norm": 0.09589721262454987, "learning_rate": 1.768263353041387e-05, "loss": 0.1425, "num_input_tokens_seen": 66048000, "step": 64500 }, { "epoch": 3.2568393626615895, "grad_norm": 0.03517961502075195, "learning_rate": 1.7432107425593745e-05, "loss": 0.1709, "num_input_tokens_seen": 66560000, "step": 65000 }, { "epoch": 3.2818919731436016, "grad_norm": 0.16013863682746887, "learning_rate": 1.7181581320773626e-05, "loss": 0.1671, "num_input_tokens_seen": 67072000, "step": 65500 }, { "epoch": 3.3069445836256137, "grad_norm": 0.9810895323753357, "learning_rate": 1.6931055215953504e-05, "loss": 0.1552, "num_input_tokens_seen": 67584000, "step": 66000 }, { "epoch": 3.331997194107626, "grad_norm": 0.12127078324556351, "learning_rate": 1.668052911113338e-05, "loss": 0.1752, "num_input_tokens_seen": 68096000, "step": 66500 }, { "epoch": 3.3570498045896384, "grad_norm": 0.22286617755889893, "learning_rate": 1.6430003006313257e-05, "loss": 0.1761, "num_input_tokens_seen": 68608000, "step": 67000 }, { "epoch": 3.3821024150716505, "grad_norm": 35.76771545410156, "learning_rate": 1.6179476901493138e-05, "loss": 0.1687, "num_input_tokens_seen": 69120000, "step": 67500 }, { "epoch": 3.4071550255536627, "grad_norm": 0.16311609745025635, "learning_rate": 1.5928950796673016e-05, "loss": 0.1448, "num_input_tokens_seen": 69632000, "step": 68000 }, { "epoch": 3.432207636035675, "grad_norm": 0.10213588923215866, "learning_rate": 1.567842469185289e-05, "loss": 0.1668, "num_input_tokens_seen": 70144000, "step": 68500 }, { "epoch": 3.457260246517687, "grad_norm": 0.04066482558846474, "learning_rate": 1.542789858703277e-05, "loss": 0.1556, "num_input_tokens_seen": 70656000, "step": 69000 }, { "epoch": 3.4823128569996995, "grad_norm": 2.4263927936553955, "learning_rate": 1.5177372482212648e-05, "loss": 0.1692, "num_input_tokens_seen": 71168000, "step": 69500 }, { "epoch": 3.5073654674817116, "grad_norm": 0.2218380570411682, "learning_rate": 1.4926846377392526e-05, "loss": 0.1552, "num_input_tokens_seen": 71680000, "step": 70000 }, { "epoch": 3.5324180779637238, "grad_norm": 0.08358863741159439, "learning_rate": 1.4676320272572402e-05, "loss": 0.1602, "num_input_tokens_seen": 72192000, "step": 70500 }, { "epoch": 3.557470688445736, "grad_norm": 0.16968253254890442, "learning_rate": 1.442579416775228e-05, "loss": 0.1651, "num_input_tokens_seen": 72704000, "step": 71000 }, { "epoch": 3.5825232989277485, "grad_norm": 20.67737579345703, "learning_rate": 1.417526806293216e-05, "loss": 0.173, "num_input_tokens_seen": 73216000, "step": 71500 }, { "epoch": 3.6075759094097606, "grad_norm": 15.341917991638184, "learning_rate": 1.3924741958112036e-05, "loss": 0.1566, "num_input_tokens_seen": 73728000, "step": 72000 }, { "epoch": 3.6326285198917727, "grad_norm": 0.07673631608486176, "learning_rate": 1.3674215853291914e-05, "loss": 0.1765, "num_input_tokens_seen": 74240000, "step": 72500 }, { "epoch": 3.657681130373785, "grad_norm": 0.21996235847473145, "learning_rate": 1.342368974847179e-05, "loss": 0.1461, "num_input_tokens_seen": 74752000, "step": 73000 }, { "epoch": 3.682733740855797, "grad_norm": 183.6659393310547, "learning_rate": 1.317316364365167e-05, "loss": 0.1706, "num_input_tokens_seen": 75264000, "step": 73500 }, { "epoch": 3.7077863513378095, "grad_norm": 0.1564781218767166, "learning_rate": 1.2922637538831548e-05, "loss": 0.159, "num_input_tokens_seen": 75776000, "step": 74000 }, { "epoch": 3.7328389618198217, "grad_norm": 8.662553787231445, "learning_rate": 1.2672111434011424e-05, "loss": 0.1737, "num_input_tokens_seen": 76288000, "step": 74500 }, { "epoch": 3.757891572301834, "grad_norm": 331.7611999511719, "learning_rate": 1.2421585329191303e-05, "loss": 0.1603, "num_input_tokens_seen": 76800000, "step": 75000 }, { "epoch": 3.782944182783846, "grad_norm": 0.09944739192724228, "learning_rate": 1.217105922437118e-05, "loss": 0.1508, "num_input_tokens_seen": 77312000, "step": 75500 }, { "epoch": 3.8079967932658585, "grad_norm": 0.24497084319591522, "learning_rate": 1.1920533119551058e-05, "loss": 0.1595, "num_input_tokens_seen": 77824000, "step": 76000 }, { "epoch": 3.8330494037478706, "grad_norm": 3.5547239780426025, "learning_rate": 1.1670007014730936e-05, "loss": 0.1595, "num_input_tokens_seen": 78336000, "step": 76500 }, { "epoch": 3.8581020142298827, "grad_norm": 0.3509676456451416, "learning_rate": 1.1419480909910813e-05, "loss": 0.1606, "num_input_tokens_seen": 78848000, "step": 77000 }, { "epoch": 3.883154624711895, "grad_norm": 19.29859161376953, "learning_rate": 1.1168954805090691e-05, "loss": 0.1722, "num_input_tokens_seen": 79360000, "step": 77500 }, { "epoch": 3.908207235193907, "grad_norm": 0.9797153472900391, "learning_rate": 1.091842870027057e-05, "loss": 0.159, "num_input_tokens_seen": 79872000, "step": 78000 }, { "epoch": 3.9332598456759196, "grad_norm": 24.612085342407227, "learning_rate": 1.0667902595450446e-05, "loss": 0.1656, "num_input_tokens_seen": 80384000, "step": 78500 }, { "epoch": 3.9583124561579317, "grad_norm": 42.16061782836914, "learning_rate": 1.0417376490630325e-05, "loss": 0.1494, "num_input_tokens_seen": 80896000, "step": 79000 }, { "epoch": 3.983365066639944, "grad_norm": 0.244501531124115, "learning_rate": 1.0166850385810201e-05, "loss": 0.1576, "num_input_tokens_seen": 81408000, "step": 79500 }, { "epoch": 4.0, "eval_accuracy": 0.9080068143100511, "eval_combined_score": 2.11188865673963, "eval_loss": 0.3887489140033722, "eval_runtime": 20.3698, "eval_samples_per_second": 1959.563, "eval_steps_per_second": 244.97, "num_input_tokens_seen": 81747968, "step": 79832 }, { "epoch": 4.008417677121956, "grad_norm": 0.1654539704322815, "learning_rate": 9.916324280990081e-06, "loss": 0.1449, "num_input_tokens_seen": 81920000, "step": 80000 }, { "epoch": 4.0334702876039685, "grad_norm": 0.1446412056684494, "learning_rate": 9.665798176169957e-06, "loss": 0.1065, "num_input_tokens_seen": 82432000, "step": 80500 }, { "epoch": 4.058522898085981, "grad_norm": 0.15385593473911285, "learning_rate": 9.415272071349835e-06, "loss": 0.1173, "num_input_tokens_seen": 82944000, "step": 81000 }, { "epoch": 4.083575508567993, "grad_norm": 0.1097198873758316, "learning_rate": 9.164745966529713e-06, "loss": 0.122, "num_input_tokens_seen": 83456000, "step": 81500 }, { "epoch": 4.108628119050005, "grad_norm": 140.7417449951172, "learning_rate": 8.914219861709591e-06, "loss": 0.1334, "num_input_tokens_seen": 83968000, "step": 82000 }, { "epoch": 4.133680729532017, "grad_norm": 0.07995349913835526, "learning_rate": 8.663693756889469e-06, "loss": 0.1099, "num_input_tokens_seen": 84480000, "step": 82500 }, { "epoch": 4.158733340014029, "grad_norm": 23.00501823425293, "learning_rate": 8.413167652069347e-06, "loss": 0.1362, "num_input_tokens_seen": 84992000, "step": 83000 }, { "epoch": 4.183785950496041, "grad_norm": 0.17540641129016876, "learning_rate": 8.162641547249223e-06, "loss": 0.1119, "num_input_tokens_seen": 85504000, "step": 83500 }, { "epoch": 4.208838560978054, "grad_norm": 10.598029136657715, "learning_rate": 7.912115442429101e-06, "loss": 0.1174, "num_input_tokens_seen": 86016000, "step": 84000 }, { "epoch": 4.233891171460066, "grad_norm": 0.06076182797551155, "learning_rate": 7.661589337608979e-06, "loss": 0.104, "num_input_tokens_seen": 86528000, "step": 84500 }, { "epoch": 4.258943781942079, "grad_norm": 0.2133261263370514, "learning_rate": 7.411063232788856e-06, "loss": 0.1229, "num_input_tokens_seen": 87040000, "step": 85000 }, { "epoch": 4.283996392424091, "grad_norm": 8.342382431030273, "learning_rate": 7.160537127968735e-06, "loss": 0.1254, "num_input_tokens_seen": 87552000, "step": 85500 }, { "epoch": 4.309049002906103, "grad_norm": 0.015887776389718056, "learning_rate": 6.910011023148612e-06, "loss": 0.1308, "num_input_tokens_seen": 88064000, "step": 86000 }, { "epoch": 4.334101613388115, "grad_norm": 0.24978305399417877, "learning_rate": 6.6594849183284905e-06, "loss": 0.1106, "num_input_tokens_seen": 88576000, "step": 86500 }, { "epoch": 4.359154223870127, "grad_norm": 0.19210007786750793, "learning_rate": 6.4089588135083675e-06, "loss": 0.1257, "num_input_tokens_seen": 89088000, "step": 87000 }, { "epoch": 4.384206834352139, "grad_norm": 10.589780807495117, "learning_rate": 6.1584327086882454e-06, "loss": 0.1238, "num_input_tokens_seen": 89600000, "step": 87500 }, { "epoch": 4.409259444834151, "grad_norm": 38.77216720581055, "learning_rate": 5.907906603868123e-06, "loss": 0.1213, "num_input_tokens_seen": 90112000, "step": 88000 }, { "epoch": 4.434312055316164, "grad_norm": 0.1937304437160492, "learning_rate": 5.657380499048001e-06, "loss": 0.1346, "num_input_tokens_seen": 90624000, "step": 88500 }, { "epoch": 4.4593646657981765, "grad_norm": 0.023678578436374664, "learning_rate": 5.406854394227879e-06, "loss": 0.1013, "num_input_tokens_seen": 91136000, "step": 89000 }, { "epoch": 4.484417276280189, "grad_norm": 7.357041358947754, "learning_rate": 5.156328289407756e-06, "loss": 0.1353, "num_input_tokens_seen": 91648000, "step": 89500 }, { "epoch": 4.509469886762201, "grad_norm": 0.0861930251121521, "learning_rate": 4.905802184587634e-06, "loss": 0.1188, "num_input_tokens_seen": 92160000, "step": 90000 }, { "epoch": 4.534522497244213, "grad_norm": 0.05523020401597023, "learning_rate": 4.655276079767512e-06, "loss": 0.0998, "num_input_tokens_seen": 92672000, "step": 90500 }, { "epoch": 4.559575107726225, "grad_norm": 35.0329704284668, "learning_rate": 4.40474997494739e-06, "loss": 0.1286, "num_input_tokens_seen": 93184000, "step": 91000 }, { "epoch": 4.584627718208237, "grad_norm": 0.04070122167468071, "learning_rate": 4.154223870127268e-06, "loss": 0.116, "num_input_tokens_seen": 93696000, "step": 91500 }, { "epoch": 4.609680328690249, "grad_norm": 6.631749153137207, "learning_rate": 3.903697765307145e-06, "loss": 0.1079, "num_input_tokens_seen": 94208000, "step": 92000 }, { "epoch": 4.634732939172261, "grad_norm": 38.14702606201172, "learning_rate": 3.653171660487023e-06, "loss": 0.1338, "num_input_tokens_seen": 94720000, "step": 92500 }, { "epoch": 4.6597855496542735, "grad_norm": 0.04164925217628479, "learning_rate": 3.402645555666901e-06, "loss": 0.0951, "num_input_tokens_seen": 95232000, "step": 93000 }, { "epoch": 4.6848381601362865, "grad_norm": 0.12932783365249634, "learning_rate": 3.1521194508467787e-06, "loss": 0.123, "num_input_tokens_seen": 95744000, "step": 93500 }, { "epoch": 4.709890770618299, "grad_norm": 0.41988006234169006, "learning_rate": 2.901593346026656e-06, "loss": 0.1316, "num_input_tokens_seen": 96256000, "step": 94000 }, { "epoch": 4.734943381100311, "grad_norm": 0.2050684094429016, "learning_rate": 2.6510672412065337e-06, "loss": 0.1193, "num_input_tokens_seen": 96768000, "step": 94500 }, { "epoch": 4.759995991582323, "grad_norm": 0.08065121620893478, "learning_rate": 2.4005411363864116e-06, "loss": 0.1069, "num_input_tokens_seen": 97280000, "step": 95000 }, { "epoch": 4.785048602064335, "grad_norm": 0.057149503380060196, "learning_rate": 2.150015031566289e-06, "loss": 0.1089, "num_input_tokens_seen": 97792000, "step": 95500 }, { "epoch": 4.810101212546347, "grad_norm": 0.12982851266860962, "learning_rate": 1.8994889267461668e-06, "loss": 0.1025, "num_input_tokens_seen": 98304000, "step": 96000 }, { "epoch": 4.835153823028359, "grad_norm": 0.09585094451904297, "learning_rate": 1.6489628219260447e-06, "loss": 0.1276, "num_input_tokens_seen": 98816000, "step": 96500 }, { "epoch": 4.860206433510371, "grad_norm": 0.06750782579183578, "learning_rate": 1.3984367171059227e-06, "loss": 0.13, "num_input_tokens_seen": 99328000, "step": 97000 }, { "epoch": 4.885259043992384, "grad_norm": 0.16248978674411774, "learning_rate": 1.1479106122858004e-06, "loss": 0.125, "num_input_tokens_seen": 99840000, "step": 97500 }, { "epoch": 4.9103116544743965, "grad_norm": 0.10622742027044296, "learning_rate": 8.973845074656779e-07, "loss": 0.1212, "num_input_tokens_seen": 100352000, "step": 98000 }, { "epoch": 4.935364264956409, "grad_norm": 0.0945580005645752, "learning_rate": 6.468584026455557e-07, "loss": 0.1094, "num_input_tokens_seen": 100864000, "step": 98500 }, { "epoch": 4.960416875438421, "grad_norm": 0.2841167449951172, "learning_rate": 3.9633229782543347e-07, "loss": 0.1056, "num_input_tokens_seen": 101376000, "step": 99000 }, { "epoch": 4.985469485920433, "grad_norm": 0.04773109778761864, "learning_rate": 1.4580619300531115e-07, "loss": 0.1127, "num_input_tokens_seen": 101888000, "step": 99500 }, { "epoch": 5.0, "eval_accuracy": 0.9069295520593246, "eval_combined_score": 2.1307642592014506, "eval_loss": 0.4687708020210266, "eval_runtime": 20.4238, "eval_samples_per_second": 1954.382, "eval_steps_per_second": 244.322, "num_input_tokens_seen": 102184960, "step": 99790 }, { "epoch": 5.0, "num_input_tokens_seen": 102184960, "step": 99790, "total_flos": 1.314687947575296e+16, "train_loss": 0.2132515576335182, "train_runtime": 1882.7905, "train_samples_per_second": 424.009, "train_steps_per_second": 53.001, "train_tokens_per_second": 54273.145 } ], "logging_steps": 500, "max_steps": 99790, "num_input_tokens_seen": 102184960, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.314687947575296e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }