| { | |
| "best_global_step": 19958, | |
| "best_metric": 0.2926097810268402, | |
| "best_model_checkpoint": "/media/user/Expansion1/bge-small-en-v1.5-ultrafineweb-vs-pile-classifier/checkpoint-19958", | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 99790, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.025052610482012225, | |
| "grad_norm": 1.178984522819519, | |
| "learning_rate": 4.9749974947389525e-05, | |
| "loss": 0.424, | |
| "num_input_tokens_seen": 512000, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05010522096402445, | |
| "grad_norm": 16.981231689453125, | |
| "learning_rate": 4.9499448842569396e-05, | |
| "loss": 0.3671, | |
| "num_input_tokens_seen": 1024000, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07515783144603667, | |
| "grad_norm": 1.642918586730957, | |
| "learning_rate": 4.9248922737749274e-05, | |
| "loss": 0.3593, | |
| "num_input_tokens_seen": 1536000, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1002104419280489, | |
| "grad_norm": 1.7461471557617188, | |
| "learning_rate": 4.899839663292916e-05, | |
| "loss": 0.3482, | |
| "num_input_tokens_seen": 2048000, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.12526305241006114, | |
| "grad_norm": 1.169636845588684, | |
| "learning_rate": 4.874787052810903e-05, | |
| "loss": 0.3251, | |
| "num_input_tokens_seen": 2560000, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.15031566289207335, | |
| "grad_norm": 2.5293004512786865, | |
| "learning_rate": 4.849734442328891e-05, | |
| "loss": 0.3267, | |
| "num_input_tokens_seen": 3072000, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.1753682733740856, | |
| "grad_norm": 2.039461135864258, | |
| "learning_rate": 4.824681831846879e-05, | |
| "loss": 0.3278, | |
| "num_input_tokens_seen": 3584000, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.2004208838560978, | |
| "grad_norm": 0.4097174406051636, | |
| "learning_rate": 4.799629221364866e-05, | |
| "loss": 0.3077, | |
| "num_input_tokens_seen": 4096000, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.22547349433811004, | |
| "grad_norm": 9.578287124633789, | |
| "learning_rate": 4.774576610882854e-05, | |
| "loss": 0.3076, | |
| "num_input_tokens_seen": 4608000, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.2505261048201223, | |
| "grad_norm": 2.4667727947235107, | |
| "learning_rate": 4.749524000400842e-05, | |
| "loss": 0.3063, | |
| "num_input_tokens_seen": 5120000, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.27557871530213446, | |
| "grad_norm": 0.20313717424869537, | |
| "learning_rate": 4.72447138991883e-05, | |
| "loss": 0.3087, | |
| "num_input_tokens_seen": 5632000, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.3006313257841467, | |
| "grad_norm": 12.131691932678223, | |
| "learning_rate": 4.6994187794368175e-05, | |
| "loss": 0.3134, | |
| "num_input_tokens_seen": 6144000, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.32568393626615894, | |
| "grad_norm": 9.874564170837402, | |
| "learning_rate": 4.674366168954805e-05, | |
| "loss": 0.3176, | |
| "num_input_tokens_seen": 6656000, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.3507365467481712, | |
| "grad_norm": 0.43632322549819946, | |
| "learning_rate": 4.649313558472793e-05, | |
| "loss": 0.3124, | |
| "num_input_tokens_seen": 7168000, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.3757891572301834, | |
| "grad_norm": 6.8587141036987305, | |
| "learning_rate": 4.624260947990781e-05, | |
| "loss": 0.3025, | |
| "num_input_tokens_seen": 7680000, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.4008417677121956, | |
| "grad_norm": 0.6035759449005127, | |
| "learning_rate": 4.5992083375087687e-05, | |
| "loss": 0.2948, | |
| "num_input_tokens_seen": 8192000, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.42589437819420783, | |
| "grad_norm": 9.4423246383667, | |
| "learning_rate": 4.5741557270267564e-05, | |
| "loss": 0.3029, | |
| "num_input_tokens_seen": 8704000, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.4509469886762201, | |
| "grad_norm": 0.47421976923942566, | |
| "learning_rate": 4.549103116544744e-05, | |
| "loss": 0.2954, | |
| "num_input_tokens_seen": 9216000, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.4759995991582323, | |
| "grad_norm": 2.0256924629211426, | |
| "learning_rate": 4.524050506062732e-05, | |
| "loss": 0.308, | |
| "num_input_tokens_seen": 9728000, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.5010522096402446, | |
| "grad_norm": 2.3783328533172607, | |
| "learning_rate": 4.49899789558072e-05, | |
| "loss": 0.308, | |
| "num_input_tokens_seen": 10240000, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5261048201222568, | |
| "grad_norm": 11.536542892456055, | |
| "learning_rate": 4.4739452850987076e-05, | |
| "loss": 0.287, | |
| "num_input_tokens_seen": 10752000, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.5511574306042689, | |
| "grad_norm": 7.235984802246094, | |
| "learning_rate": 4.4488926746166954e-05, | |
| "loss": 0.3005, | |
| "num_input_tokens_seen": 11264000, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.5762100410862812, | |
| "grad_norm": 11.705055236816406, | |
| "learning_rate": 4.4238400641346825e-05, | |
| "loss": 0.3083, | |
| "num_input_tokens_seen": 11776000, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.6012626515682934, | |
| "grad_norm": 1.2158238887786865, | |
| "learning_rate": 4.398787453652671e-05, | |
| "loss": 0.2983, | |
| "num_input_tokens_seen": 12288000, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6263152620503056, | |
| "grad_norm": 13.371932029724121, | |
| "learning_rate": 4.373734843170659e-05, | |
| "loss": 0.3055, | |
| "num_input_tokens_seen": 12800000, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.6513678725323179, | |
| "grad_norm": 1.052199363708496, | |
| "learning_rate": 4.348682232688646e-05, | |
| "loss": 0.3096, | |
| "num_input_tokens_seen": 13312000, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.6764204830143301, | |
| "grad_norm": 1.528619408607483, | |
| "learning_rate": 4.3236296222066344e-05, | |
| "loss": 0.3024, | |
| "num_input_tokens_seen": 13824000, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.7014730934963423, | |
| "grad_norm": 7.829930305480957, | |
| "learning_rate": 4.298577011724622e-05, | |
| "loss": 0.296, | |
| "num_input_tokens_seen": 14336000, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7265257039783546, | |
| "grad_norm": 13.035155296325684, | |
| "learning_rate": 4.273524401242609e-05, | |
| "loss": 0.3154, | |
| "num_input_tokens_seen": 14848000, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.7515783144603668, | |
| "grad_norm": 12.151269912719727, | |
| "learning_rate": 4.248471790760598e-05, | |
| "loss": 0.2988, | |
| "num_input_tokens_seen": 15360000, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.776630924942379, | |
| "grad_norm": 2.0840141773223877, | |
| "learning_rate": 4.223419180278585e-05, | |
| "loss": 0.2945, | |
| "num_input_tokens_seen": 15872000, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.8016835354243912, | |
| "grad_norm": 14.681863784790039, | |
| "learning_rate": 4.1983665697965726e-05, | |
| "loss": 0.3228, | |
| "num_input_tokens_seen": 16384000, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.8267361459064034, | |
| "grad_norm": 0.5756533741950989, | |
| "learning_rate": 4.173313959314561e-05, | |
| "loss": 0.3042, | |
| "num_input_tokens_seen": 16896000, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.8517887563884157, | |
| "grad_norm": 5.992170810699463, | |
| "learning_rate": 4.148261348832548e-05, | |
| "loss": 0.3052, | |
| "num_input_tokens_seen": 17408000, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.8768413668704279, | |
| "grad_norm": 0.5731572508811951, | |
| "learning_rate": 4.123208738350536e-05, | |
| "loss": 0.3027, | |
| "num_input_tokens_seen": 17920000, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.9018939773524401, | |
| "grad_norm": 4.941533088684082, | |
| "learning_rate": 4.0981561278685245e-05, | |
| "loss": 0.2897, | |
| "num_input_tokens_seen": 18432000, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.9269465878344524, | |
| "grad_norm": 2.07985520362854, | |
| "learning_rate": 4.0731035173865116e-05, | |
| "loss": 0.3088, | |
| "num_input_tokens_seen": 18944000, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.9519991983164646, | |
| "grad_norm": 0.6285837292671204, | |
| "learning_rate": 4.0480509069044994e-05, | |
| "loss": 0.3138, | |
| "num_input_tokens_seen": 19456000, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.9770518087984768, | |
| "grad_norm": 1.9885900020599365, | |
| "learning_rate": 4.022998296422487e-05, | |
| "loss": 0.2893, | |
| "num_input_tokens_seen": 19968000, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.9061278685239001, | |
| "eval_combined_score": 2.144811219173038, | |
| "eval_loss": 0.2926097810268402, | |
| "eval_runtime": 20.2505, | |
| "eval_samples_per_second": 1971.108, | |
| "eval_steps_per_second": 246.413, | |
| "num_input_tokens_seen": 20436992, | |
| "step": 19958 | |
| }, | |
| { | |
| "epoch": 1.002104419280489, | |
| "grad_norm": 0.5625438690185547, | |
| "learning_rate": 3.997945685940475e-05, | |
| "loss": 0.2873, | |
| "num_input_tokens_seen": 20480000, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.0271570297625012, | |
| "grad_norm": 19.315837860107422, | |
| "learning_rate": 3.9728930754584634e-05, | |
| "loss": 0.2627, | |
| "num_input_tokens_seen": 20992000, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.0522096402445136, | |
| "grad_norm": 0.16634128987789154, | |
| "learning_rate": 3.9478404649764506e-05, | |
| "loss": 0.2664, | |
| "num_input_tokens_seen": 21504000, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.0772622507265257, | |
| "grad_norm": 139.21690368652344, | |
| "learning_rate": 3.9227878544944383e-05, | |
| "loss": 0.2469, | |
| "num_input_tokens_seen": 22016000, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.1023148612085378, | |
| "grad_norm": 0.19860202074050903, | |
| "learning_rate": 3.897735244012427e-05, | |
| "loss": 0.2786, | |
| "num_input_tokens_seen": 22528000, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.1273674716905502, | |
| "grad_norm": 0.420663058757782, | |
| "learning_rate": 3.872682633530414e-05, | |
| "loss": 0.2355, | |
| "num_input_tokens_seen": 23040000, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.1524200821725623, | |
| "grad_norm": 7.762341022491455, | |
| "learning_rate": 3.847630023048402e-05, | |
| "loss": 0.266, | |
| "num_input_tokens_seen": 23552000, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.1774726926545747, | |
| "grad_norm": 8.320157051086426, | |
| "learning_rate": 3.8225774125663895e-05, | |
| "loss": 0.2599, | |
| "num_input_tokens_seen": 24064000, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.2025253031365868, | |
| "grad_norm": 6.601953506469727, | |
| "learning_rate": 3.797524802084377e-05, | |
| "loss": 0.2692, | |
| "num_input_tokens_seen": 24576000, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.2275779136185991, | |
| "grad_norm": 2.0535728931427, | |
| "learning_rate": 3.772472191602365e-05, | |
| "loss": 0.2487, | |
| "num_input_tokens_seen": 25088000, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.2526305241006113, | |
| "grad_norm": 0.2633844316005707, | |
| "learning_rate": 3.747419581120353e-05, | |
| "loss": 0.2659, | |
| "num_input_tokens_seen": 25600000, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.2776831345826234, | |
| "grad_norm": 2.607544183731079, | |
| "learning_rate": 3.722366970638341e-05, | |
| "loss": 0.2809, | |
| "num_input_tokens_seen": 26112000, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.3027357450646357, | |
| "grad_norm": 0.6017013788223267, | |
| "learning_rate": 3.6973143601563285e-05, | |
| "loss": 0.2651, | |
| "num_input_tokens_seen": 26624000, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.327788355546648, | |
| "grad_norm": 0.30723240971565247, | |
| "learning_rate": 3.672261749674316e-05, | |
| "loss": 0.2565, | |
| "num_input_tokens_seen": 27136000, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.3528409660286602, | |
| "grad_norm": 0.3843832015991211, | |
| "learning_rate": 3.647209139192304e-05, | |
| "loss": 0.2736, | |
| "num_input_tokens_seen": 27648000, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.3778935765106723, | |
| "grad_norm": 12.198064804077148, | |
| "learning_rate": 3.622156528710292e-05, | |
| "loss": 0.2591, | |
| "num_input_tokens_seen": 28160000, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.4029461869926847, | |
| "grad_norm": 1.8191192150115967, | |
| "learning_rate": 3.5971039182282796e-05, | |
| "loss": 0.2667, | |
| "num_input_tokens_seen": 28672000, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.4279987974746968, | |
| "grad_norm": 4.33354377746582, | |
| "learning_rate": 3.5720513077462674e-05, | |
| "loss": 0.2812, | |
| "num_input_tokens_seen": 29184000, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.4530514079567092, | |
| "grad_norm": 6.340269088745117, | |
| "learning_rate": 3.546998697264255e-05, | |
| "loss": 0.2691, | |
| "num_input_tokens_seen": 29696000, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.4781040184387213, | |
| "grad_norm": 12.591937065124512, | |
| "learning_rate": 3.521946086782243e-05, | |
| "loss": 0.2591, | |
| "num_input_tokens_seen": 30208000, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.5031566289207334, | |
| "grad_norm": 0.40031296014785767, | |
| "learning_rate": 3.496893476300231e-05, | |
| "loss": 0.2735, | |
| "num_input_tokens_seen": 30720000, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.5282092394027458, | |
| "grad_norm": 3.4099674224853516, | |
| "learning_rate": 3.4718408658182186e-05, | |
| "loss": 0.2737, | |
| "num_input_tokens_seen": 31232000, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.5532618498847581, | |
| "grad_norm": 1.1597915887832642, | |
| "learning_rate": 3.4467882553362064e-05, | |
| "loss": 0.2587, | |
| "num_input_tokens_seen": 31744000, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.5783144603667703, | |
| "grad_norm": 68.36583709716797, | |
| "learning_rate": 3.4217356448541935e-05, | |
| "loss": 0.2533, | |
| "num_input_tokens_seen": 32256000, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.6033670708487824, | |
| "grad_norm": 2.799591302871704, | |
| "learning_rate": 3.396683034372182e-05, | |
| "loss": 0.2779, | |
| "num_input_tokens_seen": 32768000, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.6284196813307945, | |
| "grad_norm": 0.5911589860916138, | |
| "learning_rate": 3.37163042389017e-05, | |
| "loss": 0.2427, | |
| "num_input_tokens_seen": 33280000, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.6534722918128069, | |
| "grad_norm": 11.752649307250977, | |
| "learning_rate": 3.346577813408157e-05, | |
| "loss": 0.2387, | |
| "num_input_tokens_seen": 33792000, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.6785249022948192, | |
| "grad_norm": 0.4207652807235718, | |
| "learning_rate": 3.3215252029261453e-05, | |
| "loss": 0.2415, | |
| "num_input_tokens_seen": 34304000, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.7035775127768313, | |
| "grad_norm": 7.099503993988037, | |
| "learning_rate": 3.296472592444133e-05, | |
| "loss": 0.2649, | |
| "num_input_tokens_seen": 34816000, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.7286301232588435, | |
| "grad_norm": 0.41005975008010864, | |
| "learning_rate": 3.27141998196212e-05, | |
| "loss": 0.2618, | |
| "num_input_tokens_seen": 35328000, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.7536827337408558, | |
| "grad_norm": 7.169194221496582, | |
| "learning_rate": 3.246367371480109e-05, | |
| "loss": 0.2497, | |
| "num_input_tokens_seen": 35840000, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.7787353442228682, | |
| "grad_norm": 2.8473153114318848, | |
| "learning_rate": 3.221314760998096e-05, | |
| "loss": 0.2706, | |
| "num_input_tokens_seen": 36352000, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.8037879547048803, | |
| "grad_norm": 37.23502731323242, | |
| "learning_rate": 3.1962621505160836e-05, | |
| "loss": 0.2643, | |
| "num_input_tokens_seen": 36864000, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.8288405651868924, | |
| "grad_norm": 0.3462938666343689, | |
| "learning_rate": 3.171209540034072e-05, | |
| "loss": 0.258, | |
| "num_input_tokens_seen": 37376000, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.8538931756689045, | |
| "grad_norm": 4.687111854553223, | |
| "learning_rate": 3.146156929552059e-05, | |
| "loss": 0.2555, | |
| "num_input_tokens_seen": 37888000, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.878945786150917, | |
| "grad_norm": 0.3295840919017792, | |
| "learning_rate": 3.121104319070047e-05, | |
| "loss": 0.2475, | |
| "num_input_tokens_seen": 38400000, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.9039983966329292, | |
| "grad_norm": 5.046384334564209, | |
| "learning_rate": 3.0960517085880355e-05, | |
| "loss": 0.2483, | |
| "num_input_tokens_seen": 38912000, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.9290510071149414, | |
| "grad_norm": 4.56272029876709, | |
| "learning_rate": 3.0709990981060226e-05, | |
| "loss": 0.2521, | |
| "num_input_tokens_seen": 39424000, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.9541036175969535, | |
| "grad_norm": 3.8051841259002686, | |
| "learning_rate": 3.0459464876240107e-05, | |
| "loss": 0.2805, | |
| "num_input_tokens_seen": 39936000, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.9791562280789659, | |
| "grad_norm": 0.6326732039451599, | |
| "learning_rate": 3.020893877141998e-05, | |
| "loss": 0.2397, | |
| "num_input_tokens_seen": 40448000, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.9075809199318569, | |
| "eval_combined_score": 2.1193511042245365, | |
| "eval_loss": 0.31268036365509033, | |
| "eval_runtime": 20.2588, | |
| "eval_samples_per_second": 1970.3, | |
| "eval_steps_per_second": 246.312, | |
| "num_input_tokens_seen": 40873984, | |
| "step": 39916 | |
| }, | |
| { | |
| "epoch": 2.004208838560978, | |
| "grad_norm": 6.647907733917236, | |
| "learning_rate": 2.995841266659986e-05, | |
| "loss": 0.2299, | |
| "num_input_tokens_seen": 40960000, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.0292614490429903, | |
| "grad_norm": 0.32763534784317017, | |
| "learning_rate": 2.970788656177974e-05, | |
| "loss": 0.1992, | |
| "num_input_tokens_seen": 41472000, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 2.0543140595250025, | |
| "grad_norm": 14.943070411682129, | |
| "learning_rate": 2.9457360456959615e-05, | |
| "loss": 0.2179, | |
| "num_input_tokens_seen": 41984000, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.0793666700070146, | |
| "grad_norm": 4.630057334899902, | |
| "learning_rate": 2.9206834352139493e-05, | |
| "loss": 0.2151, | |
| "num_input_tokens_seen": 42496000, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 2.104419280489027, | |
| "grad_norm": 15.34054183959961, | |
| "learning_rate": 2.8956308247319375e-05, | |
| "loss": 0.197, | |
| "num_input_tokens_seen": 43008000, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.1294718909710393, | |
| "grad_norm": 0.16923962533473969, | |
| "learning_rate": 2.870578214249925e-05, | |
| "loss": 0.1865, | |
| "num_input_tokens_seen": 43520000, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 2.1545245014530514, | |
| "grad_norm": 34.0042839050293, | |
| "learning_rate": 2.845525603767913e-05, | |
| "loss": 0.2122, | |
| "num_input_tokens_seen": 44032000, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 2.1795771119350635, | |
| "grad_norm": 5.353533744812012, | |
| "learning_rate": 2.8204729932859e-05, | |
| "loss": 0.2289, | |
| "num_input_tokens_seen": 44544000, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 2.2046297224170757, | |
| "grad_norm": 0.5980260372161865, | |
| "learning_rate": 2.7954203828038883e-05, | |
| "loss": 0.2175, | |
| "num_input_tokens_seen": 45056000, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 2.2296823328990882, | |
| "grad_norm": 12.995455741882324, | |
| "learning_rate": 2.7703677723218764e-05, | |
| "loss": 0.2337, | |
| "num_input_tokens_seen": 45568000, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 2.2547349433811004, | |
| "grad_norm": 0.11803791671991348, | |
| "learning_rate": 2.7453151618398635e-05, | |
| "loss": 0.2114, | |
| "num_input_tokens_seen": 46080000, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 2.2797875538631125, | |
| "grad_norm": 0.21874956786632538, | |
| "learning_rate": 2.7202625513578517e-05, | |
| "loss": 0.2215, | |
| "num_input_tokens_seen": 46592000, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 2.3048401643451246, | |
| "grad_norm": 42.951351165771484, | |
| "learning_rate": 2.6952099408758398e-05, | |
| "loss": 0.2296, | |
| "num_input_tokens_seen": 47104000, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 2.3298927748271367, | |
| "grad_norm": 3.324039936065674, | |
| "learning_rate": 2.6701573303938272e-05, | |
| "loss": 0.2118, | |
| "num_input_tokens_seen": 47616000, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 2.3549453853091493, | |
| "grad_norm": 0.8097792863845825, | |
| "learning_rate": 2.645104719911815e-05, | |
| "loss": 0.198, | |
| "num_input_tokens_seen": 48128000, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 2.3799979957911614, | |
| "grad_norm": 2.3140671253204346, | |
| "learning_rate": 2.6200521094298025e-05, | |
| "loss": 0.2251, | |
| "num_input_tokens_seen": 48640000, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 2.4050506062731736, | |
| "grad_norm": 5.793896675109863, | |
| "learning_rate": 2.5949994989477906e-05, | |
| "loss": 0.1942, | |
| "num_input_tokens_seen": 49152000, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 2.4301032167551857, | |
| "grad_norm": 0.08759485185146332, | |
| "learning_rate": 2.5699468884657784e-05, | |
| "loss": 0.2105, | |
| "num_input_tokens_seen": 49664000, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 2.4551558272371983, | |
| "grad_norm": 13.725948333740234, | |
| "learning_rate": 2.544894277983766e-05, | |
| "loss": 0.2174, | |
| "num_input_tokens_seen": 50176000, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 2.4802084377192104, | |
| "grad_norm": 0.24512171745300293, | |
| "learning_rate": 2.519841667501754e-05, | |
| "loss": 0.2213, | |
| "num_input_tokens_seen": 50688000, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 2.5052610482012225, | |
| "grad_norm": 24.33919906616211, | |
| "learning_rate": 2.4947890570197415e-05, | |
| "loss": 0.208, | |
| "num_input_tokens_seen": 51200000, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.5303136586832347, | |
| "grad_norm": 20.30912971496582, | |
| "learning_rate": 2.4697364465377292e-05, | |
| "loss": 0.2344, | |
| "num_input_tokens_seen": 51712000, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 2.555366269165247, | |
| "grad_norm": 0.27970781922340393, | |
| "learning_rate": 2.444683836055717e-05, | |
| "loss": 0.217, | |
| "num_input_tokens_seen": 52224000, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 2.5804188796472594, | |
| "grad_norm": 0.18607856333255768, | |
| "learning_rate": 2.4196312255737048e-05, | |
| "loss": 0.1942, | |
| "num_input_tokens_seen": 52736000, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 2.6054714901292715, | |
| "grad_norm": 3.2024385929107666, | |
| "learning_rate": 2.3945786150916926e-05, | |
| "loss": 0.2009, | |
| "num_input_tokens_seen": 53248000, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 2.6305241006112836, | |
| "grad_norm": 9.92158317565918, | |
| "learning_rate": 2.3695260046096804e-05, | |
| "loss": 0.1888, | |
| "num_input_tokens_seen": 53760000, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 2.655576711093296, | |
| "grad_norm": 9.307025909423828, | |
| "learning_rate": 2.3444733941276682e-05, | |
| "loss": 0.1927, | |
| "num_input_tokens_seen": 54272000, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 2.6806293215753083, | |
| "grad_norm": 32.159671783447266, | |
| "learning_rate": 2.319420783645656e-05, | |
| "loss": 0.213, | |
| "num_input_tokens_seen": 54784000, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 2.7056819320573204, | |
| "grad_norm": 11.267858505249023, | |
| "learning_rate": 2.2943681731636438e-05, | |
| "loss": 0.2264, | |
| "num_input_tokens_seen": 55296000, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 2.7307345425393326, | |
| "grad_norm": 0.18241587281227112, | |
| "learning_rate": 2.2693155626816316e-05, | |
| "loss": 0.2253, | |
| "num_input_tokens_seen": 55808000, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 2.7557871530213447, | |
| "grad_norm": 1.898651123046875, | |
| "learning_rate": 2.244262952199619e-05, | |
| "loss": 0.2112, | |
| "num_input_tokens_seen": 56320000, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 2.780839763503357, | |
| "grad_norm": 10.556557655334473, | |
| "learning_rate": 2.219210341717607e-05, | |
| "loss": 0.2157, | |
| "num_input_tokens_seen": 56832000, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 2.8058923739853694, | |
| "grad_norm": 1.2913810014724731, | |
| "learning_rate": 2.194157731235595e-05, | |
| "loss": 0.2218, | |
| "num_input_tokens_seen": 57344000, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 2.8309449844673815, | |
| "grad_norm": 20.129615783691406, | |
| "learning_rate": 2.1691051207535827e-05, | |
| "loss": 0.2205, | |
| "num_input_tokens_seen": 57856000, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 2.8559975949493936, | |
| "grad_norm": 0.3709011971950531, | |
| "learning_rate": 2.1440525102715702e-05, | |
| "loss": 0.2288, | |
| "num_input_tokens_seen": 58368000, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 2.881050205431406, | |
| "grad_norm": 24.663593292236328, | |
| "learning_rate": 2.1189998997895583e-05, | |
| "loss": 0.2225, | |
| "num_input_tokens_seen": 58880000, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 2.9061028159134183, | |
| "grad_norm": 8.534331321716309, | |
| "learning_rate": 2.093947289307546e-05, | |
| "loss": 0.2236, | |
| "num_input_tokens_seen": 59392000, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 2.9311554263954305, | |
| "grad_norm": 23.226032257080078, | |
| "learning_rate": 2.0688946788255336e-05, | |
| "loss": 0.2111, | |
| "num_input_tokens_seen": 59904000, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 2.9562080368774426, | |
| "grad_norm": 16.948610305786133, | |
| "learning_rate": 2.0438420683435214e-05, | |
| "loss": 0.2123, | |
| "num_input_tokens_seen": 60416000, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 2.9812606473594547, | |
| "grad_norm": 3.7574212551116943, | |
| "learning_rate": 2.0187894578615095e-05, | |
| "loss": 0.2, | |
| "num_input_tokens_seen": 60928000, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.9109379697364466, | |
| "eval_combined_score": 2.0605294593435146, | |
| "eval_loss": 0.32792404294013977, | |
| "eval_runtime": 20.2718, | |
| "eval_samples_per_second": 1969.037, | |
| "eval_steps_per_second": 246.154, | |
| "num_input_tokens_seen": 61310976, | |
| "step": 59874 | |
| }, | |
| { | |
| "epoch": 3.0063132578414673, | |
| "grad_norm": 2.2473177909851074, | |
| "learning_rate": 1.993736847379497e-05, | |
| "loss": 0.1933, | |
| "num_input_tokens_seen": 61440000, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.0313658683234794, | |
| "grad_norm": 0.11083228886127472, | |
| "learning_rate": 1.9686842368974847e-05, | |
| "loss": 0.1592, | |
| "num_input_tokens_seen": 61952000, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 3.0564184788054916, | |
| "grad_norm": 0.06714469939470291, | |
| "learning_rate": 1.9436316264154725e-05, | |
| "loss": 0.158, | |
| "num_input_tokens_seen": 62464000, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 3.0814710892875037, | |
| "grad_norm": 0.41380876302719116, | |
| "learning_rate": 1.9185790159334603e-05, | |
| "loss": 0.1591, | |
| "num_input_tokens_seen": 62976000, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 3.106523699769516, | |
| "grad_norm": 11.745950698852539, | |
| "learning_rate": 1.893526405451448e-05, | |
| "loss": 0.1687, | |
| "num_input_tokens_seen": 63488000, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 3.1315763102515284, | |
| "grad_norm": 0.10439453274011612, | |
| "learning_rate": 1.868473794969436e-05, | |
| "loss": 0.1773, | |
| "num_input_tokens_seen": 64000000, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 3.1566289207335405, | |
| "grad_norm": 0.05308441445231438, | |
| "learning_rate": 1.8434211844874237e-05, | |
| "loss": 0.1729, | |
| "num_input_tokens_seen": 64512000, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 3.1816815312155526, | |
| "grad_norm": 2.2679662704467773, | |
| "learning_rate": 1.8183685740054115e-05, | |
| "loss": 0.1428, | |
| "num_input_tokens_seen": 65024000, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 3.2067341416975648, | |
| "grad_norm": 0.18617786467075348, | |
| "learning_rate": 1.7933159635233993e-05, | |
| "loss": 0.162, | |
| "num_input_tokens_seen": 65536000, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 3.231786752179577, | |
| "grad_norm": 0.09589721262454987, | |
| "learning_rate": 1.768263353041387e-05, | |
| "loss": 0.1425, | |
| "num_input_tokens_seen": 66048000, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 3.2568393626615895, | |
| "grad_norm": 0.03517961502075195, | |
| "learning_rate": 1.7432107425593745e-05, | |
| "loss": 0.1709, | |
| "num_input_tokens_seen": 66560000, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 3.2818919731436016, | |
| "grad_norm": 0.16013863682746887, | |
| "learning_rate": 1.7181581320773626e-05, | |
| "loss": 0.1671, | |
| "num_input_tokens_seen": 67072000, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 3.3069445836256137, | |
| "grad_norm": 0.9810895323753357, | |
| "learning_rate": 1.6931055215953504e-05, | |
| "loss": 0.1552, | |
| "num_input_tokens_seen": 67584000, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 3.331997194107626, | |
| "grad_norm": 0.12127078324556351, | |
| "learning_rate": 1.668052911113338e-05, | |
| "loss": 0.1752, | |
| "num_input_tokens_seen": 68096000, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 3.3570498045896384, | |
| "grad_norm": 0.22286617755889893, | |
| "learning_rate": 1.6430003006313257e-05, | |
| "loss": 0.1761, | |
| "num_input_tokens_seen": 68608000, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 3.3821024150716505, | |
| "grad_norm": 35.76771545410156, | |
| "learning_rate": 1.6179476901493138e-05, | |
| "loss": 0.1687, | |
| "num_input_tokens_seen": 69120000, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 3.4071550255536627, | |
| "grad_norm": 0.16311609745025635, | |
| "learning_rate": 1.5928950796673016e-05, | |
| "loss": 0.1448, | |
| "num_input_tokens_seen": 69632000, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 3.432207636035675, | |
| "grad_norm": 0.10213588923215866, | |
| "learning_rate": 1.567842469185289e-05, | |
| "loss": 0.1668, | |
| "num_input_tokens_seen": 70144000, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 3.457260246517687, | |
| "grad_norm": 0.04066482558846474, | |
| "learning_rate": 1.542789858703277e-05, | |
| "loss": 0.1556, | |
| "num_input_tokens_seen": 70656000, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 3.4823128569996995, | |
| "grad_norm": 2.4263927936553955, | |
| "learning_rate": 1.5177372482212648e-05, | |
| "loss": 0.1692, | |
| "num_input_tokens_seen": 71168000, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 3.5073654674817116, | |
| "grad_norm": 0.2218380570411682, | |
| "learning_rate": 1.4926846377392526e-05, | |
| "loss": 0.1552, | |
| "num_input_tokens_seen": 71680000, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 3.5324180779637238, | |
| "grad_norm": 0.08358863741159439, | |
| "learning_rate": 1.4676320272572402e-05, | |
| "loss": 0.1602, | |
| "num_input_tokens_seen": 72192000, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 3.557470688445736, | |
| "grad_norm": 0.16968253254890442, | |
| "learning_rate": 1.442579416775228e-05, | |
| "loss": 0.1651, | |
| "num_input_tokens_seen": 72704000, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 3.5825232989277485, | |
| "grad_norm": 20.67737579345703, | |
| "learning_rate": 1.417526806293216e-05, | |
| "loss": 0.173, | |
| "num_input_tokens_seen": 73216000, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 3.6075759094097606, | |
| "grad_norm": 15.341917991638184, | |
| "learning_rate": 1.3924741958112036e-05, | |
| "loss": 0.1566, | |
| "num_input_tokens_seen": 73728000, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 3.6326285198917727, | |
| "grad_norm": 0.07673631608486176, | |
| "learning_rate": 1.3674215853291914e-05, | |
| "loss": 0.1765, | |
| "num_input_tokens_seen": 74240000, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 3.657681130373785, | |
| "grad_norm": 0.21996235847473145, | |
| "learning_rate": 1.342368974847179e-05, | |
| "loss": 0.1461, | |
| "num_input_tokens_seen": 74752000, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 3.682733740855797, | |
| "grad_norm": 183.6659393310547, | |
| "learning_rate": 1.317316364365167e-05, | |
| "loss": 0.1706, | |
| "num_input_tokens_seen": 75264000, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 3.7077863513378095, | |
| "grad_norm": 0.1564781218767166, | |
| "learning_rate": 1.2922637538831548e-05, | |
| "loss": 0.159, | |
| "num_input_tokens_seen": 75776000, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 3.7328389618198217, | |
| "grad_norm": 8.662553787231445, | |
| "learning_rate": 1.2672111434011424e-05, | |
| "loss": 0.1737, | |
| "num_input_tokens_seen": 76288000, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 3.757891572301834, | |
| "grad_norm": 331.7611999511719, | |
| "learning_rate": 1.2421585329191303e-05, | |
| "loss": 0.1603, | |
| "num_input_tokens_seen": 76800000, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 3.782944182783846, | |
| "grad_norm": 0.09944739192724228, | |
| "learning_rate": 1.217105922437118e-05, | |
| "loss": 0.1508, | |
| "num_input_tokens_seen": 77312000, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 3.8079967932658585, | |
| "grad_norm": 0.24497084319591522, | |
| "learning_rate": 1.1920533119551058e-05, | |
| "loss": 0.1595, | |
| "num_input_tokens_seen": 77824000, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 3.8330494037478706, | |
| "grad_norm": 3.5547239780426025, | |
| "learning_rate": 1.1670007014730936e-05, | |
| "loss": 0.1595, | |
| "num_input_tokens_seen": 78336000, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 3.8581020142298827, | |
| "grad_norm": 0.3509676456451416, | |
| "learning_rate": 1.1419480909910813e-05, | |
| "loss": 0.1606, | |
| "num_input_tokens_seen": 78848000, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 3.883154624711895, | |
| "grad_norm": 19.29859161376953, | |
| "learning_rate": 1.1168954805090691e-05, | |
| "loss": 0.1722, | |
| "num_input_tokens_seen": 79360000, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 3.908207235193907, | |
| "grad_norm": 0.9797153472900391, | |
| "learning_rate": 1.091842870027057e-05, | |
| "loss": 0.159, | |
| "num_input_tokens_seen": 79872000, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 3.9332598456759196, | |
| "grad_norm": 24.612085342407227, | |
| "learning_rate": 1.0667902595450446e-05, | |
| "loss": 0.1656, | |
| "num_input_tokens_seen": 80384000, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 3.9583124561579317, | |
| "grad_norm": 42.16061782836914, | |
| "learning_rate": 1.0417376490630325e-05, | |
| "loss": 0.1494, | |
| "num_input_tokens_seen": 80896000, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 3.983365066639944, | |
| "grad_norm": 0.244501531124115, | |
| "learning_rate": 1.0166850385810201e-05, | |
| "loss": 0.1576, | |
| "num_input_tokens_seen": 81408000, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.9080068143100511, | |
| "eval_combined_score": 2.11188865673963, | |
| "eval_loss": 0.3887489140033722, | |
| "eval_runtime": 20.3698, | |
| "eval_samples_per_second": 1959.563, | |
| "eval_steps_per_second": 244.97, | |
| "num_input_tokens_seen": 81747968, | |
| "step": 79832 | |
| }, | |
| { | |
| "epoch": 4.008417677121956, | |
| "grad_norm": 0.1654539704322815, | |
| "learning_rate": 9.916324280990081e-06, | |
| "loss": 0.1449, | |
| "num_input_tokens_seen": 81920000, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 4.0334702876039685, | |
| "grad_norm": 0.1446412056684494, | |
| "learning_rate": 9.665798176169957e-06, | |
| "loss": 0.1065, | |
| "num_input_tokens_seen": 82432000, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 4.058522898085981, | |
| "grad_norm": 0.15385593473911285, | |
| "learning_rate": 9.415272071349835e-06, | |
| "loss": 0.1173, | |
| "num_input_tokens_seen": 82944000, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 4.083575508567993, | |
| "grad_norm": 0.1097198873758316, | |
| "learning_rate": 9.164745966529713e-06, | |
| "loss": 0.122, | |
| "num_input_tokens_seen": 83456000, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 4.108628119050005, | |
| "grad_norm": 140.7417449951172, | |
| "learning_rate": 8.914219861709591e-06, | |
| "loss": 0.1334, | |
| "num_input_tokens_seen": 83968000, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 4.133680729532017, | |
| "grad_norm": 0.07995349913835526, | |
| "learning_rate": 8.663693756889469e-06, | |
| "loss": 0.1099, | |
| "num_input_tokens_seen": 84480000, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 4.158733340014029, | |
| "grad_norm": 23.00501823425293, | |
| "learning_rate": 8.413167652069347e-06, | |
| "loss": 0.1362, | |
| "num_input_tokens_seen": 84992000, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 4.183785950496041, | |
| "grad_norm": 0.17540641129016876, | |
| "learning_rate": 8.162641547249223e-06, | |
| "loss": 0.1119, | |
| "num_input_tokens_seen": 85504000, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 4.208838560978054, | |
| "grad_norm": 10.598029136657715, | |
| "learning_rate": 7.912115442429101e-06, | |
| "loss": 0.1174, | |
| "num_input_tokens_seen": 86016000, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 4.233891171460066, | |
| "grad_norm": 0.06076182797551155, | |
| "learning_rate": 7.661589337608979e-06, | |
| "loss": 0.104, | |
| "num_input_tokens_seen": 86528000, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 4.258943781942079, | |
| "grad_norm": 0.2133261263370514, | |
| "learning_rate": 7.411063232788856e-06, | |
| "loss": 0.1229, | |
| "num_input_tokens_seen": 87040000, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 4.283996392424091, | |
| "grad_norm": 8.342382431030273, | |
| "learning_rate": 7.160537127968735e-06, | |
| "loss": 0.1254, | |
| "num_input_tokens_seen": 87552000, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 4.309049002906103, | |
| "grad_norm": 0.015887776389718056, | |
| "learning_rate": 6.910011023148612e-06, | |
| "loss": 0.1308, | |
| "num_input_tokens_seen": 88064000, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 4.334101613388115, | |
| "grad_norm": 0.24978305399417877, | |
| "learning_rate": 6.6594849183284905e-06, | |
| "loss": 0.1106, | |
| "num_input_tokens_seen": 88576000, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 4.359154223870127, | |
| "grad_norm": 0.19210007786750793, | |
| "learning_rate": 6.4089588135083675e-06, | |
| "loss": 0.1257, | |
| "num_input_tokens_seen": 89088000, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 4.384206834352139, | |
| "grad_norm": 10.589780807495117, | |
| "learning_rate": 6.1584327086882454e-06, | |
| "loss": 0.1238, | |
| "num_input_tokens_seen": 89600000, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 4.409259444834151, | |
| "grad_norm": 38.77216720581055, | |
| "learning_rate": 5.907906603868123e-06, | |
| "loss": 0.1213, | |
| "num_input_tokens_seen": 90112000, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 4.434312055316164, | |
| "grad_norm": 0.1937304437160492, | |
| "learning_rate": 5.657380499048001e-06, | |
| "loss": 0.1346, | |
| "num_input_tokens_seen": 90624000, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 4.4593646657981765, | |
| "grad_norm": 0.023678578436374664, | |
| "learning_rate": 5.406854394227879e-06, | |
| "loss": 0.1013, | |
| "num_input_tokens_seen": 91136000, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 4.484417276280189, | |
| "grad_norm": 7.357041358947754, | |
| "learning_rate": 5.156328289407756e-06, | |
| "loss": 0.1353, | |
| "num_input_tokens_seen": 91648000, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 4.509469886762201, | |
| "grad_norm": 0.0861930251121521, | |
| "learning_rate": 4.905802184587634e-06, | |
| "loss": 0.1188, | |
| "num_input_tokens_seen": 92160000, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 4.534522497244213, | |
| "grad_norm": 0.05523020401597023, | |
| "learning_rate": 4.655276079767512e-06, | |
| "loss": 0.0998, | |
| "num_input_tokens_seen": 92672000, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 4.559575107726225, | |
| "grad_norm": 35.0329704284668, | |
| "learning_rate": 4.40474997494739e-06, | |
| "loss": 0.1286, | |
| "num_input_tokens_seen": 93184000, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 4.584627718208237, | |
| "grad_norm": 0.04070122167468071, | |
| "learning_rate": 4.154223870127268e-06, | |
| "loss": 0.116, | |
| "num_input_tokens_seen": 93696000, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 4.609680328690249, | |
| "grad_norm": 6.631749153137207, | |
| "learning_rate": 3.903697765307145e-06, | |
| "loss": 0.1079, | |
| "num_input_tokens_seen": 94208000, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 4.634732939172261, | |
| "grad_norm": 38.14702606201172, | |
| "learning_rate": 3.653171660487023e-06, | |
| "loss": 0.1338, | |
| "num_input_tokens_seen": 94720000, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 4.6597855496542735, | |
| "grad_norm": 0.04164925217628479, | |
| "learning_rate": 3.402645555666901e-06, | |
| "loss": 0.0951, | |
| "num_input_tokens_seen": 95232000, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 4.6848381601362865, | |
| "grad_norm": 0.12932783365249634, | |
| "learning_rate": 3.1521194508467787e-06, | |
| "loss": 0.123, | |
| "num_input_tokens_seen": 95744000, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 4.709890770618299, | |
| "grad_norm": 0.41988006234169006, | |
| "learning_rate": 2.901593346026656e-06, | |
| "loss": 0.1316, | |
| "num_input_tokens_seen": 96256000, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 4.734943381100311, | |
| "grad_norm": 0.2050684094429016, | |
| "learning_rate": 2.6510672412065337e-06, | |
| "loss": 0.1193, | |
| "num_input_tokens_seen": 96768000, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 4.759995991582323, | |
| "grad_norm": 0.08065121620893478, | |
| "learning_rate": 2.4005411363864116e-06, | |
| "loss": 0.1069, | |
| "num_input_tokens_seen": 97280000, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 4.785048602064335, | |
| "grad_norm": 0.057149503380060196, | |
| "learning_rate": 2.150015031566289e-06, | |
| "loss": 0.1089, | |
| "num_input_tokens_seen": 97792000, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 4.810101212546347, | |
| "grad_norm": 0.12982851266860962, | |
| "learning_rate": 1.8994889267461668e-06, | |
| "loss": 0.1025, | |
| "num_input_tokens_seen": 98304000, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 4.835153823028359, | |
| "grad_norm": 0.09585094451904297, | |
| "learning_rate": 1.6489628219260447e-06, | |
| "loss": 0.1276, | |
| "num_input_tokens_seen": 98816000, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 4.860206433510371, | |
| "grad_norm": 0.06750782579183578, | |
| "learning_rate": 1.3984367171059227e-06, | |
| "loss": 0.13, | |
| "num_input_tokens_seen": 99328000, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 4.885259043992384, | |
| "grad_norm": 0.16248978674411774, | |
| "learning_rate": 1.1479106122858004e-06, | |
| "loss": 0.125, | |
| "num_input_tokens_seen": 99840000, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 4.9103116544743965, | |
| "grad_norm": 0.10622742027044296, | |
| "learning_rate": 8.973845074656779e-07, | |
| "loss": 0.1212, | |
| "num_input_tokens_seen": 100352000, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 4.935364264956409, | |
| "grad_norm": 0.0945580005645752, | |
| "learning_rate": 6.468584026455557e-07, | |
| "loss": 0.1094, | |
| "num_input_tokens_seen": 100864000, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 4.960416875438421, | |
| "grad_norm": 0.2841167449951172, | |
| "learning_rate": 3.9633229782543347e-07, | |
| "loss": 0.1056, | |
| "num_input_tokens_seen": 101376000, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 4.985469485920433, | |
| "grad_norm": 0.04773109778761864, | |
| "learning_rate": 1.4580619300531115e-07, | |
| "loss": 0.1127, | |
| "num_input_tokens_seen": 101888000, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.9069295520593246, | |
| "eval_combined_score": 2.1307642592014506, | |
| "eval_loss": 0.4687708020210266, | |
| "eval_runtime": 20.4238, | |
| "eval_samples_per_second": 1954.382, | |
| "eval_steps_per_second": 244.322, | |
| "num_input_tokens_seen": 102184960, | |
| "step": 99790 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "num_input_tokens_seen": 102184960, | |
| "step": 99790, | |
| "total_flos": 1.314687947575296e+16, | |
| "train_loss": 0.2132515576335182, | |
| "train_runtime": 1882.7905, | |
| "train_samples_per_second": 424.009, | |
| "train_steps_per_second": 53.001, | |
| "train_tokens_per_second": 54273.145 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 99790, | |
| "num_input_tokens_seen": 102184960, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.314687947575296e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |