{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 500, "global_step": 1960, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 0.0004999678864499828, "loss": 1.9845, "step": 10 }, { "epoch": 0.08, "learning_rate": 0.000499871554050172, "loss": 1.8444, "step": 20 }, { "epoch": 0.12, "learning_rate": 0.0004997110275491702, "loss": 1.721, "step": 30 }, { "epoch": 0.16, "learning_rate": 0.0004994863481875841, "loss": 1.6755, "step": 40 }, { "epoch": 0.2, "learning_rate": 0.0004991975736874289, "loss": 1.7248, "step": 50 }, { "epoch": 0.24, "learning_rate": 0.0004988447782372996, "loss": 1.6803, "step": 60 }, { "epoch": 0.29, "learning_rate": 0.0004984280524733107, "loss": 1.7063, "step": 70 }, { "epoch": 0.33, "learning_rate": 0.0004979475034558115, "loss": 1.6724, "step": 80 }, { "epoch": 0.37, "learning_rate": 0.0004974032546418815, "loss": 1.6933, "step": 90 }, { "epoch": 0.41, "learning_rate": 0.0004967954458536126, "loss": 1.6243, "step": 100 }, { "epoch": 0.45, "learning_rate": 0.0004961242332421882, "loss": 1.639, "step": 110 }, { "epoch": 0.49, "learning_rate": 0.0004953897892477664, "loss": 1.6545, "step": 120 }, { "epoch": 0.53, "learning_rate": 0.0004945923025551788, "loss": 1.6017, "step": 130 }, { "epoch": 0.57, "learning_rate": 0.0004937319780454559, "loss": 1.6522, "step": 140 }, { "epoch": 0.61, "learning_rate": 0.000492809036743191, "loss": 1.5948, "step": 150 }, { "epoch": 0.65, "learning_rate": 0.0004918237157597574, "loss": 1.5385, "step": 160 }, { "epoch": 0.69, "learning_rate": 0.0004907762682323926, "loss": 1.6132, "step": 170 }, { "epoch": 0.73, "learning_rate": 0.0004896669632591652, "loss": 1.5396, "step": 180 }, { "epoch": 0.78, "learning_rate": 0.000488496085829841, "loss": 1.5023, "step": 190 }, { "epoch": 0.82, "learning_rate": 0.0004872639367526672, "loss": 1.4937, "step": 200 }, { "epoch": 0.86, "learning_rate": 0.0004859708325770919, "loss": 1.4824, "step": 210 }, { "epoch": 0.9, "learning_rate": 0.0004846171055124401, "loss": 1.5506, "step": 220 }, { "epoch": 0.94, "learning_rate": 0.00048320310334256625, "loss": 1.5709, "step": 230 }, { "epoch": 0.98, "learning_rate": 0.0004817291893365054, "loss": 1.5531, "step": 240 }, { "epoch": 1.02, "learning_rate": 0.00048019574215514706, "loss": 1.4546, "step": 250 }, { "epoch": 1.06, "learning_rate": 0.0004786031557539531, "loss": 1.4281, "step": 260 }, { "epoch": 1.1, "learning_rate": 0.00047695183928174803, "loss": 1.4339, "step": 270 }, { "epoch": 1.14, "learning_rate": 0.00047524221697560476, "loss": 1.3993, "step": 280 }, { "epoch": 1.18, "learning_rate": 0.0004734747280518549, "loss": 1.3786, "step": 290 }, { "epoch": 1.22, "learning_rate": 0.00047164982659325005, "loss": 1.458, "step": 300 }, { "epoch": 1.27, "learning_rate": 0.0004697679814323043, "loss": 1.3692, "step": 310 }, { "epoch": 1.31, "learning_rate": 0.00046782967603084736, "loss": 1.491, "step": 320 }, { "epoch": 1.35, "learning_rate": 0.00046583540835581883, "loss": 1.3637, "step": 330 }, { "epoch": 1.39, "learning_rate": 0.0004637856907513366, "loss": 1.4276, "step": 340 }, { "epoch": 1.43, "learning_rate": 0.00046168104980707104, "loss": 1.3943, "step": 350 }, { "epoch": 1.47, "learning_rate": 0.00045952202622296013, "loss": 1.4274, "step": 360 }, { "epoch": 1.51, "learning_rate": 0.00045730917467029877, "loss": 1.4829, "step": 370 }, { "epoch": 1.55, "learning_rate": 0.00045504306364923896, "loss": 1.3585, "step": 380 }, { "epoch": 1.59, "learning_rate": 0.00045272427534273776, "loss": 1.4194, "step": 390 }, { "epoch": 1.63, "learning_rate": 0.00045035340546698916, "loss": 1.3924, "step": 400 }, { "epoch": 1.67, "learning_rate": 0.0004479310631183799, "loss": 1.4117, "step": 410 }, { "epoch": 1.71, "learning_rate": 0.0004454578706170075, "loss": 1.3849, "step": 420 }, { "epoch": 1.76, "learning_rate": 0.0004429344633468004, "loss": 1.3857, "step": 430 }, { "epoch": 1.8, "learning_rate": 0.0004403614895922836, "loss": 1.4772, "step": 440 }, { "epoch": 1.84, "learning_rate": 0.0004377396103720278, "loss": 1.4021, "step": 450 }, { "epoch": 1.88, "learning_rate": 0.00043506949926882887, "loss": 1.3589, "step": 460 }, { "epoch": 1.92, "learning_rate": 0.0004323518422566586, "loss": 1.45, "step": 470 }, { "epoch": 1.96, "learning_rate": 0.0004295873375244319, "loss": 1.2784, "step": 480 }, { "epoch": 2.0, "learning_rate": 0.00042677669529663686, "loss": 1.4127, "step": 490 }, { "epoch": 2.04, "learning_rate": 0.0004239206376508716, "loss": 1.2777, "step": 500 }, { "epoch": 2.08, "learning_rate": 0.00042101989833233654, "loss": 1.1873, "step": 510 }, { "epoch": 2.12, "learning_rate": 0.0004180752225653292, "loss": 1.2746, "step": 520 }, { "epoch": 2.16, "learning_rate": 0.0004150873668617898, "loss": 1.308, "step": 530 }, { "epoch": 2.2, "learning_rate": 0.00041205709882694713, "loss": 1.2431, "step": 540 }, { "epoch": 2.24, "learning_rate": 0.0004089851969621138, "loss": 1.3411, "step": 550 }, { "epoch": 2.29, "learning_rate": 0.0004058724504646834, "loss": 1.2369, "step": 560 }, { "epoch": 2.33, "learning_rate": 0.0004027196590253786, "loss": 1.303, "step": 570 }, { "epoch": 2.37, "learning_rate": 0.000399527632622804, "loss": 1.2011, "step": 580 }, { "epoch": 2.41, "learning_rate": 0.0003962971913153559, "loss": 1.2239, "step": 590 }, { "epoch": 2.45, "learning_rate": 0.00039302916503054243, "loss": 1.2133, "step": 600 }, { "epoch": 2.49, "learning_rate": 0.0003897243933517679, "loss": 1.2108, "step": 610 }, { "epoch": 2.53, "learning_rate": 0.00038638372530263714, "loss": 1.3095, "step": 620 }, { "epoch": 2.57, "learning_rate": 0.00038300801912883415, "loss": 1.2392, "step": 630 }, { "epoch": 2.61, "learning_rate": 0.0003795981420776313, "loss": 1.1789, "step": 640 }, { "epoch": 2.65, "learning_rate": 0.0003761549701750865, "loss": 1.2285, "step": 650 }, { "epoch": 2.69, "learning_rate": 0.0003726793880009845, "loss": 1.2101, "step": 660 }, { "epoch": 2.73, "learning_rate": 0.00036917228846158136, "loss": 1.1973, "step": 670 }, { "epoch": 2.78, "learning_rate": 0.00036563457256020887, "loss": 1.2238, "step": 680 }, { "epoch": 2.82, "learning_rate": 0.0003620671491657992, "loss": 1.2249, "step": 690 }, { "epoch": 2.86, "learning_rate": 0.00035847093477938953, "loss": 1.2314, "step": 700 }, { "epoch": 2.9, "learning_rate": 0.00035484685329866423, "loss": 1.2987, "step": 710 }, { "epoch": 2.94, "learning_rate": 0.00035119583578059843, "loss": 1.2416, "step": 720 }, { "epoch": 2.98, "learning_rate": 0.00034751882020226174, "loss": 1.175, "step": 730 }, { "epoch": 3.02, "learning_rate": 0.0003438167512198436, "loss": 1.2123, "step": 740 }, { "epoch": 3.06, "learning_rate": 0.00034009057992596335, "loss": 1.0257, "step": 750 }, { "epoch": 3.1, "learning_rate": 0.0003363412636053269, "loss": 1.118, "step": 760 }, { "epoch": 3.14, "learning_rate": 0.00033256976548879183, "loss": 1.1586, "step": 770 }, { "epoch": 3.18, "learning_rate": 0.0003287770545059052, "loss": 1.0559, "step": 780 }, { "epoch": 3.22, "learning_rate": 0.0003249641050359779, "loss": 1.0775, "step": 790 }, { "epoch": 3.27, "learning_rate": 0.0003211318966577581, "loss": 1.0905, "step": 800 }, { "epoch": 3.31, "learning_rate": 0.00031728141389776923, "loss": 1.088, "step": 810 }, { "epoch": 3.35, "learning_rate": 0.0003134136459773768, "loss": 1.0395, "step": 820 }, { "epoch": 3.39, "learning_rate": 0.00030952958655864954, "loss": 1.1064, "step": 830 }, { "epoch": 3.43, "learning_rate": 0.0003056302334890786, "loss": 1.0662, "step": 840 }, { "epoch": 3.47, "learning_rate": 0.0003017165885452227, "loss": 1.0627, "step": 850 }, { "epoch": 3.51, "learning_rate": 0.00029778965717534313, "loss": 0.9968, "step": 860 }, { "epoch": 3.55, "learning_rate": 0.0002938504482410954, "loss": 1.1237, "step": 870 }, { "epoch": 3.59, "learning_rate": 0.00028989997375834483, "loss": 1.0432, "step": 880 }, { "epoch": 3.63, "learning_rate": 0.00028593924863717045, "loss": 1.1197, "step": 890 }, { "epoch": 3.67, "learning_rate": 0.0002819692904211265, "loss": 1.0016, "step": 900 }, { "epoch": 3.71, "learning_rate": 0.00027799111902582696, "loss": 1.0956, "step": 910 }, { "epoch": 3.76, "learning_rate": 0.00027400575647692046, "loss": 1.0878, "step": 920 }, { "epoch": 3.8, "learning_rate": 0.00027001422664752335, "loss": 1.0838, "step": 930 }, { "epoch": 3.84, "learning_rate": 0.00026601755499517824, "loss": 1.0587, "step": 940 }, { "epoch": 3.88, "learning_rate": 0.0002620167682984052, "loss": 1.0857, "step": 950 }, { "epoch": 3.92, "learning_rate": 0.00025801289439291385, "loss": 1.0107, "step": 960 }, { "epoch": 3.96, "learning_rate": 0.00025400696190754345, "loss": 1.1491, "step": 970 }, { "epoch": 4.0, "learning_rate": 0.00025, "loss": 1.0726, "step": 980 }, { "epoch": 4.04, "learning_rate": 0.0002459930380924566, "loss": 0.9856, "step": 990 }, { "epoch": 4.08, "learning_rate": 0.0002419871056070862, "loss": 0.9726, "step": 1000 }, { "epoch": 4.12, "learning_rate": 0.00023798323170159486, "loss": 0.9675, "step": 1010 }, { "epoch": 4.16, "learning_rate": 0.0002339824450048218, "loss": 0.9831, "step": 1020 }, { "epoch": 4.2, "learning_rate": 0.0002299857733524767, "loss": 0.9082, "step": 1030 }, { "epoch": 4.24, "learning_rate": 0.00022599424352307955, "loss": 1.04, "step": 1040 }, { "epoch": 4.29, "learning_rate": 0.00022200888097417305, "loss": 0.9608, "step": 1050 }, { "epoch": 4.33, "learning_rate": 0.00021803070957887347, "loss": 1.0405, "step": 1060 }, { "epoch": 4.37, "learning_rate": 0.0002140607513628296, "loss": 0.9635, "step": 1070 }, { "epoch": 4.41, "learning_rate": 0.00021010002624165526, "loss": 0.8787, "step": 1080 }, { "epoch": 4.45, "learning_rate": 0.00020614955175890463, "loss": 0.9367, "step": 1090 }, { "epoch": 4.49, "learning_rate": 0.00020221034282465699, "loss": 1.0206, "step": 1100 }, { "epoch": 4.53, "learning_rate": 0.00019828341145477728, "loss": 0.962, "step": 1110 }, { "epoch": 4.57, "learning_rate": 0.00019436976651092142, "loss": 0.8907, "step": 1120 }, { "epoch": 4.61, "learning_rate": 0.00019047041344135045, "loss": 0.932, "step": 1130 }, { "epoch": 4.65, "learning_rate": 0.0001865863540226232, "loss": 0.957, "step": 1140 }, { "epoch": 4.69, "learning_rate": 0.0001827185861022308, "loss": 0.9121, "step": 1150 }, { "epoch": 4.73, "learning_rate": 0.0001788681033422419, "loss": 0.9817, "step": 1160 }, { "epoch": 4.78, "learning_rate": 0.0001750358949640221, "loss": 0.9116, "step": 1170 }, { "epoch": 4.82, "learning_rate": 0.00017122294549409484, "loss": 0.8627, "step": 1180 }, { "epoch": 4.86, "learning_rate": 0.00016743023451120832, "loss": 0.8922, "step": 1190 }, { "epoch": 4.9, "learning_rate": 0.00016365873639467314, "loss": 0.9926, "step": 1200 }, { "epoch": 4.94, "learning_rate": 0.0001599094200740367, "loss": 1.0046, "step": 1210 }, { "epoch": 4.98, "learning_rate": 0.0001561832487801565, "loss": 0.927, "step": 1220 }, { "epoch": 5.02, "learning_rate": 0.0001524811797977383, "loss": 0.8946, "step": 1230 }, { "epoch": 5.06, "learning_rate": 0.00014880416421940155, "loss": 0.9484, "step": 1240 }, { "epoch": 5.1, "learning_rate": 0.0001451531467013358, "loss": 0.7806, "step": 1250 }, { "epoch": 5.14, "learning_rate": 0.00014152906522061048, "loss": 0.8487, "step": 1260 }, { "epoch": 5.18, "learning_rate": 0.00013793285083420076, "loss": 0.7939, "step": 1270 }, { "epoch": 5.22, "learning_rate": 0.00013436542743979125, "loss": 0.9054, "step": 1280 }, { "epoch": 5.27, "learning_rate": 0.0001308277115384187, "loss": 0.8808, "step": 1290 }, { "epoch": 5.31, "learning_rate": 0.00012732061199901561, "loss": 0.8748, "step": 1300 }, { "epoch": 5.35, "learning_rate": 0.00012384502982491357, "loss": 0.9054, "step": 1310 }, { "epoch": 5.39, "learning_rate": 0.00012040185792236874, "loss": 0.8023, "step": 1320 }, { "epoch": 5.43, "learning_rate": 0.00011699198087116588, "loss": 0.9195, "step": 1330 }, { "epoch": 5.47, "learning_rate": 0.00011361627469736286, "loss": 0.8406, "step": 1340 }, { "epoch": 5.51, "learning_rate": 0.00011027560664823208, "loss": 0.8873, "step": 1350 }, { "epoch": 5.55, "learning_rate": 0.00010697083496945764, "loss": 0.8811, "step": 1360 }, { "epoch": 5.59, "learning_rate": 0.00010370280868464405, "loss": 0.9164, "step": 1370 }, { "epoch": 5.63, "learning_rate": 0.000100472367377196, "loss": 0.9604, "step": 1380 }, { "epoch": 5.67, "learning_rate": 9.728034097462144e-05, "loss": 0.9027, "step": 1390 }, { "epoch": 5.71, "learning_rate": 9.412754953531663e-05, "loss": 0.857, "step": 1400 }, { "epoch": 5.76, "learning_rate": 9.101480303788623e-05, "loss": 0.8198, "step": 1410 }, { "epoch": 5.8, "learning_rate": 8.794290117305295e-05, "loss": 0.866, "step": 1420 }, { "epoch": 5.84, "learning_rate": 8.491263313821021e-05, "loss": 0.8422, "step": 1430 }, { "epoch": 5.88, "learning_rate": 8.192477743467078e-05, "loss": 0.8921, "step": 1440 }, { "epoch": 5.92, "learning_rate": 7.898010166766348e-05, "loss": 0.8429, "step": 1450 }, { "epoch": 5.96, "learning_rate": 7.60793623491284e-05, "loss": 0.8728, "step": 1460 }, { "epoch": 6.0, "learning_rate": 7.322330470336314e-05, "loss": 0.8335, "step": 1470 }, { "epoch": 6.04, "learning_rate": 7.041266247556813e-05, "loss": 0.8213, "step": 1480 }, { "epoch": 6.08, "learning_rate": 6.764815774334149e-05, "loss": 0.8509, "step": 1490 }, { "epoch": 6.12, "learning_rate": 6.493050073117116e-05, "loss": 0.7348, "step": 1500 }, { "epoch": 6.16, "learning_rate": 6.226038962797217e-05, "loss": 0.8422, "step": 1510 }, { "epoch": 6.2, "learning_rate": 5.96385104077164e-05, "loss": 0.896, "step": 1520 }, { "epoch": 6.24, "learning_rate": 5.706553665319955e-05, "loss": 0.7791, "step": 1530 }, { "epoch": 6.29, "learning_rate": 5.454212938299255e-05, "loss": 0.8045, "step": 1540 }, { "epoch": 6.33, "learning_rate": 5.206893688162009e-05, "loss": 0.8531, "step": 1550 }, { "epoch": 6.37, "learning_rate": 4.9646594533010875e-05, "loss": 0.8444, "step": 1560 }, { "epoch": 6.41, "learning_rate": 4.7275724657262294e-05, "loss": 0.7966, "step": 1570 }, { "epoch": 6.45, "learning_rate": 4.495693635076101e-05, "loss": 0.7895, "step": 1580 }, { "epoch": 6.49, "learning_rate": 4.269082532970131e-05, "loss": 0.7409, "step": 1590 }, { "epoch": 6.53, "learning_rate": 4.047797377703985e-05, "loss": 0.875, "step": 1600 }, { "epoch": 6.57, "learning_rate": 3.831895019292897e-05, "loss": 0.8869, "step": 1610 }, { "epoch": 6.61, "learning_rate": 3.621430924866348e-05, "loss": 0.8159, "step": 1620 }, { "epoch": 6.65, "learning_rate": 3.416459164418123e-05, "loss": 0.7707, "step": 1630 }, { "epoch": 6.69, "learning_rate": 3.217032396915265e-05, "loss": 0.8724, "step": 1640 }, { "epoch": 6.73, "learning_rate": 3.0232018567695695e-05, "loss": 0.7733, "step": 1650 }, { "epoch": 6.78, "learning_rate": 2.8350173406749973e-05, "loss": 0.8311, "step": 1660 }, { "epoch": 6.82, "learning_rate": 2.652527194814511e-05, "loss": 0.8356, "step": 1670 }, { "epoch": 6.86, "learning_rate": 2.4757783024395242e-05, "loss": 0.8144, "step": 1680 }, { "epoch": 6.9, "learning_rate": 2.3048160718251997e-05, "loss": 0.7767, "step": 1690 }, { "epoch": 6.94, "learning_rate": 2.1396844246046905e-05, "loss": 0.8402, "step": 1700 }, { "epoch": 6.98, "learning_rate": 1.980425784485293e-05, "loss": 0.8576, "step": 1710 }, { "epoch": 7.02, "learning_rate": 1.827081066349459e-05, "loss": 0.7797, "step": 1720 }, { "epoch": 7.06, "learning_rate": 1.6796896657433808e-05, "loss": 0.8409, "step": 1730 }, { "epoch": 7.1, "learning_rate": 1.538289448755989e-05, "loss": 0.7642, "step": 1740 }, { "epoch": 7.14, "learning_rate": 1.4029167422908107e-05, "loss": 0.8464, "step": 1750 }, { "epoch": 7.18, "learning_rate": 1.273606324733284e-05, "loss": 0.7975, "step": 1760 }, { "epoch": 7.22, "learning_rate": 1.1503914170159058e-05, "loss": 0.7577, "step": 1770 }, { "epoch": 7.27, "learning_rate": 1.0333036740834856e-05, "loss": 0.8085, "step": 1780 }, { "epoch": 7.31, "learning_rate": 9.223731767607434e-06, "loss": 0.7511, "step": 1790 }, { "epoch": 7.35, "learning_rate": 8.176284240242638e-06, "loss": 0.845, "step": 1800 }, { "epoch": 7.39, "learning_rate": 7.190963256809069e-06, "loss": 0.865, "step": 1810 }, { "epoch": 7.43, "learning_rate": 6.268021954544096e-06, "loss": 0.8486, "step": 1820 }, { "epoch": 7.47, "learning_rate": 5.407697444821169e-06, "loss": 0.7099, "step": 1830 }, { "epoch": 7.51, "learning_rate": 4.61021075223364e-06, "loss": 0.8223, "step": 1840 }, { "epoch": 7.55, "learning_rate": 3.8757667578118995e-06, "loss": 0.8753, "step": 1850 }, { "epoch": 7.59, "learning_rate": 3.2045541463874563e-06, "loss": 0.8835, "step": 1860 }, { "epoch": 7.63, "learning_rate": 2.5967453581185186e-06, "loss": 0.7877, "step": 1870 }, { "epoch": 7.67, "learning_rate": 2.052496544188487e-06, "loss": 0.8828, "step": 1880 }, { "epoch": 7.71, "learning_rate": 1.571947526689349e-06, "loss": 0.7658, "step": 1890 }, { "epoch": 7.76, "learning_rate": 1.1552217627004424e-06, "loss": 0.8952, "step": 1900 }, { "epoch": 7.8, "learning_rate": 8.024263125710751e-07, "loss": 0.8118, "step": 1910 }, { "epoch": 7.84, "learning_rate": 5.136518124159162e-07, "loss": 0.8503, "step": 1920 }, { "epoch": 7.88, "learning_rate": 2.8897245082978865e-07, "loss": 0.8248, "step": 1930 }, { "epoch": 7.92, "learning_rate": 1.284459498280266e-07, "loss": 0.8515, "step": 1940 }, { "epoch": 7.96, "learning_rate": 3.2113550017198734e-08, "loss": 0.8274, "step": 1950 }, { "epoch": 8.0, "learning_rate": 0.0, "loss": 0.7359, "step": 1960 }, { "epoch": 8.0, "step": 1960, "total_flos": 7.457847416862474e+17, "train_loss": 1.1027992168251348, "train_runtime": 31447.7257, "train_samples_per_second": 0.997, "train_steps_per_second": 0.062 } ], "logging_steps": 10, "max_steps": 1960, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 100, "total_flos": 7.457847416862474e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }