diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,65844 @@ +{ + "best_global_step": 1000, + "best_metric": 0.1526937186717987, + "best_model_checkpoint": "saves/prompt-tuning/gemma-3-1b-it/train_mrpc_1744902643/checkpoint-1000", + "epoch": 194.17433414043583, + "eval_steps": 200, + "global_step": 40000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.024213075060532687, + "grad_norm": 5.071005821228027, + "learning_rate": 0.29999999259779675, + "loss": 9.1493, + "num_input_tokens_seen": 8032, + "step": 5 + }, + { + "epoch": 0.048426150121065374, + "grad_norm": 2.0499110221862793, + "learning_rate": 0.29999996252634736, + "loss": 3.9374, + "num_input_tokens_seen": 16672, + "step": 10 + }, + { + "epoch": 0.07263922518159806, + "grad_norm": 0.968630313873291, + "learning_rate": 0.2999999093230187, + "loss": 1.673, + "num_input_tokens_seen": 25184, + "step": 15 + }, + { + "epoch": 0.09685230024213075, + "grad_norm": 0.6263306736946106, + "learning_rate": 0.299999832987819, + "loss": 0.8702, + "num_input_tokens_seen": 34272, + "step": 20 + }, + { + "epoch": 0.12106537530266344, + "grad_norm": 0.1749742180109024, + "learning_rate": 0.29999973352076004, + "loss": 0.6104, + "num_input_tokens_seen": 42464, + "step": 25 + }, + { + "epoch": 0.14527845036319612, + "grad_norm": 0.10219960659742355, + "learning_rate": 0.2999996109218572, + "loss": 0.4351, + "num_input_tokens_seen": 50976, + "step": 30 + }, + { + "epoch": 0.1694915254237288, + "grad_norm": 0.10777541995048523, + "learning_rate": 0.2999994651911293, + "loss": 0.2699, + "num_input_tokens_seen": 59552, + "step": 35 + }, + { + "epoch": 0.1937046004842615, + "grad_norm": 0.40226423740386963, + "learning_rate": 0.2999992963285989, + "loss": 0.2699, + "num_input_tokens_seen": 68416, + "step": 40 + }, + { + "epoch": 0.2179176755447942, + "grad_norm": 0.06184570863842964, + "learning_rate": 0.29999910433429194, + "loss": 0.2844, + "num_input_tokens_seen": 76992, + "step": 45 + }, + { + "epoch": 0.24213075060532688, + "grad_norm": 0.06225006282329559, + "learning_rate": 0.29999888920823814, + "loss": 0.1694, + "num_input_tokens_seen": 85792, + "step": 50 + }, + { + "epoch": 0.26634382566585957, + "grad_norm": 0.07618734240531921, + "learning_rate": 0.29999865095047057, + "loss": 0.2117, + "num_input_tokens_seen": 94112, + "step": 55 + }, + { + "epoch": 0.29055690072639223, + "grad_norm": 0.060120780020952225, + "learning_rate": 0.29999838956102604, + "loss": 0.1858, + "num_input_tokens_seen": 102848, + "step": 60 + }, + { + "epoch": 0.31476997578692495, + "grad_norm": 0.06279272586107254, + "learning_rate": 0.29999810503994484, + "loss": 0.2456, + "num_input_tokens_seen": 111488, + "step": 65 + }, + { + "epoch": 0.3389830508474576, + "grad_norm": 0.04608820378780365, + "learning_rate": 0.29999779738727084, + "loss": 0.2, + "num_input_tokens_seen": 119968, + "step": 70 + }, + { + "epoch": 0.36319612590799033, + "grad_norm": 0.020889610052108765, + "learning_rate": 0.29999746660305154, + "loss": 0.1579, + "num_input_tokens_seen": 128896, + "step": 75 + }, + { + "epoch": 0.387409200968523, + "grad_norm": 0.044906292110681534, + "learning_rate": 0.2999971126873379, + "loss": 0.2211, + "num_input_tokens_seen": 137408, + "step": 80 + }, + { + "epoch": 0.4116222760290557, + "grad_norm": 0.03616130352020264, + "learning_rate": 0.2999967356401845, + "loss": 0.1993, + "num_input_tokens_seen": 146112, + "step": 85 + }, + { + "epoch": 0.4358353510895884, + "grad_norm": 0.0262387003749609, + "learning_rate": 0.29999633546164944, + "loss": 0.1887, + "num_input_tokens_seen": 154592, + "step": 90 + }, + { + "epoch": 0.4600484261501211, + "grad_norm": 0.017507828772068024, + "learning_rate": 0.29999591215179444, + "loss": 0.1806, + "num_input_tokens_seen": 163040, + "step": 95 + }, + { + "epoch": 0.48426150121065376, + "grad_norm": 0.039545938372612, + "learning_rate": 0.2999954657106849, + "loss": 0.1803, + "num_input_tokens_seen": 171616, + "step": 100 + }, + { + "epoch": 0.5084745762711864, + "grad_norm": 0.05395766720175743, + "learning_rate": 0.2999949961383896, + "loss": 0.2159, + "num_input_tokens_seen": 180608, + "step": 105 + }, + { + "epoch": 0.5326876513317191, + "grad_norm": 0.03013993240892887, + "learning_rate": 0.2999945034349809, + "loss": 0.197, + "num_input_tokens_seen": 189088, + "step": 110 + }, + { + "epoch": 0.5569007263922519, + "grad_norm": 0.010992400348186493, + "learning_rate": 0.2999939876005348, + "loss": 0.2267, + "num_input_tokens_seen": 197792, + "step": 115 + }, + { + "epoch": 0.5811138014527845, + "grad_norm": 0.039533477276563644, + "learning_rate": 0.29999344863513094, + "loss": 0.2161, + "num_input_tokens_seen": 206368, + "step": 120 + }, + { + "epoch": 0.6053268765133172, + "grad_norm": 0.011832542717456818, + "learning_rate": 0.2999928865388523, + "loss": 0.1718, + "num_input_tokens_seen": 214784, + "step": 125 + }, + { + "epoch": 0.6295399515738499, + "grad_norm": 0.04620729014277458, + "learning_rate": 0.29999230131178567, + "loss": 0.1906, + "num_input_tokens_seen": 223680, + "step": 130 + }, + { + "epoch": 0.6537530266343826, + "grad_norm": 0.024787720292806625, + "learning_rate": 0.2999916929540212, + "loss": 0.1869, + "num_input_tokens_seen": 232160, + "step": 135 + }, + { + "epoch": 0.6779661016949152, + "grad_norm": 0.02642720565199852, + "learning_rate": 0.29999106146565285, + "loss": 0.1952, + "num_input_tokens_seen": 240480, + "step": 140 + }, + { + "epoch": 0.7021791767554479, + "grad_norm": 0.0347641222178936, + "learning_rate": 0.29999040684677786, + "loss": 0.1758, + "num_input_tokens_seen": 248896, + "step": 145 + }, + { + "epoch": 0.7263922518159807, + "grad_norm": 0.014818494208157063, + "learning_rate": 0.2999897290974972, + "loss": 0.1947, + "num_input_tokens_seen": 257472, + "step": 150 + }, + { + "epoch": 0.7506053268765133, + "grad_norm": 0.026584504172205925, + "learning_rate": 0.2999890282179155, + "loss": 0.1838, + "num_input_tokens_seen": 265696, + "step": 155 + }, + { + "epoch": 0.774818401937046, + "grad_norm": 0.01314197201281786, + "learning_rate": 0.29998830420814077, + "loss": 0.1891, + "num_input_tokens_seen": 274080, + "step": 160 + }, + { + "epoch": 0.7990314769975787, + "grad_norm": 0.024450266733765602, + "learning_rate": 0.2999875570682846, + "loss": 0.2118, + "num_input_tokens_seen": 282496, + "step": 165 + }, + { + "epoch": 0.8232445520581114, + "grad_norm": 0.014593984000384808, + "learning_rate": 0.2999867867984623, + "loss": 0.1865, + "num_input_tokens_seen": 290944, + "step": 170 + }, + { + "epoch": 0.847457627118644, + "grad_norm": 0.014560905285179615, + "learning_rate": 0.29998599339879267, + "loss": 0.1724, + "num_input_tokens_seen": 299872, + "step": 175 + }, + { + "epoch": 0.8716707021791767, + "grad_norm": 0.017347628250718117, + "learning_rate": 0.29998517686939796, + "loss": 0.1813, + "num_input_tokens_seen": 308512, + "step": 180 + }, + { + "epoch": 0.8958837772397095, + "grad_norm": 0.025490086525678635, + "learning_rate": 0.29998433721040413, + "loss": 0.1593, + "num_input_tokens_seen": 317312, + "step": 185 + }, + { + "epoch": 0.9200968523002422, + "grad_norm": 0.027983617037534714, + "learning_rate": 0.29998347442194073, + "loss": 0.1487, + "num_input_tokens_seen": 325760, + "step": 190 + }, + { + "epoch": 0.9443099273607748, + "grad_norm": 0.012684508226811886, + "learning_rate": 0.2999825885041407, + "loss": 0.1437, + "num_input_tokens_seen": 334432, + "step": 195 + }, + { + "epoch": 0.9685230024213075, + "grad_norm": 0.010962334461510181, + "learning_rate": 0.29998167945714077, + "loss": 0.161, + "num_input_tokens_seen": 342592, + "step": 200 + }, + { + "epoch": 0.9685230024213075, + "eval_loss": 0.1907040923833847, + "eval_runtime": 4.6189, + "eval_samples_per_second": 79.456, + "eval_steps_per_second": 19.918, + "num_input_tokens_seen": 342592, + "step": 200 + }, + { + "epoch": 0.9927360774818402, + "grad_norm": 0.011584723368287086, + "learning_rate": 0.2999807472810811, + "loss": 0.1786, + "num_input_tokens_seen": 351136, + "step": 205 + }, + { + "epoch": 1.0193704600484261, + "grad_norm": 0.020712362602353096, + "learning_rate": 0.29997979197610536, + "loss": 0.2003, + "num_input_tokens_seen": 360064, + "step": 210 + }, + { + "epoch": 1.0435835351089588, + "grad_norm": 0.028761588037014008, + "learning_rate": 0.299978813542361, + "loss": 0.1617, + "num_input_tokens_seen": 368544, + "step": 215 + }, + { + "epoch": 1.0677966101694916, + "grad_norm": 0.01963549107313156, + "learning_rate": 0.2999778119799988, + "loss": 0.1745, + "num_input_tokens_seen": 377120, + "step": 220 + }, + { + "epoch": 1.0920096852300243, + "grad_norm": 0.020693229511380196, + "learning_rate": 0.29997678728917326, + "loss": 0.1645, + "num_input_tokens_seen": 385536, + "step": 225 + }, + { + "epoch": 1.116222760290557, + "grad_norm": 0.01137514691799879, + "learning_rate": 0.2999757394700424, + "loss": 0.1693, + "num_input_tokens_seen": 394240, + "step": 230 + }, + { + "epoch": 1.1404358353510895, + "grad_norm": 0.015184774994850159, + "learning_rate": 0.29997466852276783, + "loss": 0.1721, + "num_input_tokens_seen": 402944, + "step": 235 + }, + { + "epoch": 1.1646489104116222, + "grad_norm": 0.0440896712243557, + "learning_rate": 0.29997357444751466, + "loss": 0.184, + "num_input_tokens_seen": 411808, + "step": 240 + }, + { + "epoch": 1.188861985472155, + "grad_norm": 0.03371434286236763, + "learning_rate": 0.2999724572444516, + "loss": 0.1866, + "num_input_tokens_seen": 420160, + "step": 245 + }, + { + "epoch": 1.2130750605326877, + "grad_norm": 0.029387593269348145, + "learning_rate": 0.29997131691375095, + "loss": 0.189, + "num_input_tokens_seen": 428768, + "step": 250 + }, + { + "epoch": 1.2372881355932204, + "grad_norm": 0.00850792694836855, + "learning_rate": 0.2999701534555886, + "loss": 0.1593, + "num_input_tokens_seen": 437280, + "step": 255 + }, + { + "epoch": 1.261501210653753, + "grad_norm": 0.01935967616736889, + "learning_rate": 0.2999689668701439, + "loss": 0.1841, + "num_input_tokens_seen": 445952, + "step": 260 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.014468884095549583, + "learning_rate": 0.29996775715759993, + "loss": 0.1736, + "num_input_tokens_seen": 454464, + "step": 265 + }, + { + "epoch": 1.3099273607748185, + "grad_norm": 0.018423881381750107, + "learning_rate": 0.2999665243181432, + "loss": 0.1568, + "num_input_tokens_seen": 463008, + "step": 270 + }, + { + "epoch": 1.334140435835351, + "grad_norm": 0.015267972834408283, + "learning_rate": 0.2999652683519638, + "loss": 0.1985, + "num_input_tokens_seen": 471648, + "step": 275 + }, + { + "epoch": 1.3583535108958837, + "grad_norm": 0.008920734748244286, + "learning_rate": 0.29996398925925544, + "loss": 0.1776, + "num_input_tokens_seen": 480416, + "step": 280 + }, + { + "epoch": 1.3825665859564165, + "grad_norm": 0.009594307281076908, + "learning_rate": 0.2999626870402154, + "loss": 0.1827, + "num_input_tokens_seen": 488928, + "step": 285 + }, + { + "epoch": 1.4067796610169492, + "grad_norm": 0.008735272102057934, + "learning_rate": 0.29996136169504445, + "loss": 0.1981, + "num_input_tokens_seen": 497312, + "step": 290 + }, + { + "epoch": 1.430992736077482, + "grad_norm": 0.020749423652887344, + "learning_rate": 0.29996001322394694, + "loss": 0.1699, + "num_input_tokens_seen": 505888, + "step": 295 + }, + { + "epoch": 1.4552058111380144, + "grad_norm": 0.0322580523788929, + "learning_rate": 0.29995864162713093, + "loss": 0.1941, + "num_input_tokens_seen": 514304, + "step": 300 + }, + { + "epoch": 1.4794188861985473, + "grad_norm": 0.009766981936991215, + "learning_rate": 0.2999572469048079, + "loss": 0.1636, + "num_input_tokens_seen": 522624, + "step": 305 + }, + { + "epoch": 1.5036319612590798, + "grad_norm": 0.00974379200488329, + "learning_rate": 0.29995582905719287, + "loss": 0.1707, + "num_input_tokens_seen": 531040, + "step": 310 + }, + { + "epoch": 1.5278450363196125, + "grad_norm": 0.02028411440551281, + "learning_rate": 0.2999543880845046, + "loss": 0.1551, + "num_input_tokens_seen": 539072, + "step": 315 + }, + { + "epoch": 1.5520581113801453, + "grad_norm": 0.010228387080132961, + "learning_rate": 0.2999529239869652, + "loss": 0.1495, + "num_input_tokens_seen": 548288, + "step": 320 + }, + { + "epoch": 1.576271186440678, + "grad_norm": 0.011940037831664085, + "learning_rate": 0.2999514367648005, + "loss": 0.1924, + "num_input_tokens_seen": 556832, + "step": 325 + }, + { + "epoch": 1.6004842615012107, + "grad_norm": 0.012123778462409973, + "learning_rate": 0.29994992641823987, + "loss": 0.1711, + "num_input_tokens_seen": 565632, + "step": 330 + }, + { + "epoch": 1.6246973365617432, + "grad_norm": 0.011955016292631626, + "learning_rate": 0.29994839294751613, + "loss": 0.2266, + "num_input_tokens_seen": 574080, + "step": 335 + }, + { + "epoch": 1.6489104116222761, + "grad_norm": 0.010912937112152576, + "learning_rate": 0.29994683635286584, + "loss": 0.169, + "num_input_tokens_seen": 582720, + "step": 340 + }, + { + "epoch": 1.6731234866828086, + "grad_norm": 0.021453550085425377, + "learning_rate": 0.2999452566345291, + "loss": 0.1466, + "num_input_tokens_seen": 591520, + "step": 345 + }, + { + "epoch": 1.6973365617433414, + "grad_norm": 0.010866965167224407, + "learning_rate": 0.2999436537927494, + "loss": 0.1518, + "num_input_tokens_seen": 600096, + "step": 350 + }, + { + "epoch": 1.721549636803874, + "grad_norm": 0.02985861897468567, + "learning_rate": 0.299942027827774, + "loss": 0.1833, + "num_input_tokens_seen": 608992, + "step": 355 + }, + { + "epoch": 1.7457627118644068, + "grad_norm": 0.01891891472041607, + "learning_rate": 0.29994037873985363, + "loss": 0.1833, + "num_input_tokens_seen": 617728, + "step": 360 + }, + { + "epoch": 1.7699757869249395, + "grad_norm": 0.009983880445361137, + "learning_rate": 0.29993870652924254, + "loss": 0.1558, + "num_input_tokens_seen": 625984, + "step": 365 + }, + { + "epoch": 1.794188861985472, + "grad_norm": 0.012361986562609673, + "learning_rate": 0.29993701119619876, + "loss": 0.1603, + "num_input_tokens_seen": 634528, + "step": 370 + }, + { + "epoch": 1.818401937046005, + "grad_norm": 0.013831421732902527, + "learning_rate": 0.2999352927409835, + "loss": 0.1856, + "num_input_tokens_seen": 642944, + "step": 375 + }, + { + "epoch": 1.8426150121065374, + "grad_norm": 0.015830736607313156, + "learning_rate": 0.29993355116386194, + "loss": 0.1778, + "num_input_tokens_seen": 651488, + "step": 380 + }, + { + "epoch": 1.8668280871670704, + "grad_norm": 0.015386034734547138, + "learning_rate": 0.29993178646510266, + "loss": 0.173, + "num_input_tokens_seen": 660096, + "step": 385 + }, + { + "epoch": 1.8910411622276029, + "grad_norm": 0.00972446147352457, + "learning_rate": 0.2999299986449777, + "loss": 0.1817, + "num_input_tokens_seen": 668384, + "step": 390 + }, + { + "epoch": 1.9152542372881356, + "grad_norm": 0.01328370999544859, + "learning_rate": 0.29992818770376284, + "loss": 0.1782, + "num_input_tokens_seen": 677152, + "step": 395 + }, + { + "epoch": 1.9394673123486683, + "grad_norm": 0.010454659350216389, + "learning_rate": 0.29992635364173725, + "loss": 0.1822, + "num_input_tokens_seen": 685504, + "step": 400 + }, + { + "epoch": 1.9394673123486683, + "eval_loss": 0.15939690172672272, + "eval_runtime": 4.6415, + "eval_samples_per_second": 79.069, + "eval_steps_per_second": 19.821, + "num_input_tokens_seen": 685504, + "step": 400 + }, + { + "epoch": 1.9636803874092008, + "grad_norm": 0.014762120321393013, + "learning_rate": 0.2999244964591839, + "loss": 0.1566, + "num_input_tokens_seen": 694048, + "step": 405 + }, + { + "epoch": 1.9878934624697338, + "grad_norm": 0.010674744844436646, + "learning_rate": 0.2999226161563891, + "loss": 0.1643, + "num_input_tokens_seen": 702368, + "step": 410 + }, + { + "epoch": 2.0145278450363198, + "grad_norm": 0.010488958097994328, + "learning_rate": 0.2999207127336429, + "loss": 0.2139, + "num_input_tokens_seen": 711520, + "step": 415 + }, + { + "epoch": 2.0387409200968523, + "grad_norm": 0.009743495844304562, + "learning_rate": 0.2999187861912387, + "loss": 0.1466, + "num_input_tokens_seen": 719680, + "step": 420 + }, + { + "epoch": 2.062953995157385, + "grad_norm": 0.014719569124281406, + "learning_rate": 0.2999168365294737, + "loss": 0.1782, + "num_input_tokens_seen": 727904, + "step": 425 + }, + { + "epoch": 2.0871670702179177, + "grad_norm": 0.017000338062644005, + "learning_rate": 0.29991486374864856, + "loss": 0.1602, + "num_input_tokens_seen": 736288, + "step": 430 + }, + { + "epoch": 2.11138014527845, + "grad_norm": 0.010440610349178314, + "learning_rate": 0.29991286784906745, + "loss": 0.1872, + "num_input_tokens_seen": 744992, + "step": 435 + }, + { + "epoch": 2.135593220338983, + "grad_norm": 0.0222209133207798, + "learning_rate": 0.2999108488310382, + "loss": 0.1595, + "num_input_tokens_seen": 753472, + "step": 440 + }, + { + "epoch": 2.1598062953995156, + "grad_norm": 0.011414933949708939, + "learning_rate": 0.29990880669487213, + "loss": 0.1542, + "num_input_tokens_seen": 762240, + "step": 445 + }, + { + "epoch": 2.1840193704600486, + "grad_norm": 0.016092399135231972, + "learning_rate": 0.29990674144088425, + "loss": 0.1764, + "num_input_tokens_seen": 770784, + "step": 450 + }, + { + "epoch": 2.208232445520581, + "grad_norm": 0.017623426392674446, + "learning_rate": 0.299904653069393, + "loss": 0.172, + "num_input_tokens_seen": 779360, + "step": 455 + }, + { + "epoch": 2.232445520581114, + "grad_norm": 0.01020096056163311, + "learning_rate": 0.29990254158072044, + "loss": 0.1549, + "num_input_tokens_seen": 787744, + "step": 460 + }, + { + "epoch": 2.2566585956416465, + "grad_norm": 0.011899656616151333, + "learning_rate": 0.2999004069751921, + "loss": 0.1722, + "num_input_tokens_seen": 796128, + "step": 465 + }, + { + "epoch": 2.280871670702179, + "grad_norm": 0.015132845379412174, + "learning_rate": 0.2998982492531373, + "loss": 0.1698, + "num_input_tokens_seen": 804704, + "step": 470 + }, + { + "epoch": 2.305084745762712, + "grad_norm": 0.023833123967051506, + "learning_rate": 0.2998960684148887, + "loss": 0.1682, + "num_input_tokens_seen": 813056, + "step": 475 + }, + { + "epoch": 2.3292978208232444, + "grad_norm": 0.012906037271022797, + "learning_rate": 0.29989386446078264, + "loss": 0.1593, + "num_input_tokens_seen": 821632, + "step": 480 + }, + { + "epoch": 2.3535108958837774, + "grad_norm": 0.013239835388958454, + "learning_rate": 0.299891637391159, + "loss": 0.1718, + "num_input_tokens_seen": 830176, + "step": 485 + }, + { + "epoch": 2.37772397094431, + "grad_norm": 0.01942196488380432, + "learning_rate": 0.2998893872063612, + "loss": 0.182, + "num_input_tokens_seen": 838272, + "step": 490 + }, + { + "epoch": 2.401937046004843, + "grad_norm": 0.01873689517378807, + "learning_rate": 0.2998871139067363, + "loss": 0.1663, + "num_input_tokens_seen": 846560, + "step": 495 + }, + { + "epoch": 2.4261501210653753, + "grad_norm": 0.01746363379061222, + "learning_rate": 0.2998848174926348, + "loss": 0.1562, + "num_input_tokens_seen": 855392, + "step": 500 + }, + { + "epoch": 2.450363196125908, + "grad_norm": 0.03828323259949684, + "learning_rate": 0.2998824979644109, + "loss": 0.1882, + "num_input_tokens_seen": 863584, + "step": 505 + }, + { + "epoch": 2.4745762711864407, + "grad_norm": 0.008617733605206013, + "learning_rate": 0.29988015532242224, + "loss": 0.1792, + "num_input_tokens_seen": 871936, + "step": 510 + }, + { + "epoch": 2.4987893462469732, + "grad_norm": 0.011088581755757332, + "learning_rate": 0.29987778956703015, + "loss": 0.1542, + "num_input_tokens_seen": 880544, + "step": 515 + }, + { + "epoch": 2.523002421307506, + "grad_norm": 0.014293737709522247, + "learning_rate": 0.2998754006985994, + "loss": 0.1614, + "num_input_tokens_seen": 889184, + "step": 520 + }, + { + "epoch": 2.5472154963680387, + "grad_norm": 0.042401332408189774, + "learning_rate": 0.29987298871749846, + "loss": 0.1946, + "num_input_tokens_seen": 897792, + "step": 525 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.014036297798156738, + "learning_rate": 0.2998705536240992, + "loss": 0.1756, + "num_input_tokens_seen": 906272, + "step": 530 + }, + { + "epoch": 2.595641646489104, + "grad_norm": 0.02388450689613819, + "learning_rate": 0.2998680954187772, + "loss": 0.155, + "num_input_tokens_seen": 915328, + "step": 535 + }, + { + "epoch": 2.619854721549637, + "grad_norm": 0.034376442432403564, + "learning_rate": 0.2998656141019115, + "loss": 0.1724, + "num_input_tokens_seen": 924256, + "step": 540 + }, + { + "epoch": 2.6440677966101696, + "grad_norm": 0.012539933435618877, + "learning_rate": 0.2998631096738848, + "loss": 0.1651, + "num_input_tokens_seen": 933056, + "step": 545 + }, + { + "epoch": 2.668280871670702, + "grad_norm": 0.02096189185976982, + "learning_rate": 0.29986058213508326, + "loss": 0.183, + "num_input_tokens_seen": 941568, + "step": 550 + }, + { + "epoch": 2.692493946731235, + "grad_norm": 0.010658941231667995, + "learning_rate": 0.29985803148589674, + "loss": 0.17, + "num_input_tokens_seen": 949984, + "step": 555 + }, + { + "epoch": 2.7167070217917675, + "grad_norm": 0.023811815306544304, + "learning_rate": 0.2998554577267185, + "loss": 0.1778, + "num_input_tokens_seen": 958560, + "step": 560 + }, + { + "epoch": 2.7409200968523004, + "grad_norm": 0.010588140226900578, + "learning_rate": 0.2998528608579455, + "loss": 0.1626, + "num_input_tokens_seen": 966880, + "step": 565 + }, + { + "epoch": 2.765133171912833, + "grad_norm": 0.009137889370322227, + "learning_rate": 0.2998502408799781, + "loss": 0.1561, + "num_input_tokens_seen": 975744, + "step": 570 + }, + { + "epoch": 2.7893462469733654, + "grad_norm": 0.018747486174106598, + "learning_rate": 0.2998475977932205, + "loss": 0.1664, + "num_input_tokens_seen": 984352, + "step": 575 + }, + { + "epoch": 2.8135593220338984, + "grad_norm": 0.014297720044851303, + "learning_rate": 0.29984493159808023, + "loss": 0.1703, + "num_input_tokens_seen": 993184, + "step": 580 + }, + { + "epoch": 2.837772397094431, + "grad_norm": 0.014942971058189869, + "learning_rate": 0.29984224229496836, + "loss": 0.1756, + "num_input_tokens_seen": 1001824, + "step": 585 + }, + { + "epoch": 2.861985472154964, + "grad_norm": 0.008899236097931862, + "learning_rate": 0.2998395298842998, + "loss": 0.1631, + "num_input_tokens_seen": 1010816, + "step": 590 + }, + { + "epoch": 2.8861985472154963, + "grad_norm": 0.017474183812737465, + "learning_rate": 0.29983679436649263, + "loss": 0.1618, + "num_input_tokens_seen": 1018976, + "step": 595 + }, + { + "epoch": 2.910411622276029, + "grad_norm": 0.009983228519558907, + "learning_rate": 0.2998340357419689, + "loss": 0.1611, + "num_input_tokens_seen": 1027680, + "step": 600 + }, + { + "epoch": 2.910411622276029, + "eval_loss": 0.16621483862400055, + "eval_runtime": 4.6372, + "eval_samples_per_second": 79.143, + "eval_steps_per_second": 19.84, + "num_input_tokens_seen": 1027680, + "step": 600 + }, + { + "epoch": 2.9346246973365617, + "grad_norm": 0.012684737332165241, + "learning_rate": 0.29983125401115385, + "loss": 0.172, + "num_input_tokens_seen": 1036224, + "step": 605 + }, + { + "epoch": 2.9588377723970947, + "grad_norm": 0.03180256485939026, + "learning_rate": 0.29982844917447654, + "loss": 0.1924, + "num_input_tokens_seen": 1045184, + "step": 610 + }, + { + "epoch": 2.983050847457627, + "grad_norm": 0.011137894354760647, + "learning_rate": 0.2998256212323695, + "loss": 0.1848, + "num_input_tokens_seen": 1053344, + "step": 615 + }, + { + "epoch": 3.009685230024213, + "grad_norm": 0.01448863185942173, + "learning_rate": 0.29982277018526887, + "loss": 0.2032, + "num_input_tokens_seen": 1062336, + "step": 620 + }, + { + "epoch": 3.0338983050847457, + "grad_norm": 0.012388558126986027, + "learning_rate": 0.2998198960336143, + "loss": 0.1593, + "num_input_tokens_seen": 1070752, + "step": 625 + }, + { + "epoch": 3.0581113801452786, + "grad_norm": 0.01198655553162098, + "learning_rate": 0.299816998777849, + "loss": 0.1487, + "num_input_tokens_seen": 1079840, + "step": 630 + }, + { + "epoch": 3.082324455205811, + "grad_norm": 0.009623045101761818, + "learning_rate": 0.2998140784184197, + "loss": 0.1543, + "num_input_tokens_seen": 1088416, + "step": 635 + }, + { + "epoch": 3.106537530266344, + "grad_norm": 0.022741099819540977, + "learning_rate": 0.2998111349557769, + "loss": 0.1859, + "num_input_tokens_seen": 1096992, + "step": 640 + }, + { + "epoch": 3.1307506053268765, + "grad_norm": 0.012226049788296223, + "learning_rate": 0.29980816839037444, + "loss": 0.1351, + "num_input_tokens_seen": 1105920, + "step": 645 + }, + { + "epoch": 3.154963680387409, + "grad_norm": 0.018815601244568825, + "learning_rate": 0.2998051787226698, + "loss": 0.1642, + "num_input_tokens_seen": 1114208, + "step": 650 + }, + { + "epoch": 3.179176755447942, + "grad_norm": 0.010203109122812748, + "learning_rate": 0.29980216595312403, + "loss": 0.1623, + "num_input_tokens_seen": 1122848, + "step": 655 + }, + { + "epoch": 3.2033898305084745, + "grad_norm": 0.022526711225509644, + "learning_rate": 0.29979913008220177, + "loss": 0.1946, + "num_input_tokens_seen": 1131488, + "step": 660 + }, + { + "epoch": 3.2276029055690074, + "grad_norm": 0.01602816954255104, + "learning_rate": 0.2997960711103711, + "loss": 0.1562, + "num_input_tokens_seen": 1139840, + "step": 665 + }, + { + "epoch": 3.25181598062954, + "grad_norm": 0.015256829559803009, + "learning_rate": 0.29979298903810386, + "loss": 0.1507, + "num_input_tokens_seen": 1149088, + "step": 670 + }, + { + "epoch": 3.2760290556900724, + "grad_norm": 0.0144090186804533, + "learning_rate": 0.29978988386587524, + "loss": 0.1491, + "num_input_tokens_seen": 1157728, + "step": 675 + }, + { + "epoch": 3.3002421307506054, + "grad_norm": 0.014401441439986229, + "learning_rate": 0.2997867555941642, + "loss": 0.1529, + "num_input_tokens_seen": 1165728, + "step": 680 + }, + { + "epoch": 3.324455205811138, + "grad_norm": 0.016209829598665237, + "learning_rate": 0.299783604223453, + "loss": 0.1583, + "num_input_tokens_seen": 1173952, + "step": 685 + }, + { + "epoch": 3.348668280871671, + "grad_norm": 0.017491448670625687, + "learning_rate": 0.29978042975422786, + "loss": 0.157, + "num_input_tokens_seen": 1182720, + "step": 690 + }, + { + "epoch": 3.3728813559322033, + "grad_norm": 0.023096319288015366, + "learning_rate": 0.29977723218697816, + "loss": 0.1686, + "num_input_tokens_seen": 1191520, + "step": 695 + }, + { + "epoch": 3.3970944309927362, + "grad_norm": 0.017921505495905876, + "learning_rate": 0.299774011522197, + "loss": 0.1763, + "num_input_tokens_seen": 1200256, + "step": 700 + }, + { + "epoch": 3.4213075060532687, + "grad_norm": 0.013801966793835163, + "learning_rate": 0.29977076776038114, + "loss": 0.1723, + "num_input_tokens_seen": 1208928, + "step": 705 + }, + { + "epoch": 3.4455205811138017, + "grad_norm": 0.01897471770644188, + "learning_rate": 0.2997675009020307, + "loss": 0.18, + "num_input_tokens_seen": 1217600, + "step": 710 + }, + { + "epoch": 3.469733656174334, + "grad_norm": 0.023410454392433167, + "learning_rate": 0.2997642109476496, + "loss": 0.1865, + "num_input_tokens_seen": 1226272, + "step": 715 + }, + { + "epoch": 3.4939467312348667, + "grad_norm": 0.013495941646397114, + "learning_rate": 0.299760897897745, + "loss": 0.1523, + "num_input_tokens_seen": 1234400, + "step": 720 + }, + { + "epoch": 3.5181598062953996, + "grad_norm": 0.010956584475934505, + "learning_rate": 0.29975756175282803, + "loss": 0.1809, + "num_input_tokens_seen": 1242912, + "step": 725 + }, + { + "epoch": 3.542372881355932, + "grad_norm": 0.008936137892305851, + "learning_rate": 0.29975420251341306, + "loss": 0.1673, + "num_input_tokens_seen": 1251712, + "step": 730 + }, + { + "epoch": 3.566585956416465, + "grad_norm": 0.025514597073197365, + "learning_rate": 0.29975082018001814, + "loss": 0.1887, + "num_input_tokens_seen": 1260320, + "step": 735 + }, + { + "epoch": 3.5907990314769975, + "grad_norm": 0.010167508386075497, + "learning_rate": 0.2997474147531648, + "loss": 0.175, + "num_input_tokens_seen": 1268736, + "step": 740 + }, + { + "epoch": 3.61501210653753, + "grad_norm": 0.01305976789444685, + "learning_rate": 0.29974398623337833, + "loss": 0.167, + "num_input_tokens_seen": 1277568, + "step": 745 + }, + { + "epoch": 3.639225181598063, + "grad_norm": 0.012718023732304573, + "learning_rate": 0.2997405346211873, + "loss": 0.1608, + "num_input_tokens_seen": 1285792, + "step": 750 + }, + { + "epoch": 3.663438256658596, + "grad_norm": 0.014878565445542336, + "learning_rate": 0.2997370599171241, + "loss": 0.1645, + "num_input_tokens_seen": 1294432, + "step": 755 + }, + { + "epoch": 3.6876513317191284, + "grad_norm": 0.011907026171684265, + "learning_rate": 0.2997335621217246, + "loss": 0.1726, + "num_input_tokens_seen": 1303232, + "step": 760 + }, + { + "epoch": 3.711864406779661, + "grad_norm": 0.010987116023898125, + "learning_rate": 0.29973004123552816, + "loss": 0.1707, + "num_input_tokens_seen": 1311840, + "step": 765 + }, + { + "epoch": 3.736077481840194, + "grad_norm": 0.009355372749269009, + "learning_rate": 0.2997264972590777, + "loss": 0.1561, + "num_input_tokens_seen": 1320192, + "step": 770 + }, + { + "epoch": 3.7602905569007263, + "grad_norm": 0.021087199449539185, + "learning_rate": 0.29972293019291973, + "loss": 0.1513, + "num_input_tokens_seen": 1328672, + "step": 775 + }, + { + "epoch": 3.7845036319612593, + "grad_norm": 0.01526679191738367, + "learning_rate": 0.2997193400376045, + "loss": 0.1655, + "num_input_tokens_seen": 1337184, + "step": 780 + }, + { + "epoch": 3.8087167070217918, + "grad_norm": 0.011955141089856625, + "learning_rate": 0.2997157267936854, + "loss": 0.1687, + "num_input_tokens_seen": 1345760, + "step": 785 + }, + { + "epoch": 3.8329297820823243, + "grad_norm": 0.012527767568826675, + "learning_rate": 0.2997120904617199, + "loss": 0.1875, + "num_input_tokens_seen": 1354240, + "step": 790 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 0.023824680596590042, + "learning_rate": 0.29970843104226863, + "loss": 0.1763, + "num_input_tokens_seen": 1362528, + "step": 795 + }, + { + "epoch": 3.8813559322033897, + "grad_norm": 0.019424816593527794, + "learning_rate": 0.2997047485358959, + "loss": 0.1791, + "num_input_tokens_seen": 1371040, + "step": 800 + }, + { + "epoch": 3.8813559322033897, + "eval_loss": 0.16065216064453125, + "eval_runtime": 4.6164, + "eval_samples_per_second": 79.499, + "eval_steps_per_second": 19.929, + "num_input_tokens_seen": 1371040, + "step": 800 + }, + { + "epoch": 3.9055690072639226, + "grad_norm": 0.017343295738101006, + "learning_rate": 0.2997010429431697, + "loss": 0.158, + "num_input_tokens_seen": 1379904, + "step": 805 + }, + { + "epoch": 3.929782082324455, + "grad_norm": 0.007367840968072414, + "learning_rate": 0.29969731426466134, + "loss": 0.1537, + "num_input_tokens_seen": 1388416, + "step": 810 + }, + { + "epoch": 3.9539951573849876, + "grad_norm": 0.010721693746745586, + "learning_rate": 0.299693562500946, + "loss": 0.1692, + "num_input_tokens_seen": 1396832, + "step": 815 + }, + { + "epoch": 3.9782082324455206, + "grad_norm": 0.008323339745402336, + "learning_rate": 0.29968978765260207, + "loss": 0.1786, + "num_input_tokens_seen": 1405632, + "step": 820 + }, + { + "epoch": 4.004842615012106, + "grad_norm": 0.04076782613992691, + "learning_rate": 0.2996859897202118, + "loss": 0.1927, + "num_input_tokens_seen": 1414304, + "step": 825 + }, + { + "epoch": 4.0290556900726395, + "grad_norm": 0.016315795481204987, + "learning_rate": 0.2996821687043609, + "loss": 0.1942, + "num_input_tokens_seen": 1423040, + "step": 830 + }, + { + "epoch": 4.053268765133172, + "grad_norm": 0.011546843685209751, + "learning_rate": 0.2996783246056384, + "loss": 0.1551, + "num_input_tokens_seen": 1431520, + "step": 835 + }, + { + "epoch": 4.0774818401937045, + "grad_norm": 0.01259304117411375, + "learning_rate": 0.29967445742463744, + "loss": 0.164, + "num_input_tokens_seen": 1440128, + "step": 840 + }, + { + "epoch": 4.101694915254237, + "grad_norm": 0.018328124657273293, + "learning_rate": 0.29967056716195417, + "loss": 0.1527, + "num_input_tokens_seen": 1448544, + "step": 845 + }, + { + "epoch": 4.12590799031477, + "grad_norm": 0.014159264042973518, + "learning_rate": 0.2996666538181885, + "loss": 0.14, + "num_input_tokens_seen": 1457216, + "step": 850 + }, + { + "epoch": 4.150121065375303, + "grad_norm": 0.011884521692991257, + "learning_rate": 0.29966271739394407, + "loss": 0.1646, + "num_input_tokens_seen": 1466016, + "step": 855 + }, + { + "epoch": 4.174334140435835, + "grad_norm": 0.011800392530858517, + "learning_rate": 0.29965875788982776, + "loss": 0.1333, + "num_input_tokens_seen": 1474016, + "step": 860 + }, + { + "epoch": 4.198547215496368, + "grad_norm": 0.011116903275251389, + "learning_rate": 0.2996547753064503, + "loss": 0.1354, + "num_input_tokens_seen": 1482816, + "step": 865 + }, + { + "epoch": 4.2227602905569, + "grad_norm": 0.021837567910552025, + "learning_rate": 0.29965076964442583, + "loss": 0.1597, + "num_input_tokens_seen": 1491488, + "step": 870 + }, + { + "epoch": 4.246973365617434, + "grad_norm": 0.022325383499264717, + "learning_rate": 0.299646740904372, + "loss": 0.1561, + "num_input_tokens_seen": 1500256, + "step": 875 + }, + { + "epoch": 4.271186440677966, + "grad_norm": 0.02439846657216549, + "learning_rate": 0.29964268908691016, + "loss": 0.1593, + "num_input_tokens_seen": 1508672, + "step": 880 + }, + { + "epoch": 4.295399515738499, + "grad_norm": 0.016201738268136978, + "learning_rate": 0.29963861419266513, + "loss": 0.1543, + "num_input_tokens_seen": 1517216, + "step": 885 + }, + { + "epoch": 4.319612590799031, + "grad_norm": 0.019082296639680862, + "learning_rate": 0.29963451622226533, + "loss": 0.1887, + "num_input_tokens_seen": 1525632, + "step": 890 + }, + { + "epoch": 4.343825665859564, + "grad_norm": 0.01020568422973156, + "learning_rate": 0.29963039517634277, + "loss": 0.1536, + "num_input_tokens_seen": 1534528, + "step": 895 + }, + { + "epoch": 4.368038740920097, + "grad_norm": 0.028411483392119408, + "learning_rate": 0.2996262510555328, + "loss": 0.1919, + "num_input_tokens_seen": 1543072, + "step": 900 + }, + { + "epoch": 4.39225181598063, + "grad_norm": 0.013592520728707314, + "learning_rate": 0.2996220838604746, + "loss": 0.1531, + "num_input_tokens_seen": 1551680, + "step": 905 + }, + { + "epoch": 4.416464891041162, + "grad_norm": 0.03139405697584152, + "learning_rate": 0.29961789359181085, + "loss": 0.2153, + "num_input_tokens_seen": 1559680, + "step": 910 + }, + { + "epoch": 4.440677966101695, + "grad_norm": 0.015007052570581436, + "learning_rate": 0.29961368025018764, + "loss": 0.1649, + "num_input_tokens_seen": 1568192, + "step": 915 + }, + { + "epoch": 4.464891041162228, + "grad_norm": 0.016937602311372757, + "learning_rate": 0.2996094438362548, + "loss": 0.182, + "num_input_tokens_seen": 1576768, + "step": 920 + }, + { + "epoch": 4.4891041162227605, + "grad_norm": 0.008449510671198368, + "learning_rate": 0.2996051843506657, + "loss": 0.1546, + "num_input_tokens_seen": 1585376, + "step": 925 + }, + { + "epoch": 4.513317191283293, + "grad_norm": 0.014277836307883263, + "learning_rate": 0.299600901794077, + "loss": 0.1489, + "num_input_tokens_seen": 1593792, + "step": 930 + }, + { + "epoch": 4.5375302663438255, + "grad_norm": 0.022901875898241997, + "learning_rate": 0.29959659616714923, + "loss": 0.1721, + "num_input_tokens_seen": 1602336, + "step": 935 + }, + { + "epoch": 4.561743341404358, + "grad_norm": 0.019917843863368034, + "learning_rate": 0.2995922674705464, + "loss": 0.1804, + "num_input_tokens_seen": 1610592, + "step": 940 + }, + { + "epoch": 4.585956416464891, + "grad_norm": 0.01246443297713995, + "learning_rate": 0.2995879157049361, + "loss": 0.152, + "num_input_tokens_seen": 1619200, + "step": 945 + }, + { + "epoch": 4.610169491525424, + "grad_norm": 0.015667054802179337, + "learning_rate": 0.2995835408709893, + "loss": 0.1726, + "num_input_tokens_seen": 1627552, + "step": 950 + }, + { + "epoch": 4.634382566585956, + "grad_norm": 0.011403476819396019, + "learning_rate": 0.29957914296938076, + "loss": 0.1562, + "num_input_tokens_seen": 1636128, + "step": 955 + }, + { + "epoch": 4.658595641646489, + "grad_norm": 0.01440424844622612, + "learning_rate": 0.2995747220007886, + "loss": 0.1666, + "num_input_tokens_seen": 1644448, + "step": 960 + }, + { + "epoch": 4.682808716707022, + "grad_norm": 0.009565543383359909, + "learning_rate": 0.2995702779658947, + "loss": 0.1636, + "num_input_tokens_seen": 1652896, + "step": 965 + }, + { + "epoch": 4.707021791767555, + "grad_norm": 0.013080611824989319, + "learning_rate": 0.29956581086538425, + "loss": 0.1724, + "num_input_tokens_seen": 1661888, + "step": 970 + }, + { + "epoch": 4.731234866828087, + "grad_norm": 0.014663955196738243, + "learning_rate": 0.2995613206999462, + "loss": 0.1396, + "num_input_tokens_seen": 1670752, + "step": 975 + }, + { + "epoch": 4.75544794188862, + "grad_norm": 0.016609029844403267, + "learning_rate": 0.29955680747027297, + "loss": 0.1768, + "num_input_tokens_seen": 1679200, + "step": 980 + }, + { + "epoch": 4.779661016949152, + "grad_norm": 0.010175849311053753, + "learning_rate": 0.2995522711770607, + "loss": 0.1407, + "num_input_tokens_seen": 1687712, + "step": 985 + }, + { + "epoch": 4.803874092009686, + "grad_norm": 0.012522423639893532, + "learning_rate": 0.2995477118210087, + "loss": 0.1687, + "num_input_tokens_seen": 1696544, + "step": 990 + }, + { + "epoch": 4.828087167070218, + "grad_norm": 0.012908313423395157, + "learning_rate": 0.29954312940282024, + "loss": 0.1685, + "num_input_tokens_seen": 1705088, + "step": 995 + }, + { + "epoch": 4.852300242130751, + "grad_norm": 0.01197136752307415, + "learning_rate": 0.29953852392320196, + "loss": 0.1444, + "num_input_tokens_seen": 1713440, + "step": 1000 + }, + { + "epoch": 4.852300242130751, + "eval_loss": 0.1526937186717987, + "eval_runtime": 4.6133, + "eval_samples_per_second": 79.552, + "eval_steps_per_second": 19.942, + "num_input_tokens_seen": 1713440, + "step": 1000 + }, + { + "epoch": 4.876513317191283, + "grad_norm": 0.021112551912665367, + "learning_rate": 0.2995338953828641, + "loss": 0.1615, + "num_input_tokens_seen": 1721728, + "step": 1005 + }, + { + "epoch": 4.900726392251816, + "grad_norm": 0.016798578202724457, + "learning_rate": 0.2995292437825204, + "loss": 0.1535, + "num_input_tokens_seen": 1730560, + "step": 1010 + }, + { + "epoch": 4.924939467312349, + "grad_norm": 0.015116257593035698, + "learning_rate": 0.29952456912288816, + "loss": 0.1409, + "num_input_tokens_seen": 1739456, + "step": 1015 + }, + { + "epoch": 4.9491525423728815, + "grad_norm": 0.015726398676633835, + "learning_rate": 0.2995198714046884, + "loss": 0.1714, + "num_input_tokens_seen": 1747936, + "step": 1020 + }, + { + "epoch": 4.973365617433414, + "grad_norm": 0.020122092217206955, + "learning_rate": 0.2995151506286454, + "loss": 0.1775, + "num_input_tokens_seen": 1756032, + "step": 1025 + }, + { + "epoch": 4.9975786924939465, + "grad_norm": 0.022168820723891258, + "learning_rate": 0.2995104067954873, + "loss": 0.1707, + "num_input_tokens_seen": 1764640, + "step": 1030 + }, + { + "epoch": 5.0242130750605325, + "grad_norm": 0.042415957897901535, + "learning_rate": 0.2995056399059456, + "loss": 0.2931, + "num_input_tokens_seen": 1773632, + "step": 1035 + }, + { + "epoch": 5.048426150121065, + "grad_norm": 0.019460385665297508, + "learning_rate": 0.2995008499607554, + "loss": 0.1785, + "num_input_tokens_seen": 1781888, + "step": 1040 + }, + { + "epoch": 5.072639225181598, + "grad_norm": 0.009444390423595905, + "learning_rate": 0.2994960369606554, + "loss": 0.164, + "num_input_tokens_seen": 1790048, + "step": 1045 + }, + { + "epoch": 5.096852300242131, + "grad_norm": 0.014936824329197407, + "learning_rate": 0.2994912009063878, + "loss": 0.1508, + "num_input_tokens_seen": 1798624, + "step": 1050 + }, + { + "epoch": 5.121065375302663, + "grad_norm": 0.011253643780946732, + "learning_rate": 0.29948634179869843, + "loss": 0.1701, + "num_input_tokens_seen": 1807200, + "step": 1055 + }, + { + "epoch": 5.145278450363196, + "grad_norm": 0.013002830557525158, + "learning_rate": 0.29948145963833656, + "loss": 0.1683, + "num_input_tokens_seen": 1815456, + "step": 1060 + }, + { + "epoch": 5.169491525423728, + "grad_norm": 0.018141621723771095, + "learning_rate": 0.29947655442605514, + "loss": 0.1387, + "num_input_tokens_seen": 1824224, + "step": 1065 + }, + { + "epoch": 5.193704600484262, + "grad_norm": 0.011396532878279686, + "learning_rate": 0.2994716261626106, + "loss": 0.1739, + "num_input_tokens_seen": 1832992, + "step": 1070 + }, + { + "epoch": 5.217917675544794, + "grad_norm": 0.010456476360559464, + "learning_rate": 0.2994666748487629, + "loss": 0.1419, + "num_input_tokens_seen": 1841760, + "step": 1075 + }, + { + "epoch": 5.242130750605327, + "grad_norm": 0.01158500649034977, + "learning_rate": 0.2994617004852756, + "loss": 0.1404, + "num_input_tokens_seen": 1850688, + "step": 1080 + }, + { + "epoch": 5.266343825665859, + "grad_norm": 0.009113197214901447, + "learning_rate": 0.2994567030729159, + "loss": 0.1261, + "num_input_tokens_seen": 1858816, + "step": 1085 + }, + { + "epoch": 5.290556900726393, + "grad_norm": 0.01002470776438713, + "learning_rate": 0.29945168261245436, + "loss": 0.1547, + "num_input_tokens_seen": 1867584, + "step": 1090 + }, + { + "epoch": 5.314769975786925, + "grad_norm": 0.013098735362291336, + "learning_rate": 0.29944663910466524, + "loss": 0.1485, + "num_input_tokens_seen": 1875840, + "step": 1095 + }, + { + "epoch": 5.338983050847458, + "grad_norm": 0.013616028241813183, + "learning_rate": 0.2994415725503263, + "loss": 0.155, + "num_input_tokens_seen": 1884000, + "step": 1100 + }, + { + "epoch": 5.36319612590799, + "grad_norm": 0.015490202233195305, + "learning_rate": 0.29943648295021885, + "loss": 0.1588, + "num_input_tokens_seen": 1892640, + "step": 1105 + }, + { + "epoch": 5.387409200968523, + "grad_norm": 0.012093933299183846, + "learning_rate": 0.2994313703051278, + "loss": 0.1529, + "num_input_tokens_seen": 1901152, + "step": 1110 + }, + { + "epoch": 5.411622276029056, + "grad_norm": 0.01955879107117653, + "learning_rate": 0.29942623461584156, + "loss": 0.1781, + "num_input_tokens_seen": 1909792, + "step": 1115 + }, + { + "epoch": 5.4358353510895885, + "grad_norm": 0.016249684616923332, + "learning_rate": 0.29942107588315214, + "loss": 0.1502, + "num_input_tokens_seen": 1918688, + "step": 1120 + }, + { + "epoch": 5.460048426150121, + "grad_norm": 0.014873219653964043, + "learning_rate": 0.29941589410785513, + "loss": 0.1813, + "num_input_tokens_seen": 1927264, + "step": 1125 + }, + { + "epoch": 5.4842615012106535, + "grad_norm": 0.013536624610424042, + "learning_rate": 0.29941068929074954, + "loss": 0.1529, + "num_input_tokens_seen": 1935744, + "step": 1130 + }, + { + "epoch": 5.508474576271187, + "grad_norm": 0.011873905546963215, + "learning_rate": 0.2994054614326381, + "loss": 0.161, + "num_input_tokens_seen": 1944064, + "step": 1135 + }, + { + "epoch": 5.532687651331719, + "grad_norm": 0.012496971525251865, + "learning_rate": 0.29940021053432686, + "loss": 0.1888, + "num_input_tokens_seen": 1952768, + "step": 1140 + }, + { + "epoch": 5.556900726392252, + "grad_norm": 0.020763888955116272, + "learning_rate": 0.29939493659662575, + "loss": 0.1806, + "num_input_tokens_seen": 1961504, + "step": 1145 + }, + { + "epoch": 5.581113801452784, + "grad_norm": 0.009508232586085796, + "learning_rate": 0.299389639620348, + "loss": 0.162, + "num_input_tokens_seen": 1969920, + "step": 1150 + }, + { + "epoch": 5.605326876513317, + "grad_norm": 0.02023990824818611, + "learning_rate": 0.29938431960631046, + "loss": 0.1876, + "num_input_tokens_seen": 1978432, + "step": 1155 + }, + { + "epoch": 5.62953995157385, + "grad_norm": 0.009783950634300709, + "learning_rate": 0.2993789765553335, + "loss": 0.1707, + "num_input_tokens_seen": 1986464, + "step": 1160 + }, + { + "epoch": 5.653753026634383, + "grad_norm": 0.009784811176359653, + "learning_rate": 0.2993736104682412, + "loss": 0.1832, + "num_input_tokens_seen": 1995520, + "step": 1165 + }, + { + "epoch": 5.677966101694915, + "grad_norm": 0.009572578594088554, + "learning_rate": 0.299368221345861, + "loss": 0.1534, + "num_input_tokens_seen": 2004032, + "step": 1170 + }, + { + "epoch": 5.702179176755448, + "grad_norm": 0.010823666118085384, + "learning_rate": 0.29936280918902397, + "loss": 0.1642, + "num_input_tokens_seen": 2013120, + "step": 1175 + }, + { + "epoch": 5.72639225181598, + "grad_norm": 0.013940390199422836, + "learning_rate": 0.2993573739985648, + "loss": 0.1531, + "num_input_tokens_seen": 2021888, + "step": 1180 + }, + { + "epoch": 5.750605326876514, + "grad_norm": 0.011700445786118507, + "learning_rate": 0.2993519157753216, + "loss": 0.1515, + "num_input_tokens_seen": 2030208, + "step": 1185 + }, + { + "epoch": 5.774818401937046, + "grad_norm": 0.013676813803613186, + "learning_rate": 0.2993464345201361, + "loss": 0.1501, + "num_input_tokens_seen": 2039072, + "step": 1190 + }, + { + "epoch": 5.799031476997579, + "grad_norm": 0.02336663194000721, + "learning_rate": 0.2993409302338536, + "loss": 0.1584, + "num_input_tokens_seen": 2047840, + "step": 1195 + }, + { + "epoch": 5.823244552058111, + "grad_norm": 0.013788881711661816, + "learning_rate": 0.2993354029173229, + "loss": 0.1477, + "num_input_tokens_seen": 2056384, + "step": 1200 + }, + { + "epoch": 5.823244552058111, + "eval_loss": 0.15967370569705963, + "eval_runtime": 4.6364, + "eval_samples_per_second": 79.156, + "eval_steps_per_second": 19.843, + "num_input_tokens_seen": 2056384, + "step": 1200 + }, + { + "epoch": 5.847457627118644, + "grad_norm": 0.013135431334376335, + "learning_rate": 0.2993298525713965, + "loss": 0.1367, + "num_input_tokens_seen": 2065280, + "step": 1205 + }, + { + "epoch": 5.871670702179177, + "grad_norm": 0.015313195995986462, + "learning_rate": 0.29932427919693017, + "loss": 0.1602, + "num_input_tokens_seen": 2074112, + "step": 1210 + }, + { + "epoch": 5.8958837772397095, + "grad_norm": 0.015994783490896225, + "learning_rate": 0.2993186827947834, + "loss": 0.1768, + "num_input_tokens_seen": 2082720, + "step": 1215 + }, + { + "epoch": 5.920096852300242, + "grad_norm": 0.014032437466084957, + "learning_rate": 0.2993130633658194, + "loss": 0.1716, + "num_input_tokens_seen": 2091232, + "step": 1220 + }, + { + "epoch": 5.9443099273607745, + "grad_norm": 0.02114204131066799, + "learning_rate": 0.29930742091090456, + "loss": 0.1865, + "num_input_tokens_seen": 2100064, + "step": 1225 + }, + { + "epoch": 5.968523002421308, + "grad_norm": 0.01258323434740305, + "learning_rate": 0.29930175543090914, + "loss": 0.1886, + "num_input_tokens_seen": 2108320, + "step": 1230 + }, + { + "epoch": 5.99273607748184, + "grad_norm": 0.011027333326637745, + "learning_rate": 0.2992960669267068, + "loss": 0.1821, + "num_input_tokens_seen": 2116608, + "step": 1235 + }, + { + "epoch": 6.019370460048426, + "grad_norm": 0.01147460751235485, + "learning_rate": 0.29929035539917476, + "loss": 0.1886, + "num_input_tokens_seen": 2126016, + "step": 1240 + }, + { + "epoch": 6.043583535108959, + "grad_norm": 0.010342691093683243, + "learning_rate": 0.2992846208491938, + "loss": 0.1382, + "num_input_tokens_seen": 2134240, + "step": 1245 + }, + { + "epoch": 6.067796610169491, + "grad_norm": 0.014109786599874496, + "learning_rate": 0.2992788632776483, + "loss": 0.1417, + "num_input_tokens_seen": 2142976, + "step": 1250 + }, + { + "epoch": 6.092009685230024, + "grad_norm": 0.020979657769203186, + "learning_rate": 0.29927308268542613, + "loss": 0.1306, + "num_input_tokens_seen": 2151392, + "step": 1255 + }, + { + "epoch": 6.116222760290557, + "grad_norm": 0.01883854903280735, + "learning_rate": 0.2992672790734187, + "loss": 0.1548, + "num_input_tokens_seen": 2160288, + "step": 1260 + }, + { + "epoch": 6.14043583535109, + "grad_norm": 0.018721817061305046, + "learning_rate": 0.299261452442521, + "loss": 0.1346, + "num_input_tokens_seen": 2168896, + "step": 1265 + }, + { + "epoch": 6.164648910411622, + "grad_norm": 0.02052600122988224, + "learning_rate": 0.29925560279363167, + "loss": 0.1574, + "num_input_tokens_seen": 2177312, + "step": 1270 + }, + { + "epoch": 6.188861985472155, + "grad_norm": 0.014110010117292404, + "learning_rate": 0.29924973012765266, + "loss": 0.1292, + "num_input_tokens_seen": 2185920, + "step": 1275 + }, + { + "epoch": 6.213075060532688, + "grad_norm": 0.03836653381586075, + "learning_rate": 0.29924383444548974, + "loss": 0.1401, + "num_input_tokens_seen": 2194336, + "step": 1280 + }, + { + "epoch": 6.237288135593221, + "grad_norm": 0.016516393050551414, + "learning_rate": 0.299237915748052, + "loss": 0.1682, + "num_input_tokens_seen": 2203040, + "step": 1285 + }, + { + "epoch": 6.261501210653753, + "grad_norm": 0.030387261882424355, + "learning_rate": 0.2992319740362522, + "loss": 0.1628, + "num_input_tokens_seen": 2211744, + "step": 1290 + }, + { + "epoch": 6.285714285714286, + "grad_norm": 0.021053893491625786, + "learning_rate": 0.2992260093110066, + "loss": 0.1818, + "num_input_tokens_seen": 2219936, + "step": 1295 + }, + { + "epoch": 6.309927360774818, + "grad_norm": 0.020579230040311813, + "learning_rate": 0.2992200215732352, + "loss": 0.1644, + "num_input_tokens_seen": 2228800, + "step": 1300 + }, + { + "epoch": 6.3341404358353515, + "grad_norm": 0.026494912803173065, + "learning_rate": 0.2992140108238611, + "loss": 0.1814, + "num_input_tokens_seen": 2237344, + "step": 1305 + }, + { + "epoch": 6.358353510895884, + "grad_norm": 0.012523140758275986, + "learning_rate": 0.2992079770638115, + "loss": 0.1801, + "num_input_tokens_seen": 2245920, + "step": 1310 + }, + { + "epoch": 6.3825665859564165, + "grad_norm": 0.009657776914536953, + "learning_rate": 0.29920192029401677, + "loss": 0.1642, + "num_input_tokens_seen": 2254816, + "step": 1315 + }, + { + "epoch": 6.406779661016949, + "grad_norm": 0.015766702592372894, + "learning_rate": 0.2991958405154109, + "loss": 0.161, + "num_input_tokens_seen": 2262976, + "step": 1320 + }, + { + "epoch": 6.4309927360774815, + "grad_norm": 0.012859159149229527, + "learning_rate": 0.29918973772893154, + "loss": 0.1758, + "num_input_tokens_seen": 2271264, + "step": 1325 + }, + { + "epoch": 6.455205811138015, + "grad_norm": 0.014796551316976547, + "learning_rate": 0.29918361193551973, + "loss": 0.1563, + "num_input_tokens_seen": 2279392, + "step": 1330 + }, + { + "epoch": 6.479418886198547, + "grad_norm": 0.008321371860802174, + "learning_rate": 0.29917746313612026, + "loss": 0.1692, + "num_input_tokens_seen": 2287392, + "step": 1335 + }, + { + "epoch": 6.50363196125908, + "grad_norm": 0.01179565954953432, + "learning_rate": 0.29917129133168124, + "loss": 0.1542, + "num_input_tokens_seen": 2296032, + "step": 1340 + }, + { + "epoch": 6.527845036319612, + "grad_norm": 0.013314268551766872, + "learning_rate": 0.2991650965231546, + "loss": 0.1392, + "num_input_tokens_seen": 2304640, + "step": 1345 + }, + { + "epoch": 6.552058111380145, + "grad_norm": 0.01162253599613905, + "learning_rate": 0.29915887871149544, + "loss": 0.1554, + "num_input_tokens_seen": 2313024, + "step": 1350 + }, + { + "epoch": 6.576271186440678, + "grad_norm": 0.012460592202842236, + "learning_rate": 0.2991526378976628, + "loss": 0.1387, + "num_input_tokens_seen": 2321408, + "step": 1355 + }, + { + "epoch": 6.600484261501211, + "grad_norm": 0.016798684373497963, + "learning_rate": 0.29914637408261896, + "loss": 0.1635, + "num_input_tokens_seen": 2329824, + "step": 1360 + }, + { + "epoch": 6.624697336561743, + "grad_norm": 0.017247390002012253, + "learning_rate": 0.29914008726733, + "loss": 0.1591, + "num_input_tokens_seen": 2338624, + "step": 1365 + }, + { + "epoch": 6.648910411622276, + "grad_norm": 0.021171437576413155, + "learning_rate": 0.2991337774527653, + "loss": 0.1395, + "num_input_tokens_seen": 2347424, + "step": 1370 + }, + { + "epoch": 6.673123486682809, + "grad_norm": 0.01715819537639618, + "learning_rate": 0.2991274446398981, + "loss": 0.1774, + "num_input_tokens_seen": 2356224, + "step": 1375 + }, + { + "epoch": 6.697336561743342, + "grad_norm": 0.020877547562122345, + "learning_rate": 0.29912108882970484, + "loss": 0.1607, + "num_input_tokens_seen": 2365216, + "step": 1380 + }, + { + "epoch": 6.721549636803874, + "grad_norm": 0.020483125001192093, + "learning_rate": 0.2991147100231657, + "loss": 0.1535, + "num_input_tokens_seen": 2373984, + "step": 1385 + }, + { + "epoch": 6.745762711864407, + "grad_norm": 0.02383432909846306, + "learning_rate": 0.2991083082212644, + "loss": 0.1722, + "num_input_tokens_seen": 2382720, + "step": 1390 + }, + { + "epoch": 6.76997578692494, + "grad_norm": 0.013079582713544369, + "learning_rate": 0.2991018834249881, + "loss": 0.1739, + "num_input_tokens_seen": 2391392, + "step": 1395 + }, + { + "epoch": 6.7941888619854724, + "grad_norm": 0.012568505480885506, + "learning_rate": 0.29909543563532764, + "loss": 0.1451, + "num_input_tokens_seen": 2400544, + "step": 1400 + }, + { + "epoch": 6.7941888619854724, + "eval_loss": 0.15691396594047546, + "eval_runtime": 4.6165, + "eval_samples_per_second": 79.498, + "eval_steps_per_second": 19.929, + "num_input_tokens_seen": 2400544, + "step": 1400 + }, + { + "epoch": 6.818401937046005, + "grad_norm": 0.013082708232104778, + "learning_rate": 0.29908896485327746, + "loss": 0.1469, + "num_input_tokens_seen": 2408928, + "step": 1405 + }, + { + "epoch": 6.842615012106537, + "grad_norm": 0.011572941206395626, + "learning_rate": 0.29908247107983527, + "loss": 0.1672, + "num_input_tokens_seen": 2417472, + "step": 1410 + }, + { + "epoch": 6.86682808716707, + "grad_norm": 0.010674682445824146, + "learning_rate": 0.29907595431600253, + "loss": 0.1828, + "num_input_tokens_seen": 2425984, + "step": 1415 + }, + { + "epoch": 6.891041162227603, + "grad_norm": 0.00917674321681261, + "learning_rate": 0.29906941456278424, + "loss": 0.1391, + "num_input_tokens_seen": 2434656, + "step": 1420 + }, + { + "epoch": 6.915254237288136, + "grad_norm": 0.012031454592943192, + "learning_rate": 0.2990628518211889, + "loss": 0.1756, + "num_input_tokens_seen": 2443072, + "step": 1425 + }, + { + "epoch": 6.939467312348668, + "grad_norm": 0.012429754249751568, + "learning_rate": 0.2990562660922286, + "loss": 0.1654, + "num_input_tokens_seen": 2451168, + "step": 1430 + }, + { + "epoch": 6.963680387409201, + "grad_norm": 0.02001038007438183, + "learning_rate": 0.2990496573769189, + "loss": 0.1821, + "num_input_tokens_seen": 2459488, + "step": 1435 + }, + { + "epoch": 6.987893462469733, + "grad_norm": 0.009715056978166103, + "learning_rate": 0.29904302567627894, + "loss": 0.1414, + "num_input_tokens_seen": 2467776, + "step": 1440 + }, + { + "epoch": 7.014527845036319, + "grad_norm": 0.012346748262643814, + "learning_rate": 0.2990363709913314, + "loss": 0.1542, + "num_input_tokens_seen": 2476544, + "step": 1445 + }, + { + "epoch": 7.038740920096853, + "grad_norm": 0.021277397871017456, + "learning_rate": 0.29902969332310264, + "loss": 0.1409, + "num_input_tokens_seen": 2484864, + "step": 1450 + }, + { + "epoch": 7.062953995157385, + "grad_norm": 0.013476619496941566, + "learning_rate": 0.2990229926726223, + "loss": 0.1446, + "num_input_tokens_seen": 2493696, + "step": 1455 + }, + { + "epoch": 7.087167070217918, + "grad_norm": 0.013873101212084293, + "learning_rate": 0.29901626904092365, + "loss": 0.1311, + "num_input_tokens_seen": 2502112, + "step": 1460 + }, + { + "epoch": 7.11138014527845, + "grad_norm": 0.017765752971172333, + "learning_rate": 0.2990095224290438, + "loss": 0.1316, + "num_input_tokens_seen": 2510592, + "step": 1465 + }, + { + "epoch": 7.135593220338983, + "grad_norm": 0.011004822328686714, + "learning_rate": 0.29900275283802297, + "loss": 0.136, + "num_input_tokens_seen": 2518944, + "step": 1470 + }, + { + "epoch": 7.159806295399516, + "grad_norm": 0.024762079119682312, + "learning_rate": 0.2989959602689051, + "loss": 0.1539, + "num_input_tokens_seen": 2527328, + "step": 1475 + }, + { + "epoch": 7.184019370460049, + "grad_norm": 0.015447047539055347, + "learning_rate": 0.2989891447227379, + "loss": 0.1397, + "num_input_tokens_seen": 2535776, + "step": 1480 + }, + { + "epoch": 7.208232445520581, + "grad_norm": 0.018114956095814705, + "learning_rate": 0.29898230620057215, + "loss": 0.1433, + "num_input_tokens_seen": 2544288, + "step": 1485 + }, + { + "epoch": 7.232445520581114, + "grad_norm": 0.019417274743318558, + "learning_rate": 0.2989754447034626, + "loss": 0.1597, + "num_input_tokens_seen": 2552864, + "step": 1490 + }, + { + "epoch": 7.256658595641646, + "grad_norm": 0.013782854191958904, + "learning_rate": 0.2989685602324673, + "loss": 0.1413, + "num_input_tokens_seen": 2561408, + "step": 1495 + }, + { + "epoch": 7.280871670702179, + "grad_norm": 0.018303433433175087, + "learning_rate": 0.298961652788648, + "loss": 0.1485, + "num_input_tokens_seen": 2570048, + "step": 1500 + }, + { + "epoch": 7.305084745762712, + "grad_norm": 0.014770775102078915, + "learning_rate": 0.29895472237306986, + "loss": 0.1224, + "num_input_tokens_seen": 2578624, + "step": 1505 + }, + { + "epoch": 7.329297820823244, + "grad_norm": 0.021745482459664345, + "learning_rate": 0.29894776898680164, + "loss": 0.1534, + "num_input_tokens_seen": 2587136, + "step": 1510 + }, + { + "epoch": 7.353510895883777, + "grad_norm": 0.019123204052448273, + "learning_rate": 0.29894079263091566, + "loss": 0.1738, + "num_input_tokens_seen": 2595936, + "step": 1515 + }, + { + "epoch": 7.37772397094431, + "grad_norm": 0.012657473795115948, + "learning_rate": 0.2989337933064877, + "loss": 0.1726, + "num_input_tokens_seen": 2604096, + "step": 1520 + }, + { + "epoch": 7.401937046004843, + "grad_norm": 0.009013009257614613, + "learning_rate": 0.29892677101459725, + "loss": 0.1483, + "num_input_tokens_seen": 2612416, + "step": 1525 + }, + { + "epoch": 7.426150121065375, + "grad_norm": 0.011755028739571571, + "learning_rate": 0.2989197257563272, + "loss": 0.1422, + "num_input_tokens_seen": 2621280, + "step": 1530 + }, + { + "epoch": 7.450363196125908, + "grad_norm": 0.02171889692544937, + "learning_rate": 0.2989126575327639, + "loss": 0.1574, + "num_input_tokens_seen": 2629728, + "step": 1535 + }, + { + "epoch": 7.47457627118644, + "grad_norm": 0.012835530564188957, + "learning_rate": 0.29890556634499754, + "loss": 0.1494, + "num_input_tokens_seen": 2638464, + "step": 1540 + }, + { + "epoch": 7.498789346246974, + "grad_norm": 0.009123477153480053, + "learning_rate": 0.2988984521941216, + "loss": 0.1465, + "num_input_tokens_seen": 2646688, + "step": 1545 + }, + { + "epoch": 7.523002421307506, + "grad_norm": 0.011296091601252556, + "learning_rate": 0.29889131508123307, + "loss": 0.1573, + "num_input_tokens_seen": 2655200, + "step": 1550 + }, + { + "epoch": 7.547215496368039, + "grad_norm": 0.015012518502771854, + "learning_rate": 0.2988841550074327, + "loss": 0.1701, + "num_input_tokens_seen": 2663808, + "step": 1555 + }, + { + "epoch": 7.571428571428571, + "grad_norm": 0.01414725836366415, + "learning_rate": 0.2988769719738246, + "loss": 0.158, + "num_input_tokens_seen": 2672096, + "step": 1560 + }, + { + "epoch": 7.595641646489105, + "grad_norm": 0.012410610914230347, + "learning_rate": 0.29886976598151666, + "loss": 0.1555, + "num_input_tokens_seen": 2680928, + "step": 1565 + }, + { + "epoch": 7.619854721549637, + "grad_norm": 0.01596769131720066, + "learning_rate": 0.29886253703161986, + "loss": 0.1438, + "num_input_tokens_seen": 2690016, + "step": 1570 + }, + { + "epoch": 7.6440677966101696, + "grad_norm": 0.01540317852050066, + "learning_rate": 0.29885528512524917, + "loss": 0.1458, + "num_input_tokens_seen": 2698880, + "step": 1575 + }, + { + "epoch": 7.668280871670702, + "grad_norm": 0.01642509177327156, + "learning_rate": 0.29884801026352287, + "loss": 0.1532, + "num_input_tokens_seen": 2707136, + "step": 1580 + }, + { + "epoch": 7.6924939467312345, + "grad_norm": 0.010152550414204597, + "learning_rate": 0.2988407124475629, + "loss": 0.1399, + "num_input_tokens_seen": 2716160, + "step": 1585 + }, + { + "epoch": 7.716707021791768, + "grad_norm": 0.01443334762006998, + "learning_rate": 0.2988333916784945, + "loss": 0.1553, + "num_input_tokens_seen": 2724736, + "step": 1590 + }, + { + "epoch": 7.7409200968523, + "grad_norm": 0.01359404157847166, + "learning_rate": 0.2988260479574468, + "loss": 0.1566, + "num_input_tokens_seen": 2732704, + "step": 1595 + }, + { + "epoch": 7.765133171912833, + "grad_norm": 0.014635490253567696, + "learning_rate": 0.2988186812855523, + "loss": 0.1515, + "num_input_tokens_seen": 2741344, + "step": 1600 + }, + { + "epoch": 7.765133171912833, + "eval_loss": 0.17347173392772675, + "eval_runtime": 4.6188, + "eval_samples_per_second": 79.457, + "eval_steps_per_second": 19.918, + "num_input_tokens_seen": 2741344, + "step": 1600 + }, + { + "epoch": 7.789346246973365, + "grad_norm": 0.018409578129649162, + "learning_rate": 0.29881129166394693, + "loss": 0.1583, + "num_input_tokens_seen": 2750176, + "step": 1605 + }, + { + "epoch": 7.813559322033898, + "grad_norm": 0.015572583302855492, + "learning_rate": 0.29880387909377026, + "loss": 0.1727, + "num_input_tokens_seen": 2758624, + "step": 1610 + }, + { + "epoch": 7.837772397094431, + "grad_norm": 0.032998353242874146, + "learning_rate": 0.2987964435761655, + "loss": 0.1695, + "num_input_tokens_seen": 2767040, + "step": 1615 + }, + { + "epoch": 7.861985472154964, + "grad_norm": 0.01389442477375269, + "learning_rate": 0.29878898511227925, + "loss": 0.1847, + "num_input_tokens_seen": 2775872, + "step": 1620 + }, + { + "epoch": 7.886198547215496, + "grad_norm": 0.01210673525929451, + "learning_rate": 0.2987815037032617, + "loss": 0.1705, + "num_input_tokens_seen": 2784256, + "step": 1625 + }, + { + "epoch": 7.910411622276029, + "grad_norm": 0.01074386015534401, + "learning_rate": 0.29877399935026655, + "loss": 0.1954, + "num_input_tokens_seen": 2792704, + "step": 1630 + }, + { + "epoch": 7.934624697336561, + "grad_norm": 0.010396549478173256, + "learning_rate": 0.2987664720544511, + "loss": 0.171, + "num_input_tokens_seen": 2801280, + "step": 1635 + }, + { + "epoch": 7.958837772397095, + "grad_norm": 0.010461622849106789, + "learning_rate": 0.2987589218169761, + "loss": 0.1382, + "num_input_tokens_seen": 2809984, + "step": 1640 + }, + { + "epoch": 7.983050847457627, + "grad_norm": 0.009640945121645927, + "learning_rate": 0.29875134863900604, + "loss": 0.1374, + "num_input_tokens_seen": 2818496, + "step": 1645 + }, + { + "epoch": 8.009685230024212, + "grad_norm": 0.009369813837110996, + "learning_rate": 0.29874375252170865, + "loss": 0.166, + "num_input_tokens_seen": 2827584, + "step": 1650 + }, + { + "epoch": 8.033898305084746, + "grad_norm": 0.008336028084158897, + "learning_rate": 0.2987361334662553, + "loss": 0.1681, + "num_input_tokens_seen": 2836064, + "step": 1655 + }, + { + "epoch": 8.058111380145279, + "grad_norm": 0.010196635499596596, + "learning_rate": 0.29872849147382113, + "loss": 0.1922, + "num_input_tokens_seen": 2844608, + "step": 1660 + }, + { + "epoch": 8.08232445520581, + "grad_norm": 0.010667513124644756, + "learning_rate": 0.2987208265455845, + "loss": 0.1547, + "num_input_tokens_seen": 2853024, + "step": 1665 + }, + { + "epoch": 8.106537530266344, + "grad_norm": 0.010443334467709064, + "learning_rate": 0.29871313868272753, + "loss": 0.1464, + "num_input_tokens_seen": 2861504, + "step": 1670 + }, + { + "epoch": 8.130750605326876, + "grad_norm": 0.010370592586696148, + "learning_rate": 0.29870542788643567, + "loss": 0.1529, + "num_input_tokens_seen": 2869952, + "step": 1675 + }, + { + "epoch": 8.154963680387409, + "grad_norm": 0.013643348589539528, + "learning_rate": 0.2986976941578981, + "loss": 0.1599, + "num_input_tokens_seen": 2878592, + "step": 1680 + }, + { + "epoch": 8.179176755447942, + "grad_norm": 0.008508515544235706, + "learning_rate": 0.29868993749830747, + "loss": 0.1174, + "num_input_tokens_seen": 2887168, + "step": 1685 + }, + { + "epoch": 8.203389830508474, + "grad_norm": 0.017143074423074722, + "learning_rate": 0.2986821579088598, + "loss": 0.131, + "num_input_tokens_seen": 2896064, + "step": 1690 + }, + { + "epoch": 8.227602905569007, + "grad_norm": 0.01622709445655346, + "learning_rate": 0.29867435539075504, + "loss": 0.138, + "num_input_tokens_seen": 2904448, + "step": 1695 + }, + { + "epoch": 8.25181598062954, + "grad_norm": 0.017753146588802338, + "learning_rate": 0.2986665299451963, + "loss": 0.1293, + "num_input_tokens_seen": 2912832, + "step": 1700 + }, + { + "epoch": 8.276029055690072, + "grad_norm": 0.02689751423895359, + "learning_rate": 0.29865868157339037, + "loss": 0.1464, + "num_input_tokens_seen": 2921280, + "step": 1705 + }, + { + "epoch": 8.300242130750606, + "grad_norm": 0.017727667465806007, + "learning_rate": 0.2986508102765476, + "loss": 0.1476, + "num_input_tokens_seen": 2929568, + "step": 1710 + }, + { + "epoch": 8.324455205811137, + "grad_norm": 0.016247961670160294, + "learning_rate": 0.2986429160558818, + "loss": 0.1513, + "num_input_tokens_seen": 2937760, + "step": 1715 + }, + { + "epoch": 8.34866828087167, + "grad_norm": 0.013560589402914047, + "learning_rate": 0.2986349989126104, + "loss": 0.1329, + "num_input_tokens_seen": 2946656, + "step": 1720 + }, + { + "epoch": 8.372881355932204, + "grad_norm": 0.020893597975373268, + "learning_rate": 0.29862705884795426, + "loss": 0.1719, + "num_input_tokens_seen": 2955200, + "step": 1725 + }, + { + "epoch": 8.397094430992736, + "grad_norm": 0.027956776320934296, + "learning_rate": 0.2986190958631379, + "loss": 0.1665, + "num_input_tokens_seen": 2963776, + "step": 1730 + }, + { + "epoch": 8.42130750605327, + "grad_norm": 0.01724022626876831, + "learning_rate": 0.29861110995938933, + "loss": 0.1507, + "num_input_tokens_seen": 2972224, + "step": 1735 + }, + { + "epoch": 8.4455205811138, + "grad_norm": 0.01795484498143196, + "learning_rate": 0.29860310113794, + "loss": 0.1308, + "num_input_tokens_seen": 2980896, + "step": 1740 + }, + { + "epoch": 8.469733656174334, + "grad_norm": 0.011120525188744068, + "learning_rate": 0.29859506940002506, + "loss": 0.1298, + "num_input_tokens_seen": 2989728, + "step": 1745 + }, + { + "epoch": 8.493946731234868, + "grad_norm": 0.011389891617000103, + "learning_rate": 0.298587014746883, + "loss": 0.1412, + "num_input_tokens_seen": 2998176, + "step": 1750 + }, + { + "epoch": 8.5181598062954, + "grad_norm": 0.01709909364581108, + "learning_rate": 0.298578937179756, + "loss": 0.145, + "num_input_tokens_seen": 3006656, + "step": 1755 + }, + { + "epoch": 8.542372881355933, + "grad_norm": 0.018022816628217697, + "learning_rate": 0.29857083669988976, + "loss": 0.1586, + "num_input_tokens_seen": 3015072, + "step": 1760 + }, + { + "epoch": 8.566585956416464, + "grad_norm": 0.028391076251864433, + "learning_rate": 0.29856271330853346, + "loss": 0.1627, + "num_input_tokens_seen": 3024000, + "step": 1765 + }, + { + "epoch": 8.590799031476998, + "grad_norm": 0.01699059084057808, + "learning_rate": 0.2985545670069398, + "loss": 0.1488, + "num_input_tokens_seen": 3032448, + "step": 1770 + }, + { + "epoch": 8.615012106537531, + "grad_norm": 0.027171460911631584, + "learning_rate": 0.29854639779636505, + "loss": 0.1389, + "num_input_tokens_seen": 3041024, + "step": 1775 + }, + { + "epoch": 8.639225181598063, + "grad_norm": 0.020306358113884926, + "learning_rate": 0.298538205678069, + "loss": 0.1852, + "num_input_tokens_seen": 3049504, + "step": 1780 + }, + { + "epoch": 8.663438256658596, + "grad_norm": 0.010174267925322056, + "learning_rate": 0.298529990653315, + "loss": 0.1693, + "num_input_tokens_seen": 3057824, + "step": 1785 + }, + { + "epoch": 8.687651331719128, + "grad_norm": 0.008912432007491589, + "learning_rate": 0.29852175272336984, + "loss": 0.1969, + "num_input_tokens_seen": 3066368, + "step": 1790 + }, + { + "epoch": 8.711864406779661, + "grad_norm": 0.013427253812551498, + "learning_rate": 0.29851349188950405, + "loss": 0.1766, + "num_input_tokens_seen": 3074880, + "step": 1795 + }, + { + "epoch": 8.736077481840194, + "grad_norm": 0.014528544619679451, + "learning_rate": 0.2985052081529914, + "loss": 0.1643, + "num_input_tokens_seen": 3083872, + "step": 1800 + }, + { + "epoch": 8.736077481840194, + "eval_loss": 0.16357342898845673, + "eval_runtime": 4.6157, + "eval_samples_per_second": 79.511, + "eval_steps_per_second": 19.932, + "num_input_tokens_seen": 3083872, + "step": 1800 + }, + { + "epoch": 8.760290556900726, + "grad_norm": 0.009951440617442131, + "learning_rate": 0.29849690151510944, + "loss": 0.1841, + "num_input_tokens_seen": 3092384, + "step": 1805 + }, + { + "epoch": 8.78450363196126, + "grad_norm": 0.012479634024202824, + "learning_rate": 0.2984885719771392, + "loss": 0.1745, + "num_input_tokens_seen": 3100704, + "step": 1810 + }, + { + "epoch": 8.80871670702179, + "grad_norm": 0.009403201751410961, + "learning_rate": 0.2984802195403651, + "loss": 0.1641, + "num_input_tokens_seen": 3109248, + "step": 1815 + }, + { + "epoch": 8.832929782082324, + "grad_norm": 0.0098939323797822, + "learning_rate": 0.2984718442060752, + "loss": 0.1571, + "num_input_tokens_seen": 3117952, + "step": 1820 + }, + { + "epoch": 8.857142857142858, + "grad_norm": 0.00656508281826973, + "learning_rate": 0.2984634459755611, + "loss": 0.1461, + "num_input_tokens_seen": 3126624, + "step": 1825 + }, + { + "epoch": 8.88135593220339, + "grad_norm": 0.010143778286874294, + "learning_rate": 0.29845502485011793, + "loss": 0.1434, + "num_input_tokens_seen": 3135360, + "step": 1830 + }, + { + "epoch": 8.905569007263923, + "grad_norm": 0.009224778041243553, + "learning_rate": 0.2984465808310444, + "loss": 0.1518, + "num_input_tokens_seen": 3143872, + "step": 1835 + }, + { + "epoch": 8.929782082324456, + "grad_norm": 0.009734573774039745, + "learning_rate": 0.29843811391964253, + "loss": 0.1559, + "num_input_tokens_seen": 3152096, + "step": 1840 + }, + { + "epoch": 8.953995157384988, + "grad_norm": 0.014505796134471893, + "learning_rate": 0.2984296241172182, + "loss": 0.1805, + "num_input_tokens_seen": 3160896, + "step": 1845 + }, + { + "epoch": 8.978208232445521, + "grad_norm": 0.01087959948927164, + "learning_rate": 0.29842111142508043, + "loss": 0.1463, + "num_input_tokens_seen": 3169248, + "step": 1850 + }, + { + "epoch": 9.004842615012107, + "grad_norm": 0.01967608742415905, + "learning_rate": 0.29841257584454217, + "loss": 0.1623, + "num_input_tokens_seen": 3178112, + "step": 1855 + }, + { + "epoch": 9.029055690072639, + "grad_norm": 0.012836488895118237, + "learning_rate": 0.29840401737691963, + "loss": 0.1372, + "num_input_tokens_seen": 3186304, + "step": 1860 + }, + { + "epoch": 9.053268765133172, + "grad_norm": 0.015291298739612103, + "learning_rate": 0.29839543602353263, + "loss": 0.1415, + "num_input_tokens_seen": 3194592, + "step": 1865 + }, + { + "epoch": 9.077481840193705, + "grad_norm": 0.012577862478792667, + "learning_rate": 0.2983868317857046, + "loss": 0.143, + "num_input_tokens_seen": 3203328, + "step": 1870 + }, + { + "epoch": 9.101694915254237, + "grad_norm": 0.019796311855316162, + "learning_rate": 0.2983782046647623, + "loss": 0.1424, + "num_input_tokens_seen": 3212384, + "step": 1875 + }, + { + "epoch": 9.12590799031477, + "grad_norm": 0.015606844797730446, + "learning_rate": 0.2983695546620362, + "loss": 0.1243, + "num_input_tokens_seen": 3221088, + "step": 1880 + }, + { + "epoch": 9.150121065375302, + "grad_norm": 0.01235766801983118, + "learning_rate": 0.2983608817788603, + "loss": 0.146, + "num_input_tokens_seen": 3229664, + "step": 1885 + }, + { + "epoch": 9.174334140435835, + "grad_norm": 0.014688318595290184, + "learning_rate": 0.29835218601657193, + "loss": 0.1486, + "num_input_tokens_seen": 3238144, + "step": 1890 + }, + { + "epoch": 9.198547215496369, + "grad_norm": 0.019921740517020226, + "learning_rate": 0.29834346737651224, + "loss": 0.145, + "num_input_tokens_seen": 3246752, + "step": 1895 + }, + { + "epoch": 9.2227602905569, + "grad_norm": 0.015278465114533901, + "learning_rate": 0.29833472586002563, + "loss": 0.1643, + "num_input_tokens_seen": 3255232, + "step": 1900 + }, + { + "epoch": 9.246973365617434, + "grad_norm": 0.014131027273833752, + "learning_rate": 0.29832596146846024, + "loss": 0.1352, + "num_input_tokens_seen": 3263968, + "step": 1905 + }, + { + "epoch": 9.271186440677965, + "grad_norm": 0.015516439452767372, + "learning_rate": 0.2983171742031676, + "loss": 0.1505, + "num_input_tokens_seen": 3272640, + "step": 1910 + }, + { + "epoch": 9.295399515738499, + "grad_norm": 0.0181057620793581, + "learning_rate": 0.2983083640655028, + "loss": 0.1362, + "num_input_tokens_seen": 3281088, + "step": 1915 + }, + { + "epoch": 9.319612590799032, + "grad_norm": 0.03412938117980957, + "learning_rate": 0.29829953105682455, + "loss": 0.1341, + "num_input_tokens_seen": 3289696, + "step": 1920 + }, + { + "epoch": 9.343825665859564, + "grad_norm": 0.018575144931674004, + "learning_rate": 0.29829067517849495, + "loss": 0.1445, + "num_input_tokens_seen": 3297920, + "step": 1925 + }, + { + "epoch": 9.368038740920097, + "grad_norm": 0.029220806434750557, + "learning_rate": 0.2982817964318797, + "loss": 0.1732, + "num_input_tokens_seen": 3306400, + "step": 1930 + }, + { + "epoch": 9.392251815980629, + "grad_norm": 0.03952506557106972, + "learning_rate": 0.298272894818348, + "loss": 0.1664, + "num_input_tokens_seen": 3314560, + "step": 1935 + }, + { + "epoch": 9.416464891041162, + "grad_norm": 0.030460838228464127, + "learning_rate": 0.2982639703392726, + "loss": 0.1673, + "num_input_tokens_seen": 3323328, + "step": 1940 + }, + { + "epoch": 9.440677966101696, + "grad_norm": 0.012853202410042286, + "learning_rate": 0.29825502299602974, + "loss": 0.1956, + "num_input_tokens_seen": 3331840, + "step": 1945 + }, + { + "epoch": 9.464891041162227, + "grad_norm": 0.028823163360357285, + "learning_rate": 0.2982460527899993, + "loss": 0.1857, + "num_input_tokens_seen": 3340480, + "step": 1950 + }, + { + "epoch": 9.48910411622276, + "grad_norm": 0.01192950177937746, + "learning_rate": 0.29823705972256453, + "loss": 0.1637, + "num_input_tokens_seen": 3348864, + "step": 1955 + }, + { + "epoch": 9.513317191283292, + "grad_norm": 0.012151962146162987, + "learning_rate": 0.2982280437951123, + "loss": 0.1598, + "num_input_tokens_seen": 3357312, + "step": 1960 + }, + { + "epoch": 9.537530266343826, + "grad_norm": 0.010103327222168446, + "learning_rate": 0.298219005009033, + "loss": 0.1717, + "num_input_tokens_seen": 3365824, + "step": 1965 + }, + { + "epoch": 9.561743341404359, + "grad_norm": 0.014545989222824574, + "learning_rate": 0.29820994336572043, + "loss": 0.164, + "num_input_tokens_seen": 3374080, + "step": 1970 + }, + { + "epoch": 9.58595641646489, + "grad_norm": 0.011677796021103859, + "learning_rate": 0.2982008588665721, + "loss": 0.1653, + "num_input_tokens_seen": 3382880, + "step": 1975 + }, + { + "epoch": 9.610169491525424, + "grad_norm": 0.01001259870827198, + "learning_rate": 0.2981917515129889, + "loss": 0.1729, + "num_input_tokens_seen": 3391680, + "step": 1980 + }, + { + "epoch": 9.634382566585955, + "grad_norm": 0.00837306771427393, + "learning_rate": 0.2981826213063753, + "loss": 0.1703, + "num_input_tokens_seen": 3400256, + "step": 1985 + }, + { + "epoch": 9.658595641646489, + "grad_norm": 0.011346152983605862, + "learning_rate": 0.2981734682481394, + "loss": 0.1576, + "num_input_tokens_seen": 3408960, + "step": 1990 + }, + { + "epoch": 9.682808716707022, + "grad_norm": 0.010291368700563908, + "learning_rate": 0.29816429233969255, + "loss": 0.1529, + "num_input_tokens_seen": 3417280, + "step": 1995 + }, + { + "epoch": 9.707021791767554, + "grad_norm": 0.009341096505522728, + "learning_rate": 0.2981550935824499, + "loss": 0.1533, + "num_input_tokens_seen": 3425696, + "step": 2000 + }, + { + "epoch": 9.707021791767554, + "eval_loss": 0.16580797731876373, + "eval_runtime": 4.6293, + "eval_samples_per_second": 79.278, + "eval_steps_per_second": 19.873, + "num_input_tokens_seen": 3425696, + "step": 2000 + }, + { + "epoch": 9.731234866828087, + "grad_norm": 0.015815837308764458, + "learning_rate": 0.29814587197783, + "loss": 0.1586, + "num_input_tokens_seen": 3434208, + "step": 2005 + }, + { + "epoch": 9.75544794188862, + "grad_norm": 0.015718743205070496, + "learning_rate": 0.29813662752725495, + "loss": 0.1611, + "num_input_tokens_seen": 3442720, + "step": 2010 + }, + { + "epoch": 9.779661016949152, + "grad_norm": 0.007133147679269314, + "learning_rate": 0.29812736023215025, + "loss": 0.1861, + "num_input_tokens_seen": 3451264, + "step": 2015 + }, + { + "epoch": 9.803874092009686, + "grad_norm": 0.011138089932501316, + "learning_rate": 0.29811807009394514, + "loss": 0.145, + "num_input_tokens_seen": 3460096, + "step": 2020 + }, + { + "epoch": 9.828087167070217, + "grad_norm": 0.007948190905153751, + "learning_rate": 0.2981087571140723, + "loss": 0.1676, + "num_input_tokens_seen": 3468800, + "step": 2025 + }, + { + "epoch": 9.85230024213075, + "grad_norm": 0.007801585830748081, + "learning_rate": 0.2980994212939678, + "loss": 0.1482, + "num_input_tokens_seen": 3477472, + "step": 2030 + }, + { + "epoch": 9.876513317191284, + "grad_norm": 0.006480956915766001, + "learning_rate": 0.2980900626350715, + "loss": 0.168, + "num_input_tokens_seen": 3486240, + "step": 2035 + }, + { + "epoch": 9.900726392251816, + "grad_norm": 0.009083558805286884, + "learning_rate": 0.29808068113882646, + "loss": 0.1639, + "num_input_tokens_seen": 3494944, + "step": 2040 + }, + { + "epoch": 9.924939467312349, + "grad_norm": 0.008352433331310749, + "learning_rate": 0.2980712768066795, + "loss": 0.1739, + "num_input_tokens_seen": 3503008, + "step": 2045 + }, + { + "epoch": 9.94915254237288, + "grad_norm": 0.007065822370350361, + "learning_rate": 0.2980618496400809, + "loss": 0.1463, + "num_input_tokens_seen": 3511680, + "step": 2050 + }, + { + "epoch": 9.973365617433414, + "grad_norm": 0.007208114955574274, + "learning_rate": 0.2980523996404844, + "loss": 0.1712, + "num_input_tokens_seen": 3520192, + "step": 2055 + }, + { + "epoch": 9.997578692493947, + "grad_norm": 0.007089495658874512, + "learning_rate": 0.2980429268093473, + "loss": 0.152, + "num_input_tokens_seen": 3528800, + "step": 2060 + }, + { + "epoch": 10.024213075060533, + "grad_norm": 0.009294702671468258, + "learning_rate": 0.29803343114813047, + "loss": 0.1941, + "num_input_tokens_seen": 3537952, + "step": 2065 + }, + { + "epoch": 10.048426150121065, + "grad_norm": 0.013804367743432522, + "learning_rate": 0.2980239126582983, + "loss": 0.1473, + "num_input_tokens_seen": 3546464, + "step": 2070 + }, + { + "epoch": 10.072639225181598, + "grad_norm": 0.01364931371062994, + "learning_rate": 0.2980143713413186, + "loss": 0.1594, + "num_input_tokens_seen": 3554912, + "step": 2075 + }, + { + "epoch": 10.09685230024213, + "grad_norm": 0.010529780760407448, + "learning_rate": 0.29800480719866274, + "loss": 0.1448, + "num_input_tokens_seen": 3563136, + "step": 2080 + }, + { + "epoch": 10.121065375302663, + "grad_norm": 0.015633579343557358, + "learning_rate": 0.2979952202318057, + "loss": 0.147, + "num_input_tokens_seen": 3571328, + "step": 2085 + }, + { + "epoch": 10.145278450363197, + "grad_norm": 0.015429631806910038, + "learning_rate": 0.2979856104422259, + "loss": 0.128, + "num_input_tokens_seen": 3580128, + "step": 2090 + }, + { + "epoch": 10.169491525423728, + "grad_norm": 0.012230188585817814, + "learning_rate": 0.2979759778314052, + "loss": 0.1302, + "num_input_tokens_seen": 3588704, + "step": 2095 + }, + { + "epoch": 10.193704600484262, + "grad_norm": 0.018613863736391068, + "learning_rate": 0.2979663224008292, + "loss": 0.1257, + "num_input_tokens_seen": 3597408, + "step": 2100 + }, + { + "epoch": 10.217917675544793, + "grad_norm": 0.016331210732460022, + "learning_rate": 0.2979566441519868, + "loss": 0.1517, + "num_input_tokens_seen": 3605632, + "step": 2105 + }, + { + "epoch": 10.242130750605327, + "grad_norm": 0.013659182004630566, + "learning_rate": 0.29794694308637054, + "loss": 0.1297, + "num_input_tokens_seen": 3613920, + "step": 2110 + }, + { + "epoch": 10.26634382566586, + "grad_norm": 0.012223025783896446, + "learning_rate": 0.2979372192054764, + "loss": 0.1249, + "num_input_tokens_seen": 3622304, + "step": 2115 + }, + { + "epoch": 10.290556900726392, + "grad_norm": 0.013187659904360771, + "learning_rate": 0.297927472510804, + "loss": 0.1431, + "num_input_tokens_seen": 3631072, + "step": 2120 + }, + { + "epoch": 10.314769975786925, + "grad_norm": 0.01690581813454628, + "learning_rate": 0.29791770300385634, + "loss": 0.1597, + "num_input_tokens_seen": 3639552, + "step": 2125 + }, + { + "epoch": 10.338983050847457, + "grad_norm": 0.025121798738837242, + "learning_rate": 0.29790791068614003, + "loss": 0.1382, + "num_input_tokens_seen": 3648192, + "step": 2130 + }, + { + "epoch": 10.36319612590799, + "grad_norm": 0.014668646268546581, + "learning_rate": 0.2978980955591652, + "loss": 0.1658, + "num_input_tokens_seen": 3657152, + "step": 2135 + }, + { + "epoch": 10.387409200968523, + "grad_norm": 0.02765909768640995, + "learning_rate": 0.2978882576244454, + "loss": 0.1597, + "num_input_tokens_seen": 3665632, + "step": 2140 + }, + { + "epoch": 10.411622276029055, + "grad_norm": 0.013048161752521992, + "learning_rate": 0.2978783968834978, + "loss": 0.1785, + "num_input_tokens_seen": 3674560, + "step": 2145 + }, + { + "epoch": 10.435835351089588, + "grad_norm": 0.010947328060865402, + "learning_rate": 0.29786851333784303, + "loss": 0.1569, + "num_input_tokens_seen": 3683360, + "step": 2150 + }, + { + "epoch": 10.460048426150122, + "grad_norm": 0.01320174802094698, + "learning_rate": 0.2978586069890053, + "loss": 0.1746, + "num_input_tokens_seen": 3692096, + "step": 2155 + }, + { + "epoch": 10.484261501210653, + "grad_norm": 0.011226643808186054, + "learning_rate": 0.29784867783851227, + "loss": 0.1561, + "num_input_tokens_seen": 3700768, + "step": 2160 + }, + { + "epoch": 10.508474576271187, + "grad_norm": 0.010036150924861431, + "learning_rate": 0.2978387258878951, + "loss": 0.1414, + "num_input_tokens_seen": 3709216, + "step": 2165 + }, + { + "epoch": 10.532687651331718, + "grad_norm": 0.008052741177380085, + "learning_rate": 0.29782875113868856, + "loss": 0.1175, + "num_input_tokens_seen": 3718016, + "step": 2170 + }, + { + "epoch": 10.556900726392252, + "grad_norm": 0.013398339040577412, + "learning_rate": 0.2978187535924309, + "loss": 0.1656, + "num_input_tokens_seen": 3726368, + "step": 2175 + }, + { + "epoch": 10.581113801452785, + "grad_norm": 0.010107472538948059, + "learning_rate": 0.29780873325066376, + "loss": 0.1607, + "num_input_tokens_seen": 3735264, + "step": 2180 + }, + { + "epoch": 10.605326876513317, + "grad_norm": 0.008387213572859764, + "learning_rate": 0.2977986901149325, + "loss": 0.1457, + "num_input_tokens_seen": 3744000, + "step": 2185 + }, + { + "epoch": 10.62953995157385, + "grad_norm": 0.011580671183764935, + "learning_rate": 0.29778862418678587, + "loss": 0.1374, + "num_input_tokens_seen": 3752672, + "step": 2190 + }, + { + "epoch": 10.653753026634382, + "grad_norm": 0.012327899225056171, + "learning_rate": 0.29777853546777616, + "loss": 0.1344, + "num_input_tokens_seen": 3761184, + "step": 2195 + }, + { + "epoch": 10.677966101694915, + "grad_norm": 0.008466425351798534, + "learning_rate": 0.2977684239594592, + "loss": 0.1429, + "num_input_tokens_seen": 3769888, + "step": 2200 + }, + { + "epoch": 10.677966101694915, + "eval_loss": 0.1588224321603775, + "eval_runtime": 4.6177, + "eval_samples_per_second": 79.476, + "eval_steps_per_second": 19.923, + "num_input_tokens_seen": 3769888, + "step": 2200 + }, + { + "epoch": 10.702179176755449, + "grad_norm": 0.009228895418345928, + "learning_rate": 0.29775828966339424, + "loss": 0.1426, + "num_input_tokens_seen": 3778464, + "step": 2205 + }, + { + "epoch": 10.72639225181598, + "grad_norm": 0.012579170987010002, + "learning_rate": 0.29774813258114424, + "loss": 0.1598, + "num_input_tokens_seen": 3787008, + "step": 2210 + }, + { + "epoch": 10.750605326876514, + "grad_norm": 0.011784552596509457, + "learning_rate": 0.29773795271427544, + "loss": 0.1634, + "num_input_tokens_seen": 3795328, + "step": 2215 + }, + { + "epoch": 10.774818401937045, + "grad_norm": 0.01478433795273304, + "learning_rate": 0.2977277500643577, + "loss": 0.1451, + "num_input_tokens_seen": 3803808, + "step": 2220 + }, + { + "epoch": 10.799031476997579, + "grad_norm": 0.026190677657723427, + "learning_rate": 0.29771752463296447, + "loss": 0.164, + "num_input_tokens_seen": 3812480, + "step": 2225 + }, + { + "epoch": 10.823244552058112, + "grad_norm": 0.014682246372103691, + "learning_rate": 0.29770727642167266, + "loss": 0.1542, + "num_input_tokens_seen": 3820864, + "step": 2230 + }, + { + "epoch": 10.847457627118644, + "grad_norm": 0.008913833647966385, + "learning_rate": 0.29769700543206257, + "loss": 0.1469, + "num_input_tokens_seen": 3829472, + "step": 2235 + }, + { + "epoch": 10.871670702179177, + "grad_norm": 0.014638429507613182, + "learning_rate": 0.2976867116657182, + "loss": 0.1658, + "num_input_tokens_seen": 3838016, + "step": 2240 + }, + { + "epoch": 10.89588377723971, + "grad_norm": 0.01280896831303835, + "learning_rate": 0.2976763951242269, + "loss": 0.1352, + "num_input_tokens_seen": 3846464, + "step": 2245 + }, + { + "epoch": 10.920096852300242, + "grad_norm": 0.012967362999916077, + "learning_rate": 0.29766605580917965, + "loss": 0.1515, + "num_input_tokens_seen": 3855232, + "step": 2250 + }, + { + "epoch": 10.944309927360775, + "grad_norm": 0.012711426243185997, + "learning_rate": 0.29765569372217093, + "loss": 0.1372, + "num_input_tokens_seen": 3864128, + "step": 2255 + }, + { + "epoch": 10.968523002421307, + "grad_norm": 0.013414343819022179, + "learning_rate": 0.2976453088647987, + "loss": 0.1589, + "num_input_tokens_seen": 3872512, + "step": 2260 + }, + { + "epoch": 10.99273607748184, + "grad_norm": 0.012259240262210369, + "learning_rate": 0.2976349012386644, + "loss": 0.1484, + "num_input_tokens_seen": 3880768, + "step": 2265 + }, + { + "epoch": 11.019370460048426, + "grad_norm": 0.009916359558701515, + "learning_rate": 0.29762447084537297, + "loss": 0.1574, + "num_input_tokens_seen": 3889728, + "step": 2270 + }, + { + "epoch": 11.043583535108958, + "grad_norm": 0.013547291047871113, + "learning_rate": 0.29761401768653306, + "loss": 0.1138, + "num_input_tokens_seen": 3898048, + "step": 2275 + }, + { + "epoch": 11.067796610169491, + "grad_norm": 0.02063489519059658, + "learning_rate": 0.29760354176375653, + "loss": 0.1377, + "num_input_tokens_seen": 3906144, + "step": 2280 + }, + { + "epoch": 11.092009685230025, + "grad_norm": 0.011468524113297462, + "learning_rate": 0.29759304307865897, + "loss": 0.114, + "num_input_tokens_seen": 3914336, + "step": 2285 + }, + { + "epoch": 11.116222760290556, + "grad_norm": 0.02023732103407383, + "learning_rate": 0.2975825216328594, + "loss": 0.1314, + "num_input_tokens_seen": 3922784, + "step": 2290 + }, + { + "epoch": 11.14043583535109, + "grad_norm": 0.018662717193365097, + "learning_rate": 0.2975719774279804, + "loss": 0.1242, + "num_input_tokens_seen": 3931136, + "step": 2295 + }, + { + "epoch": 11.164648910411623, + "grad_norm": 0.0171133354306221, + "learning_rate": 0.29756141046564794, + "loss": 0.1042, + "num_input_tokens_seen": 3939680, + "step": 2300 + }, + { + "epoch": 11.188861985472155, + "grad_norm": 0.027547169476747513, + "learning_rate": 0.2975508207474916, + "loss": 0.1262, + "num_input_tokens_seen": 3948192, + "step": 2305 + }, + { + "epoch": 11.213075060532688, + "grad_norm": 0.014536193571984768, + "learning_rate": 0.2975402082751445, + "loss": 0.1348, + "num_input_tokens_seen": 3956448, + "step": 2310 + }, + { + "epoch": 11.23728813559322, + "grad_norm": 0.01636543683707714, + "learning_rate": 0.29752957305024313, + "loss": 0.1513, + "num_input_tokens_seen": 3964800, + "step": 2315 + }, + { + "epoch": 11.261501210653753, + "grad_norm": 0.012281771749258041, + "learning_rate": 0.2975189150744277, + "loss": 0.161, + "num_input_tokens_seen": 3973760, + "step": 2320 + }, + { + "epoch": 11.285714285714286, + "grad_norm": 0.01743309386074543, + "learning_rate": 0.29750823434934165, + "loss": 0.1505, + "num_input_tokens_seen": 3982368, + "step": 2325 + }, + { + "epoch": 11.309927360774818, + "grad_norm": 0.021788321435451508, + "learning_rate": 0.29749753087663217, + "loss": 0.1439, + "num_input_tokens_seen": 3991232, + "step": 2330 + }, + { + "epoch": 11.334140435835351, + "grad_norm": 0.014379581436514854, + "learning_rate": 0.29748680465794985, + "loss": 0.1376, + "num_input_tokens_seen": 3999872, + "step": 2335 + }, + { + "epoch": 11.358353510895883, + "grad_norm": 0.013596038334071636, + "learning_rate": 0.29747605569494884, + "loss": 0.1312, + "num_input_tokens_seen": 4008608, + "step": 2340 + }, + { + "epoch": 11.382566585956416, + "grad_norm": 0.02469591051340103, + "learning_rate": 0.29746528398928673, + "loss": 0.1615, + "num_input_tokens_seen": 4017184, + "step": 2345 + }, + { + "epoch": 11.40677966101695, + "grad_norm": 0.01593739353120327, + "learning_rate": 0.2974544895426247, + "loss": 0.144, + "num_input_tokens_seen": 4025920, + "step": 2350 + }, + { + "epoch": 11.430992736077481, + "grad_norm": 0.008775530382990837, + "learning_rate": 0.29744367235662733, + "loss": 0.1565, + "num_input_tokens_seen": 4034272, + "step": 2355 + }, + { + "epoch": 11.455205811138015, + "grad_norm": 0.014709966257214546, + "learning_rate": 0.29743283243296276, + "loss": 0.1378, + "num_input_tokens_seen": 4042592, + "step": 2360 + }, + { + "epoch": 11.479418886198546, + "grad_norm": 0.00894935429096222, + "learning_rate": 0.29742196977330276, + "loss": 0.0979, + "num_input_tokens_seen": 4050976, + "step": 2365 + }, + { + "epoch": 11.50363196125908, + "grad_norm": 0.014842092990875244, + "learning_rate": 0.2974110843793223, + "loss": 0.1336, + "num_input_tokens_seen": 4059392, + "step": 2370 + }, + { + "epoch": 11.527845036319613, + "grad_norm": 0.01360783725976944, + "learning_rate": 0.2974001762527002, + "loss": 0.1555, + "num_input_tokens_seen": 4067680, + "step": 2375 + }, + { + "epoch": 11.552058111380145, + "grad_norm": 0.014516441151499748, + "learning_rate": 0.2973892453951186, + "loss": 0.1627, + "num_input_tokens_seen": 4076448, + "step": 2380 + }, + { + "epoch": 11.576271186440678, + "grad_norm": 0.013504903763532639, + "learning_rate": 0.2973782918082631, + "loss": 0.1499, + "num_input_tokens_seen": 4084864, + "step": 2385 + }, + { + "epoch": 11.600484261501212, + "grad_norm": 0.016433285549283028, + "learning_rate": 0.29736731549382295, + "loss": 0.1451, + "num_input_tokens_seen": 4093344, + "step": 2390 + }, + { + "epoch": 11.624697336561743, + "grad_norm": 0.01587015762925148, + "learning_rate": 0.2973563164534908, + "loss": 0.1343, + "num_input_tokens_seen": 4102144, + "step": 2395 + }, + { + "epoch": 11.648910411622277, + "grad_norm": 0.015948524698615074, + "learning_rate": 0.29734529468896287, + "loss": 0.1556, + "num_input_tokens_seen": 4110336, + "step": 2400 + }, + { + "epoch": 11.648910411622277, + "eval_loss": 0.182367205619812, + "eval_runtime": 4.6243, + "eval_samples_per_second": 79.363, + "eval_steps_per_second": 19.895, + "num_input_tokens_seen": 4110336, + "step": 2400 + }, + { + "epoch": 11.673123486682808, + "grad_norm": 0.012360447086393833, + "learning_rate": 0.2973342502019388, + "loss": 0.1538, + "num_input_tokens_seen": 4118912, + "step": 2405 + }, + { + "epoch": 11.697336561743342, + "grad_norm": 0.010386185720562935, + "learning_rate": 0.2973231829941219, + "loss": 0.1423, + "num_input_tokens_seen": 4127104, + "step": 2410 + }, + { + "epoch": 11.721549636803875, + "grad_norm": 0.010580603964626789, + "learning_rate": 0.2973120930672188, + "loss": 0.1347, + "num_input_tokens_seen": 4135552, + "step": 2415 + }, + { + "epoch": 11.745762711864407, + "grad_norm": 0.012725355103611946, + "learning_rate": 0.2973009804229397, + "loss": 0.1462, + "num_input_tokens_seen": 4144448, + "step": 2420 + }, + { + "epoch": 11.76997578692494, + "grad_norm": 0.013460000045597553, + "learning_rate": 0.29728984506299827, + "loss": 0.1282, + "num_input_tokens_seen": 4153536, + "step": 2425 + }, + { + "epoch": 11.794188861985472, + "grad_norm": 0.013387774117290974, + "learning_rate": 0.2972786869891118, + "loss": 0.1679, + "num_input_tokens_seen": 4162304, + "step": 2430 + }, + { + "epoch": 11.818401937046005, + "grad_norm": 0.014200201258063316, + "learning_rate": 0.29726750620300096, + "loss": 0.1391, + "num_input_tokens_seen": 4170720, + "step": 2435 + }, + { + "epoch": 11.842615012106538, + "grad_norm": 0.011025333777070045, + "learning_rate": 0.29725630270639003, + "loss": 0.1246, + "num_input_tokens_seen": 4179200, + "step": 2440 + }, + { + "epoch": 11.86682808716707, + "grad_norm": 0.01721113547682762, + "learning_rate": 0.2972450765010067, + "loss": 0.1638, + "num_input_tokens_seen": 4187840, + "step": 2445 + }, + { + "epoch": 11.891041162227603, + "grad_norm": 0.016050660982728004, + "learning_rate": 0.29723382758858213, + "loss": 0.1818, + "num_input_tokens_seen": 4196416, + "step": 2450 + }, + { + "epoch": 11.915254237288135, + "grad_norm": 0.010584020987153053, + "learning_rate": 0.29722255597085107, + "loss": 0.1413, + "num_input_tokens_seen": 4204896, + "step": 2455 + }, + { + "epoch": 11.939467312348668, + "grad_norm": 0.011135122738778591, + "learning_rate": 0.2972112616495518, + "loss": 0.1344, + "num_input_tokens_seen": 4213440, + "step": 2460 + }, + { + "epoch": 11.963680387409202, + "grad_norm": 0.008332955650985241, + "learning_rate": 0.297199944626426, + "loss": 0.1671, + "num_input_tokens_seen": 4222400, + "step": 2465 + }, + { + "epoch": 11.987893462469733, + "grad_norm": 0.009986016899347305, + "learning_rate": 0.2971886049032189, + "loss": 0.1441, + "num_input_tokens_seen": 4231008, + "step": 2470 + }, + { + "epoch": 12.01452784503632, + "grad_norm": 0.01083291508257389, + "learning_rate": 0.29717724248167926, + "loss": 0.1924, + "num_input_tokens_seen": 4239680, + "step": 2475 + }, + { + "epoch": 12.038740920096853, + "grad_norm": 0.01028387900441885, + "learning_rate": 0.29716585736355927, + "loss": 0.1364, + "num_input_tokens_seen": 4248704, + "step": 2480 + }, + { + "epoch": 12.062953995157384, + "grad_norm": 0.013027569279074669, + "learning_rate": 0.2971544495506147, + "loss": 0.1236, + "num_input_tokens_seen": 4257472, + "step": 2485 + }, + { + "epoch": 12.087167070217918, + "grad_norm": 0.015057161450386047, + "learning_rate": 0.2971430190446048, + "loss": 0.1064, + "num_input_tokens_seen": 4265984, + "step": 2490 + }, + { + "epoch": 12.111380145278451, + "grad_norm": 0.012227839790284634, + "learning_rate": 0.2971315658472921, + "loss": 0.132, + "num_input_tokens_seen": 4274368, + "step": 2495 + }, + { + "epoch": 12.135593220338983, + "grad_norm": 0.009448564611375332, + "learning_rate": 0.2971200899604431, + "loss": 0.1116, + "num_input_tokens_seen": 4283136, + "step": 2500 + }, + { + "epoch": 12.159806295399516, + "grad_norm": 0.006791207008063793, + "learning_rate": 0.29710859138582735, + "loss": 0.0894, + "num_input_tokens_seen": 4291744, + "step": 2505 + }, + { + "epoch": 12.184019370460048, + "grad_norm": 0.012509898282587528, + "learning_rate": 0.29709707012521813, + "loss": 0.1301, + "num_input_tokens_seen": 4300416, + "step": 2510 + }, + { + "epoch": 12.208232445520581, + "grad_norm": 0.018222665414214134, + "learning_rate": 0.29708552618039213, + "loss": 0.1408, + "num_input_tokens_seen": 4308928, + "step": 2515 + }, + { + "epoch": 12.232445520581114, + "grad_norm": 0.012536331079900265, + "learning_rate": 0.2970739595531296, + "loss": 0.1506, + "num_input_tokens_seen": 4317664, + "step": 2520 + }, + { + "epoch": 12.256658595641646, + "grad_norm": 0.021543707698583603, + "learning_rate": 0.2970623702452143, + "loss": 0.1789, + "num_input_tokens_seen": 4326304, + "step": 2525 + }, + { + "epoch": 12.28087167070218, + "grad_norm": 0.013077815063297749, + "learning_rate": 0.2970507582584334, + "loss": 0.1278, + "num_input_tokens_seen": 4335136, + "step": 2530 + }, + { + "epoch": 12.305084745762711, + "grad_norm": 0.020097706466913223, + "learning_rate": 0.2970391235945776, + "loss": 0.1455, + "num_input_tokens_seen": 4343584, + "step": 2535 + }, + { + "epoch": 12.329297820823244, + "grad_norm": 0.013645220547914505, + "learning_rate": 0.2970274662554412, + "loss": 0.1392, + "num_input_tokens_seen": 4351840, + "step": 2540 + }, + { + "epoch": 12.353510895883778, + "grad_norm": 0.012403641827404499, + "learning_rate": 0.2970157862428218, + "loss": 0.1494, + "num_input_tokens_seen": 4360320, + "step": 2545 + }, + { + "epoch": 12.37772397094431, + "grad_norm": 0.011417338624596596, + "learning_rate": 0.2970040835585206, + "loss": 0.1389, + "num_input_tokens_seen": 4369056, + "step": 2550 + }, + { + "epoch": 12.401937046004843, + "grad_norm": 0.011475899256765842, + "learning_rate": 0.2969923582043424, + "loss": 0.1481, + "num_input_tokens_seen": 4377792, + "step": 2555 + }, + { + "epoch": 12.426150121065376, + "grad_norm": 0.013924461789429188, + "learning_rate": 0.2969806101820953, + "loss": 0.1478, + "num_input_tokens_seen": 4386208, + "step": 2560 + }, + { + "epoch": 12.450363196125908, + "grad_norm": 0.014392987824976444, + "learning_rate": 0.2969688394935911, + "loss": 0.1358, + "num_input_tokens_seen": 4394592, + "step": 2565 + }, + { + "epoch": 12.474576271186441, + "grad_norm": 0.015184837393462658, + "learning_rate": 0.2969570461406449, + "loss": 0.1547, + "num_input_tokens_seen": 4403200, + "step": 2570 + }, + { + "epoch": 12.498789346246973, + "grad_norm": 0.014337876811623573, + "learning_rate": 0.29694523012507534, + "loss": 0.1307, + "num_input_tokens_seen": 4411360, + "step": 2575 + }, + { + "epoch": 12.523002421307506, + "grad_norm": 0.015293248929083347, + "learning_rate": 0.2969333914487048, + "loss": 0.1408, + "num_input_tokens_seen": 4419840, + "step": 2580 + }, + { + "epoch": 12.54721549636804, + "grad_norm": 0.012337548658251762, + "learning_rate": 0.2969215301133587, + "loss": 0.143, + "num_input_tokens_seen": 4428480, + "step": 2585 + }, + { + "epoch": 12.571428571428571, + "grad_norm": 0.013208792544901371, + "learning_rate": 0.29690964612086634, + "loss": 0.139, + "num_input_tokens_seen": 4436704, + "step": 2590 + }, + { + "epoch": 12.595641646489105, + "grad_norm": 0.008205856196582317, + "learning_rate": 0.2968977394730604, + "loss": 0.1225, + "num_input_tokens_seen": 4445216, + "step": 2595 + }, + { + "epoch": 12.619854721549636, + "grad_norm": 0.01250548753887415, + "learning_rate": 0.296885810171777, + "loss": 0.1218, + "num_input_tokens_seen": 4453600, + "step": 2600 + }, + { + "epoch": 12.619854721549636, + "eval_loss": 0.16858550906181335, + "eval_runtime": 4.6177, + "eval_samples_per_second": 79.476, + "eval_steps_per_second": 19.923, + "num_input_tokens_seen": 4453600, + "step": 2600 + }, + { + "epoch": 12.64406779661017, + "grad_norm": 0.014495743438601494, + "learning_rate": 0.2968738582188558, + "loss": 0.12, + "num_input_tokens_seen": 4461856, + "step": 2605 + }, + { + "epoch": 12.668280871670703, + "grad_norm": 0.015647489577531815, + "learning_rate": 0.2968618836161399, + "loss": 0.1295, + "num_input_tokens_seen": 4470176, + "step": 2610 + }, + { + "epoch": 12.692493946731235, + "grad_norm": 0.02043408900499344, + "learning_rate": 0.296849886365476, + "loss": 0.1755, + "num_input_tokens_seen": 4478848, + "step": 2615 + }, + { + "epoch": 12.716707021791768, + "grad_norm": 0.01204620860517025, + "learning_rate": 0.2968378664687142, + "loss": 0.1382, + "num_input_tokens_seen": 4488096, + "step": 2620 + }, + { + "epoch": 12.7409200968523, + "grad_norm": 0.0140983946621418, + "learning_rate": 0.296825823927708, + "loss": 0.1455, + "num_input_tokens_seen": 4496544, + "step": 2625 + }, + { + "epoch": 12.765133171912833, + "grad_norm": 0.011414079926908016, + "learning_rate": 0.29681375874431476, + "loss": 0.1415, + "num_input_tokens_seen": 4504960, + "step": 2630 + }, + { + "epoch": 12.789346246973366, + "grad_norm": 0.014809144660830498, + "learning_rate": 0.29680167092039483, + "loss": 0.1197, + "num_input_tokens_seen": 4513664, + "step": 2635 + }, + { + "epoch": 12.813559322033898, + "grad_norm": 0.011350237764418125, + "learning_rate": 0.2967895604578125, + "loss": 0.1134, + "num_input_tokens_seen": 4522816, + "step": 2640 + }, + { + "epoch": 12.837772397094431, + "grad_norm": 0.009708325378596783, + "learning_rate": 0.2967774273584352, + "loss": 0.1338, + "num_input_tokens_seen": 4531648, + "step": 2645 + }, + { + "epoch": 12.861985472154963, + "grad_norm": 0.016878439113497734, + "learning_rate": 0.2967652716241342, + "loss": 0.1484, + "num_input_tokens_seen": 4540192, + "step": 2650 + }, + { + "epoch": 12.886198547215496, + "grad_norm": 0.015518235974013805, + "learning_rate": 0.29675309325678384, + "loss": 0.1228, + "num_input_tokens_seen": 4548608, + "step": 2655 + }, + { + "epoch": 12.91041162227603, + "grad_norm": 0.016884885728359222, + "learning_rate": 0.29674089225826233, + "loss": 0.1669, + "num_input_tokens_seen": 4556832, + "step": 2660 + }, + { + "epoch": 12.934624697336561, + "grad_norm": 0.01882031373679638, + "learning_rate": 0.29672866863045116, + "loss": 0.1745, + "num_input_tokens_seen": 4565312, + "step": 2665 + }, + { + "epoch": 12.958837772397095, + "grad_norm": 0.013269204646348953, + "learning_rate": 0.2967164223752354, + "loss": 0.1502, + "num_input_tokens_seen": 4574016, + "step": 2670 + }, + { + "epoch": 12.983050847457626, + "grad_norm": 0.009360434487462044, + "learning_rate": 0.2967041534945035, + "loss": 0.1427, + "num_input_tokens_seen": 4582560, + "step": 2675 + }, + { + "epoch": 13.009685230024212, + "grad_norm": 0.009981958195567131, + "learning_rate": 0.2966918619901476, + "loss": 0.1516, + "num_input_tokens_seen": 4591648, + "step": 2680 + }, + { + "epoch": 13.033898305084746, + "grad_norm": 0.02180599234998226, + "learning_rate": 0.2966795478640631, + "loss": 0.1554, + "num_input_tokens_seen": 4600192, + "step": 2685 + }, + { + "epoch": 13.058111380145279, + "grad_norm": 0.009442998096346855, + "learning_rate": 0.29666721111814903, + "loss": 0.1542, + "num_input_tokens_seen": 4608864, + "step": 2690 + }, + { + "epoch": 13.08232445520581, + "grad_norm": 0.014703603461384773, + "learning_rate": 0.2966548517543079, + "loss": 0.1214, + "num_input_tokens_seen": 4617632, + "step": 2695 + }, + { + "epoch": 13.106537530266344, + "grad_norm": 0.015071050263941288, + "learning_rate": 0.29664246977444564, + "loss": 0.102, + "num_input_tokens_seen": 4626336, + "step": 2700 + }, + { + "epoch": 13.130750605326876, + "grad_norm": 0.01461946964263916, + "learning_rate": 0.2966300651804717, + "loss": 0.1341, + "num_input_tokens_seen": 4634976, + "step": 2705 + }, + { + "epoch": 13.154963680387409, + "grad_norm": 0.005975825246423483, + "learning_rate": 0.296617637974299, + "loss": 0.0884, + "num_input_tokens_seen": 4643360, + "step": 2710 + }, + { + "epoch": 13.179176755447942, + "grad_norm": 0.010068121366202831, + "learning_rate": 0.2966051881578441, + "loss": 0.0987, + "num_input_tokens_seen": 4651808, + "step": 2715 + }, + { + "epoch": 13.203389830508474, + "grad_norm": 0.015472343191504478, + "learning_rate": 0.29659271573302676, + "loss": 0.1173, + "num_input_tokens_seen": 4660352, + "step": 2720 + }, + { + "epoch": 13.227602905569007, + "grad_norm": 0.017771540209650993, + "learning_rate": 0.2965802207017705, + "loss": 0.1304, + "num_input_tokens_seen": 4668832, + "step": 2725 + }, + { + "epoch": 13.25181598062954, + "grad_norm": 0.015645330771803856, + "learning_rate": 0.2965677030660021, + "loss": 0.1392, + "num_input_tokens_seen": 4677408, + "step": 2730 + }, + { + "epoch": 13.276029055690072, + "grad_norm": 0.012089382857084274, + "learning_rate": 0.2965551628276521, + "loss": 0.1131, + "num_input_tokens_seen": 4686048, + "step": 2735 + }, + { + "epoch": 13.300242130750606, + "grad_norm": 0.015288025140762329, + "learning_rate": 0.29654259998865423, + "loss": 0.1387, + "num_input_tokens_seen": 4694528, + "step": 2740 + }, + { + "epoch": 13.324455205811137, + "grad_norm": 0.017931178212165833, + "learning_rate": 0.2965300145509458, + "loss": 0.1398, + "num_input_tokens_seen": 4703040, + "step": 2745 + }, + { + "epoch": 13.34866828087167, + "grad_norm": 0.017409740015864372, + "learning_rate": 0.2965174065164678, + "loss": 0.1235, + "num_input_tokens_seen": 4711648, + "step": 2750 + }, + { + "epoch": 13.372881355932204, + "grad_norm": 0.011363574303686619, + "learning_rate": 0.2965047758871644, + "loss": 0.1336, + "num_input_tokens_seen": 4719904, + "step": 2755 + }, + { + "epoch": 13.397094430992736, + "grad_norm": 0.01375485211610794, + "learning_rate": 0.2964921226649835, + "loss": 0.14, + "num_input_tokens_seen": 4728512, + "step": 2760 + }, + { + "epoch": 13.42130750605327, + "grad_norm": 0.007642189972102642, + "learning_rate": 0.2964794468518763, + "loss": 0.1177, + "num_input_tokens_seen": 4737376, + "step": 2765 + }, + { + "epoch": 13.4455205811138, + "grad_norm": 0.013453743420541286, + "learning_rate": 0.2964667484497977, + "loss": 0.0905, + "num_input_tokens_seen": 4745664, + "step": 2770 + }, + { + "epoch": 13.469733656174334, + "grad_norm": 0.012418149039149284, + "learning_rate": 0.29645402746070587, + "loss": 0.1242, + "num_input_tokens_seen": 4753792, + "step": 2775 + }, + { + "epoch": 13.493946731234868, + "grad_norm": 0.013720093294978142, + "learning_rate": 0.2964412838865625, + "loss": 0.1715, + "num_input_tokens_seen": 4762400, + "step": 2780 + }, + { + "epoch": 13.5181598062954, + "grad_norm": 0.0093028349801898, + "learning_rate": 0.29642851772933293, + "loss": 0.1439, + "num_input_tokens_seen": 4770816, + "step": 2785 + }, + { + "epoch": 13.542372881355933, + "grad_norm": 0.008187117986381054, + "learning_rate": 0.29641572899098567, + "loss": 0.1295, + "num_input_tokens_seen": 4779488, + "step": 2790 + }, + { + "epoch": 13.566585956416464, + "grad_norm": 0.010337627492845058, + "learning_rate": 0.29640291767349314, + "loss": 0.1322, + "num_input_tokens_seen": 4787808, + "step": 2795 + }, + { + "epoch": 13.590799031476998, + "grad_norm": 0.012394286692142487, + "learning_rate": 0.2963900837788308, + "loss": 0.1527, + "num_input_tokens_seen": 4796192, + "step": 2800 + }, + { + "epoch": 13.590799031476998, + "eval_loss": 0.19470778107643127, + "eval_runtime": 4.6316, + "eval_samples_per_second": 79.239, + "eval_steps_per_second": 19.864, + "num_input_tokens_seen": 4796192, + "step": 2800 + }, + { + "epoch": 13.615012106537531, + "grad_norm": 0.013360095210373402, + "learning_rate": 0.2963772273089779, + "loss": 0.1231, + "num_input_tokens_seen": 4804704, + "step": 2805 + }, + { + "epoch": 13.639225181598063, + "grad_norm": 0.01431864034384489, + "learning_rate": 0.2963643482659171, + "loss": 0.1436, + "num_input_tokens_seen": 4813504, + "step": 2810 + }, + { + "epoch": 13.663438256658596, + "grad_norm": 0.010154633782804012, + "learning_rate": 0.2963514466516345, + "loss": 0.1404, + "num_input_tokens_seen": 4822304, + "step": 2815 + }, + { + "epoch": 13.687651331719128, + "grad_norm": 0.010921495966613293, + "learning_rate": 0.2963385224681196, + "loss": 0.1252, + "num_input_tokens_seen": 4831008, + "step": 2820 + }, + { + "epoch": 13.711864406779661, + "grad_norm": 0.011696339584887028, + "learning_rate": 0.29632557571736556, + "loss": 0.1262, + "num_input_tokens_seen": 4839296, + "step": 2825 + }, + { + "epoch": 13.736077481840194, + "grad_norm": 0.01296605821698904, + "learning_rate": 0.2963126064013689, + "loss": 0.1243, + "num_input_tokens_seen": 4847840, + "step": 2830 + }, + { + "epoch": 13.760290556900726, + "grad_norm": 0.015626100823283195, + "learning_rate": 0.29629961452212966, + "loss": 0.1577, + "num_input_tokens_seen": 4856608, + "step": 2835 + }, + { + "epoch": 13.78450363196126, + "grad_norm": 0.011631995439529419, + "learning_rate": 0.2962866000816513, + "loss": 0.1088, + "num_input_tokens_seen": 4865056, + "step": 2840 + }, + { + "epoch": 13.80871670702179, + "grad_norm": 0.01220032013952732, + "learning_rate": 0.2962735630819409, + "loss": 0.128, + "num_input_tokens_seen": 4873664, + "step": 2845 + }, + { + "epoch": 13.832929782082324, + "grad_norm": 0.010479170829057693, + "learning_rate": 0.2962605035250089, + "loss": 0.1285, + "num_input_tokens_seen": 4882144, + "step": 2850 + }, + { + "epoch": 13.857142857142858, + "grad_norm": 0.008999623358249664, + "learning_rate": 0.29624742141286914, + "loss": 0.0969, + "num_input_tokens_seen": 4890272, + "step": 2855 + }, + { + "epoch": 13.88135593220339, + "grad_norm": 0.01717103086411953, + "learning_rate": 0.29623431674753925, + "loss": 0.1514, + "num_input_tokens_seen": 4899200, + "step": 2860 + }, + { + "epoch": 13.905569007263923, + "grad_norm": 0.01234872080385685, + "learning_rate": 0.29622118953103993, + "loss": 0.1151, + "num_input_tokens_seen": 4907872, + "step": 2865 + }, + { + "epoch": 13.929782082324456, + "grad_norm": 0.018845394253730774, + "learning_rate": 0.2962080397653957, + "loss": 0.1452, + "num_input_tokens_seen": 4916832, + "step": 2870 + }, + { + "epoch": 13.953995157384988, + "grad_norm": 0.012085996568202972, + "learning_rate": 0.29619486745263435, + "loss": 0.1463, + "num_input_tokens_seen": 4924896, + "step": 2875 + }, + { + "epoch": 13.978208232445521, + "grad_norm": 0.016405681148171425, + "learning_rate": 0.2961816725947873, + "loss": 0.1525, + "num_input_tokens_seen": 4933312, + "step": 2880 + }, + { + "epoch": 14.004842615012107, + "grad_norm": 0.030664730817079544, + "learning_rate": 0.29616845519388924, + "loss": 0.1604, + "num_input_tokens_seen": 4942240, + "step": 2885 + }, + { + "epoch": 14.029055690072639, + "grad_norm": 0.012023496441543102, + "learning_rate": 0.2961552152519785, + "loss": 0.0927, + "num_input_tokens_seen": 4950688, + "step": 2890 + }, + { + "epoch": 14.053268765133172, + "grad_norm": 0.012118026614189148, + "learning_rate": 0.29614195277109695, + "loss": 0.1122, + "num_input_tokens_seen": 4959168, + "step": 2895 + }, + { + "epoch": 14.077481840193705, + "grad_norm": 0.014744431711733341, + "learning_rate": 0.2961286677532897, + "loss": 0.124, + "num_input_tokens_seen": 4967904, + "step": 2900 + }, + { + "epoch": 14.101694915254237, + "grad_norm": 0.01148089300841093, + "learning_rate": 0.2961153602006055, + "loss": 0.1308, + "num_input_tokens_seen": 4976512, + "step": 2905 + }, + { + "epoch": 14.12590799031477, + "grad_norm": 0.013524714857339859, + "learning_rate": 0.29610203011509656, + "loss": 0.1088, + "num_input_tokens_seen": 4985312, + "step": 2910 + }, + { + "epoch": 14.150121065375302, + "grad_norm": 0.012978832237422466, + "learning_rate": 0.29608867749881856, + "loss": 0.1187, + "num_input_tokens_seen": 4993728, + "step": 2915 + }, + { + "epoch": 14.174334140435835, + "grad_norm": 0.027687111869454384, + "learning_rate": 0.29607530235383067, + "loss": 0.1383, + "num_input_tokens_seen": 5001824, + "step": 2920 + }, + { + "epoch": 14.198547215496369, + "grad_norm": 0.016148077324032784, + "learning_rate": 0.2960619046821954, + "loss": 0.1185, + "num_input_tokens_seen": 5010784, + "step": 2925 + }, + { + "epoch": 14.2227602905569, + "grad_norm": 0.007757062092423439, + "learning_rate": 0.2960484844859789, + "loss": 0.1285, + "num_input_tokens_seen": 5019488, + "step": 2930 + }, + { + "epoch": 14.246973365617434, + "grad_norm": 0.010282677598297596, + "learning_rate": 0.29603504176725076, + "loss": 0.1247, + "num_input_tokens_seen": 5028064, + "step": 2935 + }, + { + "epoch": 14.271186440677965, + "grad_norm": 0.01250401884317398, + "learning_rate": 0.296021576528084, + "loss": 0.1023, + "num_input_tokens_seen": 5036992, + "step": 2940 + }, + { + "epoch": 14.295399515738499, + "grad_norm": 0.011794215068221092, + "learning_rate": 0.29600808877055507, + "loss": 0.1071, + "num_input_tokens_seen": 5045856, + "step": 2945 + }, + { + "epoch": 14.319612590799032, + "grad_norm": 0.0168713741004467, + "learning_rate": 0.29599457849674404, + "loss": 0.1284, + "num_input_tokens_seen": 5054304, + "step": 2950 + }, + { + "epoch": 14.343825665859564, + "grad_norm": 0.017568713054060936, + "learning_rate": 0.2959810457087343, + "loss": 0.1279, + "num_input_tokens_seen": 5062560, + "step": 2955 + }, + { + "epoch": 14.368038740920097, + "grad_norm": 0.013005728833377361, + "learning_rate": 0.2959674904086128, + "loss": 0.0777, + "num_input_tokens_seen": 5071264, + "step": 2960 + }, + { + "epoch": 14.392251815980629, + "grad_norm": 0.012556981295347214, + "learning_rate": 0.2959539125984699, + "loss": 0.146, + "num_input_tokens_seen": 5079488, + "step": 2965 + }, + { + "epoch": 14.416464891041162, + "grad_norm": 0.011061318218708038, + "learning_rate": 0.2959403122803996, + "loss": 0.144, + "num_input_tokens_seen": 5087584, + "step": 2970 + }, + { + "epoch": 14.440677966101696, + "grad_norm": 0.0102925980463624, + "learning_rate": 0.2959266894564991, + "loss": 0.1311, + "num_input_tokens_seen": 5096256, + "step": 2975 + }, + { + "epoch": 14.464891041162227, + "grad_norm": 0.009280265308916569, + "learning_rate": 0.2959130441288692, + "loss": 0.1205, + "num_input_tokens_seen": 5105120, + "step": 2980 + }, + { + "epoch": 14.48910411622276, + "grad_norm": 0.014796472154557705, + "learning_rate": 0.2958993762996143, + "loss": 0.1253, + "num_input_tokens_seen": 5113536, + "step": 2985 + }, + { + "epoch": 14.513317191283292, + "grad_norm": 0.012482735328376293, + "learning_rate": 0.2958856859708421, + "loss": 0.1167, + "num_input_tokens_seen": 5121888, + "step": 2990 + }, + { + "epoch": 14.537530266343826, + "grad_norm": 0.017815688624978065, + "learning_rate": 0.2958719731446638, + "loss": 0.1473, + "num_input_tokens_seen": 5130336, + "step": 2995 + }, + { + "epoch": 14.561743341404359, + "grad_norm": 0.014011642895638943, + "learning_rate": 0.29585823782319404, + "loss": 0.0872, + "num_input_tokens_seen": 5138720, + "step": 3000 + }, + { + "epoch": 14.561743341404359, + "eval_loss": 0.1859712451696396, + "eval_runtime": 4.6109, + "eval_samples_per_second": 79.595, + "eval_steps_per_second": 19.953, + "num_input_tokens_seen": 5138720, + "step": 3000 + }, + { + "epoch": 14.58595641646489, + "grad_norm": 0.010357539169490337, + "learning_rate": 0.2958444800085511, + "loss": 0.0991, + "num_input_tokens_seen": 5147584, + "step": 3005 + }, + { + "epoch": 14.610169491525424, + "grad_norm": 0.013112259097397327, + "learning_rate": 0.2958306997028565, + "loss": 0.0882, + "num_input_tokens_seen": 5156000, + "step": 3010 + }, + { + "epoch": 14.634382566585955, + "grad_norm": 0.0176407378166914, + "learning_rate": 0.2958168969082354, + "loss": 0.1468, + "num_input_tokens_seen": 5164736, + "step": 3015 + }, + { + "epoch": 14.658595641646489, + "grad_norm": 0.017539937049150467, + "learning_rate": 0.2958030716268164, + "loss": 0.1185, + "num_input_tokens_seen": 5173248, + "step": 3020 + }, + { + "epoch": 14.682808716707022, + "grad_norm": 0.01557779498398304, + "learning_rate": 0.2957892238607314, + "loss": 0.1582, + "num_input_tokens_seen": 5181408, + "step": 3025 + }, + { + "epoch": 14.707021791767554, + "grad_norm": 0.021295061334967613, + "learning_rate": 0.2957753536121161, + "loss": 0.1761, + "num_input_tokens_seen": 5189856, + "step": 3030 + }, + { + "epoch": 14.731234866828087, + "grad_norm": 0.012641767039895058, + "learning_rate": 0.29576146088310923, + "loss": 0.1242, + "num_input_tokens_seen": 5198304, + "step": 3035 + }, + { + "epoch": 14.75544794188862, + "grad_norm": 0.015193860046565533, + "learning_rate": 0.2957475456758533, + "loss": 0.1577, + "num_input_tokens_seen": 5206880, + "step": 3040 + }, + { + "epoch": 14.779661016949152, + "grad_norm": 0.012984483502805233, + "learning_rate": 0.2957336079924944, + "loss": 0.14, + "num_input_tokens_seen": 5215456, + "step": 3045 + }, + { + "epoch": 14.803874092009686, + "grad_norm": 0.011403502896428108, + "learning_rate": 0.2957196478351816, + "loss": 0.1115, + "num_input_tokens_seen": 5224320, + "step": 3050 + }, + { + "epoch": 14.828087167070217, + "grad_norm": 0.00975309032946825, + "learning_rate": 0.295705665206068, + "loss": 0.1168, + "num_input_tokens_seen": 5233120, + "step": 3055 + }, + { + "epoch": 14.85230024213075, + "grad_norm": 0.0055193183943629265, + "learning_rate": 0.2956916601073097, + "loss": 0.1505, + "num_input_tokens_seen": 5241152, + "step": 3060 + }, + { + "epoch": 14.876513317191284, + "grad_norm": 0.011295417323708534, + "learning_rate": 0.29567763254106655, + "loss": 0.1744, + "num_input_tokens_seen": 5249696, + "step": 3065 + }, + { + "epoch": 14.900726392251816, + "grad_norm": 0.01481185108423233, + "learning_rate": 0.29566358250950175, + "loss": 0.1448, + "num_input_tokens_seen": 5258400, + "step": 3070 + }, + { + "epoch": 14.924939467312349, + "grad_norm": 0.010032817721366882, + "learning_rate": 0.295649510014782, + "loss": 0.1419, + "num_input_tokens_seen": 5266784, + "step": 3075 + }, + { + "epoch": 14.94915254237288, + "grad_norm": 0.008769607171416283, + "learning_rate": 0.2956354150590775, + "loss": 0.139, + "num_input_tokens_seen": 5275456, + "step": 3080 + }, + { + "epoch": 14.973365617433414, + "grad_norm": 0.007984300144016743, + "learning_rate": 0.2956212976445618, + "loss": 0.122, + "num_input_tokens_seen": 5283744, + "step": 3085 + }, + { + "epoch": 14.997578692493947, + "grad_norm": 0.0064214435406029224, + "learning_rate": 0.295607157773412, + "loss": 0.1298, + "num_input_tokens_seen": 5292576, + "step": 3090 + }, + { + "epoch": 15.024213075060533, + "grad_norm": 0.007531640585511923, + "learning_rate": 0.2955929954478087, + "loss": 0.0906, + "num_input_tokens_seen": 5301632, + "step": 3095 + }, + { + "epoch": 15.048426150121065, + "grad_norm": 0.01177139300853014, + "learning_rate": 0.29557881066993585, + "loss": 0.0974, + "num_input_tokens_seen": 5310016, + "step": 3100 + }, + { + "epoch": 15.072639225181598, + "grad_norm": 0.009540482424199581, + "learning_rate": 0.29556460344198093, + "loss": 0.0734, + "num_input_tokens_seen": 5318784, + "step": 3105 + }, + { + "epoch": 15.09685230024213, + "grad_norm": 0.011741126887500286, + "learning_rate": 0.29555037376613486, + "loss": 0.0913, + "num_input_tokens_seen": 5327072, + "step": 3110 + }, + { + "epoch": 15.121065375302663, + "grad_norm": 0.0078929103910923, + "learning_rate": 0.29553612164459203, + "loss": 0.0719, + "num_input_tokens_seen": 5335744, + "step": 3115 + }, + { + "epoch": 15.145278450363197, + "grad_norm": 0.010534374043345451, + "learning_rate": 0.29552184707955037, + "loss": 0.1009, + "num_input_tokens_seen": 5344544, + "step": 3120 + }, + { + "epoch": 15.169491525423728, + "grad_norm": 0.017737820744514465, + "learning_rate": 0.29550755007321117, + "loss": 0.0953, + "num_input_tokens_seen": 5353056, + "step": 3125 + }, + { + "epoch": 15.193704600484262, + "grad_norm": 0.015537609346210957, + "learning_rate": 0.29549323062777916, + "loss": 0.0872, + "num_input_tokens_seen": 5361600, + "step": 3130 + }, + { + "epoch": 15.217917675544793, + "grad_norm": 0.013185004703700542, + "learning_rate": 0.29547888874546263, + "loss": 0.1167, + "num_input_tokens_seen": 5369952, + "step": 3135 + }, + { + "epoch": 15.242130750605327, + "grad_norm": 0.013041634112596512, + "learning_rate": 0.2954645244284732, + "loss": 0.1135, + "num_input_tokens_seen": 5378144, + "step": 3140 + }, + { + "epoch": 15.26634382566586, + "grad_norm": 0.010987133719027042, + "learning_rate": 0.2954501376790261, + "loss": 0.0918, + "num_input_tokens_seen": 5386432, + "step": 3145 + }, + { + "epoch": 15.290556900726392, + "grad_norm": 0.013591373339295387, + "learning_rate": 0.29543572849933997, + "loss": 0.096, + "num_input_tokens_seen": 5394784, + "step": 3150 + }, + { + "epoch": 15.314769975786925, + "grad_norm": 0.018982941284775734, + "learning_rate": 0.2954212968916368, + "loss": 0.1151, + "num_input_tokens_seen": 5403232, + "step": 3155 + }, + { + "epoch": 15.338983050847457, + "grad_norm": 0.01216250378638506, + "learning_rate": 0.29540684285814217, + "loss": 0.1211, + "num_input_tokens_seen": 5412256, + "step": 3160 + }, + { + "epoch": 15.36319612590799, + "grad_norm": 0.013632924295961857, + "learning_rate": 0.2953923664010851, + "loss": 0.1129, + "num_input_tokens_seen": 5421024, + "step": 3165 + }, + { + "epoch": 15.387409200968523, + "grad_norm": 0.01417049765586853, + "learning_rate": 0.295377867522698, + "loss": 0.1187, + "num_input_tokens_seen": 5429632, + "step": 3170 + }, + { + "epoch": 15.411622276029055, + "grad_norm": 0.015546616166830063, + "learning_rate": 0.2953633462252168, + "loss": 0.1417, + "num_input_tokens_seen": 5438112, + "step": 3175 + }, + { + "epoch": 15.435835351089588, + "grad_norm": 0.023353496566414833, + "learning_rate": 0.2953488025108809, + "loss": 0.1208, + "num_input_tokens_seen": 5446464, + "step": 3180 + }, + { + "epoch": 15.460048426150122, + "grad_norm": 0.015099822543561459, + "learning_rate": 0.295334236381933, + "loss": 0.1335, + "num_input_tokens_seen": 5455040, + "step": 3185 + }, + { + "epoch": 15.484261501210653, + "grad_norm": 0.020804159343242645, + "learning_rate": 0.29531964784061954, + "loss": 0.1318, + "num_input_tokens_seen": 5463456, + "step": 3190 + }, + { + "epoch": 15.508474576271187, + "grad_norm": 0.018768493086099625, + "learning_rate": 0.2953050368891902, + "loss": 0.1329, + "num_input_tokens_seen": 5472256, + "step": 3195 + }, + { + "epoch": 15.532687651331718, + "grad_norm": 0.018365364521741867, + "learning_rate": 0.29529040352989805, + "loss": 0.1445, + "num_input_tokens_seen": 5480512, + "step": 3200 + }, + { + "epoch": 15.532687651331718, + "eval_loss": 0.20996984839439392, + "eval_runtime": 4.6226, + "eval_samples_per_second": 79.393, + "eval_steps_per_second": 19.902, + "num_input_tokens_seen": 5480512, + "step": 3200 + }, + { + "epoch": 15.556900726392252, + "grad_norm": 0.012136989273130894, + "learning_rate": 0.29527574776499993, + "loss": 0.1398, + "num_input_tokens_seen": 5489088, + "step": 3205 + }, + { + "epoch": 15.581113801452785, + "grad_norm": 0.019440749660134315, + "learning_rate": 0.2952610695967558, + "loss": 0.151, + "num_input_tokens_seen": 5497568, + "step": 3210 + }, + { + "epoch": 15.605326876513317, + "grad_norm": 0.011852381750941277, + "learning_rate": 0.29524636902742935, + "loss": 0.1481, + "num_input_tokens_seen": 5506368, + "step": 3215 + }, + { + "epoch": 15.62953995157385, + "grad_norm": 0.018219228833913803, + "learning_rate": 0.2952316460592875, + "loss": 0.1834, + "num_input_tokens_seen": 5514912, + "step": 3220 + }, + { + "epoch": 15.653753026634382, + "grad_norm": 0.010144869796931744, + "learning_rate": 0.29521690069460066, + "loss": 0.1717, + "num_input_tokens_seen": 5523520, + "step": 3225 + }, + { + "epoch": 15.677966101694915, + "grad_norm": 0.007609181106090546, + "learning_rate": 0.29520213293564285, + "loss": 0.1842, + "num_input_tokens_seen": 5532224, + "step": 3230 + }, + { + "epoch": 15.702179176755449, + "grad_norm": 0.008561758324503899, + "learning_rate": 0.29518734278469144, + "loss": 0.1787, + "num_input_tokens_seen": 5541056, + "step": 3235 + }, + { + "epoch": 15.72639225181598, + "grad_norm": 0.011923378333449364, + "learning_rate": 0.29517253024402723, + "loss": 0.1642, + "num_input_tokens_seen": 5549376, + "step": 3240 + }, + { + "epoch": 15.750605326876514, + "grad_norm": 0.009497306309640408, + "learning_rate": 0.2951576953159345, + "loss": 0.1488, + "num_input_tokens_seen": 5557952, + "step": 3245 + }, + { + "epoch": 15.774818401937045, + "grad_norm": 0.009900815784931183, + "learning_rate": 0.29514283800270097, + "loss": 0.1048, + "num_input_tokens_seen": 5566720, + "step": 3250 + }, + { + "epoch": 15.799031476997579, + "grad_norm": 0.012577936984598637, + "learning_rate": 0.2951279583066179, + "loss": 0.1538, + "num_input_tokens_seen": 5575168, + "step": 3255 + }, + { + "epoch": 15.823244552058112, + "grad_norm": 0.00897025316953659, + "learning_rate": 0.2951130562299798, + "loss": 0.1301, + "num_input_tokens_seen": 5583936, + "step": 3260 + }, + { + "epoch": 15.847457627118644, + "grad_norm": 0.008265340700745583, + "learning_rate": 0.29509813177508487, + "loss": 0.1353, + "num_input_tokens_seen": 5592256, + "step": 3265 + }, + { + "epoch": 15.871670702179177, + "grad_norm": 0.006922699511051178, + "learning_rate": 0.2950831849442346, + "loss": 0.1344, + "num_input_tokens_seen": 5600896, + "step": 3270 + }, + { + "epoch": 15.89588377723971, + "grad_norm": 0.012416384182870388, + "learning_rate": 0.2950682157397339, + "loss": 0.1317, + "num_input_tokens_seen": 5609312, + "step": 3275 + }, + { + "epoch": 15.920096852300242, + "grad_norm": 0.009429411962628365, + "learning_rate": 0.2950532241638914, + "loss": 0.113, + "num_input_tokens_seen": 5617632, + "step": 3280 + }, + { + "epoch": 15.944309927360775, + "grad_norm": 0.00930047407746315, + "learning_rate": 0.2950382102190188, + "loss": 0.1224, + "num_input_tokens_seen": 5626080, + "step": 3285 + }, + { + "epoch": 15.968523002421307, + "grad_norm": 0.012953980825841427, + "learning_rate": 0.2950231739074316, + "loss": 0.1358, + "num_input_tokens_seen": 5634624, + "step": 3290 + }, + { + "epoch": 15.99273607748184, + "grad_norm": 0.008190843276679516, + "learning_rate": 0.29500811523144843, + "loss": 0.1221, + "num_input_tokens_seen": 5643264, + "step": 3295 + }, + { + "epoch": 16.019370460048425, + "grad_norm": 0.014819824136793613, + "learning_rate": 0.2949930341933917, + "loss": 0.1351, + "num_input_tokens_seen": 5652160, + "step": 3300 + }, + { + "epoch": 16.043583535108958, + "grad_norm": 0.009277250617742538, + "learning_rate": 0.29497793079558693, + "loss": 0.122, + "num_input_tokens_seen": 5660544, + "step": 3305 + }, + { + "epoch": 16.06779661016949, + "grad_norm": 0.008962602354586124, + "learning_rate": 0.2949628050403633, + "loss": 0.0798, + "num_input_tokens_seen": 5668800, + "step": 3310 + }, + { + "epoch": 16.092009685230025, + "grad_norm": 0.014605077914893627, + "learning_rate": 0.2949476569300535, + "loss": 0.1032, + "num_input_tokens_seen": 5677120, + "step": 3315 + }, + { + "epoch": 16.116222760290558, + "grad_norm": 0.011486216448247433, + "learning_rate": 0.29493248646699344, + "loss": 0.082, + "num_input_tokens_seen": 5685920, + "step": 3320 + }, + { + "epoch": 16.140435835351088, + "grad_norm": 0.01914600282907486, + "learning_rate": 0.29491729365352265, + "loss": 0.0878, + "num_input_tokens_seen": 5694016, + "step": 3325 + }, + { + "epoch": 16.16464891041162, + "grad_norm": 0.0074091418646276, + "learning_rate": 0.29490207849198397, + "loss": 0.0852, + "num_input_tokens_seen": 5702816, + "step": 3330 + }, + { + "epoch": 16.188861985472155, + "grad_norm": 0.011557982303202152, + "learning_rate": 0.29488684098472384, + "loss": 0.1002, + "num_input_tokens_seen": 5711360, + "step": 3335 + }, + { + "epoch": 16.213075060532688, + "grad_norm": 0.02321610040962696, + "learning_rate": 0.2948715811340921, + "loss": 0.081, + "num_input_tokens_seen": 5719552, + "step": 3340 + }, + { + "epoch": 16.23728813559322, + "grad_norm": 0.01713506318628788, + "learning_rate": 0.294856298942442, + "loss": 0.1314, + "num_input_tokens_seen": 5728448, + "step": 3345 + }, + { + "epoch": 16.26150121065375, + "grad_norm": 0.009307526983320713, + "learning_rate": 0.2948409944121302, + "loss": 0.0867, + "num_input_tokens_seen": 5737248, + "step": 3350 + }, + { + "epoch": 16.285714285714285, + "grad_norm": 0.00672230776399374, + "learning_rate": 0.29482566754551687, + "loss": 0.0772, + "num_input_tokens_seen": 5745632, + "step": 3355 + }, + { + "epoch": 16.309927360774818, + "grad_norm": 0.01371297612786293, + "learning_rate": 0.2948103183449656, + "loss": 0.1347, + "num_input_tokens_seen": 5754016, + "step": 3360 + }, + { + "epoch": 16.33414043583535, + "grad_norm": 0.0141986683011055, + "learning_rate": 0.2947949468128435, + "loss": 0.1039, + "num_input_tokens_seen": 5762400, + "step": 3365 + }, + { + "epoch": 16.358353510895885, + "grad_norm": 0.012636539526283741, + "learning_rate": 0.2947795529515209, + "loss": 0.0996, + "num_input_tokens_seen": 5771328, + "step": 3370 + }, + { + "epoch": 16.38256658595642, + "grad_norm": 0.02092861756682396, + "learning_rate": 0.29476413676337193, + "loss": 0.0904, + "num_input_tokens_seen": 5779584, + "step": 3375 + }, + { + "epoch": 16.406779661016948, + "grad_norm": 0.011557938531041145, + "learning_rate": 0.2947486982507738, + "loss": 0.0861, + "num_input_tokens_seen": 5788256, + "step": 3380 + }, + { + "epoch": 16.43099273607748, + "grad_norm": 0.009938221424818039, + "learning_rate": 0.29473323741610735, + "loss": 0.086, + "num_input_tokens_seen": 5796608, + "step": 3385 + }, + { + "epoch": 16.455205811138015, + "grad_norm": 0.01508268527686596, + "learning_rate": 0.2947177542617569, + "loss": 0.1266, + "num_input_tokens_seen": 5805184, + "step": 3390 + }, + { + "epoch": 16.479418886198548, + "grad_norm": 0.01632162369787693, + "learning_rate": 0.2947022487901101, + "loss": 0.1123, + "num_input_tokens_seen": 5813696, + "step": 3395 + }, + { + "epoch": 16.50363196125908, + "grad_norm": 0.017126794904470444, + "learning_rate": 0.2946867210035581, + "loss": 0.1433, + "num_input_tokens_seen": 5822816, + "step": 3400 + }, + { + "epoch": 16.50363196125908, + "eval_loss": 0.18770189583301544, + "eval_runtime": 4.6207, + "eval_samples_per_second": 79.425, + "eval_steps_per_second": 19.91, + "num_input_tokens_seen": 5822816, + "step": 3400 + }, + { + "epoch": 16.52784503631961, + "grad_norm": 0.012776474468410015, + "learning_rate": 0.2946711709044954, + "loss": 0.1079, + "num_input_tokens_seen": 5831520, + "step": 3405 + }, + { + "epoch": 16.552058111380145, + "grad_norm": 0.008290711790323257, + "learning_rate": 0.2946555984953202, + "loss": 0.1312, + "num_input_tokens_seen": 5840416, + "step": 3410 + }, + { + "epoch": 16.576271186440678, + "grad_norm": 0.011036786250770092, + "learning_rate": 0.2946400037784338, + "loss": 0.1204, + "num_input_tokens_seen": 5848960, + "step": 3415 + }, + { + "epoch": 16.60048426150121, + "grad_norm": 0.008906779810786247, + "learning_rate": 0.29462438675624114, + "loss": 0.1181, + "num_input_tokens_seen": 5857536, + "step": 3420 + }, + { + "epoch": 16.624697336561745, + "grad_norm": 0.020741473883390427, + "learning_rate": 0.2946087474311506, + "loss": 0.1401, + "num_input_tokens_seen": 5865952, + "step": 3425 + }, + { + "epoch": 16.648910411622275, + "grad_norm": 0.011480086483061314, + "learning_rate": 0.294593085805574, + "loss": 0.1683, + "num_input_tokens_seen": 5874144, + "step": 3430 + }, + { + "epoch": 16.673123486682808, + "grad_norm": 0.011766710318624973, + "learning_rate": 0.2945774018819264, + "loss": 0.123, + "num_input_tokens_seen": 5882688, + "step": 3435 + }, + { + "epoch": 16.69733656174334, + "grad_norm": 0.011161262169480324, + "learning_rate": 0.2945616956626266, + "loss": 0.1303, + "num_input_tokens_seen": 5891328, + "step": 3440 + }, + { + "epoch": 16.721549636803875, + "grad_norm": 0.007787701208144426, + "learning_rate": 0.2945459671500966, + "loss": 0.1286, + "num_input_tokens_seen": 5899648, + "step": 3445 + }, + { + "epoch": 16.74576271186441, + "grad_norm": 0.01261927466839552, + "learning_rate": 0.2945302163467621, + "loss": 0.1204, + "num_input_tokens_seen": 5908224, + "step": 3450 + }, + { + "epoch": 16.769975786924938, + "grad_norm": 0.014145327731966972, + "learning_rate": 0.2945144432550519, + "loss": 0.1499, + "num_input_tokens_seen": 5917344, + "step": 3455 + }, + { + "epoch": 16.79418886198547, + "grad_norm": 0.012529561296105385, + "learning_rate": 0.29449864787739843, + "loss": 0.1271, + "num_input_tokens_seen": 5925728, + "step": 3460 + }, + { + "epoch": 16.818401937046005, + "grad_norm": 0.01656254567205906, + "learning_rate": 0.2944828302162376, + "loss": 0.1589, + "num_input_tokens_seen": 5934656, + "step": 3465 + }, + { + "epoch": 16.84261501210654, + "grad_norm": 0.010193984024226665, + "learning_rate": 0.2944669902740087, + "loss": 0.1318, + "num_input_tokens_seen": 5942976, + "step": 3470 + }, + { + "epoch": 16.86682808716707, + "grad_norm": 0.010315504856407642, + "learning_rate": 0.2944511280531544, + "loss": 0.1186, + "num_input_tokens_seen": 5951360, + "step": 3475 + }, + { + "epoch": 16.8910411622276, + "grad_norm": 0.010301398113369942, + "learning_rate": 0.29443524355612083, + "loss": 0.1153, + "num_input_tokens_seen": 5960032, + "step": 3480 + }, + { + "epoch": 16.915254237288135, + "grad_norm": 0.011368426494300365, + "learning_rate": 0.29441933678535764, + "loss": 0.1225, + "num_input_tokens_seen": 5968352, + "step": 3485 + }, + { + "epoch": 16.93946731234867, + "grad_norm": 0.020202303305268288, + "learning_rate": 0.29440340774331786, + "loss": 0.139, + "num_input_tokens_seen": 5977056, + "step": 3490 + }, + { + "epoch": 16.9636803874092, + "grad_norm": 0.010164718143641949, + "learning_rate": 0.2943874564324579, + "loss": 0.0969, + "num_input_tokens_seen": 5985472, + "step": 3495 + }, + { + "epoch": 16.987893462469735, + "grad_norm": 0.00819008145481348, + "learning_rate": 0.2943714828552376, + "loss": 0.1022, + "num_input_tokens_seen": 5994112, + "step": 3500 + }, + { + "epoch": 17.01452784503632, + "grad_norm": 0.005511843133717775, + "learning_rate": 0.29435548701412045, + "loss": 0.1074, + "num_input_tokens_seen": 6003392, + "step": 3505 + }, + { + "epoch": 17.038740920096853, + "grad_norm": 0.005678016226738691, + "learning_rate": 0.2943394689115731, + "loss": 0.0746, + "num_input_tokens_seen": 6011584, + "step": 3510 + }, + { + "epoch": 17.062953995157386, + "grad_norm": 0.010749926790595055, + "learning_rate": 0.29432342855006577, + "loss": 0.0617, + "num_input_tokens_seen": 6020032, + "step": 3515 + }, + { + "epoch": 17.087167070217916, + "grad_norm": 0.010034185834228992, + "learning_rate": 0.294307365932072, + "loss": 0.0701, + "num_input_tokens_seen": 6028576, + "step": 3520 + }, + { + "epoch": 17.11138014527845, + "grad_norm": 0.011064323596656322, + "learning_rate": 0.294291281060069, + "loss": 0.0628, + "num_input_tokens_seen": 6037024, + "step": 3525 + }, + { + "epoch": 17.135593220338983, + "grad_norm": 0.017009932547807693, + "learning_rate": 0.29427517393653724, + "loss": 0.1088, + "num_input_tokens_seen": 6045344, + "step": 3530 + }, + { + "epoch": 17.159806295399516, + "grad_norm": 0.012858324684202671, + "learning_rate": 0.29425904456396046, + "loss": 0.0848, + "num_input_tokens_seen": 6053728, + "step": 3535 + }, + { + "epoch": 17.18401937046005, + "grad_norm": 0.016334209591150284, + "learning_rate": 0.2942428929448262, + "loss": 0.1016, + "num_input_tokens_seen": 6062656, + "step": 3540 + }, + { + "epoch": 17.208232445520583, + "grad_norm": 0.012315157800912857, + "learning_rate": 0.2942267190816252, + "loss": 0.0919, + "num_input_tokens_seen": 6071104, + "step": 3545 + }, + { + "epoch": 17.232445520581113, + "grad_norm": 0.006759049370884895, + "learning_rate": 0.2942105229768516, + "loss": 0.0703, + "num_input_tokens_seen": 6079840, + "step": 3550 + }, + { + "epoch": 17.256658595641646, + "grad_norm": 0.007663401775062084, + "learning_rate": 0.29419430463300306, + "loss": 0.0556, + "num_input_tokens_seen": 6088672, + "step": 3555 + }, + { + "epoch": 17.28087167070218, + "grad_norm": 0.014961505308747292, + "learning_rate": 0.2941780640525808, + "loss": 0.086, + "num_input_tokens_seen": 6097056, + "step": 3560 + }, + { + "epoch": 17.305084745762713, + "grad_norm": 0.014259983785450459, + "learning_rate": 0.2941618012380891, + "loss": 0.0915, + "num_input_tokens_seen": 6105376, + "step": 3565 + }, + { + "epoch": 17.329297820823246, + "grad_norm": 0.012175391428172588, + "learning_rate": 0.29414551619203605, + "loss": 0.0954, + "num_input_tokens_seen": 6114080, + "step": 3570 + }, + { + "epoch": 17.353510895883776, + "grad_norm": 0.012599071487784386, + "learning_rate": 0.29412920891693295, + "loss": 0.0916, + "num_input_tokens_seen": 6122656, + "step": 3575 + }, + { + "epoch": 17.37772397094431, + "grad_norm": 0.011397581547498703, + "learning_rate": 0.2941128794152946, + "loss": 0.082, + "num_input_tokens_seen": 6131456, + "step": 3580 + }, + { + "epoch": 17.401937046004843, + "grad_norm": 0.013944916427135468, + "learning_rate": 0.2940965276896392, + "loss": 0.0645, + "num_input_tokens_seen": 6139872, + "step": 3585 + }, + { + "epoch": 17.426150121065376, + "grad_norm": 0.03121156245470047, + "learning_rate": 0.2940801537424884, + "loss": 0.1038, + "num_input_tokens_seen": 6148128, + "step": 3590 + }, + { + "epoch": 17.45036319612591, + "grad_norm": 0.01379704661667347, + "learning_rate": 0.2940637575763673, + "loss": 0.1271, + "num_input_tokens_seen": 6156512, + "step": 3595 + }, + { + "epoch": 17.47457627118644, + "grad_norm": 0.0148248840123415, + "learning_rate": 0.2940473391938043, + "loss": 0.1014, + "num_input_tokens_seen": 6165056, + "step": 3600 + }, + { + "epoch": 17.47457627118644, + "eval_loss": 0.2470071017742157, + "eval_runtime": 4.6122, + "eval_samples_per_second": 79.572, + "eval_steps_per_second": 19.947, + "num_input_tokens_seen": 6165056, + "step": 3600 + }, + { + "epoch": 17.498789346246973, + "grad_norm": 0.006890692748129368, + "learning_rate": 0.29403089859733145, + "loss": 0.1078, + "num_input_tokens_seen": 6173856, + "step": 3605 + }, + { + "epoch": 17.523002421307506, + "grad_norm": 0.00907644722610712, + "learning_rate": 0.294014435789484, + "loss": 0.1092, + "num_input_tokens_seen": 6182496, + "step": 3610 + }, + { + "epoch": 17.54721549636804, + "grad_norm": 0.013525772839784622, + "learning_rate": 0.2939979507728007, + "loss": 0.1013, + "num_input_tokens_seen": 6191200, + "step": 3615 + }, + { + "epoch": 17.571428571428573, + "grad_norm": 0.009627921506762505, + "learning_rate": 0.2939814435498239, + "loss": 0.1048, + "num_input_tokens_seen": 6199968, + "step": 3620 + }, + { + "epoch": 17.595641646489103, + "grad_norm": 0.01230956893414259, + "learning_rate": 0.29396491412309905, + "loss": 0.0991, + "num_input_tokens_seen": 6208608, + "step": 3625 + }, + { + "epoch": 17.619854721549636, + "grad_norm": 0.016701167449355125, + "learning_rate": 0.2939483624951753, + "loss": 0.1277, + "num_input_tokens_seen": 6217408, + "step": 3630 + }, + { + "epoch": 17.64406779661017, + "grad_norm": 0.011697192676365376, + "learning_rate": 0.2939317886686051, + "loss": 0.1169, + "num_input_tokens_seen": 6226240, + "step": 3635 + }, + { + "epoch": 17.668280871670703, + "grad_norm": 0.01275196298956871, + "learning_rate": 0.2939151926459443, + "loss": 0.111, + "num_input_tokens_seen": 6234880, + "step": 3640 + }, + { + "epoch": 17.692493946731236, + "grad_norm": 0.009507134556770325, + "learning_rate": 0.2938985744297522, + "loss": 0.0677, + "num_input_tokens_seen": 6243008, + "step": 3645 + }, + { + "epoch": 17.716707021791766, + "grad_norm": 0.01599535159766674, + "learning_rate": 0.29388193402259166, + "loss": 0.0949, + "num_input_tokens_seen": 6251584, + "step": 3650 + }, + { + "epoch": 17.7409200968523, + "grad_norm": 0.015867726877331734, + "learning_rate": 0.29386527142702873, + "loss": 0.1326, + "num_input_tokens_seen": 6260192, + "step": 3655 + }, + { + "epoch": 17.765133171912833, + "grad_norm": 0.01095310878008604, + "learning_rate": 0.293848586645633, + "loss": 0.1481, + "num_input_tokens_seen": 6268704, + "step": 3660 + }, + { + "epoch": 17.789346246973366, + "grad_norm": 0.012851768173277378, + "learning_rate": 0.2938318796809775, + "loss": 0.0991, + "num_input_tokens_seen": 6277120, + "step": 3665 + }, + { + "epoch": 17.8135593220339, + "grad_norm": 0.01639559678733349, + "learning_rate": 0.29381515053563867, + "loss": 0.1074, + "num_input_tokens_seen": 6285472, + "step": 3670 + }, + { + "epoch": 17.83777239709443, + "grad_norm": 0.010194025933742523, + "learning_rate": 0.29379839921219636, + "loss": 0.0868, + "num_input_tokens_seen": 6294016, + "step": 3675 + }, + { + "epoch": 17.861985472154963, + "grad_norm": 0.01520596444606781, + "learning_rate": 0.2937816257132338, + "loss": 0.1178, + "num_input_tokens_seen": 6302464, + "step": 3680 + }, + { + "epoch": 17.886198547215496, + "grad_norm": 0.014867817983031273, + "learning_rate": 0.2937648300413376, + "loss": 0.1263, + "num_input_tokens_seen": 6310976, + "step": 3685 + }, + { + "epoch": 17.91041162227603, + "grad_norm": 0.006102949846535921, + "learning_rate": 0.293748012199098, + "loss": 0.1407, + "num_input_tokens_seen": 6319392, + "step": 3690 + }, + { + "epoch": 17.934624697336563, + "grad_norm": 0.008295104838907719, + "learning_rate": 0.29373117218910844, + "loss": 0.1056, + "num_input_tokens_seen": 6327936, + "step": 3695 + }, + { + "epoch": 17.958837772397093, + "grad_norm": 0.013421069830656052, + "learning_rate": 0.2937143100139659, + "loss": 0.1107, + "num_input_tokens_seen": 6336704, + "step": 3700 + }, + { + "epoch": 17.983050847457626, + "grad_norm": 0.013998658396303654, + "learning_rate": 0.29369742567627083, + "loss": 0.1178, + "num_input_tokens_seen": 6345280, + "step": 3705 + }, + { + "epoch": 18.009685230024214, + "grad_norm": 0.012872147373855114, + "learning_rate": 0.29368051917862675, + "loss": 0.1268, + "num_input_tokens_seen": 6354016, + "step": 3710 + }, + { + "epoch": 18.033898305084747, + "grad_norm": 0.011284693144261837, + "learning_rate": 0.2936635905236411, + "loss": 0.1131, + "num_input_tokens_seen": 6362592, + "step": 3715 + }, + { + "epoch": 18.058111380145277, + "grad_norm": 0.01060469076037407, + "learning_rate": 0.2936466397139244, + "loss": 0.0769, + "num_input_tokens_seen": 6371072, + "step": 3720 + }, + { + "epoch": 18.08232445520581, + "grad_norm": 0.008602946996688843, + "learning_rate": 0.2936296667520907, + "loss": 0.0927, + "num_input_tokens_seen": 6379840, + "step": 3725 + }, + { + "epoch": 18.106537530266344, + "grad_norm": 0.010528785176575184, + "learning_rate": 0.2936126716407574, + "loss": 0.0822, + "num_input_tokens_seen": 6388416, + "step": 3730 + }, + { + "epoch": 18.130750605326877, + "grad_norm": 0.010453174822032452, + "learning_rate": 0.29359565438254537, + "loss": 0.0655, + "num_input_tokens_seen": 6396800, + "step": 3735 + }, + { + "epoch": 18.15496368038741, + "grad_norm": 0.012616252526640892, + "learning_rate": 0.29357861498007887, + "loss": 0.0728, + "num_input_tokens_seen": 6405088, + "step": 3740 + }, + { + "epoch": 18.17917675544794, + "grad_norm": 0.01565117947757244, + "learning_rate": 0.29356155343598567, + "loss": 0.0681, + "num_input_tokens_seen": 6413984, + "step": 3745 + }, + { + "epoch": 18.203389830508474, + "grad_norm": 0.01340776402503252, + "learning_rate": 0.2935444697528968, + "loss": 0.0584, + "num_input_tokens_seen": 6422880, + "step": 3750 + }, + { + "epoch": 18.227602905569007, + "grad_norm": 0.006106976419687271, + "learning_rate": 0.2935273639334468, + "loss": 0.0421, + "num_input_tokens_seen": 6431296, + "step": 3755 + }, + { + "epoch": 18.25181598062954, + "grad_norm": 0.017731891945004463, + "learning_rate": 0.29351023598027365, + "loss": 0.0678, + "num_input_tokens_seen": 6439776, + "step": 3760 + }, + { + "epoch": 18.276029055690074, + "grad_norm": 0.013278374448418617, + "learning_rate": 0.2934930858960186, + "loss": 0.051, + "num_input_tokens_seen": 6448128, + "step": 3765 + }, + { + "epoch": 18.300242130750604, + "grad_norm": 0.014810338616371155, + "learning_rate": 0.29347591368332643, + "loss": 0.0845, + "num_input_tokens_seen": 6456672, + "step": 3770 + }, + { + "epoch": 18.324455205811137, + "grad_norm": 0.010408693924546242, + "learning_rate": 0.2934587193448454, + "loss": 0.088, + "num_input_tokens_seen": 6465216, + "step": 3775 + }, + { + "epoch": 18.34866828087167, + "grad_norm": 0.012235943228006363, + "learning_rate": 0.29344150288322696, + "loss": 0.0674, + "num_input_tokens_seen": 6473280, + "step": 3780 + }, + { + "epoch": 18.372881355932204, + "grad_norm": 0.014035950414836407, + "learning_rate": 0.2934242643011263, + "loss": 0.0651, + "num_input_tokens_seen": 6482048, + "step": 3785 + }, + { + "epoch": 18.397094430992738, + "grad_norm": 0.013662065379321575, + "learning_rate": 0.2934070036012016, + "loss": 0.0813, + "num_input_tokens_seen": 6490432, + "step": 3790 + }, + { + "epoch": 18.421307506053267, + "grad_norm": 0.015790646895766258, + "learning_rate": 0.29338972078611475, + "loss": 0.0788, + "num_input_tokens_seen": 6498720, + "step": 3795 + }, + { + "epoch": 18.4455205811138, + "grad_norm": 0.02461019717156887, + "learning_rate": 0.2933724158585311, + "loss": 0.1101, + "num_input_tokens_seen": 6507264, + "step": 3800 + }, + { + "epoch": 18.4455205811138, + "eval_loss": 0.2416277825832367, + "eval_runtime": 4.6238, + "eval_samples_per_second": 79.372, + "eval_steps_per_second": 19.897, + "num_input_tokens_seen": 6507264, + "step": 3800 + }, + { + "epoch": 18.469733656174334, + "grad_norm": 0.011180555447936058, + "learning_rate": 0.29335508882111916, + "loss": 0.0712, + "num_input_tokens_seen": 6515648, + "step": 3805 + }, + { + "epoch": 18.493946731234868, + "grad_norm": 0.009392514824867249, + "learning_rate": 0.29333773967655097, + "loss": 0.0621, + "num_input_tokens_seen": 6524064, + "step": 3810 + }, + { + "epoch": 18.5181598062954, + "grad_norm": 0.011687814258038998, + "learning_rate": 0.2933203684275021, + "loss": 0.1034, + "num_input_tokens_seen": 6533184, + "step": 3815 + }, + { + "epoch": 18.54237288135593, + "grad_norm": 0.013718221336603165, + "learning_rate": 0.2933029750766513, + "loss": 0.1175, + "num_input_tokens_seen": 6542016, + "step": 3820 + }, + { + "epoch": 18.566585956416464, + "grad_norm": 0.008042431436479092, + "learning_rate": 0.2932855596266809, + "loss": 0.081, + "num_input_tokens_seen": 6550528, + "step": 3825 + }, + { + "epoch": 18.590799031476998, + "grad_norm": 0.02205514907836914, + "learning_rate": 0.2932681220802765, + "loss": 0.1132, + "num_input_tokens_seen": 6558752, + "step": 3830 + }, + { + "epoch": 18.61501210653753, + "grad_norm": 0.009436814114451408, + "learning_rate": 0.2932506624401274, + "loss": 0.1003, + "num_input_tokens_seen": 6567200, + "step": 3835 + }, + { + "epoch": 18.639225181598064, + "grad_norm": 0.016103224828839302, + "learning_rate": 0.29323318070892584, + "loss": 0.1403, + "num_input_tokens_seen": 6576128, + "step": 3840 + }, + { + "epoch": 18.663438256658594, + "grad_norm": 0.01510174572467804, + "learning_rate": 0.29321567688936784, + "loss": 0.1137, + "num_input_tokens_seen": 6584320, + "step": 3845 + }, + { + "epoch": 18.687651331719128, + "grad_norm": 0.009877965785562992, + "learning_rate": 0.29319815098415275, + "loss": 0.1076, + "num_input_tokens_seen": 6592864, + "step": 3850 + }, + { + "epoch": 18.71186440677966, + "grad_norm": 0.0094504589214921, + "learning_rate": 0.2931806029959832, + "loss": 0.1221, + "num_input_tokens_seen": 6601184, + "step": 3855 + }, + { + "epoch": 18.736077481840194, + "grad_norm": 0.011632287874817848, + "learning_rate": 0.29316303292756535, + "loss": 0.1129, + "num_input_tokens_seen": 6609664, + "step": 3860 + }, + { + "epoch": 18.760290556900728, + "grad_norm": 0.023679295554757118, + "learning_rate": 0.29314544078160876, + "loss": 0.1143, + "num_input_tokens_seen": 6618112, + "step": 3865 + }, + { + "epoch": 18.784503631961257, + "grad_norm": 0.011044162325561047, + "learning_rate": 0.2931278265608263, + "loss": 0.0834, + "num_input_tokens_seen": 6626880, + "step": 3870 + }, + { + "epoch": 18.80871670702179, + "grad_norm": 0.017740242183208466, + "learning_rate": 0.29311019026793433, + "loss": 0.1213, + "num_input_tokens_seen": 6635392, + "step": 3875 + }, + { + "epoch": 18.832929782082324, + "grad_norm": 0.009513427503407001, + "learning_rate": 0.29309253190565254, + "loss": 0.0902, + "num_input_tokens_seen": 6644000, + "step": 3880 + }, + { + "epoch": 18.857142857142858, + "grad_norm": 0.01333858072757721, + "learning_rate": 0.2930748514767042, + "loss": 0.1116, + "num_input_tokens_seen": 6653024, + "step": 3885 + }, + { + "epoch": 18.88135593220339, + "grad_norm": 0.014179621823132038, + "learning_rate": 0.29305714898381574, + "loss": 0.1071, + "num_input_tokens_seen": 6661696, + "step": 3890 + }, + { + "epoch": 18.90556900726392, + "grad_norm": 0.01111089251935482, + "learning_rate": 0.29303942442971714, + "loss": 0.1035, + "num_input_tokens_seen": 6670112, + "step": 3895 + }, + { + "epoch": 18.929782082324454, + "grad_norm": 0.00995503831654787, + "learning_rate": 0.2930216778171417, + "loss": 0.0896, + "num_input_tokens_seen": 6678560, + "step": 3900 + }, + { + "epoch": 18.953995157384988, + "grad_norm": 0.0119496313855052, + "learning_rate": 0.2930039091488263, + "loss": 0.1024, + "num_input_tokens_seen": 6687136, + "step": 3905 + }, + { + "epoch": 18.97820823244552, + "grad_norm": 0.009387069381773472, + "learning_rate": 0.29298611842751093, + "loss": 0.0895, + "num_input_tokens_seen": 6695520, + "step": 3910 + }, + { + "epoch": 19.004842615012105, + "grad_norm": 0.039142999798059464, + "learning_rate": 0.29296830565593923, + "loss": 0.1335, + "num_input_tokens_seen": 6704736, + "step": 3915 + }, + { + "epoch": 19.02905569007264, + "grad_norm": 0.009539668448269367, + "learning_rate": 0.2929504708368582, + "loss": 0.1132, + "num_input_tokens_seen": 6713152, + "step": 3920 + }, + { + "epoch": 19.053268765133172, + "grad_norm": 0.007094311527907848, + "learning_rate": 0.29293261397301806, + "loss": 0.0668, + "num_input_tokens_seen": 6722176, + "step": 3925 + }, + { + "epoch": 19.077481840193705, + "grad_norm": 0.008564012125134468, + "learning_rate": 0.29291473506717275, + "loss": 0.0843, + "num_input_tokens_seen": 6730688, + "step": 3930 + }, + { + "epoch": 19.10169491525424, + "grad_norm": 0.005402495618909597, + "learning_rate": 0.29289683412207923, + "loss": 0.063, + "num_input_tokens_seen": 6738944, + "step": 3935 + }, + { + "epoch": 19.12590799031477, + "grad_norm": 0.01041052769869566, + "learning_rate": 0.29287891114049813, + "loss": 0.0529, + "num_input_tokens_seen": 6747296, + "step": 3940 + }, + { + "epoch": 19.150121065375302, + "grad_norm": 0.011899386532604694, + "learning_rate": 0.29286096612519347, + "loss": 0.0624, + "num_input_tokens_seen": 6755968, + "step": 3945 + }, + { + "epoch": 19.174334140435835, + "grad_norm": 0.014535704627633095, + "learning_rate": 0.2928429990789325, + "loss": 0.0806, + "num_input_tokens_seen": 6764640, + "step": 3950 + }, + { + "epoch": 19.19854721549637, + "grad_norm": 0.010395407676696777, + "learning_rate": 0.29282501000448596, + "loss": 0.0509, + "num_input_tokens_seen": 6773152, + "step": 3955 + }, + { + "epoch": 19.222760290556902, + "grad_norm": 0.008461576886475086, + "learning_rate": 0.2928069989046281, + "loss": 0.073, + "num_input_tokens_seen": 6781664, + "step": 3960 + }, + { + "epoch": 19.246973365617432, + "grad_norm": 0.010998588055372238, + "learning_rate": 0.2927889657821363, + "loss": 0.0835, + "num_input_tokens_seen": 6789792, + "step": 3965 + }, + { + "epoch": 19.271186440677965, + "grad_norm": 0.00817631185054779, + "learning_rate": 0.2927709106397916, + "loss": 0.0551, + "num_input_tokens_seen": 6798368, + "step": 3970 + }, + { + "epoch": 19.2953995157385, + "grad_norm": 0.00823062565177679, + "learning_rate": 0.29275283348037834, + "loss": 0.0488, + "num_input_tokens_seen": 6806720, + "step": 3975 + }, + { + "epoch": 19.319612590799032, + "grad_norm": 0.008261079899966717, + "learning_rate": 0.29273473430668423, + "loss": 0.0671, + "num_input_tokens_seen": 6815232, + "step": 3980 + }, + { + "epoch": 19.343825665859566, + "grad_norm": 0.010987048968672752, + "learning_rate": 0.2927166131215003, + "loss": 0.0639, + "num_input_tokens_seen": 6823904, + "step": 3985 + }, + { + "epoch": 19.368038740920095, + "grad_norm": 0.020785389468073845, + "learning_rate": 0.2926984699276212, + "loss": 0.0701, + "num_input_tokens_seen": 6832256, + "step": 3990 + }, + { + "epoch": 19.39225181598063, + "grad_norm": 0.016736313700675964, + "learning_rate": 0.29268030472784473, + "loss": 0.0771, + "num_input_tokens_seen": 6841440, + "step": 3995 + }, + { + "epoch": 19.416464891041162, + "grad_norm": 0.013324973173439503, + "learning_rate": 0.2926621175249723, + "loss": 0.0521, + "num_input_tokens_seen": 6849792, + "step": 4000 + }, + { + "epoch": 19.416464891041162, + "eval_loss": 0.28131264448165894, + "eval_runtime": 4.622, + "eval_samples_per_second": 79.404, + "eval_steps_per_second": 19.905, + "num_input_tokens_seen": 6849792, + "step": 4000 + }, + { + "epoch": 19.440677966101696, + "grad_norm": 0.012729051522910595, + "learning_rate": 0.29264390832180853, + "loss": 0.0746, + "num_input_tokens_seen": 6858656, + "step": 4005 + }, + { + "epoch": 19.46489104116223, + "grad_norm": 0.007837725803256035, + "learning_rate": 0.29262567712116144, + "loss": 0.072, + "num_input_tokens_seen": 6867040, + "step": 4010 + }, + { + "epoch": 19.48910411622276, + "grad_norm": 0.016148049384355545, + "learning_rate": 0.29260742392584266, + "loss": 0.078, + "num_input_tokens_seen": 6875584, + "step": 4015 + }, + { + "epoch": 19.513317191283292, + "grad_norm": 0.014140038751065731, + "learning_rate": 0.292589148738667, + "loss": 0.1261, + "num_input_tokens_seen": 6884096, + "step": 4020 + }, + { + "epoch": 19.537530266343826, + "grad_norm": 0.015417484566569328, + "learning_rate": 0.2925708515624527, + "loss": 0.103, + "num_input_tokens_seen": 6892384, + "step": 4025 + }, + { + "epoch": 19.56174334140436, + "grad_norm": 0.006517657078802586, + "learning_rate": 0.29255253240002144, + "loss": 0.0887, + "num_input_tokens_seen": 6901184, + "step": 4030 + }, + { + "epoch": 19.585956416464892, + "grad_norm": 0.01188060361891985, + "learning_rate": 0.2925341912541983, + "loss": 0.0807, + "num_input_tokens_seen": 6909856, + "step": 4035 + }, + { + "epoch": 19.610169491525422, + "grad_norm": 0.005873067770153284, + "learning_rate": 0.2925158281278116, + "loss": 0.084, + "num_input_tokens_seen": 6918880, + "step": 4040 + }, + { + "epoch": 19.634382566585955, + "grad_norm": 0.013509012758731842, + "learning_rate": 0.29249744302369324, + "loss": 0.0898, + "num_input_tokens_seen": 6927488, + "step": 4045 + }, + { + "epoch": 19.65859564164649, + "grad_norm": 0.010316449217498302, + "learning_rate": 0.29247903594467844, + "loss": 0.0723, + "num_input_tokens_seen": 6936224, + "step": 4050 + }, + { + "epoch": 19.682808716707022, + "grad_norm": 0.013251434080302715, + "learning_rate": 0.2924606068936058, + "loss": 0.0867, + "num_input_tokens_seen": 6944608, + "step": 4055 + }, + { + "epoch": 19.707021791767556, + "grad_norm": 0.00840473361313343, + "learning_rate": 0.2924421558733173, + "loss": 0.0758, + "num_input_tokens_seen": 6953408, + "step": 4060 + }, + { + "epoch": 19.731234866828085, + "grad_norm": 0.011409467086195946, + "learning_rate": 0.2924236828866583, + "loss": 0.0975, + "num_input_tokens_seen": 6961760, + "step": 4065 + }, + { + "epoch": 19.75544794188862, + "grad_norm": 0.00978404562920332, + "learning_rate": 0.29240518793647763, + "loss": 0.0878, + "num_input_tokens_seen": 6970368, + "step": 4070 + }, + { + "epoch": 19.779661016949152, + "grad_norm": 0.010824495926499367, + "learning_rate": 0.29238667102562743, + "loss": 0.08, + "num_input_tokens_seen": 6979232, + "step": 4075 + }, + { + "epoch": 19.803874092009686, + "grad_norm": 0.009513840079307556, + "learning_rate": 0.29236813215696317, + "loss": 0.1019, + "num_input_tokens_seen": 6987712, + "step": 4080 + }, + { + "epoch": 19.82808716707022, + "grad_norm": 0.014390144497156143, + "learning_rate": 0.2923495713333439, + "loss": 0.1127, + "num_input_tokens_seen": 6996096, + "step": 4085 + }, + { + "epoch": 19.852300242130752, + "grad_norm": 0.006313161924481392, + "learning_rate": 0.29233098855763173, + "loss": 0.1066, + "num_input_tokens_seen": 7004576, + "step": 4090 + }, + { + "epoch": 19.876513317191282, + "grad_norm": 0.011677771806716919, + "learning_rate": 0.29231238383269254, + "loss": 0.1482, + "num_input_tokens_seen": 7012768, + "step": 4095 + }, + { + "epoch": 19.900726392251816, + "grad_norm": 0.014514470472931862, + "learning_rate": 0.2922937571613954, + "loss": 0.0854, + "num_input_tokens_seen": 7021440, + "step": 4100 + }, + { + "epoch": 19.92493946731235, + "grad_norm": 0.012987284921109676, + "learning_rate": 0.29227510854661265, + "loss": 0.1127, + "num_input_tokens_seen": 7029856, + "step": 4105 + }, + { + "epoch": 19.949152542372882, + "grad_norm": 0.006623938214033842, + "learning_rate": 0.29225643799122025, + "loss": 0.0721, + "num_input_tokens_seen": 7038176, + "step": 4110 + }, + { + "epoch": 19.973365617433416, + "grad_norm": 0.01644420064985752, + "learning_rate": 0.2922377454980974, + "loss": 0.0957, + "num_input_tokens_seen": 7046848, + "step": 4115 + }, + { + "epoch": 19.997578692493946, + "grad_norm": 0.018230123445391655, + "learning_rate": 0.29221903107012676, + "loss": 0.0881, + "num_input_tokens_seen": 7055520, + "step": 4120 + }, + { + "epoch": 20.024213075060533, + "grad_norm": 0.006553490646183491, + "learning_rate": 0.29220029471019426, + "loss": 0.0663, + "num_input_tokens_seen": 7064672, + "step": 4125 + }, + { + "epoch": 20.048426150121067, + "grad_norm": 0.00549925584346056, + "learning_rate": 0.2921815364211893, + "loss": 0.0381, + "num_input_tokens_seen": 7072832, + "step": 4130 + }, + { + "epoch": 20.072639225181597, + "grad_norm": 0.009730352088809013, + "learning_rate": 0.29216275620600474, + "loss": 0.0529, + "num_input_tokens_seen": 7081728, + "step": 4135 + }, + { + "epoch": 20.09685230024213, + "grad_norm": 0.006192285567522049, + "learning_rate": 0.29214395406753657, + "loss": 0.0488, + "num_input_tokens_seen": 7089952, + "step": 4140 + }, + { + "epoch": 20.121065375302663, + "grad_norm": 0.008225451223552227, + "learning_rate": 0.2921251300086844, + "loss": 0.0311, + "num_input_tokens_seen": 7098432, + "step": 4145 + }, + { + "epoch": 20.145278450363197, + "grad_norm": 0.006020956207066774, + "learning_rate": 0.2921062840323511, + "loss": 0.0315, + "num_input_tokens_seen": 7106944, + "step": 4150 + }, + { + "epoch": 20.16949152542373, + "grad_norm": 0.013746820390224457, + "learning_rate": 0.29208741614144307, + "loss": 0.0958, + "num_input_tokens_seen": 7115648, + "step": 4155 + }, + { + "epoch": 20.19370460048426, + "grad_norm": 0.009016139432787895, + "learning_rate": 0.2920685263388698, + "loss": 0.0549, + "num_input_tokens_seen": 7124224, + "step": 4160 + }, + { + "epoch": 20.217917675544793, + "grad_norm": 0.008706462569534779, + "learning_rate": 0.2920496146275445, + "loss": 0.0449, + "num_input_tokens_seen": 7132640, + "step": 4165 + }, + { + "epoch": 20.242130750605327, + "grad_norm": 0.015175719745457172, + "learning_rate": 0.29203068101038343, + "loss": 0.0463, + "num_input_tokens_seen": 7141088, + "step": 4170 + }, + { + "epoch": 20.26634382566586, + "grad_norm": 0.009347951039671898, + "learning_rate": 0.2920117254903065, + "loss": 0.0626, + "num_input_tokens_seen": 7149632, + "step": 4175 + }, + { + "epoch": 20.290556900726394, + "grad_norm": 0.010068162344396114, + "learning_rate": 0.29199274807023695, + "loss": 0.0896, + "num_input_tokens_seen": 7158016, + "step": 4180 + }, + { + "epoch": 20.314769975786923, + "grad_norm": 0.010923965834081173, + "learning_rate": 0.29197374875310117, + "loss": 0.0629, + "num_input_tokens_seen": 7166848, + "step": 4185 + }, + { + "epoch": 20.338983050847457, + "grad_norm": 0.00988994725048542, + "learning_rate": 0.2919547275418292, + "loss": 0.0888, + "num_input_tokens_seen": 7175520, + "step": 4190 + }, + { + "epoch": 20.36319612590799, + "grad_norm": 0.010361588560044765, + "learning_rate": 0.29193568443935436, + "loss": 0.0969, + "num_input_tokens_seen": 7184320, + "step": 4195 + }, + { + "epoch": 20.387409200968523, + "grad_norm": 0.009408905170857906, + "learning_rate": 0.2919166194486133, + "loss": 0.0438, + "num_input_tokens_seen": 7192864, + "step": 4200 + }, + { + "epoch": 20.387409200968523, + "eval_loss": 0.2733117938041687, + "eval_runtime": 4.633, + "eval_samples_per_second": 79.214, + "eval_steps_per_second": 19.858, + "num_input_tokens_seen": 7192864, + "step": 4200 + }, + { + "epoch": 20.411622276029057, + "grad_norm": 0.010685781948268414, + "learning_rate": 0.2918975325725461, + "loss": 0.0895, + "num_input_tokens_seen": 7201504, + "step": 4205 + }, + { + "epoch": 20.435835351089587, + "grad_norm": 0.011095595546066761, + "learning_rate": 0.29187842381409607, + "loss": 0.0674, + "num_input_tokens_seen": 7210080, + "step": 4210 + }, + { + "epoch": 20.46004842615012, + "grad_norm": 0.014500923454761505, + "learning_rate": 0.29185929317621023, + "loss": 0.082, + "num_input_tokens_seen": 7218784, + "step": 4215 + }, + { + "epoch": 20.484261501210653, + "grad_norm": 0.0065336995758116245, + "learning_rate": 0.29184014066183867, + "loss": 0.0457, + "num_input_tokens_seen": 7227296, + "step": 4220 + }, + { + "epoch": 20.508474576271187, + "grad_norm": 0.005971224512904882, + "learning_rate": 0.2918209662739349, + "loss": 0.0699, + "num_input_tokens_seen": 7235424, + "step": 4225 + }, + { + "epoch": 20.53268765133172, + "grad_norm": 0.013835481368005276, + "learning_rate": 0.29180177001545593, + "loss": 0.0802, + "num_input_tokens_seen": 7244192, + "step": 4230 + }, + { + "epoch": 20.55690072639225, + "grad_norm": 0.008880465291440487, + "learning_rate": 0.29178255188936203, + "loss": 0.0456, + "num_input_tokens_seen": 7253152, + "step": 4235 + }, + { + "epoch": 20.581113801452783, + "grad_norm": 0.004869069904088974, + "learning_rate": 0.2917633118986169, + "loss": 0.0458, + "num_input_tokens_seen": 7261504, + "step": 4240 + }, + { + "epoch": 20.605326876513317, + "grad_norm": 0.02023349143564701, + "learning_rate": 0.2917440500461875, + "loss": 0.0461, + "num_input_tokens_seen": 7269568, + "step": 4245 + }, + { + "epoch": 20.62953995157385, + "grad_norm": 0.016820665448904037, + "learning_rate": 0.29172476633504435, + "loss": 0.0839, + "num_input_tokens_seen": 7278048, + "step": 4250 + }, + { + "epoch": 20.653753026634384, + "grad_norm": 0.012066825293004513, + "learning_rate": 0.2917054607681612, + "loss": 0.0567, + "num_input_tokens_seen": 7286560, + "step": 4255 + }, + { + "epoch": 20.677966101694913, + "grad_norm": 0.01455223374068737, + "learning_rate": 0.29168613334851523, + "loss": 0.0836, + "num_input_tokens_seen": 7294496, + "step": 4260 + }, + { + "epoch": 20.702179176755447, + "grad_norm": 0.013497594743967056, + "learning_rate": 0.2916667840790869, + "loss": 0.1019, + "num_input_tokens_seen": 7303040, + "step": 4265 + }, + { + "epoch": 20.72639225181598, + "grad_norm": 0.01409521047025919, + "learning_rate": 0.2916474129628603, + "loss": 0.134, + "num_input_tokens_seen": 7311232, + "step": 4270 + }, + { + "epoch": 20.750605326876514, + "grad_norm": 0.007784151006489992, + "learning_rate": 0.29162802000282245, + "loss": 0.0981, + "num_input_tokens_seen": 7319520, + "step": 4275 + }, + { + "epoch": 20.774818401937047, + "grad_norm": 0.006814433727413416, + "learning_rate": 0.2916086052019642, + "loss": 0.088, + "num_input_tokens_seen": 7328032, + "step": 4280 + }, + { + "epoch": 20.79903147699758, + "grad_norm": 0.003285493701696396, + "learning_rate": 0.2915891685632794, + "loss": 0.0613, + "num_input_tokens_seen": 7337472, + "step": 4285 + }, + { + "epoch": 20.82324455205811, + "grad_norm": 0.013611174188554287, + "learning_rate": 0.29156971008976545, + "loss": 0.0914, + "num_input_tokens_seen": 7346016, + "step": 4290 + }, + { + "epoch": 20.847457627118644, + "grad_norm": 0.010699892416596413, + "learning_rate": 0.2915502297844232, + "loss": 0.0714, + "num_input_tokens_seen": 7354496, + "step": 4295 + }, + { + "epoch": 20.871670702179177, + "grad_norm": 0.013157747685909271, + "learning_rate": 0.2915307276502566, + "loss": 0.0806, + "num_input_tokens_seen": 7363264, + "step": 4300 + }, + { + "epoch": 20.89588377723971, + "grad_norm": 0.020143434405326843, + "learning_rate": 0.29151120369027334, + "loss": 0.0783, + "num_input_tokens_seen": 7371712, + "step": 4305 + }, + { + "epoch": 20.920096852300244, + "grad_norm": 0.018156005069613457, + "learning_rate": 0.29149165790748405, + "loss": 0.0813, + "num_input_tokens_seen": 7380160, + "step": 4310 + }, + { + "epoch": 20.944309927360774, + "grad_norm": 0.017335183918476105, + "learning_rate": 0.291472090304903, + "loss": 0.0774, + "num_input_tokens_seen": 7388352, + "step": 4315 + }, + { + "epoch": 20.968523002421307, + "grad_norm": 0.008848442696034908, + "learning_rate": 0.2914525008855478, + "loss": 0.0677, + "num_input_tokens_seen": 7397696, + "step": 4320 + }, + { + "epoch": 20.99273607748184, + "grad_norm": 0.014505148865282536, + "learning_rate": 0.2914328896524394, + "loss": 0.0808, + "num_input_tokens_seen": 7405824, + "step": 4325 + }, + { + "epoch": 21.019370460048425, + "grad_norm": 0.009806646034121513, + "learning_rate": 0.291413256608602, + "loss": 0.0853, + "num_input_tokens_seen": 7414464, + "step": 4330 + }, + { + "epoch": 21.043583535108958, + "grad_norm": 0.01046704314649105, + "learning_rate": 0.29139360175706336, + "loss": 0.0502, + "num_input_tokens_seen": 7423136, + "step": 4335 + }, + { + "epoch": 21.06779661016949, + "grad_norm": 0.008120108395814896, + "learning_rate": 0.2913739251008544, + "loss": 0.0549, + "num_input_tokens_seen": 7431744, + "step": 4340 + }, + { + "epoch": 21.092009685230025, + "grad_norm": 0.005119819659739733, + "learning_rate": 0.29135422664300964, + "loss": 0.0508, + "num_input_tokens_seen": 7439968, + "step": 4345 + }, + { + "epoch": 21.116222760290558, + "grad_norm": 0.0010450785048305988, + "learning_rate": 0.29133450638656677, + "loss": 0.0367, + "num_input_tokens_seen": 7448352, + "step": 4350 + }, + { + "epoch": 21.140435835351088, + "grad_norm": 0.004997045733034611, + "learning_rate": 0.2913147643345669, + "loss": 0.0298, + "num_input_tokens_seen": 7457184, + "step": 4355 + }, + { + "epoch": 21.16464891041162, + "grad_norm": 0.008857759647071362, + "learning_rate": 0.29129500049005447, + "loss": 0.0474, + "num_input_tokens_seen": 7465792, + "step": 4360 + }, + { + "epoch": 21.188861985472155, + "grad_norm": 0.0059000542387366295, + "learning_rate": 0.2912752148560773, + "loss": 0.0478, + "num_input_tokens_seen": 7474688, + "step": 4365 + }, + { + "epoch": 21.213075060532688, + "grad_norm": 0.006436925381422043, + "learning_rate": 0.2912554074356866, + "loss": 0.0438, + "num_input_tokens_seen": 7483296, + "step": 4370 + }, + { + "epoch": 21.23728813559322, + "grad_norm": 0.007617922965437174, + "learning_rate": 0.2912355782319371, + "loss": 0.0267, + "num_input_tokens_seen": 7491712, + "step": 4375 + }, + { + "epoch": 21.26150121065375, + "grad_norm": 0.003598397132009268, + "learning_rate": 0.2912157272478864, + "loss": 0.05, + "num_input_tokens_seen": 7500256, + "step": 4380 + }, + { + "epoch": 21.285714285714285, + "grad_norm": 0.009911957196891308, + "learning_rate": 0.291195854486596, + "loss": 0.0468, + "num_input_tokens_seen": 7508992, + "step": 4385 + }, + { + "epoch": 21.309927360774818, + "grad_norm": 0.008557649329304695, + "learning_rate": 0.2911759599511305, + "loss": 0.0325, + "num_input_tokens_seen": 7517184, + "step": 4390 + }, + { + "epoch": 21.33414043583535, + "grad_norm": 0.02344810776412487, + "learning_rate": 0.29115604364455777, + "loss": 0.0519, + "num_input_tokens_seen": 7525664, + "step": 4395 + }, + { + "epoch": 21.358353510895885, + "grad_norm": 0.02554458938539028, + "learning_rate": 0.2911361055699493, + "loss": 0.1356, + "num_input_tokens_seen": 7534272, + "step": 4400 + }, + { + "epoch": 21.358353510895885, + "eval_loss": 0.3190828561782837, + "eval_runtime": 4.6183, + "eval_samples_per_second": 79.467, + "eval_steps_per_second": 19.921, + "num_input_tokens_seen": 7534272, + "step": 4400 + }, + { + "epoch": 21.38256658595642, + "grad_norm": 0.009758868254721165, + "learning_rate": 0.2911161457303797, + "loss": 0.0699, + "num_input_tokens_seen": 7542688, + "step": 4405 + }, + { + "epoch": 21.406779661016948, + "grad_norm": 0.00784946046769619, + "learning_rate": 0.291096164128927, + "loss": 0.0568, + "num_input_tokens_seen": 7551712, + "step": 4410 + }, + { + "epoch": 21.43099273607748, + "grad_norm": 0.00840779673308134, + "learning_rate": 0.2910761607686727, + "loss": 0.0689, + "num_input_tokens_seen": 7560352, + "step": 4415 + }, + { + "epoch": 21.455205811138015, + "grad_norm": 0.009074106812477112, + "learning_rate": 0.2910561356527016, + "loss": 0.0471, + "num_input_tokens_seen": 7568576, + "step": 4420 + }, + { + "epoch": 21.479418886198548, + "grad_norm": 0.009618768468499184, + "learning_rate": 0.2910360887841017, + "loss": 0.0651, + "num_input_tokens_seen": 7577376, + "step": 4425 + }, + { + "epoch": 21.50363196125908, + "grad_norm": 0.006445304490625858, + "learning_rate": 0.2910160201659645, + "loss": 0.0548, + "num_input_tokens_seen": 7586208, + "step": 4430 + }, + { + "epoch": 21.52784503631961, + "grad_norm": 0.010776783339679241, + "learning_rate": 0.29099592980138494, + "loss": 0.092, + "num_input_tokens_seen": 7594880, + "step": 4435 + }, + { + "epoch": 21.552058111380145, + "grad_norm": 0.007953236810863018, + "learning_rate": 0.29097581769346115, + "loss": 0.0652, + "num_input_tokens_seen": 7603360, + "step": 4440 + }, + { + "epoch": 21.576271186440678, + "grad_norm": 0.01678667590022087, + "learning_rate": 0.29095568384529463, + "loss": 0.0905, + "num_input_tokens_seen": 7611904, + "step": 4445 + }, + { + "epoch": 21.60048426150121, + "grad_norm": 0.01630890555679798, + "learning_rate": 0.2909355282599903, + "loss": 0.0678, + "num_input_tokens_seen": 7620576, + "step": 4450 + }, + { + "epoch": 21.624697336561745, + "grad_norm": 0.00995719525963068, + "learning_rate": 0.29091535094065635, + "loss": 0.0736, + "num_input_tokens_seen": 7629120, + "step": 4455 + }, + { + "epoch": 21.648910411622275, + "grad_norm": 0.01599689945578575, + "learning_rate": 0.2908951518904045, + "loss": 0.0867, + "num_input_tokens_seen": 7637888, + "step": 4460 + }, + { + "epoch": 21.673123486682808, + "grad_norm": 0.008602357469499111, + "learning_rate": 0.29087493111234963, + "loss": 0.0761, + "num_input_tokens_seen": 7646336, + "step": 4465 + }, + { + "epoch": 21.69733656174334, + "grad_norm": 0.010727209970355034, + "learning_rate": 0.29085468860961, + "loss": 0.0923, + "num_input_tokens_seen": 7655136, + "step": 4470 + }, + { + "epoch": 21.721549636803875, + "grad_norm": 0.01415548101067543, + "learning_rate": 0.2908344243853073, + "loss": 0.0614, + "num_input_tokens_seen": 7664224, + "step": 4475 + }, + { + "epoch": 21.74576271186441, + "grad_norm": 0.013741208240389824, + "learning_rate": 0.2908141384425666, + "loss": 0.0872, + "num_input_tokens_seen": 7672384, + "step": 4480 + }, + { + "epoch": 21.769975786924938, + "grad_norm": 0.01196425873786211, + "learning_rate": 0.2907938307845161, + "loss": 0.0586, + "num_input_tokens_seen": 7680992, + "step": 4485 + }, + { + "epoch": 21.79418886198547, + "grad_norm": 0.01370865199714899, + "learning_rate": 0.2907735014142876, + "loss": 0.0608, + "num_input_tokens_seen": 7689792, + "step": 4490 + }, + { + "epoch": 21.818401937046005, + "grad_norm": 0.00966983288526535, + "learning_rate": 0.2907531503350161, + "loss": 0.0775, + "num_input_tokens_seen": 7698336, + "step": 4495 + }, + { + "epoch": 21.84261501210654, + "grad_norm": 0.013273878023028374, + "learning_rate": 0.29073277754983995, + "loss": 0.0786, + "num_input_tokens_seen": 7706880, + "step": 4500 + }, + { + "epoch": 21.86682808716707, + "grad_norm": 0.011317849159240723, + "learning_rate": 0.290712383061901, + "loss": 0.0914, + "num_input_tokens_seen": 7715328, + "step": 4505 + }, + { + "epoch": 21.8910411622276, + "grad_norm": 0.007597822230309248, + "learning_rate": 0.2906919668743443, + "loss": 0.0684, + "num_input_tokens_seen": 7723840, + "step": 4510 + }, + { + "epoch": 21.915254237288135, + "grad_norm": 0.013863557949662209, + "learning_rate": 0.29067152899031823, + "loss": 0.0782, + "num_input_tokens_seen": 7732288, + "step": 4515 + }, + { + "epoch": 21.93946731234867, + "grad_norm": 0.0166613832116127, + "learning_rate": 0.2906510694129746, + "loss": 0.0942, + "num_input_tokens_seen": 7740352, + "step": 4520 + }, + { + "epoch": 21.9636803874092, + "grad_norm": 0.018365781754255295, + "learning_rate": 0.2906305881454685, + "loss": 0.0843, + "num_input_tokens_seen": 7748736, + "step": 4525 + }, + { + "epoch": 21.987893462469735, + "grad_norm": 0.007654623128473759, + "learning_rate": 0.2906100851909585, + "loss": 0.0859, + "num_input_tokens_seen": 7757024, + "step": 4530 + }, + { + "epoch": 22.01452784503632, + "grad_norm": 0.0031790335197001696, + "learning_rate": 0.29058956055260626, + "loss": 0.0612, + "num_input_tokens_seen": 7766144, + "step": 4535 + }, + { + "epoch": 22.038740920096853, + "grad_norm": 0.007658373564481735, + "learning_rate": 0.2905690142335771, + "loss": 0.0296, + "num_input_tokens_seen": 7774624, + "step": 4540 + }, + { + "epoch": 22.062953995157386, + "grad_norm": 0.009080914780497551, + "learning_rate": 0.29054844623703946, + "loss": 0.0404, + "num_input_tokens_seen": 7783488, + "step": 4545 + }, + { + "epoch": 22.087167070217916, + "grad_norm": 0.005521758925169706, + "learning_rate": 0.2905278565661651, + "loss": 0.0415, + "num_input_tokens_seen": 7791840, + "step": 4550 + }, + { + "epoch": 22.11138014527845, + "grad_norm": 0.002015097066760063, + "learning_rate": 0.2905072452241293, + "loss": 0.0495, + "num_input_tokens_seen": 7800064, + "step": 4555 + }, + { + "epoch": 22.135593220338983, + "grad_norm": 0.004736931063234806, + "learning_rate": 0.2904866122141106, + "loss": 0.0459, + "num_input_tokens_seen": 7808864, + "step": 4560 + }, + { + "epoch": 22.159806295399516, + "grad_norm": 0.011345493607223034, + "learning_rate": 0.2904659575392908, + "loss": 0.0488, + "num_input_tokens_seen": 7817728, + "step": 4565 + }, + { + "epoch": 22.18401937046005, + "grad_norm": 0.010535103268921375, + "learning_rate": 0.2904452812028551, + "loss": 0.0266, + "num_input_tokens_seen": 7825984, + "step": 4570 + }, + { + "epoch": 22.208232445520583, + "grad_norm": 0.01672489196062088, + "learning_rate": 0.2904245832079922, + "loss": 0.0574, + "num_input_tokens_seen": 7834432, + "step": 4575 + }, + { + "epoch": 22.232445520581113, + "grad_norm": 0.00614935951307416, + "learning_rate": 0.29040386355789377, + "loss": 0.0378, + "num_input_tokens_seen": 7843424, + "step": 4580 + }, + { + "epoch": 22.256658595641646, + "grad_norm": 0.012359379790723324, + "learning_rate": 0.29038312225575524, + "loss": 0.0349, + "num_input_tokens_seen": 7852384, + "step": 4585 + }, + { + "epoch": 22.28087167070218, + "grad_norm": 0.012141426093876362, + "learning_rate": 0.29036235930477505, + "loss": 0.0419, + "num_input_tokens_seen": 7860672, + "step": 4590 + }, + { + "epoch": 22.305084745762713, + "grad_norm": 0.004493392072618008, + "learning_rate": 0.29034157470815514, + "loss": 0.0297, + "num_input_tokens_seen": 7869056, + "step": 4595 + }, + { + "epoch": 22.329297820823246, + "grad_norm": 0.010902565903961658, + "learning_rate": 0.2903207684691008, + "loss": 0.0441, + "num_input_tokens_seen": 7877248, + "step": 4600 + }, + { + "epoch": 22.329297820823246, + "eval_loss": 0.30341872572898865, + "eval_runtime": 4.6089, + "eval_samples_per_second": 79.629, + "eval_steps_per_second": 19.961, + "num_input_tokens_seen": 7877248, + "step": 4600 + }, + { + "epoch": 22.353510895883776, + "grad_norm": 0.003560848068445921, + "learning_rate": 0.29029994059082054, + "loss": 0.0331, + "num_input_tokens_seen": 7885696, + "step": 4605 + }, + { + "epoch": 22.37772397094431, + "grad_norm": 0.008412528783082962, + "learning_rate": 0.2902790910765264, + "loss": 0.06, + "num_input_tokens_seen": 7894464, + "step": 4610 + }, + { + "epoch": 22.401937046004843, + "grad_norm": 0.0054008495062589645, + "learning_rate": 0.29025821992943346, + "loss": 0.0511, + "num_input_tokens_seen": 7902848, + "step": 4615 + }, + { + "epoch": 22.426150121065376, + "grad_norm": 0.011562882922589779, + "learning_rate": 0.29023732715276046, + "loss": 0.0417, + "num_input_tokens_seen": 7911424, + "step": 4620 + }, + { + "epoch": 22.45036319612591, + "grad_norm": 0.008615410886704922, + "learning_rate": 0.2902164127497293, + "loss": 0.0514, + "num_input_tokens_seen": 7919840, + "step": 4625 + }, + { + "epoch": 22.47457627118644, + "grad_norm": 0.0028440915048122406, + "learning_rate": 0.2901954767235652, + "loss": 0.0757, + "num_input_tokens_seen": 7928128, + "step": 4630 + }, + { + "epoch": 22.498789346246973, + "grad_norm": 0.01078416034579277, + "learning_rate": 0.2901745190774968, + "loss": 0.0549, + "num_input_tokens_seen": 7936672, + "step": 4635 + }, + { + "epoch": 22.523002421307506, + "grad_norm": 0.008647510781884193, + "learning_rate": 0.290153539814756, + "loss": 0.073, + "num_input_tokens_seen": 7945632, + "step": 4640 + }, + { + "epoch": 22.54721549636804, + "grad_norm": 0.005087712313979864, + "learning_rate": 0.2901325389385781, + "loss": 0.0747, + "num_input_tokens_seen": 7954624, + "step": 4645 + }, + { + "epoch": 22.571428571428573, + "grad_norm": 0.008847849443554878, + "learning_rate": 0.2901115164522016, + "loss": 0.0592, + "num_input_tokens_seen": 7963392, + "step": 4650 + }, + { + "epoch": 22.595641646489103, + "grad_norm": 0.006823436357080936, + "learning_rate": 0.29009047235886865, + "loss": 0.0473, + "num_input_tokens_seen": 7971968, + "step": 4655 + }, + { + "epoch": 22.619854721549636, + "grad_norm": 0.011647647246718407, + "learning_rate": 0.2900694066618243, + "loss": 0.0542, + "num_input_tokens_seen": 7980704, + "step": 4660 + }, + { + "epoch": 22.64406779661017, + "grad_norm": 0.015706907957792282, + "learning_rate": 0.2900483193643172, + "loss": 0.0798, + "num_input_tokens_seen": 7989728, + "step": 4665 + }, + { + "epoch": 22.668280871670703, + "grad_norm": 0.008279160596430302, + "learning_rate": 0.29002721046959934, + "loss": 0.0322, + "num_input_tokens_seen": 7998208, + "step": 4670 + }, + { + "epoch": 22.692493946731236, + "grad_norm": 0.011857392266392708, + "learning_rate": 0.29000607998092587, + "loss": 0.0463, + "num_input_tokens_seen": 8007104, + "step": 4675 + }, + { + "epoch": 22.716707021791766, + "grad_norm": 0.012087996117770672, + "learning_rate": 0.2899849279015555, + "loss": 0.0619, + "num_input_tokens_seen": 8015584, + "step": 4680 + }, + { + "epoch": 22.7409200968523, + "grad_norm": 0.010351885110139847, + "learning_rate": 0.28996375423475007, + "loss": 0.0668, + "num_input_tokens_seen": 8024192, + "step": 4685 + }, + { + "epoch": 22.765133171912833, + "grad_norm": 0.008662221021950245, + "learning_rate": 0.28994255898377486, + "loss": 0.0431, + "num_input_tokens_seen": 8032480, + "step": 4690 + }, + { + "epoch": 22.789346246973366, + "grad_norm": 0.009205535985529423, + "learning_rate": 0.2899213421518984, + "loss": 0.0468, + "num_input_tokens_seen": 8041088, + "step": 4695 + }, + { + "epoch": 22.8135593220339, + "grad_norm": 0.0181270744651556, + "learning_rate": 0.2899001037423926, + "loss": 0.0735, + "num_input_tokens_seen": 8049280, + "step": 4700 + }, + { + "epoch": 22.83777239709443, + "grad_norm": 0.006210879422724247, + "learning_rate": 0.28987884375853273, + "loss": 0.0949, + "num_input_tokens_seen": 8057408, + "step": 4705 + }, + { + "epoch": 22.861985472154963, + "grad_norm": 0.00816432572901249, + "learning_rate": 0.2898575622035974, + "loss": 0.0823, + "num_input_tokens_seen": 8065792, + "step": 4710 + }, + { + "epoch": 22.886198547215496, + "grad_norm": 0.009825835935771465, + "learning_rate": 0.2898362590808683, + "loss": 0.0456, + "num_input_tokens_seen": 8074368, + "step": 4715 + }, + { + "epoch": 22.91041162227603, + "grad_norm": 0.01261716615408659, + "learning_rate": 0.2898149343936308, + "loss": 0.0805, + "num_input_tokens_seen": 8083008, + "step": 4720 + }, + { + "epoch": 22.934624697336563, + "grad_norm": 0.008728956803679466, + "learning_rate": 0.2897935881451734, + "loss": 0.0696, + "num_input_tokens_seen": 8091712, + "step": 4725 + }, + { + "epoch": 22.958837772397093, + "grad_norm": 0.014719651080667973, + "learning_rate": 0.28977222033878797, + "loss": 0.1245, + "num_input_tokens_seen": 8100224, + "step": 4730 + }, + { + "epoch": 22.983050847457626, + "grad_norm": 0.00892999954521656, + "learning_rate": 0.28975083097776966, + "loss": 0.0992, + "num_input_tokens_seen": 8108832, + "step": 4735 + }, + { + "epoch": 23.009685230024214, + "grad_norm": 0.0064992900006473064, + "learning_rate": 0.28972942006541696, + "loss": 0.0752, + "num_input_tokens_seen": 8118112, + "step": 4740 + }, + { + "epoch": 23.033898305084747, + "grad_norm": 0.0132670933380723, + "learning_rate": 0.2897079876050318, + "loss": 0.0433, + "num_input_tokens_seen": 8126656, + "step": 4745 + }, + { + "epoch": 23.058111380145277, + "grad_norm": 0.01271195150911808, + "learning_rate": 0.2896865335999192, + "loss": 0.0748, + "num_input_tokens_seen": 8135040, + "step": 4750 + }, + { + "epoch": 23.08232445520581, + "grad_norm": 0.01422757375985384, + "learning_rate": 0.28966505805338777, + "loss": 0.0534, + "num_input_tokens_seen": 8143648, + "step": 4755 + }, + { + "epoch": 23.106537530266344, + "grad_norm": 0.011422793380916119, + "learning_rate": 0.2896435609687492, + "loss": 0.0682, + "num_input_tokens_seen": 8151872, + "step": 4760 + }, + { + "epoch": 23.130750605326877, + "grad_norm": 0.010456803254783154, + "learning_rate": 0.2896220423493187, + "loss": 0.0651, + "num_input_tokens_seen": 8160544, + "step": 4765 + }, + { + "epoch": 23.15496368038741, + "grad_norm": 0.00879751518368721, + "learning_rate": 0.28960050219841466, + "loss": 0.0436, + "num_input_tokens_seen": 8169216, + "step": 4770 + }, + { + "epoch": 23.17917675544794, + "grad_norm": 0.007181358989328146, + "learning_rate": 0.28957894051935884, + "loss": 0.0632, + "num_input_tokens_seen": 8177568, + "step": 4775 + }, + { + "epoch": 23.203389830508474, + "grad_norm": 0.009652458131313324, + "learning_rate": 0.2895573573154764, + "loss": 0.055, + "num_input_tokens_seen": 8186048, + "step": 4780 + }, + { + "epoch": 23.227602905569007, + "grad_norm": 0.010002998635172844, + "learning_rate": 0.28953575259009556, + "loss": 0.043, + "num_input_tokens_seen": 8194752, + "step": 4785 + }, + { + "epoch": 23.25181598062954, + "grad_norm": 0.018968669697642326, + "learning_rate": 0.2895141263465482, + "loss": 0.0813, + "num_input_tokens_seen": 8203584, + "step": 4790 + }, + { + "epoch": 23.276029055690074, + "grad_norm": 0.016597652807831764, + "learning_rate": 0.28949247858816934, + "loss": 0.0525, + "num_input_tokens_seen": 8211872, + "step": 4795 + }, + { + "epoch": 23.300242130750604, + "grad_norm": 0.01031019352376461, + "learning_rate": 0.2894708093182973, + "loss": 0.041, + "num_input_tokens_seen": 8220544, + "step": 4800 + }, + { + "epoch": 23.300242130750604, + "eval_loss": 0.34597745537757874, + "eval_runtime": 4.6473, + "eval_samples_per_second": 78.97, + "eval_steps_per_second": 19.796, + "num_input_tokens_seen": 8220544, + "step": 4800 + }, + { + "epoch": 23.324455205811137, + "grad_norm": 0.015491259284317493, + "learning_rate": 0.2894491185402737, + "loss": 0.1052, + "num_input_tokens_seen": 8229088, + "step": 4805 + }, + { + "epoch": 23.34866828087167, + "grad_norm": 0.009800028055906296, + "learning_rate": 0.2894274062574437, + "loss": 0.052, + "num_input_tokens_seen": 8237408, + "step": 4810 + }, + { + "epoch": 23.372881355932204, + "grad_norm": 0.012369765900075436, + "learning_rate": 0.2894056724731554, + "loss": 0.0644, + "num_input_tokens_seen": 8246272, + "step": 4815 + }, + { + "epoch": 23.397094430992738, + "grad_norm": 0.015452837571501732, + "learning_rate": 0.28938391719076056, + "loss": 0.0553, + "num_input_tokens_seen": 8255072, + "step": 4820 + }, + { + "epoch": 23.421307506053267, + "grad_norm": 0.00439245393499732, + "learning_rate": 0.28936214041361413, + "loss": 0.0489, + "num_input_tokens_seen": 8263680, + "step": 4825 + }, + { + "epoch": 23.4455205811138, + "grad_norm": 0.017391560599207878, + "learning_rate": 0.2893403421450743, + "loss": 0.0506, + "num_input_tokens_seen": 8272352, + "step": 4830 + }, + { + "epoch": 23.469733656174334, + "grad_norm": 0.009217319078743458, + "learning_rate": 0.2893185223885026, + "loss": 0.0606, + "num_input_tokens_seen": 8281248, + "step": 4835 + }, + { + "epoch": 23.493946731234868, + "grad_norm": 0.01030106283724308, + "learning_rate": 0.289296681147264, + "loss": 0.0802, + "num_input_tokens_seen": 8289984, + "step": 4840 + }, + { + "epoch": 23.5181598062954, + "grad_norm": 0.0040209428407251835, + "learning_rate": 0.28927481842472663, + "loss": 0.0376, + "num_input_tokens_seen": 8298752, + "step": 4845 + }, + { + "epoch": 23.54237288135593, + "grad_norm": 0.0180116705596447, + "learning_rate": 0.28925293422426207, + "loss": 0.0457, + "num_input_tokens_seen": 8307424, + "step": 4850 + }, + { + "epoch": 23.566585956416464, + "grad_norm": 0.010274866595864296, + "learning_rate": 0.28923102854924504, + "loss": 0.0592, + "num_input_tokens_seen": 8315776, + "step": 4855 + }, + { + "epoch": 23.590799031476998, + "grad_norm": 0.010143953375518322, + "learning_rate": 0.2892091014030537, + "loss": 0.0462, + "num_input_tokens_seen": 8324416, + "step": 4860 + }, + { + "epoch": 23.61501210653753, + "grad_norm": 0.013059153221547604, + "learning_rate": 0.2891871527890696, + "loss": 0.0647, + "num_input_tokens_seen": 8332960, + "step": 4865 + }, + { + "epoch": 23.639225181598064, + "grad_norm": 0.012448777444660664, + "learning_rate": 0.2891651827106773, + "loss": 0.0665, + "num_input_tokens_seen": 8341632, + "step": 4870 + }, + { + "epoch": 23.663438256658594, + "grad_norm": 0.006634972523897886, + "learning_rate": 0.2891431911712651, + "loss": 0.0551, + "num_input_tokens_seen": 8350336, + "step": 4875 + }, + { + "epoch": 23.687651331719128, + "grad_norm": 0.02107115276157856, + "learning_rate": 0.2891211781742241, + "loss": 0.1046, + "num_input_tokens_seen": 8358656, + "step": 4880 + }, + { + "epoch": 23.71186440677966, + "grad_norm": 0.009525422006845474, + "learning_rate": 0.2890991437229492, + "loss": 0.0554, + "num_input_tokens_seen": 8367072, + "step": 4885 + }, + { + "epoch": 23.736077481840194, + "grad_norm": 0.00698609696701169, + "learning_rate": 0.2890770878208383, + "loss": 0.0546, + "num_input_tokens_seen": 8375424, + "step": 4890 + }, + { + "epoch": 23.760290556900728, + "grad_norm": 0.00930626131594181, + "learning_rate": 0.28905501047129273, + "loss": 0.0586, + "num_input_tokens_seen": 8383968, + "step": 4895 + }, + { + "epoch": 23.784503631961257, + "grad_norm": 0.005893534980714321, + "learning_rate": 0.289032911677717, + "loss": 0.0614, + "num_input_tokens_seen": 8392512, + "step": 4900 + }, + { + "epoch": 23.80871670702179, + "grad_norm": 0.015668049454689026, + "learning_rate": 0.28901079144351915, + "loss": 0.0571, + "num_input_tokens_seen": 8400800, + "step": 4905 + }, + { + "epoch": 23.832929782082324, + "grad_norm": 0.007289852946996689, + "learning_rate": 0.2889886497721103, + "loss": 0.0524, + "num_input_tokens_seen": 8409440, + "step": 4910 + }, + { + "epoch": 23.857142857142858, + "grad_norm": 0.016272908076643944, + "learning_rate": 0.28896648666690505, + "loss": 0.0759, + "num_input_tokens_seen": 8417792, + "step": 4915 + }, + { + "epoch": 23.88135593220339, + "grad_norm": 0.01349421963095665, + "learning_rate": 0.2889443021313212, + "loss": 0.0724, + "num_input_tokens_seen": 8426080, + "step": 4920 + }, + { + "epoch": 23.90556900726392, + "grad_norm": 0.014511422254145145, + "learning_rate": 0.28892209616877984, + "loss": 0.0485, + "num_input_tokens_seen": 8434560, + "step": 4925 + }, + { + "epoch": 23.929782082324454, + "grad_norm": 0.018564099445939064, + "learning_rate": 0.28889986878270546, + "loss": 0.0663, + "num_input_tokens_seen": 8443424, + "step": 4930 + }, + { + "epoch": 23.953995157384988, + "grad_norm": 0.01379811204969883, + "learning_rate": 0.28887761997652583, + "loss": 0.0703, + "num_input_tokens_seen": 8451680, + "step": 4935 + }, + { + "epoch": 23.97820823244552, + "grad_norm": 0.007686655502766371, + "learning_rate": 0.2888553497536719, + "loss": 0.0648, + "num_input_tokens_seen": 8460224, + "step": 4940 + }, + { + "epoch": 24.004842615012105, + "grad_norm": 0.030504731461405754, + "learning_rate": 0.2888330581175781, + "loss": 0.1588, + "num_input_tokens_seen": 8469056, + "step": 4945 + }, + { + "epoch": 24.02905569007264, + "grad_norm": 0.00587989017367363, + "learning_rate": 0.28881074507168203, + "loss": 0.0726, + "num_input_tokens_seen": 8477568, + "step": 4950 + }, + { + "epoch": 24.053268765133172, + "grad_norm": 0.008041176944971085, + "learning_rate": 0.2887884106194247, + "loss": 0.0378, + "num_input_tokens_seen": 8486048, + "step": 4955 + }, + { + "epoch": 24.077481840193705, + "grad_norm": 0.005158673506230116, + "learning_rate": 0.28876605476425027, + "loss": 0.0419, + "num_input_tokens_seen": 8494784, + "step": 4960 + }, + { + "epoch": 24.10169491525424, + "grad_norm": 0.00799319427460432, + "learning_rate": 0.2887436775096064, + "loss": 0.037, + "num_input_tokens_seen": 8503072, + "step": 4965 + }, + { + "epoch": 24.12590799031477, + "grad_norm": 0.00591488229110837, + "learning_rate": 0.2887212788589439, + "loss": 0.0484, + "num_input_tokens_seen": 8511264, + "step": 4970 + }, + { + "epoch": 24.150121065375302, + "grad_norm": 0.007922365330159664, + "learning_rate": 0.2886988588157169, + "loss": 0.0261, + "num_input_tokens_seen": 8519616, + "step": 4975 + }, + { + "epoch": 24.174334140435835, + "grad_norm": 0.0061256168410182, + "learning_rate": 0.28867641738338284, + "loss": 0.0245, + "num_input_tokens_seen": 8528192, + "step": 4980 + }, + { + "epoch": 24.19854721549637, + "grad_norm": 0.00228314520791173, + "learning_rate": 0.2886539545654026, + "loss": 0.0241, + "num_input_tokens_seen": 8536768, + "step": 4985 + }, + { + "epoch": 24.222760290556902, + "grad_norm": 0.0057846298441290855, + "learning_rate": 0.28863147036524006, + "loss": 0.0383, + "num_input_tokens_seen": 8545120, + "step": 4990 + }, + { + "epoch": 24.246973365617432, + "grad_norm": 0.017875080928206444, + "learning_rate": 0.2886089647863626, + "loss": 0.0364, + "num_input_tokens_seen": 8553472, + "step": 4995 + }, + { + "epoch": 24.271186440677965, + "grad_norm": 0.006887033581733704, + "learning_rate": 0.288586437832241, + "loss": 0.024, + "num_input_tokens_seen": 8562144, + "step": 5000 + }, + { + "epoch": 24.271186440677965, + "eval_loss": 0.34834131598472595, + "eval_runtime": 4.6242, + "eval_samples_per_second": 79.365, + "eval_steps_per_second": 19.895, + "num_input_tokens_seen": 8562144, + "step": 5000 + }, + { + "epoch": 24.2953995157385, + "grad_norm": 0.004862347152084112, + "learning_rate": 0.28856388950634904, + "loss": 0.0215, + "num_input_tokens_seen": 8570560, + "step": 5005 + }, + { + "epoch": 24.319612590799032, + "grad_norm": 0.010632318444550037, + "learning_rate": 0.288541319812164, + "loss": 0.0561, + "num_input_tokens_seen": 8579168, + "step": 5010 + }, + { + "epoch": 24.343825665859566, + "grad_norm": 0.010886844247579575, + "learning_rate": 0.2885187287531665, + "loss": 0.0929, + "num_input_tokens_seen": 8587552, + "step": 5015 + }, + { + "epoch": 24.368038740920095, + "grad_norm": 0.007784074638038874, + "learning_rate": 0.2884961163328402, + "loss": 0.0221, + "num_input_tokens_seen": 8596096, + "step": 5020 + }, + { + "epoch": 24.39225181598063, + "grad_norm": 0.007052563596516848, + "learning_rate": 0.28847348255467237, + "loss": 0.0528, + "num_input_tokens_seen": 8604672, + "step": 5025 + }, + { + "epoch": 24.416464891041162, + "grad_norm": 0.010835700668394566, + "learning_rate": 0.28845082742215333, + "loss": 0.0531, + "num_input_tokens_seen": 8613024, + "step": 5030 + }, + { + "epoch": 24.440677966101696, + "grad_norm": 0.006443202029913664, + "learning_rate": 0.2884281509387769, + "loss": 0.0536, + "num_input_tokens_seen": 8621856, + "step": 5035 + }, + { + "epoch": 24.46489104116223, + "grad_norm": 0.006567500066012144, + "learning_rate": 0.2884054531080399, + "loss": 0.0424, + "num_input_tokens_seen": 8630336, + "step": 5040 + }, + { + "epoch": 24.48910411622276, + "grad_norm": 0.004950159694999456, + "learning_rate": 0.28838273393344277, + "loss": 0.0533, + "num_input_tokens_seen": 8638720, + "step": 5045 + }, + { + "epoch": 24.513317191283292, + "grad_norm": 0.007667267229408026, + "learning_rate": 0.288359993418489, + "loss": 0.048, + "num_input_tokens_seen": 8647360, + "step": 5050 + }, + { + "epoch": 24.537530266343826, + "grad_norm": 0.00804623868316412, + "learning_rate": 0.28833723156668556, + "loss": 0.0506, + "num_input_tokens_seen": 8655936, + "step": 5055 + }, + { + "epoch": 24.56174334140436, + "grad_norm": 0.0017491914331912994, + "learning_rate": 0.2883144483815425, + "loss": 0.0537, + "num_input_tokens_seen": 8664448, + "step": 5060 + }, + { + "epoch": 24.585956416464892, + "grad_norm": 0.007258000783622265, + "learning_rate": 0.28829164386657335, + "loss": 0.0418, + "num_input_tokens_seen": 8673152, + "step": 5065 + }, + { + "epoch": 24.610169491525422, + "grad_norm": 0.004867732524871826, + "learning_rate": 0.28826881802529486, + "loss": 0.0165, + "num_input_tokens_seen": 8682016, + "step": 5070 + }, + { + "epoch": 24.634382566585955, + "grad_norm": 0.005723764654248953, + "learning_rate": 0.28824597086122705, + "loss": 0.0196, + "num_input_tokens_seen": 8690464, + "step": 5075 + }, + { + "epoch": 24.65859564164649, + "grad_norm": 0.007091791369020939, + "learning_rate": 0.28822310237789317, + "loss": 0.0288, + "num_input_tokens_seen": 8699136, + "step": 5080 + }, + { + "epoch": 24.682808716707022, + "grad_norm": 0.014312445186078548, + "learning_rate": 0.2882002125788199, + "loss": 0.0559, + "num_input_tokens_seen": 8707744, + "step": 5085 + }, + { + "epoch": 24.707021791767556, + "grad_norm": 0.006637599319219589, + "learning_rate": 0.2881773014675371, + "loss": 0.0418, + "num_input_tokens_seen": 8716544, + "step": 5090 + }, + { + "epoch": 24.731234866828085, + "grad_norm": 0.0011365012032911181, + "learning_rate": 0.288154369047578, + "loss": 0.0293, + "num_input_tokens_seen": 8724704, + "step": 5095 + }, + { + "epoch": 24.75544794188862, + "grad_norm": 0.012974552810192108, + "learning_rate": 0.28813141532247905, + "loss": 0.0452, + "num_input_tokens_seen": 8733376, + "step": 5100 + }, + { + "epoch": 24.779661016949152, + "grad_norm": 0.007594721857458353, + "learning_rate": 0.28810844029578, + "loss": 0.0395, + "num_input_tokens_seen": 8741888, + "step": 5105 + }, + { + "epoch": 24.803874092009686, + "grad_norm": 0.01104890275746584, + "learning_rate": 0.2880854439710238, + "loss": 0.074, + "num_input_tokens_seen": 8750784, + "step": 5110 + }, + { + "epoch": 24.82808716707022, + "grad_norm": 0.006550648249685764, + "learning_rate": 0.28806242635175694, + "loss": 0.0271, + "num_input_tokens_seen": 8759200, + "step": 5115 + }, + { + "epoch": 24.852300242130752, + "grad_norm": 0.004642006009817123, + "learning_rate": 0.2880393874415289, + "loss": 0.0525, + "num_input_tokens_seen": 8767648, + "step": 5120 + }, + { + "epoch": 24.876513317191282, + "grad_norm": 0.016631172969937325, + "learning_rate": 0.2880163272438926, + "loss": 0.0496, + "num_input_tokens_seen": 8776416, + "step": 5125 + }, + { + "epoch": 24.900726392251816, + "grad_norm": 0.0034542165230959654, + "learning_rate": 0.2879932457624042, + "loss": 0.0339, + "num_input_tokens_seen": 8784960, + "step": 5130 + }, + { + "epoch": 24.92493946731235, + "grad_norm": 0.014089958742260933, + "learning_rate": 0.2879701430006232, + "loss": 0.0927, + "num_input_tokens_seen": 8793280, + "step": 5135 + }, + { + "epoch": 24.949152542372882, + "grad_norm": 0.006299767643213272, + "learning_rate": 0.28794701896211233, + "loss": 0.0395, + "num_input_tokens_seen": 8801952, + "step": 5140 + }, + { + "epoch": 24.973365617433416, + "grad_norm": 0.0024708127602934837, + "learning_rate": 0.28792387365043753, + "loss": 0.026, + "num_input_tokens_seen": 8810560, + "step": 5145 + }, + { + "epoch": 24.997578692493946, + "grad_norm": 0.0062315515242516994, + "learning_rate": 0.28790070706916815, + "loss": 0.0416, + "num_input_tokens_seen": 8819072, + "step": 5150 + }, + { + "epoch": 25.024213075060533, + "grad_norm": 0.011142670176923275, + "learning_rate": 0.2878775192218768, + "loss": 0.0647, + "num_input_tokens_seen": 8827808, + "step": 5155 + }, + { + "epoch": 25.048426150121067, + "grad_norm": 0.008840291760861874, + "learning_rate": 0.2878543101121393, + "loss": 0.0361, + "num_input_tokens_seen": 8836480, + "step": 5160 + }, + { + "epoch": 25.072639225181597, + "grad_norm": 0.0007825137581676245, + "learning_rate": 0.28783107974353483, + "loss": 0.0263, + "num_input_tokens_seen": 8844768, + "step": 5165 + }, + { + "epoch": 25.09685230024213, + "grad_norm": 0.014335334300994873, + "learning_rate": 0.2878078281196457, + "loss": 0.0426, + "num_input_tokens_seen": 8853760, + "step": 5170 + }, + { + "epoch": 25.121065375302663, + "grad_norm": 0.00365812866948545, + "learning_rate": 0.28778455524405777, + "loss": 0.0128, + "num_input_tokens_seen": 8862560, + "step": 5175 + }, + { + "epoch": 25.145278450363197, + "grad_norm": 0.0032549009192734957, + "learning_rate": 0.2877612611203598, + "loss": 0.0217, + "num_input_tokens_seen": 8871168, + "step": 5180 + }, + { + "epoch": 25.16949152542373, + "grad_norm": 0.004032242111861706, + "learning_rate": 0.28773794575214423, + "loss": 0.0407, + "num_input_tokens_seen": 8880160, + "step": 5185 + }, + { + "epoch": 25.19370460048426, + "grad_norm": 0.009966525249183178, + "learning_rate": 0.28771460914300645, + "loss": 0.0331, + "num_input_tokens_seen": 8888640, + "step": 5190 + }, + { + "epoch": 25.217917675544793, + "grad_norm": 0.0023499103263020515, + "learning_rate": 0.2876912512965454, + "loss": 0.0158, + "num_input_tokens_seen": 8896992, + "step": 5195 + }, + { + "epoch": 25.242130750605327, + "grad_norm": 0.0024700076319277287, + "learning_rate": 0.287667872216363, + "loss": 0.0235, + "num_input_tokens_seen": 8905568, + "step": 5200 + }, + { + "epoch": 25.242130750605327, + "eval_loss": 0.39248618483543396, + "eval_runtime": 4.6107, + "eval_samples_per_second": 79.597, + "eval_steps_per_second": 19.953, + "num_input_tokens_seen": 8905568, + "step": 5200 + }, + { + "epoch": 25.26634382566586, + "grad_norm": 0.006362801417708397, + "learning_rate": 0.2876444719060647, + "loss": 0.0277, + "num_input_tokens_seen": 8913984, + "step": 5205 + }, + { + "epoch": 25.290556900726394, + "grad_norm": 0.0037005003541707993, + "learning_rate": 0.287621050369259, + "loss": 0.0282, + "num_input_tokens_seen": 8922624, + "step": 5210 + }, + { + "epoch": 25.314769975786923, + "grad_norm": 0.001916595152579248, + "learning_rate": 0.28759760760955794, + "loss": 0.0401, + "num_input_tokens_seen": 8931168, + "step": 5215 + }, + { + "epoch": 25.338983050847457, + "grad_norm": 0.00602490920573473, + "learning_rate": 0.2875741436305766, + "loss": 0.0095, + "num_input_tokens_seen": 8940128, + "step": 5220 + }, + { + "epoch": 25.36319612590799, + "grad_norm": 0.00523360213264823, + "learning_rate": 0.28755065843593347, + "loss": 0.0326, + "num_input_tokens_seen": 8948704, + "step": 5225 + }, + { + "epoch": 25.387409200968523, + "grad_norm": 0.005486863199621439, + "learning_rate": 0.2875271520292502, + "loss": 0.0169, + "num_input_tokens_seen": 8957248, + "step": 5230 + }, + { + "epoch": 25.411622276029057, + "grad_norm": 0.010963312350213528, + "learning_rate": 0.28750362441415184, + "loss": 0.0297, + "num_input_tokens_seen": 8965344, + "step": 5235 + }, + { + "epoch": 25.435835351089587, + "grad_norm": 0.009679618291556835, + "learning_rate": 0.28748007559426664, + "loss": 0.0447, + "num_input_tokens_seen": 8973664, + "step": 5240 + }, + { + "epoch": 25.46004842615012, + "grad_norm": 0.007327042520046234, + "learning_rate": 0.2874565055732261, + "loss": 0.0298, + "num_input_tokens_seen": 8982144, + "step": 5245 + }, + { + "epoch": 25.484261501210653, + "grad_norm": 0.005041094496846199, + "learning_rate": 0.28743291435466495, + "loss": 0.0587, + "num_input_tokens_seen": 8990720, + "step": 5250 + }, + { + "epoch": 25.508474576271187, + "grad_norm": 0.014025293290615082, + "learning_rate": 0.2874093019422214, + "loss": 0.0355, + "num_input_tokens_seen": 8999808, + "step": 5255 + }, + { + "epoch": 25.53268765133172, + "grad_norm": 0.008638406172394753, + "learning_rate": 0.28738566833953666, + "loss": 0.0425, + "num_input_tokens_seen": 9008352, + "step": 5260 + }, + { + "epoch": 25.55690072639225, + "grad_norm": 0.001648566103540361, + "learning_rate": 0.28736201355025537, + "loss": 0.0287, + "num_input_tokens_seen": 9016992, + "step": 5265 + }, + { + "epoch": 25.581113801452783, + "grad_norm": 0.008271683938801289, + "learning_rate": 0.28733833757802535, + "loss": 0.0443, + "num_input_tokens_seen": 9024960, + "step": 5270 + }, + { + "epoch": 25.605326876513317, + "grad_norm": 0.010512091219425201, + "learning_rate": 0.28731464042649785, + "loss": 0.0458, + "num_input_tokens_seen": 9033792, + "step": 5275 + }, + { + "epoch": 25.62953995157385, + "grad_norm": 0.0034207559656351805, + "learning_rate": 0.2872909220993271, + "loss": 0.0682, + "num_input_tokens_seen": 9042720, + "step": 5280 + }, + { + "epoch": 25.653753026634384, + "grad_norm": 0.010098911821842194, + "learning_rate": 0.287267182600171, + "loss": 0.037, + "num_input_tokens_seen": 9051264, + "step": 5285 + }, + { + "epoch": 25.677966101694913, + "grad_norm": 0.013029354624450207, + "learning_rate": 0.2872434219326902, + "loss": 0.0499, + "num_input_tokens_seen": 9060064, + "step": 5290 + }, + { + "epoch": 25.702179176755447, + "grad_norm": 0.003845794824883342, + "learning_rate": 0.28721964010054907, + "loss": 0.0212, + "num_input_tokens_seen": 9068512, + "step": 5295 + }, + { + "epoch": 25.72639225181598, + "grad_norm": 0.006173205096274614, + "learning_rate": 0.28719583710741503, + "loss": 0.0366, + "num_input_tokens_seen": 9077024, + "step": 5300 + }, + { + "epoch": 25.750605326876514, + "grad_norm": 0.007971569895744324, + "learning_rate": 0.28717201295695877, + "loss": 0.0568, + "num_input_tokens_seen": 9085792, + "step": 5305 + }, + { + "epoch": 25.774818401937047, + "grad_norm": 0.012967247515916824, + "learning_rate": 0.28714816765285434, + "loss": 0.0646, + "num_input_tokens_seen": 9094368, + "step": 5310 + }, + { + "epoch": 25.79903147699758, + "grad_norm": 0.007453206926584244, + "learning_rate": 0.28712430119877896, + "loss": 0.0306, + "num_input_tokens_seen": 9102592, + "step": 5315 + }, + { + "epoch": 25.82324455205811, + "grad_norm": 0.0048935930244624615, + "learning_rate": 0.28710041359841304, + "loss": 0.0374, + "num_input_tokens_seen": 9111232, + "step": 5320 + }, + { + "epoch": 25.847457627118644, + "grad_norm": 0.003304366022348404, + "learning_rate": 0.28707650485544056, + "loss": 0.0273, + "num_input_tokens_seen": 9119776, + "step": 5325 + }, + { + "epoch": 25.871670702179177, + "grad_norm": 0.0023037109058350325, + "learning_rate": 0.28705257497354836, + "loss": 0.0444, + "num_input_tokens_seen": 9127936, + "step": 5330 + }, + { + "epoch": 25.89588377723971, + "grad_norm": 0.01097120065242052, + "learning_rate": 0.28702862395642675, + "loss": 0.0343, + "num_input_tokens_seen": 9136416, + "step": 5335 + }, + { + "epoch": 25.920096852300244, + "grad_norm": 0.008020082488656044, + "learning_rate": 0.28700465180776935, + "loss": 0.0413, + "num_input_tokens_seen": 9145088, + "step": 5340 + }, + { + "epoch": 25.944309927360774, + "grad_norm": 0.0012465919135138392, + "learning_rate": 0.2869806585312729, + "loss": 0.0357, + "num_input_tokens_seen": 9153536, + "step": 5345 + }, + { + "epoch": 25.968523002421307, + "grad_norm": 0.01342796441167593, + "learning_rate": 0.28695664413063754, + "loss": 0.0263, + "num_input_tokens_seen": 9162304, + "step": 5350 + }, + { + "epoch": 25.99273607748184, + "grad_norm": 0.004078370984643698, + "learning_rate": 0.28693260860956654, + "loss": 0.0288, + "num_input_tokens_seen": 9170976, + "step": 5355 + }, + { + "epoch": 26.019370460048425, + "grad_norm": 0.0014292318373918533, + "learning_rate": 0.2869085519717665, + "loss": 0.0603, + "num_input_tokens_seen": 9179712, + "step": 5360 + }, + { + "epoch": 26.043583535108958, + "grad_norm": 0.002013850724324584, + "learning_rate": 0.28688447422094726, + "loss": 0.0236, + "num_input_tokens_seen": 9188000, + "step": 5365 + }, + { + "epoch": 26.06779661016949, + "grad_norm": 0.0010149130830541253, + "learning_rate": 0.2868603753608219, + "loss": 0.0344, + "num_input_tokens_seen": 9196448, + "step": 5370 + }, + { + "epoch": 26.092009685230025, + "grad_norm": 0.004034500569105148, + "learning_rate": 0.28683625539510665, + "loss": 0.0343, + "num_input_tokens_seen": 9205376, + "step": 5375 + }, + { + "epoch": 26.116222760290558, + "grad_norm": 0.0024016203824430704, + "learning_rate": 0.28681211432752135, + "loss": 0.0141, + "num_input_tokens_seen": 9213920, + "step": 5380 + }, + { + "epoch": 26.140435835351088, + "grad_norm": 0.00333636743016541, + "learning_rate": 0.2867879521617887, + "loss": 0.012, + "num_input_tokens_seen": 9222496, + "step": 5385 + }, + { + "epoch": 26.16464891041162, + "grad_norm": 0.002383190207183361, + "learning_rate": 0.28676376890163485, + "loss": 0.0064, + "num_input_tokens_seen": 9230912, + "step": 5390 + }, + { + "epoch": 26.188861985472155, + "grad_norm": 0.0072426809929311275, + "learning_rate": 0.2867395645507891, + "loss": 0.0255, + "num_input_tokens_seen": 9240000, + "step": 5395 + }, + { + "epoch": 26.213075060532688, + "grad_norm": 0.0026888421270996332, + "learning_rate": 0.2867153391129842, + "loss": 0.0111, + "num_input_tokens_seen": 9248640, + "step": 5400 + }, + { + "epoch": 26.213075060532688, + "eval_loss": 0.39592254161834717, + "eval_runtime": 4.6154, + "eval_samples_per_second": 79.516, + "eval_steps_per_second": 19.933, + "num_input_tokens_seen": 9248640, + "step": 5400 + }, + { + "epoch": 26.23728813559322, + "grad_norm": 0.007286573760211468, + "learning_rate": 0.28669109259195585, + "loss": 0.0116, + "num_input_tokens_seen": 9257312, + "step": 5405 + }, + { + "epoch": 26.26150121065375, + "grad_norm": 0.0016550137661397457, + "learning_rate": 0.2866668249914433, + "loss": 0.0162, + "num_input_tokens_seen": 9265824, + "step": 5410 + }, + { + "epoch": 26.285714285714285, + "grad_norm": 0.010058215819299221, + "learning_rate": 0.2866425363151889, + "loss": 0.0087, + "num_input_tokens_seen": 9274816, + "step": 5415 + }, + { + "epoch": 26.309927360774818, + "grad_norm": 0.004615162964910269, + "learning_rate": 0.2866182265669382, + "loss": 0.0126, + "num_input_tokens_seen": 9283424, + "step": 5420 + }, + { + "epoch": 26.33414043583535, + "grad_norm": 0.0008917021914385259, + "learning_rate": 0.28659389575044014, + "loss": 0.0374, + "num_input_tokens_seen": 9291744, + "step": 5425 + }, + { + "epoch": 26.358353510895885, + "grad_norm": 0.003966794814914465, + "learning_rate": 0.28656954386944683, + "loss": 0.0119, + "num_input_tokens_seen": 9299968, + "step": 5430 + }, + { + "epoch": 26.38256658595642, + "grad_norm": 0.005998417269438505, + "learning_rate": 0.28654517092771353, + "loss": 0.0346, + "num_input_tokens_seen": 9308384, + "step": 5435 + }, + { + "epoch": 26.406779661016948, + "grad_norm": 0.01015126146376133, + "learning_rate": 0.286520776928999, + "loss": 0.0595, + "num_input_tokens_seen": 9317312, + "step": 5440 + }, + { + "epoch": 26.43099273607748, + "grad_norm": 0.01754389889538288, + "learning_rate": 0.286496361877065, + "loss": 0.0528, + "num_input_tokens_seen": 9326016, + "step": 5445 + }, + { + "epoch": 26.455205811138015, + "grad_norm": 0.009231338277459145, + "learning_rate": 0.28647192577567676, + "loss": 0.0414, + "num_input_tokens_seen": 9334560, + "step": 5450 + }, + { + "epoch": 26.479418886198548, + "grad_norm": 0.01053446251899004, + "learning_rate": 0.28644746862860254, + "loss": 0.0311, + "num_input_tokens_seen": 9343168, + "step": 5455 + }, + { + "epoch": 26.50363196125908, + "grad_norm": 0.007093291264027357, + "learning_rate": 0.2864229904396139, + "loss": 0.0196, + "num_input_tokens_seen": 9351808, + "step": 5460 + }, + { + "epoch": 26.52784503631961, + "grad_norm": 0.008492815308272839, + "learning_rate": 0.28639849121248573, + "loss": 0.0296, + "num_input_tokens_seen": 9360480, + "step": 5465 + }, + { + "epoch": 26.552058111380145, + "grad_norm": 0.002098996425047517, + "learning_rate": 0.28637397095099615, + "loss": 0.012, + "num_input_tokens_seen": 9369120, + "step": 5470 + }, + { + "epoch": 26.576271186440678, + "grad_norm": 0.003845006227493286, + "learning_rate": 0.28634942965892646, + "loss": 0.0455, + "num_input_tokens_seen": 9378016, + "step": 5475 + }, + { + "epoch": 26.60048426150121, + "grad_norm": 0.004848518408834934, + "learning_rate": 0.28632486734006124, + "loss": 0.0693, + "num_input_tokens_seen": 9386368, + "step": 5480 + }, + { + "epoch": 26.624697336561745, + "grad_norm": 0.004706223960965872, + "learning_rate": 0.28630028399818835, + "loss": 0.0202, + "num_input_tokens_seen": 9394688, + "step": 5485 + }, + { + "epoch": 26.648910411622275, + "grad_norm": 0.0051443749107420444, + "learning_rate": 0.2862756796370987, + "loss": 0.0141, + "num_input_tokens_seen": 9403488, + "step": 5490 + }, + { + "epoch": 26.673123486682808, + "grad_norm": 0.009690443985164165, + "learning_rate": 0.2862510542605868, + "loss": 0.0329, + "num_input_tokens_seen": 9412320, + "step": 5495 + }, + { + "epoch": 26.69733656174334, + "grad_norm": 0.01928745023906231, + "learning_rate": 0.2862264078724501, + "loss": 0.0581, + "num_input_tokens_seen": 9420896, + "step": 5500 + }, + { + "epoch": 26.721549636803875, + "grad_norm": 0.004278009757399559, + "learning_rate": 0.28620174047648933, + "loss": 0.0189, + "num_input_tokens_seen": 9429792, + "step": 5505 + }, + { + "epoch": 26.74576271186441, + "grad_norm": 0.010976087301969528, + "learning_rate": 0.2861770520765086, + "loss": 0.0731, + "num_input_tokens_seen": 9438400, + "step": 5510 + }, + { + "epoch": 26.769975786924938, + "grad_norm": 0.006313483230769634, + "learning_rate": 0.2861523426763151, + "loss": 0.0306, + "num_input_tokens_seen": 9446880, + "step": 5515 + }, + { + "epoch": 26.79418886198547, + "grad_norm": 0.007223348133265972, + "learning_rate": 0.2861276122797194, + "loss": 0.0483, + "num_input_tokens_seen": 9455584, + "step": 5520 + }, + { + "epoch": 26.818401937046005, + "grad_norm": 0.005968168377876282, + "learning_rate": 0.28610286089053516, + "loss": 0.0493, + "num_input_tokens_seen": 9464608, + "step": 5525 + }, + { + "epoch": 26.84261501210654, + "grad_norm": 0.006216951180249453, + "learning_rate": 0.28607808851257943, + "loss": 0.064, + "num_input_tokens_seen": 9472992, + "step": 5530 + }, + { + "epoch": 26.86682808716707, + "grad_norm": 0.006728218402713537, + "learning_rate": 0.28605329514967237, + "loss": 0.0311, + "num_input_tokens_seen": 9481568, + "step": 5535 + }, + { + "epoch": 26.8910411622276, + "grad_norm": 0.014853104017674923, + "learning_rate": 0.2860284808056374, + "loss": 0.0371, + "num_input_tokens_seen": 9489888, + "step": 5540 + }, + { + "epoch": 26.915254237288135, + "grad_norm": 0.001895784749649465, + "learning_rate": 0.28600364548430135, + "loss": 0.0438, + "num_input_tokens_seen": 9498048, + "step": 5545 + }, + { + "epoch": 26.93946731234867, + "grad_norm": 0.011082019656896591, + "learning_rate": 0.28597878918949393, + "loss": 0.0267, + "num_input_tokens_seen": 9506272, + "step": 5550 + }, + { + "epoch": 26.9636803874092, + "grad_norm": 0.014022605493664742, + "learning_rate": 0.2859539119250485, + "loss": 0.0449, + "num_input_tokens_seen": 9514848, + "step": 5555 + }, + { + "epoch": 26.987893462469735, + "grad_norm": 0.002696659881621599, + "learning_rate": 0.2859290136948013, + "loss": 0.0165, + "num_input_tokens_seen": 9523168, + "step": 5560 + }, + { + "epoch": 27.01452784503632, + "grad_norm": 0.0017985086888074875, + "learning_rate": 0.28590409450259197, + "loss": 0.0299, + "num_input_tokens_seen": 9532000, + "step": 5565 + }, + { + "epoch": 27.038740920096853, + "grad_norm": 0.00047486749826930463, + "learning_rate": 0.28587915435226346, + "loss": 0.0155, + "num_input_tokens_seen": 9540480, + "step": 5570 + }, + { + "epoch": 27.062953995157386, + "grad_norm": 0.0012797955423593521, + "learning_rate": 0.2858541932476617, + "loss": 0.0131, + "num_input_tokens_seen": 9549216, + "step": 5575 + }, + { + "epoch": 27.087167070217916, + "grad_norm": 0.005189008545130491, + "learning_rate": 0.2858292111926361, + "loss": 0.0196, + "num_input_tokens_seen": 9558144, + "step": 5580 + }, + { + "epoch": 27.11138014527845, + "grad_norm": 0.0025340213906019926, + "learning_rate": 0.28580420819103924, + "loss": 0.0325, + "num_input_tokens_seen": 9567072, + "step": 5585 + }, + { + "epoch": 27.135593220338983, + "grad_norm": 0.005022048484534025, + "learning_rate": 0.2857791842467269, + "loss": 0.0089, + "num_input_tokens_seen": 9575424, + "step": 5590 + }, + { + "epoch": 27.159806295399516, + "grad_norm": 0.004396447911858559, + "learning_rate": 0.2857541393635579, + "loss": 0.0425, + "num_input_tokens_seen": 9583776, + "step": 5595 + }, + { + "epoch": 27.18401937046005, + "grad_norm": 0.010448693297803402, + "learning_rate": 0.2857290735453948, + "loss": 0.0355, + "num_input_tokens_seen": 9592608, + "step": 5600 + }, + { + "epoch": 27.18401937046005, + "eval_loss": 0.35928356647491455, + "eval_runtime": 4.6292, + "eval_samples_per_second": 79.279, + "eval_steps_per_second": 19.874, + "num_input_tokens_seen": 9592608, + "step": 5600 + }, + { + "epoch": 27.208232445520583, + "grad_norm": 0.002378769451752305, + "learning_rate": 0.28570398679610276, + "loss": 0.0348, + "num_input_tokens_seen": 9600992, + "step": 5605 + }, + { + "epoch": 27.232445520581113, + "grad_norm": 0.01184111274778843, + "learning_rate": 0.2856788791195506, + "loss": 0.0533, + "num_input_tokens_seen": 9609728, + "step": 5610 + }, + { + "epoch": 27.256658595641646, + "grad_norm": 0.004369072150439024, + "learning_rate": 0.28565375051961023, + "loss": 0.0358, + "num_input_tokens_seen": 9618016, + "step": 5615 + }, + { + "epoch": 27.28087167070218, + "grad_norm": 0.0014507090672850609, + "learning_rate": 0.28562860100015686, + "loss": 0.0441, + "num_input_tokens_seen": 9626816, + "step": 5620 + }, + { + "epoch": 27.305084745762713, + "grad_norm": 0.010246449150145054, + "learning_rate": 0.2856034305650687, + "loss": 0.0837, + "num_input_tokens_seen": 9635392, + "step": 5625 + }, + { + "epoch": 27.329297820823246, + "grad_norm": 0.011137311346828938, + "learning_rate": 0.28557823921822756, + "loss": 0.0497, + "num_input_tokens_seen": 9644160, + "step": 5630 + }, + { + "epoch": 27.353510895883776, + "grad_norm": 0.0031912673730403185, + "learning_rate": 0.2855530269635181, + "loss": 0.0268, + "num_input_tokens_seen": 9652544, + "step": 5635 + }, + { + "epoch": 27.37772397094431, + "grad_norm": 0.005235560704022646, + "learning_rate": 0.2855277938048284, + "loss": 0.0193, + "num_input_tokens_seen": 9661024, + "step": 5640 + }, + { + "epoch": 27.401937046004843, + "grad_norm": 0.004980601370334625, + "learning_rate": 0.2855025397460498, + "loss": 0.0243, + "num_input_tokens_seen": 9669728, + "step": 5645 + }, + { + "epoch": 27.426150121065376, + "grad_norm": 0.0026312065310776234, + "learning_rate": 0.28547726479107666, + "loss": 0.0145, + "num_input_tokens_seen": 9678336, + "step": 5650 + }, + { + "epoch": 27.45036319612591, + "grad_norm": 0.011697628535330296, + "learning_rate": 0.2854519689438068, + "loss": 0.0231, + "num_input_tokens_seen": 9686752, + "step": 5655 + }, + { + "epoch": 27.47457627118644, + "grad_norm": 0.0054552131332457066, + "learning_rate": 0.2854266522081412, + "loss": 0.0127, + "num_input_tokens_seen": 9695264, + "step": 5660 + }, + { + "epoch": 27.498789346246973, + "grad_norm": 0.003525267820805311, + "learning_rate": 0.28540131458798385, + "loss": 0.0854, + "num_input_tokens_seen": 9704000, + "step": 5665 + }, + { + "epoch": 27.523002421307506, + "grad_norm": 0.003907197620719671, + "learning_rate": 0.28537595608724226, + "loss": 0.0571, + "num_input_tokens_seen": 9712384, + "step": 5670 + }, + { + "epoch": 27.54721549636804, + "grad_norm": 0.008533839136362076, + "learning_rate": 0.28535057670982705, + "loss": 0.0301, + "num_input_tokens_seen": 9720864, + "step": 5675 + }, + { + "epoch": 27.571428571428573, + "grad_norm": 0.006391732953488827, + "learning_rate": 0.285325176459652, + "loss": 0.0692, + "num_input_tokens_seen": 9729280, + "step": 5680 + }, + { + "epoch": 27.595641646489103, + "grad_norm": 0.009245307184755802, + "learning_rate": 0.28529975534063406, + "loss": 0.0614, + "num_input_tokens_seen": 9737760, + "step": 5685 + }, + { + "epoch": 27.619854721549636, + "grad_norm": 0.007822115905582905, + "learning_rate": 0.2852743133566936, + "loss": 0.0627, + "num_input_tokens_seen": 9746016, + "step": 5690 + }, + { + "epoch": 27.64406779661017, + "grad_norm": 0.016376368701457977, + "learning_rate": 0.2852488505117541, + "loss": 0.0585, + "num_input_tokens_seen": 9754688, + "step": 5695 + }, + { + "epoch": 27.668280871670703, + "grad_norm": 0.006502298638224602, + "learning_rate": 0.28522336680974214, + "loss": 0.0564, + "num_input_tokens_seen": 9763040, + "step": 5700 + }, + { + "epoch": 27.692493946731236, + "grad_norm": 0.006689038127660751, + "learning_rate": 0.2851978622545877, + "loss": 0.0428, + "num_input_tokens_seen": 9770944, + "step": 5705 + }, + { + "epoch": 27.716707021791766, + "grad_norm": 0.007037417963147163, + "learning_rate": 0.285172336850224, + "loss": 0.0263, + "num_input_tokens_seen": 9779200, + "step": 5710 + }, + { + "epoch": 27.7409200968523, + "grad_norm": 0.006465295795351267, + "learning_rate": 0.2851467906005871, + "loss": 0.039, + "num_input_tokens_seen": 9787744, + "step": 5715 + }, + { + "epoch": 27.765133171912833, + "grad_norm": 0.002489279955625534, + "learning_rate": 0.28512122350961683, + "loss": 0.0263, + "num_input_tokens_seen": 9796480, + "step": 5720 + }, + { + "epoch": 27.789346246973366, + "grad_norm": 0.014387407340109348, + "learning_rate": 0.2850956355812559, + "loss": 0.0483, + "num_input_tokens_seen": 9804992, + "step": 5725 + }, + { + "epoch": 27.8135593220339, + "grad_norm": 0.009332387708127499, + "learning_rate": 0.28507002681945015, + "loss": 0.032, + "num_input_tokens_seen": 9814016, + "step": 5730 + }, + { + "epoch": 27.83777239709443, + "grad_norm": 0.01578746922314167, + "learning_rate": 0.28504439722814895, + "loss": 0.0964, + "num_input_tokens_seen": 9822400, + "step": 5735 + }, + { + "epoch": 27.861985472154963, + "grad_norm": 0.004101573955267668, + "learning_rate": 0.28501874681130457, + "loss": 0.0558, + "num_input_tokens_seen": 9830944, + "step": 5740 + }, + { + "epoch": 27.886198547215496, + "grad_norm": 0.007614610251039267, + "learning_rate": 0.2849930755728727, + "loss": 0.0388, + "num_input_tokens_seen": 9839616, + "step": 5745 + }, + { + "epoch": 27.91041162227603, + "grad_norm": 0.013322088867425919, + "learning_rate": 0.28496738351681217, + "loss": 0.0493, + "num_input_tokens_seen": 9848192, + "step": 5750 + }, + { + "epoch": 27.934624697336563, + "grad_norm": 0.0037293678615242243, + "learning_rate": 0.284941670647085, + "loss": 0.0201, + "num_input_tokens_seen": 9857056, + "step": 5755 + }, + { + "epoch": 27.958837772397093, + "grad_norm": 0.008224154822528362, + "learning_rate": 0.2849159369676563, + "loss": 0.0298, + "num_input_tokens_seen": 9865888, + "step": 5760 + }, + { + "epoch": 27.983050847457626, + "grad_norm": 0.011563452892005444, + "learning_rate": 0.2848901824824948, + "loss": 0.0459, + "num_input_tokens_seen": 9874560, + "step": 5765 + }, + { + "epoch": 28.009685230024214, + "grad_norm": 0.007254316471517086, + "learning_rate": 0.284864407195572, + "loss": 0.0419, + "num_input_tokens_seen": 9883200, + "step": 5770 + }, + { + "epoch": 28.033898305084747, + "grad_norm": 0.011461423709988594, + "learning_rate": 0.28483861111086284, + "loss": 0.0341, + "num_input_tokens_seen": 9891488, + "step": 5775 + }, + { + "epoch": 28.058111380145277, + "grad_norm": 0.0037456154823303223, + "learning_rate": 0.2848127942323453, + "loss": 0.0145, + "num_input_tokens_seen": 9900320, + "step": 5780 + }, + { + "epoch": 28.08232445520581, + "grad_norm": 0.01312308106571436, + "learning_rate": 0.2847869565640007, + "loss": 0.0292, + "num_input_tokens_seen": 9908800, + "step": 5785 + }, + { + "epoch": 28.106537530266344, + "grad_norm": 0.0031054120045155287, + "learning_rate": 0.2847610981098136, + "loss": 0.0096, + "num_input_tokens_seen": 9916960, + "step": 5790 + }, + { + "epoch": 28.130750605326877, + "grad_norm": 0.009710662998259068, + "learning_rate": 0.2847352188737716, + "loss": 0.0467, + "num_input_tokens_seen": 9925344, + "step": 5795 + }, + { + "epoch": 28.15496368038741, + "grad_norm": 0.0009681852534413338, + "learning_rate": 0.2847093188598658, + "loss": 0.0166, + "num_input_tokens_seen": 9933568, + "step": 5800 + }, + { + "epoch": 28.15496368038741, + "eval_loss": 0.347304105758667, + "eval_runtime": 4.6131, + "eval_samples_per_second": 79.556, + "eval_steps_per_second": 19.943, + "num_input_tokens_seen": 9933568, + "step": 5800 + }, + { + "epoch": 28.17917675544794, + "grad_norm": 0.001612599822692573, + "learning_rate": 0.28468339807209003, + "loss": 0.0137, + "num_input_tokens_seen": 9942144, + "step": 5805 + }, + { + "epoch": 28.203389830508474, + "grad_norm": 0.013285033404827118, + "learning_rate": 0.2846574565144418, + "loss": 0.0268, + "num_input_tokens_seen": 9950784, + "step": 5810 + }, + { + "epoch": 28.227602905569007, + "grad_norm": 0.00469855684787035, + "learning_rate": 0.28463149419092154, + "loss": 0.0234, + "num_input_tokens_seen": 9959232, + "step": 5815 + }, + { + "epoch": 28.25181598062954, + "grad_norm": 0.004609542433172464, + "learning_rate": 0.284605511105533, + "loss": 0.0223, + "num_input_tokens_seen": 9967968, + "step": 5820 + }, + { + "epoch": 28.276029055690074, + "grad_norm": 0.004775868728756905, + "learning_rate": 0.28457950726228315, + "loss": 0.0163, + "num_input_tokens_seen": 9976288, + "step": 5825 + }, + { + "epoch": 28.300242130750604, + "grad_norm": 0.011360222473740578, + "learning_rate": 0.28455348266518193, + "loss": 0.025, + "num_input_tokens_seen": 9984928, + "step": 5830 + }, + { + "epoch": 28.324455205811137, + "grad_norm": 0.0009111149702221155, + "learning_rate": 0.28452743731824287, + "loss": 0.0137, + "num_input_tokens_seen": 9993216, + "step": 5835 + }, + { + "epoch": 28.34866828087167, + "grad_norm": 0.012649932876229286, + "learning_rate": 0.28450137122548236, + "loss": 0.0467, + "num_input_tokens_seen": 10001632, + "step": 5840 + }, + { + "epoch": 28.372881355932204, + "grad_norm": 0.006874158512800932, + "learning_rate": 0.2844752843909201, + "loss": 0.0133, + "num_input_tokens_seen": 10010304, + "step": 5845 + }, + { + "epoch": 28.397094430992738, + "grad_norm": 0.011582540348172188, + "learning_rate": 0.28444917681857923, + "loss": 0.0226, + "num_input_tokens_seen": 10019072, + "step": 5850 + }, + { + "epoch": 28.421307506053267, + "grad_norm": 0.0034014503471553326, + "learning_rate": 0.28442304851248557, + "loss": 0.0155, + "num_input_tokens_seen": 10027712, + "step": 5855 + }, + { + "epoch": 28.4455205811138, + "grad_norm": 0.009582020342350006, + "learning_rate": 0.2843968994766686, + "loss": 0.0136, + "num_input_tokens_seen": 10036608, + "step": 5860 + }, + { + "epoch": 28.469733656174334, + "grad_norm": 0.00364207336679101, + "learning_rate": 0.28437072971516075, + "loss": 0.0116, + "num_input_tokens_seen": 10044960, + "step": 5865 + }, + { + "epoch": 28.493946731234868, + "grad_norm": 0.00895712897181511, + "learning_rate": 0.2843445392319979, + "loss": 0.0205, + "num_input_tokens_seen": 10053568, + "step": 5870 + }, + { + "epoch": 28.5181598062954, + "grad_norm": 0.008032158948481083, + "learning_rate": 0.28431832803121865, + "loss": 0.0224, + "num_input_tokens_seen": 10062144, + "step": 5875 + }, + { + "epoch": 28.54237288135593, + "grad_norm": 0.0026575580704957247, + "learning_rate": 0.28429209611686534, + "loss": 0.0156, + "num_input_tokens_seen": 10070624, + "step": 5880 + }, + { + "epoch": 28.566585956416464, + "grad_norm": 0.005437444429844618, + "learning_rate": 0.28426584349298323, + "loss": 0.0166, + "num_input_tokens_seen": 10079328, + "step": 5885 + }, + { + "epoch": 28.590799031476998, + "grad_norm": 0.01664937101304531, + "learning_rate": 0.2842395701636207, + "loss": 0.0571, + "num_input_tokens_seen": 10087808, + "step": 5890 + }, + { + "epoch": 28.61501210653753, + "grad_norm": 0.014506528154015541, + "learning_rate": 0.28421327613282954, + "loss": 0.0651, + "num_input_tokens_seen": 10096192, + "step": 5895 + }, + { + "epoch": 28.639225181598064, + "grad_norm": 0.0009191579883918166, + "learning_rate": 0.28418696140466454, + "loss": 0.0273, + "num_input_tokens_seen": 10104704, + "step": 5900 + }, + { + "epoch": 28.663438256658594, + "grad_norm": 0.008955471217632294, + "learning_rate": 0.2841606259831838, + "loss": 0.0283, + "num_input_tokens_seen": 10113408, + "step": 5905 + }, + { + "epoch": 28.687651331719128, + "grad_norm": 0.0015941913006827235, + "learning_rate": 0.2841342698724486, + "loss": 0.017, + "num_input_tokens_seen": 10122016, + "step": 5910 + }, + { + "epoch": 28.71186440677966, + "grad_norm": 0.00606264965608716, + "learning_rate": 0.28410789307652334, + "loss": 0.0394, + "num_input_tokens_seen": 10130816, + "step": 5915 + }, + { + "epoch": 28.736077481840194, + "grad_norm": 0.006097684148699045, + "learning_rate": 0.2840814955994756, + "loss": 0.0418, + "num_input_tokens_seen": 10139456, + "step": 5920 + }, + { + "epoch": 28.760290556900728, + "grad_norm": 0.002304822439327836, + "learning_rate": 0.2840550774453763, + "loss": 0.0243, + "num_input_tokens_seen": 10148704, + "step": 5925 + }, + { + "epoch": 28.784503631961257, + "grad_norm": 0.00637489790096879, + "learning_rate": 0.28402863861829947, + "loss": 0.0408, + "num_input_tokens_seen": 10157056, + "step": 5930 + }, + { + "epoch": 28.80871670702179, + "grad_norm": 0.01166750118136406, + "learning_rate": 0.2840021791223222, + "loss": 0.065, + "num_input_tokens_seen": 10165504, + "step": 5935 + }, + { + "epoch": 28.832929782082324, + "grad_norm": 0.001513806520961225, + "learning_rate": 0.2839756989615249, + "loss": 0.0252, + "num_input_tokens_seen": 10174176, + "step": 5940 + }, + { + "epoch": 28.857142857142858, + "grad_norm": 0.004257069900631905, + "learning_rate": 0.28394919813999125, + "loss": 0.0344, + "num_input_tokens_seen": 10182880, + "step": 5945 + }, + { + "epoch": 28.88135593220339, + "grad_norm": 0.011887911707162857, + "learning_rate": 0.28392267666180787, + "loss": 0.0435, + "num_input_tokens_seen": 10191488, + "step": 5950 + }, + { + "epoch": 28.90556900726392, + "grad_norm": 0.0006612995057366788, + "learning_rate": 0.2838961345310648, + "loss": 0.0134, + "num_input_tokens_seen": 10200416, + "step": 5955 + }, + { + "epoch": 28.929782082324454, + "grad_norm": 0.01594662107527256, + "learning_rate": 0.2838695717518552, + "loss": 0.0517, + "num_input_tokens_seen": 10208832, + "step": 5960 + }, + { + "epoch": 28.953995157384988, + "grad_norm": 0.010013467632234097, + "learning_rate": 0.28384298832827526, + "loss": 0.0258, + "num_input_tokens_seen": 10217056, + "step": 5965 + }, + { + "epoch": 28.97820823244552, + "grad_norm": 0.004083605948835611, + "learning_rate": 0.28381638426442457, + "loss": 0.0304, + "num_input_tokens_seen": 10225632, + "step": 5970 + }, + { + "epoch": 29.004842615012105, + "grad_norm": 0.051845796406269073, + "learning_rate": 0.2837897595644057, + "loss": 0.0832, + "num_input_tokens_seen": 10234464, + "step": 5975 + }, + { + "epoch": 29.02905569007264, + "grad_norm": 0.0034325055312365294, + "learning_rate": 0.28376311423232475, + "loss": 0.0365, + "num_input_tokens_seen": 10243424, + "step": 5980 + }, + { + "epoch": 29.053268765133172, + "grad_norm": 0.0033665213268250227, + "learning_rate": 0.2837364482722905, + "loss": 0.0223, + "num_input_tokens_seen": 10251968, + "step": 5985 + }, + { + "epoch": 29.077481840193705, + "grad_norm": 0.006648968439549208, + "learning_rate": 0.28370976168841533, + "loss": 0.0276, + "num_input_tokens_seen": 10260448, + "step": 5990 + }, + { + "epoch": 29.10169491525424, + "grad_norm": 0.001443632529117167, + "learning_rate": 0.2836830544848146, + "loss": 0.0238, + "num_input_tokens_seen": 10268544, + "step": 5995 + }, + { + "epoch": 29.12590799031477, + "grad_norm": 0.007746626622974873, + "learning_rate": 0.2836563266656069, + "loss": 0.0216, + "num_input_tokens_seen": 10277088, + "step": 6000 + }, + { + "epoch": 29.12590799031477, + "eval_loss": 0.38236203789711, + "eval_runtime": 4.6273, + "eval_samples_per_second": 79.313, + "eval_steps_per_second": 19.882, + "num_input_tokens_seen": 10277088, + "step": 6000 + }, + { + "epoch": 29.150121065375302, + "grad_norm": 0.0010310927173122764, + "learning_rate": 0.283629578234914, + "loss": 0.091, + "num_input_tokens_seen": 10285760, + "step": 6005 + }, + { + "epoch": 29.174334140435835, + "grad_norm": 0.0006331848562695086, + "learning_rate": 0.2836028091968608, + "loss": 0.0425, + "num_input_tokens_seen": 10294784, + "step": 6010 + }, + { + "epoch": 29.19854721549637, + "grad_norm": 0.0034342652652412653, + "learning_rate": 0.28357601955557554, + "loss": 0.0096, + "num_input_tokens_seen": 10303360, + "step": 6015 + }, + { + "epoch": 29.222760290556902, + "grad_norm": 0.008264169096946716, + "learning_rate": 0.2835492093151894, + "loss": 0.0622, + "num_input_tokens_seen": 10312064, + "step": 6020 + }, + { + "epoch": 29.246973365617432, + "grad_norm": 0.015095417387783527, + "learning_rate": 0.2835223784798369, + "loss": 0.0892, + "num_input_tokens_seen": 10320448, + "step": 6025 + }, + { + "epoch": 29.271186440677965, + "grad_norm": 0.007208686787635088, + "learning_rate": 0.2834955270536557, + "loss": 0.0473, + "num_input_tokens_seen": 10329088, + "step": 6030 + }, + { + "epoch": 29.2953995157385, + "grad_norm": 0.010191328823566437, + "learning_rate": 0.2834686550407866, + "loss": 0.0935, + "num_input_tokens_seen": 10337664, + "step": 6035 + }, + { + "epoch": 29.319612590799032, + "grad_norm": 0.009477955289185047, + "learning_rate": 0.28344176244537367, + "loss": 0.1007, + "num_input_tokens_seen": 10346112, + "step": 6040 + }, + { + "epoch": 29.343825665859566, + "grad_norm": 0.010098096914589405, + "learning_rate": 0.28341484927156396, + "loss": 0.0527, + "num_input_tokens_seen": 10354912, + "step": 6045 + }, + { + "epoch": 29.368038740920095, + "grad_norm": 0.004085563123226166, + "learning_rate": 0.28338791552350795, + "loss": 0.0475, + "num_input_tokens_seen": 10363264, + "step": 6050 + }, + { + "epoch": 29.39225181598063, + "grad_norm": 0.01164751686155796, + "learning_rate": 0.28336096120535914, + "loss": 0.0438, + "num_input_tokens_seen": 10371552, + "step": 6055 + }, + { + "epoch": 29.416464891041162, + "grad_norm": 0.00784617755562067, + "learning_rate": 0.2833339863212741, + "loss": 0.0352, + "num_input_tokens_seen": 10380224, + "step": 6060 + }, + { + "epoch": 29.440677966101696, + "grad_norm": 0.006575527135282755, + "learning_rate": 0.28330699087541283, + "loss": 0.0377, + "num_input_tokens_seen": 10388352, + "step": 6065 + }, + { + "epoch": 29.46489104116223, + "grad_norm": 0.008188319392502308, + "learning_rate": 0.2832799748719384, + "loss": 0.0457, + "num_input_tokens_seen": 10397120, + "step": 6070 + }, + { + "epoch": 29.48910411622276, + "grad_norm": 0.007746563293039799, + "learning_rate": 0.28325293831501686, + "loss": 0.0353, + "num_input_tokens_seen": 10405664, + "step": 6075 + }, + { + "epoch": 29.513317191283292, + "grad_norm": 0.007749489508569241, + "learning_rate": 0.2832258812088177, + "loss": 0.0273, + "num_input_tokens_seen": 10414080, + "step": 6080 + }, + { + "epoch": 29.537530266343826, + "grad_norm": 0.010835062712430954, + "learning_rate": 0.2831988035575134, + "loss": 0.065, + "num_input_tokens_seen": 10422912, + "step": 6085 + }, + { + "epoch": 29.56174334140436, + "grad_norm": 0.001654289080761373, + "learning_rate": 0.28317170536527975, + "loss": 0.0306, + "num_input_tokens_seen": 10431168, + "step": 6090 + }, + { + "epoch": 29.585956416464892, + "grad_norm": 0.0022483731154352427, + "learning_rate": 0.2831445866362956, + "loss": 0.0467, + "num_input_tokens_seen": 10439648, + "step": 6095 + }, + { + "epoch": 29.610169491525422, + "grad_norm": 0.007163942791521549, + "learning_rate": 0.2831174473747429, + "loss": 0.0525, + "num_input_tokens_seen": 10448384, + "step": 6100 + }, + { + "epoch": 29.634382566585955, + "grad_norm": 0.0030901413410902023, + "learning_rate": 0.2830902875848071, + "loss": 0.0254, + "num_input_tokens_seen": 10456672, + "step": 6105 + }, + { + "epoch": 29.65859564164649, + "grad_norm": 0.0019474996952340007, + "learning_rate": 0.28306310727067635, + "loss": 0.0276, + "num_input_tokens_seen": 10465056, + "step": 6110 + }, + { + "epoch": 29.682808716707022, + "grad_norm": 0.005840312223881483, + "learning_rate": 0.2830359064365423, + "loss": 0.0256, + "num_input_tokens_seen": 10473088, + "step": 6115 + }, + { + "epoch": 29.707021791767556, + "grad_norm": 0.009356793947517872, + "learning_rate": 0.28300868508659965, + "loss": 0.0583, + "num_input_tokens_seen": 10482016, + "step": 6120 + }, + { + "epoch": 29.731234866828085, + "grad_norm": 0.009378666989505291, + "learning_rate": 0.28298144322504626, + "loss": 0.0217, + "num_input_tokens_seen": 10490912, + "step": 6125 + }, + { + "epoch": 29.75544794188862, + "grad_norm": 0.0035288427025079727, + "learning_rate": 0.2829541808560832, + "loss": 0.0378, + "num_input_tokens_seen": 10499392, + "step": 6130 + }, + { + "epoch": 29.779661016949152, + "grad_norm": 0.008073392324149609, + "learning_rate": 0.2829268979839146, + "loss": 0.0357, + "num_input_tokens_seen": 10507648, + "step": 6135 + }, + { + "epoch": 29.803874092009686, + "grad_norm": 0.020287415012717247, + "learning_rate": 0.2828995946127479, + "loss": 0.0514, + "num_input_tokens_seen": 10516608, + "step": 6140 + }, + { + "epoch": 29.82808716707022, + "grad_norm": 0.009940443560481071, + "learning_rate": 0.2828722707467936, + "loss": 0.0469, + "num_input_tokens_seen": 10525248, + "step": 6145 + }, + { + "epoch": 29.852300242130752, + "grad_norm": 0.00507322745397687, + "learning_rate": 0.2828449263902653, + "loss": 0.0296, + "num_input_tokens_seen": 10534016, + "step": 6150 + }, + { + "epoch": 29.876513317191282, + "grad_norm": 0.005047328304499388, + "learning_rate": 0.28281756154738, + "loss": 0.0269, + "num_input_tokens_seen": 10542592, + "step": 6155 + }, + { + "epoch": 29.900726392251816, + "grad_norm": 0.0019504317315295339, + "learning_rate": 0.28279017622235764, + "loss": 0.052, + "num_input_tokens_seen": 10551232, + "step": 6160 + }, + { + "epoch": 29.92493946731235, + "grad_norm": 0.005392779130488634, + "learning_rate": 0.28276277041942127, + "loss": 0.0224, + "num_input_tokens_seen": 10559648, + "step": 6165 + }, + { + "epoch": 29.949152542372882, + "grad_norm": 0.003869615960866213, + "learning_rate": 0.2827353441427974, + "loss": 0.0521, + "num_input_tokens_seen": 10568128, + "step": 6170 + }, + { + "epoch": 29.973365617433416, + "grad_norm": 0.007091515231877565, + "learning_rate": 0.2827078973967153, + "loss": 0.0549, + "num_input_tokens_seen": 10576672, + "step": 6175 + }, + { + "epoch": 29.997578692493946, + "grad_norm": 0.006147994659841061, + "learning_rate": 0.2826804301854078, + "loss": 0.0281, + "num_input_tokens_seen": 10585088, + "step": 6180 + }, + { + "epoch": 30.024213075060533, + "grad_norm": 0.005718730390071869, + "learning_rate": 0.2826529425131105, + "loss": 0.0105, + "num_input_tokens_seen": 10594176, + "step": 6185 + }, + { + "epoch": 30.048426150121067, + "grad_norm": 0.007616083137691021, + "learning_rate": 0.2826254343840625, + "loss": 0.0208, + "num_input_tokens_seen": 10602688, + "step": 6190 + }, + { + "epoch": 30.072639225181597, + "grad_norm": 0.0016875944565981627, + "learning_rate": 0.2825979058025059, + "loss": 0.0106, + "num_input_tokens_seen": 10610944, + "step": 6195 + }, + { + "epoch": 30.09685230024213, + "grad_norm": 0.004152728710323572, + "learning_rate": 0.2825703567726858, + "loss": 0.0187, + "num_input_tokens_seen": 10619488, + "step": 6200 + }, + { + "epoch": 30.09685230024213, + "eval_loss": 0.40932339429855347, + "eval_runtime": 4.6142, + "eval_samples_per_second": 79.538, + "eval_steps_per_second": 19.939, + "num_input_tokens_seen": 10619488, + "step": 6200 + }, + { + "epoch": 30.121065375302663, + "grad_norm": 0.0004888582625426352, + "learning_rate": 0.2825427872988508, + "loss": 0.024, + "num_input_tokens_seen": 10627968, + "step": 6205 + }, + { + "epoch": 30.145278450363197, + "grad_norm": 0.014699950814247131, + "learning_rate": 0.28251519738525227, + "loss": 0.0165, + "num_input_tokens_seen": 10636608, + "step": 6210 + }, + { + "epoch": 30.16949152542373, + "grad_norm": 0.002133154310286045, + "learning_rate": 0.28248758703614507, + "loss": 0.0155, + "num_input_tokens_seen": 10645152, + "step": 6215 + }, + { + "epoch": 30.19370460048426, + "grad_norm": 0.01412939466536045, + "learning_rate": 0.28245995625578696, + "loss": 0.0296, + "num_input_tokens_seen": 10653856, + "step": 6220 + }, + { + "epoch": 30.217917675544793, + "grad_norm": 0.009586004540324211, + "learning_rate": 0.282432305048439, + "loss": 0.0251, + "num_input_tokens_seen": 10662400, + "step": 6225 + }, + { + "epoch": 30.242130750605327, + "grad_norm": 0.002459130249917507, + "learning_rate": 0.28240463341836536, + "loss": 0.034, + "num_input_tokens_seen": 10670976, + "step": 6230 + }, + { + "epoch": 30.26634382566586, + "grad_norm": 0.00838039442896843, + "learning_rate": 0.2823769413698334, + "loss": 0.0336, + "num_input_tokens_seen": 10679424, + "step": 6235 + }, + { + "epoch": 30.290556900726394, + "grad_norm": 0.007183686830103397, + "learning_rate": 0.2823492289071135, + "loss": 0.028, + "num_input_tokens_seen": 10688128, + "step": 6240 + }, + { + "epoch": 30.314769975786923, + "grad_norm": 0.0014300947077572346, + "learning_rate": 0.2823214960344793, + "loss": 0.0403, + "num_input_tokens_seen": 10696864, + "step": 6245 + }, + { + "epoch": 30.338983050847457, + "grad_norm": 0.0013255664380267262, + "learning_rate": 0.28229374275620756, + "loss": 0.0205, + "num_input_tokens_seen": 10705312, + "step": 6250 + }, + { + "epoch": 30.36319612590799, + "grad_norm": 0.0006239761714823544, + "learning_rate": 0.28226596907657814, + "loss": 0.0289, + "num_input_tokens_seen": 10714048, + "step": 6255 + }, + { + "epoch": 30.387409200968523, + "grad_norm": 0.0007189657771959901, + "learning_rate": 0.28223817499987414, + "loss": 0.0128, + "num_input_tokens_seen": 10722816, + "step": 6260 + }, + { + "epoch": 30.411622276029057, + "grad_norm": 0.004801915492862463, + "learning_rate": 0.2822103605303818, + "loss": 0.0463, + "num_input_tokens_seen": 10731232, + "step": 6265 + }, + { + "epoch": 30.435835351089587, + "grad_norm": 0.003390589961782098, + "learning_rate": 0.2821825256723903, + "loss": 0.028, + "num_input_tokens_seen": 10739840, + "step": 6270 + }, + { + "epoch": 30.46004842615012, + "grad_norm": 0.0056129139848053455, + "learning_rate": 0.2821546704301923, + "loss": 0.0209, + "num_input_tokens_seen": 10748352, + "step": 6275 + }, + { + "epoch": 30.484261501210653, + "grad_norm": 0.007951987907290459, + "learning_rate": 0.2821267948080834, + "loss": 0.0384, + "num_input_tokens_seen": 10756608, + "step": 6280 + }, + { + "epoch": 30.508474576271187, + "grad_norm": 0.008708737790584564, + "learning_rate": 0.28209889881036226, + "loss": 0.0292, + "num_input_tokens_seen": 10765152, + "step": 6285 + }, + { + "epoch": 30.53268765133172, + "grad_norm": 0.00568590871989727, + "learning_rate": 0.28207098244133094, + "loss": 0.0245, + "num_input_tokens_seen": 10774304, + "step": 6290 + }, + { + "epoch": 30.55690072639225, + "grad_norm": 0.0005429614102467895, + "learning_rate": 0.2820430457052943, + "loss": 0.0295, + "num_input_tokens_seen": 10782688, + "step": 6295 + }, + { + "epoch": 30.581113801452783, + "grad_norm": 0.002156594768166542, + "learning_rate": 0.28201508860656077, + "loss": 0.0166, + "num_input_tokens_seen": 10790944, + "step": 6300 + }, + { + "epoch": 30.605326876513317, + "grad_norm": 0.0031992888543754816, + "learning_rate": 0.2819871111494415, + "loss": 0.0216, + "num_input_tokens_seen": 10799488, + "step": 6305 + }, + { + "epoch": 30.62953995157385, + "grad_norm": 0.004360993858426809, + "learning_rate": 0.28195911333825113, + "loss": 0.0101, + "num_input_tokens_seen": 10808320, + "step": 6310 + }, + { + "epoch": 30.653753026634384, + "grad_norm": 0.012063895352184772, + "learning_rate": 0.28193109517730713, + "loss": 0.0202, + "num_input_tokens_seen": 10816608, + "step": 6315 + }, + { + "epoch": 30.677966101694913, + "grad_norm": 0.005196544341742992, + "learning_rate": 0.2819030566709303, + "loss": 0.0146, + "num_input_tokens_seen": 10825120, + "step": 6320 + }, + { + "epoch": 30.702179176755447, + "grad_norm": 0.006044864188879728, + "learning_rate": 0.2818749978234445, + "loss": 0.0241, + "num_input_tokens_seen": 10833696, + "step": 6325 + }, + { + "epoch": 30.72639225181598, + "grad_norm": 0.012050513178110123, + "learning_rate": 0.2818469186391768, + "loss": 0.0273, + "num_input_tokens_seen": 10842048, + "step": 6330 + }, + { + "epoch": 30.750605326876514, + "grad_norm": 0.009396987035870552, + "learning_rate": 0.28181881912245743, + "loss": 0.0183, + "num_input_tokens_seen": 10850656, + "step": 6335 + }, + { + "epoch": 30.774818401937047, + "grad_norm": 0.007490444928407669, + "learning_rate": 0.2817906992776195, + "loss": 0.0205, + "num_input_tokens_seen": 10859200, + "step": 6340 + }, + { + "epoch": 30.79903147699758, + "grad_norm": 0.0036698312032967806, + "learning_rate": 0.28176255910899967, + "loss": 0.0123, + "num_input_tokens_seen": 10867680, + "step": 6345 + }, + { + "epoch": 30.82324455205811, + "grad_norm": 0.01129594724625349, + "learning_rate": 0.2817343986209373, + "loss": 0.0307, + "num_input_tokens_seen": 10876352, + "step": 6350 + }, + { + "epoch": 30.847457627118644, + "grad_norm": 0.011349881067872047, + "learning_rate": 0.2817062178177753, + "loss": 0.0297, + "num_input_tokens_seen": 10884992, + "step": 6355 + }, + { + "epoch": 30.871670702179177, + "grad_norm": 0.0034491100814193487, + "learning_rate": 0.2816780167038593, + "loss": 0.0452, + "num_input_tokens_seen": 10893472, + "step": 6360 + }, + { + "epoch": 30.89588377723971, + "grad_norm": 0.005815032869577408, + "learning_rate": 0.28164979528353834, + "loss": 0.0299, + "num_input_tokens_seen": 10901728, + "step": 6365 + }, + { + "epoch": 30.920096852300244, + "grad_norm": 0.015379645861685276, + "learning_rate": 0.28162155356116453, + "loss": 0.0559, + "num_input_tokens_seen": 10910272, + "step": 6370 + }, + { + "epoch": 30.944309927360774, + "grad_norm": 0.010143215768039227, + "learning_rate": 0.28159329154109314, + "loss": 0.043, + "num_input_tokens_seen": 10918592, + "step": 6375 + }, + { + "epoch": 30.968523002421307, + "grad_norm": 0.0025285209994763136, + "learning_rate": 0.28156500922768246, + "loss": 0.0361, + "num_input_tokens_seen": 10927424, + "step": 6380 + }, + { + "epoch": 30.99273607748184, + "grad_norm": 0.005930402781814337, + "learning_rate": 0.28153670662529406, + "loss": 0.0353, + "num_input_tokens_seen": 10936384, + "step": 6385 + }, + { + "epoch": 31.019370460048425, + "grad_norm": 0.0019075904274359345, + "learning_rate": 0.28150838373829246, + "loss": 0.0281, + "num_input_tokens_seen": 10945216, + "step": 6390 + }, + { + "epoch": 31.043583535108958, + "grad_norm": 0.001058183261193335, + "learning_rate": 0.2814800405710455, + "loss": 0.0304, + "num_input_tokens_seen": 10953568, + "step": 6395 + }, + { + "epoch": 31.06779661016949, + "grad_norm": 0.009708947502076626, + "learning_rate": 0.2814516771279239, + "loss": 0.0332, + "num_input_tokens_seen": 10962112, + "step": 6400 + }, + { + "epoch": 31.06779661016949, + "eval_loss": 0.38957688212394714, + "eval_runtime": 4.6231, + "eval_samples_per_second": 79.383, + "eval_steps_per_second": 19.9, + "num_input_tokens_seen": 10962112, + "step": 6400 + }, + { + "epoch": 31.092009685230025, + "grad_norm": 0.0018324910197407007, + "learning_rate": 0.28142329341330186, + "loss": 0.0109, + "num_input_tokens_seen": 10971200, + "step": 6405 + }, + { + "epoch": 31.116222760290558, + "grad_norm": 0.0007369147497229278, + "learning_rate": 0.2813948894315564, + "loss": 0.0174, + "num_input_tokens_seen": 10979936, + "step": 6410 + }, + { + "epoch": 31.140435835351088, + "grad_norm": 0.0002355070464545861, + "learning_rate": 0.2813664651870677, + "loss": 0.0055, + "num_input_tokens_seen": 10988512, + "step": 6415 + }, + { + "epoch": 31.16464891041162, + "grad_norm": 0.0018119249725714326, + "learning_rate": 0.28133802068421926, + "loss": 0.0031, + "num_input_tokens_seen": 10997056, + "step": 6420 + }, + { + "epoch": 31.188861985472155, + "grad_norm": 0.0052183400839567184, + "learning_rate": 0.28130955592739754, + "loss": 0.0257, + "num_input_tokens_seen": 11005408, + "step": 6425 + }, + { + "epoch": 31.213075060532688, + "grad_norm": 0.002561873523518443, + "learning_rate": 0.2812810709209922, + "loss": 0.0177, + "num_input_tokens_seen": 11013824, + "step": 6430 + }, + { + "epoch": 31.23728813559322, + "grad_norm": 0.002841181354597211, + "learning_rate": 0.2812525656693959, + "loss": 0.0105, + "num_input_tokens_seen": 11022464, + "step": 6435 + }, + { + "epoch": 31.26150121065375, + "grad_norm": 0.003280766075477004, + "learning_rate": 0.28122404017700453, + "loss": 0.0045, + "num_input_tokens_seen": 11031552, + "step": 6440 + }, + { + "epoch": 31.285714285714285, + "grad_norm": 0.008911705575883389, + "learning_rate": 0.2811954944482171, + "loss": 0.0176, + "num_input_tokens_seen": 11039840, + "step": 6445 + }, + { + "epoch": 31.309927360774818, + "grad_norm": 0.0017046238062903285, + "learning_rate": 0.2811669284874358, + "loss": 0.0104, + "num_input_tokens_seen": 11048320, + "step": 6450 + }, + { + "epoch": 31.33414043583535, + "grad_norm": 0.004114898853003979, + "learning_rate": 0.2811383422990657, + "loss": 0.0172, + "num_input_tokens_seen": 11056928, + "step": 6455 + }, + { + "epoch": 31.358353510895885, + "grad_norm": 0.00923193246126175, + "learning_rate": 0.2811097358875152, + "loss": 0.0188, + "num_input_tokens_seen": 11065568, + "step": 6460 + }, + { + "epoch": 31.38256658595642, + "grad_norm": 0.0010740322759374976, + "learning_rate": 0.2810811092571959, + "loss": 0.0091, + "num_input_tokens_seen": 11074368, + "step": 6465 + }, + { + "epoch": 31.406779661016948, + "grad_norm": 0.007040260825306177, + "learning_rate": 0.28105246241252224, + "loss": 0.0142, + "num_input_tokens_seen": 11082528, + "step": 6470 + }, + { + "epoch": 31.43099273607748, + "grad_norm": 0.01073891669511795, + "learning_rate": 0.28102379535791194, + "loss": 0.0315, + "num_input_tokens_seen": 11091104, + "step": 6475 + }, + { + "epoch": 31.455205811138015, + "grad_norm": 0.009309533052146435, + "learning_rate": 0.2809951080977859, + "loss": 0.0182, + "num_input_tokens_seen": 11099680, + "step": 6480 + }, + { + "epoch": 31.479418886198548, + "grad_norm": 0.0031392527744174004, + "learning_rate": 0.28096640063656797, + "loss": 0.0168, + "num_input_tokens_seen": 11108192, + "step": 6485 + }, + { + "epoch": 31.50363196125908, + "grad_norm": 0.005916336551308632, + "learning_rate": 0.2809376729786852, + "loss": 0.018, + "num_input_tokens_seen": 11116640, + "step": 6490 + }, + { + "epoch": 31.52784503631961, + "grad_norm": 0.0069991727359592915, + "learning_rate": 0.28090892512856785, + "loss": 0.0116, + "num_input_tokens_seen": 11124992, + "step": 6495 + }, + { + "epoch": 31.552058111380145, + "grad_norm": 0.0011779326014220715, + "learning_rate": 0.2808801570906491, + "loss": 0.0175, + "num_input_tokens_seen": 11133600, + "step": 6500 + }, + { + "epoch": 31.576271186440678, + "grad_norm": 0.015682745724916458, + "learning_rate": 0.2808513688693654, + "loss": 0.0521, + "num_input_tokens_seen": 11142272, + "step": 6505 + }, + { + "epoch": 31.60048426150121, + "grad_norm": 0.006287550553679466, + "learning_rate": 0.28082256046915627, + "loss": 0.0107, + "num_input_tokens_seen": 11150560, + "step": 6510 + }, + { + "epoch": 31.624697336561745, + "grad_norm": 0.002363968873396516, + "learning_rate": 0.28079373189446427, + "loss": 0.0373, + "num_input_tokens_seen": 11159008, + "step": 6515 + }, + { + "epoch": 31.648910411622275, + "grad_norm": 0.010363094508647919, + "learning_rate": 0.28076488314973513, + "loss": 0.0241, + "num_input_tokens_seen": 11167712, + "step": 6520 + }, + { + "epoch": 31.673123486682808, + "grad_norm": 0.007117273285984993, + "learning_rate": 0.28073601423941774, + "loss": 0.0279, + "num_input_tokens_seen": 11176608, + "step": 6525 + }, + { + "epoch": 31.69733656174334, + "grad_norm": 0.010571841150522232, + "learning_rate": 0.28070712516796403, + "loss": 0.0462, + "num_input_tokens_seen": 11185472, + "step": 6530 + }, + { + "epoch": 31.721549636803875, + "grad_norm": 0.010187139734625816, + "learning_rate": 0.28067821593982906, + "loss": 0.0368, + "num_input_tokens_seen": 11194048, + "step": 6535 + }, + { + "epoch": 31.74576271186441, + "grad_norm": 0.010741823352873325, + "learning_rate": 0.28064928655947097, + "loss": 0.0397, + "num_input_tokens_seen": 11202880, + "step": 6540 + }, + { + "epoch": 31.769975786924938, + "grad_norm": 0.004384764935821295, + "learning_rate": 0.28062033703135103, + "loss": 0.0249, + "num_input_tokens_seen": 11211680, + "step": 6545 + }, + { + "epoch": 31.79418886198547, + "grad_norm": 0.006513781379908323, + "learning_rate": 0.2805913673599337, + "loss": 0.0126, + "num_input_tokens_seen": 11219872, + "step": 6550 + }, + { + "epoch": 31.818401937046005, + "grad_norm": 0.012785282917320728, + "learning_rate": 0.2805623775496864, + "loss": 0.0614, + "num_input_tokens_seen": 11228288, + "step": 6555 + }, + { + "epoch": 31.84261501210654, + "grad_norm": 0.008441115729510784, + "learning_rate": 0.2805333676050797, + "loss": 0.0319, + "num_input_tokens_seen": 11236864, + "step": 6560 + }, + { + "epoch": 31.86682808716707, + "grad_norm": 0.009852394461631775, + "learning_rate": 0.2805043375305873, + "loss": 0.0403, + "num_input_tokens_seen": 11245344, + "step": 6565 + }, + { + "epoch": 31.8910411622276, + "grad_norm": 0.00236009550280869, + "learning_rate": 0.2804752873306861, + "loss": 0.0122, + "num_input_tokens_seen": 11254016, + "step": 6570 + }, + { + "epoch": 31.915254237288135, + "grad_norm": 0.003650831524282694, + "learning_rate": 0.2804462170098559, + "loss": 0.046, + "num_input_tokens_seen": 11262720, + "step": 6575 + }, + { + "epoch": 31.93946731234867, + "grad_norm": 0.0016207348089665174, + "learning_rate": 0.2804171265725797, + "loss": 0.0162, + "num_input_tokens_seen": 11271168, + "step": 6580 + }, + { + "epoch": 31.9636803874092, + "grad_norm": 0.0012548647355288267, + "learning_rate": 0.28038801602334373, + "loss": 0.0273, + "num_input_tokens_seen": 11279360, + "step": 6585 + }, + { + "epoch": 31.987893462469735, + "grad_norm": 0.004016688093543053, + "learning_rate": 0.28035888536663717, + "loss": 0.0409, + "num_input_tokens_seen": 11287840, + "step": 6590 + }, + { + "epoch": 32.01452784503632, + "grad_norm": 0.0017827354604378343, + "learning_rate": 0.2803297346069522, + "loss": 0.0261, + "num_input_tokens_seen": 11297248, + "step": 6595 + }, + { + "epoch": 32.03874092009685, + "grad_norm": 0.00200119917280972, + "learning_rate": 0.28030056374878437, + "loss": 0.0287, + "num_input_tokens_seen": 11306080, + "step": 6600 + }, + { + "epoch": 32.03874092009685, + "eval_loss": 0.3758592903614044, + "eval_runtime": 4.6119, + "eval_samples_per_second": 79.577, + "eval_steps_per_second": 19.948, + "num_input_tokens_seen": 11306080, + "step": 6600 + }, + { + "epoch": 32.062953995157386, + "grad_norm": 0.010961097665131092, + "learning_rate": 0.2802713727966321, + "loss": 0.0206, + "num_input_tokens_seen": 11314880, + "step": 6605 + }, + { + "epoch": 32.087167070217916, + "grad_norm": 0.0019352895906195045, + "learning_rate": 0.28024216175499717, + "loss": 0.0066, + "num_input_tokens_seen": 11323488, + "step": 6610 + }, + { + "epoch": 32.11138014527845, + "grad_norm": 0.010365807451307774, + "learning_rate": 0.2802129306283841, + "loss": 0.0238, + "num_input_tokens_seen": 11331968, + "step": 6615 + }, + { + "epoch": 32.13559322033898, + "grad_norm": 0.002487853402271867, + "learning_rate": 0.28018367942130074, + "loss": 0.0076, + "num_input_tokens_seen": 11340480, + "step": 6620 + }, + { + "epoch": 32.15980629539951, + "grad_norm": 0.007942247204482555, + "learning_rate": 0.28015440813825804, + "loss": 0.0205, + "num_input_tokens_seen": 11349184, + "step": 6625 + }, + { + "epoch": 32.18401937046005, + "grad_norm": 0.0010227295570075512, + "learning_rate": 0.28012511678377006, + "loss": 0.0199, + "num_input_tokens_seen": 11357664, + "step": 6630 + }, + { + "epoch": 32.20823244552058, + "grad_norm": 0.0013582519022747874, + "learning_rate": 0.28009580536235373, + "loss": 0.005, + "num_input_tokens_seen": 11366080, + "step": 6635 + }, + { + "epoch": 32.232445520581116, + "grad_norm": 0.0010042107896879315, + "learning_rate": 0.28006647387852934, + "loss": 0.0293, + "num_input_tokens_seen": 11374496, + "step": 6640 + }, + { + "epoch": 32.256658595641646, + "grad_norm": 0.007315501570701599, + "learning_rate": 0.28003712233682015, + "loss": 0.0085, + "num_input_tokens_seen": 11382976, + "step": 6645 + }, + { + "epoch": 32.280871670702176, + "grad_norm": 0.0036110791843384504, + "learning_rate": 0.2800077507417526, + "loss": 0.016, + "num_input_tokens_seen": 11391488, + "step": 6650 + }, + { + "epoch": 32.30508474576271, + "grad_norm": 0.004272564314305782, + "learning_rate": 0.2799783590978561, + "loss": 0.0106, + "num_input_tokens_seen": 11399616, + "step": 6655 + }, + { + "epoch": 32.32929782082324, + "grad_norm": 0.00033628640812821686, + "learning_rate": 0.2799489474096632, + "loss": 0.0132, + "num_input_tokens_seen": 11408288, + "step": 6660 + }, + { + "epoch": 32.35351089588378, + "grad_norm": 0.012678660452365875, + "learning_rate": 0.27991951568170953, + "loss": 0.0221, + "num_input_tokens_seen": 11417024, + "step": 6665 + }, + { + "epoch": 32.37772397094431, + "grad_norm": 0.0023948538582772017, + "learning_rate": 0.2798900639185339, + "loss": 0.0269, + "num_input_tokens_seen": 11425920, + "step": 6670 + }, + { + "epoch": 32.40193704600484, + "grad_norm": 0.004526571370661259, + "learning_rate": 0.2798605921246781, + "loss": 0.056, + "num_input_tokens_seen": 11434240, + "step": 6675 + }, + { + "epoch": 32.426150121065376, + "grad_norm": 0.02274603210389614, + "learning_rate": 0.2798311003046871, + "loss": 0.054, + "num_input_tokens_seen": 11442784, + "step": 6680 + }, + { + "epoch": 32.450363196125906, + "grad_norm": 0.0020454248879104853, + "learning_rate": 0.2798015884631089, + "loss": 0.0406, + "num_input_tokens_seen": 11451456, + "step": 6685 + }, + { + "epoch": 32.47457627118644, + "grad_norm": 0.0034039891324937344, + "learning_rate": 0.27977205660449445, + "loss": 0.0507, + "num_input_tokens_seen": 11460064, + "step": 6690 + }, + { + "epoch": 32.49878934624697, + "grad_norm": 0.0052105216309428215, + "learning_rate": 0.2797425047333981, + "loss": 0.0146, + "num_input_tokens_seen": 11468512, + "step": 6695 + }, + { + "epoch": 32.5230024213075, + "grad_norm": 0.01816197857260704, + "learning_rate": 0.27971293285437715, + "loss": 0.0679, + "num_input_tokens_seen": 11477024, + "step": 6700 + }, + { + "epoch": 32.54721549636804, + "grad_norm": 0.007343772798776627, + "learning_rate": 0.2796833409719918, + "loss": 0.0456, + "num_input_tokens_seen": 11485952, + "step": 6705 + }, + { + "epoch": 32.57142857142857, + "grad_norm": 0.006980357691645622, + "learning_rate": 0.27965372909080566, + "loss": 0.0323, + "num_input_tokens_seen": 11494368, + "step": 6710 + }, + { + "epoch": 32.595641646489106, + "grad_norm": 0.006549867335706949, + "learning_rate": 0.27962409721538506, + "loss": 0.0266, + "num_input_tokens_seen": 11503072, + "step": 6715 + }, + { + "epoch": 32.619854721549636, + "grad_norm": 0.0032377506140619516, + "learning_rate": 0.27959444535029976, + "loss": 0.009, + "num_input_tokens_seen": 11511712, + "step": 6720 + }, + { + "epoch": 32.644067796610166, + "grad_norm": 0.012464367784559727, + "learning_rate": 0.27956477350012243, + "loss": 0.0211, + "num_input_tokens_seen": 11520608, + "step": 6725 + }, + { + "epoch": 32.6682808716707, + "grad_norm": 0.009873084723949432, + "learning_rate": 0.27953508166942875, + "loss": 0.0231, + "num_input_tokens_seen": 11529184, + "step": 6730 + }, + { + "epoch": 32.69249394673123, + "grad_norm": 0.004180551040917635, + "learning_rate": 0.27950536986279767, + "loss": 0.0305, + "num_input_tokens_seen": 11537696, + "step": 6735 + }, + { + "epoch": 32.71670702179177, + "grad_norm": 0.0060309297405183315, + "learning_rate": 0.2794756380848111, + "loss": 0.032, + "num_input_tokens_seen": 11546176, + "step": 6740 + }, + { + "epoch": 32.7409200968523, + "grad_norm": 0.0035676860716193914, + "learning_rate": 0.279445886340054, + "loss": 0.0067, + "num_input_tokens_seen": 11554688, + "step": 6745 + }, + { + "epoch": 32.76513317191284, + "grad_norm": 0.010883619077503681, + "learning_rate": 0.27941611463311455, + "loss": 0.0707, + "num_input_tokens_seen": 11563072, + "step": 6750 + }, + { + "epoch": 32.789346246973366, + "grad_norm": 0.008509734645485878, + "learning_rate": 0.2793863229685839, + "loss": 0.0497, + "num_input_tokens_seen": 11571520, + "step": 6755 + }, + { + "epoch": 32.813559322033896, + "grad_norm": 0.005979436449706554, + "learning_rate": 0.27935651135105627, + "loss": 0.0259, + "num_input_tokens_seen": 11580480, + "step": 6760 + }, + { + "epoch": 32.83777239709443, + "grad_norm": 0.007930135354399681, + "learning_rate": 0.279326679785129, + "loss": 0.0487, + "num_input_tokens_seen": 11588736, + "step": 6765 + }, + { + "epoch": 32.86198547215496, + "grad_norm": 0.0014408512506633997, + "learning_rate": 0.2792968282754024, + "loss": 0.0535, + "num_input_tokens_seen": 11597280, + "step": 6770 + }, + { + "epoch": 32.8861985472155, + "grad_norm": 0.005627661943435669, + "learning_rate": 0.2792669568264801, + "loss": 0.0357, + "num_input_tokens_seen": 11606016, + "step": 6775 + }, + { + "epoch": 32.91041162227603, + "grad_norm": 0.007340212352573872, + "learning_rate": 0.27923706544296856, + "loss": 0.0889, + "num_input_tokens_seen": 11614464, + "step": 6780 + }, + { + "epoch": 32.93462469733656, + "grad_norm": 0.012119791470468044, + "learning_rate": 0.2792071541294775, + "loss": 0.0485, + "num_input_tokens_seen": 11623040, + "step": 6785 + }, + { + "epoch": 32.958837772397096, + "grad_norm": 0.0051692212000489235, + "learning_rate": 0.27917722289061947, + "loss": 0.0407, + "num_input_tokens_seen": 11631584, + "step": 6790 + }, + { + "epoch": 32.983050847457626, + "grad_norm": 0.010908465832471848, + "learning_rate": 0.27914727173101034, + "loss": 0.0331, + "num_input_tokens_seen": 11639904, + "step": 6795 + }, + { + "epoch": 33.00968523002421, + "grad_norm": 0.007426546886563301, + "learning_rate": 0.279117300655269, + "loss": 0.0515, + "num_input_tokens_seen": 11649024, + "step": 6800 + }, + { + "epoch": 33.00968523002421, + "eval_loss": 0.3480857312679291, + "eval_runtime": 4.6208, + "eval_samples_per_second": 79.423, + "eval_steps_per_second": 19.91, + "num_input_tokens_seen": 11649024, + "step": 6800 + }, + { + "epoch": 33.03389830508475, + "grad_norm": 0.008043481968343258, + "learning_rate": 0.2790873096680173, + "loss": 0.0362, + "num_input_tokens_seen": 11657760, + "step": 6805 + }, + { + "epoch": 33.05811138014528, + "grad_norm": 0.002948666224256158, + "learning_rate": 0.2790572987738802, + "loss": 0.0263, + "num_input_tokens_seen": 11666304, + "step": 6810 + }, + { + "epoch": 33.082324455205814, + "grad_norm": 0.000923606101423502, + "learning_rate": 0.27902726797748584, + "loss": 0.0208, + "num_input_tokens_seen": 11675072, + "step": 6815 + }, + { + "epoch": 33.106537530266344, + "grad_norm": 0.0016540224896743894, + "learning_rate": 0.2789972172834652, + "loss": 0.0099, + "num_input_tokens_seen": 11683840, + "step": 6820 + }, + { + "epoch": 33.130750605326874, + "grad_norm": 0.001307376311160624, + "learning_rate": 0.2789671466964527, + "loss": 0.0064, + "num_input_tokens_seen": 11693056, + "step": 6825 + }, + { + "epoch": 33.15496368038741, + "grad_norm": 0.007065641228109598, + "learning_rate": 0.2789370562210854, + "loss": 0.0217, + "num_input_tokens_seen": 11701280, + "step": 6830 + }, + { + "epoch": 33.17917675544794, + "grad_norm": 0.0016106164548546076, + "learning_rate": 0.27890694586200376, + "loss": 0.0043, + "num_input_tokens_seen": 11709888, + "step": 6835 + }, + { + "epoch": 33.20338983050848, + "grad_norm": 0.007746633142232895, + "learning_rate": 0.2788768156238511, + "loss": 0.038, + "num_input_tokens_seen": 11718592, + "step": 6840 + }, + { + "epoch": 33.22760290556901, + "grad_norm": 0.012635056860744953, + "learning_rate": 0.27884666551127385, + "loss": 0.0167, + "num_input_tokens_seen": 11726976, + "step": 6845 + }, + { + "epoch": 33.25181598062954, + "grad_norm": 0.002255229977890849, + "learning_rate": 0.2788164955289217, + "loss": 0.0276, + "num_input_tokens_seen": 11735488, + "step": 6850 + }, + { + "epoch": 33.276029055690074, + "grad_norm": 0.003482096828520298, + "learning_rate": 0.27878630568144697, + "loss": 0.0339, + "num_input_tokens_seen": 11744224, + "step": 6855 + }, + { + "epoch": 33.300242130750604, + "grad_norm": 0.0003029554500244558, + "learning_rate": 0.2787560959735056, + "loss": 0.0113, + "num_input_tokens_seen": 11752896, + "step": 6860 + }, + { + "epoch": 33.32445520581114, + "grad_norm": 0.0018309784354642034, + "learning_rate": 0.27872586640975616, + "loss": 0.018, + "num_input_tokens_seen": 11761248, + "step": 6865 + }, + { + "epoch": 33.34866828087167, + "grad_norm": 0.0010292233200743794, + "learning_rate": 0.27869561699486045, + "loss": 0.0152, + "num_input_tokens_seen": 11769728, + "step": 6870 + }, + { + "epoch": 33.3728813559322, + "grad_norm": 0.0033901166170835495, + "learning_rate": 0.2786653477334833, + "loss": 0.0338, + "num_input_tokens_seen": 11778432, + "step": 6875 + }, + { + "epoch": 33.39709443099274, + "grad_norm": 0.0003219728241674602, + "learning_rate": 0.2786350586302926, + "loss": 0.0117, + "num_input_tokens_seen": 11787040, + "step": 6880 + }, + { + "epoch": 33.42130750605327, + "grad_norm": 0.00915338285267353, + "learning_rate": 0.27860474968995935, + "loss": 0.0194, + "num_input_tokens_seen": 11795424, + "step": 6885 + }, + { + "epoch": 33.445520581113804, + "grad_norm": 0.005368479527533054, + "learning_rate": 0.27857442091715756, + "loss": 0.0159, + "num_input_tokens_seen": 11804064, + "step": 6890 + }, + { + "epoch": 33.469733656174334, + "grad_norm": 0.00047489651478827, + "learning_rate": 0.27854407231656425, + "loss": 0.0297, + "num_input_tokens_seen": 11812384, + "step": 6895 + }, + { + "epoch": 33.493946731234864, + "grad_norm": 0.008412981405854225, + "learning_rate": 0.2785137038928596, + "loss": 0.0187, + "num_input_tokens_seen": 11821344, + "step": 6900 + }, + { + "epoch": 33.5181598062954, + "grad_norm": 0.005653939675539732, + "learning_rate": 0.27848331565072687, + "loss": 0.0277, + "num_input_tokens_seen": 11829696, + "step": 6905 + }, + { + "epoch": 33.54237288135593, + "grad_norm": 0.016949940472841263, + "learning_rate": 0.27845290759485225, + "loss": 0.0239, + "num_input_tokens_seen": 11837792, + "step": 6910 + }, + { + "epoch": 33.56658595641647, + "grad_norm": 0.011399650946259499, + "learning_rate": 0.278422479729925, + "loss": 0.021, + "num_input_tokens_seen": 11846304, + "step": 6915 + }, + { + "epoch": 33.590799031477, + "grad_norm": 0.016283871605992317, + "learning_rate": 0.2783920320606375, + "loss": 0.0382, + "num_input_tokens_seen": 11855136, + "step": 6920 + }, + { + "epoch": 33.61501210653753, + "grad_norm": 0.00742152938619256, + "learning_rate": 0.2783615645916852, + "loss": 0.0199, + "num_input_tokens_seen": 11863680, + "step": 6925 + }, + { + "epoch": 33.639225181598064, + "grad_norm": 0.004940335173159838, + "learning_rate": 0.2783310773277666, + "loss": 0.0234, + "num_input_tokens_seen": 11872096, + "step": 6930 + }, + { + "epoch": 33.663438256658594, + "grad_norm": 0.004314760211855173, + "learning_rate": 0.2783005702735831, + "loss": 0.0195, + "num_input_tokens_seen": 11880480, + "step": 6935 + }, + { + "epoch": 33.68765133171913, + "grad_norm": 0.011629710905253887, + "learning_rate": 0.2782700434338394, + "loss": 0.0287, + "num_input_tokens_seen": 11889056, + "step": 6940 + }, + { + "epoch": 33.71186440677966, + "grad_norm": 0.0067303176037967205, + "learning_rate": 0.278239496813243, + "loss": 0.0492, + "num_input_tokens_seen": 11897600, + "step": 6945 + }, + { + "epoch": 33.73607748184019, + "grad_norm": 0.0012489552609622478, + "learning_rate": 0.27820893041650463, + "loss": 0.0153, + "num_input_tokens_seen": 11906144, + "step": 6950 + }, + { + "epoch": 33.76029055690073, + "grad_norm": 0.013817784376442432, + "learning_rate": 0.27817834424833804, + "loss": 0.0288, + "num_input_tokens_seen": 11914880, + "step": 6955 + }, + { + "epoch": 33.78450363196126, + "grad_norm": 0.0025416200514882803, + "learning_rate": 0.27814773831345996, + "loss": 0.0198, + "num_input_tokens_seen": 11923424, + "step": 6960 + }, + { + "epoch": 33.808716707021794, + "grad_norm": 0.0022307387553155422, + "learning_rate": 0.2781171126165902, + "loss": 0.0189, + "num_input_tokens_seen": 11931776, + "step": 6965 + }, + { + "epoch": 33.832929782082324, + "grad_norm": 0.005138874985277653, + "learning_rate": 0.2780864671624517, + "loss": 0.026, + "num_input_tokens_seen": 11940736, + "step": 6970 + }, + { + "epoch": 33.857142857142854, + "grad_norm": 0.015625404193997383, + "learning_rate": 0.27805580195577034, + "loss": 0.0205, + "num_input_tokens_seen": 11949088, + "step": 6975 + }, + { + "epoch": 33.88135593220339, + "grad_norm": 0.005405431613326073, + "learning_rate": 0.2780251170012751, + "loss": 0.0357, + "num_input_tokens_seen": 11957536, + "step": 6980 + }, + { + "epoch": 33.90556900726392, + "grad_norm": 0.011693950742483139, + "learning_rate": 0.27799441230369787, + "loss": 0.0391, + "num_input_tokens_seen": 11966048, + "step": 6985 + }, + { + "epoch": 33.92978208232446, + "grad_norm": 0.012930922210216522, + "learning_rate": 0.27796368786777387, + "loss": 0.0596, + "num_input_tokens_seen": 11974816, + "step": 6990 + }, + { + "epoch": 33.95399515738499, + "grad_norm": 0.006200103089213371, + "learning_rate": 0.277932943698241, + "loss": 0.0116, + "num_input_tokens_seen": 11983424, + "step": 6995 + }, + { + "epoch": 33.97820823244552, + "grad_norm": 0.012861770577728748, + "learning_rate": 0.2779021797998406, + "loss": 0.0241, + "num_input_tokens_seen": 11992032, + "step": 7000 + }, + { + "epoch": 33.97820823244552, + "eval_loss": 0.4164133369922638, + "eval_runtime": 4.6282, + "eval_samples_per_second": 79.296, + "eval_steps_per_second": 19.878, + "num_input_tokens_seen": 11992032, + "step": 7000 + }, + { + "epoch": 34.00484261501211, + "grad_norm": 0.0012920749140903354, + "learning_rate": 0.2778713961773167, + "loss": 0.0093, + "num_input_tokens_seen": 12001024, + "step": 7005 + }, + { + "epoch": 34.02905569007264, + "grad_norm": 0.00118659483268857, + "learning_rate": 0.2778405928354166, + "loss": 0.034, + "num_input_tokens_seen": 12009728, + "step": 7010 + }, + { + "epoch": 34.05326876513317, + "grad_norm": 0.0013759395806118846, + "learning_rate": 0.27780976977889055, + "loss": 0.0109, + "num_input_tokens_seen": 12018144, + "step": 7015 + }, + { + "epoch": 34.077481840193705, + "grad_norm": 0.0004460505733732134, + "learning_rate": 0.27777892701249185, + "loss": 0.0104, + "num_input_tokens_seen": 12026816, + "step": 7020 + }, + { + "epoch": 34.101694915254235, + "grad_norm": 0.00291324220597744, + "learning_rate": 0.2777480645409768, + "loss": 0.0272, + "num_input_tokens_seen": 12035360, + "step": 7025 + }, + { + "epoch": 34.12590799031477, + "grad_norm": 0.0007353037944994867, + "learning_rate": 0.27771718236910486, + "loss": 0.0191, + "num_input_tokens_seen": 12043808, + "step": 7030 + }, + { + "epoch": 34.1501210653753, + "grad_norm": 0.0014788014814257622, + "learning_rate": 0.27768628050163835, + "loss": 0.0061, + "num_input_tokens_seen": 12051968, + "step": 7035 + }, + { + "epoch": 34.17433414043583, + "grad_norm": 0.0009302247199229896, + "learning_rate": 0.2776553589433428, + "loss": 0.0139, + "num_input_tokens_seen": 12060256, + "step": 7040 + }, + { + "epoch": 34.19854721549637, + "grad_norm": 0.0020551036577671766, + "learning_rate": 0.27762441769898666, + "loss": 0.0058, + "num_input_tokens_seen": 12068768, + "step": 7045 + }, + { + "epoch": 34.2227602905569, + "grad_norm": 0.001494893105700612, + "learning_rate": 0.2775934567733415, + "loss": 0.0148, + "num_input_tokens_seen": 12077376, + "step": 7050 + }, + { + "epoch": 34.246973365617436, + "grad_norm": 0.007001136429607868, + "learning_rate": 0.2775624761711819, + "loss": 0.0223, + "num_input_tokens_seen": 12086240, + "step": 7055 + }, + { + "epoch": 34.271186440677965, + "grad_norm": 0.0005588053609244525, + "learning_rate": 0.2775314758972854, + "loss": 0.0118, + "num_input_tokens_seen": 12095008, + "step": 7060 + }, + { + "epoch": 34.295399515738495, + "grad_norm": 0.006005032453685999, + "learning_rate": 0.2775004559564327, + "loss": 0.0091, + "num_input_tokens_seen": 12103552, + "step": 7065 + }, + { + "epoch": 34.31961259079903, + "grad_norm": 0.0003263730031903833, + "learning_rate": 0.2774694163534073, + "loss": 0.0054, + "num_input_tokens_seen": 12111936, + "step": 7070 + }, + { + "epoch": 34.34382566585956, + "grad_norm": 0.007144091185182333, + "learning_rate": 0.27743835709299614, + "loss": 0.0405, + "num_input_tokens_seen": 12120608, + "step": 7075 + }, + { + "epoch": 34.3680387409201, + "grad_norm": 0.01457324530929327, + "learning_rate": 0.2774072781799888, + "loss": 0.0285, + "num_input_tokens_seen": 12129344, + "step": 7080 + }, + { + "epoch": 34.39225181598063, + "grad_norm": 0.0024973175022751093, + "learning_rate": 0.27737617961917804, + "loss": 0.0154, + "num_input_tokens_seen": 12137664, + "step": 7085 + }, + { + "epoch": 34.416464891041166, + "grad_norm": 0.0005977308028377593, + "learning_rate": 0.27734506141535964, + "loss": 0.0162, + "num_input_tokens_seen": 12146080, + "step": 7090 + }, + { + "epoch": 34.440677966101696, + "grad_norm": 0.007875269278883934, + "learning_rate": 0.2773139235733325, + "loss": 0.0182, + "num_input_tokens_seen": 12154432, + "step": 7095 + }, + { + "epoch": 34.464891041162225, + "grad_norm": 0.010828551836311817, + "learning_rate": 0.2772827660978984, + "loss": 0.0559, + "num_input_tokens_seen": 12162848, + "step": 7100 + }, + { + "epoch": 34.48910411622276, + "grad_norm": 0.00898829661309719, + "learning_rate": 0.27725158899386226, + "loss": 0.0197, + "num_input_tokens_seen": 12171296, + "step": 7105 + }, + { + "epoch": 34.51331719128329, + "grad_norm": 0.0015658024931326509, + "learning_rate": 0.27722039226603196, + "loss": 0.008, + "num_input_tokens_seen": 12179904, + "step": 7110 + }, + { + "epoch": 34.53753026634383, + "grad_norm": 0.002098899334669113, + "learning_rate": 0.2771891759192184, + "loss": 0.0274, + "num_input_tokens_seen": 12188768, + "step": 7115 + }, + { + "epoch": 34.56174334140436, + "grad_norm": 0.0026039539370685816, + "learning_rate": 0.2771579399582355, + "loss": 0.0374, + "num_input_tokens_seen": 12197344, + "step": 7120 + }, + { + "epoch": 34.58595641646489, + "grad_norm": 0.010788487270474434, + "learning_rate": 0.2771266843879004, + "loss": 0.0167, + "num_input_tokens_seen": 12205824, + "step": 7125 + }, + { + "epoch": 34.610169491525426, + "grad_norm": 0.005104257259517908, + "learning_rate": 0.2770954092130329, + "loss": 0.0145, + "num_input_tokens_seen": 12214368, + "step": 7130 + }, + { + "epoch": 34.634382566585955, + "grad_norm": 0.007483882363885641, + "learning_rate": 0.27706411443845613, + "loss": 0.0345, + "num_input_tokens_seen": 12223072, + "step": 7135 + }, + { + "epoch": 34.65859564164649, + "grad_norm": 0.004953610245138407, + "learning_rate": 0.27703280006899617, + "loss": 0.0262, + "num_input_tokens_seen": 12231520, + "step": 7140 + }, + { + "epoch": 34.68280871670702, + "grad_norm": 0.008286009542644024, + "learning_rate": 0.277001466109482, + "loss": 0.026, + "num_input_tokens_seen": 12240320, + "step": 7145 + }, + { + "epoch": 34.70702179176755, + "grad_norm": 0.003099602647125721, + "learning_rate": 0.2769701125647458, + "loss": 0.0125, + "num_input_tokens_seen": 12249152, + "step": 7150 + }, + { + "epoch": 34.73123486682809, + "grad_norm": 0.004026666283607483, + "learning_rate": 0.27693873943962266, + "loss": 0.0114, + "num_input_tokens_seen": 12257664, + "step": 7155 + }, + { + "epoch": 34.75544794188862, + "grad_norm": 0.011009610258042812, + "learning_rate": 0.2769073467389506, + "loss": 0.0274, + "num_input_tokens_seen": 12266496, + "step": 7160 + }, + { + "epoch": 34.779661016949156, + "grad_norm": 0.0038768472149968147, + "learning_rate": 0.2768759344675709, + "loss": 0.0352, + "num_input_tokens_seen": 12274976, + "step": 7165 + }, + { + "epoch": 34.803874092009686, + "grad_norm": 0.01257918868213892, + "learning_rate": 0.27684450263032767, + "loss": 0.0712, + "num_input_tokens_seen": 12283520, + "step": 7170 + }, + { + "epoch": 34.828087167070215, + "grad_norm": 0.0050577446818351746, + "learning_rate": 0.2768130512320682, + "loss": 0.0317, + "num_input_tokens_seen": 12291904, + "step": 7175 + }, + { + "epoch": 34.85230024213075, + "grad_norm": 0.006926960777491331, + "learning_rate": 0.27678158027764244, + "loss": 0.0489, + "num_input_tokens_seen": 12300960, + "step": 7180 + }, + { + "epoch": 34.87651331719128, + "grad_norm": 0.014857172966003418, + "learning_rate": 0.27675008977190385, + "loss": 0.047, + "num_input_tokens_seen": 12309504, + "step": 7185 + }, + { + "epoch": 34.90072639225182, + "grad_norm": 0.006916518323123455, + "learning_rate": 0.2767185797197086, + "loss": 0.0383, + "num_input_tokens_seen": 12318016, + "step": 7190 + }, + { + "epoch": 34.92493946731235, + "grad_norm": 0.0029140878468751907, + "learning_rate": 0.2766870501259159, + "loss": 0.0295, + "num_input_tokens_seen": 12326176, + "step": 7195 + }, + { + "epoch": 34.94915254237288, + "grad_norm": 0.002536057261750102, + "learning_rate": 0.276655500995388, + "loss": 0.023, + "num_input_tokens_seen": 12334784, + "step": 7200 + }, + { + "epoch": 34.94915254237288, + "eval_loss": 0.3967973589897156, + "eval_runtime": 4.6325, + "eval_samples_per_second": 79.224, + "eval_steps_per_second": 19.86, + "num_input_tokens_seen": 12334784, + "step": 7200 + }, + { + "epoch": 34.973365617433416, + "grad_norm": 0.008086508139967918, + "learning_rate": 0.27662393233299015, + "loss": 0.052, + "num_input_tokens_seen": 12343360, + "step": 7205 + }, + { + "epoch": 34.997578692493946, + "grad_norm": 0.00855178851634264, + "learning_rate": 0.27659234414359074, + "loss": 0.0276, + "num_input_tokens_seen": 12351968, + "step": 7210 + }, + { + "epoch": 35.02421307506053, + "grad_norm": 0.0049479990266263485, + "learning_rate": 0.27656073643206097, + "loss": 0.0138, + "num_input_tokens_seen": 12360864, + "step": 7215 + }, + { + "epoch": 35.04842615012107, + "grad_norm": 0.0019293311052024364, + "learning_rate": 0.27652910920327517, + "loss": 0.0082, + "num_input_tokens_seen": 12369248, + "step": 7220 + }, + { + "epoch": 35.0726392251816, + "grad_norm": 0.005958365276455879, + "learning_rate": 0.2764974624621107, + "loss": 0.0238, + "num_input_tokens_seen": 12377824, + "step": 7225 + }, + { + "epoch": 35.09685230024213, + "grad_norm": 0.0019172881729900837, + "learning_rate": 0.2764657962134479, + "loss": 0.0294, + "num_input_tokens_seen": 12386432, + "step": 7230 + }, + { + "epoch": 35.12106537530266, + "grad_norm": 0.0038172975182533264, + "learning_rate": 0.27643411046217, + "loss": 0.0184, + "num_input_tokens_seen": 12395072, + "step": 7235 + }, + { + "epoch": 35.14527845036319, + "grad_norm": 0.008554153144359589, + "learning_rate": 0.27640240521316334, + "loss": 0.016, + "num_input_tokens_seen": 12403424, + "step": 7240 + }, + { + "epoch": 35.16949152542373, + "grad_norm": 0.004251268692314625, + "learning_rate": 0.2763706804713174, + "loss": 0.0171, + "num_input_tokens_seen": 12412032, + "step": 7245 + }, + { + "epoch": 35.19370460048426, + "grad_norm": 0.001027761842124164, + "learning_rate": 0.2763389362415245, + "loss": 0.0158, + "num_input_tokens_seen": 12420640, + "step": 7250 + }, + { + "epoch": 35.2179176755448, + "grad_norm": 0.0033850492909550667, + "learning_rate": 0.27630717252867987, + "loss": 0.0162, + "num_input_tokens_seen": 12429088, + "step": 7255 + }, + { + "epoch": 35.24213075060533, + "grad_norm": 0.0010146595304831862, + "learning_rate": 0.276275389337682, + "loss": 0.0294, + "num_input_tokens_seen": 12437344, + "step": 7260 + }, + { + "epoch": 35.26634382566586, + "grad_norm": 0.00843824166804552, + "learning_rate": 0.2762435866734322, + "loss": 0.0305, + "num_input_tokens_seen": 12445376, + "step": 7265 + }, + { + "epoch": 35.29055690072639, + "grad_norm": 0.0010103249223902822, + "learning_rate": 0.27621176454083485, + "loss": 0.0144, + "num_input_tokens_seen": 12453760, + "step": 7270 + }, + { + "epoch": 35.31476997578692, + "grad_norm": 0.0005178165738470852, + "learning_rate": 0.2761799229447973, + "loss": 0.0098, + "num_input_tokens_seen": 12462720, + "step": 7275 + }, + { + "epoch": 35.33898305084746, + "grad_norm": 0.0008738053729757667, + "learning_rate": 0.27614806189023006, + "loss": 0.024, + "num_input_tokens_seen": 12471136, + "step": 7280 + }, + { + "epoch": 35.36319612590799, + "grad_norm": 0.007358910981565714, + "learning_rate": 0.27611618138204636, + "loss": 0.0202, + "num_input_tokens_seen": 12479424, + "step": 7285 + }, + { + "epoch": 35.38740920096852, + "grad_norm": 0.0010342866880819201, + "learning_rate": 0.2760842814251626, + "loss": 0.0147, + "num_input_tokens_seen": 12487968, + "step": 7290 + }, + { + "epoch": 35.41162227602906, + "grad_norm": 0.0067621138878166676, + "learning_rate": 0.2760523620244982, + "loss": 0.0127, + "num_input_tokens_seen": 12496544, + "step": 7295 + }, + { + "epoch": 35.43583535108959, + "grad_norm": 0.0015151489060372114, + "learning_rate": 0.27602042318497544, + "loss": 0.0236, + "num_input_tokens_seen": 12505184, + "step": 7300 + }, + { + "epoch": 35.460048426150124, + "grad_norm": 0.005006266292184591, + "learning_rate": 0.2759884649115198, + "loss": 0.0105, + "num_input_tokens_seen": 12513792, + "step": 7305 + }, + { + "epoch": 35.48426150121065, + "grad_norm": 0.007570985238999128, + "learning_rate": 0.2759564872090596, + "loss": 0.0339, + "num_input_tokens_seen": 12522400, + "step": 7310 + }, + { + "epoch": 35.50847457627118, + "grad_norm": 0.006613556295633316, + "learning_rate": 0.2759244900825262, + "loss": 0.0317, + "num_input_tokens_seen": 12531136, + "step": 7315 + }, + { + "epoch": 35.53268765133172, + "grad_norm": 0.004600088577717543, + "learning_rate": 0.2758924735368539, + "loss": 0.0254, + "num_input_tokens_seen": 12539488, + "step": 7320 + }, + { + "epoch": 35.55690072639225, + "grad_norm": 0.0009460410219617188, + "learning_rate": 0.27586043757698014, + "loss": 0.0142, + "num_input_tokens_seen": 12548160, + "step": 7325 + }, + { + "epoch": 35.58111380145279, + "grad_norm": 0.006980703212320805, + "learning_rate": 0.27582838220784534, + "loss": 0.0351, + "num_input_tokens_seen": 12556928, + "step": 7330 + }, + { + "epoch": 35.60532687651332, + "grad_norm": 0.004067316185683012, + "learning_rate": 0.27579630743439265, + "loss": 0.0637, + "num_input_tokens_seen": 12565696, + "step": 7335 + }, + { + "epoch": 35.62953995157385, + "grad_norm": 0.00933065451681614, + "learning_rate": 0.2757642132615686, + "loss": 0.0266, + "num_input_tokens_seen": 12574176, + "step": 7340 + }, + { + "epoch": 35.653753026634384, + "grad_norm": 0.002594403689727187, + "learning_rate": 0.2757320996943223, + "loss": 0.0232, + "num_input_tokens_seen": 12582848, + "step": 7345 + }, + { + "epoch": 35.67796610169491, + "grad_norm": 0.00046032803948037326, + "learning_rate": 0.2756999667376062, + "loss": 0.023, + "num_input_tokens_seen": 12591808, + "step": 7350 + }, + { + "epoch": 35.70217917675545, + "grad_norm": 0.019596461206674576, + "learning_rate": 0.2756678143963756, + "loss": 0.0109, + "num_input_tokens_seen": 12600384, + "step": 7355 + }, + { + "epoch": 35.72639225181598, + "grad_norm": 0.002989195752888918, + "learning_rate": 0.2756356426755888, + "loss": 0.0202, + "num_input_tokens_seen": 12609024, + "step": 7360 + }, + { + "epoch": 35.75060532687651, + "grad_norm": 0.008946023881435394, + "learning_rate": 0.27560345158020705, + "loss": 0.015, + "num_input_tokens_seen": 12617888, + "step": 7365 + }, + { + "epoch": 35.77481840193705, + "grad_norm": 0.0071942745707929134, + "learning_rate": 0.27557124111519465, + "loss": 0.0389, + "num_input_tokens_seen": 12626688, + "step": 7370 + }, + { + "epoch": 35.79903147699758, + "grad_norm": 0.001646455959416926, + "learning_rate": 0.27553901128551883, + "loss": 0.039, + "num_input_tokens_seen": 12635200, + "step": 7375 + }, + { + "epoch": 35.823244552058114, + "grad_norm": 0.015560675412416458, + "learning_rate": 0.2755067620961498, + "loss": 0.0419, + "num_input_tokens_seen": 12643744, + "step": 7380 + }, + { + "epoch": 35.847457627118644, + "grad_norm": 0.012105708010494709, + "learning_rate": 0.27547449355206094, + "loss": 0.0514, + "num_input_tokens_seen": 12652384, + "step": 7385 + }, + { + "epoch": 35.87167070217917, + "grad_norm": 0.01528442557901144, + "learning_rate": 0.2754422056582283, + "loss": 0.032, + "num_input_tokens_seen": 12660832, + "step": 7390 + }, + { + "epoch": 35.89588377723971, + "grad_norm": 0.001710566459223628, + "learning_rate": 0.27540989841963115, + "loss": 0.0542, + "num_input_tokens_seen": 12669440, + "step": 7395 + }, + { + "epoch": 35.92009685230024, + "grad_norm": 0.005525865126401186, + "learning_rate": 0.27537757184125167, + "loss": 0.0699, + "num_input_tokens_seen": 12677888, + "step": 7400 + }, + { + "epoch": 35.92009685230024, + "eval_loss": 0.3399965465068817, + "eval_runtime": 4.6191, + "eval_samples_per_second": 79.452, + "eval_steps_per_second": 19.917, + "num_input_tokens_seen": 12677888, + "step": 7400 + }, + { + "epoch": 35.94430992736078, + "grad_norm": 0.01309080608189106, + "learning_rate": 0.275345225928075, + "loss": 0.0456, + "num_input_tokens_seen": 12686432, + "step": 7405 + }, + { + "epoch": 35.96852300242131, + "grad_norm": 0.0007145829731598496, + "learning_rate": 0.2753128606850893, + "loss": 0.0557, + "num_input_tokens_seen": 12694720, + "step": 7410 + }, + { + "epoch": 35.99273607748184, + "grad_norm": 0.006868042983114719, + "learning_rate": 0.2752804761172858, + "loss": 0.0386, + "num_input_tokens_seen": 12703232, + "step": 7415 + }, + { + "epoch": 36.01937046004843, + "grad_norm": 0.0005993112572468817, + "learning_rate": 0.27524807222965836, + "loss": 0.0332, + "num_input_tokens_seen": 12712320, + "step": 7420 + }, + { + "epoch": 36.04358353510896, + "grad_norm": 0.0008556530810892582, + "learning_rate": 0.27521564902720436, + "loss": 0.0198, + "num_input_tokens_seen": 12720832, + "step": 7425 + }, + { + "epoch": 36.067796610169495, + "grad_norm": 0.005906420294195414, + "learning_rate": 0.2751832065149236, + "loss": 0.0379, + "num_input_tokens_seen": 12729408, + "step": 7430 + }, + { + "epoch": 36.092009685230025, + "grad_norm": 0.009183371439576149, + "learning_rate": 0.2751507446978193, + "loss": 0.033, + "num_input_tokens_seen": 12737792, + "step": 7435 + }, + { + "epoch": 36.116222760290555, + "grad_norm": 0.006289254408329725, + "learning_rate": 0.2751182635808974, + "loss": 0.0084, + "num_input_tokens_seen": 12746464, + "step": 7440 + }, + { + "epoch": 36.14043583535109, + "grad_norm": 0.0035440856590867043, + "learning_rate": 0.27508576316916694, + "loss": 0.0375, + "num_input_tokens_seen": 12755232, + "step": 7445 + }, + { + "epoch": 36.16464891041162, + "grad_norm": 0.00010838423622772098, + "learning_rate": 0.2750532434676399, + "loss": 0.0143, + "num_input_tokens_seen": 12763840, + "step": 7450 + }, + { + "epoch": 36.18886198547216, + "grad_norm": 0.00553701538592577, + "learning_rate": 0.27502070448133115, + "loss": 0.0161, + "num_input_tokens_seen": 12772768, + "step": 7455 + }, + { + "epoch": 36.21307506053269, + "grad_norm": 0.004667915403842926, + "learning_rate": 0.2749881462152587, + "loss": 0.017, + "num_input_tokens_seen": 12781152, + "step": 7460 + }, + { + "epoch": 36.23728813559322, + "grad_norm": 0.0017437082715332508, + "learning_rate": 0.2749555686744434, + "loss": 0.019, + "num_input_tokens_seen": 12789472, + "step": 7465 + }, + { + "epoch": 36.261501210653755, + "grad_norm": 0.008005854673683643, + "learning_rate": 0.2749229718639091, + "loss": 0.0139, + "num_input_tokens_seen": 12798176, + "step": 7470 + }, + { + "epoch": 36.285714285714285, + "grad_norm": 0.0036078274715691805, + "learning_rate": 0.27489035578868265, + "loss": 0.008, + "num_input_tokens_seen": 12806560, + "step": 7475 + }, + { + "epoch": 36.30992736077482, + "grad_norm": 0.010773731395602226, + "learning_rate": 0.2748577204537939, + "loss": 0.0257, + "num_input_tokens_seen": 12815264, + "step": 7480 + }, + { + "epoch": 36.33414043583535, + "grad_norm": 0.006848309654742479, + "learning_rate": 0.2748250658642756, + "loss": 0.0081, + "num_input_tokens_seen": 12823808, + "step": 7485 + }, + { + "epoch": 36.35835351089588, + "grad_norm": 0.002475538058206439, + "learning_rate": 0.2747923920251634, + "loss": 0.005, + "num_input_tokens_seen": 12832288, + "step": 7490 + }, + { + "epoch": 36.38256658595642, + "grad_norm": 0.0043183318339288235, + "learning_rate": 0.27475969894149627, + "loss": 0.0142, + "num_input_tokens_seen": 12840960, + "step": 7495 + }, + { + "epoch": 36.40677966101695, + "grad_norm": 0.00039308948908001184, + "learning_rate": 0.2747269866183156, + "loss": 0.0159, + "num_input_tokens_seen": 12849408, + "step": 7500 + }, + { + "epoch": 36.430992736077485, + "grad_norm": 0.0029250504449009895, + "learning_rate": 0.27469425506066625, + "loss": 0.0217, + "num_input_tokens_seen": 12857696, + "step": 7505 + }, + { + "epoch": 36.455205811138015, + "grad_norm": 0.0015936467098072171, + "learning_rate": 0.27466150427359576, + "loss": 0.0081, + "num_input_tokens_seen": 12866592, + "step": 7510 + }, + { + "epoch": 36.479418886198545, + "grad_norm": 0.004634034354239702, + "learning_rate": 0.2746287342621547, + "loss": 0.0176, + "num_input_tokens_seen": 12875232, + "step": 7515 + }, + { + "epoch": 36.50363196125908, + "grad_norm": 0.0012507039355114102, + "learning_rate": 0.2745959450313966, + "loss": 0.0071, + "num_input_tokens_seen": 12883616, + "step": 7520 + }, + { + "epoch": 36.52784503631961, + "grad_norm": 0.0011314130388200283, + "learning_rate": 0.27456313658637804, + "loss": 0.0108, + "num_input_tokens_seen": 12892224, + "step": 7525 + }, + { + "epoch": 36.55205811138015, + "grad_norm": 0.0024544657208025455, + "learning_rate": 0.27453030893215846, + "loss": 0.0071, + "num_input_tokens_seen": 12900448, + "step": 7530 + }, + { + "epoch": 36.57627118644068, + "grad_norm": 0.00750658567994833, + "learning_rate": 0.2744974620738003, + "loss": 0.0109, + "num_input_tokens_seen": 12909312, + "step": 7535 + }, + { + "epoch": 36.60048426150121, + "grad_norm": 0.007340850308537483, + "learning_rate": 0.27446459601636897, + "loss": 0.0131, + "num_input_tokens_seen": 12917824, + "step": 7540 + }, + { + "epoch": 36.624697336561745, + "grad_norm": 0.0010847916128113866, + "learning_rate": 0.2744317107649328, + "loss": 0.0324, + "num_input_tokens_seen": 12926208, + "step": 7545 + }, + { + "epoch": 36.648910411622275, + "grad_norm": 0.0008046952425502241, + "learning_rate": 0.2743988063245631, + "loss": 0.0218, + "num_input_tokens_seen": 12934944, + "step": 7550 + }, + { + "epoch": 36.67312348668281, + "grad_norm": 0.0006884734029881656, + "learning_rate": 0.2743658827003342, + "loss": 0.0296, + "num_input_tokens_seen": 12943616, + "step": 7555 + }, + { + "epoch": 36.69733656174334, + "grad_norm": 0.003780453698709607, + "learning_rate": 0.27433293989732327, + "loss": 0.0185, + "num_input_tokens_seen": 12952128, + "step": 7560 + }, + { + "epoch": 36.72154963680387, + "grad_norm": 0.001256827381439507, + "learning_rate": 0.27429997792061056, + "loss": 0.0293, + "num_input_tokens_seen": 12960512, + "step": 7565 + }, + { + "epoch": 36.74576271186441, + "grad_norm": 0.010509653948247433, + "learning_rate": 0.27426699677527927, + "loss": 0.0263, + "num_input_tokens_seen": 12969120, + "step": 7570 + }, + { + "epoch": 36.76997578692494, + "grad_norm": 0.0024534377735108137, + "learning_rate": 0.2742339964664154, + "loss": 0.0608, + "num_input_tokens_seen": 12977696, + "step": 7575 + }, + { + "epoch": 36.794188861985475, + "grad_norm": 0.0037732133641839027, + "learning_rate": 0.274200976999108, + "loss": 0.0275, + "num_input_tokens_seen": 12986144, + "step": 7580 + }, + { + "epoch": 36.818401937046005, + "grad_norm": 0.00334400893189013, + "learning_rate": 0.27416793837844916, + "loss": 0.02, + "num_input_tokens_seen": 12994560, + "step": 7585 + }, + { + "epoch": 36.842615012106535, + "grad_norm": 0.0116573516279459, + "learning_rate": 0.27413488060953384, + "loss": 0.0322, + "num_input_tokens_seen": 13003200, + "step": 7590 + }, + { + "epoch": 36.86682808716707, + "grad_norm": 0.004680284298956394, + "learning_rate": 0.27410180369745996, + "loss": 0.0342, + "num_input_tokens_seen": 13012224, + "step": 7595 + }, + { + "epoch": 36.8910411622276, + "grad_norm": 0.0026611285284161568, + "learning_rate": 0.27406870764732844, + "loss": 0.0311, + "num_input_tokens_seen": 13020640, + "step": 7600 + }, + { + "epoch": 36.8910411622276, + "eval_loss": 0.3644829988479614, + "eval_runtime": 4.6148, + "eval_samples_per_second": 79.526, + "eval_steps_per_second": 19.936, + "num_input_tokens_seen": 13020640, + "step": 7600 + }, + { + "epoch": 36.91525423728814, + "grad_norm": 0.0031257886439561844, + "learning_rate": 0.27403559246424297, + "loss": 0.0342, + "num_input_tokens_seen": 13029472, + "step": 7605 + }, + { + "epoch": 36.93946731234867, + "grad_norm": 0.003241767408326268, + "learning_rate": 0.2740024581533105, + "loss": 0.0372, + "num_input_tokens_seen": 13038208, + "step": 7610 + }, + { + "epoch": 36.9636803874092, + "grad_norm": 0.0025535982567816973, + "learning_rate": 0.2739693047196406, + "loss": 0.0171, + "num_input_tokens_seen": 13047232, + "step": 7615 + }, + { + "epoch": 36.987893462469735, + "grad_norm": 0.008665882982313633, + "learning_rate": 0.27393613216834606, + "loss": 0.0748, + "num_input_tokens_seen": 13055904, + "step": 7620 + }, + { + "epoch": 37.01452784503632, + "grad_norm": 0.002838360145688057, + "learning_rate": 0.2739029405045424, + "loss": 0.0198, + "num_input_tokens_seen": 13065088, + "step": 7625 + }, + { + "epoch": 37.03874092009685, + "grad_norm": 0.007940150797367096, + "learning_rate": 0.2738697297333483, + "loss": 0.0333, + "num_input_tokens_seen": 13073568, + "step": 7630 + }, + { + "epoch": 37.062953995157386, + "grad_norm": 0.0040554688312113285, + "learning_rate": 0.2738364998598852, + "loss": 0.0101, + "num_input_tokens_seen": 13082048, + "step": 7635 + }, + { + "epoch": 37.087167070217916, + "grad_norm": 0.005932907573878765, + "learning_rate": 0.27380325088927765, + "loss": 0.0199, + "num_input_tokens_seen": 13090752, + "step": 7640 + }, + { + "epoch": 37.11138014527845, + "grad_norm": 0.0006148771499283612, + "learning_rate": 0.27376998282665294, + "loss": 0.0053, + "num_input_tokens_seen": 13099488, + "step": 7645 + }, + { + "epoch": 37.13559322033898, + "grad_norm": 0.007317365612834692, + "learning_rate": 0.27373669567714154, + "loss": 0.0141, + "num_input_tokens_seen": 13107776, + "step": 7650 + }, + { + "epoch": 37.15980629539951, + "grad_norm": 0.0007680591661483049, + "learning_rate": 0.27370338944587663, + "loss": 0.0045, + "num_input_tokens_seen": 13116416, + "step": 7655 + }, + { + "epoch": 37.18401937046005, + "grad_norm": 0.0002696675655897707, + "learning_rate": 0.27367006413799455, + "loss": 0.0339, + "num_input_tokens_seen": 13124928, + "step": 7660 + }, + { + "epoch": 37.20823244552058, + "grad_norm": 0.00046539687900803983, + "learning_rate": 0.2736367197586345, + "loss": 0.0042, + "num_input_tokens_seen": 13133824, + "step": 7665 + }, + { + "epoch": 37.232445520581116, + "grad_norm": 0.0039034485816955566, + "learning_rate": 0.2736033563129385, + "loss": 0.0115, + "num_input_tokens_seen": 13142208, + "step": 7670 + }, + { + "epoch": 37.256658595641646, + "grad_norm": 0.0007160262321121991, + "learning_rate": 0.27356997380605164, + "loss": 0.0147, + "num_input_tokens_seen": 13150944, + "step": 7675 + }, + { + "epoch": 37.280871670702176, + "grad_norm": 0.0017772256396710873, + "learning_rate": 0.27353657224312194, + "loss": 0.0106, + "num_input_tokens_seen": 13159424, + "step": 7680 + }, + { + "epoch": 37.30508474576271, + "grad_norm": 6.45149702904746e-05, + "learning_rate": 0.2735031516293004, + "loss": 0.0072, + "num_input_tokens_seen": 13168160, + "step": 7685 + }, + { + "epoch": 37.32929782082324, + "grad_norm": 0.0027531853411346674, + "learning_rate": 0.2734697119697408, + "loss": 0.0035, + "num_input_tokens_seen": 13176224, + "step": 7690 + }, + { + "epoch": 37.35351089588378, + "grad_norm": 0.0044101751409471035, + "learning_rate": 0.27343625326959997, + "loss": 0.0061, + "num_input_tokens_seen": 13184736, + "step": 7695 + }, + { + "epoch": 37.37772397094431, + "grad_norm": 0.0019159673247486353, + "learning_rate": 0.27340277553403775, + "loss": 0.0087, + "num_input_tokens_seen": 13193248, + "step": 7700 + }, + { + "epoch": 37.40193704600484, + "grad_norm": 0.0007560692611150444, + "learning_rate": 0.2733692787682167, + "loss": 0.002, + "num_input_tokens_seen": 13201856, + "step": 7705 + }, + { + "epoch": 37.426150121065376, + "grad_norm": 0.0005141404690220952, + "learning_rate": 0.27333576297730255, + "loss": 0.0069, + "num_input_tokens_seen": 13210656, + "step": 7710 + }, + { + "epoch": 37.450363196125906, + "grad_norm": 3.257780190324411e-05, + "learning_rate": 0.2733022281664638, + "loss": 0.0028, + "num_input_tokens_seen": 13219136, + "step": 7715 + }, + { + "epoch": 37.47457627118644, + "grad_norm": 9.636359027354047e-05, + "learning_rate": 0.273268674340872, + "loss": 0.0083, + "num_input_tokens_seen": 13227200, + "step": 7720 + }, + { + "epoch": 37.49878934624697, + "grad_norm": 0.00014656812709290534, + "learning_rate": 0.27323510150570146, + "loss": 0.0108, + "num_input_tokens_seen": 13235744, + "step": 7725 + }, + { + "epoch": 37.5230024213075, + "grad_norm": 0.0022798471618443727, + "learning_rate": 0.27320150966612966, + "loss": 0.0078, + "num_input_tokens_seen": 13243968, + "step": 7730 + }, + { + "epoch": 37.54721549636804, + "grad_norm": 0.008884389884769917, + "learning_rate": 0.2731678988273368, + "loss": 0.0088, + "num_input_tokens_seen": 13252576, + "step": 7735 + }, + { + "epoch": 37.57142857142857, + "grad_norm": 0.0031912773847579956, + "learning_rate": 0.27313426899450605, + "loss": 0.0123, + "num_input_tokens_seen": 13261120, + "step": 7740 + }, + { + "epoch": 37.595641646489106, + "grad_norm": 0.0021312234457582235, + "learning_rate": 0.27310062017282366, + "loss": 0.0078, + "num_input_tokens_seen": 13269568, + "step": 7745 + }, + { + "epoch": 37.619854721549636, + "grad_norm": 2.864149610104505e-05, + "learning_rate": 0.2730669523674787, + "loss": 0.0133, + "num_input_tokens_seen": 13278208, + "step": 7750 + }, + { + "epoch": 37.644067796610166, + "grad_norm": 0.002685694256797433, + "learning_rate": 0.2730332655836631, + "loss": 0.0083, + "num_input_tokens_seen": 13286688, + "step": 7755 + }, + { + "epoch": 37.6682808716707, + "grad_norm": 0.00014247478975448757, + "learning_rate": 0.2729995598265718, + "loss": 0.0098, + "num_input_tokens_seen": 13295392, + "step": 7760 + }, + { + "epoch": 37.69249394673123, + "grad_norm": 0.00342165632173419, + "learning_rate": 0.2729658351014027, + "loss": 0.0074, + "num_input_tokens_seen": 13303904, + "step": 7765 + }, + { + "epoch": 37.71670702179177, + "grad_norm": 0.0030923387967050076, + "learning_rate": 0.27293209141335656, + "loss": 0.0129, + "num_input_tokens_seen": 13312608, + "step": 7770 + }, + { + "epoch": 37.7409200968523, + "grad_norm": 0.0008679302991367877, + "learning_rate": 0.27289832876763703, + "loss": 0.0112, + "num_input_tokens_seen": 13321056, + "step": 7775 + }, + { + "epoch": 37.76513317191284, + "grad_norm": 0.0030651309061795473, + "learning_rate": 0.27286454716945074, + "loss": 0.0077, + "num_input_tokens_seen": 13329504, + "step": 7780 + }, + { + "epoch": 37.789346246973366, + "grad_norm": 0.00264027900993824, + "learning_rate": 0.27283074662400725, + "loss": 0.0026, + "num_input_tokens_seen": 13338144, + "step": 7785 + }, + { + "epoch": 37.813559322033896, + "grad_norm": 0.0019714355003088713, + "learning_rate": 0.2727969271365191, + "loss": 0.0042, + "num_input_tokens_seen": 13346656, + "step": 7790 + }, + { + "epoch": 37.83777239709443, + "grad_norm": 0.0019998138304799795, + "learning_rate": 0.2727630887122016, + "loss": 0.0243, + "num_input_tokens_seen": 13355072, + "step": 7795 + }, + { + "epoch": 37.86198547215496, + "grad_norm": 0.00464823329821229, + "learning_rate": 0.27272923135627314, + "loss": 0.0095, + "num_input_tokens_seen": 13363648, + "step": 7800 + }, + { + "epoch": 37.86198547215496, + "eval_loss": 0.4563427269458771, + "eval_runtime": 4.6302, + "eval_samples_per_second": 79.263, + "eval_steps_per_second": 19.87, + "num_input_tokens_seen": 13363648, + "step": 7800 + }, + { + "epoch": 37.8861985472155, + "grad_norm": 0.0012450923677533865, + "learning_rate": 0.2726953550739548, + "loss": 0.0092, + "num_input_tokens_seen": 13372416, + "step": 7805 + }, + { + "epoch": 37.91041162227603, + "grad_norm": 0.0023699202574789524, + "learning_rate": 0.27266145987047086, + "loss": 0.0089, + "num_input_tokens_seen": 13381376, + "step": 7810 + }, + { + "epoch": 37.93462469733656, + "grad_norm": 0.00010363931505708024, + "learning_rate": 0.27262754575104836, + "loss": 0.0073, + "num_input_tokens_seen": 13389792, + "step": 7815 + }, + { + "epoch": 37.958837772397096, + "grad_norm": 0.006974071729928255, + "learning_rate": 0.27259361272091726, + "loss": 0.0207, + "num_input_tokens_seen": 13397984, + "step": 7820 + }, + { + "epoch": 37.983050847457626, + "grad_norm": 0.00030344820697791874, + "learning_rate": 0.27255966078531046, + "loss": 0.0078, + "num_input_tokens_seen": 13406848, + "step": 7825 + }, + { + "epoch": 38.00968523002421, + "grad_norm": 0.00016283727018162608, + "learning_rate": 0.2725256899494638, + "loss": 0.0035, + "num_input_tokens_seen": 13415616, + "step": 7830 + }, + { + "epoch": 38.03389830508475, + "grad_norm": 0.0003317389346193522, + "learning_rate": 0.272491700218616, + "loss": 0.003, + "num_input_tokens_seen": 13424192, + "step": 7835 + }, + { + "epoch": 38.05811138014528, + "grad_norm": 0.009904574602842331, + "learning_rate": 0.27245769159800876, + "loss": 0.0344, + "num_input_tokens_seen": 13432768, + "step": 7840 + }, + { + "epoch": 38.082324455205814, + "grad_norm": 0.00849754549562931, + "learning_rate": 0.2724236640928865, + "loss": 0.03, + "num_input_tokens_seen": 13441568, + "step": 7845 + }, + { + "epoch": 38.106537530266344, + "grad_norm": 0.0002489784674253315, + "learning_rate": 0.27238961770849673, + "loss": 0.0035, + "num_input_tokens_seen": 13449984, + "step": 7850 + }, + { + "epoch": 38.130750605326874, + "grad_norm": 0.004688160959631205, + "learning_rate": 0.27235555245008997, + "loss": 0.0107, + "num_input_tokens_seen": 13458368, + "step": 7855 + }, + { + "epoch": 38.15496368038741, + "grad_norm": 0.00016273840446956456, + "learning_rate": 0.2723214683229193, + "loss": 0.0032, + "num_input_tokens_seen": 13466464, + "step": 7860 + }, + { + "epoch": 38.17917675544794, + "grad_norm": 0.0007720162393525243, + "learning_rate": 0.27228736533224107, + "loss": 0.0088, + "num_input_tokens_seen": 13475104, + "step": 7865 + }, + { + "epoch": 38.20338983050848, + "grad_norm": 0.0003214380703866482, + "learning_rate": 0.27225324348331437, + "loss": 0.0066, + "num_input_tokens_seen": 13483680, + "step": 7870 + }, + { + "epoch": 38.22760290556901, + "grad_norm": 0.012900297529995441, + "learning_rate": 0.27221910278140116, + "loss": 0.0189, + "num_input_tokens_seen": 13492384, + "step": 7875 + }, + { + "epoch": 38.25181598062954, + "grad_norm": 0.00029766993247903883, + "learning_rate": 0.2721849432317664, + "loss": 0.0115, + "num_input_tokens_seen": 13501056, + "step": 7880 + }, + { + "epoch": 38.276029055690074, + "grad_norm": 0.000555931415874511, + "learning_rate": 0.2721507648396779, + "loss": 0.0023, + "num_input_tokens_seen": 13509376, + "step": 7885 + }, + { + "epoch": 38.300242130750604, + "grad_norm": 0.0021169669926166534, + "learning_rate": 0.27211656761040653, + "loss": 0.0021, + "num_input_tokens_seen": 13517728, + "step": 7890 + }, + { + "epoch": 38.32445520581114, + "grad_norm": 0.006732437759637833, + "learning_rate": 0.2720823515492257, + "loss": 0.0112, + "num_input_tokens_seen": 13526112, + "step": 7895 + }, + { + "epoch": 38.34866828087167, + "grad_norm": 7.467430987162516e-05, + "learning_rate": 0.27204811666141215, + "loss": 0.0078, + "num_input_tokens_seen": 13534624, + "step": 7900 + }, + { + "epoch": 38.3728813559322, + "grad_norm": 0.0006743100821040571, + "learning_rate": 0.2720138629522452, + "loss": 0.0056, + "num_input_tokens_seen": 13543584, + "step": 7905 + }, + { + "epoch": 38.39709443099274, + "grad_norm": 0.00011220723536098376, + "learning_rate": 0.2719795904270073, + "loss": 0.0014, + "num_input_tokens_seen": 13552128, + "step": 7910 + }, + { + "epoch": 38.42130750605327, + "grad_norm": 0.008503460325300694, + "learning_rate": 0.2719452990909837, + "loss": 0.0238, + "num_input_tokens_seen": 13560800, + "step": 7915 + }, + { + "epoch": 38.445520581113804, + "grad_norm": 0.0006852109218016267, + "learning_rate": 0.2719109889494625, + "loss": 0.0102, + "num_input_tokens_seen": 13569184, + "step": 7920 + }, + { + "epoch": 38.469733656174334, + "grad_norm": 0.005665584001690149, + "learning_rate": 0.27187666000773475, + "loss": 0.0252, + "num_input_tokens_seen": 13577664, + "step": 7925 + }, + { + "epoch": 38.493946731234864, + "grad_norm": 0.002790414495393634, + "learning_rate": 0.2718423122710944, + "loss": 0.0092, + "num_input_tokens_seen": 13586560, + "step": 7930 + }, + { + "epoch": 38.5181598062954, + "grad_norm": 0.003995677921921015, + "learning_rate": 0.2718079457448384, + "loss": 0.0054, + "num_input_tokens_seen": 13595392, + "step": 7935 + }, + { + "epoch": 38.54237288135593, + "grad_norm": 0.005079672206193209, + "learning_rate": 0.27177356043426637, + "loss": 0.0103, + "num_input_tokens_seen": 13603840, + "step": 7940 + }, + { + "epoch": 38.56658595641647, + "grad_norm": 0.0017063479172065854, + "learning_rate": 0.27173915634468104, + "loss": 0.0136, + "num_input_tokens_seen": 13612384, + "step": 7945 + }, + { + "epoch": 38.590799031477, + "grad_norm": 0.0027299344073981047, + "learning_rate": 0.27170473348138796, + "loss": 0.0084, + "num_input_tokens_seen": 13620736, + "step": 7950 + }, + { + "epoch": 38.61501210653753, + "grad_norm": 0.0011734231375157833, + "learning_rate": 0.27167029184969554, + "loss": 0.0092, + "num_input_tokens_seen": 13629152, + "step": 7955 + }, + { + "epoch": 38.639225181598064, + "grad_norm": 0.004888922907412052, + "learning_rate": 0.27163583145491504, + "loss": 0.0141, + "num_input_tokens_seen": 13637632, + "step": 7960 + }, + { + "epoch": 38.663438256658594, + "grad_norm": 0.006230051163583994, + "learning_rate": 0.2716013523023608, + "loss": 0.052, + "num_input_tokens_seen": 13646048, + "step": 7965 + }, + { + "epoch": 38.68765133171913, + "grad_norm": 0.006526857148855925, + "learning_rate": 0.27156685439734995, + "loss": 0.0245, + "num_input_tokens_seen": 13654880, + "step": 7970 + }, + { + "epoch": 38.71186440677966, + "grad_norm": 0.0008585036266595125, + "learning_rate": 0.2715323377452024, + "loss": 0.022, + "num_input_tokens_seen": 13663520, + "step": 7975 + }, + { + "epoch": 38.73607748184019, + "grad_norm": 0.001948303310200572, + "learning_rate": 0.2714978023512411, + "loss": 0.0236, + "num_input_tokens_seen": 13672320, + "step": 7980 + }, + { + "epoch": 38.76029055690073, + "grad_norm": 0.008924534544348717, + "learning_rate": 0.2714632482207918, + "loss": 0.0325, + "num_input_tokens_seen": 13680544, + "step": 7985 + }, + { + "epoch": 38.78450363196126, + "grad_norm": 0.004723210819065571, + "learning_rate": 0.2714286753591833, + "loss": 0.0137, + "num_input_tokens_seen": 13689248, + "step": 7990 + }, + { + "epoch": 38.808716707021794, + "grad_norm": 0.0013683628058061004, + "learning_rate": 0.27139408377174706, + "loss": 0.0146, + "num_input_tokens_seen": 13697888, + "step": 7995 + }, + { + "epoch": 38.832929782082324, + "grad_norm": 0.002312249504029751, + "learning_rate": 0.27135947346381756, + "loss": 0.0138, + "num_input_tokens_seen": 13706752, + "step": 8000 + }, + { + "epoch": 38.832929782082324, + "eval_loss": 0.457261860370636, + "eval_runtime": 4.62, + "eval_samples_per_second": 79.437, + "eval_steps_per_second": 19.913, + "num_input_tokens_seen": 13706752, + "step": 8000 + }, + { + "epoch": 38.857142857142854, + "grad_norm": 0.011671515181660652, + "learning_rate": 0.2713248444407322, + "loss": 0.0312, + "num_input_tokens_seen": 13715296, + "step": 8005 + }, + { + "epoch": 38.88135593220339, + "grad_norm": 0.0002082908176817, + "learning_rate": 0.27129019670783106, + "loss": 0.0126, + "num_input_tokens_seen": 13724000, + "step": 8010 + }, + { + "epoch": 38.90556900726392, + "grad_norm": 0.005742449313402176, + "learning_rate": 0.27125553027045746, + "loss": 0.0083, + "num_input_tokens_seen": 13732512, + "step": 8015 + }, + { + "epoch": 38.92978208232446, + "grad_norm": 0.0003380756243132055, + "learning_rate": 0.2712208451339572, + "loss": 0.0464, + "num_input_tokens_seen": 13741024, + "step": 8020 + }, + { + "epoch": 38.95399515738499, + "grad_norm": 0.0007092381129041314, + "learning_rate": 0.27118614130367935, + "loss": 0.0241, + "num_input_tokens_seen": 13749664, + "step": 8025 + }, + { + "epoch": 38.97820823244552, + "grad_norm": 0.005512033589184284, + "learning_rate": 0.2711514187849756, + "loss": 0.0303, + "num_input_tokens_seen": 13757984, + "step": 8030 + }, + { + "epoch": 39.00484261501211, + "grad_norm": 0.011026188731193542, + "learning_rate": 0.27111667758320057, + "loss": 0.0176, + "num_input_tokens_seen": 13767040, + "step": 8035 + }, + { + "epoch": 39.02905569007264, + "grad_norm": 0.0007421321352012455, + "learning_rate": 0.27108191770371176, + "loss": 0.0317, + "num_input_tokens_seen": 13775616, + "step": 8040 + }, + { + "epoch": 39.05326876513317, + "grad_norm": 0.0009809716138988733, + "learning_rate": 0.2710471391518697, + "loss": 0.0033, + "num_input_tokens_seen": 13784320, + "step": 8045 + }, + { + "epoch": 39.077481840193705, + "grad_norm": 0.00023665433400310576, + "learning_rate": 0.2710123419330375, + "loss": 0.0031, + "num_input_tokens_seen": 13792384, + "step": 8050 + }, + { + "epoch": 39.101694915254235, + "grad_norm": 0.06441008299589157, + "learning_rate": 0.2709775260525816, + "loss": 0.1275, + "num_input_tokens_seen": 13800704, + "step": 8055 + }, + { + "epoch": 39.12590799031477, + "grad_norm": 0.01863381452858448, + "learning_rate": 0.27094269151587075, + "loss": 0.0867, + "num_input_tokens_seen": 13809440, + "step": 8060 + }, + { + "epoch": 39.1501210653753, + "grad_norm": 0.0065587833523750305, + "learning_rate": 0.27090783832827703, + "loss": 0.0746, + "num_input_tokens_seen": 13818272, + "step": 8065 + }, + { + "epoch": 39.17433414043583, + "grad_norm": 0.016976185142993927, + "learning_rate": 0.2708729664951753, + "loss": 0.0973, + "num_input_tokens_seen": 13826592, + "step": 8070 + }, + { + "epoch": 39.19854721549637, + "grad_norm": 0.015047082677483559, + "learning_rate": 0.27083807602194304, + "loss": 0.1224, + "num_input_tokens_seen": 13835424, + "step": 8075 + }, + { + "epoch": 39.2227602905569, + "grad_norm": 0.010311217047274113, + "learning_rate": 0.270803166913961, + "loss": 0.133, + "num_input_tokens_seen": 13844064, + "step": 8080 + }, + { + "epoch": 39.246973365617436, + "grad_norm": 0.005547080188989639, + "learning_rate": 0.27076823917661247, + "loss": 0.091, + "num_input_tokens_seen": 13853056, + "step": 8085 + }, + { + "epoch": 39.271186440677965, + "grad_norm": 0.007512770127505064, + "learning_rate": 0.2707332928152838, + "loss": 0.0488, + "num_input_tokens_seen": 13861696, + "step": 8090 + }, + { + "epoch": 39.295399515738495, + "grad_norm": 0.007394031155854464, + "learning_rate": 0.2706983278353641, + "loss": 0.0839, + "num_input_tokens_seen": 13870432, + "step": 8095 + }, + { + "epoch": 39.31961259079903, + "grad_norm": 0.007667731959372759, + "learning_rate": 0.27066334424224553, + "loss": 0.0905, + "num_input_tokens_seen": 13879296, + "step": 8100 + }, + { + "epoch": 39.34382566585956, + "grad_norm": 0.006006748881191015, + "learning_rate": 0.27062834204132297, + "loss": 0.0458, + "num_input_tokens_seen": 13887488, + "step": 8105 + }, + { + "epoch": 39.3680387409201, + "grad_norm": 0.0016069068806245923, + "learning_rate": 0.27059332123799407, + "loss": 0.0508, + "num_input_tokens_seen": 13895776, + "step": 8110 + }, + { + "epoch": 39.39225181598063, + "grad_norm": 0.004273891448974609, + "learning_rate": 0.27055828183765956, + "loss": 0.0403, + "num_input_tokens_seen": 13904160, + "step": 8115 + }, + { + "epoch": 39.416464891041166, + "grad_norm": 0.0042550587095320225, + "learning_rate": 0.270523223845723, + "loss": 0.0498, + "num_input_tokens_seen": 13912672, + "step": 8120 + }, + { + "epoch": 39.440677966101696, + "grad_norm": 0.005416970234364271, + "learning_rate": 0.2704881472675907, + "loss": 0.031, + "num_input_tokens_seen": 13921312, + "step": 8125 + }, + { + "epoch": 39.464891041162225, + "grad_norm": 0.008596121333539486, + "learning_rate": 0.270453052108672, + "loss": 0.067, + "num_input_tokens_seen": 13929984, + "step": 8130 + }, + { + "epoch": 39.48910411622276, + "grad_norm": 0.0015923178289085627, + "learning_rate": 0.2704179383743789, + "loss": 0.0149, + "num_input_tokens_seen": 13937952, + "step": 8135 + }, + { + "epoch": 39.51331719128329, + "grad_norm": 0.00867521297186613, + "learning_rate": 0.27038280607012644, + "loss": 0.0596, + "num_input_tokens_seen": 13946144, + "step": 8140 + }, + { + "epoch": 39.53753026634383, + "grad_norm": 0.01537265069782734, + "learning_rate": 0.27034765520133247, + "loss": 0.0739, + "num_input_tokens_seen": 13954272, + "step": 8145 + }, + { + "epoch": 39.56174334140436, + "grad_norm": 0.013296553865075111, + "learning_rate": 0.2703124857734177, + "loss": 0.0583, + "num_input_tokens_seen": 13962976, + "step": 8150 + }, + { + "epoch": 39.58595641646489, + "grad_norm": 0.002946484135463834, + "learning_rate": 0.27027729779180565, + "loss": 0.0708, + "num_input_tokens_seen": 13971392, + "step": 8155 + }, + { + "epoch": 39.610169491525426, + "grad_norm": 0.007981255650520325, + "learning_rate": 0.27024209126192283, + "loss": 0.062, + "num_input_tokens_seen": 13979776, + "step": 8160 + }, + { + "epoch": 39.634382566585955, + "grad_norm": 0.0062227253802120686, + "learning_rate": 0.2702068661891984, + "loss": 0.0541, + "num_input_tokens_seen": 13988256, + "step": 8165 + }, + { + "epoch": 39.65859564164649, + "grad_norm": 0.0032665100879967213, + "learning_rate": 0.2701716225790647, + "loss": 0.0621, + "num_input_tokens_seen": 13996768, + "step": 8170 + }, + { + "epoch": 39.68280871670702, + "grad_norm": 0.0012327709700912237, + "learning_rate": 0.27013636043695655, + "loss": 0.0383, + "num_input_tokens_seen": 14005344, + "step": 8175 + }, + { + "epoch": 39.70702179176755, + "grad_norm": 0.008603109046816826, + "learning_rate": 0.27010107976831194, + "loss": 0.0516, + "num_input_tokens_seen": 14014080, + "step": 8180 + }, + { + "epoch": 39.73123486682809, + "grad_norm": 0.0032469676807522774, + "learning_rate": 0.2700657805785715, + "loss": 0.0225, + "num_input_tokens_seen": 14022496, + "step": 8185 + }, + { + "epoch": 39.75544794188862, + "grad_norm": 0.007325473241508007, + "learning_rate": 0.2700304628731789, + "loss": 0.0355, + "num_input_tokens_seen": 14031104, + "step": 8190 + }, + { + "epoch": 39.779661016949156, + "grad_norm": 0.0063154674135148525, + "learning_rate": 0.26999512665758046, + "loss": 0.0275, + "num_input_tokens_seen": 14039680, + "step": 8195 + }, + { + "epoch": 39.803874092009686, + "grad_norm": 0.005193155258893967, + "learning_rate": 0.2699597719372256, + "loss": 0.0358, + "num_input_tokens_seen": 14048256, + "step": 8200 + }, + { + "epoch": 39.803874092009686, + "eval_loss": 0.38340991735458374, + "eval_runtime": 4.6224, + "eval_samples_per_second": 79.396, + "eval_steps_per_second": 19.903, + "num_input_tokens_seen": 14048256, + "step": 8200 + }, + { + "epoch": 39.828087167070215, + "grad_norm": 0.005483638495206833, + "learning_rate": 0.26992439871756635, + "loss": 0.0366, + "num_input_tokens_seen": 14056928, + "step": 8205 + }, + { + "epoch": 39.85230024213075, + "grad_norm": 0.0052724494598805904, + "learning_rate": 0.2698890070040578, + "loss": 0.0388, + "num_input_tokens_seen": 14065696, + "step": 8210 + }, + { + "epoch": 39.87651331719128, + "grad_norm": 0.009708352386951447, + "learning_rate": 0.2698535968021577, + "loss": 0.0606, + "num_input_tokens_seen": 14074080, + "step": 8215 + }, + { + "epoch": 39.90072639225182, + "grad_norm": 0.006542219780385494, + "learning_rate": 0.26981816811732684, + "loss": 0.0292, + "num_input_tokens_seen": 14082688, + "step": 8220 + }, + { + "epoch": 39.92493946731235, + "grad_norm": 0.005829886067658663, + "learning_rate": 0.26978272095502875, + "loss": 0.0297, + "num_input_tokens_seen": 14091200, + "step": 8225 + }, + { + "epoch": 39.94915254237288, + "grad_norm": 0.005303608253598213, + "learning_rate": 0.26974725532072974, + "loss": 0.0452, + "num_input_tokens_seen": 14099936, + "step": 8230 + }, + { + "epoch": 39.973365617433416, + "grad_norm": 0.005822218023240566, + "learning_rate": 0.26971177121989914, + "loss": 0.0281, + "num_input_tokens_seen": 14108480, + "step": 8235 + }, + { + "epoch": 39.997578692493946, + "grad_norm": 0.007460458669811487, + "learning_rate": 0.2696762686580091, + "loss": 0.0251, + "num_input_tokens_seen": 14117216, + "step": 8240 + }, + { + "epoch": 40.02421307506053, + "grad_norm": 0.005667224060744047, + "learning_rate": 0.26964074764053436, + "loss": 0.0096, + "num_input_tokens_seen": 14126208, + "step": 8245 + }, + { + "epoch": 40.04842615012107, + "grad_norm": 0.005869144108146429, + "learning_rate": 0.2696052081729529, + "loss": 0.013, + "num_input_tokens_seen": 14134688, + "step": 8250 + }, + { + "epoch": 40.0726392251816, + "grad_norm": 0.0035762684419751167, + "learning_rate": 0.2695696502607453, + "loss": 0.0068, + "num_input_tokens_seen": 14143200, + "step": 8255 + }, + { + "epoch": 40.09685230024213, + "grad_norm": 0.0012090334203094244, + "learning_rate": 0.26953407390939504, + "loss": 0.0089, + "num_input_tokens_seen": 14152128, + "step": 8260 + }, + { + "epoch": 40.12106537530266, + "grad_norm": 0.0008448883309029043, + "learning_rate": 0.26949847912438835, + "loss": 0.0134, + "num_input_tokens_seen": 14160704, + "step": 8265 + }, + { + "epoch": 40.14527845036319, + "grad_norm": 0.001999086234718561, + "learning_rate": 0.26946286591121454, + "loss": 0.0082, + "num_input_tokens_seen": 14169024, + "step": 8270 + }, + { + "epoch": 40.16949152542373, + "grad_norm": 0.005386147182434797, + "learning_rate": 0.2694272342753655, + "loss": 0.0105, + "num_input_tokens_seen": 14177600, + "step": 8275 + }, + { + "epoch": 40.19370460048426, + "grad_norm": 0.004037580452859402, + "learning_rate": 0.26939158422233617, + "loss": 0.015, + "num_input_tokens_seen": 14185696, + "step": 8280 + }, + { + "epoch": 40.2179176755448, + "grad_norm": 0.000800173613242805, + "learning_rate": 0.26935591575762413, + "loss": 0.0085, + "num_input_tokens_seen": 14194400, + "step": 8285 + }, + { + "epoch": 40.24213075060533, + "grad_norm": 0.0035757895093411207, + "learning_rate": 0.26932022888672996, + "loss": 0.011, + "num_input_tokens_seen": 14203072, + "step": 8290 + }, + { + "epoch": 40.26634382566586, + "grad_norm": 0.0012092432007193565, + "learning_rate": 0.26928452361515703, + "loss": 0.0089, + "num_input_tokens_seen": 14211968, + "step": 8295 + }, + { + "epoch": 40.29055690072639, + "grad_norm": 0.009662676602602005, + "learning_rate": 0.26924879994841155, + "loss": 0.0549, + "num_input_tokens_seen": 14220672, + "step": 8300 + }, + { + "epoch": 40.31476997578692, + "grad_norm": 0.0003405738389119506, + "learning_rate": 0.2692130578920025, + "loss": 0.0283, + "num_input_tokens_seen": 14229312, + "step": 8305 + }, + { + "epoch": 40.33898305084746, + "grad_norm": 0.0049034650437533855, + "learning_rate": 0.26917729745144187, + "loss": 0.0124, + "num_input_tokens_seen": 14237920, + "step": 8310 + }, + { + "epoch": 40.36319612590799, + "grad_norm": 0.001158976461738348, + "learning_rate": 0.2691415186322443, + "loss": 0.0133, + "num_input_tokens_seen": 14246176, + "step": 8315 + }, + { + "epoch": 40.38740920096852, + "grad_norm": 0.0014724992215633392, + "learning_rate": 0.2691057214399273, + "loss": 0.0082, + "num_input_tokens_seen": 14254976, + "step": 8320 + }, + { + "epoch": 40.41162227602906, + "grad_norm": 0.00021399934485089034, + "learning_rate": 0.2690699058800113, + "loss": 0.0151, + "num_input_tokens_seen": 14263776, + "step": 8325 + }, + { + "epoch": 40.43583535108959, + "grad_norm": 0.006372722797095776, + "learning_rate": 0.2690340719580194, + "loss": 0.0196, + "num_input_tokens_seen": 14272960, + "step": 8330 + }, + { + "epoch": 40.460048426150124, + "grad_norm": 0.00030938698910176754, + "learning_rate": 0.2689982196794778, + "loss": 0.0075, + "num_input_tokens_seen": 14281344, + "step": 8335 + }, + { + "epoch": 40.48426150121065, + "grad_norm": 0.0004338110447861254, + "learning_rate": 0.2689623490499153, + "loss": 0.0084, + "num_input_tokens_seen": 14289760, + "step": 8340 + }, + { + "epoch": 40.50847457627118, + "grad_norm": 0.007861214689910412, + "learning_rate": 0.2689264600748636, + "loss": 0.0136, + "num_input_tokens_seen": 14298528, + "step": 8345 + }, + { + "epoch": 40.53268765133172, + "grad_norm": 0.00036348760477267206, + "learning_rate": 0.26889055275985724, + "loss": 0.0105, + "num_input_tokens_seen": 14307168, + "step": 8350 + }, + { + "epoch": 40.55690072639225, + "grad_norm": 0.0033556607086211443, + "learning_rate": 0.2688546271104335, + "loss": 0.0115, + "num_input_tokens_seen": 14315264, + "step": 8355 + }, + { + "epoch": 40.58111380145279, + "grad_norm": 0.0011042419355362654, + "learning_rate": 0.26881868313213275, + "loss": 0.0174, + "num_input_tokens_seen": 14323808, + "step": 8360 + }, + { + "epoch": 40.60532687651332, + "grad_norm": 0.0069442857056856155, + "learning_rate": 0.2687827208304978, + "loss": 0.0125, + "num_input_tokens_seen": 14331808, + "step": 8365 + }, + { + "epoch": 40.62953995157385, + "grad_norm": 0.011836163699626923, + "learning_rate": 0.26874674021107464, + "loss": 0.0233, + "num_input_tokens_seen": 14340384, + "step": 8370 + }, + { + "epoch": 40.653753026634384, + "grad_norm": 9.164316725218669e-05, + "learning_rate": 0.2687107412794118, + "loss": 0.007, + "num_input_tokens_seen": 14348896, + "step": 8375 + }, + { + "epoch": 40.67796610169491, + "grad_norm": 0.0028332890942692757, + "learning_rate": 0.26867472404106096, + "loss": 0.0299, + "num_input_tokens_seen": 14357536, + "step": 8380 + }, + { + "epoch": 40.70217917675545, + "grad_norm": 0.0018223021179437637, + "learning_rate": 0.26863868850157624, + "loss": 0.0133, + "num_input_tokens_seen": 14366048, + "step": 8385 + }, + { + "epoch": 40.72639225181598, + "grad_norm": 0.0053795198909938335, + "learning_rate": 0.26860263466651485, + "loss": 0.0183, + "num_input_tokens_seen": 14374560, + "step": 8390 + }, + { + "epoch": 40.75060532687651, + "grad_norm": 0.0021763830445706844, + "learning_rate": 0.26856656254143674, + "loss": 0.021, + "num_input_tokens_seen": 14383232, + "step": 8395 + }, + { + "epoch": 40.77481840193705, + "grad_norm": 0.006429789587855339, + "learning_rate": 0.2685304721319047, + "loss": 0.0293, + "num_input_tokens_seen": 14392064, + "step": 8400 + }, + { + "epoch": 40.77481840193705, + "eval_loss": 0.38330236077308655, + "eval_runtime": 4.6394, + "eval_samples_per_second": 79.104, + "eval_steps_per_second": 19.83, + "num_input_tokens_seen": 14392064, + "step": 8400 + }, + { + "epoch": 40.79903147699758, + "grad_norm": 0.006193754728883505, + "learning_rate": 0.2684943634434843, + "loss": 0.0417, + "num_input_tokens_seen": 14400640, + "step": 8405 + }, + { + "epoch": 40.823244552058114, + "grad_norm": 0.0018129948293790221, + "learning_rate": 0.268458236481744, + "loss": 0.014, + "num_input_tokens_seen": 14408928, + "step": 8410 + }, + { + "epoch": 40.847457627118644, + "grad_norm": 0.006777280941605568, + "learning_rate": 0.2684220912522549, + "loss": 0.0325, + "num_input_tokens_seen": 14417344, + "step": 8415 + }, + { + "epoch": 40.87167070217917, + "grad_norm": 0.00939716026186943, + "learning_rate": 0.2683859277605913, + "loss": 0.0225, + "num_input_tokens_seen": 14425504, + "step": 8420 + }, + { + "epoch": 40.89588377723971, + "grad_norm": 0.004435001406818628, + "learning_rate": 0.2683497460123298, + "loss": 0.0142, + "num_input_tokens_seen": 14434240, + "step": 8425 + }, + { + "epoch": 40.92009685230024, + "grad_norm": 0.001838209107518196, + "learning_rate": 0.26831354601305013, + "loss": 0.0216, + "num_input_tokens_seen": 14442880, + "step": 8430 + }, + { + "epoch": 40.94430992736078, + "grad_norm": 0.003060736460611224, + "learning_rate": 0.26827732776833496, + "loss": 0.0125, + "num_input_tokens_seen": 14451776, + "step": 8435 + }, + { + "epoch": 40.96852300242131, + "grad_norm": 0.007717016618698835, + "learning_rate": 0.26824109128376944, + "loss": 0.0308, + "num_input_tokens_seen": 14460384, + "step": 8440 + }, + { + "epoch": 40.99273607748184, + "grad_norm": 0.0033654109574854374, + "learning_rate": 0.2682048365649417, + "loss": 0.0176, + "num_input_tokens_seen": 14469120, + "step": 8445 + }, + { + "epoch": 41.01937046004843, + "grad_norm": 0.0011501964181661606, + "learning_rate": 0.2681685636174428, + "loss": 0.027, + "num_input_tokens_seen": 14478080, + "step": 8450 + }, + { + "epoch": 41.04358353510896, + "grad_norm": 0.0014325217343866825, + "learning_rate": 0.2681322724468663, + "loss": 0.004, + "num_input_tokens_seen": 14486880, + "step": 8455 + }, + { + "epoch": 41.067796610169495, + "grad_norm": 0.01033877208828926, + "learning_rate": 0.2680959630588089, + "loss": 0.0115, + "num_input_tokens_seen": 14495520, + "step": 8460 + }, + { + "epoch": 41.092009685230025, + "grad_norm": 0.0027040215209126472, + "learning_rate": 0.26805963545886985, + "loss": 0.0106, + "num_input_tokens_seen": 14503616, + "step": 8465 + }, + { + "epoch": 41.116222760290555, + "grad_norm": 9.713251347420737e-05, + "learning_rate": 0.26802328965265143, + "loss": 0.0058, + "num_input_tokens_seen": 14512128, + "step": 8470 + }, + { + "epoch": 41.14043583535109, + "grad_norm": 0.0012281544040888548, + "learning_rate": 0.26798692564575854, + "loss": 0.0033, + "num_input_tokens_seen": 14520992, + "step": 8475 + }, + { + "epoch": 41.16464891041162, + "grad_norm": 0.004396478179842234, + "learning_rate": 0.26795054344379904, + "loss": 0.0079, + "num_input_tokens_seen": 14529600, + "step": 8480 + }, + { + "epoch": 41.18886198547216, + "grad_norm": 0.005843395832926035, + "learning_rate": 0.2679141430523835, + "loss": 0.0076, + "num_input_tokens_seen": 14538816, + "step": 8485 + }, + { + "epoch": 41.21307506053269, + "grad_norm": 0.00135545595549047, + "learning_rate": 0.2678777244771252, + "loss": 0.0073, + "num_input_tokens_seen": 14547296, + "step": 8490 + }, + { + "epoch": 41.23728813559322, + "grad_norm": 0.000960705045145005, + "learning_rate": 0.2678412877236405, + "loss": 0.0089, + "num_input_tokens_seen": 14555776, + "step": 8495 + }, + { + "epoch": 41.261501210653755, + "grad_norm": 0.0017531297635287046, + "learning_rate": 0.2678048327975484, + "loss": 0.0026, + "num_input_tokens_seen": 14564320, + "step": 8500 + }, + { + "epoch": 41.285714285714285, + "grad_norm": 0.00043308353633619845, + "learning_rate": 0.2677683597044706, + "loss": 0.0028, + "num_input_tokens_seen": 14572896, + "step": 8505 + }, + { + "epoch": 41.30992736077482, + "grad_norm": 0.0008826818084344268, + "learning_rate": 0.2677318684500318, + "loss": 0.0058, + "num_input_tokens_seen": 14581600, + "step": 8510 + }, + { + "epoch": 41.33414043583535, + "grad_norm": 0.004627978429198265, + "learning_rate": 0.2676953590398593, + "loss": 0.0143, + "num_input_tokens_seen": 14590048, + "step": 8515 + }, + { + "epoch": 41.35835351089588, + "grad_norm": 0.007866734638810158, + "learning_rate": 0.2676588314795834, + "loss": 0.0103, + "num_input_tokens_seen": 14598528, + "step": 8520 + }, + { + "epoch": 41.38256658595642, + "grad_norm": 0.0004518120549619198, + "learning_rate": 0.26762228577483715, + "loss": 0.0087, + "num_input_tokens_seen": 14606912, + "step": 8525 + }, + { + "epoch": 41.40677966101695, + "grad_norm": 0.00281900423578918, + "learning_rate": 0.2675857219312563, + "loss": 0.003, + "num_input_tokens_seen": 14615296, + "step": 8530 + }, + { + "epoch": 41.430992736077485, + "grad_norm": 0.00673436326906085, + "learning_rate": 0.2675491399544794, + "loss": 0.0044, + "num_input_tokens_seen": 14623552, + "step": 8535 + }, + { + "epoch": 41.455205811138015, + "grad_norm": 0.00023285408678930253, + "learning_rate": 0.2675125398501479, + "loss": 0.0022, + "num_input_tokens_seen": 14631872, + "step": 8540 + }, + { + "epoch": 41.479418886198545, + "grad_norm": 0.00021113043476361781, + "learning_rate": 0.26747592162390604, + "loss": 0.0105, + "num_input_tokens_seen": 14640512, + "step": 8545 + }, + { + "epoch": 41.50363196125908, + "grad_norm": 0.012193230912089348, + "learning_rate": 0.26743928528140076, + "loss": 0.0111, + "num_input_tokens_seen": 14649184, + "step": 8550 + }, + { + "epoch": 41.52784503631961, + "grad_norm": 0.0008122065337374806, + "learning_rate": 0.26740263082828186, + "loss": 0.0163, + "num_input_tokens_seen": 14657632, + "step": 8555 + }, + { + "epoch": 41.55205811138015, + "grad_norm": 0.0028879002202302217, + "learning_rate": 0.2673659582702019, + "loss": 0.0108, + "num_input_tokens_seen": 14666208, + "step": 8560 + }, + { + "epoch": 41.57627118644068, + "grad_norm": 0.0006408838089555502, + "learning_rate": 0.2673292676128163, + "loss": 0.0154, + "num_input_tokens_seen": 14674560, + "step": 8565 + }, + { + "epoch": 41.60048426150121, + "grad_norm": 0.0006222509546205401, + "learning_rate": 0.2672925588617831, + "loss": 0.0106, + "num_input_tokens_seen": 14683168, + "step": 8570 + }, + { + "epoch": 41.624697336561745, + "grad_norm": 0.00163977628108114, + "learning_rate": 0.2672558320227634, + "loss": 0.0134, + "num_input_tokens_seen": 14691840, + "step": 8575 + }, + { + "epoch": 41.648910411622275, + "grad_norm": 0.012170759029686451, + "learning_rate": 0.2672190871014209, + "loss": 0.0237, + "num_input_tokens_seen": 14700288, + "step": 8580 + }, + { + "epoch": 41.67312348668281, + "grad_norm": 0.0003271872701589018, + "learning_rate": 0.267182324103422, + "loss": 0.0089, + "num_input_tokens_seen": 14708768, + "step": 8585 + }, + { + "epoch": 41.69733656174334, + "grad_norm": 0.003225240157917142, + "learning_rate": 0.2671455430344362, + "loss": 0.0155, + "num_input_tokens_seen": 14717120, + "step": 8590 + }, + { + "epoch": 41.72154963680387, + "grad_norm": 0.0042495750822126865, + "learning_rate": 0.2671087439001355, + "loss": 0.0063, + "num_input_tokens_seen": 14725120, + "step": 8595 + }, + { + "epoch": 41.74576271186441, + "grad_norm": 0.005819555837661028, + "learning_rate": 0.2670719267061948, + "loss": 0.0119, + "num_input_tokens_seen": 14733504, + "step": 8600 + }, + { + "epoch": 41.74576271186441, + "eval_loss": 0.4785551130771637, + "eval_runtime": 4.6175, + "eval_samples_per_second": 79.481, + "eval_steps_per_second": 19.924, + "num_input_tokens_seen": 14733504, + "step": 8600 + }, + { + "epoch": 41.76997578692494, + "grad_norm": 0.005521407816559076, + "learning_rate": 0.2670350914582918, + "loss": 0.0285, + "num_input_tokens_seen": 14741824, + "step": 8605 + }, + { + "epoch": 41.794188861985475, + "grad_norm": 0.0014932435005903244, + "learning_rate": 0.26699823816210694, + "loss": 0.0136, + "num_input_tokens_seen": 14750688, + "step": 8610 + }, + { + "epoch": 41.818401937046005, + "grad_norm": 0.001942291622981429, + "learning_rate": 0.26696136682332344, + "loss": 0.0089, + "num_input_tokens_seen": 14759136, + "step": 8615 + }, + { + "epoch": 41.842615012106535, + "grad_norm": 0.0014701620675623417, + "learning_rate": 0.2669244774476274, + "loss": 0.0162, + "num_input_tokens_seen": 14767840, + "step": 8620 + }, + { + "epoch": 41.86682808716707, + "grad_norm": 0.0031147256959229708, + "learning_rate": 0.2668875700407075, + "loss": 0.0079, + "num_input_tokens_seen": 14776448, + "step": 8625 + }, + { + "epoch": 41.8910411622276, + "grad_norm": 0.00414907094091177, + "learning_rate": 0.26685064460825547, + "loss": 0.0301, + "num_input_tokens_seen": 14785312, + "step": 8630 + }, + { + "epoch": 41.91525423728814, + "grad_norm": 0.002752625150606036, + "learning_rate": 0.26681370115596553, + "loss": 0.0128, + "num_input_tokens_seen": 14793696, + "step": 8635 + }, + { + "epoch": 41.93946731234867, + "grad_norm": 0.004503365606069565, + "learning_rate": 0.26677673968953497, + "loss": 0.0076, + "num_input_tokens_seen": 14802592, + "step": 8640 + }, + { + "epoch": 41.9636803874092, + "grad_norm": 0.0009641069336794317, + "learning_rate": 0.2667397602146636, + "loss": 0.0048, + "num_input_tokens_seen": 14811296, + "step": 8645 + }, + { + "epoch": 41.987893462469735, + "grad_norm": 0.00926984939724207, + "learning_rate": 0.2667027627370542, + "loss": 0.0096, + "num_input_tokens_seen": 14819712, + "step": 8650 + }, + { + "epoch": 42.01452784503632, + "grad_norm": 0.002182846888899803, + "learning_rate": 0.26666574726241216, + "loss": 0.0107, + "num_input_tokens_seen": 14828896, + "step": 8655 + }, + { + "epoch": 42.03874092009685, + "grad_norm": 0.00016339476860594004, + "learning_rate": 0.2666287137964458, + "loss": 0.002, + "num_input_tokens_seen": 14837792, + "step": 8660 + }, + { + "epoch": 42.062953995157386, + "grad_norm": 0.0001939149369718507, + "learning_rate": 0.26659166234486614, + "loss": 0.0014, + "num_input_tokens_seen": 14846144, + "step": 8665 + }, + { + "epoch": 42.087167070217916, + "grad_norm": 0.00021221645874902606, + "learning_rate": 0.2665545929133869, + "loss": 0.0048, + "num_input_tokens_seen": 14854848, + "step": 8670 + }, + { + "epoch": 42.11138014527845, + "grad_norm": 0.00395962642505765, + "learning_rate": 0.2665175055077248, + "loss": 0.0054, + "num_input_tokens_seen": 14863456, + "step": 8675 + }, + { + "epoch": 42.13559322033898, + "grad_norm": 0.0003282784018665552, + "learning_rate": 0.2664804001335991, + "loss": 0.0026, + "num_input_tokens_seen": 14871808, + "step": 8680 + }, + { + "epoch": 42.15980629539951, + "grad_norm": 0.0002455076901242137, + "learning_rate": 0.26644327679673185, + "loss": 0.0035, + "num_input_tokens_seen": 14880384, + "step": 8685 + }, + { + "epoch": 42.18401937046005, + "grad_norm": 0.0035732684191316366, + "learning_rate": 0.26640613550284803, + "loss": 0.0069, + "num_input_tokens_seen": 14888832, + "step": 8690 + }, + { + "epoch": 42.20823244552058, + "grad_norm": 0.0075311362743377686, + "learning_rate": 0.26636897625767525, + "loss": 0.0101, + "num_input_tokens_seen": 14897376, + "step": 8695 + }, + { + "epoch": 42.232445520581116, + "grad_norm": 0.0004970013978891075, + "learning_rate": 0.266331799066944, + "loss": 0.0107, + "num_input_tokens_seen": 14906048, + "step": 8700 + }, + { + "epoch": 42.256658595641646, + "grad_norm": 0.0007628099410794675, + "learning_rate": 0.2662946039363874, + "loss": 0.0072, + "num_input_tokens_seen": 14914656, + "step": 8705 + }, + { + "epoch": 42.280871670702176, + "grad_norm": 0.0001989218289963901, + "learning_rate": 0.2662573908717414, + "loss": 0.0059, + "num_input_tokens_seen": 14923040, + "step": 8710 + }, + { + "epoch": 42.30508474576271, + "grad_norm": 0.0007869219407439232, + "learning_rate": 0.2662201598787447, + "loss": 0.0054, + "num_input_tokens_seen": 14931488, + "step": 8715 + }, + { + "epoch": 42.32929782082324, + "grad_norm": 0.00024069330538623035, + "learning_rate": 0.2661829109631389, + "loss": 0.0025, + "num_input_tokens_seen": 14939648, + "step": 8720 + }, + { + "epoch": 42.35351089588378, + "grad_norm": 0.00025293961516581476, + "learning_rate": 0.26614564413066816, + "loss": 0.0084, + "num_input_tokens_seen": 14948128, + "step": 8725 + }, + { + "epoch": 42.37772397094431, + "grad_norm": 0.0019806723576039076, + "learning_rate": 0.2661083593870795, + "loss": 0.0015, + "num_input_tokens_seen": 14956832, + "step": 8730 + }, + { + "epoch": 42.40193704600484, + "grad_norm": 0.00010160019883187488, + "learning_rate": 0.26607105673812276, + "loss": 0.0061, + "num_input_tokens_seen": 14965728, + "step": 8735 + }, + { + "epoch": 42.426150121065376, + "grad_norm": 0.0005256315344013274, + "learning_rate": 0.2660337361895504, + "loss": 0.0007, + "num_input_tokens_seen": 14974432, + "step": 8740 + }, + { + "epoch": 42.450363196125906, + "grad_norm": 0.0013655887451022863, + "learning_rate": 0.26599639774711775, + "loss": 0.0056, + "num_input_tokens_seen": 14983104, + "step": 8745 + }, + { + "epoch": 42.47457627118644, + "grad_norm": 0.000781711598392576, + "learning_rate": 0.2659590414165829, + "loss": 0.0031, + "num_input_tokens_seen": 14991776, + "step": 8750 + }, + { + "epoch": 42.49878934624697, + "grad_norm": 9.083319309866056e-05, + "learning_rate": 0.2659216672037066, + "loss": 0.0012, + "num_input_tokens_seen": 15000160, + "step": 8755 + }, + { + "epoch": 42.5230024213075, + "grad_norm": 0.000298446073429659, + "learning_rate": 0.26588427511425244, + "loss": 0.0009, + "num_input_tokens_seen": 15008544, + "step": 8760 + }, + { + "epoch": 42.54721549636804, + "grad_norm": 0.00021195918088778853, + "learning_rate": 0.26584686515398676, + "loss": 0.0018, + "num_input_tokens_seen": 15017088, + "step": 8765 + }, + { + "epoch": 42.57142857142857, + "grad_norm": 0.004412441980093718, + "learning_rate": 0.2658094373286787, + "loss": 0.0023, + "num_input_tokens_seen": 15025504, + "step": 8770 + }, + { + "epoch": 42.595641646489106, + "grad_norm": 0.0015929766232147813, + "learning_rate": 0.2657719916441, + "loss": 0.0021, + "num_input_tokens_seen": 15034048, + "step": 8775 + }, + { + "epoch": 42.619854721549636, + "grad_norm": 3.5556298826122656e-05, + "learning_rate": 0.2657345281060253, + "loss": 0.0016, + "num_input_tokens_seen": 15042528, + "step": 8780 + }, + { + "epoch": 42.644067796610166, + "grad_norm": 6.062145621399395e-05, + "learning_rate": 0.26569704672023203, + "loss": 0.0071, + "num_input_tokens_seen": 15050784, + "step": 8785 + }, + { + "epoch": 42.6682808716707, + "grad_norm": 0.00015647518739569932, + "learning_rate": 0.26565954749250015, + "loss": 0.0023, + "num_input_tokens_seen": 15059296, + "step": 8790 + }, + { + "epoch": 42.69249394673123, + "grad_norm": 0.0005034139030613005, + "learning_rate": 0.2656220304286126, + "loss": 0.01, + "num_input_tokens_seen": 15068000, + "step": 8795 + }, + { + "epoch": 42.71670702179177, + "grad_norm": 0.001863463781774044, + "learning_rate": 0.265584495534355, + "loss": 0.001, + "num_input_tokens_seen": 15076736, + "step": 8800 + }, + { + "epoch": 42.71670702179177, + "eval_loss": 0.5092389583587646, + "eval_runtime": 4.612, + "eval_samples_per_second": 79.574, + "eval_steps_per_second": 19.948, + "num_input_tokens_seen": 15076736, + "step": 8800 + }, + { + "epoch": 42.7409200968523, + "grad_norm": 0.0003398284607101232, + "learning_rate": 0.2655469428155156, + "loss": 0.0007, + "num_input_tokens_seen": 15085024, + "step": 8805 + }, + { + "epoch": 42.76513317191284, + "grad_norm": 0.01240223553031683, + "learning_rate": 0.2655093722778856, + "loss": 0.0105, + "num_input_tokens_seen": 15093600, + "step": 8810 + }, + { + "epoch": 42.789346246973366, + "grad_norm": 0.004623549524694681, + "learning_rate": 0.2654717839272588, + "loss": 0.0038, + "num_input_tokens_seen": 15102496, + "step": 8815 + }, + { + "epoch": 42.813559322033896, + "grad_norm": 0.0004214807995595038, + "learning_rate": 0.2654341777694318, + "loss": 0.0071, + "num_input_tokens_seen": 15111392, + "step": 8820 + }, + { + "epoch": 42.83777239709443, + "grad_norm": 0.002715643495321274, + "learning_rate": 0.265396553810204, + "loss": 0.0067, + "num_input_tokens_seen": 15119904, + "step": 8825 + }, + { + "epoch": 42.86198547215496, + "grad_norm": 0.005521420389413834, + "learning_rate": 0.26535891205537737, + "loss": 0.013, + "num_input_tokens_seen": 15127904, + "step": 8830 + }, + { + "epoch": 42.8861985472155, + "grad_norm": 0.0005130738136358559, + "learning_rate": 0.26532125251075683, + "loss": 0.0168, + "num_input_tokens_seen": 15136448, + "step": 8835 + }, + { + "epoch": 42.91041162227603, + "grad_norm": 0.001880268449895084, + "learning_rate": 0.26528357518214996, + "loss": 0.0041, + "num_input_tokens_seen": 15144704, + "step": 8840 + }, + { + "epoch": 42.93462469733656, + "grad_norm": 0.00010590036981739104, + "learning_rate": 0.26524588007536704, + "loss": 0.0133, + "num_input_tokens_seen": 15153472, + "step": 8845 + }, + { + "epoch": 42.958837772397096, + "grad_norm": 0.0014799748314544559, + "learning_rate": 0.26520816719622115, + "loss": 0.0027, + "num_input_tokens_seen": 15161888, + "step": 8850 + }, + { + "epoch": 42.983050847457626, + "grad_norm": 0.002094567520543933, + "learning_rate": 0.2651704365505281, + "loss": 0.008, + "num_input_tokens_seen": 15170208, + "step": 8855 + }, + { + "epoch": 43.00968523002421, + "grad_norm": 0.001783597283065319, + "learning_rate": 0.26513268814410634, + "loss": 0.0066, + "num_input_tokens_seen": 15179808, + "step": 8860 + }, + { + "epoch": 43.03389830508475, + "grad_norm": 0.00019669435278046876, + "learning_rate": 0.2650949219827773, + "loss": 0.0022, + "num_input_tokens_seen": 15188192, + "step": 8865 + }, + { + "epoch": 43.05811138014528, + "grad_norm": 0.00031370826764032245, + "learning_rate": 0.26505713807236486, + "loss": 0.0018, + "num_input_tokens_seen": 15196384, + "step": 8870 + }, + { + "epoch": 43.082324455205814, + "grad_norm": 0.0010798900621011853, + "learning_rate": 0.26501933641869585, + "loss": 0.0047, + "num_input_tokens_seen": 15204992, + "step": 8875 + }, + { + "epoch": 43.106537530266344, + "grad_norm": 0.0009261481463909149, + "learning_rate": 0.26498151702759976, + "loss": 0.0018, + "num_input_tokens_seen": 15213824, + "step": 8880 + }, + { + "epoch": 43.130750605326874, + "grad_norm": 0.0040235016494989395, + "learning_rate": 0.2649436799049088, + "loss": 0.0035, + "num_input_tokens_seen": 15222400, + "step": 8885 + }, + { + "epoch": 43.15496368038741, + "grad_norm": 0.00031082166242413223, + "learning_rate": 0.2649058250564579, + "loss": 0.0038, + "num_input_tokens_seen": 15231328, + "step": 8890 + }, + { + "epoch": 43.17917675544794, + "grad_norm": 0.0006198540795594454, + "learning_rate": 0.26486795248808476, + "loss": 0.0027, + "num_input_tokens_seen": 15239936, + "step": 8895 + }, + { + "epoch": 43.20338983050848, + "grad_norm": 0.004920692183077335, + "learning_rate": 0.2648300622056298, + "loss": 0.0032, + "num_input_tokens_seen": 15248672, + "step": 8900 + }, + { + "epoch": 43.22760290556901, + "grad_norm": 0.0008592500817030668, + "learning_rate": 0.2647921542149363, + "loss": 0.0103, + "num_input_tokens_seen": 15257088, + "step": 8905 + }, + { + "epoch": 43.25181598062954, + "grad_norm": 5.1546729082474485e-05, + "learning_rate": 0.26475422852185, + "loss": 0.0097, + "num_input_tokens_seen": 15265664, + "step": 8910 + }, + { + "epoch": 43.276029055690074, + "grad_norm": 0.008981681428849697, + "learning_rate": 0.2647162851322196, + "loss": 0.0281, + "num_input_tokens_seen": 15274144, + "step": 8915 + }, + { + "epoch": 43.300242130750604, + "grad_norm": 0.0007225500885397196, + "learning_rate": 0.2646783240518964, + "loss": 0.0018, + "num_input_tokens_seen": 15282560, + "step": 8920 + }, + { + "epoch": 43.32445520581114, + "grad_norm": 6.842490984126925e-05, + "learning_rate": 0.26464034528673447, + "loss": 0.0079, + "num_input_tokens_seen": 15290848, + "step": 8925 + }, + { + "epoch": 43.34866828087167, + "grad_norm": 0.000614279939327389, + "learning_rate": 0.26460234884259065, + "loss": 0.0123, + "num_input_tokens_seen": 15299520, + "step": 8930 + }, + { + "epoch": 43.3728813559322, + "grad_norm": 0.0009266952401958406, + "learning_rate": 0.2645643347253245, + "loss": 0.0033, + "num_input_tokens_seen": 15307936, + "step": 8935 + }, + { + "epoch": 43.39709443099274, + "grad_norm": 0.001146232825703919, + "learning_rate": 0.2645263029407982, + "loss": 0.0064, + "num_input_tokens_seen": 15316576, + "step": 8940 + }, + { + "epoch": 43.42130750605327, + "grad_norm": 0.0021249488927423954, + "learning_rate": 0.2644882534948767, + "loss": 0.0083, + "num_input_tokens_seen": 15324864, + "step": 8945 + }, + { + "epoch": 43.445520581113804, + "grad_norm": 0.0036599477753043175, + "learning_rate": 0.2644501863934278, + "loss": 0.003, + "num_input_tokens_seen": 15333632, + "step": 8950 + }, + { + "epoch": 43.469733656174334, + "grad_norm": 0.006517655681818724, + "learning_rate": 0.26441210164232193, + "loss": 0.0072, + "num_input_tokens_seen": 15341760, + "step": 8955 + }, + { + "epoch": 43.493946731234864, + "grad_norm": 0.0013988050632178783, + "learning_rate": 0.26437399924743216, + "loss": 0.007, + "num_input_tokens_seen": 15350368, + "step": 8960 + }, + { + "epoch": 43.5181598062954, + "grad_norm": 0.010009031742811203, + "learning_rate": 0.26433587921463436, + "loss": 0.0178, + "num_input_tokens_seen": 15358656, + "step": 8965 + }, + { + "epoch": 43.54237288135593, + "grad_norm": 0.0007642245618626475, + "learning_rate": 0.2642977415498072, + "loss": 0.0029, + "num_input_tokens_seen": 15367296, + "step": 8970 + }, + { + "epoch": 43.56658595641647, + "grad_norm": 0.00042224806384183466, + "learning_rate": 0.26425958625883195, + "loss": 0.0065, + "num_input_tokens_seen": 15375712, + "step": 8975 + }, + { + "epoch": 43.590799031477, + "grad_norm": 0.002382000209763646, + "learning_rate": 0.2642214133475926, + "loss": 0.015, + "num_input_tokens_seen": 15384064, + "step": 8980 + }, + { + "epoch": 43.61501210653753, + "grad_norm": 0.0007160938694141805, + "learning_rate": 0.26418322282197587, + "loss": 0.0048, + "num_input_tokens_seen": 15392448, + "step": 8985 + }, + { + "epoch": 43.639225181598064, + "grad_norm": 0.0011403086828067899, + "learning_rate": 0.2641450146878714, + "loss": 0.0088, + "num_input_tokens_seen": 15400800, + "step": 8990 + }, + { + "epoch": 43.663438256658594, + "grad_norm": 0.003497854806482792, + "learning_rate": 0.26410678895117107, + "loss": 0.0034, + "num_input_tokens_seen": 15409408, + "step": 8995 + }, + { + "epoch": 43.68765133171913, + "grad_norm": 0.0005289507680572569, + "learning_rate": 0.26406854561777, + "loss": 0.0036, + "num_input_tokens_seen": 15418176, + "step": 9000 + }, + { + "epoch": 43.68765133171913, + "eval_loss": 0.4884087145328522, + "eval_runtime": 4.6123, + "eval_samples_per_second": 79.57, + "eval_steps_per_second": 19.947, + "num_input_tokens_seen": 15418176, + "step": 9000 + }, + { + "epoch": 43.71186440677966, + "grad_norm": 0.0003283753467258066, + "learning_rate": 0.26403028469356576, + "loss": 0.0121, + "num_input_tokens_seen": 15426784, + "step": 9005 + }, + { + "epoch": 43.73607748184019, + "grad_norm": 0.00852760300040245, + "learning_rate": 0.2639920061844585, + "loss": 0.0135, + "num_input_tokens_seen": 15435744, + "step": 9010 + }, + { + "epoch": 43.76029055690073, + "grad_norm": 0.004267589189112186, + "learning_rate": 0.2639537100963515, + "loss": 0.0117, + "num_input_tokens_seen": 15444480, + "step": 9015 + }, + { + "epoch": 43.78450363196126, + "grad_norm": 0.007300990168005228, + "learning_rate": 0.26391539643515033, + "loss": 0.0044, + "num_input_tokens_seen": 15452864, + "step": 9020 + }, + { + "epoch": 43.808716707021794, + "grad_norm": 4.6275647036964074e-05, + "learning_rate": 0.26387706520676346, + "loss": 0.001, + "num_input_tokens_seen": 15461440, + "step": 9025 + }, + { + "epoch": 43.832929782082324, + "grad_norm": 0.0034188448917120695, + "learning_rate": 0.26383871641710205, + "loss": 0.0028, + "num_input_tokens_seen": 15469920, + "step": 9030 + }, + { + "epoch": 43.857142857142854, + "grad_norm": 0.0025623685214668512, + "learning_rate": 0.26380035007208, + "loss": 0.0013, + "num_input_tokens_seen": 15478368, + "step": 9035 + }, + { + "epoch": 43.88135593220339, + "grad_norm": 0.00013112953456584364, + "learning_rate": 0.26376196617761394, + "loss": 0.0008, + "num_input_tokens_seen": 15486880, + "step": 9040 + }, + { + "epoch": 43.90556900726392, + "grad_norm": 1.7153617591247894e-05, + "learning_rate": 0.263723564739623, + "loss": 0.0202, + "num_input_tokens_seen": 15495488, + "step": 9045 + }, + { + "epoch": 43.92978208232446, + "grad_norm": 0.009673204272985458, + "learning_rate": 0.2636851457640293, + "loss": 0.0173, + "num_input_tokens_seen": 15504384, + "step": 9050 + }, + { + "epoch": 43.95399515738499, + "grad_norm": 0.0014878656947985291, + "learning_rate": 0.26364670925675737, + "loss": 0.038, + "num_input_tokens_seen": 15513120, + "step": 9055 + }, + { + "epoch": 43.97820823244552, + "grad_norm": 0.002607679693028331, + "learning_rate": 0.2636082552237347, + "loss": 0.0254, + "num_input_tokens_seen": 15521760, + "step": 9060 + }, + { + "epoch": 44.00484261501211, + "grad_norm": 0.0004075664619449526, + "learning_rate": 0.26356978367089146, + "loss": 0.0077, + "num_input_tokens_seen": 15530496, + "step": 9065 + }, + { + "epoch": 44.02905569007264, + "grad_norm": 0.008485096506774426, + "learning_rate": 0.26353129460416036, + "loss": 0.0109, + "num_input_tokens_seen": 15539328, + "step": 9070 + }, + { + "epoch": 44.05326876513317, + "grad_norm": 0.00461251474916935, + "learning_rate": 0.2634927880294769, + "loss": 0.0273, + "num_input_tokens_seen": 15547904, + "step": 9075 + }, + { + "epoch": 44.077481840193705, + "grad_norm": 0.0024951142258942127, + "learning_rate": 0.26345426395277927, + "loss": 0.0163, + "num_input_tokens_seen": 15556224, + "step": 9080 + }, + { + "epoch": 44.101694915254235, + "grad_norm": 7.534482574556023e-05, + "learning_rate": 0.2634157223800084, + "loss": 0.0058, + "num_input_tokens_seen": 15564736, + "step": 9085 + }, + { + "epoch": 44.12590799031477, + "grad_norm": 0.004137636628001928, + "learning_rate": 0.26337716331710787, + "loss": 0.0228, + "num_input_tokens_seen": 15573952, + "step": 9090 + }, + { + "epoch": 44.1501210653753, + "grad_norm": 0.003604338737204671, + "learning_rate": 0.2633385867700239, + "loss": 0.0088, + "num_input_tokens_seen": 15582560, + "step": 9095 + }, + { + "epoch": 44.17433414043583, + "grad_norm": 0.00164245895575732, + "learning_rate": 0.2632999927447056, + "loss": 0.0129, + "num_input_tokens_seen": 15591104, + "step": 9100 + }, + { + "epoch": 44.19854721549637, + "grad_norm": 0.00048363450332544744, + "learning_rate": 0.2632613812471046, + "loss": 0.0154, + "num_input_tokens_seen": 15599552, + "step": 9105 + }, + { + "epoch": 44.2227602905569, + "grad_norm": 0.000501013535540551, + "learning_rate": 0.2632227522831753, + "loss": 0.0017, + "num_input_tokens_seen": 15608608, + "step": 9110 + }, + { + "epoch": 44.246973365617436, + "grad_norm": 0.0036238934844732285, + "learning_rate": 0.26318410585887475, + "loss": 0.0074, + "num_input_tokens_seen": 15616800, + "step": 9115 + }, + { + "epoch": 44.271186440677965, + "grad_norm": 0.00010397305595688522, + "learning_rate": 0.2631454419801627, + "loss": 0.0026, + "num_input_tokens_seen": 15625504, + "step": 9120 + }, + { + "epoch": 44.295399515738495, + "grad_norm": 0.005143571645021439, + "learning_rate": 0.2631067606530016, + "loss": 0.016, + "num_input_tokens_seen": 15634144, + "step": 9125 + }, + { + "epoch": 44.31961259079903, + "grad_norm": 0.00480489619076252, + "learning_rate": 0.2630680618833567, + "loss": 0.0147, + "num_input_tokens_seen": 15642912, + "step": 9130 + }, + { + "epoch": 44.34382566585956, + "grad_norm": 0.00035806652158498764, + "learning_rate": 0.26302934567719566, + "loss": 0.0037, + "num_input_tokens_seen": 15651584, + "step": 9135 + }, + { + "epoch": 44.3680387409201, + "grad_norm": 0.0008176146657206118, + "learning_rate": 0.2629906120404892, + "loss": 0.0037, + "num_input_tokens_seen": 15660512, + "step": 9140 + }, + { + "epoch": 44.39225181598063, + "grad_norm": 0.0040468089282512665, + "learning_rate": 0.26295186097921036, + "loss": 0.0148, + "num_input_tokens_seen": 15669216, + "step": 9145 + }, + { + "epoch": 44.416464891041166, + "grad_norm": 0.008107698522508144, + "learning_rate": 0.2629130924993351, + "loss": 0.0133, + "num_input_tokens_seen": 15677664, + "step": 9150 + }, + { + "epoch": 44.440677966101696, + "grad_norm": 0.008652489632368088, + "learning_rate": 0.2628743066068421, + "loss": 0.0102, + "num_input_tokens_seen": 15686272, + "step": 9155 + }, + { + "epoch": 44.464891041162225, + "grad_norm": 0.002938736928626895, + "learning_rate": 0.26283550330771244, + "loss": 0.0082, + "num_input_tokens_seen": 15694880, + "step": 9160 + }, + { + "epoch": 44.48910411622276, + "grad_norm": 0.00021845127048436552, + "learning_rate": 0.2627966826079303, + "loss": 0.0104, + "num_input_tokens_seen": 15703584, + "step": 9165 + }, + { + "epoch": 44.51331719128329, + "grad_norm": 0.005228742025792599, + "learning_rate": 0.26275784451348216, + "loss": 0.0191, + "num_input_tokens_seen": 15711968, + "step": 9170 + }, + { + "epoch": 44.53753026634383, + "grad_norm": 0.0007439465261995792, + "learning_rate": 0.2627189890303574, + "loss": 0.0016, + "num_input_tokens_seen": 15720832, + "step": 9175 + }, + { + "epoch": 44.56174334140436, + "grad_norm": 0.0007456416497007012, + "learning_rate": 0.262680116164548, + "loss": 0.0101, + "num_input_tokens_seen": 15728864, + "step": 9180 + }, + { + "epoch": 44.58595641646489, + "grad_norm": 0.0007671202183701098, + "learning_rate": 0.2626412259220487, + "loss": 0.0181, + "num_input_tokens_seen": 15737728, + "step": 9185 + }, + { + "epoch": 44.610169491525426, + "grad_norm": 0.0007578240474686027, + "learning_rate": 0.2626023183088568, + "loss": 0.0113, + "num_input_tokens_seen": 15746176, + "step": 9190 + }, + { + "epoch": 44.634382566585955, + "grad_norm": 0.005933685228228569, + "learning_rate": 0.26256339333097234, + "loss": 0.0112, + "num_input_tokens_seen": 15754656, + "step": 9195 + }, + { + "epoch": 44.65859564164649, + "grad_norm": 0.000278741616057232, + "learning_rate": 0.2625244509943981, + "loss": 0.0258, + "num_input_tokens_seen": 15762912, + "step": 9200 + }, + { + "epoch": 44.65859564164649, + "eval_loss": 0.5241928696632385, + "eval_runtime": 4.6346, + "eval_samples_per_second": 79.188, + "eval_steps_per_second": 19.851, + "num_input_tokens_seen": 15762912, + "step": 9200 + }, + { + "epoch": 44.68280871670702, + "grad_norm": 0.0026850763242691755, + "learning_rate": 0.2624854913051395, + "loss": 0.0144, + "num_input_tokens_seen": 15771456, + "step": 9205 + }, + { + "epoch": 44.70702179176755, + "grad_norm": 0.007305624894797802, + "learning_rate": 0.26244651426920446, + "loss": 0.0285, + "num_input_tokens_seen": 15780192, + "step": 9210 + }, + { + "epoch": 44.73123486682809, + "grad_norm": 0.01051029097288847, + "learning_rate": 0.26240751989260386, + "loss": 0.0647, + "num_input_tokens_seen": 15788704, + "step": 9215 + }, + { + "epoch": 44.75544794188862, + "grad_norm": 0.006184750236570835, + "learning_rate": 0.2623685081813511, + "loss": 0.0641, + "num_input_tokens_seen": 15797120, + "step": 9220 + }, + { + "epoch": 44.779661016949156, + "grad_norm": 0.004658713936805725, + "learning_rate": 0.2623294791414623, + "loss": 0.0254, + "num_input_tokens_seen": 15805504, + "step": 9225 + }, + { + "epoch": 44.803874092009686, + "grad_norm": 0.002697985852137208, + "learning_rate": 0.26229043277895614, + "loss": 0.0222, + "num_input_tokens_seen": 15813600, + "step": 9230 + }, + { + "epoch": 44.828087167070215, + "grad_norm": 0.0014484993880614638, + "learning_rate": 0.2622513690998542, + "loss": 0.0332, + "num_input_tokens_seen": 15822336, + "step": 9235 + }, + { + "epoch": 44.85230024213075, + "grad_norm": 0.0010297225089743733, + "learning_rate": 0.26221228811018044, + "loss": 0.0143, + "num_input_tokens_seen": 15830528, + "step": 9240 + }, + { + "epoch": 44.87651331719128, + "grad_norm": 0.002584264613687992, + "learning_rate": 0.2621731898159617, + "loss": 0.023, + "num_input_tokens_seen": 15838912, + "step": 9245 + }, + { + "epoch": 44.90072639225182, + "grad_norm": 0.0014964918373152614, + "learning_rate": 0.26213407422322743, + "loss": 0.0243, + "num_input_tokens_seen": 15847136, + "step": 9250 + }, + { + "epoch": 44.92493946731235, + "grad_norm": 0.008775824680924416, + "learning_rate": 0.2620949413380098, + "loss": 0.0137, + "num_input_tokens_seen": 15856192, + "step": 9255 + }, + { + "epoch": 44.94915254237288, + "grad_norm": 0.0004765403282362968, + "learning_rate": 0.26205579116634353, + "loss": 0.0155, + "num_input_tokens_seen": 15865024, + "step": 9260 + }, + { + "epoch": 44.973365617433416, + "grad_norm": 0.004927963484078646, + "learning_rate": 0.26201662371426604, + "loss": 0.0223, + "num_input_tokens_seen": 15873568, + "step": 9265 + }, + { + "epoch": 44.997578692493946, + "grad_norm": 0.00018563741468824446, + "learning_rate": 0.2619774389878175, + "loss": 0.0107, + "num_input_tokens_seen": 15881696, + "step": 9270 + }, + { + "epoch": 45.02421307506053, + "grad_norm": 0.004126290790736675, + "learning_rate": 0.2619382369930407, + "loss": 0.0033, + "num_input_tokens_seen": 15890848, + "step": 9275 + }, + { + "epoch": 45.04842615012107, + "grad_norm": 0.005374103784561157, + "learning_rate": 0.261899017735981, + "loss": 0.0101, + "num_input_tokens_seen": 15899776, + "step": 9280 + }, + { + "epoch": 45.0726392251816, + "grad_norm": 0.007244367152452469, + "learning_rate": 0.2618597812226866, + "loss": 0.0104, + "num_input_tokens_seen": 15908288, + "step": 9285 + }, + { + "epoch": 45.09685230024213, + "grad_norm": 0.001787935383617878, + "learning_rate": 0.2618205274592082, + "loss": 0.0028, + "num_input_tokens_seen": 15917152, + "step": 9290 + }, + { + "epoch": 45.12106537530266, + "grad_norm": 0.01054916437715292, + "learning_rate": 0.2617812564515992, + "loss": 0.0086, + "num_input_tokens_seen": 15925408, + "step": 9295 + }, + { + "epoch": 45.14527845036319, + "grad_norm": 0.00010270014172419906, + "learning_rate": 0.2617419682059158, + "loss": 0.0031, + "num_input_tokens_seen": 15934016, + "step": 9300 + }, + { + "epoch": 45.16949152542373, + "grad_norm": 0.0015085522318258882, + "learning_rate": 0.26170266272821663, + "loss": 0.0056, + "num_input_tokens_seen": 15943072, + "step": 9305 + }, + { + "epoch": 45.19370460048426, + "grad_norm": 0.0015533772530034184, + "learning_rate": 0.26166334002456315, + "loss": 0.006, + "num_input_tokens_seen": 15951584, + "step": 9310 + }, + { + "epoch": 45.2179176755448, + "grad_norm": 0.00024463157751597464, + "learning_rate": 0.2616240001010194, + "loss": 0.0046, + "num_input_tokens_seen": 15960512, + "step": 9315 + }, + { + "epoch": 45.24213075060533, + "grad_norm": 0.015005193650722504, + "learning_rate": 0.26158464296365197, + "loss": 0.0264, + "num_input_tokens_seen": 15969536, + "step": 9320 + }, + { + "epoch": 45.26634382566586, + "grad_norm": 0.00019550645083654672, + "learning_rate": 0.2615452686185304, + "loss": 0.0099, + "num_input_tokens_seen": 15978176, + "step": 9325 + }, + { + "epoch": 45.29055690072639, + "grad_norm": 0.001565192942507565, + "learning_rate": 0.26150587707172673, + "loss": 0.0083, + "num_input_tokens_seen": 15986720, + "step": 9330 + }, + { + "epoch": 45.31476997578692, + "grad_norm": 0.00017152732471004128, + "learning_rate": 0.2614664683293154, + "loss": 0.0024, + "num_input_tokens_seen": 15995136, + "step": 9335 + }, + { + "epoch": 45.33898305084746, + "grad_norm": 0.0060538966208696365, + "learning_rate": 0.26142704239737397, + "loss": 0.0058, + "num_input_tokens_seen": 16003648, + "step": 9340 + }, + { + "epoch": 45.36319612590799, + "grad_norm": 0.0025277372915297747, + "learning_rate": 0.26138759928198235, + "loss": 0.0052, + "num_input_tokens_seen": 16012128, + "step": 9345 + }, + { + "epoch": 45.38740920096852, + "grad_norm": 0.0003232191957067698, + "learning_rate": 0.26134813898922304, + "loss": 0.0018, + "num_input_tokens_seen": 16020672, + "step": 9350 + }, + { + "epoch": 45.41162227602906, + "grad_norm": 0.007617026101797819, + "learning_rate": 0.26130866152518145, + "loss": 0.0225, + "num_input_tokens_seen": 16029440, + "step": 9355 + }, + { + "epoch": 45.43583535108959, + "grad_norm": 0.0016912169521674514, + "learning_rate": 0.2612691668959455, + "loss": 0.0096, + "num_input_tokens_seen": 16037888, + "step": 9360 + }, + { + "epoch": 45.460048426150124, + "grad_norm": 0.011366824619472027, + "learning_rate": 0.2612296551076057, + "loss": 0.0306, + "num_input_tokens_seen": 16046272, + "step": 9365 + }, + { + "epoch": 45.48426150121065, + "grad_norm": 0.0024063887540251017, + "learning_rate": 0.26119012616625525, + "loss": 0.0424, + "num_input_tokens_seen": 16054688, + "step": 9370 + }, + { + "epoch": 45.50847457627118, + "grad_norm": 0.0002505441661924124, + "learning_rate": 0.26115058007799, + "loss": 0.0164, + "num_input_tokens_seen": 16063200, + "step": 9375 + }, + { + "epoch": 45.53268765133172, + "grad_norm": 0.005725615192204714, + "learning_rate": 0.26111101684890864, + "loss": 0.033, + "num_input_tokens_seen": 16071520, + "step": 9380 + }, + { + "epoch": 45.55690072639225, + "grad_norm": 0.0007801649626344442, + "learning_rate": 0.26107143648511205, + "loss": 0.0371, + "num_input_tokens_seen": 16079936, + "step": 9385 + }, + { + "epoch": 45.58111380145279, + "grad_norm": 0.0013481519417837262, + "learning_rate": 0.2610318389927042, + "loss": 0.0301, + "num_input_tokens_seen": 16088704, + "step": 9390 + }, + { + "epoch": 45.60532687651332, + "grad_norm": 0.0013883349020034075, + "learning_rate": 0.26099222437779146, + "loss": 0.0171, + "num_input_tokens_seen": 16096896, + "step": 9395 + }, + { + "epoch": 45.62953995157385, + "grad_norm": 0.004324445966631174, + "learning_rate": 0.26095259264648285, + "loss": 0.0188, + "num_input_tokens_seen": 16105760, + "step": 9400 + }, + { + "epoch": 45.62953995157385, + "eval_loss": 0.4023173749446869, + "eval_runtime": 4.623, + "eval_samples_per_second": 79.385, + "eval_steps_per_second": 19.9, + "num_input_tokens_seen": 16105760, + "step": 9400 + }, + { + "epoch": 45.653753026634384, + "grad_norm": 0.0007557587814517319, + "learning_rate": 0.2609129438048902, + "loss": 0.0048, + "num_input_tokens_seen": 16113920, + "step": 9405 + }, + { + "epoch": 45.67796610169491, + "grad_norm": 0.004602430853992701, + "learning_rate": 0.2608732778591278, + "loss": 0.0261, + "num_input_tokens_seen": 16122720, + "step": 9410 + }, + { + "epoch": 45.70217917675545, + "grad_norm": 0.008368156850337982, + "learning_rate": 0.2608335948153126, + "loss": 0.0149, + "num_input_tokens_seen": 16131360, + "step": 9415 + }, + { + "epoch": 45.72639225181598, + "grad_norm": 0.00011811752483481541, + "learning_rate": 0.26079389467956426, + "loss": 0.0232, + "num_input_tokens_seen": 16139840, + "step": 9420 + }, + { + "epoch": 45.75060532687651, + "grad_norm": 0.00257821730338037, + "learning_rate": 0.26075417745800505, + "loss": 0.0086, + "num_input_tokens_seen": 16148448, + "step": 9425 + }, + { + "epoch": 45.77481840193705, + "grad_norm": 0.00203947932459414, + "learning_rate": 0.26071444315675985, + "loss": 0.0204, + "num_input_tokens_seen": 16157088, + "step": 9430 + }, + { + "epoch": 45.79903147699758, + "grad_norm": 0.0005688505480065942, + "learning_rate": 0.2606746917819562, + "loss": 0.0164, + "num_input_tokens_seen": 16165792, + "step": 9435 + }, + { + "epoch": 45.823244552058114, + "grad_norm": 0.0011968129547312856, + "learning_rate": 0.2606349233397242, + "loss": 0.024, + "num_input_tokens_seen": 16174656, + "step": 9440 + }, + { + "epoch": 45.847457627118644, + "grad_norm": 0.0002711242123041302, + "learning_rate": 0.26059513783619676, + "loss": 0.0055, + "num_input_tokens_seen": 16182848, + "step": 9445 + }, + { + "epoch": 45.87167070217917, + "grad_norm": 0.0012912789825350046, + "learning_rate": 0.26055533527750924, + "loss": 0.0156, + "num_input_tokens_seen": 16191552, + "step": 9450 + }, + { + "epoch": 45.89588377723971, + "grad_norm": 0.014430168084800243, + "learning_rate": 0.26051551566979964, + "loss": 0.0355, + "num_input_tokens_seen": 16200128, + "step": 9455 + }, + { + "epoch": 45.92009685230024, + "grad_norm": 0.0007770861266180873, + "learning_rate": 0.26047567901920876, + "loss": 0.0429, + "num_input_tokens_seen": 16208800, + "step": 9460 + }, + { + "epoch": 45.94430992736078, + "grad_norm": 0.00974811427295208, + "learning_rate": 0.2604358253318798, + "loss": 0.0437, + "num_input_tokens_seen": 16217184, + "step": 9465 + }, + { + "epoch": 45.96852300242131, + "grad_norm": 0.003547095227986574, + "learning_rate": 0.26039595461395876, + "loss": 0.0309, + "num_input_tokens_seen": 16225760, + "step": 9470 + }, + { + "epoch": 45.99273607748184, + "grad_norm": 0.0027205816004425287, + "learning_rate": 0.26035606687159424, + "loss": 0.0533, + "num_input_tokens_seen": 16234336, + "step": 9475 + }, + { + "epoch": 46.01937046004843, + "grad_norm": 0.004148026462644339, + "learning_rate": 0.26031616211093733, + "loss": 0.0123, + "num_input_tokens_seen": 16243072, + "step": 9480 + }, + { + "epoch": 46.04358353510896, + "grad_norm": 0.0001072903469321318, + "learning_rate": 0.26027624033814195, + "loss": 0.0113, + "num_input_tokens_seen": 16251360, + "step": 9485 + }, + { + "epoch": 46.067796610169495, + "grad_norm": 0.002195350592955947, + "learning_rate": 0.2602363015593645, + "loss": 0.0055, + "num_input_tokens_seen": 16260064, + "step": 9490 + }, + { + "epoch": 46.092009685230025, + "grad_norm": 0.004592906218022108, + "learning_rate": 0.26019634578076395, + "loss": 0.011, + "num_input_tokens_seen": 16268800, + "step": 9495 + }, + { + "epoch": 46.116222760290555, + "grad_norm": 0.007324582897126675, + "learning_rate": 0.26015637300850214, + "loss": 0.0168, + "num_input_tokens_seen": 16277376, + "step": 9500 + }, + { + "epoch": 46.14043583535109, + "grad_norm": 0.002864883979782462, + "learning_rate": 0.26011638324874325, + "loss": 0.0088, + "num_input_tokens_seen": 16286144, + "step": 9505 + }, + { + "epoch": 46.16464891041162, + "grad_norm": 0.008649916388094425, + "learning_rate": 0.2600763765076543, + "loss": 0.0133, + "num_input_tokens_seen": 16294304, + "step": 9510 + }, + { + "epoch": 46.18886198547216, + "grad_norm": 0.01306819822639227, + "learning_rate": 0.2600363527914048, + "loss": 0.0154, + "num_input_tokens_seen": 16302752, + "step": 9515 + }, + { + "epoch": 46.21307506053269, + "grad_norm": 0.000886746624018997, + "learning_rate": 0.25999631210616686, + "loss": 0.0122, + "num_input_tokens_seen": 16310848, + "step": 9520 + }, + { + "epoch": 46.23728813559322, + "grad_norm": 0.0020341111812740564, + "learning_rate": 0.25995625445811527, + "loss": 0.0059, + "num_input_tokens_seen": 16319488, + "step": 9525 + }, + { + "epoch": 46.261501210653755, + "grad_norm": 0.0003234792675357312, + "learning_rate": 0.2599161798534275, + "loss": 0.0226, + "num_input_tokens_seen": 16327904, + "step": 9530 + }, + { + "epoch": 46.285714285714285, + "grad_norm": 0.01668827049434185, + "learning_rate": 0.25987608829828346, + "loss": 0.0342, + "num_input_tokens_seen": 16336576, + "step": 9535 + }, + { + "epoch": 46.30992736077482, + "grad_norm": 0.001778596080839634, + "learning_rate": 0.25983597979886586, + "loss": 0.0159, + "num_input_tokens_seen": 16345088, + "step": 9540 + }, + { + "epoch": 46.33414043583535, + "grad_norm": 0.009621615521609783, + "learning_rate": 0.2597958543613599, + "loss": 0.036, + "num_input_tokens_seen": 16353824, + "step": 9545 + }, + { + "epoch": 46.35835351089588, + "grad_norm": 0.013110878877341747, + "learning_rate": 0.25975571199195335, + "loss": 0.0205, + "num_input_tokens_seen": 16362112, + "step": 9550 + }, + { + "epoch": 46.38256658595642, + "grad_norm": 0.0043246448040008545, + "learning_rate": 0.25971555269683677, + "loss": 0.0202, + "num_input_tokens_seen": 16370944, + "step": 9555 + }, + { + "epoch": 46.40677966101695, + "grad_norm": 0.0034426439087837934, + "learning_rate": 0.25967537648220324, + "loss": 0.0142, + "num_input_tokens_seen": 16379328, + "step": 9560 + }, + { + "epoch": 46.430992736077485, + "grad_norm": 0.006378095597028732, + "learning_rate": 0.2596351833542483, + "loss": 0.0372, + "num_input_tokens_seen": 16387904, + "step": 9565 + }, + { + "epoch": 46.455205811138015, + "grad_norm": 0.005538501776754856, + "learning_rate": 0.25959497331917036, + "loss": 0.0252, + "num_input_tokens_seen": 16396192, + "step": 9570 + }, + { + "epoch": 46.479418886198545, + "grad_norm": 0.0008844386320561171, + "learning_rate": 0.2595547463831703, + "loss": 0.039, + "num_input_tokens_seen": 16404576, + "step": 9575 + }, + { + "epoch": 46.50363196125908, + "grad_norm": 0.00069190509384498, + "learning_rate": 0.25951450255245156, + "loss": 0.0359, + "num_input_tokens_seen": 16413824, + "step": 9580 + }, + { + "epoch": 46.52784503631961, + "grad_norm": 0.009246356785297394, + "learning_rate": 0.2594742418332203, + "loss": 0.0255, + "num_input_tokens_seen": 16422464, + "step": 9585 + }, + { + "epoch": 46.55205811138015, + "grad_norm": 0.005170043557882309, + "learning_rate": 0.2594339642316852, + "loss": 0.0164, + "num_input_tokens_seen": 16431040, + "step": 9590 + }, + { + "epoch": 46.57627118644068, + "grad_norm": 0.0024902760051190853, + "learning_rate": 0.2593936697540576, + "loss": 0.0243, + "num_input_tokens_seen": 16439680, + "step": 9595 + }, + { + "epoch": 46.60048426150121, + "grad_norm": 0.005000267177820206, + "learning_rate": 0.2593533584065514, + "loss": 0.0254, + "num_input_tokens_seen": 16448096, + "step": 9600 + }, + { + "epoch": 46.60048426150121, + "eval_loss": 0.4033198058605194, + "eval_runtime": 4.6142, + "eval_samples_per_second": 79.537, + "eval_steps_per_second": 19.939, + "num_input_tokens_seen": 16448096, + "step": 9600 + }, + { + "epoch": 46.624697336561745, + "grad_norm": 0.0042776986956596375, + "learning_rate": 0.2593130301953831, + "loss": 0.0363, + "num_input_tokens_seen": 16456544, + "step": 9605 + }, + { + "epoch": 46.648910411622275, + "grad_norm": 0.0009380117407999933, + "learning_rate": 0.2592726851267718, + "loss": 0.0244, + "num_input_tokens_seen": 16464640, + "step": 9610 + }, + { + "epoch": 46.67312348668281, + "grad_norm": 0.0037351297214627266, + "learning_rate": 0.2592323232069393, + "loss": 0.0445, + "num_input_tokens_seen": 16473408, + "step": 9615 + }, + { + "epoch": 46.69733656174334, + "grad_norm": 0.0009564004139974713, + "learning_rate": 0.25919194444210986, + "loss": 0.011, + "num_input_tokens_seen": 16481728, + "step": 9620 + }, + { + "epoch": 46.72154963680387, + "grad_norm": 0.002822602167725563, + "learning_rate": 0.2591515488385103, + "loss": 0.0245, + "num_input_tokens_seen": 16490144, + "step": 9625 + }, + { + "epoch": 46.74576271186441, + "grad_norm": 0.0055653415620327, + "learning_rate": 0.2591111364023704, + "loss": 0.0252, + "num_input_tokens_seen": 16498656, + "step": 9630 + }, + { + "epoch": 46.76997578692494, + "grad_norm": 0.007214772514998913, + "learning_rate": 0.259070707139922, + "loss": 0.0267, + "num_input_tokens_seen": 16507040, + "step": 9635 + }, + { + "epoch": 46.794188861985475, + "grad_norm": 0.008964958600699902, + "learning_rate": 0.25903026105739985, + "loss": 0.0199, + "num_input_tokens_seen": 16515424, + "step": 9640 + }, + { + "epoch": 46.818401937046005, + "grad_norm": 0.000311787414830178, + "learning_rate": 0.2589897981610413, + "loss": 0.0158, + "num_input_tokens_seen": 16524160, + "step": 9645 + }, + { + "epoch": 46.842615012106535, + "grad_norm": 0.008951524272561073, + "learning_rate": 0.2589493184570863, + "loss": 0.0483, + "num_input_tokens_seen": 16532736, + "step": 9650 + }, + { + "epoch": 46.86682808716707, + "grad_norm": 0.0006592901772819459, + "learning_rate": 0.25890882195177717, + "loss": 0.0133, + "num_input_tokens_seen": 16541216, + "step": 9655 + }, + { + "epoch": 46.8910411622276, + "grad_norm": 0.00029741323669441044, + "learning_rate": 0.25886830865135907, + "loss": 0.023, + "num_input_tokens_seen": 16550112, + "step": 9660 + }, + { + "epoch": 46.91525423728814, + "grad_norm": 6.915661651873961e-05, + "learning_rate": 0.25882777856207967, + "loss": 0.0047, + "num_input_tokens_seen": 16558528, + "step": 9665 + }, + { + "epoch": 46.93946731234867, + "grad_norm": 0.001473527285270393, + "learning_rate": 0.2587872316901892, + "loss": 0.0155, + "num_input_tokens_seen": 16567072, + "step": 9670 + }, + { + "epoch": 46.9636803874092, + "grad_norm": 0.001100907800719142, + "learning_rate": 0.25874666804194046, + "loss": 0.0104, + "num_input_tokens_seen": 16575968, + "step": 9675 + }, + { + "epoch": 46.987893462469735, + "grad_norm": 0.0034573518205434084, + "learning_rate": 0.258706087623589, + "loss": 0.0054, + "num_input_tokens_seen": 16584768, + "step": 9680 + }, + { + "epoch": 47.01452784503632, + "grad_norm": 0.00037197352503426373, + "learning_rate": 0.25866549044139264, + "loss": 0.0054, + "num_input_tokens_seen": 16593440, + "step": 9685 + }, + { + "epoch": 47.03874092009685, + "grad_norm": 0.0031259097158908844, + "learning_rate": 0.25862487650161214, + "loss": 0.0177, + "num_input_tokens_seen": 16601792, + "step": 9690 + }, + { + "epoch": 47.062953995157386, + "grad_norm": 0.00014030461898073554, + "learning_rate": 0.2585842458105106, + "loss": 0.0009, + "num_input_tokens_seen": 16610464, + "step": 9695 + }, + { + "epoch": 47.087167070217916, + "grad_norm": 0.0027199634350836277, + "learning_rate": 0.2585435983743538, + "loss": 0.0042, + "num_input_tokens_seen": 16618976, + "step": 9700 + }, + { + "epoch": 47.11138014527845, + "grad_norm": 0.001732176635414362, + "learning_rate": 0.2585029341994101, + "loss": 0.0078, + "num_input_tokens_seen": 16627200, + "step": 9705 + }, + { + "epoch": 47.13559322033898, + "grad_norm": 0.0005859534721821547, + "learning_rate": 0.2584622532919504, + "loss": 0.0035, + "num_input_tokens_seen": 16635648, + "step": 9710 + }, + { + "epoch": 47.15980629539951, + "grad_norm": 0.00014966845628805459, + "learning_rate": 0.2584215556582482, + "loss": 0.0011, + "num_input_tokens_seen": 16644032, + "step": 9715 + }, + { + "epoch": 47.18401937046005, + "grad_norm": 0.00041493913158774376, + "learning_rate": 0.25838084130457967, + "loss": 0.0019, + "num_input_tokens_seen": 16652768, + "step": 9720 + }, + { + "epoch": 47.20823244552058, + "grad_norm": 0.0004093059978913516, + "learning_rate": 0.2583401102372234, + "loss": 0.0031, + "num_input_tokens_seen": 16661504, + "step": 9725 + }, + { + "epoch": 47.232445520581116, + "grad_norm": 4.982122845831327e-05, + "learning_rate": 0.2582993624624606, + "loss": 0.0067, + "num_input_tokens_seen": 16670048, + "step": 9730 + }, + { + "epoch": 47.256658595641646, + "grad_norm": 0.0009135808795690536, + "learning_rate": 0.25825859798657513, + "loss": 0.0121, + "num_input_tokens_seen": 16678624, + "step": 9735 + }, + { + "epoch": 47.280871670702176, + "grad_norm": 0.004037138074636459, + "learning_rate": 0.25821781681585343, + "loss": 0.0151, + "num_input_tokens_seen": 16687328, + "step": 9740 + }, + { + "epoch": 47.30508474576271, + "grad_norm": 0.0004140683449804783, + "learning_rate": 0.2581770189565844, + "loss": 0.0083, + "num_input_tokens_seen": 16695488, + "step": 9745 + }, + { + "epoch": 47.32929782082324, + "grad_norm": 0.00018516683485358953, + "learning_rate": 0.25813620441505963, + "loss": 0.0034, + "num_input_tokens_seen": 16703968, + "step": 9750 + }, + { + "epoch": 47.35351089588378, + "grad_norm": 0.00013679151015821844, + "learning_rate": 0.2580953731975732, + "loss": 0.0067, + "num_input_tokens_seen": 16712320, + "step": 9755 + }, + { + "epoch": 47.37772397094431, + "grad_norm": 0.0021518217399716377, + "learning_rate": 0.2580545253104218, + "loss": 0.0198, + "num_input_tokens_seen": 16721216, + "step": 9760 + }, + { + "epoch": 47.40193704600484, + "grad_norm": 0.0005401180824264884, + "learning_rate": 0.2580136607599047, + "loss": 0.0019, + "num_input_tokens_seen": 16729568, + "step": 9765 + }, + { + "epoch": 47.426150121065376, + "grad_norm": 0.003607676364481449, + "learning_rate": 0.2579727795523238, + "loss": 0.0054, + "num_input_tokens_seen": 16738080, + "step": 9770 + }, + { + "epoch": 47.450363196125906, + "grad_norm": 0.001598522998392582, + "learning_rate": 0.25793188169398334, + "loss": 0.0059, + "num_input_tokens_seen": 16746432, + "step": 9775 + }, + { + "epoch": 47.47457627118644, + "grad_norm": 0.0016076606698334217, + "learning_rate": 0.25789096719119037, + "loss": 0.0067, + "num_input_tokens_seen": 16755104, + "step": 9780 + }, + { + "epoch": 47.49878934624697, + "grad_norm": 3.2810334232635796e-05, + "learning_rate": 0.2578500360502544, + "loss": 0.0065, + "num_input_tokens_seen": 16764160, + "step": 9785 + }, + { + "epoch": 47.5230024213075, + "grad_norm": 0.006970714777708054, + "learning_rate": 0.2578090882774876, + "loss": 0.006, + "num_input_tokens_seen": 16772768, + "step": 9790 + }, + { + "epoch": 47.54721549636804, + "grad_norm": 0.0015380722470581532, + "learning_rate": 0.25776812387920456, + "loss": 0.0033, + "num_input_tokens_seen": 16781600, + "step": 9795 + }, + { + "epoch": 47.57142857142857, + "grad_norm": 0.0002537644759286195, + "learning_rate": 0.2577271428617225, + "loss": 0.0109, + "num_input_tokens_seen": 16790336, + "step": 9800 + }, + { + "epoch": 47.57142857142857, + "eval_loss": 0.49081873893737793, + "eval_runtime": 4.6282, + "eval_samples_per_second": 79.296, + "eval_steps_per_second": 19.878, + "num_input_tokens_seen": 16790336, + "step": 9800 + }, + { + "epoch": 47.595641646489106, + "grad_norm": 0.004460226744413376, + "learning_rate": 0.25768614523136124, + "loss": 0.0075, + "num_input_tokens_seen": 16798784, + "step": 9805 + }, + { + "epoch": 47.619854721549636, + "grad_norm": 0.0009707870194688439, + "learning_rate": 0.25764513099444314, + "loss": 0.0018, + "num_input_tokens_seen": 16807616, + "step": 9810 + }, + { + "epoch": 47.644067796610166, + "grad_norm": 0.00490099610760808, + "learning_rate": 0.25760410015729307, + "loss": 0.0192, + "num_input_tokens_seen": 16816000, + "step": 9815 + }, + { + "epoch": 47.6682808716707, + "grad_norm": 0.0009017122210934758, + "learning_rate": 0.2575630527262385, + "loss": 0.0172, + "num_input_tokens_seen": 16824736, + "step": 9820 + }, + { + "epoch": 47.69249394673123, + "grad_norm": 0.0002519864065106958, + "learning_rate": 0.25752198870760945, + "loss": 0.0085, + "num_input_tokens_seen": 16833376, + "step": 9825 + }, + { + "epoch": 47.71670702179177, + "grad_norm": 0.0026380298659205437, + "learning_rate": 0.2574809081077386, + "loss": 0.0023, + "num_input_tokens_seen": 16841984, + "step": 9830 + }, + { + "epoch": 47.7409200968523, + "grad_norm": 0.005795728415250778, + "learning_rate": 0.257439810932961, + "loss": 0.0158, + "num_input_tokens_seen": 16850368, + "step": 9835 + }, + { + "epoch": 47.76513317191284, + "grad_norm": 0.0023059051018208265, + "learning_rate": 0.2573986971896144, + "loss": 0.0231, + "num_input_tokens_seen": 16859040, + "step": 9840 + }, + { + "epoch": 47.789346246973366, + "grad_norm": 0.00395488366484642, + "learning_rate": 0.257357566884039, + "loss": 0.0378, + "num_input_tokens_seen": 16867328, + "step": 9845 + }, + { + "epoch": 47.813559322033896, + "grad_norm": 0.006626340560615063, + "learning_rate": 0.25731642002257765, + "loss": 0.0206, + "num_input_tokens_seen": 16876096, + "step": 9850 + }, + { + "epoch": 47.83777239709443, + "grad_norm": 0.003193723037838936, + "learning_rate": 0.25727525661157574, + "loss": 0.0333, + "num_input_tokens_seen": 16884640, + "step": 9855 + }, + { + "epoch": 47.86198547215496, + "grad_norm": 0.012062108144164085, + "learning_rate": 0.2572340766573811, + "loss": 0.0252, + "num_input_tokens_seen": 16892800, + "step": 9860 + }, + { + "epoch": 47.8861985472155, + "grad_norm": 0.0005270461551845074, + "learning_rate": 0.25719288016634434, + "loss": 0.016, + "num_input_tokens_seen": 16901280, + "step": 9865 + }, + { + "epoch": 47.91041162227603, + "grad_norm": 0.003954360727220774, + "learning_rate": 0.25715166714481835, + "loss": 0.0097, + "num_input_tokens_seen": 16910112, + "step": 9870 + }, + { + "epoch": 47.93462469733656, + "grad_norm": 0.005514764692634344, + "learning_rate": 0.2571104375991587, + "loss": 0.0183, + "num_input_tokens_seen": 16918496, + "step": 9875 + }, + { + "epoch": 47.958837772397096, + "grad_norm": 0.009650176391005516, + "learning_rate": 0.2570691915357236, + "loss": 0.0262, + "num_input_tokens_seen": 16927392, + "step": 9880 + }, + { + "epoch": 47.983050847457626, + "grad_norm": 0.0023683845065534115, + "learning_rate": 0.2570279289608736, + "loss": 0.0209, + "num_input_tokens_seen": 16935936, + "step": 9885 + }, + { + "epoch": 48.00968523002421, + "grad_norm": 0.0006263833492994308, + "learning_rate": 0.256986649880972, + "loss": 0.0094, + "num_input_tokens_seen": 16945216, + "step": 9890 + }, + { + "epoch": 48.03389830508475, + "grad_norm": 0.002294032135978341, + "learning_rate": 0.25694535430238447, + "loss": 0.0055, + "num_input_tokens_seen": 16953632, + "step": 9895 + }, + { + "epoch": 48.05811138014528, + "grad_norm": 0.0006058073486201465, + "learning_rate": 0.25690404223147933, + "loss": 0.0059, + "num_input_tokens_seen": 16962144, + "step": 9900 + }, + { + "epoch": 48.082324455205814, + "grad_norm": 0.0003484050394035876, + "learning_rate": 0.2568627136746275, + "loss": 0.0124, + "num_input_tokens_seen": 16970560, + "step": 9905 + }, + { + "epoch": 48.106537530266344, + "grad_norm": 0.004171490203589201, + "learning_rate": 0.25682136863820226, + "loss": 0.0213, + "num_input_tokens_seen": 16979040, + "step": 9910 + }, + { + "epoch": 48.130750605326874, + "grad_norm": 0.0027880959678441286, + "learning_rate": 0.25678000712857957, + "loss": 0.0078, + "num_input_tokens_seen": 16987712, + "step": 9915 + }, + { + "epoch": 48.15496368038741, + "grad_norm": 0.001046926248818636, + "learning_rate": 0.2567386291521379, + "loss": 0.0031, + "num_input_tokens_seen": 16996192, + "step": 9920 + }, + { + "epoch": 48.17917675544794, + "grad_norm": 0.0005521501298062503, + "learning_rate": 0.2566972347152583, + "loss": 0.0121, + "num_input_tokens_seen": 17004576, + "step": 9925 + }, + { + "epoch": 48.20338983050848, + "grad_norm": 0.0017058058874681592, + "learning_rate": 0.2566558238243242, + "loss": 0.0052, + "num_input_tokens_seen": 17013248, + "step": 9930 + }, + { + "epoch": 48.22760290556901, + "grad_norm": 0.0007137099164538085, + "learning_rate": 0.25661439648572176, + "loss": 0.0068, + "num_input_tokens_seen": 17021792, + "step": 9935 + }, + { + "epoch": 48.25181598062954, + "grad_norm": 0.0005070074112154543, + "learning_rate": 0.25657295270583963, + "loss": 0.0151, + "num_input_tokens_seen": 17030144, + "step": 9940 + }, + { + "epoch": 48.276029055690074, + "grad_norm": 0.006045724265277386, + "learning_rate": 0.25653149249106894, + "loss": 0.0157, + "num_input_tokens_seen": 17038656, + "step": 9945 + }, + { + "epoch": 48.300242130750604, + "grad_norm": 4.2707524698926136e-05, + "learning_rate": 0.25649001584780323, + "loss": 0.0238, + "num_input_tokens_seen": 17047168, + "step": 9950 + }, + { + "epoch": 48.32445520581114, + "grad_norm": 0.00029784563230350614, + "learning_rate": 0.2564485227824389, + "loss": 0.0108, + "num_input_tokens_seen": 17056384, + "step": 9955 + }, + { + "epoch": 48.34866828087167, + "grad_norm": 0.006650278810411692, + "learning_rate": 0.25640701330137466, + "loss": 0.0124, + "num_input_tokens_seen": 17064896, + "step": 9960 + }, + { + "epoch": 48.3728813559322, + "grad_norm": 0.002862063469365239, + "learning_rate": 0.2563654874110117, + "loss": 0.0151, + "num_input_tokens_seen": 17073280, + "step": 9965 + }, + { + "epoch": 48.39709443099274, + "grad_norm": 0.0025958626065403223, + "learning_rate": 0.256323945117754, + "loss": 0.005, + "num_input_tokens_seen": 17081856, + "step": 9970 + }, + { + "epoch": 48.42130750605327, + "grad_norm": 0.007550583221018314, + "learning_rate": 0.2562823864280078, + "loss": 0.019, + "num_input_tokens_seen": 17090400, + "step": 9975 + }, + { + "epoch": 48.445520581113804, + "grad_norm": 9.983385098166764e-05, + "learning_rate": 0.25624081134818194, + "loss": 0.0173, + "num_input_tokens_seen": 17098848, + "step": 9980 + }, + { + "epoch": 48.469733656174334, + "grad_norm": 0.008753548376262188, + "learning_rate": 0.2561992198846879, + "loss": 0.0263, + "num_input_tokens_seen": 17107360, + "step": 9985 + }, + { + "epoch": 48.493946731234864, + "grad_norm": 0.001494420925155282, + "learning_rate": 0.25615761204393955, + "loss": 0.0071, + "num_input_tokens_seen": 17115808, + "step": 9990 + }, + { + "epoch": 48.5181598062954, + "grad_norm": 0.0070576658472418785, + "learning_rate": 0.2561159878323534, + "loss": 0.0331, + "num_input_tokens_seen": 17124224, + "step": 9995 + }, + { + "epoch": 48.54237288135593, + "grad_norm": 0.0007957213674671948, + "learning_rate": 0.2560743472563483, + "loss": 0.0044, + "num_input_tokens_seen": 17132896, + "step": 10000 + }, + { + "epoch": 48.54237288135593, + "eval_loss": 0.43655478954315186, + "eval_runtime": 4.6166, + "eval_samples_per_second": 79.496, + "eval_steps_per_second": 19.928, + "num_input_tokens_seen": 17132896, + "step": 10000 + }, + { + "epoch": 48.56658595641647, + "grad_norm": 0.001939740963280201, + "learning_rate": 0.25603269032234593, + "loss": 0.0063, + "num_input_tokens_seen": 17141408, + "step": 10005 + }, + { + "epoch": 48.590799031477, + "grad_norm": 0.0004981565289199352, + "learning_rate": 0.2559910170367702, + "loss": 0.0106, + "num_input_tokens_seen": 17150080, + "step": 10010 + }, + { + "epoch": 48.61501210653753, + "grad_norm": 0.007795684039592743, + "learning_rate": 0.2559493274060477, + "loss": 0.015, + "num_input_tokens_seen": 17158560, + "step": 10015 + }, + { + "epoch": 48.639225181598064, + "grad_norm": 0.0021578373853117228, + "learning_rate": 0.2559076214366074, + "loss": 0.0133, + "num_input_tokens_seen": 17166752, + "step": 10020 + }, + { + "epoch": 48.663438256658594, + "grad_norm": 0.0039971014484763145, + "learning_rate": 0.25586589913488106, + "loss": 0.016, + "num_input_tokens_seen": 17175584, + "step": 10025 + }, + { + "epoch": 48.68765133171913, + "grad_norm": 0.0015109240775927901, + "learning_rate": 0.2558241605073026, + "loss": 0.0046, + "num_input_tokens_seen": 17184352, + "step": 10030 + }, + { + "epoch": 48.71186440677966, + "grad_norm": 0.00012140044418629259, + "learning_rate": 0.25578240556030873, + "loss": 0.0059, + "num_input_tokens_seen": 17193152, + "step": 10035 + }, + { + "epoch": 48.73607748184019, + "grad_norm": 0.001038125017657876, + "learning_rate": 0.2557406343003386, + "loss": 0.0078, + "num_input_tokens_seen": 17201888, + "step": 10040 + }, + { + "epoch": 48.76029055690073, + "grad_norm": 0.0020188840571790934, + "learning_rate": 0.25569884673383375, + "loss": 0.0086, + "num_input_tokens_seen": 17210688, + "step": 10045 + }, + { + "epoch": 48.78450363196126, + "grad_norm": 0.0010799281299114227, + "learning_rate": 0.25565704286723856, + "loss": 0.0012, + "num_input_tokens_seen": 17219328, + "step": 10050 + }, + { + "epoch": 48.808716707021794, + "grad_norm": 0.0014693833654746413, + "learning_rate": 0.25561522270699955, + "loss": 0.0056, + "num_input_tokens_seen": 17227936, + "step": 10055 + }, + { + "epoch": 48.832929782082324, + "grad_norm": 0.0017318455502390862, + "learning_rate": 0.25557338625956594, + "loss": 0.005, + "num_input_tokens_seen": 17236288, + "step": 10060 + }, + { + "epoch": 48.857142857142854, + "grad_norm": 0.0002813795581459999, + "learning_rate": 0.25553153353138947, + "loss": 0.0069, + "num_input_tokens_seen": 17244896, + "step": 10065 + }, + { + "epoch": 48.88135593220339, + "grad_norm": 0.007563974242657423, + "learning_rate": 0.2554896645289243, + "loss": 0.0154, + "num_input_tokens_seen": 17252928, + "step": 10070 + }, + { + "epoch": 48.90556900726392, + "grad_norm": 0.0027502276934683323, + "learning_rate": 0.2554477792586272, + "loss": 0.0127, + "num_input_tokens_seen": 17261504, + "step": 10075 + }, + { + "epoch": 48.92978208232446, + "grad_norm": 0.000867670401930809, + "learning_rate": 0.25540587772695744, + "loss": 0.0033, + "num_input_tokens_seen": 17269664, + "step": 10080 + }, + { + "epoch": 48.95399515738499, + "grad_norm": 0.00018295733025297523, + "learning_rate": 0.2553639599403767, + "loss": 0.009, + "num_input_tokens_seen": 17278368, + "step": 10085 + }, + { + "epoch": 48.97820823244552, + "grad_norm": 0.003356169443577528, + "learning_rate": 0.2553220259053493, + "loss": 0.0221, + "num_input_tokens_seen": 17286816, + "step": 10090 + }, + { + "epoch": 49.00484261501211, + "grad_norm": 0.006229174789041281, + "learning_rate": 0.2552800756283419, + "loss": 0.0199, + "num_input_tokens_seen": 17295968, + "step": 10095 + }, + { + "epoch": 49.02905569007264, + "grad_norm": 0.004259457811713219, + "learning_rate": 0.25523810911582373, + "loss": 0.0129, + "num_input_tokens_seen": 17304608, + "step": 10100 + }, + { + "epoch": 49.05326876513317, + "grad_norm": 0.00024856344680301845, + "learning_rate": 0.25519612637426675, + "loss": 0.0258, + "num_input_tokens_seen": 17313216, + "step": 10105 + }, + { + "epoch": 49.077481840193705, + "grad_norm": 3.970496982219629e-05, + "learning_rate": 0.25515412741014504, + "loss": 0.0162, + "num_input_tokens_seen": 17321472, + "step": 10110 + }, + { + "epoch": 49.101694915254235, + "grad_norm": 0.005003532860428095, + "learning_rate": 0.2551121122299355, + "loss": 0.0288, + "num_input_tokens_seen": 17329888, + "step": 10115 + }, + { + "epoch": 49.12590799031477, + "grad_norm": 0.006293368525803089, + "learning_rate": 0.2550700808401173, + "loss": 0.0142, + "num_input_tokens_seen": 17338624, + "step": 10120 + }, + { + "epoch": 49.1501210653753, + "grad_norm": 0.00731846084818244, + "learning_rate": 0.2550280332471722, + "loss": 0.0172, + "num_input_tokens_seen": 17347168, + "step": 10125 + }, + { + "epoch": 49.17433414043583, + "grad_norm": 0.0025432342663407326, + "learning_rate": 0.2549859694575845, + "loss": 0.0052, + "num_input_tokens_seen": 17355456, + "step": 10130 + }, + { + "epoch": 49.19854721549637, + "grad_norm": 0.001443275366909802, + "learning_rate": 0.254943889477841, + "loss": 0.009, + "num_input_tokens_seen": 17363840, + "step": 10135 + }, + { + "epoch": 49.2227602905569, + "grad_norm": 0.001263641519472003, + "learning_rate": 0.25490179331443097, + "loss": 0.0019, + "num_input_tokens_seen": 17373376, + "step": 10140 + }, + { + "epoch": 49.246973365617436, + "grad_norm": 0.0005523196887224913, + "learning_rate": 0.25485968097384615, + "loss": 0.0051, + "num_input_tokens_seen": 17382304, + "step": 10145 + }, + { + "epoch": 49.271186440677965, + "grad_norm": 5.6999091611942276e-05, + "learning_rate": 0.25481755246258075, + "loss": 0.0035, + "num_input_tokens_seen": 17390752, + "step": 10150 + }, + { + "epoch": 49.295399515738495, + "grad_norm": 6.569478864548728e-05, + "learning_rate": 0.2547754077871315, + "loss": 0.0146, + "num_input_tokens_seen": 17399360, + "step": 10155 + }, + { + "epoch": 49.31961259079903, + "grad_norm": 0.001237761927768588, + "learning_rate": 0.25473324695399774, + "loss": 0.0112, + "num_input_tokens_seen": 17408160, + "step": 10160 + }, + { + "epoch": 49.34382566585956, + "grad_norm": 0.0009985461365431547, + "learning_rate": 0.25469106996968105, + "loss": 0.0078, + "num_input_tokens_seen": 17416928, + "step": 10165 + }, + { + "epoch": 49.3680387409201, + "grad_norm": 0.0004992940812371671, + "learning_rate": 0.2546488768406858, + "loss": 0.0095, + "num_input_tokens_seen": 17425504, + "step": 10170 + }, + { + "epoch": 49.39225181598063, + "grad_norm": 0.00033896949025802314, + "learning_rate": 0.25460666757351863, + "loss": 0.016, + "num_input_tokens_seen": 17433760, + "step": 10175 + }, + { + "epoch": 49.416464891041166, + "grad_norm": 0.0010425643995404243, + "learning_rate": 0.25456444217468877, + "loss": 0.0082, + "num_input_tokens_seen": 17442976, + "step": 10180 + }, + { + "epoch": 49.440677966101696, + "grad_norm": 0.00035225824103690684, + "learning_rate": 0.25452220065070785, + "loss": 0.015, + "num_input_tokens_seen": 17451712, + "step": 10185 + }, + { + "epoch": 49.464891041162225, + "grad_norm": 0.00158646609634161, + "learning_rate": 0.2544799430080901, + "loss": 0.0047, + "num_input_tokens_seen": 17460480, + "step": 10190 + }, + { + "epoch": 49.48910411622276, + "grad_norm": 0.004719399381428957, + "learning_rate": 0.2544376692533522, + "loss": 0.0109, + "num_input_tokens_seen": 17468928, + "step": 10195 + }, + { + "epoch": 49.51331719128329, + "grad_norm": 0.00017216063861269504, + "learning_rate": 0.2543953793930132, + "loss": 0.0034, + "num_input_tokens_seen": 17477376, + "step": 10200 + }, + { + "epoch": 49.51331719128329, + "eval_loss": 0.46030524373054504, + "eval_runtime": 4.6216, + "eval_samples_per_second": 79.411, + "eval_steps_per_second": 19.907, + "num_input_tokens_seen": 17477376, + "step": 10200 + }, + { + "epoch": 49.53753026634383, + "grad_norm": 0.002181842690333724, + "learning_rate": 0.2543530734335948, + "loss": 0.0027, + "num_input_tokens_seen": 17485760, + "step": 10205 + }, + { + "epoch": 49.56174334140436, + "grad_norm": 0.006491517648100853, + "learning_rate": 0.2543107513816211, + "loss": 0.0155, + "num_input_tokens_seen": 17494080, + "step": 10210 + }, + { + "epoch": 49.58595641646489, + "grad_norm": 0.006013989914208651, + "learning_rate": 0.25426841324361865, + "loss": 0.0075, + "num_input_tokens_seen": 17502560, + "step": 10215 + }, + { + "epoch": 49.610169491525426, + "grad_norm": 0.0008187603671103716, + "learning_rate": 0.2542260590261166, + "loss": 0.0073, + "num_input_tokens_seen": 17511040, + "step": 10220 + }, + { + "epoch": 49.634382566585955, + "grad_norm": 0.00030784099362790585, + "learning_rate": 0.2541836887356465, + "loss": 0.0054, + "num_input_tokens_seen": 17519520, + "step": 10225 + }, + { + "epoch": 49.65859564164649, + "grad_norm": 0.0001541833480587229, + "learning_rate": 0.2541413023787423, + "loss": 0.002, + "num_input_tokens_seen": 17528064, + "step": 10230 + }, + { + "epoch": 49.68280871670702, + "grad_norm": 0.005457856226712465, + "learning_rate": 0.2540988999619405, + "loss": 0.0122, + "num_input_tokens_seen": 17536352, + "step": 10235 + }, + { + "epoch": 49.70702179176755, + "grad_norm": 0.00019510202400851995, + "learning_rate": 0.25405648149178023, + "loss": 0.012, + "num_input_tokens_seen": 17544960, + "step": 10240 + }, + { + "epoch": 49.73123486682809, + "grad_norm": 0.002623815555125475, + "learning_rate": 0.2540140469748028, + "loss": 0.0067, + "num_input_tokens_seen": 17553408, + "step": 10245 + }, + { + "epoch": 49.75544794188862, + "grad_norm": 0.007402515970170498, + "learning_rate": 0.25397159641755224, + "loss": 0.018, + "num_input_tokens_seen": 17561536, + "step": 10250 + }, + { + "epoch": 49.779661016949156, + "grad_norm": 1.1165188880113419e-05, + "learning_rate": 0.2539291298265749, + "loss": 0.0007, + "num_input_tokens_seen": 17569856, + "step": 10255 + }, + { + "epoch": 49.803874092009686, + "grad_norm": 6.612352444790304e-05, + "learning_rate": 0.2538866472084197, + "loss": 0.0019, + "num_input_tokens_seen": 17578272, + "step": 10260 + }, + { + "epoch": 49.828087167070215, + "grad_norm": 0.002547122770920396, + "learning_rate": 0.25384414856963794, + "loss": 0.0064, + "num_input_tokens_seen": 17586720, + "step": 10265 + }, + { + "epoch": 49.85230024213075, + "grad_norm": 0.00014601348084397614, + "learning_rate": 0.25380163391678356, + "loss": 0.0163, + "num_input_tokens_seen": 17595168, + "step": 10270 + }, + { + "epoch": 49.87651331719128, + "grad_norm": 0.011155623011291027, + "learning_rate": 0.2537591032564127, + "loss": 0.0163, + "num_input_tokens_seen": 17604096, + "step": 10275 + }, + { + "epoch": 49.90072639225182, + "grad_norm": 0.011615765281021595, + "learning_rate": 0.25371655659508424, + "loss": 0.0216, + "num_input_tokens_seen": 17612768, + "step": 10280 + }, + { + "epoch": 49.92493946731235, + "grad_norm": 0.002343317959457636, + "learning_rate": 0.25367399393935935, + "loss": 0.0234, + "num_input_tokens_seen": 17621312, + "step": 10285 + }, + { + "epoch": 49.94915254237288, + "grad_norm": 0.009173535741865635, + "learning_rate": 0.25363141529580174, + "loss": 0.0166, + "num_input_tokens_seen": 17629600, + "step": 10290 + }, + { + "epoch": 49.973365617433416, + "grad_norm": 0.00835066381841898, + "learning_rate": 0.2535888206709776, + "loss": 0.01, + "num_input_tokens_seen": 17638144, + "step": 10295 + }, + { + "epoch": 49.997578692493946, + "grad_norm": 0.0047483197413384914, + "learning_rate": 0.2535462100714555, + "loss": 0.0201, + "num_input_tokens_seen": 17646592, + "step": 10300 + }, + { + "epoch": 50.02421307506053, + "grad_norm": 0.002035312820225954, + "learning_rate": 0.2535035835038066, + "loss": 0.0128, + "num_input_tokens_seen": 17655552, + "step": 10305 + }, + { + "epoch": 50.04842615012107, + "grad_norm": 0.0006905422196723521, + "learning_rate": 0.2534609409746044, + "loss": 0.0017, + "num_input_tokens_seen": 17664160, + "step": 10310 + }, + { + "epoch": 50.0726392251816, + "grad_norm": 0.00013134854089003056, + "learning_rate": 0.253418282490425, + "loss": 0.0011, + "num_input_tokens_seen": 17673120, + "step": 10315 + }, + { + "epoch": 50.09685230024213, + "grad_norm": 0.0014154051896184683, + "learning_rate": 0.2533756080578467, + "loss": 0.0201, + "num_input_tokens_seen": 17681696, + "step": 10320 + }, + { + "epoch": 50.12106537530266, + "grad_norm": 0.008338646963238716, + "learning_rate": 0.25333291768345056, + "loss": 0.0162, + "num_input_tokens_seen": 17690208, + "step": 10325 + }, + { + "epoch": 50.14527845036319, + "grad_norm": 0.0011659220326691866, + "learning_rate": 0.25329021137381996, + "loss": 0.017, + "num_input_tokens_seen": 17698848, + "step": 10330 + }, + { + "epoch": 50.16949152542373, + "grad_norm": 0.004782774019986391, + "learning_rate": 0.25324748913554074, + "loss": 0.0089, + "num_input_tokens_seen": 17707232, + "step": 10335 + }, + { + "epoch": 50.19370460048426, + "grad_norm": 0.006369943730533123, + "learning_rate": 0.2532047509752013, + "loss": 0.0127, + "num_input_tokens_seen": 17715520, + "step": 10340 + }, + { + "epoch": 50.2179176755448, + "grad_norm": 0.00020878504437860101, + "learning_rate": 0.25316199689939217, + "loss": 0.0013, + "num_input_tokens_seen": 17724032, + "step": 10345 + }, + { + "epoch": 50.24213075060533, + "grad_norm": 0.0006198026239871979, + "learning_rate": 0.2531192269147068, + "loss": 0.0023, + "num_input_tokens_seen": 17732704, + "step": 10350 + }, + { + "epoch": 50.26634382566586, + "grad_norm": 0.0030836043879389763, + "learning_rate": 0.2530764410277407, + "loss": 0.0079, + "num_input_tokens_seen": 17741248, + "step": 10355 + }, + { + "epoch": 50.29055690072639, + "grad_norm": 0.002243240363895893, + "learning_rate": 0.25303363924509203, + "loss": 0.0065, + "num_input_tokens_seen": 17750144, + "step": 10360 + }, + { + "epoch": 50.31476997578692, + "grad_norm": 0.0009751206962391734, + "learning_rate": 0.25299082157336145, + "loss": 0.0058, + "num_input_tokens_seen": 17758816, + "step": 10365 + }, + { + "epoch": 50.33898305084746, + "grad_norm": 0.0007158254156820476, + "learning_rate": 0.2529479880191519, + "loss": 0.0116, + "num_input_tokens_seen": 17767168, + "step": 10370 + }, + { + "epoch": 50.36319612590799, + "grad_norm": 0.0012653378071263433, + "learning_rate": 0.2529051385890689, + "loss": 0.0022, + "num_input_tokens_seen": 17775104, + "step": 10375 + }, + { + "epoch": 50.38740920096852, + "grad_norm": 0.00042368550202809274, + "learning_rate": 0.2528622732897203, + "loss": 0.0077, + "num_input_tokens_seen": 17783680, + "step": 10380 + }, + { + "epoch": 50.41162227602906, + "grad_norm": 0.0006253225728869438, + "learning_rate": 0.25281939212771654, + "loss": 0.0153, + "num_input_tokens_seen": 17792352, + "step": 10385 + }, + { + "epoch": 50.43583535108959, + "grad_norm": 0.0015921865124255419, + "learning_rate": 0.2527764951096704, + "loss": 0.0123, + "num_input_tokens_seen": 17800768, + "step": 10390 + }, + { + "epoch": 50.460048426150124, + "grad_norm": 0.00025266240118071437, + "learning_rate": 0.2527335822421971, + "loss": 0.0084, + "num_input_tokens_seen": 17809504, + "step": 10395 + }, + { + "epoch": 50.48426150121065, + "grad_norm": 0.00020129565382376313, + "learning_rate": 0.25269065353191444, + "loss": 0.007, + "num_input_tokens_seen": 17817792, + "step": 10400 + }, + { + "epoch": 50.48426150121065, + "eval_loss": 0.5221193432807922, + "eval_runtime": 4.6125, + "eval_samples_per_second": 79.567, + "eval_steps_per_second": 19.946, + "num_input_tokens_seen": 17817792, + "step": 10400 + }, + { + "epoch": 50.50847457627118, + "grad_norm": 0.00015490497753489763, + "learning_rate": 0.2526477089854425, + "loss": 0.0016, + "num_input_tokens_seen": 17826720, + "step": 10405 + }, + { + "epoch": 50.53268765133172, + "grad_norm": 0.0011538665276020765, + "learning_rate": 0.25260474860940385, + "loss": 0.0017, + "num_input_tokens_seen": 17835328, + "step": 10410 + }, + { + "epoch": 50.55690072639225, + "grad_norm": 0.0017677554860711098, + "learning_rate": 0.2525617724104236, + "loss": 0.0131, + "num_input_tokens_seen": 17843968, + "step": 10415 + }, + { + "epoch": 50.58111380145279, + "grad_norm": 0.005017333198338747, + "learning_rate": 0.25251878039512915, + "loss": 0.0054, + "num_input_tokens_seen": 17852512, + "step": 10420 + }, + { + "epoch": 50.60532687651332, + "grad_norm": 0.0005721343914046884, + "learning_rate": 0.25247577257015047, + "loss": 0.0125, + "num_input_tokens_seen": 17860576, + "step": 10425 + }, + { + "epoch": 50.62953995157385, + "grad_norm": 0.0002568172349128872, + "learning_rate": 0.2524327489421198, + "loss": 0.0109, + "num_input_tokens_seen": 17869056, + "step": 10430 + }, + { + "epoch": 50.653753026634384, + "grad_norm": 0.011420482769608498, + "learning_rate": 0.25238970951767203, + "loss": 0.0145, + "num_input_tokens_seen": 17877408, + "step": 10435 + }, + { + "epoch": 50.67796610169491, + "grad_norm": 0.0030467903707176447, + "learning_rate": 0.25234665430344433, + "loss": 0.0201, + "num_input_tokens_seen": 17886368, + "step": 10440 + }, + { + "epoch": 50.70217917675545, + "grad_norm": 0.009681008756160736, + "learning_rate": 0.2523035833060764, + "loss": 0.0286, + "num_input_tokens_seen": 17894976, + "step": 10445 + }, + { + "epoch": 50.72639225181598, + "grad_norm": 0.0006334695499390364, + "learning_rate": 0.2522604965322103, + "loss": 0.0043, + "num_input_tokens_seen": 17903392, + "step": 10450 + }, + { + "epoch": 50.75060532687651, + "grad_norm": 0.0029529016464948654, + "learning_rate": 0.25221739398849047, + "loss": 0.0034, + "num_input_tokens_seen": 17912064, + "step": 10455 + }, + { + "epoch": 50.77481840193705, + "grad_norm": 0.0007456925814040005, + "learning_rate": 0.252174275681564, + "loss": 0.0029, + "num_input_tokens_seen": 17920800, + "step": 10460 + }, + { + "epoch": 50.79903147699758, + "grad_norm": 2.232446422567591e-05, + "learning_rate": 0.2521311416180802, + "loss": 0.0132, + "num_input_tokens_seen": 17929344, + "step": 10465 + }, + { + "epoch": 50.823244552058114, + "grad_norm": 0.009707272984087467, + "learning_rate": 0.25208799180469094, + "loss": 0.0354, + "num_input_tokens_seen": 17938016, + "step": 10470 + }, + { + "epoch": 50.847457627118644, + "grad_norm": 0.002811228157952428, + "learning_rate": 0.2520448262480504, + "loss": 0.0078, + "num_input_tokens_seen": 17946304, + "step": 10475 + }, + { + "epoch": 50.87167070217917, + "grad_norm": 0.008087489753961563, + "learning_rate": 0.25200164495481525, + "loss": 0.0169, + "num_input_tokens_seen": 17954656, + "step": 10480 + }, + { + "epoch": 50.89588377723971, + "grad_norm": 0.0001614352222532034, + "learning_rate": 0.25195844793164474, + "loss": 0.0136, + "num_input_tokens_seen": 17962976, + "step": 10485 + }, + { + "epoch": 50.92009685230024, + "grad_norm": 0.00020859578216914088, + "learning_rate": 0.2519152351852001, + "loss": 0.0396, + "num_input_tokens_seen": 17971648, + "step": 10490 + }, + { + "epoch": 50.94430992736078, + "grad_norm": 0.0006014065002091229, + "learning_rate": 0.25187200672214555, + "loss": 0.0067, + "num_input_tokens_seen": 17980256, + "step": 10495 + }, + { + "epoch": 50.96852300242131, + "grad_norm": 0.0015817421954125166, + "learning_rate": 0.2518287625491473, + "loss": 0.006, + "num_input_tokens_seen": 17988832, + "step": 10500 + }, + { + "epoch": 50.99273607748184, + "grad_norm": 0.00015358990640379488, + "learning_rate": 0.25178550267287425, + "loss": 0.0025, + "num_input_tokens_seen": 17997312, + "step": 10505 + }, + { + "epoch": 51.01937046004843, + "grad_norm": 0.0012545068748295307, + "learning_rate": 0.2517422270999976, + "loss": 0.004, + "num_input_tokens_seen": 18006240, + "step": 10510 + }, + { + "epoch": 51.04358353510896, + "grad_norm": 0.00046276478678919375, + "learning_rate": 0.2516989358371909, + "loss": 0.0109, + "num_input_tokens_seen": 18015040, + "step": 10515 + }, + { + "epoch": 51.067796610169495, + "grad_norm": 0.0008055217331275344, + "learning_rate": 0.25165562889113025, + "loss": 0.0021, + "num_input_tokens_seen": 18023776, + "step": 10520 + }, + { + "epoch": 51.092009685230025, + "grad_norm": 0.00038193134241737425, + "learning_rate": 0.2516123062684942, + "loss": 0.0022, + "num_input_tokens_seen": 18032128, + "step": 10525 + }, + { + "epoch": 51.116222760290555, + "grad_norm": 9.850619971984997e-05, + "learning_rate": 0.25156896797596356, + "loss": 0.0081, + "num_input_tokens_seen": 18040608, + "step": 10530 + }, + { + "epoch": 51.14043583535109, + "grad_norm": 0.003119743661954999, + "learning_rate": 0.2515256140202216, + "loss": 0.0032, + "num_input_tokens_seen": 18049024, + "step": 10535 + }, + { + "epoch": 51.16464891041162, + "grad_norm": 0.006540943402796984, + "learning_rate": 0.25148224440795425, + "loss": 0.0086, + "num_input_tokens_seen": 18057664, + "step": 10540 + }, + { + "epoch": 51.18886198547216, + "grad_norm": 8.674411219544709e-05, + "learning_rate": 0.2514388591458494, + "loss": 0.0014, + "num_input_tokens_seen": 18065920, + "step": 10545 + }, + { + "epoch": 51.21307506053269, + "grad_norm": 0.00031933028367348015, + "learning_rate": 0.2513954582405977, + "loss": 0.009, + "num_input_tokens_seen": 18074272, + "step": 10550 + }, + { + "epoch": 51.23728813559322, + "grad_norm": 0.0007797247963026166, + "learning_rate": 0.2513520416988922, + "loss": 0.0058, + "num_input_tokens_seen": 18082752, + "step": 10555 + }, + { + "epoch": 51.261501210653755, + "grad_norm": 0.009408914484083652, + "learning_rate": 0.2513086095274281, + "loss": 0.033, + "num_input_tokens_seen": 18091648, + "step": 10560 + }, + { + "epoch": 51.285714285714285, + "grad_norm": 0.0007879480253905058, + "learning_rate": 0.25126516173290336, + "loss": 0.0017, + "num_input_tokens_seen": 18100352, + "step": 10565 + }, + { + "epoch": 51.30992736077482, + "grad_norm": 0.0022428191732615232, + "learning_rate": 0.2512216983220181, + "loss": 0.0048, + "num_input_tokens_seen": 18109120, + "step": 10570 + }, + { + "epoch": 51.33414043583535, + "grad_norm": 0.005245597567409277, + "learning_rate": 0.25117821930147494, + "loss": 0.0057, + "num_input_tokens_seen": 18117792, + "step": 10575 + }, + { + "epoch": 51.35835351089588, + "grad_norm": 0.0004926823894493282, + "learning_rate": 0.2511347246779788, + "loss": 0.0088, + "num_input_tokens_seen": 18126336, + "step": 10580 + }, + { + "epoch": 51.38256658595642, + "grad_norm": 8.652334508951753e-05, + "learning_rate": 0.25109121445823723, + "loss": 0.0032, + "num_input_tokens_seen": 18135072, + "step": 10585 + }, + { + "epoch": 51.40677966101695, + "grad_norm": 0.006240383256226778, + "learning_rate": 0.25104768864896004, + "loss": 0.0103, + "num_input_tokens_seen": 18143488, + "step": 10590 + }, + { + "epoch": 51.430992736077485, + "grad_norm": 0.0006370506598614156, + "learning_rate": 0.2510041472568594, + "loss": 0.0008, + "num_input_tokens_seen": 18152000, + "step": 10595 + }, + { + "epoch": 51.455205811138015, + "grad_norm": 9.376856178278103e-05, + "learning_rate": 0.25096059028864987, + "loss": 0.003, + "num_input_tokens_seen": 18160384, + "step": 10600 + }, + { + "epoch": 51.455205811138015, + "eval_loss": 0.5242970585823059, + "eval_runtime": 4.635, + "eval_samples_per_second": 79.18, + "eval_steps_per_second": 19.849, + "num_input_tokens_seen": 18160384, + "step": 10600 + }, + { + "epoch": 51.479418886198545, + "grad_norm": 0.0012617920292541385, + "learning_rate": 0.25091701775104863, + "loss": 0.0045, + "num_input_tokens_seen": 18169024, + "step": 10605 + }, + { + "epoch": 51.50363196125908, + "grad_norm": 9.265727567253634e-05, + "learning_rate": 0.250873429650775, + "loss": 0.0015, + "num_input_tokens_seen": 18177632, + "step": 10610 + }, + { + "epoch": 51.52784503631961, + "grad_norm": 1.2988499292987399e-05, + "learning_rate": 0.25082982599455095, + "loss": 0.0019, + "num_input_tokens_seen": 18186400, + "step": 10615 + }, + { + "epoch": 51.55205811138015, + "grad_norm": 0.00016765222244430333, + "learning_rate": 0.2507862067891006, + "loss": 0.001, + "num_input_tokens_seen": 18194848, + "step": 10620 + }, + { + "epoch": 51.57627118644068, + "grad_norm": 0.00028886503423564136, + "learning_rate": 0.25074257204115064, + "loss": 0.0018, + "num_input_tokens_seen": 18202912, + "step": 10625 + }, + { + "epoch": 51.60048426150121, + "grad_norm": 0.0009164898656308651, + "learning_rate": 0.25069892175742997, + "loss": 0.0146, + "num_input_tokens_seen": 18211552, + "step": 10630 + }, + { + "epoch": 51.624697336561745, + "grad_norm": 0.006943365093320608, + "learning_rate": 0.25065525594467014, + "loss": 0.0078, + "num_input_tokens_seen": 18220032, + "step": 10635 + }, + { + "epoch": 51.648910411622275, + "grad_norm": 0.000623910513240844, + "learning_rate": 0.2506115746096049, + "loss": 0.002, + "num_input_tokens_seen": 18228512, + "step": 10640 + }, + { + "epoch": 51.67312348668281, + "grad_norm": 0.00016433214477729052, + "learning_rate": 0.25056787775897055, + "loss": 0.0116, + "num_input_tokens_seen": 18237600, + "step": 10645 + }, + { + "epoch": 51.69733656174334, + "grad_norm": 0.0002869238087441772, + "learning_rate": 0.2505241653995056, + "loss": 0.0018, + "num_input_tokens_seen": 18246336, + "step": 10650 + }, + { + "epoch": 51.72154963680387, + "grad_norm": 0.0013547822600230575, + "learning_rate": 0.25048043753795113, + "loss": 0.0077, + "num_input_tokens_seen": 18254848, + "step": 10655 + }, + { + "epoch": 51.74576271186441, + "grad_norm": 0.002461525145918131, + "learning_rate": 0.2504366941810504, + "loss": 0.0029, + "num_input_tokens_seen": 18263936, + "step": 10660 + }, + { + "epoch": 51.76997578692494, + "grad_norm": 4.490307765081525e-05, + "learning_rate": 0.2503929353355493, + "loss": 0.0058, + "num_input_tokens_seen": 18272672, + "step": 10665 + }, + { + "epoch": 51.794188861985475, + "grad_norm": 0.005062413867563009, + "learning_rate": 0.250349161008196, + "loss": 0.0041, + "num_input_tokens_seen": 18281088, + "step": 10670 + }, + { + "epoch": 51.818401937046005, + "grad_norm": 0.0002884066489059478, + "learning_rate": 0.2503053712057409, + "loss": 0.0049, + "num_input_tokens_seen": 18289632, + "step": 10675 + }, + { + "epoch": 51.842615012106535, + "grad_norm": 0.000456471461802721, + "learning_rate": 0.25026156593493715, + "loss": 0.0071, + "num_input_tokens_seen": 18298272, + "step": 10680 + }, + { + "epoch": 51.86682808716707, + "grad_norm": 0.004112924914807081, + "learning_rate": 0.2502177452025399, + "loss": 0.0051, + "num_input_tokens_seen": 18306368, + "step": 10685 + }, + { + "epoch": 51.8910411622276, + "grad_norm": 0.0007676348323002458, + "learning_rate": 0.25017390901530695, + "loss": 0.0028, + "num_input_tokens_seen": 18315264, + "step": 10690 + }, + { + "epoch": 51.91525423728814, + "grad_norm": 0.0008402905659750104, + "learning_rate": 0.2501300573799984, + "loss": 0.0132, + "num_input_tokens_seen": 18323424, + "step": 10695 + }, + { + "epoch": 51.93946731234867, + "grad_norm": 0.0002527609176468104, + "learning_rate": 0.2500861903033766, + "loss": 0.0006, + "num_input_tokens_seen": 18332320, + "step": 10700 + }, + { + "epoch": 51.9636803874092, + "grad_norm": 0.00038924950058571994, + "learning_rate": 0.25004230779220654, + "loss": 0.002, + "num_input_tokens_seen": 18340608, + "step": 10705 + }, + { + "epoch": 51.987893462469735, + "grad_norm": 0.0013950967695564032, + "learning_rate": 0.24999840985325542, + "loss": 0.0019, + "num_input_tokens_seen": 18349120, + "step": 10710 + }, + { + "epoch": 52.01452784503632, + "grad_norm": 0.0001085635885829106, + "learning_rate": 0.24995449649329285, + "loss": 0.0006, + "num_input_tokens_seen": 18358240, + "step": 10715 + }, + { + "epoch": 52.03874092009685, + "grad_norm": 0.000175031105754897, + "learning_rate": 0.2499105677190908, + "loss": 0.0009, + "num_input_tokens_seen": 18366720, + "step": 10720 + }, + { + "epoch": 52.062953995157386, + "grad_norm": 5.7293131249025464e-05, + "learning_rate": 0.24986662353742364, + "loss": 0.001, + "num_input_tokens_seen": 18375040, + "step": 10725 + }, + { + "epoch": 52.087167070217916, + "grad_norm": 7.708832708885893e-05, + "learning_rate": 0.24982266395506814, + "loss": 0.0025, + "num_input_tokens_seen": 18383520, + "step": 10730 + }, + { + "epoch": 52.11138014527845, + "grad_norm": 8.886801515473053e-05, + "learning_rate": 0.2497786889788034, + "loss": 0.0005, + "num_input_tokens_seen": 18391904, + "step": 10735 + }, + { + "epoch": 52.13559322033898, + "grad_norm": 0.00010590317833703011, + "learning_rate": 0.24973469861541095, + "loss": 0.0005, + "num_input_tokens_seen": 18400256, + "step": 10740 + }, + { + "epoch": 52.15980629539951, + "grad_norm": 5.17821463290602e-05, + "learning_rate": 0.24969069287167456, + "loss": 0.0011, + "num_input_tokens_seen": 18408704, + "step": 10745 + }, + { + "epoch": 52.18401937046005, + "grad_norm": 2.837755710061174e-05, + "learning_rate": 0.2496466717543806, + "loss": 0.0004, + "num_input_tokens_seen": 18417664, + "step": 10750 + }, + { + "epoch": 52.20823244552058, + "grad_norm": 1.7581465726834722e-05, + "learning_rate": 0.24960263527031762, + "loss": 0.0008, + "num_input_tokens_seen": 18426080, + "step": 10755 + }, + { + "epoch": 52.232445520581116, + "grad_norm": 0.00027818052330985665, + "learning_rate": 0.24955858342627657, + "loss": 0.0004, + "num_input_tokens_seen": 18434496, + "step": 10760 + }, + { + "epoch": 52.256658595641646, + "grad_norm": 0.002366701839491725, + "learning_rate": 0.24951451622905083, + "loss": 0.0019, + "num_input_tokens_seen": 18443232, + "step": 10765 + }, + { + "epoch": 52.280871670702176, + "grad_norm": 8.316624735016376e-05, + "learning_rate": 0.24947043368543612, + "loss": 0.0014, + "num_input_tokens_seen": 18451680, + "step": 10770 + }, + { + "epoch": 52.30508474576271, + "grad_norm": 4.761137461173348e-05, + "learning_rate": 0.2494263358022305, + "loss": 0.0009, + "num_input_tokens_seen": 18460352, + "step": 10775 + }, + { + "epoch": 52.32929782082324, + "grad_norm": 0.0003073733823839575, + "learning_rate": 0.24938222258623444, + "loss": 0.0005, + "num_input_tokens_seen": 18468768, + "step": 10780 + }, + { + "epoch": 52.35351089588378, + "grad_norm": 4.615030775312334e-05, + "learning_rate": 0.24933809404425075, + "loss": 0.0003, + "num_input_tokens_seen": 18477504, + "step": 10785 + }, + { + "epoch": 52.37772397094431, + "grad_norm": 0.000139174735522829, + "learning_rate": 0.24929395018308453, + "loss": 0.0009, + "num_input_tokens_seen": 18486144, + "step": 10790 + }, + { + "epoch": 52.40193704600484, + "grad_norm": 4.102551974938251e-05, + "learning_rate": 0.24924979100954348, + "loss": 0.0002, + "num_input_tokens_seen": 18494528, + "step": 10795 + }, + { + "epoch": 52.426150121065376, + "grad_norm": 0.0009084509219974279, + "learning_rate": 0.24920561653043735, + "loss": 0.0013, + "num_input_tokens_seen": 18502784, + "step": 10800 + }, + { + "epoch": 52.426150121065376, + "eval_loss": 0.6114208102226257, + "eval_runtime": 4.6186, + "eval_samples_per_second": 79.462, + "eval_steps_per_second": 19.92, + "num_input_tokens_seen": 18502784, + "step": 10800 + }, + { + "epoch": 52.450363196125906, + "grad_norm": 3.212683441233821e-05, + "learning_rate": 0.24916142675257846, + "loss": 0.0009, + "num_input_tokens_seen": 18511040, + "step": 10805 + }, + { + "epoch": 52.47457627118644, + "grad_norm": 0.00011183920287294313, + "learning_rate": 0.24911722168278144, + "loss": 0.0006, + "num_input_tokens_seen": 18519424, + "step": 10810 + }, + { + "epoch": 52.49878934624697, + "grad_norm": 4.723203346657101e-06, + "learning_rate": 0.24907300132786328, + "loss": 0.0004, + "num_input_tokens_seen": 18528192, + "step": 10815 + }, + { + "epoch": 52.5230024213075, + "grad_norm": 0.0001470038841944188, + "learning_rate": 0.24902876569464322, + "loss": 0.0003, + "num_input_tokens_seen": 18537120, + "step": 10820 + }, + { + "epoch": 52.54721549636804, + "grad_norm": 0.00013562339881900698, + "learning_rate": 0.24898451478994305, + "loss": 0.0079, + "num_input_tokens_seen": 18545760, + "step": 10825 + }, + { + "epoch": 52.57142857142857, + "grad_norm": 0.0001835747534641996, + "learning_rate": 0.2489402486205868, + "loss": 0.0018, + "num_input_tokens_seen": 18554400, + "step": 10830 + }, + { + "epoch": 52.595641646489106, + "grad_norm": 0.0001383983326377347, + "learning_rate": 0.24889596719340085, + "loss": 0.0004, + "num_input_tokens_seen": 18562496, + "step": 10835 + }, + { + "epoch": 52.619854721549636, + "grad_norm": 0.00010940637002931908, + "learning_rate": 0.24885167051521392, + "loss": 0.0012, + "num_input_tokens_seen": 18571040, + "step": 10840 + }, + { + "epoch": 52.644067796610166, + "grad_norm": 4.907606489723548e-05, + "learning_rate": 0.24880735859285716, + "loss": 0.0004, + "num_input_tokens_seen": 18579584, + "step": 10845 + }, + { + "epoch": 52.6682808716707, + "grad_norm": 8.815645560389385e-05, + "learning_rate": 0.24876303143316406, + "loss": 0.0014, + "num_input_tokens_seen": 18588256, + "step": 10850 + }, + { + "epoch": 52.69249394673123, + "grad_norm": 3.1999552447814494e-05, + "learning_rate": 0.24871868904297031, + "loss": 0.0005, + "num_input_tokens_seen": 18596512, + "step": 10855 + }, + { + "epoch": 52.71670702179177, + "grad_norm": 0.002697814954444766, + "learning_rate": 0.24867433142911416, + "loss": 0.0056, + "num_input_tokens_seen": 18604992, + "step": 10860 + }, + { + "epoch": 52.7409200968523, + "grad_norm": 8.526186866220087e-05, + "learning_rate": 0.24862995859843612, + "loss": 0.0027, + "num_input_tokens_seen": 18613728, + "step": 10865 + }, + { + "epoch": 52.76513317191284, + "grad_norm": 0.00046154510346241295, + "learning_rate": 0.24858557055777897, + "loss": 0.0016, + "num_input_tokens_seen": 18622368, + "step": 10870 + }, + { + "epoch": 52.789346246973366, + "grad_norm": 2.8779097192455083e-05, + "learning_rate": 0.24854116731398793, + "loss": 0.001, + "num_input_tokens_seen": 18630880, + "step": 10875 + }, + { + "epoch": 52.813559322033896, + "grad_norm": 0.001747315051034093, + "learning_rate": 0.24849674887391052, + "loss": 0.0101, + "num_input_tokens_seen": 18639008, + "step": 10880 + }, + { + "epoch": 52.83777239709443, + "grad_norm": 0.001865105121396482, + "learning_rate": 0.2484523152443967, + "loss": 0.0022, + "num_input_tokens_seen": 18647744, + "step": 10885 + }, + { + "epoch": 52.86198547215496, + "grad_norm": 0.004462799057364464, + "learning_rate": 0.24840786643229862, + "loss": 0.0041, + "num_input_tokens_seen": 18656576, + "step": 10890 + }, + { + "epoch": 52.8861985472155, + "grad_norm": 0.0005324830417521298, + "learning_rate": 0.2483634024444709, + "loss": 0.0006, + "num_input_tokens_seen": 18664928, + "step": 10895 + }, + { + "epoch": 52.91041162227603, + "grad_norm": 0.005912259686738253, + "learning_rate": 0.24831892328777033, + "loss": 0.0106, + "num_input_tokens_seen": 18673600, + "step": 10900 + }, + { + "epoch": 52.93462469733656, + "grad_norm": 7.819951133569703e-05, + "learning_rate": 0.2482744289690563, + "loss": 0.0051, + "num_input_tokens_seen": 18682432, + "step": 10905 + }, + { + "epoch": 52.958837772397096, + "grad_norm": 0.0008900463581085205, + "learning_rate": 0.2482299194951903, + "loss": 0.001, + "num_input_tokens_seen": 18690976, + "step": 10910 + }, + { + "epoch": 52.983050847457626, + "grad_norm": 0.005599700380116701, + "learning_rate": 0.2481853948730363, + "loss": 0.0101, + "num_input_tokens_seen": 18699744, + "step": 10915 + }, + { + "epoch": 53.00968523002421, + "grad_norm": 1.153649645857513e-05, + "learning_rate": 0.24814085510946052, + "loss": 0.0022, + "num_input_tokens_seen": 18708832, + "step": 10920 + }, + { + "epoch": 53.03389830508475, + "grad_norm": 4.8774556489661336e-05, + "learning_rate": 0.24809630021133158, + "loss": 0.0004, + "num_input_tokens_seen": 18717216, + "step": 10925 + }, + { + "epoch": 53.05811138014528, + "grad_norm": 4.982155587640591e-05, + "learning_rate": 0.24805173018552037, + "loss": 0.0146, + "num_input_tokens_seen": 18726208, + "step": 10930 + }, + { + "epoch": 53.082324455205814, + "grad_norm": 0.0017331239068880677, + "learning_rate": 0.2480071450389002, + "loss": 0.0008, + "num_input_tokens_seen": 18735104, + "step": 10935 + }, + { + "epoch": 53.106537530266344, + "grad_norm": 0.0005618988070636988, + "learning_rate": 0.24796254477834662, + "loss": 0.0088, + "num_input_tokens_seen": 18743424, + "step": 10940 + }, + { + "epoch": 53.130750605326874, + "grad_norm": 0.00026520577375777066, + "learning_rate": 0.24791792941073754, + "loss": 0.0125, + "num_input_tokens_seen": 18751872, + "step": 10945 + }, + { + "epoch": 53.15496368038741, + "grad_norm": 4.3907151848543435e-05, + "learning_rate": 0.2478732989429533, + "loss": 0.0008, + "num_input_tokens_seen": 18760288, + "step": 10950 + }, + { + "epoch": 53.17917675544794, + "grad_norm": 3.895996997016482e-05, + "learning_rate": 0.24782865338187632, + "loss": 0.0069, + "num_input_tokens_seen": 18768608, + "step": 10955 + }, + { + "epoch": 53.20338983050848, + "grad_norm": 0.002181831980124116, + "learning_rate": 0.2477839927343916, + "loss": 0.002, + "num_input_tokens_seen": 18777088, + "step": 10960 + }, + { + "epoch": 53.22760290556901, + "grad_norm": 8.514938235748559e-05, + "learning_rate": 0.2477393170073864, + "loss": 0.0018, + "num_input_tokens_seen": 18786144, + "step": 10965 + }, + { + "epoch": 53.25181598062954, + "grad_norm": 0.0009749658638611436, + "learning_rate": 0.2476946262077503, + "loss": 0.0011, + "num_input_tokens_seen": 18794400, + "step": 10970 + }, + { + "epoch": 53.276029055690074, + "grad_norm": 0.0009869947098195553, + "learning_rate": 0.24764992034237507, + "loss": 0.0009, + "num_input_tokens_seen": 18803232, + "step": 10975 + }, + { + "epoch": 53.300242130750604, + "grad_norm": 0.007893186993896961, + "learning_rate": 0.24760519941815498, + "loss": 0.0118, + "num_input_tokens_seen": 18811520, + "step": 10980 + }, + { + "epoch": 53.32445520581114, + "grad_norm": 0.00011337234172970057, + "learning_rate": 0.2475604634419866, + "loss": 0.0031, + "num_input_tokens_seen": 18820032, + "step": 10985 + }, + { + "epoch": 53.34866828087167, + "grad_norm": 0.00407156627625227, + "learning_rate": 0.24751571242076872, + "loss": 0.0046, + "num_input_tokens_seen": 18828448, + "step": 10990 + }, + { + "epoch": 53.3728813559322, + "grad_norm": 0.0008399271173402667, + "learning_rate": 0.2474709463614025, + "loss": 0.0038, + "num_input_tokens_seen": 18837024, + "step": 10995 + }, + { + "epoch": 53.39709443099274, + "grad_norm": 0.01031828299164772, + "learning_rate": 0.24742616527079145, + "loss": 0.0128, + "num_input_tokens_seen": 18845184, + "step": 11000 + }, + { + "epoch": 53.39709443099274, + "eval_loss": 0.5866853594779968, + "eval_runtime": 4.6214, + "eval_samples_per_second": 79.414, + "eval_steps_per_second": 19.908, + "num_input_tokens_seen": 18845184, + "step": 11000 + }, + { + "epoch": 53.42130750605327, + "grad_norm": 0.00021165535144973546, + "learning_rate": 0.24738136915584139, + "loss": 0.0165, + "num_input_tokens_seen": 18853760, + "step": 11005 + }, + { + "epoch": 53.445520581113804, + "grad_norm": 0.0016210319008678198, + "learning_rate": 0.24733655802346047, + "loss": 0.0029, + "num_input_tokens_seen": 18862112, + "step": 11010 + }, + { + "epoch": 53.469733656174334, + "grad_norm": 0.0014063094276934862, + "learning_rate": 0.24729173188055906, + "loss": 0.0148, + "num_input_tokens_seen": 18870816, + "step": 11015 + }, + { + "epoch": 53.493946731234864, + "grad_norm": 0.00019366282504051924, + "learning_rate": 0.24724689073404996, + "loss": 0.025, + "num_input_tokens_seen": 18879136, + "step": 11020 + }, + { + "epoch": 53.5181598062954, + "grad_norm": 0.007873749360442162, + "learning_rate": 0.24720203459084822, + "loss": 0.0292, + "num_input_tokens_seen": 18887520, + "step": 11025 + }, + { + "epoch": 53.54237288135593, + "grad_norm": 0.00019237362721469253, + "learning_rate": 0.24715716345787123, + "loss": 0.0117, + "num_input_tokens_seen": 18896160, + "step": 11030 + }, + { + "epoch": 53.56658595641647, + "grad_norm": 0.0012481308076530695, + "learning_rate": 0.2471122773420387, + "loss": 0.0044, + "num_input_tokens_seen": 18904544, + "step": 11035 + }, + { + "epoch": 53.590799031477, + "grad_norm": 0.006371548864990473, + "learning_rate": 0.24706737625027259, + "loss": 0.0465, + "num_input_tokens_seen": 18913376, + "step": 11040 + }, + { + "epoch": 53.61501210653753, + "grad_norm": 5.0950773584190756e-05, + "learning_rate": 0.24702246018949725, + "loss": 0.0076, + "num_input_tokens_seen": 18922048, + "step": 11045 + }, + { + "epoch": 53.639225181598064, + "grad_norm": 0.0003420463763177395, + "learning_rate": 0.2469775291666393, + "loss": 0.0047, + "num_input_tokens_seen": 18930304, + "step": 11050 + }, + { + "epoch": 53.663438256658594, + "grad_norm": 0.0001923701784107834, + "learning_rate": 0.24693258318862765, + "loss": 0.018, + "num_input_tokens_seen": 18939104, + "step": 11055 + }, + { + "epoch": 53.68765133171913, + "grad_norm": 0.005120066925883293, + "learning_rate": 0.2468876222623935, + "loss": 0.0245, + "num_input_tokens_seen": 18947616, + "step": 11060 + }, + { + "epoch": 53.71186440677966, + "grad_norm": 0.00013749173376709223, + "learning_rate": 0.2468426463948705, + "loss": 0.0057, + "num_input_tokens_seen": 18956288, + "step": 11065 + }, + { + "epoch": 53.73607748184019, + "grad_norm": 0.0011261394247412682, + "learning_rate": 0.24679765559299438, + "loss": 0.0073, + "num_input_tokens_seen": 18965280, + "step": 11070 + }, + { + "epoch": 53.76029055690073, + "grad_norm": 0.0010826472425833344, + "learning_rate": 0.24675264986370332, + "loss": 0.0171, + "num_input_tokens_seen": 18973920, + "step": 11075 + }, + { + "epoch": 53.78450363196126, + "grad_norm": 0.006997179705649614, + "learning_rate": 0.2467076292139378, + "loss": 0.0179, + "num_input_tokens_seen": 18982624, + "step": 11080 + }, + { + "epoch": 53.808716707021794, + "grad_norm": 0.0002294790028827265, + "learning_rate": 0.24666259365064055, + "loss": 0.017, + "num_input_tokens_seen": 18990528, + "step": 11085 + }, + { + "epoch": 53.832929782082324, + "grad_norm": 0.00012618937762454152, + "learning_rate": 0.24661754318075663, + "loss": 0.0075, + "num_input_tokens_seen": 18999232, + "step": 11090 + }, + { + "epoch": 53.857142857142854, + "grad_norm": 0.00045866434811614454, + "learning_rate": 0.2465724778112334, + "loss": 0.0084, + "num_input_tokens_seen": 19008160, + "step": 11095 + }, + { + "epoch": 53.88135593220339, + "grad_norm": 0.0008551671053282917, + "learning_rate": 0.24652739754902042, + "loss": 0.0059, + "num_input_tokens_seen": 19016736, + "step": 11100 + }, + { + "epoch": 53.90556900726392, + "grad_norm": 0.0003001989971380681, + "learning_rate": 0.24648230240106975, + "loss": 0.0055, + "num_input_tokens_seen": 19025504, + "step": 11105 + }, + { + "epoch": 53.92978208232446, + "grad_norm": 0.003181398380547762, + "learning_rate": 0.2464371923743356, + "loss": 0.003, + "num_input_tokens_seen": 19033728, + "step": 11110 + }, + { + "epoch": 53.95399515738499, + "grad_norm": 0.00044855804299004376, + "learning_rate": 0.24639206747577444, + "loss": 0.0087, + "num_input_tokens_seen": 19042176, + "step": 11115 + }, + { + "epoch": 53.97820823244552, + "grad_norm": 0.0024233011063188314, + "learning_rate": 0.24634692771234515, + "loss": 0.0075, + "num_input_tokens_seen": 19050784, + "step": 11120 + }, + { + "epoch": 54.00484261501211, + "grad_norm": 0.0010556979104876518, + "learning_rate": 0.2463017730910088, + "loss": 0.0023, + "num_input_tokens_seen": 19059616, + "step": 11125 + }, + { + "epoch": 54.02905569007264, + "grad_norm": 0.002765527693554759, + "learning_rate": 0.2462566036187289, + "loss": 0.0144, + "num_input_tokens_seen": 19067936, + "step": 11130 + }, + { + "epoch": 54.05326876513317, + "grad_norm": 0.0004756396228913218, + "learning_rate": 0.24621141930247106, + "loss": 0.0053, + "num_input_tokens_seen": 19076640, + "step": 11135 + }, + { + "epoch": 54.077481840193705, + "grad_norm": 0.0011734082363545895, + "learning_rate": 0.2461662201492033, + "loss": 0.0109, + "num_input_tokens_seen": 19085248, + "step": 11140 + }, + { + "epoch": 54.101694915254235, + "grad_norm": 0.00011545914458110929, + "learning_rate": 0.24612100616589586, + "loss": 0.0011, + "num_input_tokens_seen": 19094080, + "step": 11145 + }, + { + "epoch": 54.12590799031477, + "grad_norm": 0.0002800012589432299, + "learning_rate": 0.24607577735952135, + "loss": 0.0005, + "num_input_tokens_seen": 19102688, + "step": 11150 + }, + { + "epoch": 54.1501210653753, + "grad_norm": 0.002934742486104369, + "learning_rate": 0.24603053373705464, + "loss": 0.0074, + "num_input_tokens_seen": 19110912, + "step": 11155 + }, + { + "epoch": 54.17433414043583, + "grad_norm": 0.00024156291328836232, + "learning_rate": 0.2459852753054728, + "loss": 0.0018, + "num_input_tokens_seen": 19119264, + "step": 11160 + }, + { + "epoch": 54.19854721549637, + "grad_norm": 0.00010542735253693536, + "learning_rate": 0.24594000207175526, + "loss": 0.0039, + "num_input_tokens_seen": 19127648, + "step": 11165 + }, + { + "epoch": 54.2227602905569, + "grad_norm": 0.0005427181604318321, + "learning_rate": 0.2458947140428838, + "loss": 0.0075, + "num_input_tokens_seen": 19136032, + "step": 11170 + }, + { + "epoch": 54.246973365617436, + "grad_norm": 0.0019010024843737483, + "learning_rate": 0.24584941122584233, + "loss": 0.004, + "num_input_tokens_seen": 19144544, + "step": 11175 + }, + { + "epoch": 54.271186440677965, + "grad_norm": 0.00025571740115992725, + "learning_rate": 0.24580409362761713, + "loss": 0.0047, + "num_input_tokens_seen": 19153024, + "step": 11180 + }, + { + "epoch": 54.295399515738495, + "grad_norm": 0.0005023619160056114, + "learning_rate": 0.2457587612551967, + "loss": 0.0103, + "num_input_tokens_seen": 19161856, + "step": 11185 + }, + { + "epoch": 54.31961259079903, + "grad_norm": 0.0004891595453955233, + "learning_rate": 0.24571341411557193, + "loss": 0.0204, + "num_input_tokens_seen": 19170528, + "step": 11190 + }, + { + "epoch": 54.34382566585956, + "grad_norm": 0.00323123368434608, + "learning_rate": 0.2456680522157359, + "loss": 0.0029, + "num_input_tokens_seen": 19178720, + "step": 11195 + }, + { + "epoch": 54.3680387409201, + "grad_norm": 0.0049800206907093525, + "learning_rate": 0.245622675562684, + "loss": 0.0076, + "num_input_tokens_seen": 19187296, + "step": 11200 + }, + { + "epoch": 54.3680387409201, + "eval_loss": 0.5882262587547302, + "eval_runtime": 4.6143, + "eval_samples_per_second": 79.535, + "eval_steps_per_second": 19.938, + "num_input_tokens_seen": 19187296, + "step": 11200 + }, + { + "epoch": 54.39225181598063, + "grad_norm": 0.0006670102011412382, + "learning_rate": 0.24557728416341384, + "loss": 0.0055, + "num_input_tokens_seen": 19195680, + "step": 11205 + }, + { + "epoch": 54.416464891041166, + "grad_norm": 0.0039854953065514565, + "learning_rate": 0.24553187802492538, + "loss": 0.0093, + "num_input_tokens_seen": 19204224, + "step": 11210 + }, + { + "epoch": 54.440677966101696, + "grad_norm": 0.0005203376640565693, + "learning_rate": 0.24548645715422074, + "loss": 0.0049, + "num_input_tokens_seen": 19213056, + "step": 11215 + }, + { + "epoch": 54.464891041162225, + "grad_norm": 0.00017724555800668895, + "learning_rate": 0.2454410215583045, + "loss": 0.0101, + "num_input_tokens_seen": 19221664, + "step": 11220 + }, + { + "epoch": 54.48910411622276, + "grad_norm": 0.0003020094591192901, + "learning_rate": 0.24539557124418332, + "loss": 0.0214, + "num_input_tokens_seen": 19230112, + "step": 11225 + }, + { + "epoch": 54.51331719128329, + "grad_norm": 0.0017393050948157907, + "learning_rate": 0.24535010621886624, + "loss": 0.0035, + "num_input_tokens_seen": 19238240, + "step": 11230 + }, + { + "epoch": 54.53753026634383, + "grad_norm": 0.002485118107870221, + "learning_rate": 0.2453046264893646, + "loss": 0.0048, + "num_input_tokens_seen": 19246720, + "step": 11235 + }, + { + "epoch": 54.56174334140436, + "grad_norm": 0.0018875550013035536, + "learning_rate": 0.24525913206269184, + "loss": 0.0151, + "num_input_tokens_seen": 19255712, + "step": 11240 + }, + { + "epoch": 54.58595641646489, + "grad_norm": 0.0007968196296133101, + "learning_rate": 0.2452136229458638, + "loss": 0.0031, + "num_input_tokens_seen": 19264160, + "step": 11245 + }, + { + "epoch": 54.610169491525426, + "grad_norm": 0.005771851632744074, + "learning_rate": 0.24516809914589857, + "loss": 0.008, + "num_input_tokens_seen": 19272736, + "step": 11250 + }, + { + "epoch": 54.634382566585955, + "grad_norm": 0.0016780185978859663, + "learning_rate": 0.2451225606698165, + "loss": 0.0116, + "num_input_tokens_seen": 19281632, + "step": 11255 + }, + { + "epoch": 54.65859564164649, + "grad_norm": 0.00019448799139354378, + "learning_rate": 0.2450770075246402, + "loss": 0.0024, + "num_input_tokens_seen": 19290016, + "step": 11260 + }, + { + "epoch": 54.68280871670702, + "grad_norm": 0.002047543879598379, + "learning_rate": 0.24503143971739455, + "loss": 0.0018, + "num_input_tokens_seen": 19298688, + "step": 11265 + }, + { + "epoch": 54.70702179176755, + "grad_norm": 0.004637548699975014, + "learning_rate": 0.24498585725510663, + "loss": 0.0367, + "num_input_tokens_seen": 19307264, + "step": 11270 + }, + { + "epoch": 54.73123486682809, + "grad_norm": 0.002926443936303258, + "learning_rate": 0.24494026014480583, + "loss": 0.0188, + "num_input_tokens_seen": 19315616, + "step": 11275 + }, + { + "epoch": 54.75544794188862, + "grad_norm": 0.0066160219721496105, + "learning_rate": 0.24489464839352387, + "loss": 0.0134, + "num_input_tokens_seen": 19324448, + "step": 11280 + }, + { + "epoch": 54.779661016949156, + "grad_norm": 0.0003696212370414287, + "learning_rate": 0.2448490220082946, + "loss": 0.0103, + "num_input_tokens_seen": 19333056, + "step": 11285 + }, + { + "epoch": 54.803874092009686, + "grad_norm": 0.006579764652997255, + "learning_rate": 0.24480338099615415, + "loss": 0.0162, + "num_input_tokens_seen": 19341920, + "step": 11290 + }, + { + "epoch": 54.828087167070215, + "grad_norm": 0.009472090750932693, + "learning_rate": 0.244757725364141, + "loss": 0.0125, + "num_input_tokens_seen": 19350624, + "step": 11295 + }, + { + "epoch": 54.85230024213075, + "grad_norm": 0.0001849872642196715, + "learning_rate": 0.24471205511929583, + "loss": 0.021, + "num_input_tokens_seen": 19359360, + "step": 11300 + }, + { + "epoch": 54.87651331719128, + "grad_norm": 0.0003135320730507374, + "learning_rate": 0.24466637026866145, + "loss": 0.0055, + "num_input_tokens_seen": 19368064, + "step": 11305 + }, + { + "epoch": 54.90072639225182, + "grad_norm": 0.0020500430837273598, + "learning_rate": 0.2446206708192832, + "loss": 0.0112, + "num_input_tokens_seen": 19376288, + "step": 11310 + }, + { + "epoch": 54.92493946731235, + "grad_norm": 0.0012529100058600307, + "learning_rate": 0.2445749567782084, + "loss": 0.0093, + "num_input_tokens_seen": 19384576, + "step": 11315 + }, + { + "epoch": 54.94915254237288, + "grad_norm": 0.009300045669078827, + "learning_rate": 0.2445292281524868, + "loss": 0.0274, + "num_input_tokens_seen": 19393248, + "step": 11320 + }, + { + "epoch": 54.973365617433416, + "grad_norm": 0.0003936871071346104, + "learning_rate": 0.24448348494917022, + "loss": 0.014, + "num_input_tokens_seen": 19402176, + "step": 11325 + }, + { + "epoch": 54.997578692493946, + "grad_norm": 5.6206808949355036e-05, + "learning_rate": 0.24443772717531295, + "loss": 0.0012, + "num_input_tokens_seen": 19410464, + "step": 11330 + }, + { + "epoch": 55.02421307506053, + "grad_norm": 0.0011533062206581235, + "learning_rate": 0.24439195483797138, + "loss": 0.0024, + "num_input_tokens_seen": 19419776, + "step": 11335 + }, + { + "epoch": 55.04842615012107, + "grad_norm": 0.00294741103425622, + "learning_rate": 0.24434616794420416, + "loss": 0.0067, + "num_input_tokens_seen": 19428512, + "step": 11340 + }, + { + "epoch": 55.0726392251816, + "grad_norm": 8.277581946458668e-05, + "learning_rate": 0.24430036650107223, + "loss": 0.0018, + "num_input_tokens_seen": 19436736, + "step": 11345 + }, + { + "epoch": 55.09685230024213, + "grad_norm": 0.007638063747435808, + "learning_rate": 0.2442545505156387, + "loss": 0.0419, + "num_input_tokens_seen": 19445280, + "step": 11350 + }, + { + "epoch": 55.12106537530266, + "grad_norm": 0.00014210610243026167, + "learning_rate": 0.24420871999496904, + "loss": 0.0007, + "num_input_tokens_seen": 19453792, + "step": 11355 + }, + { + "epoch": 55.14527845036319, + "grad_norm": 0.0018019105773419142, + "learning_rate": 0.24416287494613084, + "loss": 0.0064, + "num_input_tokens_seen": 19462592, + "step": 11360 + }, + { + "epoch": 55.16949152542373, + "grad_norm": 0.001212629838846624, + "learning_rate": 0.24411701537619399, + "loss": 0.0027, + "num_input_tokens_seen": 19470752, + "step": 11365 + }, + { + "epoch": 55.19370460048426, + "grad_norm": 0.00017831842706073076, + "learning_rate": 0.24407114129223062, + "loss": 0.0043, + "num_input_tokens_seen": 19479424, + "step": 11370 + }, + { + "epoch": 55.2179176755448, + "grad_norm": 0.0038111202884465456, + "learning_rate": 0.2440252527013151, + "loss": 0.0098, + "num_input_tokens_seen": 19487744, + "step": 11375 + }, + { + "epoch": 55.24213075060533, + "grad_norm": 0.007913678884506226, + "learning_rate": 0.24397934961052403, + "loss": 0.0114, + "num_input_tokens_seen": 19496288, + "step": 11380 + }, + { + "epoch": 55.26634382566586, + "grad_norm": 0.0008859282825142145, + "learning_rate": 0.24393343202693618, + "loss": 0.0163, + "num_input_tokens_seen": 19505184, + "step": 11385 + }, + { + "epoch": 55.29055690072639, + "grad_norm": 5.2850031352136284e-05, + "learning_rate": 0.2438874999576327, + "loss": 0.0129, + "num_input_tokens_seen": 19513440, + "step": 11390 + }, + { + "epoch": 55.31476997578692, + "grad_norm": 0.01403718814253807, + "learning_rate": 0.24384155340969688, + "loss": 0.0197, + "num_input_tokens_seen": 19521632, + "step": 11395 + }, + { + "epoch": 55.33898305084746, + "grad_norm": 0.0009096984867937863, + "learning_rate": 0.24379559239021423, + "loss": 0.042, + "num_input_tokens_seen": 19529792, + "step": 11400 + }, + { + "epoch": 55.33898305084746, + "eval_loss": 0.4477413296699524, + "eval_runtime": 4.6269, + "eval_samples_per_second": 79.319, + "eval_steps_per_second": 19.884, + "num_input_tokens_seen": 19529792, + "step": 11400 + }, + { + "epoch": 55.36319612590799, + "grad_norm": 0.0008335176389664412, + "learning_rate": 0.2437496169062725, + "loss": 0.0043, + "num_input_tokens_seen": 19538112, + "step": 11405 + }, + { + "epoch": 55.38740920096852, + "grad_norm": 0.000466277968371287, + "learning_rate": 0.24370362696496176, + "loss": 0.011, + "num_input_tokens_seen": 19546720, + "step": 11410 + }, + { + "epoch": 55.41162227602906, + "grad_norm": 0.0006170221604406834, + "learning_rate": 0.24365762257337417, + "loss": 0.0079, + "num_input_tokens_seen": 19555296, + "step": 11415 + }, + { + "epoch": 55.43583535108959, + "grad_norm": 0.000859297055285424, + "learning_rate": 0.2436116037386042, + "loss": 0.0067, + "num_input_tokens_seen": 19564128, + "step": 11420 + }, + { + "epoch": 55.460048426150124, + "grad_norm": 0.0007472372381016612, + "learning_rate": 0.24356557046774852, + "loss": 0.0039, + "num_input_tokens_seen": 19572512, + "step": 11425 + }, + { + "epoch": 55.48426150121065, + "grad_norm": 0.002696782350540161, + "learning_rate": 0.24351952276790606, + "loss": 0.0106, + "num_input_tokens_seen": 19580768, + "step": 11430 + }, + { + "epoch": 55.50847457627118, + "grad_norm": 0.00035268356441520154, + "learning_rate": 0.24347346064617797, + "loss": 0.0055, + "num_input_tokens_seen": 19589344, + "step": 11435 + }, + { + "epoch": 55.53268765133172, + "grad_norm": 0.0007959622889757156, + "learning_rate": 0.24342738410966758, + "loss": 0.0054, + "num_input_tokens_seen": 19597792, + "step": 11440 + }, + { + "epoch": 55.55690072639225, + "grad_norm": 0.0005117912660352886, + "learning_rate": 0.24338129316548046, + "loss": 0.0148, + "num_input_tokens_seen": 19606496, + "step": 11445 + }, + { + "epoch": 55.58111380145279, + "grad_norm": 0.0003300872049294412, + "learning_rate": 0.24333518782072444, + "loss": 0.0274, + "num_input_tokens_seen": 19615296, + "step": 11450 + }, + { + "epoch": 55.60532687651332, + "grad_norm": 0.005475268699228764, + "learning_rate": 0.24328906808250952, + "loss": 0.0292, + "num_input_tokens_seen": 19623744, + "step": 11455 + }, + { + "epoch": 55.62953995157385, + "grad_norm": 0.000916951394174248, + "learning_rate": 0.243242933957948, + "loss": 0.0121, + "num_input_tokens_seen": 19632160, + "step": 11460 + }, + { + "epoch": 55.653753026634384, + "grad_norm": 0.001667837961576879, + "learning_rate": 0.24319678545415427, + "loss": 0.0058, + "num_input_tokens_seen": 19640576, + "step": 11465 + }, + { + "epoch": 55.67796610169491, + "grad_norm": 0.0010166538413614035, + "learning_rate": 0.24315062257824507, + "loss": 0.0037, + "num_input_tokens_seen": 19649344, + "step": 11470 + }, + { + "epoch": 55.70217917675545, + "grad_norm": 0.0009599123150110245, + "learning_rate": 0.24310444533733921, + "loss": 0.0038, + "num_input_tokens_seen": 19657824, + "step": 11475 + }, + { + "epoch": 55.72639225181598, + "grad_norm": 1.684766357357148e-05, + "learning_rate": 0.2430582537385579, + "loss": 0.0124, + "num_input_tokens_seen": 19666592, + "step": 11480 + }, + { + "epoch": 55.75060532687651, + "grad_norm": 0.0038883674424141645, + "learning_rate": 0.2430120477890244, + "loss": 0.0085, + "num_input_tokens_seen": 19675488, + "step": 11485 + }, + { + "epoch": 55.77481840193705, + "grad_norm": 0.0006615977035835385, + "learning_rate": 0.24296582749586426, + "loss": 0.0059, + "num_input_tokens_seen": 19684160, + "step": 11490 + }, + { + "epoch": 55.79903147699758, + "grad_norm": 0.00277089630253613, + "learning_rate": 0.24291959286620526, + "loss": 0.0032, + "num_input_tokens_seen": 19693024, + "step": 11495 + }, + { + "epoch": 55.823244552058114, + "grad_norm": 0.00878493208438158, + "learning_rate": 0.24287334390717738, + "loss": 0.0206, + "num_input_tokens_seen": 19701760, + "step": 11500 + }, + { + "epoch": 55.847457627118644, + "grad_norm": 0.008451533503830433, + "learning_rate": 0.24282708062591268, + "loss": 0.0084, + "num_input_tokens_seen": 19710400, + "step": 11505 + }, + { + "epoch": 55.87167070217917, + "grad_norm": 0.00015932579117361456, + "learning_rate": 0.24278080302954563, + "loss": 0.0126, + "num_input_tokens_seen": 19718848, + "step": 11510 + }, + { + "epoch": 55.89588377723971, + "grad_norm": 0.00010309196659363806, + "learning_rate": 0.24273451112521283, + "loss": 0.022, + "num_input_tokens_seen": 19726976, + "step": 11515 + }, + { + "epoch": 55.92009685230024, + "grad_norm": 5.226223584031686e-05, + "learning_rate": 0.242688204920053, + "loss": 0.0062, + "num_input_tokens_seen": 19735488, + "step": 11520 + }, + { + "epoch": 55.94430992736078, + "grad_norm": 0.0007036825409159064, + "learning_rate": 0.24264188442120715, + "loss": 0.0114, + "num_input_tokens_seen": 19744000, + "step": 11525 + }, + { + "epoch": 55.96852300242131, + "grad_norm": 0.009082849137485027, + "learning_rate": 0.24259554963581853, + "loss": 0.0216, + "num_input_tokens_seen": 19752992, + "step": 11530 + }, + { + "epoch": 55.99273607748184, + "grad_norm": 0.004037275444716215, + "learning_rate": 0.24254920057103257, + "loss": 0.0191, + "num_input_tokens_seen": 19761440, + "step": 11535 + }, + { + "epoch": 56.01937046004843, + "grad_norm": 0.004109177738428116, + "learning_rate": 0.24250283723399685, + "loss": 0.0124, + "num_input_tokens_seen": 19770560, + "step": 11540 + }, + { + "epoch": 56.04358353510896, + "grad_norm": 0.00010242993448628113, + "learning_rate": 0.24245645963186108, + "loss": 0.004, + "num_input_tokens_seen": 19778976, + "step": 11545 + }, + { + "epoch": 56.067796610169495, + "grad_norm": 0.004380055237561464, + "learning_rate": 0.2424100677717774, + "loss": 0.0039, + "num_input_tokens_seen": 19787616, + "step": 11550 + }, + { + "epoch": 56.092009685230025, + "grad_norm": 0.013755116611719131, + "learning_rate": 0.24236366166090004, + "loss": 0.0174, + "num_input_tokens_seen": 19796384, + "step": 11555 + }, + { + "epoch": 56.116222760290555, + "grad_norm": 0.002494401764124632, + "learning_rate": 0.24231724130638527, + "loss": 0.004, + "num_input_tokens_seen": 19804832, + "step": 11560 + }, + { + "epoch": 56.14043583535109, + "grad_norm": 0.0032536047510802746, + "learning_rate": 0.2422708067153917, + "loss": 0.0132, + "num_input_tokens_seen": 19813792, + "step": 11565 + }, + { + "epoch": 56.16464891041162, + "grad_norm": 0.011018183082342148, + "learning_rate": 0.24222435789508026, + "loss": 0.029, + "num_input_tokens_seen": 19822016, + "step": 11570 + }, + { + "epoch": 56.18886198547216, + "grad_norm": 3.711076351464726e-05, + "learning_rate": 0.24217789485261387, + "loss": 0.002, + "num_input_tokens_seen": 19830432, + "step": 11575 + }, + { + "epoch": 56.21307506053269, + "grad_norm": 0.00023924620472826064, + "learning_rate": 0.2421314175951577, + "loss": 0.0032, + "num_input_tokens_seen": 19839168, + "step": 11580 + }, + { + "epoch": 56.23728813559322, + "grad_norm": 0.0034829715732485056, + "learning_rate": 0.2420849261298791, + "loss": 0.0128, + "num_input_tokens_seen": 19848064, + "step": 11585 + }, + { + "epoch": 56.261501210653755, + "grad_norm": 0.011074673384428024, + "learning_rate": 0.24203842046394775, + "loss": 0.0151, + "num_input_tokens_seen": 19856704, + "step": 11590 + }, + { + "epoch": 56.285714285714285, + "grad_norm": 0.00033696426544338465, + "learning_rate": 0.24199190060453535, + "loss": 0.0048, + "num_input_tokens_seen": 19865120, + "step": 11595 + }, + { + "epoch": 56.30992736077482, + "grad_norm": 0.006888486910611391, + "learning_rate": 0.2419453665588158, + "loss": 0.0147, + "num_input_tokens_seen": 19873728, + "step": 11600 + }, + { + "epoch": 56.30992736077482, + "eval_loss": 0.5451940298080444, + "eval_runtime": 4.6247, + "eval_samples_per_second": 79.357, + "eval_steps_per_second": 19.893, + "num_input_tokens_seen": 19873728, + "step": 11600 + }, + { + "epoch": 56.33414043583535, + "grad_norm": 0.0008300399640575051, + "learning_rate": 0.24189881833396523, + "loss": 0.0092, + "num_input_tokens_seen": 19882336, + "step": 11605 + }, + { + "epoch": 56.35835351089588, + "grad_norm": 0.008790943771600723, + "learning_rate": 0.24185225593716203, + "loss": 0.0387, + "num_input_tokens_seen": 19890688, + "step": 11610 + }, + { + "epoch": 56.38256658595642, + "grad_norm": 0.00019405438797548413, + "learning_rate": 0.2418056793755867, + "loss": 0.0009, + "num_input_tokens_seen": 19899296, + "step": 11615 + }, + { + "epoch": 56.40677966101695, + "grad_norm": 0.0005692241829819977, + "learning_rate": 0.24175908865642187, + "loss": 0.024, + "num_input_tokens_seen": 19907520, + "step": 11620 + }, + { + "epoch": 56.430992736077485, + "grad_norm": 0.0015663034282624722, + "learning_rate": 0.24171248378685248, + "loss": 0.0151, + "num_input_tokens_seen": 19916128, + "step": 11625 + }, + { + "epoch": 56.455205811138015, + "grad_norm": 0.00010815326095325872, + "learning_rate": 0.24166586477406554, + "loss": 0.0506, + "num_input_tokens_seen": 19924768, + "step": 11630 + }, + { + "epoch": 56.479418886198545, + "grad_norm": 0.002545187482610345, + "learning_rate": 0.24161923162525034, + "loss": 0.0071, + "num_input_tokens_seen": 19933632, + "step": 11635 + }, + { + "epoch": 56.50363196125908, + "grad_norm": 0.005468389019370079, + "learning_rate": 0.2415725843475982, + "loss": 0.0219, + "num_input_tokens_seen": 19941952, + "step": 11640 + }, + { + "epoch": 56.52784503631961, + "grad_norm": 0.008520971052348614, + "learning_rate": 0.24152592294830286, + "loss": 0.0218, + "num_input_tokens_seen": 19950656, + "step": 11645 + }, + { + "epoch": 56.55205811138015, + "grad_norm": 0.0007136841304600239, + "learning_rate": 0.24147924743455995, + "loss": 0.0136, + "num_input_tokens_seen": 19959136, + "step": 11650 + }, + { + "epoch": 56.57627118644068, + "grad_norm": 0.00017117623065132648, + "learning_rate": 0.24143255781356754, + "loss": 0.0094, + "num_input_tokens_seen": 19967776, + "step": 11655 + }, + { + "epoch": 56.60048426150121, + "grad_norm": 0.00010242087591905147, + "learning_rate": 0.24138585409252566, + "loss": 0.0099, + "num_input_tokens_seen": 19976192, + "step": 11660 + }, + { + "epoch": 56.624697336561745, + "grad_norm": 0.0005252289702184498, + "learning_rate": 0.24133913627863662, + "loss": 0.004, + "num_input_tokens_seen": 19984736, + "step": 11665 + }, + { + "epoch": 56.648910411622275, + "grad_norm": 0.00015069724759086967, + "learning_rate": 0.241292404379105, + "loss": 0.0123, + "num_input_tokens_seen": 19993088, + "step": 11670 + }, + { + "epoch": 56.67312348668281, + "grad_norm": 0.006676867604255676, + "learning_rate": 0.24124565840113735, + "loss": 0.0117, + "num_input_tokens_seen": 20001536, + "step": 11675 + }, + { + "epoch": 56.69733656174334, + "grad_norm": 0.00882598478347063, + "learning_rate": 0.2411988983519425, + "loss": 0.0118, + "num_input_tokens_seen": 20010208, + "step": 11680 + }, + { + "epoch": 56.72154963680387, + "grad_norm": 0.005572310648858547, + "learning_rate": 0.24115212423873145, + "loss": 0.0269, + "num_input_tokens_seen": 20018752, + "step": 11685 + }, + { + "epoch": 56.74576271186441, + "grad_norm": 0.00018547041690908372, + "learning_rate": 0.24110533606871737, + "loss": 0.0066, + "num_input_tokens_seen": 20027488, + "step": 11690 + }, + { + "epoch": 56.76997578692494, + "grad_norm": 0.0003932613763026893, + "learning_rate": 0.24105853384911552, + "loss": 0.0116, + "num_input_tokens_seen": 20035968, + "step": 11695 + }, + { + "epoch": 56.794188861985475, + "grad_norm": 0.0012564575299620628, + "learning_rate": 0.24101171758714346, + "loss": 0.0372, + "num_input_tokens_seen": 20044896, + "step": 11700 + }, + { + "epoch": 56.818401937046005, + "grad_norm": 0.0010491001885384321, + "learning_rate": 0.24096488729002086, + "loss": 0.0119, + "num_input_tokens_seen": 20053504, + "step": 11705 + }, + { + "epoch": 56.842615012106535, + "grad_norm": 0.005455024540424347, + "learning_rate": 0.24091804296496946, + "loss": 0.0299, + "num_input_tokens_seen": 20062144, + "step": 11710 + }, + { + "epoch": 56.86682808716707, + "grad_norm": 0.0030407668091356754, + "learning_rate": 0.2408711846192133, + "loss": 0.0146, + "num_input_tokens_seen": 20070656, + "step": 11715 + }, + { + "epoch": 56.8910411622276, + "grad_norm": 0.0001760929444571957, + "learning_rate": 0.24082431225997855, + "loss": 0.0073, + "num_input_tokens_seen": 20079168, + "step": 11720 + }, + { + "epoch": 56.91525423728814, + "grad_norm": 0.0008517725509591401, + "learning_rate": 0.24077742589449344, + "loss": 0.0154, + "num_input_tokens_seen": 20087200, + "step": 11725 + }, + { + "epoch": 56.93946731234867, + "grad_norm": 0.0007828353554941714, + "learning_rate": 0.24073052552998844, + "loss": 0.0089, + "num_input_tokens_seen": 20095808, + "step": 11730 + }, + { + "epoch": 56.9636803874092, + "grad_norm": 0.007630815729498863, + "learning_rate": 0.2406836111736963, + "loss": 0.0198, + "num_input_tokens_seen": 20104576, + "step": 11735 + }, + { + "epoch": 56.987893462469735, + "grad_norm": 0.0021728752180933952, + "learning_rate": 0.2406366828328517, + "loss": 0.0193, + "num_input_tokens_seen": 20113056, + "step": 11740 + }, + { + "epoch": 57.01452784503632, + "grad_norm": 0.0004419867764227092, + "learning_rate": 0.2405897405146915, + "loss": 0.0016, + "num_input_tokens_seen": 20122016, + "step": 11745 + }, + { + "epoch": 57.03874092009685, + "grad_norm": 0.007449570577591658, + "learning_rate": 0.240542784226455, + "loss": 0.011, + "num_input_tokens_seen": 20130208, + "step": 11750 + }, + { + "epoch": 57.062953995157386, + "grad_norm": 0.00015219645865727216, + "learning_rate": 0.24049581397538328, + "loss": 0.0225, + "num_input_tokens_seen": 20138720, + "step": 11755 + }, + { + "epoch": 57.087167070217916, + "grad_norm": 0.0010872062994167209, + "learning_rate": 0.24044882976871984, + "loss": 0.0143, + "num_input_tokens_seen": 20147648, + "step": 11760 + }, + { + "epoch": 57.11138014527845, + "grad_norm": 5.994912135065533e-05, + "learning_rate": 0.2404018316137102, + "loss": 0.0144, + "num_input_tokens_seen": 20155936, + "step": 11765 + }, + { + "epoch": 57.13559322033898, + "grad_norm": 0.0027321898378431797, + "learning_rate": 0.24035481951760204, + "loss": 0.0049, + "num_input_tokens_seen": 20164448, + "step": 11770 + }, + { + "epoch": 57.15980629539951, + "grad_norm": 0.0003096248547080904, + "learning_rate": 0.2403077934876452, + "loss": 0.0376, + "num_input_tokens_seen": 20173152, + "step": 11775 + }, + { + "epoch": 57.18401937046005, + "grad_norm": 0.008939704857766628, + "learning_rate": 0.2402607535310918, + "loss": 0.0208, + "num_input_tokens_seen": 20181664, + "step": 11780 + }, + { + "epoch": 57.20823244552058, + "grad_norm": 0.0002128948544850573, + "learning_rate": 0.2402136996551959, + "loss": 0.025, + "num_input_tokens_seen": 20190080, + "step": 11785 + }, + { + "epoch": 57.232445520581116, + "grad_norm": 0.0018594851717352867, + "learning_rate": 0.24016663186721376, + "loss": 0.0026, + "num_input_tokens_seen": 20198816, + "step": 11790 + }, + { + "epoch": 57.256658595641646, + "grad_norm": 0.01261682529002428, + "learning_rate": 0.24011955017440395, + "loss": 0.0199, + "num_input_tokens_seen": 20207008, + "step": 11795 + }, + { + "epoch": 57.280871670702176, + "grad_norm": 4.9354712245985866e-05, + "learning_rate": 0.24007245458402696, + "loss": 0.0166, + "num_input_tokens_seen": 20215680, + "step": 11800 + }, + { + "epoch": 57.280871670702176, + "eval_loss": 0.5036935210227966, + "eval_runtime": 4.6207, + "eval_samples_per_second": 79.426, + "eval_steps_per_second": 19.91, + "num_input_tokens_seen": 20215680, + "step": 11800 + }, + { + "epoch": 57.30508474576271, + "grad_norm": 0.008431616239249706, + "learning_rate": 0.2400253451033456, + "loss": 0.0262, + "num_input_tokens_seen": 20224224, + "step": 11805 + }, + { + "epoch": 57.32929782082324, + "grad_norm": 0.002079994650557637, + "learning_rate": 0.23997822173962463, + "loss": 0.0444, + "num_input_tokens_seen": 20232736, + "step": 11810 + }, + { + "epoch": 57.35351089588378, + "grad_norm": 0.0014943407149985433, + "learning_rate": 0.23993108450013118, + "loss": 0.0163, + "num_input_tokens_seen": 20241152, + "step": 11815 + }, + { + "epoch": 57.37772397094431, + "grad_norm": 0.00873255543410778, + "learning_rate": 0.2398839333921343, + "loss": 0.0265, + "num_input_tokens_seen": 20249632, + "step": 11820 + }, + { + "epoch": 57.40193704600484, + "grad_norm": 0.003573615336790681, + "learning_rate": 0.23983676842290536, + "loss": 0.0134, + "num_input_tokens_seen": 20258432, + "step": 11825 + }, + { + "epoch": 57.426150121065376, + "grad_norm": 0.00034312077332288027, + "learning_rate": 0.2397895895997178, + "loss": 0.0095, + "num_input_tokens_seen": 20266880, + "step": 11830 + }, + { + "epoch": 57.450363196125906, + "grad_norm": 0.0060109300538897514, + "learning_rate": 0.23974239692984714, + "loss": 0.0084, + "num_input_tokens_seen": 20275168, + "step": 11835 + }, + { + "epoch": 57.47457627118644, + "grad_norm": 0.0049270158633589745, + "learning_rate": 0.2396951904205711, + "loss": 0.0078, + "num_input_tokens_seen": 20283776, + "step": 11840 + }, + { + "epoch": 57.49878934624697, + "grad_norm": 0.002385346684604883, + "learning_rate": 0.23964797007916952, + "loss": 0.0121, + "num_input_tokens_seen": 20292160, + "step": 11845 + }, + { + "epoch": 57.5230024213075, + "grad_norm": 0.0009424402378499508, + "learning_rate": 0.23960073591292436, + "loss": 0.0229, + "num_input_tokens_seen": 20301152, + "step": 11850 + }, + { + "epoch": 57.54721549636804, + "grad_norm": 0.0033035229425877333, + "learning_rate": 0.2395534879291197, + "loss": 0.0061, + "num_input_tokens_seen": 20309856, + "step": 11855 + }, + { + "epoch": 57.57142857142857, + "grad_norm": 0.0005990634672343731, + "learning_rate": 0.23950622613504186, + "loss": 0.0126, + "num_input_tokens_seen": 20318464, + "step": 11860 + }, + { + "epoch": 57.595641646489106, + "grad_norm": 0.0001540368830319494, + "learning_rate": 0.2394589505379791, + "loss": 0.0032, + "num_input_tokens_seen": 20327072, + "step": 11865 + }, + { + "epoch": 57.619854721549636, + "grad_norm": 0.002219670219346881, + "learning_rate": 0.23941166114522197, + "loss": 0.0149, + "num_input_tokens_seen": 20335552, + "step": 11870 + }, + { + "epoch": 57.644067796610166, + "grad_norm": 0.00047693151282146573, + "learning_rate": 0.23936435796406308, + "loss": 0.0053, + "num_input_tokens_seen": 20344416, + "step": 11875 + }, + { + "epoch": 57.6682808716707, + "grad_norm": 0.0011338329641148448, + "learning_rate": 0.23931704100179715, + "loss": 0.0192, + "num_input_tokens_seen": 20353152, + "step": 11880 + }, + { + "epoch": 57.69249394673123, + "grad_norm": 0.003637917572632432, + "learning_rate": 0.2392697102657211, + "loss": 0.0105, + "num_input_tokens_seen": 20361824, + "step": 11885 + }, + { + "epoch": 57.71670702179177, + "grad_norm": 0.00025375845143571496, + "learning_rate": 0.23922236576313388, + "loss": 0.0038, + "num_input_tokens_seen": 20370528, + "step": 11890 + }, + { + "epoch": 57.7409200968523, + "grad_norm": 0.0005535701056942344, + "learning_rate": 0.2391750075013366, + "loss": 0.0044, + "num_input_tokens_seen": 20379168, + "step": 11895 + }, + { + "epoch": 57.76513317191284, + "grad_norm": 0.004067595116794109, + "learning_rate": 0.2391276354876326, + "loss": 0.0046, + "num_input_tokens_seen": 20387520, + "step": 11900 + }, + { + "epoch": 57.789346246973366, + "grad_norm": 0.0005607693456113338, + "learning_rate": 0.23908024972932707, + "loss": 0.0016, + "num_input_tokens_seen": 20396512, + "step": 11905 + }, + { + "epoch": 57.813559322033896, + "grad_norm": 0.007944297976791859, + "learning_rate": 0.2390328502337276, + "loss": 0.0087, + "num_input_tokens_seen": 20405024, + "step": 11910 + }, + { + "epoch": 57.83777239709443, + "grad_norm": 0.0006996900192461908, + "learning_rate": 0.23898543700814376, + "loss": 0.012, + "num_input_tokens_seen": 20413248, + "step": 11915 + }, + { + "epoch": 57.86198547215496, + "grad_norm": 0.0005103507428430021, + "learning_rate": 0.2389380100598873, + "loss": 0.0035, + "num_input_tokens_seen": 20421504, + "step": 11920 + }, + { + "epoch": 57.8861985472155, + "grad_norm": 0.0022842849139124155, + "learning_rate": 0.23889056939627207, + "loss": 0.0038, + "num_input_tokens_seen": 20430144, + "step": 11925 + }, + { + "epoch": 57.91041162227603, + "grad_norm": 0.0002008612354984507, + "learning_rate": 0.23884311502461386, + "loss": 0.0113, + "num_input_tokens_seen": 20438880, + "step": 11930 + }, + { + "epoch": 57.93462469733656, + "grad_norm": 0.00616917759180069, + "learning_rate": 0.23879564695223088, + "loss": 0.0199, + "num_input_tokens_seen": 20447168, + "step": 11935 + }, + { + "epoch": 57.958837772397096, + "grad_norm": 0.000884979497641325, + "learning_rate": 0.23874816518644332, + "loss": 0.0063, + "num_input_tokens_seen": 20455712, + "step": 11940 + }, + { + "epoch": 57.983050847457626, + "grad_norm": 0.0002042231644736603, + "learning_rate": 0.23870066973457335, + "loss": 0.0232, + "num_input_tokens_seen": 20463936, + "step": 11945 + }, + { + "epoch": 58.00968523002421, + "grad_norm": 8.360170613741502e-05, + "learning_rate": 0.23865316060394545, + "loss": 0.0101, + "num_input_tokens_seen": 20473120, + "step": 11950 + }, + { + "epoch": 58.03389830508475, + "grad_norm": 6.980962643865496e-05, + "learning_rate": 0.2386056378018861, + "loss": 0.0038, + "num_input_tokens_seen": 20481600, + "step": 11955 + }, + { + "epoch": 58.05811138014528, + "grad_norm": 0.0005908331950195134, + "learning_rate": 0.2385581013357239, + "loss": 0.0046, + "num_input_tokens_seen": 20490400, + "step": 11960 + }, + { + "epoch": 58.082324455205814, + "grad_norm": 0.0013283099979162216, + "learning_rate": 0.23851055121278958, + "loss": 0.0066, + "num_input_tokens_seen": 20499136, + "step": 11965 + }, + { + "epoch": 58.106537530266344, + "grad_norm": 0.007148314733058214, + "learning_rate": 0.23846298744041594, + "loss": 0.0087, + "num_input_tokens_seen": 20507680, + "step": 11970 + }, + { + "epoch": 58.130750605326874, + "grad_norm": 0.0005068094469606876, + "learning_rate": 0.23841541002593802, + "loss": 0.0008, + "num_input_tokens_seen": 20516096, + "step": 11975 + }, + { + "epoch": 58.15496368038741, + "grad_norm": 0.00010303925955668092, + "learning_rate": 0.23836781897669276, + "loss": 0.0019, + "num_input_tokens_seen": 20524480, + "step": 11980 + }, + { + "epoch": 58.17917675544794, + "grad_norm": 0.0077032907865941525, + "learning_rate": 0.23832021430001926, + "loss": 0.0082, + "num_input_tokens_seen": 20532832, + "step": 11985 + }, + { + "epoch": 58.20338983050848, + "grad_norm": 0.00021432161156553775, + "learning_rate": 0.2382725960032588, + "loss": 0.0009, + "num_input_tokens_seen": 20541568, + "step": 11990 + }, + { + "epoch": 58.22760290556901, + "grad_norm": 0.000823044974822551, + "learning_rate": 0.23822496409375482, + "loss": 0.0043, + "num_input_tokens_seen": 20549952, + "step": 11995 + }, + { + "epoch": 58.25181598062954, + "grad_norm": 0.00034212053287774324, + "learning_rate": 0.2381773185788526, + "loss": 0.0022, + "num_input_tokens_seen": 20558624, + "step": 12000 + }, + { + "epoch": 58.25181598062954, + "eval_loss": 0.5719043016433716, + "eval_runtime": 4.6339, + "eval_samples_per_second": 79.199, + "eval_steps_per_second": 19.854, + "num_input_tokens_seen": 20558624, + "step": 12000 + }, + { + "epoch": 58.276029055690074, + "grad_norm": 0.0004907196271233261, + "learning_rate": 0.2381296594658998, + "loss": 0.0009, + "num_input_tokens_seen": 20567360, + "step": 12005 + }, + { + "epoch": 58.300242130750604, + "grad_norm": 0.00042665403452701867, + "learning_rate": 0.238081986762246, + "loss": 0.0031, + "num_input_tokens_seen": 20575904, + "step": 12010 + }, + { + "epoch": 58.32445520581114, + "grad_norm": 0.004032065626233816, + "learning_rate": 0.23803430047524293, + "loss": 0.0072, + "num_input_tokens_seen": 20584416, + "step": 12015 + }, + { + "epoch": 58.34866828087167, + "grad_norm": 0.00011732536950148642, + "learning_rate": 0.23798660061224441, + "loss": 0.0039, + "num_input_tokens_seen": 20593184, + "step": 12020 + }, + { + "epoch": 58.3728813559322, + "grad_norm": 0.00012200529454275966, + "learning_rate": 0.23793888718060632, + "loss": 0.0006, + "num_input_tokens_seen": 20601568, + "step": 12025 + }, + { + "epoch": 58.39709443099274, + "grad_norm": 0.0002543975133448839, + "learning_rate": 0.23789116018768675, + "loss": 0.0074, + "num_input_tokens_seen": 20610240, + "step": 12030 + }, + { + "epoch": 58.42130750605327, + "grad_norm": 0.0005467755254358053, + "learning_rate": 0.2378434196408458, + "loss": 0.0024, + "num_input_tokens_seen": 20619136, + "step": 12035 + }, + { + "epoch": 58.445520581113804, + "grad_norm": 0.0003100768371950835, + "learning_rate": 0.23779566554744563, + "loss": 0.002, + "num_input_tokens_seen": 20627584, + "step": 12040 + }, + { + "epoch": 58.469733656174334, + "grad_norm": 0.0039864820428192616, + "learning_rate": 0.23774789791485051, + "loss": 0.0043, + "num_input_tokens_seen": 20636192, + "step": 12045 + }, + { + "epoch": 58.493946731234864, + "grad_norm": 0.0013781226007267833, + "learning_rate": 0.2377001167504268, + "loss": 0.0054, + "num_input_tokens_seen": 20644896, + "step": 12050 + }, + { + "epoch": 58.5181598062954, + "grad_norm": 2.6943984266836196e-05, + "learning_rate": 0.23765232206154302, + "loss": 0.0019, + "num_input_tokens_seen": 20653696, + "step": 12055 + }, + { + "epoch": 58.54237288135593, + "grad_norm": 2.0770259652636014e-05, + "learning_rate": 0.23760451385556966, + "loss": 0.0035, + "num_input_tokens_seen": 20662304, + "step": 12060 + }, + { + "epoch": 58.56658595641647, + "grad_norm": 0.0001502202358096838, + "learning_rate": 0.23755669213987932, + "loss": 0.0151, + "num_input_tokens_seen": 20670976, + "step": 12065 + }, + { + "epoch": 58.590799031477, + "grad_norm": 0.0003401805297471583, + "learning_rate": 0.23750885692184676, + "loss": 0.009, + "num_input_tokens_seen": 20679552, + "step": 12070 + }, + { + "epoch": 58.61501210653753, + "grad_norm": 0.00011124116281280294, + "learning_rate": 0.23746100820884875, + "loss": 0.0123, + "num_input_tokens_seen": 20688352, + "step": 12075 + }, + { + "epoch": 58.639225181598064, + "grad_norm": 0.0026645485777407885, + "learning_rate": 0.23741314600826421, + "loss": 0.0132, + "num_input_tokens_seen": 20697152, + "step": 12080 + }, + { + "epoch": 58.663438256658594, + "grad_norm": 0.0016229443717747927, + "learning_rate": 0.23736527032747406, + "loss": 0.0016, + "num_input_tokens_seen": 20705760, + "step": 12085 + }, + { + "epoch": 58.68765133171913, + "grad_norm": 0.0007580490200780332, + "learning_rate": 0.23731738117386128, + "loss": 0.0082, + "num_input_tokens_seen": 20714080, + "step": 12090 + }, + { + "epoch": 58.71186440677966, + "grad_norm": 0.0032864308450371027, + "learning_rate": 0.237269478554811, + "loss": 0.0025, + "num_input_tokens_seen": 20722592, + "step": 12095 + }, + { + "epoch": 58.73607748184019, + "grad_norm": 0.0005697034066542983, + "learning_rate": 0.23722156247771053, + "loss": 0.0133, + "num_input_tokens_seen": 20730752, + "step": 12100 + }, + { + "epoch": 58.76029055690073, + "grad_norm": 0.0001645233714953065, + "learning_rate": 0.23717363294994895, + "loss": 0.0044, + "num_input_tokens_seen": 20739200, + "step": 12105 + }, + { + "epoch": 58.78450363196126, + "grad_norm": 0.00464207399636507, + "learning_rate": 0.2371256899789177, + "loss": 0.0062, + "num_input_tokens_seen": 20747776, + "step": 12110 + }, + { + "epoch": 58.808716707021794, + "grad_norm": 0.002578640589490533, + "learning_rate": 0.23707773357201017, + "loss": 0.0124, + "num_input_tokens_seen": 20756000, + "step": 12115 + }, + { + "epoch": 58.832929782082324, + "grad_norm": 0.00045351343578659, + "learning_rate": 0.2370297637366218, + "loss": 0.0031, + "num_input_tokens_seen": 20764672, + "step": 12120 + }, + { + "epoch": 58.857142857142854, + "grad_norm": 0.00017691790708340704, + "learning_rate": 0.23698178048015026, + "loss": 0.0019, + "num_input_tokens_seen": 20773280, + "step": 12125 + }, + { + "epoch": 58.88135593220339, + "grad_norm": 0.000722875352948904, + "learning_rate": 0.236933783809995, + "loss": 0.0073, + "num_input_tokens_seen": 20781952, + "step": 12130 + }, + { + "epoch": 58.90556900726392, + "grad_norm": 0.0006928906659595668, + "learning_rate": 0.23688577373355785, + "loss": 0.0015, + "num_input_tokens_seen": 20790592, + "step": 12135 + }, + { + "epoch": 58.92978208232446, + "grad_norm": 0.0030676887836307287, + "learning_rate": 0.23683775025824247, + "loss": 0.0059, + "num_input_tokens_seen": 20799264, + "step": 12140 + }, + { + "epoch": 58.95399515738499, + "grad_norm": 0.006642740219831467, + "learning_rate": 0.2367897133914548, + "loss": 0.0059, + "num_input_tokens_seen": 20807488, + "step": 12145 + }, + { + "epoch": 58.97820823244552, + "grad_norm": 6.273800681810826e-05, + "learning_rate": 0.2367416631406026, + "loss": 0.0018, + "num_input_tokens_seen": 20816352, + "step": 12150 + }, + { + "epoch": 59.00484261501211, + "grad_norm": 0.002689128741621971, + "learning_rate": 0.23669359951309588, + "loss": 0.0019, + "num_input_tokens_seen": 20825120, + "step": 12155 + }, + { + "epoch": 59.02905569007264, + "grad_norm": 5.996465188218281e-05, + "learning_rate": 0.23664552251634666, + "loss": 0.0005, + "num_input_tokens_seen": 20833408, + "step": 12160 + }, + { + "epoch": 59.05326876513317, + "grad_norm": 9.996088192565367e-05, + "learning_rate": 0.23659743215776907, + "loss": 0.0006, + "num_input_tokens_seen": 20841792, + "step": 12165 + }, + { + "epoch": 59.077481840193705, + "grad_norm": 0.0013946618419140577, + "learning_rate": 0.23654932844477908, + "loss": 0.0015, + "num_input_tokens_seen": 20850272, + "step": 12170 + }, + { + "epoch": 59.101694915254235, + "grad_norm": 4.2162922909483314e-05, + "learning_rate": 0.23650121138479507, + "loss": 0.0007, + "num_input_tokens_seen": 20859040, + "step": 12175 + }, + { + "epoch": 59.12590799031477, + "grad_norm": 8.316188177559525e-05, + "learning_rate": 0.23645308098523724, + "loss": 0.0012, + "num_input_tokens_seen": 20867616, + "step": 12180 + }, + { + "epoch": 59.1501210653753, + "grad_norm": 7.237161480588838e-05, + "learning_rate": 0.23640493725352785, + "loss": 0.0008, + "num_input_tokens_seen": 20876288, + "step": 12185 + }, + { + "epoch": 59.17433414043583, + "grad_norm": 0.000301659427350387, + "learning_rate": 0.2363567801970913, + "loss": 0.0014, + "num_input_tokens_seen": 20884864, + "step": 12190 + }, + { + "epoch": 59.19854721549637, + "grad_norm": 0.00032340676989406347, + "learning_rate": 0.236308609823354, + "loss": 0.0005, + "num_input_tokens_seen": 20893248, + "step": 12195 + }, + { + "epoch": 59.2227602905569, + "grad_norm": 2.243436756543815e-05, + "learning_rate": 0.23626042613974452, + "loss": 0.0004, + "num_input_tokens_seen": 20901984, + "step": 12200 + }, + { + "epoch": 59.2227602905569, + "eval_loss": 0.5728352665901184, + "eval_runtime": 4.614, + "eval_samples_per_second": 79.54, + "eval_steps_per_second": 19.939, + "num_input_tokens_seen": 20901984, + "step": 12200 + }, + { + "epoch": 59.246973365617436, + "grad_norm": 3.071926403208636e-05, + "learning_rate": 0.23621222915369325, + "loss": 0.0002, + "num_input_tokens_seen": 20910496, + "step": 12205 + }, + { + "epoch": 59.271186440677965, + "grad_norm": 0.00011008849833160639, + "learning_rate": 0.23616401887263283, + "loss": 0.0019, + "num_input_tokens_seen": 20919104, + "step": 12210 + }, + { + "epoch": 59.295399515738495, + "grad_norm": 0.000100723969808314, + "learning_rate": 0.23611579530399793, + "loss": 0.0012, + "num_input_tokens_seen": 20927424, + "step": 12215 + }, + { + "epoch": 59.31961259079903, + "grad_norm": 5.193666947889142e-05, + "learning_rate": 0.23606755845522517, + "loss": 0.0003, + "num_input_tokens_seen": 20936096, + "step": 12220 + }, + { + "epoch": 59.34382566585956, + "grad_norm": 0.0013037186581641436, + "learning_rate": 0.23601930833375329, + "loss": 0.0012, + "num_input_tokens_seen": 20944608, + "step": 12225 + }, + { + "epoch": 59.3680387409201, + "grad_norm": 0.00011060604447266087, + "learning_rate": 0.23597104494702312, + "loss": 0.0007, + "num_input_tokens_seen": 20952992, + "step": 12230 + }, + { + "epoch": 59.39225181598063, + "grad_norm": 7.285639003384858e-05, + "learning_rate": 0.23592276830247744, + "loss": 0.0054, + "num_input_tokens_seen": 20961408, + "step": 12235 + }, + { + "epoch": 59.416464891041166, + "grad_norm": 0.00014544502482749522, + "learning_rate": 0.2358744784075611, + "loss": 0.0004, + "num_input_tokens_seen": 20969888, + "step": 12240 + }, + { + "epoch": 59.440677966101696, + "grad_norm": 0.00015554246783722192, + "learning_rate": 0.235826175269721, + "loss": 0.0018, + "num_input_tokens_seen": 20978688, + "step": 12245 + }, + { + "epoch": 59.464891041162225, + "grad_norm": 0.0011516556842252612, + "learning_rate": 0.23577785889640612, + "loss": 0.0009, + "num_input_tokens_seen": 20987136, + "step": 12250 + }, + { + "epoch": 59.48910411622276, + "grad_norm": 0.004207438789308071, + "learning_rate": 0.23572952929506744, + "loss": 0.004, + "num_input_tokens_seen": 20995488, + "step": 12255 + }, + { + "epoch": 59.51331719128329, + "grad_norm": 0.00017323074280284345, + "learning_rate": 0.23568118647315803, + "loss": 0.0005, + "num_input_tokens_seen": 21003968, + "step": 12260 + }, + { + "epoch": 59.53753026634383, + "grad_norm": 8.53405799716711e-06, + "learning_rate": 0.23563283043813296, + "loss": 0.0002, + "num_input_tokens_seen": 21012448, + "step": 12265 + }, + { + "epoch": 59.56174334140436, + "grad_norm": 5.4622105380985886e-05, + "learning_rate": 0.23558446119744922, + "loss": 0.0007, + "num_input_tokens_seen": 21021376, + "step": 12270 + }, + { + "epoch": 59.58595641646489, + "grad_norm": 3.6655055737355724e-05, + "learning_rate": 0.23553607875856608, + "loss": 0.0008, + "num_input_tokens_seen": 21029888, + "step": 12275 + }, + { + "epoch": 59.610169491525426, + "grad_norm": 7.030014967313036e-05, + "learning_rate": 0.2354876831289447, + "loss": 0.0011, + "num_input_tokens_seen": 21038144, + "step": 12280 + }, + { + "epoch": 59.634382566585955, + "grad_norm": 0.00011769371485570446, + "learning_rate": 0.23543927431604827, + "loss": 0.0002, + "num_input_tokens_seen": 21046560, + "step": 12285 + }, + { + "epoch": 59.65859564164649, + "grad_norm": 0.0004253068764228374, + "learning_rate": 0.23539085232734203, + "loss": 0.0003, + "num_input_tokens_seen": 21055296, + "step": 12290 + }, + { + "epoch": 59.68280871670702, + "grad_norm": 4.0830200305208564e-05, + "learning_rate": 0.2353424171702933, + "loss": 0.0004, + "num_input_tokens_seen": 21064192, + "step": 12295 + }, + { + "epoch": 59.70702179176755, + "grad_norm": 4.5045861043035984e-05, + "learning_rate": 0.23529396885237133, + "loss": 0.0014, + "num_input_tokens_seen": 21072960, + "step": 12300 + }, + { + "epoch": 59.73123486682809, + "grad_norm": 1.3414199202088639e-05, + "learning_rate": 0.2352455073810475, + "loss": 0.001, + "num_input_tokens_seen": 21081632, + "step": 12305 + }, + { + "epoch": 59.75544794188862, + "grad_norm": 4.224810982123017e-05, + "learning_rate": 0.23519703276379517, + "loss": 0.0004, + "num_input_tokens_seen": 21090016, + "step": 12310 + }, + { + "epoch": 59.779661016949156, + "grad_norm": 2.715899063332472e-05, + "learning_rate": 0.2351485450080897, + "loss": 0.0005, + "num_input_tokens_seen": 21098336, + "step": 12315 + }, + { + "epoch": 59.803874092009686, + "grad_norm": 0.004300935659557581, + "learning_rate": 0.2351000441214086, + "loss": 0.0052, + "num_input_tokens_seen": 21106688, + "step": 12320 + }, + { + "epoch": 59.828087167070215, + "grad_norm": 0.0002716537856031209, + "learning_rate": 0.23505153011123125, + "loss": 0.0019, + "num_input_tokens_seen": 21115424, + "step": 12325 + }, + { + "epoch": 59.85230024213075, + "grad_norm": 2.994581518578343e-05, + "learning_rate": 0.23500300298503912, + "loss": 0.0003, + "num_input_tokens_seen": 21124128, + "step": 12330 + }, + { + "epoch": 59.87651331719128, + "grad_norm": 8.793469532974996e-06, + "learning_rate": 0.23495446275031576, + "loss": 0.0055, + "num_input_tokens_seen": 21132672, + "step": 12335 + }, + { + "epoch": 59.90072639225182, + "grad_norm": 0.0019110614666715264, + "learning_rate": 0.2349059094145466, + "loss": 0.0016, + "num_input_tokens_seen": 21141472, + "step": 12340 + }, + { + "epoch": 59.92493946731235, + "grad_norm": 0.00019160962256137282, + "learning_rate": 0.2348573429852192, + "loss": 0.004, + "num_input_tokens_seen": 21150048, + "step": 12345 + }, + { + "epoch": 59.94915254237288, + "grad_norm": 2.3897879145806655e-05, + "learning_rate": 0.23480876346982313, + "loss": 0.0004, + "num_input_tokens_seen": 21158304, + "step": 12350 + }, + { + "epoch": 59.973365617433416, + "grad_norm": 5.479711944644805e-06, + "learning_rate": 0.23476017087585, + "loss": 0.0016, + "num_input_tokens_seen": 21166848, + "step": 12355 + }, + { + "epoch": 59.997578692493946, + "grad_norm": 0.001379597233608365, + "learning_rate": 0.23471156521079334, + "loss": 0.0016, + "num_input_tokens_seen": 21175872, + "step": 12360 + }, + { + "epoch": 60.02421307506053, + "grad_norm": 4.0680202801013365e-05, + "learning_rate": 0.23466294648214875, + "loss": 0.0002, + "num_input_tokens_seen": 21184480, + "step": 12365 + }, + { + "epoch": 60.04842615012107, + "grad_norm": 1.9332446754560806e-05, + "learning_rate": 0.2346143146974139, + "loss": 0.0004, + "num_input_tokens_seen": 21193088, + "step": 12370 + }, + { + "epoch": 60.0726392251816, + "grad_norm": 3.877439667121507e-05, + "learning_rate": 0.23456566986408836, + "loss": 0.0002, + "num_input_tokens_seen": 21201920, + "step": 12375 + }, + { + "epoch": 60.09685230024213, + "grad_norm": 0.00018275604816153646, + "learning_rate": 0.23451701198967384, + "loss": 0.0004, + "num_input_tokens_seen": 21210368, + "step": 12380 + }, + { + "epoch": 60.12106537530266, + "grad_norm": 1.1641201126622036e-05, + "learning_rate": 0.23446834108167397, + "loss": 0.0004, + "num_input_tokens_seen": 21219328, + "step": 12385 + }, + { + "epoch": 60.14527845036319, + "grad_norm": 0.0002372722519794479, + "learning_rate": 0.23441965714759438, + "loss": 0.0003, + "num_input_tokens_seen": 21227776, + "step": 12390 + }, + { + "epoch": 60.16949152542373, + "grad_norm": 2.854558260878548e-05, + "learning_rate": 0.23437096019494277, + "loss": 0.0003, + "num_input_tokens_seen": 21236448, + "step": 12395 + }, + { + "epoch": 60.19370460048426, + "grad_norm": 0.00015611790877301246, + "learning_rate": 0.23432225023122885, + "loss": 0.0018, + "num_input_tokens_seen": 21244800, + "step": 12400 + }, + { + "epoch": 60.19370460048426, + "eval_loss": 0.6162539124488831, + "eval_runtime": 4.6087, + "eval_samples_per_second": 79.631, + "eval_steps_per_second": 19.962, + "num_input_tokens_seen": 21244800, + "step": 12400 + }, + { + "epoch": 60.2179176755448, + "grad_norm": 3.677781569422223e-05, + "learning_rate": 0.23427352726396428, + "loss": 0.0006, + "num_input_tokens_seen": 21253344, + "step": 12405 + }, + { + "epoch": 60.24213075060533, + "grad_norm": 0.005949400365352631, + "learning_rate": 0.2342247913006628, + "loss": 0.0023, + "num_input_tokens_seen": 21261984, + "step": 12410 + }, + { + "epoch": 60.26634382566586, + "grad_norm": 0.000196978566236794, + "learning_rate": 0.23417604234883999, + "loss": 0.0005, + "num_input_tokens_seen": 21270816, + "step": 12415 + }, + { + "epoch": 60.29055690072639, + "grad_norm": 9.981634320865851e-06, + "learning_rate": 0.23412728041601363, + "loss": 0.0003, + "num_input_tokens_seen": 21279392, + "step": 12420 + }, + { + "epoch": 60.31476997578692, + "grad_norm": 0.0037164161913096905, + "learning_rate": 0.23407850550970347, + "loss": 0.0025, + "num_input_tokens_seen": 21287968, + "step": 12425 + }, + { + "epoch": 60.33898305084746, + "grad_norm": 0.001447078655473888, + "learning_rate": 0.23402971763743116, + "loss": 0.0011, + "num_input_tokens_seen": 21296736, + "step": 12430 + }, + { + "epoch": 60.36319612590799, + "grad_norm": 7.03733239788562e-05, + "learning_rate": 0.23398091680672037, + "loss": 0.001, + "num_input_tokens_seen": 21305088, + "step": 12435 + }, + { + "epoch": 60.38740920096852, + "grad_norm": 0.00030293146846815944, + "learning_rate": 0.23393210302509687, + "loss": 0.0016, + "num_input_tokens_seen": 21313952, + "step": 12440 + }, + { + "epoch": 60.41162227602906, + "grad_norm": 0.00047467873082496226, + "learning_rate": 0.23388327630008832, + "loss": 0.0005, + "num_input_tokens_seen": 21322784, + "step": 12445 + }, + { + "epoch": 60.43583535108959, + "grad_norm": 3.041282980120741e-05, + "learning_rate": 0.23383443663922443, + "loss": 0.0002, + "num_input_tokens_seen": 21331264, + "step": 12450 + }, + { + "epoch": 60.460048426150124, + "grad_norm": 6.606844544876367e-05, + "learning_rate": 0.23378558405003685, + "loss": 0.0004, + "num_input_tokens_seen": 21340096, + "step": 12455 + }, + { + "epoch": 60.48426150121065, + "grad_norm": 4.591818651533686e-06, + "learning_rate": 0.2337367185400593, + "loss": 0.0003, + "num_input_tokens_seen": 21348736, + "step": 12460 + }, + { + "epoch": 60.50847457627118, + "grad_norm": 3.0091074222582392e-05, + "learning_rate": 0.23368784011682747, + "loss": 0.0004, + "num_input_tokens_seen": 21357664, + "step": 12465 + }, + { + "epoch": 60.53268765133172, + "grad_norm": 0.0006089616799727082, + "learning_rate": 0.23363894878787902, + "loss": 0.0015, + "num_input_tokens_seen": 21365952, + "step": 12470 + }, + { + "epoch": 60.55690072639225, + "grad_norm": 0.00022738336701877415, + "learning_rate": 0.23359004456075352, + "loss": 0.0008, + "num_input_tokens_seen": 21374432, + "step": 12475 + }, + { + "epoch": 60.58111380145279, + "grad_norm": 2.1523559553315863e-05, + "learning_rate": 0.23354112744299277, + "loss": 0.0002, + "num_input_tokens_seen": 21383072, + "step": 12480 + }, + { + "epoch": 60.60532687651332, + "grad_norm": 3.540857505868189e-05, + "learning_rate": 0.2334921974421403, + "loss": 0.0007, + "num_input_tokens_seen": 21391520, + "step": 12485 + }, + { + "epoch": 60.62953995157385, + "grad_norm": 7.143727998482063e-06, + "learning_rate": 0.23344325456574178, + "loss": 0.0002, + "num_input_tokens_seen": 21399648, + "step": 12490 + }, + { + "epoch": 60.653753026634384, + "grad_norm": 0.0001710614887997508, + "learning_rate": 0.23339429882134477, + "loss": 0.0006, + "num_input_tokens_seen": 21408224, + "step": 12495 + }, + { + "epoch": 60.67796610169491, + "grad_norm": 4.611174881574698e-05, + "learning_rate": 0.23334533021649884, + "loss": 0.0009, + "num_input_tokens_seen": 21416960, + "step": 12500 + }, + { + "epoch": 60.70217917675545, + "grad_norm": 8.912568773666862e-06, + "learning_rate": 0.23329634875875566, + "loss": 0.0001, + "num_input_tokens_seen": 21425312, + "step": 12505 + }, + { + "epoch": 60.72639225181598, + "grad_norm": 0.0001814026036299765, + "learning_rate": 0.23324735445566874, + "loss": 0.0003, + "num_input_tokens_seen": 21433856, + "step": 12510 + }, + { + "epoch": 60.75060532687651, + "grad_norm": 0.00021023696172051132, + "learning_rate": 0.2331983473147936, + "loss": 0.0002, + "num_input_tokens_seen": 21442048, + "step": 12515 + }, + { + "epoch": 60.77481840193705, + "grad_norm": 2.8761927751475014e-05, + "learning_rate": 0.23314932734368776, + "loss": 0.0013, + "num_input_tokens_seen": 21450688, + "step": 12520 + }, + { + "epoch": 60.79903147699758, + "grad_norm": 0.0018856561509892344, + "learning_rate": 0.2331002945499107, + "loss": 0.0014, + "num_input_tokens_seen": 21459456, + "step": 12525 + }, + { + "epoch": 60.823244552058114, + "grad_norm": 4.2768144339788705e-05, + "learning_rate": 0.23305124894102397, + "loss": 0.0002, + "num_input_tokens_seen": 21467968, + "step": 12530 + }, + { + "epoch": 60.847457627118644, + "grad_norm": 3.269484659540467e-05, + "learning_rate": 0.23300219052459092, + "loss": 0.0001, + "num_input_tokens_seen": 21476288, + "step": 12535 + }, + { + "epoch": 60.87167070217917, + "grad_norm": 0.0002484095748513937, + "learning_rate": 0.23295311930817708, + "loss": 0.0003, + "num_input_tokens_seen": 21484640, + "step": 12540 + }, + { + "epoch": 60.89588377723971, + "grad_norm": 9.466702977078967e-06, + "learning_rate": 0.23290403529934972, + "loss": 0.0003, + "num_input_tokens_seen": 21493312, + "step": 12545 + }, + { + "epoch": 60.92009685230024, + "grad_norm": 5.4941046983003616e-05, + "learning_rate": 0.23285493850567832, + "loss": 0.0002, + "num_input_tokens_seen": 21501600, + "step": 12550 + }, + { + "epoch": 60.94430992736078, + "grad_norm": 0.00013246115122456104, + "learning_rate": 0.23280582893473414, + "loss": 0.0006, + "num_input_tokens_seen": 21510592, + "step": 12555 + }, + { + "epoch": 60.96852300242131, + "grad_norm": 0.0009314062772318721, + "learning_rate": 0.2327567065940906, + "loss": 0.0005, + "num_input_tokens_seen": 21519168, + "step": 12560 + }, + { + "epoch": 60.99273607748184, + "grad_norm": 5.497151505551301e-05, + "learning_rate": 0.23270757149132285, + "loss": 0.0003, + "num_input_tokens_seen": 21528096, + "step": 12565 + }, + { + "epoch": 61.01937046004843, + "grad_norm": 1.493647232564399e-05, + "learning_rate": 0.23265842363400827, + "loss": 0.0001, + "num_input_tokens_seen": 21536992, + "step": 12570 + }, + { + "epoch": 61.04358353510896, + "grad_norm": 9.604925435269251e-05, + "learning_rate": 0.23260926302972595, + "loss": 0.0002, + "num_input_tokens_seen": 21545728, + "step": 12575 + }, + { + "epoch": 61.067796610169495, + "grad_norm": 1.4707727132190485e-05, + "learning_rate": 0.2325600896860572, + "loss": 0.0001, + "num_input_tokens_seen": 21554272, + "step": 12580 + }, + { + "epoch": 61.092009685230025, + "grad_norm": 1.0389824637968559e-05, + "learning_rate": 0.23251090361058505, + "loss": 0.0001, + "num_input_tokens_seen": 21562848, + "step": 12585 + }, + { + "epoch": 61.116222760290555, + "grad_norm": 8.920450454752427e-06, + "learning_rate": 0.23246170481089476, + "loss": 0.0001, + "num_input_tokens_seen": 21571424, + "step": 12590 + }, + { + "epoch": 61.14043583535109, + "grad_norm": 0.00011160673602716997, + "learning_rate": 0.23241249329457317, + "loss": 0.0002, + "num_input_tokens_seen": 21580288, + "step": 12595 + }, + { + "epoch": 61.16464891041162, + "grad_norm": 4.650319169741124e-06, + "learning_rate": 0.23236326906920957, + "loss": 0.0004, + "num_input_tokens_seen": 21588704, + "step": 12600 + }, + { + "epoch": 61.16464891041162, + "eval_loss": 0.6409426331520081, + "eval_runtime": 4.6182, + "eval_samples_per_second": 79.468, + "eval_steps_per_second": 19.921, + "num_input_tokens_seen": 21588704, + "step": 12600 + }, + { + "epoch": 61.18886198547216, + "grad_norm": 2.2294365408015437e-05, + "learning_rate": 0.2323140321423948, + "loss": 0.0002, + "num_input_tokens_seen": 21597664, + "step": 12605 + }, + { + "epoch": 61.21307506053269, + "grad_norm": 0.00015133175475057214, + "learning_rate": 0.23226478252172184, + "loss": 0.0002, + "num_input_tokens_seen": 21606016, + "step": 12610 + }, + { + "epoch": 61.23728813559322, + "grad_norm": 2.9120072213117965e-05, + "learning_rate": 0.23221552021478561, + "loss": 0.0001, + "num_input_tokens_seen": 21614336, + "step": 12615 + }, + { + "epoch": 61.261501210653755, + "grad_norm": 1.532363603473641e-05, + "learning_rate": 0.232166245229183, + "loss": 0.0001, + "num_input_tokens_seen": 21622944, + "step": 12620 + }, + { + "epoch": 61.285714285714285, + "grad_norm": 1.8026075849775225e-05, + "learning_rate": 0.2321169575725128, + "loss": 0.0001, + "num_input_tokens_seen": 21631776, + "step": 12625 + }, + { + "epoch": 61.30992736077482, + "grad_norm": 2.4836739612510428e-05, + "learning_rate": 0.23206765725237577, + "loss": 0.0001, + "num_input_tokens_seen": 21640704, + "step": 12630 + }, + { + "epoch": 61.33414043583535, + "grad_norm": 6.493135151686147e-05, + "learning_rate": 0.2320183442763747, + "loss": 0.0002, + "num_input_tokens_seen": 21649184, + "step": 12635 + }, + { + "epoch": 61.35835351089588, + "grad_norm": 2.6827226975001395e-05, + "learning_rate": 0.23196901865211422, + "loss": 0.0001, + "num_input_tokens_seen": 21657856, + "step": 12640 + }, + { + "epoch": 61.38256658595642, + "grad_norm": 1.2149003850936424e-05, + "learning_rate": 0.231919680387201, + "loss": 0.0001, + "num_input_tokens_seen": 21666528, + "step": 12645 + }, + { + "epoch": 61.40677966101695, + "grad_norm": 2.8188491342007183e-05, + "learning_rate": 0.23187032948924358, + "loss": 0.0001, + "num_input_tokens_seen": 21675168, + "step": 12650 + }, + { + "epoch": 61.430992736077485, + "grad_norm": 7.0090286499180365e-06, + "learning_rate": 0.23182096596585247, + "loss": 0.0001, + "num_input_tokens_seen": 21683360, + "step": 12655 + }, + { + "epoch": 61.455205811138015, + "grad_norm": 1.3493714504875243e-05, + "learning_rate": 0.23177158982464025, + "loss": 0.0001, + "num_input_tokens_seen": 21691872, + "step": 12660 + }, + { + "epoch": 61.479418886198545, + "grad_norm": 2.028809831244871e-05, + "learning_rate": 0.23172220107322122, + "loss": 0.0001, + "num_input_tokens_seen": 21700384, + "step": 12665 + }, + { + "epoch": 61.50363196125908, + "grad_norm": 8.085624358500354e-06, + "learning_rate": 0.23167279971921184, + "loss": 0.0001, + "num_input_tokens_seen": 21708928, + "step": 12670 + }, + { + "epoch": 61.52784503631961, + "grad_norm": 2.8375570764183067e-05, + "learning_rate": 0.23162338577023034, + "loss": 0.0001, + "num_input_tokens_seen": 21717280, + "step": 12675 + }, + { + "epoch": 61.55205811138015, + "grad_norm": 8.588443961343728e-06, + "learning_rate": 0.23157395923389704, + "loss": 0.0001, + "num_input_tokens_seen": 21725696, + "step": 12680 + }, + { + "epoch": 61.57627118644068, + "grad_norm": 1.6341124137397856e-05, + "learning_rate": 0.2315245201178341, + "loss": 0.0001, + "num_input_tokens_seen": 21734560, + "step": 12685 + }, + { + "epoch": 61.60048426150121, + "grad_norm": 1.563852492836304e-05, + "learning_rate": 0.23147506842966564, + "loss": 0.0001, + "num_input_tokens_seen": 21743232, + "step": 12690 + }, + { + "epoch": 61.624697336561745, + "grad_norm": 3.120078326901421e-05, + "learning_rate": 0.23142560417701774, + "loss": 0.0001, + "num_input_tokens_seen": 21751808, + "step": 12695 + }, + { + "epoch": 61.648910411622275, + "grad_norm": 4.2190004023723304e-05, + "learning_rate": 0.23137612736751845, + "loss": 0.0003, + "num_input_tokens_seen": 21760064, + "step": 12700 + }, + { + "epoch": 61.67312348668281, + "grad_norm": 2.435919850540813e-05, + "learning_rate": 0.23132663800879766, + "loss": 0.0001, + "num_input_tokens_seen": 21768544, + "step": 12705 + }, + { + "epoch": 61.69733656174334, + "grad_norm": 8.94858476385707e-06, + "learning_rate": 0.2312771361084873, + "loss": 0.0001, + "num_input_tokens_seen": 21777248, + "step": 12710 + }, + { + "epoch": 61.72154963680387, + "grad_norm": 1.2610980775207281e-05, + "learning_rate": 0.23122762167422112, + "loss": 0.0001, + "num_input_tokens_seen": 21785632, + "step": 12715 + }, + { + "epoch": 61.74576271186441, + "grad_norm": 2.236623186036013e-05, + "learning_rate": 0.23117809471363493, + "loss": 0.0001, + "num_input_tokens_seen": 21794112, + "step": 12720 + }, + { + "epoch": 61.76997578692494, + "grad_norm": 8.381457882933319e-05, + "learning_rate": 0.23112855523436637, + "loss": 0.0001, + "num_input_tokens_seen": 21802720, + "step": 12725 + }, + { + "epoch": 61.794188861985475, + "grad_norm": 1.5749050362501293e-05, + "learning_rate": 0.23107900324405511, + "loss": 0.0001, + "num_input_tokens_seen": 21811424, + "step": 12730 + }, + { + "epoch": 61.818401937046005, + "grad_norm": 2.7110002065455774e-06, + "learning_rate": 0.2310294387503426, + "loss": 0.0001, + "num_input_tokens_seen": 21820064, + "step": 12735 + }, + { + "epoch": 61.842615012106535, + "grad_norm": 1.963354407052975e-05, + "learning_rate": 0.23097986176087237, + "loss": 0.0002, + "num_input_tokens_seen": 21828640, + "step": 12740 + }, + { + "epoch": 61.86682808716707, + "grad_norm": 1.592692206031643e-05, + "learning_rate": 0.23093027228328986, + "loss": 0.0, + "num_input_tokens_seen": 21837184, + "step": 12745 + }, + { + "epoch": 61.8910411622276, + "grad_norm": 2.289748954353854e-05, + "learning_rate": 0.23088067032524226, + "loss": 0.0001, + "num_input_tokens_seen": 21845600, + "step": 12750 + }, + { + "epoch": 61.91525423728814, + "grad_norm": 2.234337807749398e-05, + "learning_rate": 0.23083105589437888, + "loss": 0.0001, + "num_input_tokens_seen": 21853952, + "step": 12755 + }, + { + "epoch": 61.93946731234867, + "grad_norm": 1.7133766050392296e-06, + "learning_rate": 0.23078142899835094, + "loss": 0.0001, + "num_input_tokens_seen": 21862592, + "step": 12760 + }, + { + "epoch": 61.9636803874092, + "grad_norm": 1.3604741070594173e-05, + "learning_rate": 0.23073178964481147, + "loss": 0.0001, + "num_input_tokens_seen": 21871168, + "step": 12765 + }, + { + "epoch": 61.987893462469735, + "grad_norm": 3.701376044773497e-05, + "learning_rate": 0.2306821378414155, + "loss": 0.0001, + "num_input_tokens_seen": 21879520, + "step": 12770 + }, + { + "epoch": 62.01452784503632, + "grad_norm": 2.1534891857299954e-05, + "learning_rate": 0.2306324735958199, + "loss": 0.0002, + "num_input_tokens_seen": 21888320, + "step": 12775 + }, + { + "epoch": 62.03874092009685, + "grad_norm": 9.131727892963681e-06, + "learning_rate": 0.23058279691568362, + "loss": 0.0001, + "num_input_tokens_seen": 21897120, + "step": 12780 + }, + { + "epoch": 62.062953995157386, + "grad_norm": 1.2752843758789822e-05, + "learning_rate": 0.23053310780866745, + "loss": 0.0001, + "num_input_tokens_seen": 21905952, + "step": 12785 + }, + { + "epoch": 62.087167070217916, + "grad_norm": 1.5919127690722235e-05, + "learning_rate": 0.23048340628243397, + "loss": 0.0001, + "num_input_tokens_seen": 21914720, + "step": 12790 + }, + { + "epoch": 62.11138014527845, + "grad_norm": 3.7426520975714084e-06, + "learning_rate": 0.23043369234464783, + "loss": 0.0001, + "num_input_tokens_seen": 21923104, + "step": 12795 + }, + { + "epoch": 62.13559322033898, + "grad_norm": 2.3130123736336827e-05, + "learning_rate": 0.2303839660029755, + "loss": 0.0001, + "num_input_tokens_seen": 21931872, + "step": 12800 + }, + { + "epoch": 62.13559322033898, + "eval_loss": 0.655301034450531, + "eval_runtime": 4.6279, + "eval_samples_per_second": 79.301, + "eval_steps_per_second": 19.879, + "num_input_tokens_seen": 21931872, + "step": 12800 + }, + { + "epoch": 62.15980629539951, + "grad_norm": 1.0581546121102292e-05, + "learning_rate": 0.23033422726508548, + "loss": 0.0001, + "num_input_tokens_seen": 21940768, + "step": 12805 + }, + { + "epoch": 62.18401937046005, + "grad_norm": 6.710809429932851e-06, + "learning_rate": 0.23028447613864808, + "loss": 0.0, + "num_input_tokens_seen": 21949152, + "step": 12810 + }, + { + "epoch": 62.20823244552058, + "grad_norm": 2.0876885173493065e-05, + "learning_rate": 0.2302347126313355, + "loss": 0.0001, + "num_input_tokens_seen": 21958144, + "step": 12815 + }, + { + "epoch": 62.232445520581116, + "grad_norm": 3.194853707100265e-05, + "learning_rate": 0.23018493675082197, + "loss": 0.0001, + "num_input_tokens_seen": 21966848, + "step": 12820 + }, + { + "epoch": 62.256658595641646, + "grad_norm": 1.021724983729655e-05, + "learning_rate": 0.2301351485047835, + "loss": 0.0001, + "num_input_tokens_seen": 21975264, + "step": 12825 + }, + { + "epoch": 62.280871670702176, + "grad_norm": 1.1820766303571872e-05, + "learning_rate": 0.23008534790089813, + "loss": 0.0001, + "num_input_tokens_seen": 21983904, + "step": 12830 + }, + { + "epoch": 62.30508474576271, + "grad_norm": 2.661968392203562e-05, + "learning_rate": 0.2300355349468457, + "loss": 0.0001, + "num_input_tokens_seen": 21992640, + "step": 12835 + }, + { + "epoch": 62.32929782082324, + "grad_norm": 6.508781552838627e-06, + "learning_rate": 0.22998570965030793, + "loss": 0.0001, + "num_input_tokens_seen": 22001152, + "step": 12840 + }, + { + "epoch": 62.35351089588378, + "grad_norm": 3.195069803041406e-05, + "learning_rate": 0.22993587201896862, + "loss": 0.0001, + "num_input_tokens_seen": 22009248, + "step": 12845 + }, + { + "epoch": 62.37772397094431, + "grad_norm": 1.960061490535736e-05, + "learning_rate": 0.2298860220605133, + "loss": 0.0001, + "num_input_tokens_seen": 22017952, + "step": 12850 + }, + { + "epoch": 62.40193704600484, + "grad_norm": 1.5915968106128275e-05, + "learning_rate": 0.22983615978262942, + "loss": 0.0001, + "num_input_tokens_seen": 22026272, + "step": 12855 + }, + { + "epoch": 62.426150121065376, + "grad_norm": 4.1352326661581174e-05, + "learning_rate": 0.22978628519300648, + "loss": 0.0001, + "num_input_tokens_seen": 22034752, + "step": 12860 + }, + { + "epoch": 62.450363196125906, + "grad_norm": 1.125891776609933e-05, + "learning_rate": 0.22973639829933568, + "loss": 0.0001, + "num_input_tokens_seen": 22043232, + "step": 12865 + }, + { + "epoch": 62.47457627118644, + "grad_norm": 1.2340291505097412e-05, + "learning_rate": 0.22968649910931027, + "loss": 0.0, + "num_input_tokens_seen": 22052160, + "step": 12870 + }, + { + "epoch": 62.49878934624697, + "grad_norm": 6.528182893816847e-06, + "learning_rate": 0.22963658763062528, + "loss": 0.0001, + "num_input_tokens_seen": 22060608, + "step": 12875 + }, + { + "epoch": 62.5230024213075, + "grad_norm": 3.334255234221928e-05, + "learning_rate": 0.22958666387097765, + "loss": 0.0001, + "num_input_tokens_seen": 22069664, + "step": 12880 + }, + { + "epoch": 62.54721549636804, + "grad_norm": 6.791029136365978e-06, + "learning_rate": 0.22953672783806633, + "loss": 0.0001, + "num_input_tokens_seen": 22077888, + "step": 12885 + }, + { + "epoch": 62.57142857142857, + "grad_norm": 1.1662553333735559e-05, + "learning_rate": 0.22948677953959207, + "loss": 0.0001, + "num_input_tokens_seen": 22086176, + "step": 12890 + }, + { + "epoch": 62.595641646489106, + "grad_norm": 3.8343255255313125e-06, + "learning_rate": 0.2294368189832575, + "loss": 0.0, + "num_input_tokens_seen": 22094752, + "step": 12895 + }, + { + "epoch": 62.619854721549636, + "grad_norm": 3.309015710328822e-06, + "learning_rate": 0.2293868461767672, + "loss": 0.0001, + "num_input_tokens_seen": 22103680, + "step": 12900 + }, + { + "epoch": 62.644067796610166, + "grad_norm": 4.226026703690877e-06, + "learning_rate": 0.22933686112782758, + "loss": 0.0001, + "num_input_tokens_seen": 22111968, + "step": 12905 + }, + { + "epoch": 62.6682808716707, + "grad_norm": 2.9229031497379765e-05, + "learning_rate": 0.22928686384414698, + "loss": 0.0001, + "num_input_tokens_seen": 22120800, + "step": 12910 + }, + { + "epoch": 62.69249394673123, + "grad_norm": 8.771027751208749e-06, + "learning_rate": 0.22923685433343552, + "loss": 0.0, + "num_input_tokens_seen": 22129088, + "step": 12915 + }, + { + "epoch": 62.71670702179177, + "grad_norm": 7.368478691205382e-06, + "learning_rate": 0.22918683260340542, + "loss": 0.0001, + "num_input_tokens_seen": 22137600, + "step": 12920 + }, + { + "epoch": 62.7409200968523, + "grad_norm": 4.010656084574293e-06, + "learning_rate": 0.2291367986617706, + "loss": 0.0001, + "num_input_tokens_seen": 22146048, + "step": 12925 + }, + { + "epoch": 62.76513317191284, + "grad_norm": 1.2086450624337886e-05, + "learning_rate": 0.22908675251624697, + "loss": 0.0001, + "num_input_tokens_seen": 22154816, + "step": 12930 + }, + { + "epoch": 62.789346246973366, + "grad_norm": 1.932636905621621e-06, + "learning_rate": 0.22903669417455216, + "loss": 0.0001, + "num_input_tokens_seen": 22163360, + "step": 12935 + }, + { + "epoch": 62.813559322033896, + "grad_norm": 1.0249294064124115e-05, + "learning_rate": 0.22898662364440592, + "loss": 0.0001, + "num_input_tokens_seen": 22171712, + "step": 12940 + }, + { + "epoch": 62.83777239709443, + "grad_norm": 5.174794750928413e-06, + "learning_rate": 0.2289365409335297, + "loss": 0.0, + "num_input_tokens_seen": 22180224, + "step": 12945 + }, + { + "epoch": 62.86198547215496, + "grad_norm": 7.2668349275772925e-06, + "learning_rate": 0.2288864460496469, + "loss": 0.0001, + "num_input_tokens_seen": 22188960, + "step": 12950 + }, + { + "epoch": 62.8861985472155, + "grad_norm": 7.170625394792296e-06, + "learning_rate": 0.22883633900048272, + "loss": 0.0001, + "num_input_tokens_seen": 22197504, + "step": 12955 + }, + { + "epoch": 62.91041162227603, + "grad_norm": 1.5846975657041185e-05, + "learning_rate": 0.2287862197937644, + "loss": 0.0001, + "num_input_tokens_seen": 22205696, + "step": 12960 + }, + { + "epoch": 62.93462469733656, + "grad_norm": 7.932217158668209e-06, + "learning_rate": 0.2287360884372209, + "loss": 0.0001, + "num_input_tokens_seen": 22214336, + "step": 12965 + }, + { + "epoch": 62.958837772397096, + "grad_norm": 1.5685096514062025e-05, + "learning_rate": 0.22868594493858307, + "loss": 0.0001, + "num_input_tokens_seen": 22222720, + "step": 12970 + }, + { + "epoch": 62.983050847457626, + "grad_norm": 1.0694378943298943e-05, + "learning_rate": 0.2286357893055837, + "loss": 0.0, + "num_input_tokens_seen": 22231360, + "step": 12975 + }, + { + "epoch": 63.00968523002421, + "grad_norm": 1.4924169590813108e-05, + "learning_rate": 0.22858562154595746, + "loss": 0.0001, + "num_input_tokens_seen": 22240480, + "step": 12980 + }, + { + "epoch": 63.03389830508475, + "grad_norm": 2.2987956981523894e-05, + "learning_rate": 0.22853544166744078, + "loss": 0.0001, + "num_input_tokens_seen": 22249216, + "step": 12985 + }, + { + "epoch": 63.05811138014528, + "grad_norm": 1.9342984160175547e-05, + "learning_rate": 0.22848524967777206, + "loss": 0.0001, + "num_input_tokens_seen": 22257600, + "step": 12990 + }, + { + "epoch": 63.082324455205814, + "grad_norm": 2.109215893142391e-05, + "learning_rate": 0.22843504558469152, + "loss": 0.0001, + "num_input_tokens_seen": 22266112, + "step": 12995 + }, + { + "epoch": 63.106537530266344, + "grad_norm": 1.0526359801588114e-05, + "learning_rate": 0.2283848293959413, + "loss": 0.0001, + "num_input_tokens_seen": 22274560, + "step": 13000 + }, + { + "epoch": 63.106537530266344, + "eval_loss": 0.6691649556159973, + "eval_runtime": 4.6195, + "eval_samples_per_second": 79.445, + "eval_steps_per_second": 19.915, + "num_input_tokens_seen": 22274560, + "step": 13000 + }, + { + "epoch": 63.130750605326874, + "grad_norm": 3.590885171433911e-05, + "learning_rate": 0.22833460111926532, + "loss": 0.0001, + "num_input_tokens_seen": 22283072, + "step": 13005 + }, + { + "epoch": 63.15496368038741, + "grad_norm": 6.6936822804564144e-06, + "learning_rate": 0.22828436076240946, + "loss": 0.0001, + "num_input_tokens_seen": 22291520, + "step": 13010 + }, + { + "epoch": 63.17917675544794, + "grad_norm": 2.759490826065303e-06, + "learning_rate": 0.22823410833312135, + "loss": 0.0, + "num_input_tokens_seen": 22300032, + "step": 13015 + }, + { + "epoch": 63.20338983050848, + "grad_norm": 9.925291124091018e-06, + "learning_rate": 0.2281838438391506, + "loss": 0.0, + "num_input_tokens_seen": 22308512, + "step": 13020 + }, + { + "epoch": 63.22760290556901, + "grad_norm": 3.964945335610537e-06, + "learning_rate": 0.22813356728824863, + "loss": 0.0, + "num_input_tokens_seen": 22316928, + "step": 13025 + }, + { + "epoch": 63.25181598062954, + "grad_norm": 1.884938683360815e-05, + "learning_rate": 0.2280832786881687, + "loss": 0.0001, + "num_input_tokens_seen": 22325376, + "step": 13030 + }, + { + "epoch": 63.276029055690074, + "grad_norm": 8.765420716372319e-06, + "learning_rate": 0.22803297804666592, + "loss": 0.0, + "num_input_tokens_seen": 22334176, + "step": 13035 + }, + { + "epoch": 63.300242130750604, + "grad_norm": 1.4528832252835855e-05, + "learning_rate": 0.22798266537149728, + "loss": 0.0001, + "num_input_tokens_seen": 22342848, + "step": 13040 + }, + { + "epoch": 63.32445520581114, + "grad_norm": 6.93734000378754e-06, + "learning_rate": 0.22793234067042167, + "loss": 0.0001, + "num_input_tokens_seen": 22351488, + "step": 13045 + }, + { + "epoch": 63.34866828087167, + "grad_norm": 1.681276444287505e-05, + "learning_rate": 0.22788200395119979, + "loss": 0.0001, + "num_input_tokens_seen": 22360416, + "step": 13050 + }, + { + "epoch": 63.3728813559322, + "grad_norm": 1.6074014638434164e-05, + "learning_rate": 0.2278316552215942, + "loss": 0.0001, + "num_input_tokens_seen": 22368832, + "step": 13055 + }, + { + "epoch": 63.39709443099274, + "grad_norm": 9.782369488675613e-06, + "learning_rate": 0.22778129448936918, + "loss": 0.0, + "num_input_tokens_seen": 22378144, + "step": 13060 + }, + { + "epoch": 63.42130750605327, + "grad_norm": 2.1849642507731915e-05, + "learning_rate": 0.22773092176229118, + "loss": 0.0001, + "num_input_tokens_seen": 22386304, + "step": 13065 + }, + { + "epoch": 63.445520581113804, + "grad_norm": 9.660887371865101e-06, + "learning_rate": 0.22768053704812816, + "loss": 0.0001, + "num_input_tokens_seen": 22395072, + "step": 13070 + }, + { + "epoch": 63.469733656174334, + "grad_norm": 2.239523564639967e-05, + "learning_rate": 0.22763014035465018, + "loss": 0.0001, + "num_input_tokens_seen": 22403552, + "step": 13075 + }, + { + "epoch": 63.493946731234864, + "grad_norm": 2.517898246878758e-05, + "learning_rate": 0.22757973168962892, + "loss": 0.0001, + "num_input_tokens_seen": 22412128, + "step": 13080 + }, + { + "epoch": 63.5181598062954, + "grad_norm": 8.855633495841175e-06, + "learning_rate": 0.22752931106083818, + "loss": 0.0001, + "num_input_tokens_seen": 22421056, + "step": 13085 + }, + { + "epoch": 63.54237288135593, + "grad_norm": 2.7288831915939227e-05, + "learning_rate": 0.22747887847605341, + "loss": 0.0001, + "num_input_tokens_seen": 22429568, + "step": 13090 + }, + { + "epoch": 63.56658595641647, + "grad_norm": 3.798475881922059e-05, + "learning_rate": 0.22742843394305184, + "loss": 0.0001, + "num_input_tokens_seen": 22437888, + "step": 13095 + }, + { + "epoch": 63.590799031477, + "grad_norm": 1.7929483874468133e-05, + "learning_rate": 0.22737797746961272, + "loss": 0.0001, + "num_input_tokens_seen": 22446272, + "step": 13100 + }, + { + "epoch": 63.61501210653753, + "grad_norm": 3.377621396793984e-06, + "learning_rate": 0.22732750906351712, + "loss": 0.0, + "num_input_tokens_seen": 22454816, + "step": 13105 + }, + { + "epoch": 63.639225181598064, + "grad_norm": 2.5031022232724354e-05, + "learning_rate": 0.22727702873254785, + "loss": 0.0, + "num_input_tokens_seen": 22463360, + "step": 13110 + }, + { + "epoch": 63.663438256658594, + "grad_norm": 1.1250133866269607e-05, + "learning_rate": 0.22722653648448968, + "loss": 0.0001, + "num_input_tokens_seen": 22472032, + "step": 13115 + }, + { + "epoch": 63.68765133171913, + "grad_norm": 8.065100701060146e-06, + "learning_rate": 0.22717603232712902, + "loss": 0.0, + "num_input_tokens_seen": 22480704, + "step": 13120 + }, + { + "epoch": 63.71186440677966, + "grad_norm": 8.958490980148781e-06, + "learning_rate": 0.22712551626825436, + "loss": 0.0, + "num_input_tokens_seen": 22489216, + "step": 13125 + }, + { + "epoch": 63.73607748184019, + "grad_norm": 8.675374374433886e-06, + "learning_rate": 0.2270749883156559, + "loss": 0.0, + "num_input_tokens_seen": 22497920, + "step": 13130 + }, + { + "epoch": 63.76029055690073, + "grad_norm": 1.2048523785779253e-05, + "learning_rate": 0.22702444847712563, + "loss": 0.0, + "num_input_tokens_seen": 22506432, + "step": 13135 + }, + { + "epoch": 63.78450363196126, + "grad_norm": 1.607337617315352e-05, + "learning_rate": 0.22697389676045743, + "loss": 0.0, + "num_input_tokens_seen": 22514560, + "step": 13140 + }, + { + "epoch": 63.808716707021794, + "grad_norm": 1.514247924205847e-05, + "learning_rate": 0.22692333317344704, + "loss": 0.0001, + "num_input_tokens_seen": 22523008, + "step": 13145 + }, + { + "epoch": 63.832929782082324, + "grad_norm": 1.6976237020571716e-05, + "learning_rate": 0.22687275772389198, + "loss": 0.0001, + "num_input_tokens_seen": 22531552, + "step": 13150 + }, + { + "epoch": 63.857142857142854, + "grad_norm": 1.1904512575711124e-05, + "learning_rate": 0.22682217041959168, + "loss": 0.0001, + "num_input_tokens_seen": 22540416, + "step": 13155 + }, + { + "epoch": 63.88135593220339, + "grad_norm": 1.868039726105053e-05, + "learning_rate": 0.2267715712683473, + "loss": 0.0001, + "num_input_tokens_seen": 22548864, + "step": 13160 + }, + { + "epoch": 63.90556900726392, + "grad_norm": 1.3276557183417026e-05, + "learning_rate": 0.22672096027796182, + "loss": 0.0, + "num_input_tokens_seen": 22557472, + "step": 13165 + }, + { + "epoch": 63.92978208232446, + "grad_norm": 1.7622012819629163e-05, + "learning_rate": 0.22667033745624016, + "loss": 0.0001, + "num_input_tokens_seen": 22565952, + "step": 13170 + }, + { + "epoch": 63.95399515738499, + "grad_norm": 3.791710696532391e-06, + "learning_rate": 0.22661970281098895, + "loss": 0.0, + "num_input_tokens_seen": 22574592, + "step": 13175 + }, + { + "epoch": 63.97820823244552, + "grad_norm": 1.9350458387634717e-05, + "learning_rate": 0.22656905635001667, + "loss": 0.0001, + "num_input_tokens_seen": 22583296, + "step": 13180 + }, + { + "epoch": 64.00484261501211, + "grad_norm": 5.436665014713071e-05, + "learning_rate": 0.2265183980811337, + "loss": 0.0001, + "num_input_tokens_seen": 22592352, + "step": 13185 + }, + { + "epoch": 64.02905569007264, + "grad_norm": 5.099434474686859e-06, + "learning_rate": 0.22646772801215218, + "loss": 0.0, + "num_input_tokens_seen": 22601216, + "step": 13190 + }, + { + "epoch": 64.05326876513317, + "grad_norm": 2.506213786546141e-05, + "learning_rate": 0.22641704615088598, + "loss": 0.0001, + "num_input_tokens_seen": 22609824, + "step": 13195 + }, + { + "epoch": 64.0774818401937, + "grad_norm": 8.951867130235769e-06, + "learning_rate": 0.22636635250515103, + "loss": 0.0, + "num_input_tokens_seen": 22618432, + "step": 13200 + }, + { + "epoch": 64.0774818401937, + "eval_loss": 0.6788890361785889, + "eval_runtime": 4.6182, + "eval_samples_per_second": 79.469, + "eval_steps_per_second": 19.921, + "num_input_tokens_seen": 22618432, + "step": 13200 + }, + { + "epoch": 64.10169491525424, + "grad_norm": 8.007153155631386e-06, + "learning_rate": 0.2263156470827648, + "loss": 0.0, + "num_input_tokens_seen": 22626880, + "step": 13205 + }, + { + "epoch": 64.12590799031477, + "grad_norm": 3.605205847634352e-06, + "learning_rate": 0.22626492989154678, + "loss": 0.0001, + "num_input_tokens_seen": 22635264, + "step": 13210 + }, + { + "epoch": 64.1501210653753, + "grad_norm": 6.854404546174919e-06, + "learning_rate": 0.22621420093931813, + "loss": 0.0001, + "num_input_tokens_seen": 22644256, + "step": 13215 + }, + { + "epoch": 64.17433414043583, + "grad_norm": 8.349389645445626e-06, + "learning_rate": 0.22616346023390194, + "loss": 0.0001, + "num_input_tokens_seen": 22652800, + "step": 13220 + }, + { + "epoch": 64.19854721549636, + "grad_norm": 1.1175518920936156e-05, + "learning_rate": 0.22611270778312306, + "loss": 0.0, + "num_input_tokens_seen": 22661792, + "step": 13225 + }, + { + "epoch": 64.2227602905569, + "grad_norm": 1.2496188901423011e-05, + "learning_rate": 0.2260619435948081, + "loss": 0.0, + "num_input_tokens_seen": 22670464, + "step": 13230 + }, + { + "epoch": 64.24697336561744, + "grad_norm": 2.4401198970736004e-05, + "learning_rate": 0.22601116767678567, + "loss": 0.0001, + "num_input_tokens_seen": 22679008, + "step": 13235 + }, + { + "epoch": 64.27118644067797, + "grad_norm": 1.8905393517343327e-05, + "learning_rate": 0.2259603800368859, + "loss": 0.0001, + "num_input_tokens_seen": 22687904, + "step": 13240 + }, + { + "epoch": 64.2953995157385, + "grad_norm": 6.003891940054018e-06, + "learning_rate": 0.22590958068294098, + "loss": 0.0, + "num_input_tokens_seen": 22696736, + "step": 13245 + }, + { + "epoch": 64.31961259079903, + "grad_norm": 1.4467056644207332e-05, + "learning_rate": 0.22585876962278478, + "loss": 0.0001, + "num_input_tokens_seen": 22705152, + "step": 13250 + }, + { + "epoch": 64.34382566585957, + "grad_norm": 6.051106083759805e-06, + "learning_rate": 0.22580794686425298, + "loss": 0.0, + "num_input_tokens_seen": 22713440, + "step": 13255 + }, + { + "epoch": 64.3680387409201, + "grad_norm": 1.337474532192573e-05, + "learning_rate": 0.22575711241518312, + "loss": 0.0, + "num_input_tokens_seen": 22722208, + "step": 13260 + }, + { + "epoch": 64.39225181598063, + "grad_norm": 7.578477379865944e-06, + "learning_rate": 0.22570626628341453, + "loss": 0.0001, + "num_input_tokens_seen": 22730656, + "step": 13265 + }, + { + "epoch": 64.41646489104116, + "grad_norm": 8.810190593067091e-06, + "learning_rate": 0.22565540847678828, + "loss": 0.0, + "num_input_tokens_seen": 22739616, + "step": 13270 + }, + { + "epoch": 64.44067796610169, + "grad_norm": 1.89249749382725e-05, + "learning_rate": 0.2256045390031473, + "loss": 0.0, + "num_input_tokens_seen": 22748320, + "step": 13275 + }, + { + "epoch": 64.46489104116223, + "grad_norm": 1.646409509703517e-05, + "learning_rate": 0.22555365787033627, + "loss": 0.0001, + "num_input_tokens_seen": 22757088, + "step": 13280 + }, + { + "epoch": 64.48910411622276, + "grad_norm": 1.0479324373591226e-05, + "learning_rate": 0.22550276508620173, + "loss": 0.0001, + "num_input_tokens_seen": 22765632, + "step": 13285 + }, + { + "epoch": 64.51331719128329, + "grad_norm": 7.4911931733367965e-06, + "learning_rate": 0.22545186065859202, + "loss": 0.0001, + "num_input_tokens_seen": 22773920, + "step": 13290 + }, + { + "epoch": 64.53753026634382, + "grad_norm": 6.855058927612845e-06, + "learning_rate": 0.2254009445953572, + "loss": 0.0, + "num_input_tokens_seen": 22782528, + "step": 13295 + }, + { + "epoch": 64.56174334140435, + "grad_norm": 1.3225037946540397e-05, + "learning_rate": 0.22535001690434917, + "loss": 0.0, + "num_input_tokens_seen": 22791200, + "step": 13300 + }, + { + "epoch": 64.5859564164649, + "grad_norm": 5.420035904535325e-06, + "learning_rate": 0.22529907759342163, + "loss": 0.0, + "num_input_tokens_seen": 22799680, + "step": 13305 + }, + { + "epoch": 64.61016949152543, + "grad_norm": 1.3675022273673676e-05, + "learning_rate": 0.22524812667043007, + "loss": 0.0001, + "num_input_tokens_seen": 22808128, + "step": 13310 + }, + { + "epoch": 64.63438256658596, + "grad_norm": 6.023608875693753e-06, + "learning_rate": 0.22519716414323177, + "loss": 0.0, + "num_input_tokens_seen": 22816640, + "step": 13315 + }, + { + "epoch": 64.65859564164649, + "grad_norm": 6.2379531300393865e-06, + "learning_rate": 0.22514619001968567, + "loss": 0.0, + "num_input_tokens_seen": 22825088, + "step": 13320 + }, + { + "epoch": 64.68280871670702, + "grad_norm": 9.613540896680206e-06, + "learning_rate": 0.2250952043076528, + "loss": 0.0, + "num_input_tokens_seen": 22833504, + "step": 13325 + }, + { + "epoch": 64.70702179176756, + "grad_norm": 1.3279439372126944e-05, + "learning_rate": 0.2250442070149957, + "loss": 0.0, + "num_input_tokens_seen": 22842112, + "step": 13330 + }, + { + "epoch": 64.73123486682809, + "grad_norm": 1.210975824506022e-05, + "learning_rate": 0.22499319814957885, + "loss": 0.0, + "num_input_tokens_seen": 22850464, + "step": 13335 + }, + { + "epoch": 64.75544794188862, + "grad_norm": 7.4493896136118565e-06, + "learning_rate": 0.2249421777192684, + "loss": 0.0, + "num_input_tokens_seen": 22858880, + "step": 13340 + }, + { + "epoch": 64.77966101694915, + "grad_norm": 1.767601315805223e-05, + "learning_rate": 0.22489114573193236, + "loss": 0.0, + "num_input_tokens_seen": 22867488, + "step": 13345 + }, + { + "epoch": 64.80387409200968, + "grad_norm": 8.616498234914616e-06, + "learning_rate": 0.2248401021954405, + "loss": 0.0, + "num_input_tokens_seen": 22876000, + "step": 13350 + }, + { + "epoch": 64.82808716707022, + "grad_norm": 5.673330633726437e-06, + "learning_rate": 0.22478904711766443, + "loss": 0.0, + "num_input_tokens_seen": 22884192, + "step": 13355 + }, + { + "epoch": 64.85230024213075, + "grad_norm": 6.3015691011969466e-06, + "learning_rate": 0.22473798050647734, + "loss": 0.0, + "num_input_tokens_seen": 22892832, + "step": 13360 + }, + { + "epoch": 64.87651331719128, + "grad_norm": 2.1400896002887748e-05, + "learning_rate": 0.22468690236975453, + "loss": 0.0001, + "num_input_tokens_seen": 22901312, + "step": 13365 + }, + { + "epoch": 64.90072639225181, + "grad_norm": 1.0489977285033092e-05, + "learning_rate": 0.22463581271537272, + "loss": 0.0, + "num_input_tokens_seen": 22909568, + "step": 13370 + }, + { + "epoch": 64.92493946731234, + "grad_norm": 1.249909746547928e-05, + "learning_rate": 0.22458471155121076, + "loss": 0.0, + "num_input_tokens_seen": 22917696, + "step": 13375 + }, + { + "epoch": 64.94915254237289, + "grad_norm": 6.593854777747765e-06, + "learning_rate": 0.2245335988851489, + "loss": 0.0, + "num_input_tokens_seen": 22926912, + "step": 13380 + }, + { + "epoch": 64.97336561743342, + "grad_norm": 2.529119456085027e-06, + "learning_rate": 0.2244824747250695, + "loss": 0.0, + "num_input_tokens_seen": 22935136, + "step": 13385 + }, + { + "epoch": 64.99757869249395, + "grad_norm": 2.3462350782210706e-06, + "learning_rate": 0.22443133907885646, + "loss": 0.0, + "num_input_tokens_seen": 22943616, + "step": 13390 + }, + { + "epoch": 65.02421307506053, + "grad_norm": 5.491408501256956e-06, + "learning_rate": 0.22438019195439557, + "loss": 0.0001, + "num_input_tokens_seen": 22952608, + "step": 13395 + }, + { + "epoch": 65.04842615012106, + "grad_norm": 6.920449777680915e-06, + "learning_rate": 0.22432903335957435, + "loss": 0.0, + "num_input_tokens_seen": 22961216, + "step": 13400 + }, + { + "epoch": 65.04842615012106, + "eval_loss": 0.6877094507217407, + "eval_runtime": 4.6327, + "eval_samples_per_second": 79.219, + "eval_steps_per_second": 19.859, + "num_input_tokens_seen": 22961216, + "step": 13400 + }, + { + "epoch": 65.0726392251816, + "grad_norm": 1.2449910173017997e-05, + "learning_rate": 0.22427786330228214, + "loss": 0.0, + "num_input_tokens_seen": 22969856, + "step": 13405 + }, + { + "epoch": 65.09685230024213, + "grad_norm": 7.982067472767085e-06, + "learning_rate": 0.22422668179040997, + "loss": 0.0, + "num_input_tokens_seen": 22978496, + "step": 13410 + }, + { + "epoch": 65.12106537530266, + "grad_norm": 3.9672936509305146e-06, + "learning_rate": 0.2241754888318507, + "loss": 0.0, + "num_input_tokens_seen": 22987008, + "step": 13415 + }, + { + "epoch": 65.1452784503632, + "grad_norm": 2.8769475193257676e-06, + "learning_rate": 0.22412428443449886, + "loss": 0.0, + "num_input_tokens_seen": 22995904, + "step": 13420 + }, + { + "epoch": 65.16949152542372, + "grad_norm": 8.331090612045955e-06, + "learning_rate": 0.22407306860625087, + "loss": 0.0, + "num_input_tokens_seen": 23004384, + "step": 13425 + }, + { + "epoch": 65.19370460048427, + "grad_norm": 1.539205368317198e-05, + "learning_rate": 0.22402184135500483, + "loss": 0.0, + "num_input_tokens_seen": 23013376, + "step": 13430 + }, + { + "epoch": 65.2179176755448, + "grad_norm": 6.47067281533964e-06, + "learning_rate": 0.22397060268866067, + "loss": 0.0, + "num_input_tokens_seen": 23022144, + "step": 13435 + }, + { + "epoch": 65.24213075060533, + "grad_norm": 6.74251168675255e-06, + "learning_rate": 0.22391935261511994, + "loss": 0.0, + "num_input_tokens_seen": 23030560, + "step": 13440 + }, + { + "epoch": 65.26634382566586, + "grad_norm": 3.3061169233405963e-06, + "learning_rate": 0.22386809114228615, + "loss": 0.0, + "num_input_tokens_seen": 23039008, + "step": 13445 + }, + { + "epoch": 65.29055690072639, + "grad_norm": 2.054183369182283e-06, + "learning_rate": 0.22381681827806446, + "loss": 0.0, + "num_input_tokens_seen": 23047328, + "step": 13450 + }, + { + "epoch": 65.31476997578693, + "grad_norm": 4.432408786669839e-06, + "learning_rate": 0.22376553403036173, + "loss": 0.0, + "num_input_tokens_seen": 23056000, + "step": 13455 + }, + { + "epoch": 65.33898305084746, + "grad_norm": 1.4281115909398068e-05, + "learning_rate": 0.22371423840708662, + "loss": 0.0, + "num_input_tokens_seen": 23064544, + "step": 13460 + }, + { + "epoch": 65.36319612590799, + "grad_norm": 3.733649009518558e-06, + "learning_rate": 0.22366293141614962, + "loss": 0.0, + "num_input_tokens_seen": 23073280, + "step": 13465 + }, + { + "epoch": 65.38740920096852, + "grad_norm": 3.567768544598948e-06, + "learning_rate": 0.22361161306546287, + "loss": 0.0, + "num_input_tokens_seen": 23081728, + "step": 13470 + }, + { + "epoch": 65.41162227602905, + "grad_norm": 8.856653948896565e-06, + "learning_rate": 0.22356028336294037, + "loss": 0.0, + "num_input_tokens_seen": 23090432, + "step": 13475 + }, + { + "epoch": 65.4358353510896, + "grad_norm": 6.610477612412069e-06, + "learning_rate": 0.2235089423164977, + "loss": 0.0, + "num_input_tokens_seen": 23099232, + "step": 13480 + }, + { + "epoch": 65.46004842615012, + "grad_norm": 7.30142983229598e-06, + "learning_rate": 0.22345758993405243, + "loss": 0.0, + "num_input_tokens_seen": 23107936, + "step": 13485 + }, + { + "epoch": 65.48426150121065, + "grad_norm": 1.0523073797230609e-05, + "learning_rate": 0.2234062262235236, + "loss": 0.0001, + "num_input_tokens_seen": 23116672, + "step": 13490 + }, + { + "epoch": 65.50847457627118, + "grad_norm": 8.02195518190274e-06, + "learning_rate": 0.22335485119283222, + "loss": 0.0, + "num_input_tokens_seen": 23125152, + "step": 13495 + }, + { + "epoch": 65.53268765133171, + "grad_norm": 1.9636720480775693e-06, + "learning_rate": 0.22330346484990093, + "loss": 0.0, + "num_input_tokens_seen": 23133600, + "step": 13500 + }, + { + "epoch": 65.55690072639226, + "grad_norm": 6.844009021733655e-06, + "learning_rate": 0.22325206720265425, + "loss": 0.0, + "num_input_tokens_seen": 23142208, + "step": 13505 + }, + { + "epoch": 65.58111380145279, + "grad_norm": 1.0546096746111289e-05, + "learning_rate": 0.2232006582590182, + "loss": 0.0, + "num_input_tokens_seen": 23150944, + "step": 13510 + }, + { + "epoch": 65.60532687651332, + "grad_norm": 7.974327672854997e-06, + "learning_rate": 0.22314923802692077, + "loss": 0.0, + "num_input_tokens_seen": 23159392, + "step": 13515 + }, + { + "epoch": 65.62953995157385, + "grad_norm": 1.3146130186214577e-05, + "learning_rate": 0.22309780651429156, + "loss": 0.0001, + "num_input_tokens_seen": 23167776, + "step": 13520 + }, + { + "epoch": 65.65375302663438, + "grad_norm": 6.913888682902325e-06, + "learning_rate": 0.22304636372906203, + "loss": 0.0, + "num_input_tokens_seen": 23176448, + "step": 13525 + }, + { + "epoch": 65.67796610169492, + "grad_norm": 9.742003385326825e-06, + "learning_rate": 0.22299490967916522, + "loss": 0.0001, + "num_input_tokens_seen": 23184576, + "step": 13530 + }, + { + "epoch": 65.70217917675545, + "grad_norm": 1.8502452803659253e-05, + "learning_rate": 0.22294344437253602, + "loss": 0.0, + "num_input_tokens_seen": 23193120, + "step": 13535 + }, + { + "epoch": 65.72639225181598, + "grad_norm": 1.4800451936025638e-05, + "learning_rate": 0.22289196781711101, + "loss": 0.0001, + "num_input_tokens_seen": 23201504, + "step": 13540 + }, + { + "epoch": 65.75060532687651, + "grad_norm": 8.553942279831972e-06, + "learning_rate": 0.2228404800208286, + "loss": 0.0001, + "num_input_tokens_seen": 23210208, + "step": 13545 + }, + { + "epoch": 65.77481840193704, + "grad_norm": 1.1717180314008147e-05, + "learning_rate": 0.22278898099162875, + "loss": 0.0001, + "num_input_tokens_seen": 23218976, + "step": 13550 + }, + { + "epoch": 65.79903147699758, + "grad_norm": 4.0749891923042014e-06, + "learning_rate": 0.22273747073745337, + "loss": 0.0, + "num_input_tokens_seen": 23227232, + "step": 13555 + }, + { + "epoch": 65.82324455205811, + "grad_norm": 6.732009751431178e-06, + "learning_rate": 0.22268594926624588, + "loss": 0.0, + "num_input_tokens_seen": 23236064, + "step": 13560 + }, + { + "epoch": 65.84745762711864, + "grad_norm": 1.3621983271150384e-05, + "learning_rate": 0.22263441658595162, + "loss": 0.0, + "num_input_tokens_seen": 23244544, + "step": 13565 + }, + { + "epoch": 65.87167070217917, + "grad_norm": 1.1434467523940839e-05, + "learning_rate": 0.2225828727045175, + "loss": 0.0, + "num_input_tokens_seen": 23252960, + "step": 13570 + }, + { + "epoch": 65.8958837772397, + "grad_norm": 8.510600309818983e-06, + "learning_rate": 0.22253131762989228, + "loss": 0.0, + "num_input_tokens_seen": 23261440, + "step": 13575 + }, + { + "epoch": 65.92009685230025, + "grad_norm": 3.313671641080873e-06, + "learning_rate": 0.2224797513700264, + "loss": 0.0, + "num_input_tokens_seen": 23270400, + "step": 13580 + }, + { + "epoch": 65.94430992736078, + "grad_norm": 9.942027645593043e-06, + "learning_rate": 0.22242817393287204, + "loss": 0.0, + "num_input_tokens_seen": 23278656, + "step": 13585 + }, + { + "epoch": 65.9685230024213, + "grad_norm": 9.631587090552785e-06, + "learning_rate": 0.22237658532638305, + "loss": 0.0, + "num_input_tokens_seen": 23286816, + "step": 13590 + }, + { + "epoch": 65.99273607748184, + "grad_norm": 5.299463737173937e-06, + "learning_rate": 0.22232498555851513, + "loss": 0.0, + "num_input_tokens_seen": 23295296, + "step": 13595 + }, + { + "epoch": 66.01937046004842, + "grad_norm": 9.754885468282737e-06, + "learning_rate": 0.22227337463722546, + "loss": 0.0, + "num_input_tokens_seen": 23304288, + "step": 13600 + }, + { + "epoch": 66.01937046004842, + "eval_loss": 0.696119487285614, + "eval_runtime": 4.6315, + "eval_samples_per_second": 79.24, + "eval_steps_per_second": 19.864, + "num_input_tokens_seen": 23304288, + "step": 13600 + }, + { + "epoch": 66.04358353510897, + "grad_norm": 8.711696864338592e-06, + "learning_rate": 0.2222217525704732, + "loss": 0.0, + "num_input_tokens_seen": 23312704, + "step": 13605 + }, + { + "epoch": 66.0677966101695, + "grad_norm": 1.3805297385260928e-05, + "learning_rate": 0.22217011936621908, + "loss": 0.0, + "num_input_tokens_seen": 23320896, + "step": 13610 + }, + { + "epoch": 66.09200968523002, + "grad_norm": 8.67503786139423e-06, + "learning_rate": 0.22211847503242566, + "loss": 0.0, + "num_input_tokens_seen": 23328992, + "step": 13615 + }, + { + "epoch": 66.11622276029055, + "grad_norm": 2.2898482257005526e-06, + "learning_rate": 0.22206681957705704, + "loss": 0.0, + "num_input_tokens_seen": 23337536, + "step": 13620 + }, + { + "epoch": 66.14043583535108, + "grad_norm": 1.5004498891357798e-05, + "learning_rate": 0.2220151530080792, + "loss": 0.0, + "num_input_tokens_seen": 23346560, + "step": 13625 + }, + { + "epoch": 66.16464891041163, + "grad_norm": 7.409056706819683e-06, + "learning_rate": 0.2219634753334598, + "loss": 0.0, + "num_input_tokens_seen": 23354752, + "step": 13630 + }, + { + "epoch": 66.18886198547216, + "grad_norm": 4.0774521039566025e-06, + "learning_rate": 0.22191178656116817, + "loss": 0.0, + "num_input_tokens_seen": 23363168, + "step": 13635 + }, + { + "epoch": 66.21307506053269, + "grad_norm": 4.666391760110855e-06, + "learning_rate": 0.2218600866991753, + "loss": 0.0, + "num_input_tokens_seen": 23371648, + "step": 13640 + }, + { + "epoch": 66.23728813559322, + "grad_norm": 7.563046438008314e-06, + "learning_rate": 0.221808375755454, + "loss": 0.0, + "num_input_tokens_seen": 23380544, + "step": 13645 + }, + { + "epoch": 66.26150121065375, + "grad_norm": 1.618567466721288e-06, + "learning_rate": 0.22175665373797881, + "loss": 0.0, + "num_input_tokens_seen": 23389120, + "step": 13650 + }, + { + "epoch": 66.28571428571429, + "grad_norm": 1.618506394152064e-05, + "learning_rate": 0.22170492065472583, + "loss": 0.0, + "num_input_tokens_seen": 23397536, + "step": 13655 + }, + { + "epoch": 66.30992736077482, + "grad_norm": 7.387754521914758e-06, + "learning_rate": 0.221653176513673, + "loss": 0.0, + "num_input_tokens_seen": 23405952, + "step": 13660 + }, + { + "epoch": 66.33414043583535, + "grad_norm": 8.065419024205767e-06, + "learning_rate": 0.2216014213227999, + "loss": 0.0, + "num_input_tokens_seen": 23414368, + "step": 13665 + }, + { + "epoch": 66.35835351089588, + "grad_norm": 3.83940277970396e-06, + "learning_rate": 0.22154965509008784, + "loss": 0.0, + "num_input_tokens_seen": 23423040, + "step": 13670 + }, + { + "epoch": 66.38256658595641, + "grad_norm": 6.445816325140186e-07, + "learning_rate": 0.2214978778235198, + "loss": 0.0, + "num_input_tokens_seen": 23431232, + "step": 13675 + }, + { + "epoch": 66.40677966101696, + "grad_norm": 1.4095315236772876e-05, + "learning_rate": 0.2214460895310805, + "loss": 0.0, + "num_input_tokens_seen": 23439744, + "step": 13680 + }, + { + "epoch": 66.43099273607749, + "grad_norm": 1.3729418242292013e-06, + "learning_rate": 0.22139429022075635, + "loss": 0.0, + "num_input_tokens_seen": 23448640, + "step": 13685 + }, + { + "epoch": 66.45520581113801, + "grad_norm": 1.131487988459412e-05, + "learning_rate": 0.22134247990053546, + "loss": 0.0, + "num_input_tokens_seen": 23456928, + "step": 13690 + }, + { + "epoch": 66.47941888619854, + "grad_norm": 8.500082913087681e-06, + "learning_rate": 0.2212906585784076, + "loss": 0.0, + "num_input_tokens_seen": 23465312, + "step": 13695 + }, + { + "epoch": 66.50363196125907, + "grad_norm": 4.213838565192418e-06, + "learning_rate": 0.22123882626236432, + "loss": 0.0, + "num_input_tokens_seen": 23474272, + "step": 13700 + }, + { + "epoch": 66.52784503631962, + "grad_norm": 3.5517234664439457e-06, + "learning_rate": 0.2211869829603988, + "loss": 0.0, + "num_input_tokens_seen": 23482400, + "step": 13705 + }, + { + "epoch": 66.55205811138015, + "grad_norm": 6.224649041541852e-06, + "learning_rate": 0.22113512868050592, + "loss": 0.0, + "num_input_tokens_seen": 23491136, + "step": 13710 + }, + { + "epoch": 66.57627118644068, + "grad_norm": 1.051233721227618e-05, + "learning_rate": 0.2210832634306822, + "loss": 0.0, + "num_input_tokens_seen": 23499712, + "step": 13715 + }, + { + "epoch": 66.60048426150121, + "grad_norm": 7.824385647836607e-06, + "learning_rate": 0.22103138721892598, + "loss": 0.0, + "num_input_tokens_seen": 23508352, + "step": 13720 + }, + { + "epoch": 66.62469733656174, + "grad_norm": 5.442684141598875e-06, + "learning_rate": 0.22097950005323724, + "loss": 0.0, + "num_input_tokens_seen": 23517056, + "step": 13725 + }, + { + "epoch": 66.64891041162228, + "grad_norm": 8.519321454514284e-06, + "learning_rate": 0.22092760194161762, + "loss": 0.0, + "num_input_tokens_seen": 23526080, + "step": 13730 + }, + { + "epoch": 66.67312348668281, + "grad_norm": 5.06283322465606e-06, + "learning_rate": 0.2208756928920704, + "loss": 0.0, + "num_input_tokens_seen": 23534528, + "step": 13735 + }, + { + "epoch": 66.69733656174334, + "grad_norm": 1.9407118088565767e-05, + "learning_rate": 0.22082377291260072, + "loss": 0.0, + "num_input_tokens_seen": 23542976, + "step": 13740 + }, + { + "epoch": 66.72154963680387, + "grad_norm": 6.832921826571692e-06, + "learning_rate": 0.2207718420112152, + "loss": 0.0, + "num_input_tokens_seen": 23552160, + "step": 13745 + }, + { + "epoch": 66.7457627118644, + "grad_norm": 4.2879755710600875e-06, + "learning_rate": 0.22071990019592228, + "loss": 0.0, + "num_input_tokens_seen": 23561184, + "step": 13750 + }, + { + "epoch": 66.76997578692495, + "grad_norm": 6.817879693699069e-06, + "learning_rate": 0.22066794747473198, + "loss": 0.0, + "num_input_tokens_seen": 23569696, + "step": 13755 + }, + { + "epoch": 66.79418886198548, + "grad_norm": 6.496049081761157e-06, + "learning_rate": 0.2206159838556562, + "loss": 0.0, + "num_input_tokens_seen": 23578336, + "step": 13760 + }, + { + "epoch": 66.818401937046, + "grad_norm": 8.887383955880068e-06, + "learning_rate": 0.2205640093467082, + "loss": 0.0, + "num_input_tokens_seen": 23587232, + "step": 13765 + }, + { + "epoch": 66.84261501210653, + "grad_norm": 3.132069423372741e-06, + "learning_rate": 0.22051202395590322, + "loss": 0.0, + "num_input_tokens_seen": 23595520, + "step": 13770 + }, + { + "epoch": 66.86682808716706, + "grad_norm": 9.28678400669014e-06, + "learning_rate": 0.22046002769125808, + "loss": 0.0, + "num_input_tokens_seen": 23604224, + "step": 13775 + }, + { + "epoch": 66.89104116222761, + "grad_norm": 5.194187906454317e-06, + "learning_rate": 0.2204080205607912, + "loss": 0.0, + "num_input_tokens_seen": 23612992, + "step": 13780 + }, + { + "epoch": 66.91525423728814, + "grad_norm": 1.1053780326619744e-05, + "learning_rate": 0.22035600257252272, + "loss": 0.0, + "num_input_tokens_seen": 23621472, + "step": 13785 + }, + { + "epoch": 66.93946731234867, + "grad_norm": 6.50177025818266e-06, + "learning_rate": 0.2203039737344745, + "loss": 0.0, + "num_input_tokens_seen": 23629984, + "step": 13790 + }, + { + "epoch": 66.9636803874092, + "grad_norm": 2.940330205092323e-06, + "learning_rate": 0.22025193405467003, + "loss": 0.0, + "num_input_tokens_seen": 23638336, + "step": 13795 + }, + { + "epoch": 66.98789346246973, + "grad_norm": 1.0178112461289857e-05, + "learning_rate": 0.2201998835411345, + "loss": 0.0, + "num_input_tokens_seen": 23646592, + "step": 13800 + }, + { + "epoch": 66.98789346246973, + "eval_loss": 0.7060869336128235, + "eval_runtime": 4.6085, + "eval_samples_per_second": 79.636, + "eval_steps_per_second": 19.963, + "num_input_tokens_seen": 23646592, + "step": 13800 + }, + { + "epoch": 67.01452784503633, + "grad_norm": 9.445935575058684e-06, + "learning_rate": 0.22014782220189474, + "loss": 0.0001, + "num_input_tokens_seen": 23655648, + "step": 13805 + }, + { + "epoch": 67.03874092009686, + "grad_norm": 5.297511052049231e-06, + "learning_rate": 0.2200957500449793, + "loss": 0.0, + "num_input_tokens_seen": 23664096, + "step": 13810 + }, + { + "epoch": 67.06295399515739, + "grad_norm": 3.2106825074151857e-06, + "learning_rate": 0.22004366707841827, + "loss": 0.0, + "num_input_tokens_seen": 23672608, + "step": 13815 + }, + { + "epoch": 67.08716707021792, + "grad_norm": 5.825221705890726e-06, + "learning_rate": 0.21999157331024358, + "loss": 0.0, + "num_input_tokens_seen": 23680864, + "step": 13820 + }, + { + "epoch": 67.11138014527845, + "grad_norm": 8.797274858807214e-06, + "learning_rate": 0.21993946874848871, + "loss": 0.0, + "num_input_tokens_seen": 23689408, + "step": 13825 + }, + { + "epoch": 67.13559322033899, + "grad_norm": 4.77911862617475e-06, + "learning_rate": 0.2198873534011888, + "loss": 0.0, + "num_input_tokens_seen": 23698144, + "step": 13830 + }, + { + "epoch": 67.15980629539952, + "grad_norm": 2.7884175324288663e-06, + "learning_rate": 0.2198352272763808, + "loss": 0.0, + "num_input_tokens_seen": 23706624, + "step": 13835 + }, + { + "epoch": 67.18401937046005, + "grad_norm": 4.087565230292967e-06, + "learning_rate": 0.2197830903821031, + "loss": 0.0, + "num_input_tokens_seen": 23715392, + "step": 13840 + }, + { + "epoch": 67.20823244552058, + "grad_norm": 3.7213553696346935e-06, + "learning_rate": 0.21973094272639598, + "loss": 0.0, + "num_input_tokens_seen": 23723936, + "step": 13845 + }, + { + "epoch": 67.23244552058111, + "grad_norm": 4.626612280844711e-06, + "learning_rate": 0.21967878431730117, + "loss": 0.0, + "num_input_tokens_seen": 23732544, + "step": 13850 + }, + { + "epoch": 67.25665859564165, + "grad_norm": 1.100535155273974e-05, + "learning_rate": 0.21962661516286217, + "loss": 0.0, + "num_input_tokens_seen": 23741056, + "step": 13855 + }, + { + "epoch": 67.28087167070218, + "grad_norm": 4.229372279951349e-06, + "learning_rate": 0.21957443527112414, + "loss": 0.0, + "num_input_tokens_seen": 23749504, + "step": 13860 + }, + { + "epoch": 67.30508474576271, + "grad_norm": 8.574807907280046e-06, + "learning_rate": 0.21952224465013384, + "loss": 0.0, + "num_input_tokens_seen": 23758208, + "step": 13865 + }, + { + "epoch": 67.32929782082324, + "grad_norm": 9.940144991560373e-06, + "learning_rate": 0.21947004330793976, + "loss": 0.0, + "num_input_tokens_seen": 23766784, + "step": 13870 + }, + { + "epoch": 67.35351089588377, + "grad_norm": 9.481838105784846e-07, + "learning_rate": 0.21941783125259198, + "loss": 0.0, + "num_input_tokens_seen": 23775136, + "step": 13875 + }, + { + "epoch": 67.37772397094432, + "grad_norm": 1.763680120347999e-05, + "learning_rate": 0.21936560849214226, + "loss": 0.0, + "num_input_tokens_seen": 23784000, + "step": 13880 + }, + { + "epoch": 67.40193704600485, + "grad_norm": 5.275309376884252e-06, + "learning_rate": 0.21931337503464404, + "loss": 0.0, + "num_input_tokens_seen": 23792608, + "step": 13885 + }, + { + "epoch": 67.42615012106538, + "grad_norm": 4.81705956190126e-06, + "learning_rate": 0.21926113088815233, + "loss": 0.0, + "num_input_tokens_seen": 23801280, + "step": 13890 + }, + { + "epoch": 67.4503631961259, + "grad_norm": 5.212139967625262e-06, + "learning_rate": 0.2192088760607238, + "loss": 0.0, + "num_input_tokens_seen": 23810016, + "step": 13895 + }, + { + "epoch": 67.47457627118644, + "grad_norm": 4.56682801086572e-06, + "learning_rate": 0.2191566105604169, + "loss": 0.0, + "num_input_tokens_seen": 23818432, + "step": 13900 + }, + { + "epoch": 67.49878934624698, + "grad_norm": 7.403093150060158e-06, + "learning_rate": 0.21910433439529153, + "loss": 0.0, + "num_input_tokens_seen": 23827008, + "step": 13905 + }, + { + "epoch": 67.52300242130751, + "grad_norm": 9.388378202856984e-06, + "learning_rate": 0.2190520475734094, + "loss": 0.0, + "num_input_tokens_seen": 23835744, + "step": 13910 + }, + { + "epoch": 67.54721549636804, + "grad_norm": 4.237809662299696e-06, + "learning_rate": 0.2189997501028338, + "loss": 0.0, + "num_input_tokens_seen": 23844352, + "step": 13915 + }, + { + "epoch": 67.57142857142857, + "grad_norm": 2.051441924777464e-06, + "learning_rate": 0.2189474419916296, + "loss": 0.0, + "num_input_tokens_seen": 23852672, + "step": 13920 + }, + { + "epoch": 67.5956416464891, + "grad_norm": 9.982405572372954e-06, + "learning_rate": 0.21889512324786342, + "loss": 0.0, + "num_input_tokens_seen": 23861088, + "step": 13925 + }, + { + "epoch": 67.61985472154964, + "grad_norm": 1.1629186701611616e-05, + "learning_rate": 0.21884279387960345, + "loss": 0.0, + "num_input_tokens_seen": 23869664, + "step": 13930 + }, + { + "epoch": 67.64406779661017, + "grad_norm": 3.7032505133538507e-06, + "learning_rate": 0.2187904538949195, + "loss": 0.0, + "num_input_tokens_seen": 23878016, + "step": 13935 + }, + { + "epoch": 67.6682808716707, + "grad_norm": 9.97778170130914e-06, + "learning_rate": 0.2187381033018831, + "loss": 0.0, + "num_input_tokens_seen": 23886848, + "step": 13940 + }, + { + "epoch": 67.69249394673123, + "grad_norm": 4.8150313887163065e-06, + "learning_rate": 0.2186857421085673, + "loss": 0.0, + "num_input_tokens_seen": 23895328, + "step": 13945 + }, + { + "epoch": 67.71670702179176, + "grad_norm": 4.7228213588823564e-06, + "learning_rate": 0.21863337032304697, + "loss": 0.0, + "num_input_tokens_seen": 23903840, + "step": 13950 + }, + { + "epoch": 67.7409200968523, + "grad_norm": 5.9161898207094055e-06, + "learning_rate": 0.21858098795339845, + "loss": 0.0, + "num_input_tokens_seen": 23912320, + "step": 13955 + }, + { + "epoch": 67.76513317191284, + "grad_norm": 6.836526608822169e-06, + "learning_rate": 0.21852859500769975, + "loss": 0.0, + "num_input_tokens_seen": 23921024, + "step": 13960 + }, + { + "epoch": 67.78934624697337, + "grad_norm": 1.6995856640278362e-05, + "learning_rate": 0.21847619149403044, + "loss": 0.0, + "num_input_tokens_seen": 23929920, + "step": 13965 + }, + { + "epoch": 67.8135593220339, + "grad_norm": 7.0002129177737515e-06, + "learning_rate": 0.21842377742047195, + "loss": 0.0, + "num_input_tokens_seen": 23938304, + "step": 13970 + }, + { + "epoch": 67.83777239709443, + "grad_norm": 5.550267360376893e-06, + "learning_rate": 0.21837135279510705, + "loss": 0.0, + "num_input_tokens_seen": 23946592, + "step": 13975 + }, + { + "epoch": 67.86198547215497, + "grad_norm": 6.659209702775115e-06, + "learning_rate": 0.21831891762602038, + "loss": 0.0, + "num_input_tokens_seen": 23955264, + "step": 13980 + }, + { + "epoch": 67.8861985472155, + "grad_norm": 5.9058361330244225e-06, + "learning_rate": 0.21826647192129806, + "loss": 0.0, + "num_input_tokens_seen": 23964384, + "step": 13985 + }, + { + "epoch": 67.91041162227603, + "grad_norm": 3.4883030366472667e-06, + "learning_rate": 0.21821401568902787, + "loss": 0.0, + "num_input_tokens_seen": 23972576, + "step": 13990 + }, + { + "epoch": 67.93462469733656, + "grad_norm": 1.111199253500672e-05, + "learning_rate": 0.21816154893729925, + "loss": 0.0, + "num_input_tokens_seen": 23981056, + "step": 13995 + }, + { + "epoch": 67.95883777239709, + "grad_norm": 2.2545489173353417e-06, + "learning_rate": 0.2181090716742032, + "loss": 0.0, + "num_input_tokens_seen": 23989408, + "step": 14000 + }, + { + "epoch": 67.95883777239709, + "eval_loss": 0.7121654748916626, + "eval_runtime": 4.621, + "eval_samples_per_second": 79.42, + "eval_steps_per_second": 19.909, + "num_input_tokens_seen": 23989408, + "step": 14000 + }, + { + "epoch": 67.98305084745763, + "grad_norm": 4.681936843553558e-06, + "learning_rate": 0.21805658390783236, + "loss": 0.0, + "num_input_tokens_seen": 23998144, + "step": 14005 + }, + { + "epoch": 68.00968523002422, + "grad_norm": 1.8157859358325368e-06, + "learning_rate": 0.21800408564628107, + "loss": 0.0, + "num_input_tokens_seen": 24007264, + "step": 14010 + }, + { + "epoch": 68.03389830508475, + "grad_norm": 6.567765467480058e-06, + "learning_rate": 0.21795157689764516, + "loss": 0.0, + "num_input_tokens_seen": 24015904, + "step": 14015 + }, + { + "epoch": 68.05811138014528, + "grad_norm": 3.97023586629075e-06, + "learning_rate": 0.21789905767002216, + "loss": 0.0, + "num_input_tokens_seen": 24024096, + "step": 14020 + }, + { + "epoch": 68.08232445520581, + "grad_norm": 9.475580554862972e-06, + "learning_rate": 0.2178465279715112, + "loss": 0.0, + "num_input_tokens_seen": 24032768, + "step": 14025 + }, + { + "epoch": 68.10653753026634, + "grad_norm": 6.08948539593257e-06, + "learning_rate": 0.21779398781021303, + "loss": 0.0, + "num_input_tokens_seen": 24041408, + "step": 14030 + }, + { + "epoch": 68.13075060532688, + "grad_norm": 1.44763253047131e-05, + "learning_rate": 0.21774143719422998, + "loss": 0.0, + "num_input_tokens_seen": 24049664, + "step": 14035 + }, + { + "epoch": 68.15496368038741, + "grad_norm": 1.0121534614881966e-05, + "learning_rate": 0.21768887613166601, + "loss": 0.0, + "num_input_tokens_seen": 24058560, + "step": 14040 + }, + { + "epoch": 68.17917675544794, + "grad_norm": 3.3781495858420385e-06, + "learning_rate": 0.2176363046306267, + "loss": 0.0, + "num_input_tokens_seen": 24067136, + "step": 14045 + }, + { + "epoch": 68.20338983050847, + "grad_norm": 4.696848463936476e-06, + "learning_rate": 0.21758372269921925, + "loss": 0.0, + "num_input_tokens_seen": 24075936, + "step": 14050 + }, + { + "epoch": 68.227602905569, + "grad_norm": 7.313691185117932e-06, + "learning_rate": 0.21753113034555244, + "loss": 0.0, + "num_input_tokens_seen": 24084928, + "step": 14055 + }, + { + "epoch": 68.25181598062954, + "grad_norm": 3.49187507708848e-06, + "learning_rate": 0.2174785275777367, + "loss": 0.0, + "num_input_tokens_seen": 24093696, + "step": 14060 + }, + { + "epoch": 68.27602905569007, + "grad_norm": 1.9433844045124715e-06, + "learning_rate": 0.21742591440388404, + "loss": 0.0, + "num_input_tokens_seen": 24102368, + "step": 14065 + }, + { + "epoch": 68.3002421307506, + "grad_norm": 4.00757653551409e-06, + "learning_rate": 0.21737329083210802, + "loss": 0.0, + "num_input_tokens_seen": 24110976, + "step": 14070 + }, + { + "epoch": 68.32445520581113, + "grad_norm": 8.592336598667316e-06, + "learning_rate": 0.2173206568705239, + "loss": 0.0, + "num_input_tokens_seen": 24119456, + "step": 14075 + }, + { + "epoch": 68.34866828087166, + "grad_norm": 5.4087558964965865e-06, + "learning_rate": 0.2172680125272485, + "loss": 0.0, + "num_input_tokens_seen": 24127808, + "step": 14080 + }, + { + "epoch": 68.37288135593221, + "grad_norm": 2.8954357276234077e-06, + "learning_rate": 0.2172153578104002, + "loss": 0.0, + "num_input_tokens_seen": 24136192, + "step": 14085 + }, + { + "epoch": 68.39709443099274, + "grad_norm": 1.7926773807630525e-06, + "learning_rate": 0.21716269272809902, + "loss": 0.0, + "num_input_tokens_seen": 24144896, + "step": 14090 + }, + { + "epoch": 68.42130750605327, + "grad_norm": 1.3333651622815523e-05, + "learning_rate": 0.21711001728846666, + "loss": 0.0, + "num_input_tokens_seen": 24153280, + "step": 14095 + }, + { + "epoch": 68.4455205811138, + "grad_norm": 1.5149900036703912e-06, + "learning_rate": 0.21705733149962628, + "loss": 0.0, + "num_input_tokens_seen": 24161888, + "step": 14100 + }, + { + "epoch": 68.46973365617433, + "grad_norm": 7.000043751759222e-06, + "learning_rate": 0.21700463536970263, + "loss": 0.0, + "num_input_tokens_seen": 24170208, + "step": 14105 + }, + { + "epoch": 68.49394673123487, + "grad_norm": 4.802072453458095e-06, + "learning_rate": 0.21695192890682222, + "loss": 0.0, + "num_input_tokens_seen": 24178496, + "step": 14110 + }, + { + "epoch": 68.5181598062954, + "grad_norm": 5.617511305899825e-06, + "learning_rate": 0.21689921211911298, + "loss": 0.0, + "num_input_tokens_seen": 24187328, + "step": 14115 + }, + { + "epoch": 68.54237288135593, + "grad_norm": 7.910864042059984e-06, + "learning_rate": 0.21684648501470452, + "loss": 0.0, + "num_input_tokens_seen": 24195904, + "step": 14120 + }, + { + "epoch": 68.56658595641646, + "grad_norm": 8.641353815619368e-06, + "learning_rate": 0.216793747601728, + "loss": 0.0, + "num_input_tokens_seen": 24204224, + "step": 14125 + }, + { + "epoch": 68.59079903147699, + "grad_norm": 2.8409381229721475e-06, + "learning_rate": 0.21674099988831627, + "loss": 0.0, + "num_input_tokens_seen": 24212864, + "step": 14130 + }, + { + "epoch": 68.61501210653753, + "grad_norm": 1.089091438188916e-05, + "learning_rate": 0.21668824188260363, + "loss": 0.0, + "num_input_tokens_seen": 24221216, + "step": 14135 + }, + { + "epoch": 68.63922518159806, + "grad_norm": 8.112468094623182e-06, + "learning_rate": 0.21663547359272606, + "loss": 0.0, + "num_input_tokens_seen": 24229504, + "step": 14140 + }, + { + "epoch": 68.6634382566586, + "grad_norm": 9.313052942161448e-06, + "learning_rate": 0.216582695026821, + "loss": 0.0, + "num_input_tokens_seen": 24238240, + "step": 14145 + }, + { + "epoch": 68.68765133171912, + "grad_norm": 1.0316389307263307e-05, + "learning_rate": 0.21652990619302767, + "loss": 0.0, + "num_input_tokens_seen": 24247008, + "step": 14150 + }, + { + "epoch": 68.71186440677967, + "grad_norm": 2.9572095172625268e-06, + "learning_rate": 0.21647710709948673, + "loss": 0.0, + "num_input_tokens_seen": 24255776, + "step": 14155 + }, + { + "epoch": 68.7360774818402, + "grad_norm": 5.288602551445365e-06, + "learning_rate": 0.2164242977543405, + "loss": 0.0, + "num_input_tokens_seen": 24264160, + "step": 14160 + }, + { + "epoch": 68.76029055690073, + "grad_norm": 1.326646361121675e-05, + "learning_rate": 0.21637147816573277, + "loss": 0.0, + "num_input_tokens_seen": 24272768, + "step": 14165 + }, + { + "epoch": 68.78450363196126, + "grad_norm": 6.511041647172533e-06, + "learning_rate": 0.21631864834180908, + "loss": 0.0, + "num_input_tokens_seen": 24281600, + "step": 14170 + }, + { + "epoch": 68.80871670702179, + "grad_norm": 4.5509409574151505e-06, + "learning_rate": 0.21626580829071637, + "loss": 0.0, + "num_input_tokens_seen": 24290240, + "step": 14175 + }, + { + "epoch": 68.83292978208233, + "grad_norm": 4.352874839241849e-06, + "learning_rate": 0.21621295802060328, + "loss": 0.0, + "num_input_tokens_seen": 24298816, + "step": 14180 + }, + { + "epoch": 68.85714285714286, + "grad_norm": 7.238736088766018e-06, + "learning_rate": 0.21616009753961996, + "loss": 0.0, + "num_input_tokens_seen": 24307200, + "step": 14185 + }, + { + "epoch": 68.88135593220339, + "grad_norm": 2.34105868912593e-06, + "learning_rate": 0.2161072268559182, + "loss": 0.0, + "num_input_tokens_seen": 24315776, + "step": 14190 + }, + { + "epoch": 68.90556900726392, + "grad_norm": 7.494660167139955e-06, + "learning_rate": 0.21605434597765133, + "loss": 0.0, + "num_input_tokens_seen": 24324160, + "step": 14195 + }, + { + "epoch": 68.92978208232445, + "grad_norm": 2.0842110188823426e-06, + "learning_rate": 0.21600145491297418, + "loss": 0.0, + "num_input_tokens_seen": 24332544, + "step": 14200 + }, + { + "epoch": 68.92978208232445, + "eval_loss": 0.7210355997085571, + "eval_runtime": 4.6201, + "eval_samples_per_second": 79.435, + "eval_steps_per_second": 19.913, + "num_input_tokens_seen": 24332544, + "step": 14200 + }, + { + "epoch": 68.953995157385, + "grad_norm": 2.451803538860986e-06, + "learning_rate": 0.21594855367004326, + "loss": 0.0, + "num_input_tokens_seen": 24341152, + "step": 14205 + }, + { + "epoch": 68.97820823244552, + "grad_norm": 6.910451247676974e-06, + "learning_rate": 0.21589564225701663, + "loss": 0.0, + "num_input_tokens_seen": 24349472, + "step": 14210 + }, + { + "epoch": 69.00484261501211, + "grad_norm": 5.3917010518489406e-06, + "learning_rate": 0.21584272068205385, + "loss": 0.0, + "num_input_tokens_seen": 24358432, + "step": 14215 + }, + { + "epoch": 69.02905569007264, + "grad_norm": 4.639612143364502e-06, + "learning_rate": 0.2157897889533161, + "loss": 0.0, + "num_input_tokens_seen": 24367040, + "step": 14220 + }, + { + "epoch": 69.05326876513317, + "grad_norm": 3.7296540540410206e-06, + "learning_rate": 0.21573684707896612, + "loss": 0.0, + "num_input_tokens_seen": 24375616, + "step": 14225 + }, + { + "epoch": 69.0774818401937, + "grad_norm": 4.931417606712785e-06, + "learning_rate": 0.21568389506716826, + "loss": 0.0, + "num_input_tokens_seen": 24383840, + "step": 14230 + }, + { + "epoch": 69.10169491525424, + "grad_norm": 6.235172804736067e-06, + "learning_rate": 0.21563093292608831, + "loss": 0.0, + "num_input_tokens_seen": 24392736, + "step": 14235 + }, + { + "epoch": 69.12590799031477, + "grad_norm": 9.166265613202995e-07, + "learning_rate": 0.21557796066389376, + "loss": 0.0, + "num_input_tokens_seen": 24401056, + "step": 14240 + }, + { + "epoch": 69.1501210653753, + "grad_norm": 4.043148692289833e-06, + "learning_rate": 0.21552497828875353, + "loss": 0.0, + "num_input_tokens_seen": 24409664, + "step": 14245 + }, + { + "epoch": 69.17433414043583, + "grad_norm": 3.2384289170295233e-06, + "learning_rate": 0.21547198580883828, + "loss": 0.0, + "num_input_tokens_seen": 24418080, + "step": 14250 + }, + { + "epoch": 69.19854721549636, + "grad_norm": 7.815346179995686e-06, + "learning_rate": 0.21541898323232, + "loss": 0.0, + "num_input_tokens_seen": 24426624, + "step": 14255 + }, + { + "epoch": 69.2227602905569, + "grad_norm": 6.224007847777102e-06, + "learning_rate": 0.2153659705673724, + "loss": 0.0, + "num_input_tokens_seen": 24434912, + "step": 14260 + }, + { + "epoch": 69.24697336561744, + "grad_norm": 1.249558522431471e-06, + "learning_rate": 0.2153129478221707, + "loss": 0.0, + "num_input_tokens_seen": 24443488, + "step": 14265 + }, + { + "epoch": 69.27118644067797, + "grad_norm": 3.94716653318028e-06, + "learning_rate": 0.21525991500489164, + "loss": 0.0, + "num_input_tokens_seen": 24452640, + "step": 14270 + }, + { + "epoch": 69.2953995157385, + "grad_norm": 5.2542104640451726e-06, + "learning_rate": 0.21520687212371362, + "loss": 0.0, + "num_input_tokens_seen": 24461248, + "step": 14275 + }, + { + "epoch": 69.31961259079903, + "grad_norm": 5.845005489391042e-06, + "learning_rate": 0.21515381918681648, + "loss": 0.0, + "num_input_tokens_seen": 24469984, + "step": 14280 + }, + { + "epoch": 69.34382566585957, + "grad_norm": 1.1513915524119511e-05, + "learning_rate": 0.21510075620238167, + "loss": 0.0, + "num_input_tokens_seen": 24478240, + "step": 14285 + }, + { + "epoch": 69.3680387409201, + "grad_norm": 6.541861694131512e-06, + "learning_rate": 0.21504768317859208, + "loss": 0.0, + "num_input_tokens_seen": 24486624, + "step": 14290 + }, + { + "epoch": 69.39225181598063, + "grad_norm": 8.087914466159418e-06, + "learning_rate": 0.2149946001236323, + "loss": 0.0, + "num_input_tokens_seen": 24495008, + "step": 14295 + }, + { + "epoch": 69.41646489104116, + "grad_norm": 3.647468020062661e-06, + "learning_rate": 0.21494150704568848, + "loss": 0.0, + "num_input_tokens_seen": 24503392, + "step": 14300 + }, + { + "epoch": 69.44067796610169, + "grad_norm": 5.8408659242559224e-06, + "learning_rate": 0.21488840395294811, + "loss": 0.0, + "num_input_tokens_seen": 24511872, + "step": 14305 + }, + { + "epoch": 69.46489104116223, + "grad_norm": 5.993641480017686e-06, + "learning_rate": 0.21483529085360042, + "loss": 0.0, + "num_input_tokens_seen": 24520224, + "step": 14310 + }, + { + "epoch": 69.48910411622276, + "grad_norm": 5.431258159660501e-06, + "learning_rate": 0.2147821677558361, + "loss": 0.0, + "num_input_tokens_seen": 24528736, + "step": 14315 + }, + { + "epoch": 69.51331719128329, + "grad_norm": 7.33085471438244e-06, + "learning_rate": 0.2147290346678475, + "loss": 0.0, + "num_input_tokens_seen": 24537024, + "step": 14320 + }, + { + "epoch": 69.53753026634382, + "grad_norm": 7.1031295192369726e-06, + "learning_rate": 0.21467589159782827, + "loss": 0.0, + "num_input_tokens_seen": 24545856, + "step": 14325 + }, + { + "epoch": 69.56174334140435, + "grad_norm": 5.370317467168206e-06, + "learning_rate": 0.21462273855397374, + "loss": 0.0, + "num_input_tokens_seen": 24554624, + "step": 14330 + }, + { + "epoch": 69.5859564164649, + "grad_norm": 3.4275110465387115e-06, + "learning_rate": 0.21456957554448083, + "loss": 0.0, + "num_input_tokens_seen": 24563552, + "step": 14335 + }, + { + "epoch": 69.61016949152543, + "grad_norm": 1.9038150185224367e-06, + "learning_rate": 0.21451640257754795, + "loss": 0.0, + "num_input_tokens_seen": 24572288, + "step": 14340 + }, + { + "epoch": 69.63438256658596, + "grad_norm": 1.0349912372475956e-05, + "learning_rate": 0.21446321966137508, + "loss": 0.0, + "num_input_tokens_seen": 24580928, + "step": 14345 + }, + { + "epoch": 69.65859564164649, + "grad_norm": 3.7406252886285074e-06, + "learning_rate": 0.21441002680416354, + "loss": 0.0, + "num_input_tokens_seen": 24589696, + "step": 14350 + }, + { + "epoch": 69.68280871670702, + "grad_norm": 2.535851763241226e-06, + "learning_rate": 0.21435682401411654, + "loss": 0.0, + "num_input_tokens_seen": 24598144, + "step": 14355 + }, + { + "epoch": 69.70702179176756, + "grad_norm": 4.735127731692046e-06, + "learning_rate": 0.2143036112994385, + "loss": 0.0, + "num_input_tokens_seen": 24607040, + "step": 14360 + }, + { + "epoch": 69.73123486682809, + "grad_norm": 5.605635578831425e-06, + "learning_rate": 0.21425038866833548, + "loss": 0.0, + "num_input_tokens_seen": 24615840, + "step": 14365 + }, + { + "epoch": 69.75544794188862, + "grad_norm": 4.752309450850589e-06, + "learning_rate": 0.21419715612901508, + "loss": 0.0, + "num_input_tokens_seen": 24624384, + "step": 14370 + }, + { + "epoch": 69.77966101694915, + "grad_norm": 3.5681821373145794e-06, + "learning_rate": 0.21414391368968652, + "loss": 0.0, + "num_input_tokens_seen": 24633408, + "step": 14375 + }, + { + "epoch": 69.80387409200968, + "grad_norm": 4.659721980715403e-06, + "learning_rate": 0.21409066135856034, + "loss": 0.0, + "num_input_tokens_seen": 24641632, + "step": 14380 + }, + { + "epoch": 69.82808716707022, + "grad_norm": 2.250860234198626e-06, + "learning_rate": 0.21403739914384878, + "loss": 0.0, + "num_input_tokens_seen": 24649984, + "step": 14385 + }, + { + "epoch": 69.85230024213075, + "grad_norm": 1.3878762956665014e-06, + "learning_rate": 0.21398412705376554, + "loss": 0.0, + "num_input_tokens_seen": 24658496, + "step": 14390 + }, + { + "epoch": 69.87651331719128, + "grad_norm": 3.252630904171383e-06, + "learning_rate": 0.2139308450965258, + "loss": 0.0, + "num_input_tokens_seen": 24667072, + "step": 14395 + }, + { + "epoch": 69.90072639225181, + "grad_norm": 5.937481546425261e-06, + "learning_rate": 0.21387755328034638, + "loss": 0.0, + "num_input_tokens_seen": 24675424, + "step": 14400 + }, + { + "epoch": 69.90072639225181, + "eval_loss": 0.7237737774848938, + "eval_runtime": 4.6196, + "eval_samples_per_second": 79.444, + "eval_steps_per_second": 19.915, + "num_input_tokens_seen": 24675424, + "step": 14400 + }, + { + "epoch": 69.92493946731234, + "grad_norm": 2.0387512904562755e-06, + "learning_rate": 0.2138242516134455, + "loss": 0.0, + "num_input_tokens_seen": 24683872, + "step": 14405 + }, + { + "epoch": 69.94915254237289, + "grad_norm": 6.952817784622312e-06, + "learning_rate": 0.2137709401040429, + "loss": 0.0, + "num_input_tokens_seen": 24692064, + "step": 14410 + }, + { + "epoch": 69.97336561743342, + "grad_norm": 5.439031610876555e-06, + "learning_rate": 0.21371761876036, + "loss": 0.0, + "num_input_tokens_seen": 24700640, + "step": 14415 + }, + { + "epoch": 69.99757869249395, + "grad_norm": 2.180835963372374e-06, + "learning_rate": 0.21366428759061956, + "loss": 0.0, + "num_input_tokens_seen": 24709408, + "step": 14420 + }, + { + "epoch": 70.02421307506053, + "grad_norm": 4.697490567195928e-06, + "learning_rate": 0.2136109466030459, + "loss": 0.0, + "num_input_tokens_seen": 24718304, + "step": 14425 + }, + { + "epoch": 70.04842615012106, + "grad_norm": 5.475110356201185e-07, + "learning_rate": 0.2135575958058649, + "loss": 0.0, + "num_input_tokens_seen": 24726624, + "step": 14430 + }, + { + "epoch": 70.0726392251816, + "grad_norm": 6.747668521711603e-06, + "learning_rate": 0.2135042352073039, + "loss": 0.0, + "num_input_tokens_seen": 24734848, + "step": 14435 + }, + { + "epoch": 70.09685230024213, + "grad_norm": 2.5325714432256063e-06, + "learning_rate": 0.2134508648155918, + "loss": 0.0, + "num_input_tokens_seen": 24743104, + "step": 14440 + }, + { + "epoch": 70.12106537530266, + "grad_norm": 2.3080558548826957e-06, + "learning_rate": 0.213397484638959, + "loss": 0.0, + "num_input_tokens_seen": 24751776, + "step": 14445 + }, + { + "epoch": 70.1452784503632, + "grad_norm": 4.630208422895521e-06, + "learning_rate": 0.21334409468563728, + "loss": 0.0, + "num_input_tokens_seen": 24760256, + "step": 14450 + }, + { + "epoch": 70.16949152542372, + "grad_norm": 4.7505536713288166e-06, + "learning_rate": 0.2132906949638602, + "loss": 0.0, + "num_input_tokens_seen": 24768992, + "step": 14455 + }, + { + "epoch": 70.19370460048427, + "grad_norm": 2.0633408439607592e-06, + "learning_rate": 0.21323728548186255, + "loss": 0.0, + "num_input_tokens_seen": 24777504, + "step": 14460 + }, + { + "epoch": 70.2179176755448, + "grad_norm": 4.761607215186814e-06, + "learning_rate": 0.21318386624788088, + "loss": 0.0, + "num_input_tokens_seen": 24785792, + "step": 14465 + }, + { + "epoch": 70.24213075060533, + "grad_norm": 3.0875028187438147e-06, + "learning_rate": 0.21313043727015288, + "loss": 0.0, + "num_input_tokens_seen": 24794336, + "step": 14470 + }, + { + "epoch": 70.26634382566586, + "grad_norm": 6.566215233760886e-06, + "learning_rate": 0.2130769985569182, + "loss": 0.0, + "num_input_tokens_seen": 24803136, + "step": 14475 + }, + { + "epoch": 70.29055690072639, + "grad_norm": 4.559372882795287e-06, + "learning_rate": 0.21302355011641766, + "loss": 0.0, + "num_input_tokens_seen": 24811904, + "step": 14480 + }, + { + "epoch": 70.31476997578693, + "grad_norm": 2.45583260038984e-06, + "learning_rate": 0.21297009195689365, + "loss": 0.0, + "num_input_tokens_seen": 24820800, + "step": 14485 + }, + { + "epoch": 70.33898305084746, + "grad_norm": 5.7342399486515205e-06, + "learning_rate": 0.21291662408659015, + "loss": 0.0, + "num_input_tokens_seen": 24829088, + "step": 14490 + }, + { + "epoch": 70.36319612590799, + "grad_norm": 9.70578366832342e-06, + "learning_rate": 0.21286314651375254, + "loss": 0.0, + "num_input_tokens_seen": 24837920, + "step": 14495 + }, + { + "epoch": 70.38740920096852, + "grad_norm": 6.1799601098755375e-06, + "learning_rate": 0.2128096592466278, + "loss": 0.0, + "num_input_tokens_seen": 24846528, + "step": 14500 + }, + { + "epoch": 70.41162227602905, + "grad_norm": 2.4332559860340552e-06, + "learning_rate": 0.21275616229346428, + "loss": 0.0, + "num_input_tokens_seen": 24854912, + "step": 14505 + }, + { + "epoch": 70.4358353510896, + "grad_norm": 5.071214673080249e-06, + "learning_rate": 0.21270265566251184, + "loss": 0.0, + "num_input_tokens_seen": 24863520, + "step": 14510 + }, + { + "epoch": 70.46004842615012, + "grad_norm": 5.940078608546173e-06, + "learning_rate": 0.21264913936202193, + "loss": 0.0, + "num_input_tokens_seen": 24871936, + "step": 14515 + }, + { + "epoch": 70.48426150121065, + "grad_norm": 4.364813321444672e-06, + "learning_rate": 0.2125956134002475, + "loss": 0.0, + "num_input_tokens_seen": 24880896, + "step": 14520 + }, + { + "epoch": 70.50847457627118, + "grad_norm": 4.4609332690015435e-06, + "learning_rate": 0.2125420777854428, + "loss": 0.0, + "num_input_tokens_seen": 24889344, + "step": 14525 + }, + { + "epoch": 70.53268765133171, + "grad_norm": 1.505543764324102e-06, + "learning_rate": 0.21248853252586372, + "loss": 0.0, + "num_input_tokens_seen": 24897792, + "step": 14530 + }, + { + "epoch": 70.55690072639226, + "grad_norm": 2.717634970395011e-06, + "learning_rate": 0.21243497762976774, + "loss": 0.0, + "num_input_tokens_seen": 24906368, + "step": 14535 + }, + { + "epoch": 70.58111380145279, + "grad_norm": 3.852481768262805e-06, + "learning_rate": 0.21238141310541356, + "loss": 0.0, + "num_input_tokens_seen": 24915040, + "step": 14540 + }, + { + "epoch": 70.60532687651332, + "grad_norm": 4.386998170957668e-06, + "learning_rate": 0.21232783896106153, + "loss": 0.0, + "num_input_tokens_seen": 24923808, + "step": 14545 + }, + { + "epoch": 70.62953995157385, + "grad_norm": 6.365026820276398e-06, + "learning_rate": 0.21227425520497345, + "loss": 0.0, + "num_input_tokens_seen": 24932128, + "step": 14550 + }, + { + "epoch": 70.65375302663438, + "grad_norm": 4.244815954734804e-06, + "learning_rate": 0.2122206618454127, + "loss": 0.0, + "num_input_tokens_seen": 24940960, + "step": 14555 + }, + { + "epoch": 70.67796610169492, + "grad_norm": 5.688802048098296e-06, + "learning_rate": 0.2121670588906439, + "loss": 0.0, + "num_input_tokens_seen": 24949664, + "step": 14560 + }, + { + "epoch": 70.70217917675545, + "grad_norm": 2.6900306693278253e-06, + "learning_rate": 0.21211344634893345, + "loss": 0.0, + "num_input_tokens_seen": 24958016, + "step": 14565 + }, + { + "epoch": 70.72639225181598, + "grad_norm": 1.1367541446816176e-05, + "learning_rate": 0.21205982422854897, + "loss": 0.0, + "num_input_tokens_seen": 24966240, + "step": 14570 + }, + { + "epoch": 70.75060532687651, + "grad_norm": 4.776866262545809e-06, + "learning_rate": 0.21200619253775974, + "loss": 0.0, + "num_input_tokens_seen": 24974976, + "step": 14575 + }, + { + "epoch": 70.77481840193704, + "grad_norm": 7.481365855710465e-07, + "learning_rate": 0.21195255128483637, + "loss": 0.0, + "num_input_tokens_seen": 24983456, + "step": 14580 + }, + { + "epoch": 70.79903147699758, + "grad_norm": 2.672436494322028e-06, + "learning_rate": 0.21189890047805102, + "loss": 0.0, + "num_input_tokens_seen": 24992032, + "step": 14585 + }, + { + "epoch": 70.82324455205811, + "grad_norm": 1.3263166920296499e-06, + "learning_rate": 0.21184524012567735, + "loss": 0.0, + "num_input_tokens_seen": 25000448, + "step": 14590 + }, + { + "epoch": 70.84745762711864, + "grad_norm": 3.568344482118846e-06, + "learning_rate": 0.2117915702359905, + "loss": 0.0, + "num_input_tokens_seen": 25009248, + "step": 14595 + }, + { + "epoch": 70.87167070217917, + "grad_norm": 2.8246927286090795e-06, + "learning_rate": 0.211737890817267, + "loss": 0.0, + "num_input_tokens_seen": 25017632, + "step": 14600 + }, + { + "epoch": 70.87167070217917, + "eval_loss": 0.7327545881271362, + "eval_runtime": 4.6186, + "eval_samples_per_second": 79.461, + "eval_steps_per_second": 19.919, + "num_input_tokens_seen": 25017632, + "step": 14600 + }, + { + "epoch": 70.8958837772397, + "grad_norm": 6.071125881135231e-06, + "learning_rate": 0.21168420187778483, + "loss": 0.0, + "num_input_tokens_seen": 25026208, + "step": 14605 + }, + { + "epoch": 70.92009685230025, + "grad_norm": 2.2870442535349866e-06, + "learning_rate": 0.21163050342582362, + "loss": 0.0, + "num_input_tokens_seen": 25034976, + "step": 14610 + }, + { + "epoch": 70.94430992736078, + "grad_norm": 1.242360326614289e-06, + "learning_rate": 0.21157679546966426, + "loss": 0.0, + "num_input_tokens_seen": 25043680, + "step": 14615 + }, + { + "epoch": 70.9685230024213, + "grad_norm": 3.7498612073250115e-06, + "learning_rate": 0.2115230780175892, + "loss": 0.0, + "num_input_tokens_seen": 25052480, + "step": 14620 + }, + { + "epoch": 70.99273607748184, + "grad_norm": 6.509853392344667e-06, + "learning_rate": 0.21146935107788237, + "loss": 0.0, + "num_input_tokens_seen": 25060736, + "step": 14625 + }, + { + "epoch": 71.01937046004842, + "grad_norm": 2.493075726306415e-06, + "learning_rate": 0.21141561465882916, + "loss": 0.0, + "num_input_tokens_seen": 25069824, + "step": 14630 + }, + { + "epoch": 71.04358353510897, + "grad_norm": 6.352835953293834e-06, + "learning_rate": 0.21136186876871635, + "loss": 0.0, + "num_input_tokens_seen": 25078624, + "step": 14635 + }, + { + "epoch": 71.0677966101695, + "grad_norm": 5.053944278188283e-06, + "learning_rate": 0.21130811341583225, + "loss": 0.0, + "num_input_tokens_seen": 25087360, + "step": 14640 + }, + { + "epoch": 71.09200968523002, + "grad_norm": 5.32328840563423e-07, + "learning_rate": 0.21125434860846667, + "loss": 0.0, + "num_input_tokens_seen": 25095776, + "step": 14645 + }, + { + "epoch": 71.11622276029055, + "grad_norm": 3.5237101201346377e-06, + "learning_rate": 0.2112005743549107, + "loss": 0.0, + "num_input_tokens_seen": 25104416, + "step": 14650 + }, + { + "epoch": 71.14043583535108, + "grad_norm": 2.446813596179709e-06, + "learning_rate": 0.21114679066345707, + "loss": 0.0, + "num_input_tokens_seen": 25113024, + "step": 14655 + }, + { + "epoch": 71.16464891041163, + "grad_norm": 4.483181783143664e-06, + "learning_rate": 0.21109299754239993, + "loss": 0.0, + "num_input_tokens_seen": 25121696, + "step": 14660 + }, + { + "epoch": 71.18886198547216, + "grad_norm": 2.4362477688555373e-06, + "learning_rate": 0.21103919500003482, + "loss": 0.0, + "num_input_tokens_seen": 25130016, + "step": 14665 + }, + { + "epoch": 71.21307506053269, + "grad_norm": 4.02672640120727e-06, + "learning_rate": 0.21098538304465872, + "loss": 0.0, + "num_input_tokens_seen": 25138528, + "step": 14670 + }, + { + "epoch": 71.23728813559322, + "grad_norm": 3.345276354593807e-06, + "learning_rate": 0.2109315616845702, + "loss": 0.0, + "num_input_tokens_seen": 25146816, + "step": 14675 + }, + { + "epoch": 71.26150121065375, + "grad_norm": 4.232868377584964e-06, + "learning_rate": 0.21087773092806925, + "loss": 0.0, + "num_input_tokens_seen": 25155424, + "step": 14680 + }, + { + "epoch": 71.28571428571429, + "grad_norm": 3.4294855595362606e-06, + "learning_rate": 0.21082389078345704, + "loss": 0.0, + "num_input_tokens_seen": 25164256, + "step": 14685 + }, + { + "epoch": 71.30992736077482, + "grad_norm": 4.884195732302032e-06, + "learning_rate": 0.2107700412590365, + "loss": 0.0, + "num_input_tokens_seen": 25172672, + "step": 14690 + }, + { + "epoch": 71.33414043583535, + "grad_norm": 5.41193321623723e-06, + "learning_rate": 0.210716182363112, + "loss": 0.0, + "num_input_tokens_seen": 25181216, + "step": 14695 + }, + { + "epoch": 71.35835351089588, + "grad_norm": 6.6239376792509574e-06, + "learning_rate": 0.2106623141039891, + "loss": 0.0, + "num_input_tokens_seen": 25189824, + "step": 14700 + }, + { + "epoch": 71.38256658595641, + "grad_norm": 9.540021892462391e-07, + "learning_rate": 0.21060843648997507, + "loss": 0.0, + "num_input_tokens_seen": 25198272, + "step": 14705 + }, + { + "epoch": 71.40677966101696, + "grad_norm": 3.7680513287341455e-06, + "learning_rate": 0.21055454952937844, + "loss": 0.0, + "num_input_tokens_seen": 25206752, + "step": 14710 + }, + { + "epoch": 71.43099273607749, + "grad_norm": 4.593310222844593e-06, + "learning_rate": 0.21050065323050937, + "loss": 0.0, + "num_input_tokens_seen": 25215040, + "step": 14715 + }, + { + "epoch": 71.45520581113801, + "grad_norm": 5.454597157950047e-06, + "learning_rate": 0.21044674760167928, + "loss": 0.0, + "num_input_tokens_seen": 25223680, + "step": 14720 + }, + { + "epoch": 71.47941888619854, + "grad_norm": 6.53955203233636e-06, + "learning_rate": 0.210392832651201, + "loss": 0.0, + "num_input_tokens_seen": 25232096, + "step": 14725 + }, + { + "epoch": 71.50363196125907, + "grad_norm": 2.5854617433651583e-06, + "learning_rate": 0.210338908387389, + "loss": 0.0, + "num_input_tokens_seen": 25240416, + "step": 14730 + }, + { + "epoch": 71.52784503631962, + "grad_norm": 4.199012892058818e-06, + "learning_rate": 0.21028497481855912, + "loss": 0.0, + "num_input_tokens_seen": 25248768, + "step": 14735 + }, + { + "epoch": 71.55205811138015, + "grad_norm": 1.513569714006735e-06, + "learning_rate": 0.21023103195302847, + "loss": 0.0, + "num_input_tokens_seen": 25257248, + "step": 14740 + }, + { + "epoch": 71.57627118644068, + "grad_norm": 6.806436431361362e-06, + "learning_rate": 0.21017707979911582, + "loss": 0.0, + "num_input_tokens_seen": 25265568, + "step": 14745 + }, + { + "epoch": 71.60048426150121, + "grad_norm": 4.530211299424991e-06, + "learning_rate": 0.21012311836514122, + "loss": 0.0, + "num_input_tokens_seen": 25274144, + "step": 14750 + }, + { + "epoch": 71.62469733656174, + "grad_norm": 2.3297184270631988e-06, + "learning_rate": 0.21006914765942622, + "loss": 0.0, + "num_input_tokens_seen": 25282816, + "step": 14755 + }, + { + "epoch": 71.64891041162228, + "grad_norm": 2.503594942027121e-06, + "learning_rate": 0.2100151676902938, + "loss": 0.0, + "num_input_tokens_seen": 25291520, + "step": 14760 + }, + { + "epoch": 71.67312348668281, + "grad_norm": 3.940167971450137e-06, + "learning_rate": 0.2099611784660683, + "loss": 0.0, + "num_input_tokens_seen": 25300224, + "step": 14765 + }, + { + "epoch": 71.69733656174334, + "grad_norm": 4.603918114298722e-06, + "learning_rate": 0.20990717999507552, + "loss": 0.0, + "num_input_tokens_seen": 25308640, + "step": 14770 + }, + { + "epoch": 71.72154963680387, + "grad_norm": 1.3687857745026122e-06, + "learning_rate": 0.20985317228564276, + "loss": 0.0, + "num_input_tokens_seen": 25317408, + "step": 14775 + }, + { + "epoch": 71.7457627118644, + "grad_norm": 7.037820068944711e-07, + "learning_rate": 0.20979915534609872, + "loss": 0.0, + "num_input_tokens_seen": 25325760, + "step": 14780 + }, + { + "epoch": 71.76997578692495, + "grad_norm": 2.1594385088974377e-06, + "learning_rate": 0.20974512918477342, + "loss": 0.0, + "num_input_tokens_seen": 25334080, + "step": 14785 + }, + { + "epoch": 71.79418886198548, + "grad_norm": 5.888786290597636e-06, + "learning_rate": 0.2096910938099984, + "loss": 0.0, + "num_input_tokens_seen": 25343072, + "step": 14790 + }, + { + "epoch": 71.818401937046, + "grad_norm": 2.3791114927007584e-06, + "learning_rate": 0.2096370492301066, + "loss": 0.0, + "num_input_tokens_seen": 25351968, + "step": 14795 + }, + { + "epoch": 71.84261501210653, + "grad_norm": 3.303080347905052e-06, + "learning_rate": 0.2095829954534323, + "loss": 0.0, + "num_input_tokens_seen": 25360352, + "step": 14800 + }, + { + "epoch": 71.84261501210653, + "eval_loss": 0.7408377528190613, + "eval_runtime": 4.6115, + "eval_samples_per_second": 79.583, + "eval_steps_per_second": 19.95, + "num_input_tokens_seen": 25360352, + "step": 14800 + }, + { + "epoch": 71.86682808716706, + "grad_norm": 3.5441460113361245e-06, + "learning_rate": 0.2095289324883114, + "loss": 0.0, + "num_input_tokens_seen": 25369152, + "step": 14805 + }, + { + "epoch": 71.89104116222761, + "grad_norm": 3.84112126994296e-06, + "learning_rate": 0.20947486034308097, + "loss": 0.0, + "num_input_tokens_seen": 25377760, + "step": 14810 + }, + { + "epoch": 71.91525423728814, + "grad_norm": 3.5272460081614554e-06, + "learning_rate": 0.2094207790260797, + "loss": 0.0, + "num_input_tokens_seen": 25386144, + "step": 14815 + }, + { + "epoch": 71.93946731234867, + "grad_norm": 3.9292795008805115e-06, + "learning_rate": 0.20936668854564758, + "loss": 0.0, + "num_input_tokens_seen": 25394528, + "step": 14820 + }, + { + "epoch": 71.9636803874092, + "grad_norm": 2.91099331661826e-06, + "learning_rate": 0.20931258891012602, + "loss": 0.0, + "num_input_tokens_seen": 25403104, + "step": 14825 + }, + { + "epoch": 71.98789346246973, + "grad_norm": 1.6827468698465964e-06, + "learning_rate": 0.20925848012785792, + "loss": 0.0, + "num_input_tokens_seen": 25411328, + "step": 14830 + }, + { + "epoch": 72.01452784503633, + "grad_norm": 2.7449868866824545e-06, + "learning_rate": 0.20920436220718747, + "loss": 0.0, + "num_input_tokens_seen": 25420480, + "step": 14835 + }, + { + "epoch": 72.03874092009686, + "grad_norm": 1.3083439398542396e-06, + "learning_rate": 0.20915023515646033, + "loss": 0.0, + "num_input_tokens_seen": 25429088, + "step": 14840 + }, + { + "epoch": 72.06295399515739, + "grad_norm": 1.758980033628177e-06, + "learning_rate": 0.20909609898402368, + "loss": 0.0, + "num_input_tokens_seen": 25437792, + "step": 14845 + }, + { + "epoch": 72.08716707021792, + "grad_norm": 1.7043360003299313e-06, + "learning_rate": 0.2090419536982258, + "loss": 0.0, + "num_input_tokens_seen": 25446304, + "step": 14850 + }, + { + "epoch": 72.11138014527845, + "grad_norm": 7.070880201354157e-06, + "learning_rate": 0.2089877993074168, + "loss": 0.0, + "num_input_tokens_seen": 25454688, + "step": 14855 + }, + { + "epoch": 72.13559322033899, + "grad_norm": 4.714990154752741e-06, + "learning_rate": 0.20893363581994784, + "loss": 0.0, + "num_input_tokens_seen": 25463264, + "step": 14860 + }, + { + "epoch": 72.15980629539952, + "grad_norm": 5.4102019930724055e-06, + "learning_rate": 0.2088794632441716, + "loss": 0.0, + "num_input_tokens_seen": 25471552, + "step": 14865 + }, + { + "epoch": 72.18401937046005, + "grad_norm": 3.2250388812826714e-06, + "learning_rate": 0.20882528158844219, + "loss": 0.0, + "num_input_tokens_seen": 25480096, + "step": 14870 + }, + { + "epoch": 72.20823244552058, + "grad_norm": 3.017977860508836e-06, + "learning_rate": 0.20877109086111514, + "loss": 0.0, + "num_input_tokens_seen": 25488832, + "step": 14875 + }, + { + "epoch": 72.23244552058111, + "grad_norm": 5.0834278226830065e-06, + "learning_rate": 0.2087168910705473, + "loss": 0.0, + "num_input_tokens_seen": 25496928, + "step": 14880 + }, + { + "epoch": 72.25665859564165, + "grad_norm": 1.4236203469408792e-06, + "learning_rate": 0.208662682225097, + "loss": 0.0, + "num_input_tokens_seen": 25505952, + "step": 14885 + }, + { + "epoch": 72.28087167070218, + "grad_norm": 3.3904614156199386e-06, + "learning_rate": 0.2086084643331239, + "loss": 0.0, + "num_input_tokens_seen": 25514560, + "step": 14890 + }, + { + "epoch": 72.30508474576271, + "grad_norm": 2.4447911073366413e-06, + "learning_rate": 0.20855423740298906, + "loss": 0.0, + "num_input_tokens_seen": 25523040, + "step": 14895 + }, + { + "epoch": 72.32929782082324, + "grad_norm": 3.732456661964534e-06, + "learning_rate": 0.208500001443055, + "loss": 0.0, + "num_input_tokens_seen": 25531200, + "step": 14900 + }, + { + "epoch": 72.35351089588377, + "grad_norm": 2.4969192509161076e-06, + "learning_rate": 0.20844575646168553, + "loss": 0.0, + "num_input_tokens_seen": 25539456, + "step": 14905 + }, + { + "epoch": 72.37772397094432, + "grad_norm": 2.7041409111916437e-07, + "learning_rate": 0.20839150246724594, + "loss": 0.0, + "num_input_tokens_seen": 25548160, + "step": 14910 + }, + { + "epoch": 72.40193704600485, + "grad_norm": 2.0940137801517267e-06, + "learning_rate": 0.20833723946810287, + "loss": 0.0, + "num_input_tokens_seen": 25556672, + "step": 14915 + }, + { + "epoch": 72.42615012106538, + "grad_norm": 1.9637900550151244e-06, + "learning_rate": 0.20828296747262437, + "loss": 0.0, + "num_input_tokens_seen": 25564928, + "step": 14920 + }, + { + "epoch": 72.4503631961259, + "grad_norm": 2.071855305985082e-06, + "learning_rate": 0.20822868648917986, + "loss": 0.0, + "num_input_tokens_seen": 25573568, + "step": 14925 + }, + { + "epoch": 72.47457627118644, + "grad_norm": 1.940440597536508e-06, + "learning_rate": 0.20817439652614017, + "loss": 0.0, + "num_input_tokens_seen": 25581792, + "step": 14930 + }, + { + "epoch": 72.49878934624698, + "grad_norm": 8.171522836164513e-07, + "learning_rate": 0.20812009759187744, + "loss": 0.0, + "num_input_tokens_seen": 25590432, + "step": 14935 + }, + { + "epoch": 72.52300242130751, + "grad_norm": 3.5880107134289574e-06, + "learning_rate": 0.2080657896947653, + "loss": 0.0, + "num_input_tokens_seen": 25599168, + "step": 14940 + }, + { + "epoch": 72.54721549636804, + "grad_norm": 2.788937081277254e-06, + "learning_rate": 0.2080114728431787, + "loss": 0.0, + "num_input_tokens_seen": 25607936, + "step": 14945 + }, + { + "epoch": 72.57142857142857, + "grad_norm": 9.405314926880237e-07, + "learning_rate": 0.20795714704549392, + "loss": 0.0, + "num_input_tokens_seen": 25616672, + "step": 14950 + }, + { + "epoch": 72.5956416464891, + "grad_norm": 3.5573452805692796e-06, + "learning_rate": 0.20790281231008875, + "loss": 0.0, + "num_input_tokens_seen": 25625248, + "step": 14955 + }, + { + "epoch": 72.61985472154964, + "grad_norm": 1.4067304618947674e-06, + "learning_rate": 0.20784846864534226, + "loss": 0.0, + "num_input_tokens_seen": 25633568, + "step": 14960 + }, + { + "epoch": 72.64406779661017, + "grad_norm": 1.7436949519833433e-06, + "learning_rate": 0.20779411605963496, + "loss": 0.0, + "num_input_tokens_seen": 25642112, + "step": 14965 + }, + { + "epoch": 72.6682808716707, + "grad_norm": 3.1879251309874235e-06, + "learning_rate": 0.2077397545613487, + "loss": 0.0, + "num_input_tokens_seen": 25650496, + "step": 14970 + }, + { + "epoch": 72.69249394673123, + "grad_norm": 4.077821813552873e-06, + "learning_rate": 0.20768538415886661, + "loss": 0.0, + "num_input_tokens_seen": 25658912, + "step": 14975 + }, + { + "epoch": 72.71670702179176, + "grad_norm": 2.0927298010064987e-06, + "learning_rate": 0.20763100486057343, + "loss": 0.0, + "num_input_tokens_seen": 25667424, + "step": 14980 + }, + { + "epoch": 72.7409200968523, + "grad_norm": 1.9136130049446365e-06, + "learning_rate": 0.20757661667485502, + "loss": 0.0, + "num_input_tokens_seen": 25675872, + "step": 14985 + }, + { + "epoch": 72.76513317191284, + "grad_norm": 1.4420234037970658e-06, + "learning_rate": 0.2075222196100988, + "loss": 0.0, + "num_input_tokens_seen": 25684512, + "step": 14990 + }, + { + "epoch": 72.78934624697337, + "grad_norm": 4.467447524802992e-06, + "learning_rate": 0.20746781367469344, + "loss": 0.0, + "num_input_tokens_seen": 25692896, + "step": 14995 + }, + { + "epoch": 72.8135593220339, + "grad_norm": 8.590302968514152e-06, + "learning_rate": 0.207413398877029, + "loss": 0.0, + "num_input_tokens_seen": 25701344, + "step": 15000 + }, + { + "epoch": 72.8135593220339, + "eval_loss": 0.7472565770149231, + "eval_runtime": 4.6282, + "eval_samples_per_second": 79.296, + "eval_steps_per_second": 19.878, + "num_input_tokens_seen": 25701344, + "step": 15000 + }, + { + "epoch": 72.83777239709443, + "grad_norm": 5.464864443638362e-06, + "learning_rate": 0.20735897522549698, + "loss": 0.0, + "num_input_tokens_seen": 25710304, + "step": 15005 + }, + { + "epoch": 72.86198547215497, + "grad_norm": 1.8623096593728405e-06, + "learning_rate": 0.2073045427284902, + "loss": 0.0, + "num_input_tokens_seen": 25719392, + "step": 15010 + }, + { + "epoch": 72.8861985472155, + "grad_norm": 3.18052093462029e-06, + "learning_rate": 0.2072501013944027, + "loss": 0.0, + "num_input_tokens_seen": 25728032, + "step": 15015 + }, + { + "epoch": 72.91041162227603, + "grad_norm": 1.4966379922043416e-06, + "learning_rate": 0.20719565123163017, + "loss": 0.0, + "num_input_tokens_seen": 25736448, + "step": 15020 + }, + { + "epoch": 72.93462469733656, + "grad_norm": 4.019475909444736e-06, + "learning_rate": 0.20714119224856944, + "loss": 0.0, + "num_input_tokens_seen": 25744832, + "step": 15025 + }, + { + "epoch": 72.95883777239709, + "grad_norm": 5.094246716907946e-06, + "learning_rate": 0.2070867244536188, + "loss": 0.0, + "num_input_tokens_seen": 25753280, + "step": 15030 + }, + { + "epoch": 72.98305084745763, + "grad_norm": 6.899902928125812e-06, + "learning_rate": 0.20703224785517785, + "loss": 0.0, + "num_input_tokens_seen": 25762336, + "step": 15035 + }, + { + "epoch": 73.00968523002422, + "grad_norm": 2.8010715595883084e-06, + "learning_rate": 0.20697776246164754, + "loss": 0.0, + "num_input_tokens_seen": 25771520, + "step": 15040 + }, + { + "epoch": 73.03389830508475, + "grad_norm": 1.1854082231366192e-06, + "learning_rate": 0.2069232682814303, + "loss": 0.0, + "num_input_tokens_seen": 25780288, + "step": 15045 + }, + { + "epoch": 73.05811138014528, + "grad_norm": 4.440621069079498e-06, + "learning_rate": 0.20686876532292972, + "loss": 0.0, + "num_input_tokens_seen": 25788896, + "step": 15050 + }, + { + "epoch": 73.08232445520581, + "grad_norm": 1.3499029591912404e-06, + "learning_rate": 0.20681425359455083, + "loss": 0.0, + "num_input_tokens_seen": 25797472, + "step": 15055 + }, + { + "epoch": 73.10653753026634, + "grad_norm": 4.3557338358368725e-06, + "learning_rate": 0.20675973310470008, + "loss": 0.0, + "num_input_tokens_seen": 25806016, + "step": 15060 + }, + { + "epoch": 73.13075060532688, + "grad_norm": 3.0483492992061656e-06, + "learning_rate": 0.2067052038617852, + "loss": 0.0, + "num_input_tokens_seen": 25814848, + "step": 15065 + }, + { + "epoch": 73.15496368038741, + "grad_norm": 3.1282090731110657e-06, + "learning_rate": 0.2066506658742153, + "loss": 0.0, + "num_input_tokens_seen": 25822944, + "step": 15070 + }, + { + "epoch": 73.17917675544794, + "grad_norm": 1.577197963342769e-06, + "learning_rate": 0.20659611915040077, + "loss": 0.0, + "num_input_tokens_seen": 25831296, + "step": 15075 + }, + { + "epoch": 73.20338983050847, + "grad_norm": 4.45284149463987e-06, + "learning_rate": 0.20654156369875348, + "loss": 0.0, + "num_input_tokens_seen": 25840064, + "step": 15080 + }, + { + "epoch": 73.227602905569, + "grad_norm": 1.606260070730059e-06, + "learning_rate": 0.20648699952768648, + "loss": 0.0, + "num_input_tokens_seen": 25848768, + "step": 15085 + }, + { + "epoch": 73.25181598062954, + "grad_norm": 3.0593591873184778e-06, + "learning_rate": 0.20643242664561437, + "loss": 0.0, + "num_input_tokens_seen": 25857248, + "step": 15090 + }, + { + "epoch": 73.27602905569007, + "grad_norm": 2.3366758341580862e-06, + "learning_rate": 0.20637784506095277, + "loss": 0.0, + "num_input_tokens_seen": 25865216, + "step": 15095 + }, + { + "epoch": 73.3002421307506, + "grad_norm": 8.545654054614715e-06, + "learning_rate": 0.20632325478211908, + "loss": 0.0, + "num_input_tokens_seen": 25873664, + "step": 15100 + }, + { + "epoch": 73.32445520581113, + "grad_norm": 1.1485949471534695e-06, + "learning_rate": 0.20626865581753165, + "loss": 0.0, + "num_input_tokens_seen": 25882464, + "step": 15105 + }, + { + "epoch": 73.34866828087166, + "grad_norm": 2.49288586928742e-06, + "learning_rate": 0.2062140481756104, + "loss": 0.0, + "num_input_tokens_seen": 25891168, + "step": 15110 + }, + { + "epoch": 73.37288135593221, + "grad_norm": 3.3951710065593943e-06, + "learning_rate": 0.20615943186477648, + "loss": 0.0, + "num_input_tokens_seen": 25900096, + "step": 15115 + }, + { + "epoch": 73.39709443099274, + "grad_norm": 1.0939328376480262e-06, + "learning_rate": 0.20610480689345242, + "loss": 0.0, + "num_input_tokens_seen": 25908800, + "step": 15120 + }, + { + "epoch": 73.42130750605327, + "grad_norm": 1.5341054222517414e-06, + "learning_rate": 0.2060501732700621, + "loss": 0.0, + "num_input_tokens_seen": 25917280, + "step": 15125 + }, + { + "epoch": 73.4455205811138, + "grad_norm": 4.500921477301745e-06, + "learning_rate": 0.20599553100303067, + "loss": 0.0, + "num_input_tokens_seen": 25925664, + "step": 15130 + }, + { + "epoch": 73.46973365617433, + "grad_norm": 2.9845255085092504e-06, + "learning_rate": 0.20594088010078465, + "loss": 0.0, + "num_input_tokens_seen": 25934240, + "step": 15135 + }, + { + "epoch": 73.49394673123487, + "grad_norm": 7.202811502793338e-06, + "learning_rate": 0.20588622057175196, + "loss": 0.0, + "num_input_tokens_seen": 25942816, + "step": 15140 + }, + { + "epoch": 73.5181598062954, + "grad_norm": 3.114542096227524e-06, + "learning_rate": 0.20583155242436177, + "loss": 0.0, + "num_input_tokens_seen": 25951200, + "step": 15145 + }, + { + "epoch": 73.54237288135593, + "grad_norm": 4.739897576655494e-06, + "learning_rate": 0.20577687566704453, + "loss": 0.0, + "num_input_tokens_seen": 25959552, + "step": 15150 + }, + { + "epoch": 73.56658595641646, + "grad_norm": 4.962979005540546e-07, + "learning_rate": 0.20572219030823213, + "loss": 0.0, + "num_input_tokens_seen": 25967872, + "step": 15155 + }, + { + "epoch": 73.59079903147699, + "grad_norm": 2.3253367089637322e-06, + "learning_rate": 0.20566749635635775, + "loss": 0.0, + "num_input_tokens_seen": 25976224, + "step": 15160 + }, + { + "epoch": 73.61501210653753, + "grad_norm": 2.1060434391984018e-06, + "learning_rate": 0.20561279381985587, + "loss": 0.0, + "num_input_tokens_seen": 25985248, + "step": 15165 + }, + { + "epoch": 73.63922518159806, + "grad_norm": 3.4704951303865528e-06, + "learning_rate": 0.2055580827071623, + "loss": 0.0, + "num_input_tokens_seen": 25994240, + "step": 15170 + }, + { + "epoch": 73.6634382566586, + "grad_norm": 1.2041541594953742e-06, + "learning_rate": 0.20550336302671418, + "loss": 0.0, + "num_input_tokens_seen": 26003200, + "step": 15175 + }, + { + "epoch": 73.68765133171912, + "grad_norm": 3.3955404887819896e-06, + "learning_rate": 0.20544863478695, + "loss": 0.0, + "num_input_tokens_seen": 26012000, + "step": 15180 + }, + { + "epoch": 73.71186440677967, + "grad_norm": 1.3987086049382924e-06, + "learning_rate": 0.20539389799630953, + "loss": 0.0, + "num_input_tokens_seen": 26020416, + "step": 15185 + }, + { + "epoch": 73.7360774818402, + "grad_norm": 2.3709233119006967e-06, + "learning_rate": 0.20533915266323388, + "loss": 0.0, + "num_input_tokens_seen": 26029024, + "step": 15190 + }, + { + "epoch": 73.76029055690073, + "grad_norm": 2.0872159893770004e-06, + "learning_rate": 0.20528439879616542, + "loss": 0.0, + "num_input_tokens_seen": 26037344, + "step": 15195 + }, + { + "epoch": 73.78450363196126, + "grad_norm": 3.6753654057974927e-06, + "learning_rate": 0.20522963640354794, + "loss": 0.0, + "num_input_tokens_seen": 26046016, + "step": 15200 + }, + { + "epoch": 73.78450363196126, + "eval_loss": 0.7552339434623718, + "eval_runtime": 4.6299, + "eval_samples_per_second": 79.267, + "eval_steps_per_second": 19.871, + "num_input_tokens_seen": 26046016, + "step": 15200 + }, + { + "epoch": 73.80871670702179, + "grad_norm": 3.037433543795487e-06, + "learning_rate": 0.20517486549382644, + "loss": 0.0, + "num_input_tokens_seen": 26054656, + "step": 15205 + }, + { + "epoch": 73.83292978208233, + "grad_norm": 3.922946689272067e-06, + "learning_rate": 0.20512008607544735, + "loss": 0.0, + "num_input_tokens_seen": 26063104, + "step": 15210 + }, + { + "epoch": 73.85714285714286, + "grad_norm": 3.2212001315201633e-06, + "learning_rate": 0.20506529815685826, + "loss": 0.0, + "num_input_tokens_seen": 26071424, + "step": 15215 + }, + { + "epoch": 73.88135593220339, + "grad_norm": 4.342701231507817e-06, + "learning_rate": 0.2050105017465082, + "loss": 0.0, + "num_input_tokens_seen": 26079584, + "step": 15220 + }, + { + "epoch": 73.90556900726392, + "grad_norm": 4.682328381022671e-06, + "learning_rate": 0.20495569685284754, + "loss": 0.0, + "num_input_tokens_seen": 26087744, + "step": 15225 + }, + { + "epoch": 73.92978208232445, + "grad_norm": 1.6519626342414995e-06, + "learning_rate": 0.20490088348432778, + "loss": 0.0, + "num_input_tokens_seen": 26096736, + "step": 15230 + }, + { + "epoch": 73.953995157385, + "grad_norm": 1.7914442196342861e-06, + "learning_rate": 0.2048460616494018, + "loss": 0.0, + "num_input_tokens_seen": 26105184, + "step": 15235 + }, + { + "epoch": 73.97820823244552, + "grad_norm": 2.1835944608028512e-06, + "learning_rate": 0.2047912313565239, + "loss": 0.0, + "num_input_tokens_seen": 26113440, + "step": 15240 + }, + { + "epoch": 74.00484261501211, + "grad_norm": 1.0400934115750715e-05, + "learning_rate": 0.20473639261414958, + "loss": 0.0, + "num_input_tokens_seen": 26122272, + "step": 15245 + }, + { + "epoch": 74.02905569007264, + "grad_norm": 3.118836957582971e-06, + "learning_rate": 0.2046815454307357, + "loss": 0.0, + "num_input_tokens_seen": 26131168, + "step": 15250 + }, + { + "epoch": 74.05326876513317, + "grad_norm": 5.497107849805616e-06, + "learning_rate": 0.20462668981474028, + "loss": 0.0, + "num_input_tokens_seen": 26139648, + "step": 15255 + }, + { + "epoch": 74.0774818401937, + "grad_norm": 1.4454718666456756e-06, + "learning_rate": 0.20457182577462288, + "loss": 0.0, + "num_input_tokens_seen": 26148384, + "step": 15260 + }, + { + "epoch": 74.10169491525424, + "grad_norm": 2.692977204787894e-06, + "learning_rate": 0.2045169533188441, + "loss": 0.0, + "num_input_tokens_seen": 26156832, + "step": 15265 + }, + { + "epoch": 74.12590799031477, + "grad_norm": 1.3176800166547764e-06, + "learning_rate": 0.20446207245586603, + "loss": 0.0, + "num_input_tokens_seen": 26165632, + "step": 15270 + }, + { + "epoch": 74.1501210653753, + "grad_norm": 3.19651076097216e-06, + "learning_rate": 0.20440718319415196, + "loss": 0.0, + "num_input_tokens_seen": 26174496, + "step": 15275 + }, + { + "epoch": 74.17433414043583, + "grad_norm": 5.262594413579791e-07, + "learning_rate": 0.20435228554216653, + "loss": 0.0, + "num_input_tokens_seen": 26182560, + "step": 15280 + }, + { + "epoch": 74.19854721549636, + "grad_norm": 1.5885345874266932e-06, + "learning_rate": 0.20429737950837565, + "loss": 0.0, + "num_input_tokens_seen": 26190912, + "step": 15285 + }, + { + "epoch": 74.2227602905569, + "grad_norm": 3.0264548058767105e-06, + "learning_rate": 0.20424246510124647, + "loss": 0.0, + "num_input_tokens_seen": 26199584, + "step": 15290 + }, + { + "epoch": 74.24697336561744, + "grad_norm": 2.7285143460176187e-06, + "learning_rate": 0.20418754232924755, + "loss": 0.0, + "num_input_tokens_seen": 26208096, + "step": 15295 + }, + { + "epoch": 74.27118644067797, + "grad_norm": 3.7162187709327554e-06, + "learning_rate": 0.20413261120084863, + "loss": 0.0, + "num_input_tokens_seen": 26216832, + "step": 15300 + }, + { + "epoch": 74.2953995157385, + "grad_norm": 3.883023964590393e-06, + "learning_rate": 0.2040776717245208, + "loss": 0.0, + "num_input_tokens_seen": 26225440, + "step": 15305 + }, + { + "epoch": 74.31961259079903, + "grad_norm": 6.759215921192663e-06, + "learning_rate": 0.2040227239087364, + "loss": 0.0, + "num_input_tokens_seen": 26234112, + "step": 15310 + }, + { + "epoch": 74.34382566585957, + "grad_norm": 2.3514269287261413e-06, + "learning_rate": 0.20396776776196904, + "loss": 0.0, + "num_input_tokens_seen": 26242240, + "step": 15315 + }, + { + "epoch": 74.3680387409201, + "grad_norm": 2.232528913737042e-06, + "learning_rate": 0.20391280329269373, + "loss": 0.0, + "num_input_tokens_seen": 26250816, + "step": 15320 + }, + { + "epoch": 74.39225181598063, + "grad_norm": 1.6836040686030174e-06, + "learning_rate": 0.20385783050938663, + "loss": 0.0, + "num_input_tokens_seen": 26259424, + "step": 15325 + }, + { + "epoch": 74.41646489104116, + "grad_norm": 4.897215603705263e-06, + "learning_rate": 0.20380284942052526, + "loss": 0.0, + "num_input_tokens_seen": 26268224, + "step": 15330 + }, + { + "epoch": 74.44067796610169, + "grad_norm": 1.977934971364448e-06, + "learning_rate": 0.2037478600345884, + "loss": 0.0, + "num_input_tokens_seen": 26276736, + "step": 15335 + }, + { + "epoch": 74.46489104116223, + "grad_norm": 5.789742317574564e-06, + "learning_rate": 0.20369286236005604, + "loss": 0.0, + "num_input_tokens_seen": 26285312, + "step": 15340 + }, + { + "epoch": 74.48910411622276, + "grad_norm": 1.7212051943715778e-06, + "learning_rate": 0.20363785640540957, + "loss": 0.0, + "num_input_tokens_seen": 26293984, + "step": 15345 + }, + { + "epoch": 74.51331719128329, + "grad_norm": 2.629741857163026e-06, + "learning_rate": 0.2035828421791316, + "loss": 0.0, + "num_input_tokens_seen": 26302688, + "step": 15350 + }, + { + "epoch": 74.53753026634382, + "grad_norm": 1.1780608701883466e-06, + "learning_rate": 0.20352781968970599, + "loss": 0.0, + "num_input_tokens_seen": 26310944, + "step": 15355 + }, + { + "epoch": 74.56174334140435, + "grad_norm": 1.8922934259535396e-06, + "learning_rate": 0.2034727889456179, + "loss": 0.0, + "num_input_tokens_seen": 26319776, + "step": 15360 + }, + { + "epoch": 74.5859564164649, + "grad_norm": 1.4299113217930426e-06, + "learning_rate": 0.2034177499553538, + "loss": 0.0, + "num_input_tokens_seen": 26328192, + "step": 15365 + }, + { + "epoch": 74.61016949152543, + "grad_norm": 6.278281148297538e-07, + "learning_rate": 0.2033627027274014, + "loss": 0.0, + "num_input_tokens_seen": 26336896, + "step": 15370 + }, + { + "epoch": 74.63438256658596, + "grad_norm": 1.1977833764831303e-06, + "learning_rate": 0.20330764727024955, + "loss": 0.0, + "num_input_tokens_seen": 26345120, + "step": 15375 + }, + { + "epoch": 74.65859564164649, + "grad_norm": 2.757403535724734e-06, + "learning_rate": 0.20325258359238868, + "loss": 0.0, + "num_input_tokens_seen": 26354176, + "step": 15380 + }, + { + "epoch": 74.68280871670702, + "grad_norm": 3.2625066523905843e-06, + "learning_rate": 0.20319751170231018, + "loss": 0.0, + "num_input_tokens_seen": 26362976, + "step": 15385 + }, + { + "epoch": 74.70702179176756, + "grad_norm": 2.270495315315202e-06, + "learning_rate": 0.2031424316085068, + "loss": 0.0, + "num_input_tokens_seen": 26371264, + "step": 15390 + }, + { + "epoch": 74.73123486682809, + "grad_norm": 1.0648830084392102e-06, + "learning_rate": 0.20308734331947265, + "loss": 0.0, + "num_input_tokens_seen": 26380000, + "step": 15395 + }, + { + "epoch": 74.75544794188862, + "grad_norm": 1.8621642539073946e-06, + "learning_rate": 0.20303224684370305, + "loss": 0.0, + "num_input_tokens_seen": 26388448, + "step": 15400 + }, + { + "epoch": 74.75544794188862, + "eval_loss": 0.7630203366279602, + "eval_runtime": 4.6218, + "eval_samples_per_second": 79.406, + "eval_steps_per_second": 19.906, + "num_input_tokens_seen": 26388448, + "step": 15400 + }, + { + "epoch": 74.77966101694915, + "grad_norm": 3.855132490571123e-06, + "learning_rate": 0.20297714218969456, + "loss": 0.0, + "num_input_tokens_seen": 26397088, + "step": 15405 + }, + { + "epoch": 74.80387409200968, + "grad_norm": 2.200105427618837e-06, + "learning_rate": 0.20292202936594497, + "loss": 0.0, + "num_input_tokens_seen": 26405344, + "step": 15410 + }, + { + "epoch": 74.82808716707022, + "grad_norm": 2.4012065296119545e-06, + "learning_rate": 0.2028669083809534, + "loss": 0.0, + "num_input_tokens_seen": 26413792, + "step": 15415 + }, + { + "epoch": 74.85230024213075, + "grad_norm": 8.704237757228839e-07, + "learning_rate": 0.20281177924322016, + "loss": 0.0, + "num_input_tokens_seen": 26422272, + "step": 15420 + }, + { + "epoch": 74.87651331719128, + "grad_norm": 2.0284637685108464e-06, + "learning_rate": 0.2027566419612469, + "loss": 0.0, + "num_input_tokens_seen": 26430752, + "step": 15425 + }, + { + "epoch": 74.90072639225181, + "grad_norm": 3.873760306305485e-06, + "learning_rate": 0.20270149654353647, + "loss": 0.0, + "num_input_tokens_seen": 26439296, + "step": 15430 + }, + { + "epoch": 74.92493946731234, + "grad_norm": 1.4879177570037427e-06, + "learning_rate": 0.202646342998593, + "loss": 0.0, + "num_input_tokens_seen": 26447744, + "step": 15435 + }, + { + "epoch": 74.94915254237289, + "grad_norm": 7.795035230628855e-07, + "learning_rate": 0.20259118133492185, + "loss": 0.0, + "num_input_tokens_seen": 26456192, + "step": 15440 + }, + { + "epoch": 74.97336561743342, + "grad_norm": 6.088260761316633e-06, + "learning_rate": 0.20253601156102966, + "loss": 0.0, + "num_input_tokens_seen": 26464416, + "step": 15445 + }, + { + "epoch": 74.99757869249395, + "grad_norm": 3.5858065530192107e-06, + "learning_rate": 0.20248083368542422, + "loss": 0.0, + "num_input_tokens_seen": 26473120, + "step": 15450 + }, + { + "epoch": 75.02421307506053, + "grad_norm": 1.3457333807309624e-06, + "learning_rate": 0.2024256477166147, + "loss": 0.0, + "num_input_tokens_seen": 26481984, + "step": 15455 + }, + { + "epoch": 75.04842615012106, + "grad_norm": 1.165846015283023e-06, + "learning_rate": 0.2023704536631115, + "loss": 0.0, + "num_input_tokens_seen": 26490368, + "step": 15460 + }, + { + "epoch": 75.0726392251816, + "grad_norm": 3.893666416843189e-06, + "learning_rate": 0.20231525153342625, + "loss": 0.0, + "num_input_tokens_seen": 26498656, + "step": 15465 + }, + { + "epoch": 75.09685230024213, + "grad_norm": 2.845962171704741e-06, + "learning_rate": 0.20226004133607173, + "loss": 0.0, + "num_input_tokens_seen": 26507616, + "step": 15470 + }, + { + "epoch": 75.12106537530266, + "grad_norm": 2.47867342295649e-06, + "learning_rate": 0.20220482307956214, + "loss": 0.0, + "num_input_tokens_seen": 26516160, + "step": 15475 + }, + { + "epoch": 75.1452784503632, + "grad_norm": 2.241762331323116e-06, + "learning_rate": 0.20214959677241276, + "loss": 0.0, + "num_input_tokens_seen": 26524832, + "step": 15480 + }, + { + "epoch": 75.16949152542372, + "grad_norm": 8.613656063971575e-07, + "learning_rate": 0.20209436242314022, + "loss": 0.0, + "num_input_tokens_seen": 26533216, + "step": 15485 + }, + { + "epoch": 75.19370460048427, + "grad_norm": 3.4758706988213817e-06, + "learning_rate": 0.2020391200402623, + "loss": 0.0, + "num_input_tokens_seen": 26541952, + "step": 15490 + }, + { + "epoch": 75.2179176755448, + "grad_norm": 2.2051642645237735e-06, + "learning_rate": 0.2019838696322981, + "loss": 0.0, + "num_input_tokens_seen": 26550592, + "step": 15495 + }, + { + "epoch": 75.24213075060533, + "grad_norm": 1.4252462960939738e-06, + "learning_rate": 0.20192861120776798, + "loss": 0.0, + "num_input_tokens_seen": 26559296, + "step": 15500 + }, + { + "epoch": 75.26634382566586, + "grad_norm": 2.8575339001690736e-06, + "learning_rate": 0.20187334477519345, + "loss": 0.0, + "num_input_tokens_seen": 26567520, + "step": 15505 + }, + { + "epoch": 75.29055690072639, + "grad_norm": 1.8425341750116786e-06, + "learning_rate": 0.20181807034309726, + "loss": 0.0, + "num_input_tokens_seen": 26576192, + "step": 15510 + }, + { + "epoch": 75.31476997578693, + "grad_norm": 1.9582764707593014e-06, + "learning_rate": 0.2017627879200034, + "loss": 0.0, + "num_input_tokens_seen": 26584576, + "step": 15515 + }, + { + "epoch": 75.33898305084746, + "grad_norm": 8.553524821763858e-07, + "learning_rate": 0.2017074975144372, + "loss": 0.0, + "num_input_tokens_seen": 26592864, + "step": 15520 + }, + { + "epoch": 75.36319612590799, + "grad_norm": 1.6712268688934273e-06, + "learning_rate": 0.20165219913492508, + "loss": 0.0, + "num_input_tokens_seen": 26601376, + "step": 15525 + }, + { + "epoch": 75.38740920096852, + "grad_norm": 3.2697546430426883e-06, + "learning_rate": 0.20159689278999468, + "loss": 0.0, + "num_input_tokens_seen": 26609824, + "step": 15530 + }, + { + "epoch": 75.41162227602905, + "grad_norm": 1.011664039651805e-06, + "learning_rate": 0.20154157848817508, + "loss": 0.0, + "num_input_tokens_seen": 26618272, + "step": 15535 + }, + { + "epoch": 75.4358353510896, + "grad_norm": 1.6862933307493222e-06, + "learning_rate": 0.20148625623799632, + "loss": 0.0, + "num_input_tokens_seen": 26627072, + "step": 15540 + }, + { + "epoch": 75.46004842615012, + "grad_norm": 2.3151687855715863e-06, + "learning_rate": 0.20143092604798984, + "loss": 0.0, + "num_input_tokens_seen": 26635232, + "step": 15545 + }, + { + "epoch": 75.48426150121065, + "grad_norm": 2.420490545773646e-06, + "learning_rate": 0.2013755879266883, + "loss": 0.0, + "num_input_tokens_seen": 26644064, + "step": 15550 + }, + { + "epoch": 75.50847457627118, + "grad_norm": 2.391787347733043e-06, + "learning_rate": 0.20132024188262543, + "loss": 0.0, + "num_input_tokens_seen": 26652320, + "step": 15555 + }, + { + "epoch": 75.53268765133171, + "grad_norm": 4.356637361979665e-07, + "learning_rate": 0.2012648879243363, + "loss": 0.0, + "num_input_tokens_seen": 26661152, + "step": 15560 + }, + { + "epoch": 75.55690072639226, + "grad_norm": 5.762280579801882e-06, + "learning_rate": 0.20120952606035725, + "loss": 0.0, + "num_input_tokens_seen": 26669824, + "step": 15565 + }, + { + "epoch": 75.58111380145279, + "grad_norm": 1.8797956045091269e-06, + "learning_rate": 0.20115415629922576, + "loss": 0.0, + "num_input_tokens_seen": 26678592, + "step": 15570 + }, + { + "epoch": 75.60532687651332, + "grad_norm": 3.5890714116249e-06, + "learning_rate": 0.20109877864948048, + "loss": 0.0, + "num_input_tokens_seen": 26686944, + "step": 15575 + }, + { + "epoch": 75.62953995157385, + "grad_norm": 2.0783700165338814e-06, + "learning_rate": 0.20104339311966138, + "loss": 0.0, + "num_input_tokens_seen": 26695520, + "step": 15580 + }, + { + "epoch": 75.65375302663438, + "grad_norm": 2.4447906525892904e-06, + "learning_rate": 0.2009879997183097, + "loss": 0.0, + "num_input_tokens_seen": 26704224, + "step": 15585 + }, + { + "epoch": 75.67796610169492, + "grad_norm": 3.6571211694536032e-06, + "learning_rate": 0.20093259845396763, + "loss": 0.0, + "num_input_tokens_seen": 26712672, + "step": 15590 + }, + { + "epoch": 75.70217917675545, + "grad_norm": 9.773561941983644e-07, + "learning_rate": 0.20087718933517884, + "loss": 0.0, + "num_input_tokens_seen": 26721184, + "step": 15595 + }, + { + "epoch": 75.72639225181598, + "grad_norm": 1.130152668338269e-06, + "learning_rate": 0.20082177237048807, + "loss": 0.0, + "num_input_tokens_seen": 26729856, + "step": 15600 + }, + { + "epoch": 75.72639225181598, + "eval_loss": 0.7694434523582458, + "eval_runtime": 4.6185, + "eval_samples_per_second": 79.463, + "eval_steps_per_second": 19.92, + "num_input_tokens_seen": 26729856, + "step": 15600 + }, + { + "epoch": 75.75060532687651, + "grad_norm": 6.451976446442131e-07, + "learning_rate": 0.20076634756844133, + "loss": 0.0, + "num_input_tokens_seen": 26737920, + "step": 15605 + }, + { + "epoch": 75.77481840193704, + "grad_norm": 1.611349262020667e-06, + "learning_rate": 0.20071091493758586, + "loss": 0.0, + "num_input_tokens_seen": 26746720, + "step": 15610 + }, + { + "epoch": 75.79903147699758, + "grad_norm": 4.925542725686682e-06, + "learning_rate": 0.20065547448647003, + "loss": 0.0, + "num_input_tokens_seen": 26755264, + "step": 15615 + }, + { + "epoch": 75.82324455205811, + "grad_norm": 3.3478456771263154e-06, + "learning_rate": 0.20060002622364348, + "loss": 0.0, + "num_input_tokens_seen": 26763616, + "step": 15620 + }, + { + "epoch": 75.84745762711864, + "grad_norm": 3.038078375539044e-06, + "learning_rate": 0.20054457015765695, + "loss": 0.0, + "num_input_tokens_seen": 26772512, + "step": 15625 + }, + { + "epoch": 75.87167070217917, + "grad_norm": 1.4392750244951458e-06, + "learning_rate": 0.20048910629706254, + "loss": 0.0, + "num_input_tokens_seen": 26781088, + "step": 15630 + }, + { + "epoch": 75.8958837772397, + "grad_norm": 2.38936695495795e-06, + "learning_rate": 0.20043363465041347, + "loss": 0.0, + "num_input_tokens_seen": 26789888, + "step": 15635 + }, + { + "epoch": 75.92009685230025, + "grad_norm": 1.381489369123301e-06, + "learning_rate": 0.2003781552262641, + "loss": 0.0, + "num_input_tokens_seen": 26798496, + "step": 15640 + }, + { + "epoch": 75.94430992736078, + "grad_norm": 1.0674787063180702e-06, + "learning_rate": 0.20032266803317014, + "loss": 0.0, + "num_input_tokens_seen": 26807168, + "step": 15645 + }, + { + "epoch": 75.9685230024213, + "grad_norm": 2.3024631445878185e-06, + "learning_rate": 0.2002671730796884, + "loss": 0.0, + "num_input_tokens_seen": 26815520, + "step": 15650 + }, + { + "epoch": 75.99273607748184, + "grad_norm": 2.76711239166616e-06, + "learning_rate": 0.20021167037437684, + "loss": 0.0, + "num_input_tokens_seen": 26823968, + "step": 15655 + }, + { + "epoch": 76.01937046004842, + "grad_norm": 2.089388317472185e-06, + "learning_rate": 0.20015615992579472, + "loss": 0.0, + "num_input_tokens_seen": 26832640, + "step": 15660 + }, + { + "epoch": 76.04358353510897, + "grad_norm": 5.604726425190165e-07, + "learning_rate": 0.20010064174250244, + "loss": 0.0, + "num_input_tokens_seen": 26841248, + "step": 15665 + }, + { + "epoch": 76.0677966101695, + "grad_norm": 1.8325047221878776e-06, + "learning_rate": 0.2000451158330616, + "loss": 0.0, + "num_input_tokens_seen": 26849824, + "step": 15670 + }, + { + "epoch": 76.09200968523002, + "grad_norm": 2.8041120003763353e-06, + "learning_rate": 0.199989582206035, + "loss": 0.0, + "num_input_tokens_seen": 26858304, + "step": 15675 + }, + { + "epoch": 76.11622276029055, + "grad_norm": 1.1505554766699788e-06, + "learning_rate": 0.1999340408699866, + "loss": 0.0, + "num_input_tokens_seen": 26866880, + "step": 15680 + }, + { + "epoch": 76.14043583535108, + "grad_norm": 4.3337527131370734e-06, + "learning_rate": 0.19987849183348155, + "loss": 0.0, + "num_input_tokens_seen": 26875648, + "step": 15685 + }, + { + "epoch": 76.16464891041163, + "grad_norm": 1.1017681345038e-06, + "learning_rate": 0.19982293510508628, + "loss": 0.0, + "num_input_tokens_seen": 26884448, + "step": 15690 + }, + { + "epoch": 76.18886198547216, + "grad_norm": 2.4180699256248772e-06, + "learning_rate": 0.19976737069336833, + "loss": 0.0, + "num_input_tokens_seen": 26893088, + "step": 15695 + }, + { + "epoch": 76.21307506053269, + "grad_norm": 1.2086774177078041e-06, + "learning_rate": 0.1997117986068964, + "loss": 0.0, + "num_input_tokens_seen": 26901632, + "step": 15700 + }, + { + "epoch": 76.23728813559322, + "grad_norm": 3.172358219671878e-06, + "learning_rate": 0.19965621885424037, + "loss": 0.0, + "num_input_tokens_seen": 26910176, + "step": 15705 + }, + { + "epoch": 76.26150121065375, + "grad_norm": 3.983264832640998e-06, + "learning_rate": 0.19960063144397142, + "loss": 0.0, + "num_input_tokens_seen": 26918464, + "step": 15710 + }, + { + "epoch": 76.28571428571429, + "grad_norm": 2.949550207631546e-06, + "learning_rate": 0.19954503638466176, + "loss": 0.0, + "num_input_tokens_seen": 26926848, + "step": 15715 + }, + { + "epoch": 76.30992736077482, + "grad_norm": 1.8004594721787726e-06, + "learning_rate": 0.1994894336848848, + "loss": 0.0, + "num_input_tokens_seen": 26935648, + "step": 15720 + }, + { + "epoch": 76.33414043583535, + "grad_norm": 3.266758085374022e-06, + "learning_rate": 0.1994338233532153, + "loss": 0.0, + "num_input_tokens_seen": 26944256, + "step": 15725 + }, + { + "epoch": 76.35835351089588, + "grad_norm": 1.1418602525736787e-06, + "learning_rate": 0.19937820539822904, + "loss": 0.0, + "num_input_tokens_seen": 26952640, + "step": 15730 + }, + { + "epoch": 76.38256658595641, + "grad_norm": 2.920815802553989e-07, + "learning_rate": 0.199322579828503, + "loss": 0.0, + "num_input_tokens_seen": 26961152, + "step": 15735 + }, + { + "epoch": 76.40677966101696, + "grad_norm": 2.0410022898431635e-06, + "learning_rate": 0.19926694665261527, + "loss": 0.0, + "num_input_tokens_seen": 26969696, + "step": 15740 + }, + { + "epoch": 76.43099273607749, + "grad_norm": 1.0194854667133768e-06, + "learning_rate": 0.19921130587914526, + "loss": 0.0, + "num_input_tokens_seen": 26978688, + "step": 15745 + }, + { + "epoch": 76.45520581113801, + "grad_norm": 2.4103715077217203e-06, + "learning_rate": 0.19915565751667344, + "loss": 0.0, + "num_input_tokens_seen": 26987168, + "step": 15750 + }, + { + "epoch": 76.47941888619854, + "grad_norm": 3.043941433134023e-06, + "learning_rate": 0.19910000157378152, + "loss": 0.0, + "num_input_tokens_seen": 26995712, + "step": 15755 + }, + { + "epoch": 76.50363196125907, + "grad_norm": 2.1529926925722975e-06, + "learning_rate": 0.1990443380590523, + "loss": 0.0, + "num_input_tokens_seen": 27004000, + "step": 15760 + }, + { + "epoch": 76.52784503631962, + "grad_norm": 2.944089828815777e-06, + "learning_rate": 0.19898866698106984, + "loss": 0.0, + "num_input_tokens_seen": 27012352, + "step": 15765 + }, + { + "epoch": 76.55205811138015, + "grad_norm": 1.9240842448198237e-06, + "learning_rate": 0.19893298834841933, + "loss": 0.0, + "num_input_tokens_seen": 27021056, + "step": 15770 + }, + { + "epoch": 76.57627118644068, + "grad_norm": 2.3166330720414408e-06, + "learning_rate": 0.19887730216968705, + "loss": 0.0, + "num_input_tokens_seen": 27029504, + "step": 15775 + }, + { + "epoch": 76.60048426150121, + "grad_norm": 1.7194550991916913e-06, + "learning_rate": 0.19882160845346053, + "loss": 0.0, + "num_input_tokens_seen": 27037952, + "step": 15780 + }, + { + "epoch": 76.62469733656174, + "grad_norm": 4.053724182995211e-07, + "learning_rate": 0.1987659072083285, + "loss": 0.0, + "num_input_tokens_seen": 27046336, + "step": 15785 + }, + { + "epoch": 76.64891041162228, + "grad_norm": 3.175082156303688e-06, + "learning_rate": 0.1987101984428807, + "loss": 0.0, + "num_input_tokens_seen": 27055072, + "step": 15790 + }, + { + "epoch": 76.67312348668281, + "grad_norm": 2.7547275749384426e-06, + "learning_rate": 0.19865448216570822, + "loss": 0.0, + "num_input_tokens_seen": 27063456, + "step": 15795 + }, + { + "epoch": 76.69733656174334, + "grad_norm": 2.8736303647747263e-06, + "learning_rate": 0.19859875838540317, + "loss": 0.0, + "num_input_tokens_seen": 27072064, + "step": 15800 + }, + { + "epoch": 76.69733656174334, + "eval_loss": 0.7713584303855896, + "eval_runtime": 4.6305, + "eval_samples_per_second": 79.257, + "eval_steps_per_second": 19.868, + "num_input_tokens_seen": 27072064, + "step": 15800 + }, + { + "epoch": 76.72154963680387, + "grad_norm": 1.962827582246973e-06, + "learning_rate": 0.1985430271105588, + "loss": 0.0, + "num_input_tokens_seen": 27080672, + "step": 15805 + }, + { + "epoch": 76.7457627118644, + "grad_norm": 1.5310890830733115e-06, + "learning_rate": 0.19848728834976961, + "loss": 0.0, + "num_input_tokens_seen": 27089408, + "step": 15810 + }, + { + "epoch": 76.76997578692495, + "grad_norm": 2.1995665520080365e-06, + "learning_rate": 0.19843154211163128, + "loss": 0.0, + "num_input_tokens_seen": 27097792, + "step": 15815 + }, + { + "epoch": 76.79418886198548, + "grad_norm": 2.0656118522310862e-06, + "learning_rate": 0.1983757884047405, + "loss": 0.0, + "num_input_tokens_seen": 27106560, + "step": 15820 + }, + { + "epoch": 76.818401937046, + "grad_norm": 2.8830652354372432e-06, + "learning_rate": 0.1983200272376952, + "loss": 0.0, + "num_input_tokens_seen": 27115168, + "step": 15825 + }, + { + "epoch": 76.84261501210653, + "grad_norm": 6.348197985062143e-07, + "learning_rate": 0.1982642586190945, + "loss": 0.0, + "num_input_tokens_seen": 27124320, + "step": 15830 + }, + { + "epoch": 76.86682808716706, + "grad_norm": 2.27842269850953e-06, + "learning_rate": 0.1982084825575386, + "loss": 0.0, + "num_input_tokens_seen": 27132928, + "step": 15835 + }, + { + "epoch": 76.89104116222761, + "grad_norm": 1.2576822427945444e-06, + "learning_rate": 0.19815269906162883, + "loss": 0.0, + "num_input_tokens_seen": 27141344, + "step": 15840 + }, + { + "epoch": 76.91525423728814, + "grad_norm": 2.279202362842625e-06, + "learning_rate": 0.19809690813996775, + "loss": 0.0, + "num_input_tokens_seen": 27150112, + "step": 15845 + }, + { + "epoch": 76.93946731234867, + "grad_norm": 2.6046507173305145e-06, + "learning_rate": 0.19804110980115905, + "loss": 0.0, + "num_input_tokens_seen": 27158880, + "step": 15850 + }, + { + "epoch": 76.9636803874092, + "grad_norm": 2.8481370009103557e-06, + "learning_rate": 0.19798530405380746, + "loss": 0.0, + "num_input_tokens_seen": 27167072, + "step": 15855 + }, + { + "epoch": 76.98789346246973, + "grad_norm": 1.7757949990482302e-06, + "learning_rate": 0.19792949090651893, + "loss": 0.0, + "num_input_tokens_seen": 27175776, + "step": 15860 + }, + { + "epoch": 77.01452784503633, + "grad_norm": 3.818163349933457e-06, + "learning_rate": 0.19787367036790066, + "loss": 0.0, + "num_input_tokens_seen": 27185024, + "step": 15865 + }, + { + "epoch": 77.03874092009686, + "grad_norm": 1.0954980780297774e-06, + "learning_rate": 0.19781784244656075, + "loss": 0.0, + "num_input_tokens_seen": 27193728, + "step": 15870 + }, + { + "epoch": 77.06295399515739, + "grad_norm": 1.0451875596118043e-06, + "learning_rate": 0.19776200715110864, + "loss": 0.0, + "num_input_tokens_seen": 27202368, + "step": 15875 + }, + { + "epoch": 77.08716707021792, + "grad_norm": 9.53260212099849e-07, + "learning_rate": 0.1977061644901548, + "loss": 0.0, + "num_input_tokens_seen": 27211040, + "step": 15880 + }, + { + "epoch": 77.11138014527845, + "grad_norm": 1.4397872973859194e-06, + "learning_rate": 0.1976503144723109, + "loss": 0.0, + "num_input_tokens_seen": 27219936, + "step": 15885 + }, + { + "epoch": 77.13559322033899, + "grad_norm": 2.003404006245546e-06, + "learning_rate": 0.19759445710618967, + "loss": 0.0, + "num_input_tokens_seen": 27228288, + "step": 15890 + }, + { + "epoch": 77.15980629539952, + "grad_norm": 8.375334346055752e-07, + "learning_rate": 0.19753859240040508, + "loss": 0.0, + "num_input_tokens_seen": 27237056, + "step": 15895 + }, + { + "epoch": 77.18401937046005, + "grad_norm": 1.0741083542598062e-06, + "learning_rate": 0.1974827203635721, + "loss": 0.0, + "num_input_tokens_seen": 27244960, + "step": 15900 + }, + { + "epoch": 77.20823244552058, + "grad_norm": 1.0154830079045496e-06, + "learning_rate": 0.19742684100430694, + "loss": 0.0, + "num_input_tokens_seen": 27253152, + "step": 15905 + }, + { + "epoch": 77.23244552058111, + "grad_norm": 1.7226145700988127e-06, + "learning_rate": 0.19737095433122692, + "loss": 0.0, + "num_input_tokens_seen": 27261664, + "step": 15910 + }, + { + "epoch": 77.25665859564165, + "grad_norm": 1.4143895441520726e-06, + "learning_rate": 0.19731506035295046, + "loss": 0.0, + "num_input_tokens_seen": 27269888, + "step": 15915 + }, + { + "epoch": 77.28087167070218, + "grad_norm": 2.315204937985982e-06, + "learning_rate": 0.19725915907809702, + "loss": 0.0, + "num_input_tokens_seen": 27278688, + "step": 15920 + }, + { + "epoch": 77.30508474576271, + "grad_norm": 3.6344770251162117e-06, + "learning_rate": 0.1972032505152874, + "loss": 0.0, + "num_input_tokens_seen": 27287328, + "step": 15925 + }, + { + "epoch": 77.32929782082324, + "grad_norm": 1.2411612715368392e-06, + "learning_rate": 0.19714733467314338, + "loss": 0.0, + "num_input_tokens_seen": 27295648, + "step": 15930 + }, + { + "epoch": 77.35351089588377, + "grad_norm": 1.0300084340997273e-06, + "learning_rate": 0.19709141156028784, + "loss": 0.0, + "num_input_tokens_seen": 27304032, + "step": 15935 + }, + { + "epoch": 77.37772397094432, + "grad_norm": 2.079226078421925e-06, + "learning_rate": 0.1970354811853448, + "loss": 0.0, + "num_input_tokens_seen": 27312544, + "step": 15940 + }, + { + "epoch": 77.40193704600485, + "grad_norm": 1.0369748224547948e-06, + "learning_rate": 0.19697954355693953, + "loss": 0.0, + "num_input_tokens_seen": 27320896, + "step": 15945 + }, + { + "epoch": 77.42615012106538, + "grad_norm": 2.965223075079848e-06, + "learning_rate": 0.19692359868369827, + "loss": 0.0, + "num_input_tokens_seen": 27329664, + "step": 15950 + }, + { + "epoch": 77.4503631961259, + "grad_norm": 3.1667520943301497e-06, + "learning_rate": 0.1968676465742484, + "loss": 0.0, + "num_input_tokens_seen": 27338176, + "step": 15955 + }, + { + "epoch": 77.47457627118644, + "grad_norm": 1.0704052328947e-06, + "learning_rate": 0.19681168723721845, + "loss": 0.0, + "num_input_tokens_seen": 27346720, + "step": 15960 + }, + { + "epoch": 77.49878934624698, + "grad_norm": 9.900993518385803e-07, + "learning_rate": 0.19675572068123803, + "loss": 0.0, + "num_input_tokens_seen": 27355072, + "step": 15965 + }, + { + "epoch": 77.52300242130751, + "grad_norm": 2.436175236653071e-06, + "learning_rate": 0.19669974691493794, + "loss": 0.0, + "num_input_tokens_seen": 27363680, + "step": 15970 + }, + { + "epoch": 77.54721549636804, + "grad_norm": 2.273436848554411e-06, + "learning_rate": 0.19664376594695002, + "loss": 0.0, + "num_input_tokens_seen": 27372352, + "step": 15975 + }, + { + "epoch": 77.57142857142857, + "grad_norm": 1.8705259208218195e-06, + "learning_rate": 0.19658777778590722, + "loss": 0.0, + "num_input_tokens_seen": 27380736, + "step": 15980 + }, + { + "epoch": 77.5956416464891, + "grad_norm": 2.660408881638432e-06, + "learning_rate": 0.19653178244044364, + "loss": 0.0, + "num_input_tokens_seen": 27389408, + "step": 15985 + }, + { + "epoch": 77.61985472154964, + "grad_norm": 1.2373112667773967e-06, + "learning_rate": 0.19647577991919443, + "loss": 0.0, + "num_input_tokens_seen": 27398336, + "step": 15990 + }, + { + "epoch": 77.64406779661017, + "grad_norm": 3.0784510727244196e-06, + "learning_rate": 0.1964197702307959, + "loss": 0.0, + "num_input_tokens_seen": 27407072, + "step": 15995 + }, + { + "epoch": 77.6682808716707, + "grad_norm": 2.2551305391971255e-06, + "learning_rate": 0.19636375338388545, + "loss": 0.0, + "num_input_tokens_seen": 27415968, + "step": 16000 + }, + { + "epoch": 77.6682808716707, + "eval_loss": 0.786064624786377, + "eval_runtime": 4.6254, + "eval_samples_per_second": 79.345, + "eval_steps_per_second": 19.89, + "num_input_tokens_seen": 27415968, + "step": 16000 + }, + { + "epoch": 77.69249394673123, + "grad_norm": 1.2684166676990571e-06, + "learning_rate": 0.1963077293871016, + "loss": 0.0, + "num_input_tokens_seen": 27424768, + "step": 16005 + }, + { + "epoch": 77.71670702179176, + "grad_norm": 1.3537883205572143e-06, + "learning_rate": 0.19625169824908395, + "loss": 0.0, + "num_input_tokens_seen": 27433120, + "step": 16010 + }, + { + "epoch": 77.7409200968523, + "grad_norm": 3.196086026946432e-06, + "learning_rate": 0.19619565997847319, + "loss": 0.0, + "num_input_tokens_seen": 27441632, + "step": 16015 + }, + { + "epoch": 77.76513317191284, + "grad_norm": 3.4614944866007136e-07, + "learning_rate": 0.19613961458391113, + "loss": 0.0, + "num_input_tokens_seen": 27449728, + "step": 16020 + }, + { + "epoch": 77.78934624697337, + "grad_norm": 2.24720452024485e-06, + "learning_rate": 0.19608356207404065, + "loss": 0.0, + "num_input_tokens_seen": 27458208, + "step": 16025 + }, + { + "epoch": 77.8135593220339, + "grad_norm": 9.570021575200371e-07, + "learning_rate": 0.1960275024575058, + "loss": 0.0, + "num_input_tokens_seen": 27466528, + "step": 16030 + }, + { + "epoch": 77.83777239709443, + "grad_norm": 2.8969827781111235e-06, + "learning_rate": 0.19597143574295164, + "loss": 0.0, + "num_input_tokens_seen": 27475136, + "step": 16035 + }, + { + "epoch": 77.86198547215497, + "grad_norm": 2.146044380424428e-06, + "learning_rate": 0.1959153619390244, + "loss": 0.0, + "num_input_tokens_seen": 27483872, + "step": 16040 + }, + { + "epoch": 77.8861985472155, + "grad_norm": 1.4155191365716746e-06, + "learning_rate": 0.1958592810543713, + "loss": 0.0, + "num_input_tokens_seen": 27492512, + "step": 16045 + }, + { + "epoch": 77.91041162227603, + "grad_norm": 6.271241090871627e-07, + "learning_rate": 0.19580319309764077, + "loss": 0.0, + "num_input_tokens_seen": 27501248, + "step": 16050 + }, + { + "epoch": 77.93462469733656, + "grad_norm": 3.938321242458187e-06, + "learning_rate": 0.1957470980774823, + "loss": 0.0, + "num_input_tokens_seen": 27509984, + "step": 16055 + }, + { + "epoch": 77.95883777239709, + "grad_norm": 2.078441184494295e-06, + "learning_rate": 0.19569099600254639, + "loss": 0.0, + "num_input_tokens_seen": 27518656, + "step": 16060 + }, + { + "epoch": 77.98305084745763, + "grad_norm": 2.5468459625699325e-06, + "learning_rate": 0.1956348868814847, + "loss": 0.0, + "num_input_tokens_seen": 27527040, + "step": 16065 + }, + { + "epoch": 78.00968523002422, + "grad_norm": 6.220245722943218e-07, + "learning_rate": 0.19557877072295, + "loss": 0.0, + "num_input_tokens_seen": 27536288, + "step": 16070 + }, + { + "epoch": 78.03389830508475, + "grad_norm": 1.5907392025837908e-06, + "learning_rate": 0.19552264753559603, + "loss": 0.0, + "num_input_tokens_seen": 27544864, + "step": 16075 + }, + { + "epoch": 78.05811138014528, + "grad_norm": 2.4755981939961202e-06, + "learning_rate": 0.19546651732807774, + "loss": 0.0, + "num_input_tokens_seen": 27553472, + "step": 16080 + }, + { + "epoch": 78.08232445520581, + "grad_norm": 1.6215227560678613e-06, + "learning_rate": 0.19541038010905112, + "loss": 0.0, + "num_input_tokens_seen": 27562080, + "step": 16085 + }, + { + "epoch": 78.10653753026634, + "grad_norm": 9.114953627431532e-07, + "learning_rate": 0.19535423588717324, + "loss": 0.0, + "num_input_tokens_seen": 27570816, + "step": 16090 + }, + { + "epoch": 78.13075060532688, + "grad_norm": 1.4444134421864874e-06, + "learning_rate": 0.19529808467110224, + "loss": 0.0, + "num_input_tokens_seen": 27579104, + "step": 16095 + }, + { + "epoch": 78.15496368038741, + "grad_norm": 9.623595360608306e-07, + "learning_rate": 0.19524192646949734, + "loss": 0.0, + "num_input_tokens_seen": 27587776, + "step": 16100 + }, + { + "epoch": 78.17917675544794, + "grad_norm": 1.3937625453763758e-06, + "learning_rate": 0.19518576129101878, + "loss": 0.0, + "num_input_tokens_seen": 27596512, + "step": 16105 + }, + { + "epoch": 78.20338983050847, + "grad_norm": 1.735836235638999e-06, + "learning_rate": 0.19512958914432804, + "loss": 0.0, + "num_input_tokens_seen": 27605216, + "step": 16110 + }, + { + "epoch": 78.227602905569, + "grad_norm": 2.6632142180460505e-06, + "learning_rate": 0.1950734100380875, + "loss": 0.0, + "num_input_tokens_seen": 27613504, + "step": 16115 + }, + { + "epoch": 78.25181598062954, + "grad_norm": 1.638586695662525e-06, + "learning_rate": 0.19501722398096066, + "loss": 0.0, + "num_input_tokens_seen": 27622208, + "step": 16120 + }, + { + "epoch": 78.27602905569007, + "grad_norm": 1.6414489891758421e-06, + "learning_rate": 0.1949610309816122, + "loss": 0.0, + "num_input_tokens_seen": 27630528, + "step": 16125 + }, + { + "epoch": 78.3002421307506, + "grad_norm": 2.147688064724207e-06, + "learning_rate": 0.1949048310487078, + "loss": 0.0, + "num_input_tokens_seen": 27639232, + "step": 16130 + }, + { + "epoch": 78.32445520581113, + "grad_norm": 1.0919613941950956e-06, + "learning_rate": 0.19484862419091406, + "loss": 0.0, + "num_input_tokens_seen": 27647744, + "step": 16135 + }, + { + "epoch": 78.34866828087166, + "grad_norm": 1.256907353308634e-06, + "learning_rate": 0.19479241041689893, + "loss": 0.0, + "num_input_tokens_seen": 27656352, + "step": 16140 + }, + { + "epoch": 78.37288135593221, + "grad_norm": 2.526829803173314e-06, + "learning_rate": 0.19473618973533116, + "loss": 0.0, + "num_input_tokens_seen": 27664928, + "step": 16145 + }, + { + "epoch": 78.39709443099274, + "grad_norm": 1.5395114587590797e-06, + "learning_rate": 0.19467996215488076, + "loss": 0.0, + "num_input_tokens_seen": 27673280, + "step": 16150 + }, + { + "epoch": 78.42130750605327, + "grad_norm": 1.2737283441310865e-06, + "learning_rate": 0.1946237276842187, + "loss": 0.0, + "num_input_tokens_seen": 27681856, + "step": 16155 + }, + { + "epoch": 78.4455205811138, + "grad_norm": 1.0418571037007496e-06, + "learning_rate": 0.19456748633201712, + "loss": 0.0, + "num_input_tokens_seen": 27690272, + "step": 16160 + }, + { + "epoch": 78.46973365617433, + "grad_norm": 3.291247367087635e-06, + "learning_rate": 0.194511238106949, + "loss": 0.0, + "num_input_tokens_seen": 27698464, + "step": 16165 + }, + { + "epoch": 78.49394673123487, + "grad_norm": 2.3098557448975043e-06, + "learning_rate": 0.19445498301768863, + "loss": 0.0, + "num_input_tokens_seen": 27707296, + "step": 16170 + }, + { + "epoch": 78.5181598062954, + "grad_norm": 1.8019414937953115e-06, + "learning_rate": 0.19439872107291126, + "loss": 0.0, + "num_input_tokens_seen": 27716160, + "step": 16175 + }, + { + "epoch": 78.54237288135593, + "grad_norm": 1.8393326399746002e-06, + "learning_rate": 0.1943424522812931, + "loss": 0.0, + "num_input_tokens_seen": 27724896, + "step": 16180 + }, + { + "epoch": 78.56658595641646, + "grad_norm": 8.111335318972124e-07, + "learning_rate": 0.19428617665151157, + "loss": 0.0, + "num_input_tokens_seen": 27733696, + "step": 16185 + }, + { + "epoch": 78.59079903147699, + "grad_norm": 1.894884007924702e-06, + "learning_rate": 0.19422989419224507, + "loss": 0.0, + "num_input_tokens_seen": 27742432, + "step": 16190 + }, + { + "epoch": 78.61501210653753, + "grad_norm": 1.6914538036871818e-06, + "learning_rate": 0.19417360491217303, + "loss": 0.0, + "num_input_tokens_seen": 27750912, + "step": 16195 + }, + { + "epoch": 78.63922518159806, + "grad_norm": 8.805511129139632e-07, + "learning_rate": 0.19411730881997605, + "loss": 0.0, + "num_input_tokens_seen": 27759520, + "step": 16200 + }, + { + "epoch": 78.63922518159806, + "eval_loss": 0.7884824872016907, + "eval_runtime": 4.6112, + "eval_samples_per_second": 79.589, + "eval_steps_per_second": 19.951, + "num_input_tokens_seen": 27759520, + "step": 16200 + }, + { + "epoch": 78.6634382566586, + "grad_norm": 7.895127964729909e-07, + "learning_rate": 0.1940610059243356, + "loss": 0.0, + "num_input_tokens_seen": 27768000, + "step": 16205 + }, + { + "epoch": 78.68765133171912, + "grad_norm": 1.5596250477756257e-06, + "learning_rate": 0.19400469623393435, + "loss": 0.0, + "num_input_tokens_seen": 27776512, + "step": 16210 + }, + { + "epoch": 78.71186440677967, + "grad_norm": 2.8663284865615424e-06, + "learning_rate": 0.1939483797574559, + "loss": 0.0, + "num_input_tokens_seen": 27784928, + "step": 16215 + }, + { + "epoch": 78.7360774818402, + "grad_norm": 3.1163583571469644e-06, + "learning_rate": 0.19389205650358504, + "loss": 0.0, + "num_input_tokens_seen": 27793152, + "step": 16220 + }, + { + "epoch": 78.76029055690073, + "grad_norm": 3.238591943954816e-06, + "learning_rate": 0.19383572648100747, + "loss": 0.0, + "num_input_tokens_seen": 27801120, + "step": 16225 + }, + { + "epoch": 78.78450363196126, + "grad_norm": 4.703795468685712e-07, + "learning_rate": 0.19377938969841, + "loss": 0.0, + "num_input_tokens_seen": 27809760, + "step": 16230 + }, + { + "epoch": 78.80871670702179, + "grad_norm": 1.7556814100316842e-06, + "learning_rate": 0.1937230461644805, + "loss": 0.0, + "num_input_tokens_seen": 27818144, + "step": 16235 + }, + { + "epoch": 78.83292978208233, + "grad_norm": 2.982561682074447e-06, + "learning_rate": 0.19366669588790777, + "loss": 0.0, + "num_input_tokens_seen": 27826848, + "step": 16240 + }, + { + "epoch": 78.85714285714286, + "grad_norm": 2.25657686314662e-06, + "learning_rate": 0.19361033887738185, + "loss": 0.0, + "num_input_tokens_seen": 27835296, + "step": 16245 + }, + { + "epoch": 78.88135593220339, + "grad_norm": 1.5989272696970147e-06, + "learning_rate": 0.19355397514159361, + "loss": 0.0, + "num_input_tokens_seen": 27844064, + "step": 16250 + }, + { + "epoch": 78.90556900726392, + "grad_norm": 1.2248196981090587e-06, + "learning_rate": 0.19349760468923508, + "loss": 0.0, + "num_input_tokens_seen": 27852640, + "step": 16255 + }, + { + "epoch": 78.92978208232445, + "grad_norm": 1.2240147952979896e-06, + "learning_rate": 0.19344122752899925, + "loss": 0.0, + "num_input_tokens_seen": 27861152, + "step": 16260 + }, + { + "epoch": 78.953995157385, + "grad_norm": 1.3964640857011545e-06, + "learning_rate": 0.1933848436695802, + "loss": 0.0, + "num_input_tokens_seen": 27869760, + "step": 16265 + }, + { + "epoch": 78.97820823244552, + "grad_norm": 1.8710940139499144e-06, + "learning_rate": 0.1933284531196731, + "loss": 0.0, + "num_input_tokens_seen": 27878752, + "step": 16270 + }, + { + "epoch": 79.00484261501211, + "grad_norm": 4.778087259182939e-06, + "learning_rate": 0.19327205588797403, + "loss": 0.0, + "num_input_tokens_seen": 27888032, + "step": 16275 + }, + { + "epoch": 79.02905569007264, + "grad_norm": 3.1206141102302354e-06, + "learning_rate": 0.19321565198318014, + "loss": 0.0, + "num_input_tokens_seen": 27896672, + "step": 16280 + }, + { + "epoch": 79.05326876513317, + "grad_norm": 3.829775039321248e-07, + "learning_rate": 0.1931592414139896, + "loss": 0.0, + "num_input_tokens_seen": 27905088, + "step": 16285 + }, + { + "epoch": 79.0774818401937, + "grad_norm": 3.9946183960637427e-07, + "learning_rate": 0.19310282418910169, + "loss": 0.0, + "num_input_tokens_seen": 27913760, + "step": 16290 + }, + { + "epoch": 79.10169491525424, + "grad_norm": 1.8505797925172374e-06, + "learning_rate": 0.1930464003172166, + "loss": 0.0, + "num_input_tokens_seen": 27922528, + "step": 16295 + }, + { + "epoch": 79.12590799031477, + "grad_norm": 4.7214206233547884e-07, + "learning_rate": 0.19298996980703567, + "loss": 0.0, + "num_input_tokens_seen": 27931456, + "step": 16300 + }, + { + "epoch": 79.1501210653753, + "grad_norm": 1.3253360293674632e-06, + "learning_rate": 0.19293353266726113, + "loss": 0.0, + "num_input_tokens_seen": 27939808, + "step": 16305 + }, + { + "epoch": 79.17433414043583, + "grad_norm": 7.871427101235895e-07, + "learning_rate": 0.19287708890659633, + "loss": 0.0, + "num_input_tokens_seen": 27948096, + "step": 16310 + }, + { + "epoch": 79.19854721549636, + "grad_norm": 7.247195412674046e-07, + "learning_rate": 0.19282063853374556, + "loss": 0.0, + "num_input_tokens_seen": 27956608, + "step": 16315 + }, + { + "epoch": 79.2227602905569, + "grad_norm": 2.05603373615304e-06, + "learning_rate": 0.19276418155741423, + "loss": 0.0, + "num_input_tokens_seen": 27965152, + "step": 16320 + }, + { + "epoch": 79.24697336561744, + "grad_norm": 2.8294075491430704e-06, + "learning_rate": 0.19270771798630867, + "loss": 0.0, + "num_input_tokens_seen": 27973536, + "step": 16325 + }, + { + "epoch": 79.27118644067797, + "grad_norm": 1.669251219027501e-06, + "learning_rate": 0.1926512478291363, + "loss": 0.0, + "num_input_tokens_seen": 27981952, + "step": 16330 + }, + { + "epoch": 79.2953995157385, + "grad_norm": 1.0920185786744696e-06, + "learning_rate": 0.19259477109460557, + "loss": 0.0, + "num_input_tokens_seen": 27990688, + "step": 16335 + }, + { + "epoch": 79.31961259079903, + "grad_norm": 1.0919742408077582e-06, + "learning_rate": 0.19253828779142584, + "loss": 0.0, + "num_input_tokens_seen": 27999136, + "step": 16340 + }, + { + "epoch": 79.34382566585957, + "grad_norm": 9.65142476161418e-07, + "learning_rate": 0.19248179792830755, + "loss": 0.0, + "num_input_tokens_seen": 28007584, + "step": 16345 + }, + { + "epoch": 79.3680387409201, + "grad_norm": 9.156053693004651e-07, + "learning_rate": 0.19242530151396217, + "loss": 0.0, + "num_input_tokens_seen": 28016256, + "step": 16350 + }, + { + "epoch": 79.39225181598063, + "grad_norm": 1.6218905329878908e-06, + "learning_rate": 0.19236879855710215, + "loss": 0.0, + "num_input_tokens_seen": 28024704, + "step": 16355 + }, + { + "epoch": 79.41646489104116, + "grad_norm": 9.884105338642257e-07, + "learning_rate": 0.19231228906644096, + "loss": 0.0, + "num_input_tokens_seen": 28033024, + "step": 16360 + }, + { + "epoch": 79.44067796610169, + "grad_norm": 8.450424502370879e-07, + "learning_rate": 0.19225577305069302, + "loss": 0.0, + "num_input_tokens_seen": 28041632, + "step": 16365 + }, + { + "epoch": 79.46489104116223, + "grad_norm": 3.859557864416274e-07, + "learning_rate": 0.1921992505185739, + "loss": 0.0, + "num_input_tokens_seen": 28050464, + "step": 16370 + }, + { + "epoch": 79.48910411622276, + "grad_norm": 2.4472903987771133e-06, + "learning_rate": 0.19214272147880004, + "loss": 0.0, + "num_input_tokens_seen": 28059264, + "step": 16375 + }, + { + "epoch": 79.51331719128329, + "grad_norm": 1.5083637663337868e-06, + "learning_rate": 0.19208618594008892, + "loss": 0.0, + "num_input_tokens_seen": 28067616, + "step": 16380 + }, + { + "epoch": 79.53753026634382, + "grad_norm": 2.4249645775853423e-06, + "learning_rate": 0.19202964391115904, + "loss": 0.0, + "num_input_tokens_seen": 28075872, + "step": 16385 + }, + { + "epoch": 79.56174334140435, + "grad_norm": 1.5373361748061143e-06, + "learning_rate": 0.1919730954007299, + "loss": 0.0, + "num_input_tokens_seen": 28084512, + "step": 16390 + }, + { + "epoch": 79.5859564164649, + "grad_norm": 1.495564561082574e-06, + "learning_rate": 0.19191654041752199, + "loss": 0.0, + "num_input_tokens_seen": 28092928, + "step": 16395 + }, + { + "epoch": 79.61016949152543, + "grad_norm": 1.8958132841362385e-06, + "learning_rate": 0.19185997897025678, + "loss": 0.0, + "num_input_tokens_seen": 28101632, + "step": 16400 + }, + { + "epoch": 79.61016949152543, + "eval_loss": 0.7974054217338562, + "eval_runtime": 4.6316, + "eval_samples_per_second": 79.239, + "eval_steps_per_second": 19.864, + "num_input_tokens_seen": 28101632, + "step": 16400 + }, + { + "epoch": 79.63438256658596, + "grad_norm": 2.028018343480653e-06, + "learning_rate": 0.19180341106765672, + "loss": 0.0, + "num_input_tokens_seen": 28110016, + "step": 16405 + }, + { + "epoch": 79.65859564164649, + "grad_norm": 9.709463029139442e-07, + "learning_rate": 0.19174683671844536, + "loss": 0.0, + "num_input_tokens_seen": 28118656, + "step": 16410 + }, + { + "epoch": 79.68280871670702, + "grad_norm": 2.474318762324401e-06, + "learning_rate": 0.19169025593134717, + "loss": 0.0, + "num_input_tokens_seen": 28127104, + "step": 16415 + }, + { + "epoch": 79.70702179176756, + "grad_norm": 7.105801955731295e-07, + "learning_rate": 0.19163366871508764, + "loss": 0.0, + "num_input_tokens_seen": 28135872, + "step": 16420 + }, + { + "epoch": 79.73123486682809, + "grad_norm": 2.2515089312946657e-06, + "learning_rate": 0.19157707507839317, + "loss": 0.0, + "num_input_tokens_seen": 28144640, + "step": 16425 + }, + { + "epoch": 79.75544794188862, + "grad_norm": 2.5321796783828177e-06, + "learning_rate": 0.19152047502999123, + "loss": 0.0, + "num_input_tokens_seen": 28153120, + "step": 16430 + }, + { + "epoch": 79.77966101694915, + "grad_norm": 1.2122878843001672e-06, + "learning_rate": 0.19146386857861025, + "loss": 0.0, + "num_input_tokens_seen": 28161376, + "step": 16435 + }, + { + "epoch": 79.80387409200968, + "grad_norm": 1.4418648106584442e-06, + "learning_rate": 0.19140725573297968, + "loss": 0.0, + "num_input_tokens_seen": 28170016, + "step": 16440 + }, + { + "epoch": 79.82808716707022, + "grad_norm": 1.3887603245166247e-06, + "learning_rate": 0.19135063650182987, + "loss": 0.0, + "num_input_tokens_seen": 28178560, + "step": 16445 + }, + { + "epoch": 79.85230024213075, + "grad_norm": 1.5361484884124366e-06, + "learning_rate": 0.19129401089389234, + "loss": 0.0, + "num_input_tokens_seen": 28187488, + "step": 16450 + }, + { + "epoch": 79.87651331719128, + "grad_norm": 1.561278850203962e-06, + "learning_rate": 0.19123737891789938, + "loss": 0.0, + "num_input_tokens_seen": 28196064, + "step": 16455 + }, + { + "epoch": 79.90072639225181, + "grad_norm": 1.1322044883854687e-06, + "learning_rate": 0.19118074058258439, + "loss": 0.0, + "num_input_tokens_seen": 28204800, + "step": 16460 + }, + { + "epoch": 79.92493946731234, + "grad_norm": 1.913394726216211e-06, + "learning_rate": 0.1911240958966816, + "loss": 0.0, + "num_input_tokens_seen": 28213408, + "step": 16465 + }, + { + "epoch": 79.94915254237289, + "grad_norm": 2.8297433800617e-06, + "learning_rate": 0.19106744486892652, + "loss": 0.0, + "num_input_tokens_seen": 28222048, + "step": 16470 + }, + { + "epoch": 79.97336561743342, + "grad_norm": 4.149388246332819e-07, + "learning_rate": 0.1910107875080553, + "loss": 0.0, + "num_input_tokens_seen": 28230432, + "step": 16475 + }, + { + "epoch": 79.99757869249395, + "grad_norm": 3.24127427120402e-07, + "learning_rate": 0.19095412382280533, + "loss": 0.0, + "num_input_tokens_seen": 28238944, + "step": 16480 + }, + { + "epoch": 80.02421307506053, + "grad_norm": 9.023956408782396e-07, + "learning_rate": 0.19089745382191473, + "loss": 0.0, + "num_input_tokens_seen": 28247936, + "step": 16485 + }, + { + "epoch": 80.04842615012106, + "grad_norm": 6.484953019025852e-07, + "learning_rate": 0.19084077751412284, + "loss": 0.0, + "num_input_tokens_seen": 28256736, + "step": 16490 + }, + { + "epoch": 80.0726392251816, + "grad_norm": 7.617529718118021e-07, + "learning_rate": 0.19078409490816986, + "loss": 0.0, + "num_input_tokens_seen": 28264928, + "step": 16495 + }, + { + "epoch": 80.09685230024213, + "grad_norm": 8.557635737815872e-07, + "learning_rate": 0.19072740601279686, + "loss": 0.0, + "num_input_tokens_seen": 28273312, + "step": 16500 + }, + { + "epoch": 80.12106537530266, + "grad_norm": 1.4604767102355254e-06, + "learning_rate": 0.19067071083674605, + "loss": 0.0, + "num_input_tokens_seen": 28282016, + "step": 16505 + }, + { + "epoch": 80.1452784503632, + "grad_norm": 1.0506186072234414e-06, + "learning_rate": 0.19061400938876052, + "loss": 0.0, + "num_input_tokens_seen": 28290368, + "step": 16510 + }, + { + "epoch": 80.16949152542372, + "grad_norm": 2.536031388444826e-06, + "learning_rate": 0.1905573016775844, + "loss": 0.0, + "num_input_tokens_seen": 28299264, + "step": 16515 + }, + { + "epoch": 80.19370460048427, + "grad_norm": 5.530065436687437e-07, + "learning_rate": 0.19050058771196263, + "loss": 0.0, + "num_input_tokens_seen": 28307904, + "step": 16520 + }, + { + "epoch": 80.2179176755448, + "grad_norm": 8.903859338715847e-07, + "learning_rate": 0.19044386750064132, + "loss": 0.0, + "num_input_tokens_seen": 28316320, + "step": 16525 + }, + { + "epoch": 80.24213075060533, + "grad_norm": 2.9313700906641316e-06, + "learning_rate": 0.19038714105236737, + "loss": 0.0, + "num_input_tokens_seen": 28325152, + "step": 16530 + }, + { + "epoch": 80.26634382566586, + "grad_norm": 6.94310870130721e-07, + "learning_rate": 0.19033040837588874, + "loss": 0.0, + "num_input_tokens_seen": 28333984, + "step": 16535 + }, + { + "epoch": 80.29055690072639, + "grad_norm": 2.196726427428075e-06, + "learning_rate": 0.1902736694799543, + "loss": 0.0, + "num_input_tokens_seen": 28342944, + "step": 16540 + }, + { + "epoch": 80.31476997578693, + "grad_norm": 2.47209686676797e-06, + "learning_rate": 0.19021692437331392, + "loss": 0.0, + "num_input_tokens_seen": 28351648, + "step": 16545 + }, + { + "epoch": 80.33898305084746, + "grad_norm": 1.0785837503135554e-06, + "learning_rate": 0.1901601730647184, + "loss": 0.0, + "num_input_tokens_seen": 28360448, + "step": 16550 + }, + { + "epoch": 80.36319612590799, + "grad_norm": 1.9715828329935903e-06, + "learning_rate": 0.19010341556291954, + "loss": 0.0, + "num_input_tokens_seen": 28368928, + "step": 16555 + }, + { + "epoch": 80.38740920096852, + "grad_norm": 1.647828753448266e-06, + "learning_rate": 0.19004665187667, + "loss": 0.0, + "num_input_tokens_seen": 28377600, + "step": 16560 + }, + { + "epoch": 80.41162227602905, + "grad_norm": 1.7062210417861934e-06, + "learning_rate": 0.1899898820147235, + "loss": 0.0, + "num_input_tokens_seen": 28386336, + "step": 16565 + }, + { + "epoch": 80.4358353510896, + "grad_norm": 2.9882730814279057e-06, + "learning_rate": 0.18993310598583465, + "loss": 0.0, + "num_input_tokens_seen": 28394848, + "step": 16570 + }, + { + "epoch": 80.46004842615012, + "grad_norm": 1.392740728078934e-06, + "learning_rate": 0.18987632379875904, + "loss": 0.0, + "num_input_tokens_seen": 28403456, + "step": 16575 + }, + { + "epoch": 80.48426150121065, + "grad_norm": 6.771118705728441e-07, + "learning_rate": 0.18981953546225314, + "loss": 0.0, + "num_input_tokens_seen": 28411872, + "step": 16580 + }, + { + "epoch": 80.50847457627118, + "grad_norm": 1.8999409121533972e-06, + "learning_rate": 0.18976274098507445, + "loss": 0.0, + "num_input_tokens_seen": 28420480, + "step": 16585 + }, + { + "epoch": 80.53268765133171, + "grad_norm": 1.2617449556273641e-06, + "learning_rate": 0.18970594037598146, + "loss": 0.0, + "num_input_tokens_seen": 28429024, + "step": 16590 + }, + { + "epoch": 80.55690072639226, + "grad_norm": 8.954451686804532e-07, + "learning_rate": 0.1896491336437335, + "loss": 0.0, + "num_input_tokens_seen": 28437632, + "step": 16595 + }, + { + "epoch": 80.58111380145279, + "grad_norm": 6.389797704287048e-07, + "learning_rate": 0.18959232079709085, + "loss": 0.0, + "num_input_tokens_seen": 28446208, + "step": 16600 + }, + { + "epoch": 80.58111380145279, + "eval_loss": 0.8027089238166809, + "eval_runtime": 4.6328, + "eval_samples_per_second": 79.218, + "eval_steps_per_second": 19.859, + "num_input_tokens_seen": 28446208, + "step": 16600 + }, + { + "epoch": 80.60532687651332, + "grad_norm": 1.4591811350328499e-06, + "learning_rate": 0.18953550184481477, + "loss": 0.0, + "num_input_tokens_seen": 28454912, + "step": 16605 + }, + { + "epoch": 80.62953995157385, + "grad_norm": 1.9949895886384184e-06, + "learning_rate": 0.18947867679566752, + "loss": 0.0, + "num_input_tokens_seen": 28463712, + "step": 16610 + }, + { + "epoch": 80.65375302663438, + "grad_norm": 1.7503786011729972e-06, + "learning_rate": 0.18942184565841216, + "loss": 0.0, + "num_input_tokens_seen": 28472128, + "step": 16615 + }, + { + "epoch": 80.67796610169492, + "grad_norm": 1.9041835912503302e-06, + "learning_rate": 0.18936500844181278, + "loss": 0.0, + "num_input_tokens_seen": 28480448, + "step": 16620 + }, + { + "epoch": 80.70217917675545, + "grad_norm": 8.488029266118247e-07, + "learning_rate": 0.18930816515463436, + "loss": 0.0, + "num_input_tokens_seen": 28488992, + "step": 16625 + }, + { + "epoch": 80.72639225181598, + "grad_norm": 1.303846261180297e-06, + "learning_rate": 0.18925131580564297, + "loss": 0.0, + "num_input_tokens_seen": 28497472, + "step": 16630 + }, + { + "epoch": 80.75060532687651, + "grad_norm": 7.705428970439243e-07, + "learning_rate": 0.1891944604036054, + "loss": 0.0, + "num_input_tokens_seen": 28505632, + "step": 16635 + }, + { + "epoch": 80.77481840193704, + "grad_norm": 1.2092759789084084e-06, + "learning_rate": 0.1891375989572895, + "loss": 0.0, + "num_input_tokens_seen": 28514336, + "step": 16640 + }, + { + "epoch": 80.79903147699758, + "grad_norm": 8.62252818478737e-07, + "learning_rate": 0.18908073147546398, + "loss": 0.0, + "num_input_tokens_seen": 28523136, + "step": 16645 + }, + { + "epoch": 80.82324455205811, + "grad_norm": 1.364038439533033e-06, + "learning_rate": 0.18902385796689858, + "loss": 0.0, + "num_input_tokens_seen": 28531552, + "step": 16650 + }, + { + "epoch": 80.84745762711864, + "grad_norm": 1.5046039152366575e-06, + "learning_rate": 0.18896697844036384, + "loss": 0.0, + "num_input_tokens_seen": 28539552, + "step": 16655 + }, + { + "epoch": 80.87167070217917, + "grad_norm": 1.5017240002634935e-06, + "learning_rate": 0.18891009290463137, + "loss": 0.0, + "num_input_tokens_seen": 28547872, + "step": 16660 + }, + { + "epoch": 80.8958837772397, + "grad_norm": 2.00728936761152e-06, + "learning_rate": 0.18885320136847353, + "loss": 0.0, + "num_input_tokens_seen": 28556576, + "step": 16665 + }, + { + "epoch": 80.92009685230025, + "grad_norm": 1.2990104778509703e-06, + "learning_rate": 0.1887963038406639, + "loss": 0.0, + "num_input_tokens_seen": 28565120, + "step": 16670 + }, + { + "epoch": 80.94430992736078, + "grad_norm": 2.261991539853625e-06, + "learning_rate": 0.18873940032997658, + "loss": 0.0, + "num_input_tokens_seen": 28573728, + "step": 16675 + }, + { + "epoch": 80.9685230024213, + "grad_norm": 4.694463484611333e-07, + "learning_rate": 0.18868249084518693, + "loss": 0.0, + "num_input_tokens_seen": 28582240, + "step": 16680 + }, + { + "epoch": 80.99273607748184, + "grad_norm": 8.96775759429147e-07, + "learning_rate": 0.18862557539507102, + "loss": 0.0, + "num_input_tokens_seen": 28590400, + "step": 16685 + }, + { + "epoch": 81.01937046004842, + "grad_norm": 9.16964097541495e-07, + "learning_rate": 0.18856865398840605, + "loss": 0.0, + "num_input_tokens_seen": 28599584, + "step": 16690 + }, + { + "epoch": 81.04358353510897, + "grad_norm": 1.1676448821162921e-06, + "learning_rate": 0.18851172663396995, + "loss": 0.0, + "num_input_tokens_seen": 28608256, + "step": 16695 + }, + { + "epoch": 81.0677966101695, + "grad_norm": 2.416933966742363e-06, + "learning_rate": 0.1884547933405416, + "loss": 0.0, + "num_input_tokens_seen": 28616576, + "step": 16700 + }, + { + "epoch": 81.09200968523002, + "grad_norm": 1.2655284535867395e-06, + "learning_rate": 0.1883978541169009, + "loss": 0.0, + "num_input_tokens_seen": 28625664, + "step": 16705 + }, + { + "epoch": 81.11622276029055, + "grad_norm": 5.63927358143701e-07, + "learning_rate": 0.18834090897182854, + "loss": 0.0, + "num_input_tokens_seen": 28634240, + "step": 16710 + }, + { + "epoch": 81.14043583535108, + "grad_norm": 2.191165094700409e-06, + "learning_rate": 0.1882839579141062, + "loss": 0.0, + "num_input_tokens_seen": 28642816, + "step": 16715 + }, + { + "epoch": 81.16464891041163, + "grad_norm": 1.1549807368282927e-06, + "learning_rate": 0.18822700095251646, + "loss": 0.0, + "num_input_tokens_seen": 28651648, + "step": 16720 + }, + { + "epoch": 81.18886198547216, + "grad_norm": 1.7832965113484534e-06, + "learning_rate": 0.18817003809584273, + "loss": 0.0, + "num_input_tokens_seen": 28660352, + "step": 16725 + }, + { + "epoch": 81.21307506053269, + "grad_norm": 1.2253893828528817e-06, + "learning_rate": 0.1881130693528695, + "loss": 0.0, + "num_input_tokens_seen": 28669216, + "step": 16730 + }, + { + "epoch": 81.23728813559322, + "grad_norm": 1.1085601272498025e-06, + "learning_rate": 0.18805609473238197, + "loss": 0.0, + "num_input_tokens_seen": 28677824, + "step": 16735 + }, + { + "epoch": 81.26150121065375, + "grad_norm": 1.6095243609015597e-06, + "learning_rate": 0.18799911424316643, + "loss": 0.0, + "num_input_tokens_seen": 28686208, + "step": 16740 + }, + { + "epoch": 81.28571428571429, + "grad_norm": 1.845951373979915e-06, + "learning_rate": 0.18794212789400994, + "loss": 0.0, + "num_input_tokens_seen": 28694624, + "step": 16745 + }, + { + "epoch": 81.30992736077482, + "grad_norm": 1.027684902510373e-06, + "learning_rate": 0.18788513569370052, + "loss": 0.0, + "num_input_tokens_seen": 28703424, + "step": 16750 + }, + { + "epoch": 81.33414043583535, + "grad_norm": 5.594389449470327e-07, + "learning_rate": 0.1878281376510271, + "loss": 0.0, + "num_input_tokens_seen": 28712256, + "step": 16755 + }, + { + "epoch": 81.35835351089588, + "grad_norm": 4.0866643757908605e-07, + "learning_rate": 0.18777113377477941, + "loss": 0.0, + "num_input_tokens_seen": 28720864, + "step": 16760 + }, + { + "epoch": 81.38256658595641, + "grad_norm": 2.236195769000915e-06, + "learning_rate": 0.1877141240737483, + "loss": 0.0, + "num_input_tokens_seen": 28728960, + "step": 16765 + }, + { + "epoch": 81.40677966101696, + "grad_norm": 1.243251062987838e-06, + "learning_rate": 0.18765710855672527, + "loss": 0.0, + "num_input_tokens_seen": 28737696, + "step": 16770 + }, + { + "epoch": 81.43099273607749, + "grad_norm": 5.343634370547079e-07, + "learning_rate": 0.18760008723250288, + "loss": 0.0, + "num_input_tokens_seen": 28746080, + "step": 16775 + }, + { + "epoch": 81.45520581113801, + "grad_norm": 1.8352809547650395e-06, + "learning_rate": 0.18754306010987457, + "loss": 0.0, + "num_input_tokens_seen": 28754496, + "step": 16780 + }, + { + "epoch": 81.47941888619854, + "grad_norm": 8.241515274676203e-07, + "learning_rate": 0.18748602719763457, + "loss": 0.0, + "num_input_tokens_seen": 28762720, + "step": 16785 + }, + { + "epoch": 81.50363196125907, + "grad_norm": 7.082743422870408e-07, + "learning_rate": 0.18742898850457804, + "loss": 0.0, + "num_input_tokens_seen": 28771296, + "step": 16790 + }, + { + "epoch": 81.52784503631962, + "grad_norm": 2.990848543049651e-06, + "learning_rate": 0.1873719440395012, + "loss": 0.0, + "num_input_tokens_seen": 28779552, + "step": 16795 + }, + { + "epoch": 81.55205811138015, + "grad_norm": 5.400763143370568e-07, + "learning_rate": 0.1873148938112009, + "loss": 0.0, + "num_input_tokens_seen": 28787840, + "step": 16800 + }, + { + "epoch": 81.55205811138015, + "eval_loss": 0.8074849843978882, + "eval_runtime": 4.623, + "eval_samples_per_second": 79.386, + "eval_steps_per_second": 19.901, + "num_input_tokens_seen": 28787840, + "step": 16800 + }, + { + "epoch": 81.57627118644068, + "grad_norm": 1.212950792250922e-06, + "learning_rate": 0.18725783782847508, + "loss": 0.0, + "num_input_tokens_seen": 28796320, + "step": 16805 + }, + { + "epoch": 81.60048426150121, + "grad_norm": 5.722578180211713e-07, + "learning_rate": 0.1872007761001224, + "loss": 0.0, + "num_input_tokens_seen": 28804800, + "step": 16810 + }, + { + "epoch": 81.62469733656174, + "grad_norm": 1.076454964277218e-06, + "learning_rate": 0.1871437086349426, + "loss": 0.0, + "num_input_tokens_seen": 28813312, + "step": 16815 + }, + { + "epoch": 81.64891041162228, + "grad_norm": 6.196268600433541e-07, + "learning_rate": 0.18708663544173615, + "loss": 0.0, + "num_input_tokens_seen": 28821504, + "step": 16820 + }, + { + "epoch": 81.67312348668281, + "grad_norm": 1.1458954531917698e-06, + "learning_rate": 0.18702955652930442, + "loss": 0.0, + "num_input_tokens_seen": 28830080, + "step": 16825 + }, + { + "epoch": 81.69733656174334, + "grad_norm": 2.1807386474392843e-06, + "learning_rate": 0.18697247190644972, + "loss": 0.0, + "num_input_tokens_seen": 28838720, + "step": 16830 + }, + { + "epoch": 81.72154963680387, + "grad_norm": 2.5001193080242956e-06, + "learning_rate": 0.18691538158197527, + "loss": 0.0, + "num_input_tokens_seen": 28847104, + "step": 16835 + }, + { + "epoch": 81.7457627118644, + "grad_norm": 8.276677476715122e-07, + "learning_rate": 0.1868582855646851, + "loss": 0.0, + "num_input_tokens_seen": 28855968, + "step": 16840 + }, + { + "epoch": 81.76997578692495, + "grad_norm": 7.949228120196494e-07, + "learning_rate": 0.18680118386338404, + "loss": 0.0, + "num_input_tokens_seen": 28864192, + "step": 16845 + }, + { + "epoch": 81.79418886198548, + "grad_norm": 1.5636575199096114e-06, + "learning_rate": 0.18674407648687794, + "loss": 0.0, + "num_input_tokens_seen": 28872800, + "step": 16850 + }, + { + "epoch": 81.818401937046, + "grad_norm": 7.512660431530094e-07, + "learning_rate": 0.1866869634439736, + "loss": 0.0, + "num_input_tokens_seen": 28881344, + "step": 16855 + }, + { + "epoch": 81.84261501210653, + "grad_norm": 5.161512603990559e-07, + "learning_rate": 0.18662984474347838, + "loss": 0.0, + "num_input_tokens_seen": 28889696, + "step": 16860 + }, + { + "epoch": 81.86682808716706, + "grad_norm": 1.3715482509724097e-06, + "learning_rate": 0.1865727203942008, + "loss": 0.0, + "num_input_tokens_seen": 28898336, + "step": 16865 + }, + { + "epoch": 81.89104116222761, + "grad_norm": 1.5633050907126744e-06, + "learning_rate": 0.1865155904049501, + "loss": 0.0, + "num_input_tokens_seen": 28906720, + "step": 16870 + }, + { + "epoch": 81.91525423728814, + "grad_norm": 1.7235711311514024e-06, + "learning_rate": 0.1864584547845365, + "loss": 0.0, + "num_input_tokens_seen": 28915488, + "step": 16875 + }, + { + "epoch": 81.93946731234867, + "grad_norm": 1.06419543044467e-06, + "learning_rate": 0.186401313541771, + "loss": 0.0, + "num_input_tokens_seen": 28924224, + "step": 16880 + }, + { + "epoch": 81.9636803874092, + "grad_norm": 9.833696594796493e-07, + "learning_rate": 0.18634416668546552, + "loss": 0.0, + "num_input_tokens_seen": 28932864, + "step": 16885 + }, + { + "epoch": 81.98789346246973, + "grad_norm": 9.203642434840731e-07, + "learning_rate": 0.1862870142244328, + "loss": 0.0, + "num_input_tokens_seen": 28941248, + "step": 16890 + }, + { + "epoch": 82.01452784503633, + "grad_norm": 1.250966988664004e-06, + "learning_rate": 0.1862298561674865, + "loss": 0.0, + "num_input_tokens_seen": 28950176, + "step": 16895 + }, + { + "epoch": 82.03874092009686, + "grad_norm": 2.5255626496800687e-06, + "learning_rate": 0.18617269252344104, + "loss": 0.0, + "num_input_tokens_seen": 28958752, + "step": 16900 + }, + { + "epoch": 82.06295399515739, + "grad_norm": 1.658213477639947e-06, + "learning_rate": 0.18611552330111186, + "loss": 0.0, + "num_input_tokens_seen": 28967296, + "step": 16905 + }, + { + "epoch": 82.08716707021792, + "grad_norm": 2.324009784615555e-07, + "learning_rate": 0.18605834850931507, + "loss": 0.0, + "num_input_tokens_seen": 28975872, + "step": 16910 + }, + { + "epoch": 82.11138014527845, + "grad_norm": 8.34839397612086e-07, + "learning_rate": 0.18600116815686787, + "loss": 0.0, + "num_input_tokens_seen": 28984352, + "step": 16915 + }, + { + "epoch": 82.13559322033899, + "grad_norm": 7.296010267054953e-07, + "learning_rate": 0.1859439822525881, + "loss": 0.0, + "num_input_tokens_seen": 28992736, + "step": 16920 + }, + { + "epoch": 82.15980629539952, + "grad_norm": 5.980714377074037e-07, + "learning_rate": 0.18588679080529455, + "loss": 0.0, + "num_input_tokens_seen": 29001312, + "step": 16925 + }, + { + "epoch": 82.18401937046005, + "grad_norm": 5.126956921230885e-07, + "learning_rate": 0.1858295938238069, + "loss": 0.0, + "num_input_tokens_seen": 29009792, + "step": 16930 + }, + { + "epoch": 82.20823244552058, + "grad_norm": 7.980115128702892e-07, + "learning_rate": 0.18577239131694562, + "loss": 0.0, + "num_input_tokens_seen": 29018272, + "step": 16935 + }, + { + "epoch": 82.23244552058111, + "grad_norm": 5.16404838890594e-07, + "learning_rate": 0.18571518329353204, + "loss": 0.0, + "num_input_tokens_seen": 29026784, + "step": 16940 + }, + { + "epoch": 82.25665859564165, + "grad_norm": 1.280589799534937e-06, + "learning_rate": 0.18565796976238838, + "loss": 0.0, + "num_input_tokens_seen": 29035296, + "step": 16945 + }, + { + "epoch": 82.28087167070218, + "grad_norm": 6.032106512066093e-07, + "learning_rate": 0.18560075073233764, + "loss": 0.0, + "num_input_tokens_seen": 29044128, + "step": 16950 + }, + { + "epoch": 82.30508474576271, + "grad_norm": 1.6510401792402263e-06, + "learning_rate": 0.18554352621220377, + "loss": 0.0, + "num_input_tokens_seen": 29052576, + "step": 16955 + }, + { + "epoch": 82.32929782082324, + "grad_norm": 6.68470136133692e-07, + "learning_rate": 0.18548629621081153, + "loss": 0.0, + "num_input_tokens_seen": 29060928, + "step": 16960 + }, + { + "epoch": 82.35351089588377, + "grad_norm": 1.102282681131328e-06, + "learning_rate": 0.18542906073698645, + "loss": 0.0, + "num_input_tokens_seen": 29069344, + "step": 16965 + }, + { + "epoch": 82.37772397094432, + "grad_norm": 9.040101645041432e-07, + "learning_rate": 0.18537181979955494, + "loss": 0.0, + "num_input_tokens_seen": 29078336, + "step": 16970 + }, + { + "epoch": 82.40193704600485, + "grad_norm": 9.197273698191566e-07, + "learning_rate": 0.18531457340734434, + "loss": 0.0, + "num_input_tokens_seen": 29086784, + "step": 16975 + }, + { + "epoch": 82.42615012106538, + "grad_norm": 3.3534511203470174e-06, + "learning_rate": 0.1852573215691827, + "loss": 0.0, + "num_input_tokens_seen": 29095200, + "step": 16980 + }, + { + "epoch": 82.4503631961259, + "grad_norm": 6.679244393126282e-07, + "learning_rate": 0.18520006429389904, + "loss": 0.0, + "num_input_tokens_seen": 29103744, + "step": 16985 + }, + { + "epoch": 82.47457627118644, + "grad_norm": 1.61154821398668e-06, + "learning_rate": 0.1851428015903231, + "loss": 0.0, + "num_input_tokens_seen": 29112320, + "step": 16990 + }, + { + "epoch": 82.49878934624698, + "grad_norm": 9.788540182853467e-07, + "learning_rate": 0.1850855334672855, + "loss": 0.0, + "num_input_tokens_seen": 29120928, + "step": 16995 + }, + { + "epoch": 82.52300242130751, + "grad_norm": 1.5112858591237455e-06, + "learning_rate": 0.1850282599336178, + "loss": 0.0, + "num_input_tokens_seen": 29129536, + "step": 17000 + }, + { + "epoch": 82.52300242130751, + "eval_loss": 0.8158443570137024, + "eval_runtime": 4.6133, + "eval_samples_per_second": 79.552, + "eval_steps_per_second": 19.942, + "num_input_tokens_seen": 29129536, + "step": 17000 + }, + { + "epoch": 82.54721549636804, + "grad_norm": 1.4190394495017244e-06, + "learning_rate": 0.18497098099815215, + "loss": 0.0, + "num_input_tokens_seen": 29137920, + "step": 17005 + }, + { + "epoch": 82.57142857142857, + "grad_norm": 5.569912104874675e-07, + "learning_rate": 0.18491369666972174, + "loss": 0.0, + "num_input_tokens_seen": 29147360, + "step": 17010 + }, + { + "epoch": 82.5956416464891, + "grad_norm": 9.88354486253229e-07, + "learning_rate": 0.1848564069571606, + "loss": 0.0, + "num_input_tokens_seen": 29155840, + "step": 17015 + }, + { + "epoch": 82.61985472154964, + "grad_norm": 1.878475245575828e-06, + "learning_rate": 0.18479911186930348, + "loss": 0.0, + "num_input_tokens_seen": 29164544, + "step": 17020 + }, + { + "epoch": 82.64406779661017, + "grad_norm": 9.242035048373509e-07, + "learning_rate": 0.18474181141498597, + "loss": 0.0, + "num_input_tokens_seen": 29172960, + "step": 17025 + }, + { + "epoch": 82.6682808716707, + "grad_norm": 1.8569430721981917e-06, + "learning_rate": 0.18468450560304453, + "loss": 0.0, + "num_input_tokens_seen": 29181344, + "step": 17030 + }, + { + "epoch": 82.69249394673123, + "grad_norm": 3.008086082445516e-07, + "learning_rate": 0.1846271944423165, + "loss": 0.0, + "num_input_tokens_seen": 29190016, + "step": 17035 + }, + { + "epoch": 82.71670702179176, + "grad_norm": 9.938818266164162e-07, + "learning_rate": 0.18456987794163993, + "loss": 0.0, + "num_input_tokens_seen": 29198176, + "step": 17040 + }, + { + "epoch": 82.7409200968523, + "grad_norm": 1.611699644854525e-06, + "learning_rate": 0.18451255610985373, + "loss": 0.0, + "num_input_tokens_seen": 29206912, + "step": 17045 + }, + { + "epoch": 82.76513317191284, + "grad_norm": 1.3499321767085348e-06, + "learning_rate": 0.18445522895579766, + "loss": 0.0, + "num_input_tokens_seen": 29215296, + "step": 17050 + }, + { + "epoch": 82.78934624697337, + "grad_norm": 1.043776364895166e-06, + "learning_rate": 0.1843978964883123, + "loss": 0.0, + "num_input_tokens_seen": 29224064, + "step": 17055 + }, + { + "epoch": 82.8135593220339, + "grad_norm": 1.1494259979372146e-06, + "learning_rate": 0.18434055871623906, + "loss": 0.0, + "num_input_tokens_seen": 29232832, + "step": 17060 + }, + { + "epoch": 82.83777239709443, + "grad_norm": 2.0116576706641354e-06, + "learning_rate": 0.18428321564842007, + "loss": 0.0, + "num_input_tokens_seen": 29241696, + "step": 17065 + }, + { + "epoch": 82.86198547215497, + "grad_norm": 2.2739229734725086e-06, + "learning_rate": 0.18422586729369841, + "loss": 0.0, + "num_input_tokens_seen": 29250368, + "step": 17070 + }, + { + "epoch": 82.8861985472155, + "grad_norm": 9.17923784982122e-07, + "learning_rate": 0.1841685136609179, + "loss": 0.0, + "num_input_tokens_seen": 29258848, + "step": 17075 + }, + { + "epoch": 82.91041162227603, + "grad_norm": 7.308762519642187e-07, + "learning_rate": 0.18411115475892326, + "loss": 0.0, + "num_input_tokens_seen": 29267456, + "step": 17080 + }, + { + "epoch": 82.93462469733656, + "grad_norm": 7.923450198177306e-07, + "learning_rate": 0.18405379059655982, + "loss": 0.0, + "num_input_tokens_seen": 29276064, + "step": 17085 + }, + { + "epoch": 82.95883777239709, + "grad_norm": 2.570022161307861e-06, + "learning_rate": 0.1839964211826739, + "loss": 0.0, + "num_input_tokens_seen": 29284736, + "step": 17090 + }, + { + "epoch": 82.98305084745763, + "grad_norm": 8.991118534140696e-07, + "learning_rate": 0.18393904652611265, + "loss": 0.0, + "num_input_tokens_seen": 29292960, + "step": 17095 + }, + { + "epoch": 83.00968523002422, + "grad_norm": 1.458367137274763e-06, + "learning_rate": 0.18388166663572392, + "loss": 0.0, + "num_input_tokens_seen": 29302112, + "step": 17100 + }, + { + "epoch": 83.03389830508475, + "grad_norm": 1.0120861588802654e-06, + "learning_rate": 0.18382428152035643, + "loss": 0.0, + "num_input_tokens_seen": 29310336, + "step": 17105 + }, + { + "epoch": 83.05811138014528, + "grad_norm": 1.7208084273079294e-06, + "learning_rate": 0.1837668911888596, + "loss": 0.0, + "num_input_tokens_seen": 29318752, + "step": 17110 + }, + { + "epoch": 83.08232445520581, + "grad_norm": 1.3406170182861388e-06, + "learning_rate": 0.18370949565008388, + "loss": 0.0, + "num_input_tokens_seen": 29327616, + "step": 17115 + }, + { + "epoch": 83.10653753026634, + "grad_norm": 1.3332695516510285e-06, + "learning_rate": 0.1836520949128803, + "loss": 0.0, + "num_input_tokens_seen": 29336160, + "step": 17120 + }, + { + "epoch": 83.13075060532688, + "grad_norm": 1.15050352178514e-06, + "learning_rate": 0.18359468898610076, + "loss": 0.0, + "num_input_tokens_seen": 29344736, + "step": 17125 + }, + { + "epoch": 83.15496368038741, + "grad_norm": 6.238416290216264e-07, + "learning_rate": 0.18353727787859797, + "loss": 0.0, + "num_input_tokens_seen": 29353504, + "step": 17130 + }, + { + "epoch": 83.17917675544794, + "grad_norm": 8.272113518614788e-07, + "learning_rate": 0.18347986159922552, + "loss": 0.0, + "num_input_tokens_seen": 29361600, + "step": 17135 + }, + { + "epoch": 83.20338983050847, + "grad_norm": 4.5156167516324786e-07, + "learning_rate": 0.1834224401568377, + "loss": 0.0, + "num_input_tokens_seen": 29370048, + "step": 17140 + }, + { + "epoch": 83.227602905569, + "grad_norm": 7.122368401724088e-07, + "learning_rate": 0.1833650135602896, + "loss": 0.0, + "num_input_tokens_seen": 29378400, + "step": 17145 + }, + { + "epoch": 83.25181598062954, + "grad_norm": 5.585794724538573e-07, + "learning_rate": 0.18330758181843707, + "loss": 0.0, + "num_input_tokens_seen": 29387264, + "step": 17150 + }, + { + "epoch": 83.27602905569007, + "grad_norm": 1.0013444580181385e-06, + "learning_rate": 0.18325014494013686, + "loss": 0.0, + "num_input_tokens_seen": 29395840, + "step": 17155 + }, + { + "epoch": 83.3002421307506, + "grad_norm": 7.134661359486927e-07, + "learning_rate": 0.18319270293424647, + "loss": 0.0, + "num_input_tokens_seen": 29404288, + "step": 17160 + }, + { + "epoch": 83.32445520581113, + "grad_norm": 1.1493237934701028e-06, + "learning_rate": 0.18313525580962417, + "loss": 0.0, + "num_input_tokens_seen": 29413024, + "step": 17165 + }, + { + "epoch": 83.34866828087166, + "grad_norm": 1.7801544345275033e-06, + "learning_rate": 0.18307780357512896, + "loss": 0.0, + "num_input_tokens_seen": 29421632, + "step": 17170 + }, + { + "epoch": 83.37288135593221, + "grad_norm": 9.428267162547854e-07, + "learning_rate": 0.1830203462396208, + "loss": 0.0, + "num_input_tokens_seen": 29430400, + "step": 17175 + }, + { + "epoch": 83.39709443099274, + "grad_norm": 1.4912379810994025e-06, + "learning_rate": 0.18296288381196033, + "loss": 0.0, + "num_input_tokens_seen": 29438656, + "step": 17180 + }, + { + "epoch": 83.42130750605327, + "grad_norm": 1.0732636610555346e-06, + "learning_rate": 0.1829054163010089, + "loss": 0.0, + "num_input_tokens_seen": 29447456, + "step": 17185 + }, + { + "epoch": 83.4455205811138, + "grad_norm": 6.801346899010241e-07, + "learning_rate": 0.18284794371562874, + "loss": 0.0, + "num_input_tokens_seen": 29455968, + "step": 17190 + }, + { + "epoch": 83.46973365617433, + "grad_norm": 6.263705358833249e-07, + "learning_rate": 0.18279046606468288, + "loss": 0.0, + "num_input_tokens_seen": 29464608, + "step": 17195 + }, + { + "epoch": 83.49394673123487, + "grad_norm": 8.264237862931623e-07, + "learning_rate": 0.1827329833570351, + "loss": 0.0, + "num_input_tokens_seen": 29473344, + "step": 17200 + }, + { + "epoch": 83.49394673123487, + "eval_loss": 0.8238951563835144, + "eval_runtime": 4.6175, + "eval_samples_per_second": 79.481, + "eval_steps_per_second": 19.924, + "num_input_tokens_seen": 29473344, + "step": 17200 + }, + { + "epoch": 83.5181598062954, + "grad_norm": 1.1777857480410603e-06, + "learning_rate": 0.18267549560154991, + "loss": 0.0, + "num_input_tokens_seen": 29481984, + "step": 17205 + }, + { + "epoch": 83.54237288135593, + "grad_norm": 2.480682724126382e-06, + "learning_rate": 0.18261800280709267, + "loss": 0.0, + "num_input_tokens_seen": 29490464, + "step": 17210 + }, + { + "epoch": 83.56658595641646, + "grad_norm": 6.063442015147302e-07, + "learning_rate": 0.18256050498252957, + "loss": 0.0, + "num_input_tokens_seen": 29499488, + "step": 17215 + }, + { + "epoch": 83.59079903147699, + "grad_norm": 1.3241016176834819e-06, + "learning_rate": 0.18250300213672735, + "loss": 0.0, + "num_input_tokens_seen": 29508576, + "step": 17220 + }, + { + "epoch": 83.61501210653753, + "grad_norm": 6.356760309245146e-07, + "learning_rate": 0.18244549427855378, + "loss": 0.0, + "num_input_tokens_seen": 29516736, + "step": 17225 + }, + { + "epoch": 83.63922518159806, + "grad_norm": 6.249773036870465e-07, + "learning_rate": 0.1823879814168772, + "loss": 0.0, + "num_input_tokens_seen": 29525248, + "step": 17230 + }, + { + "epoch": 83.6634382566586, + "grad_norm": 2.9123848435119726e-06, + "learning_rate": 0.18233046356056692, + "loss": 0.0, + "num_input_tokens_seen": 29533632, + "step": 17235 + }, + { + "epoch": 83.68765133171912, + "grad_norm": 6.761551958334167e-07, + "learning_rate": 0.18227294071849284, + "loss": 0.0, + "num_input_tokens_seen": 29542144, + "step": 17240 + }, + { + "epoch": 83.71186440677967, + "grad_norm": 4.3563778717725654e-07, + "learning_rate": 0.18221541289952578, + "loss": 0.0, + "num_input_tokens_seen": 29550688, + "step": 17245 + }, + { + "epoch": 83.7360774818402, + "grad_norm": 1.0824078344739974e-06, + "learning_rate": 0.18215788011253717, + "loss": 0.0, + "num_input_tokens_seen": 29559328, + "step": 17250 + }, + { + "epoch": 83.76029055690073, + "grad_norm": 5.52849201085337e-07, + "learning_rate": 0.18210034236639935, + "loss": 0.0, + "num_input_tokens_seen": 29567776, + "step": 17255 + }, + { + "epoch": 83.78450363196126, + "grad_norm": 8.924785106501076e-07, + "learning_rate": 0.1820427996699853, + "loss": 0.0, + "num_input_tokens_seen": 29576416, + "step": 17260 + }, + { + "epoch": 83.80871670702179, + "grad_norm": 6.007805382068909e-07, + "learning_rate": 0.1819852520321689, + "loss": 0.0, + "num_input_tokens_seen": 29584704, + "step": 17265 + }, + { + "epoch": 83.83292978208233, + "grad_norm": 1.3258172657515388e-06, + "learning_rate": 0.18192769946182466, + "loss": 0.0, + "num_input_tokens_seen": 29593344, + "step": 17270 + }, + { + "epoch": 83.85714285714286, + "grad_norm": 1.1813343689937028e-06, + "learning_rate": 0.18187014196782794, + "loss": 0.0, + "num_input_tokens_seen": 29601952, + "step": 17275 + }, + { + "epoch": 83.88135593220339, + "grad_norm": 6.893957333886647e-07, + "learning_rate": 0.18181257955905486, + "loss": 0.0, + "num_input_tokens_seen": 29610368, + "step": 17280 + }, + { + "epoch": 83.90556900726392, + "grad_norm": 1.8693290257942863e-06, + "learning_rate": 0.18175501224438217, + "loss": 0.0, + "num_input_tokens_seen": 29618592, + "step": 17285 + }, + { + "epoch": 83.92978208232445, + "grad_norm": 9.994178071792703e-07, + "learning_rate": 0.18169744003268756, + "loss": 0.0, + "num_input_tokens_seen": 29627168, + "step": 17290 + }, + { + "epoch": 83.953995157385, + "grad_norm": 9.54991378421255e-07, + "learning_rate": 0.18163986293284937, + "loss": 0.0, + "num_input_tokens_seen": 29635648, + "step": 17295 + }, + { + "epoch": 83.97820823244552, + "grad_norm": 6.710994853165175e-07, + "learning_rate": 0.18158228095374673, + "loss": 0.0, + "num_input_tokens_seen": 29644096, + "step": 17300 + }, + { + "epoch": 84.00484261501211, + "grad_norm": 3.163057272104197e-06, + "learning_rate": 0.18152469410425945, + "loss": 0.0, + "num_input_tokens_seen": 29653088, + "step": 17305 + }, + { + "epoch": 84.02905569007264, + "grad_norm": 1.3591346714747488e-06, + "learning_rate": 0.18146710239326813, + "loss": 0.0, + "num_input_tokens_seen": 29661824, + "step": 17310 + }, + { + "epoch": 84.05326876513317, + "grad_norm": 8.302705509777297e-07, + "learning_rate": 0.18140950582965423, + "loss": 0.0, + "num_input_tokens_seen": 29670560, + "step": 17315 + }, + { + "epoch": 84.0774818401937, + "grad_norm": 1.6151460613400559e-06, + "learning_rate": 0.1813519044222998, + "loss": 0.0, + "num_input_tokens_seen": 29679104, + "step": 17320 + }, + { + "epoch": 84.10169491525424, + "grad_norm": 1.2222096756886458e-06, + "learning_rate": 0.18129429818008772, + "loss": 0.0, + "num_input_tokens_seen": 29687648, + "step": 17325 + }, + { + "epoch": 84.12590799031477, + "grad_norm": 1.0929747986665461e-06, + "learning_rate": 0.18123668711190163, + "loss": 0.0, + "num_input_tokens_seen": 29696544, + "step": 17330 + }, + { + "epoch": 84.1501210653753, + "grad_norm": 5.106183493808203e-07, + "learning_rate": 0.18117907122662583, + "loss": 0.0, + "num_input_tokens_seen": 29705248, + "step": 17335 + }, + { + "epoch": 84.17433414043583, + "grad_norm": 8.790556194071542e-07, + "learning_rate": 0.1811214505331454, + "loss": 0.0, + "num_input_tokens_seen": 29713760, + "step": 17340 + }, + { + "epoch": 84.19854721549636, + "grad_norm": 1.9156173038936686e-06, + "learning_rate": 0.1810638250403462, + "loss": 0.0, + "num_input_tokens_seen": 29722432, + "step": 17345 + }, + { + "epoch": 84.2227602905569, + "grad_norm": 1.8153965584133402e-06, + "learning_rate": 0.1810061947571148, + "loss": 0.0, + "num_input_tokens_seen": 29730848, + "step": 17350 + }, + { + "epoch": 84.24697336561744, + "grad_norm": 6.584631933037599e-07, + "learning_rate": 0.1809485596923385, + "loss": 0.0, + "num_input_tokens_seen": 29739168, + "step": 17355 + }, + { + "epoch": 84.27118644067797, + "grad_norm": 3.2390545356975053e-07, + "learning_rate": 0.18089091985490546, + "loss": 0.0, + "num_input_tokens_seen": 29747168, + "step": 17360 + }, + { + "epoch": 84.2953995157385, + "grad_norm": 5.683610879714251e-07, + "learning_rate": 0.18083327525370432, + "loss": 0.0, + "num_input_tokens_seen": 29755680, + "step": 17365 + }, + { + "epoch": 84.31961259079903, + "grad_norm": 1.1595334399316926e-06, + "learning_rate": 0.18077562589762464, + "loss": 0.0, + "num_input_tokens_seen": 29763808, + "step": 17370 + }, + { + "epoch": 84.34382566585957, + "grad_norm": 5.251661150396103e-07, + "learning_rate": 0.1807179717955567, + "loss": 0.0, + "num_input_tokens_seen": 29772192, + "step": 17375 + }, + { + "epoch": 84.3680387409201, + "grad_norm": 3.2816078032738005e-07, + "learning_rate": 0.1806603129563915, + "loss": 0.0, + "num_input_tokens_seen": 29780992, + "step": 17380 + }, + { + "epoch": 84.39225181598063, + "grad_norm": 9.105318667934625e-07, + "learning_rate": 0.1806026493890208, + "loss": 0.0, + "num_input_tokens_seen": 29790080, + "step": 17385 + }, + { + "epoch": 84.41646489104116, + "grad_norm": 1.229523718393466e-06, + "learning_rate": 0.18054498110233688, + "loss": 0.0, + "num_input_tokens_seen": 29798688, + "step": 17390 + }, + { + "epoch": 84.44067796610169, + "grad_norm": 6.948267241568828e-07, + "learning_rate": 0.1804873081052331, + "loss": 0.0, + "num_input_tokens_seen": 29807104, + "step": 17395 + }, + { + "epoch": 84.46489104116223, + "grad_norm": 1.0665786476238281e-06, + "learning_rate": 0.18042963040660326, + "loss": 0.0, + "num_input_tokens_seen": 29815360, + "step": 17400 + }, + { + "epoch": 84.46489104116223, + "eval_loss": 0.8303042650222778, + "eval_runtime": 4.6408, + "eval_samples_per_second": 79.082, + "eval_steps_per_second": 19.824, + "num_input_tokens_seen": 29815360, + "step": 17400 + }, + { + "epoch": 84.48910411622276, + "grad_norm": 8.569652436563047e-07, + "learning_rate": 0.180371948015342, + "loss": 0.0, + "num_input_tokens_seen": 29823776, + "step": 17405 + }, + { + "epoch": 84.51331719128329, + "grad_norm": 1.5284269920812221e-06, + "learning_rate": 0.18031426094034472, + "loss": 0.0, + "num_input_tokens_seen": 29832512, + "step": 17410 + }, + { + "epoch": 84.53753026634382, + "grad_norm": 1.2748249673677492e-06, + "learning_rate": 0.18025656919050737, + "loss": 0.0, + "num_input_tokens_seen": 29840928, + "step": 17415 + }, + { + "epoch": 84.56174334140435, + "grad_norm": 6.772964411538851e-07, + "learning_rate": 0.18019887277472688, + "loss": 0.0, + "num_input_tokens_seen": 29849344, + "step": 17420 + }, + { + "epoch": 84.5859564164649, + "grad_norm": 7.129123673621507e-07, + "learning_rate": 0.18014117170190067, + "loss": 0.0, + "num_input_tokens_seen": 29857824, + "step": 17425 + }, + { + "epoch": 84.61016949152543, + "grad_norm": 1.4925161622159067e-06, + "learning_rate": 0.18008346598092703, + "loss": 0.0, + "num_input_tokens_seen": 29866528, + "step": 17430 + }, + { + "epoch": 84.63438256658596, + "grad_norm": 1.3294952623255085e-06, + "learning_rate": 0.18002575562070489, + "loss": 0.0, + "num_input_tokens_seen": 29875424, + "step": 17435 + }, + { + "epoch": 84.65859564164649, + "grad_norm": 7.447005145877483e-07, + "learning_rate": 0.1799680406301339, + "loss": 0.0, + "num_input_tokens_seen": 29884064, + "step": 17440 + }, + { + "epoch": 84.68280871670702, + "grad_norm": 1.5991950021998491e-06, + "learning_rate": 0.17991032101811447, + "loss": 0.0, + "num_input_tokens_seen": 29892448, + "step": 17445 + }, + { + "epoch": 84.70702179176756, + "grad_norm": 7.427328796438815e-07, + "learning_rate": 0.1798525967935476, + "loss": 0.0, + "num_input_tokens_seen": 29901152, + "step": 17450 + }, + { + "epoch": 84.73123486682809, + "grad_norm": 1.6438244756500353e-06, + "learning_rate": 0.17979486796533517, + "loss": 0.0, + "num_input_tokens_seen": 29909984, + "step": 17455 + }, + { + "epoch": 84.75544794188862, + "grad_norm": 7.435965017066337e-07, + "learning_rate": 0.1797371345423797, + "loss": 0.0, + "num_input_tokens_seen": 29918528, + "step": 17460 + }, + { + "epoch": 84.77966101694915, + "grad_norm": 5.05412401707872e-07, + "learning_rate": 0.17967939653358436, + "loss": 0.0, + "num_input_tokens_seen": 29927200, + "step": 17465 + }, + { + "epoch": 84.80387409200968, + "grad_norm": 8.172049206223164e-07, + "learning_rate": 0.17962165394785315, + "loss": 0.0, + "num_input_tokens_seen": 29935776, + "step": 17470 + }, + { + "epoch": 84.82808716707022, + "grad_norm": 1.1966136526098126e-06, + "learning_rate": 0.17956390679409057, + "loss": 0.0, + "num_input_tokens_seen": 29944000, + "step": 17475 + }, + { + "epoch": 84.85230024213075, + "grad_norm": 7.912834121270862e-07, + "learning_rate": 0.1795061550812021, + "loss": 0.0, + "num_input_tokens_seen": 29952352, + "step": 17480 + }, + { + "epoch": 84.87651331719128, + "grad_norm": 7.736075531283859e-07, + "learning_rate": 0.1794483988180937, + "loss": 0.0, + "num_input_tokens_seen": 29960832, + "step": 17485 + }, + { + "epoch": 84.90072639225181, + "grad_norm": 7.741447802800394e-07, + "learning_rate": 0.17939063801367214, + "loss": 0.0, + "num_input_tokens_seen": 29969632, + "step": 17490 + }, + { + "epoch": 84.92493946731234, + "grad_norm": 5.808079208691197e-07, + "learning_rate": 0.17933287267684483, + "loss": 0.0, + "num_input_tokens_seen": 29978144, + "step": 17495 + }, + { + "epoch": 84.94915254237289, + "grad_norm": 1.370689687973936e-06, + "learning_rate": 0.17927510281651995, + "loss": 0.0, + "num_input_tokens_seen": 29986848, + "step": 17500 + }, + { + "epoch": 84.97336561743342, + "grad_norm": 1.4289654473031987e-06, + "learning_rate": 0.17921732844160634, + "loss": 0.0, + "num_input_tokens_seen": 29995424, + "step": 17505 + }, + { + "epoch": 84.99757869249395, + "grad_norm": 1.250892637472134e-06, + "learning_rate": 0.17915954956101351, + "loss": 0.0, + "num_input_tokens_seen": 30003840, + "step": 17510 + }, + { + "epoch": 85.02421307506053, + "grad_norm": 9.025496296999336e-07, + "learning_rate": 0.17910176618365165, + "loss": 0.0, + "num_input_tokens_seen": 30012672, + "step": 17515 + }, + { + "epoch": 85.04842615012106, + "grad_norm": 9.608028221919085e-07, + "learning_rate": 0.17904397831843177, + "loss": 0.0, + "num_input_tokens_seen": 30020960, + "step": 17520 + }, + { + "epoch": 85.0726392251816, + "grad_norm": 8.679181178194995e-07, + "learning_rate": 0.17898618597426547, + "loss": 0.0, + "num_input_tokens_seen": 30029472, + "step": 17525 + }, + { + "epoch": 85.09685230024213, + "grad_norm": 9.948298611561768e-07, + "learning_rate": 0.17892838916006495, + "loss": 0.0, + "num_input_tokens_seen": 30037952, + "step": 17530 + }, + { + "epoch": 85.12106537530266, + "grad_norm": 1.0556052529864246e-06, + "learning_rate": 0.17887058788474333, + "loss": 0.0, + "num_input_tokens_seen": 30046688, + "step": 17535 + }, + { + "epoch": 85.1452784503632, + "grad_norm": 8.126902457661345e-07, + "learning_rate": 0.17881278215721427, + "loss": 0.0, + "num_input_tokens_seen": 30055104, + "step": 17540 + }, + { + "epoch": 85.16949152542372, + "grad_norm": 1.842529513851332e-06, + "learning_rate": 0.1787549719863921, + "loss": 0.0, + "num_input_tokens_seen": 30063456, + "step": 17545 + }, + { + "epoch": 85.19370460048427, + "grad_norm": 7.29352848338749e-07, + "learning_rate": 0.17869715738119188, + "loss": 0.0, + "num_input_tokens_seen": 30072224, + "step": 17550 + }, + { + "epoch": 85.2179176755448, + "grad_norm": 1.325970288235112e-06, + "learning_rate": 0.17863933835052936, + "loss": 0.0, + "num_input_tokens_seen": 30081120, + "step": 17555 + }, + { + "epoch": 85.24213075060533, + "grad_norm": 7.749946462354274e-07, + "learning_rate": 0.17858151490332097, + "loss": 0.0, + "num_input_tokens_seen": 30089408, + "step": 17560 + }, + { + "epoch": 85.26634382566586, + "grad_norm": 1.208394905916066e-06, + "learning_rate": 0.17852368704848381, + "loss": 0.0, + "num_input_tokens_seen": 30097888, + "step": 17565 + }, + { + "epoch": 85.29055690072639, + "grad_norm": 7.662209782210994e-07, + "learning_rate": 0.17846585479493565, + "loss": 0.0, + "num_input_tokens_seen": 30106432, + "step": 17570 + }, + { + "epoch": 85.31476997578693, + "grad_norm": 8.729381875127729e-07, + "learning_rate": 0.178408018151595, + "loss": 0.0, + "num_input_tokens_seen": 30115168, + "step": 17575 + }, + { + "epoch": 85.33898305084746, + "grad_norm": 2.868132469302509e-07, + "learning_rate": 0.17835017712738085, + "loss": 0.0, + "num_input_tokens_seen": 30123840, + "step": 17580 + }, + { + "epoch": 85.36319612590799, + "grad_norm": 5.281037829263369e-07, + "learning_rate": 0.17829233173121323, + "loss": 0.0, + "num_input_tokens_seen": 30132480, + "step": 17585 + }, + { + "epoch": 85.38740920096852, + "grad_norm": 4.0806153833727876e-07, + "learning_rate": 0.17823448197201244, + "loss": 0.0, + "num_input_tokens_seen": 30140672, + "step": 17590 + }, + { + "epoch": 85.41162227602905, + "grad_norm": 1.5055284166010097e-06, + "learning_rate": 0.1781766278586997, + "loss": 0.0, + "num_input_tokens_seen": 30149344, + "step": 17595 + }, + { + "epoch": 85.4358353510896, + "grad_norm": 1.00058343832643e-06, + "learning_rate": 0.1781187694001969, + "loss": 0.0, + "num_input_tokens_seen": 30157632, + "step": 17600 + }, + { + "epoch": 85.4358353510896, + "eval_loss": 0.8375508785247803, + "eval_runtime": 4.6197, + "eval_samples_per_second": 79.443, + "eval_steps_per_second": 19.915, + "num_input_tokens_seen": 30157632, + "step": 17600 + }, + { + "epoch": 85.46004842615012, + "grad_norm": 2.062931798718637e-06, + "learning_rate": 0.1780609066054265, + "loss": 0.0, + "num_input_tokens_seen": 30166144, + "step": 17605 + }, + { + "epoch": 85.48426150121065, + "grad_norm": 5.473671649269818e-07, + "learning_rate": 0.17800303948331164, + "loss": 0.0, + "num_input_tokens_seen": 30174848, + "step": 17610 + }, + { + "epoch": 85.50847457627118, + "grad_norm": 7.230008236547292e-07, + "learning_rate": 0.1779451680427762, + "loss": 0.0, + "num_input_tokens_seen": 30183456, + "step": 17615 + }, + { + "epoch": 85.53268765133171, + "grad_norm": 5.67241386306705e-07, + "learning_rate": 0.17788729229274464, + "loss": 0.0, + "num_input_tokens_seen": 30192160, + "step": 17620 + }, + { + "epoch": 85.55690072639226, + "grad_norm": 3.634263805452065e-07, + "learning_rate": 0.17782941224214222, + "loss": 0.0, + "num_input_tokens_seen": 30200224, + "step": 17625 + }, + { + "epoch": 85.58111380145279, + "grad_norm": 6.260424356696603e-07, + "learning_rate": 0.17777152789989464, + "loss": 0.0, + "num_input_tokens_seen": 30208704, + "step": 17630 + }, + { + "epoch": 85.60532687651332, + "grad_norm": 8.08333425084129e-07, + "learning_rate": 0.17771363927492845, + "loss": 0.0, + "num_input_tokens_seen": 30217248, + "step": 17635 + }, + { + "epoch": 85.62953995157385, + "grad_norm": 7.355886282311985e-07, + "learning_rate": 0.17765574637617085, + "loss": 0.0, + "num_input_tokens_seen": 30225792, + "step": 17640 + }, + { + "epoch": 85.65375302663438, + "grad_norm": 5.112082135383389e-07, + "learning_rate": 0.17759784921254962, + "loss": 0.0, + "num_input_tokens_seen": 30234592, + "step": 17645 + }, + { + "epoch": 85.67796610169492, + "grad_norm": 6.183973937368137e-07, + "learning_rate": 0.1775399477929932, + "loss": 0.0, + "num_input_tokens_seen": 30243296, + "step": 17650 + }, + { + "epoch": 85.70217917675545, + "grad_norm": 8.968445399659686e-07, + "learning_rate": 0.17748204212643076, + "loss": 0.0, + "num_input_tokens_seen": 30251872, + "step": 17655 + }, + { + "epoch": 85.72639225181598, + "grad_norm": 2.2736496418929164e-07, + "learning_rate": 0.17742413222179204, + "loss": 0.0, + "num_input_tokens_seen": 30260096, + "step": 17660 + }, + { + "epoch": 85.75060532687651, + "grad_norm": 2.118172687914921e-06, + "learning_rate": 0.17736621808800754, + "loss": 0.0, + "num_input_tokens_seen": 30268832, + "step": 17665 + }, + { + "epoch": 85.77481840193704, + "grad_norm": 3.287372010163381e-07, + "learning_rate": 0.17730829973400827, + "loss": 0.0, + "num_input_tokens_seen": 30277568, + "step": 17670 + }, + { + "epoch": 85.79903147699758, + "grad_norm": 7.860678579163505e-07, + "learning_rate": 0.17725037716872602, + "loss": 0.0, + "num_input_tokens_seen": 30286304, + "step": 17675 + }, + { + "epoch": 85.82324455205811, + "grad_norm": 4.86376961816859e-07, + "learning_rate": 0.17719245040109313, + "loss": 0.0, + "num_input_tokens_seen": 30295360, + "step": 17680 + }, + { + "epoch": 85.84745762711864, + "grad_norm": 1.952111688297009e-06, + "learning_rate": 0.17713451944004271, + "loss": 0.0, + "num_input_tokens_seen": 30303680, + "step": 17685 + }, + { + "epoch": 85.87167070217917, + "grad_norm": 7.303191296159639e-07, + "learning_rate": 0.17707658429450843, + "loss": 0.0, + "num_input_tokens_seen": 30312128, + "step": 17690 + }, + { + "epoch": 85.8958837772397, + "grad_norm": 1.012876509776106e-06, + "learning_rate": 0.1770186449734245, + "loss": 0.0, + "num_input_tokens_seen": 30320928, + "step": 17695 + }, + { + "epoch": 85.92009685230025, + "grad_norm": 1.2394323221087689e-06, + "learning_rate": 0.17696070148572599, + "loss": 0.0, + "num_input_tokens_seen": 30329824, + "step": 17700 + }, + { + "epoch": 85.94430992736078, + "grad_norm": 7.392189900201629e-07, + "learning_rate": 0.17690275384034856, + "loss": 0.0, + "num_input_tokens_seen": 30338176, + "step": 17705 + }, + { + "epoch": 85.9685230024213, + "grad_norm": 4.2041034475914785e-07, + "learning_rate": 0.17684480204622835, + "loss": 0.0, + "num_input_tokens_seen": 30346752, + "step": 17710 + }, + { + "epoch": 85.99273607748184, + "grad_norm": 6.890136319270823e-07, + "learning_rate": 0.1767868461123023, + "loss": 0.0, + "num_input_tokens_seen": 30355552, + "step": 17715 + }, + { + "epoch": 86.01937046004842, + "grad_norm": 3.7542312725236116e-07, + "learning_rate": 0.176728886047508, + "loss": 0.0, + "num_input_tokens_seen": 30364448, + "step": 17720 + }, + { + "epoch": 86.04358353510897, + "grad_norm": 8.849434038893378e-07, + "learning_rate": 0.17667092186078362, + "loss": 0.0, + "num_input_tokens_seen": 30373184, + "step": 17725 + }, + { + "epoch": 86.0677966101695, + "grad_norm": 3.9873492596598226e-07, + "learning_rate": 0.17661295356106785, + "loss": 0.0, + "num_input_tokens_seen": 30381600, + "step": 17730 + }, + { + "epoch": 86.09200968523002, + "grad_norm": 1.0421912293168134e-06, + "learning_rate": 0.1765549811573002, + "loss": 0.0, + "num_input_tokens_seen": 30390208, + "step": 17735 + }, + { + "epoch": 86.11622276029055, + "grad_norm": 5.586370548371633e-07, + "learning_rate": 0.17649700465842078, + "loss": 0.0, + "num_input_tokens_seen": 30398368, + "step": 17740 + }, + { + "epoch": 86.14043583535108, + "grad_norm": 1.0590470083116088e-06, + "learning_rate": 0.17643902407337023, + "loss": 0.0, + "num_input_tokens_seen": 30407168, + "step": 17745 + }, + { + "epoch": 86.16464891041163, + "grad_norm": 7.8423562399621e-07, + "learning_rate": 0.17638103941108993, + "loss": 0.0, + "num_input_tokens_seen": 30415840, + "step": 17750 + }, + { + "epoch": 86.18886198547216, + "grad_norm": 3.176041900587734e-07, + "learning_rate": 0.1763230506805218, + "loss": 0.0, + "num_input_tokens_seen": 30424576, + "step": 17755 + }, + { + "epoch": 86.21307506053269, + "grad_norm": 9.461479066885659e-07, + "learning_rate": 0.1762650578906085, + "loss": 0.0, + "num_input_tokens_seen": 30432928, + "step": 17760 + }, + { + "epoch": 86.23728813559322, + "grad_norm": 1.1527706647029845e-06, + "learning_rate": 0.1762070610502932, + "loss": 0.0, + "num_input_tokens_seen": 30441120, + "step": 17765 + }, + { + "epoch": 86.26150121065375, + "grad_norm": 7.54605139263731e-07, + "learning_rate": 0.17614906016851975, + "loss": 0.0, + "num_input_tokens_seen": 30449536, + "step": 17770 + }, + { + "epoch": 86.28571428571429, + "grad_norm": 6.404734449461102e-07, + "learning_rate": 0.17609105525423258, + "loss": 0.0, + "num_input_tokens_seen": 30458208, + "step": 17775 + }, + { + "epoch": 86.30992736077482, + "grad_norm": 8.974488423518778e-07, + "learning_rate": 0.1760330463163768, + "loss": 0.0, + "num_input_tokens_seen": 30466624, + "step": 17780 + }, + { + "epoch": 86.33414043583535, + "grad_norm": 3.3462362125646905e-07, + "learning_rate": 0.17597503336389816, + "loss": 0.0, + "num_input_tokens_seen": 30475328, + "step": 17785 + }, + { + "epoch": 86.35835351089588, + "grad_norm": 7.014767788859899e-07, + "learning_rate": 0.17591701640574298, + "loss": 0.0, + "num_input_tokens_seen": 30484192, + "step": 17790 + }, + { + "epoch": 86.38256658595641, + "grad_norm": 6.695534011669224e-07, + "learning_rate": 0.17585899545085815, + "loss": 0.0, + "num_input_tokens_seen": 30492736, + "step": 17795 + }, + { + "epoch": 86.40677966101696, + "grad_norm": 6.267552521421749e-07, + "learning_rate": 0.17580097050819124, + "loss": 0.0, + "num_input_tokens_seen": 30501440, + "step": 17800 + }, + { + "epoch": 86.40677966101696, + "eval_loss": 0.8438611030578613, + "eval_runtime": 4.609, + "eval_samples_per_second": 79.627, + "eval_steps_per_second": 19.961, + "num_input_tokens_seen": 30501440, + "step": 17800 + }, + { + "epoch": 86.43099273607749, + "grad_norm": 9.371934197588416e-07, + "learning_rate": 0.17574294158669046, + "loss": 0.0, + "num_input_tokens_seen": 30509664, + "step": 17805 + }, + { + "epoch": 86.45520581113801, + "grad_norm": 7.024223691587395e-07, + "learning_rate": 0.17568490869530456, + "loss": 0.0, + "num_input_tokens_seen": 30518080, + "step": 17810 + }, + { + "epoch": 86.47941888619854, + "grad_norm": 5.448144406727806e-07, + "learning_rate": 0.17562687184298295, + "loss": 0.0, + "num_input_tokens_seen": 30526816, + "step": 17815 + }, + { + "epoch": 86.50363196125907, + "grad_norm": 7.172405958044692e-07, + "learning_rate": 0.1755688310386757, + "loss": 0.0, + "num_input_tokens_seen": 30535744, + "step": 17820 + }, + { + "epoch": 86.52784503631962, + "grad_norm": 8.693339168530656e-07, + "learning_rate": 0.17551078629133335, + "loss": 0.0, + "num_input_tokens_seen": 30544480, + "step": 17825 + }, + { + "epoch": 86.55205811138015, + "grad_norm": 4.772646207129583e-07, + "learning_rate": 0.17545273760990718, + "loss": 0.0, + "num_input_tokens_seen": 30553024, + "step": 17830 + }, + { + "epoch": 86.57627118644068, + "grad_norm": 5.543877819036425e-07, + "learning_rate": 0.17539468500334904, + "loss": 0.0, + "num_input_tokens_seen": 30561440, + "step": 17835 + }, + { + "epoch": 86.60048426150121, + "grad_norm": 7.090213784977095e-07, + "learning_rate": 0.17533662848061132, + "loss": 0.0, + "num_input_tokens_seen": 30570016, + "step": 17840 + }, + { + "epoch": 86.62469733656174, + "grad_norm": 5.267094138616812e-07, + "learning_rate": 0.1752785680506471, + "loss": 0.0, + "num_input_tokens_seen": 30578656, + "step": 17845 + }, + { + "epoch": 86.64891041162228, + "grad_norm": 5.024941742703959e-07, + "learning_rate": 0.17522050372241, + "loss": 0.0, + "num_input_tokens_seen": 30586976, + "step": 17850 + }, + { + "epoch": 86.67312348668281, + "grad_norm": 4.7407189640580327e-07, + "learning_rate": 0.17516243550485425, + "loss": 0.0, + "num_input_tokens_seen": 30595776, + "step": 17855 + }, + { + "epoch": 86.69733656174334, + "grad_norm": 8.160941433743574e-07, + "learning_rate": 0.17510436340693478, + "loss": 0.0, + "num_input_tokens_seen": 30604416, + "step": 17860 + }, + { + "epoch": 86.72154963680387, + "grad_norm": 1.5968507796060294e-06, + "learning_rate": 0.175046287437607, + "loss": 0.0, + "num_input_tokens_seen": 30612896, + "step": 17865 + }, + { + "epoch": 86.7457627118644, + "grad_norm": 3.4875191090577573e-07, + "learning_rate": 0.17498820760582695, + "loss": 0.0, + "num_input_tokens_seen": 30621024, + "step": 17870 + }, + { + "epoch": 86.76997578692495, + "grad_norm": 1.1317166581648053e-06, + "learning_rate": 0.1749301239205512, + "loss": 0.0, + "num_input_tokens_seen": 30629632, + "step": 17875 + }, + { + "epoch": 86.79418886198548, + "grad_norm": 6.495916409221536e-07, + "learning_rate": 0.1748720363907371, + "loss": 0.0, + "num_input_tokens_seen": 30637856, + "step": 17880 + }, + { + "epoch": 86.818401937046, + "grad_norm": 1.770087123986741e-06, + "learning_rate": 0.17481394502534242, + "loss": 0.0, + "num_input_tokens_seen": 30646464, + "step": 17885 + }, + { + "epoch": 86.84261501210653, + "grad_norm": 9.921100172505248e-07, + "learning_rate": 0.17475584983332562, + "loss": 0.0, + "num_input_tokens_seen": 30655232, + "step": 17890 + }, + { + "epoch": 86.86682808716706, + "grad_norm": 3.515285129651602e-07, + "learning_rate": 0.17469775082364558, + "loss": 0.0, + "num_input_tokens_seen": 30663776, + "step": 17895 + }, + { + "epoch": 86.89104116222761, + "grad_norm": 5.267627898319915e-07, + "learning_rate": 0.17463964800526205, + "loss": 0.0, + "num_input_tokens_seen": 30672320, + "step": 17900 + }, + { + "epoch": 86.91525423728814, + "grad_norm": 5.270335918794444e-07, + "learning_rate": 0.17458154138713522, + "loss": 0.0, + "num_input_tokens_seen": 30680832, + "step": 17905 + }, + { + "epoch": 86.93946731234867, + "grad_norm": 1.0624154356264626e-06, + "learning_rate": 0.17452343097822576, + "loss": 0.0, + "num_input_tokens_seen": 30689280, + "step": 17910 + }, + { + "epoch": 86.9636803874092, + "grad_norm": 9.861963690127595e-07, + "learning_rate": 0.17446531678749497, + "loss": 0.0, + "num_input_tokens_seen": 30697472, + "step": 17915 + }, + { + "epoch": 86.98789346246973, + "grad_norm": 9.756709005159792e-07, + "learning_rate": 0.17440719882390496, + "loss": 0.0, + "num_input_tokens_seen": 30705856, + "step": 17920 + }, + { + "epoch": 87.01452784503633, + "grad_norm": 3.2505488434253493e-07, + "learning_rate": 0.17434907709641814, + "loss": 0.0, + "num_input_tokens_seen": 30714912, + "step": 17925 + }, + { + "epoch": 87.03874092009686, + "grad_norm": 5.901997042201401e-07, + "learning_rate": 0.17429095161399769, + "loss": 0.0, + "num_input_tokens_seen": 30723488, + "step": 17930 + }, + { + "epoch": 87.06295399515739, + "grad_norm": 8.171163017323124e-07, + "learning_rate": 0.1742328223856072, + "loss": 0.0, + "num_input_tokens_seen": 30731968, + "step": 17935 + }, + { + "epoch": 87.08716707021792, + "grad_norm": 8.888308684618096e-07, + "learning_rate": 0.174174689420211, + "loss": 0.0, + "num_input_tokens_seen": 30740544, + "step": 17940 + }, + { + "epoch": 87.11138014527845, + "grad_norm": 2.221496657739408e-07, + "learning_rate": 0.1741165527267739, + "loss": 0.0, + "num_input_tokens_seen": 30749184, + "step": 17945 + }, + { + "epoch": 87.13559322033899, + "grad_norm": 8.14647819424863e-07, + "learning_rate": 0.17405841231426125, + "loss": 0.0, + "num_input_tokens_seen": 30757888, + "step": 17950 + }, + { + "epoch": 87.15980629539952, + "grad_norm": 2.9389676114988106e-07, + "learning_rate": 0.1740002681916391, + "loss": 0.0, + "num_input_tokens_seen": 30766400, + "step": 17955 + }, + { + "epoch": 87.18401937046005, + "grad_norm": 4.941040856465406e-07, + "learning_rate": 0.17394212036787401, + "loss": 0.0, + "num_input_tokens_seen": 30774848, + "step": 17960 + }, + { + "epoch": 87.20823244552058, + "grad_norm": 1.852393552326248e-06, + "learning_rate": 0.1738839688519331, + "loss": 0.0, + "num_input_tokens_seen": 30783264, + "step": 17965 + }, + { + "epoch": 87.23244552058111, + "grad_norm": 4.5274316562426975e-07, + "learning_rate": 0.17382581365278402, + "loss": 0.0, + "num_input_tokens_seen": 30791456, + "step": 17970 + }, + { + "epoch": 87.25665859564165, + "grad_norm": 3.0537884754266997e-07, + "learning_rate": 0.17376765477939507, + "loss": 0.0, + "num_input_tokens_seen": 30800032, + "step": 17975 + }, + { + "epoch": 87.28087167070218, + "grad_norm": 5.375276828090136e-07, + "learning_rate": 0.1737094922407351, + "loss": 0.0, + "num_input_tokens_seen": 30808640, + "step": 17980 + }, + { + "epoch": 87.30508474576271, + "grad_norm": 6.185085226206866e-07, + "learning_rate": 0.1736513260457734, + "loss": 0.0, + "num_input_tokens_seen": 30817376, + "step": 17985 + }, + { + "epoch": 87.32929782082324, + "grad_norm": 3.5345033211342525e-07, + "learning_rate": 0.17359315620348006, + "loss": 0.0, + "num_input_tokens_seen": 30825952, + "step": 17990 + }, + { + "epoch": 87.35351089588377, + "grad_norm": 1.0409487458673539e-06, + "learning_rate": 0.17353498272282547, + "loss": 0.0, + "num_input_tokens_seen": 30834176, + "step": 17995 + }, + { + "epoch": 87.37772397094432, + "grad_norm": 5.083019232188235e-07, + "learning_rate": 0.17347680561278087, + "loss": 0.0, + "num_input_tokens_seen": 30843072, + "step": 18000 + }, + { + "epoch": 87.37772397094432, + "eval_loss": 0.8497471809387207, + "eval_runtime": 4.621, + "eval_samples_per_second": 79.419, + "eval_steps_per_second": 19.909, + "num_input_tokens_seen": 30843072, + "step": 18000 + }, + { + "epoch": 87.40193704600485, + "grad_norm": 6.60022749343625e-07, + "learning_rate": 0.1734186248823178, + "loss": 0.0, + "num_input_tokens_seen": 30851488, + "step": 18005 + }, + { + "epoch": 87.42615012106538, + "grad_norm": 7.412168656628637e-07, + "learning_rate": 0.17336044054040844, + "loss": 0.0, + "num_input_tokens_seen": 30859872, + "step": 18010 + }, + { + "epoch": 87.4503631961259, + "grad_norm": 2.9867433681829425e-07, + "learning_rate": 0.1733022525960256, + "loss": 0.0, + "num_input_tokens_seen": 30868480, + "step": 18015 + }, + { + "epoch": 87.47457627118644, + "grad_norm": 7.628489697708574e-07, + "learning_rate": 0.1732440610581426, + "loss": 0.0, + "num_input_tokens_seen": 30876928, + "step": 18020 + }, + { + "epoch": 87.49878934624698, + "grad_norm": 3.496980696127139e-07, + "learning_rate": 0.17318586593573326, + "loss": 0.0, + "num_input_tokens_seen": 30885408, + "step": 18025 + }, + { + "epoch": 87.52300242130751, + "grad_norm": 6.590880161638779e-07, + "learning_rate": 0.17312766723777204, + "loss": 0.0, + "num_input_tokens_seen": 30894080, + "step": 18030 + }, + { + "epoch": 87.54721549636804, + "grad_norm": 6.637450269408873e-07, + "learning_rate": 0.1730694649732339, + "loss": 0.0, + "num_input_tokens_seen": 30902592, + "step": 18035 + }, + { + "epoch": 87.57142857142857, + "grad_norm": 8.94892707492545e-07, + "learning_rate": 0.17301125915109428, + "loss": 0.0, + "num_input_tokens_seen": 30911040, + "step": 18040 + }, + { + "epoch": 87.5956416464891, + "grad_norm": 6.087016117817257e-07, + "learning_rate": 0.17295304978032938, + "loss": 0.0, + "num_input_tokens_seen": 30919552, + "step": 18045 + }, + { + "epoch": 87.61985472154964, + "grad_norm": 3.222069722141896e-07, + "learning_rate": 0.17289483686991577, + "loss": 0.0, + "num_input_tokens_seen": 30927648, + "step": 18050 + }, + { + "epoch": 87.64406779661017, + "grad_norm": 6.162760541883472e-07, + "learning_rate": 0.1728366204288306, + "loss": 0.0, + "num_input_tokens_seen": 30936608, + "step": 18055 + }, + { + "epoch": 87.6682808716707, + "grad_norm": 6.78821777455596e-07, + "learning_rate": 0.17277840046605153, + "loss": 0.0, + "num_input_tokens_seen": 30945376, + "step": 18060 + }, + { + "epoch": 87.69249394673123, + "grad_norm": 3.223358078230376e-07, + "learning_rate": 0.17272017699055686, + "loss": 0.0, + "num_input_tokens_seen": 30954272, + "step": 18065 + }, + { + "epoch": 87.71670702179176, + "grad_norm": 3.2036632546805777e-07, + "learning_rate": 0.17266195001132542, + "loss": 0.0, + "num_input_tokens_seen": 30962976, + "step": 18070 + }, + { + "epoch": 87.7409200968523, + "grad_norm": 5.635807838189066e-07, + "learning_rate": 0.17260371953733647, + "loss": 0.0, + "num_input_tokens_seen": 30971616, + "step": 18075 + }, + { + "epoch": 87.76513317191284, + "grad_norm": 4.19856320377221e-07, + "learning_rate": 0.1725454855775699, + "loss": 0.0, + "num_input_tokens_seen": 30980320, + "step": 18080 + }, + { + "epoch": 87.78934624697337, + "grad_norm": 1.016801093101094e-06, + "learning_rate": 0.17248724814100616, + "loss": 0.0, + "num_input_tokens_seen": 30989376, + "step": 18085 + }, + { + "epoch": 87.8135593220339, + "grad_norm": 6.666593321824621e-07, + "learning_rate": 0.17242900723662619, + "loss": 0.0, + "num_input_tokens_seen": 30998112, + "step": 18090 + }, + { + "epoch": 87.83777239709443, + "grad_norm": 3.6040719919583353e-07, + "learning_rate": 0.1723707628734114, + "loss": 0.0, + "num_input_tokens_seen": 31006784, + "step": 18095 + }, + { + "epoch": 87.86198547215497, + "grad_norm": 4.2004737110801216e-07, + "learning_rate": 0.1723125150603438, + "loss": 0.0, + "num_input_tokens_seen": 31015328, + "step": 18100 + }, + { + "epoch": 87.8861985472155, + "grad_norm": 1.7062731672012887e-07, + "learning_rate": 0.1722542638064061, + "loss": 0.0, + "num_input_tokens_seen": 31023712, + "step": 18105 + }, + { + "epoch": 87.91041162227603, + "grad_norm": 5.244577323537669e-07, + "learning_rate": 0.17219600912058117, + "loss": 0.0, + "num_input_tokens_seen": 31032416, + "step": 18110 + }, + { + "epoch": 87.93462469733656, + "grad_norm": 5.963304943179537e-07, + "learning_rate": 0.17213775101185272, + "loss": 0.0, + "num_input_tokens_seen": 31041216, + "step": 18115 + }, + { + "epoch": 87.95883777239709, + "grad_norm": 3.6204414755047765e-07, + "learning_rate": 0.17207948948920485, + "loss": 0.0, + "num_input_tokens_seen": 31049824, + "step": 18120 + }, + { + "epoch": 87.98305084745763, + "grad_norm": 1.1505134125400218e-06, + "learning_rate": 0.17202122456162228, + "loss": 0.0, + "num_input_tokens_seen": 31058240, + "step": 18125 + }, + { + "epoch": 88.00968523002422, + "grad_norm": 1.2713414889731212e-06, + "learning_rate": 0.17196295623809013, + "loss": 0.0, + "num_input_tokens_seen": 31067104, + "step": 18130 + }, + { + "epoch": 88.03389830508475, + "grad_norm": 6.728031962666137e-07, + "learning_rate": 0.1719046845275941, + "loss": 0.0, + "num_input_tokens_seen": 31075904, + "step": 18135 + }, + { + "epoch": 88.05811138014528, + "grad_norm": 1.0387020665802993e-06, + "learning_rate": 0.17184640943912044, + "loss": 0.0, + "num_input_tokens_seen": 31084352, + "step": 18140 + }, + { + "epoch": 88.08232445520581, + "grad_norm": 3.730042692495772e-07, + "learning_rate": 0.1717881309816559, + "loss": 0.0, + "num_input_tokens_seen": 31092544, + "step": 18145 + }, + { + "epoch": 88.10653753026634, + "grad_norm": 5.198804728934192e-07, + "learning_rate": 0.1717298491641878, + "loss": 0.0, + "num_input_tokens_seen": 31101248, + "step": 18150 + }, + { + "epoch": 88.13075060532688, + "grad_norm": 5.021900619794906e-07, + "learning_rate": 0.17167156399570385, + "loss": 0.0, + "num_input_tokens_seen": 31109792, + "step": 18155 + }, + { + "epoch": 88.15496368038741, + "grad_norm": 3.387317519809585e-07, + "learning_rate": 0.17161327548519242, + "loss": 0.0, + "num_input_tokens_seen": 31118208, + "step": 18160 + }, + { + "epoch": 88.17917675544794, + "grad_norm": 9.377520768794056e-07, + "learning_rate": 0.1715549836416423, + "loss": 0.0, + "num_input_tokens_seen": 31126656, + "step": 18165 + }, + { + "epoch": 88.20338983050847, + "grad_norm": 3.880512906562217e-07, + "learning_rate": 0.17149668847404279, + "loss": 0.0, + "num_input_tokens_seen": 31135232, + "step": 18170 + }, + { + "epoch": 88.227602905569, + "grad_norm": 8.076266340140137e-07, + "learning_rate": 0.1714383899913838, + "loss": 0.0, + "num_input_tokens_seen": 31143872, + "step": 18175 + }, + { + "epoch": 88.25181598062954, + "grad_norm": 3.11833503019443e-07, + "learning_rate": 0.17138008820265563, + "loss": 0.0, + "num_input_tokens_seen": 31152544, + "step": 18180 + }, + { + "epoch": 88.27602905569007, + "grad_norm": 3.8196017726477294e-07, + "learning_rate": 0.17132178311684917, + "loss": 0.0, + "num_input_tokens_seen": 31161312, + "step": 18185 + }, + { + "epoch": 88.3002421307506, + "grad_norm": 2.264857386080621e-07, + "learning_rate": 0.1712634747429559, + "loss": 0.0, + "num_input_tokens_seen": 31170112, + "step": 18190 + }, + { + "epoch": 88.32445520581113, + "grad_norm": 8.277792744593171e-07, + "learning_rate": 0.17120516308996753, + "loss": 0.0, + "num_input_tokens_seen": 31178912, + "step": 18195 + }, + { + "epoch": 88.34866828087166, + "grad_norm": 5.641002189804567e-07, + "learning_rate": 0.17114684816687653, + "loss": 0.0, + "num_input_tokens_seen": 31187360, + "step": 18200 + }, + { + "epoch": 88.34866828087166, + "eval_loss": 0.8595455884933472, + "eval_runtime": 4.7287, + "eval_samples_per_second": 77.611, + "eval_steps_per_second": 19.456, + "num_input_tokens_seen": 31187360, + "step": 18200 + }, + { + "epoch": 88.37288135593221, + "grad_norm": 1.1407017836972955e-06, + "learning_rate": 0.17108852998267585, + "loss": 0.0, + "num_input_tokens_seen": 31196160, + "step": 18205 + }, + { + "epoch": 88.39709443099274, + "grad_norm": 1.0581858305158676e-06, + "learning_rate": 0.17103020854635878, + "loss": 0.0, + "num_input_tokens_seen": 31204512, + "step": 18210 + }, + { + "epoch": 88.42130750605327, + "grad_norm": 6.605417297578242e-07, + "learning_rate": 0.1709718838669193, + "loss": 0.0, + "num_input_tokens_seen": 31212704, + "step": 18215 + }, + { + "epoch": 88.4455205811138, + "grad_norm": 6.808133434788033e-07, + "learning_rate": 0.17091355595335173, + "loss": 0.0, + "num_input_tokens_seen": 31221440, + "step": 18220 + }, + { + "epoch": 88.46973365617433, + "grad_norm": 1.0758591315607191e-06, + "learning_rate": 0.17085522481465107, + "loss": 0.0, + "num_input_tokens_seen": 31229696, + "step": 18225 + }, + { + "epoch": 88.49394673123487, + "grad_norm": 8.784173246567661e-07, + "learning_rate": 0.17079689045981264, + "loss": 0.0, + "num_input_tokens_seen": 31238144, + "step": 18230 + }, + { + "epoch": 88.5181598062954, + "grad_norm": 9.498182294009894e-07, + "learning_rate": 0.17073855289783238, + "loss": 0.0, + "num_input_tokens_seen": 31246784, + "step": 18235 + }, + { + "epoch": 88.54237288135593, + "grad_norm": 2.417812652311113e-07, + "learning_rate": 0.1706802121377066, + "loss": 0.0, + "num_input_tokens_seen": 31255552, + "step": 18240 + }, + { + "epoch": 88.56658595641646, + "grad_norm": 7.419324106194836e-07, + "learning_rate": 0.17062186818843225, + "loss": 0.0, + "num_input_tokens_seen": 31263872, + "step": 18245 + }, + { + "epoch": 88.59079903147699, + "grad_norm": 9.507922413831693e-07, + "learning_rate": 0.17056352105900668, + "loss": 0.0, + "num_input_tokens_seen": 31272576, + "step": 18250 + }, + { + "epoch": 88.61501210653753, + "grad_norm": 1.061639295585337e-06, + "learning_rate": 0.17050517075842772, + "loss": 0.0, + "num_input_tokens_seen": 31281376, + "step": 18255 + }, + { + "epoch": 88.63922518159806, + "grad_norm": 4.538421762845246e-07, + "learning_rate": 0.17044681729569375, + "loss": 0.0, + "num_input_tokens_seen": 31290464, + "step": 18260 + }, + { + "epoch": 88.6634382566586, + "grad_norm": 6.420083877856086e-07, + "learning_rate": 0.17038846067980365, + "loss": 0.0, + "num_input_tokens_seen": 31298944, + "step": 18265 + }, + { + "epoch": 88.68765133171912, + "grad_norm": 4.7125570290518226e-07, + "learning_rate": 0.17033010091975664, + "loss": 0.0, + "num_input_tokens_seen": 31307520, + "step": 18270 + }, + { + "epoch": 88.71186440677967, + "grad_norm": 5.940861456110724e-07, + "learning_rate": 0.17027173802455262, + "loss": 0.0, + "num_input_tokens_seen": 31316064, + "step": 18275 + }, + { + "epoch": 88.7360774818402, + "grad_norm": 4.0082281316244917e-07, + "learning_rate": 0.1702133720031918, + "loss": 0.0, + "num_input_tokens_seen": 31324768, + "step": 18280 + }, + { + "epoch": 88.76029055690073, + "grad_norm": 5.845757868883084e-07, + "learning_rate": 0.17015500286467503, + "loss": 0.0, + "num_input_tokens_seen": 31333088, + "step": 18285 + }, + { + "epoch": 88.78450363196126, + "grad_norm": 1.1836704061352066e-06, + "learning_rate": 0.17009663061800354, + "loss": 0.0, + "num_input_tokens_seen": 31341600, + "step": 18290 + }, + { + "epoch": 88.80871670702179, + "grad_norm": 4.414110605921451e-07, + "learning_rate": 0.17003825527217903, + "loss": 0.0, + "num_input_tokens_seen": 31350016, + "step": 18295 + }, + { + "epoch": 88.83292978208233, + "grad_norm": 1.6086976302176481e-06, + "learning_rate": 0.16997987683620377, + "loss": 0.0, + "num_input_tokens_seen": 31358592, + "step": 18300 + }, + { + "epoch": 88.85714285714286, + "grad_norm": 3.2996914001159894e-07, + "learning_rate": 0.16992149531908043, + "loss": 0.0, + "num_input_tokens_seen": 31367072, + "step": 18305 + }, + { + "epoch": 88.88135593220339, + "grad_norm": 5.335477908374742e-07, + "learning_rate": 0.16986311072981214, + "loss": 0.0, + "num_input_tokens_seen": 31375712, + "step": 18310 + }, + { + "epoch": 88.90556900726392, + "grad_norm": 3.4424169825797435e-07, + "learning_rate": 0.16980472307740255, + "loss": 0.0, + "num_input_tokens_seen": 31384352, + "step": 18315 + }, + { + "epoch": 88.92978208232445, + "grad_norm": 5.180495463719126e-07, + "learning_rate": 0.1697463323708558, + "loss": 0.0, + "num_input_tokens_seen": 31392576, + "step": 18320 + }, + { + "epoch": 88.953995157385, + "grad_norm": 3.708610734065587e-07, + "learning_rate": 0.16968793861917641, + "loss": 0.0, + "num_input_tokens_seen": 31400896, + "step": 18325 + }, + { + "epoch": 88.97820823244552, + "grad_norm": 1.1750125139542433e-07, + "learning_rate": 0.16962954183136952, + "loss": 0.0, + "num_input_tokens_seen": 31409504, + "step": 18330 + }, + { + "epoch": 89.00484261501211, + "grad_norm": 6.90084107191069e-07, + "learning_rate": 0.16957114201644058, + "loss": 0.0, + "num_input_tokens_seen": 31418816, + "step": 18335 + }, + { + "epoch": 89.02905569007264, + "grad_norm": 3.9995521206037665e-07, + "learning_rate": 0.16951273918339563, + "loss": 0.0, + "num_input_tokens_seen": 31427584, + "step": 18340 + }, + { + "epoch": 89.05326876513317, + "grad_norm": 6.48764910238242e-07, + "learning_rate": 0.16945433334124105, + "loss": 0.0, + "num_input_tokens_seen": 31435520, + "step": 18345 + }, + { + "epoch": 89.0774818401937, + "grad_norm": 8.092869165921002e-07, + "learning_rate": 0.1693959244989838, + "loss": 0.0, + "num_input_tokens_seen": 31444160, + "step": 18350 + }, + { + "epoch": 89.10169491525424, + "grad_norm": 7.244987614285492e-07, + "learning_rate": 0.16933751266563127, + "loss": 0.0, + "num_input_tokens_seen": 31452608, + "step": 18355 + }, + { + "epoch": 89.12590799031477, + "grad_norm": 6.702353516629955e-07, + "learning_rate": 0.16927909785019118, + "loss": 0.0, + "num_input_tokens_seen": 31461152, + "step": 18360 + }, + { + "epoch": 89.1501210653753, + "grad_norm": 4.1378140736014757e-07, + "learning_rate": 0.169220680061672, + "loss": 0.0, + "num_input_tokens_seen": 31469408, + "step": 18365 + }, + { + "epoch": 89.17433414043583, + "grad_norm": 1.0141454822587548e-06, + "learning_rate": 0.16916225930908244, + "loss": 0.0, + "num_input_tokens_seen": 31477600, + "step": 18370 + }, + { + "epoch": 89.19854721549636, + "grad_norm": 1.1211211585759884e-06, + "learning_rate": 0.16910383560143163, + "loss": 0.0, + "num_input_tokens_seen": 31485920, + "step": 18375 + }, + { + "epoch": 89.2227602905569, + "grad_norm": 3.8813874425613903e-07, + "learning_rate": 0.16904540894772935, + "loss": 0.0, + "num_input_tokens_seen": 31494560, + "step": 18380 + }, + { + "epoch": 89.24697336561744, + "grad_norm": 6.670362040495093e-07, + "learning_rate": 0.16898697935698562, + "loss": 0.0, + "num_input_tokens_seen": 31503040, + "step": 18385 + }, + { + "epoch": 89.27118644067797, + "grad_norm": 1.06704260360857e-06, + "learning_rate": 0.1689285468382111, + "loss": 0.0, + "num_input_tokens_seen": 31511456, + "step": 18390 + }, + { + "epoch": 89.2953995157385, + "grad_norm": 4.96272434702405e-07, + "learning_rate": 0.16887011140041677, + "loss": 0.0, + "num_input_tokens_seen": 31520032, + "step": 18395 + }, + { + "epoch": 89.31961259079903, + "grad_norm": 8.219379878937616e-07, + "learning_rate": 0.1688116730526141, + "loss": 0.0, + "num_input_tokens_seen": 31528480, + "step": 18400 + }, + { + "epoch": 89.31961259079903, + "eval_loss": 0.8655370473861694, + "eval_runtime": 4.609, + "eval_samples_per_second": 79.626, + "eval_steps_per_second": 19.961, + "num_input_tokens_seen": 31528480, + "step": 18400 + }, + { + "epoch": 89.34382566585957, + "grad_norm": 2.1286376750140334e-07, + "learning_rate": 0.1687532318038151, + "loss": 0.0, + "num_input_tokens_seen": 31537120, + "step": 18405 + }, + { + "epoch": 89.3680387409201, + "grad_norm": 5.155274038770585e-07, + "learning_rate": 0.16869478766303206, + "loss": 0.0, + "num_input_tokens_seen": 31545824, + "step": 18410 + }, + { + "epoch": 89.39225181598063, + "grad_norm": 1.0392855074314866e-06, + "learning_rate": 0.16863634063927788, + "loss": 0.0, + "num_input_tokens_seen": 31554336, + "step": 18415 + }, + { + "epoch": 89.41646489104116, + "grad_norm": 2.5878719611682754e-07, + "learning_rate": 0.16857789074156568, + "loss": 0.0, + "num_input_tokens_seen": 31562976, + "step": 18420 + }, + { + "epoch": 89.44067796610169, + "grad_norm": 3.89478657325526e-07, + "learning_rate": 0.16851943797890928, + "loss": 0.0, + "num_input_tokens_seen": 31571648, + "step": 18425 + }, + { + "epoch": 89.46489104116223, + "grad_norm": 6.036513582330372e-07, + "learning_rate": 0.16846098236032284, + "loss": 0.0, + "num_input_tokens_seen": 31580224, + "step": 18430 + }, + { + "epoch": 89.48910411622276, + "grad_norm": 4.6884801463420445e-07, + "learning_rate": 0.16840252389482097, + "loss": 0.0, + "num_input_tokens_seen": 31588736, + "step": 18435 + }, + { + "epoch": 89.51331719128329, + "grad_norm": 2.8840693744314194e-07, + "learning_rate": 0.16834406259141857, + "loss": 0.0, + "num_input_tokens_seen": 31597376, + "step": 18440 + }, + { + "epoch": 89.53753026634382, + "grad_norm": 2.2198517513061233e-07, + "learning_rate": 0.16828559845913124, + "loss": 0.0, + "num_input_tokens_seen": 31605888, + "step": 18445 + }, + { + "epoch": 89.56174334140435, + "grad_norm": 8.359629077858699e-07, + "learning_rate": 0.16822713150697488, + "loss": 0.0, + "num_input_tokens_seen": 31614592, + "step": 18450 + }, + { + "epoch": 89.5859564164649, + "grad_norm": 8.992508924166032e-07, + "learning_rate": 0.16816866174396575, + "loss": 0.0, + "num_input_tokens_seen": 31623168, + "step": 18455 + }, + { + "epoch": 89.61016949152543, + "grad_norm": 7.8329554753509e-07, + "learning_rate": 0.16811018917912057, + "loss": 0.0, + "num_input_tokens_seen": 31631424, + "step": 18460 + }, + { + "epoch": 89.63438256658596, + "grad_norm": 5.745677640334179e-07, + "learning_rate": 0.16805171382145673, + "loss": 0.0, + "num_input_tokens_seen": 31640384, + "step": 18465 + }, + { + "epoch": 89.65859564164649, + "grad_norm": 3.9426282683052705e-07, + "learning_rate": 0.16799323567999175, + "loss": 0.0, + "num_input_tokens_seen": 31648736, + "step": 18470 + }, + { + "epoch": 89.68280871670702, + "grad_norm": 6.15369515344355e-07, + "learning_rate": 0.16793475476374367, + "loss": 0.0, + "num_input_tokens_seen": 31657632, + "step": 18475 + }, + { + "epoch": 89.70702179176756, + "grad_norm": 4.192077653897286e-07, + "learning_rate": 0.1678762710817311, + "loss": 0.0, + "num_input_tokens_seen": 31666048, + "step": 18480 + }, + { + "epoch": 89.73123486682809, + "grad_norm": 6.951434556867753e-07, + "learning_rate": 0.1678177846429728, + "loss": 0.0, + "num_input_tokens_seen": 31674432, + "step": 18485 + }, + { + "epoch": 89.75544794188862, + "grad_norm": 3.738104510375706e-07, + "learning_rate": 0.16775929545648827, + "loss": 0.0, + "num_input_tokens_seen": 31683232, + "step": 18490 + }, + { + "epoch": 89.77966101694915, + "grad_norm": 3.8400196444854373e-07, + "learning_rate": 0.16770080353129715, + "loss": 0.0, + "num_input_tokens_seen": 31691840, + "step": 18495 + }, + { + "epoch": 89.80387409200968, + "grad_norm": 8.006398388715752e-07, + "learning_rate": 0.16764230887641968, + "loss": 0.0, + "num_input_tokens_seen": 31700160, + "step": 18500 + }, + { + "epoch": 89.82808716707022, + "grad_norm": 7.726230251137167e-07, + "learning_rate": 0.1675838115008765, + "loss": 0.0, + "num_input_tokens_seen": 31708672, + "step": 18505 + }, + { + "epoch": 89.85230024213075, + "grad_norm": 9.075906746147666e-07, + "learning_rate": 0.1675253114136886, + "loss": 0.0, + "num_input_tokens_seen": 31717184, + "step": 18510 + }, + { + "epoch": 89.87651331719128, + "grad_norm": 4.5431391981765046e-07, + "learning_rate": 0.16746680862387747, + "loss": 0.0, + "num_input_tokens_seen": 31725856, + "step": 18515 + }, + { + "epoch": 89.90072639225181, + "grad_norm": 4.6048143076404813e-07, + "learning_rate": 0.16740830314046493, + "loss": 0.0, + "num_input_tokens_seen": 31734528, + "step": 18520 + }, + { + "epoch": 89.92493946731234, + "grad_norm": 3.222271516278852e-07, + "learning_rate": 0.1673497949724733, + "loss": 0.0, + "num_input_tokens_seen": 31743008, + "step": 18525 + }, + { + "epoch": 89.94915254237289, + "grad_norm": 3.345981554048194e-07, + "learning_rate": 0.16729128412892522, + "loss": 0.0, + "num_input_tokens_seen": 31751616, + "step": 18530 + }, + { + "epoch": 89.97336561743342, + "grad_norm": 1.998602527919502e-07, + "learning_rate": 0.16723277061884384, + "loss": 0.0, + "num_input_tokens_seen": 31759968, + "step": 18535 + }, + { + "epoch": 89.99757869249395, + "grad_norm": 4.7771311528777e-07, + "learning_rate": 0.16717425445125267, + "loss": 0.0, + "num_input_tokens_seen": 31769152, + "step": 18540 + }, + { + "epoch": 90.02421307506053, + "grad_norm": 6.142241204543097e-07, + "learning_rate": 0.16711573563517565, + "loss": 0.0, + "num_input_tokens_seen": 31778400, + "step": 18545 + }, + { + "epoch": 90.04842615012106, + "grad_norm": 5.870996915291471e-07, + "learning_rate": 0.1670572141796371, + "loss": 0.0, + "num_input_tokens_seen": 31787072, + "step": 18550 + }, + { + "epoch": 90.0726392251816, + "grad_norm": 4.725124824744853e-07, + "learning_rate": 0.16699869009366175, + "loss": 0.0, + "num_input_tokens_seen": 31795200, + "step": 18555 + }, + { + "epoch": 90.09685230024213, + "grad_norm": 5.358130010790774e-07, + "learning_rate": 0.1669401633862748, + "loss": 0.0, + "num_input_tokens_seen": 31803552, + "step": 18560 + }, + { + "epoch": 90.12106537530266, + "grad_norm": 9.105392564379144e-07, + "learning_rate": 0.16688163406650178, + "loss": 0.0, + "num_input_tokens_seen": 31812224, + "step": 18565 + }, + { + "epoch": 90.1452784503632, + "grad_norm": 7.619604502906441e-07, + "learning_rate": 0.1668231021433686, + "loss": 0.0, + "num_input_tokens_seen": 31820928, + "step": 18570 + }, + { + "epoch": 90.16949152542372, + "grad_norm": 8.754212785788695e-07, + "learning_rate": 0.1667645676259017, + "loss": 0.0, + "num_input_tokens_seen": 31829472, + "step": 18575 + }, + { + "epoch": 90.19370460048427, + "grad_norm": 2.9930450295978517e-07, + "learning_rate": 0.1667060305231277, + "loss": 0.0, + "num_input_tokens_seen": 31838304, + "step": 18580 + }, + { + "epoch": 90.2179176755448, + "grad_norm": 6.684384743493865e-07, + "learning_rate": 0.16664749084407396, + "loss": 0.0, + "num_input_tokens_seen": 31847072, + "step": 18585 + }, + { + "epoch": 90.24213075060533, + "grad_norm": 5.989516012050444e-07, + "learning_rate": 0.16658894859776788, + "loss": 0.0, + "num_input_tokens_seen": 31855328, + "step": 18590 + }, + { + "epoch": 90.26634382566586, + "grad_norm": 3.6675032788480166e-07, + "learning_rate": 0.16653040379323752, + "loss": 0.0, + "num_input_tokens_seen": 31864032, + "step": 18595 + }, + { + "epoch": 90.29055690072639, + "grad_norm": 2.536282863729866e-07, + "learning_rate": 0.16647185643951107, + "loss": 0.0, + "num_input_tokens_seen": 31872544, + "step": 18600 + }, + { + "epoch": 90.29055690072639, + "eval_loss": 0.873073935508728, + "eval_runtime": 4.6129, + "eval_samples_per_second": 79.559, + "eval_steps_per_second": 19.944, + "num_input_tokens_seen": 31872544, + "step": 18600 + }, + { + "epoch": 90.31476997578693, + "grad_norm": 6.166652610772871e-07, + "learning_rate": 0.1664133065456174, + "loss": 0.0, + "num_input_tokens_seen": 31881056, + "step": 18605 + }, + { + "epoch": 90.33898305084746, + "grad_norm": 3.271112518632435e-07, + "learning_rate": 0.1663547541205856, + "loss": 0.0, + "num_input_tokens_seen": 31889536, + "step": 18610 + }, + { + "epoch": 90.36319612590799, + "grad_norm": 3.5578389656620857e-07, + "learning_rate": 0.16629619917344518, + "loss": 0.0, + "num_input_tokens_seen": 31898272, + "step": 18615 + }, + { + "epoch": 90.38740920096852, + "grad_norm": 7.941897024466016e-07, + "learning_rate": 0.16623764171322605, + "loss": 0.0, + "num_input_tokens_seen": 31906720, + "step": 18620 + }, + { + "epoch": 90.41162227602905, + "grad_norm": 2.622719819100894e-07, + "learning_rate": 0.1661790817489585, + "loss": 0.0, + "num_input_tokens_seen": 31915360, + "step": 18625 + }, + { + "epoch": 90.4358353510896, + "grad_norm": 7.98112864686118e-07, + "learning_rate": 0.16612051928967328, + "loss": 0.0, + "num_input_tokens_seen": 31924288, + "step": 18630 + }, + { + "epoch": 90.46004842615012, + "grad_norm": 6.353282060445054e-07, + "learning_rate": 0.16606195434440138, + "loss": 0.0, + "num_input_tokens_seen": 31933056, + "step": 18635 + }, + { + "epoch": 90.48426150121065, + "grad_norm": 5.440494419417519e-07, + "learning_rate": 0.16600338692217426, + "loss": 0.0, + "num_input_tokens_seen": 31941312, + "step": 18640 + }, + { + "epoch": 90.50847457627118, + "grad_norm": 5.179568347557506e-07, + "learning_rate": 0.16594481703202374, + "loss": 0.0, + "num_input_tokens_seen": 31949376, + "step": 18645 + }, + { + "epoch": 90.53268765133171, + "grad_norm": 1.0513888355490053e-06, + "learning_rate": 0.1658862446829821, + "loss": 0.0, + "num_input_tokens_seen": 31958048, + "step": 18650 + }, + { + "epoch": 90.55690072639226, + "grad_norm": 6.410841137949319e-07, + "learning_rate": 0.16582766988408187, + "loss": 0.0, + "num_input_tokens_seen": 31966624, + "step": 18655 + }, + { + "epoch": 90.58111380145279, + "grad_norm": 4.1168641473632306e-07, + "learning_rate": 0.16576909264435608, + "loss": 0.0, + "num_input_tokens_seen": 31975200, + "step": 18660 + }, + { + "epoch": 90.60532687651332, + "grad_norm": 3.220753796995268e-07, + "learning_rate": 0.16571051297283798, + "loss": 0.0, + "num_input_tokens_seen": 31983712, + "step": 18665 + }, + { + "epoch": 90.62953995157385, + "grad_norm": 3.253298643812741e-07, + "learning_rate": 0.16565193087856137, + "loss": 0.0, + "num_input_tokens_seen": 31992128, + "step": 18670 + }, + { + "epoch": 90.65375302663438, + "grad_norm": 5.330762178346049e-07, + "learning_rate": 0.16559334637056033, + "loss": 0.0, + "num_input_tokens_seen": 32000640, + "step": 18675 + }, + { + "epoch": 90.67796610169492, + "grad_norm": 8.04826242983836e-07, + "learning_rate": 0.16553475945786933, + "loss": 0.0, + "num_input_tokens_seen": 32009120, + "step": 18680 + }, + { + "epoch": 90.70217917675545, + "grad_norm": 2.0604423411896278e-07, + "learning_rate": 0.16547617014952318, + "loss": 0.0, + "num_input_tokens_seen": 32017824, + "step": 18685 + }, + { + "epoch": 90.72639225181598, + "grad_norm": 4.532363107045967e-07, + "learning_rate": 0.1654175784545571, + "loss": 0.0, + "num_input_tokens_seen": 32026528, + "step": 18690 + }, + { + "epoch": 90.75060532687651, + "grad_norm": 6.837032060502679e-07, + "learning_rate": 0.1653589843820067, + "loss": 0.0, + "num_input_tokens_seen": 32035168, + "step": 18695 + }, + { + "epoch": 90.77481840193704, + "grad_norm": 5.815256827190751e-07, + "learning_rate": 0.1653003879409079, + "loss": 0.0, + "num_input_tokens_seen": 32043328, + "step": 18700 + }, + { + "epoch": 90.79903147699758, + "grad_norm": 2.482383081314765e-07, + "learning_rate": 0.165241789140297, + "loss": 0.0, + "num_input_tokens_seen": 32051744, + "step": 18705 + }, + { + "epoch": 90.82324455205811, + "grad_norm": 3.9845392052484385e-07, + "learning_rate": 0.16518318798921064, + "loss": 0.0, + "num_input_tokens_seen": 32059968, + "step": 18710 + }, + { + "epoch": 90.84745762711864, + "grad_norm": 5.652372578879294e-07, + "learning_rate": 0.16512458449668593, + "loss": 0.0, + "num_input_tokens_seen": 32068256, + "step": 18715 + }, + { + "epoch": 90.87167070217917, + "grad_norm": 9.45474823765835e-07, + "learning_rate": 0.1650659786717602, + "loss": 0.0, + "num_input_tokens_seen": 32076928, + "step": 18720 + }, + { + "epoch": 90.8958837772397, + "grad_norm": 9.726401231091586e-07, + "learning_rate": 0.1650073705234712, + "loss": 0.0, + "num_input_tokens_seen": 32085248, + "step": 18725 + }, + { + "epoch": 90.92009685230025, + "grad_norm": 9.052931773112505e-07, + "learning_rate": 0.16494876006085712, + "loss": 0.0, + "num_input_tokens_seen": 32094144, + "step": 18730 + }, + { + "epoch": 90.94430992736078, + "grad_norm": 3.136020438887499e-07, + "learning_rate": 0.16489014729295634, + "loss": 0.0, + "num_input_tokens_seen": 32102912, + "step": 18735 + }, + { + "epoch": 90.9685230024213, + "grad_norm": 6.115792530181352e-07, + "learning_rate": 0.16483153222880775, + "loss": 0.0, + "num_input_tokens_seen": 32111328, + "step": 18740 + }, + { + "epoch": 90.99273607748184, + "grad_norm": 5.334499064701959e-07, + "learning_rate": 0.16477291487745052, + "loss": 0.0, + "num_input_tokens_seen": 32119904, + "step": 18745 + }, + { + "epoch": 91.01937046004842, + "grad_norm": 1.3049093183781224e-07, + "learning_rate": 0.16471429524792416, + "loss": 0.0, + "num_input_tokens_seen": 32129024, + "step": 18750 + }, + { + "epoch": 91.04358353510897, + "grad_norm": 5.359506758395582e-07, + "learning_rate": 0.16465567334926856, + "loss": 0.0, + "num_input_tokens_seen": 32137472, + "step": 18755 + }, + { + "epoch": 91.0677966101695, + "grad_norm": 2.1734176414156536e-07, + "learning_rate": 0.16459704919052395, + "loss": 0.0, + "num_input_tokens_seen": 32146144, + "step": 18760 + }, + { + "epoch": 91.09200968523002, + "grad_norm": 9.192663696921954e-07, + "learning_rate": 0.16453842278073086, + "loss": 0.0, + "num_input_tokens_seen": 32154688, + "step": 18765 + }, + { + "epoch": 91.11622276029055, + "grad_norm": 8.610869173253377e-08, + "learning_rate": 0.16447979412893038, + "loss": 0.0, + "num_input_tokens_seen": 32163584, + "step": 18770 + }, + { + "epoch": 91.14043583535108, + "grad_norm": 1.2761005052652763e-07, + "learning_rate": 0.16442116324416367, + "loss": 0.0, + "num_input_tokens_seen": 32172160, + "step": 18775 + }, + { + "epoch": 91.16464891041163, + "grad_norm": 7.661890322196996e-07, + "learning_rate": 0.1643625301354723, + "loss": 0.0, + "num_input_tokens_seen": 32180672, + "step": 18780 + }, + { + "epoch": 91.18886198547216, + "grad_norm": 5.593156515715236e-07, + "learning_rate": 0.16430389481189828, + "loss": 0.0, + "num_input_tokens_seen": 32189056, + "step": 18785 + }, + { + "epoch": 91.21307506053269, + "grad_norm": 5.621344030259934e-07, + "learning_rate": 0.164245257282484, + "loss": 0.0, + "num_input_tokens_seen": 32197376, + "step": 18790 + }, + { + "epoch": 91.23728813559322, + "grad_norm": 3.474253276181116e-07, + "learning_rate": 0.16418661755627195, + "loss": 0.0, + "num_input_tokens_seen": 32205792, + "step": 18795 + }, + { + "epoch": 91.26150121065375, + "grad_norm": 7.152859211601026e-07, + "learning_rate": 0.16412797564230527, + "loss": 0.0, + "num_input_tokens_seen": 32214560, + "step": 18800 + }, + { + "epoch": 91.26150121065375, + "eval_loss": 0.8823966383934021, + "eval_runtime": 4.637, + "eval_samples_per_second": 79.146, + "eval_steps_per_second": 19.84, + "num_input_tokens_seen": 32214560, + "step": 18800 + }, + { + "epoch": 91.28571428571429, + "grad_norm": 4.4408633925741015e-07, + "learning_rate": 0.16406933154962713, + "loss": 0.0, + "num_input_tokens_seen": 32223040, + "step": 18805 + }, + { + "epoch": 91.30992736077482, + "grad_norm": 1.0207120340055553e-06, + "learning_rate": 0.16401068528728133, + "loss": 0.0, + "num_input_tokens_seen": 32231680, + "step": 18810 + }, + { + "epoch": 91.33414043583535, + "grad_norm": 1.5605749581482087e-07, + "learning_rate": 0.16395203686431173, + "loss": 0.0, + "num_input_tokens_seen": 32240512, + "step": 18815 + }, + { + "epoch": 91.35835351089588, + "grad_norm": 7.118302391972975e-07, + "learning_rate": 0.16389338628976277, + "loss": 0.0, + "num_input_tokens_seen": 32249088, + "step": 18820 + }, + { + "epoch": 91.38256658595641, + "grad_norm": 3.044038692223694e-07, + "learning_rate": 0.163834733572679, + "loss": 0.0, + "num_input_tokens_seen": 32257728, + "step": 18825 + }, + { + "epoch": 91.40677966101696, + "grad_norm": 6.28993518603238e-07, + "learning_rate": 0.16377607872210545, + "loss": 0.0, + "num_input_tokens_seen": 32266240, + "step": 18830 + }, + { + "epoch": 91.43099273607749, + "grad_norm": 3.392565304238815e-07, + "learning_rate": 0.16371742174708748, + "loss": 0.0, + "num_input_tokens_seen": 32274400, + "step": 18835 + }, + { + "epoch": 91.45520581113801, + "grad_norm": 3.413675244701153e-07, + "learning_rate": 0.16365876265667065, + "loss": 0.0, + "num_input_tokens_seen": 32282784, + "step": 18840 + }, + { + "epoch": 91.47941888619854, + "grad_norm": 4.1351404433953576e-07, + "learning_rate": 0.163600101459901, + "loss": 0.0, + "num_input_tokens_seen": 32291200, + "step": 18845 + }, + { + "epoch": 91.50363196125907, + "grad_norm": 5.858980216544296e-07, + "learning_rate": 0.16354143816582484, + "loss": 0.0, + "num_input_tokens_seen": 32300288, + "step": 18850 + }, + { + "epoch": 91.52784503631962, + "grad_norm": 9.57608222051931e-07, + "learning_rate": 0.1634827727834887, + "loss": 0.0, + "num_input_tokens_seen": 32308704, + "step": 18855 + }, + { + "epoch": 91.55205811138015, + "grad_norm": 9.207784614773118e-07, + "learning_rate": 0.16342410532193954, + "loss": 0.0, + "num_input_tokens_seen": 32317280, + "step": 18860 + }, + { + "epoch": 91.57627118644068, + "grad_norm": 5.018779916099447e-07, + "learning_rate": 0.16336543579022464, + "loss": 0.0, + "num_input_tokens_seen": 32326496, + "step": 18865 + }, + { + "epoch": 91.60048426150121, + "grad_norm": 8.10946630735998e-07, + "learning_rate": 0.16330676419739157, + "loss": 0.0, + "num_input_tokens_seen": 32335104, + "step": 18870 + }, + { + "epoch": 91.62469733656174, + "grad_norm": 7.120855798348202e-07, + "learning_rate": 0.1632480905524883, + "loss": 0.0, + "num_input_tokens_seen": 32343776, + "step": 18875 + }, + { + "epoch": 91.64891041162228, + "grad_norm": 3.884887860294839e-07, + "learning_rate": 0.16318941486456293, + "loss": 0.0, + "num_input_tokens_seen": 32352096, + "step": 18880 + }, + { + "epoch": 91.67312348668281, + "grad_norm": 3.3114682196355716e-07, + "learning_rate": 0.16313073714266405, + "loss": 0.0, + "num_input_tokens_seen": 32360384, + "step": 18885 + }, + { + "epoch": 91.69733656174334, + "grad_norm": 2.5597364583518356e-07, + "learning_rate": 0.16307205739584052, + "loss": 0.0, + "num_input_tokens_seen": 32368992, + "step": 18890 + }, + { + "epoch": 91.72154963680387, + "grad_norm": 9.776293836694094e-07, + "learning_rate": 0.16301337563314144, + "loss": 0.0, + "num_input_tokens_seen": 32377824, + "step": 18895 + }, + { + "epoch": 91.7457627118644, + "grad_norm": 6.6523983832667e-07, + "learning_rate": 0.1629546918636163, + "loss": 0.0, + "num_input_tokens_seen": 32386592, + "step": 18900 + }, + { + "epoch": 91.76997578692495, + "grad_norm": 5.850287152497913e-07, + "learning_rate": 0.16289600609631485, + "loss": 0.0, + "num_input_tokens_seen": 32395104, + "step": 18905 + }, + { + "epoch": 91.79418886198548, + "grad_norm": 6.000964276609011e-07, + "learning_rate": 0.16283731834028722, + "loss": 0.0, + "num_input_tokens_seen": 32403264, + "step": 18910 + }, + { + "epoch": 91.818401937046, + "grad_norm": 7.774857522235834e-07, + "learning_rate": 0.16277862860458378, + "loss": 0.0, + "num_input_tokens_seen": 32411840, + "step": 18915 + }, + { + "epoch": 91.84261501210653, + "grad_norm": 2.9882241392442666e-07, + "learning_rate": 0.16271993689825526, + "loss": 0.0, + "num_input_tokens_seen": 32420160, + "step": 18920 + }, + { + "epoch": 91.86682808716706, + "grad_norm": 5.057692078480613e-07, + "learning_rate": 0.1626612432303526, + "loss": 0.0, + "num_input_tokens_seen": 32428864, + "step": 18925 + }, + { + "epoch": 91.89104116222761, + "grad_norm": 2.7809775815512694e-07, + "learning_rate": 0.1626025476099271, + "loss": 0.0, + "num_input_tokens_seen": 32437152, + "step": 18930 + }, + { + "epoch": 91.91525423728814, + "grad_norm": 4.124754013901111e-07, + "learning_rate": 0.1625438500460304, + "loss": 0.0, + "num_input_tokens_seen": 32445696, + "step": 18935 + }, + { + "epoch": 91.93946731234867, + "grad_norm": 4.153301063070103e-07, + "learning_rate": 0.16248515054771442, + "loss": 0.0, + "num_input_tokens_seen": 32454112, + "step": 18940 + }, + { + "epoch": 91.9636803874092, + "grad_norm": 6.414644531105296e-07, + "learning_rate": 0.16242644912403123, + "loss": 0.0, + "num_input_tokens_seen": 32462592, + "step": 18945 + }, + { + "epoch": 91.98789346246973, + "grad_norm": 2.686194022771815e-07, + "learning_rate": 0.1623677457840335, + "loss": 0.0, + "num_input_tokens_seen": 32471264, + "step": 18950 + }, + { + "epoch": 92.01452784503633, + "grad_norm": 9.637359426051262e-07, + "learning_rate": 0.16230904053677397, + "loss": 0.0, + "num_input_tokens_seen": 32480704, + "step": 18955 + }, + { + "epoch": 92.03874092009686, + "grad_norm": 6.602462576665857e-07, + "learning_rate": 0.16225033339130568, + "loss": 0.0, + "num_input_tokens_seen": 32489856, + "step": 18960 + }, + { + "epoch": 92.06295399515739, + "grad_norm": 4.98075053201319e-07, + "learning_rate": 0.16219162435668197, + "loss": 0.0, + "num_input_tokens_seen": 32498304, + "step": 18965 + }, + { + "epoch": 92.08716707021792, + "grad_norm": 3.9999426348913403e-07, + "learning_rate": 0.16213291344195666, + "loss": 0.0, + "num_input_tokens_seen": 32506688, + "step": 18970 + }, + { + "epoch": 92.11138014527845, + "grad_norm": 4.988510227121878e-07, + "learning_rate": 0.16207420065618358, + "loss": 0.0, + "num_input_tokens_seen": 32515008, + "step": 18975 + }, + { + "epoch": 92.13559322033899, + "grad_norm": 3.0819910534773953e-07, + "learning_rate": 0.16201548600841706, + "loss": 0.0, + "num_input_tokens_seen": 32523648, + "step": 18980 + }, + { + "epoch": 92.15980629539952, + "grad_norm": 4.4025628653798776e-07, + "learning_rate": 0.16195676950771154, + "loss": 0.0, + "num_input_tokens_seen": 32532512, + "step": 18985 + }, + { + "epoch": 92.18401937046005, + "grad_norm": 9.79760898189852e-07, + "learning_rate": 0.16189805116312198, + "loss": 0.0, + "num_input_tokens_seen": 32541280, + "step": 18990 + }, + { + "epoch": 92.20823244552058, + "grad_norm": 4.1753730783966603e-07, + "learning_rate": 0.16183933098370337, + "loss": 0.0, + "num_input_tokens_seen": 32549440, + "step": 18995 + }, + { + "epoch": 92.23244552058111, + "grad_norm": 2.8533480644910014e-07, + "learning_rate": 0.16178060897851115, + "loss": 0.0, + "num_input_tokens_seen": 32558112, + "step": 19000 + }, + { + "epoch": 92.23244552058111, + "eval_loss": 0.8885065913200378, + "eval_runtime": 4.6198, + "eval_samples_per_second": 79.441, + "eval_steps_per_second": 19.914, + "num_input_tokens_seen": 32558112, + "step": 19000 + }, + { + "epoch": 92.25665859564165, + "grad_norm": 8.887986382433155e-07, + "learning_rate": 0.16172188515660096, + "loss": 0.0, + "num_input_tokens_seen": 32566944, + "step": 19005 + }, + { + "epoch": 92.28087167070218, + "grad_norm": 5.133681497682119e-07, + "learning_rate": 0.16166315952702878, + "loss": 0.0, + "num_input_tokens_seen": 32575488, + "step": 19010 + }, + { + "epoch": 92.30508474576271, + "grad_norm": 5.670726750395261e-07, + "learning_rate": 0.16160443209885084, + "loss": 0.0, + "num_input_tokens_seen": 32583776, + "step": 19015 + }, + { + "epoch": 92.32929782082324, + "grad_norm": 6.852169462945312e-07, + "learning_rate": 0.16154570288112363, + "loss": 0.0, + "num_input_tokens_seen": 32592480, + "step": 19020 + }, + { + "epoch": 92.35351089588377, + "grad_norm": 5.289552404974529e-07, + "learning_rate": 0.16148697188290395, + "loss": 0.0, + "num_input_tokens_seen": 32601088, + "step": 19025 + }, + { + "epoch": 92.37772397094432, + "grad_norm": 4.933913828608638e-07, + "learning_rate": 0.16142823911324888, + "loss": 0.0, + "num_input_tokens_seen": 32609440, + "step": 19030 + }, + { + "epoch": 92.40193704600485, + "grad_norm": 8.36195624742686e-07, + "learning_rate": 0.16136950458121568, + "loss": 0.0, + "num_input_tokens_seen": 32617792, + "step": 19035 + }, + { + "epoch": 92.42615012106538, + "grad_norm": 4.6553248012060067e-07, + "learning_rate": 0.16131076829586205, + "loss": 0.0, + "num_input_tokens_seen": 32626624, + "step": 19040 + }, + { + "epoch": 92.4503631961259, + "grad_norm": 5.85225279792212e-07, + "learning_rate": 0.1612520302662457, + "loss": 0.0, + "num_input_tokens_seen": 32635008, + "step": 19045 + }, + { + "epoch": 92.47457627118644, + "grad_norm": 5.106389266984479e-07, + "learning_rate": 0.16119329050142497, + "loss": 0.0, + "num_input_tokens_seen": 32643840, + "step": 19050 + }, + { + "epoch": 92.49878934624698, + "grad_norm": 2.613142839891225e-07, + "learning_rate": 0.16113454901045818, + "loss": 0.0, + "num_input_tokens_seen": 32652352, + "step": 19055 + }, + { + "epoch": 92.52300242130751, + "grad_norm": 2.723401166804251e-07, + "learning_rate": 0.16107580580240397, + "loss": 0.0, + "num_input_tokens_seen": 32660320, + "step": 19060 + }, + { + "epoch": 92.54721549636804, + "grad_norm": 7.771827199576364e-07, + "learning_rate": 0.16101706088632134, + "loss": 0.0, + "num_input_tokens_seen": 32669280, + "step": 19065 + }, + { + "epoch": 92.57142857142857, + "grad_norm": 7.583092269669578e-07, + "learning_rate": 0.16095831427126947, + "loss": 0.0, + "num_input_tokens_seen": 32677792, + "step": 19070 + }, + { + "epoch": 92.5956416464891, + "grad_norm": 2.7789221235252626e-07, + "learning_rate": 0.16089956596630783, + "loss": 0.0, + "num_input_tokens_seen": 32686496, + "step": 19075 + }, + { + "epoch": 92.61985472154964, + "grad_norm": 2.2794210963184014e-07, + "learning_rate": 0.16084081598049618, + "loss": 0.0, + "num_input_tokens_seen": 32695264, + "step": 19080 + }, + { + "epoch": 92.64406779661017, + "grad_norm": 2.904027098793449e-07, + "learning_rate": 0.1607820643228944, + "loss": 0.0, + "num_input_tokens_seen": 32704096, + "step": 19085 + }, + { + "epoch": 92.6682808716707, + "grad_norm": 2.116995290180057e-07, + "learning_rate": 0.16072331100256285, + "loss": 0.0, + "num_input_tokens_seen": 32712608, + "step": 19090 + }, + { + "epoch": 92.69249394673123, + "grad_norm": 7.732272138127882e-07, + "learning_rate": 0.16066455602856197, + "loss": 0.0, + "num_input_tokens_seen": 32721056, + "step": 19095 + }, + { + "epoch": 92.71670702179176, + "grad_norm": 9.792216815185384e-07, + "learning_rate": 0.16060579940995257, + "loss": 0.0, + "num_input_tokens_seen": 32729984, + "step": 19100 + }, + { + "epoch": 92.7409200968523, + "grad_norm": 7.057035986690607e-07, + "learning_rate": 0.16054704115579557, + "loss": 0.0, + "num_input_tokens_seen": 32738528, + "step": 19105 + }, + { + "epoch": 92.76513317191284, + "grad_norm": 3.3933008580788737e-07, + "learning_rate": 0.1604882812751523, + "loss": 0.0, + "num_input_tokens_seen": 32746848, + "step": 19110 + }, + { + "epoch": 92.78934624697337, + "grad_norm": 3.139099646887189e-07, + "learning_rate": 0.16042951977708425, + "loss": 0.0, + "num_input_tokens_seen": 32755424, + "step": 19115 + }, + { + "epoch": 92.8135593220339, + "grad_norm": 3.567374164958892e-07, + "learning_rate": 0.16037075667065318, + "loss": 0.0, + "num_input_tokens_seen": 32763968, + "step": 19120 + }, + { + "epoch": 92.83777239709443, + "grad_norm": 3.6097569022786047e-07, + "learning_rate": 0.1603119919649211, + "loss": 0.0, + "num_input_tokens_seen": 32772544, + "step": 19125 + }, + { + "epoch": 92.86198547215497, + "grad_norm": 5.922894956711389e-07, + "learning_rate": 0.16025322566895028, + "loss": 0.0, + "num_input_tokens_seen": 32780896, + "step": 19130 + }, + { + "epoch": 92.8861985472155, + "grad_norm": 6.47348656457325e-07, + "learning_rate": 0.16019445779180322, + "loss": 0.0, + "num_input_tokens_seen": 32789472, + "step": 19135 + }, + { + "epoch": 92.91041162227603, + "grad_norm": 4.993684683540778e-07, + "learning_rate": 0.16013568834254271, + "loss": 0.0, + "num_input_tokens_seen": 32798304, + "step": 19140 + }, + { + "epoch": 92.93462469733656, + "grad_norm": 2.6469291469766176e-07, + "learning_rate": 0.1600769173302316, + "loss": 0.0, + "num_input_tokens_seen": 32806656, + "step": 19145 + }, + { + "epoch": 92.95883777239709, + "grad_norm": 2.453676870572963e-07, + "learning_rate": 0.16001814476393322, + "loss": 0.0, + "num_input_tokens_seen": 32814688, + "step": 19150 + }, + { + "epoch": 92.98305084745763, + "grad_norm": 1.8059294859540387e-07, + "learning_rate": 0.15995937065271104, + "loss": 0.0, + "num_input_tokens_seen": 32823520, + "step": 19155 + }, + { + "epoch": 93.00968523002422, + "grad_norm": 3.752715258542594e-07, + "learning_rate": 0.15990059500562873, + "loss": 0.0, + "num_input_tokens_seen": 32832736, + "step": 19160 + }, + { + "epoch": 93.03389830508475, + "grad_norm": 2.5212915488737053e-07, + "learning_rate": 0.15984181783175025, + "loss": 0.0, + "num_input_tokens_seen": 32841280, + "step": 19165 + }, + { + "epoch": 93.05811138014528, + "grad_norm": 3.224961346859345e-07, + "learning_rate": 0.1597830391401398, + "loss": 0.0, + "num_input_tokens_seen": 32849664, + "step": 19170 + }, + { + "epoch": 93.08232445520581, + "grad_norm": 2.0106872966607625e-07, + "learning_rate": 0.15972425893986178, + "loss": 0.0, + "num_input_tokens_seen": 32858016, + "step": 19175 + }, + { + "epoch": 93.10653753026634, + "grad_norm": 1.6208984732202225e-07, + "learning_rate": 0.15966547723998084, + "loss": 0.0, + "num_input_tokens_seen": 32866560, + "step": 19180 + }, + { + "epoch": 93.13075060532688, + "grad_norm": 3.646800337264722e-07, + "learning_rate": 0.15960669404956176, + "loss": 0.0, + "num_input_tokens_seen": 32875008, + "step": 19185 + }, + { + "epoch": 93.15496368038741, + "grad_norm": 4.020597828002792e-07, + "learning_rate": 0.1595479093776698, + "loss": 0.0, + "num_input_tokens_seen": 32883168, + "step": 19190 + }, + { + "epoch": 93.17917675544794, + "grad_norm": 8.54252220960916e-07, + "learning_rate": 0.15948912323337022, + "loss": 0.0, + "num_input_tokens_seen": 32891808, + "step": 19195 + }, + { + "epoch": 93.20338983050847, + "grad_norm": 2.7803196189779555e-07, + "learning_rate": 0.1594303356257286, + "loss": 0.0, + "num_input_tokens_seen": 32900448, + "step": 19200 + }, + { + "epoch": 93.20338983050847, + "eval_loss": 0.8940050601959229, + "eval_runtime": 4.6284, + "eval_samples_per_second": 79.294, + "eval_steps_per_second": 19.877, + "num_input_tokens_seen": 32900448, + "step": 19200 + }, + { + "epoch": 93.227602905569, + "grad_norm": 1.7363231563649606e-07, + "learning_rate": 0.15937154656381072, + "loss": 0.0, + "num_input_tokens_seen": 32908832, + "step": 19205 + }, + { + "epoch": 93.25181598062954, + "grad_norm": 5.4181759878702e-07, + "learning_rate": 0.15931275605668258, + "loss": 0.0, + "num_input_tokens_seen": 32917568, + "step": 19210 + }, + { + "epoch": 93.27602905569007, + "grad_norm": 1.916299794402221e-07, + "learning_rate": 0.1592539641134104, + "loss": 0.0, + "num_input_tokens_seen": 32926272, + "step": 19215 + }, + { + "epoch": 93.3002421307506, + "grad_norm": 1.3822547373365524e-07, + "learning_rate": 0.1591951707430607, + "loss": 0.0, + "num_input_tokens_seen": 32935264, + "step": 19220 + }, + { + "epoch": 93.32445520581113, + "grad_norm": 3.790538301018387e-07, + "learning_rate": 0.15913637595470007, + "loss": 0.0, + "num_input_tokens_seen": 32944032, + "step": 19225 + }, + { + "epoch": 93.34866828087166, + "grad_norm": 1.194317320596383e-07, + "learning_rate": 0.15907757975739548, + "loss": 0.0, + "num_input_tokens_seen": 32952448, + "step": 19230 + }, + { + "epoch": 93.37288135593221, + "grad_norm": 4.3089815449093294e-07, + "learning_rate": 0.159018782160214, + "loss": 0.0, + "num_input_tokens_seen": 32960608, + "step": 19235 + }, + { + "epoch": 93.39709443099274, + "grad_norm": 5.836669743075618e-07, + "learning_rate": 0.158959983172223, + "loss": 0.0, + "num_input_tokens_seen": 32968800, + "step": 19240 + }, + { + "epoch": 93.42130750605327, + "grad_norm": 5.395330617830041e-07, + "learning_rate": 0.15890118280249, + "loss": 0.0, + "num_input_tokens_seen": 32977792, + "step": 19245 + }, + { + "epoch": 93.4455205811138, + "grad_norm": 3.2997596122186224e-07, + "learning_rate": 0.15884238106008275, + "loss": 0.0, + "num_input_tokens_seen": 32986496, + "step": 19250 + }, + { + "epoch": 93.46973365617433, + "grad_norm": 4.3691952100743947e-07, + "learning_rate": 0.15878357795406922, + "loss": 0.0, + "num_input_tokens_seen": 32995264, + "step": 19255 + }, + { + "epoch": 93.49394673123487, + "grad_norm": 5.46911735455069e-07, + "learning_rate": 0.15872477349351757, + "loss": 0.0, + "num_input_tokens_seen": 33003712, + "step": 19260 + }, + { + "epoch": 93.5181598062954, + "grad_norm": 2.939743808383355e-07, + "learning_rate": 0.15866596768749622, + "loss": 0.0, + "num_input_tokens_seen": 33011936, + "step": 19265 + }, + { + "epoch": 93.54237288135593, + "grad_norm": 2.8196103585287347e-07, + "learning_rate": 0.15860716054507373, + "loss": 0.0, + "num_input_tokens_seen": 33020768, + "step": 19270 + }, + { + "epoch": 93.56658595641646, + "grad_norm": 3.7822275089638424e-07, + "learning_rate": 0.1585483520753189, + "loss": 0.0, + "num_input_tokens_seen": 33029024, + "step": 19275 + }, + { + "epoch": 93.59079903147699, + "grad_norm": 6.963023224670906e-07, + "learning_rate": 0.1584895422873008, + "loss": 0.0, + "num_input_tokens_seen": 33037344, + "step": 19280 + }, + { + "epoch": 93.61501210653753, + "grad_norm": 3.9113163552428887e-07, + "learning_rate": 0.1584307311900886, + "loss": 0.0, + "num_input_tokens_seen": 33045952, + "step": 19285 + }, + { + "epoch": 93.63922518159806, + "grad_norm": 3.812973545791465e-07, + "learning_rate": 0.1583719187927517, + "loss": 0.0, + "num_input_tokens_seen": 33054720, + "step": 19290 + }, + { + "epoch": 93.6634382566586, + "grad_norm": 3.2187995202548336e-07, + "learning_rate": 0.15831310510435967, + "loss": 0.0, + "num_input_tokens_seen": 33063168, + "step": 19295 + }, + { + "epoch": 93.68765133171912, + "grad_norm": 4.069351575708424e-07, + "learning_rate": 0.15825429013398243, + "loss": 0.0, + "num_input_tokens_seen": 33071520, + "step": 19300 + }, + { + "epoch": 93.71186440677967, + "grad_norm": 1.653412198265869e-07, + "learning_rate": 0.15819547389068986, + "loss": 0.0, + "num_input_tokens_seen": 33080128, + "step": 19305 + }, + { + "epoch": 93.7360774818402, + "grad_norm": 3.5570570844356553e-07, + "learning_rate": 0.1581366563835522, + "loss": 0.0, + "num_input_tokens_seen": 33088640, + "step": 19310 + }, + { + "epoch": 93.76029055690073, + "grad_norm": 3.233398615520855e-07, + "learning_rate": 0.15807783762163993, + "loss": 0.0, + "num_input_tokens_seen": 33097280, + "step": 19315 + }, + { + "epoch": 93.78450363196126, + "grad_norm": 3.505318488805642e-07, + "learning_rate": 0.15801901761402365, + "loss": 0.0, + "num_input_tokens_seen": 33105824, + "step": 19320 + }, + { + "epoch": 93.80871670702179, + "grad_norm": 3.3975456403823046e-07, + "learning_rate": 0.157960196369774, + "loss": 0.0, + "num_input_tokens_seen": 33114368, + "step": 19325 + }, + { + "epoch": 93.83292978208233, + "grad_norm": 2.333378432695099e-07, + "learning_rate": 0.157901373897962, + "loss": 0.0, + "num_input_tokens_seen": 33123168, + "step": 19330 + }, + { + "epoch": 93.85714285714286, + "grad_norm": 1.8921956268513895e-07, + "learning_rate": 0.15784255020765892, + "loss": 0.0, + "num_input_tokens_seen": 33131648, + "step": 19335 + }, + { + "epoch": 93.88135593220339, + "grad_norm": 5.758303700531542e-07, + "learning_rate": 0.157783725307936, + "loss": 0.0, + "num_input_tokens_seen": 33140576, + "step": 19340 + }, + { + "epoch": 93.90556900726392, + "grad_norm": 2.4146066834873636e-07, + "learning_rate": 0.15772489920786484, + "loss": 0.0, + "num_input_tokens_seen": 33149824, + "step": 19345 + }, + { + "epoch": 93.92978208232445, + "grad_norm": 2.1385216086855507e-07, + "learning_rate": 0.15766607191651713, + "loss": 0.0, + "num_input_tokens_seen": 33158272, + "step": 19350 + }, + { + "epoch": 93.953995157385, + "grad_norm": 5.351786285245907e-07, + "learning_rate": 0.1576072434429648, + "loss": 0.0, + "num_input_tokens_seen": 33167232, + "step": 19355 + }, + { + "epoch": 93.97820823244552, + "grad_norm": 1.6968502336567326e-07, + "learning_rate": 0.15754841379627998, + "loss": 0.0, + "num_input_tokens_seen": 33175840, + "step": 19360 + }, + { + "epoch": 94.00484261501211, + "grad_norm": 8.784004421613645e-07, + "learning_rate": 0.15748958298553484, + "loss": 0.0, + "num_input_tokens_seen": 33184896, + "step": 19365 + }, + { + "epoch": 94.02905569007264, + "grad_norm": 3.3446610814280575e-07, + "learning_rate": 0.1574307510198019, + "loss": 0.0, + "num_input_tokens_seen": 33193856, + "step": 19370 + }, + { + "epoch": 94.05326876513317, + "grad_norm": 2.475121334555297e-07, + "learning_rate": 0.15737191790815375, + "loss": 0.0, + "num_input_tokens_seen": 33202176, + "step": 19375 + }, + { + "epoch": 94.0774818401937, + "grad_norm": 2.1681147188701289e-07, + "learning_rate": 0.15731308365966323, + "loss": 0.0, + "num_input_tokens_seen": 33210720, + "step": 19380 + }, + { + "epoch": 94.10169491525424, + "grad_norm": 3.2003285355131084e-07, + "learning_rate": 0.15725424828340331, + "loss": 0.0, + "num_input_tokens_seen": 33219552, + "step": 19385 + }, + { + "epoch": 94.12590799031477, + "grad_norm": 6.119944941929134e-07, + "learning_rate": 0.15719541178844715, + "loss": 0.0, + "num_input_tokens_seen": 33227968, + "step": 19390 + }, + { + "epoch": 94.1501210653753, + "grad_norm": 1.0161214447634848e-07, + "learning_rate": 0.15713657418386806, + "loss": 0.0, + "num_input_tokens_seen": 33236704, + "step": 19395 + }, + { + "epoch": 94.17433414043583, + "grad_norm": 7.029213406894996e-07, + "learning_rate": 0.15707773547873957, + "loss": 0.0, + "num_input_tokens_seen": 33244800, + "step": 19400 + }, + { + "epoch": 94.17433414043583, + "eval_loss": 0.9025793671607971, + "eval_runtime": 4.6339, + "eval_samples_per_second": 79.198, + "eval_steps_per_second": 19.854, + "num_input_tokens_seen": 33244800, + "step": 19400 + }, + { + "epoch": 94.19854721549636, + "grad_norm": 1.5294045851987903e-07, + "learning_rate": 0.1570188956821353, + "loss": 0.0, + "num_input_tokens_seen": 33252928, + "step": 19405 + }, + { + "epoch": 94.2227602905569, + "grad_norm": 2.9713331173297775e-07, + "learning_rate": 0.1569600548031291, + "loss": 0.0, + "num_input_tokens_seen": 33261344, + "step": 19410 + }, + { + "epoch": 94.24697336561744, + "grad_norm": 2.7358359488971473e-07, + "learning_rate": 0.156901212850795, + "loss": 0.0, + "num_input_tokens_seen": 33269696, + "step": 19415 + }, + { + "epoch": 94.27118644067797, + "grad_norm": 2.952612305762159e-07, + "learning_rate": 0.15684236983420716, + "loss": 0.0, + "num_input_tokens_seen": 33277568, + "step": 19420 + }, + { + "epoch": 94.2953995157385, + "grad_norm": 4.74138573736127e-07, + "learning_rate": 0.1567835257624399, + "loss": 0.0, + "num_input_tokens_seen": 33286112, + "step": 19425 + }, + { + "epoch": 94.31961259079903, + "grad_norm": 1.4738841969119676e-07, + "learning_rate": 0.1567246806445677, + "loss": 0.0, + "num_input_tokens_seen": 33294528, + "step": 19430 + }, + { + "epoch": 94.34382566585957, + "grad_norm": 1.691375359769154e-07, + "learning_rate": 0.15666583448966526, + "loss": 0.0, + "num_input_tokens_seen": 33303296, + "step": 19435 + }, + { + "epoch": 94.3680387409201, + "grad_norm": 3.84877210990453e-07, + "learning_rate": 0.1566069873068074, + "loss": 0.0, + "num_input_tokens_seen": 33311968, + "step": 19440 + }, + { + "epoch": 94.39225181598063, + "grad_norm": 5.811190817439638e-07, + "learning_rate": 0.156548139105069, + "loss": 0.0, + "num_input_tokens_seen": 33320224, + "step": 19445 + }, + { + "epoch": 94.41646489104116, + "grad_norm": 8.036482768147835e-07, + "learning_rate": 0.15648928989352529, + "loss": 0.0, + "num_input_tokens_seen": 33329088, + "step": 19450 + }, + { + "epoch": 94.44067796610169, + "grad_norm": 1.2658489367822767e-07, + "learning_rate": 0.15643043968125156, + "loss": 0.0, + "num_input_tokens_seen": 33337888, + "step": 19455 + }, + { + "epoch": 94.46489104116223, + "grad_norm": 4.861965408053948e-07, + "learning_rate": 0.15637158847732316, + "loss": 0.0, + "num_input_tokens_seen": 33346400, + "step": 19460 + }, + { + "epoch": 94.48910411622276, + "grad_norm": 1.7505142579921085e-07, + "learning_rate": 0.15631273629081582, + "loss": 0.0, + "num_input_tokens_seen": 33355040, + "step": 19465 + }, + { + "epoch": 94.51331719128329, + "grad_norm": 2.112617636385039e-07, + "learning_rate": 0.15625388313080518, + "loss": 0.0, + "num_input_tokens_seen": 33363328, + "step": 19470 + }, + { + "epoch": 94.53753026634382, + "grad_norm": 2.6600619662531244e-07, + "learning_rate": 0.15619502900636714, + "loss": 0.0, + "num_input_tokens_seen": 33371712, + "step": 19475 + }, + { + "epoch": 94.56174334140435, + "grad_norm": 1.9779399451635982e-07, + "learning_rate": 0.15613617392657783, + "loss": 0.0, + "num_input_tokens_seen": 33380416, + "step": 19480 + }, + { + "epoch": 94.5859564164649, + "grad_norm": 3.561412142971676e-07, + "learning_rate": 0.15607731790051335, + "loss": 0.0, + "num_input_tokens_seen": 33389152, + "step": 19485 + }, + { + "epoch": 94.61016949152543, + "grad_norm": 2.689189102511591e-07, + "learning_rate": 0.15601846093725008, + "loss": 0.0, + "num_input_tokens_seen": 33397792, + "step": 19490 + }, + { + "epoch": 94.63438256658596, + "grad_norm": 1.650911656270182e-07, + "learning_rate": 0.1559596030458645, + "loss": 0.0, + "num_input_tokens_seen": 33406336, + "step": 19495 + }, + { + "epoch": 94.65859564164649, + "grad_norm": 3.4388256153761176e-07, + "learning_rate": 0.1559007442354333, + "loss": 0.0, + "num_input_tokens_seen": 33415392, + "step": 19500 + }, + { + "epoch": 94.68280871670702, + "grad_norm": 3.3323439652122033e-07, + "learning_rate": 0.15584188451503314, + "loss": 0.0, + "num_input_tokens_seen": 33423840, + "step": 19505 + }, + { + "epoch": 94.70702179176756, + "grad_norm": 3.535357109285542e-07, + "learning_rate": 0.15578302389374094, + "loss": 0.0, + "num_input_tokens_seen": 33432544, + "step": 19510 + }, + { + "epoch": 94.73123486682809, + "grad_norm": 6.358504833769985e-07, + "learning_rate": 0.1557241623806338, + "loss": 0.0, + "num_input_tokens_seen": 33440960, + "step": 19515 + }, + { + "epoch": 94.75544794188862, + "grad_norm": 5.01008457831631e-07, + "learning_rate": 0.15566529998478887, + "loss": 0.0, + "num_input_tokens_seen": 33449632, + "step": 19520 + }, + { + "epoch": 94.77966101694915, + "grad_norm": 3.5809480891657586e-07, + "learning_rate": 0.15560643671528354, + "loss": 0.0, + "num_input_tokens_seen": 33458336, + "step": 19525 + }, + { + "epoch": 94.80387409200968, + "grad_norm": 1.874289097258952e-07, + "learning_rate": 0.15554757258119514, + "loss": 0.0, + "num_input_tokens_seen": 33467328, + "step": 19530 + }, + { + "epoch": 94.82808716707022, + "grad_norm": 3.0014805929567956e-07, + "learning_rate": 0.1554887075916014, + "loss": 0.0, + "num_input_tokens_seen": 33475744, + "step": 19535 + }, + { + "epoch": 94.85230024213075, + "grad_norm": 2.711926470055914e-07, + "learning_rate": 0.15542984175558, + "loss": 0.0, + "num_input_tokens_seen": 33484320, + "step": 19540 + }, + { + "epoch": 94.87651331719128, + "grad_norm": 4.1233354863834393e-07, + "learning_rate": 0.1553709750822087, + "loss": 0.0, + "num_input_tokens_seen": 33492800, + "step": 19545 + }, + { + "epoch": 94.90072639225181, + "grad_norm": 1.4836103900961461e-07, + "learning_rate": 0.15531210758056554, + "loss": 0.0, + "num_input_tokens_seen": 33501248, + "step": 19550 + }, + { + "epoch": 94.92493946731234, + "grad_norm": 3.612097998484387e-07, + "learning_rate": 0.15525323925972867, + "loss": 0.0, + "num_input_tokens_seen": 33509632, + "step": 19555 + }, + { + "epoch": 94.94915254237289, + "grad_norm": 2.3693081629971857e-07, + "learning_rate": 0.15519437012877627, + "loss": 0.0, + "num_input_tokens_seen": 33518112, + "step": 19560 + }, + { + "epoch": 94.97336561743342, + "grad_norm": 2.3217414479859144e-07, + "learning_rate": 0.15513550019678676, + "loss": 0.0, + "num_input_tokens_seen": 33526688, + "step": 19565 + }, + { + "epoch": 94.99757869249395, + "grad_norm": 2.2730452542418789e-07, + "learning_rate": 0.15507662947283854, + "loss": 0.0, + "num_input_tokens_seen": 33535168, + "step": 19570 + }, + { + "epoch": 95.02421307506053, + "grad_norm": 1.3058159709089523e-07, + "learning_rate": 0.15501775796601028, + "loss": 0.0, + "num_input_tokens_seen": 33543872, + "step": 19575 + }, + { + "epoch": 95.04842615012106, + "grad_norm": 7.057981861180451e-07, + "learning_rate": 0.15495888568538066, + "loss": 0.0, + "num_input_tokens_seen": 33552768, + "step": 19580 + }, + { + "epoch": 95.0726392251816, + "grad_norm": 4.1577999354558415e-07, + "learning_rate": 0.1549000126400286, + "loss": 0.0, + "num_input_tokens_seen": 33561184, + "step": 19585 + }, + { + "epoch": 95.09685230024213, + "grad_norm": 2.8175855959489127e-07, + "learning_rate": 0.15484113883903294, + "loss": 0.0, + "num_input_tokens_seen": 33569856, + "step": 19590 + }, + { + "epoch": 95.12106537530266, + "grad_norm": 3.273259494562808e-07, + "learning_rate": 0.15478226429147288, + "loss": 0.0, + "num_input_tokens_seen": 33578240, + "step": 19595 + }, + { + "epoch": 95.1452784503632, + "grad_norm": 1.58715906195539e-07, + "learning_rate": 0.15472338900642757, + "loss": 0.0, + "num_input_tokens_seen": 33587168, + "step": 19600 + }, + { + "epoch": 95.1452784503632, + "eval_loss": 0.9150127172470093, + "eval_runtime": 4.6163, + "eval_samples_per_second": 79.501, + "eval_steps_per_second": 19.929, + "num_input_tokens_seen": 33587168, + "step": 19600 + }, + { + "epoch": 95.16949152542372, + "grad_norm": 1.596393985892064e-07, + "learning_rate": 0.15466451299297632, + "loss": 0.0, + "num_input_tokens_seen": 33595744, + "step": 19605 + }, + { + "epoch": 95.19370460048427, + "grad_norm": 4.942323812429095e-07, + "learning_rate": 0.15460563626019852, + "loss": 0.0, + "num_input_tokens_seen": 33604640, + "step": 19610 + }, + { + "epoch": 95.2179176755448, + "grad_norm": 4.410102292240481e-07, + "learning_rate": 0.15454675881717375, + "loss": 0.0, + "num_input_tokens_seen": 33613088, + "step": 19615 + }, + { + "epoch": 95.24213075060533, + "grad_norm": 1.2648210656607262e-07, + "learning_rate": 0.1544878806729816, + "loss": 0.0, + "num_input_tokens_seen": 33621536, + "step": 19620 + }, + { + "epoch": 95.26634382566586, + "grad_norm": 4.539954829851922e-07, + "learning_rate": 0.1544290018367019, + "loss": 0.0, + "num_input_tokens_seen": 33629824, + "step": 19625 + }, + { + "epoch": 95.29055690072639, + "grad_norm": 2.6714701562013943e-07, + "learning_rate": 0.15437012231741445, + "loss": 0.0, + "num_input_tokens_seen": 33638240, + "step": 19630 + }, + { + "epoch": 95.31476997578693, + "grad_norm": 3.893824498391041e-07, + "learning_rate": 0.1543112421241992, + "loss": 0.0, + "num_input_tokens_seen": 33646560, + "step": 19635 + }, + { + "epoch": 95.33898305084746, + "grad_norm": 2.931928122507088e-07, + "learning_rate": 0.15425236126613626, + "loss": 0.0, + "num_input_tokens_seen": 33655392, + "step": 19640 + }, + { + "epoch": 95.36319612590799, + "grad_norm": 3.1127788702178805e-07, + "learning_rate": 0.15419347975230577, + "loss": 0.0, + "num_input_tokens_seen": 33663552, + "step": 19645 + }, + { + "epoch": 95.38740920096852, + "grad_norm": 8.370665796064714e-07, + "learning_rate": 0.154134597591788, + "loss": 0.0, + "num_input_tokens_seen": 33671840, + "step": 19650 + }, + { + "epoch": 95.41162227602905, + "grad_norm": 2.3246217040195916e-07, + "learning_rate": 0.1540757147936633, + "loss": 0.0, + "num_input_tokens_seen": 33680128, + "step": 19655 + }, + { + "epoch": 95.4358353510896, + "grad_norm": 3.9667909845775284e-07, + "learning_rate": 0.1540168313670122, + "loss": 0.0, + "num_input_tokens_seen": 33688640, + "step": 19660 + }, + { + "epoch": 95.46004842615012, + "grad_norm": 3.0866931410855614e-07, + "learning_rate": 0.1539579473209152, + "loss": 0.0, + "num_input_tokens_seen": 33696896, + "step": 19665 + }, + { + "epoch": 95.48426150121065, + "grad_norm": 2.638805369770125e-07, + "learning_rate": 0.15389906266445294, + "loss": 0.0, + "num_input_tokens_seen": 33705952, + "step": 19670 + }, + { + "epoch": 95.50847457627118, + "grad_norm": 2.460456300923397e-07, + "learning_rate": 0.15384017740670627, + "loss": 0.0, + "num_input_tokens_seen": 33714592, + "step": 19675 + }, + { + "epoch": 95.53268765133171, + "grad_norm": 3.7534019270424324e-07, + "learning_rate": 0.15378129155675602, + "loss": 0.0, + "num_input_tokens_seen": 33723072, + "step": 19680 + }, + { + "epoch": 95.55690072639226, + "grad_norm": 3.6239421774553193e-07, + "learning_rate": 0.15372240512368307, + "loss": 0.0, + "num_input_tokens_seen": 33731712, + "step": 19685 + }, + { + "epoch": 95.58111380145279, + "grad_norm": 4.42163127445383e-07, + "learning_rate": 0.1536635181165684, + "loss": 0.0, + "num_input_tokens_seen": 33740448, + "step": 19690 + }, + { + "epoch": 95.60532687651332, + "grad_norm": 3.2021270612858643e-07, + "learning_rate": 0.15360463054449328, + "loss": 0.0, + "num_input_tokens_seen": 33748960, + "step": 19695 + }, + { + "epoch": 95.62953995157385, + "grad_norm": 3.2571122687841125e-07, + "learning_rate": 0.1535457424165388, + "loss": 0.0, + "num_input_tokens_seen": 33757408, + "step": 19700 + }, + { + "epoch": 95.65375302663438, + "grad_norm": 2.539363777032122e-07, + "learning_rate": 0.15348685374178628, + "loss": 0.0, + "num_input_tokens_seen": 33765792, + "step": 19705 + }, + { + "epoch": 95.67796610169492, + "grad_norm": 3.155192018766684e-07, + "learning_rate": 0.1534279645293171, + "loss": 0.0, + "num_input_tokens_seen": 33774208, + "step": 19710 + }, + { + "epoch": 95.70217917675545, + "grad_norm": 3.402795698548289e-07, + "learning_rate": 0.1533690747882127, + "loss": 0.0, + "num_input_tokens_seen": 33782880, + "step": 19715 + }, + { + "epoch": 95.72639225181598, + "grad_norm": 3.159793493523466e-07, + "learning_rate": 0.15331018452755465, + "loss": 0.0, + "num_input_tokens_seen": 33792064, + "step": 19720 + }, + { + "epoch": 95.75060532687651, + "grad_norm": 4.632277139648977e-08, + "learning_rate": 0.15325129375642457, + "loss": 0.0, + "num_input_tokens_seen": 33800736, + "step": 19725 + }, + { + "epoch": 95.77481840193704, + "grad_norm": 2.3406592219998856e-07, + "learning_rate": 0.15319240248390406, + "loss": 0.0, + "num_input_tokens_seen": 33809184, + "step": 19730 + }, + { + "epoch": 95.79903147699758, + "grad_norm": 4.911996143164288e-07, + "learning_rate": 0.153133510719075, + "loss": 0.0, + "num_input_tokens_seen": 33817824, + "step": 19735 + }, + { + "epoch": 95.82324455205811, + "grad_norm": 2.47062786229435e-07, + "learning_rate": 0.15307461847101922, + "loss": 0.0, + "num_input_tokens_seen": 33826400, + "step": 19740 + }, + { + "epoch": 95.84745762711864, + "grad_norm": 4.6018118382562534e-07, + "learning_rate": 0.15301572574881864, + "loss": 0.0, + "num_input_tokens_seen": 33834720, + "step": 19745 + }, + { + "epoch": 95.87167070217917, + "grad_norm": 1.5847578538341622e-07, + "learning_rate": 0.15295683256155523, + "loss": 0.0, + "num_input_tokens_seen": 33843200, + "step": 19750 + }, + { + "epoch": 95.8958837772397, + "grad_norm": 5.37666380751034e-07, + "learning_rate": 0.15289793891831113, + "loss": 0.0, + "num_input_tokens_seen": 33851616, + "step": 19755 + }, + { + "epoch": 95.92009685230025, + "grad_norm": 1.5070433789787785e-07, + "learning_rate": 0.15283904482816837, + "loss": 0.0, + "num_input_tokens_seen": 33860160, + "step": 19760 + }, + { + "epoch": 95.94430992736078, + "grad_norm": 1.25943188322708e-07, + "learning_rate": 0.15278015030020928, + "loss": 0.0, + "num_input_tokens_seen": 33869248, + "step": 19765 + }, + { + "epoch": 95.9685230024213, + "grad_norm": 3.2051730158855207e-07, + "learning_rate": 0.152721255343516, + "loss": 0.0, + "num_input_tokens_seen": 33877792, + "step": 19770 + }, + { + "epoch": 95.99273607748184, + "grad_norm": 3.09172293100346e-07, + "learning_rate": 0.15266235996717098, + "loss": 0.0, + "num_input_tokens_seen": 33886176, + "step": 19775 + }, + { + "epoch": 96.01937046004842, + "grad_norm": 3.714007164035138e-07, + "learning_rate": 0.15260346418025664, + "loss": 0.0, + "num_input_tokens_seen": 33894688, + "step": 19780 + }, + { + "epoch": 96.04358353510897, + "grad_norm": 2.2807530797308573e-07, + "learning_rate": 0.15254456799185537, + "loss": 0.0, + "num_input_tokens_seen": 33903264, + "step": 19785 + }, + { + "epoch": 96.0677966101695, + "grad_norm": 3.9320417499766336e-07, + "learning_rate": 0.15248567141104974, + "loss": 0.0, + "num_input_tokens_seen": 33911776, + "step": 19790 + }, + { + "epoch": 96.09200968523002, + "grad_norm": 2.7338319341652095e-07, + "learning_rate": 0.15242677444692232, + "loss": 0.0, + "num_input_tokens_seen": 33920608, + "step": 19795 + }, + { + "epoch": 96.11622276029055, + "grad_norm": 3.862562323320162e-07, + "learning_rate": 0.15236787710855584, + "loss": 0.0, + "num_input_tokens_seen": 33929248, + "step": 19800 + }, + { + "epoch": 96.11622276029055, + "eval_loss": 0.9224525094032288, + "eval_runtime": 4.6225, + "eval_samples_per_second": 79.393, + "eval_steps_per_second": 19.902, + "num_input_tokens_seen": 33929248, + "step": 19800 + }, + { + "epoch": 96.14043583535108, + "grad_norm": 2.98030215617473e-07, + "learning_rate": 0.1523089794050329, + "loss": 0.0, + "num_input_tokens_seen": 33937888, + "step": 19805 + }, + { + "epoch": 96.16464891041163, + "grad_norm": 1.7776135052827158e-07, + "learning_rate": 0.15225008134543633, + "loss": 0.0, + "num_input_tokens_seen": 33946240, + "step": 19810 + }, + { + "epoch": 96.18886198547216, + "grad_norm": 2.413326285477524e-07, + "learning_rate": 0.15219118293884895, + "loss": 0.0, + "num_input_tokens_seen": 33954592, + "step": 19815 + }, + { + "epoch": 96.21307506053269, + "grad_norm": 2.7718726869352395e-07, + "learning_rate": 0.15213228419435362, + "loss": 0.0, + "num_input_tokens_seen": 33963136, + "step": 19820 + }, + { + "epoch": 96.23728813559322, + "grad_norm": 3.3595478043935145e-07, + "learning_rate": 0.15207338512103327, + "loss": 0.0, + "num_input_tokens_seen": 33971904, + "step": 19825 + }, + { + "epoch": 96.26150121065375, + "grad_norm": 3.055512536320748e-07, + "learning_rate": 0.1520144857279709, + "loss": 0.0, + "num_input_tokens_seen": 33980128, + "step": 19830 + }, + { + "epoch": 96.28571428571429, + "grad_norm": 1.4747236320999946e-07, + "learning_rate": 0.1519555860242495, + "loss": 0.0, + "num_input_tokens_seen": 33988640, + "step": 19835 + }, + { + "epoch": 96.30992736077482, + "grad_norm": 4.1011287521541817e-07, + "learning_rate": 0.15189668601895218, + "loss": 0.0, + "num_input_tokens_seen": 33997248, + "step": 19840 + }, + { + "epoch": 96.33414043583535, + "grad_norm": 4.879693733528256e-07, + "learning_rate": 0.151837785721162, + "loss": 0.0, + "num_input_tokens_seen": 34005792, + "step": 19845 + }, + { + "epoch": 96.35835351089588, + "grad_norm": 3.2623799484099436e-07, + "learning_rate": 0.15177888513996218, + "loss": 0.0, + "num_input_tokens_seen": 34014080, + "step": 19850 + }, + { + "epoch": 96.38256658595641, + "grad_norm": 1.3170246404570207e-07, + "learning_rate": 0.15171998428443592, + "loss": 0.0, + "num_input_tokens_seen": 34022688, + "step": 19855 + }, + { + "epoch": 96.40677966101696, + "grad_norm": 1.2679903704793105e-07, + "learning_rate": 0.1516610831636665, + "loss": 0.0, + "num_input_tokens_seen": 34031168, + "step": 19860 + }, + { + "epoch": 96.43099273607749, + "grad_norm": 4.790475713889464e-07, + "learning_rate": 0.15160218178673715, + "loss": 0.0, + "num_input_tokens_seen": 34039680, + "step": 19865 + }, + { + "epoch": 96.45520581113801, + "grad_norm": 2.660232496509707e-07, + "learning_rate": 0.15154328016273122, + "loss": 0.0, + "num_input_tokens_seen": 34048416, + "step": 19870 + }, + { + "epoch": 96.47941888619854, + "grad_norm": 4.2187406279481365e-07, + "learning_rate": 0.1514843783007321, + "loss": 0.0, + "num_input_tokens_seen": 34056864, + "step": 19875 + }, + { + "epoch": 96.50363196125907, + "grad_norm": 4.0482413510289916e-07, + "learning_rate": 0.15142547620982322, + "loss": 0.0, + "num_input_tokens_seen": 34065408, + "step": 19880 + }, + { + "epoch": 96.52784503631962, + "grad_norm": 2.2652046993698605e-07, + "learning_rate": 0.15136657389908797, + "loss": 0.0, + "num_input_tokens_seen": 34074208, + "step": 19885 + }, + { + "epoch": 96.55205811138015, + "grad_norm": 1.725377956063312e-07, + "learning_rate": 0.15130767137760986, + "loss": 0.0, + "num_input_tokens_seen": 34082688, + "step": 19890 + }, + { + "epoch": 96.57627118644068, + "grad_norm": 2.44246592728814e-07, + "learning_rate": 0.15124876865447243, + "loss": 0.0, + "num_input_tokens_seen": 34091168, + "step": 19895 + }, + { + "epoch": 96.60048426150121, + "grad_norm": 6.710798174935917e-07, + "learning_rate": 0.15118986573875912, + "loss": 0.0, + "num_input_tokens_seen": 34099808, + "step": 19900 + }, + { + "epoch": 96.62469733656174, + "grad_norm": 2.805970211738895e-07, + "learning_rate": 0.15113096263955358, + "loss": 0.0, + "num_input_tokens_seen": 34108160, + "step": 19905 + }, + { + "epoch": 96.64891041162228, + "grad_norm": 3.4683915828281897e-07, + "learning_rate": 0.1510720593659394, + "loss": 0.0, + "num_input_tokens_seen": 34116448, + "step": 19910 + }, + { + "epoch": 96.67312348668281, + "grad_norm": 1.652107215477372e-07, + "learning_rate": 0.15101315592700015, + "loss": 0.0, + "num_input_tokens_seen": 34125184, + "step": 19915 + }, + { + "epoch": 96.69733656174334, + "grad_norm": 2.634566556025675e-07, + "learning_rate": 0.15095425233181956, + "loss": 0.0, + "num_input_tokens_seen": 34133920, + "step": 19920 + }, + { + "epoch": 96.72154963680387, + "grad_norm": 5.288839020067826e-07, + "learning_rate": 0.15089534858948128, + "loss": 0.0, + "num_input_tokens_seen": 34142752, + "step": 19925 + }, + { + "epoch": 96.7457627118644, + "grad_norm": 9.961438252048538e-08, + "learning_rate": 0.15083644470906898, + "loss": 0.0, + "num_input_tokens_seen": 34151360, + "step": 19930 + }, + { + "epoch": 96.76997578692495, + "grad_norm": 2.7050595008404343e-07, + "learning_rate": 0.1507775406996664, + "loss": 0.0, + "num_input_tokens_seen": 34160320, + "step": 19935 + }, + { + "epoch": 96.79418886198548, + "grad_norm": 2.1223809199000243e-07, + "learning_rate": 0.15071863657035725, + "loss": 0.0, + "num_input_tokens_seen": 34169024, + "step": 19940 + }, + { + "epoch": 96.818401937046, + "grad_norm": 1.1102897445880444e-07, + "learning_rate": 0.15065973233022534, + "loss": 0.0, + "num_input_tokens_seen": 34177504, + "step": 19945 + }, + { + "epoch": 96.84261501210653, + "grad_norm": 2.047837739382885e-07, + "learning_rate": 0.15060082798835442, + "loss": 0.0, + "num_input_tokens_seen": 34186016, + "step": 19950 + }, + { + "epoch": 96.86682808716706, + "grad_norm": 1.5028042810172337e-07, + "learning_rate": 0.15054192355382823, + "loss": 0.0, + "num_input_tokens_seen": 34194784, + "step": 19955 + }, + { + "epoch": 96.89104116222761, + "grad_norm": 4.599074259203917e-07, + "learning_rate": 0.15048301903573066, + "loss": 0.0, + "num_input_tokens_seen": 34203040, + "step": 19960 + }, + { + "epoch": 96.91525423728814, + "grad_norm": 2.9447997462739295e-07, + "learning_rate": 0.15042411444314546, + "loss": 0.0, + "num_input_tokens_seen": 34211456, + "step": 19965 + }, + { + "epoch": 96.93946731234867, + "grad_norm": 3.6199890018906444e-07, + "learning_rate": 0.1503652097851565, + "loss": 0.0, + "num_input_tokens_seen": 34219840, + "step": 19970 + }, + { + "epoch": 96.9636803874092, + "grad_norm": 3.6592797414414235e-07, + "learning_rate": 0.15030630507084758, + "loss": 0.0, + "num_input_tokens_seen": 34228352, + "step": 19975 + }, + { + "epoch": 96.98789346246973, + "grad_norm": 2.719671101658605e-07, + "learning_rate": 0.1502474003093026, + "loss": 0.0, + "num_input_tokens_seen": 34237056, + "step": 19980 + }, + { + "epoch": 97.01452784503633, + "grad_norm": 3.354834916535765e-07, + "learning_rate": 0.15018849550960536, + "loss": 0.0, + "num_input_tokens_seen": 34245952, + "step": 19985 + }, + { + "epoch": 97.03874092009686, + "grad_norm": 2.0228283403866953e-07, + "learning_rate": 0.15012959068083975, + "loss": 0.0, + "num_input_tokens_seen": 34254624, + "step": 19990 + }, + { + "epoch": 97.06295399515739, + "grad_norm": 1.3033498191816761e-07, + "learning_rate": 0.1500706858320896, + "loss": 0.0, + "num_input_tokens_seen": 34263136, + "step": 19995 + }, + { + "epoch": 97.08716707021792, + "grad_norm": 1.6307407690874243e-07, + "learning_rate": 0.15001178097243886, + "loss": 0.0, + "num_input_tokens_seen": 34271648, + "step": 20000 + }, + { + "epoch": 97.08716707021792, + "eval_loss": 0.9254322648048401, + "eval_runtime": 4.6119, + "eval_samples_per_second": 79.576, + "eval_steps_per_second": 19.948, + "num_input_tokens_seen": 34271648, + "step": 20000 + }, + { + "epoch": 97.11138014527845, + "grad_norm": 2.502374627511017e-07, + "learning_rate": 0.1499528761109713, + "loss": 0.0, + "num_input_tokens_seen": 34280128, + "step": 20005 + }, + { + "epoch": 97.13559322033899, + "grad_norm": 5.415452619672578e-07, + "learning_rate": 0.14989397125677087, + "loss": 0.0, + "num_input_tokens_seen": 34288832, + "step": 20010 + }, + { + "epoch": 97.15980629539952, + "grad_norm": 2.5805945824686205e-07, + "learning_rate": 0.14983506641892141, + "loss": 0.0, + "num_input_tokens_seen": 34297152, + "step": 20015 + }, + { + "epoch": 97.18401937046005, + "grad_norm": 1.5078846615779185e-07, + "learning_rate": 0.14977616160650672, + "loss": 0.0, + "num_input_tokens_seen": 34305856, + "step": 20020 + }, + { + "epoch": 97.20823244552058, + "grad_norm": 2.2799717669386155e-07, + "learning_rate": 0.14971725682861076, + "loss": 0.0, + "num_input_tokens_seen": 34314368, + "step": 20025 + }, + { + "epoch": 97.23244552058111, + "grad_norm": 1.276180512377323e-07, + "learning_rate": 0.14965835209431738, + "loss": 0.0, + "num_input_tokens_seen": 34322976, + "step": 20030 + }, + { + "epoch": 97.25665859564165, + "grad_norm": 1.588122842122175e-07, + "learning_rate": 0.14959944741271036, + "loss": 0.0, + "num_input_tokens_seen": 34331616, + "step": 20035 + }, + { + "epoch": 97.28087167070218, + "grad_norm": 2.2179854397563759e-07, + "learning_rate": 0.14954054279287363, + "loss": 0.0, + "num_input_tokens_seen": 34340000, + "step": 20040 + }, + { + "epoch": 97.30508474576271, + "grad_norm": 3.414839682136517e-07, + "learning_rate": 0.14948163824389094, + "loss": 0.0, + "num_input_tokens_seen": 34348640, + "step": 20045 + }, + { + "epoch": 97.32929782082324, + "grad_norm": 3.6373941725287295e-07, + "learning_rate": 0.14942273377484613, + "loss": 0.0, + "num_input_tokens_seen": 34357280, + "step": 20050 + }, + { + "epoch": 97.35351089588377, + "grad_norm": 2.4705676082703576e-07, + "learning_rate": 0.1493638293948231, + "loss": 0.0, + "num_input_tokens_seen": 34365760, + "step": 20055 + }, + { + "epoch": 97.37772397094432, + "grad_norm": 4.543493048458913e-07, + "learning_rate": 0.14930492511290547, + "loss": 0.0, + "num_input_tokens_seen": 34374112, + "step": 20060 + }, + { + "epoch": 97.40193704600485, + "grad_norm": 3.6207163134349685e-07, + "learning_rate": 0.14924602093817715, + "loss": 0.0, + "num_input_tokens_seen": 34382752, + "step": 20065 + }, + { + "epoch": 97.42615012106538, + "grad_norm": 1.3220216033005272e-07, + "learning_rate": 0.14918711687972194, + "loss": 0.0, + "num_input_tokens_seen": 34391200, + "step": 20070 + }, + { + "epoch": 97.4503631961259, + "grad_norm": 9.786724319837958e-08, + "learning_rate": 0.14912821294662346, + "loss": 0.0, + "num_input_tokens_seen": 34399552, + "step": 20075 + }, + { + "epoch": 97.47457627118644, + "grad_norm": 3.2328082966159855e-07, + "learning_rate": 0.14906930914796554, + "loss": 0.0, + "num_input_tokens_seen": 34407680, + "step": 20080 + }, + { + "epoch": 97.49878934624698, + "grad_norm": 2.758217121368034e-08, + "learning_rate": 0.14901040549283182, + "loss": 0.0, + "num_input_tokens_seen": 34416544, + "step": 20085 + }, + { + "epoch": 97.52300242130751, + "grad_norm": 4.5743240661977325e-07, + "learning_rate": 0.148951501990306, + "loss": 0.0, + "num_input_tokens_seen": 34425120, + "step": 20090 + }, + { + "epoch": 97.54721549636804, + "grad_norm": 1.9647127658117824e-07, + "learning_rate": 0.14889259864947177, + "loss": 0.0, + "num_input_tokens_seen": 34434016, + "step": 20095 + }, + { + "epoch": 97.57142857142857, + "grad_norm": 1.1014638801043475e-07, + "learning_rate": 0.14883369547941272, + "loss": 0.0, + "num_input_tokens_seen": 34442368, + "step": 20100 + }, + { + "epoch": 97.5956416464891, + "grad_norm": 1.0627782387473417e-07, + "learning_rate": 0.14877479248921247, + "loss": 0.0, + "num_input_tokens_seen": 34451296, + "step": 20105 + }, + { + "epoch": 97.61985472154964, + "grad_norm": 2.766446414170787e-07, + "learning_rate": 0.14871588968795468, + "loss": 0.0, + "num_input_tokens_seen": 34460160, + "step": 20110 + }, + { + "epoch": 97.64406779661017, + "grad_norm": 1.9907874104774237e-07, + "learning_rate": 0.1486569870847228, + "loss": 0.0, + "num_input_tokens_seen": 34468704, + "step": 20115 + }, + { + "epoch": 97.6682808716707, + "grad_norm": 4.91249352307932e-08, + "learning_rate": 0.1485980846886004, + "loss": 0.0, + "num_input_tokens_seen": 34477600, + "step": 20120 + }, + { + "epoch": 97.69249394673123, + "grad_norm": 3.1707355674370774e-07, + "learning_rate": 0.14853918250867096, + "loss": 0.0, + "num_input_tokens_seen": 34486432, + "step": 20125 + }, + { + "epoch": 97.71670702179176, + "grad_norm": 3.465749216502445e-07, + "learning_rate": 0.1484802805540179, + "loss": 0.0, + "num_input_tokens_seen": 34494912, + "step": 20130 + }, + { + "epoch": 97.7409200968523, + "grad_norm": 4.7166107464136076e-08, + "learning_rate": 0.14842137883372472, + "loss": 0.0, + "num_input_tokens_seen": 34503520, + "step": 20135 + }, + { + "epoch": 97.76513317191284, + "grad_norm": 4.601623686539824e-07, + "learning_rate": 0.14836247735687474, + "loss": 0.0, + "num_input_tokens_seen": 34512064, + "step": 20140 + }, + { + "epoch": 97.78934624697337, + "grad_norm": 4.5149221250539995e-07, + "learning_rate": 0.14830357613255132, + "loss": 0.0, + "num_input_tokens_seen": 34520608, + "step": 20145 + }, + { + "epoch": 97.8135593220339, + "grad_norm": 4.1868329958560935e-07, + "learning_rate": 0.1482446751698378, + "loss": 0.0, + "num_input_tokens_seen": 34529088, + "step": 20150 + }, + { + "epoch": 97.83777239709443, + "grad_norm": 2.530456129079539e-07, + "learning_rate": 0.14818577447781744, + "loss": 0.0, + "num_input_tokens_seen": 34537472, + "step": 20155 + }, + { + "epoch": 97.86198547215497, + "grad_norm": 2.711565514346148e-07, + "learning_rate": 0.14812687406557346, + "loss": 0.0, + "num_input_tokens_seen": 34545728, + "step": 20160 + }, + { + "epoch": 97.8861985472155, + "grad_norm": 1.9746210000448627e-07, + "learning_rate": 0.14806797394218899, + "loss": 0.0, + "num_input_tokens_seen": 34554304, + "step": 20165 + }, + { + "epoch": 97.91041162227603, + "grad_norm": 5.431662657429115e-07, + "learning_rate": 0.1480090741167472, + "loss": 0.0, + "num_input_tokens_seen": 34562880, + "step": 20170 + }, + { + "epoch": 97.93462469733656, + "grad_norm": 4.176748120698903e-07, + "learning_rate": 0.1479501745983313, + "loss": 0.0, + "num_input_tokens_seen": 34571520, + "step": 20175 + }, + { + "epoch": 97.95883777239709, + "grad_norm": 7.642883304015413e-08, + "learning_rate": 0.14789127539602415, + "loss": 0.0, + "num_input_tokens_seen": 34580128, + "step": 20180 + }, + { + "epoch": 97.98305084745763, + "grad_norm": 5.6088163091772e-07, + "learning_rate": 0.14783237651890885, + "loss": 0.0, + "num_input_tokens_seen": 34588448, + "step": 20185 + }, + { + "epoch": 98.00968523002422, + "grad_norm": 2.49357697157393e-07, + "learning_rate": 0.14777347797606838, + "loss": 0.0, + "num_input_tokens_seen": 34596992, + "step": 20190 + }, + { + "epoch": 98.03389830508475, + "grad_norm": 3.143169067243434e-07, + "learning_rate": 0.14771457977658553, + "loss": 0.0, + "num_input_tokens_seen": 34605376, + "step": 20195 + }, + { + "epoch": 98.05811138014528, + "grad_norm": 2.96363623419893e-07, + "learning_rate": 0.14765568192954326, + "loss": 0.0, + "num_input_tokens_seen": 34613344, + "step": 20200 + }, + { + "epoch": 98.05811138014528, + "eval_loss": 0.9303249716758728, + "eval_runtime": 4.6096, + "eval_samples_per_second": 79.616, + "eval_steps_per_second": 19.958, + "num_input_tokens_seen": 34613344, + "step": 20200 + }, + { + "epoch": 98.08232445520581, + "grad_norm": 7.837755333639507e-08, + "learning_rate": 0.14759678444402421, + "loss": 0.0, + "num_input_tokens_seen": 34621952, + "step": 20205 + }, + { + "epoch": 98.10653753026634, + "grad_norm": 1.4310094798020145e-07, + "learning_rate": 0.14753788732911122, + "loss": 0.0, + "num_input_tokens_seen": 34630752, + "step": 20210 + }, + { + "epoch": 98.13075060532688, + "grad_norm": 4.0770311215965194e-07, + "learning_rate": 0.147478990593887, + "loss": 0.0, + "num_input_tokens_seen": 34639232, + "step": 20215 + }, + { + "epoch": 98.15496368038741, + "grad_norm": 2.6411575504425855e-07, + "learning_rate": 0.14742009424743405, + "loss": 0.0, + "num_input_tokens_seen": 34647840, + "step": 20220 + }, + { + "epoch": 98.17917675544794, + "grad_norm": 2.876755047509505e-07, + "learning_rate": 0.14736119829883504, + "loss": 0.0, + "num_input_tokens_seen": 34656096, + "step": 20225 + }, + { + "epoch": 98.20338983050847, + "grad_norm": 1.1078895312266468e-07, + "learning_rate": 0.14730230275717243, + "loss": 0.0, + "num_input_tokens_seen": 34664576, + "step": 20230 + }, + { + "epoch": 98.227602905569, + "grad_norm": 2.6929203045256145e-07, + "learning_rate": 0.14724340763152854, + "loss": 0.0, + "num_input_tokens_seen": 34673280, + "step": 20235 + }, + { + "epoch": 98.25181598062954, + "grad_norm": 1.5995000524071656e-07, + "learning_rate": 0.14718451293098594, + "loss": 0.0, + "num_input_tokens_seen": 34681952, + "step": 20240 + }, + { + "epoch": 98.27602905569007, + "grad_norm": 2.6511423811825807e-07, + "learning_rate": 0.14712561866462676, + "loss": 0.0, + "num_input_tokens_seen": 34690592, + "step": 20245 + }, + { + "epoch": 98.3002421307506, + "grad_norm": 9.984672288965157e-08, + "learning_rate": 0.1470667248415333, + "loss": 0.0, + "num_input_tokens_seen": 34698976, + "step": 20250 + }, + { + "epoch": 98.32445520581113, + "grad_norm": 3.6921409218848567e-07, + "learning_rate": 0.1470078314707878, + "loss": 0.0, + "num_input_tokens_seen": 34708064, + "step": 20255 + }, + { + "epoch": 98.34866828087166, + "grad_norm": 3.6817317550230655e-07, + "learning_rate": 0.14694893856147223, + "loss": 0.0, + "num_input_tokens_seen": 34716416, + "step": 20260 + }, + { + "epoch": 98.37288135593221, + "grad_norm": 1.7683792918887775e-07, + "learning_rate": 0.14689004612266868, + "loss": 0.0, + "num_input_tokens_seen": 34724896, + "step": 20265 + }, + { + "epoch": 98.39709443099274, + "grad_norm": 2.2201709271030268e-07, + "learning_rate": 0.14683115416345913, + "loss": 0.0, + "num_input_tokens_seen": 34733728, + "step": 20270 + }, + { + "epoch": 98.42130750605327, + "grad_norm": 2.3170406393546727e-07, + "learning_rate": 0.1467722626929254, + "loss": 0.0, + "num_input_tokens_seen": 34742464, + "step": 20275 + }, + { + "epoch": 98.4455205811138, + "grad_norm": 3.9733720313961385e-07, + "learning_rate": 0.14671337172014937, + "loss": 0.0, + "num_input_tokens_seen": 34750656, + "step": 20280 + }, + { + "epoch": 98.46973365617433, + "grad_norm": 1.893558163601483e-07, + "learning_rate": 0.14665448125421265, + "loss": 0.0, + "num_input_tokens_seen": 34759808, + "step": 20285 + }, + { + "epoch": 98.49394673123487, + "grad_norm": 2.0609309103747364e-07, + "learning_rate": 0.146595591304197, + "loss": 0.0, + "num_input_tokens_seen": 34768544, + "step": 20290 + }, + { + "epoch": 98.5181598062954, + "grad_norm": 1.0812251360903247e-07, + "learning_rate": 0.14653670187918397, + "loss": 0.0, + "num_input_tokens_seen": 34777152, + "step": 20295 + }, + { + "epoch": 98.54237288135593, + "grad_norm": 1.879555782124953e-07, + "learning_rate": 0.14647781298825502, + "loss": 0.0, + "num_input_tokens_seen": 34785792, + "step": 20300 + }, + { + "epoch": 98.56658595641646, + "grad_norm": 4.131134971885331e-07, + "learning_rate": 0.14641892464049153, + "loss": 0.0, + "num_input_tokens_seen": 34794304, + "step": 20305 + }, + { + "epoch": 98.59079903147699, + "grad_norm": 1.5002839859334927e-07, + "learning_rate": 0.14636003684497495, + "loss": 0.0, + "num_input_tokens_seen": 34803200, + "step": 20310 + }, + { + "epoch": 98.61501210653753, + "grad_norm": 7.695605290791718e-07, + "learning_rate": 0.14630114961078636, + "loss": 0.0, + "num_input_tokens_seen": 34811712, + "step": 20315 + }, + { + "epoch": 98.63922518159806, + "grad_norm": 5.5451977232223726e-08, + "learning_rate": 0.14624226294700704, + "loss": 0.0, + "num_input_tokens_seen": 34820288, + "step": 20320 + }, + { + "epoch": 98.6634382566586, + "grad_norm": 1.932080238020717e-07, + "learning_rate": 0.14618337686271793, + "loss": 0.0, + "num_input_tokens_seen": 34828928, + "step": 20325 + }, + { + "epoch": 98.68765133171912, + "grad_norm": 2.9420587566164613e-07, + "learning_rate": 0.1461244913670001, + "loss": 0.0, + "num_input_tokens_seen": 34837472, + "step": 20330 + }, + { + "epoch": 98.71186440677967, + "grad_norm": 7.54439426486897e-08, + "learning_rate": 0.1460656064689344, + "loss": 0.0, + "num_input_tokens_seen": 34846112, + "step": 20335 + }, + { + "epoch": 98.7360774818402, + "grad_norm": 2.0523165744634753e-07, + "learning_rate": 0.14600672217760163, + "loss": 0.0, + "num_input_tokens_seen": 34854816, + "step": 20340 + }, + { + "epoch": 98.76029055690073, + "grad_norm": 4.2500283825575025e-07, + "learning_rate": 0.14594783850208248, + "loss": 0.0, + "num_input_tokens_seen": 34863424, + "step": 20345 + }, + { + "epoch": 98.78450363196126, + "grad_norm": 1.343437787681978e-07, + "learning_rate": 0.14588895545145758, + "loss": 0.0, + "num_input_tokens_seen": 34871808, + "step": 20350 + }, + { + "epoch": 98.80871670702179, + "grad_norm": 2.1112690262725664e-07, + "learning_rate": 0.14583007303480738, + "loss": 0.0, + "num_input_tokens_seen": 34880480, + "step": 20355 + }, + { + "epoch": 98.83292978208233, + "grad_norm": 2.709654722821142e-07, + "learning_rate": 0.14577119126121235, + "loss": 0.0, + "num_input_tokens_seen": 34888768, + "step": 20360 + }, + { + "epoch": 98.85714285714286, + "grad_norm": 4.181790131951857e-07, + "learning_rate": 0.14571231013975272, + "loss": 0.0, + "num_input_tokens_seen": 34897152, + "step": 20365 + }, + { + "epoch": 98.88135593220339, + "grad_norm": 3.0673342621412303e-07, + "learning_rate": 0.1456534296795088, + "loss": 0.0, + "num_input_tokens_seen": 34905632, + "step": 20370 + }, + { + "epoch": 98.90556900726392, + "grad_norm": 3.589044865748292e-07, + "learning_rate": 0.14559454988956066, + "loss": 0.0, + "num_input_tokens_seen": 34914272, + "step": 20375 + }, + { + "epoch": 98.92978208232445, + "grad_norm": 2.523222519812407e-07, + "learning_rate": 0.1455356707789882, + "loss": 0.0, + "num_input_tokens_seen": 34922720, + "step": 20380 + }, + { + "epoch": 98.953995157385, + "grad_norm": 3.3944621691262e-07, + "learning_rate": 0.14547679235687147, + "loss": 0.0, + "num_input_tokens_seen": 34931328, + "step": 20385 + }, + { + "epoch": 98.97820823244552, + "grad_norm": 5.005434218219307e-07, + "learning_rate": 0.14541791463229023, + "loss": 0.0, + "num_input_tokens_seen": 34939744, + "step": 20390 + }, + { + "epoch": 99.00484261501211, + "grad_norm": 7.688484515711025e-07, + "learning_rate": 0.14535903761432406, + "loss": 0.0, + "num_input_tokens_seen": 34948640, + "step": 20395 + }, + { + "epoch": 99.02905569007264, + "grad_norm": 2.438217450162483e-07, + "learning_rate": 0.1453001613120527, + "loss": 0.0, + "num_input_tokens_seen": 34957056, + "step": 20400 + }, + { + "epoch": 99.02905569007264, + "eval_loss": 0.9378030300140381, + "eval_runtime": 4.63, + "eval_samples_per_second": 79.265, + "eval_steps_per_second": 19.87, + "num_input_tokens_seen": 34957056, + "step": 20400 + }, + { + "epoch": 99.05326876513317, + "grad_norm": 3.2837064622981416e-07, + "learning_rate": 0.14524128573455547, + "loss": 0.0, + "num_input_tokens_seen": 34965856, + "step": 20405 + }, + { + "epoch": 99.0774818401937, + "grad_norm": 3.621890130034444e-07, + "learning_rate": 0.14518241089091177, + "loss": 0.0, + "num_input_tokens_seen": 34974176, + "step": 20410 + }, + { + "epoch": 99.10169491525424, + "grad_norm": 3.356125830578094e-07, + "learning_rate": 0.1451235367902009, + "loss": 0.0, + "num_input_tokens_seen": 34982912, + "step": 20415 + }, + { + "epoch": 99.12590799031477, + "grad_norm": 1.632634507586772e-07, + "learning_rate": 0.1450646634415019, + "loss": 0.0, + "num_input_tokens_seen": 34991392, + "step": 20420 + }, + { + "epoch": 99.1501210653753, + "grad_norm": 2.2507971664254e-07, + "learning_rate": 0.1450057908538938, + "loss": 0.0, + "num_input_tokens_seen": 35000160, + "step": 20425 + }, + { + "epoch": 99.17433414043583, + "grad_norm": 2.2011168709923368e-07, + "learning_rate": 0.14494691903645557, + "loss": 0.0, + "num_input_tokens_seen": 35008736, + "step": 20430 + }, + { + "epoch": 99.19854721549636, + "grad_norm": 9.757399510590403e-08, + "learning_rate": 0.14488804799826588, + "loss": 0.0, + "num_input_tokens_seen": 35017024, + "step": 20435 + }, + { + "epoch": 99.2227602905569, + "grad_norm": 2.1534923178023746e-07, + "learning_rate": 0.14482917774840348, + "loss": 0.0, + "num_input_tokens_seen": 35026016, + "step": 20440 + }, + { + "epoch": 99.24697336561744, + "grad_norm": 2.914848096224887e-07, + "learning_rate": 0.14477030829594684, + "loss": 0.0, + "num_input_tokens_seen": 35034432, + "step": 20445 + }, + { + "epoch": 99.27118644067797, + "grad_norm": 2.9764859732495097e-07, + "learning_rate": 0.14471143964997432, + "loss": 0.0, + "num_input_tokens_seen": 35042976, + "step": 20450 + }, + { + "epoch": 99.2953995157385, + "grad_norm": 3.442031299982773e-07, + "learning_rate": 0.14465257181956434, + "loss": 0.0, + "num_input_tokens_seen": 35051808, + "step": 20455 + }, + { + "epoch": 99.31961259079903, + "grad_norm": 1.8055219186408067e-07, + "learning_rate": 0.1445937048137949, + "loss": 0.0, + "num_input_tokens_seen": 35060608, + "step": 20460 + }, + { + "epoch": 99.34382566585957, + "grad_norm": 2.0270688594337116e-07, + "learning_rate": 0.14453483864174416, + "loss": 0.0, + "num_input_tokens_seen": 35068928, + "step": 20465 + }, + { + "epoch": 99.3680387409201, + "grad_norm": 2.3198894893994293e-07, + "learning_rate": 0.14447597331249, + "loss": 0.0, + "num_input_tokens_seen": 35077024, + "step": 20470 + }, + { + "epoch": 99.39225181598063, + "grad_norm": 2.4384982566516555e-07, + "learning_rate": 0.1444171088351102, + "loss": 0.0, + "num_input_tokens_seen": 35086144, + "step": 20475 + }, + { + "epoch": 99.41646489104116, + "grad_norm": 1.2959917228272388e-07, + "learning_rate": 0.14435824521868235, + "loss": 0.0, + "num_input_tokens_seen": 35095232, + "step": 20480 + }, + { + "epoch": 99.44067796610169, + "grad_norm": 1.6309542161252466e-07, + "learning_rate": 0.14429938247228397, + "loss": 0.0, + "num_input_tokens_seen": 35104320, + "step": 20485 + }, + { + "epoch": 99.46489104116223, + "grad_norm": 5.345065900996815e-08, + "learning_rate": 0.14424052060499243, + "loss": 0.0, + "num_input_tokens_seen": 35113152, + "step": 20490 + }, + { + "epoch": 99.48910411622276, + "grad_norm": 1.9573548115658923e-07, + "learning_rate": 0.14418165962588506, + "loss": 0.0, + "num_input_tokens_seen": 35121728, + "step": 20495 + }, + { + "epoch": 99.51331719128329, + "grad_norm": 3.6784553003599285e-07, + "learning_rate": 0.1441227995440388, + "loss": 0.0, + "num_input_tokens_seen": 35130112, + "step": 20500 + }, + { + "epoch": 99.53753026634382, + "grad_norm": 1.529769804164971e-07, + "learning_rate": 0.14406394036853082, + "loss": 0.0, + "num_input_tokens_seen": 35138688, + "step": 20505 + }, + { + "epoch": 99.56174334140435, + "grad_norm": 2.407615795618767e-07, + "learning_rate": 0.14400508210843774, + "loss": 0.0, + "num_input_tokens_seen": 35147040, + "step": 20510 + }, + { + "epoch": 99.5859564164649, + "grad_norm": 1.1024097545941913e-07, + "learning_rate": 0.1439462247728364, + "loss": 0.0, + "num_input_tokens_seen": 35155264, + "step": 20515 + }, + { + "epoch": 99.61016949152543, + "grad_norm": 2.207738276638338e-07, + "learning_rate": 0.14388736837080326, + "loss": 0.0, + "num_input_tokens_seen": 35164032, + "step": 20520 + }, + { + "epoch": 99.63438256658596, + "grad_norm": 1.948592540657046e-07, + "learning_rate": 0.14382851291141469, + "loss": 0.0, + "num_input_tokens_seen": 35172384, + "step": 20525 + }, + { + "epoch": 99.65859564164649, + "grad_norm": 2.3189406306300953e-07, + "learning_rate": 0.14376965840374697, + "loss": 0.0, + "num_input_tokens_seen": 35180768, + "step": 20530 + }, + { + "epoch": 99.68280871670702, + "grad_norm": 2.598833930278488e-07, + "learning_rate": 0.14371080485687632, + "loss": 0.0, + "num_input_tokens_seen": 35189024, + "step": 20535 + }, + { + "epoch": 99.70702179176756, + "grad_norm": 3.182423142789048e-07, + "learning_rate": 0.1436519522798785, + "loss": 0.0, + "num_input_tokens_seen": 35197728, + "step": 20540 + }, + { + "epoch": 99.73123486682809, + "grad_norm": 1.884309170918641e-07, + "learning_rate": 0.14359310068182948, + "loss": 0.0, + "num_input_tokens_seen": 35206208, + "step": 20545 + }, + { + "epoch": 99.75544794188862, + "grad_norm": 3.417294465180021e-07, + "learning_rate": 0.14353425007180484, + "loss": 0.0, + "num_input_tokens_seen": 35214816, + "step": 20550 + }, + { + "epoch": 99.77966101694915, + "grad_norm": 1.5267173125721456e-07, + "learning_rate": 0.14347540045888005, + "loss": 0.0, + "num_input_tokens_seen": 35223584, + "step": 20555 + }, + { + "epoch": 99.80387409200968, + "grad_norm": 2.011536537338543e-07, + "learning_rate": 0.14341655185213056, + "loss": 0.0, + "num_input_tokens_seen": 35231680, + "step": 20560 + }, + { + "epoch": 99.82808716707022, + "grad_norm": 9.297970393618016e-08, + "learning_rate": 0.14335770426063144, + "loss": 0.0, + "num_input_tokens_seen": 35240064, + "step": 20565 + }, + { + "epoch": 99.85230024213075, + "grad_norm": 3.1684967893852445e-07, + "learning_rate": 0.1432988576934578, + "loss": 0.0, + "num_input_tokens_seen": 35248384, + "step": 20570 + }, + { + "epoch": 99.87651331719128, + "grad_norm": 2.0594735872236924e-07, + "learning_rate": 0.14324001215968457, + "loss": 0.0, + "num_input_tokens_seen": 35256576, + "step": 20575 + }, + { + "epoch": 99.90072639225181, + "grad_norm": 3.1535427069684374e-07, + "learning_rate": 0.14318116766838637, + "loss": 0.0, + "num_input_tokens_seen": 35265120, + "step": 20580 + }, + { + "epoch": 99.92493946731234, + "grad_norm": 1.1132643606970305e-07, + "learning_rate": 0.14312232422863788, + "loss": 0.0, + "num_input_tokens_seen": 35273664, + "step": 20585 + }, + { + "epoch": 99.94915254237289, + "grad_norm": 2.9123114586582233e-07, + "learning_rate": 0.14306348184951334, + "loss": 0.0, + "num_input_tokens_seen": 35282016, + "step": 20590 + }, + { + "epoch": 99.97336561743342, + "grad_norm": 1.409624559300937e-07, + "learning_rate": 0.1430046405400871, + "loss": 0.0, + "num_input_tokens_seen": 35290528, + "step": 20595 + }, + { + "epoch": 99.99757869249395, + "grad_norm": 2.137759906872816e-07, + "learning_rate": 0.14294580030943324, + "loss": 0.0, + "num_input_tokens_seen": 35299200, + "step": 20600 + }, + { + "epoch": 99.99757869249395, + "eval_loss": 0.9486237168312073, + "eval_runtime": 4.6179, + "eval_samples_per_second": 79.473, + "eval_steps_per_second": 19.922, + "num_input_tokens_seen": 35299200, + "step": 20600 + }, + { + "epoch": 100.02421307506053, + "grad_norm": 1.3886095473480964e-07, + "learning_rate": 0.14288696116662553, + "loss": 0.0, + "num_input_tokens_seen": 35308160, + "step": 20605 + }, + { + "epoch": 100.04842615012106, + "grad_norm": 3.662909762169875e-07, + "learning_rate": 0.1428281231207378, + "loss": 0.0, + "num_input_tokens_seen": 35316192, + "step": 20610 + }, + { + "epoch": 100.0726392251816, + "grad_norm": 1.9484116364765214e-07, + "learning_rate": 0.1427692861808437, + "loss": 0.0, + "num_input_tokens_seen": 35324896, + "step": 20615 + }, + { + "epoch": 100.09685230024213, + "grad_norm": 2.9332130679904367e-07, + "learning_rate": 0.1427104503560165, + "loss": 0.0, + "num_input_tokens_seen": 35333408, + "step": 20620 + }, + { + "epoch": 100.12106537530266, + "grad_norm": 3.212215062831092e-07, + "learning_rate": 0.14265161565532947, + "loss": 0.0, + "num_input_tokens_seen": 35342240, + "step": 20625 + }, + { + "epoch": 100.1452784503632, + "grad_norm": 2.4434658030259016e-07, + "learning_rate": 0.14259278208785564, + "loss": 0.0, + "num_input_tokens_seen": 35350784, + "step": 20630 + }, + { + "epoch": 100.16949152542372, + "grad_norm": 4.2724786908365786e-07, + "learning_rate": 0.14253394966266789, + "loss": 0.0, + "num_input_tokens_seen": 35358944, + "step": 20635 + }, + { + "epoch": 100.19370460048427, + "grad_norm": 1.8705109994243685e-07, + "learning_rate": 0.14247511838883894, + "loss": 0.0, + "num_input_tokens_seen": 35368064, + "step": 20640 + }, + { + "epoch": 100.2179176755448, + "grad_norm": 1.0472129474692338e-07, + "learning_rate": 0.14241628827544126, + "loss": 0.0, + "num_input_tokens_seen": 35376736, + "step": 20645 + }, + { + "epoch": 100.24213075060533, + "grad_norm": 2.452457863455493e-07, + "learning_rate": 0.14235745933154723, + "loss": 0.0, + "num_input_tokens_seen": 35385056, + "step": 20650 + }, + { + "epoch": 100.26634382566586, + "grad_norm": 2.4210089577536564e-07, + "learning_rate": 0.14229863156622907, + "loss": 0.0, + "num_input_tokens_seen": 35394144, + "step": 20655 + }, + { + "epoch": 100.29055690072639, + "grad_norm": 8.477740465195893e-08, + "learning_rate": 0.14223980498855868, + "loss": 0.0, + "num_input_tokens_seen": 35402912, + "step": 20660 + }, + { + "epoch": 100.31476997578693, + "grad_norm": 2.6632778826751746e-07, + "learning_rate": 0.14218097960760792, + "loss": 0.0, + "num_input_tokens_seen": 35411840, + "step": 20665 + }, + { + "epoch": 100.33898305084746, + "grad_norm": 2.3038678875764163e-07, + "learning_rate": 0.1421221554324483, + "loss": 0.0, + "num_input_tokens_seen": 35420448, + "step": 20670 + }, + { + "epoch": 100.36319612590799, + "grad_norm": 1.935724043278242e-07, + "learning_rate": 0.1420633324721513, + "loss": 0.0, + "num_input_tokens_seen": 35428960, + "step": 20675 + }, + { + "epoch": 100.38740920096852, + "grad_norm": 1.3546818422582874e-07, + "learning_rate": 0.14200451073578824, + "loss": 0.0, + "num_input_tokens_seen": 35437728, + "step": 20680 + }, + { + "epoch": 100.41162227602905, + "grad_norm": 3.114191144959477e-07, + "learning_rate": 0.14194569023243003, + "loss": 0.0, + "num_input_tokens_seen": 35446080, + "step": 20685 + }, + { + "epoch": 100.4358353510896, + "grad_norm": 2.3734882859116624e-07, + "learning_rate": 0.14188687097114766, + "loss": 0.0, + "num_input_tokens_seen": 35454400, + "step": 20690 + }, + { + "epoch": 100.46004842615012, + "grad_norm": 1.266610070160823e-07, + "learning_rate": 0.14182805296101172, + "loss": 0.0, + "num_input_tokens_seen": 35462656, + "step": 20695 + }, + { + "epoch": 100.48426150121065, + "grad_norm": 1.0549283757654848e-07, + "learning_rate": 0.14176923621109272, + "loss": 0.0, + "num_input_tokens_seen": 35471264, + "step": 20700 + }, + { + "epoch": 100.50847457627118, + "grad_norm": 2.735762905103911e-07, + "learning_rate": 0.14171042073046097, + "loss": 0.0, + "num_input_tokens_seen": 35480064, + "step": 20705 + }, + { + "epoch": 100.53268765133171, + "grad_norm": 1.7570073396200314e-07, + "learning_rate": 0.14165160652818642, + "loss": 0.0, + "num_input_tokens_seen": 35488992, + "step": 20710 + }, + { + "epoch": 100.55690072639226, + "grad_norm": 2.849845088803704e-07, + "learning_rate": 0.14159279361333907, + "loss": 0.0, + "num_input_tokens_seen": 35497792, + "step": 20715 + }, + { + "epoch": 100.58111380145279, + "grad_norm": 2.722880196870392e-07, + "learning_rate": 0.14153398199498868, + "loss": 0.0, + "num_input_tokens_seen": 35506272, + "step": 20720 + }, + { + "epoch": 100.60532687651332, + "grad_norm": 4.017488492991106e-07, + "learning_rate": 0.14147517168220458, + "loss": 0.0, + "num_input_tokens_seen": 35514688, + "step": 20725 + }, + { + "epoch": 100.62953995157385, + "grad_norm": 9.006672740952126e-08, + "learning_rate": 0.14141636268405616, + "loss": 0.0, + "num_input_tokens_seen": 35523264, + "step": 20730 + }, + { + "epoch": 100.65375302663438, + "grad_norm": 1.8339873975037335e-07, + "learning_rate": 0.14135755500961253, + "loss": 0.0, + "num_input_tokens_seen": 35532032, + "step": 20735 + }, + { + "epoch": 100.67796610169492, + "grad_norm": 1.921024477269384e-07, + "learning_rate": 0.14129874866794245, + "loss": 0.0, + "num_input_tokens_seen": 35540512, + "step": 20740 + }, + { + "epoch": 100.70217917675545, + "grad_norm": 7.535095392086077e-08, + "learning_rate": 0.14123994366811476, + "loss": 0.0, + "num_input_tokens_seen": 35548704, + "step": 20745 + }, + { + "epoch": 100.72639225181598, + "grad_norm": 1.4199366660250234e-07, + "learning_rate": 0.14118114001919774, + "loss": 0.0, + "num_input_tokens_seen": 35557184, + "step": 20750 + }, + { + "epoch": 100.75060532687651, + "grad_norm": 1.7893536607971328e-07, + "learning_rate": 0.14112233773025978, + "loss": 0.0, + "num_input_tokens_seen": 35565472, + "step": 20755 + }, + { + "epoch": 100.77481840193704, + "grad_norm": 1.3997794212627923e-07, + "learning_rate": 0.14106353681036896, + "loss": 0.0, + "num_input_tokens_seen": 35574272, + "step": 20760 + }, + { + "epoch": 100.79903147699758, + "grad_norm": 2.529817777485732e-07, + "learning_rate": 0.14100473726859303, + "loss": 0.0, + "num_input_tokens_seen": 35582464, + "step": 20765 + }, + { + "epoch": 100.82324455205811, + "grad_norm": 3.5862913705386745e-07, + "learning_rate": 0.14094593911399964, + "loss": 0.0, + "num_input_tokens_seen": 35591424, + "step": 20770 + }, + { + "epoch": 100.84745762711864, + "grad_norm": 2.143820410083208e-07, + "learning_rate": 0.14088714235565625, + "loss": 0.0, + "num_input_tokens_seen": 35599872, + "step": 20775 + }, + { + "epoch": 100.87167070217917, + "grad_norm": 4.16897840693764e-08, + "learning_rate": 0.14082834700263, + "loss": 0.0, + "num_input_tokens_seen": 35608480, + "step": 20780 + }, + { + "epoch": 100.8958837772397, + "grad_norm": 2.0218534757532325e-07, + "learning_rate": 0.14076955306398795, + "loss": 0.0, + "num_input_tokens_seen": 35616960, + "step": 20785 + }, + { + "epoch": 100.92009685230025, + "grad_norm": 1.9674047280204832e-07, + "learning_rate": 0.14071076054879675, + "loss": 0.0, + "num_input_tokens_seen": 35625952, + "step": 20790 + }, + { + "epoch": 100.94430992736078, + "grad_norm": 1.925626378351808e-07, + "learning_rate": 0.14065196946612302, + "loss": 0.0, + "num_input_tokens_seen": 35634336, + "step": 20795 + }, + { + "epoch": 100.9685230024213, + "grad_norm": 3.328993969375915e-08, + "learning_rate": 0.1405931798250331, + "loss": 0.0, + "num_input_tokens_seen": 35642464, + "step": 20800 + }, + { + "epoch": 100.9685230024213, + "eval_loss": 0.9553340673446655, + "eval_runtime": 4.6222, + "eval_samples_per_second": 79.4, + "eval_steps_per_second": 19.904, + "num_input_tokens_seen": 35642464, + "step": 20800 + }, + { + "epoch": 100.99273607748184, + "grad_norm": 2.8621971637221577e-07, + "learning_rate": 0.14053439163459308, + "loss": 0.0, + "num_input_tokens_seen": 35651168, + "step": 20805 + }, + { + "epoch": 101.01937046004842, + "grad_norm": 2.654080617503496e-07, + "learning_rate": 0.14047560490386876, + "loss": 0.0, + "num_input_tokens_seen": 35659936, + "step": 20810 + }, + { + "epoch": 101.04358353510897, + "grad_norm": 2.5009063620018424e-07, + "learning_rate": 0.14041681964192593, + "loss": 0.0, + "num_input_tokens_seen": 35668224, + "step": 20815 + }, + { + "epoch": 101.0677966101695, + "grad_norm": 1.721008686672576e-07, + "learning_rate": 0.14035803585782988, + "loss": 0.0, + "num_input_tokens_seen": 35676800, + "step": 20820 + }, + { + "epoch": 101.09200968523002, + "grad_norm": 2.8622326908589457e-07, + "learning_rate": 0.14029925356064593, + "loss": 0.0, + "num_input_tokens_seen": 35685280, + "step": 20825 + }, + { + "epoch": 101.11622276029055, + "grad_norm": 7.285962055902928e-08, + "learning_rate": 0.1402404727594389, + "loss": 0.0, + "num_input_tokens_seen": 35693920, + "step": 20830 + }, + { + "epoch": 101.14043583535108, + "grad_norm": 1.306437269477101e-07, + "learning_rate": 0.1401816934632737, + "loss": 0.0, + "num_input_tokens_seen": 35702816, + "step": 20835 + }, + { + "epoch": 101.16464891041163, + "grad_norm": 1.306407426682199e-07, + "learning_rate": 0.1401229156812147, + "loss": 0.0, + "num_input_tokens_seen": 35712032, + "step": 20840 + }, + { + "epoch": 101.18886198547216, + "grad_norm": 2.0806746192647552e-07, + "learning_rate": 0.14006413942232626, + "loss": 0.0, + "num_input_tokens_seen": 35720256, + "step": 20845 + }, + { + "epoch": 101.21307506053269, + "grad_norm": 2.5200006348313764e-07, + "learning_rate": 0.14000536469567235, + "loss": 0.0, + "num_input_tokens_seen": 35728864, + "step": 20850 + }, + { + "epoch": 101.23728813559322, + "grad_norm": 3.8808462932138355e-07, + "learning_rate": 0.13994659151031685, + "loss": 0.0, + "num_input_tokens_seen": 35737728, + "step": 20855 + }, + { + "epoch": 101.26150121065375, + "grad_norm": 1.6259996016287914e-07, + "learning_rate": 0.13988781987532323, + "loss": 0.0, + "num_input_tokens_seen": 35746432, + "step": 20860 + }, + { + "epoch": 101.28571428571429, + "grad_norm": 9.988420401896292e-08, + "learning_rate": 0.1398290497997549, + "loss": 0.0, + "num_input_tokens_seen": 35754784, + "step": 20865 + }, + { + "epoch": 101.30992736077482, + "grad_norm": 3.0558109642697673e-07, + "learning_rate": 0.13977028129267488, + "loss": 0.0, + "num_input_tokens_seen": 35763296, + "step": 20870 + }, + { + "epoch": 101.33414043583535, + "grad_norm": 5.500701760752236e-08, + "learning_rate": 0.13971151436314605, + "loss": 0.0, + "num_input_tokens_seen": 35772000, + "step": 20875 + }, + { + "epoch": 101.35835351089588, + "grad_norm": 2.3601701570896694e-07, + "learning_rate": 0.13965274902023103, + "loss": 0.0, + "num_input_tokens_seen": 35780832, + "step": 20880 + }, + { + "epoch": 101.38256658595641, + "grad_norm": 3.564347537121648e-07, + "learning_rate": 0.13959398527299208, + "loss": 0.0, + "num_input_tokens_seen": 35789216, + "step": 20885 + }, + { + "epoch": 101.40677966101696, + "grad_norm": 2.4820673161229934e-07, + "learning_rate": 0.13953522313049138, + "loss": 0.0, + "num_input_tokens_seen": 35797472, + "step": 20890 + }, + { + "epoch": 101.43099273607749, + "grad_norm": 2.3847675834076654e-07, + "learning_rate": 0.13947646260179083, + "loss": 0.0, + "num_input_tokens_seen": 35806016, + "step": 20895 + }, + { + "epoch": 101.45520581113801, + "grad_norm": 2.748911640537699e-07, + "learning_rate": 0.13941770369595194, + "loss": 0.0, + "num_input_tokens_seen": 35814496, + "step": 20900 + }, + { + "epoch": 101.47941888619854, + "grad_norm": 1.4508349011066457e-07, + "learning_rate": 0.1393589464220362, + "loss": 0.0, + "num_input_tokens_seen": 35823072, + "step": 20905 + }, + { + "epoch": 101.50363196125907, + "grad_norm": 2.5665661951279617e-07, + "learning_rate": 0.13930019078910455, + "loss": 0.0, + "num_input_tokens_seen": 35831488, + "step": 20910 + }, + { + "epoch": 101.52784503631962, + "grad_norm": 1.6748371933772432e-07, + "learning_rate": 0.139241436806218, + "loss": 0.0, + "num_input_tokens_seen": 35839872, + "step": 20915 + }, + { + "epoch": 101.55205811138015, + "grad_norm": 2.920401982464682e-07, + "learning_rate": 0.13918268448243712, + "loss": 0.0, + "num_input_tokens_seen": 35848192, + "step": 20920 + }, + { + "epoch": 101.57627118644068, + "grad_norm": 1.4346419163757673e-07, + "learning_rate": 0.13912393382682217, + "loss": 0.0, + "num_input_tokens_seen": 35856768, + "step": 20925 + }, + { + "epoch": 101.60048426150121, + "grad_norm": 6.914714845152048e-08, + "learning_rate": 0.1390651848484333, + "loss": 0.0, + "num_input_tokens_seen": 35865408, + "step": 20930 + }, + { + "epoch": 101.62469733656174, + "grad_norm": 1.6745363495829224e-07, + "learning_rate": 0.1390064375563304, + "loss": 0.0, + "num_input_tokens_seen": 35873888, + "step": 20935 + }, + { + "epoch": 101.64891041162228, + "grad_norm": 2.0322764271440974e-07, + "learning_rate": 0.13894769195957293, + "loss": 0.0, + "num_input_tokens_seen": 35882368, + "step": 20940 + }, + { + "epoch": 101.67312348668281, + "grad_norm": 1.652031613730287e-07, + "learning_rate": 0.13888894806722032, + "loss": 0.0, + "num_input_tokens_seen": 35890784, + "step": 20945 + }, + { + "epoch": 101.69733656174334, + "grad_norm": 2.721850194120634e-07, + "learning_rate": 0.1388302058883315, + "loss": 0.0, + "num_input_tokens_seen": 35899488, + "step": 20950 + }, + { + "epoch": 101.72154963680387, + "grad_norm": 1.8616236729940283e-07, + "learning_rate": 0.13877146543196528, + "loss": 0.0, + "num_input_tokens_seen": 35907776, + "step": 20955 + }, + { + "epoch": 101.7457627118644, + "grad_norm": 1.0688623319765611e-07, + "learning_rate": 0.13871272670718027, + "loss": 0.0, + "num_input_tokens_seen": 35916320, + "step": 20960 + }, + { + "epoch": 101.76997578692495, + "grad_norm": 2.903409779264621e-07, + "learning_rate": 0.13865398972303455, + "loss": 0.0, + "num_input_tokens_seen": 35924960, + "step": 20965 + }, + { + "epoch": 101.79418886198548, + "grad_norm": 1.5850029910779995e-07, + "learning_rate": 0.13859525448858623, + "loss": 0.0, + "num_input_tokens_seen": 35933920, + "step": 20970 + }, + { + "epoch": 101.818401937046, + "grad_norm": 1.3447163382807048e-07, + "learning_rate": 0.13853652101289304, + "loss": 0.0, + "num_input_tokens_seen": 35942912, + "step": 20975 + }, + { + "epoch": 101.84261501210653, + "grad_norm": 1.4872128417664499e-07, + "learning_rate": 0.13847778930501234, + "loss": 0.0, + "num_input_tokens_seen": 35951168, + "step": 20980 + }, + { + "epoch": 101.86682808716706, + "grad_norm": 7.538263702144832e-08, + "learning_rate": 0.1384190593740013, + "loss": 0.0, + "num_input_tokens_seen": 35959648, + "step": 20985 + }, + { + "epoch": 101.89104116222761, + "grad_norm": 1.3185734815124306e-07, + "learning_rate": 0.13836033122891686, + "loss": 0.0, + "num_input_tokens_seen": 35968160, + "step": 20990 + }, + { + "epoch": 101.91525423728814, + "grad_norm": 6.42182911292366e-08, + "learning_rate": 0.1383016048788156, + "loss": 0.0, + "num_input_tokens_seen": 35976768, + "step": 20995 + }, + { + "epoch": 101.93946731234867, + "grad_norm": 8.714342669691177e-08, + "learning_rate": 0.13824288033275392, + "loss": 0.0, + "num_input_tokens_seen": 35985280, + "step": 21000 + }, + { + "epoch": 101.93946731234867, + "eval_loss": 0.9646040201187134, + "eval_runtime": 4.6292, + "eval_samples_per_second": 79.279, + "eval_steps_per_second": 19.874, + "num_input_tokens_seen": 35985280, + "step": 21000 + }, + { + "epoch": 101.9636803874092, + "grad_norm": 2.2718546688338392e-07, + "learning_rate": 0.1381841575997878, + "loss": 0.0, + "num_input_tokens_seen": 35993472, + "step": 21005 + }, + { + "epoch": 101.98789346246973, + "grad_norm": 3.215180299775966e-07, + "learning_rate": 0.13812543668897306, + "loss": 0.0, + "num_input_tokens_seen": 36001824, + "step": 21010 + }, + { + "epoch": 102.01452784503633, + "grad_norm": 1.6738269437155395e-07, + "learning_rate": 0.13806671760936526, + "loss": 0.0, + "num_input_tokens_seen": 36011200, + "step": 21015 + }, + { + "epoch": 102.03874092009686, + "grad_norm": 9.000755341048716e-08, + "learning_rate": 0.13800800037001956, + "loss": 0.0, + "num_input_tokens_seen": 36019936, + "step": 21020 + }, + { + "epoch": 102.06295399515739, + "grad_norm": 2.755703008006094e-07, + "learning_rate": 0.13794928497999087, + "loss": 0.0, + "num_input_tokens_seen": 36028608, + "step": 21025 + }, + { + "epoch": 102.08716707021792, + "grad_norm": 6.930369522706314e-08, + "learning_rate": 0.1378905714483339, + "loss": 0.0, + "num_input_tokens_seen": 36037408, + "step": 21030 + }, + { + "epoch": 102.11138014527845, + "grad_norm": 1.8430023374094162e-07, + "learning_rate": 0.13783185978410295, + "loss": 0.0, + "num_input_tokens_seen": 36046080, + "step": 21035 + }, + { + "epoch": 102.13559322033899, + "grad_norm": 9.499921560518487e-08, + "learning_rate": 0.13777314999635218, + "loss": 0.0, + "num_input_tokens_seen": 36054752, + "step": 21040 + }, + { + "epoch": 102.15980629539952, + "grad_norm": 1.6853296358476655e-07, + "learning_rate": 0.1377144420941353, + "loss": 0.0, + "num_input_tokens_seen": 36063456, + "step": 21045 + }, + { + "epoch": 102.18401937046005, + "grad_norm": 1.2901720936042693e-07, + "learning_rate": 0.13765573608650586, + "loss": 0.0, + "num_input_tokens_seen": 36071936, + "step": 21050 + }, + { + "epoch": 102.20823244552058, + "grad_norm": 1.0719433163330905e-07, + "learning_rate": 0.13759703198251702, + "loss": 0.0, + "num_input_tokens_seen": 36080096, + "step": 21055 + }, + { + "epoch": 102.23244552058111, + "grad_norm": 2.6469990643818164e-07, + "learning_rate": 0.13753832979122174, + "loss": 0.0, + "num_input_tokens_seen": 36088736, + "step": 21060 + }, + { + "epoch": 102.25665859564165, + "grad_norm": 1.6233842359270056e-07, + "learning_rate": 0.13747962952167264, + "loss": 0.0, + "num_input_tokens_seen": 36096896, + "step": 21065 + }, + { + "epoch": 102.28087167070218, + "grad_norm": 1.2581200792283198e-07, + "learning_rate": 0.13742093118292192, + "loss": 0.0, + "num_input_tokens_seen": 36105632, + "step": 21070 + }, + { + "epoch": 102.30508474576271, + "grad_norm": 5.590686313894366e-08, + "learning_rate": 0.13736223478402174, + "loss": 0.0, + "num_input_tokens_seen": 36114368, + "step": 21075 + }, + { + "epoch": 102.32929782082324, + "grad_norm": 7.365527210367873e-08, + "learning_rate": 0.1373035403340238, + "loss": 0.0, + "num_input_tokens_seen": 36123264, + "step": 21080 + }, + { + "epoch": 102.35351089588377, + "grad_norm": 1.9978887166871573e-07, + "learning_rate": 0.13724484784197943, + "loss": 0.0, + "num_input_tokens_seen": 36131904, + "step": 21085 + }, + { + "epoch": 102.37772397094432, + "grad_norm": 1.252024333098234e-07, + "learning_rate": 0.13718615731693987, + "loss": 0.0, + "num_input_tokens_seen": 36140160, + "step": 21090 + }, + { + "epoch": 102.40193704600485, + "grad_norm": 4.4743657667822845e-08, + "learning_rate": 0.13712746876795587, + "loss": 0.0, + "num_input_tokens_seen": 36148704, + "step": 21095 + }, + { + "epoch": 102.42615012106538, + "grad_norm": 1.1671731670048757e-07, + "learning_rate": 0.13706878220407792, + "loss": 0.0, + "num_input_tokens_seen": 36157184, + "step": 21100 + }, + { + "epoch": 102.4503631961259, + "grad_norm": 1.0446970577504544e-07, + "learning_rate": 0.13701009763435631, + "loss": 0.0, + "num_input_tokens_seen": 36165568, + "step": 21105 + }, + { + "epoch": 102.47457627118644, + "grad_norm": 1.777732450136682e-07, + "learning_rate": 0.13695141506784084, + "loss": 0.0, + "num_input_tokens_seen": 36174272, + "step": 21110 + }, + { + "epoch": 102.49878934624698, + "grad_norm": 2.648557142492791e-07, + "learning_rate": 0.13689273451358114, + "loss": 0.0, + "num_input_tokens_seen": 36182496, + "step": 21115 + }, + { + "epoch": 102.52300242130751, + "grad_norm": 9.703914827241533e-08, + "learning_rate": 0.13683405598062653, + "loss": 0.0, + "num_input_tokens_seen": 36191040, + "step": 21120 + }, + { + "epoch": 102.54721549636804, + "grad_norm": 8.806669882233109e-08, + "learning_rate": 0.1367753794780259, + "loss": 0.0, + "num_input_tokens_seen": 36199808, + "step": 21125 + }, + { + "epoch": 102.57142857142857, + "grad_norm": 1.7248035533157235e-07, + "learning_rate": 0.13671670501482802, + "loss": 0.0, + "num_input_tokens_seen": 36208320, + "step": 21130 + }, + { + "epoch": 102.5956416464891, + "grad_norm": 1.235488724660172e-07, + "learning_rate": 0.1366580326000811, + "loss": 0.0, + "num_input_tokens_seen": 36216480, + "step": 21135 + }, + { + "epoch": 102.61985472154964, + "grad_norm": 2.284615305825355e-07, + "learning_rate": 0.1365993622428332, + "loss": 0.0, + "num_input_tokens_seen": 36225216, + "step": 21140 + }, + { + "epoch": 102.64406779661017, + "grad_norm": 2.1804235927902482e-07, + "learning_rate": 0.13654069395213211, + "loss": 0.0, + "num_input_tokens_seen": 36233536, + "step": 21145 + }, + { + "epoch": 102.6682808716707, + "grad_norm": 1.3740941540163476e-07, + "learning_rate": 0.13648202773702509, + "loss": 0.0, + "num_input_tokens_seen": 36241696, + "step": 21150 + }, + { + "epoch": 102.69249394673123, + "grad_norm": 7.915917876744061e-08, + "learning_rate": 0.13642336360655927, + "loss": 0.0, + "num_input_tokens_seen": 36250432, + "step": 21155 + }, + { + "epoch": 102.71670702179176, + "grad_norm": 1.7746302205523534e-07, + "learning_rate": 0.13636470156978145, + "loss": 0.0, + "num_input_tokens_seen": 36259104, + "step": 21160 + }, + { + "epoch": 102.7409200968523, + "grad_norm": 2.5895610633597244e-07, + "learning_rate": 0.13630604163573798, + "loss": 0.0, + "num_input_tokens_seen": 36267808, + "step": 21165 + }, + { + "epoch": 102.76513317191284, + "grad_norm": 2.834200927281927e-07, + "learning_rate": 0.13624738381347495, + "loss": 0.0, + "num_input_tokens_seen": 36276448, + "step": 21170 + }, + { + "epoch": 102.78934624697337, + "grad_norm": 1.3739736459683627e-07, + "learning_rate": 0.1361887281120382, + "loss": 0.0, + "num_input_tokens_seen": 36284928, + "step": 21175 + }, + { + "epoch": 102.8135593220339, + "grad_norm": 1.373298061935202e-07, + "learning_rate": 0.13613007454047307, + "loss": 0.0, + "num_input_tokens_seen": 36293184, + "step": 21180 + }, + { + "epoch": 102.83777239709443, + "grad_norm": 2.956747096050094e-07, + "learning_rate": 0.13607142310782486, + "loss": 0.0, + "num_input_tokens_seen": 36301920, + "step": 21185 + }, + { + "epoch": 102.86198547215497, + "grad_norm": 9.929725308666093e-08, + "learning_rate": 0.13601277382313814, + "loss": 0.0, + "num_input_tokens_seen": 36310752, + "step": 21190 + }, + { + "epoch": 102.8861985472155, + "grad_norm": 2.9238302090561774e-07, + "learning_rate": 0.1359541266954575, + "loss": 0.0, + "num_input_tokens_seen": 36319200, + "step": 21195 + }, + { + "epoch": 102.91041162227603, + "grad_norm": 1.1869425264876554e-07, + "learning_rate": 0.13589548173382707, + "loss": 0.0, + "num_input_tokens_seen": 36327840, + "step": 21200 + }, + { + "epoch": 102.91041162227603, + "eval_loss": 0.9643514752388, + "eval_runtime": 4.6209, + "eval_samples_per_second": 79.422, + "eval_steps_per_second": 19.91, + "num_input_tokens_seen": 36327840, + "step": 21200 + }, + { + "epoch": 102.93462469733656, + "grad_norm": 1.5408951981044083e-07, + "learning_rate": 0.1358368389472906, + "loss": 0.0, + "num_input_tokens_seen": 36336576, + "step": 21205 + }, + { + "epoch": 102.95883777239709, + "grad_norm": 1.7577751521002938e-07, + "learning_rate": 0.13577819834489155, + "loss": 0.0, + "num_input_tokens_seen": 36345280, + "step": 21210 + }, + { + "epoch": 102.98305084745763, + "grad_norm": 1.5795784236161126e-07, + "learning_rate": 0.135719559935673, + "loss": 0.0, + "num_input_tokens_seen": 36353568, + "step": 21215 + }, + { + "epoch": 103.00968523002422, + "grad_norm": 4.471011649798129e-08, + "learning_rate": 0.13566092372867775, + "loss": 0.0, + "num_input_tokens_seen": 36362688, + "step": 21220 + }, + { + "epoch": 103.03389830508475, + "grad_norm": 2.0585939353168214e-07, + "learning_rate": 0.13560228973294833, + "loss": 0.0, + "num_input_tokens_seen": 36371168, + "step": 21225 + }, + { + "epoch": 103.05811138014528, + "grad_norm": 6.222439452585604e-08, + "learning_rate": 0.13554365795752668, + "loss": 0.0, + "num_input_tokens_seen": 36379936, + "step": 21230 + }, + { + "epoch": 103.08232445520581, + "grad_norm": 2.691902807328006e-07, + "learning_rate": 0.1354850284114547, + "loss": 0.0, + "num_input_tokens_seen": 36388448, + "step": 21235 + }, + { + "epoch": 103.10653753026634, + "grad_norm": 1.6133463986989227e-07, + "learning_rate": 0.13542640110377374, + "loss": 0.0, + "num_input_tokens_seen": 36396416, + "step": 21240 + }, + { + "epoch": 103.13075060532688, + "grad_norm": 2.476309362009488e-07, + "learning_rate": 0.13536777604352487, + "loss": 0.0, + "num_input_tokens_seen": 36404576, + "step": 21245 + }, + { + "epoch": 103.15496368038741, + "grad_norm": 1.1627196272456786e-07, + "learning_rate": 0.13530915323974887, + "loss": 0.0, + "num_input_tokens_seen": 36413312, + "step": 21250 + }, + { + "epoch": 103.17917675544794, + "grad_norm": 9.724562488599986e-08, + "learning_rate": 0.13525053270148596, + "loss": 0.0, + "num_input_tokens_seen": 36421696, + "step": 21255 + }, + { + "epoch": 103.20338983050847, + "grad_norm": 1.440338195379809e-07, + "learning_rate": 0.13519191443777628, + "loss": 0.0, + "num_input_tokens_seen": 36430368, + "step": 21260 + }, + { + "epoch": 103.227602905569, + "grad_norm": 1.0477150169663219e-07, + "learning_rate": 0.13513329845765953, + "loss": 0.0, + "num_input_tokens_seen": 36439104, + "step": 21265 + }, + { + "epoch": 103.25181598062954, + "grad_norm": 2.981432203341683e-07, + "learning_rate": 0.13507468477017495, + "loss": 0.0, + "num_input_tokens_seen": 36447584, + "step": 21270 + }, + { + "epoch": 103.27602905569007, + "grad_norm": 9.236598685902209e-08, + "learning_rate": 0.13501607338436153, + "loss": 0.0, + "num_input_tokens_seen": 36456448, + "step": 21275 + }, + { + "epoch": 103.3002421307506, + "grad_norm": 7.723767225797928e-08, + "learning_rate": 0.13495746430925798, + "loss": 0.0, + "num_input_tokens_seen": 36464800, + "step": 21280 + }, + { + "epoch": 103.32445520581113, + "grad_norm": 1.6647472023123555e-07, + "learning_rate": 0.13489885755390238, + "loss": 0.0, + "num_input_tokens_seen": 36473056, + "step": 21285 + }, + { + "epoch": 103.34866828087166, + "grad_norm": 1.863536880364336e-07, + "learning_rate": 0.13484025312733275, + "loss": 0.0, + "num_input_tokens_seen": 36481728, + "step": 21290 + }, + { + "epoch": 103.37288135593221, + "grad_norm": 1.9577656473757088e-07, + "learning_rate": 0.13478165103858658, + "loss": 0.0, + "num_input_tokens_seen": 36490592, + "step": 21295 + }, + { + "epoch": 103.39709443099274, + "grad_norm": 7.50943769389778e-08, + "learning_rate": 0.13472305129670106, + "loss": 0.0, + "num_input_tokens_seen": 36499168, + "step": 21300 + }, + { + "epoch": 103.42130750605327, + "grad_norm": 5.835959271394131e-08, + "learning_rate": 0.13466445391071305, + "loss": 0.0, + "num_input_tokens_seen": 36507360, + "step": 21305 + }, + { + "epoch": 103.4455205811138, + "grad_norm": 9.907225262395514e-08, + "learning_rate": 0.13460585888965895, + "loss": 0.0, + "num_input_tokens_seen": 36516032, + "step": 21310 + }, + { + "epoch": 103.46973365617433, + "grad_norm": 2.8925612127750355e-07, + "learning_rate": 0.13454726624257482, + "loss": 0.0, + "num_input_tokens_seen": 36524576, + "step": 21315 + }, + { + "epoch": 103.49394673123487, + "grad_norm": 2.570241122157313e-07, + "learning_rate": 0.1344886759784965, + "loss": 0.0, + "num_input_tokens_seen": 36532960, + "step": 21320 + }, + { + "epoch": 103.5181598062954, + "grad_norm": 1.2762640722030483e-07, + "learning_rate": 0.13443008810645923, + "loss": 0.0, + "num_input_tokens_seen": 36541568, + "step": 21325 + }, + { + "epoch": 103.54237288135593, + "grad_norm": 2.0260060296095617e-07, + "learning_rate": 0.13437150263549807, + "loss": 0.0, + "num_input_tokens_seen": 36550240, + "step": 21330 + }, + { + "epoch": 103.56658595641646, + "grad_norm": 5.6719795082926794e-08, + "learning_rate": 0.13431291957464755, + "loss": 0.0, + "num_input_tokens_seen": 36558592, + "step": 21335 + }, + { + "epoch": 103.59079903147699, + "grad_norm": 2.500278810657619e-07, + "learning_rate": 0.13425433893294197, + "loss": 0.0, + "num_input_tokens_seen": 36566848, + "step": 21340 + }, + { + "epoch": 103.61501210653753, + "grad_norm": 1.8457413375472242e-07, + "learning_rate": 0.13419576071941525, + "loss": 0.0, + "num_input_tokens_seen": 36575648, + "step": 21345 + }, + { + "epoch": 103.63922518159806, + "grad_norm": 1.2488192169257673e-07, + "learning_rate": 0.1341371849431008, + "loss": 0.0, + "num_input_tokens_seen": 36584128, + "step": 21350 + }, + { + "epoch": 103.6634382566586, + "grad_norm": 1.5608429748681374e-07, + "learning_rate": 0.13407861161303178, + "loss": 0.0, + "num_input_tokens_seen": 36592864, + "step": 21355 + }, + { + "epoch": 103.68765133171912, + "grad_norm": 1.0375039494192606e-07, + "learning_rate": 0.13402004073824098, + "loss": 0.0, + "num_input_tokens_seen": 36601536, + "step": 21360 + }, + { + "epoch": 103.71186440677967, + "grad_norm": 1.5181477408532373e-07, + "learning_rate": 0.13396147232776062, + "loss": 0.0, + "num_input_tokens_seen": 36610176, + "step": 21365 + }, + { + "epoch": 103.7360774818402, + "grad_norm": 1.3590857861345285e-07, + "learning_rate": 0.13390290639062288, + "loss": 0.0, + "num_input_tokens_seen": 36619104, + "step": 21370 + }, + { + "epoch": 103.76029055690073, + "grad_norm": 1.3676887533620175e-07, + "learning_rate": 0.13384434293585917, + "loss": 0.0, + "num_input_tokens_seen": 36627552, + "step": 21375 + }, + { + "epoch": 103.78450363196126, + "grad_norm": 9.93629001300178e-08, + "learning_rate": 0.13378578197250088, + "loss": 0.0, + "num_input_tokens_seen": 36635872, + "step": 21380 + }, + { + "epoch": 103.80871670702179, + "grad_norm": 1.460183227663947e-07, + "learning_rate": 0.13372722350957872, + "loss": 0.0, + "num_input_tokens_seen": 36644352, + "step": 21385 + }, + { + "epoch": 103.83292978208233, + "grad_norm": 2.2809895483533182e-07, + "learning_rate": 0.13366866755612322, + "loss": 0.0, + "num_input_tokens_seen": 36652672, + "step": 21390 + }, + { + "epoch": 103.85714285714286, + "grad_norm": 2.0848850112997752e-07, + "learning_rate": 0.13361011412116436, + "loss": 0.0, + "num_input_tokens_seen": 36660896, + "step": 21395 + }, + { + "epoch": 103.88135593220339, + "grad_norm": 1.319401263799591e-07, + "learning_rate": 0.13355156321373196, + "loss": 0.0, + "num_input_tokens_seen": 36669664, + "step": 21400 + }, + { + "epoch": 103.88135593220339, + "eval_loss": 0.9755566716194153, + "eval_runtime": 4.6231, + "eval_samples_per_second": 79.384, + "eval_steps_per_second": 19.9, + "num_input_tokens_seen": 36669664, + "step": 21400 + }, + { + "epoch": 103.90556900726392, + "grad_norm": 2.558136600327998e-07, + "learning_rate": 0.13349301484285514, + "loss": 0.0, + "num_input_tokens_seen": 36678240, + "step": 21405 + }, + { + "epoch": 103.92978208232445, + "grad_norm": 1.9378606452846725e-07, + "learning_rate": 0.13343446901756295, + "loss": 0.0, + "num_input_tokens_seen": 36686944, + "step": 21410 + }, + { + "epoch": 103.953995157385, + "grad_norm": 1.841190595541775e-07, + "learning_rate": 0.13337592574688376, + "loss": 0.0, + "num_input_tokens_seen": 36695840, + "step": 21415 + }, + { + "epoch": 103.97820823244552, + "grad_norm": 2.5631382527535607e-07, + "learning_rate": 0.13331738503984572, + "loss": 0.0, + "num_input_tokens_seen": 36704480, + "step": 21420 + }, + { + "epoch": 104.00484261501211, + "grad_norm": 1.539330014566076e-07, + "learning_rate": 0.1332588469054766, + "loss": 0.0, + "num_input_tokens_seen": 36713664, + "step": 21425 + }, + { + "epoch": 104.02905569007264, + "grad_norm": 2.1228687785423972e-07, + "learning_rate": 0.1332003113528036, + "loss": 0.0, + "num_input_tokens_seen": 36722464, + "step": 21430 + }, + { + "epoch": 104.05326876513317, + "grad_norm": 1.7418467734842125e-07, + "learning_rate": 0.13314177839085373, + "loss": 0.0, + "num_input_tokens_seen": 36731392, + "step": 21435 + }, + { + "epoch": 104.0774818401937, + "grad_norm": 1.560547246981514e-07, + "learning_rate": 0.13308324802865354, + "loss": 0.0, + "num_input_tokens_seen": 36739904, + "step": 21440 + }, + { + "epoch": 104.10169491525424, + "grad_norm": 1.1241690600627408e-07, + "learning_rate": 0.13302472027522905, + "loss": 0.0, + "num_input_tokens_seen": 36748576, + "step": 21445 + }, + { + "epoch": 104.12590799031477, + "grad_norm": 3.086657045514585e-07, + "learning_rate": 0.13296619513960606, + "loss": 0.0, + "num_input_tokens_seen": 36757504, + "step": 21450 + }, + { + "epoch": 104.1501210653753, + "grad_norm": 3.257608227613673e-07, + "learning_rate": 0.1329076726308098, + "loss": 0.0, + "num_input_tokens_seen": 36766080, + "step": 21455 + }, + { + "epoch": 104.17433414043583, + "grad_norm": 1.2176752761661191e-07, + "learning_rate": 0.13284915275786519, + "loss": 0.0, + "num_input_tokens_seen": 36774624, + "step": 21460 + }, + { + "epoch": 104.19854721549636, + "grad_norm": 6.015428510863785e-08, + "learning_rate": 0.1327906355297968, + "loss": 0.0, + "num_input_tokens_seen": 36784096, + "step": 21465 + }, + { + "epoch": 104.2227602905569, + "grad_norm": 8.760730452195276e-08, + "learning_rate": 0.13273212095562867, + "loss": 0.0, + "num_input_tokens_seen": 36792800, + "step": 21470 + }, + { + "epoch": 104.24697336561744, + "grad_norm": 1.1800904786696265e-07, + "learning_rate": 0.13267360904438444, + "loss": 0.0, + "num_input_tokens_seen": 36801344, + "step": 21475 + }, + { + "epoch": 104.27118644067797, + "grad_norm": 9.326696925882061e-08, + "learning_rate": 0.1326150998050875, + "loss": 0.0, + "num_input_tokens_seen": 36809440, + "step": 21480 + }, + { + "epoch": 104.2953995157385, + "grad_norm": 2.3166970208876592e-07, + "learning_rate": 0.1325565932467606, + "loss": 0.0, + "num_input_tokens_seen": 36817920, + "step": 21485 + }, + { + "epoch": 104.31961259079903, + "grad_norm": 1.1391996679321892e-07, + "learning_rate": 0.13249808937842628, + "loss": 0.0, + "num_input_tokens_seen": 36826016, + "step": 21490 + }, + { + "epoch": 104.34382566585957, + "grad_norm": 9.529551192599683e-08, + "learning_rate": 0.1324395882091065, + "loss": 0.0, + "num_input_tokens_seen": 36834368, + "step": 21495 + }, + { + "epoch": 104.3680387409201, + "grad_norm": 1.8266212009621086e-07, + "learning_rate": 0.13238108974782284, + "loss": 0.0, + "num_input_tokens_seen": 36843200, + "step": 21500 + }, + { + "epoch": 104.39225181598063, + "grad_norm": 2.241900602939495e-07, + "learning_rate": 0.13232259400359664, + "loss": 0.0, + "num_input_tokens_seen": 36851648, + "step": 21505 + }, + { + "epoch": 104.41646489104116, + "grad_norm": 1.533405367126761e-07, + "learning_rate": 0.13226410098544852, + "loss": 0.0, + "num_input_tokens_seen": 36860032, + "step": 21510 + }, + { + "epoch": 104.44067796610169, + "grad_norm": 1.1572510771884481e-07, + "learning_rate": 0.13220561070239892, + "loss": 0.0, + "num_input_tokens_seen": 36868096, + "step": 21515 + }, + { + "epoch": 104.46489104116223, + "grad_norm": 5.925928903138811e-08, + "learning_rate": 0.13214712316346783, + "loss": 0.0, + "num_input_tokens_seen": 36876768, + "step": 21520 + }, + { + "epoch": 104.48910411622276, + "grad_norm": 6.101812033421083e-08, + "learning_rate": 0.13208863837767465, + "loss": 0.0, + "num_input_tokens_seen": 36885152, + "step": 21525 + }, + { + "epoch": 104.51331719128329, + "grad_norm": 9.671010303691219e-08, + "learning_rate": 0.13203015635403856, + "loss": 0.0, + "num_input_tokens_seen": 36893536, + "step": 21530 + }, + { + "epoch": 104.53753026634382, + "grad_norm": 1.7028611409841687e-07, + "learning_rate": 0.13197167710157817, + "loss": 0.0, + "num_input_tokens_seen": 36901728, + "step": 21535 + }, + { + "epoch": 104.56174334140435, + "grad_norm": 1.370208622120117e-07, + "learning_rate": 0.13191320062931167, + "loss": 0.0, + "num_input_tokens_seen": 36909600, + "step": 21540 + }, + { + "epoch": 104.5859564164649, + "grad_norm": 1.0773301539757085e-07, + "learning_rate": 0.13185472694625702, + "loss": 0.0, + "num_input_tokens_seen": 36918144, + "step": 21545 + }, + { + "epoch": 104.61016949152543, + "grad_norm": 1.0056353971776844e-07, + "learning_rate": 0.13179625606143142, + "loss": 0.0, + "num_input_tokens_seen": 36926816, + "step": 21550 + }, + { + "epoch": 104.63438256658596, + "grad_norm": 2.442687616621697e-07, + "learning_rate": 0.13173778798385188, + "loss": 0.0, + "num_input_tokens_seen": 36935424, + "step": 21555 + }, + { + "epoch": 104.65859564164649, + "grad_norm": 2.2345470540585666e-07, + "learning_rate": 0.13167932272253505, + "loss": 0.0, + "num_input_tokens_seen": 36944064, + "step": 21560 + }, + { + "epoch": 104.68280871670702, + "grad_norm": 1.411476233670328e-07, + "learning_rate": 0.1316208602864968, + "loss": 0.0, + "num_input_tokens_seen": 36952704, + "step": 21565 + }, + { + "epoch": 104.70702179176756, + "grad_norm": 7.044186389748575e-08, + "learning_rate": 0.13156240068475292, + "loss": 0.0, + "num_input_tokens_seen": 36961504, + "step": 21570 + }, + { + "epoch": 104.73123486682809, + "grad_norm": 1.5081887738688238e-07, + "learning_rate": 0.1315039439263185, + "loss": 0.0, + "num_input_tokens_seen": 36969696, + "step": 21575 + }, + { + "epoch": 104.75544794188862, + "grad_norm": 1.3743924398568197e-07, + "learning_rate": 0.13144549002020833, + "loss": 0.0, + "num_input_tokens_seen": 36978112, + "step": 21580 + }, + { + "epoch": 104.77966101694915, + "grad_norm": 1.4147475724257674e-07, + "learning_rate": 0.13138703897543688, + "loss": 0.0, + "num_input_tokens_seen": 36986912, + "step": 21585 + }, + { + "epoch": 104.80387409200968, + "grad_norm": 1.369969169218166e-07, + "learning_rate": 0.1313285908010178, + "loss": 0.0, + "num_input_tokens_seen": 36995872, + "step": 21590 + }, + { + "epoch": 104.82808716707022, + "grad_norm": 6.25391081143789e-08, + "learning_rate": 0.13127014550596475, + "loss": 0.0, + "num_input_tokens_seen": 37004224, + "step": 21595 + }, + { + "epoch": 104.85230024213075, + "grad_norm": 1.7467030488660384e-07, + "learning_rate": 0.1312117030992906, + "loss": 0.0, + "num_input_tokens_seen": 37012960, + "step": 21600 + }, + { + "epoch": 104.85230024213075, + "eval_loss": 0.9847287535667419, + "eval_runtime": 4.6288, + "eval_samples_per_second": 79.287, + "eval_steps_per_second": 19.876, + "num_input_tokens_seen": 37012960, + "step": 21600 + }, + { + "epoch": 104.87651331719128, + "grad_norm": 3.103157268924406e-07, + "learning_rate": 0.13115326359000795, + "loss": 0.0, + "num_input_tokens_seen": 37021408, + "step": 21605 + }, + { + "epoch": 104.90072639225181, + "grad_norm": 1.3704035950468096e-07, + "learning_rate": 0.13109482698712896, + "loss": 0.0, + "num_input_tokens_seen": 37029920, + "step": 21610 + }, + { + "epoch": 104.92493946731234, + "grad_norm": 1.9063342904246383e-07, + "learning_rate": 0.1310363932996651, + "loss": 0.0, + "num_input_tokens_seen": 37038656, + "step": 21615 + }, + { + "epoch": 104.94915254237289, + "grad_norm": 1.7816880415466585e-07, + "learning_rate": 0.13097796253662775, + "loss": 0.0, + "num_input_tokens_seen": 37047392, + "step": 21620 + }, + { + "epoch": 104.97336561743342, + "grad_norm": 9.383976617982626e-08, + "learning_rate": 0.1309195347070277, + "loss": 0.0, + "num_input_tokens_seen": 37055264, + "step": 21625 + }, + { + "epoch": 104.99757869249395, + "grad_norm": 1.0884547663181365e-07, + "learning_rate": 0.13086110981987506, + "loss": 0.0, + "num_input_tokens_seen": 37063872, + "step": 21630 + }, + { + "epoch": 105.02421307506053, + "grad_norm": 1.271613285780404e-07, + "learning_rate": 0.13080268788417987, + "loss": 0.0, + "num_input_tokens_seen": 37072640, + "step": 21635 + }, + { + "epoch": 105.04842615012106, + "grad_norm": 2.1566275165696425e-07, + "learning_rate": 0.1307442689089515, + "loss": 0.0, + "num_input_tokens_seen": 37081024, + "step": 21640 + }, + { + "epoch": 105.0726392251816, + "grad_norm": 1.3531735021388158e-07, + "learning_rate": 0.13068585290319873, + "loss": 0.0, + "num_input_tokens_seen": 37089984, + "step": 21645 + }, + { + "epoch": 105.09685230024213, + "grad_norm": 1.037258883229697e-07, + "learning_rate": 0.13062743987593026, + "loss": 0.0, + "num_input_tokens_seen": 37098688, + "step": 21650 + }, + { + "epoch": 105.12106537530266, + "grad_norm": 1.3173189472581726e-07, + "learning_rate": 0.13056902983615395, + "loss": 0.0, + "num_input_tokens_seen": 37107424, + "step": 21655 + }, + { + "epoch": 105.1452784503632, + "grad_norm": 1.7364305904266075e-07, + "learning_rate": 0.13051062279287742, + "loss": 0.0, + "num_input_tokens_seen": 37116064, + "step": 21660 + }, + { + "epoch": 105.16949152542372, + "grad_norm": 2.621431747229508e-07, + "learning_rate": 0.13045221875510782, + "loss": 0.0, + "num_input_tokens_seen": 37124608, + "step": 21665 + }, + { + "epoch": 105.19370460048427, + "grad_norm": 1.256265704796533e-07, + "learning_rate": 0.13039381773185174, + "loss": 0.0, + "num_input_tokens_seen": 37133344, + "step": 21670 + }, + { + "epoch": 105.2179176755448, + "grad_norm": 8.137397600194163e-08, + "learning_rate": 0.1303354197321153, + "loss": 0.0, + "num_input_tokens_seen": 37141824, + "step": 21675 + }, + { + "epoch": 105.24213075060533, + "grad_norm": 6.88188137587531e-08, + "learning_rate": 0.13027702476490433, + "loss": 0.0, + "num_input_tokens_seen": 37150368, + "step": 21680 + }, + { + "epoch": 105.26634382566586, + "grad_norm": 7.574762861395357e-08, + "learning_rate": 0.1302186328392239, + "loss": 0.0, + "num_input_tokens_seen": 37158912, + "step": 21685 + }, + { + "epoch": 105.29055690072639, + "grad_norm": 8.433859477463557e-08, + "learning_rate": 0.130160243964079, + "loss": 0.0, + "num_input_tokens_seen": 37167648, + "step": 21690 + }, + { + "epoch": 105.31476997578693, + "grad_norm": 1.1783068742943215e-07, + "learning_rate": 0.13010185814847372, + "loss": 0.0, + "num_input_tokens_seen": 37175968, + "step": 21695 + }, + { + "epoch": 105.33898305084746, + "grad_norm": 2.0612168327716063e-07, + "learning_rate": 0.13004347540141192, + "loss": 0.0, + "num_input_tokens_seen": 37184352, + "step": 21700 + }, + { + "epoch": 105.36319612590799, + "grad_norm": 1.4295078187842591e-07, + "learning_rate": 0.12998509573189712, + "loss": 0.0, + "num_input_tokens_seen": 37192704, + "step": 21705 + }, + { + "epoch": 105.38740920096852, + "grad_norm": 5.4553719763816844e-08, + "learning_rate": 0.12992671914893203, + "loss": 0.0, + "num_input_tokens_seen": 37201344, + "step": 21710 + }, + { + "epoch": 105.41162227602905, + "grad_norm": 9.955547142226351e-08, + "learning_rate": 0.12986834566151909, + "loss": 0.0, + "num_input_tokens_seen": 37209792, + "step": 21715 + }, + { + "epoch": 105.4358353510896, + "grad_norm": 1.1275240296981792e-07, + "learning_rate": 0.12980997527866028, + "loss": 0.0, + "num_input_tokens_seen": 37218720, + "step": 21720 + }, + { + "epoch": 105.46004842615012, + "grad_norm": 1.1523654563916352e-07, + "learning_rate": 0.12975160800935692, + "loss": 0.0, + "num_input_tokens_seen": 37226720, + "step": 21725 + }, + { + "epoch": 105.48426150121065, + "grad_norm": 3.6792087598769285e-08, + "learning_rate": 0.12969324386261016, + "loss": 0.0, + "num_input_tokens_seen": 37235200, + "step": 21730 + }, + { + "epoch": 105.50847457627118, + "grad_norm": 7.459755124727963e-08, + "learning_rate": 0.12963488284742034, + "loss": 0.0, + "num_input_tokens_seen": 37243648, + "step": 21735 + }, + { + "epoch": 105.53268765133171, + "grad_norm": 6.648660644259508e-08, + "learning_rate": 0.12957652497278752, + "loss": 0.0, + "num_input_tokens_seen": 37252160, + "step": 21740 + }, + { + "epoch": 105.55690072639226, + "grad_norm": 1.7393249152064527e-07, + "learning_rate": 0.12951817024771117, + "loss": 0.0, + "num_input_tokens_seen": 37260864, + "step": 21745 + }, + { + "epoch": 105.58111380145279, + "grad_norm": 9.779615339766679e-08, + "learning_rate": 0.12945981868119041, + "loss": 0.0, + "num_input_tokens_seen": 37269408, + "step": 21750 + }, + { + "epoch": 105.60532687651332, + "grad_norm": 1.719914877185147e-07, + "learning_rate": 0.12940147028222376, + "loss": 0.0, + "num_input_tokens_seen": 37278304, + "step": 21755 + }, + { + "epoch": 105.62953995157385, + "grad_norm": 6.833345622681009e-08, + "learning_rate": 0.12934312505980916, + "loss": 0.0, + "num_input_tokens_seen": 37286752, + "step": 21760 + }, + { + "epoch": 105.65375302663438, + "grad_norm": 1.452975766369491e-07, + "learning_rate": 0.1292847830229443, + "loss": 0.0, + "num_input_tokens_seen": 37294976, + "step": 21765 + }, + { + "epoch": 105.67796610169492, + "grad_norm": 9.783838095245301e-08, + "learning_rate": 0.12922644418062626, + "loss": 0.0, + "num_input_tokens_seen": 37303744, + "step": 21770 + }, + { + "epoch": 105.70217917675545, + "grad_norm": 7.501853360736277e-08, + "learning_rate": 0.1291681085418515, + "loss": 0.0, + "num_input_tokens_seen": 37312416, + "step": 21775 + }, + { + "epoch": 105.72639225181598, + "grad_norm": 1.5127547214888182e-07, + "learning_rate": 0.12910977611561628, + "loss": 0.0, + "num_input_tokens_seen": 37321152, + "step": 21780 + }, + { + "epoch": 105.75060532687651, + "grad_norm": 1.3724387315505737e-07, + "learning_rate": 0.1290514469109161, + "loss": 0.0, + "num_input_tokens_seen": 37329952, + "step": 21785 + }, + { + "epoch": 105.77481840193704, + "grad_norm": 1.553342627858001e-07, + "learning_rate": 0.128993120936746, + "loss": 0.0, + "num_input_tokens_seen": 37338368, + "step": 21790 + }, + { + "epoch": 105.79903147699758, + "grad_norm": 1.4026132078015507e-07, + "learning_rate": 0.12893479820210071, + "loss": 0.0, + "num_input_tokens_seen": 37346944, + "step": 21795 + }, + { + "epoch": 105.82324455205811, + "grad_norm": 9.406421241919816e-08, + "learning_rate": 0.1288764787159742, + "loss": 0.0, + "num_input_tokens_seen": 37355968, + "step": 21800 + }, + { + "epoch": 105.82324455205811, + "eval_loss": 0.9930744171142578, + "eval_runtime": 4.6353, + "eval_samples_per_second": 79.176, + "eval_steps_per_second": 19.848, + "num_input_tokens_seen": 37355968, + "step": 21800 + }, + { + "epoch": 105.84745762711864, + "grad_norm": 8.626925307453348e-08, + "learning_rate": 0.1288181624873601, + "loss": 0.0, + "num_input_tokens_seen": 37364288, + "step": 21805 + }, + { + "epoch": 105.87167070217917, + "grad_norm": 1.460716845258503e-07, + "learning_rate": 0.12875984952525163, + "loss": 0.0, + "num_input_tokens_seen": 37372928, + "step": 21810 + }, + { + "epoch": 105.8958837772397, + "grad_norm": 5.53394983171529e-08, + "learning_rate": 0.12870153983864122, + "loss": 0.0, + "num_input_tokens_seen": 37381472, + "step": 21815 + }, + { + "epoch": 105.92009685230025, + "grad_norm": 1.5139434594857448e-07, + "learning_rate": 0.12864323343652104, + "loss": 0.0, + "num_input_tokens_seen": 37390080, + "step": 21820 + }, + { + "epoch": 105.94430992736078, + "grad_norm": 1.5184679114099708e-07, + "learning_rate": 0.12858493032788268, + "loss": 0.0, + "num_input_tokens_seen": 37398464, + "step": 21825 + }, + { + "epoch": 105.9685230024213, + "grad_norm": 1.1122152443476807e-07, + "learning_rate": 0.12852663052171714, + "loss": 0.0, + "num_input_tokens_seen": 37406912, + "step": 21830 + }, + { + "epoch": 105.99273607748184, + "grad_norm": 1.4425438621401554e-07, + "learning_rate": 0.12846833402701507, + "loss": 0.0, + "num_input_tokens_seen": 37415424, + "step": 21835 + }, + { + "epoch": 106.01937046004842, + "grad_norm": 7.263550827474319e-08, + "learning_rate": 0.12841004085276642, + "loss": 0.0, + "num_input_tokens_seen": 37424736, + "step": 21840 + }, + { + "epoch": 106.04358353510897, + "grad_norm": 1.244475811290613e-07, + "learning_rate": 0.12835175100796076, + "loss": 0.0, + "num_input_tokens_seen": 37433536, + "step": 21845 + }, + { + "epoch": 106.0677966101695, + "grad_norm": 1.7127445062214974e-07, + "learning_rate": 0.12829346450158724, + "loss": 0.0, + "num_input_tokens_seen": 37442272, + "step": 21850 + }, + { + "epoch": 106.09200968523002, + "grad_norm": 9.828126934507964e-08, + "learning_rate": 0.12823518134263423, + "loss": 0.0, + "num_input_tokens_seen": 37450816, + "step": 21855 + }, + { + "epoch": 106.11622276029055, + "grad_norm": 8.848365951052983e-08, + "learning_rate": 0.12817690154008973, + "loss": 0.0, + "num_input_tokens_seen": 37459200, + "step": 21860 + }, + { + "epoch": 106.14043583535108, + "grad_norm": 7.238035237833174e-08, + "learning_rate": 0.12811862510294134, + "loss": 0.0, + "num_input_tokens_seen": 37468224, + "step": 21865 + }, + { + "epoch": 106.16464891041163, + "grad_norm": 2.1175793563088519e-07, + "learning_rate": 0.12806035204017585, + "loss": 0.0, + "num_input_tokens_seen": 37476928, + "step": 21870 + }, + { + "epoch": 106.18886198547216, + "grad_norm": 1.752933940224466e-07, + "learning_rate": 0.12800208236077987, + "loss": 0.0, + "num_input_tokens_seen": 37485408, + "step": 21875 + }, + { + "epoch": 106.21307506053269, + "grad_norm": 1.0922833837412327e-07, + "learning_rate": 0.12794381607373917, + "loss": 0.0, + "num_input_tokens_seen": 37493760, + "step": 21880 + }, + { + "epoch": 106.23728813559322, + "grad_norm": 1.156976097149709e-07, + "learning_rate": 0.12788555318803924, + "loss": 0.0, + "num_input_tokens_seen": 37502240, + "step": 21885 + }, + { + "epoch": 106.26150121065375, + "grad_norm": 1.48010187217551e-07, + "learning_rate": 0.1278272937126649, + "loss": 0.0, + "num_input_tokens_seen": 37510368, + "step": 21890 + }, + { + "epoch": 106.28571428571429, + "grad_norm": 1.857085010215087e-07, + "learning_rate": 0.1277690376566005, + "loss": 0.0, + "num_input_tokens_seen": 37518720, + "step": 21895 + }, + { + "epoch": 106.30992736077482, + "grad_norm": 1.840385550622159e-07, + "learning_rate": 0.12771078502882985, + "loss": 0.0, + "num_input_tokens_seen": 37526912, + "step": 21900 + }, + { + "epoch": 106.33414043583535, + "grad_norm": 6.656415507677593e-08, + "learning_rate": 0.12765253583833633, + "loss": 0.0, + "num_input_tokens_seen": 37535520, + "step": 21905 + }, + { + "epoch": 106.35835351089588, + "grad_norm": 9.16290474606285e-08, + "learning_rate": 0.12759429009410256, + "loss": 0.0, + "num_input_tokens_seen": 37544224, + "step": 21910 + }, + { + "epoch": 106.38256658595641, + "grad_norm": 4.9946912383802555e-08, + "learning_rate": 0.12753604780511085, + "loss": 0.0, + "num_input_tokens_seen": 37552704, + "step": 21915 + }, + { + "epoch": 106.40677966101696, + "grad_norm": 6.266196805881918e-08, + "learning_rate": 0.12747780898034283, + "loss": 0.0, + "num_input_tokens_seen": 37561344, + "step": 21920 + }, + { + "epoch": 106.43099273607749, + "grad_norm": 5.47760450331225e-08, + "learning_rate": 0.12741957362877973, + "loss": 0.0, + "num_input_tokens_seen": 37569824, + "step": 21925 + }, + { + "epoch": 106.45520581113801, + "grad_norm": 4.306415846144773e-08, + "learning_rate": 0.12736134175940214, + "loss": 0.0, + "num_input_tokens_seen": 37578528, + "step": 21930 + }, + { + "epoch": 106.47941888619854, + "grad_norm": 1.3463838399729866e-07, + "learning_rate": 0.12730311338119016, + "loss": 0.0, + "num_input_tokens_seen": 37587040, + "step": 21935 + }, + { + "epoch": 106.50363196125907, + "grad_norm": 8.448052568610365e-08, + "learning_rate": 0.12724488850312327, + "loss": 0.0, + "num_input_tokens_seen": 37595776, + "step": 21940 + }, + { + "epoch": 106.52784503631962, + "grad_norm": 5.388463009126099e-08, + "learning_rate": 0.1271866671341806, + "loss": 0.0, + "num_input_tokens_seen": 37604288, + "step": 21945 + }, + { + "epoch": 106.55205811138015, + "grad_norm": 1.246170597823948e-07, + "learning_rate": 0.12712844928334047, + "loss": 0.0, + "num_input_tokens_seen": 37612704, + "step": 21950 + }, + { + "epoch": 106.57627118644068, + "grad_norm": 1.3532314824260538e-07, + "learning_rate": 0.12707023495958095, + "loss": 0.0, + "num_input_tokens_seen": 37621408, + "step": 21955 + }, + { + "epoch": 106.60048426150121, + "grad_norm": 9.490481289731179e-08, + "learning_rate": 0.12701202417187932, + "loss": 0.0, + "num_input_tokens_seen": 37629792, + "step": 21960 + }, + { + "epoch": 106.62469733656174, + "grad_norm": 6.83393537315169e-08, + "learning_rate": 0.12695381692921243, + "loss": 0.0, + "num_input_tokens_seen": 37638208, + "step": 21965 + }, + { + "epoch": 106.64891041162228, + "grad_norm": 1.8292705306066637e-07, + "learning_rate": 0.12689561324055665, + "loss": 0.0, + "num_input_tokens_seen": 37646624, + "step": 21970 + }, + { + "epoch": 106.67312348668281, + "grad_norm": 2.1837126951140817e-07, + "learning_rate": 0.12683741311488758, + "loss": 0.0, + "num_input_tokens_seen": 37655136, + "step": 21975 + }, + { + "epoch": 106.69733656174334, + "grad_norm": 2.136782057959863e-07, + "learning_rate": 0.1267792165611805, + "loss": 0.0, + "num_input_tokens_seen": 37663648, + "step": 21980 + }, + { + "epoch": 106.72154963680387, + "grad_norm": 7.694703185734397e-08, + "learning_rate": 0.1267210235884101, + "loss": 0.0, + "num_input_tokens_seen": 37672128, + "step": 21985 + }, + { + "epoch": 106.7457627118644, + "grad_norm": 1.4152337257655745e-07, + "learning_rate": 0.12666283420555033, + "loss": 0.0, + "num_input_tokens_seen": 37680992, + "step": 21990 + }, + { + "epoch": 106.76997578692495, + "grad_norm": 1.763084043204799e-07, + "learning_rate": 0.12660464842157487, + "loss": 0.0, + "num_input_tokens_seen": 37689600, + "step": 21995 + }, + { + "epoch": 106.79418886198548, + "grad_norm": 1.6910682631987584e-07, + "learning_rate": 0.1265464662454566, + "loss": 0.0, + "num_input_tokens_seen": 37698112, + "step": 22000 + }, + { + "epoch": 106.79418886198548, + "eval_loss": 1.0024558305740356, + "eval_runtime": 4.6157, + "eval_samples_per_second": 79.511, + "eval_steps_per_second": 19.932, + "num_input_tokens_seen": 37698112, + "step": 22000 + }, + { + "epoch": 106.818401937046, + "grad_norm": 8.972043730182122e-08, + "learning_rate": 0.12648828768616793, + "loss": 0.0, + "num_input_tokens_seen": 37706784, + "step": 22005 + }, + { + "epoch": 106.84261501210653, + "grad_norm": 1.0750986234597804e-07, + "learning_rate": 0.12643011275268085, + "loss": 0.0, + "num_input_tokens_seen": 37715200, + "step": 22010 + }, + { + "epoch": 106.86682808716706, + "grad_norm": 1.0484340862149111e-07, + "learning_rate": 0.1263719414539665, + "loss": 0.0, + "num_input_tokens_seen": 37723776, + "step": 22015 + }, + { + "epoch": 106.89104116222761, + "grad_norm": 8.192093758907504e-08, + "learning_rate": 0.1263137737989957, + "loss": 0.0, + "num_input_tokens_seen": 37732352, + "step": 22020 + }, + { + "epoch": 106.91525423728814, + "grad_norm": 8.104931481511812e-08, + "learning_rate": 0.1262556097967387, + "loss": 0.0, + "num_input_tokens_seen": 37740832, + "step": 22025 + }, + { + "epoch": 106.93946731234867, + "grad_norm": 1.1145117184696574e-07, + "learning_rate": 0.126197449456165, + "loss": 0.0, + "num_input_tokens_seen": 37749728, + "step": 22030 + }, + { + "epoch": 106.9636803874092, + "grad_norm": 9.111705168152184e-08, + "learning_rate": 0.12613929278624378, + "loss": 0.0, + "num_input_tokens_seen": 37758240, + "step": 22035 + }, + { + "epoch": 106.98789346246973, + "grad_norm": 6.575640298933649e-08, + "learning_rate": 0.12608113979594343, + "loss": 0.0, + "num_input_tokens_seen": 37766688, + "step": 22040 + }, + { + "epoch": 107.01452784503633, + "grad_norm": 9.443390069918678e-08, + "learning_rate": 0.1260229904942319, + "loss": 0.0, + "num_input_tokens_seen": 37775424, + "step": 22045 + }, + { + "epoch": 107.03874092009686, + "grad_norm": 1.3188213188186637e-07, + "learning_rate": 0.12596484489007662, + "loss": 0.0, + "num_input_tokens_seen": 37784064, + "step": 22050 + }, + { + "epoch": 107.06295399515739, + "grad_norm": 6.841413124902829e-08, + "learning_rate": 0.1259067029924442, + "loss": 0.0, + "num_input_tokens_seen": 37792448, + "step": 22055 + }, + { + "epoch": 107.08716707021792, + "grad_norm": 1.454921232380002e-07, + "learning_rate": 0.12584856481030096, + "loss": 0.0, + "num_input_tokens_seen": 37801312, + "step": 22060 + }, + { + "epoch": 107.11138014527845, + "grad_norm": 9.039226966933711e-08, + "learning_rate": 0.12579043035261261, + "loss": 0.0, + "num_input_tokens_seen": 37809888, + "step": 22065 + }, + { + "epoch": 107.13559322033899, + "grad_norm": 7.461380846507382e-08, + "learning_rate": 0.1257322996283441, + "loss": 0.0, + "num_input_tokens_seen": 37818464, + "step": 22070 + }, + { + "epoch": 107.15980629539952, + "grad_norm": 1.236267337390018e-07, + "learning_rate": 0.12567417264645994, + "loss": 0.0, + "num_input_tokens_seen": 37827104, + "step": 22075 + }, + { + "epoch": 107.18401937046005, + "grad_norm": 1.2965416829047172e-07, + "learning_rate": 0.12561604941592408, + "loss": 0.0, + "num_input_tokens_seen": 37835840, + "step": 22080 + }, + { + "epoch": 107.20823244552058, + "grad_norm": 7.353796149800473e-08, + "learning_rate": 0.12555792994569978, + "loss": 0.0, + "num_input_tokens_seen": 37844416, + "step": 22085 + }, + { + "epoch": 107.23244552058111, + "grad_norm": 1.0997782595723038e-07, + "learning_rate": 0.1254998142447499, + "loss": 0.0, + "num_input_tokens_seen": 37853024, + "step": 22090 + }, + { + "epoch": 107.25665859564165, + "grad_norm": 1.5987308188414318e-07, + "learning_rate": 0.1254417023220365, + "loss": 0.0, + "num_input_tokens_seen": 37861600, + "step": 22095 + }, + { + "epoch": 107.28087167070218, + "grad_norm": 1.3130309639564075e-07, + "learning_rate": 0.12538359418652126, + "loss": 0.0, + "num_input_tokens_seen": 37870432, + "step": 22100 + }, + { + "epoch": 107.30508474576271, + "grad_norm": 1.0870182620692503e-07, + "learning_rate": 0.12532548984716513, + "loss": 0.0, + "num_input_tokens_seen": 37878752, + "step": 22105 + }, + { + "epoch": 107.32929782082324, + "grad_norm": 9.189436411816132e-08, + "learning_rate": 0.12526738931292855, + "loss": 0.0, + "num_input_tokens_seen": 37887744, + "step": 22110 + }, + { + "epoch": 107.35351089588377, + "grad_norm": 8.257446637571775e-08, + "learning_rate": 0.1252092925927714, + "loss": 0.0, + "num_input_tokens_seen": 37895904, + "step": 22115 + }, + { + "epoch": 107.37772397094432, + "grad_norm": 1.188576206345715e-07, + "learning_rate": 0.12515119969565278, + "loss": 0.0, + "num_input_tokens_seen": 37904256, + "step": 22120 + }, + { + "epoch": 107.40193704600485, + "grad_norm": 1.1588333848067123e-07, + "learning_rate": 0.12509311063053144, + "loss": 0.0, + "num_input_tokens_seen": 37912736, + "step": 22125 + }, + { + "epoch": 107.42615012106538, + "grad_norm": 1.249738943442935e-07, + "learning_rate": 0.1250350254063655, + "loss": 0.0, + "num_input_tokens_seen": 37920992, + "step": 22130 + }, + { + "epoch": 107.4503631961259, + "grad_norm": 8.790841832251317e-08, + "learning_rate": 0.1249769440321123, + "loss": 0.0, + "num_input_tokens_seen": 37929376, + "step": 22135 + }, + { + "epoch": 107.47457627118644, + "grad_norm": 5.152976711997326e-08, + "learning_rate": 0.12491886651672884, + "loss": 0.0, + "num_input_tokens_seen": 37938400, + "step": 22140 + }, + { + "epoch": 107.49878934624698, + "grad_norm": 1.1137334610111793e-07, + "learning_rate": 0.12486079286917139, + "loss": 0.0, + "num_input_tokens_seen": 37946816, + "step": 22145 + }, + { + "epoch": 107.52300242130751, + "grad_norm": 1.1839159697046853e-07, + "learning_rate": 0.12480272309839553, + "loss": 0.0, + "num_input_tokens_seen": 37955104, + "step": 22150 + }, + { + "epoch": 107.54721549636804, + "grad_norm": 6.853902334569284e-08, + "learning_rate": 0.12474465721335648, + "loss": 0.0, + "num_input_tokens_seen": 37963584, + "step": 22155 + }, + { + "epoch": 107.57142857142857, + "grad_norm": 2.0762310271038587e-07, + "learning_rate": 0.12468659522300861, + "loss": 0.0, + "num_input_tokens_seen": 37972384, + "step": 22160 + }, + { + "epoch": 107.5956416464891, + "grad_norm": 6.396179230705457e-08, + "learning_rate": 0.12462853713630584, + "loss": 0.0, + "num_input_tokens_seen": 37981088, + "step": 22165 + }, + { + "epoch": 107.61985472154964, + "grad_norm": 7.487555109264576e-08, + "learning_rate": 0.12457048296220156, + "loss": 0.0, + "num_input_tokens_seen": 37989600, + "step": 22170 + }, + { + "epoch": 107.64406779661017, + "grad_norm": 1.1777624564501821e-07, + "learning_rate": 0.12451243270964832, + "loss": 0.0, + "num_input_tokens_seen": 37997984, + "step": 22175 + }, + { + "epoch": 107.6682808716707, + "grad_norm": 2.121497288953833e-07, + "learning_rate": 0.12445438638759827, + "loss": 0.0, + "num_input_tokens_seen": 38006464, + "step": 22180 + }, + { + "epoch": 107.69249394673123, + "grad_norm": 1.3622448591377179e-07, + "learning_rate": 0.1243963440050029, + "loss": 0.0, + "num_input_tokens_seen": 38014912, + "step": 22185 + }, + { + "epoch": 107.71670702179176, + "grad_norm": 4.645377060796818e-08, + "learning_rate": 0.12433830557081298, + "loss": 0.0, + "num_input_tokens_seen": 38023296, + "step": 22190 + }, + { + "epoch": 107.7409200968523, + "grad_norm": 9.791318689167383e-08, + "learning_rate": 0.12428027109397889, + "loss": 0.0, + "num_input_tokens_seen": 38032000, + "step": 22195 + }, + { + "epoch": 107.76513317191284, + "grad_norm": 1.2936814641761885e-07, + "learning_rate": 0.12422224058345015, + "loss": 0.0, + "num_input_tokens_seen": 38040768, + "step": 22200 + }, + { + "epoch": 107.76513317191284, + "eval_loss": 1.0076138973236084, + "eval_runtime": 4.63, + "eval_samples_per_second": 79.266, + "eval_steps_per_second": 19.871, + "num_input_tokens_seen": 38040768, + "step": 22200 + }, + { + "epoch": 107.78934624697337, + "grad_norm": 9.988678328909373e-08, + "learning_rate": 0.12416421404817583, + "loss": 0.0, + "num_input_tokens_seen": 38049216, + "step": 22205 + }, + { + "epoch": 107.8135593220339, + "grad_norm": 1.3507698781722866e-07, + "learning_rate": 0.12410619149710447, + "loss": 0.0, + "num_input_tokens_seen": 38057824, + "step": 22210 + }, + { + "epoch": 107.83777239709443, + "grad_norm": 2.2939845223390876e-07, + "learning_rate": 0.12404817293918374, + "loss": 0.0, + "num_input_tokens_seen": 38066624, + "step": 22215 + }, + { + "epoch": 107.86198547215497, + "grad_norm": 1.1458097759486918e-07, + "learning_rate": 0.12399015838336086, + "loss": 0.0, + "num_input_tokens_seen": 38075104, + "step": 22220 + }, + { + "epoch": 107.8861985472155, + "grad_norm": 1.6779394229615718e-07, + "learning_rate": 0.12393214783858246, + "loss": 0.0, + "num_input_tokens_seen": 38083904, + "step": 22225 + }, + { + "epoch": 107.91041162227603, + "grad_norm": 5.800830038538152e-08, + "learning_rate": 0.1238741413137944, + "loss": 0.0, + "num_input_tokens_seen": 38092384, + "step": 22230 + }, + { + "epoch": 107.93462469733656, + "grad_norm": 1.2592721532200812e-07, + "learning_rate": 0.12381613881794212, + "loss": 0.0, + "num_input_tokens_seen": 38101184, + "step": 22235 + }, + { + "epoch": 107.95883777239709, + "grad_norm": 1.3460282843880123e-07, + "learning_rate": 0.12375814035997022, + "loss": 0.0, + "num_input_tokens_seen": 38109696, + "step": 22240 + }, + { + "epoch": 107.98305084745763, + "grad_norm": 8.303587861746564e-08, + "learning_rate": 0.12370014594882285, + "loss": 0.0, + "num_input_tokens_seen": 38118240, + "step": 22245 + }, + { + "epoch": 108.00968523002422, + "grad_norm": 1.32330583824114e-07, + "learning_rate": 0.12364215559344356, + "loss": 0.0, + "num_input_tokens_seen": 38127168, + "step": 22250 + }, + { + "epoch": 108.03389830508475, + "grad_norm": 6.125946327983911e-08, + "learning_rate": 0.12358416930277506, + "loss": 0.0, + "num_input_tokens_seen": 38135680, + "step": 22255 + }, + { + "epoch": 108.05811138014528, + "grad_norm": 1.66317818184325e-07, + "learning_rate": 0.1235261870857596, + "loss": 0.0, + "num_input_tokens_seen": 38144768, + "step": 22260 + }, + { + "epoch": 108.08232445520581, + "grad_norm": 3.568901973949323e-08, + "learning_rate": 0.12346820895133884, + "loss": 0.0, + "num_input_tokens_seen": 38153504, + "step": 22265 + }, + { + "epoch": 108.10653753026634, + "grad_norm": 5.487900622824782e-08, + "learning_rate": 0.12341023490845361, + "loss": 0.0, + "num_input_tokens_seen": 38161952, + "step": 22270 + }, + { + "epoch": 108.13075060532688, + "grad_norm": 1.0510749604009106e-07, + "learning_rate": 0.12335226496604437, + "loss": 0.0, + "num_input_tokens_seen": 38170368, + "step": 22275 + }, + { + "epoch": 108.15496368038741, + "grad_norm": 8.124297323774954e-08, + "learning_rate": 0.12329429913305069, + "loss": 0.0, + "num_input_tokens_seen": 38179168, + "step": 22280 + }, + { + "epoch": 108.17917675544794, + "grad_norm": 1.6216210951824905e-07, + "learning_rate": 0.12323633741841171, + "loss": 0.0, + "num_input_tokens_seen": 38187616, + "step": 22285 + }, + { + "epoch": 108.20338983050847, + "grad_norm": 1.9372546233853427e-08, + "learning_rate": 0.12317837983106583, + "loss": 0.0, + "num_input_tokens_seen": 38196064, + "step": 22290 + }, + { + "epoch": 108.227602905569, + "grad_norm": 8.742092916236288e-08, + "learning_rate": 0.12312042637995087, + "loss": 0.0, + "num_input_tokens_seen": 38204480, + "step": 22295 + }, + { + "epoch": 108.25181598062954, + "grad_norm": 4.432110500829367e-08, + "learning_rate": 0.12306247707400389, + "loss": 0.0, + "num_input_tokens_seen": 38213440, + "step": 22300 + }, + { + "epoch": 108.27602905569007, + "grad_norm": 4.1502403291815426e-08, + "learning_rate": 0.12300453192216154, + "loss": 0.0, + "num_input_tokens_seen": 38221504, + "step": 22305 + }, + { + "epoch": 108.3002421307506, + "grad_norm": 5.881955900122193e-08, + "learning_rate": 0.12294659093335956, + "loss": 0.0, + "num_input_tokens_seen": 38230176, + "step": 22310 + }, + { + "epoch": 108.32445520581113, + "grad_norm": 5.450944584595163e-08, + "learning_rate": 0.12288865411653327, + "loss": 0.0, + "num_input_tokens_seen": 38238688, + "step": 22315 + }, + { + "epoch": 108.34866828087166, + "grad_norm": 3.9315864341915585e-08, + "learning_rate": 0.12283072148061717, + "loss": 0.0, + "num_input_tokens_seen": 38247328, + "step": 22320 + }, + { + "epoch": 108.37288135593221, + "grad_norm": 4.125050878656111e-08, + "learning_rate": 0.12277279303454529, + "loss": 0.0, + "num_input_tokens_seen": 38256032, + "step": 22325 + }, + { + "epoch": 108.39709443099274, + "grad_norm": 8.356524006103427e-08, + "learning_rate": 0.12271486878725091, + "loss": 0.0, + "num_input_tokens_seen": 38264416, + "step": 22330 + }, + { + "epoch": 108.42130750605327, + "grad_norm": 1.2977243102341163e-07, + "learning_rate": 0.12265694874766658, + "loss": 0.0, + "num_input_tokens_seen": 38272864, + "step": 22335 + }, + { + "epoch": 108.4455205811138, + "grad_norm": 9.538880618720214e-08, + "learning_rate": 0.12259903292472435, + "loss": 0.0, + "num_input_tokens_seen": 38281312, + "step": 22340 + }, + { + "epoch": 108.46973365617433, + "grad_norm": 1.0462959920687354e-07, + "learning_rate": 0.12254112132735567, + "loss": 0.0, + "num_input_tokens_seen": 38289632, + "step": 22345 + }, + { + "epoch": 108.49394673123487, + "grad_norm": 4.7236987654741824e-08, + "learning_rate": 0.12248321396449108, + "loss": 0.0, + "num_input_tokens_seen": 38297952, + "step": 22350 + }, + { + "epoch": 108.5181598062954, + "grad_norm": 1.949293846337241e-07, + "learning_rate": 0.12242531084506075, + "loss": 0.0, + "num_input_tokens_seen": 38306784, + "step": 22355 + }, + { + "epoch": 108.54237288135593, + "grad_norm": 9.848530169165315e-08, + "learning_rate": 0.122367411977994, + "loss": 0.0, + "num_input_tokens_seen": 38315616, + "step": 22360 + }, + { + "epoch": 108.56658595641646, + "grad_norm": 2.6574294764714068e-08, + "learning_rate": 0.12230951737221954, + "loss": 0.0, + "num_input_tokens_seen": 38324192, + "step": 22365 + }, + { + "epoch": 108.59079903147699, + "grad_norm": 1.082752945080756e-07, + "learning_rate": 0.12225162703666555, + "loss": 0.0, + "num_input_tokens_seen": 38332512, + "step": 22370 + }, + { + "epoch": 108.61501210653753, + "grad_norm": 7.187881578829547e-08, + "learning_rate": 0.1221937409802593, + "loss": 0.0, + "num_input_tokens_seen": 38340928, + "step": 22375 + }, + { + "epoch": 108.63922518159806, + "grad_norm": 1.0672827954749664e-07, + "learning_rate": 0.12213585921192768, + "loss": 0.0, + "num_input_tokens_seen": 38349504, + "step": 22380 + }, + { + "epoch": 108.6634382566586, + "grad_norm": 5.6576261897589575e-08, + "learning_rate": 0.1220779817405967, + "loss": 0.0, + "num_input_tokens_seen": 38357696, + "step": 22385 + }, + { + "epoch": 108.68765133171912, + "grad_norm": 9.256300614879365e-08, + "learning_rate": 0.12202010857519181, + "loss": 0.0, + "num_input_tokens_seen": 38366176, + "step": 22390 + }, + { + "epoch": 108.71186440677967, + "grad_norm": 8.292302311474486e-08, + "learning_rate": 0.12196223972463785, + "loss": 0.0, + "num_input_tokens_seen": 38375104, + "step": 22395 + }, + { + "epoch": 108.7360774818402, + "grad_norm": 9.528656619295361e-08, + "learning_rate": 0.12190437519785885, + "loss": 0.0, + "num_input_tokens_seen": 38383744, + "step": 22400 + }, + { + "epoch": 108.7360774818402, + "eval_loss": 1.013824701309204, + "eval_runtime": 4.6544, + "eval_samples_per_second": 78.85, + "eval_steps_per_second": 19.766, + "num_input_tokens_seen": 38383744, + "step": 22400 + }, + { + "epoch": 108.76029055690073, + "grad_norm": 6.431558574604423e-08, + "learning_rate": 0.12184651500377823, + "loss": 0.0, + "num_input_tokens_seen": 38392448, + "step": 22405 + }, + { + "epoch": 108.78450363196126, + "grad_norm": 5.572868744252446e-08, + "learning_rate": 0.12178865915131885, + "loss": 0.0, + "num_input_tokens_seen": 38400768, + "step": 22410 + }, + { + "epoch": 108.80871670702179, + "grad_norm": 1.4313867779947032e-07, + "learning_rate": 0.1217308076494027, + "loss": 0.0, + "num_input_tokens_seen": 38409376, + "step": 22415 + }, + { + "epoch": 108.83292978208233, + "grad_norm": 5.575287786996341e-08, + "learning_rate": 0.12167296050695134, + "loss": 0.0, + "num_input_tokens_seen": 38417984, + "step": 22420 + }, + { + "epoch": 108.85714285714286, + "grad_norm": 5.4395616899682864e-08, + "learning_rate": 0.12161511773288536, + "loss": 0.0, + "num_input_tokens_seen": 38426720, + "step": 22425 + }, + { + "epoch": 108.88135593220339, + "grad_norm": 1.2381543967876496e-07, + "learning_rate": 0.121557279336125, + "loss": 0.0, + "num_input_tokens_seen": 38435328, + "step": 22430 + }, + { + "epoch": 108.90556900726392, + "grad_norm": 1.708667554112253e-07, + "learning_rate": 0.12149944532558957, + "loss": 0.0, + "num_input_tokens_seen": 38443776, + "step": 22435 + }, + { + "epoch": 108.92978208232445, + "grad_norm": 1.3152592259757512e-07, + "learning_rate": 0.12144161571019785, + "loss": 0.0, + "num_input_tokens_seen": 38452160, + "step": 22440 + }, + { + "epoch": 108.953995157385, + "grad_norm": 9.563572689330613e-08, + "learning_rate": 0.12138379049886781, + "loss": 0.0, + "num_input_tokens_seen": 38460992, + "step": 22445 + }, + { + "epoch": 108.97820823244552, + "grad_norm": 7.381081701396397e-08, + "learning_rate": 0.12132596970051697, + "loss": 0.0, + "num_input_tokens_seen": 38469504, + "step": 22450 + }, + { + "epoch": 109.00484261501211, + "grad_norm": 9.3032554104866e-08, + "learning_rate": 0.12126815332406189, + "loss": 0.0, + "num_input_tokens_seen": 38478048, + "step": 22455 + }, + { + "epoch": 109.02905569007264, + "grad_norm": 1.1066126859304859e-07, + "learning_rate": 0.12121034137841868, + "loss": 0.0, + "num_input_tokens_seen": 38486912, + "step": 22460 + }, + { + "epoch": 109.05326876513317, + "grad_norm": 1.5014560972304025e-07, + "learning_rate": 0.12115253387250258, + "loss": 0.0, + "num_input_tokens_seen": 38495520, + "step": 22465 + }, + { + "epoch": 109.0774818401937, + "grad_norm": 9.173493253911147e-08, + "learning_rate": 0.12109473081522831, + "loss": 0.0, + "num_input_tokens_seen": 38503968, + "step": 22470 + }, + { + "epoch": 109.10169491525424, + "grad_norm": 1.2690978223872662e-07, + "learning_rate": 0.12103693221550982, + "loss": 0.0, + "num_input_tokens_seen": 38512640, + "step": 22475 + }, + { + "epoch": 109.12590799031477, + "grad_norm": 1.9001022977249704e-08, + "learning_rate": 0.12097913808226027, + "loss": 0.0, + "num_input_tokens_seen": 38520992, + "step": 22480 + }, + { + "epoch": 109.1501210653753, + "grad_norm": 5.448232087701399e-08, + "learning_rate": 0.12092134842439234, + "loss": 0.0, + "num_input_tokens_seen": 38529568, + "step": 22485 + }, + { + "epoch": 109.17433414043583, + "grad_norm": 1.694149034392467e-07, + "learning_rate": 0.12086356325081798, + "loss": 0.0, + "num_input_tokens_seen": 38538272, + "step": 22490 + }, + { + "epoch": 109.19854721549636, + "grad_norm": 1.5886469384440716e-07, + "learning_rate": 0.12080578257044824, + "loss": 0.0, + "num_input_tokens_seen": 38546560, + "step": 22495 + }, + { + "epoch": 109.2227602905569, + "grad_norm": 7.912734645287856e-08, + "learning_rate": 0.12074800639219378, + "loss": 0.0, + "num_input_tokens_seen": 38555168, + "step": 22500 + }, + { + "epoch": 109.24697336561744, + "grad_norm": 6.170501620772484e-08, + "learning_rate": 0.12069023472496428, + "loss": 0.0, + "num_input_tokens_seen": 38563648, + "step": 22505 + }, + { + "epoch": 109.27118644067797, + "grad_norm": 1.0759193003195833e-07, + "learning_rate": 0.12063246757766893, + "loss": 0.0, + "num_input_tokens_seen": 38572352, + "step": 22510 + }, + { + "epoch": 109.2953995157385, + "grad_norm": 5.205304987043746e-08, + "learning_rate": 0.12057470495921618, + "loss": 0.0, + "num_input_tokens_seen": 38580608, + "step": 22515 + }, + { + "epoch": 109.31961259079903, + "grad_norm": 7.717803640616694e-08, + "learning_rate": 0.12051694687851364, + "loss": 0.0, + "num_input_tokens_seen": 38589280, + "step": 22520 + }, + { + "epoch": 109.34382566585957, + "grad_norm": 1.6851419104568777e-07, + "learning_rate": 0.12045919334446839, + "loss": 0.0, + "num_input_tokens_seen": 38597696, + "step": 22525 + }, + { + "epoch": 109.3680387409201, + "grad_norm": 2.0680109003023972e-07, + "learning_rate": 0.12040144436598683, + "loss": 0.0, + "num_input_tokens_seen": 38606368, + "step": 22530 + }, + { + "epoch": 109.39225181598063, + "grad_norm": 1.964501024076526e-07, + "learning_rate": 0.12034369995197444, + "loss": 0.0, + "num_input_tokens_seen": 38614976, + "step": 22535 + }, + { + "epoch": 109.41646489104116, + "grad_norm": 1.1207340833152557e-07, + "learning_rate": 0.12028596011133627, + "loss": 0.0, + "num_input_tokens_seen": 38623104, + "step": 22540 + }, + { + "epoch": 109.44067796610169, + "grad_norm": 5.311931516871482e-08, + "learning_rate": 0.12022822485297643, + "loss": 0.0, + "num_input_tokens_seen": 38631712, + "step": 22545 + }, + { + "epoch": 109.46489104116223, + "grad_norm": 1.0361836899619448e-07, + "learning_rate": 0.12017049418579843, + "loss": 0.0, + "num_input_tokens_seen": 38640000, + "step": 22550 + }, + { + "epoch": 109.48910411622276, + "grad_norm": 9.383804666640572e-08, + "learning_rate": 0.12011276811870514, + "loss": 0.0, + "num_input_tokens_seen": 38648736, + "step": 22555 + }, + { + "epoch": 109.51331719128329, + "grad_norm": 1.1626076457105228e-07, + "learning_rate": 0.12005504666059852, + "loss": 0.0, + "num_input_tokens_seen": 38657312, + "step": 22560 + }, + { + "epoch": 109.53753026634382, + "grad_norm": 1.5324727087318024e-07, + "learning_rate": 0.11999732982038003, + "loss": 0.0, + "num_input_tokens_seen": 38666112, + "step": 22565 + }, + { + "epoch": 109.56174334140435, + "grad_norm": 6.034650112951567e-08, + "learning_rate": 0.11993961760695038, + "loss": 0.0, + "num_input_tokens_seen": 38674848, + "step": 22570 + }, + { + "epoch": 109.5859564164649, + "grad_norm": 1.248434813305721e-07, + "learning_rate": 0.11988191002920942, + "loss": 0.0, + "num_input_tokens_seen": 38683488, + "step": 22575 + }, + { + "epoch": 109.61016949152543, + "grad_norm": 7.622067954571321e-08, + "learning_rate": 0.11982420709605641, + "loss": 0.0, + "num_input_tokens_seen": 38692256, + "step": 22580 + }, + { + "epoch": 109.63438256658596, + "grad_norm": 6.337529612210346e-08, + "learning_rate": 0.11976650881638991, + "loss": 0.0, + "num_input_tokens_seen": 38700832, + "step": 22585 + }, + { + "epoch": 109.65859564164649, + "grad_norm": 1.2613952549145324e-07, + "learning_rate": 0.11970881519910764, + "loss": 0.0, + "num_input_tokens_seen": 38709632, + "step": 22590 + }, + { + "epoch": 109.68280871670702, + "grad_norm": 1.9177763022071304e-07, + "learning_rate": 0.1196511262531068, + "loss": 0.0, + "num_input_tokens_seen": 38718208, + "step": 22595 + }, + { + "epoch": 109.70702179176756, + "grad_norm": 1.1243852782172326e-07, + "learning_rate": 0.11959344198728361, + "loss": 0.0, + "num_input_tokens_seen": 38726880, + "step": 22600 + }, + { + "epoch": 109.70702179176756, + "eval_loss": 1.0151851177215576, + "eval_runtime": 4.6229, + "eval_samples_per_second": 79.387, + "eval_steps_per_second": 19.901, + "num_input_tokens_seen": 38726880, + "step": 22600 + }, + { + "epoch": 109.73123486682809, + "grad_norm": 9.521001942403018e-08, + "learning_rate": 0.11953576241053378, + "loss": 0.0, + "num_input_tokens_seen": 38734976, + "step": 22605 + }, + { + "epoch": 109.75544794188862, + "grad_norm": 3.7249453299637025e-08, + "learning_rate": 0.11947808753175228, + "loss": 0.0, + "num_input_tokens_seen": 38743264, + "step": 22610 + }, + { + "epoch": 109.77966101694915, + "grad_norm": 5.950654013986423e-08, + "learning_rate": 0.1194204173598332, + "loss": 0.0, + "num_input_tokens_seen": 38751520, + "step": 22615 + }, + { + "epoch": 109.80387409200968, + "grad_norm": 1.0274720096958845e-07, + "learning_rate": 0.11936275190367007, + "loss": 0.0, + "num_input_tokens_seen": 38760064, + "step": 22620 + }, + { + "epoch": 109.82808716707022, + "grad_norm": 7.891017617112084e-08, + "learning_rate": 0.11930509117215563, + "loss": 0.0, + "num_input_tokens_seen": 38768672, + "step": 22625 + }, + { + "epoch": 109.85230024213075, + "grad_norm": 5.9272856844927446e-08, + "learning_rate": 0.11924743517418179, + "loss": 0.0, + "num_input_tokens_seen": 38777248, + "step": 22630 + }, + { + "epoch": 109.87651331719128, + "grad_norm": 3.089210309781265e-08, + "learning_rate": 0.11918978391864, + "loss": 0.0, + "num_input_tokens_seen": 38785984, + "step": 22635 + }, + { + "epoch": 109.90072639225181, + "grad_norm": 8.267634399317103e-08, + "learning_rate": 0.11913213741442065, + "loss": 0.0, + "num_input_tokens_seen": 38794464, + "step": 22640 + }, + { + "epoch": 109.92493946731234, + "grad_norm": 1.1226645568740423e-07, + "learning_rate": 0.11907449567041364, + "loss": 0.0, + "num_input_tokens_seen": 38802880, + "step": 22645 + }, + { + "epoch": 109.94915254237289, + "grad_norm": 8.422440345157156e-08, + "learning_rate": 0.11901685869550803, + "loss": 0.0, + "num_input_tokens_seen": 38811488, + "step": 22650 + }, + { + "epoch": 109.97336561743342, + "grad_norm": 1.470242665391197e-07, + "learning_rate": 0.1189592264985922, + "loss": 0.0, + "num_input_tokens_seen": 38820352, + "step": 22655 + }, + { + "epoch": 109.99757869249395, + "grad_norm": 5.080260478962373e-08, + "learning_rate": 0.11890159908855373, + "loss": 0.0, + "num_input_tokens_seen": 38828864, + "step": 22660 + }, + { + "epoch": 110.02421307506053, + "grad_norm": 1.1813368416824233e-07, + "learning_rate": 0.11884397647427941, + "loss": 0.0, + "num_input_tokens_seen": 38837664, + "step": 22665 + }, + { + "epoch": 110.04842615012106, + "grad_norm": 6.164180632595162e-08, + "learning_rate": 0.11878635866465546, + "loss": 0.0, + "num_input_tokens_seen": 38846176, + "step": 22670 + }, + { + "epoch": 110.0726392251816, + "grad_norm": 1.657113983810632e-07, + "learning_rate": 0.11872874566856734, + "loss": 0.0, + "num_input_tokens_seen": 38854816, + "step": 22675 + }, + { + "epoch": 110.09685230024213, + "grad_norm": 4.4299593326968534e-08, + "learning_rate": 0.11867113749489955, + "loss": 0.0, + "num_input_tokens_seen": 38863456, + "step": 22680 + }, + { + "epoch": 110.12106537530266, + "grad_norm": 7.586461236996911e-08, + "learning_rate": 0.11861353415253607, + "loss": 0.0, + "num_input_tokens_seen": 38872192, + "step": 22685 + }, + { + "epoch": 110.1452784503632, + "grad_norm": 9.643174792017817e-08, + "learning_rate": 0.11855593565036011, + "loss": 0.0, + "num_input_tokens_seen": 38880512, + "step": 22690 + }, + { + "epoch": 110.16949152542372, + "grad_norm": 3.933682535262051e-08, + "learning_rate": 0.11849834199725394, + "loss": 0.0, + "num_input_tokens_seen": 38889088, + "step": 22695 + }, + { + "epoch": 110.19370460048427, + "grad_norm": 6.380586370369201e-08, + "learning_rate": 0.1184407532020994, + "loss": 0.0, + "num_input_tokens_seen": 38897344, + "step": 22700 + }, + { + "epoch": 110.2179176755448, + "grad_norm": 2.6546700837570825e-08, + "learning_rate": 0.11838316927377723, + "loss": 0.0, + "num_input_tokens_seen": 38905920, + "step": 22705 + }, + { + "epoch": 110.24213075060533, + "grad_norm": 8.314097499351192e-08, + "learning_rate": 0.11832559022116766, + "loss": 0.0, + "num_input_tokens_seen": 38914240, + "step": 22710 + }, + { + "epoch": 110.26634382566586, + "grad_norm": 7.257145284711441e-08, + "learning_rate": 0.11826801605315022, + "loss": 0.0, + "num_input_tokens_seen": 38922688, + "step": 22715 + }, + { + "epoch": 110.29055690072639, + "grad_norm": 5.5239791407757366e-08, + "learning_rate": 0.1182104467786034, + "loss": 0.0, + "num_input_tokens_seen": 38931104, + "step": 22720 + }, + { + "epoch": 110.31476997578693, + "grad_norm": 4.3688505257932775e-08, + "learning_rate": 0.1181528824064052, + "loss": 0.0, + "num_input_tokens_seen": 38939808, + "step": 22725 + }, + { + "epoch": 110.33898305084746, + "grad_norm": 5.473125952448754e-08, + "learning_rate": 0.11809532294543279, + "loss": 0.0, + "num_input_tokens_seen": 38948224, + "step": 22730 + }, + { + "epoch": 110.36319612590799, + "grad_norm": 4.863703395585617e-08, + "learning_rate": 0.11803776840456245, + "loss": 0.0, + "num_input_tokens_seen": 38956480, + "step": 22735 + }, + { + "epoch": 110.38740920096852, + "grad_norm": 9.147169066636707e-08, + "learning_rate": 0.11798021879266997, + "loss": 0.0, + "num_input_tokens_seen": 38965440, + "step": 22740 + }, + { + "epoch": 110.41162227602905, + "grad_norm": 6.61005046254104e-08, + "learning_rate": 0.11792267411863006, + "loss": 0.0, + "num_input_tokens_seen": 38973984, + "step": 22745 + }, + { + "epoch": 110.4358353510896, + "grad_norm": 6.98559787792874e-08, + "learning_rate": 0.1178651343913169, + "loss": 0.0, + "num_input_tokens_seen": 38982432, + "step": 22750 + }, + { + "epoch": 110.46004842615012, + "grad_norm": 7.624117159821253e-08, + "learning_rate": 0.11780759961960392, + "loss": 0.0, + "num_input_tokens_seen": 38991040, + "step": 22755 + }, + { + "epoch": 110.48426150121065, + "grad_norm": 5.7474462522577596e-08, + "learning_rate": 0.1177500698123636, + "loss": 0.0, + "num_input_tokens_seen": 39000096, + "step": 22760 + }, + { + "epoch": 110.50847457627118, + "grad_norm": 7.349165542791525e-08, + "learning_rate": 0.11769254497846778, + "loss": 0.0, + "num_input_tokens_seen": 39008672, + "step": 22765 + }, + { + "epoch": 110.53268765133171, + "grad_norm": 3.6409897319344964e-08, + "learning_rate": 0.11763502512678758, + "loss": 0.0, + "num_input_tokens_seen": 39017216, + "step": 22770 + }, + { + "epoch": 110.55690072639226, + "grad_norm": 1.0460785659915928e-07, + "learning_rate": 0.11757751026619315, + "loss": 0.0, + "num_input_tokens_seen": 39025696, + "step": 22775 + }, + { + "epoch": 110.58111380145279, + "grad_norm": 1.7628977388994826e-07, + "learning_rate": 0.11752000040555416, + "loss": 0.0, + "num_input_tokens_seen": 39034272, + "step": 22780 + }, + { + "epoch": 110.60532687651332, + "grad_norm": 4.691309385407294e-08, + "learning_rate": 0.11746249555373921, + "loss": 0.0, + "num_input_tokens_seen": 39043232, + "step": 22785 + }, + { + "epoch": 110.62953995157385, + "grad_norm": 2.9279584978780804e-08, + "learning_rate": 0.11740499571961638, + "loss": 0.0, + "num_input_tokens_seen": 39051872, + "step": 22790 + }, + { + "epoch": 110.65375302663438, + "grad_norm": 5.3428546920031295e-08, + "learning_rate": 0.11734750091205279, + "loss": 0.0, + "num_input_tokens_seen": 39060384, + "step": 22795 + }, + { + "epoch": 110.67796610169492, + "grad_norm": 5.329481211902021e-08, + "learning_rate": 0.11729001113991493, + "loss": 0.0, + "num_input_tokens_seen": 39068512, + "step": 22800 + }, + { + "epoch": 110.67796610169492, + "eval_loss": 1.030229926109314, + "eval_runtime": 4.6108, + "eval_samples_per_second": 79.597, + "eval_steps_per_second": 19.953, + "num_input_tokens_seen": 39068512, + "step": 22800 + }, + { + "epoch": 110.70217917675545, + "grad_norm": 1.1463770022146491e-07, + "learning_rate": 0.11723252641206837, + "loss": 0.0, + "num_input_tokens_seen": 39077152, + "step": 22805 + }, + { + "epoch": 110.72639225181598, + "grad_norm": 8.120916561438207e-08, + "learning_rate": 0.11717504673737808, + "loss": 0.0, + "num_input_tokens_seen": 39085632, + "step": 22810 + }, + { + "epoch": 110.75060532687651, + "grad_norm": 1.1981025238583243e-07, + "learning_rate": 0.11711757212470802, + "loss": 0.0, + "num_input_tokens_seen": 39094208, + "step": 22815 + }, + { + "epoch": 110.77481840193704, + "grad_norm": 1.0002970185496451e-07, + "learning_rate": 0.11706010258292165, + "loss": 0.0, + "num_input_tokens_seen": 39102720, + "step": 22820 + }, + { + "epoch": 110.79903147699758, + "grad_norm": 5.4080501854514296e-08, + "learning_rate": 0.11700263812088131, + "loss": 0.0, + "num_input_tokens_seen": 39111648, + "step": 22825 + }, + { + "epoch": 110.82324455205811, + "grad_norm": 3.2217513989962754e-08, + "learning_rate": 0.11694517874744892, + "loss": 0.0, + "num_input_tokens_seen": 39120256, + "step": 22830 + }, + { + "epoch": 110.84745762711864, + "grad_norm": 5.1037996939840014e-08, + "learning_rate": 0.11688772447148532, + "loss": 0.0, + "num_input_tokens_seen": 39128608, + "step": 22835 + }, + { + "epoch": 110.87167070217917, + "grad_norm": 1.93148480320815e-07, + "learning_rate": 0.11683027530185074, + "loss": 0.0, + "num_input_tokens_seen": 39137248, + "step": 22840 + }, + { + "epoch": 110.8958837772397, + "grad_norm": 9.683281376737796e-08, + "learning_rate": 0.11677283124740451, + "loss": 0.0, + "num_input_tokens_seen": 39145920, + "step": 22845 + }, + { + "epoch": 110.92009685230025, + "grad_norm": 1.3644088880937488e-07, + "learning_rate": 0.11671539231700531, + "loss": 0.0, + "num_input_tokens_seen": 39154272, + "step": 22850 + }, + { + "epoch": 110.94430992736078, + "grad_norm": 5.3274799682867524e-08, + "learning_rate": 0.11665795851951084, + "loss": 0.0, + "num_input_tokens_seen": 39163200, + "step": 22855 + }, + { + "epoch": 110.9685230024213, + "grad_norm": 2.975244939307231e-08, + "learning_rate": 0.11660052986377825, + "loss": 0.0, + "num_input_tokens_seen": 39172064, + "step": 22860 + }, + { + "epoch": 110.99273607748184, + "grad_norm": 8.453328348423383e-08, + "learning_rate": 0.1165431063586636, + "loss": 0.0, + "num_input_tokens_seen": 39180160, + "step": 22865 + }, + { + "epoch": 111.01937046004842, + "grad_norm": 1.5282546428352362e-07, + "learning_rate": 0.11648568801302245, + "loss": 0.0, + "num_input_tokens_seen": 39189408, + "step": 22870 + }, + { + "epoch": 111.04358353510897, + "grad_norm": 3.896265710068292e-08, + "learning_rate": 0.11642827483570937, + "loss": 0.0, + "num_input_tokens_seen": 39198400, + "step": 22875 + }, + { + "epoch": 111.0677966101695, + "grad_norm": 7.76259980739269e-08, + "learning_rate": 0.11637086683557815, + "loss": 0.0, + "num_input_tokens_seen": 39206816, + "step": 22880 + }, + { + "epoch": 111.09200968523002, + "grad_norm": 5.699712346540764e-08, + "learning_rate": 0.11631346402148188, + "loss": 0.0, + "num_input_tokens_seen": 39215360, + "step": 22885 + }, + { + "epoch": 111.11622276029055, + "grad_norm": 6.849461442470783e-08, + "learning_rate": 0.11625606640227285, + "loss": 0.0, + "num_input_tokens_seen": 39223904, + "step": 22890 + }, + { + "epoch": 111.14043583535108, + "grad_norm": 1.2565462270686112e-07, + "learning_rate": 0.11619867398680238, + "loss": 0.0, + "num_input_tokens_seen": 39232352, + "step": 22895 + }, + { + "epoch": 111.16464891041163, + "grad_norm": 6.104190219957673e-08, + "learning_rate": 0.11614128678392119, + "loss": 0.0, + "num_input_tokens_seen": 39241056, + "step": 22900 + }, + { + "epoch": 111.18886198547216, + "grad_norm": 4.828234523301944e-08, + "learning_rate": 0.11608390480247906, + "loss": 0.0, + "num_input_tokens_seen": 39249408, + "step": 22905 + }, + { + "epoch": 111.21307506053269, + "grad_norm": 4.12459186804881e-08, + "learning_rate": 0.11602652805132499, + "loss": 0.0, + "num_input_tokens_seen": 39258528, + "step": 22910 + }, + { + "epoch": 111.23728813559322, + "grad_norm": 4.82680739821717e-08, + "learning_rate": 0.11596915653930731, + "loss": 0.0, + "num_input_tokens_seen": 39266432, + "step": 22915 + }, + { + "epoch": 111.26150121065375, + "grad_norm": 1.1407774280769445e-07, + "learning_rate": 0.11591179027527328, + "loss": 0.0, + "num_input_tokens_seen": 39274912, + "step": 22920 + }, + { + "epoch": 111.28571428571429, + "grad_norm": 1.3662801734426466e-07, + "learning_rate": 0.11585442926806956, + "loss": 0.0, + "num_input_tokens_seen": 39283296, + "step": 22925 + }, + { + "epoch": 111.30992736077482, + "grad_norm": 3.2357394985638166e-08, + "learning_rate": 0.11579707352654202, + "loss": 0.0, + "num_input_tokens_seen": 39292288, + "step": 22930 + }, + { + "epoch": 111.33414043583535, + "grad_norm": 4.153328347911156e-08, + "learning_rate": 0.11573972305953548, + "loss": 0.0, + "num_input_tokens_seen": 39300448, + "step": 22935 + }, + { + "epoch": 111.35835351089588, + "grad_norm": 1.4385963709173666e-07, + "learning_rate": 0.11568237787589426, + "loss": 0.0, + "num_input_tokens_seen": 39308992, + "step": 22940 + }, + { + "epoch": 111.38256658595641, + "grad_norm": 4.166952649597988e-08, + "learning_rate": 0.11562503798446161, + "loss": 0.0, + "num_input_tokens_seen": 39317440, + "step": 22945 + }, + { + "epoch": 111.40677966101696, + "grad_norm": 6.49717861733734e-08, + "learning_rate": 0.11556770339408005, + "loss": 0.0, + "num_input_tokens_seen": 39325824, + "step": 22950 + }, + { + "epoch": 111.43099273607749, + "grad_norm": 6.56313829949795e-08, + "learning_rate": 0.1155103741135914, + "loss": 0.0, + "num_input_tokens_seen": 39334464, + "step": 22955 + }, + { + "epoch": 111.45520581113801, + "grad_norm": 5.976784223094e-08, + "learning_rate": 0.1154530501518364, + "loss": 0.0, + "num_input_tokens_seen": 39342912, + "step": 22960 + }, + { + "epoch": 111.47941888619854, + "grad_norm": 1.435338106148265e-07, + "learning_rate": 0.11539573151765523, + "loss": 0.0, + "num_input_tokens_seen": 39351488, + "step": 22965 + }, + { + "epoch": 111.50363196125907, + "grad_norm": 1.0639618608365708e-07, + "learning_rate": 0.11533841821988719, + "loss": 0.0, + "num_input_tokens_seen": 39359840, + "step": 22970 + }, + { + "epoch": 111.52784503631962, + "grad_norm": 6.192473733790393e-08, + "learning_rate": 0.11528111026737059, + "loss": 0.0, + "num_input_tokens_seen": 39368192, + "step": 22975 + }, + { + "epoch": 111.55205811138015, + "grad_norm": 1.8616941588334157e-07, + "learning_rate": 0.11522380766894312, + "loss": 0.0, + "num_input_tokens_seen": 39376736, + "step": 22980 + }, + { + "epoch": 111.57627118644068, + "grad_norm": 1.5463405134141794e-07, + "learning_rate": 0.11516651043344152, + "loss": 0.0, + "num_input_tokens_seen": 39385344, + "step": 22985 + }, + { + "epoch": 111.60048426150121, + "grad_norm": 6.125790008582044e-08, + "learning_rate": 0.11510921856970172, + "loss": 0.0, + "num_input_tokens_seen": 39394400, + "step": 22990 + }, + { + "epoch": 111.62469733656174, + "grad_norm": 1.1295767876617901e-07, + "learning_rate": 0.11505193208655895, + "loss": 0.0, + "num_input_tokens_seen": 39402976, + "step": 22995 + }, + { + "epoch": 111.64891041162228, + "grad_norm": 1.000828291353173e-07, + "learning_rate": 0.11499465099284738, + "loss": 0.0, + "num_input_tokens_seen": 39411712, + "step": 23000 + }, + { + "epoch": 111.64891041162228, + "eval_loss": 1.0302379131317139, + "eval_runtime": 4.6055, + "eval_samples_per_second": 79.687, + "eval_steps_per_second": 19.976, + "num_input_tokens_seen": 39411712, + "step": 23000 + }, + { + "epoch": 111.67312348668281, + "grad_norm": 3.880646914922181e-08, + "learning_rate": 0.1149373752974006, + "loss": 0.0, + "num_input_tokens_seen": 39420096, + "step": 23005 + }, + { + "epoch": 111.69733656174334, + "grad_norm": 4.183156221415629e-08, + "learning_rate": 0.11488010500905109, + "loss": 0.0, + "num_input_tokens_seen": 39428288, + "step": 23010 + }, + { + "epoch": 111.72154963680387, + "grad_norm": 1.3119914399339905e-07, + "learning_rate": 0.11482284013663077, + "loss": 0.0, + "num_input_tokens_seen": 39436832, + "step": 23015 + }, + { + "epoch": 111.7457627118644, + "grad_norm": 4.51932784528708e-08, + "learning_rate": 0.11476558068897061, + "loss": 0.0, + "num_input_tokens_seen": 39445248, + "step": 23020 + }, + { + "epoch": 111.76997578692495, + "grad_norm": 9.912161402780839e-08, + "learning_rate": 0.11470832667490061, + "loss": 0.0, + "num_input_tokens_seen": 39453952, + "step": 23025 + }, + { + "epoch": 111.79418886198548, + "grad_norm": 1.1875435745878349e-07, + "learning_rate": 0.11465107810325013, + "loss": 0.0, + "num_input_tokens_seen": 39462464, + "step": 23030 + }, + { + "epoch": 111.818401937046, + "grad_norm": 4.450557966606539e-08, + "learning_rate": 0.11459383498284771, + "loss": 0.0, + "num_input_tokens_seen": 39470976, + "step": 23035 + }, + { + "epoch": 111.84261501210653, + "grad_norm": 5.823168081064978e-08, + "learning_rate": 0.11453659732252082, + "loss": 0.0, + "num_input_tokens_seen": 39479648, + "step": 23040 + }, + { + "epoch": 111.86682808716706, + "grad_norm": 9.47912894844194e-08, + "learning_rate": 0.11447936513109633, + "loss": 0.0, + "num_input_tokens_seen": 39488160, + "step": 23045 + }, + { + "epoch": 111.89104116222761, + "grad_norm": 1.2620718337075232e-07, + "learning_rate": 0.11442213841740011, + "loss": 0.0, + "num_input_tokens_seen": 39496416, + "step": 23050 + }, + { + "epoch": 111.91525423728814, + "grad_norm": 6.259332963054476e-08, + "learning_rate": 0.1143649171902572, + "loss": 0.0, + "num_input_tokens_seen": 39504960, + "step": 23055 + }, + { + "epoch": 111.93946731234867, + "grad_norm": 8.750357238795914e-08, + "learning_rate": 0.11430770145849194, + "loss": 0.0, + "num_input_tokens_seen": 39513696, + "step": 23060 + }, + { + "epoch": 111.9636803874092, + "grad_norm": 2.563761469787096e-08, + "learning_rate": 0.11425049123092756, + "loss": 0.0, + "num_input_tokens_seen": 39522400, + "step": 23065 + }, + { + "epoch": 111.98789346246973, + "grad_norm": 1.0429961605495919e-07, + "learning_rate": 0.11419328651638674, + "loss": 0.0, + "num_input_tokens_seen": 39530912, + "step": 23070 + }, + { + "epoch": 112.01452784503633, + "grad_norm": 7.825443049114256e-08, + "learning_rate": 0.11413608732369115, + "loss": 0.0, + "num_input_tokens_seen": 39540160, + "step": 23075 + }, + { + "epoch": 112.03874092009686, + "grad_norm": 5.66156899139969e-08, + "learning_rate": 0.11407889366166153, + "loss": 0.0, + "num_input_tokens_seen": 39548896, + "step": 23080 + }, + { + "epoch": 112.06295399515739, + "grad_norm": 6.932887686161848e-08, + "learning_rate": 0.11402170553911797, + "loss": 0.0, + "num_input_tokens_seen": 39557472, + "step": 23085 + }, + { + "epoch": 112.08716707021792, + "grad_norm": 7.896273501728501e-08, + "learning_rate": 0.11396452296487955, + "loss": 0.0, + "num_input_tokens_seen": 39566144, + "step": 23090 + }, + { + "epoch": 112.11138014527845, + "grad_norm": 6.670477148418286e-08, + "learning_rate": 0.11390734594776449, + "loss": 0.0, + "num_input_tokens_seen": 39574912, + "step": 23095 + }, + { + "epoch": 112.13559322033899, + "grad_norm": 3.561302364119001e-08, + "learning_rate": 0.11385017449659031, + "loss": 0.0, + "num_input_tokens_seen": 39583360, + "step": 23100 + }, + { + "epoch": 112.15980629539952, + "grad_norm": 6.234325411469399e-08, + "learning_rate": 0.11379300862017344, + "loss": 0.0, + "num_input_tokens_seen": 39592128, + "step": 23105 + }, + { + "epoch": 112.18401937046005, + "grad_norm": 7.609516927686855e-08, + "learning_rate": 0.11373584832732966, + "loss": 0.0, + "num_input_tokens_seen": 39600096, + "step": 23110 + }, + { + "epoch": 112.20823244552058, + "grad_norm": 3.9489396641556596e-08, + "learning_rate": 0.11367869362687386, + "loss": 0.0, + "num_input_tokens_seen": 39608576, + "step": 23115 + }, + { + "epoch": 112.23244552058111, + "grad_norm": 6.837309030061078e-08, + "learning_rate": 0.11362154452761988, + "loss": 0.0, + "num_input_tokens_seen": 39617664, + "step": 23120 + }, + { + "epoch": 112.25665859564165, + "grad_norm": 6.891595205615886e-08, + "learning_rate": 0.11356440103838095, + "loss": 0.0, + "num_input_tokens_seen": 39625920, + "step": 23125 + }, + { + "epoch": 112.28087167070218, + "grad_norm": 5.1696616765184444e-08, + "learning_rate": 0.11350726316796922, + "loss": 0.0, + "num_input_tokens_seen": 39634816, + "step": 23130 + }, + { + "epoch": 112.30508474576271, + "grad_norm": 1.1407674804786438e-07, + "learning_rate": 0.11345013092519607, + "loss": 0.0, + "num_input_tokens_seen": 39643328, + "step": 23135 + }, + { + "epoch": 112.32929782082324, + "grad_norm": 3.571779672029152e-08, + "learning_rate": 0.11339300431887213, + "loss": 0.0, + "num_input_tokens_seen": 39651648, + "step": 23140 + }, + { + "epoch": 112.35351089588377, + "grad_norm": 4.0486916930149164e-08, + "learning_rate": 0.11333588335780687, + "loss": 0.0, + "num_input_tokens_seen": 39660384, + "step": 23145 + }, + { + "epoch": 112.37772397094432, + "grad_norm": 9.10360142825084e-08, + "learning_rate": 0.11327876805080916, + "loss": 0.0, + "num_input_tokens_seen": 39668736, + "step": 23150 + }, + { + "epoch": 112.40193704600485, + "grad_norm": 1.048486382160263e-07, + "learning_rate": 0.11322165840668696, + "loss": 0.0, + "num_input_tokens_seen": 39677312, + "step": 23155 + }, + { + "epoch": 112.42615012106538, + "grad_norm": 1.0100946212787676e-07, + "learning_rate": 0.11316455443424717, + "loss": 0.0, + "num_input_tokens_seen": 39686304, + "step": 23160 + }, + { + "epoch": 112.4503631961259, + "grad_norm": 1.7361554682793212e-07, + "learning_rate": 0.11310745614229603, + "loss": 0.0, + "num_input_tokens_seen": 39695008, + "step": 23165 + }, + { + "epoch": 112.47457627118644, + "grad_norm": 6.223553583595276e-08, + "learning_rate": 0.1130503635396387, + "loss": 0.0, + "num_input_tokens_seen": 39703520, + "step": 23170 + }, + { + "epoch": 112.49878934624698, + "grad_norm": 5.420053028615257e-08, + "learning_rate": 0.11299327663507966, + "loss": 0.0, + "num_input_tokens_seen": 39712160, + "step": 23175 + }, + { + "epoch": 112.52300242130751, + "grad_norm": 7.422214309826813e-08, + "learning_rate": 0.11293619543742246, + "loss": 0.0, + "num_input_tokens_seen": 39720512, + "step": 23180 + }, + { + "epoch": 112.54721549636804, + "grad_norm": 4.0541024759477295e-08, + "learning_rate": 0.11287911995546965, + "loss": 0.0, + "num_input_tokens_seen": 39729056, + "step": 23185 + }, + { + "epoch": 112.57142857142857, + "grad_norm": 9.591972371936208e-08, + "learning_rate": 0.11282205019802308, + "loss": 0.0, + "num_input_tokens_seen": 39737696, + "step": 23190 + }, + { + "epoch": 112.5956416464891, + "grad_norm": 4.864543257099285e-08, + "learning_rate": 0.11276498617388354, + "loss": 0.0, + "num_input_tokens_seen": 39746112, + "step": 23195 + }, + { + "epoch": 112.61985472154964, + "grad_norm": 4.305427125927963e-08, + "learning_rate": 0.11270792789185109, + "loss": 0.0, + "num_input_tokens_seen": 39754784, + "step": 23200 + }, + { + "epoch": 112.61985472154964, + "eval_loss": 1.0323305130004883, + "eval_runtime": 4.6289, + "eval_samples_per_second": 79.284, + "eval_steps_per_second": 19.875, + "num_input_tokens_seen": 39754784, + "step": 23200 + }, + { + "epoch": 112.64406779661017, + "grad_norm": 8.263788231488434e-08, + "learning_rate": 0.11265087536072482, + "loss": 0.0, + "num_input_tokens_seen": 39763488, + "step": 23205 + }, + { + "epoch": 112.6682808716707, + "grad_norm": 6.208982483713044e-08, + "learning_rate": 0.11259382858930288, + "loss": 0.0, + "num_input_tokens_seen": 39771936, + "step": 23210 + }, + { + "epoch": 112.69249394673123, + "grad_norm": 1.4186744579092192e-07, + "learning_rate": 0.11253678758638262, + "loss": 0.0, + "num_input_tokens_seen": 39780512, + "step": 23215 + }, + { + "epoch": 112.71670702179176, + "grad_norm": 9.735097705743101e-08, + "learning_rate": 0.11247975236076059, + "loss": 0.0, + "num_input_tokens_seen": 39788832, + "step": 23220 + }, + { + "epoch": 112.7409200968523, + "grad_norm": 6.91987267487093e-08, + "learning_rate": 0.11242272292123218, + "loss": 0.0, + "num_input_tokens_seen": 39797376, + "step": 23225 + }, + { + "epoch": 112.76513317191284, + "grad_norm": 4.663720432063201e-08, + "learning_rate": 0.11236569927659217, + "loss": 0.0, + "num_input_tokens_seen": 39805600, + "step": 23230 + }, + { + "epoch": 112.78934624697337, + "grad_norm": 9.963680014379861e-08, + "learning_rate": 0.11230868143563429, + "loss": 0.0, + "num_input_tokens_seen": 39814368, + "step": 23235 + }, + { + "epoch": 112.8135593220339, + "grad_norm": 3.126800152131182e-08, + "learning_rate": 0.11225166940715131, + "loss": 0.0, + "num_input_tokens_seen": 39822592, + "step": 23240 + }, + { + "epoch": 112.83777239709443, + "grad_norm": 1.126105786397602e-07, + "learning_rate": 0.11219466319993537, + "loss": 0.0, + "num_input_tokens_seen": 39830944, + "step": 23245 + }, + { + "epoch": 112.86198547215497, + "grad_norm": 5.4789389025700075e-08, + "learning_rate": 0.11213766282277739, + "loss": 0.0, + "num_input_tokens_seen": 39839200, + "step": 23250 + }, + { + "epoch": 112.8861985472155, + "grad_norm": 8.796766337582085e-08, + "learning_rate": 0.11208066828446761, + "loss": 0.0, + "num_input_tokens_seen": 39847744, + "step": 23255 + }, + { + "epoch": 112.91041162227603, + "grad_norm": 5.12575013544847e-08, + "learning_rate": 0.11202367959379537, + "loss": 0.0, + "num_input_tokens_seen": 39856608, + "step": 23260 + }, + { + "epoch": 112.93462469733656, + "grad_norm": 5.3771998409501975e-08, + "learning_rate": 0.11196669675954894, + "loss": 0.0, + "num_input_tokens_seen": 39865152, + "step": 23265 + }, + { + "epoch": 112.95883777239709, + "grad_norm": 5.653591017562576e-08, + "learning_rate": 0.1119097197905158, + "loss": 0.0, + "num_input_tokens_seen": 39874272, + "step": 23270 + }, + { + "epoch": 112.98305084745763, + "grad_norm": 3.8184769124427476e-08, + "learning_rate": 0.11185274869548259, + "loss": 0.0, + "num_input_tokens_seen": 39882592, + "step": 23275 + }, + { + "epoch": 113.00968523002422, + "grad_norm": 3.1770010622267364e-08, + "learning_rate": 0.11179578348323486, + "loss": 0.0, + "num_input_tokens_seen": 39891520, + "step": 23280 + }, + { + "epoch": 113.03389830508475, + "grad_norm": 3.654333369240703e-08, + "learning_rate": 0.1117388241625575, + "loss": 0.0, + "num_input_tokens_seen": 39900256, + "step": 23285 + }, + { + "epoch": 113.05811138014528, + "grad_norm": 5.740884034821647e-08, + "learning_rate": 0.11168187074223421, + "loss": 0.0, + "num_input_tokens_seen": 39909056, + "step": 23290 + }, + { + "epoch": 113.08232445520581, + "grad_norm": 9.220751451266551e-08, + "learning_rate": 0.11162492323104796, + "loss": 0.0, + "num_input_tokens_seen": 39917600, + "step": 23295 + }, + { + "epoch": 113.10653753026634, + "grad_norm": 6.515236350423947e-08, + "learning_rate": 0.11156798163778091, + "loss": 0.0, + "num_input_tokens_seen": 39926368, + "step": 23300 + }, + { + "epoch": 113.13075060532688, + "grad_norm": 1.7036478539012023e-07, + "learning_rate": 0.11151104597121399, + "loss": 0.0, + "num_input_tokens_seen": 39935200, + "step": 23305 + }, + { + "epoch": 113.15496368038741, + "grad_norm": 6.798347129688409e-08, + "learning_rate": 0.11145411624012742, + "loss": 0.0, + "num_input_tokens_seen": 39944000, + "step": 23310 + }, + { + "epoch": 113.17917675544794, + "grad_norm": 2.24970015949566e-08, + "learning_rate": 0.11139719245330063, + "loss": 0.0, + "num_input_tokens_seen": 39952704, + "step": 23315 + }, + { + "epoch": 113.20338983050847, + "grad_norm": 8.541336882217365e-08, + "learning_rate": 0.11134027461951179, + "loss": 0.0, + "num_input_tokens_seen": 39960960, + "step": 23320 + }, + { + "epoch": 113.227602905569, + "grad_norm": 1.9386141048016725e-07, + "learning_rate": 0.11128336274753849, + "loss": 0.0, + "num_input_tokens_seen": 39969280, + "step": 23325 + }, + { + "epoch": 113.25181598062954, + "grad_norm": 1.0614157730515217e-07, + "learning_rate": 0.11122645684615715, + "loss": 0.0, + "num_input_tokens_seen": 39977856, + "step": 23330 + }, + { + "epoch": 113.27602905569007, + "grad_norm": 6.375273642333923e-08, + "learning_rate": 0.11116955692414345, + "loss": 0.0, + "num_input_tokens_seen": 39986272, + "step": 23335 + }, + { + "epoch": 113.3002421307506, + "grad_norm": 7.34881098196638e-08, + "learning_rate": 0.11111266299027203, + "loss": 0.0, + "num_input_tokens_seen": 39994656, + "step": 23340 + }, + { + "epoch": 113.32445520581113, + "grad_norm": 7.613983399323843e-08, + "learning_rate": 0.11105577505331668, + "loss": 0.0, + "num_input_tokens_seen": 40003104, + "step": 23345 + }, + { + "epoch": 113.34866828087166, + "grad_norm": 4.8552927012224245e-08, + "learning_rate": 0.11099889312205018, + "loss": 0.0, + "num_input_tokens_seen": 40011520, + "step": 23350 + }, + { + "epoch": 113.37288135593221, + "grad_norm": 5.7429684119369995e-08, + "learning_rate": 0.11094201720524455, + "loss": 0.0, + "num_input_tokens_seen": 40019712, + "step": 23355 + }, + { + "epoch": 113.39709443099274, + "grad_norm": 5.8239830735828946e-08, + "learning_rate": 0.11088514731167064, + "loss": 0.0, + "num_input_tokens_seen": 40028640, + "step": 23360 + }, + { + "epoch": 113.42130750605327, + "grad_norm": 9.361234276639152e-08, + "learning_rate": 0.11082828345009862, + "loss": 0.0, + "num_input_tokens_seen": 40037472, + "step": 23365 + }, + { + "epoch": 113.4455205811138, + "grad_norm": 4.768870454086027e-08, + "learning_rate": 0.11077142562929748, + "loss": 0.0, + "num_input_tokens_seen": 40045952, + "step": 23370 + }, + { + "epoch": 113.46973365617433, + "grad_norm": 1.0045952336668051e-07, + "learning_rate": 0.11071457385803554, + "loss": 0.0, + "num_input_tokens_seen": 40054752, + "step": 23375 + }, + { + "epoch": 113.49394673123487, + "grad_norm": 4.3843410679755834e-08, + "learning_rate": 0.11065772814508001, + "loss": 0.0, + "num_input_tokens_seen": 40063296, + "step": 23380 + }, + { + "epoch": 113.5181598062954, + "grad_norm": 3.73398698627625e-08, + "learning_rate": 0.11060088849919715, + "loss": 0.0, + "num_input_tokens_seen": 40072096, + "step": 23385 + }, + { + "epoch": 113.54237288135593, + "grad_norm": 4.425695721010925e-08, + "learning_rate": 0.11054405492915244, + "loss": 0.0, + "num_input_tokens_seen": 40080320, + "step": 23390 + }, + { + "epoch": 113.56658595641646, + "grad_norm": 4.430938815858099e-08, + "learning_rate": 0.11048722744371031, + "loss": 0.0, + "num_input_tokens_seen": 40088928, + "step": 23395 + }, + { + "epoch": 113.59079903147699, + "grad_norm": 4.200673231480323e-08, + "learning_rate": 0.1104304060516342, + "loss": 0.0, + "num_input_tokens_seen": 40097568, + "step": 23400 + }, + { + "epoch": 113.59079903147699, + "eval_loss": 1.0440937280654907, + "eval_runtime": 4.6232, + "eval_samples_per_second": 79.382, + "eval_steps_per_second": 19.899, + "num_input_tokens_seen": 40097568, + "step": 23400 + }, + { + "epoch": 113.61501210653753, + "grad_norm": 8.280655805492643e-08, + "learning_rate": 0.11037359076168682, + "loss": 0.0, + "num_input_tokens_seen": 40106080, + "step": 23405 + }, + { + "epoch": 113.63922518159806, + "grad_norm": 1.011719632515451e-07, + "learning_rate": 0.11031678158262966, + "loss": 0.0, + "num_input_tokens_seen": 40114784, + "step": 23410 + }, + { + "epoch": 113.6634382566586, + "grad_norm": 5.068744357572541e-08, + "learning_rate": 0.11025997852322349, + "loss": 0.0, + "num_input_tokens_seen": 40123680, + "step": 23415 + }, + { + "epoch": 113.68765133171912, + "grad_norm": 9.291533586974765e-08, + "learning_rate": 0.11020318159222807, + "loss": 0.0, + "num_input_tokens_seen": 40132160, + "step": 23420 + }, + { + "epoch": 113.71186440677967, + "grad_norm": 7.480581842855827e-08, + "learning_rate": 0.1101463907984021, + "loss": 0.0, + "num_input_tokens_seen": 40140576, + "step": 23425 + }, + { + "epoch": 113.7360774818402, + "grad_norm": 9.988457350118551e-08, + "learning_rate": 0.11008960615050352, + "loss": 0.0, + "num_input_tokens_seen": 40149184, + "step": 23430 + }, + { + "epoch": 113.76029055690073, + "grad_norm": 7.599693674364971e-08, + "learning_rate": 0.11003282765728925, + "loss": 0.0, + "num_input_tokens_seen": 40157568, + "step": 23435 + }, + { + "epoch": 113.78450363196126, + "grad_norm": 7.10390608560374e-08, + "learning_rate": 0.10997605532751518, + "loss": 0.0, + "num_input_tokens_seen": 40166368, + "step": 23440 + }, + { + "epoch": 113.80871670702179, + "grad_norm": 7.10951013616068e-08, + "learning_rate": 0.1099192891699364, + "loss": 0.0, + "num_input_tokens_seen": 40175040, + "step": 23445 + }, + { + "epoch": 113.83292978208233, + "grad_norm": 1.2440406038649598e-08, + "learning_rate": 0.10986252919330687, + "loss": 0.0, + "num_input_tokens_seen": 40183424, + "step": 23450 + }, + { + "epoch": 113.85714285714286, + "grad_norm": 4.6501355654982035e-08, + "learning_rate": 0.10980577540637973, + "loss": 0.0, + "num_input_tokens_seen": 40192032, + "step": 23455 + }, + { + "epoch": 113.88135593220339, + "grad_norm": 7.679349067757357e-08, + "learning_rate": 0.10974902781790719, + "loss": 0.0, + "num_input_tokens_seen": 40200640, + "step": 23460 + }, + { + "epoch": 113.90556900726392, + "grad_norm": 3.532531778205339e-08, + "learning_rate": 0.10969228643664032, + "loss": 0.0, + "num_input_tokens_seen": 40209152, + "step": 23465 + }, + { + "epoch": 113.92978208232445, + "grad_norm": 3.6274119707968566e-08, + "learning_rate": 0.10963555127132942, + "loss": 0.0, + "num_input_tokens_seen": 40217920, + "step": 23470 + }, + { + "epoch": 113.953995157385, + "grad_norm": 6.549029762936698e-08, + "learning_rate": 0.10957882233072382, + "loss": 0.0, + "num_input_tokens_seen": 40226432, + "step": 23475 + }, + { + "epoch": 113.97820823244552, + "grad_norm": 2.989840552913847e-08, + "learning_rate": 0.10952209962357176, + "loss": 0.0, + "num_input_tokens_seen": 40234720, + "step": 23480 + }, + { + "epoch": 114.00484261501211, + "grad_norm": 7.039478333581428e-08, + "learning_rate": 0.10946538315862062, + "loss": 0.0, + "num_input_tokens_seen": 40243520, + "step": 23485 + }, + { + "epoch": 114.02905569007264, + "grad_norm": 8.155485176075672e-08, + "learning_rate": 0.10940867294461679, + "loss": 0.0, + "num_input_tokens_seen": 40252128, + "step": 23490 + }, + { + "epoch": 114.05326876513317, + "grad_norm": 1.0117703652667842e-07, + "learning_rate": 0.10935196899030565, + "loss": 0.0, + "num_input_tokens_seen": 40261088, + "step": 23495 + }, + { + "epoch": 114.0774818401937, + "grad_norm": 2.3619172395683563e-08, + "learning_rate": 0.10929527130443177, + "loss": 0.0, + "num_input_tokens_seen": 40269536, + "step": 23500 + }, + { + "epoch": 114.10169491525424, + "grad_norm": 6.186657941498197e-08, + "learning_rate": 0.1092385798957385, + "loss": 0.0, + "num_input_tokens_seen": 40278464, + "step": 23505 + }, + { + "epoch": 114.12590799031477, + "grad_norm": 3.2851165343572575e-08, + "learning_rate": 0.10918189477296848, + "loss": 0.0, + "num_input_tokens_seen": 40286848, + "step": 23510 + }, + { + "epoch": 114.1501210653753, + "grad_norm": 6.290579079859526e-08, + "learning_rate": 0.1091252159448633, + "loss": 0.0, + "num_input_tokens_seen": 40295424, + "step": 23515 + }, + { + "epoch": 114.17433414043583, + "grad_norm": 5.5557123346261506e-08, + "learning_rate": 0.10906854342016345, + "loss": 0.0, + "num_input_tokens_seen": 40304032, + "step": 23520 + }, + { + "epoch": 114.19854721549636, + "grad_norm": 1.482882083791992e-07, + "learning_rate": 0.10901187720760858, + "loss": 0.0, + "num_input_tokens_seen": 40312800, + "step": 23525 + }, + { + "epoch": 114.2227602905569, + "grad_norm": 8.439459264764082e-08, + "learning_rate": 0.10895521731593734, + "loss": 0.0, + "num_input_tokens_seen": 40321568, + "step": 23530 + }, + { + "epoch": 114.24697336561744, + "grad_norm": 3.730126607592865e-08, + "learning_rate": 0.10889856375388733, + "loss": 0.0, + "num_input_tokens_seen": 40330400, + "step": 23535 + }, + { + "epoch": 114.27118644067797, + "grad_norm": 5.030525329630109e-08, + "learning_rate": 0.1088419165301954, + "loss": 0.0, + "num_input_tokens_seen": 40338688, + "step": 23540 + }, + { + "epoch": 114.2953995157385, + "grad_norm": 8.342472312961036e-08, + "learning_rate": 0.1087852756535971, + "loss": 0.0, + "num_input_tokens_seen": 40347584, + "step": 23545 + }, + { + "epoch": 114.31961259079903, + "grad_norm": 2.317200653578766e-08, + "learning_rate": 0.10872864113282725, + "loss": 0.0, + "num_input_tokens_seen": 40356096, + "step": 23550 + }, + { + "epoch": 114.34382566585957, + "grad_norm": 9.903119746468292e-08, + "learning_rate": 0.10867201297661958, + "loss": 0.0, + "num_input_tokens_seen": 40364608, + "step": 23555 + }, + { + "epoch": 114.3680387409201, + "grad_norm": 5.774237266109594e-08, + "learning_rate": 0.10861539119370689, + "loss": 0.0, + "num_input_tokens_seen": 40373120, + "step": 23560 + }, + { + "epoch": 114.39225181598063, + "grad_norm": 5.186113583022234e-08, + "learning_rate": 0.10855877579282096, + "loss": 0.0, + "num_input_tokens_seen": 40381632, + "step": 23565 + }, + { + "epoch": 114.41646489104116, + "grad_norm": 4.752444482392093e-08, + "learning_rate": 0.10850216678269252, + "loss": 0.0, + "num_input_tokens_seen": 40390208, + "step": 23570 + }, + { + "epoch": 114.44067796610169, + "grad_norm": 4.895622396361432e-08, + "learning_rate": 0.10844556417205146, + "loss": 0.0, + "num_input_tokens_seen": 40398816, + "step": 23575 + }, + { + "epoch": 114.46489104116223, + "grad_norm": 6.450754597153718e-08, + "learning_rate": 0.10838896796962669, + "loss": 0.0, + "num_input_tokens_seen": 40407296, + "step": 23580 + }, + { + "epoch": 114.48910411622276, + "grad_norm": 4.2958262724823726e-08, + "learning_rate": 0.1083323781841459, + "loss": 0.0, + "num_input_tokens_seen": 40416096, + "step": 23585 + }, + { + "epoch": 114.51331719128329, + "grad_norm": 7.34304776983663e-08, + "learning_rate": 0.10827579482433607, + "loss": 0.0, + "num_input_tokens_seen": 40424224, + "step": 23590 + }, + { + "epoch": 114.53753026634382, + "grad_norm": 1.0937400674038145e-07, + "learning_rate": 0.10821921789892304, + "loss": 0.0, + "num_input_tokens_seen": 40432832, + "step": 23595 + }, + { + "epoch": 114.56174334140435, + "grad_norm": 3.374914570031251e-08, + "learning_rate": 0.10816264741663158, + "loss": 0.0, + "num_input_tokens_seen": 40441152, + "step": 23600 + }, + { + "epoch": 114.56174334140435, + "eval_loss": 1.0523483753204346, + "eval_runtime": 4.6224, + "eval_samples_per_second": 79.397, + "eval_steps_per_second": 19.903, + "num_input_tokens_seen": 40441152, + "step": 23600 + }, + { + "epoch": 114.5859564164649, + "grad_norm": 3.072331011821916e-08, + "learning_rate": 0.10810608338618573, + "loss": 0.0, + "num_input_tokens_seen": 40449952, + "step": 23605 + }, + { + "epoch": 114.61016949152543, + "grad_norm": 9.53034273720732e-08, + "learning_rate": 0.10804952581630821, + "loss": 0.0, + "num_input_tokens_seen": 40458560, + "step": 23610 + }, + { + "epoch": 114.63438256658596, + "grad_norm": 3.776926149612336e-08, + "learning_rate": 0.10799297471572102, + "loss": 0.0, + "num_input_tokens_seen": 40467264, + "step": 23615 + }, + { + "epoch": 114.65859564164649, + "grad_norm": 3.443804175162768e-08, + "learning_rate": 0.10793643009314507, + "loss": 0.0, + "num_input_tokens_seen": 40475776, + "step": 23620 + }, + { + "epoch": 114.68280871670702, + "grad_norm": 8.226325576288218e-08, + "learning_rate": 0.10787989195730015, + "loss": 0.0, + "num_input_tokens_seen": 40483936, + "step": 23625 + }, + { + "epoch": 114.70702179176756, + "grad_norm": 5.686245074798535e-08, + "learning_rate": 0.10782336031690525, + "loss": 0.0, + "num_input_tokens_seen": 40492480, + "step": 23630 + }, + { + "epoch": 114.73123486682809, + "grad_norm": 5.6668699954798285e-08, + "learning_rate": 0.10776683518067821, + "loss": 0.0, + "num_input_tokens_seen": 40500992, + "step": 23635 + }, + { + "epoch": 114.75544794188862, + "grad_norm": 3.383989977123747e-08, + "learning_rate": 0.10771031655733587, + "loss": 0.0, + "num_input_tokens_seen": 40509600, + "step": 23640 + }, + { + "epoch": 114.77966101694915, + "grad_norm": 9.49346912193505e-08, + "learning_rate": 0.10765380445559422, + "loss": 0.0, + "num_input_tokens_seen": 40518176, + "step": 23645 + }, + { + "epoch": 114.80387409200968, + "grad_norm": 5.1956277502540615e-08, + "learning_rate": 0.10759729888416801, + "loss": 0.0, + "num_input_tokens_seen": 40526848, + "step": 23650 + }, + { + "epoch": 114.82808716707022, + "grad_norm": 2.0595521732502675e-07, + "learning_rate": 0.10754079985177119, + "loss": 0.0, + "num_input_tokens_seen": 40535840, + "step": 23655 + }, + { + "epoch": 114.85230024213075, + "grad_norm": 3.1231575547963075e-08, + "learning_rate": 0.10748430736711667, + "loss": 0.0, + "num_input_tokens_seen": 40544288, + "step": 23660 + }, + { + "epoch": 114.87651331719128, + "grad_norm": 2.5540794368339448e-08, + "learning_rate": 0.10742782143891623, + "loss": 0.0, + "num_input_tokens_seen": 40552608, + "step": 23665 + }, + { + "epoch": 114.90072639225181, + "grad_norm": 3.11511811901255e-08, + "learning_rate": 0.10737134207588069, + "loss": 0.0, + "num_input_tokens_seen": 40561344, + "step": 23670 + }, + { + "epoch": 114.92493946731234, + "grad_norm": 6.212864178678501e-08, + "learning_rate": 0.10731486928671992, + "loss": 0.0, + "num_input_tokens_seen": 40569632, + "step": 23675 + }, + { + "epoch": 114.94915254237289, + "grad_norm": 8.943506202285789e-08, + "learning_rate": 0.10725840308014269, + "loss": 0.0, + "num_input_tokens_seen": 40578240, + "step": 23680 + }, + { + "epoch": 114.97336561743342, + "grad_norm": 4.288381560968446e-08, + "learning_rate": 0.10720194346485688, + "loss": 0.0, + "num_input_tokens_seen": 40586272, + "step": 23685 + }, + { + "epoch": 114.99757869249395, + "grad_norm": 7.504771559752044e-08, + "learning_rate": 0.10714549044956918, + "loss": 0.0, + "num_input_tokens_seen": 40595072, + "step": 23690 + }, + { + "epoch": 115.02421307506053, + "grad_norm": 1.0180465892517532e-07, + "learning_rate": 0.10708904404298542, + "loss": 0.0, + "num_input_tokens_seen": 40603840, + "step": 23695 + }, + { + "epoch": 115.04842615012106, + "grad_norm": 3.851364382967404e-08, + "learning_rate": 0.1070326042538103, + "loss": 0.0, + "num_input_tokens_seen": 40612288, + "step": 23700 + }, + { + "epoch": 115.0726392251816, + "grad_norm": 4.0208703921962297e-08, + "learning_rate": 0.10697617109074758, + "loss": 0.0, + "num_input_tokens_seen": 40621440, + "step": 23705 + }, + { + "epoch": 115.09685230024213, + "grad_norm": 3.6311941897793076e-08, + "learning_rate": 0.10691974456249999, + "loss": 0.0, + "num_input_tokens_seen": 40630240, + "step": 23710 + }, + { + "epoch": 115.12106537530266, + "grad_norm": 2.8396808460229295e-08, + "learning_rate": 0.10686332467776909, + "loss": 0.0, + "num_input_tokens_seen": 40638624, + "step": 23715 + }, + { + "epoch": 115.1452784503632, + "grad_norm": 4.644859075142449e-08, + "learning_rate": 0.10680691144525563, + "loss": 0.0, + "num_input_tokens_seen": 40647360, + "step": 23720 + }, + { + "epoch": 115.16949152542372, + "grad_norm": 7.152482339733979e-08, + "learning_rate": 0.10675050487365928, + "loss": 0.0, + "num_input_tokens_seen": 40656000, + "step": 23725 + }, + { + "epoch": 115.19370460048427, + "grad_norm": 3.536969117590161e-08, + "learning_rate": 0.10669410497167851, + "loss": 0.0, + "num_input_tokens_seen": 40664672, + "step": 23730 + }, + { + "epoch": 115.2179176755448, + "grad_norm": 1.0803461236719158e-08, + "learning_rate": 0.10663771174801102, + "loss": 0.0, + "num_input_tokens_seen": 40673344, + "step": 23735 + }, + { + "epoch": 115.24213075060533, + "grad_norm": 6.876492619767305e-08, + "learning_rate": 0.10658132521135329, + "loss": 0.0, + "num_input_tokens_seen": 40682112, + "step": 23740 + }, + { + "epoch": 115.26634382566586, + "grad_norm": 3.775722134946591e-08, + "learning_rate": 0.10652494537040084, + "loss": 0.0, + "num_input_tokens_seen": 40690368, + "step": 23745 + }, + { + "epoch": 115.29055690072639, + "grad_norm": 3.061614251009814e-08, + "learning_rate": 0.1064685722338482, + "loss": 0.0, + "num_input_tokens_seen": 40699040, + "step": 23750 + }, + { + "epoch": 115.31476997578693, + "grad_norm": 6.392630069740335e-08, + "learning_rate": 0.10641220581038871, + "loss": 0.0, + "num_input_tokens_seen": 40707488, + "step": 23755 + }, + { + "epoch": 115.33898305084746, + "grad_norm": 6.848919298363398e-08, + "learning_rate": 0.10635584610871483, + "loss": 0.0, + "num_input_tokens_seen": 40715776, + "step": 23760 + }, + { + "epoch": 115.36319612590799, + "grad_norm": 6.358882842505409e-08, + "learning_rate": 0.10629949313751803, + "loss": 0.0, + "num_input_tokens_seen": 40724544, + "step": 23765 + }, + { + "epoch": 115.38740920096852, + "grad_norm": 8.96399328098596e-08, + "learning_rate": 0.10624314690548849, + "loss": 0.0, + "num_input_tokens_seen": 40733024, + "step": 23770 + }, + { + "epoch": 115.41162227602905, + "grad_norm": 5.5916419938739637e-08, + "learning_rate": 0.1061868074213156, + "loss": 0.0, + "num_input_tokens_seen": 40741216, + "step": 23775 + }, + { + "epoch": 115.4358353510896, + "grad_norm": 3.967890549461117e-08, + "learning_rate": 0.10613047469368765, + "loss": 0.0, + "num_input_tokens_seen": 40749856, + "step": 23780 + }, + { + "epoch": 115.46004842615012, + "grad_norm": 7.324791795326746e-08, + "learning_rate": 0.10607414873129171, + "loss": 0.0, + "num_input_tokens_seen": 40758528, + "step": 23785 + }, + { + "epoch": 115.48426150121065, + "grad_norm": 4.8558231213746694e-08, + "learning_rate": 0.10601782954281413, + "loss": 0.0, + "num_input_tokens_seen": 40766880, + "step": 23790 + }, + { + "epoch": 115.50847457627118, + "grad_norm": 7.760493758723896e-08, + "learning_rate": 0.1059615171369399, + "loss": 0.0, + "num_input_tokens_seen": 40775616, + "step": 23795 + }, + { + "epoch": 115.53268765133171, + "grad_norm": 8.737524126445351e-08, + "learning_rate": 0.10590521152235312, + "loss": 0.0, + "num_input_tokens_seen": 40784672, + "step": 23800 + }, + { + "epoch": 115.53268765133171, + "eval_loss": 1.0557180643081665, + "eval_runtime": 4.621, + "eval_samples_per_second": 79.42, + "eval_steps_per_second": 19.909, + "num_input_tokens_seen": 40784672, + "step": 23800 + }, + { + "epoch": 115.55690072639226, + "grad_norm": 8.628605741023421e-08, + "learning_rate": 0.1058489127077369, + "loss": 0.0, + "num_input_tokens_seen": 40793152, + "step": 23805 + }, + { + "epoch": 115.58111380145279, + "grad_norm": 6.841478494834519e-08, + "learning_rate": 0.1057926207017732, + "loss": 0.0, + "num_input_tokens_seen": 40801504, + "step": 23810 + }, + { + "epoch": 115.60532687651332, + "grad_norm": 1.1061151639069067e-07, + "learning_rate": 0.10573633551314285, + "loss": 0.0, + "num_input_tokens_seen": 40810208, + "step": 23815 + }, + { + "epoch": 115.62953995157385, + "grad_norm": 1.0498990832275013e-07, + "learning_rate": 0.1056800571505259, + "loss": 0.0, + "num_input_tokens_seen": 40818720, + "step": 23820 + }, + { + "epoch": 115.65375302663438, + "grad_norm": 4.10797085237391e-08, + "learning_rate": 0.10562378562260105, + "loss": 0.0, + "num_input_tokens_seen": 40827136, + "step": 23825 + }, + { + "epoch": 115.67796610169492, + "grad_norm": 5.262808855377443e-08, + "learning_rate": 0.10556752093804615, + "loss": 0.0, + "num_input_tokens_seen": 40835680, + "step": 23830 + }, + { + "epoch": 115.70217917675545, + "grad_norm": 3.299245321386479e-08, + "learning_rate": 0.10551126310553786, + "loss": 0.0, + "num_input_tokens_seen": 40844288, + "step": 23835 + }, + { + "epoch": 115.72639225181598, + "grad_norm": 3.0224608593698576e-08, + "learning_rate": 0.10545501213375187, + "loss": 0.0, + "num_input_tokens_seen": 40852768, + "step": 23840 + }, + { + "epoch": 115.75060532687651, + "grad_norm": 7.232999621464842e-08, + "learning_rate": 0.10539876803136287, + "loss": 0.0, + "num_input_tokens_seen": 40861504, + "step": 23845 + }, + { + "epoch": 115.77481840193704, + "grad_norm": 3.289731509426019e-08, + "learning_rate": 0.10534253080704428, + "loss": 0.0, + "num_input_tokens_seen": 40870304, + "step": 23850 + }, + { + "epoch": 115.79903147699758, + "grad_norm": 1.2151711814567534e-07, + "learning_rate": 0.10528630046946862, + "loss": 0.0, + "num_input_tokens_seen": 40878592, + "step": 23855 + }, + { + "epoch": 115.82324455205811, + "grad_norm": 2.4248555163808305e-08, + "learning_rate": 0.1052300770273074, + "loss": 0.0, + "num_input_tokens_seen": 40887520, + "step": 23860 + }, + { + "epoch": 115.84745762711864, + "grad_norm": 7.308386784643517e-08, + "learning_rate": 0.10517386048923086, + "loss": 0.0, + "num_input_tokens_seen": 40896032, + "step": 23865 + }, + { + "epoch": 115.87167070217917, + "grad_norm": 1.2738786381305545e-07, + "learning_rate": 0.10511765086390841, + "loss": 0.0, + "num_input_tokens_seen": 40904768, + "step": 23870 + }, + { + "epoch": 115.8958837772397, + "grad_norm": 3.026446293574736e-08, + "learning_rate": 0.10506144816000816, + "loss": 0.0, + "num_input_tokens_seen": 40913344, + "step": 23875 + }, + { + "epoch": 115.92009685230025, + "grad_norm": 2.8044254918313527e-08, + "learning_rate": 0.10500525238619736, + "loss": 0.0, + "num_input_tokens_seen": 40921952, + "step": 23880 + }, + { + "epoch": 115.94430992736078, + "grad_norm": 5.040933714894891e-08, + "learning_rate": 0.10494906355114209, + "loss": 0.0, + "num_input_tokens_seen": 40930112, + "step": 23885 + }, + { + "epoch": 115.9685230024213, + "grad_norm": 6.756643244898441e-08, + "learning_rate": 0.10489288166350737, + "loss": 0.0, + "num_input_tokens_seen": 40938432, + "step": 23890 + }, + { + "epoch": 115.99273607748184, + "grad_norm": 6.009438635601327e-08, + "learning_rate": 0.10483670673195711, + "loss": 0.0, + "num_input_tokens_seen": 40946848, + "step": 23895 + }, + { + "epoch": 116.01937046004842, + "grad_norm": 3.1394886690350177e-08, + "learning_rate": 0.10478053876515431, + "loss": 0.0, + "num_input_tokens_seen": 40955840, + "step": 23900 + }, + { + "epoch": 116.04358353510897, + "grad_norm": 3.513421020784335e-08, + "learning_rate": 0.10472437777176061, + "loss": 0.0, + "num_input_tokens_seen": 40964352, + "step": 23905 + }, + { + "epoch": 116.0677966101695, + "grad_norm": 6.466331115007051e-08, + "learning_rate": 0.1046682237604369, + "loss": 0.0, + "num_input_tokens_seen": 40972864, + "step": 23910 + }, + { + "epoch": 116.09200968523002, + "grad_norm": 8.595671374678204e-08, + "learning_rate": 0.1046120767398427, + "loss": 0.0, + "num_input_tokens_seen": 40981664, + "step": 23915 + }, + { + "epoch": 116.11622276029055, + "grad_norm": 6.273205599427456e-08, + "learning_rate": 0.10455593671863667, + "loss": 0.0, + "num_input_tokens_seen": 40990464, + "step": 23920 + }, + { + "epoch": 116.14043583535108, + "grad_norm": 7.519192735117031e-08, + "learning_rate": 0.1044998037054763, + "loss": 0.0, + "num_input_tokens_seen": 40999072, + "step": 23925 + }, + { + "epoch": 116.16464891041163, + "grad_norm": 4.9225793219420666e-08, + "learning_rate": 0.10444367770901794, + "loss": 0.0, + "num_input_tokens_seen": 41007616, + "step": 23930 + }, + { + "epoch": 116.18886198547216, + "grad_norm": 2.799599840841438e-08, + "learning_rate": 0.10438755873791698, + "loss": 0.0, + "num_input_tokens_seen": 41016384, + "step": 23935 + }, + { + "epoch": 116.21307506053269, + "grad_norm": 6.43378470499556e-08, + "learning_rate": 0.10433144680082775, + "loss": 0.0, + "num_input_tokens_seen": 41025120, + "step": 23940 + }, + { + "epoch": 116.23728813559322, + "grad_norm": 3.416192839722498e-08, + "learning_rate": 0.10427534190640322, + "loss": 0.0, + "num_input_tokens_seen": 41033376, + "step": 23945 + }, + { + "epoch": 116.26150121065375, + "grad_norm": 4.624168070677115e-08, + "learning_rate": 0.10421924406329568, + "loss": 0.0, + "num_input_tokens_seen": 41042080, + "step": 23950 + }, + { + "epoch": 116.28571428571429, + "grad_norm": 5.211700226936955e-08, + "learning_rate": 0.10416315328015598, + "loss": 0.0, + "num_input_tokens_seen": 41050496, + "step": 23955 + }, + { + "epoch": 116.30992736077482, + "grad_norm": 5.302394967543478e-08, + "learning_rate": 0.10410706956563402, + "loss": 0.0, + "num_input_tokens_seen": 41059168, + "step": 23960 + }, + { + "epoch": 116.33414043583535, + "grad_norm": 2.4071784565649068e-08, + "learning_rate": 0.10405099292837874, + "loss": 0.0, + "num_input_tokens_seen": 41067584, + "step": 23965 + }, + { + "epoch": 116.35835351089588, + "grad_norm": 9.58036352471936e-08, + "learning_rate": 0.10399492337703771, + "loss": 0.0, + "num_input_tokens_seen": 41075872, + "step": 23970 + }, + { + "epoch": 116.38256658595641, + "grad_norm": 6.87717545133637e-08, + "learning_rate": 0.10393886092025764, + "loss": 0.0, + "num_input_tokens_seen": 41084480, + "step": 23975 + }, + { + "epoch": 116.40677966101696, + "grad_norm": 4.377655216103449e-08, + "learning_rate": 0.10388280556668412, + "loss": 0.0, + "num_input_tokens_seen": 41092768, + "step": 23980 + }, + { + "epoch": 116.43099273607749, + "grad_norm": 1.68674514355871e-08, + "learning_rate": 0.10382675732496145, + "loss": 0.0, + "num_input_tokens_seen": 41101504, + "step": 23985 + }, + { + "epoch": 116.45520581113801, + "grad_norm": 5.250817736168756e-08, + "learning_rate": 0.10377071620373311, + "loss": 0.0, + "num_input_tokens_seen": 41109920, + "step": 23990 + }, + { + "epoch": 116.47941888619854, + "grad_norm": 2.3807739779613257e-08, + "learning_rate": 0.10371468221164128, + "loss": 0.0, + "num_input_tokens_seen": 41118752, + "step": 23995 + }, + { + "epoch": 116.50363196125907, + "grad_norm": 2.404856225268759e-08, + "learning_rate": 0.10365865535732706, + "loss": 0.0, + "num_input_tokens_seen": 41127232, + "step": 24000 + }, + { + "epoch": 116.50363196125907, + "eval_loss": 1.062767744064331, + "eval_runtime": 4.6329, + "eval_samples_per_second": 79.215, + "eval_steps_per_second": 19.858, + "num_input_tokens_seen": 41127232, + "step": 24000 + }, + { + "epoch": 116.52784503631962, + "grad_norm": 6.330205337690131e-08, + "learning_rate": 0.10360263564943062, + "loss": 0.0, + "num_input_tokens_seen": 41135680, + "step": 24005 + }, + { + "epoch": 116.55205811138015, + "grad_norm": 2.8049242928318563e-08, + "learning_rate": 0.10354662309659075, + "loss": 0.0, + "num_input_tokens_seen": 41144000, + "step": 24010 + }, + { + "epoch": 116.57627118644068, + "grad_norm": 7.35719538624835e-08, + "learning_rate": 0.10349061770744537, + "loss": 0.0, + "num_input_tokens_seen": 41152416, + "step": 24015 + }, + { + "epoch": 116.60048426150121, + "grad_norm": 5.18345295574818e-08, + "learning_rate": 0.10343461949063128, + "loss": 0.0, + "num_input_tokens_seen": 41160832, + "step": 24020 + }, + { + "epoch": 116.62469733656174, + "grad_norm": 4.282485477347109e-08, + "learning_rate": 0.103378628454784, + "loss": 0.0, + "num_input_tokens_seen": 41169248, + "step": 24025 + }, + { + "epoch": 116.64891041162228, + "grad_norm": 6.545051434159177e-08, + "learning_rate": 0.10332264460853811, + "loss": 0.0, + "num_input_tokens_seen": 41177664, + "step": 24030 + }, + { + "epoch": 116.67312348668281, + "grad_norm": 4.860385161009617e-08, + "learning_rate": 0.10326666796052701, + "loss": 0.0, + "num_input_tokens_seen": 41186144, + "step": 24035 + }, + { + "epoch": 116.69733656174334, + "grad_norm": 8.000732520940801e-08, + "learning_rate": 0.10321069851938296, + "loss": 0.0, + "num_input_tokens_seen": 41194912, + "step": 24040 + }, + { + "epoch": 116.72154963680387, + "grad_norm": 5.2235346714724074e-08, + "learning_rate": 0.10315473629373724, + "loss": 0.0, + "num_input_tokens_seen": 41203360, + "step": 24045 + }, + { + "epoch": 116.7457627118644, + "grad_norm": 7.036849325459116e-08, + "learning_rate": 0.10309878129221982, + "loss": 0.0, + "num_input_tokens_seen": 41211904, + "step": 24050 + }, + { + "epoch": 116.76997578692495, + "grad_norm": 4.4495873652294904e-08, + "learning_rate": 0.10304283352345973, + "loss": 0.0, + "num_input_tokens_seen": 41220320, + "step": 24055 + }, + { + "epoch": 116.79418886198548, + "grad_norm": 3.822099969852388e-08, + "learning_rate": 0.10298689299608486, + "loss": 0.0, + "num_input_tokens_seen": 41228768, + "step": 24060 + }, + { + "epoch": 116.818401937046, + "grad_norm": 4.316891732969452e-08, + "learning_rate": 0.10293095971872188, + "loss": 0.0, + "num_input_tokens_seen": 41237344, + "step": 24065 + }, + { + "epoch": 116.84261501210653, + "grad_norm": 2.4723311398133774e-08, + "learning_rate": 0.10287503369999645, + "loss": 0.0, + "num_input_tokens_seen": 41245984, + "step": 24070 + }, + { + "epoch": 116.86682808716706, + "grad_norm": 4.0645314669518484e-08, + "learning_rate": 0.10281911494853295, + "loss": 0.0, + "num_input_tokens_seen": 41254272, + "step": 24075 + }, + { + "epoch": 116.89104116222761, + "grad_norm": 4.8678334252372224e-08, + "learning_rate": 0.10276320347295485, + "loss": 0.0, + "num_input_tokens_seen": 41262624, + "step": 24080 + }, + { + "epoch": 116.91525423728814, + "grad_norm": 8.599145218113335e-08, + "learning_rate": 0.10270729928188446, + "loss": 0.0, + "num_input_tokens_seen": 41271456, + "step": 24085 + }, + { + "epoch": 116.93946731234867, + "grad_norm": 6.436190602698844e-08, + "learning_rate": 0.10265140238394276, + "loss": 0.0, + "num_input_tokens_seen": 41280320, + "step": 24090 + }, + { + "epoch": 116.9636803874092, + "grad_norm": 6.283741527113307e-08, + "learning_rate": 0.10259551278774988, + "loss": 0.0, + "num_input_tokens_seen": 41288992, + "step": 24095 + }, + { + "epoch": 116.98789346246973, + "grad_norm": 5.0637460446978366e-08, + "learning_rate": 0.10253963050192462, + "loss": 0.0, + "num_input_tokens_seen": 41297280, + "step": 24100 + }, + { + "epoch": 117.01452784503633, + "grad_norm": 7.887121711291911e-08, + "learning_rate": 0.10248375553508478, + "loss": 0.0, + "num_input_tokens_seen": 41306304, + "step": 24105 + }, + { + "epoch": 117.03874092009686, + "grad_norm": 2.6508697459348696e-08, + "learning_rate": 0.102427887895847, + "loss": 0.0, + "num_input_tokens_seen": 41314720, + "step": 24110 + }, + { + "epoch": 117.06295399515739, + "grad_norm": 3.7254906715133984e-08, + "learning_rate": 0.10237202759282668, + "loss": 0.0, + "num_input_tokens_seen": 41323328, + "step": 24115 + }, + { + "epoch": 117.08716707021792, + "grad_norm": 7.09534901943698e-08, + "learning_rate": 0.10231617463463821, + "loss": 0.0, + "num_input_tokens_seen": 41331936, + "step": 24120 + }, + { + "epoch": 117.11138014527845, + "grad_norm": 4.336508396818317e-08, + "learning_rate": 0.10226032902989492, + "loss": 0.0, + "num_input_tokens_seen": 41340288, + "step": 24125 + }, + { + "epoch": 117.13559322033899, + "grad_norm": 4.083147686628763e-08, + "learning_rate": 0.10220449078720877, + "loss": 0.0, + "num_input_tokens_seen": 41348800, + "step": 24130 + }, + { + "epoch": 117.15980629539952, + "grad_norm": 4.5967333051066817e-08, + "learning_rate": 0.1021486599151908, + "loss": 0.0, + "num_input_tokens_seen": 41357088, + "step": 24135 + }, + { + "epoch": 117.18401937046005, + "grad_norm": 8.141481799839312e-08, + "learning_rate": 0.10209283642245084, + "loss": 0.0, + "num_input_tokens_seen": 41365824, + "step": 24140 + }, + { + "epoch": 117.20823244552058, + "grad_norm": 4.4689688394328186e-08, + "learning_rate": 0.10203702031759748, + "loss": 0.0, + "num_input_tokens_seen": 41374272, + "step": 24145 + }, + { + "epoch": 117.23244552058111, + "grad_norm": 6.930232387958313e-08, + "learning_rate": 0.1019812116092384, + "loss": 0.0, + "num_input_tokens_seen": 41383008, + "step": 24150 + }, + { + "epoch": 117.25665859564165, + "grad_norm": 5.3607973171665435e-08, + "learning_rate": 0.10192541030597986, + "loss": 0.0, + "num_input_tokens_seen": 41392000, + "step": 24155 + }, + { + "epoch": 117.28087167070218, + "grad_norm": 5.896181676234846e-08, + "learning_rate": 0.1018696164164272, + "loss": 0.0, + "num_input_tokens_seen": 41400192, + "step": 24160 + }, + { + "epoch": 117.30508474576271, + "grad_norm": 3.160104711241729e-08, + "learning_rate": 0.10181382994918459, + "loss": 0.0, + "num_input_tokens_seen": 41409024, + "step": 24165 + }, + { + "epoch": 117.32929782082324, + "grad_norm": 6.221543458195811e-08, + "learning_rate": 0.10175805091285492, + "loss": 0.0, + "num_input_tokens_seen": 41417856, + "step": 24170 + }, + { + "epoch": 117.35351089588377, + "grad_norm": 5.786709422750391e-08, + "learning_rate": 0.10170227931603999, + "loss": 0.0, + "num_input_tokens_seen": 41426464, + "step": 24175 + }, + { + "epoch": 117.37772397094432, + "grad_norm": 7.185637684870017e-08, + "learning_rate": 0.10164651516734062, + "loss": 0.0, + "num_input_tokens_seen": 41434848, + "step": 24180 + }, + { + "epoch": 117.40193704600485, + "grad_norm": 5.869159025451154e-08, + "learning_rate": 0.1015907584753562, + "loss": 0.0, + "num_input_tokens_seen": 41443680, + "step": 24185 + }, + { + "epoch": 117.42615012106538, + "grad_norm": 8.812365592802962e-08, + "learning_rate": 0.10153500924868523, + "loss": 0.0, + "num_input_tokens_seen": 41452000, + "step": 24190 + }, + { + "epoch": 117.4503631961259, + "grad_norm": 3.6668787117832835e-08, + "learning_rate": 0.10147926749592483, + "loss": 0.0, + "num_input_tokens_seen": 41460448, + "step": 24195 + }, + { + "epoch": 117.47457627118644, + "grad_norm": 6.080605885472323e-08, + "learning_rate": 0.10142353322567112, + "loss": 0.0, + "num_input_tokens_seen": 41468768, + "step": 24200 + }, + { + "epoch": 117.47457627118644, + "eval_loss": 1.0708931684494019, + "eval_runtime": 4.6254, + "eval_samples_per_second": 79.345, + "eval_steps_per_second": 19.89, + "num_input_tokens_seen": 41468768, + "step": 24200 + }, + { + "epoch": 117.49878934624698, + "grad_norm": 5.118731394304632e-08, + "learning_rate": 0.1013678064465191, + "loss": 0.0, + "num_input_tokens_seen": 41477472, + "step": 24205 + }, + { + "epoch": 117.52300242130751, + "grad_norm": 5.2285844986954544e-08, + "learning_rate": 0.10131208716706244, + "loss": 0.0, + "num_input_tokens_seen": 41485472, + "step": 24210 + }, + { + "epoch": 117.54721549636804, + "grad_norm": 7.045730399113381e-08, + "learning_rate": 0.10125637539589379, + "loss": 0.0, + "num_input_tokens_seen": 41494112, + "step": 24215 + }, + { + "epoch": 117.57142857142857, + "grad_norm": 7.753002506660778e-08, + "learning_rate": 0.10120067114160464, + "loss": 0.0, + "num_input_tokens_seen": 41503136, + "step": 24220 + }, + { + "epoch": 117.5956416464891, + "grad_norm": 5.37073567841162e-08, + "learning_rate": 0.10114497441278517, + "loss": 0.0, + "num_input_tokens_seen": 41511680, + "step": 24225 + }, + { + "epoch": 117.61985472154964, + "grad_norm": 2.3246395031151224e-08, + "learning_rate": 0.10108928521802468, + "loss": 0.0, + "num_input_tokens_seen": 41519968, + "step": 24230 + }, + { + "epoch": 117.64406779661017, + "grad_norm": 3.045744989549348e-08, + "learning_rate": 0.101033603565911, + "loss": 0.0, + "num_input_tokens_seen": 41528768, + "step": 24235 + }, + { + "epoch": 117.6682808716707, + "grad_norm": 1.7563026588618413e-08, + "learning_rate": 0.10097792946503102, + "loss": 0.0, + "num_input_tokens_seen": 41537568, + "step": 24240 + }, + { + "epoch": 117.69249394673123, + "grad_norm": 5.598518981742018e-08, + "learning_rate": 0.10092226292397039, + "loss": 0.0, + "num_input_tokens_seen": 41546144, + "step": 24245 + }, + { + "epoch": 117.71670702179176, + "grad_norm": 9.415327184569833e-08, + "learning_rate": 0.10086660395131354, + "loss": 0.0, + "num_input_tokens_seen": 41554944, + "step": 24250 + }, + { + "epoch": 117.7409200968523, + "grad_norm": 4.870687675406771e-08, + "learning_rate": 0.10081095255564385, + "loss": 0.0, + "num_input_tokens_seen": 41563136, + "step": 24255 + }, + { + "epoch": 117.76513317191284, + "grad_norm": 1.9193771905179347e-08, + "learning_rate": 0.10075530874554335, + "loss": 0.0, + "num_input_tokens_seen": 41571840, + "step": 24260 + }, + { + "epoch": 117.78934624697337, + "grad_norm": 6.209679526136824e-08, + "learning_rate": 0.10069967252959311, + "loss": 0.0, + "num_input_tokens_seen": 41580544, + "step": 24265 + }, + { + "epoch": 117.8135593220339, + "grad_norm": 6.246442296742316e-08, + "learning_rate": 0.10064404391637297, + "loss": 0.0, + "num_input_tokens_seen": 41588608, + "step": 24270 + }, + { + "epoch": 117.83777239709443, + "grad_norm": 3.745063636984014e-08, + "learning_rate": 0.10058842291446145, + "loss": 0.0, + "num_input_tokens_seen": 41596896, + "step": 24275 + }, + { + "epoch": 117.86198547215497, + "grad_norm": 8.65059632815246e-08, + "learning_rate": 0.10053280953243608, + "loss": 0.0, + "num_input_tokens_seen": 41605440, + "step": 24280 + }, + { + "epoch": 117.8861985472155, + "grad_norm": 6.239190053491939e-09, + "learning_rate": 0.10047720377887315, + "loss": 0.0, + "num_input_tokens_seen": 41614112, + "step": 24285 + }, + { + "epoch": 117.91041162227603, + "grad_norm": 8.325579159418339e-08, + "learning_rate": 0.10042160566234767, + "loss": 0.0, + "num_input_tokens_seen": 41622976, + "step": 24290 + }, + { + "epoch": 117.93462469733656, + "grad_norm": 3.742904652881407e-08, + "learning_rate": 0.10036601519143372, + "loss": 0.0, + "num_input_tokens_seen": 41631392, + "step": 24295 + }, + { + "epoch": 117.95883777239709, + "grad_norm": 1.1200114613529877e-07, + "learning_rate": 0.1003104323747039, + "loss": 0.0, + "num_input_tokens_seen": 41639776, + "step": 24300 + }, + { + "epoch": 117.98305084745763, + "grad_norm": 2.1950818052118848e-08, + "learning_rate": 0.10025485722072984, + "loss": 0.0, + "num_input_tokens_seen": 41648352, + "step": 24305 + }, + { + "epoch": 118.00968523002422, + "grad_norm": 7.474598362477991e-08, + "learning_rate": 0.10019928973808201, + "loss": 0.0, + "num_input_tokens_seen": 41657248, + "step": 24310 + }, + { + "epoch": 118.03389830508475, + "grad_norm": 5.289412285947037e-08, + "learning_rate": 0.10014372993532945, + "loss": 0.0, + "num_input_tokens_seen": 41665888, + "step": 24315 + }, + { + "epoch": 118.05811138014528, + "grad_norm": 3.371546952735116e-08, + "learning_rate": 0.1000881778210403, + "loss": 0.0, + "num_input_tokens_seen": 41674176, + "step": 24320 + }, + { + "epoch": 118.08232445520581, + "grad_norm": 4.9467303142591845e-08, + "learning_rate": 0.10003263340378142, + "loss": 0.0, + "num_input_tokens_seen": 41682624, + "step": 24325 + }, + { + "epoch": 118.10653753026634, + "grad_norm": 2.7022023729728062e-08, + "learning_rate": 0.09997709669211834, + "loss": 0.0, + "num_input_tokens_seen": 41691552, + "step": 24330 + }, + { + "epoch": 118.13075060532688, + "grad_norm": 4.9490765263726644e-08, + "learning_rate": 0.0999215676946156, + "loss": 0.0, + "num_input_tokens_seen": 41700000, + "step": 24335 + }, + { + "epoch": 118.15496368038741, + "grad_norm": 5.09708648621654e-08, + "learning_rate": 0.0998660464198364, + "loss": 0.0, + "num_input_tokens_seen": 41708480, + "step": 24340 + }, + { + "epoch": 118.17917675544794, + "grad_norm": 6.170758837242829e-08, + "learning_rate": 0.09981053287634288, + "loss": 0.0, + "num_input_tokens_seen": 41717248, + "step": 24345 + }, + { + "epoch": 118.20338983050847, + "grad_norm": 5.385309975736163e-08, + "learning_rate": 0.09975502707269596, + "loss": 0.0, + "num_input_tokens_seen": 41725888, + "step": 24350 + }, + { + "epoch": 118.227602905569, + "grad_norm": 3.105501278355405e-08, + "learning_rate": 0.09969952901745524, + "loss": 0.0, + "num_input_tokens_seen": 41734688, + "step": 24355 + }, + { + "epoch": 118.25181598062954, + "grad_norm": 3.532451486876198e-08, + "learning_rate": 0.09964403871917925, + "loss": 0.0, + "num_input_tokens_seen": 41743264, + "step": 24360 + }, + { + "epoch": 118.27602905569007, + "grad_norm": 6.735300672744415e-08, + "learning_rate": 0.09958855618642536, + "loss": 0.0, + "num_input_tokens_seen": 41751904, + "step": 24365 + }, + { + "epoch": 118.3002421307506, + "grad_norm": 5.472404041029222e-08, + "learning_rate": 0.09953308142774955, + "loss": 0.0, + "num_input_tokens_seen": 41760256, + "step": 24370 + }, + { + "epoch": 118.32445520581113, + "grad_norm": 3.449466845495408e-08, + "learning_rate": 0.09947761445170686, + "loss": 0.0, + "num_input_tokens_seen": 41768704, + "step": 24375 + }, + { + "epoch": 118.34866828087166, + "grad_norm": 4.773189132833977e-08, + "learning_rate": 0.09942215526685086, + "loss": 0.0, + "num_input_tokens_seen": 41777024, + "step": 24380 + }, + { + "epoch": 118.37288135593221, + "grad_norm": 5.076593012631747e-08, + "learning_rate": 0.09936670388173414, + "loss": 0.0, + "num_input_tokens_seen": 41785408, + "step": 24385 + }, + { + "epoch": 118.39709443099274, + "grad_norm": 3.5233870931961064e-08, + "learning_rate": 0.09931126030490799, + "loss": 0.0, + "num_input_tokens_seen": 41794048, + "step": 24390 + }, + { + "epoch": 118.42130750605327, + "grad_norm": 1.5574114442529208e-08, + "learning_rate": 0.0992558245449225, + "loss": 0.0, + "num_input_tokens_seen": 41802784, + "step": 24395 + }, + { + "epoch": 118.4455205811138, + "grad_norm": 4.077440607375138e-08, + "learning_rate": 0.09920039661032651, + "loss": 0.0, + "num_input_tokens_seen": 41811328, + "step": 24400 + }, + { + "epoch": 118.4455205811138, + "eval_loss": 1.0706900358200073, + "eval_runtime": 4.6164, + "eval_samples_per_second": 79.5, + "eval_steps_per_second": 19.929, + "num_input_tokens_seen": 41811328, + "step": 24400 + }, + { + "epoch": 118.46973365617433, + "grad_norm": 5.0436860021818575e-08, + "learning_rate": 0.09914497650966782, + "loss": 0.0, + "num_input_tokens_seen": 41819712, + "step": 24405 + }, + { + "epoch": 118.49394673123487, + "grad_norm": 5.5603681659022186e-08, + "learning_rate": 0.09908956425149276, + "loss": 0.0, + "num_input_tokens_seen": 41828160, + "step": 24410 + }, + { + "epoch": 118.5181598062954, + "grad_norm": 5.226121047030574e-08, + "learning_rate": 0.09903415984434677, + "loss": 0.0, + "num_input_tokens_seen": 41836800, + "step": 24415 + }, + { + "epoch": 118.54237288135593, + "grad_norm": 4.85675819561493e-08, + "learning_rate": 0.09897876329677373, + "loss": 0.0, + "num_input_tokens_seen": 41845376, + "step": 24420 + }, + { + "epoch": 118.56658595641646, + "grad_norm": 1.9465012712771568e-08, + "learning_rate": 0.09892337461731658, + "loss": 0.0, + "num_input_tokens_seen": 41854464, + "step": 24425 + }, + { + "epoch": 118.59079903147699, + "grad_norm": 3.4995352393707435e-08, + "learning_rate": 0.09886799381451693, + "loss": 0.0, + "num_input_tokens_seen": 41862976, + "step": 24430 + }, + { + "epoch": 118.61501210653753, + "grad_norm": 5.653101808889005e-08, + "learning_rate": 0.09881262089691521, + "loss": 0.0, + "num_input_tokens_seen": 41871360, + "step": 24435 + }, + { + "epoch": 118.63922518159806, + "grad_norm": 3.854088959087676e-08, + "learning_rate": 0.09875725587305059, + "loss": 0.0, + "num_input_tokens_seen": 41880096, + "step": 24440 + }, + { + "epoch": 118.6634382566586, + "grad_norm": 2.6328256907959258e-08, + "learning_rate": 0.09870189875146111, + "loss": 0.0, + "num_input_tokens_seen": 41888416, + "step": 24445 + }, + { + "epoch": 118.68765133171912, + "grad_norm": 2.4710889334755848e-08, + "learning_rate": 0.09864654954068346, + "loss": 0.0, + "num_input_tokens_seen": 41897184, + "step": 24450 + }, + { + "epoch": 118.71186440677967, + "grad_norm": 6.78968277156855e-08, + "learning_rate": 0.09859120824925326, + "loss": 0.0, + "num_input_tokens_seen": 41905952, + "step": 24455 + }, + { + "epoch": 118.7360774818402, + "grad_norm": 6.688784282005145e-08, + "learning_rate": 0.09853587488570474, + "loss": 0.0, + "num_input_tokens_seen": 41914912, + "step": 24460 + }, + { + "epoch": 118.76029055690073, + "grad_norm": 3.109446566895713e-08, + "learning_rate": 0.09848054945857107, + "loss": 0.0, + "num_input_tokens_seen": 41923456, + "step": 24465 + }, + { + "epoch": 118.78450363196126, + "grad_norm": 5.954231596660975e-08, + "learning_rate": 0.09842523197638416, + "loss": 0.0, + "num_input_tokens_seen": 41931840, + "step": 24470 + }, + { + "epoch": 118.80871670702179, + "grad_norm": 6.666088125939496e-08, + "learning_rate": 0.09836992244767452, + "loss": 0.0, + "num_input_tokens_seen": 41940224, + "step": 24475 + }, + { + "epoch": 118.83292978208233, + "grad_norm": 4.4249055974887597e-08, + "learning_rate": 0.09831462088097168, + "loss": 0.0, + "num_input_tokens_seen": 41948704, + "step": 24480 + }, + { + "epoch": 118.85714285714286, + "grad_norm": 7.892343489857012e-08, + "learning_rate": 0.09825932728480385, + "loss": 0.0, + "num_input_tokens_seen": 41957216, + "step": 24485 + }, + { + "epoch": 118.88135593220339, + "grad_norm": 2.9993177719234154e-08, + "learning_rate": 0.09820404166769794, + "loss": 0.0, + "num_input_tokens_seen": 41965728, + "step": 24490 + }, + { + "epoch": 118.90556900726392, + "grad_norm": 2.2207579775113118e-08, + "learning_rate": 0.09814876403817978, + "loss": 0.0, + "num_input_tokens_seen": 41974208, + "step": 24495 + }, + { + "epoch": 118.92978208232445, + "grad_norm": 4.244303397626936e-08, + "learning_rate": 0.09809349440477376, + "loss": 0.0, + "num_input_tokens_seen": 41982656, + "step": 24500 + }, + { + "epoch": 118.953995157385, + "grad_norm": 5.431065375205435e-08, + "learning_rate": 0.09803823277600317, + "loss": 0.0, + "num_input_tokens_seen": 41990944, + "step": 24505 + }, + { + "epoch": 118.97820823244552, + "grad_norm": 2.2981902603191884e-08, + "learning_rate": 0.09798297916039014, + "loss": 0.0, + "num_input_tokens_seen": 41999392, + "step": 24510 + }, + { + "epoch": 119.00484261501211, + "grad_norm": 3.391975553768134e-07, + "learning_rate": 0.09792773356645534, + "loss": 0.0, + "num_input_tokens_seen": 42008928, + "step": 24515 + }, + { + "epoch": 119.02905569007264, + "grad_norm": 3.091804856580893e-08, + "learning_rate": 0.09787249600271843, + "loss": 0.0, + "num_input_tokens_seen": 42017120, + "step": 24520 + }, + { + "epoch": 119.05326876513317, + "grad_norm": 4.938288356015619e-08, + "learning_rate": 0.09781726647769776, + "loss": 0.0, + "num_input_tokens_seen": 42025632, + "step": 24525 + }, + { + "epoch": 119.0774818401937, + "grad_norm": 2.9114421096210208e-08, + "learning_rate": 0.0977620449999103, + "loss": 0.0, + "num_input_tokens_seen": 42034304, + "step": 24530 + }, + { + "epoch": 119.10169491525424, + "grad_norm": 4.780732254516806e-08, + "learning_rate": 0.09770683157787204, + "loss": 0.0, + "num_input_tokens_seen": 42042752, + "step": 24535 + }, + { + "epoch": 119.12590799031477, + "grad_norm": 4.8496087146077116e-08, + "learning_rate": 0.09765162622009745, + "loss": 0.0, + "num_input_tokens_seen": 42051776, + "step": 24540 + }, + { + "epoch": 119.1501210653753, + "grad_norm": 3.658397318417883e-08, + "learning_rate": 0.09759642893509995, + "loss": 0.0, + "num_input_tokens_seen": 42060544, + "step": 24545 + }, + { + "epoch": 119.17433414043583, + "grad_norm": 2.916497088278902e-08, + "learning_rate": 0.09754123973139169, + "loss": 0.0, + "num_input_tokens_seen": 42069440, + "step": 24550 + }, + { + "epoch": 119.19854721549636, + "grad_norm": 2.384374298003422e-08, + "learning_rate": 0.09748605861748345, + "loss": 0.0, + "num_input_tokens_seen": 42078080, + "step": 24555 + }, + { + "epoch": 119.2227602905569, + "grad_norm": 1.0580976095297956e-08, + "learning_rate": 0.0974308856018849, + "loss": 0.0, + "num_input_tokens_seen": 42086496, + "step": 24560 + }, + { + "epoch": 119.24697336561744, + "grad_norm": 3.0705997744462366e-08, + "learning_rate": 0.09737572069310449, + "loss": 0.0, + "num_input_tokens_seen": 42094880, + "step": 24565 + }, + { + "epoch": 119.27118644067797, + "grad_norm": 3.452988295293835e-08, + "learning_rate": 0.09732056389964922, + "loss": 0.0, + "num_input_tokens_seen": 42103584, + "step": 24570 + }, + { + "epoch": 119.2953995157385, + "grad_norm": 1.8453881978075515e-08, + "learning_rate": 0.097265415230025, + "loss": 0.0, + "num_input_tokens_seen": 42111808, + "step": 24575 + }, + { + "epoch": 119.31961259079903, + "grad_norm": 4.8451784806502474e-08, + "learning_rate": 0.09721027469273648, + "loss": 0.0, + "num_input_tokens_seen": 42120192, + "step": 24580 + }, + { + "epoch": 119.34382566585957, + "grad_norm": 5.284425697027473e-08, + "learning_rate": 0.09715514229628695, + "loss": 0.0, + "num_input_tokens_seen": 42128896, + "step": 24585 + }, + { + "epoch": 119.3680387409201, + "grad_norm": 3.1032779901352114e-08, + "learning_rate": 0.09710001804917864, + "loss": 0.0, + "num_input_tokens_seen": 42137376, + "step": 24590 + }, + { + "epoch": 119.39225181598063, + "grad_norm": 2.8069051083434715e-08, + "learning_rate": 0.09704490195991226, + "loss": 0.0, + "num_input_tokens_seen": 42146144, + "step": 24595 + }, + { + "epoch": 119.41646489104116, + "grad_norm": 3.7685321530034344e-08, + "learning_rate": 0.09698979403698753, + "loss": 0.0, + "num_input_tokens_seen": 42154688, + "step": 24600 + }, + { + "epoch": 119.41646489104116, + "eval_loss": 1.0784417390823364, + "eval_runtime": 4.6152, + "eval_samples_per_second": 79.52, + "eval_steps_per_second": 19.934, + "num_input_tokens_seen": 42154688, + "step": 24600 + }, + { + "epoch": 119.44067796610169, + "grad_norm": 4.8170733180086245e-08, + "learning_rate": 0.0969346942889027, + "loss": 0.0, + "num_input_tokens_seen": 42163200, + "step": 24605 + }, + { + "epoch": 119.46489104116223, + "grad_norm": 3.456643327126585e-08, + "learning_rate": 0.09687960272415487, + "loss": 0.0, + "num_input_tokens_seen": 42171872, + "step": 24610 + }, + { + "epoch": 119.48910411622276, + "grad_norm": 3.708328932816585e-08, + "learning_rate": 0.0968245193512399, + "loss": 0.0, + "num_input_tokens_seen": 42180448, + "step": 24615 + }, + { + "epoch": 119.51331719128329, + "grad_norm": 2.5877385567696365e-08, + "learning_rate": 0.09676944417865221, + "loss": 0.0, + "num_input_tokens_seen": 42188960, + "step": 24620 + }, + { + "epoch": 119.53753026634382, + "grad_norm": 5.283155246615934e-08, + "learning_rate": 0.09671437721488517, + "loss": 0.0, + "num_input_tokens_seen": 42197408, + "step": 24625 + }, + { + "epoch": 119.56174334140435, + "grad_norm": 1.4820755289690624e-08, + "learning_rate": 0.09665931846843086, + "loss": 0.0, + "num_input_tokens_seen": 42206080, + "step": 24630 + }, + { + "epoch": 119.5859564164649, + "grad_norm": 5.91257922621935e-08, + "learning_rate": 0.0966042679477799, + "loss": 0.0, + "num_input_tokens_seen": 42214784, + "step": 24635 + }, + { + "epoch": 119.61016949152543, + "grad_norm": 6.791562157104636e-08, + "learning_rate": 0.09654922566142186, + "loss": 0.0, + "num_input_tokens_seen": 42223136, + "step": 24640 + }, + { + "epoch": 119.63438256658596, + "grad_norm": 4.1857536103862e-08, + "learning_rate": 0.09649419161784498, + "loss": 0.0, + "num_input_tokens_seen": 42231680, + "step": 24645 + }, + { + "epoch": 119.65859564164649, + "grad_norm": 1.5268261321921273e-08, + "learning_rate": 0.09643916582553606, + "loss": 0.0, + "num_input_tokens_seen": 42240416, + "step": 24650 + }, + { + "epoch": 119.68280871670702, + "grad_norm": 4.022895794264514e-08, + "learning_rate": 0.09638414829298093, + "loss": 0.0, + "num_input_tokens_seen": 42249248, + "step": 24655 + }, + { + "epoch": 119.70702179176756, + "grad_norm": 6.467055158054791e-08, + "learning_rate": 0.09632913902866386, + "loss": 0.0, + "num_input_tokens_seen": 42257632, + "step": 24660 + }, + { + "epoch": 119.73123486682809, + "grad_norm": 3.744753485079855e-08, + "learning_rate": 0.096274138041068, + "loss": 0.0, + "num_input_tokens_seen": 42265856, + "step": 24665 + }, + { + "epoch": 119.75544794188862, + "grad_norm": 4.336147796379919e-08, + "learning_rate": 0.09621914533867527, + "loss": 0.0, + "num_input_tokens_seen": 42274432, + "step": 24670 + }, + { + "epoch": 119.77966101694915, + "grad_norm": 9.952706392368782e-08, + "learning_rate": 0.09616416092996616, + "loss": 0.0, + "num_input_tokens_seen": 42283008, + "step": 24675 + }, + { + "epoch": 119.80387409200968, + "grad_norm": 4.162611233482494e-08, + "learning_rate": 0.09610918482342, + "loss": 0.0, + "num_input_tokens_seen": 42291264, + "step": 24680 + }, + { + "epoch": 119.82808716707022, + "grad_norm": 2.5089294197755407e-08, + "learning_rate": 0.09605421702751478, + "loss": 0.0, + "num_input_tokens_seen": 42299616, + "step": 24685 + }, + { + "epoch": 119.85230024213075, + "grad_norm": 6.169876343165015e-08, + "learning_rate": 0.09599925755072718, + "loss": 0.0, + "num_input_tokens_seen": 42308224, + "step": 24690 + }, + { + "epoch": 119.87651331719128, + "grad_norm": 1.479315958619054e-08, + "learning_rate": 0.09594430640153273, + "loss": 0.0, + "num_input_tokens_seen": 42316928, + "step": 24695 + }, + { + "epoch": 119.90072639225181, + "grad_norm": 4.9048118455630174e-08, + "learning_rate": 0.09588936358840547, + "loss": 0.0, + "num_input_tokens_seen": 42325632, + "step": 24700 + }, + { + "epoch": 119.92493946731234, + "grad_norm": 1.979868713419819e-08, + "learning_rate": 0.09583442911981836, + "loss": 0.0, + "num_input_tokens_seen": 42334368, + "step": 24705 + }, + { + "epoch": 119.94915254237289, + "grad_norm": 3.673319781682949e-08, + "learning_rate": 0.09577950300424302, + "loss": 0.0, + "num_input_tokens_seen": 42342944, + "step": 24710 + }, + { + "epoch": 119.97336561743342, + "grad_norm": 3.5096707762249935e-08, + "learning_rate": 0.09572458525014967, + "loss": 0.0, + "num_input_tokens_seen": 42351392, + "step": 24715 + }, + { + "epoch": 119.99757869249395, + "grad_norm": 3.031526318864053e-08, + "learning_rate": 0.0956696758660073, + "loss": 0.0, + "num_input_tokens_seen": 42359808, + "step": 24720 + }, + { + "epoch": 120.02421307506053, + "grad_norm": 4.9399126567095664e-08, + "learning_rate": 0.09561477486028373, + "loss": 0.0, + "num_input_tokens_seen": 42368832, + "step": 24725 + }, + { + "epoch": 120.04842615012106, + "grad_norm": 2.2511743580366783e-08, + "learning_rate": 0.09555988224144528, + "loss": 0.0, + "num_input_tokens_seen": 42377376, + "step": 24730 + }, + { + "epoch": 120.0726392251816, + "grad_norm": 3.4611439048148895e-08, + "learning_rate": 0.09550499801795717, + "loss": 0.0, + "num_input_tokens_seen": 42385728, + "step": 24735 + }, + { + "epoch": 120.09685230024213, + "grad_norm": 4.7743846209868934e-08, + "learning_rate": 0.09545012219828314, + "loss": 0.0, + "num_input_tokens_seen": 42394048, + "step": 24740 + }, + { + "epoch": 120.12106537530266, + "grad_norm": 2.0139315992651063e-08, + "learning_rate": 0.09539525479088577, + "loss": 0.0, + "num_input_tokens_seen": 42402816, + "step": 24745 + }, + { + "epoch": 120.1452784503632, + "grad_norm": 4.489132621188219e-08, + "learning_rate": 0.0953403958042264, + "loss": 0.0, + "num_input_tokens_seen": 42411520, + "step": 24750 + }, + { + "epoch": 120.16949152542372, + "grad_norm": 3.5067415637968224e-08, + "learning_rate": 0.09528554524676484, + "loss": 0.0, + "num_input_tokens_seen": 42420160, + "step": 24755 + }, + { + "epoch": 120.19370460048427, + "grad_norm": 9.62337232124355e-09, + "learning_rate": 0.09523070312695978, + "loss": 0.0, + "num_input_tokens_seen": 42428736, + "step": 24760 + }, + { + "epoch": 120.2179176755448, + "grad_norm": 2.7154495541026336e-08, + "learning_rate": 0.09517586945326863, + "loss": 0.0, + "num_input_tokens_seen": 42437504, + "step": 24765 + }, + { + "epoch": 120.24213075060533, + "grad_norm": 4.6920575869080494e-08, + "learning_rate": 0.0951210442341473, + "loss": 0.0, + "num_input_tokens_seen": 42445984, + "step": 24770 + }, + { + "epoch": 120.26634382566586, + "grad_norm": 1.1342912387135584e-07, + "learning_rate": 0.09506622747805066, + "loss": 0.0, + "num_input_tokens_seen": 42454240, + "step": 24775 + }, + { + "epoch": 120.29055690072639, + "grad_norm": 5.276120518260541e-08, + "learning_rate": 0.09501141919343203, + "loss": 0.0, + "num_input_tokens_seen": 42462880, + "step": 24780 + }, + { + "epoch": 120.31476997578693, + "grad_norm": 1.38853595288424e-08, + "learning_rate": 0.09495661938874361, + "loss": 0.0, + "num_input_tokens_seen": 42471296, + "step": 24785 + }, + { + "epoch": 120.33898305084746, + "grad_norm": 2.786043573621555e-08, + "learning_rate": 0.0949018280724362, + "loss": 0.0, + "num_input_tokens_seen": 42479872, + "step": 24790 + }, + { + "epoch": 120.36319612590799, + "grad_norm": 5.3770783381423826e-08, + "learning_rate": 0.09484704525295934, + "loss": 0.0, + "num_input_tokens_seen": 42488768, + "step": 24795 + }, + { + "epoch": 120.38740920096852, + "grad_norm": 5.023854399155425e-08, + "learning_rate": 0.09479227093876112, + "loss": 0.0, + "num_input_tokens_seen": 42497024, + "step": 24800 + }, + { + "epoch": 120.38740920096852, + "eval_loss": 1.086256980895996, + "eval_runtime": 4.6267, + "eval_samples_per_second": 79.323, + "eval_steps_per_second": 19.885, + "num_input_tokens_seen": 42497024, + "step": 24800 + }, + { + "epoch": 120.41162227602905, + "grad_norm": 1.9744241797070572e-08, + "learning_rate": 0.0947375051382886, + "loss": 0.0, + "num_input_tokens_seen": 42505408, + "step": 24805 + }, + { + "epoch": 120.4358353510896, + "grad_norm": 6.541781516489209e-08, + "learning_rate": 0.09468274785998718, + "loss": 0.0, + "num_input_tokens_seen": 42513536, + "step": 24810 + }, + { + "epoch": 120.46004842615012, + "grad_norm": 9.026782521459609e-08, + "learning_rate": 0.09462799911230127, + "loss": 0.0, + "num_input_tokens_seen": 42522080, + "step": 24815 + }, + { + "epoch": 120.48426150121065, + "grad_norm": 2.9688187908050168e-08, + "learning_rate": 0.0945732589036737, + "loss": 0.0, + "num_input_tokens_seen": 42531424, + "step": 24820 + }, + { + "epoch": 120.50847457627118, + "grad_norm": 3.0219517554996855e-08, + "learning_rate": 0.09451852724254614, + "loss": 0.0, + "num_input_tokens_seen": 42540096, + "step": 24825 + }, + { + "epoch": 120.53268765133171, + "grad_norm": 1.53629589050297e-08, + "learning_rate": 0.09446380413735894, + "loss": 0.0, + "num_input_tokens_seen": 42548736, + "step": 24830 + }, + { + "epoch": 120.55690072639226, + "grad_norm": 4.721813340324843e-08, + "learning_rate": 0.09440908959655099, + "loss": 0.0, + "num_input_tokens_seen": 42557248, + "step": 24835 + }, + { + "epoch": 120.58111380145279, + "grad_norm": 2.516670605245963e-08, + "learning_rate": 0.09435438362856004, + "loss": 0.0, + "num_input_tokens_seen": 42565696, + "step": 24840 + }, + { + "epoch": 120.60532687651332, + "grad_norm": 1.8475969199016617e-08, + "learning_rate": 0.0942996862418225, + "loss": 0.0, + "num_input_tokens_seen": 42574240, + "step": 24845 + }, + { + "epoch": 120.62953995157385, + "grad_norm": 2.5000321812740367e-08, + "learning_rate": 0.09424499744477322, + "loss": 0.0, + "num_input_tokens_seen": 42582464, + "step": 24850 + }, + { + "epoch": 120.65375302663438, + "grad_norm": 2.861022174727168e-08, + "learning_rate": 0.09419031724584608, + "loss": 0.0, + "num_input_tokens_seen": 42590560, + "step": 24855 + }, + { + "epoch": 120.67796610169492, + "grad_norm": 2.434903656478582e-08, + "learning_rate": 0.09413564565347331, + "loss": 0.0, + "num_input_tokens_seen": 42598848, + "step": 24860 + }, + { + "epoch": 120.70217917675545, + "grad_norm": 3.005958859603197e-08, + "learning_rate": 0.094080982676086, + "loss": 0.0, + "num_input_tokens_seen": 42607136, + "step": 24865 + }, + { + "epoch": 120.72639225181598, + "grad_norm": 5.520633905575778e-08, + "learning_rate": 0.09402632832211395, + "loss": 0.0, + "num_input_tokens_seen": 42615680, + "step": 24870 + }, + { + "epoch": 120.75060532687651, + "grad_norm": 6.236390248659518e-08, + "learning_rate": 0.09397168259998541, + "loss": 0.0, + "num_input_tokens_seen": 42624288, + "step": 24875 + }, + { + "epoch": 120.77481840193704, + "grad_norm": 2.3648487612604185e-08, + "learning_rate": 0.09391704551812759, + "loss": 0.0, + "num_input_tokens_seen": 42632992, + "step": 24880 + }, + { + "epoch": 120.79903147699758, + "grad_norm": 5.351732212943716e-08, + "learning_rate": 0.09386241708496605, + "loss": 0.0, + "num_input_tokens_seen": 42641696, + "step": 24885 + }, + { + "epoch": 120.82324455205811, + "grad_norm": 3.506330870095553e-08, + "learning_rate": 0.09380779730892527, + "loss": 0.0, + "num_input_tokens_seen": 42650080, + "step": 24890 + }, + { + "epoch": 120.84745762711864, + "grad_norm": 5.7238672468429286e-08, + "learning_rate": 0.09375318619842836, + "loss": 0.0, + "num_input_tokens_seen": 42658752, + "step": 24895 + }, + { + "epoch": 120.87167070217917, + "grad_norm": 1.8796802336851215e-08, + "learning_rate": 0.09369858376189696, + "loss": 0.0, + "num_input_tokens_seen": 42667456, + "step": 24900 + }, + { + "epoch": 120.8958837772397, + "grad_norm": 5.505145850293047e-08, + "learning_rate": 0.09364399000775143, + "loss": 0.0, + "num_input_tokens_seen": 42676032, + "step": 24905 + }, + { + "epoch": 120.92009685230025, + "grad_norm": 2.5003412673640923e-08, + "learning_rate": 0.09358940494441093, + "loss": 0.0, + "num_input_tokens_seen": 42684832, + "step": 24910 + }, + { + "epoch": 120.94430992736078, + "grad_norm": 1.6065431651668405e-08, + "learning_rate": 0.09353482858029301, + "loss": 0.0, + "num_input_tokens_seen": 42693472, + "step": 24915 + }, + { + "epoch": 120.9685230024213, + "grad_norm": 2.8317790778942253e-08, + "learning_rate": 0.09348026092381419, + "loss": 0.0, + "num_input_tokens_seen": 42701984, + "step": 24920 + }, + { + "epoch": 120.99273607748184, + "grad_norm": 4.901934502754557e-08, + "learning_rate": 0.09342570198338931, + "loss": 0.0, + "num_input_tokens_seen": 42710720, + "step": 24925 + }, + { + "epoch": 121.01937046004842, + "grad_norm": 4.293865174531675e-08, + "learning_rate": 0.0933711517674322, + "loss": 0.0, + "num_input_tokens_seen": 42719616, + "step": 24930 + }, + { + "epoch": 121.04358353510897, + "grad_norm": 4.086099991695846e-08, + "learning_rate": 0.09331661028435513, + "loss": 0.0, + "num_input_tokens_seen": 42727968, + "step": 24935 + }, + { + "epoch": 121.0677966101695, + "grad_norm": 4.4497319606762176e-08, + "learning_rate": 0.09326207754256909, + "loss": 0.0, + "num_input_tokens_seen": 42736576, + "step": 24940 + }, + { + "epoch": 121.09200968523002, + "grad_norm": 7.415287228695888e-08, + "learning_rate": 0.09320755355048366, + "loss": 0.0, + "num_input_tokens_seen": 42745344, + "step": 24945 + }, + { + "epoch": 121.11622276029055, + "grad_norm": 5.322709029087491e-08, + "learning_rate": 0.09315303831650722, + "loss": 0.0, + "num_input_tokens_seen": 42754016, + "step": 24950 + }, + { + "epoch": 121.14043583535108, + "grad_norm": 1.5403676556502433e-08, + "learning_rate": 0.09309853184904661, + "loss": 0.0, + "num_input_tokens_seen": 42762176, + "step": 24955 + }, + { + "epoch": 121.16464891041163, + "grad_norm": 2.444408586654845e-08, + "learning_rate": 0.09304403415650753, + "loss": 0.0, + "num_input_tokens_seen": 42770560, + "step": 24960 + }, + { + "epoch": 121.18886198547216, + "grad_norm": 4.321008617580446e-08, + "learning_rate": 0.09298954524729405, + "loss": 0.0, + "num_input_tokens_seen": 42779104, + "step": 24965 + }, + { + "epoch": 121.21307506053269, + "grad_norm": 3.4337674037487886e-08, + "learning_rate": 0.09293506512980916, + "loss": 0.0, + "num_input_tokens_seen": 42787840, + "step": 24970 + }, + { + "epoch": 121.23728813559322, + "grad_norm": 7.081163744260266e-08, + "learning_rate": 0.0928805938124544, + "loss": 0.0, + "num_input_tokens_seen": 42796096, + "step": 24975 + }, + { + "epoch": 121.26150121065375, + "grad_norm": 3.5395345321376226e-08, + "learning_rate": 0.09282613130362982, + "loss": 0.0, + "num_input_tokens_seen": 42804288, + "step": 24980 + }, + { + "epoch": 121.28571428571429, + "grad_norm": 2.4976907653240232e-08, + "learning_rate": 0.09277167761173427, + "loss": 0.0, + "num_input_tokens_seen": 42812544, + "step": 24985 + }, + { + "epoch": 121.30992736077482, + "grad_norm": 2.581952784908026e-08, + "learning_rate": 0.0927172327451653, + "loss": 0.0, + "num_input_tokens_seen": 42821088, + "step": 24990 + }, + { + "epoch": 121.33414043583535, + "grad_norm": 2.6116136808695956e-08, + "learning_rate": 0.09266279671231882, + "loss": 0.0, + "num_input_tokens_seen": 42829504, + "step": 24995 + }, + { + "epoch": 121.35835351089588, + "grad_norm": 5.498770150325072e-08, + "learning_rate": 0.09260836952158967, + "loss": 0.0, + "num_input_tokens_seen": 42838112, + "step": 25000 + }, + { + "epoch": 121.35835351089588, + "eval_loss": 1.089653491973877, + "eval_runtime": 4.6203, + "eval_samples_per_second": 79.432, + "eval_steps_per_second": 19.912, + "num_input_tokens_seen": 42838112, + "step": 25000 + }, + { + "epoch": 121.38256658595641, + "grad_norm": 5.28523846909934e-09, + "learning_rate": 0.09255395118137114, + "loss": 0.0, + "num_input_tokens_seen": 42846144, + "step": 25005 + }, + { + "epoch": 121.40677966101696, + "grad_norm": 4.478303594623867e-08, + "learning_rate": 0.09249954170005527, + "loss": 0.0, + "num_input_tokens_seen": 42854464, + "step": 25010 + }, + { + "epoch": 121.43099273607749, + "grad_norm": 3.1428005087263955e-08, + "learning_rate": 0.0924451410860327, + "loss": 0.0, + "num_input_tokens_seen": 42862944, + "step": 25015 + }, + { + "epoch": 121.45520581113801, + "grad_norm": 2.877895965980315e-08, + "learning_rate": 0.09239074934769258, + "loss": 0.0, + "num_input_tokens_seen": 42871360, + "step": 25020 + }, + { + "epoch": 121.47941888619854, + "grad_norm": 5.518987578057022e-08, + "learning_rate": 0.09233636649342288, + "loss": 0.0, + "num_input_tokens_seen": 42879872, + "step": 25025 + }, + { + "epoch": 121.50363196125907, + "grad_norm": 2.6485947657306497e-08, + "learning_rate": 0.09228199253161017, + "loss": 0.0, + "num_input_tokens_seen": 42888576, + "step": 25030 + }, + { + "epoch": 121.52784503631962, + "grad_norm": 5.1222421859620226e-08, + "learning_rate": 0.09222762747063949, + "loss": 0.0, + "num_input_tokens_seen": 42897248, + "step": 25035 + }, + { + "epoch": 121.55205811138015, + "grad_norm": 1.2060630716348442e-08, + "learning_rate": 0.09217327131889473, + "loss": 0.0, + "num_input_tokens_seen": 42905504, + "step": 25040 + }, + { + "epoch": 121.57627118644068, + "grad_norm": 3.471518184028355e-08, + "learning_rate": 0.09211892408475818, + "loss": 0.0, + "num_input_tokens_seen": 42914080, + "step": 25045 + }, + { + "epoch": 121.60048426150121, + "grad_norm": 2.5969349337628955e-08, + "learning_rate": 0.09206458577661089, + "loss": 0.0, + "num_input_tokens_seen": 42922880, + "step": 25050 + }, + { + "epoch": 121.62469733656174, + "grad_norm": 3.459851072307174e-08, + "learning_rate": 0.09201025640283263, + "loss": 0.0, + "num_input_tokens_seen": 42931552, + "step": 25055 + }, + { + "epoch": 121.64891041162228, + "grad_norm": 3.983891261327699e-08, + "learning_rate": 0.09195593597180148, + "loss": 0.0, + "num_input_tokens_seen": 42940288, + "step": 25060 + }, + { + "epoch": 121.67312348668281, + "grad_norm": 2.0987432236552195e-08, + "learning_rate": 0.09190162449189444, + "loss": 0.0, + "num_input_tokens_seen": 42949088, + "step": 25065 + }, + { + "epoch": 121.69733656174334, + "grad_norm": 3.344735688415312e-08, + "learning_rate": 0.09184732197148705, + "loss": 0.0, + "num_input_tokens_seen": 42957632, + "step": 25070 + }, + { + "epoch": 121.72154963680387, + "grad_norm": 2.009996791230151e-08, + "learning_rate": 0.09179302841895343, + "loss": 0.0, + "num_input_tokens_seen": 42966016, + "step": 25075 + }, + { + "epoch": 121.7457627118644, + "grad_norm": 2.3886729039190868e-08, + "learning_rate": 0.09173874384266625, + "loss": 0.0, + "num_input_tokens_seen": 42974784, + "step": 25080 + }, + { + "epoch": 121.76997578692495, + "grad_norm": 5.054740981336181e-08, + "learning_rate": 0.09168446825099695, + "loss": 0.0, + "num_input_tokens_seen": 42983136, + "step": 25085 + }, + { + "epoch": 121.79418886198548, + "grad_norm": 6.242681394041938e-08, + "learning_rate": 0.09163020165231545, + "loss": 0.0, + "num_input_tokens_seen": 42991776, + "step": 25090 + }, + { + "epoch": 121.818401937046, + "grad_norm": 4.854198465409354e-08, + "learning_rate": 0.09157594405499044, + "loss": 0.0, + "num_input_tokens_seen": 43000384, + "step": 25095 + }, + { + "epoch": 121.84261501210653, + "grad_norm": 5.307172301627361e-08, + "learning_rate": 0.09152169546738899, + "loss": 0.0, + "num_input_tokens_seen": 43009120, + "step": 25100 + }, + { + "epoch": 121.86682808716706, + "grad_norm": 4.979605350285965e-08, + "learning_rate": 0.09146745589787698, + "loss": 0.0, + "num_input_tokens_seen": 43018080, + "step": 25105 + }, + { + "epoch": 121.89104116222761, + "grad_norm": 6.460569323962773e-08, + "learning_rate": 0.09141322535481891, + "loss": 0.0, + "num_input_tokens_seen": 43026528, + "step": 25110 + }, + { + "epoch": 121.91525423728814, + "grad_norm": 3.236975842924039e-08, + "learning_rate": 0.0913590038465777, + "loss": 0.0, + "num_input_tokens_seen": 43035232, + "step": 25115 + }, + { + "epoch": 121.93946731234867, + "grad_norm": 3.184382890708548e-08, + "learning_rate": 0.09130479138151505, + "loss": 0.0, + "num_input_tokens_seen": 43043712, + "step": 25120 + }, + { + "epoch": 121.9636803874092, + "grad_norm": 7.944355928657387e-08, + "learning_rate": 0.09125058796799114, + "loss": 0.0, + "num_input_tokens_seen": 43052736, + "step": 25125 + }, + { + "epoch": 121.98789346246973, + "grad_norm": 4.008331089266903e-08, + "learning_rate": 0.09119639361436485, + "loss": 0.0, + "num_input_tokens_seen": 43061248, + "step": 25130 + }, + { + "epoch": 122.01452784503633, + "grad_norm": 2.151974420883107e-08, + "learning_rate": 0.09114220832899368, + "loss": 0.0, + "num_input_tokens_seen": 43070304, + "step": 25135 + }, + { + "epoch": 122.03874092009686, + "grad_norm": 2.2133772148436037e-08, + "learning_rate": 0.0910880321202336, + "loss": 0.0, + "num_input_tokens_seen": 43079072, + "step": 25140 + }, + { + "epoch": 122.06295399515739, + "grad_norm": 2.441115043438913e-08, + "learning_rate": 0.09103386499643933, + "loss": 0.0, + "num_input_tokens_seen": 43087840, + "step": 25145 + }, + { + "epoch": 122.08716707021792, + "grad_norm": 5.7379388351819216e-08, + "learning_rate": 0.09097970696596407, + "loss": 0.0, + "num_input_tokens_seen": 43096224, + "step": 25150 + }, + { + "epoch": 122.11138014527845, + "grad_norm": 2.6079465698103377e-08, + "learning_rate": 0.09092555803715971, + "loss": 0.0, + "num_input_tokens_seen": 43105088, + "step": 25155 + }, + { + "epoch": 122.13559322033899, + "grad_norm": 8.130707840336981e-08, + "learning_rate": 0.0908714182183767, + "loss": 0.0, + "num_input_tokens_seen": 43113792, + "step": 25160 + }, + { + "epoch": 122.15980629539952, + "grad_norm": 7.136131330298667e-08, + "learning_rate": 0.090817287517964, + "loss": 0.0, + "num_input_tokens_seen": 43122592, + "step": 25165 + }, + { + "epoch": 122.18401937046005, + "grad_norm": 1.719591580240376e-08, + "learning_rate": 0.09076316594426931, + "loss": 0.0, + "num_input_tokens_seen": 43131040, + "step": 25170 + }, + { + "epoch": 122.20823244552058, + "grad_norm": 5.2032245179134406e-08, + "learning_rate": 0.09070905350563888, + "loss": 0.0, + "num_input_tokens_seen": 43139424, + "step": 25175 + }, + { + "epoch": 122.23244552058111, + "grad_norm": 3.156829109229875e-08, + "learning_rate": 0.09065495021041745, + "loss": 0.0, + "num_input_tokens_seen": 43147872, + "step": 25180 + }, + { + "epoch": 122.25665859564165, + "grad_norm": 3.9331297330136294e-08, + "learning_rate": 0.09060085606694851, + "loss": 0.0, + "num_input_tokens_seen": 43156128, + "step": 25185 + }, + { + "epoch": 122.28087167070218, + "grad_norm": 2.5174072604272624e-08, + "learning_rate": 0.09054677108357405, + "loss": 0.0, + "num_input_tokens_seen": 43164576, + "step": 25190 + }, + { + "epoch": 122.30508474576271, + "grad_norm": 6.209908320897739e-08, + "learning_rate": 0.09049269526863457, + "loss": 0.0, + "num_input_tokens_seen": 43173248, + "step": 25195 + }, + { + "epoch": 122.32929782082324, + "grad_norm": 6.752778336505116e-09, + "learning_rate": 0.09043862863046935, + "loss": 0.0, + "num_input_tokens_seen": 43181600, + "step": 25200 + }, + { + "epoch": 122.32929782082324, + "eval_loss": 1.0971332788467407, + "eval_runtime": 4.6145, + "eval_samples_per_second": 79.531, + "eval_steps_per_second": 19.937, + "num_input_tokens_seen": 43181600, + "step": 25200 + }, + { + "epoch": 122.35351089588377, + "grad_norm": 3.7618260506633305e-08, + "learning_rate": 0.09038457117741602, + "loss": 0.0, + "num_input_tokens_seen": 43190016, + "step": 25205 + }, + { + "epoch": 122.37772397094432, + "grad_norm": 5.830489868685618e-08, + "learning_rate": 0.09033052291781099, + "loss": 0.0, + "num_input_tokens_seen": 43198912, + "step": 25210 + }, + { + "epoch": 122.40193704600485, + "grad_norm": 5.126659274878875e-08, + "learning_rate": 0.09027648385998926, + "loss": 0.0, + "num_input_tokens_seen": 43207456, + "step": 25215 + }, + { + "epoch": 122.42615012106538, + "grad_norm": 2.4754131189297368e-08, + "learning_rate": 0.09022245401228417, + "loss": 0.0, + "num_input_tokens_seen": 43216352, + "step": 25220 + }, + { + "epoch": 122.4503631961259, + "grad_norm": 2.071084281851654e-08, + "learning_rate": 0.09016843338302792, + "loss": 0.0, + "num_input_tokens_seen": 43224832, + "step": 25225 + }, + { + "epoch": 122.47457627118644, + "grad_norm": 4.971679246068561e-08, + "learning_rate": 0.09011442198055115, + "loss": 0.0, + "num_input_tokens_seen": 43233216, + "step": 25230 + }, + { + "epoch": 122.49878934624698, + "grad_norm": 2.7643194400184257e-08, + "learning_rate": 0.09006041981318305, + "loss": 0.0, + "num_input_tokens_seen": 43241376, + "step": 25235 + }, + { + "epoch": 122.52300242130751, + "grad_norm": 7.638624310857267e-08, + "learning_rate": 0.09000642688925149, + "loss": 0.0, + "num_input_tokens_seen": 43249760, + "step": 25240 + }, + { + "epoch": 122.54721549636804, + "grad_norm": 4.074024673172971e-08, + "learning_rate": 0.0899524432170828, + "loss": 0.0, + "num_input_tokens_seen": 43258496, + "step": 25245 + }, + { + "epoch": 122.57142857142857, + "grad_norm": 2.3355209322062365e-08, + "learning_rate": 0.08989846880500196, + "loss": 0.0, + "num_input_tokens_seen": 43267360, + "step": 25250 + }, + { + "epoch": 122.5956416464891, + "grad_norm": 3.975835127789651e-08, + "learning_rate": 0.08984450366133256, + "loss": 0.0, + "num_input_tokens_seen": 43275584, + "step": 25255 + }, + { + "epoch": 122.61985472154964, + "grad_norm": 4.5377159807458156e-08, + "learning_rate": 0.08979054779439664, + "loss": 0.0, + "num_input_tokens_seen": 43284192, + "step": 25260 + }, + { + "epoch": 122.64406779661017, + "grad_norm": 6.413302600094539e-08, + "learning_rate": 0.08973660121251485, + "loss": 0.0, + "num_input_tokens_seen": 43292736, + "step": 25265 + }, + { + "epoch": 122.6682808716707, + "grad_norm": 3.447026841740808e-08, + "learning_rate": 0.08968266392400655, + "loss": 0.0, + "num_input_tokens_seen": 43301312, + "step": 25270 + }, + { + "epoch": 122.69249394673123, + "grad_norm": 4.974363676524263e-08, + "learning_rate": 0.0896287359371894, + "loss": 0.0, + "num_input_tokens_seen": 43309664, + "step": 25275 + }, + { + "epoch": 122.71670702179176, + "grad_norm": 1.8191842698911387e-08, + "learning_rate": 0.08957481726037989, + "loss": 0.0, + "num_input_tokens_seen": 43318560, + "step": 25280 + }, + { + "epoch": 122.7409200968523, + "grad_norm": 3.746728083342532e-08, + "learning_rate": 0.08952090790189286, + "loss": 0.0, + "num_input_tokens_seen": 43327136, + "step": 25285 + }, + { + "epoch": 122.76513317191284, + "grad_norm": 4.1806078598938257e-08, + "learning_rate": 0.08946700787004187, + "loss": 0.0, + "num_input_tokens_seen": 43335360, + "step": 25290 + }, + { + "epoch": 122.78934624697337, + "grad_norm": 6.494580873095401e-08, + "learning_rate": 0.08941311717313899, + "loss": 0.0, + "num_input_tokens_seen": 43344320, + "step": 25295 + }, + { + "epoch": 122.8135593220339, + "grad_norm": 4.134143694045633e-08, + "learning_rate": 0.08935923581949483, + "loss": 0.0, + "num_input_tokens_seen": 43352480, + "step": 25300 + }, + { + "epoch": 122.83777239709443, + "grad_norm": 3.962769312693126e-08, + "learning_rate": 0.0893053638174185, + "loss": 0.0, + "num_input_tokens_seen": 43360800, + "step": 25305 + }, + { + "epoch": 122.86198547215497, + "grad_norm": 2.8924922901296668e-08, + "learning_rate": 0.0892515011752179, + "loss": 0.0, + "num_input_tokens_seen": 43369664, + "step": 25310 + }, + { + "epoch": 122.8861985472155, + "grad_norm": 4.819354515461782e-08, + "learning_rate": 0.08919764790119918, + "loss": 0.0, + "num_input_tokens_seen": 43378464, + "step": 25315 + }, + { + "epoch": 122.91041162227603, + "grad_norm": 2.110197883098408e-08, + "learning_rate": 0.08914380400366727, + "loss": 0.0, + "num_input_tokens_seen": 43386912, + "step": 25320 + }, + { + "epoch": 122.93462469733656, + "grad_norm": 5.4609884614365e-08, + "learning_rate": 0.08908996949092551, + "loss": 0.0, + "num_input_tokens_seen": 43395872, + "step": 25325 + }, + { + "epoch": 122.95883777239709, + "grad_norm": 6.350214221129136e-08, + "learning_rate": 0.08903614437127592, + "loss": 0.0, + "num_input_tokens_seen": 43404384, + "step": 25330 + }, + { + "epoch": 122.98305084745763, + "grad_norm": 3.9733219381332674e-08, + "learning_rate": 0.088982328653019, + "loss": 0.0, + "num_input_tokens_seen": 43412960, + "step": 25335 + }, + { + "epoch": 123.00968523002422, + "grad_norm": 3.95030710365063e-08, + "learning_rate": 0.0889285223444538, + "loss": 0.0, + "num_input_tokens_seen": 43421472, + "step": 25340 + }, + { + "epoch": 123.03389830508475, + "grad_norm": 1.5792078755794137e-08, + "learning_rate": 0.08887472545387787, + "loss": 0.0, + "num_input_tokens_seen": 43429824, + "step": 25345 + }, + { + "epoch": 123.05811138014528, + "grad_norm": 1.7027193877083846e-08, + "learning_rate": 0.08882093798958751, + "loss": 0.0, + "num_input_tokens_seen": 43438464, + "step": 25350 + }, + { + "epoch": 123.08232445520581, + "grad_norm": 5.8282488168970303e-08, + "learning_rate": 0.08876715995987726, + "loss": 0.0, + "num_input_tokens_seen": 43447328, + "step": 25355 + }, + { + "epoch": 123.10653753026634, + "grad_norm": 3.17874331301482e-08, + "learning_rate": 0.08871339137304052, + "loss": 0.0, + "num_input_tokens_seen": 43455936, + "step": 25360 + }, + { + "epoch": 123.13075060532688, + "grad_norm": 1.1471009031538415e-08, + "learning_rate": 0.0886596322373689, + "loss": 0.0, + "num_input_tokens_seen": 43464512, + "step": 25365 + }, + { + "epoch": 123.15496368038741, + "grad_norm": 3.336796794428665e-08, + "learning_rate": 0.08860588256115293, + "loss": 0.0, + "num_input_tokens_seen": 43473184, + "step": 25370 + }, + { + "epoch": 123.17917675544794, + "grad_norm": 2.9300998960479774e-08, + "learning_rate": 0.0885521423526814, + "loss": 0.0, + "num_input_tokens_seen": 43481504, + "step": 25375 + }, + { + "epoch": 123.20338983050847, + "grad_norm": 3.470969645036348e-08, + "learning_rate": 0.08849841162024165, + "loss": 0.0, + "num_input_tokens_seen": 43489696, + "step": 25380 + }, + { + "epoch": 123.227602905569, + "grad_norm": 1.4553966920516359e-08, + "learning_rate": 0.08844469037211973, + "loss": 0.0, + "num_input_tokens_seen": 43498304, + "step": 25385 + }, + { + "epoch": 123.25181598062954, + "grad_norm": 2.4016291177986204e-08, + "learning_rate": 0.08839097861660014, + "loss": 0.0, + "num_input_tokens_seen": 43507264, + "step": 25390 + }, + { + "epoch": 123.27602905569007, + "grad_norm": 2.9094328723999752e-08, + "learning_rate": 0.08833727636196585, + "loss": 0.0, + "num_input_tokens_seen": 43515776, + "step": 25395 + }, + { + "epoch": 123.3002421307506, + "grad_norm": 2.474808802332973e-08, + "learning_rate": 0.08828358361649848, + "loss": 0.0, + "num_input_tokens_seen": 43524256, + "step": 25400 + }, + { + "epoch": 123.3002421307506, + "eval_loss": 1.0958924293518066, + "eval_runtime": 4.6155, + "eval_samples_per_second": 79.514, + "eval_steps_per_second": 19.933, + "num_input_tokens_seen": 43524256, + "step": 25400 + }, + { + "epoch": 123.32445520581113, + "grad_norm": 4.4580040992059367e-08, + "learning_rate": 0.08822990038847807, + "loss": 0.0, + "num_input_tokens_seen": 43532864, + "step": 25405 + }, + { + "epoch": 123.34866828087166, + "grad_norm": 2.1395692328951554e-08, + "learning_rate": 0.08817622668618325, + "loss": 0.0, + "num_input_tokens_seen": 43541760, + "step": 25410 + }, + { + "epoch": 123.37288135593221, + "grad_norm": 1.506161773079384e-08, + "learning_rate": 0.08812256251789125, + "loss": 0.0, + "num_input_tokens_seen": 43550208, + "step": 25415 + }, + { + "epoch": 123.39709443099274, + "grad_norm": 3.5110865326259955e-08, + "learning_rate": 0.08806890789187766, + "loss": 0.0, + "num_input_tokens_seen": 43558784, + "step": 25420 + }, + { + "epoch": 123.42130750605327, + "grad_norm": 2.6295749577798233e-08, + "learning_rate": 0.08801526281641672, + "loss": 0.0, + "num_input_tokens_seen": 43567072, + "step": 25425 + }, + { + "epoch": 123.4455205811138, + "grad_norm": 3.7897500249073346e-08, + "learning_rate": 0.0879616272997813, + "loss": 0.0, + "num_input_tokens_seen": 43575584, + "step": 25430 + }, + { + "epoch": 123.46973365617433, + "grad_norm": 1.1744591965623385e-08, + "learning_rate": 0.08790800135024247, + "loss": 0.0, + "num_input_tokens_seen": 43583808, + "step": 25435 + }, + { + "epoch": 123.49394673123487, + "grad_norm": 4.318888713328306e-08, + "learning_rate": 0.08785438497607023, + "loss": 0.0, + "num_input_tokens_seen": 43592512, + "step": 25440 + }, + { + "epoch": 123.5181598062954, + "grad_norm": 2.5484586885227145e-08, + "learning_rate": 0.08780077818553277, + "loss": 0.0, + "num_input_tokens_seen": 43601376, + "step": 25445 + }, + { + "epoch": 123.54237288135593, + "grad_norm": 4.348068927129134e-08, + "learning_rate": 0.0877471809868969, + "loss": 0.0, + "num_input_tokens_seen": 43610048, + "step": 25450 + }, + { + "epoch": 123.56658595641646, + "grad_norm": 4.353003646428988e-08, + "learning_rate": 0.08769359338842811, + "loss": 0.0, + "num_input_tokens_seen": 43618848, + "step": 25455 + }, + { + "epoch": 123.59079903147699, + "grad_norm": 4.142145115793028e-08, + "learning_rate": 0.08764001539839016, + "loss": 0.0, + "num_input_tokens_seen": 43627232, + "step": 25460 + }, + { + "epoch": 123.61501210653753, + "grad_norm": 3.127124870161424e-08, + "learning_rate": 0.08758644702504548, + "loss": 0.0, + "num_input_tokens_seen": 43635968, + "step": 25465 + }, + { + "epoch": 123.63922518159806, + "grad_norm": 8.987029787022038e-08, + "learning_rate": 0.0875328882766551, + "loss": 0.0, + "num_input_tokens_seen": 43644960, + "step": 25470 + }, + { + "epoch": 123.6634382566586, + "grad_norm": 3.8790791023757265e-08, + "learning_rate": 0.08747933916147828, + "loss": 0.0, + "num_input_tokens_seen": 43653632, + "step": 25475 + }, + { + "epoch": 123.68765133171912, + "grad_norm": 5.982744255561556e-08, + "learning_rate": 0.0874257996877731, + "loss": 0.0, + "num_input_tokens_seen": 43662784, + "step": 25480 + }, + { + "epoch": 123.71186440677967, + "grad_norm": 2.0514210774535968e-08, + "learning_rate": 0.08737226986379593, + "loss": 0.0, + "num_input_tokens_seen": 43671360, + "step": 25485 + }, + { + "epoch": 123.7360774818402, + "grad_norm": 2.0379845366846894e-08, + "learning_rate": 0.08731874969780173, + "loss": 0.0, + "num_input_tokens_seen": 43679616, + "step": 25490 + }, + { + "epoch": 123.76029055690073, + "grad_norm": 2.2964178114648348e-08, + "learning_rate": 0.08726523919804412, + "loss": 0.0, + "num_input_tokens_seen": 43687744, + "step": 25495 + }, + { + "epoch": 123.78450363196126, + "grad_norm": 5.028735827750097e-08, + "learning_rate": 0.08721173837277492, + "loss": 0.0, + "num_input_tokens_seen": 43696128, + "step": 25500 + }, + { + "epoch": 123.80871670702179, + "grad_norm": 8.246588301119573e-09, + "learning_rate": 0.08715824723024479, + "loss": 0.0, + "num_input_tokens_seen": 43704608, + "step": 25505 + }, + { + "epoch": 123.83292978208233, + "grad_norm": 2.446596703009618e-08, + "learning_rate": 0.08710476577870258, + "loss": 0.0, + "num_input_tokens_seen": 43713184, + "step": 25510 + }, + { + "epoch": 123.85714285714286, + "grad_norm": 1.906006730223453e-08, + "learning_rate": 0.08705129402639587, + "loss": 0.0, + "num_input_tokens_seen": 43721312, + "step": 25515 + }, + { + "epoch": 123.88135593220339, + "grad_norm": 3.264059955654375e-08, + "learning_rate": 0.08699783198157078, + "loss": 0.0, + "num_input_tokens_seen": 43729728, + "step": 25520 + }, + { + "epoch": 123.90556900726392, + "grad_norm": 4.203625891818774e-08, + "learning_rate": 0.08694437965247163, + "loss": 0.0, + "num_input_tokens_seen": 43738432, + "step": 25525 + }, + { + "epoch": 123.92978208232445, + "grad_norm": 4.680791931832573e-08, + "learning_rate": 0.08689093704734165, + "loss": 0.0, + "num_input_tokens_seen": 43747072, + "step": 25530 + }, + { + "epoch": 123.953995157385, + "grad_norm": 2.9750136576467412e-08, + "learning_rate": 0.08683750417442222, + "loss": 0.0, + "num_input_tokens_seen": 43755488, + "step": 25535 + }, + { + "epoch": 123.97820823244552, + "grad_norm": 2.1176600029093606e-08, + "learning_rate": 0.08678408104195334, + "loss": 0.0, + "num_input_tokens_seen": 43763680, + "step": 25540 + }, + { + "epoch": 124.00484261501211, + "grad_norm": 1.2017109440876084e-07, + "learning_rate": 0.08673066765817365, + "loss": 0.0, + "num_input_tokens_seen": 43772672, + "step": 25545 + }, + { + "epoch": 124.02905569007264, + "grad_norm": 4.16939904823721e-08, + "learning_rate": 0.08667726403132005, + "loss": 0.0, + "num_input_tokens_seen": 43781408, + "step": 25550 + }, + { + "epoch": 124.05326876513317, + "grad_norm": 3.3777208585661356e-08, + "learning_rate": 0.0866238701696281, + "loss": 0.0, + "num_input_tokens_seen": 43789984, + "step": 25555 + }, + { + "epoch": 124.0774818401937, + "grad_norm": 6.839955091209049e-08, + "learning_rate": 0.08657048608133185, + "loss": 0.0, + "num_input_tokens_seen": 43798720, + "step": 25560 + }, + { + "epoch": 124.10169491525424, + "grad_norm": 1.744323263608294e-08, + "learning_rate": 0.08651711177466369, + "loss": 0.0, + "num_input_tokens_seen": 43807232, + "step": 25565 + }, + { + "epoch": 124.12590799031477, + "grad_norm": 1.1188731718903e-08, + "learning_rate": 0.08646374725785466, + "loss": 0.0, + "num_input_tokens_seen": 43815744, + "step": 25570 + }, + { + "epoch": 124.1501210653753, + "grad_norm": 6.456111378838614e-08, + "learning_rate": 0.08641039253913434, + "loss": 0.0, + "num_input_tokens_seen": 43824608, + "step": 25575 + }, + { + "epoch": 124.17433414043583, + "grad_norm": 2.6932575281080062e-08, + "learning_rate": 0.08635704762673052, + "loss": 0.0, + "num_input_tokens_seen": 43833152, + "step": 25580 + }, + { + "epoch": 124.19854721549636, + "grad_norm": 2.700523538123889e-08, + "learning_rate": 0.08630371252886981, + "loss": 0.0, + "num_input_tokens_seen": 43841824, + "step": 25585 + }, + { + "epoch": 124.2227602905569, + "grad_norm": 1.5037260325811985e-08, + "learning_rate": 0.08625038725377704, + "loss": 0.0, + "num_input_tokens_seen": 43850208, + "step": 25590 + }, + { + "epoch": 124.24697336561744, + "grad_norm": 1.999230292426546e-08, + "learning_rate": 0.08619707180967566, + "loss": 0.0, + "num_input_tokens_seen": 43859232, + "step": 25595 + }, + { + "epoch": 124.27118644067797, + "grad_norm": 5.181176376822805e-08, + "learning_rate": 0.08614376620478768, + "loss": 0.0, + "num_input_tokens_seen": 43867840, + "step": 25600 + }, + { + "epoch": 124.27118644067797, + "eval_loss": 1.0985852479934692, + "eval_runtime": 4.6207, + "eval_samples_per_second": 79.425, + "eval_steps_per_second": 19.91, + "num_input_tokens_seen": 43867840, + "step": 25600 + }, + { + "epoch": 124.2953995157385, + "grad_norm": 2.177206503972684e-08, + "learning_rate": 0.08609047044733344, + "loss": 0.0, + "num_input_tokens_seen": 43876224, + "step": 25605 + }, + { + "epoch": 124.31961259079903, + "grad_norm": 9.769534514703082e-09, + "learning_rate": 0.08603718454553168, + "loss": 0.0, + "num_input_tokens_seen": 43884640, + "step": 25610 + }, + { + "epoch": 124.34382566585957, + "grad_norm": 2.0553679647150602e-08, + "learning_rate": 0.08598390850759997, + "loss": 0.0, + "num_input_tokens_seen": 43893120, + "step": 25615 + }, + { + "epoch": 124.3680387409201, + "grad_norm": 1.0667069894054748e-08, + "learning_rate": 0.08593064234175397, + "loss": 0.0, + "num_input_tokens_seen": 43901728, + "step": 25620 + }, + { + "epoch": 124.39225181598063, + "grad_norm": 2.2766286633668642e-08, + "learning_rate": 0.08587738605620815, + "loss": 0.0, + "num_input_tokens_seen": 43909728, + "step": 25625 + }, + { + "epoch": 124.41646489104116, + "grad_norm": 3.3942040289502984e-08, + "learning_rate": 0.08582413965917512, + "loss": 0.0, + "num_input_tokens_seen": 43917920, + "step": 25630 + }, + { + "epoch": 124.44067796610169, + "grad_norm": 3.4772828172435766e-08, + "learning_rate": 0.08577090315886628, + "loss": 0.0, + "num_input_tokens_seen": 43926720, + "step": 25635 + }, + { + "epoch": 124.46489104116223, + "grad_norm": 2.6302380717879714e-08, + "learning_rate": 0.08571767656349136, + "loss": 0.0, + "num_input_tokens_seen": 43934912, + "step": 25640 + }, + { + "epoch": 124.48910411622276, + "grad_norm": 2.3036577090351784e-08, + "learning_rate": 0.08566445988125847, + "loss": 0.0, + "num_input_tokens_seen": 43943488, + "step": 25645 + }, + { + "epoch": 124.51331719128329, + "grad_norm": 1.2090066725534143e-08, + "learning_rate": 0.08561125312037436, + "loss": 0.0, + "num_input_tokens_seen": 43951776, + "step": 25650 + }, + { + "epoch": 124.53753026634382, + "grad_norm": 2.4717186519751522e-08, + "learning_rate": 0.08555805628904424, + "loss": 0.0, + "num_input_tokens_seen": 43960288, + "step": 25655 + }, + { + "epoch": 124.56174334140435, + "grad_norm": 4.134257380883355e-08, + "learning_rate": 0.08550486939547161, + "loss": 0.0, + "num_input_tokens_seen": 43968768, + "step": 25660 + }, + { + "epoch": 124.5859564164649, + "grad_norm": 1.6948906278457798e-08, + "learning_rate": 0.08545169244785869, + "loss": 0.0, + "num_input_tokens_seen": 43976928, + "step": 25665 + }, + { + "epoch": 124.61016949152543, + "grad_norm": 1.0572238196004946e-08, + "learning_rate": 0.08539852545440589, + "loss": 0.0, + "num_input_tokens_seen": 43985472, + "step": 25670 + }, + { + "epoch": 124.63438256658596, + "grad_norm": 1.420076500835421e-08, + "learning_rate": 0.08534536842331235, + "loss": 0.0, + "num_input_tokens_seen": 43994080, + "step": 25675 + }, + { + "epoch": 124.65859564164649, + "grad_norm": 2.0375630072066997e-08, + "learning_rate": 0.08529222136277545, + "loss": 0.0, + "num_input_tokens_seen": 44002656, + "step": 25680 + }, + { + "epoch": 124.68280871670702, + "grad_norm": 3.90639485203792e-08, + "learning_rate": 0.08523908428099125, + "loss": 0.0, + "num_input_tokens_seen": 44011104, + "step": 25685 + }, + { + "epoch": 124.70702179176756, + "grad_norm": 4.546636489521916e-08, + "learning_rate": 0.08518595718615402, + "loss": 0.0, + "num_input_tokens_seen": 44019264, + "step": 25690 + }, + { + "epoch": 124.73123486682809, + "grad_norm": 2.5314992768699085e-08, + "learning_rate": 0.08513284008645675, + "loss": 0.0, + "num_input_tokens_seen": 44027840, + "step": 25695 + }, + { + "epoch": 124.75544794188862, + "grad_norm": 1.5333974090481206e-08, + "learning_rate": 0.08507973299009065, + "loss": 0.0, + "num_input_tokens_seen": 44036544, + "step": 25700 + }, + { + "epoch": 124.77966101694915, + "grad_norm": 4.988309854070394e-08, + "learning_rate": 0.08502663590524563, + "loss": 0.0, + "num_input_tokens_seen": 44044864, + "step": 25705 + }, + { + "epoch": 124.80387409200968, + "grad_norm": 4.7571493411169286e-08, + "learning_rate": 0.08497354884010981, + "loss": 0.0, + "num_input_tokens_seen": 44053792, + "step": 25710 + }, + { + "epoch": 124.82808716707022, + "grad_norm": 2.7653475953570705e-08, + "learning_rate": 0.0849204718028699, + "loss": 0.0, + "num_input_tokens_seen": 44062560, + "step": 25715 + }, + { + "epoch": 124.85230024213075, + "grad_norm": 1.7183502620810032e-08, + "learning_rate": 0.08486740480171118, + "loss": 0.0, + "num_input_tokens_seen": 44070944, + "step": 25720 + }, + { + "epoch": 124.87651331719128, + "grad_norm": 2.296751588914958e-08, + "learning_rate": 0.08481434784481706, + "loss": 0.0, + "num_input_tokens_seen": 44079392, + "step": 25725 + }, + { + "epoch": 124.90072639225181, + "grad_norm": 4.2744357386936827e-08, + "learning_rate": 0.08476130094036968, + "loss": 0.0, + "num_input_tokens_seen": 44088192, + "step": 25730 + }, + { + "epoch": 124.92493946731234, + "grad_norm": 2.2487768092105398e-08, + "learning_rate": 0.08470826409654961, + "loss": 0.0, + "num_input_tokens_seen": 44096928, + "step": 25735 + }, + { + "epoch": 124.94915254237289, + "grad_norm": 1.79555126322839e-08, + "learning_rate": 0.08465523732153564, + "loss": 0.0, + "num_input_tokens_seen": 44105632, + "step": 25740 + }, + { + "epoch": 124.97336561743342, + "grad_norm": 2.4715362201277458e-08, + "learning_rate": 0.08460222062350532, + "loss": 0.0, + "num_input_tokens_seen": 44114144, + "step": 25745 + }, + { + "epoch": 124.99757869249395, + "grad_norm": 5.013135861986484e-08, + "learning_rate": 0.08454921401063442, + "loss": 0.0, + "num_input_tokens_seen": 44122912, + "step": 25750 + }, + { + "epoch": 125.02421307506053, + "grad_norm": 1.4596128750099524e-08, + "learning_rate": 0.08449621749109716, + "loss": 0.0, + "num_input_tokens_seen": 44131904, + "step": 25755 + }, + { + "epoch": 125.04842615012106, + "grad_norm": 6.611011116319787e-08, + "learning_rate": 0.08444323107306641, + "loss": 0.0, + "num_input_tokens_seen": 44140288, + "step": 25760 + }, + { + "epoch": 125.0726392251816, + "grad_norm": 3.663048175894801e-08, + "learning_rate": 0.0843902547647132, + "loss": 0.0, + "num_input_tokens_seen": 44148576, + "step": 25765 + }, + { + "epoch": 125.09685230024213, + "grad_norm": 3.1029316005515284e-08, + "learning_rate": 0.0843372885742072, + "loss": 0.0, + "num_input_tokens_seen": 44156992, + "step": 25770 + }, + { + "epoch": 125.12106537530266, + "grad_norm": 1.2275275018680532e-08, + "learning_rate": 0.08428433250971652, + "loss": 0.0, + "num_input_tokens_seen": 44165664, + "step": 25775 + }, + { + "epoch": 125.1452784503632, + "grad_norm": 1.3301841406132553e-08, + "learning_rate": 0.08423138657940757, + "loss": 0.0, + "num_input_tokens_seen": 44174048, + "step": 25780 + }, + { + "epoch": 125.16949152542372, + "grad_norm": 4.29803712620469e-08, + "learning_rate": 0.08417845079144536, + "loss": 0.0, + "num_input_tokens_seen": 44182496, + "step": 25785 + }, + { + "epoch": 125.19370460048427, + "grad_norm": 2.9837380566277716e-08, + "learning_rate": 0.08412552515399314, + "loss": 0.0, + "num_input_tokens_seen": 44190688, + "step": 25790 + }, + { + "epoch": 125.2179176755448, + "grad_norm": 2.1744646971910697e-08, + "learning_rate": 0.08407260967521278, + "loss": 0.0, + "num_input_tokens_seen": 44199072, + "step": 25795 + }, + { + "epoch": 125.24213075060533, + "grad_norm": 2.168341772801341e-08, + "learning_rate": 0.08401970436326454, + "loss": 0.0, + "num_input_tokens_seen": 44207680, + "step": 25800 + }, + { + "epoch": 125.24213075060533, + "eval_loss": 1.1098402738571167, + "eval_runtime": 4.6161, + "eval_samples_per_second": 79.504, + "eval_steps_per_second": 19.93, + "num_input_tokens_seen": 44207680, + "step": 25800 + }, + { + "epoch": 125.26634382566586, + "grad_norm": 2.790091890858548e-08, + "learning_rate": 0.08396680922630702, + "loss": 0.0, + "num_input_tokens_seen": 44216224, + "step": 25805 + }, + { + "epoch": 125.29055690072639, + "grad_norm": 2.509784735593712e-08, + "learning_rate": 0.08391392427249732, + "loss": 0.0, + "num_input_tokens_seen": 44224704, + "step": 25810 + }, + { + "epoch": 125.31476997578693, + "grad_norm": 2.3717857899896444e-08, + "learning_rate": 0.08386104950999107, + "loss": 0.0, + "num_input_tokens_seen": 44233120, + "step": 25815 + }, + { + "epoch": 125.33898305084746, + "grad_norm": 1.7194064838577106e-08, + "learning_rate": 0.0838081849469421, + "loss": 0.0, + "num_input_tokens_seen": 44241664, + "step": 25820 + }, + { + "epoch": 125.36319612590799, + "grad_norm": 2.5441584838858944e-08, + "learning_rate": 0.08375533059150281, + "loss": 0.0, + "num_input_tokens_seen": 44250080, + "step": 25825 + }, + { + "epoch": 125.38740920096852, + "grad_norm": 8.625677949680721e-09, + "learning_rate": 0.08370248645182406, + "loss": 0.0, + "num_input_tokens_seen": 44258528, + "step": 25830 + }, + { + "epoch": 125.41162227602905, + "grad_norm": 2.7570068894533506e-08, + "learning_rate": 0.083649652536055, + "loss": 0.0, + "num_input_tokens_seen": 44267008, + "step": 25835 + }, + { + "epoch": 125.4358353510896, + "grad_norm": 2.6973237865490773e-08, + "learning_rate": 0.08359682885234339, + "loss": 0.0, + "num_input_tokens_seen": 44275488, + "step": 25840 + }, + { + "epoch": 125.46004842615012, + "grad_norm": 2.5578055229402707e-08, + "learning_rate": 0.08354401540883516, + "loss": 0.0, + "num_input_tokens_seen": 44284256, + "step": 25845 + }, + { + "epoch": 125.48426150121065, + "grad_norm": 1.9437811360489832e-08, + "learning_rate": 0.0834912122136749, + "loss": 0.0, + "num_input_tokens_seen": 44293248, + "step": 25850 + }, + { + "epoch": 125.50847457627118, + "grad_norm": 2.8486663694593517e-08, + "learning_rate": 0.0834384192750056, + "loss": 0.0, + "num_input_tokens_seen": 44301920, + "step": 25855 + }, + { + "epoch": 125.53268765133171, + "grad_norm": 4.027550559726478e-08, + "learning_rate": 0.08338563660096844, + "loss": 0.0, + "num_input_tokens_seen": 44310464, + "step": 25860 + }, + { + "epoch": 125.55690072639226, + "grad_norm": 4.058829006226006e-08, + "learning_rate": 0.08333286419970329, + "loss": 0.0, + "num_input_tokens_seen": 44319008, + "step": 25865 + }, + { + "epoch": 125.58111380145279, + "grad_norm": 2.075118565869616e-08, + "learning_rate": 0.08328010207934824, + "loss": 0.0, + "num_input_tokens_seen": 44327648, + "step": 25870 + }, + { + "epoch": 125.60532687651332, + "grad_norm": 2.369067964025362e-08, + "learning_rate": 0.08322735024803989, + "loss": 0.0, + "num_input_tokens_seen": 44336224, + "step": 25875 + }, + { + "epoch": 125.62953995157385, + "grad_norm": 3.124802105958224e-08, + "learning_rate": 0.08317460871391331, + "loss": 0.0, + "num_input_tokens_seen": 44345056, + "step": 25880 + }, + { + "epoch": 125.65375302663438, + "grad_norm": 2.603641746645735e-08, + "learning_rate": 0.08312187748510179, + "loss": 0.0, + "num_input_tokens_seen": 44353664, + "step": 25885 + }, + { + "epoch": 125.67796610169492, + "grad_norm": 2.3333468490704945e-08, + "learning_rate": 0.08306915656973726, + "loss": 0.0, + "num_input_tokens_seen": 44362688, + "step": 25890 + }, + { + "epoch": 125.70217917675545, + "grad_norm": 2.48853613271649e-08, + "learning_rate": 0.08301644597594988, + "loss": 0.0, + "num_input_tokens_seen": 44371104, + "step": 25895 + }, + { + "epoch": 125.72639225181598, + "grad_norm": 9.19521259135081e-09, + "learning_rate": 0.08296374571186826, + "loss": 0.0, + "num_input_tokens_seen": 44379744, + "step": 25900 + }, + { + "epoch": 125.75060532687651, + "grad_norm": 2.8114449435179267e-08, + "learning_rate": 0.08291105578561955, + "loss": 0.0, + "num_input_tokens_seen": 44388704, + "step": 25905 + }, + { + "epoch": 125.77481840193704, + "grad_norm": 7.6786783154148e-08, + "learning_rate": 0.08285837620532904, + "loss": 0.0, + "num_input_tokens_seen": 44397280, + "step": 25910 + }, + { + "epoch": 125.79903147699758, + "grad_norm": 7.28409066397262e-09, + "learning_rate": 0.0828057069791207, + "loss": 0.0, + "num_input_tokens_seen": 44405728, + "step": 25915 + }, + { + "epoch": 125.82324455205811, + "grad_norm": 4.076412807307861e-08, + "learning_rate": 0.0827530481151168, + "loss": 0.0, + "num_input_tokens_seen": 44414592, + "step": 25920 + }, + { + "epoch": 125.84745762711864, + "grad_norm": 7.021974113285978e-08, + "learning_rate": 0.08270039962143792, + "loss": 0.0, + "num_input_tokens_seen": 44422720, + "step": 25925 + }, + { + "epoch": 125.87167070217917, + "grad_norm": 1.9618235924667715e-08, + "learning_rate": 0.08264776150620314, + "loss": 0.0, + "num_input_tokens_seen": 44431456, + "step": 25930 + }, + { + "epoch": 125.8958837772397, + "grad_norm": 8.265841344723412e-09, + "learning_rate": 0.08259513377753, + "loss": 0.0, + "num_input_tokens_seen": 44440000, + "step": 25935 + }, + { + "epoch": 125.92009685230025, + "grad_norm": 2.632208939701286e-08, + "learning_rate": 0.08254251644353423, + "loss": 0.0, + "num_input_tokens_seen": 44448640, + "step": 25940 + }, + { + "epoch": 125.94430992736078, + "grad_norm": 1.9505872472791452e-08, + "learning_rate": 0.08248990951233022, + "loss": 0.0, + "num_input_tokens_seen": 44457408, + "step": 25945 + }, + { + "epoch": 125.9685230024213, + "grad_norm": 3.4459112896456645e-08, + "learning_rate": 0.08243731299203048, + "loss": 0.0, + "num_input_tokens_seen": 44465824, + "step": 25950 + }, + { + "epoch": 125.99273607748184, + "grad_norm": 2.626160444663128e-08, + "learning_rate": 0.08238472689074612, + "loss": 0.0, + "num_input_tokens_seen": 44474368, + "step": 25955 + }, + { + "epoch": 126.01937046004842, + "grad_norm": 2.1947625938878446e-08, + "learning_rate": 0.08233215121658666, + "loss": 0.0, + "num_input_tokens_seen": 44483136, + "step": 25960 + }, + { + "epoch": 126.04358353510897, + "grad_norm": 1.9873606760256735e-08, + "learning_rate": 0.08227958597765982, + "loss": 0.0, + "num_input_tokens_seen": 44491552, + "step": 25965 + }, + { + "epoch": 126.0677966101695, + "grad_norm": 4.223911176381989e-08, + "learning_rate": 0.08222703118207181, + "loss": 0.0, + "num_input_tokens_seen": 44499936, + "step": 25970 + }, + { + "epoch": 126.09200968523002, + "grad_norm": 5.3148568213146063e-08, + "learning_rate": 0.08217448683792734, + "loss": 0.0, + "num_input_tokens_seen": 44508384, + "step": 25975 + }, + { + "epoch": 126.11622276029055, + "grad_norm": 1.964437679191633e-08, + "learning_rate": 0.08212195295332926, + "loss": 0.0, + "num_input_tokens_seen": 44516992, + "step": 25980 + }, + { + "epoch": 126.14043583535108, + "grad_norm": 1.8426275616434395e-08, + "learning_rate": 0.08206942953637915, + "loss": 0.0, + "num_input_tokens_seen": 44525472, + "step": 25985 + }, + { + "epoch": 126.16464891041163, + "grad_norm": 4.050923507747939e-08, + "learning_rate": 0.08201691659517658, + "loss": 0.0, + "num_input_tokens_seen": 44534016, + "step": 25990 + }, + { + "epoch": 126.18886198547216, + "grad_norm": 5.707035910518243e-08, + "learning_rate": 0.08196441413781981, + "loss": 0.0, + "num_input_tokens_seen": 44542816, + "step": 25995 + }, + { + "epoch": 126.21307506053269, + "grad_norm": 3.6318255070000305e-08, + "learning_rate": 0.08191192217240544, + "loss": 0.0, + "num_input_tokens_seen": 44551232, + "step": 26000 + }, + { + "epoch": 126.21307506053269, + "eval_loss": 1.1184113025665283, + "eval_runtime": 4.6194, + "eval_samples_per_second": 79.448, + "eval_steps_per_second": 19.916, + "num_input_tokens_seen": 44551232, + "step": 26000 + }, + { + "epoch": 126.23728813559322, + "grad_norm": 2.150230571373868e-08, + "learning_rate": 0.08185944070702823, + "loss": 0.0, + "num_input_tokens_seen": 44559904, + "step": 26005 + }, + { + "epoch": 126.26150121065375, + "grad_norm": 3.81221383349839e-08, + "learning_rate": 0.08180696974978159, + "loss": 0.0, + "num_input_tokens_seen": 44568384, + "step": 26010 + }, + { + "epoch": 126.28571428571429, + "grad_norm": 8.8425444744189e-09, + "learning_rate": 0.08175450930875724, + "loss": 0.0, + "num_input_tokens_seen": 44576896, + "step": 26015 + }, + { + "epoch": 126.30992736077482, + "grad_norm": 2.297291246122768e-08, + "learning_rate": 0.08170205939204513, + "loss": 0.0, + "num_input_tokens_seen": 44585536, + "step": 26020 + }, + { + "epoch": 126.33414043583535, + "grad_norm": 2.8572262777970536e-08, + "learning_rate": 0.08164962000773379, + "loss": 0.0, + "num_input_tokens_seen": 44594400, + "step": 26025 + }, + { + "epoch": 126.35835351089588, + "grad_norm": 1.2396689008653539e-08, + "learning_rate": 0.08159719116390995, + "loss": 0.0, + "num_input_tokens_seen": 44602912, + "step": 26030 + }, + { + "epoch": 126.38256658595641, + "grad_norm": 3.4787703384608903e-08, + "learning_rate": 0.08154477286865887, + "loss": 0.0, + "num_input_tokens_seen": 44611104, + "step": 26035 + }, + { + "epoch": 126.40677966101696, + "grad_norm": 6.501650773316214e-08, + "learning_rate": 0.08149236513006404, + "loss": 0.0, + "num_input_tokens_seen": 44619616, + "step": 26040 + }, + { + "epoch": 126.43099273607749, + "grad_norm": 3.775080870127567e-08, + "learning_rate": 0.08143996795620746, + "loss": 0.0, + "num_input_tokens_seen": 44628096, + "step": 26045 + }, + { + "epoch": 126.45520581113801, + "grad_norm": 3.9129769646706336e-08, + "learning_rate": 0.08138758135516938, + "loss": 0.0, + "num_input_tokens_seen": 44636544, + "step": 26050 + }, + { + "epoch": 126.47941888619854, + "grad_norm": 3.455197372659313e-08, + "learning_rate": 0.08133520533502851, + "loss": 0.0, + "num_input_tokens_seen": 44645024, + "step": 26055 + }, + { + "epoch": 126.50363196125907, + "grad_norm": 3.210100274486649e-08, + "learning_rate": 0.08128283990386184, + "loss": 0.0, + "num_input_tokens_seen": 44653824, + "step": 26060 + }, + { + "epoch": 126.52784503631962, + "grad_norm": 3.259228975593942e-08, + "learning_rate": 0.08123048506974488, + "loss": 0.0, + "num_input_tokens_seen": 44662208, + "step": 26065 + }, + { + "epoch": 126.55205811138015, + "grad_norm": 1.7505168870002308e-08, + "learning_rate": 0.08117814084075124, + "loss": 0.0, + "num_input_tokens_seen": 44670880, + "step": 26070 + }, + { + "epoch": 126.57627118644068, + "grad_norm": 4.51217196939524e-08, + "learning_rate": 0.08112580722495318, + "loss": 0.0, + "num_input_tokens_seen": 44679648, + "step": 26075 + }, + { + "epoch": 126.60048426150121, + "grad_norm": 2.931922971072254e-08, + "learning_rate": 0.08107348423042122, + "loss": 0.0, + "num_input_tokens_seen": 44687936, + "step": 26080 + }, + { + "epoch": 126.62469733656174, + "grad_norm": 2.5075850729194826e-08, + "learning_rate": 0.08102117186522413, + "loss": 0.0, + "num_input_tokens_seen": 44696448, + "step": 26085 + }, + { + "epoch": 126.64891041162228, + "grad_norm": 2.3805345250593746e-08, + "learning_rate": 0.08096887013742916, + "loss": 0.0, + "num_input_tokens_seen": 44705152, + "step": 26090 + }, + { + "epoch": 126.67312348668281, + "grad_norm": 1.2560578355191865e-08, + "learning_rate": 0.08091657905510198, + "loss": 0.0, + "num_input_tokens_seen": 44713248, + "step": 26095 + }, + { + "epoch": 126.69733656174334, + "grad_norm": 1.897313950394164e-08, + "learning_rate": 0.08086429862630642, + "loss": 0.0, + "num_input_tokens_seen": 44721856, + "step": 26100 + }, + { + "epoch": 126.72154963680387, + "grad_norm": 1.6621543252881565e-08, + "learning_rate": 0.08081202885910488, + "loss": 0.0, + "num_input_tokens_seen": 44730304, + "step": 26105 + }, + { + "epoch": 126.7457627118644, + "grad_norm": 1.4543972248759474e-08, + "learning_rate": 0.08075976976155795, + "loss": 0.0, + "num_input_tokens_seen": 44738688, + "step": 26110 + }, + { + "epoch": 126.76997578692495, + "grad_norm": 2.2530405985321522e-08, + "learning_rate": 0.08070752134172461, + "loss": 0.0, + "num_input_tokens_seen": 44747616, + "step": 26115 + }, + { + "epoch": 126.79418886198548, + "grad_norm": 2.7163464366708467e-08, + "learning_rate": 0.08065528360766229, + "loss": 0.0, + "num_input_tokens_seen": 44756128, + "step": 26120 + }, + { + "epoch": 126.818401937046, + "grad_norm": 4.4294527157262564e-08, + "learning_rate": 0.08060305656742664, + "loss": 0.0, + "num_input_tokens_seen": 44764736, + "step": 26125 + }, + { + "epoch": 126.84261501210653, + "grad_norm": 2.0390356070265625e-08, + "learning_rate": 0.08055084022907182, + "loss": 0.0, + "num_input_tokens_seen": 44773824, + "step": 26130 + }, + { + "epoch": 126.86682808716706, + "grad_norm": 5.7863736913077446e-08, + "learning_rate": 0.08049863460065014, + "loss": 0.0, + "num_input_tokens_seen": 44782272, + "step": 26135 + }, + { + "epoch": 126.89104116222761, + "grad_norm": 3.4992538644473825e-08, + "learning_rate": 0.0804464396902124, + "loss": 0.0, + "num_input_tokens_seen": 44790656, + "step": 26140 + }, + { + "epoch": 126.91525423728814, + "grad_norm": 2.582950742180401e-08, + "learning_rate": 0.08039425550580777, + "loss": 0.0, + "num_input_tokens_seen": 44798976, + "step": 26145 + }, + { + "epoch": 126.93946731234867, + "grad_norm": 2.8911577132362254e-08, + "learning_rate": 0.08034208205548363, + "loss": 0.0, + "num_input_tokens_seen": 44807648, + "step": 26150 + }, + { + "epoch": 126.9636803874092, + "grad_norm": 2.0114757859346355e-08, + "learning_rate": 0.08028991934728581, + "loss": 0.0, + "num_input_tokens_seen": 44816480, + "step": 26155 + }, + { + "epoch": 126.98789346246973, + "grad_norm": 1.2156963435927537e-08, + "learning_rate": 0.0802377673892585, + "loss": 0.0, + "num_input_tokens_seen": 44825184, + "step": 26160 + }, + { + "epoch": 127.01452784503633, + "grad_norm": 2.5842666673270287e-08, + "learning_rate": 0.0801856261894441, + "loss": 0.0, + "num_input_tokens_seen": 44834208, + "step": 26165 + }, + { + "epoch": 127.03874092009686, + "grad_norm": 1.6941363867317705e-08, + "learning_rate": 0.08013349575588354, + "loss": 0.0, + "num_input_tokens_seen": 44842528, + "step": 26170 + }, + { + "epoch": 127.06295399515739, + "grad_norm": 2.4395765407803083e-08, + "learning_rate": 0.08008137609661586, + "loss": 0.0, + "num_input_tokens_seen": 44851040, + "step": 26175 + }, + { + "epoch": 127.08716707021792, + "grad_norm": 4.248284213304032e-08, + "learning_rate": 0.08002926721967872, + "loss": 0.0, + "num_input_tokens_seen": 44860160, + "step": 26180 + }, + { + "epoch": 127.11138014527845, + "grad_norm": 1.8253610178931012e-08, + "learning_rate": 0.07997716913310782, + "loss": 0.0, + "num_input_tokens_seen": 44868960, + "step": 26185 + }, + { + "epoch": 127.13559322033899, + "grad_norm": 3.7786346496204715e-08, + "learning_rate": 0.07992508184493745, + "loss": 0.0, + "num_input_tokens_seen": 44877536, + "step": 26190 + }, + { + "epoch": 127.15980629539952, + "grad_norm": 1.8856633587915894e-08, + "learning_rate": 0.07987300536320001, + "loss": 0.0, + "num_input_tokens_seen": 44886144, + "step": 26195 + }, + { + "epoch": 127.18401937046005, + "grad_norm": 3.660550973449972e-08, + "learning_rate": 0.07982093969592649, + "loss": 0.0, + "num_input_tokens_seen": 44894816, + "step": 26200 + }, + { + "epoch": 127.18401937046005, + "eval_loss": 1.1155551671981812, + "eval_runtime": 4.6339, + "eval_samples_per_second": 79.199, + "eval_steps_per_second": 19.854, + "num_input_tokens_seen": 44894816, + "step": 26200 + }, + { + "epoch": 127.20823244552058, + "grad_norm": 8.858435762704175e-09, + "learning_rate": 0.07976888485114592, + "loss": 0.0, + "num_input_tokens_seen": 44903488, + "step": 26205 + }, + { + "epoch": 127.23244552058111, + "grad_norm": 2.1663648652747725e-08, + "learning_rate": 0.07971684083688595, + "loss": 0.0, + "num_input_tokens_seen": 44911904, + "step": 26210 + }, + { + "epoch": 127.25665859564165, + "grad_norm": 3.578584184538158e-08, + "learning_rate": 0.0796648076611723, + "loss": 0.0, + "num_input_tokens_seen": 44920608, + "step": 26215 + }, + { + "epoch": 127.28087167070218, + "grad_norm": 2.6295268185094756e-08, + "learning_rate": 0.07961278533202922, + "loss": 0.0, + "num_input_tokens_seen": 44929184, + "step": 26220 + }, + { + "epoch": 127.30508474576271, + "grad_norm": 2.6527361640660274e-08, + "learning_rate": 0.07956077385747919, + "loss": 0.0, + "num_input_tokens_seen": 44937440, + "step": 26225 + }, + { + "epoch": 127.32929782082324, + "grad_norm": 3.5267341047529044e-08, + "learning_rate": 0.079508773245543, + "loss": 0.0, + "num_input_tokens_seen": 44946016, + "step": 26230 + }, + { + "epoch": 127.35351089588377, + "grad_norm": 3.073368404216126e-08, + "learning_rate": 0.07945678350423982, + "loss": 0.0, + "num_input_tokens_seen": 44954560, + "step": 26235 + }, + { + "epoch": 127.37772397094432, + "grad_norm": 3.469491005603231e-08, + "learning_rate": 0.07940480464158717, + "loss": 0.0, + "num_input_tokens_seen": 44963072, + "step": 26240 + }, + { + "epoch": 127.40193704600485, + "grad_norm": 1.902982660340058e-08, + "learning_rate": 0.07935283666560076, + "loss": 0.0, + "num_input_tokens_seen": 44971424, + "step": 26245 + }, + { + "epoch": 127.42615012106538, + "grad_norm": 3.003808402013419e-08, + "learning_rate": 0.07930087958429478, + "loss": 0.0, + "num_input_tokens_seen": 44980160, + "step": 26250 + }, + { + "epoch": 127.4503631961259, + "grad_norm": 1.667025628648844e-08, + "learning_rate": 0.07924893340568159, + "loss": 0.0, + "num_input_tokens_seen": 44988704, + "step": 26255 + }, + { + "epoch": 127.47457627118644, + "grad_norm": 1.6121669332846977e-08, + "learning_rate": 0.07919699813777205, + "loss": 0.0, + "num_input_tokens_seen": 44997184, + "step": 26260 + }, + { + "epoch": 127.49878934624698, + "grad_norm": 2.7175969918857845e-08, + "learning_rate": 0.07914507378857515, + "loss": 0.0, + "num_input_tokens_seen": 45005824, + "step": 26265 + }, + { + "epoch": 127.52300242130751, + "grad_norm": 1.4989094410111647e-08, + "learning_rate": 0.07909316036609822, + "loss": 0.0, + "num_input_tokens_seen": 45014464, + "step": 26270 + }, + { + "epoch": 127.54721549636804, + "grad_norm": 3.3962297418810294e-09, + "learning_rate": 0.07904125787834704, + "loss": 0.0, + "num_input_tokens_seen": 45022912, + "step": 26275 + }, + { + "epoch": 127.57142857142857, + "grad_norm": 2.220500050498231e-08, + "learning_rate": 0.07898936633332569, + "loss": 0.0, + "num_input_tokens_seen": 45031584, + "step": 26280 + }, + { + "epoch": 127.5956416464891, + "grad_norm": 1.564477969395739e-08, + "learning_rate": 0.07893748573903635, + "loss": 0.0, + "num_input_tokens_seen": 45040192, + "step": 26285 + }, + { + "epoch": 127.61985472154964, + "grad_norm": 3.733407893946605e-08, + "learning_rate": 0.0788856161034798, + "loss": 0.0, + "num_input_tokens_seen": 45049088, + "step": 26290 + }, + { + "epoch": 127.64406779661017, + "grad_norm": 1.4732603581535386e-08, + "learning_rate": 0.07883375743465487, + "loss": 0.0, + "num_input_tokens_seen": 45057504, + "step": 26295 + }, + { + "epoch": 127.6682808716707, + "grad_norm": 1.2701382168245345e-08, + "learning_rate": 0.07878190974055888, + "loss": 0.0, + "num_input_tokens_seen": 45065856, + "step": 26300 + }, + { + "epoch": 127.69249394673123, + "grad_norm": 1.5517404250431355e-08, + "learning_rate": 0.07873007302918746, + "loss": 0.0, + "num_input_tokens_seen": 45074464, + "step": 26305 + }, + { + "epoch": 127.71670702179176, + "grad_norm": 1.4771595502338641e-08, + "learning_rate": 0.07867824730853433, + "loss": 0.0, + "num_input_tokens_seen": 45082912, + "step": 26310 + }, + { + "epoch": 127.7409200968523, + "grad_norm": 1.1719540893295743e-08, + "learning_rate": 0.07862643258659176, + "loss": 0.0, + "num_input_tokens_seen": 45091680, + "step": 26315 + }, + { + "epoch": 127.76513317191284, + "grad_norm": 1.2215350508881784e-08, + "learning_rate": 0.07857462887135026, + "loss": 0.0, + "num_input_tokens_seen": 45100480, + "step": 26320 + }, + { + "epoch": 127.78934624697337, + "grad_norm": 2.0470642070335998e-08, + "learning_rate": 0.0785228361707986, + "loss": 0.0, + "num_input_tokens_seen": 45108704, + "step": 26325 + }, + { + "epoch": 127.8135593220339, + "grad_norm": 1.893756795823265e-08, + "learning_rate": 0.07847105449292378, + "loss": 0.0, + "num_input_tokens_seen": 45117344, + "step": 26330 + }, + { + "epoch": 127.83777239709443, + "grad_norm": 1.7526302187320653e-08, + "learning_rate": 0.0784192838457113, + "loss": 0.0, + "num_input_tokens_seen": 45126080, + "step": 26335 + }, + { + "epoch": 127.86198547215497, + "grad_norm": 4.495327488029943e-08, + "learning_rate": 0.07836752423714473, + "loss": 0.0, + "num_input_tokens_seen": 45135040, + "step": 26340 + }, + { + "epoch": 127.8861985472155, + "grad_norm": 1.1687673051596903e-08, + "learning_rate": 0.07831577567520616, + "loss": 0.0, + "num_input_tokens_seen": 45143328, + "step": 26345 + }, + { + "epoch": 127.91041162227603, + "grad_norm": 1.4216266386313237e-08, + "learning_rate": 0.07826403816787579, + "loss": 0.0, + "num_input_tokens_seen": 45152000, + "step": 26350 + }, + { + "epoch": 127.93462469733656, + "grad_norm": 3.731829423259114e-08, + "learning_rate": 0.0782123117231322, + "loss": 0.0, + "num_input_tokens_seen": 45160128, + "step": 26355 + }, + { + "epoch": 127.95883777239709, + "grad_norm": 3.946177073999024e-08, + "learning_rate": 0.07816059634895237, + "loss": 0.0, + "num_input_tokens_seen": 45168704, + "step": 26360 + }, + { + "epoch": 127.98305084745763, + "grad_norm": 2.6244270756592414e-08, + "learning_rate": 0.0781088920533113, + "loss": 0.0, + "num_input_tokens_seen": 45177152, + "step": 26365 + }, + { + "epoch": 128.00968523002422, + "grad_norm": 2.8734570278743377e-08, + "learning_rate": 0.07805719884418257, + "loss": 0.0, + "num_input_tokens_seen": 45186208, + "step": 26370 + }, + { + "epoch": 128.03389830508473, + "grad_norm": 1.9778171989059956e-08, + "learning_rate": 0.07800551672953779, + "loss": 0.0, + "num_input_tokens_seen": 45194624, + "step": 26375 + }, + { + "epoch": 128.05811138014528, + "grad_norm": 2.8429290921394568e-08, + "learning_rate": 0.07795384571734709, + "loss": 0.0, + "num_input_tokens_seen": 45203040, + "step": 26380 + }, + { + "epoch": 128.08232445520582, + "grad_norm": 2.9459991779390293e-08, + "learning_rate": 0.07790218581557883, + "loss": 0.0, + "num_input_tokens_seen": 45211200, + "step": 26385 + }, + { + "epoch": 128.10653753026634, + "grad_norm": 2.3186499831240326e-08, + "learning_rate": 0.07785053703219949, + "loss": 0.0, + "num_input_tokens_seen": 45219936, + "step": 26390 + }, + { + "epoch": 128.13075060532688, + "grad_norm": 1.1866969629181767e-08, + "learning_rate": 0.07779889937517409, + "loss": 0.0, + "num_input_tokens_seen": 45228128, + "step": 26395 + }, + { + "epoch": 128.1549636803874, + "grad_norm": 2.5623966948273846e-08, + "learning_rate": 0.0777472728524657, + "loss": 0.0, + "num_input_tokens_seen": 45236928, + "step": 26400 + }, + { + "epoch": 128.1549636803874, + "eval_loss": 1.1207616329193115, + "eval_runtime": 4.6198, + "eval_samples_per_second": 79.441, + "eval_steps_per_second": 19.914, + "num_input_tokens_seen": 45236928, + "step": 26400 + }, + { + "epoch": 128.17917675544794, + "grad_norm": 2.4786372065932483e-08, + "learning_rate": 0.07769565747203584, + "loss": 0.0, + "num_input_tokens_seen": 45245088, + "step": 26405 + }, + { + "epoch": 128.20338983050848, + "grad_norm": 1.7594883772176217e-08, + "learning_rate": 0.07764405324184427, + "loss": 0.0, + "num_input_tokens_seen": 45253344, + "step": 26410 + }, + { + "epoch": 128.227602905569, + "grad_norm": 3.650524504905661e-08, + "learning_rate": 0.07759246016984889, + "loss": 0.0, + "num_input_tokens_seen": 45261728, + "step": 26415 + }, + { + "epoch": 128.25181598062954, + "grad_norm": 2.425935008432134e-08, + "learning_rate": 0.07754087826400609, + "loss": 0.0, + "num_input_tokens_seen": 45270048, + "step": 26420 + }, + { + "epoch": 128.27602905569006, + "grad_norm": 1.01038715172308e-08, + "learning_rate": 0.0774893075322705, + "loss": 0.0, + "num_input_tokens_seen": 45278656, + "step": 26425 + }, + { + "epoch": 128.3002421307506, + "grad_norm": 1.6034102046091903e-08, + "learning_rate": 0.07743774798259484, + "loss": 0.0, + "num_input_tokens_seen": 45287392, + "step": 26430 + }, + { + "epoch": 128.32445520581115, + "grad_norm": 2.4828130662513104e-08, + "learning_rate": 0.07738619962293032, + "loss": 0.0, + "num_input_tokens_seen": 45296000, + "step": 26435 + }, + { + "epoch": 128.34866828087166, + "grad_norm": 2.9697799774908162e-08, + "learning_rate": 0.0773346624612264, + "loss": 0.0, + "num_input_tokens_seen": 45305120, + "step": 26440 + }, + { + "epoch": 128.3728813559322, + "grad_norm": 2.0001319711582255e-08, + "learning_rate": 0.07728313650543066, + "loss": 0.0, + "num_input_tokens_seen": 45313568, + "step": 26445 + }, + { + "epoch": 128.39709443099272, + "grad_norm": 4.0421308966642755e-08, + "learning_rate": 0.07723162176348913, + "loss": 0.0, + "num_input_tokens_seen": 45321824, + "step": 26450 + }, + { + "epoch": 128.42130750605327, + "grad_norm": 8.221229919058715e-09, + "learning_rate": 0.07718011824334593, + "loss": 0.0, + "num_input_tokens_seen": 45330560, + "step": 26455 + }, + { + "epoch": 128.4455205811138, + "grad_norm": 1.5984241485966777e-08, + "learning_rate": 0.07712862595294363, + "loss": 0.0, + "num_input_tokens_seen": 45339232, + "step": 26460 + }, + { + "epoch": 128.46973365617433, + "grad_norm": 2.080055949704729e-08, + "learning_rate": 0.07707714490022301, + "loss": 0.0, + "num_input_tokens_seen": 45347904, + "step": 26465 + }, + { + "epoch": 128.49394673123487, + "grad_norm": 9.339252038387258e-09, + "learning_rate": 0.07702567509312298, + "loss": 0.0, + "num_input_tokens_seen": 45356480, + "step": 26470 + }, + { + "epoch": 128.5181598062954, + "grad_norm": 2.2958746015433462e-08, + "learning_rate": 0.07697421653958098, + "loss": 0.0, + "num_input_tokens_seen": 45364992, + "step": 26475 + }, + { + "epoch": 128.54237288135593, + "grad_norm": 2.954384825670786e-08, + "learning_rate": 0.07692276924753247, + "loss": 0.0, + "num_input_tokens_seen": 45373632, + "step": 26480 + }, + { + "epoch": 128.56658595641647, + "grad_norm": 1.9979381704615662e-08, + "learning_rate": 0.07687133322491124, + "loss": 0.0, + "num_input_tokens_seen": 45382432, + "step": 26485 + }, + { + "epoch": 128.590799031477, + "grad_norm": 2.0024145896968548e-08, + "learning_rate": 0.07681990847964948, + "loss": 0.0, + "num_input_tokens_seen": 45391008, + "step": 26490 + }, + { + "epoch": 128.61501210653753, + "grad_norm": 3.140431559245371e-08, + "learning_rate": 0.0767684950196774, + "loss": 0.0, + "num_input_tokens_seen": 45399488, + "step": 26495 + }, + { + "epoch": 128.63922518159805, + "grad_norm": 1.6607126340772993e-08, + "learning_rate": 0.0767170928529237, + "loss": 0.0, + "num_input_tokens_seen": 45408448, + "step": 26500 + }, + { + "epoch": 128.6634382566586, + "grad_norm": 2.7815074687964625e-08, + "learning_rate": 0.07666570198731526, + "loss": 0.0, + "num_input_tokens_seen": 45417312, + "step": 26505 + }, + { + "epoch": 128.68765133171914, + "grad_norm": 1.6283449255638516e-08, + "learning_rate": 0.07661432243077708, + "loss": 0.0, + "num_input_tokens_seen": 45426112, + "step": 26510 + }, + { + "epoch": 128.71186440677965, + "grad_norm": 5.3037137348610486e-08, + "learning_rate": 0.0765629541912326, + "loss": 0.0, + "num_input_tokens_seen": 45434688, + "step": 26515 + }, + { + "epoch": 128.7360774818402, + "grad_norm": 2.406926213893712e-08, + "learning_rate": 0.07651159727660352, + "loss": 0.0, + "num_input_tokens_seen": 45443360, + "step": 26520 + }, + { + "epoch": 128.7602905569007, + "grad_norm": 2.8667724194519906e-08, + "learning_rate": 0.07646025169480959, + "loss": 0.0, + "num_input_tokens_seen": 45451712, + "step": 26525 + }, + { + "epoch": 128.78450363196126, + "grad_norm": 2.522549813477326e-08, + "learning_rate": 0.07640891745376908, + "loss": 0.0, + "num_input_tokens_seen": 45460224, + "step": 26530 + }, + { + "epoch": 128.8087167070218, + "grad_norm": 5.029801641853737e-08, + "learning_rate": 0.07635759456139822, + "loss": 0.0, + "num_input_tokens_seen": 45468928, + "step": 26535 + }, + { + "epoch": 128.83292978208232, + "grad_norm": 9.615630069959025e-09, + "learning_rate": 0.0763062830256118, + "loss": 0.0, + "num_input_tokens_seen": 45477248, + "step": 26540 + }, + { + "epoch": 128.85714285714286, + "grad_norm": 2.7281448211624593e-08, + "learning_rate": 0.07625498285432258, + "loss": 0.0, + "num_input_tokens_seen": 45485760, + "step": 26545 + }, + { + "epoch": 128.88135593220338, + "grad_norm": 7.842500870935964e-09, + "learning_rate": 0.07620369405544176, + "loss": 0.0, + "num_input_tokens_seen": 45494336, + "step": 26550 + }, + { + "epoch": 128.90556900726392, + "grad_norm": 2.490814132727337e-08, + "learning_rate": 0.07615241663687868, + "loss": 0.0, + "num_input_tokens_seen": 45502976, + "step": 26555 + }, + { + "epoch": 128.92978208232446, + "grad_norm": 3.3684678157897e-08, + "learning_rate": 0.07610115060654106, + "loss": 0.0, + "num_input_tokens_seen": 45511872, + "step": 26560 + }, + { + "epoch": 128.95399515738498, + "grad_norm": 2.2530695531486344e-08, + "learning_rate": 0.07604989597233458, + "loss": 0.0, + "num_input_tokens_seen": 45520352, + "step": 26565 + }, + { + "epoch": 128.97820823244552, + "grad_norm": 1.0981378473218228e-08, + "learning_rate": 0.07599865274216352, + "loss": 0.0, + "num_input_tokens_seen": 45528672, + "step": 26570 + }, + { + "epoch": 129.0048426150121, + "grad_norm": 1.433712615295235e-08, + "learning_rate": 0.07594742092393013, + "loss": 0.0, + "num_input_tokens_seen": 45537440, + "step": 26575 + }, + { + "epoch": 129.02905569007265, + "grad_norm": 2.0757706664653597e-08, + "learning_rate": 0.07589620052553503, + "loss": 0.0, + "num_input_tokens_seen": 45545632, + "step": 26580 + }, + { + "epoch": 129.05326876513317, + "grad_norm": 1.3154506817159017e-08, + "learning_rate": 0.0758449915548771, + "loss": 0.0, + "num_input_tokens_seen": 45554240, + "step": 26585 + }, + { + "epoch": 129.0774818401937, + "grad_norm": 3.963714689803055e-08, + "learning_rate": 0.07579379401985332, + "loss": 0.0, + "num_input_tokens_seen": 45562720, + "step": 26590 + }, + { + "epoch": 129.10169491525423, + "grad_norm": 2.2249059483669953e-08, + "learning_rate": 0.07574260792835905, + "loss": 0.0, + "num_input_tokens_seen": 45571264, + "step": 26595 + }, + { + "epoch": 129.12590799031477, + "grad_norm": 1.2094500512205286e-08, + "learning_rate": 0.07569143328828784, + "loss": 0.0, + "num_input_tokens_seen": 45579584, + "step": 26600 + }, + { + "epoch": 129.12590799031477, + "eval_loss": 1.1199369430541992, + "eval_runtime": 4.6212, + "eval_samples_per_second": 79.416, + "eval_steps_per_second": 19.908, + "num_input_tokens_seen": 45579584, + "step": 26600 + }, + { + "epoch": 129.15012106537532, + "grad_norm": 2.771681906210688e-08, + "learning_rate": 0.0756402701075314, + "loss": 0.0, + "num_input_tokens_seen": 45588416, + "step": 26605 + }, + { + "epoch": 129.17433414043583, + "grad_norm": 5.0260869244311834e-08, + "learning_rate": 0.07558911839397982, + "loss": 0.0, + "num_input_tokens_seen": 45597280, + "step": 26610 + }, + { + "epoch": 129.19854721549638, + "grad_norm": 1.59556226009272e-08, + "learning_rate": 0.07553797815552123, + "loss": 0.0, + "num_input_tokens_seen": 45606016, + "step": 26615 + }, + { + "epoch": 129.2227602905569, + "grad_norm": 1.813751104862149e-08, + "learning_rate": 0.07548684940004222, + "loss": 0.0, + "num_input_tokens_seen": 45614560, + "step": 26620 + }, + { + "epoch": 129.24697336561744, + "grad_norm": 1.6385458323497915e-08, + "learning_rate": 0.07543573213542744, + "loss": 0.0, + "num_input_tokens_seen": 45622976, + "step": 26625 + }, + { + "epoch": 129.27118644067798, + "grad_norm": 2.518457264955032e-08, + "learning_rate": 0.0753846263695597, + "loss": 0.0, + "num_input_tokens_seen": 45631616, + "step": 26630 + }, + { + "epoch": 129.2953995157385, + "grad_norm": 2.7714868622297217e-08, + "learning_rate": 0.07533353211032029, + "loss": 0.0, + "num_input_tokens_seen": 45640160, + "step": 26635 + }, + { + "epoch": 129.31961259079904, + "grad_norm": 2.1638321356931556e-08, + "learning_rate": 0.07528244936558857, + "loss": 0.0, + "num_input_tokens_seen": 45649088, + "step": 26640 + }, + { + "epoch": 129.34382566585955, + "grad_norm": 6.295031607095325e-09, + "learning_rate": 0.07523137814324206, + "loss": 0.0, + "num_input_tokens_seen": 45657536, + "step": 26645 + }, + { + "epoch": 129.3680387409201, + "grad_norm": 2.213461947064843e-08, + "learning_rate": 0.07518031845115672, + "loss": 0.0, + "num_input_tokens_seen": 45666400, + "step": 26650 + }, + { + "epoch": 129.39225181598064, + "grad_norm": 1.949661054823082e-08, + "learning_rate": 0.07512927029720647, + "loss": 0.0, + "num_input_tokens_seen": 45674752, + "step": 26655 + }, + { + "epoch": 129.41646489104116, + "grad_norm": 1.662141357883229e-08, + "learning_rate": 0.0750782336892636, + "loss": 0.0, + "num_input_tokens_seen": 45683168, + "step": 26660 + }, + { + "epoch": 129.4406779661017, + "grad_norm": 4.711382217692517e-08, + "learning_rate": 0.0750272086351987, + "loss": 0.0, + "num_input_tokens_seen": 45691776, + "step": 26665 + }, + { + "epoch": 129.46489104116222, + "grad_norm": 4.1336750911113995e-08, + "learning_rate": 0.07497619514288031, + "loss": 0.0, + "num_input_tokens_seen": 45700288, + "step": 26670 + }, + { + "epoch": 129.48910411622276, + "grad_norm": 2.2014001288539475e-08, + "learning_rate": 0.07492519322017545, + "loss": 0.0, + "num_input_tokens_seen": 45708864, + "step": 26675 + }, + { + "epoch": 129.5133171912833, + "grad_norm": 1.5520212670594447e-08, + "learning_rate": 0.0748742028749493, + "loss": 0.0, + "num_input_tokens_seen": 45716928, + "step": 26680 + }, + { + "epoch": 129.53753026634382, + "grad_norm": 3.9087126424419694e-08, + "learning_rate": 0.0748232241150651, + "loss": 0.0, + "num_input_tokens_seen": 45725408, + "step": 26685 + }, + { + "epoch": 129.56174334140437, + "grad_norm": 6.747924885530665e-08, + "learning_rate": 0.07477225694838453, + "loss": 0.0, + "num_input_tokens_seen": 45733920, + "step": 26690 + }, + { + "epoch": 129.58595641646488, + "grad_norm": 2.5300593620158907e-08, + "learning_rate": 0.07472130138276731, + "loss": 0.0, + "num_input_tokens_seen": 45742624, + "step": 26695 + }, + { + "epoch": 129.61016949152543, + "grad_norm": 2.1693644214337837e-08, + "learning_rate": 0.07467035742607138, + "loss": 0.0, + "num_input_tokens_seen": 45750976, + "step": 26700 + }, + { + "epoch": 129.63438256658597, + "grad_norm": 2.9271694401700188e-08, + "learning_rate": 0.07461942508615303, + "loss": 0.0, + "num_input_tokens_seen": 45759520, + "step": 26705 + }, + { + "epoch": 129.65859564164649, + "grad_norm": 3.518036706395833e-08, + "learning_rate": 0.07456850437086657, + "loss": 0.0, + "num_input_tokens_seen": 45767968, + "step": 26710 + }, + { + "epoch": 129.68280871670703, + "grad_norm": 3.715897634037901e-08, + "learning_rate": 0.07451759528806468, + "loss": 0.0, + "num_input_tokens_seen": 45776736, + "step": 26715 + }, + { + "epoch": 129.70702179176754, + "grad_norm": 1.9544405205351723e-08, + "learning_rate": 0.0744666978455982, + "loss": 0.0, + "num_input_tokens_seen": 45785344, + "step": 26720 + }, + { + "epoch": 129.7312348668281, + "grad_norm": 2.6913646422599413e-08, + "learning_rate": 0.07441581205131609, + "loss": 0.0, + "num_input_tokens_seen": 45793600, + "step": 26725 + }, + { + "epoch": 129.75544794188863, + "grad_norm": 2.2044353897854307e-08, + "learning_rate": 0.07436493791306566, + "loss": 0.0, + "num_input_tokens_seen": 45802272, + "step": 26730 + }, + { + "epoch": 129.77966101694915, + "grad_norm": 1.557940976226746e-08, + "learning_rate": 0.07431407543869223, + "loss": 0.0, + "num_input_tokens_seen": 45810976, + "step": 26735 + }, + { + "epoch": 129.8038740920097, + "grad_norm": 2.3278623473288462e-08, + "learning_rate": 0.0742632246360395, + "loss": 0.0, + "num_input_tokens_seen": 45819584, + "step": 26740 + }, + { + "epoch": 129.8280871670702, + "grad_norm": 1.3759860806317192e-08, + "learning_rate": 0.07421238551294934, + "loss": 0.0, + "num_input_tokens_seen": 45827968, + "step": 26745 + }, + { + "epoch": 129.85230024213075, + "grad_norm": 2.913887975353191e-08, + "learning_rate": 0.07416155807726171, + "loss": 0.0, + "num_input_tokens_seen": 45836640, + "step": 26750 + }, + { + "epoch": 129.8765133171913, + "grad_norm": 3.268237946940644e-08, + "learning_rate": 0.07411074233681492, + "loss": 0.0, + "num_input_tokens_seen": 45845248, + "step": 26755 + }, + { + "epoch": 129.9007263922518, + "grad_norm": 1.50333026027738e-08, + "learning_rate": 0.07405993829944528, + "loss": 0.0, + "num_input_tokens_seen": 45853920, + "step": 26760 + }, + { + "epoch": 129.92493946731236, + "grad_norm": 1.0625981872181e-08, + "learning_rate": 0.07400914597298755, + "loss": 0.0, + "num_input_tokens_seen": 45862624, + "step": 26765 + }, + { + "epoch": 129.94915254237287, + "grad_norm": 1.672039751099419e-08, + "learning_rate": 0.07395836536527445, + "loss": 0.0, + "num_input_tokens_seen": 45870944, + "step": 26770 + }, + { + "epoch": 129.97336561743342, + "grad_norm": 3.157910200002334e-08, + "learning_rate": 0.07390759648413696, + "loss": 0.0, + "num_input_tokens_seen": 45879840, + "step": 26775 + }, + { + "epoch": 129.99757869249396, + "grad_norm": 2.3292130890695262e-08, + "learning_rate": 0.07385683933740435, + "loss": 0.0, + "num_input_tokens_seen": 45887968, + "step": 26780 + }, + { + "epoch": 130.02421307506054, + "grad_norm": 2.8920931427478536e-08, + "learning_rate": 0.07380609393290402, + "loss": 0.0, + "num_input_tokens_seen": 45897024, + "step": 26785 + }, + { + "epoch": 130.04842615012106, + "grad_norm": 3.947690530026193e-08, + "learning_rate": 0.07375536027846147, + "loss": 0.0, + "num_input_tokens_seen": 45905760, + "step": 26790 + }, + { + "epoch": 130.0726392251816, + "grad_norm": 2.0292644009600735e-08, + "learning_rate": 0.07370463838190057, + "loss": 0.0, + "num_input_tokens_seen": 45914784, + "step": 26795 + }, + { + "epoch": 130.09685230024212, + "grad_norm": 2.0448412740847743e-08, + "learning_rate": 0.07365392825104317, + "loss": 0.0, + "num_input_tokens_seen": 45923328, + "step": 26800 + }, + { + "epoch": 130.09685230024212, + "eval_loss": 1.1278047561645508, + "eval_runtime": 4.6232, + "eval_samples_per_second": 79.382, + "eval_steps_per_second": 19.899, + "num_input_tokens_seen": 45923328, + "step": 26800 + }, + { + "epoch": 130.12106537530266, + "grad_norm": 2.74273883604792e-08, + "learning_rate": 0.07360322989370945, + "loss": 0.0, + "num_input_tokens_seen": 45931968, + "step": 26805 + }, + { + "epoch": 130.1452784503632, + "grad_norm": 2.608191351782807e-08, + "learning_rate": 0.07355254331771781, + "loss": 0.0, + "num_input_tokens_seen": 45940448, + "step": 26810 + }, + { + "epoch": 130.16949152542372, + "grad_norm": 9.917935805958678e-09, + "learning_rate": 0.07350186853088461, + "loss": 0.0, + "num_input_tokens_seen": 45948736, + "step": 26815 + }, + { + "epoch": 130.19370460048427, + "grad_norm": 2.7124432477876326e-08, + "learning_rate": 0.07345120554102462, + "loss": 0.0, + "num_input_tokens_seen": 45956960, + "step": 26820 + }, + { + "epoch": 130.21791767554478, + "grad_norm": 1.399687565850627e-08, + "learning_rate": 0.07340055435595079, + "loss": 0.0, + "num_input_tokens_seen": 45965216, + "step": 26825 + }, + { + "epoch": 130.24213075060533, + "grad_norm": 2.548450872552621e-08, + "learning_rate": 0.07334991498347401, + "loss": 0.0, + "num_input_tokens_seen": 45973760, + "step": 26830 + }, + { + "epoch": 130.26634382566587, + "grad_norm": 5.636731525981986e-09, + "learning_rate": 0.07329928743140365, + "loss": 0.0, + "num_input_tokens_seen": 45982080, + "step": 26835 + }, + { + "epoch": 130.2905569007264, + "grad_norm": 1.6221779475245057e-08, + "learning_rate": 0.07324867170754705, + "loss": 0.0, + "num_input_tokens_seen": 45990752, + "step": 26840 + }, + { + "epoch": 130.31476997578693, + "grad_norm": 1.6431284777240762e-08, + "learning_rate": 0.07319806781970974, + "loss": 0.0, + "num_input_tokens_seen": 45999232, + "step": 26845 + }, + { + "epoch": 130.33898305084745, + "grad_norm": 3.22035802469145e-08, + "learning_rate": 0.07314747577569555, + "loss": 0.0, + "num_input_tokens_seen": 46008032, + "step": 26850 + }, + { + "epoch": 130.363196125908, + "grad_norm": 9.802056943897242e-09, + "learning_rate": 0.07309689558330636, + "loss": 0.0, + "num_input_tokens_seen": 46016672, + "step": 26855 + }, + { + "epoch": 130.38740920096853, + "grad_norm": 2.981702706961187e-08, + "learning_rate": 0.0730463272503423, + "loss": 0.0, + "num_input_tokens_seen": 46025248, + "step": 26860 + }, + { + "epoch": 130.41162227602905, + "grad_norm": 3.60981502467439e-08, + "learning_rate": 0.07299577078460168, + "loss": 0.0, + "num_input_tokens_seen": 46033728, + "step": 26865 + }, + { + "epoch": 130.4358353510896, + "grad_norm": 2.1079570089455046e-08, + "learning_rate": 0.07294522619388083, + "loss": 0.0, + "num_input_tokens_seen": 46042432, + "step": 26870 + }, + { + "epoch": 130.4600484261501, + "grad_norm": 2.7120307777295238e-08, + "learning_rate": 0.07289469348597452, + "loss": 0.0, + "num_input_tokens_seen": 46050816, + "step": 26875 + }, + { + "epoch": 130.48426150121065, + "grad_norm": 1.7046009048726773e-08, + "learning_rate": 0.07284417266867535, + "loss": 0.0, + "num_input_tokens_seen": 46059200, + "step": 26880 + }, + { + "epoch": 130.5084745762712, + "grad_norm": 1.899123169835093e-08, + "learning_rate": 0.07279366374977439, + "loss": 0.0, + "num_input_tokens_seen": 46067840, + "step": 26885 + }, + { + "epoch": 130.5326876513317, + "grad_norm": 1.4047202512301737e-08, + "learning_rate": 0.07274316673706074, + "loss": 0.0, + "num_input_tokens_seen": 46076576, + "step": 26890 + }, + { + "epoch": 130.55690072639226, + "grad_norm": 2.281107924773096e-08, + "learning_rate": 0.07269268163832161, + "loss": 0.0, + "num_input_tokens_seen": 46084832, + "step": 26895 + }, + { + "epoch": 130.58111380145277, + "grad_norm": 1.3878375781928298e-08, + "learning_rate": 0.07264220846134248, + "loss": 0.0, + "num_input_tokens_seen": 46093184, + "step": 26900 + }, + { + "epoch": 130.60532687651332, + "grad_norm": 1.725420162301816e-08, + "learning_rate": 0.07259174721390699, + "loss": 0.0, + "num_input_tokens_seen": 46101344, + "step": 26905 + }, + { + "epoch": 130.62953995157386, + "grad_norm": 1.3353727013054595e-08, + "learning_rate": 0.07254129790379686, + "loss": 0.0, + "num_input_tokens_seen": 46109760, + "step": 26910 + }, + { + "epoch": 130.65375302663438, + "grad_norm": 3.408125692772046e-08, + "learning_rate": 0.072490860538792, + "loss": 0.0, + "num_input_tokens_seen": 46118272, + "step": 26915 + }, + { + "epoch": 130.67796610169492, + "grad_norm": 4.06934796970404e-09, + "learning_rate": 0.07244043512667042, + "loss": 0.0, + "num_input_tokens_seen": 46126560, + "step": 26920 + }, + { + "epoch": 130.70217917675544, + "grad_norm": 1.572000662974915e-08, + "learning_rate": 0.07239002167520843, + "loss": 0.0, + "num_input_tokens_seen": 46135072, + "step": 26925 + }, + { + "epoch": 130.72639225181598, + "grad_norm": 4.467080216841168e-08, + "learning_rate": 0.07233962019218045, + "loss": 0.0, + "num_input_tokens_seen": 46143776, + "step": 26930 + }, + { + "epoch": 130.75060532687652, + "grad_norm": 1.6347842191066775e-08, + "learning_rate": 0.07228923068535892, + "loss": 0.0, + "num_input_tokens_seen": 46152224, + "step": 26935 + }, + { + "epoch": 130.77481840193704, + "grad_norm": 2.2378431552283473e-08, + "learning_rate": 0.0722388531625146, + "loss": 0.0, + "num_input_tokens_seen": 46160768, + "step": 26940 + }, + { + "epoch": 130.79903147699758, + "grad_norm": 1.824748530054876e-08, + "learning_rate": 0.07218848763141639, + "loss": 0.0, + "num_input_tokens_seen": 46169664, + "step": 26945 + }, + { + "epoch": 130.8232445520581, + "grad_norm": 2.566699208728096e-08, + "learning_rate": 0.07213813409983118, + "loss": 0.0, + "num_input_tokens_seen": 46178240, + "step": 26950 + }, + { + "epoch": 130.84745762711864, + "grad_norm": 2.5913964307733295e-08, + "learning_rate": 0.0720877925755242, + "loss": 0.0, + "num_input_tokens_seen": 46187008, + "step": 26955 + }, + { + "epoch": 130.8716707021792, + "grad_norm": 2.687015054902986e-08, + "learning_rate": 0.07203746306625866, + "loss": 0.0, + "num_input_tokens_seen": 46195584, + "step": 26960 + }, + { + "epoch": 130.8958837772397, + "grad_norm": 2.6967166277813703e-08, + "learning_rate": 0.07198714557979606, + "loss": 0.0, + "num_input_tokens_seen": 46204000, + "step": 26965 + }, + { + "epoch": 130.92009685230025, + "grad_norm": 1.6408241876320062e-08, + "learning_rate": 0.07193684012389602, + "loss": 0.0, + "num_input_tokens_seen": 46213024, + "step": 26970 + }, + { + "epoch": 130.94430992736076, + "grad_norm": 2.3688436101565458e-08, + "learning_rate": 0.07188654670631621, + "loss": 0.0, + "num_input_tokens_seen": 46221312, + "step": 26975 + }, + { + "epoch": 130.9685230024213, + "grad_norm": 9.385058064026452e-09, + "learning_rate": 0.07183626533481258, + "loss": 0.0, + "num_input_tokens_seen": 46229984, + "step": 26980 + }, + { + "epoch": 130.99273607748185, + "grad_norm": 1.8810775159749937e-08, + "learning_rate": 0.07178599601713909, + "loss": 0.0, + "num_input_tokens_seen": 46238560, + "step": 26985 + }, + { + "epoch": 131.01937046004844, + "grad_norm": 2.715050939627872e-08, + "learning_rate": 0.07173573876104786, + "loss": 0.0, + "num_input_tokens_seen": 46247776, + "step": 26990 + }, + { + "epoch": 131.04358353510895, + "grad_norm": 1.0815300655053761e-08, + "learning_rate": 0.0716854935742893, + "loss": 0.0, + "num_input_tokens_seen": 46255936, + "step": 26995 + }, + { + "epoch": 131.0677966101695, + "grad_norm": 1.875256927519331e-08, + "learning_rate": 0.07163526046461174, + "loss": 0.0, + "num_input_tokens_seen": 46264032, + "step": 27000 + }, + { + "epoch": 131.0677966101695, + "eval_loss": 1.1318058967590332, + "eval_runtime": 4.625, + "eval_samples_per_second": 79.352, + "eval_steps_per_second": 19.892, + "num_input_tokens_seen": 46264032, + "step": 27000 + }, + { + "epoch": 131.09200968523, + "grad_norm": 4.804641307032398e-08, + "learning_rate": 0.07158503943976181, + "loss": 0.0, + "num_input_tokens_seen": 46272608, + "step": 27005 + }, + { + "epoch": 131.11622276029055, + "grad_norm": 2.4271859189184397e-08, + "learning_rate": 0.07153483050748427, + "loss": 0.0, + "num_input_tokens_seen": 46281056, + "step": 27010 + }, + { + "epoch": 131.1404358353511, + "grad_norm": 2.187472425418946e-08, + "learning_rate": 0.07148463367552188, + "loss": 0.0, + "num_input_tokens_seen": 46289312, + "step": 27015 + }, + { + "epoch": 131.16464891041161, + "grad_norm": 1.3470152104844146e-08, + "learning_rate": 0.07143444895161565, + "loss": 0.0, + "num_input_tokens_seen": 46297984, + "step": 27020 + }, + { + "epoch": 131.18886198547216, + "grad_norm": 1.86482242980901e-08, + "learning_rate": 0.07138427634350476, + "loss": 0.0, + "num_input_tokens_seen": 46306592, + "step": 27025 + }, + { + "epoch": 131.21307506053267, + "grad_norm": 2.0541477852020762e-08, + "learning_rate": 0.07133411585892636, + "loss": 0.0, + "num_input_tokens_seen": 46314976, + "step": 27030 + }, + { + "epoch": 131.23728813559322, + "grad_norm": 2.0805302369808487e-08, + "learning_rate": 0.07128396750561593, + "loss": 0.0, + "num_input_tokens_seen": 46323744, + "step": 27035 + }, + { + "epoch": 131.26150121065376, + "grad_norm": 1.4228268341298644e-08, + "learning_rate": 0.07123383129130685, + "loss": 0.0, + "num_input_tokens_seen": 46332288, + "step": 27040 + }, + { + "epoch": 131.28571428571428, + "grad_norm": 2.8287329811860218e-08, + "learning_rate": 0.07118370722373084, + "loss": 0.0, + "num_input_tokens_seen": 46341216, + "step": 27045 + }, + { + "epoch": 131.30992736077482, + "grad_norm": 5.894434718101138e-09, + "learning_rate": 0.07113359531061769, + "loss": 0.0, + "num_input_tokens_seen": 46349664, + "step": 27050 + }, + { + "epoch": 131.33414043583534, + "grad_norm": 1.1441334990536234e-08, + "learning_rate": 0.07108349555969525, + "loss": 0.0, + "num_input_tokens_seen": 46358176, + "step": 27055 + }, + { + "epoch": 131.35835351089588, + "grad_norm": 2.103042184842252e-08, + "learning_rate": 0.07103340797868944, + "loss": 0.0, + "num_input_tokens_seen": 46367168, + "step": 27060 + }, + { + "epoch": 131.38256658595643, + "grad_norm": 2.1338502520507063e-08, + "learning_rate": 0.07098333257532453, + "loss": 0.0, + "num_input_tokens_seen": 46375648, + "step": 27065 + }, + { + "epoch": 131.40677966101694, + "grad_norm": 2.3472860988249522e-08, + "learning_rate": 0.07093326935732269, + "loss": 0.0, + "num_input_tokens_seen": 46384096, + "step": 27070 + }, + { + "epoch": 131.43099273607749, + "grad_norm": 2.5741702103232456e-08, + "learning_rate": 0.0708832183324044, + "loss": 0.0, + "num_input_tokens_seen": 46392672, + "step": 27075 + }, + { + "epoch": 131.455205811138, + "grad_norm": 1.9766535075405045e-08, + "learning_rate": 0.07083317950828799, + "loss": 0.0, + "num_input_tokens_seen": 46401024, + "step": 27080 + }, + { + "epoch": 131.47941888619854, + "grad_norm": 3.094151779237109e-08, + "learning_rate": 0.0707831528926902, + "loss": 0.0, + "num_input_tokens_seen": 46409920, + "step": 27085 + }, + { + "epoch": 131.5036319612591, + "grad_norm": 1.4170280060454843e-08, + "learning_rate": 0.07073313849332578, + "loss": 0.0, + "num_input_tokens_seen": 46418720, + "step": 27090 + }, + { + "epoch": 131.5278450363196, + "grad_norm": 1.9879580648307638e-08, + "learning_rate": 0.07068313631790749, + "loss": 0.0, + "num_input_tokens_seen": 46426816, + "step": 27095 + }, + { + "epoch": 131.55205811138015, + "grad_norm": 3.0500896031071534e-08, + "learning_rate": 0.07063314637414632, + "loss": 0.0, + "num_input_tokens_seen": 46435488, + "step": 27100 + }, + { + "epoch": 131.57627118644066, + "grad_norm": 3.2608735267558586e-08, + "learning_rate": 0.07058316866975144, + "loss": 0.0, + "num_input_tokens_seen": 46444000, + "step": 27105 + }, + { + "epoch": 131.6004842615012, + "grad_norm": 2.277535848804746e-08, + "learning_rate": 0.0705332032124299, + "loss": 0.0, + "num_input_tokens_seen": 46452512, + "step": 27110 + }, + { + "epoch": 131.62469733656175, + "grad_norm": 1.0218666801620202e-08, + "learning_rate": 0.0704832500098871, + "loss": 0.0, + "num_input_tokens_seen": 46460800, + "step": 27115 + }, + { + "epoch": 131.64891041162227, + "grad_norm": 1.9690002517336325e-08, + "learning_rate": 0.07043330906982641, + "loss": 0.0, + "num_input_tokens_seen": 46469376, + "step": 27120 + }, + { + "epoch": 131.6731234866828, + "grad_norm": 1.7385403339176264e-08, + "learning_rate": 0.07038338039994936, + "loss": 0.0, + "num_input_tokens_seen": 46478176, + "step": 27125 + }, + { + "epoch": 131.69733656174333, + "grad_norm": 3.045455798655894e-08, + "learning_rate": 0.07033346400795562, + "loss": 0.0, + "num_input_tokens_seen": 46487104, + "step": 27130 + }, + { + "epoch": 131.72154963680387, + "grad_norm": 2.7197541996315522e-08, + "learning_rate": 0.07028355990154282, + "loss": 0.0, + "num_input_tokens_seen": 46496032, + "step": 27135 + }, + { + "epoch": 131.74576271186442, + "grad_norm": 2.7145839354147938e-08, + "learning_rate": 0.07023366808840685, + "loss": 0.0, + "num_input_tokens_seen": 46504672, + "step": 27140 + }, + { + "epoch": 131.76997578692493, + "grad_norm": 2.4222408967489173e-08, + "learning_rate": 0.07018378857624172, + "loss": 0.0, + "num_input_tokens_seen": 46513088, + "step": 27145 + }, + { + "epoch": 131.79418886198548, + "grad_norm": 1.2908685675938614e-08, + "learning_rate": 0.0701339213727394, + "loss": 0.0, + "num_input_tokens_seen": 46521504, + "step": 27150 + }, + { + "epoch": 131.818401937046, + "grad_norm": 4.52509780757282e-08, + "learning_rate": 0.07008406648559008, + "loss": 0.0, + "num_input_tokens_seen": 46530208, + "step": 27155 + }, + { + "epoch": 131.84261501210653, + "grad_norm": 3.854477625964137e-08, + "learning_rate": 0.07003422392248196, + "loss": 0.0, + "num_input_tokens_seen": 46538688, + "step": 27160 + }, + { + "epoch": 131.86682808716708, + "grad_norm": 1.3145261767988359e-08, + "learning_rate": 0.06998439369110142, + "loss": 0.0, + "num_input_tokens_seen": 46547296, + "step": 27165 + }, + { + "epoch": 131.8910411622276, + "grad_norm": 2.615415617412964e-08, + "learning_rate": 0.06993457579913295, + "loss": 0.0, + "num_input_tokens_seen": 46555616, + "step": 27170 + }, + { + "epoch": 131.91525423728814, + "grad_norm": 1.9372720316823688e-08, + "learning_rate": 0.06988477025425903, + "loss": 0.0, + "num_input_tokens_seen": 46564448, + "step": 27175 + }, + { + "epoch": 131.93946731234865, + "grad_norm": 7.184517869518459e-09, + "learning_rate": 0.06983497706416032, + "loss": 0.0, + "num_input_tokens_seen": 46572800, + "step": 27180 + }, + { + "epoch": 131.9636803874092, + "grad_norm": 1.505987334837755e-08, + "learning_rate": 0.0697851962365156, + "loss": 0.0, + "num_input_tokens_seen": 46581280, + "step": 27185 + }, + { + "epoch": 131.98789346246974, + "grad_norm": 2.3875074361967563e-08, + "learning_rate": 0.06973542777900163, + "loss": 0.0, + "num_input_tokens_seen": 46590080, + "step": 27190 + }, + { + "epoch": 132.01452784503633, + "grad_norm": 3.3966149004527324e-08, + "learning_rate": 0.06968567169929342, + "loss": 0.0, + "num_input_tokens_seen": 46599008, + "step": 27195 + }, + { + "epoch": 132.03874092009684, + "grad_norm": 3.830975359164768e-08, + "learning_rate": 0.06963592800506392, + "loss": 0.0, + "num_input_tokens_seen": 46607776, + "step": 27200 + }, + { + "epoch": 132.03874092009684, + "eval_loss": 1.1410518884658813, + "eval_runtime": 4.6255, + "eval_samples_per_second": 79.343, + "eval_steps_per_second": 19.89, + "num_input_tokens_seen": 46607776, + "step": 27200 + }, + { + "epoch": 132.0629539951574, + "grad_norm": 1.3891807704169423e-08, + "learning_rate": 0.06958619670398417, + "loss": 0.0, + "num_input_tokens_seen": 46616256, + "step": 27205 + }, + { + "epoch": 132.08716707021793, + "grad_norm": 1.8147449765137935e-08, + "learning_rate": 0.0695364778037235, + "loss": 0.0, + "num_input_tokens_seen": 46624640, + "step": 27210 + }, + { + "epoch": 132.11138014527845, + "grad_norm": 2.4922913510749822e-08, + "learning_rate": 0.06948677131194907, + "loss": 0.0, + "num_input_tokens_seen": 46633632, + "step": 27215 + }, + { + "epoch": 132.135593220339, + "grad_norm": 2.1537092109724654e-08, + "learning_rate": 0.06943707723632629, + "loss": 0.0, + "num_input_tokens_seen": 46642272, + "step": 27220 + }, + { + "epoch": 132.1598062953995, + "grad_norm": 1.1445147052313587e-08, + "learning_rate": 0.06938739558451867, + "loss": 0.0, + "num_input_tokens_seen": 46650656, + "step": 27225 + }, + { + "epoch": 132.18401937046005, + "grad_norm": 2.0626826469083426e-08, + "learning_rate": 0.06933772636418763, + "loss": 0.0, + "num_input_tokens_seen": 46659072, + "step": 27230 + }, + { + "epoch": 132.2082324455206, + "grad_norm": 2.928160647286404e-08, + "learning_rate": 0.06928806958299293, + "loss": 0.0, + "num_input_tokens_seen": 46667584, + "step": 27235 + }, + { + "epoch": 132.2324455205811, + "grad_norm": 1.3143493404754736e-08, + "learning_rate": 0.06923842524859211, + "loss": 0.0, + "num_input_tokens_seen": 46676128, + "step": 27240 + }, + { + "epoch": 132.25665859564165, + "grad_norm": 1.4311277496403818e-08, + "learning_rate": 0.06918879336864105, + "loss": 0.0, + "num_input_tokens_seen": 46684992, + "step": 27245 + }, + { + "epoch": 132.28087167070217, + "grad_norm": 3.616487020963177e-08, + "learning_rate": 0.06913917395079362, + "loss": 0.0, + "num_input_tokens_seen": 46693600, + "step": 27250 + }, + { + "epoch": 132.3050847457627, + "grad_norm": 3.7807843966675136e-08, + "learning_rate": 0.0690895670027017, + "loss": 0.0, + "num_input_tokens_seen": 46701920, + "step": 27255 + }, + { + "epoch": 132.32929782082326, + "grad_norm": 1.876980526560601e-08, + "learning_rate": 0.06903997253201531, + "loss": 0.0, + "num_input_tokens_seen": 46710560, + "step": 27260 + }, + { + "epoch": 132.35351089588377, + "grad_norm": 6.171392286091759e-09, + "learning_rate": 0.06899039054638263, + "loss": 0.0, + "num_input_tokens_seen": 46718944, + "step": 27265 + }, + { + "epoch": 132.37772397094432, + "grad_norm": 1.6816910530792484e-08, + "learning_rate": 0.06894082105344976, + "loss": 0.0, + "num_input_tokens_seen": 46727104, + "step": 27270 + }, + { + "epoch": 132.40193704600483, + "grad_norm": 2.4169111156879808e-08, + "learning_rate": 0.06889126406086087, + "loss": 0.0, + "num_input_tokens_seen": 46735552, + "step": 27275 + }, + { + "epoch": 132.42615012106538, + "grad_norm": 2.9898245657022926e-08, + "learning_rate": 0.0688417195762584, + "loss": 0.0, + "num_input_tokens_seen": 46744320, + "step": 27280 + }, + { + "epoch": 132.45036319612592, + "grad_norm": 2.5247635093705867e-08, + "learning_rate": 0.06879218760728262, + "loss": 0.0, + "num_input_tokens_seen": 46752800, + "step": 27285 + }, + { + "epoch": 132.47457627118644, + "grad_norm": 2.0643351916760366e-08, + "learning_rate": 0.06874266816157207, + "loss": 0.0, + "num_input_tokens_seen": 46761408, + "step": 27290 + }, + { + "epoch": 132.49878934624698, + "grad_norm": 2.5591571528593704e-08, + "learning_rate": 0.06869316124676321, + "loss": 0.0, + "num_input_tokens_seen": 46769856, + "step": 27295 + }, + { + "epoch": 132.5230024213075, + "grad_norm": 1.8514423771875954e-08, + "learning_rate": 0.06864366687049062, + "loss": 0.0, + "num_input_tokens_seen": 46778112, + "step": 27300 + }, + { + "epoch": 132.54721549636804, + "grad_norm": 1.0731976196609594e-08, + "learning_rate": 0.06859418504038704, + "loss": 0.0, + "num_input_tokens_seen": 46786912, + "step": 27305 + }, + { + "epoch": 132.57142857142858, + "grad_norm": 1.2328967180508243e-08, + "learning_rate": 0.06854471576408311, + "loss": 0.0, + "num_input_tokens_seen": 46795104, + "step": 27310 + }, + { + "epoch": 132.5956416464891, + "grad_norm": 2.612826754955222e-08, + "learning_rate": 0.06849525904920767, + "loss": 0.0, + "num_input_tokens_seen": 46803680, + "step": 27315 + }, + { + "epoch": 132.61985472154964, + "grad_norm": 3.7452810630611566e-08, + "learning_rate": 0.06844581490338748, + "loss": 0.0, + "num_input_tokens_seen": 46812032, + "step": 27320 + }, + { + "epoch": 132.64406779661016, + "grad_norm": 5.078967912908183e-09, + "learning_rate": 0.06839638333424752, + "loss": 0.0, + "num_input_tokens_seen": 46820704, + "step": 27325 + }, + { + "epoch": 132.6682808716707, + "grad_norm": 1.9208490797950617e-08, + "learning_rate": 0.06834696434941082, + "loss": 0.0, + "num_input_tokens_seen": 46829248, + "step": 27330 + }, + { + "epoch": 132.69249394673125, + "grad_norm": 2.7221368270602397e-08, + "learning_rate": 0.06829755795649824, + "loss": 0.0, + "num_input_tokens_seen": 46837824, + "step": 27335 + }, + { + "epoch": 132.71670702179176, + "grad_norm": 9.662651123676369e-09, + "learning_rate": 0.06824816416312904, + "loss": 0.0, + "num_input_tokens_seen": 46846624, + "step": 27340 + }, + { + "epoch": 132.7409200968523, + "grad_norm": 8.082286839794506e-09, + "learning_rate": 0.06819878297692027, + "loss": 0.0, + "num_input_tokens_seen": 46855456, + "step": 27345 + }, + { + "epoch": 132.76513317191282, + "grad_norm": 2.674584109740863e-08, + "learning_rate": 0.0681494144054871, + "loss": 0.0, + "num_input_tokens_seen": 46864288, + "step": 27350 + }, + { + "epoch": 132.78934624697337, + "grad_norm": 2.348527594620009e-08, + "learning_rate": 0.06810005845644286, + "loss": 0.0, + "num_input_tokens_seen": 46872928, + "step": 27355 + }, + { + "epoch": 132.8135593220339, + "grad_norm": 1.1030786062349307e-08, + "learning_rate": 0.06805071513739878, + "loss": 0.0, + "num_input_tokens_seen": 46881728, + "step": 27360 + }, + { + "epoch": 132.83777239709443, + "grad_norm": 1.4126018577087507e-08, + "learning_rate": 0.06800138445596428, + "loss": 0.0, + "num_input_tokens_seen": 46890336, + "step": 27365 + }, + { + "epoch": 132.86198547215497, + "grad_norm": 3.707779328010474e-08, + "learning_rate": 0.06795206641974678, + "loss": 0.0, + "num_input_tokens_seen": 46899040, + "step": 27370 + }, + { + "epoch": 132.88619854721549, + "grad_norm": 1.6052814189038145e-08, + "learning_rate": 0.06790276103635169, + "loss": 0.0, + "num_input_tokens_seen": 46907936, + "step": 27375 + }, + { + "epoch": 132.91041162227603, + "grad_norm": 1.3567427181726543e-08, + "learning_rate": 0.0678534683133826, + "loss": 0.0, + "num_input_tokens_seen": 46916576, + "step": 27380 + }, + { + "epoch": 132.93462469733657, + "grad_norm": 1.5842900324969378e-08, + "learning_rate": 0.06780418825844095, + "loss": 0.0, + "num_input_tokens_seen": 46925088, + "step": 27385 + }, + { + "epoch": 132.9588377723971, + "grad_norm": 8.741769086384465e-09, + "learning_rate": 0.0677549208791264, + "loss": 0.0, + "num_input_tokens_seen": 46933920, + "step": 27390 + }, + { + "epoch": 132.98305084745763, + "grad_norm": 2.2161065871273422e-08, + "learning_rate": 0.06770566618303668, + "loss": 0.0, + "num_input_tokens_seen": 46942016, + "step": 27395 + }, + { + "epoch": 133.00968523002422, + "grad_norm": 1.866119880844508e-08, + "learning_rate": 0.06765642417776736, + "loss": 0.0, + "num_input_tokens_seen": 46950752, + "step": 27400 + }, + { + "epoch": 133.00968523002422, + "eval_loss": 1.1394373178482056, + "eval_runtime": 4.6145, + "eval_samples_per_second": 79.532, + "eval_steps_per_second": 19.937, + "num_input_tokens_seen": 46950752, + "step": 27400 + }, + { + "epoch": 133.03389830508473, + "grad_norm": 8.252029282118656e-09, + "learning_rate": 0.0676071948709122, + "loss": 0.0, + "num_input_tokens_seen": 46959200, + "step": 27405 + }, + { + "epoch": 133.05811138014528, + "grad_norm": 1.7392071782751373e-08, + "learning_rate": 0.06755797827006307, + "loss": 0.0, + "num_input_tokens_seen": 46967776, + "step": 27410 + }, + { + "epoch": 133.08232445520582, + "grad_norm": 2.6265231767297337e-08, + "learning_rate": 0.06750877438280974, + "loss": 0.0, + "num_input_tokens_seen": 46976512, + "step": 27415 + }, + { + "epoch": 133.10653753026634, + "grad_norm": 1.1240162578474155e-08, + "learning_rate": 0.06745958321673998, + "loss": 0.0, + "num_input_tokens_seen": 46985024, + "step": 27420 + }, + { + "epoch": 133.13075060532688, + "grad_norm": 2.6557209764632717e-08, + "learning_rate": 0.0674104047794398, + "loss": 0.0, + "num_input_tokens_seen": 46993888, + "step": 27425 + }, + { + "epoch": 133.1549636803874, + "grad_norm": 1.7504619975738933e-08, + "learning_rate": 0.06736123907849303, + "loss": 0.0, + "num_input_tokens_seen": 47002368, + "step": 27430 + }, + { + "epoch": 133.17917675544794, + "grad_norm": 1.101232616207426e-08, + "learning_rate": 0.06731208612148178, + "loss": 0.0, + "num_input_tokens_seen": 47011200, + "step": 27435 + }, + { + "epoch": 133.20338983050848, + "grad_norm": 2.5325025632128018e-08, + "learning_rate": 0.0672629459159859, + "loss": 0.0, + "num_input_tokens_seen": 47019712, + "step": 27440 + }, + { + "epoch": 133.227602905569, + "grad_norm": 1.0455038612633416e-08, + "learning_rate": 0.0672138184695835, + "loss": 0.0, + "num_input_tokens_seen": 47028352, + "step": 27445 + }, + { + "epoch": 133.25181598062954, + "grad_norm": 1.4676865944807105e-08, + "learning_rate": 0.0671647037898507, + "loss": 0.0, + "num_input_tokens_seen": 47037056, + "step": 27450 + }, + { + "epoch": 133.27602905569006, + "grad_norm": 1.983864272858682e-08, + "learning_rate": 0.0671156018843615, + "loss": 0.0, + "num_input_tokens_seen": 47045408, + "step": 27455 + }, + { + "epoch": 133.3002421307506, + "grad_norm": 7.581978600512684e-09, + "learning_rate": 0.06706651276068812, + "loss": 0.0, + "num_input_tokens_seen": 47053536, + "step": 27460 + }, + { + "epoch": 133.32445520581115, + "grad_norm": 2.7736790642052256e-08, + "learning_rate": 0.06701743642640064, + "loss": 0.0, + "num_input_tokens_seen": 47062272, + "step": 27465 + }, + { + "epoch": 133.34866828087166, + "grad_norm": 2.913028573914289e-08, + "learning_rate": 0.06696837288906729, + "loss": 0.0, + "num_input_tokens_seen": 47070944, + "step": 27470 + }, + { + "epoch": 133.3728813559322, + "grad_norm": 3.6404235181919375e-09, + "learning_rate": 0.06691932215625432, + "loss": 0.0, + "num_input_tokens_seen": 47079488, + "step": 27475 + }, + { + "epoch": 133.39709443099272, + "grad_norm": 1.2388561287934863e-08, + "learning_rate": 0.06687028423552589, + "loss": 0.0, + "num_input_tokens_seen": 47088128, + "step": 27480 + }, + { + "epoch": 133.42130750605327, + "grad_norm": 1.9357528913133137e-08, + "learning_rate": 0.06682125913444435, + "loss": 0.0, + "num_input_tokens_seen": 47096640, + "step": 27485 + }, + { + "epoch": 133.4455205811138, + "grad_norm": 1.8825700109914578e-08, + "learning_rate": 0.0667722468605699, + "loss": 0.0, + "num_input_tokens_seen": 47105184, + "step": 27490 + }, + { + "epoch": 133.46973365617433, + "grad_norm": 1.1833738433608687e-08, + "learning_rate": 0.06672324742146094, + "loss": 0.0, + "num_input_tokens_seen": 47114112, + "step": 27495 + }, + { + "epoch": 133.49394673123487, + "grad_norm": 2.1330743393832563e-08, + "learning_rate": 0.06667426082467373, + "loss": 0.0, + "num_input_tokens_seen": 47122560, + "step": 27500 + }, + { + "epoch": 133.5181598062954, + "grad_norm": 1.8131748547034476e-08, + "learning_rate": 0.0666252870777626, + "loss": 0.0, + "num_input_tokens_seen": 47131424, + "step": 27505 + }, + { + "epoch": 133.54237288135593, + "grad_norm": 1.1631299479120116e-08, + "learning_rate": 0.06657632618827995, + "loss": 0.0, + "num_input_tokens_seen": 47139936, + "step": 27510 + }, + { + "epoch": 133.56658595641647, + "grad_norm": 2.046959757251443e-08, + "learning_rate": 0.06652737816377623, + "loss": 0.0, + "num_input_tokens_seen": 47148768, + "step": 27515 + }, + { + "epoch": 133.590799031477, + "grad_norm": 2.5718403406926882e-08, + "learning_rate": 0.06647844301179971, + "loss": 0.0, + "num_input_tokens_seen": 47157216, + "step": 27520 + }, + { + "epoch": 133.61501210653753, + "grad_norm": 6.743620328819588e-09, + "learning_rate": 0.06642952073989689, + "loss": 0.0, + "num_input_tokens_seen": 47165632, + "step": 27525 + }, + { + "epoch": 133.63922518159805, + "grad_norm": 1.81743864402506e-08, + "learning_rate": 0.06638061135561223, + "loss": 0.0, + "num_input_tokens_seen": 47174240, + "step": 27530 + }, + { + "epoch": 133.6634382566586, + "grad_norm": 6.740098257296268e-08, + "learning_rate": 0.06633171486648808, + "loss": 0.0, + "num_input_tokens_seen": 47182976, + "step": 27535 + }, + { + "epoch": 133.68765133171914, + "grad_norm": 2.2447231629030284e-08, + "learning_rate": 0.06628283128006499, + "loss": 0.0, + "num_input_tokens_seen": 47191520, + "step": 27540 + }, + { + "epoch": 133.71186440677965, + "grad_norm": 2.2860294990323382e-08, + "learning_rate": 0.0662339606038813, + "loss": 0.0, + "num_input_tokens_seen": 47200288, + "step": 27545 + }, + { + "epoch": 133.7360774818402, + "grad_norm": 2.958599942814999e-08, + "learning_rate": 0.06618510284547358, + "loss": 0.0, + "num_input_tokens_seen": 47209280, + "step": 27550 + }, + { + "epoch": 133.7602905569007, + "grad_norm": 2.834064005696746e-08, + "learning_rate": 0.06613625801237633, + "loss": 0.0, + "num_input_tokens_seen": 47217952, + "step": 27555 + }, + { + "epoch": 133.78450363196126, + "grad_norm": 1.1100833141597377e-08, + "learning_rate": 0.066087426112122, + "loss": 0.0, + "num_input_tokens_seen": 47226304, + "step": 27560 + }, + { + "epoch": 133.8087167070218, + "grad_norm": 1.992106390957815e-08, + "learning_rate": 0.06603860715224101, + "loss": 0.0, + "num_input_tokens_seen": 47234720, + "step": 27565 + }, + { + "epoch": 133.83292978208232, + "grad_norm": 9.514316445802251e-09, + "learning_rate": 0.06598980114026198, + "loss": 0.0, + "num_input_tokens_seen": 47242944, + "step": 27570 + }, + { + "epoch": 133.85714285714286, + "grad_norm": 2.3788532033108822e-08, + "learning_rate": 0.06594100808371128, + "loss": 0.0, + "num_input_tokens_seen": 47251456, + "step": 27575 + }, + { + "epoch": 133.88135593220338, + "grad_norm": 1.6459324569950695e-08, + "learning_rate": 0.06589222799011357, + "loss": 0.0, + "num_input_tokens_seen": 47259680, + "step": 27580 + }, + { + "epoch": 133.90556900726392, + "grad_norm": 1.2308356112100682e-08, + "learning_rate": 0.0658434608669912, + "loss": 0.0, + "num_input_tokens_seen": 47268352, + "step": 27585 + }, + { + "epoch": 133.92978208232446, + "grad_norm": 6.262935059453412e-09, + "learning_rate": 0.06579470672186473, + "loss": 0.0, + "num_input_tokens_seen": 47277056, + "step": 27590 + }, + { + "epoch": 133.95399515738498, + "grad_norm": 3.241499868522624e-08, + "learning_rate": 0.06574596556225275, + "loss": 0.0, + "num_input_tokens_seen": 47285600, + "step": 27595 + }, + { + "epoch": 133.97820823244552, + "grad_norm": 1.7275537445016198e-08, + "learning_rate": 0.06569723739567161, + "loss": 0.0, + "num_input_tokens_seen": 47293824, + "step": 27600 + }, + { + "epoch": 133.97820823244552, + "eval_loss": 1.1429383754730225, + "eval_runtime": 4.6278, + "eval_samples_per_second": 79.303, + "eval_steps_per_second": 19.88, + "num_input_tokens_seen": 47293824, + "step": 27600 + }, + { + "epoch": 134.0048426150121, + "grad_norm": 4.3384144277069936e-08, + "learning_rate": 0.06564852222963588, + "loss": 0.0, + "num_input_tokens_seen": 47302560, + "step": 27605 + }, + { + "epoch": 134.02905569007265, + "grad_norm": 4.0075896379221376e-08, + "learning_rate": 0.06559982007165813, + "loss": 0.0, + "num_input_tokens_seen": 47311328, + "step": 27610 + }, + { + "epoch": 134.05326876513317, + "grad_norm": 2.250411412774156e-08, + "learning_rate": 0.06555113092924868, + "loss": 0.0, + "num_input_tokens_seen": 47319840, + "step": 27615 + }, + { + "epoch": 134.0774818401937, + "grad_norm": 1.2717335629019999e-08, + "learning_rate": 0.06550245480991615, + "loss": 0.0, + "num_input_tokens_seen": 47328288, + "step": 27620 + }, + { + "epoch": 134.10169491525423, + "grad_norm": 6.7614474019705995e-09, + "learning_rate": 0.0654537917211669, + "loss": 0.0, + "num_input_tokens_seen": 47337056, + "step": 27625 + }, + { + "epoch": 134.12590799031477, + "grad_norm": 1.7619541381463932e-08, + "learning_rate": 0.0654051416705055, + "loss": 0.0, + "num_input_tokens_seen": 47345248, + "step": 27630 + }, + { + "epoch": 134.15012106537532, + "grad_norm": 3.9789373573739795e-08, + "learning_rate": 0.06535650466543427, + "loss": 0.0, + "num_input_tokens_seen": 47353728, + "step": 27635 + }, + { + "epoch": 134.17433414043583, + "grad_norm": 9.207862028404179e-09, + "learning_rate": 0.0653078807134538, + "loss": 0.0, + "num_input_tokens_seen": 47362400, + "step": 27640 + }, + { + "epoch": 134.19854721549638, + "grad_norm": 2.299408485839649e-08, + "learning_rate": 0.06525926982206236, + "loss": 0.0, + "num_input_tokens_seen": 47370848, + "step": 27645 + }, + { + "epoch": 134.2227602905569, + "grad_norm": 6.593580792468856e-09, + "learning_rate": 0.06521067199875648, + "loss": 0.0, + "num_input_tokens_seen": 47379008, + "step": 27650 + }, + { + "epoch": 134.24697336561744, + "grad_norm": 1.3232973827825845e-08, + "learning_rate": 0.06516208725103047, + "loss": 0.0, + "num_input_tokens_seen": 47387232, + "step": 27655 + }, + { + "epoch": 134.27118644067798, + "grad_norm": 1.5685778009810747e-08, + "learning_rate": 0.06511351558637678, + "loss": 0.0, + "num_input_tokens_seen": 47395616, + "step": 27660 + }, + { + "epoch": 134.2953995157385, + "grad_norm": 6.201523738980086e-09, + "learning_rate": 0.06506495701228569, + "loss": 0.0, + "num_input_tokens_seen": 47404032, + "step": 27665 + }, + { + "epoch": 134.31961259079904, + "grad_norm": 1.872413157855135e-08, + "learning_rate": 0.06501641153624559, + "loss": 0.0, + "num_input_tokens_seen": 47412608, + "step": 27670 + }, + { + "epoch": 134.34382566585955, + "grad_norm": 5.2754131729670917e-08, + "learning_rate": 0.06496787916574286, + "loss": 0.0, + "num_input_tokens_seen": 47421408, + "step": 27675 + }, + { + "epoch": 134.3680387409201, + "grad_norm": 2.5540622061726026e-08, + "learning_rate": 0.06491935990826168, + "loss": 0.0, + "num_input_tokens_seen": 47430144, + "step": 27680 + }, + { + "epoch": 134.39225181598064, + "grad_norm": 1.6600219865381405e-08, + "learning_rate": 0.0648708537712844, + "loss": 0.0, + "num_input_tokens_seen": 47438848, + "step": 27685 + }, + { + "epoch": 134.41646489104116, + "grad_norm": 2.82118524097541e-08, + "learning_rate": 0.06482236076229132, + "loss": 0.0, + "num_input_tokens_seen": 47447424, + "step": 27690 + }, + { + "epoch": 134.4406779661017, + "grad_norm": 1.526027304521449e-08, + "learning_rate": 0.06477388088876056, + "loss": 0.0, + "num_input_tokens_seen": 47455776, + "step": 27695 + }, + { + "epoch": 134.46489104116222, + "grad_norm": 1.8576434612782577e-08, + "learning_rate": 0.06472541415816846, + "loss": 0.0, + "num_input_tokens_seen": 47464288, + "step": 27700 + }, + { + "epoch": 134.48910411622276, + "grad_norm": 9.491836649999641e-09, + "learning_rate": 0.06467696057798909, + "loss": 0.0, + "num_input_tokens_seen": 47472832, + "step": 27705 + }, + { + "epoch": 134.5133171912833, + "grad_norm": 9.556096358664945e-09, + "learning_rate": 0.0646285201556946, + "loss": 0.0, + "num_input_tokens_seen": 47481824, + "step": 27710 + }, + { + "epoch": 134.53753026634382, + "grad_norm": 4.053140756354878e-08, + "learning_rate": 0.06458009289875521, + "loss": 0.0, + "num_input_tokens_seen": 47490784, + "step": 27715 + }, + { + "epoch": 134.56174334140437, + "grad_norm": 2.385491626455405e-08, + "learning_rate": 0.0645316788146389, + "loss": 0.0, + "num_input_tokens_seen": 47499168, + "step": 27720 + }, + { + "epoch": 134.58595641646488, + "grad_norm": 9.353496643882409e-09, + "learning_rate": 0.06448327791081175, + "loss": 0.0, + "num_input_tokens_seen": 47507776, + "step": 27725 + }, + { + "epoch": 134.61016949152543, + "grad_norm": 2.1289590534934177e-08, + "learning_rate": 0.0644348901947379, + "loss": 0.0, + "num_input_tokens_seen": 47516352, + "step": 27730 + }, + { + "epoch": 134.63438256658597, + "grad_norm": 9.623349228604638e-09, + "learning_rate": 0.06438651567387917, + "loss": 0.0, + "num_input_tokens_seen": 47525056, + "step": 27735 + }, + { + "epoch": 134.65859564164649, + "grad_norm": 1.6232865718279754e-08, + "learning_rate": 0.0643381543556957, + "loss": 0.0, + "num_input_tokens_seen": 47533856, + "step": 27740 + }, + { + "epoch": 134.68280871670703, + "grad_norm": 2.102710672247099e-09, + "learning_rate": 0.06428980624764526, + "loss": 0.0, + "num_input_tokens_seen": 47542656, + "step": 27745 + }, + { + "epoch": 134.70702179176754, + "grad_norm": 1.359625834140843e-08, + "learning_rate": 0.06424147135718378, + "loss": 0.0, + "num_input_tokens_seen": 47551360, + "step": 27750 + }, + { + "epoch": 134.7312348668281, + "grad_norm": 1.8525849299066977e-08, + "learning_rate": 0.06419314969176519, + "loss": 0.0, + "num_input_tokens_seen": 47559936, + "step": 27755 + }, + { + "epoch": 134.75544794188863, + "grad_norm": 7.840951887772007e-09, + "learning_rate": 0.06414484125884118, + "loss": 0.0, + "num_input_tokens_seen": 47568800, + "step": 27760 + }, + { + "epoch": 134.77966101694915, + "grad_norm": 1.3194203951627514e-08, + "learning_rate": 0.06409654606586157, + "loss": 0.0, + "num_input_tokens_seen": 47577280, + "step": 27765 + }, + { + "epoch": 134.8038740920097, + "grad_norm": 1.809133820529496e-08, + "learning_rate": 0.06404826412027415, + "loss": 0.0, + "num_input_tokens_seen": 47585344, + "step": 27770 + }, + { + "epoch": 134.8280871670702, + "grad_norm": 1.9835896480913107e-08, + "learning_rate": 0.06399999542952453, + "loss": 0.0, + "num_input_tokens_seen": 47594048, + "step": 27775 + }, + { + "epoch": 134.85230024213075, + "grad_norm": 2.5652164836742486e-08, + "learning_rate": 0.0639517400010563, + "loss": 0.0, + "num_input_tokens_seen": 47602880, + "step": 27780 + }, + { + "epoch": 134.8765133171913, + "grad_norm": 1.3543825616579852e-08, + "learning_rate": 0.06390349784231118, + "loss": 0.0, + "num_input_tokens_seen": 47611552, + "step": 27785 + }, + { + "epoch": 134.9007263922518, + "grad_norm": 1.2924524561697126e-08, + "learning_rate": 0.06385526896072859, + "loss": 0.0, + "num_input_tokens_seen": 47620064, + "step": 27790 + }, + { + "epoch": 134.92493946731236, + "grad_norm": 2.1162035679367364e-08, + "learning_rate": 0.06380705336374613, + "loss": 0.0, + "num_input_tokens_seen": 47628704, + "step": 27795 + }, + { + "epoch": 134.94915254237287, + "grad_norm": 1.0917140969013417e-08, + "learning_rate": 0.06375885105879918, + "loss": 0.0, + "num_input_tokens_seen": 47637248, + "step": 27800 + }, + { + "epoch": 134.94915254237287, + "eval_loss": 1.1486748456954956, + "eval_runtime": 4.6308, + "eval_samples_per_second": 79.251, + "eval_steps_per_second": 19.867, + "num_input_tokens_seen": 47637248, + "step": 27800 + }, + { + "epoch": 134.97336561743342, + "grad_norm": 1.211957645352868e-08, + "learning_rate": 0.06371066205332115, + "loss": 0.0, + "num_input_tokens_seen": 47645536, + "step": 27805 + }, + { + "epoch": 134.99757869249396, + "grad_norm": 2.0488283070108082e-08, + "learning_rate": 0.06366248635474347, + "loss": 0.0, + "num_input_tokens_seen": 47654016, + "step": 27810 + }, + { + "epoch": 135.02421307506054, + "grad_norm": 1.7336979851734213e-08, + "learning_rate": 0.06361432397049532, + "loss": 0.0, + "num_input_tokens_seen": 47662944, + "step": 27815 + }, + { + "epoch": 135.04842615012106, + "grad_norm": 1.8168588411526798e-08, + "learning_rate": 0.06356617490800408, + "loss": 0.0, + "num_input_tokens_seen": 47671392, + "step": 27820 + }, + { + "epoch": 135.0726392251816, + "grad_norm": 1.498407797839718e-08, + "learning_rate": 0.06351803917469478, + "loss": 0.0, + "num_input_tokens_seen": 47680096, + "step": 27825 + }, + { + "epoch": 135.09685230024212, + "grad_norm": 2.5947041848439767e-08, + "learning_rate": 0.06346991677799067, + "loss": 0.0, + "num_input_tokens_seen": 47688736, + "step": 27830 + }, + { + "epoch": 135.12106537530266, + "grad_norm": 1.1750318940073612e-08, + "learning_rate": 0.06342180772531283, + "loss": 0.0, + "num_input_tokens_seen": 47697216, + "step": 27835 + }, + { + "epoch": 135.1452784503632, + "grad_norm": 1.0929078086974187e-08, + "learning_rate": 0.06337371202408021, + "loss": 0.0, + "num_input_tokens_seen": 47705600, + "step": 27840 + }, + { + "epoch": 135.16949152542372, + "grad_norm": 1.3930360864833347e-08, + "learning_rate": 0.06332562968170984, + "loss": 0.0, + "num_input_tokens_seen": 47714144, + "step": 27845 + }, + { + "epoch": 135.19370460048427, + "grad_norm": 1.8024259418325528e-08, + "learning_rate": 0.06327756070561656, + "loss": 0.0, + "num_input_tokens_seen": 47722528, + "step": 27850 + }, + { + "epoch": 135.21791767554478, + "grad_norm": 1.7731931478692786e-08, + "learning_rate": 0.06322950510321329, + "loss": 0.0, + "num_input_tokens_seen": 47731104, + "step": 27855 + }, + { + "epoch": 135.24213075060533, + "grad_norm": 9.090070030026709e-09, + "learning_rate": 0.06318146288191076, + "loss": 0.0, + "num_input_tokens_seen": 47739584, + "step": 27860 + }, + { + "epoch": 135.26634382566587, + "grad_norm": 1.2273240201920999e-08, + "learning_rate": 0.06313343404911763, + "loss": 0.0, + "num_input_tokens_seen": 47748224, + "step": 27865 + }, + { + "epoch": 135.2905569007264, + "grad_norm": 1.9240726345515213e-08, + "learning_rate": 0.0630854186122406, + "loss": 0.0, + "num_input_tokens_seen": 47757088, + "step": 27870 + }, + { + "epoch": 135.31476997578693, + "grad_norm": 9.86766846011733e-09, + "learning_rate": 0.06303741657868431, + "loss": 0.0, + "num_input_tokens_seen": 47765664, + "step": 27875 + }, + { + "epoch": 135.33898305084745, + "grad_norm": 1.5223497129568386e-08, + "learning_rate": 0.06298942795585115, + "loss": 0.0, + "num_input_tokens_seen": 47774016, + "step": 27880 + }, + { + "epoch": 135.363196125908, + "grad_norm": 2.604068960465611e-08, + "learning_rate": 0.06294145275114167, + "loss": 0.0, + "num_input_tokens_seen": 47782432, + "step": 27885 + }, + { + "epoch": 135.38740920096853, + "grad_norm": 1.069532196140699e-08, + "learning_rate": 0.06289349097195428, + "loss": 0.0, + "num_input_tokens_seen": 47790912, + "step": 27890 + }, + { + "epoch": 135.41162227602905, + "grad_norm": 3.133815340561341e-08, + "learning_rate": 0.06284554262568516, + "loss": 0.0, + "num_input_tokens_seen": 47799360, + "step": 27895 + }, + { + "epoch": 135.4358353510896, + "grad_norm": 1.6994823326399455e-08, + "learning_rate": 0.06279760771972868, + "loss": 0.0, + "num_input_tokens_seen": 47807840, + "step": 27900 + }, + { + "epoch": 135.4600484261501, + "grad_norm": 1.9284371433059277e-08, + "learning_rate": 0.06274968626147688, + "loss": 0.0, + "num_input_tokens_seen": 47816800, + "step": 27905 + }, + { + "epoch": 135.48426150121065, + "grad_norm": 3.342085364010927e-08, + "learning_rate": 0.06270177825831993, + "loss": 0.0, + "num_input_tokens_seen": 47825376, + "step": 27910 + }, + { + "epoch": 135.5084745762712, + "grad_norm": 1.917723757571821e-08, + "learning_rate": 0.06265388371764587, + "loss": 0.0, + "num_input_tokens_seen": 47834016, + "step": 27915 + }, + { + "epoch": 135.5326876513317, + "grad_norm": 1.2629171486366886e-08, + "learning_rate": 0.0626060026468406, + "loss": 0.0, + "num_input_tokens_seen": 47842368, + "step": 27920 + }, + { + "epoch": 135.55690072639226, + "grad_norm": 7.579062710760809e-09, + "learning_rate": 0.06255813505328794, + "loss": 0.0, + "num_input_tokens_seen": 47851104, + "step": 27925 + }, + { + "epoch": 135.58111380145277, + "grad_norm": 9.900301911613951e-09, + "learning_rate": 0.06251028094436978, + "loss": 0.0, + "num_input_tokens_seen": 47859936, + "step": 27930 + }, + { + "epoch": 135.60532687651332, + "grad_norm": 9.154480729023362e-09, + "learning_rate": 0.06246244032746568, + "loss": 0.0, + "num_input_tokens_seen": 47868160, + "step": 27935 + }, + { + "epoch": 135.62953995157386, + "grad_norm": 1.8612009711205246e-08, + "learning_rate": 0.06241461320995342, + "loss": 0.0, + "num_input_tokens_seen": 47876704, + "step": 27940 + }, + { + "epoch": 135.65375302663438, + "grad_norm": 1.5149989707197165e-08, + "learning_rate": 0.062366799599208426, + "loss": 0.0, + "num_input_tokens_seen": 47885088, + "step": 27945 + }, + { + "epoch": 135.67796610169492, + "grad_norm": 1.2203273058730701e-08, + "learning_rate": 0.06231899950260418, + "loss": 0.0, + "num_input_tokens_seen": 47893664, + "step": 27950 + }, + { + "epoch": 135.70217917675544, + "grad_norm": 2.092376938378493e-08, + "learning_rate": 0.06227121292751214, + "loss": 0.0, + "num_input_tokens_seen": 47902112, + "step": 27955 + }, + { + "epoch": 135.72639225181598, + "grad_norm": 1.253648118648698e-08, + "learning_rate": 0.062223439881301496, + "loss": 0.0, + "num_input_tokens_seen": 47910688, + "step": 27960 + }, + { + "epoch": 135.75060532687652, + "grad_norm": 2.0067332684448047e-08, + "learning_rate": 0.06217568037133948, + "loss": 0.0, + "num_input_tokens_seen": 47919360, + "step": 27965 + }, + { + "epoch": 135.77481840193704, + "grad_norm": 7.521432365820147e-09, + "learning_rate": 0.06212793440499126, + "loss": 0.0, + "num_input_tokens_seen": 47927776, + "step": 27970 + }, + { + "epoch": 135.79903147699758, + "grad_norm": 9.542922185801217e-08, + "learning_rate": 0.062080201989619783, + "loss": 0.0, + "num_input_tokens_seen": 47935968, + "step": 27975 + }, + { + "epoch": 135.8232445520581, + "grad_norm": 2.169349144764965e-08, + "learning_rate": 0.062032483132586094, + "loss": 0.0, + "num_input_tokens_seen": 47944864, + "step": 27980 + }, + { + "epoch": 135.84745762711864, + "grad_norm": 8.7353280164848e-09, + "learning_rate": 0.0619847778412489, + "loss": 0.0, + "num_input_tokens_seen": 47953664, + "step": 27985 + }, + { + "epoch": 135.8716707021792, + "grad_norm": 7.984466421362413e-09, + "learning_rate": 0.06193708612296509, + "loss": 0.0, + "num_input_tokens_seen": 47962208, + "step": 27990 + }, + { + "epoch": 135.8958837772397, + "grad_norm": 1.3974910118008665e-08, + "learning_rate": 0.06188940798508923, + "loss": 0.0, + "num_input_tokens_seen": 47971008, + "step": 27995 + }, + { + "epoch": 135.92009685230025, + "grad_norm": 1.469707999746106e-08, + "learning_rate": 0.06184174343497397, + "loss": 0.0, + "num_input_tokens_seen": 47979552, + "step": 28000 + }, + { + "epoch": 135.92009685230025, + "eval_loss": 1.1460644006729126, + "eval_runtime": 4.6183, + "eval_samples_per_second": 79.466, + "eval_steps_per_second": 19.921, + "num_input_tokens_seen": 47979552, + "step": 28000 + }, + { + "epoch": 135.94430992736076, + "grad_norm": 2.472228866068349e-09, + "learning_rate": 0.061794092479969726, + "loss": 0.0, + "num_input_tokens_seen": 47988224, + "step": 28005 + }, + { + "epoch": 135.9685230024213, + "grad_norm": 8.573822540824949e-09, + "learning_rate": 0.06174645512742485, + "loss": 0.0, + "num_input_tokens_seen": 47996800, + "step": 28010 + }, + { + "epoch": 135.99273607748185, + "grad_norm": 1.257400850107615e-08, + "learning_rate": 0.06169883138468565, + "loss": 0.0, + "num_input_tokens_seen": 48005344, + "step": 28015 + }, + { + "epoch": 136.01937046004844, + "grad_norm": 2.183642955344567e-08, + "learning_rate": 0.06165122125909637, + "loss": 0.0, + "num_input_tokens_seen": 48014432, + "step": 28020 + }, + { + "epoch": 136.04358353510895, + "grad_norm": 2.0298756453485112e-08, + "learning_rate": 0.061603624757998965, + "loss": 0.0, + "num_input_tokens_seen": 48023168, + "step": 28025 + }, + { + "epoch": 136.0677966101695, + "grad_norm": 1.2011750705198665e-08, + "learning_rate": 0.0615560418887335, + "loss": 0.0, + "num_input_tokens_seen": 48031680, + "step": 28030 + }, + { + "epoch": 136.09200968523, + "grad_norm": 1.3473365534366621e-08, + "learning_rate": 0.06150847265863787, + "loss": 0.0, + "num_input_tokens_seen": 48040320, + "step": 28035 + }, + { + "epoch": 136.11622276029055, + "grad_norm": 1.797937798642124e-08, + "learning_rate": 0.061460917075047757, + "loss": 0.0, + "num_input_tokens_seen": 48048736, + "step": 28040 + }, + { + "epoch": 136.1404358353511, + "grad_norm": 8.33468405403437e-09, + "learning_rate": 0.06141337514529694, + "loss": 0.0, + "num_input_tokens_seen": 48057216, + "step": 28045 + }, + { + "epoch": 136.16464891041161, + "grad_norm": 7.672632307276217e-09, + "learning_rate": 0.06136584687671687, + "loss": 0.0, + "num_input_tokens_seen": 48065504, + "step": 28050 + }, + { + "epoch": 136.18886198547216, + "grad_norm": 2.0599424388478837e-08, + "learning_rate": 0.061318332276637064, + "loss": 0.0, + "num_input_tokens_seen": 48073856, + "step": 28055 + }, + { + "epoch": 136.21307506053267, + "grad_norm": 2.6565091459929135e-08, + "learning_rate": 0.06127083135238491, + "loss": 0.0, + "num_input_tokens_seen": 48082432, + "step": 28060 + }, + { + "epoch": 136.23728813559322, + "grad_norm": 1.9980076260139867e-08, + "learning_rate": 0.06122334411128555, + "loss": 0.0, + "num_input_tokens_seen": 48091136, + "step": 28065 + }, + { + "epoch": 136.26150121065376, + "grad_norm": 2.3191072173744942e-08, + "learning_rate": 0.06117587056066223, + "loss": 0.0, + "num_input_tokens_seen": 48099680, + "step": 28070 + }, + { + "epoch": 136.28571428571428, + "grad_norm": 6.832783672194864e-09, + "learning_rate": 0.06112841070783589, + "loss": 0.0, + "num_input_tokens_seen": 48108064, + "step": 28075 + }, + { + "epoch": 136.30992736077482, + "grad_norm": 1.1170186553499661e-08, + "learning_rate": 0.061080964560125406, + "loss": 0.0, + "num_input_tokens_seen": 48116352, + "step": 28080 + }, + { + "epoch": 136.33414043583534, + "grad_norm": 1.8830693448990132e-08, + "learning_rate": 0.06103353212484766, + "loss": 0.0, + "num_input_tokens_seen": 48124960, + "step": 28085 + }, + { + "epoch": 136.35835351089588, + "grad_norm": 1.3044656022032086e-08, + "learning_rate": 0.06098611340931722, + "loss": 0.0, + "num_input_tokens_seen": 48133824, + "step": 28090 + }, + { + "epoch": 136.38256658595643, + "grad_norm": 4.55322357595378e-09, + "learning_rate": 0.06093870842084672, + "loss": 0.0, + "num_input_tokens_seen": 48142400, + "step": 28095 + }, + { + "epoch": 136.40677966101694, + "grad_norm": 2.846767266362349e-08, + "learning_rate": 0.06089131716674666, + "loss": 0.0, + "num_input_tokens_seen": 48151008, + "step": 28100 + }, + { + "epoch": 136.43099273607749, + "grad_norm": 1.1564305069100556e-08, + "learning_rate": 0.060843939654325226, + "loss": 0.0, + "num_input_tokens_seen": 48159424, + "step": 28105 + }, + { + "epoch": 136.455205811138, + "grad_norm": 1.3914510432755378e-08, + "learning_rate": 0.06079657589088873, + "loss": 0.0, + "num_input_tokens_seen": 48167744, + "step": 28110 + }, + { + "epoch": 136.47941888619854, + "grad_norm": 1.350295963931103e-08, + "learning_rate": 0.06074922588374126, + "loss": 0.0, + "num_input_tokens_seen": 48176320, + "step": 28115 + }, + { + "epoch": 136.5036319612591, + "grad_norm": 1.4590470165387615e-08, + "learning_rate": 0.06070188964018472, + "loss": 0.0, + "num_input_tokens_seen": 48185024, + "step": 28120 + }, + { + "epoch": 136.5278450363196, + "grad_norm": 1.0227720004252205e-08, + "learning_rate": 0.06065456716751902, + "loss": 0.0, + "num_input_tokens_seen": 48193536, + "step": 28125 + }, + { + "epoch": 136.55205811138015, + "grad_norm": 2.2357481199719587e-08, + "learning_rate": 0.06060725847304182, + "loss": 0.0, + "num_input_tokens_seen": 48201856, + "step": 28130 + }, + { + "epoch": 136.57627118644066, + "grad_norm": 1.0821144869055388e-08, + "learning_rate": 0.06055996356404877, + "loss": 0.0, + "num_input_tokens_seen": 48210368, + "step": 28135 + }, + { + "epoch": 136.6004842615012, + "grad_norm": 1.3593778547260627e-08, + "learning_rate": 0.06051268244783327, + "loss": 0.0, + "num_input_tokens_seen": 48218880, + "step": 28140 + }, + { + "epoch": 136.62469733656175, + "grad_norm": 1.0100621672393117e-08, + "learning_rate": 0.06046541513168676, + "loss": 0.0, + "num_input_tokens_seen": 48227392, + "step": 28145 + }, + { + "epoch": 136.64891041162227, + "grad_norm": 2.1707741382215318e-08, + "learning_rate": 0.060418161622898356, + "loss": 0.0, + "num_input_tokens_seen": 48236032, + "step": 28150 + }, + { + "epoch": 136.6731234866828, + "grad_norm": 7.696065118523165e-09, + "learning_rate": 0.06037092192875521, + "loss": 0.0, + "num_input_tokens_seen": 48244896, + "step": 28155 + }, + { + "epoch": 136.69733656174333, + "grad_norm": 1.1070030225823757e-08, + "learning_rate": 0.060323696056542225, + "loss": 0.0, + "num_input_tokens_seen": 48253856, + "step": 28160 + }, + { + "epoch": 136.72154963680387, + "grad_norm": 2.3872315679795975e-08, + "learning_rate": 0.06027648401354229, + "loss": 0.0, + "num_input_tokens_seen": 48262848, + "step": 28165 + }, + { + "epoch": 136.74576271186442, + "grad_norm": 9.506215370436166e-09, + "learning_rate": 0.06022928580703601, + "loss": 0.0, + "num_input_tokens_seen": 48271104, + "step": 28170 + }, + { + "epoch": 136.76997578692493, + "grad_norm": 3.994290409536916e-08, + "learning_rate": 0.060182101444301986, + "loss": 0.0, + "num_input_tokens_seen": 48279456, + "step": 28175 + }, + { + "epoch": 136.79418886198548, + "grad_norm": 1.169887742236142e-08, + "learning_rate": 0.06013493093261669, + "loss": 0.0, + "num_input_tokens_seen": 48288192, + "step": 28180 + }, + { + "epoch": 136.818401937046, + "grad_norm": 8.742265578121078e-09, + "learning_rate": 0.06008777427925432, + "loss": 0.0, + "num_input_tokens_seen": 48297088, + "step": 28185 + }, + { + "epoch": 136.84261501210653, + "grad_norm": 7.517130029555119e-09, + "learning_rate": 0.06004063149148705, + "loss": 0.0, + "num_input_tokens_seen": 48305440, + "step": 28190 + }, + { + "epoch": 136.86682808716708, + "grad_norm": 3.195178877035687e-08, + "learning_rate": 0.05999350257658497, + "loss": 0.0, + "num_input_tokens_seen": 48313920, + "step": 28195 + }, + { + "epoch": 136.8910411622276, + "grad_norm": 2.3451466546475785e-08, + "learning_rate": 0.05994638754181582, + "loss": 0.0, + "num_input_tokens_seen": 48322528, + "step": 28200 + }, + { + "epoch": 136.8910411622276, + "eval_loss": 1.1441251039505005, + "eval_runtime": 4.611, + "eval_samples_per_second": 79.593, + "eval_steps_per_second": 19.952, + "num_input_tokens_seen": 48322528, + "step": 28200 + }, + { + "epoch": 136.91525423728814, + "grad_norm": 1.6805708824563226e-08, + "learning_rate": 0.059899286394445445, + "loss": 0.0, + "num_input_tokens_seen": 48330880, + "step": 28205 + }, + { + "epoch": 136.93946731234865, + "grad_norm": 1.6283248527315664e-08, + "learning_rate": 0.059852199141737346, + "loss": 0.0, + "num_input_tokens_seen": 48339488, + "step": 28210 + }, + { + "epoch": 136.9636803874092, + "grad_norm": 2.49750442549157e-08, + "learning_rate": 0.05980512579095304, + "loss": 0.0, + "num_input_tokens_seen": 48348064, + "step": 28215 + }, + { + "epoch": 136.98789346246974, + "grad_norm": 1.8394409551092394e-08, + "learning_rate": 0.05975806634935181, + "loss": 0.0, + "num_input_tokens_seen": 48356480, + "step": 28220 + }, + { + "epoch": 137.01452784503633, + "grad_norm": 1.60546065330891e-08, + "learning_rate": 0.05971102082419076, + "loss": 0.0, + "num_input_tokens_seen": 48365152, + "step": 28225 + }, + { + "epoch": 137.03874092009684, + "grad_norm": 1.542021443867725e-08, + "learning_rate": 0.05966398922272492, + "loss": 0.0, + "num_input_tokens_seen": 48373856, + "step": 28230 + }, + { + "epoch": 137.0629539951574, + "grad_norm": 1.7383769090884016e-08, + "learning_rate": 0.059616971552207236, + "loss": 0.0, + "num_input_tokens_seen": 48382112, + "step": 28235 + }, + { + "epoch": 137.08716707021793, + "grad_norm": 1.6519665635428282e-08, + "learning_rate": 0.059569967819888305, + "loss": 0.0, + "num_input_tokens_seen": 48390912, + "step": 28240 + }, + { + "epoch": 137.11138014527845, + "grad_norm": 9.755830809865529e-09, + "learning_rate": 0.05952297803301681, + "loss": 0.0, + "num_input_tokens_seen": 48399840, + "step": 28245 + }, + { + "epoch": 137.135593220339, + "grad_norm": 1.0235027048111078e-08, + "learning_rate": 0.059476002198839056, + "loss": 0.0, + "num_input_tokens_seen": 48408576, + "step": 28250 + }, + { + "epoch": 137.1598062953995, + "grad_norm": 2.090634865226093e-08, + "learning_rate": 0.05942904032459935, + "loss": 0.0, + "num_input_tokens_seen": 48416736, + "step": 28255 + }, + { + "epoch": 137.18401937046005, + "grad_norm": 1.8406783652835657e-08, + "learning_rate": 0.05938209241753987, + "loss": 0.0, + "num_input_tokens_seen": 48425184, + "step": 28260 + }, + { + "epoch": 137.2082324455206, + "grad_norm": 2.0992622751236922e-08, + "learning_rate": 0.05933515848490046, + "loss": 0.0, + "num_input_tokens_seen": 48433696, + "step": 28265 + }, + { + "epoch": 137.2324455205811, + "grad_norm": 1.1553814793785477e-08, + "learning_rate": 0.059288238533918985, + "loss": 0.0, + "num_input_tokens_seen": 48442144, + "step": 28270 + }, + { + "epoch": 137.25665859564165, + "grad_norm": 2.896884332415084e-08, + "learning_rate": 0.05924133257183113, + "loss": 0.0, + "num_input_tokens_seen": 48450848, + "step": 28275 + }, + { + "epoch": 137.28087167070217, + "grad_norm": 2.7461366514103247e-08, + "learning_rate": 0.059194440605870285, + "loss": 0.0, + "num_input_tokens_seen": 48458944, + "step": 28280 + }, + { + "epoch": 137.3050847457627, + "grad_norm": 1.373401303794708e-08, + "learning_rate": 0.059147562643267884, + "loss": 0.0, + "num_input_tokens_seen": 48467776, + "step": 28285 + }, + { + "epoch": 137.32929782082326, + "grad_norm": 1.5060805935718236e-08, + "learning_rate": 0.059100698691253055, + "loss": 0.0, + "num_input_tokens_seen": 48476576, + "step": 28290 + }, + { + "epoch": 137.35351089588377, + "grad_norm": 3.355485489464627e-08, + "learning_rate": 0.05905384875705273, + "loss": 0.0, + "num_input_tokens_seen": 48485088, + "step": 28295 + }, + { + "epoch": 137.37772397094432, + "grad_norm": 1.6220129239741254e-08, + "learning_rate": 0.05900701284789189, + "loss": 0.0, + "num_input_tokens_seen": 48493792, + "step": 28300 + }, + { + "epoch": 137.40193704600483, + "grad_norm": 2.3516070868367933e-08, + "learning_rate": 0.058960190970993115, + "loss": 0.0, + "num_input_tokens_seen": 48502176, + "step": 28305 + }, + { + "epoch": 137.42615012106538, + "grad_norm": 8.334778200946857e-09, + "learning_rate": 0.058913383133576955, + "loss": 0.0, + "num_input_tokens_seen": 48510944, + "step": 28310 + }, + { + "epoch": 137.45036319612592, + "grad_norm": 1.3157555933673848e-08, + "learning_rate": 0.05886658934286185, + "loss": 0.0, + "num_input_tokens_seen": 48519072, + "step": 28315 + }, + { + "epoch": 137.47457627118644, + "grad_norm": 1.64812270497805e-08, + "learning_rate": 0.058819809606063846, + "loss": 0.0, + "num_input_tokens_seen": 48527840, + "step": 28320 + }, + { + "epoch": 137.49878934624698, + "grad_norm": 7.379778566019013e-09, + "learning_rate": 0.05877304393039711, + "loss": 0.0, + "num_input_tokens_seen": 48536128, + "step": 28325 + }, + { + "epoch": 137.5230024213075, + "grad_norm": 8.51076098484782e-09, + "learning_rate": 0.05872629232307338, + "loss": 0.0, + "num_input_tokens_seen": 48544704, + "step": 28330 + }, + { + "epoch": 137.54721549636804, + "grad_norm": 1.0294433749891141e-08, + "learning_rate": 0.05867955479130239, + "loss": 0.0, + "num_input_tokens_seen": 48553056, + "step": 28335 + }, + { + "epoch": 137.57142857142858, + "grad_norm": 2.342686578060693e-08, + "learning_rate": 0.058632831342291705, + "loss": 0.0, + "num_input_tokens_seen": 48561344, + "step": 28340 + }, + { + "epoch": 137.5956416464891, + "grad_norm": 2.513466235143369e-08, + "learning_rate": 0.05858612198324655, + "loss": 0.0, + "num_input_tokens_seen": 48569984, + "step": 28345 + }, + { + "epoch": 137.61985472154964, + "grad_norm": 3.239588153292061e-08, + "learning_rate": 0.05853942672137025, + "loss": 0.0, + "num_input_tokens_seen": 48578272, + "step": 28350 + }, + { + "epoch": 137.64406779661016, + "grad_norm": 2.3277165084323315e-08, + "learning_rate": 0.05849274556386363, + "loss": 0.0, + "num_input_tokens_seen": 48587168, + "step": 28355 + }, + { + "epoch": 137.6682808716707, + "grad_norm": 2.9340691654056172e-08, + "learning_rate": 0.05844607851792567, + "loss": 0.0, + "num_input_tokens_seen": 48595264, + "step": 28360 + }, + { + "epoch": 137.69249394673125, + "grad_norm": 1.6903122457279096e-08, + "learning_rate": 0.058399425590752924, + "loss": 0.0, + "num_input_tokens_seen": 48603712, + "step": 28365 + }, + { + "epoch": 137.71670702179176, + "grad_norm": 1.0977752928909013e-08, + "learning_rate": 0.05835278678953985, + "loss": 0.0, + "num_input_tokens_seen": 48612544, + "step": 28370 + }, + { + "epoch": 137.7409200968523, + "grad_norm": 1.1778207742452196e-08, + "learning_rate": 0.05830616212147874, + "loss": 0.0, + "num_input_tokens_seen": 48621312, + "step": 28375 + }, + { + "epoch": 137.76513317191282, + "grad_norm": 2.5440698436796083e-08, + "learning_rate": 0.058259551593759784, + "loss": 0.0, + "num_input_tokens_seen": 48629696, + "step": 28380 + }, + { + "epoch": 137.78934624697337, + "grad_norm": 2.3813784721937736e-08, + "learning_rate": 0.058212955213570804, + "loss": 0.0, + "num_input_tokens_seen": 48637696, + "step": 28385 + }, + { + "epoch": 137.8135593220339, + "grad_norm": 8.63580940091424e-09, + "learning_rate": 0.0581663729880976, + "loss": 0.0, + "num_input_tokens_seen": 48646016, + "step": 28390 + }, + { + "epoch": 137.83777239709443, + "grad_norm": 6.3273293271493e-09, + "learning_rate": 0.05811980492452379, + "loss": 0.0, + "num_input_tokens_seen": 48654816, + "step": 28395 + }, + { + "epoch": 137.86198547215497, + "grad_norm": 1.7708215338529953e-08, + "learning_rate": 0.058073251030030644, + "loss": 0.0, + "num_input_tokens_seen": 48663488, + "step": 28400 + }, + { + "epoch": 137.86198547215497, + "eval_loss": 1.1561288833618164, + "eval_runtime": 4.6126, + "eval_samples_per_second": 79.566, + "eval_steps_per_second": 19.946, + "num_input_tokens_seen": 48663488, + "step": 28400 + }, + { + "epoch": 137.88619854721549, + "grad_norm": 1.7212698821822414e-08, + "learning_rate": 0.05802671131179747, + "loss": 0.0, + "num_input_tokens_seen": 48672000, + "step": 28405 + }, + { + "epoch": 137.91041162227603, + "grad_norm": 1.248844139212224e-08, + "learning_rate": 0.057980185777001154, + "loss": 0.0, + "num_input_tokens_seen": 48680896, + "step": 28410 + }, + { + "epoch": 137.93462469733657, + "grad_norm": 2.5034411876845297e-08, + "learning_rate": 0.057933674432816606, + "loss": 0.0, + "num_input_tokens_seen": 48689824, + "step": 28415 + }, + { + "epoch": 137.9588377723971, + "grad_norm": 2.9342583474090134e-08, + "learning_rate": 0.05788717728641648, + "loss": 0.0, + "num_input_tokens_seen": 48698688, + "step": 28420 + }, + { + "epoch": 137.98305084745763, + "grad_norm": 1.7821699671571878e-08, + "learning_rate": 0.057840694344971126, + "loss": 0.0, + "num_input_tokens_seen": 48707424, + "step": 28425 + }, + { + "epoch": 138.00968523002422, + "grad_norm": 2.2070890892678108e-08, + "learning_rate": 0.0577942256156489, + "loss": 0.0, + "num_input_tokens_seen": 48716064, + "step": 28430 + }, + { + "epoch": 138.03389830508473, + "grad_norm": 3.4900008216709466e-08, + "learning_rate": 0.057747771105615804, + "loss": 0.0, + "num_input_tokens_seen": 48724384, + "step": 28435 + }, + { + "epoch": 138.05811138014528, + "grad_norm": 6.976046407203285e-09, + "learning_rate": 0.05770133082203568, + "loss": 0.0, + "num_input_tokens_seen": 48732704, + "step": 28440 + }, + { + "epoch": 138.08232445520582, + "grad_norm": 4.004618503472557e-08, + "learning_rate": 0.0576549047720703, + "loss": 0.0, + "num_input_tokens_seen": 48741408, + "step": 28445 + }, + { + "epoch": 138.10653753026634, + "grad_norm": 1.4681051041520732e-08, + "learning_rate": 0.05760849296287902, + "loss": 0.0, + "num_input_tokens_seen": 48749824, + "step": 28450 + }, + { + "epoch": 138.13075060532688, + "grad_norm": 1.6920873591175223e-08, + "learning_rate": 0.05756209540161919, + "loss": 0.0, + "num_input_tokens_seen": 48758432, + "step": 28455 + }, + { + "epoch": 138.1549636803874, + "grad_norm": 1.2215315869923415e-08, + "learning_rate": 0.05751571209544595, + "loss": 0.0, + "num_input_tokens_seen": 48767328, + "step": 28460 + }, + { + "epoch": 138.17917675544794, + "grad_norm": 1.5028284394702496e-08, + "learning_rate": 0.057469343051512085, + "loss": 0.0, + "num_input_tokens_seen": 48775872, + "step": 28465 + }, + { + "epoch": 138.20338983050848, + "grad_norm": 1.6475580011388047e-08, + "learning_rate": 0.057422988276968324, + "loss": 0.0, + "num_input_tokens_seen": 48784896, + "step": 28470 + }, + { + "epoch": 138.227602905569, + "grad_norm": 8.865809419944526e-09, + "learning_rate": 0.05737664777896323, + "loss": 0.0, + "num_input_tokens_seen": 48793600, + "step": 28475 + }, + { + "epoch": 138.25181598062954, + "grad_norm": 7.360616116613983e-09, + "learning_rate": 0.057330321564642975, + "loss": 0.0, + "num_input_tokens_seen": 48802496, + "step": 28480 + }, + { + "epoch": 138.27602905569006, + "grad_norm": 1.0406108863492136e-08, + "learning_rate": 0.05728400964115174, + "loss": 0.0, + "num_input_tokens_seen": 48811424, + "step": 28485 + }, + { + "epoch": 138.3002421307506, + "grad_norm": 2.869504633906672e-08, + "learning_rate": 0.057237712015631305, + "loss": 0.0, + "num_input_tokens_seen": 48819808, + "step": 28490 + }, + { + "epoch": 138.32445520581115, + "grad_norm": 1.9587290012168523e-08, + "learning_rate": 0.057191428695221425, + "loss": 0.0, + "num_input_tokens_seen": 48828320, + "step": 28495 + }, + { + "epoch": 138.34866828087166, + "grad_norm": 2.7476062314235605e-08, + "learning_rate": 0.05714515968705958, + "loss": 0.0, + "num_input_tokens_seen": 48837600, + "step": 28500 + }, + { + "epoch": 138.3728813559322, + "grad_norm": 1.891031331524573e-08, + "learning_rate": 0.05709890499828099, + "loss": 0.0, + "num_input_tokens_seen": 48846176, + "step": 28505 + }, + { + "epoch": 138.39709443099272, + "grad_norm": 1.3042236624016823e-08, + "learning_rate": 0.05705266463601868, + "loss": 0.0, + "num_input_tokens_seen": 48855040, + "step": 28510 + }, + { + "epoch": 138.42130750605327, + "grad_norm": 5.374671374624995e-08, + "learning_rate": 0.057006438607403565, + "loss": 0.0, + "num_input_tokens_seen": 48863360, + "step": 28515 + }, + { + "epoch": 138.4455205811138, + "grad_norm": 1.4235137513196605e-08, + "learning_rate": 0.056960226919564205, + "loss": 0.0, + "num_input_tokens_seen": 48871680, + "step": 28520 + }, + { + "epoch": 138.46973365617433, + "grad_norm": 2.0563980740462284e-08, + "learning_rate": 0.05691402957962713, + "loss": 0.0, + "num_input_tokens_seen": 48880416, + "step": 28525 + }, + { + "epoch": 138.49394673123487, + "grad_norm": 1.1636220875743675e-08, + "learning_rate": 0.05686784659471642, + "loss": 0.0, + "num_input_tokens_seen": 48889152, + "step": 28530 + }, + { + "epoch": 138.5181598062954, + "grad_norm": 1.4379578416878758e-08, + "learning_rate": 0.056821677971954136, + "loss": 0.0, + "num_input_tokens_seen": 48897760, + "step": 28535 + }, + { + "epoch": 138.54237288135593, + "grad_norm": 2.0169219183685527e-08, + "learning_rate": 0.05677552371846012, + "loss": 0.0, + "num_input_tokens_seen": 48906176, + "step": 28540 + }, + { + "epoch": 138.56658595641647, + "grad_norm": 6.412446129644422e-09, + "learning_rate": 0.05672938384135182, + "loss": 0.0, + "num_input_tokens_seen": 48914592, + "step": 28545 + }, + { + "epoch": 138.590799031477, + "grad_norm": 1.1596283044923439e-08, + "learning_rate": 0.05668325834774465, + "loss": 0.0, + "num_input_tokens_seen": 48923232, + "step": 28550 + }, + { + "epoch": 138.61501210653753, + "grad_norm": 1.7836841337270926e-08, + "learning_rate": 0.05663714724475177, + "loss": 0.0, + "num_input_tokens_seen": 48931552, + "step": 28555 + }, + { + "epoch": 138.63922518159805, + "grad_norm": 1.2599148391245762e-08, + "learning_rate": 0.05659105053948403, + "loss": 0.0, + "num_input_tokens_seen": 48939872, + "step": 28560 + }, + { + "epoch": 138.6634382566586, + "grad_norm": 2.8215925595986846e-08, + "learning_rate": 0.056544968239050176, + "loss": 0.0, + "num_input_tokens_seen": 48948224, + "step": 28565 + }, + { + "epoch": 138.68765133171914, + "grad_norm": 9.536627487705118e-09, + "learning_rate": 0.056498900350556616, + "loss": 0.0, + "num_input_tokens_seen": 48956768, + "step": 28570 + }, + { + "epoch": 138.71186440677965, + "grad_norm": 1.2196837317901554e-08, + "learning_rate": 0.05645284688110766, + "loss": 0.0, + "num_input_tokens_seen": 48965152, + "step": 28575 + }, + { + "epoch": 138.7360774818402, + "grad_norm": 1.3655604647055952e-08, + "learning_rate": 0.05640680783780532, + "loss": 0.0, + "num_input_tokens_seen": 48973472, + "step": 28580 + }, + { + "epoch": 138.7602905569007, + "grad_norm": 1.6144829473319078e-08, + "learning_rate": 0.056360783227749324, + "loss": 0.0, + "num_input_tokens_seen": 48982112, + "step": 28585 + }, + { + "epoch": 138.78450363196126, + "grad_norm": 1.9849268895200112e-08, + "learning_rate": 0.05631477305803728, + "loss": 0.0, + "num_input_tokens_seen": 48990592, + "step": 28590 + }, + { + "epoch": 138.8087167070218, + "grad_norm": 1.710211350314239e-08, + "learning_rate": 0.05626877733576462, + "loss": 0.0, + "num_input_tokens_seen": 48999328, + "step": 28595 + }, + { + "epoch": 138.83292978208232, + "grad_norm": 1.5385607454732053e-08, + "learning_rate": 0.05622279606802435, + "loss": 0.0, + "num_input_tokens_seen": 49008000, + "step": 28600 + }, + { + "epoch": 138.83292978208232, + "eval_loss": 1.151678204536438, + "eval_runtime": 4.6257, + "eval_samples_per_second": 79.339, + "eval_steps_per_second": 19.889, + "num_input_tokens_seen": 49008000, + "step": 28600 + }, + { + "epoch": 138.85714285714286, + "grad_norm": 1.89628899249783e-08, + "learning_rate": 0.05617682926190744, + "loss": 0.0, + "num_input_tokens_seen": 49016320, + "step": 28605 + }, + { + "epoch": 138.88135593220338, + "grad_norm": 1.4918724033918807e-08, + "learning_rate": 0.05613087692450248, + "loss": 0.0, + "num_input_tokens_seen": 49025056, + "step": 28610 + }, + { + "epoch": 138.90556900726392, + "grad_norm": 1.0849470655216464e-08, + "learning_rate": 0.05608493906289592, + "loss": 0.0, + "num_input_tokens_seen": 49033408, + "step": 28615 + }, + { + "epoch": 138.92978208232446, + "grad_norm": 1.9272235363132495e-08, + "learning_rate": 0.05603901568417201, + "loss": 0.0, + "num_input_tokens_seen": 49041888, + "step": 28620 + }, + { + "epoch": 138.95399515738498, + "grad_norm": 1.1998280591285493e-08, + "learning_rate": 0.055993106795412625, + "loss": 0.0, + "num_input_tokens_seen": 49050208, + "step": 28625 + }, + { + "epoch": 138.97820823244552, + "grad_norm": 1.676653837989761e-08, + "learning_rate": 0.05594721240369759, + "loss": 0.0, + "num_input_tokens_seen": 49058464, + "step": 28630 + }, + { + "epoch": 139.0048426150121, + "grad_norm": 6.457902657075465e-08, + "learning_rate": 0.055901332516104296, + "loss": 0.0, + "num_input_tokens_seen": 49067584, + "step": 28635 + }, + { + "epoch": 139.02905569007265, + "grad_norm": 1.9882811841398507e-08, + "learning_rate": 0.05585546713970804, + "loss": 0.0, + "num_input_tokens_seen": 49076096, + "step": 28640 + }, + { + "epoch": 139.05326876513317, + "grad_norm": 9.499118824862762e-09, + "learning_rate": 0.05580961628158189, + "loss": 0.0, + "num_input_tokens_seen": 49084480, + "step": 28645 + }, + { + "epoch": 139.0774818401937, + "grad_norm": 8.481899627099665e-09, + "learning_rate": 0.05576377994879659, + "loss": 0.0, + "num_input_tokens_seen": 49092928, + "step": 28650 + }, + { + "epoch": 139.10169491525423, + "grad_norm": 1.2927752202074316e-08, + "learning_rate": 0.05571795814842063, + "loss": 0.0, + "num_input_tokens_seen": 49101344, + "step": 28655 + }, + { + "epoch": 139.12590799031477, + "grad_norm": 1.2173497765388674e-08, + "learning_rate": 0.05567215088752037, + "loss": 0.0, + "num_input_tokens_seen": 49110016, + "step": 28660 + }, + { + "epoch": 139.15012106537532, + "grad_norm": 1.3609452231833075e-08, + "learning_rate": 0.05562635817315981, + "loss": 0.0, + "num_input_tokens_seen": 49118720, + "step": 28665 + }, + { + "epoch": 139.17433414043583, + "grad_norm": 3.231766143585446e-08, + "learning_rate": 0.05558058001240083, + "loss": 0.0, + "num_input_tokens_seen": 49127392, + "step": 28670 + }, + { + "epoch": 139.19854721549638, + "grad_norm": 1.2074427679920063e-08, + "learning_rate": 0.055534816412302915, + "loss": 0.0, + "num_input_tokens_seen": 49136128, + "step": 28675 + }, + { + "epoch": 139.2227602905569, + "grad_norm": 1.8444735516709443e-08, + "learning_rate": 0.055489067379923436, + "loss": 0.0, + "num_input_tokens_seen": 49144768, + "step": 28680 + }, + { + "epoch": 139.24697336561744, + "grad_norm": 1.1923694032134335e-08, + "learning_rate": 0.055443332922317505, + "loss": 0.0, + "num_input_tokens_seen": 49153280, + "step": 28685 + }, + { + "epoch": 139.27118644067798, + "grad_norm": 1.9319763566727488e-08, + "learning_rate": 0.055397613046537876, + "loss": 0.0, + "num_input_tokens_seen": 49162272, + "step": 28690 + }, + { + "epoch": 139.2953995157385, + "grad_norm": 4.618351479024341e-09, + "learning_rate": 0.055351907759635145, + "loss": 0.0, + "num_input_tokens_seen": 49170944, + "step": 28695 + }, + { + "epoch": 139.31961259079904, + "grad_norm": 1.5311021783759315e-08, + "learning_rate": 0.05530621706865772, + "loss": 0.0, + "num_input_tokens_seen": 49179808, + "step": 28700 + }, + { + "epoch": 139.34382566585955, + "grad_norm": 1.425698048507229e-08, + "learning_rate": 0.055260540980651564, + "loss": 0.0, + "num_input_tokens_seen": 49188320, + "step": 28705 + }, + { + "epoch": 139.3680387409201, + "grad_norm": 1.2150724870707563e-08, + "learning_rate": 0.05521487950266062, + "loss": 0.0, + "num_input_tokens_seen": 49196864, + "step": 28710 + }, + { + "epoch": 139.39225181598064, + "grad_norm": 1.4625974209536707e-08, + "learning_rate": 0.055169232641726344, + "loss": 0.0, + "num_input_tokens_seen": 49205344, + "step": 28715 + }, + { + "epoch": 139.41646489104116, + "grad_norm": 1.4979887552613036e-08, + "learning_rate": 0.055123600404888166, + "loss": 0.0, + "num_input_tokens_seen": 49214176, + "step": 28720 + }, + { + "epoch": 139.4406779661017, + "grad_norm": 3.009504823126008e-08, + "learning_rate": 0.05507798279918309, + "loss": 0.0, + "num_input_tokens_seen": 49222912, + "step": 28725 + }, + { + "epoch": 139.46489104116222, + "grad_norm": 6.366321692041765e-09, + "learning_rate": 0.0550323798316459, + "loss": 0.0, + "num_input_tokens_seen": 49231424, + "step": 28730 + }, + { + "epoch": 139.48910411622276, + "grad_norm": 1.4086228183884941e-08, + "learning_rate": 0.05498679150930916, + "loss": 0.0, + "num_input_tokens_seen": 49240064, + "step": 28735 + }, + { + "epoch": 139.5133171912833, + "grad_norm": 2.5453461560687174e-08, + "learning_rate": 0.05494121783920323, + "loss": 0.0, + "num_input_tokens_seen": 49248576, + "step": 28740 + }, + { + "epoch": 139.53753026634382, + "grad_norm": 2.4481561666789275e-08, + "learning_rate": 0.05489565882835605, + "loss": 0.0, + "num_input_tokens_seen": 49257056, + "step": 28745 + }, + { + "epoch": 139.56174334140437, + "grad_norm": 2.316451030992539e-08, + "learning_rate": 0.05485011448379348, + "loss": 0.0, + "num_input_tokens_seen": 49265728, + "step": 28750 + }, + { + "epoch": 139.58595641646488, + "grad_norm": 9.45931688534074e-09, + "learning_rate": 0.05480458481253893, + "loss": 0.0, + "num_input_tokens_seen": 49274016, + "step": 28755 + }, + { + "epoch": 139.61016949152543, + "grad_norm": 4.7223833732346066e-09, + "learning_rate": 0.054759069821613715, + "loss": 0.0, + "num_input_tokens_seen": 49282624, + "step": 28760 + }, + { + "epoch": 139.63438256658597, + "grad_norm": 1.4490034949687924e-08, + "learning_rate": 0.05471356951803683, + "loss": 0.0, + "num_input_tokens_seen": 49290848, + "step": 28765 + }, + { + "epoch": 139.65859564164649, + "grad_norm": 1.2004663929587878e-08, + "learning_rate": 0.054668083908824945, + "loss": 0.0, + "num_input_tokens_seen": 49299104, + "step": 28770 + }, + { + "epoch": 139.68280871670703, + "grad_norm": 9.023066738222951e-09, + "learning_rate": 0.054622613000992526, + "loss": 0.0, + "num_input_tokens_seen": 49307488, + "step": 28775 + }, + { + "epoch": 139.70702179176754, + "grad_norm": 1.9865900924287416e-08, + "learning_rate": 0.05457715680155182, + "loss": 0.0, + "num_input_tokens_seen": 49316000, + "step": 28780 + }, + { + "epoch": 139.7312348668281, + "grad_norm": 1.2448057695735315e-08, + "learning_rate": 0.05453171531751265, + "loss": 0.0, + "num_input_tokens_seen": 49324384, + "step": 28785 + }, + { + "epoch": 139.75544794188863, + "grad_norm": 9.898069919245245e-09, + "learning_rate": 0.05448628855588276, + "loss": 0.0, + "num_input_tokens_seen": 49332768, + "step": 28790 + }, + { + "epoch": 139.77966101694915, + "grad_norm": 1.5999967573065987e-08, + "learning_rate": 0.05444087652366746, + "loss": 0.0, + "num_input_tokens_seen": 49341440, + "step": 28795 + }, + { + "epoch": 139.8038740920097, + "grad_norm": 8.102587933933592e-09, + "learning_rate": 0.05439547922786984, + "loss": 0.0, + "num_input_tokens_seen": 49350304, + "step": 28800 + }, + { + "epoch": 139.8038740920097, + "eval_loss": 1.1506969928741455, + "eval_runtime": 4.6095, + "eval_samples_per_second": 79.618, + "eval_steps_per_second": 19.959, + "num_input_tokens_seen": 49350304, + "step": 28800 + }, + { + "epoch": 139.8280871670702, + "grad_norm": 2.1224019874921396e-08, + "learning_rate": 0.0543500966754908, + "loss": 0.0, + "num_input_tokens_seen": 49359072, + "step": 28805 + }, + { + "epoch": 139.85230024213075, + "grad_norm": 2.0675933853908646e-08, + "learning_rate": 0.05430472887352882, + "loss": 0.0, + "num_input_tokens_seen": 49367712, + "step": 28810 + }, + { + "epoch": 139.8765133171913, + "grad_norm": 1.2636790280851073e-08, + "learning_rate": 0.05425937582898023, + "loss": 0.0, + "num_input_tokens_seen": 49376288, + "step": 28815 + }, + { + "epoch": 139.9007263922518, + "grad_norm": 2.025210932288246e-08, + "learning_rate": 0.054214037548839085, + "loss": 0.0, + "num_input_tokens_seen": 49384928, + "step": 28820 + }, + { + "epoch": 139.92493946731236, + "grad_norm": 2.3200160015335314e-08, + "learning_rate": 0.05416871404009703, + "loss": 0.0, + "num_input_tokens_seen": 49393376, + "step": 28825 + }, + { + "epoch": 139.94915254237287, + "grad_norm": 1.5440342338024493e-08, + "learning_rate": 0.054123405309743605, + "loss": 0.0, + "num_input_tokens_seen": 49401728, + "step": 28830 + }, + { + "epoch": 139.97336561743342, + "grad_norm": 6.761184057069158e-09, + "learning_rate": 0.0540781113647659, + "loss": 0.0, + "num_input_tokens_seen": 49410560, + "step": 28835 + }, + { + "epoch": 139.99757869249396, + "grad_norm": 2.439915469665266e-08, + "learning_rate": 0.054032832212148836, + "loss": 0.0, + "num_input_tokens_seen": 49418688, + "step": 28840 + }, + { + "epoch": 140.02421307506054, + "grad_norm": 1.2705423380054981e-08, + "learning_rate": 0.0539875678588751, + "loss": 0.0, + "num_input_tokens_seen": 49427392, + "step": 28845 + }, + { + "epoch": 140.04842615012106, + "grad_norm": 5.821255921745205e-09, + "learning_rate": 0.05394231831192492, + "loss": 0.0, + "num_input_tokens_seen": 49435968, + "step": 28850 + }, + { + "epoch": 140.0726392251816, + "grad_norm": 1.527561188652271e-08, + "learning_rate": 0.05389708357827639, + "loss": 0.0, + "num_input_tokens_seen": 49444864, + "step": 28855 + }, + { + "epoch": 140.09685230024212, + "grad_norm": 1.5230071426231007e-08, + "learning_rate": 0.05385186366490533, + "loss": 0.0, + "num_input_tokens_seen": 49453440, + "step": 28860 + }, + { + "epoch": 140.12106537530266, + "grad_norm": 1.4439738293958726e-08, + "learning_rate": 0.053806658578785166, + "loss": 0.0, + "num_input_tokens_seen": 49462176, + "step": 28865 + }, + { + "epoch": 140.1452784503632, + "grad_norm": 1.08397699705165e-08, + "learning_rate": 0.05376146832688705, + "loss": 0.0, + "num_input_tokens_seen": 49470912, + "step": 28870 + }, + { + "epoch": 140.16949152542372, + "grad_norm": 8.993785272082278e-09, + "learning_rate": 0.053716292916179964, + "loss": 0.0, + "num_input_tokens_seen": 49479264, + "step": 28875 + }, + { + "epoch": 140.19370460048427, + "grad_norm": 1.0595532451418421e-08, + "learning_rate": 0.05367113235363045, + "loss": 0.0, + "num_input_tokens_seen": 49487680, + "step": 28880 + }, + { + "epoch": 140.21791767554478, + "grad_norm": 1.1223685092431879e-08, + "learning_rate": 0.05362598664620289, + "loss": 0.0, + "num_input_tokens_seen": 49496352, + "step": 28885 + }, + { + "epoch": 140.24213075060533, + "grad_norm": 2.1489881874003913e-08, + "learning_rate": 0.053580855800859285, + "loss": 0.0, + "num_input_tokens_seen": 49504960, + "step": 28890 + }, + { + "epoch": 140.26634382566587, + "grad_norm": 1.0559393359699243e-08, + "learning_rate": 0.05353573982455938, + "loss": 0.0, + "num_input_tokens_seen": 49513664, + "step": 28895 + }, + { + "epoch": 140.2905569007264, + "grad_norm": 1.7594407708543258e-08, + "learning_rate": 0.053490638724260686, + "loss": 0.0, + "num_input_tokens_seen": 49522080, + "step": 28900 + }, + { + "epoch": 140.31476997578693, + "grad_norm": 1.1824305978791472e-08, + "learning_rate": 0.05344555250691827, + "loss": 0.0, + "num_input_tokens_seen": 49531072, + "step": 28905 + }, + { + "epoch": 140.33898305084745, + "grad_norm": 1.8258075940025265e-08, + "learning_rate": 0.053400481179485086, + "loss": 0.0, + "num_input_tokens_seen": 49539840, + "step": 28910 + }, + { + "epoch": 140.363196125908, + "grad_norm": 1.4534106362873445e-08, + "learning_rate": 0.05335542474891159, + "loss": 0.0, + "num_input_tokens_seen": 49548448, + "step": 28915 + }, + { + "epoch": 140.38740920096853, + "grad_norm": 1.3771114915073213e-08, + "learning_rate": 0.053310383222146124, + "loss": 0.0, + "num_input_tokens_seen": 49556832, + "step": 28920 + }, + { + "epoch": 140.41162227602905, + "grad_norm": 1.1292678792074184e-08, + "learning_rate": 0.053265356606134684, + "loss": 0.0, + "num_input_tokens_seen": 49565664, + "step": 28925 + }, + { + "epoch": 140.4358353510896, + "grad_norm": 2.1361094226790556e-08, + "learning_rate": 0.053220344907820856, + "loss": 0.0, + "num_input_tokens_seen": 49574144, + "step": 28930 + }, + { + "epoch": 140.4600484261501, + "grad_norm": 7.420311476380448e-09, + "learning_rate": 0.05317534813414608, + "loss": 0.0, + "num_input_tokens_seen": 49582688, + "step": 28935 + }, + { + "epoch": 140.48426150121065, + "grad_norm": 7.0552994557715465e-09, + "learning_rate": 0.05313036629204942, + "loss": 0.0, + "num_input_tokens_seen": 49591680, + "step": 28940 + }, + { + "epoch": 140.5084745762712, + "grad_norm": 1.505134505919159e-08, + "learning_rate": 0.05308539938846756, + "loss": 0.0, + "num_input_tokens_seen": 49600224, + "step": 28945 + }, + { + "epoch": 140.5326876513317, + "grad_norm": 6.76461553439367e-09, + "learning_rate": 0.05304044743033507, + "loss": 0.0, + "num_input_tokens_seen": 49608768, + "step": 28950 + }, + { + "epoch": 140.55690072639226, + "grad_norm": 8.844020626952442e-09, + "learning_rate": 0.05299551042458401, + "loss": 0.0, + "num_input_tokens_seen": 49617568, + "step": 28955 + }, + { + "epoch": 140.58111380145277, + "grad_norm": 1.2488817979772193e-08, + "learning_rate": 0.052950588378144266, + "loss": 0.0, + "num_input_tokens_seen": 49625792, + "step": 28960 + }, + { + "epoch": 140.60532687651332, + "grad_norm": 1.9906341464093202e-08, + "learning_rate": 0.052905681297943465, + "loss": 0.0, + "num_input_tokens_seen": 49634496, + "step": 28965 + }, + { + "epoch": 140.62953995157386, + "grad_norm": 9.29179577724426e-09, + "learning_rate": 0.0528607891909067, + "loss": 0.0, + "num_input_tokens_seen": 49643488, + "step": 28970 + }, + { + "epoch": 140.65375302663438, + "grad_norm": 1.7647394656705728e-08, + "learning_rate": 0.05281591206395697, + "loss": 0.0, + "num_input_tokens_seen": 49651808, + "step": 28975 + }, + { + "epoch": 140.67796610169492, + "grad_norm": 9.619187224529924e-09, + "learning_rate": 0.05277104992401496, + "loss": 0.0, + "num_input_tokens_seen": 49660512, + "step": 28980 + }, + { + "epoch": 140.70217917675544, + "grad_norm": 1.3954966071594299e-08, + "learning_rate": 0.05272620277799884, + "loss": 0.0, + "num_input_tokens_seen": 49668928, + "step": 28985 + }, + { + "epoch": 140.72639225181598, + "grad_norm": 8.302106557778188e-09, + "learning_rate": 0.05268137063282473, + "loss": 0.0, + "num_input_tokens_seen": 49677504, + "step": 28990 + }, + { + "epoch": 140.75060532687652, + "grad_norm": 1.746808209190931e-08, + "learning_rate": 0.0526365534954062, + "loss": 0.0, + "num_input_tokens_seen": 49686048, + "step": 28995 + }, + { + "epoch": 140.77481840193704, + "grad_norm": 1.3082680716536288e-08, + "learning_rate": 0.052591751372654656, + "loss": 0.0, + "num_input_tokens_seen": 49694528, + "step": 29000 + }, + { + "epoch": 140.77481840193704, + "eval_loss": 1.1618506908416748, + "eval_runtime": 4.616, + "eval_samples_per_second": 79.505, + "eval_steps_per_second": 19.93, + "num_input_tokens_seen": 49694528, + "step": 29000 + }, + { + "epoch": 140.79903147699758, + "grad_norm": 1.02488240116827e-08, + "learning_rate": 0.05254696427147921, + "loss": 0.0, + "num_input_tokens_seen": 49703104, + "step": 29005 + }, + { + "epoch": 140.8232445520581, + "grad_norm": 1.2280618300053447e-08, + "learning_rate": 0.052502192198786546, + "loss": 0.0, + "num_input_tokens_seen": 49711616, + "step": 29010 + }, + { + "epoch": 140.84745762711864, + "grad_norm": 1.7746659253248254e-08, + "learning_rate": 0.05245743516148103, + "loss": 0.0, + "num_input_tokens_seen": 49720384, + "step": 29015 + }, + { + "epoch": 140.8716707021792, + "grad_norm": 2.70778901523272e-08, + "learning_rate": 0.05241269316646486, + "loss": 0.0, + "num_input_tokens_seen": 49728704, + "step": 29020 + }, + { + "epoch": 140.8958837772397, + "grad_norm": 1.0926836324642863e-08, + "learning_rate": 0.052367966220637725, + "loss": 0.0, + "num_input_tokens_seen": 49737120, + "step": 29025 + }, + { + "epoch": 140.92009685230025, + "grad_norm": 7.2676535900484396e-09, + "learning_rate": 0.05232325433089716, + "loss": 0.0, + "num_input_tokens_seen": 49745312, + "step": 29030 + }, + { + "epoch": 140.94430992736076, + "grad_norm": 1.3169984214300712e-08, + "learning_rate": 0.052278557504138214, + "loss": 0.0, + "num_input_tokens_seen": 49753824, + "step": 29035 + }, + { + "epoch": 140.9685230024213, + "grad_norm": 1.8094851839123294e-08, + "learning_rate": 0.05223387574725372, + "loss": 0.0, + "num_input_tokens_seen": 49762400, + "step": 29040 + }, + { + "epoch": 140.99273607748185, + "grad_norm": 3.239663115550684e-08, + "learning_rate": 0.05218920906713428, + "loss": 0.0, + "num_input_tokens_seen": 49770848, + "step": 29045 + }, + { + "epoch": 141.01937046004844, + "grad_norm": 1.403262395172078e-08, + "learning_rate": 0.05214455747066789, + "loss": 0.0, + "num_input_tokens_seen": 49780032, + "step": 29050 + }, + { + "epoch": 141.04358353510895, + "grad_norm": 1.041115993416497e-08, + "learning_rate": 0.05209992096474048, + "loss": 0.0, + "num_input_tokens_seen": 49788608, + "step": 29055 + }, + { + "epoch": 141.0677966101695, + "grad_norm": 1.5028254196636226e-08, + "learning_rate": 0.05205529955623559, + "loss": 0.0, + "num_input_tokens_seen": 49797152, + "step": 29060 + }, + { + "epoch": 141.09200968523, + "grad_norm": 1.266194527005382e-08, + "learning_rate": 0.052010693252034314, + "loss": 0.0, + "num_input_tokens_seen": 49805600, + "step": 29065 + }, + { + "epoch": 141.11622276029055, + "grad_norm": 7.857103412334254e-09, + "learning_rate": 0.0519661020590156, + "loss": 0.0, + "num_input_tokens_seen": 49814080, + "step": 29070 + }, + { + "epoch": 141.1404358353511, + "grad_norm": 4.8497983407003176e-09, + "learning_rate": 0.05192152598405586, + "loss": 0.0, + "num_input_tokens_seen": 49822368, + "step": 29075 + }, + { + "epoch": 141.16464891041161, + "grad_norm": 1.4626759359259722e-08, + "learning_rate": 0.05187696503402941, + "loss": 0.0, + "num_input_tokens_seen": 49830880, + "step": 29080 + }, + { + "epoch": 141.18886198547216, + "grad_norm": 1.818315453760988e-08, + "learning_rate": 0.05183241921580798, + "loss": 0.0, + "num_input_tokens_seen": 49839680, + "step": 29085 + }, + { + "epoch": 141.21307506053267, + "grad_norm": 9.681823343044016e-09, + "learning_rate": 0.051787888536261206, + "loss": 0.0, + "num_input_tokens_seen": 49848224, + "step": 29090 + }, + { + "epoch": 141.23728813559322, + "grad_norm": 1.6365396149353728e-08, + "learning_rate": 0.051743373002256184, + "loss": 0.0, + "num_input_tokens_seen": 49856800, + "step": 29095 + }, + { + "epoch": 141.26150121065376, + "grad_norm": 4.404913056532678e-08, + "learning_rate": 0.05169887262065787, + "loss": 0.0, + "num_input_tokens_seen": 49865280, + "step": 29100 + }, + { + "epoch": 141.28571428571428, + "grad_norm": 1.1699639479445523e-08, + "learning_rate": 0.051654387398328665, + "loss": 0.0, + "num_input_tokens_seen": 49873536, + "step": 29105 + }, + { + "epoch": 141.30992736077482, + "grad_norm": 8.513643123819747e-09, + "learning_rate": 0.05160991734212888, + "loss": 0.0, + "num_input_tokens_seen": 49882240, + "step": 29110 + }, + { + "epoch": 141.33414043583534, + "grad_norm": 1.5183784896066754e-08, + "learning_rate": 0.051565462458916224, + "loss": 0.0, + "num_input_tokens_seen": 49890400, + "step": 29115 + }, + { + "epoch": 141.35835351089588, + "grad_norm": 5.2723514443187014e-08, + "learning_rate": 0.05152102275554627, + "loss": 0.0, + "num_input_tokens_seen": 49899008, + "step": 29120 + }, + { + "epoch": 141.38256658595643, + "grad_norm": 3.6019174309842583e-09, + "learning_rate": 0.05147659823887222, + "loss": 0.0, + "num_input_tokens_seen": 49907488, + "step": 29125 + }, + { + "epoch": 141.40677966101694, + "grad_norm": 8.768806125658557e-09, + "learning_rate": 0.05143218891574479, + "loss": 0.0, + "num_input_tokens_seen": 49916320, + "step": 29130 + }, + { + "epoch": 141.43099273607749, + "grad_norm": 1.951583072923313e-08, + "learning_rate": 0.0513877947930125, + "loss": 0.0, + "num_input_tokens_seen": 49924832, + "step": 29135 + }, + { + "epoch": 141.455205811138, + "grad_norm": 2.0019852442487718e-08, + "learning_rate": 0.051343415877521566, + "loss": 0.0, + "num_input_tokens_seen": 49933504, + "step": 29140 + }, + { + "epoch": 141.47941888619854, + "grad_norm": 2.5390722413476396e-08, + "learning_rate": 0.051299052176115634, + "loss": 0.0, + "num_input_tokens_seen": 49941952, + "step": 29145 + }, + { + "epoch": 141.5036319612591, + "grad_norm": 1.615939559940216e-08, + "learning_rate": 0.051254703695636256, + "loss": 0.0, + "num_input_tokens_seen": 49950496, + "step": 29150 + }, + { + "epoch": 141.5278450363196, + "grad_norm": 7.647947164457491e-09, + "learning_rate": 0.05121037044292249, + "loss": 0.0, + "num_input_tokens_seen": 49959104, + "step": 29155 + }, + { + "epoch": 141.55205811138015, + "grad_norm": 1.4274711190864764e-08, + "learning_rate": 0.05116605242481101, + "loss": 0.0, + "num_input_tokens_seen": 49967424, + "step": 29160 + }, + { + "epoch": 141.57627118644066, + "grad_norm": 8.302283305283709e-09, + "learning_rate": 0.05112174964813634, + "loss": 0.0, + "num_input_tokens_seen": 49975968, + "step": 29165 + }, + { + "epoch": 141.6004842615012, + "grad_norm": 9.863567740353574e-09, + "learning_rate": 0.05107746211973038, + "loss": 0.0, + "num_input_tokens_seen": 49984416, + "step": 29170 + }, + { + "epoch": 141.62469733656175, + "grad_norm": 1.728963283653684e-08, + "learning_rate": 0.05103318984642291, + "loss": 0.0, + "num_input_tokens_seen": 49993632, + "step": 29175 + }, + { + "epoch": 141.64891041162227, + "grad_norm": 1.1571183122782713e-08, + "learning_rate": 0.05098893283504131, + "loss": 0.0, + "num_input_tokens_seen": 50002208, + "step": 29180 + }, + { + "epoch": 141.6731234866828, + "grad_norm": 1.2174203867232336e-08, + "learning_rate": 0.050944691092410475, + "loss": 0.0, + "num_input_tokens_seen": 50010848, + "step": 29185 + }, + { + "epoch": 141.69733656174333, + "grad_norm": 7.294823856085486e-09, + "learning_rate": 0.05090046462535313, + "loss": 0.0, + "num_input_tokens_seen": 50019264, + "step": 29190 + }, + { + "epoch": 141.72154963680387, + "grad_norm": 1.3821559896598501e-08, + "learning_rate": 0.050856253440689454, + "loss": 0.0, + "num_input_tokens_seen": 50027488, + "step": 29195 + }, + { + "epoch": 141.74576271186442, + "grad_norm": 9.018022772977474e-09, + "learning_rate": 0.050812057545237405, + "loss": 0.0, + "num_input_tokens_seen": 50035616, + "step": 29200 + }, + { + "epoch": 141.74576271186442, + "eval_loss": 1.162516713142395, + "eval_runtime": 4.6352, + "eval_samples_per_second": 79.176, + "eval_steps_per_second": 19.848, + "num_input_tokens_seen": 50035616, + "step": 29200 + }, + { + "epoch": 141.76997578692493, + "grad_norm": 5.747449183246545e-09, + "learning_rate": 0.0507678769458126, + "loss": 0.0, + "num_input_tokens_seen": 50044320, + "step": 29205 + }, + { + "epoch": 141.79418886198548, + "grad_norm": 9.283716906338668e-09, + "learning_rate": 0.050723711649228155, + "loss": 0.0, + "num_input_tokens_seen": 50052992, + "step": 29210 + }, + { + "epoch": 141.818401937046, + "grad_norm": 1.4934663283838745e-08, + "learning_rate": 0.05067956166229496, + "loss": 0.0, + "num_input_tokens_seen": 50061728, + "step": 29215 + }, + { + "epoch": 141.84261501210653, + "grad_norm": 2.495189121987096e-08, + "learning_rate": 0.05063542699182155, + "loss": 0.0, + "num_input_tokens_seen": 50070496, + "step": 29220 + }, + { + "epoch": 141.86682808716708, + "grad_norm": 1.1634647911762386e-08, + "learning_rate": 0.050591307644613996, + "loss": 0.0, + "num_input_tokens_seen": 50078976, + "step": 29225 + }, + { + "epoch": 141.8910411622276, + "grad_norm": 6.749011571827168e-09, + "learning_rate": 0.05054720362747599, + "loss": 0.0, + "num_input_tokens_seen": 50087360, + "step": 29230 + }, + { + "epoch": 141.91525423728814, + "grad_norm": 8.405548257428563e-09, + "learning_rate": 0.050503114947209035, + "loss": 0.0, + "num_input_tokens_seen": 50095744, + "step": 29235 + }, + { + "epoch": 141.93946731234865, + "grad_norm": 1.3571301415993275e-08, + "learning_rate": 0.05045904161061207, + "loss": 0.0, + "num_input_tokens_seen": 50104672, + "step": 29240 + }, + { + "epoch": 141.9636803874092, + "grad_norm": 2.8498167381485473e-08, + "learning_rate": 0.05041498362448185, + "loss": 0.0, + "num_input_tokens_seen": 50113440, + "step": 29245 + }, + { + "epoch": 141.98789346246974, + "grad_norm": 1.2296963447511189e-08, + "learning_rate": 0.05037094099561256, + "loss": 0.0, + "num_input_tokens_seen": 50121952, + "step": 29250 + }, + { + "epoch": 142.01452784503633, + "grad_norm": 7.376505184453208e-09, + "learning_rate": 0.05032691373079624, + "loss": 0.0, + "num_input_tokens_seen": 50131168, + "step": 29255 + }, + { + "epoch": 142.03874092009684, + "grad_norm": 1.5235290362625165e-08, + "learning_rate": 0.05028290183682234, + "loss": 0.0, + "num_input_tokens_seen": 50139968, + "step": 29260 + }, + { + "epoch": 142.0629539951574, + "grad_norm": 9.935116729309357e-09, + "learning_rate": 0.050238905320478096, + "loss": 0.0, + "num_input_tokens_seen": 50148544, + "step": 29265 + }, + { + "epoch": 142.08716707021793, + "grad_norm": 8.754646785291698e-09, + "learning_rate": 0.05019492418854838, + "loss": 0.0, + "num_input_tokens_seen": 50156896, + "step": 29270 + }, + { + "epoch": 142.11138014527845, + "grad_norm": 1.3351901806402111e-08, + "learning_rate": 0.05015095844781554, + "loss": 0.0, + "num_input_tokens_seen": 50165312, + "step": 29275 + }, + { + "epoch": 142.135593220339, + "grad_norm": 1.6324912976983796e-08, + "learning_rate": 0.05010700810505968, + "loss": 0.0, + "num_input_tokens_seen": 50174112, + "step": 29280 + }, + { + "epoch": 142.1598062953995, + "grad_norm": 1.4279287086083059e-08, + "learning_rate": 0.05006307316705856, + "loss": 0.0, + "num_input_tokens_seen": 50182336, + "step": 29285 + }, + { + "epoch": 142.18401937046005, + "grad_norm": 3.290701489078174e-09, + "learning_rate": 0.0500191536405874, + "loss": 0.0, + "num_input_tokens_seen": 50190880, + "step": 29290 + }, + { + "epoch": 142.2082324455206, + "grad_norm": 5.039916928240018e-09, + "learning_rate": 0.04997524953241922, + "loss": 0.0, + "num_input_tokens_seen": 50199520, + "step": 29295 + }, + { + "epoch": 142.2324455205811, + "grad_norm": 1.2148460015737328e-08, + "learning_rate": 0.049931360849324556, + "loss": 0.0, + "num_input_tokens_seen": 50207744, + "step": 29300 + }, + { + "epoch": 142.25665859564165, + "grad_norm": 1.3379852781270074e-08, + "learning_rate": 0.04988748759807155, + "loss": 0.0, + "num_input_tokens_seen": 50216448, + "step": 29305 + }, + { + "epoch": 142.28087167070217, + "grad_norm": 1.17712648517454e-08, + "learning_rate": 0.0498436297854261, + "loss": 0.0, + "num_input_tokens_seen": 50225120, + "step": 29310 + }, + { + "epoch": 142.3050847457627, + "grad_norm": 3.0046045651488384e-08, + "learning_rate": 0.04979978741815152, + "loss": 0.0, + "num_input_tokens_seen": 50233696, + "step": 29315 + }, + { + "epoch": 142.32929782082326, + "grad_norm": 2.0421833113459797e-08, + "learning_rate": 0.04975596050300891, + "loss": 0.0, + "num_input_tokens_seen": 50242432, + "step": 29320 + }, + { + "epoch": 142.35351089588377, + "grad_norm": 5.940467229237356e-09, + "learning_rate": 0.049712149046757005, + "loss": 0.0, + "num_input_tokens_seen": 50251040, + "step": 29325 + }, + { + "epoch": 142.37772397094432, + "grad_norm": 1.2895068124407771e-08, + "learning_rate": 0.04966835305615194, + "loss": 0.0, + "num_input_tokens_seen": 50259584, + "step": 29330 + }, + { + "epoch": 142.40193704600483, + "grad_norm": 5.676533465504008e-09, + "learning_rate": 0.049624572537947755, + "loss": 0.0, + "num_input_tokens_seen": 50268384, + "step": 29335 + }, + { + "epoch": 142.42615012106538, + "grad_norm": 2.560822487396308e-08, + "learning_rate": 0.04958080749889582, + "loss": 0.0, + "num_input_tokens_seen": 50277024, + "step": 29340 + }, + { + "epoch": 142.45036319612592, + "grad_norm": 1.6086307841334246e-09, + "learning_rate": 0.049537057945745304, + "loss": 0.0, + "num_input_tokens_seen": 50285376, + "step": 29345 + }, + { + "epoch": 142.47457627118644, + "grad_norm": 1.1927959064905735e-08, + "learning_rate": 0.049493323885243, + "loss": 0.0, + "num_input_tokens_seen": 50293856, + "step": 29350 + }, + { + "epoch": 142.49878934624698, + "grad_norm": 2.1294331631338537e-08, + "learning_rate": 0.04944960532413318, + "loss": 0.0, + "num_input_tokens_seen": 50302048, + "step": 29355 + }, + { + "epoch": 142.5230024213075, + "grad_norm": 2.069564430939863e-08, + "learning_rate": 0.049405902269157774, + "loss": 0.0, + "num_input_tokens_seen": 50310720, + "step": 29360 + }, + { + "epoch": 142.54721549636804, + "grad_norm": 1.3485618843844804e-08, + "learning_rate": 0.04936221472705646, + "loss": 0.0, + "num_input_tokens_seen": 50319392, + "step": 29365 + }, + { + "epoch": 142.57142857142858, + "grad_norm": 1.7036262178748984e-08, + "learning_rate": 0.04931854270456632, + "loss": 0.0, + "num_input_tokens_seen": 50327648, + "step": 29370 + }, + { + "epoch": 142.5956416464891, + "grad_norm": 1.697280715973193e-08, + "learning_rate": 0.049274886208422075, + "loss": 0.0, + "num_input_tokens_seen": 50335776, + "step": 29375 + }, + { + "epoch": 142.61985472154964, + "grad_norm": 2.1372228431459916e-08, + "learning_rate": 0.049231245245356235, + "loss": 0.0, + "num_input_tokens_seen": 50344448, + "step": 29380 + }, + { + "epoch": 142.64406779661016, + "grad_norm": 1.6090462295892394e-08, + "learning_rate": 0.049187619822098655, + "loss": 0.0, + "num_input_tokens_seen": 50353088, + "step": 29385 + }, + { + "epoch": 142.6682808716707, + "grad_norm": 6.861715640127386e-09, + "learning_rate": 0.04914400994537705, + "loss": 0.0, + "num_input_tokens_seen": 50361568, + "step": 29390 + }, + { + "epoch": 142.69249394673125, + "grad_norm": 1.625121548443076e-08, + "learning_rate": 0.049100415621916485, + "loss": 0.0, + "num_input_tokens_seen": 50370336, + "step": 29395 + }, + { + "epoch": 142.71670702179176, + "grad_norm": 8.309641863490924e-09, + "learning_rate": 0.04905683685843981, + "loss": 0.0, + "num_input_tokens_seen": 50378912, + "step": 29400 + }, + { + "epoch": 142.71670702179176, + "eval_loss": 1.1645675897598267, + "eval_runtime": 4.6369, + "eval_samples_per_second": 79.148, + "eval_steps_per_second": 19.841, + "num_input_tokens_seen": 50378912, + "step": 29400 + }, + { + "epoch": 142.7409200968523, + "grad_norm": 2.9877897933516806e-09, + "learning_rate": 0.049013273661667495, + "loss": 0.0, + "num_input_tokens_seen": 50387488, + "step": 29405 + }, + { + "epoch": 142.76513317191282, + "grad_norm": 8.78265993264904e-09, + "learning_rate": 0.048969726038317396, + "loss": 0.0, + "num_input_tokens_seen": 50396064, + "step": 29410 + }, + { + "epoch": 142.78934624697337, + "grad_norm": 8.02964184032362e-09, + "learning_rate": 0.048926193995105206, + "loss": 0.0, + "num_input_tokens_seen": 50404576, + "step": 29415 + }, + { + "epoch": 142.8135593220339, + "grad_norm": 1.050442932637452e-08, + "learning_rate": 0.048882677538744035, + "loss": 0.0, + "num_input_tokens_seen": 50413248, + "step": 29420 + }, + { + "epoch": 142.83777239709443, + "grad_norm": 1.2667971560631486e-08, + "learning_rate": 0.048839176675944715, + "loss": 0.0, + "num_input_tokens_seen": 50421664, + "step": 29425 + }, + { + "epoch": 142.86198547215497, + "grad_norm": 1.339236543884681e-08, + "learning_rate": 0.04879569141341566, + "loss": 0.0, + "num_input_tokens_seen": 50430336, + "step": 29430 + }, + { + "epoch": 142.88619854721549, + "grad_norm": 1.1215077755366565e-08, + "learning_rate": 0.04875222175786274, + "loss": 0.0, + "num_input_tokens_seen": 50438880, + "step": 29435 + }, + { + "epoch": 142.91041162227603, + "grad_norm": 1.2308978725172892e-08, + "learning_rate": 0.04870876771598966, + "loss": 0.0, + "num_input_tokens_seen": 50447648, + "step": 29440 + }, + { + "epoch": 142.93462469733657, + "grad_norm": 5.62656143898721e-09, + "learning_rate": 0.04866532929449744, + "loss": 0.0, + "num_input_tokens_seen": 50456544, + "step": 29445 + }, + { + "epoch": 142.9588377723971, + "grad_norm": 2.0226007535484314e-08, + "learning_rate": 0.048621906500084945, + "loss": 0.0, + "num_input_tokens_seen": 50465120, + "step": 29450 + }, + { + "epoch": 142.98305084745763, + "grad_norm": 4.650881013645858e-09, + "learning_rate": 0.04857849933944845, + "loss": 0.0, + "num_input_tokens_seen": 50473504, + "step": 29455 + }, + { + "epoch": 143.00968523002422, + "grad_norm": 5.737527786209284e-09, + "learning_rate": 0.048535107819281866, + "loss": 0.0, + "num_input_tokens_seen": 50482560, + "step": 29460 + }, + { + "epoch": 143.03389830508473, + "grad_norm": 1.4360137079449942e-08, + "learning_rate": 0.04849173194627675, + "loss": 0.0, + "num_input_tokens_seen": 50491360, + "step": 29465 + }, + { + "epoch": 143.05811138014528, + "grad_norm": 9.55880619102345e-09, + "learning_rate": 0.04844837172712223, + "loss": 0.0, + "num_input_tokens_seen": 50499520, + "step": 29470 + }, + { + "epoch": 143.08232445520582, + "grad_norm": 1.2170206176165266e-08, + "learning_rate": 0.04840502716850494, + "loss": 0.0, + "num_input_tokens_seen": 50508320, + "step": 29475 + }, + { + "epoch": 143.10653753026634, + "grad_norm": 4.433833744599269e-09, + "learning_rate": 0.04836169827710916, + "loss": 0.0, + "num_input_tokens_seen": 50516960, + "step": 29480 + }, + { + "epoch": 143.13075060532688, + "grad_norm": 1.7128428453361266e-08, + "learning_rate": 0.04831838505961684, + "loss": 0.0, + "num_input_tokens_seen": 50525248, + "step": 29485 + }, + { + "epoch": 143.1549636803874, + "grad_norm": 1.6448758799469942e-08, + "learning_rate": 0.048275087522707295, + "loss": 0.0, + "num_input_tokens_seen": 50533728, + "step": 29490 + }, + { + "epoch": 143.17917675544794, + "grad_norm": 2.485252004191807e-08, + "learning_rate": 0.04823180567305766, + "loss": 0.0, + "num_input_tokens_seen": 50542304, + "step": 29495 + }, + { + "epoch": 143.20338983050848, + "grad_norm": 8.895162828537195e-09, + "learning_rate": 0.04818853951734244, + "loss": 0.0, + "num_input_tokens_seen": 50550656, + "step": 29500 + }, + { + "epoch": 143.227602905569, + "grad_norm": 8.790576266903827e-09, + "learning_rate": 0.04814528906223387, + "loss": 0.0, + "num_input_tokens_seen": 50559680, + "step": 29505 + }, + { + "epoch": 143.25181598062954, + "grad_norm": 1.0610592404702857e-08, + "learning_rate": 0.04810205431440177, + "loss": 0.0, + "num_input_tokens_seen": 50568256, + "step": 29510 + }, + { + "epoch": 143.27602905569006, + "grad_norm": 2.0253615673482273e-08, + "learning_rate": 0.04805883528051341, + "loss": 0.0, + "num_input_tokens_seen": 50576864, + "step": 29515 + }, + { + "epoch": 143.3002421307506, + "grad_norm": 1.7606556212967917e-08, + "learning_rate": 0.048015631967233685, + "loss": 0.0, + "num_input_tokens_seen": 50585408, + "step": 29520 + }, + { + "epoch": 143.32445520581115, + "grad_norm": 1.671660321278523e-08, + "learning_rate": 0.04797244438122517, + "loss": 0.0, + "num_input_tokens_seen": 50594176, + "step": 29525 + }, + { + "epoch": 143.34866828087166, + "grad_norm": 8.876884116659767e-09, + "learning_rate": 0.04792927252914784, + "loss": 0.0, + "num_input_tokens_seen": 50602624, + "step": 29530 + }, + { + "epoch": 143.3728813559322, + "grad_norm": 9.038078729872723e-09, + "learning_rate": 0.04788611641765944, + "loss": 0.0, + "num_input_tokens_seen": 50611072, + "step": 29535 + }, + { + "epoch": 143.39709443099272, + "grad_norm": 1.1938527499921747e-08, + "learning_rate": 0.04784297605341508, + "loss": 0.0, + "num_input_tokens_seen": 50619296, + "step": 29540 + }, + { + "epoch": 143.42130750605327, + "grad_norm": 4.9115143063716005e-09, + "learning_rate": 0.04779985144306761, + "loss": 0.0, + "num_input_tokens_seen": 50627744, + "step": 29545 + }, + { + "epoch": 143.4455205811138, + "grad_norm": 1.846882291545171e-08, + "learning_rate": 0.047756742593267405, + "loss": 0.0, + "num_input_tokens_seen": 50636640, + "step": 29550 + }, + { + "epoch": 143.46973365617433, + "grad_norm": 1.1562034885059802e-08, + "learning_rate": 0.047713649510662315, + "loss": 0.0, + "num_input_tokens_seen": 50645120, + "step": 29555 + }, + { + "epoch": 143.49394673123487, + "grad_norm": 2.101208451676939e-08, + "learning_rate": 0.04767057220189789, + "loss": 0.0, + "num_input_tokens_seen": 50653216, + "step": 29560 + }, + { + "epoch": 143.5181598062954, + "grad_norm": 1.9037932119658763e-08, + "learning_rate": 0.04762751067361722, + "loss": 0.0, + "num_input_tokens_seen": 50661664, + "step": 29565 + }, + { + "epoch": 143.54237288135593, + "grad_norm": 1.9197479161903175e-08, + "learning_rate": 0.04758446493246086, + "loss": 0.0, + "num_input_tokens_seen": 50670240, + "step": 29570 + }, + { + "epoch": 143.56658595641647, + "grad_norm": 2.243293906190047e-08, + "learning_rate": 0.047541434985067084, + "loss": 0.0, + "num_input_tokens_seen": 50678592, + "step": 29575 + }, + { + "epoch": 143.590799031477, + "grad_norm": 1.1521754217369562e-08, + "learning_rate": 0.047498420838071556, + "loss": 0.0, + "num_input_tokens_seen": 50687296, + "step": 29580 + }, + { + "epoch": 143.61501210653753, + "grad_norm": 1.9294487785259662e-08, + "learning_rate": 0.04745542249810772, + "loss": 0.0, + "num_input_tokens_seen": 50696128, + "step": 29585 + }, + { + "epoch": 143.63922518159805, + "grad_norm": 9.932181299632248e-09, + "learning_rate": 0.047412439971806324, + "loss": 0.0, + "num_input_tokens_seen": 50704512, + "step": 29590 + }, + { + "epoch": 143.6634382566586, + "grad_norm": 2.7246734646269033e-08, + "learning_rate": 0.04736947326579592, + "loss": 0.0, + "num_input_tokens_seen": 50713408, + "step": 29595 + }, + { + "epoch": 143.68765133171914, + "grad_norm": 2.2333738414204163e-08, + "learning_rate": 0.04732652238670245, + "loss": 0.0, + "num_input_tokens_seen": 50722400, + "step": 29600 + }, + { + "epoch": 143.68765133171914, + "eval_loss": 1.1668256521224976, + "eval_runtime": 4.6205, + "eval_samples_per_second": 79.429, + "eval_steps_per_second": 19.911, + "num_input_tokens_seen": 50722400, + "step": 29600 + }, + { + "epoch": 143.71186440677965, + "grad_norm": 1.6158937299337595e-08, + "learning_rate": 0.04728358734114952, + "loss": 0.0, + "num_input_tokens_seen": 50731264, + "step": 29605 + }, + { + "epoch": 143.7360774818402, + "grad_norm": 3.968781747687444e-09, + "learning_rate": 0.04724066813575821, + "loss": 0.0, + "num_input_tokens_seen": 50739680, + "step": 29610 + }, + { + "epoch": 143.7602905569007, + "grad_norm": 1.9759543334885166e-08, + "learning_rate": 0.04719776477714729, + "loss": 0.0, + "num_input_tokens_seen": 50748384, + "step": 29615 + }, + { + "epoch": 143.78450363196126, + "grad_norm": 6.948734032619086e-09, + "learning_rate": 0.047154877271932856, + "loss": 0.0, + "num_input_tokens_seen": 50756864, + "step": 29620 + }, + { + "epoch": 143.8087167070218, + "grad_norm": 2.1607171163395833e-08, + "learning_rate": 0.0471120056267288, + "loss": 0.0, + "num_input_tokens_seen": 50765408, + "step": 29625 + }, + { + "epoch": 143.83292978208232, + "grad_norm": 2.6198263114451947e-08, + "learning_rate": 0.047069149848146495, + "loss": 0.0, + "num_input_tokens_seen": 50774112, + "step": 29630 + }, + { + "epoch": 143.85714285714286, + "grad_norm": 8.759491798571162e-09, + "learning_rate": 0.04702630994279473, + "loss": 0.0, + "num_input_tokens_seen": 50782528, + "step": 29635 + }, + { + "epoch": 143.88135593220338, + "grad_norm": 2.0724035820762765e-08, + "learning_rate": 0.046983485917280035, + "loss": 0.0, + "num_input_tokens_seen": 50791232, + "step": 29640 + }, + { + "epoch": 143.90556900726392, + "grad_norm": 1.4762105315924146e-08, + "learning_rate": 0.04694067777820644, + "loss": 0.0, + "num_input_tokens_seen": 50799616, + "step": 29645 + }, + { + "epoch": 143.92978208232446, + "grad_norm": 2.6101455219418312e-08, + "learning_rate": 0.046897885532175415, + "loss": 0.0, + "num_input_tokens_seen": 50808256, + "step": 29650 + }, + { + "epoch": 143.95399515738498, + "grad_norm": 1.2907722002353239e-08, + "learning_rate": 0.04685510918578613, + "loss": 0.0, + "num_input_tokens_seen": 50816864, + "step": 29655 + }, + { + "epoch": 143.97820823244552, + "grad_norm": 1.1486865680865321e-08, + "learning_rate": 0.04681234874563519, + "loss": 0.0, + "num_input_tokens_seen": 50824928, + "step": 29660 + }, + { + "epoch": 144.0048426150121, + "grad_norm": 2.5421183380558432e-08, + "learning_rate": 0.046769604218316836, + "loss": 0.0, + "num_input_tokens_seen": 50834144, + "step": 29665 + }, + { + "epoch": 144.02905569007265, + "grad_norm": 7.065265261729792e-09, + "learning_rate": 0.04672687561042279, + "loss": 0.0, + "num_input_tokens_seen": 50842528, + "step": 29670 + }, + { + "epoch": 144.05326876513317, + "grad_norm": 1.3492952533056268e-08, + "learning_rate": 0.046684162928542286, + "loss": 0.0, + "num_input_tokens_seen": 50850624, + "step": 29675 + }, + { + "epoch": 144.0774818401937, + "grad_norm": 1.9351579894077986e-08, + "learning_rate": 0.04664146617926222, + "loss": 0.0, + "num_input_tokens_seen": 50859200, + "step": 29680 + }, + { + "epoch": 144.10169491525423, + "grad_norm": 2.1099783253930582e-08, + "learning_rate": 0.046598785369167, + "loss": 0.0, + "num_input_tokens_seen": 50867872, + "step": 29685 + }, + { + "epoch": 144.12590799031477, + "grad_norm": 9.485546570431325e-09, + "learning_rate": 0.046556120504838434, + "loss": 0.0, + "num_input_tokens_seen": 50876352, + "step": 29690 + }, + { + "epoch": 144.15012106537532, + "grad_norm": 3.6256908586551617e-09, + "learning_rate": 0.04651347159285609, + "loss": 0.0, + "num_input_tokens_seen": 50885152, + "step": 29695 + }, + { + "epoch": 144.17433414043583, + "grad_norm": 7.312515926116703e-09, + "learning_rate": 0.04647083863979688, + "loss": 0.0, + "num_input_tokens_seen": 50893600, + "step": 29700 + }, + { + "epoch": 144.19854721549638, + "grad_norm": 1.4037246920395319e-08, + "learning_rate": 0.04642822165223538, + "loss": 0.0, + "num_input_tokens_seen": 50902080, + "step": 29705 + }, + { + "epoch": 144.2227602905569, + "grad_norm": 2.3518161640367907e-08, + "learning_rate": 0.046385620636743716, + "loss": 0.0, + "num_input_tokens_seen": 50910400, + "step": 29710 + }, + { + "epoch": 144.24697336561744, + "grad_norm": 1.0445818432458509e-08, + "learning_rate": 0.04634303559989141, + "loss": 0.0, + "num_input_tokens_seen": 50919072, + "step": 29715 + }, + { + "epoch": 144.27118644067798, + "grad_norm": 9.61111457087327e-09, + "learning_rate": 0.046300466548245635, + "loss": 0.0, + "num_input_tokens_seen": 50927552, + "step": 29720 + }, + { + "epoch": 144.2953995157385, + "grad_norm": 5.9583231681870075e-09, + "learning_rate": 0.04625791348837114, + "loss": 0.0, + "num_input_tokens_seen": 50936160, + "step": 29725 + }, + { + "epoch": 144.31961259079904, + "grad_norm": 1.0401483230282338e-08, + "learning_rate": 0.046215376426830095, + "loss": 0.0, + "num_input_tokens_seen": 50944800, + "step": 29730 + }, + { + "epoch": 144.34382566585955, + "grad_norm": 1.0818165918635714e-08, + "learning_rate": 0.04617285537018219, + "loss": 0.0, + "num_input_tokens_seen": 50953632, + "step": 29735 + }, + { + "epoch": 144.3680387409201, + "grad_norm": 1.8507318344518353e-08, + "learning_rate": 0.046130350324984803, + "loss": 0.0, + "num_input_tokens_seen": 50962144, + "step": 29740 + }, + { + "epoch": 144.39225181598064, + "grad_norm": 9.90919080123831e-09, + "learning_rate": 0.046087861297792666, + "loss": 0.0, + "num_input_tokens_seen": 50970720, + "step": 29745 + }, + { + "epoch": 144.41646489104116, + "grad_norm": 1.2191214260326433e-08, + "learning_rate": 0.0460453882951582, + "loss": 0.0, + "num_input_tokens_seen": 50978976, + "step": 29750 + }, + { + "epoch": 144.4406779661017, + "grad_norm": 1.7726524248473652e-08, + "learning_rate": 0.04600293132363119, + "loss": 0.0, + "num_input_tokens_seen": 50987936, + "step": 29755 + }, + { + "epoch": 144.46489104116222, + "grad_norm": 7.299660431669963e-09, + "learning_rate": 0.045960490389759086, + "loss": 0.0, + "num_input_tokens_seen": 50996448, + "step": 29760 + }, + { + "epoch": 144.48910411622276, + "grad_norm": 1.0223473623227619e-08, + "learning_rate": 0.04591806550008685, + "loss": 0.0, + "num_input_tokens_seen": 51004960, + "step": 29765 + }, + { + "epoch": 144.5133171912833, + "grad_norm": 1.877573296837909e-08, + "learning_rate": 0.045875656661156825, + "loss": 0.0, + "num_input_tokens_seen": 51013312, + "step": 29770 + }, + { + "epoch": 144.53753026634382, + "grad_norm": 1.3394889641915597e-08, + "learning_rate": 0.04583326387950911, + "loss": 0.0, + "num_input_tokens_seen": 51021888, + "step": 29775 + }, + { + "epoch": 144.56174334140437, + "grad_norm": 1.0473195644067346e-08, + "learning_rate": 0.0457908871616811, + "loss": 0.0, + "num_input_tokens_seen": 51030272, + "step": 29780 + }, + { + "epoch": 144.58595641646488, + "grad_norm": 1.3693172817852428e-08, + "learning_rate": 0.04574852651420786, + "loss": 0.0, + "num_input_tokens_seen": 51038976, + "step": 29785 + }, + { + "epoch": 144.61016949152543, + "grad_norm": 1.3650698349465529e-08, + "learning_rate": 0.045706181943621985, + "loss": 0.0, + "num_input_tokens_seen": 51047328, + "step": 29790 + }, + { + "epoch": 144.63438256658597, + "grad_norm": 2.473205817921098e-08, + "learning_rate": 0.04566385345645344, + "loss": 0.0, + "num_input_tokens_seen": 51056000, + "step": 29795 + }, + { + "epoch": 144.65859564164649, + "grad_norm": 3.740466425483646e-08, + "learning_rate": 0.04562154105922993, + "loss": 0.0, + "num_input_tokens_seen": 51064768, + "step": 29800 + }, + { + "epoch": 144.65859564164649, + "eval_loss": 1.1661797761917114, + "eval_runtime": 4.613, + "eval_samples_per_second": 79.558, + "eval_steps_per_second": 19.944, + "num_input_tokens_seen": 51064768, + "step": 29800 + }, + { + "epoch": 144.68280871670703, + "grad_norm": 2.450746627857825e-08, + "learning_rate": 0.04557924475847642, + "loss": 0.0, + "num_input_tokens_seen": 51073152, + "step": 29805 + }, + { + "epoch": 144.70702179176754, + "grad_norm": 1.787858927571051e-08, + "learning_rate": 0.04553696456071567, + "loss": 0.0, + "num_input_tokens_seen": 51081824, + "step": 29810 + }, + { + "epoch": 144.7312348668281, + "grad_norm": 5.335109687365502e-09, + "learning_rate": 0.045494700472467724, + "loss": 0.0, + "num_input_tokens_seen": 51090400, + "step": 29815 + }, + { + "epoch": 144.75544794188863, + "grad_norm": 7.1078631869170295e-09, + "learning_rate": 0.04545245250025024, + "loss": 0.0, + "num_input_tokens_seen": 51098944, + "step": 29820 + }, + { + "epoch": 144.77966101694915, + "grad_norm": 1.2330819920691738e-08, + "learning_rate": 0.045410220650578384, + "loss": 0.0, + "num_input_tokens_seen": 51107520, + "step": 29825 + }, + { + "epoch": 144.8038740920097, + "grad_norm": 2.0831853575487003e-08, + "learning_rate": 0.04536800492996492, + "loss": 0.0, + "num_input_tokens_seen": 51116224, + "step": 29830 + }, + { + "epoch": 144.8280871670702, + "grad_norm": 9.94606441651058e-09, + "learning_rate": 0.04532580534491994, + "loss": 0.0, + "num_input_tokens_seen": 51125056, + "step": 29835 + }, + { + "epoch": 144.85230024213075, + "grad_norm": 7.0817782749088565e-09, + "learning_rate": 0.045283621901951183, + "loss": 0.0, + "num_input_tokens_seen": 51133984, + "step": 29840 + }, + { + "epoch": 144.8765133171913, + "grad_norm": 4.5000771997649736e-09, + "learning_rate": 0.04524145460756393, + "loss": 0.0, + "num_input_tokens_seen": 51142560, + "step": 29845 + }, + { + "epoch": 144.9007263922518, + "grad_norm": 1.719144826495267e-08, + "learning_rate": 0.045199303468260794, + "loss": 0.0, + "num_input_tokens_seen": 51150976, + "step": 29850 + }, + { + "epoch": 144.92493946731236, + "grad_norm": 1.4833583250606353e-08, + "learning_rate": 0.04515716849054214, + "loss": 0.0, + "num_input_tokens_seen": 51159648, + "step": 29855 + }, + { + "epoch": 144.94915254237287, + "grad_norm": 1.6039432892966943e-08, + "learning_rate": 0.04511504968090558, + "loss": 0.0, + "num_input_tokens_seen": 51168096, + "step": 29860 + }, + { + "epoch": 144.97336561743342, + "grad_norm": 1.901202750786979e-08, + "learning_rate": 0.04507294704584644, + "loss": 0.0, + "num_input_tokens_seen": 51176640, + "step": 29865 + }, + { + "epoch": 144.99757869249396, + "grad_norm": 1.775576663476386e-08, + "learning_rate": 0.04503086059185749, + "loss": 0.0, + "num_input_tokens_seen": 51184896, + "step": 29870 + }, + { + "epoch": 145.02421307506054, + "grad_norm": 1.1740039163044003e-08, + "learning_rate": 0.04498879032542893, + "loss": 0.0, + "num_input_tokens_seen": 51193952, + "step": 29875 + }, + { + "epoch": 145.04842615012106, + "grad_norm": 1.9090087732820393e-08, + "learning_rate": 0.0449467362530486, + "loss": 0.0, + "num_input_tokens_seen": 51202368, + "step": 29880 + }, + { + "epoch": 145.0726392251816, + "grad_norm": 2.7510516531492613e-08, + "learning_rate": 0.04490469838120171, + "loss": 0.0, + "num_input_tokens_seen": 51210496, + "step": 29885 + }, + { + "epoch": 145.09685230024212, + "grad_norm": 1.2214685263245428e-08, + "learning_rate": 0.04486267671637101, + "loss": 0.0, + "num_input_tokens_seen": 51219072, + "step": 29890 + }, + { + "epoch": 145.12106537530266, + "grad_norm": 6.509273120514081e-09, + "learning_rate": 0.04482067126503683, + "loss": 0.0, + "num_input_tokens_seen": 51227712, + "step": 29895 + }, + { + "epoch": 145.1452784503632, + "grad_norm": 1.5290488875052688e-08, + "learning_rate": 0.04477868203367687, + "loss": 0.0, + "num_input_tokens_seen": 51235680, + "step": 29900 + }, + { + "epoch": 145.16949152542372, + "grad_norm": 5.814180692453874e-09, + "learning_rate": 0.044736709028766426, + "loss": 0.0, + "num_input_tokens_seen": 51244320, + "step": 29905 + }, + { + "epoch": 145.19370460048427, + "grad_norm": 5.87301540733165e-09, + "learning_rate": 0.04469475225677832, + "loss": 0.0, + "num_input_tokens_seen": 51252672, + "step": 29910 + }, + { + "epoch": 145.21791767554478, + "grad_norm": 6.854616430018723e-09, + "learning_rate": 0.04465281172418273, + "loss": 0.0, + "num_input_tokens_seen": 51261152, + "step": 29915 + }, + { + "epoch": 145.24213075060533, + "grad_norm": 1.3861801484438274e-08, + "learning_rate": 0.044610887437447476, + "loss": 0.0, + "num_input_tokens_seen": 51270016, + "step": 29920 + }, + { + "epoch": 145.26634382566587, + "grad_norm": 1.4741323717260002e-08, + "learning_rate": 0.044568979403037744, + "loss": 0.0, + "num_input_tokens_seen": 51278752, + "step": 29925 + }, + { + "epoch": 145.2905569007264, + "grad_norm": 6.459093704336283e-09, + "learning_rate": 0.04452708762741631, + "loss": 0.0, + "num_input_tokens_seen": 51287360, + "step": 29930 + }, + { + "epoch": 145.31476997578693, + "grad_norm": 9.719947513531224e-09, + "learning_rate": 0.044485212117043475, + "loss": 0.0, + "num_input_tokens_seen": 51296160, + "step": 29935 + }, + { + "epoch": 145.33898305084745, + "grad_norm": 1.0695166530183542e-08, + "learning_rate": 0.04444335287837687, + "loss": 0.0, + "num_input_tokens_seen": 51304928, + "step": 29940 + }, + { + "epoch": 145.363196125908, + "grad_norm": 9.670357847824107e-09, + "learning_rate": 0.04440150991787179, + "loss": 0.0, + "num_input_tokens_seen": 51313632, + "step": 29945 + }, + { + "epoch": 145.38740920096853, + "grad_norm": 5.386124879436238e-09, + "learning_rate": 0.04435968324198088, + "loss": 0.0, + "num_input_tokens_seen": 51322176, + "step": 29950 + }, + { + "epoch": 145.41162227602905, + "grad_norm": 1.1459714954753508e-08, + "learning_rate": 0.04431787285715442, + "loss": 0.0, + "num_input_tokens_seen": 51330944, + "step": 29955 + }, + { + "epoch": 145.4358353510896, + "grad_norm": 1.0923687732145027e-08, + "learning_rate": 0.04427607876984004, + "loss": 0.0, + "num_input_tokens_seen": 51339808, + "step": 29960 + }, + { + "epoch": 145.4600484261501, + "grad_norm": 1.1452724990590468e-08, + "learning_rate": 0.044234300986482886, + "loss": 0.0, + "num_input_tokens_seen": 51348320, + "step": 29965 + }, + { + "epoch": 145.48426150121065, + "grad_norm": 9.25264398432546e-09, + "learning_rate": 0.04419253951352566, + "loss": 0.0, + "num_input_tokens_seen": 51356704, + "step": 29970 + }, + { + "epoch": 145.5084745762712, + "grad_norm": 9.061460026771329e-09, + "learning_rate": 0.044150794357408533, + "loss": 0.0, + "num_input_tokens_seen": 51365728, + "step": 29975 + }, + { + "epoch": 145.5326876513317, + "grad_norm": 2.708677016016736e-08, + "learning_rate": 0.044109065524569065, + "loss": 0.0, + "num_input_tokens_seen": 51374048, + "step": 29980 + }, + { + "epoch": 145.55690072639226, + "grad_norm": 1.1374179820222707e-08, + "learning_rate": 0.0440673530214424, + "loss": 0.0, + "num_input_tokens_seen": 51382528, + "step": 29985 + }, + { + "epoch": 145.58111380145277, + "grad_norm": 1.194485754751895e-08, + "learning_rate": 0.04402565685446117, + "loss": 0.0, + "num_input_tokens_seen": 51390912, + "step": 29990 + }, + { + "epoch": 145.60532687651332, + "grad_norm": 6.434011545763951e-09, + "learning_rate": 0.04398397703005536, + "loss": 0.0, + "num_input_tokens_seen": 51399552, + "step": 29995 + }, + { + "epoch": 145.62953995157386, + "grad_norm": 1.33853452766175e-08, + "learning_rate": 0.043942313554652626, + "loss": 0.0, + "num_input_tokens_seen": 51407840, + "step": 30000 + }, + { + "epoch": 145.62953995157386, + "eval_loss": 1.1683433055877686, + "eval_runtime": 4.6215, + "eval_samples_per_second": 79.411, + "eval_steps_per_second": 19.907, + "num_input_tokens_seen": 51407840, + "step": 30000 + }, + { + "epoch": 145.65375302663438, + "grad_norm": 1.1024515522706224e-08, + "learning_rate": 0.0439006664346779, + "loss": 0.0, + "num_input_tokens_seen": 51416160, + "step": 30005 + }, + { + "epoch": 145.67796610169492, + "grad_norm": 2.1316676424021352e-08, + "learning_rate": 0.043859035676553755, + "loss": 0.0, + "num_input_tokens_seen": 51424896, + "step": 30010 + }, + { + "epoch": 145.70217917675544, + "grad_norm": 1.2408065686031478e-08, + "learning_rate": 0.043817421286700194, + "loss": 0.0, + "num_input_tokens_seen": 51433248, + "step": 30015 + }, + { + "epoch": 145.72639225181598, + "grad_norm": 1.879956812445016e-08, + "learning_rate": 0.043775823271534585, + "loss": 0.0, + "num_input_tokens_seen": 51441696, + "step": 30020 + }, + { + "epoch": 145.75060532687652, + "grad_norm": 5.020411197875774e-09, + "learning_rate": 0.04373424163747197, + "loss": 0.0, + "num_input_tokens_seen": 51450336, + "step": 30025 + }, + { + "epoch": 145.77481840193704, + "grad_norm": 2.500454598930446e-08, + "learning_rate": 0.04369267639092473, + "loss": 0.0, + "num_input_tokens_seen": 51459008, + "step": 30030 + }, + { + "epoch": 145.79903147699758, + "grad_norm": 1.2329616438933044e-08, + "learning_rate": 0.04365112753830268, + "loss": 0.0, + "num_input_tokens_seen": 51467456, + "step": 30035 + }, + { + "epoch": 145.8232445520581, + "grad_norm": 1.6985136852554206e-08, + "learning_rate": 0.04360959508601327, + "loss": 0.0, + "num_input_tokens_seen": 51476128, + "step": 30040 + }, + { + "epoch": 145.84745762711864, + "grad_norm": 1.2438194474384545e-08, + "learning_rate": 0.04356807904046123, + "loss": 0.0, + "num_input_tokens_seen": 51484896, + "step": 30045 + }, + { + "epoch": 145.8716707021792, + "grad_norm": 6.739917068898649e-09, + "learning_rate": 0.04352657940804892, + "loss": 0.0, + "num_input_tokens_seen": 51493536, + "step": 30050 + }, + { + "epoch": 145.8958837772397, + "grad_norm": 1.274997174505188e-08, + "learning_rate": 0.04348509619517613, + "loss": 0.0, + "num_input_tokens_seen": 51502208, + "step": 30055 + }, + { + "epoch": 145.92009685230025, + "grad_norm": 1.935846682954434e-08, + "learning_rate": 0.04344362940824002, + "loss": 0.0, + "num_input_tokens_seen": 51510976, + "step": 30060 + }, + { + "epoch": 145.94430992736076, + "grad_norm": 1.0653393722748206e-08, + "learning_rate": 0.04340217905363533, + "loss": 0.0, + "num_input_tokens_seen": 51519392, + "step": 30065 + }, + { + "epoch": 145.9685230024213, + "grad_norm": 1.8326289819015074e-08, + "learning_rate": 0.04336074513775425, + "loss": 0.0, + "num_input_tokens_seen": 51527776, + "step": 30070 + }, + { + "epoch": 145.99273607748185, + "grad_norm": 1.5972704048294872e-08, + "learning_rate": 0.04331932766698636, + "loss": 0.0, + "num_input_tokens_seen": 51536192, + "step": 30075 + }, + { + "epoch": 146.01937046004844, + "grad_norm": 1.904365376503847e-08, + "learning_rate": 0.0432779266477188, + "loss": 0.0, + "num_input_tokens_seen": 51544864, + "step": 30080 + }, + { + "epoch": 146.04358353510895, + "grad_norm": 6.353989778773439e-09, + "learning_rate": 0.04323654208633607, + "loss": 0.0, + "num_input_tokens_seen": 51553568, + "step": 30085 + }, + { + "epoch": 146.0677966101695, + "grad_norm": 1.4489788036087248e-08, + "learning_rate": 0.04319517398922024, + "loss": 0.0, + "num_input_tokens_seen": 51561952, + "step": 30090 + }, + { + "epoch": 146.09200968523, + "grad_norm": 2.3570386531446275e-08, + "learning_rate": 0.04315382236275079, + "loss": 0.0, + "num_input_tokens_seen": 51570528, + "step": 30095 + }, + { + "epoch": 146.11622276029055, + "grad_norm": 8.927732331187599e-09, + "learning_rate": 0.043112487213304664, + "loss": 0.0, + "num_input_tokens_seen": 51579200, + "step": 30100 + }, + { + "epoch": 146.1404358353511, + "grad_norm": 1.0005399175838647e-08, + "learning_rate": 0.04307116854725618, + "loss": 0.0, + "num_input_tokens_seen": 51587776, + "step": 30105 + }, + { + "epoch": 146.16464891041161, + "grad_norm": 1.0069094003029022e-08, + "learning_rate": 0.043029866370977325, + "loss": 0.0, + "num_input_tokens_seen": 51596160, + "step": 30110 + }, + { + "epoch": 146.18886198547216, + "grad_norm": 6.542530073261332e-09, + "learning_rate": 0.04298858069083728, + "loss": 0.0, + "num_input_tokens_seen": 51604544, + "step": 30115 + }, + { + "epoch": 146.21307506053267, + "grad_norm": 7.819640046591303e-09, + "learning_rate": 0.04294731151320295, + "loss": 0.0, + "num_input_tokens_seen": 51613280, + "step": 30120 + }, + { + "epoch": 146.23728813559322, + "grad_norm": 1.1565765234422543e-08, + "learning_rate": 0.04290605884443841, + "loss": 0.0, + "num_input_tokens_seen": 51621504, + "step": 30125 + }, + { + "epoch": 146.26150121065376, + "grad_norm": 1.306407249046515e-08, + "learning_rate": 0.04286482269090545, + "loss": 0.0, + "num_input_tokens_seen": 51629984, + "step": 30130 + }, + { + "epoch": 146.28571428571428, + "grad_norm": 1.7752405767623713e-08, + "learning_rate": 0.04282360305896323, + "loss": 0.0, + "num_input_tokens_seen": 51638848, + "step": 30135 + }, + { + "epoch": 146.30992736077482, + "grad_norm": 1.9395240968833605e-08, + "learning_rate": 0.04278239995496822, + "loss": 0.0, + "num_input_tokens_seen": 51647392, + "step": 30140 + }, + { + "epoch": 146.33414043583534, + "grad_norm": 1.886644795945358e-08, + "learning_rate": 0.042741213385274514, + "loss": 0.0, + "num_input_tokens_seen": 51656000, + "step": 30145 + }, + { + "epoch": 146.35835351089588, + "grad_norm": 1.5897740013315342e-08, + "learning_rate": 0.04270004335623366, + "loss": 0.0, + "num_input_tokens_seen": 51664672, + "step": 30150 + }, + { + "epoch": 146.38256658595643, + "grad_norm": 1.3003412568934891e-08, + "learning_rate": 0.04265888987419448, + "loss": 0.0, + "num_input_tokens_seen": 51673408, + "step": 30155 + }, + { + "epoch": 146.40677966101694, + "grad_norm": 9.489296459719299e-09, + "learning_rate": 0.04261775294550346, + "loss": 0.0, + "num_input_tokens_seen": 51681952, + "step": 30160 + }, + { + "epoch": 146.43099273607749, + "grad_norm": 1.1431519730820128e-08, + "learning_rate": 0.042576632576504354, + "loss": 0.0, + "num_input_tokens_seen": 51690464, + "step": 30165 + }, + { + "epoch": 146.455205811138, + "grad_norm": 1.462167187327168e-08, + "learning_rate": 0.0425355287735385, + "loss": 0.0, + "num_input_tokens_seen": 51699168, + "step": 30170 + }, + { + "epoch": 146.47941888619854, + "grad_norm": 1.2507026525554465e-08, + "learning_rate": 0.0424944415429446, + "loss": 0.0, + "num_input_tokens_seen": 51707552, + "step": 30175 + }, + { + "epoch": 146.5036319612591, + "grad_norm": 1.709083186085536e-08, + "learning_rate": 0.04245337089105877, + "loss": 0.0, + "num_input_tokens_seen": 51716064, + "step": 30180 + }, + { + "epoch": 146.5278450363196, + "grad_norm": 1.334702037780744e-08, + "learning_rate": 0.04241231682421467, + "loss": 0.0, + "num_input_tokens_seen": 51724256, + "step": 30185 + }, + { + "epoch": 146.55205811138015, + "grad_norm": 1.6065607510995505e-08, + "learning_rate": 0.04237127934874337, + "loss": 0.0, + "num_input_tokens_seen": 51732608, + "step": 30190 + }, + { + "epoch": 146.57627118644066, + "grad_norm": 1.7009924135891197e-08, + "learning_rate": 0.042330258470973305, + "loss": 0.0, + "num_input_tokens_seen": 51741056, + "step": 30195 + }, + { + "epoch": 146.6004842615012, + "grad_norm": 2.670191179277026e-08, + "learning_rate": 0.042289254197230515, + "loss": 0.0, + "num_input_tokens_seen": 51749792, + "step": 30200 + }, + { + "epoch": 146.6004842615012, + "eval_loss": 1.1743130683898926, + "eval_runtime": 4.6331, + "eval_samples_per_second": 79.213, + "eval_steps_per_second": 19.857, + "num_input_tokens_seen": 51749792, + "step": 30200 + }, + { + "epoch": 146.62469733656175, + "grad_norm": 1.1872691274561475e-08, + "learning_rate": 0.04224826653383823, + "loss": 0.0, + "num_input_tokens_seen": 51758240, + "step": 30205 + }, + { + "epoch": 146.64891041162227, + "grad_norm": 1.9411585228112926e-08, + "learning_rate": 0.04220729548711735, + "loss": 0.0, + "num_input_tokens_seen": 51766880, + "step": 30210 + }, + { + "epoch": 146.6731234866828, + "grad_norm": 6.539329522325943e-09, + "learning_rate": 0.04216634106338616, + "loss": 0.0, + "num_input_tokens_seen": 51775392, + "step": 30215 + }, + { + "epoch": 146.69733656174333, + "grad_norm": 1.6099916066991682e-08, + "learning_rate": 0.04212540326896025, + "loss": 0.0, + "num_input_tokens_seen": 51784160, + "step": 30220 + }, + { + "epoch": 146.72154963680387, + "grad_norm": 1.4311758889107296e-08, + "learning_rate": 0.0420844821101528, + "loss": 0.0, + "num_input_tokens_seen": 51792512, + "step": 30225 + }, + { + "epoch": 146.74576271186442, + "grad_norm": 9.776970344432812e-09, + "learning_rate": 0.04204357759327441, + "loss": 0.0, + "num_input_tokens_seen": 51800992, + "step": 30230 + }, + { + "epoch": 146.76997578692493, + "grad_norm": 2.024218659357757e-08, + "learning_rate": 0.042002689724632954, + "loss": 0.0, + "num_input_tokens_seen": 51809504, + "step": 30235 + }, + { + "epoch": 146.79418886198548, + "grad_norm": 7.129047574494507e-09, + "learning_rate": 0.04196181851053398, + "loss": 0.0, + "num_input_tokens_seen": 51818336, + "step": 30240 + }, + { + "epoch": 146.818401937046, + "grad_norm": 8.050467847908749e-09, + "learning_rate": 0.041920963957280295, + "loss": 0.0, + "num_input_tokens_seen": 51826976, + "step": 30245 + }, + { + "epoch": 146.84261501210653, + "grad_norm": 1.3860691261413649e-08, + "learning_rate": 0.04188012607117212, + "loss": 0.0, + "num_input_tokens_seen": 51836000, + "step": 30250 + }, + { + "epoch": 146.86682808716708, + "grad_norm": 1.5653712992502733e-08, + "learning_rate": 0.04183930485850725, + "loss": 0.0, + "num_input_tokens_seen": 51844864, + "step": 30255 + }, + { + "epoch": 146.8910411622276, + "grad_norm": 9.84515846624845e-09, + "learning_rate": 0.04179850032558078, + "loss": 0.0, + "num_input_tokens_seen": 51853696, + "step": 30260 + }, + { + "epoch": 146.91525423728814, + "grad_norm": 9.65814628273165e-09, + "learning_rate": 0.041757712478685295, + "loss": 0.0, + "num_input_tokens_seen": 51862336, + "step": 30265 + }, + { + "epoch": 146.93946731234865, + "grad_norm": 1.1152340384512627e-08, + "learning_rate": 0.04171694132411085, + "loss": 0.0, + "num_input_tokens_seen": 51870784, + "step": 30270 + }, + { + "epoch": 146.9636803874092, + "grad_norm": 7.765874165954756e-09, + "learning_rate": 0.04167618686814479, + "loss": 0.0, + "num_input_tokens_seen": 51879488, + "step": 30275 + }, + { + "epoch": 146.98789346246974, + "grad_norm": 1.5585296608833232e-08, + "learning_rate": 0.041635449117072024, + "loss": 0.0, + "num_input_tokens_seen": 51887648, + "step": 30280 + }, + { + "epoch": 147.01452784503633, + "grad_norm": 1.1592022453044137e-08, + "learning_rate": 0.04159472807717477, + "loss": 0.0, + "num_input_tokens_seen": 51896448, + "step": 30285 + }, + { + "epoch": 147.03874092009684, + "grad_norm": 7.219319364537569e-09, + "learning_rate": 0.041554023754732744, + "loss": 0.0, + "num_input_tokens_seen": 51905088, + "step": 30290 + }, + { + "epoch": 147.0629539951574, + "grad_norm": 1.3195297299262165e-08, + "learning_rate": 0.04151333615602311, + "loss": 0.0, + "num_input_tokens_seen": 51913888, + "step": 30295 + }, + { + "epoch": 147.08716707021793, + "grad_norm": 1.5919496831884317e-08, + "learning_rate": 0.04147266528732034, + "loss": 0.0, + "num_input_tokens_seen": 51922688, + "step": 30300 + }, + { + "epoch": 147.11138014527845, + "grad_norm": 9.950601231878409e-09, + "learning_rate": 0.0414320111548964, + "loss": 0.0, + "num_input_tokens_seen": 51931360, + "step": 30305 + }, + { + "epoch": 147.135593220339, + "grad_norm": 8.061603828934949e-09, + "learning_rate": 0.04139137376502076, + "loss": 0.0, + "num_input_tokens_seen": 51939744, + "step": 30310 + }, + { + "epoch": 147.1598062953995, + "grad_norm": 9.311283299950901e-09, + "learning_rate": 0.04135075312396014, + "loss": 0.0, + "num_input_tokens_seen": 51947840, + "step": 30315 + }, + { + "epoch": 147.18401937046005, + "grad_norm": 1.6900246535556107e-08, + "learning_rate": 0.04131014923797875, + "loss": 0.0, + "num_input_tokens_seen": 51956512, + "step": 30320 + }, + { + "epoch": 147.2082324455206, + "grad_norm": 1.5468653913330854e-08, + "learning_rate": 0.04126956211333819, + "loss": 0.0, + "num_input_tokens_seen": 51965024, + "step": 30325 + }, + { + "epoch": 147.2324455205811, + "grad_norm": 7.684366920557295e-09, + "learning_rate": 0.041228991756297545, + "loss": 0.0, + "num_input_tokens_seen": 51973312, + "step": 30330 + }, + { + "epoch": 147.25665859564165, + "grad_norm": 1.173631680728704e-08, + "learning_rate": 0.04118843817311332, + "loss": 0.0, + "num_input_tokens_seen": 51981888, + "step": 30335 + }, + { + "epoch": 147.28087167070217, + "grad_norm": 1.4024086780750622e-08, + "learning_rate": 0.0411479013700393, + "loss": 0.0, + "num_input_tokens_seen": 51990144, + "step": 30340 + }, + { + "epoch": 147.3050847457627, + "grad_norm": 1.3375200502707685e-08, + "learning_rate": 0.0411073813533268, + "loss": 0.0, + "num_input_tokens_seen": 51998848, + "step": 30345 + }, + { + "epoch": 147.32929782082326, + "grad_norm": 1.0711882936220718e-08, + "learning_rate": 0.04106687812922456, + "loss": 0.0, + "num_input_tokens_seen": 52007488, + "step": 30350 + }, + { + "epoch": 147.35351089588377, + "grad_norm": 1.3936619858156973e-08, + "learning_rate": 0.041026391703978635, + "loss": 0.0, + "num_input_tokens_seen": 52015712, + "step": 30355 + }, + { + "epoch": 147.37772397094432, + "grad_norm": 6.257645068785678e-09, + "learning_rate": 0.04098592208383259, + "loss": 0.0, + "num_input_tokens_seen": 52024544, + "step": 30360 + }, + { + "epoch": 147.40193704600483, + "grad_norm": 1.4087973454479652e-08, + "learning_rate": 0.040945469275027256, + "loss": 0.0, + "num_input_tokens_seen": 52033408, + "step": 30365 + }, + { + "epoch": 147.42615012106538, + "grad_norm": 1.0670822447877981e-08, + "learning_rate": 0.04090503328380104, + "loss": 0.0, + "num_input_tokens_seen": 52042080, + "step": 30370 + }, + { + "epoch": 147.45036319612592, + "grad_norm": 1.552272443916536e-08, + "learning_rate": 0.04086461411638971, + "loss": 0.0, + "num_input_tokens_seen": 52050336, + "step": 30375 + }, + { + "epoch": 147.47457627118644, + "grad_norm": 4.50619008773856e-09, + "learning_rate": 0.04082421177902631, + "loss": 0.0, + "num_input_tokens_seen": 52058976, + "step": 30380 + }, + { + "epoch": 147.49878934624698, + "grad_norm": 9.380284105020564e-09, + "learning_rate": 0.04078382627794149, + "loss": 0.0, + "num_input_tokens_seen": 52067744, + "step": 30385 + }, + { + "epoch": 147.5230024213075, + "grad_norm": 1.1119143827897915e-08, + "learning_rate": 0.04074345761936316, + "loss": 0.0, + "num_input_tokens_seen": 52076512, + "step": 30390 + }, + { + "epoch": 147.54721549636804, + "grad_norm": 8.994273770213113e-09, + "learning_rate": 0.04070310580951663, + "loss": 0.0, + "num_input_tokens_seen": 52085376, + "step": 30395 + }, + { + "epoch": 147.57142857142858, + "grad_norm": 6.256805296089851e-09, + "learning_rate": 0.040662770854624726, + "loss": 0.0, + "num_input_tokens_seen": 52094304, + "step": 30400 + }, + { + "epoch": 147.57142857142858, + "eval_loss": 1.174845576286316, + "eval_runtime": 4.617, + "eval_samples_per_second": 79.489, + "eval_steps_per_second": 19.926, + "num_input_tokens_seen": 52094304, + "step": 30400 + }, + { + "epoch": 147.5956416464891, + "grad_norm": 1.5715130530224997e-08, + "learning_rate": 0.040622452760907535, + "loss": 0.0, + "num_input_tokens_seen": 52102944, + "step": 30405 + }, + { + "epoch": 147.61985472154964, + "grad_norm": 7.66190400014466e-09, + "learning_rate": 0.04058215153458265, + "loss": 0.0, + "num_input_tokens_seen": 52111616, + "step": 30410 + }, + { + "epoch": 147.64406779661016, + "grad_norm": 1.5790302398954736e-08, + "learning_rate": 0.04054186718186507, + "loss": 0.0, + "num_input_tokens_seen": 52120032, + "step": 30415 + }, + { + "epoch": 147.6682808716707, + "grad_norm": 1.179273656504165e-08, + "learning_rate": 0.04050159970896708, + "loss": 0.0, + "num_input_tokens_seen": 52128416, + "step": 30420 + }, + { + "epoch": 147.69249394673125, + "grad_norm": 1.1085135476207597e-08, + "learning_rate": 0.04046134912209843, + "loss": 0.0, + "num_input_tokens_seen": 52137120, + "step": 30425 + }, + { + "epoch": 147.71670702179176, + "grad_norm": 1.1044433811946419e-08, + "learning_rate": 0.040421115427466354, + "loss": 0.0, + "num_input_tokens_seen": 52145664, + "step": 30430 + }, + { + "epoch": 147.7409200968523, + "grad_norm": 6.93050594691158e-09, + "learning_rate": 0.04038089863127529, + "loss": 0.0, + "num_input_tokens_seen": 52154080, + "step": 30435 + }, + { + "epoch": 147.76513317191282, + "grad_norm": 1.4070336007421247e-08, + "learning_rate": 0.04034069873972727, + "loss": 0.0, + "num_input_tokens_seen": 52162880, + "step": 30440 + }, + { + "epoch": 147.78934624697337, + "grad_norm": 1.7905264826367784e-08, + "learning_rate": 0.040300515759021514, + "loss": 0.0, + "num_input_tokens_seen": 52171040, + "step": 30445 + }, + { + "epoch": 147.8135593220339, + "grad_norm": 1.8160980275183647e-08, + "learning_rate": 0.04026034969535478, + "loss": 0.0, + "num_input_tokens_seen": 52179584, + "step": 30450 + }, + { + "epoch": 147.83777239709443, + "grad_norm": 9.613298601607312e-09, + "learning_rate": 0.040220200554921266, + "loss": 0.0, + "num_input_tokens_seen": 52188032, + "step": 30455 + }, + { + "epoch": 147.86198547215497, + "grad_norm": 7.028855275592605e-09, + "learning_rate": 0.0401800683439124, + "loss": 0.0, + "num_input_tokens_seen": 52196192, + "step": 30460 + }, + { + "epoch": 147.88619854721549, + "grad_norm": 1.2082498557219878e-08, + "learning_rate": 0.04013995306851704, + "loss": 0.0, + "num_input_tokens_seen": 52204704, + "step": 30465 + }, + { + "epoch": 147.91041162227603, + "grad_norm": 1.7648970285222276e-08, + "learning_rate": 0.040099854734921545, + "loss": 0.0, + "num_input_tokens_seen": 52213280, + "step": 30470 + }, + { + "epoch": 147.93462469733657, + "grad_norm": 8.734660994491605e-09, + "learning_rate": 0.0400597733493095, + "loss": 0.0, + "num_input_tokens_seen": 52222176, + "step": 30475 + }, + { + "epoch": 147.9588377723971, + "grad_norm": 7.0304704280488295e-09, + "learning_rate": 0.04001970891786203, + "loss": 0.0, + "num_input_tokens_seen": 52230272, + "step": 30480 + }, + { + "epoch": 147.98305084745763, + "grad_norm": 1.1989767401132667e-08, + "learning_rate": 0.03997966144675752, + "loss": 0.0, + "num_input_tokens_seen": 52238656, + "step": 30485 + }, + { + "epoch": 148.00968523002422, + "grad_norm": 1.0997050381433837e-08, + "learning_rate": 0.039939630942171796, + "loss": 0.0, + "num_input_tokens_seen": 52247520, + "step": 30490 + }, + { + "epoch": 148.03389830508473, + "grad_norm": 1.2994079590100682e-08, + "learning_rate": 0.03989961741027815, + "loss": 0.0, + "num_input_tokens_seen": 52256128, + "step": 30495 + }, + { + "epoch": 148.05811138014528, + "grad_norm": 6.521697404338056e-09, + "learning_rate": 0.03985962085724704, + "loss": 0.0, + "num_input_tokens_seen": 52264544, + "step": 30500 + }, + { + "epoch": 148.08232445520582, + "grad_norm": 9.838768022518707e-09, + "learning_rate": 0.03981964128924656, + "loss": 0.0, + "num_input_tokens_seen": 52272832, + "step": 30505 + }, + { + "epoch": 148.10653753026634, + "grad_norm": 6.370004967948262e-09, + "learning_rate": 0.03977967871244197, + "loss": 0.0, + "num_input_tokens_seen": 52281728, + "step": 30510 + }, + { + "epoch": 148.13075060532688, + "grad_norm": 2.0098221753528378e-08, + "learning_rate": 0.03973973313299602, + "loss": 0.0, + "num_input_tokens_seen": 52290400, + "step": 30515 + }, + { + "epoch": 148.1549636803874, + "grad_norm": 6.356193793521925e-09, + "learning_rate": 0.0396998045570689, + "loss": 0.0, + "num_input_tokens_seen": 52298848, + "step": 30520 + }, + { + "epoch": 148.17917675544794, + "grad_norm": 1.8911062937831957e-08, + "learning_rate": 0.03965989299081798, + "loss": 0.0, + "num_input_tokens_seen": 52307136, + "step": 30525 + }, + { + "epoch": 148.20338983050848, + "grad_norm": 1.551587658354947e-08, + "learning_rate": 0.039619998440398235, + "loss": 0.0, + "num_input_tokens_seen": 52315808, + "step": 30530 + }, + { + "epoch": 148.227602905569, + "grad_norm": 1.280192307717698e-08, + "learning_rate": 0.03958012091196184, + "loss": 0.0, + "num_input_tokens_seen": 52324256, + "step": 30535 + }, + { + "epoch": 148.25181598062954, + "grad_norm": 8.887250935174507e-09, + "learning_rate": 0.039540260411658396, + "loss": 0.0, + "num_input_tokens_seen": 52332672, + "step": 30540 + }, + { + "epoch": 148.27602905569006, + "grad_norm": 8.982012467129152e-09, + "learning_rate": 0.03950041694563496, + "loss": 0.0, + "num_input_tokens_seen": 52341216, + "step": 30545 + }, + { + "epoch": 148.3002421307506, + "grad_norm": 1.1407072619817882e-08, + "learning_rate": 0.0394605905200358, + "loss": 0.0, + "num_input_tokens_seen": 52349952, + "step": 30550 + }, + { + "epoch": 148.32445520581115, + "grad_norm": 8.872219403599502e-09, + "learning_rate": 0.03942078114100272, + "loss": 0.0, + "num_input_tokens_seen": 52358752, + "step": 30555 + }, + { + "epoch": 148.34866828087166, + "grad_norm": 1.521942749604932e-08, + "learning_rate": 0.03938098881467485, + "loss": 0.0, + "num_input_tokens_seen": 52367168, + "step": 30560 + }, + { + "epoch": 148.3728813559322, + "grad_norm": 1.0102545466850188e-08, + "learning_rate": 0.039341213547188586, + "loss": 0.0, + "num_input_tokens_seen": 52375776, + "step": 30565 + }, + { + "epoch": 148.39709443099272, + "grad_norm": 3.605069354151169e-09, + "learning_rate": 0.03930145534467782, + "loss": 0.0, + "num_input_tokens_seen": 52384384, + "step": 30570 + }, + { + "epoch": 148.42130750605327, + "grad_norm": 1.1349076345368303e-08, + "learning_rate": 0.0392617142132738, + "loss": 0.0, + "num_input_tokens_seen": 52393504, + "step": 30575 + }, + { + "epoch": 148.4455205811138, + "grad_norm": 2.638230789386853e-08, + "learning_rate": 0.03922199015910504, + "loss": 0.0, + "num_input_tokens_seen": 52402208, + "step": 30580 + }, + { + "epoch": 148.46973365617433, + "grad_norm": 1.0251618220991077e-08, + "learning_rate": 0.039182283188297556, + "loss": 0.0, + "num_input_tokens_seen": 52410368, + "step": 30585 + }, + { + "epoch": 148.49394673123487, + "grad_norm": 7.218542208420331e-09, + "learning_rate": 0.039142593306974595, + "loss": 0.0, + "num_input_tokens_seen": 52418528, + "step": 30590 + }, + { + "epoch": 148.5181598062954, + "grad_norm": 1.7827131770786764e-08, + "learning_rate": 0.039102920521256856, + "loss": 0.0, + "num_input_tokens_seen": 52427040, + "step": 30595 + }, + { + "epoch": 148.54237288135593, + "grad_norm": 7.419153735810369e-09, + "learning_rate": 0.03906326483726243, + "loss": 0.0, + "num_input_tokens_seen": 52436000, + "step": 30600 + }, + { + "epoch": 148.54237288135593, + "eval_loss": 1.174591064453125, + "eval_runtime": 4.6213, + "eval_samples_per_second": 79.416, + "eval_steps_per_second": 19.908, + "num_input_tokens_seen": 52436000, + "step": 30600 + }, + { + "epoch": 148.56658595641647, + "grad_norm": 1.4364773370800776e-08, + "learning_rate": 0.039023626261106704, + "loss": 0.0, + "num_input_tokens_seen": 52444896, + "step": 30605 + }, + { + "epoch": 148.590799031477, + "grad_norm": 9.436705639132015e-09, + "learning_rate": 0.03898400479890237, + "loss": 0.0, + "num_input_tokens_seen": 52453376, + "step": 30610 + }, + { + "epoch": 148.61501210653753, + "grad_norm": 1.7166899013432158e-08, + "learning_rate": 0.038944400456759655, + "loss": 0.0, + "num_input_tokens_seen": 52461984, + "step": 30615 + }, + { + "epoch": 148.63922518159805, + "grad_norm": 6.116234629871542e-09, + "learning_rate": 0.038904813240785964, + "loss": 0.0, + "num_input_tokens_seen": 52470592, + "step": 30620 + }, + { + "epoch": 148.6634382566586, + "grad_norm": 1.8509650701048486e-08, + "learning_rate": 0.03886524315708621, + "loss": 0.0, + "num_input_tokens_seen": 52479168, + "step": 30625 + }, + { + "epoch": 148.68765133171914, + "grad_norm": 8.06722599833165e-09, + "learning_rate": 0.03882569021176255, + "loss": 0.0, + "num_input_tokens_seen": 52487776, + "step": 30630 + }, + { + "epoch": 148.71186440677965, + "grad_norm": 9.682526780352418e-09, + "learning_rate": 0.038786154410914535, + "loss": 0.0, + "num_input_tokens_seen": 52496192, + "step": 30635 + }, + { + "epoch": 148.7360774818402, + "grad_norm": 6.041338096451909e-09, + "learning_rate": 0.03874663576063917, + "loss": 0.0, + "num_input_tokens_seen": 52504768, + "step": 30640 + }, + { + "epoch": 148.7602905569007, + "grad_norm": 2.5010840065675666e-09, + "learning_rate": 0.038707134267030624, + "loss": 0.0, + "num_input_tokens_seen": 52512800, + "step": 30645 + }, + { + "epoch": 148.78450363196126, + "grad_norm": 6.204585734082002e-09, + "learning_rate": 0.038667649936180555, + "loss": 0.0, + "num_input_tokens_seen": 52520992, + "step": 30650 + }, + { + "epoch": 148.8087167070218, + "grad_norm": 6.762584270347816e-09, + "learning_rate": 0.038628182774178, + "loss": 0.0, + "num_input_tokens_seen": 52529280, + "step": 30655 + }, + { + "epoch": 148.83292978208232, + "grad_norm": 1.0583160126031999e-08, + "learning_rate": 0.038588732787109226, + "loss": 0.0, + "num_input_tokens_seen": 52537632, + "step": 30660 + }, + { + "epoch": 148.85714285714286, + "grad_norm": 7.3464239136455944e-09, + "learning_rate": 0.03854929998105795, + "loss": 0.0, + "num_input_tokens_seen": 52546176, + "step": 30665 + }, + { + "epoch": 148.88135593220338, + "grad_norm": 1.0599113586806652e-08, + "learning_rate": 0.03850988436210518, + "loss": 0.0, + "num_input_tokens_seen": 52554656, + "step": 30670 + }, + { + "epoch": 148.90556900726392, + "grad_norm": 1.0920721216223228e-08, + "learning_rate": 0.03847048593632933, + "loss": 0.0, + "num_input_tokens_seen": 52562976, + "step": 30675 + }, + { + "epoch": 148.92978208232446, + "grad_norm": 7.954934488907384e-09, + "learning_rate": 0.038431104709806096, + "loss": 0.0, + "num_input_tokens_seen": 52571712, + "step": 30680 + }, + { + "epoch": 148.95399515738498, + "grad_norm": 5.170474715043838e-09, + "learning_rate": 0.0383917406886086, + "loss": 0.0, + "num_input_tokens_seen": 52580576, + "step": 30685 + }, + { + "epoch": 148.97820823244552, + "grad_norm": 7.194491669082481e-09, + "learning_rate": 0.03835239387880722, + "loss": 0.0, + "num_input_tokens_seen": 52589120, + "step": 30690 + }, + { + "epoch": 149.0048426150121, + "grad_norm": 1.031265295381445e-08, + "learning_rate": 0.03831306428646979, + "loss": 0.0, + "num_input_tokens_seen": 52597792, + "step": 30695 + }, + { + "epoch": 149.02905569007265, + "grad_norm": 1.0585327281376067e-08, + "learning_rate": 0.03827375191766135, + "loss": 0.0, + "num_input_tokens_seen": 52606432, + "step": 30700 + }, + { + "epoch": 149.05326876513317, + "grad_norm": 5.752681442316998e-09, + "learning_rate": 0.03823445677844446, + "loss": 0.0, + "num_input_tokens_seen": 52614816, + "step": 30705 + }, + { + "epoch": 149.0774818401937, + "grad_norm": 1.0939065653303714e-08, + "learning_rate": 0.03819517887487881, + "loss": 0.0, + "num_input_tokens_seen": 52623456, + "step": 30710 + }, + { + "epoch": 149.10169491525423, + "grad_norm": 8.580036237049171e-09, + "learning_rate": 0.03815591821302161, + "loss": 0.0, + "num_input_tokens_seen": 52632064, + "step": 30715 + }, + { + "epoch": 149.12590799031477, + "grad_norm": 1.2047713049412323e-08, + "learning_rate": 0.03811667479892739, + "loss": 0.0, + "num_input_tokens_seen": 52640832, + "step": 30720 + }, + { + "epoch": 149.15012106537532, + "grad_norm": 5.773595379565677e-09, + "learning_rate": 0.03807744863864788, + "loss": 0.0, + "num_input_tokens_seen": 52649248, + "step": 30725 + }, + { + "epoch": 149.17433414043583, + "grad_norm": 1.2455781295273027e-08, + "learning_rate": 0.03803823973823229, + "loss": 0.0, + "num_input_tokens_seen": 52657888, + "step": 30730 + }, + { + "epoch": 149.19854721549638, + "grad_norm": 2.572312363113838e-09, + "learning_rate": 0.03799904810372719, + "loss": 0.0, + "num_input_tokens_seen": 52666368, + "step": 30735 + }, + { + "epoch": 149.2227602905569, + "grad_norm": 1.2640305691036247e-08, + "learning_rate": 0.03795987374117632, + "loss": 0.0, + "num_input_tokens_seen": 52674624, + "step": 30740 + }, + { + "epoch": 149.24697336561744, + "grad_norm": 7.343976538010111e-09, + "learning_rate": 0.03792071665662093, + "loss": 0.0, + "num_input_tokens_seen": 52683072, + "step": 30745 + }, + { + "epoch": 149.27118644067798, + "grad_norm": 7.68441310583512e-09, + "learning_rate": 0.03788157685609952, + "loss": 0.0, + "num_input_tokens_seen": 52691808, + "step": 30750 + }, + { + "epoch": 149.2953995157385, + "grad_norm": 2.0014889301478433e-08, + "learning_rate": 0.037842454345647876, + "loss": 0.0, + "num_input_tokens_seen": 52700512, + "step": 30755 + }, + { + "epoch": 149.31961259079904, + "grad_norm": 2.2540877608889787e-08, + "learning_rate": 0.03780334913129929, + "loss": 0.0, + "num_input_tokens_seen": 52708928, + "step": 30760 + }, + { + "epoch": 149.34382566585955, + "grad_norm": 7.732416484884652e-09, + "learning_rate": 0.037764261219084175, + "loss": 0.0, + "num_input_tokens_seen": 52717792, + "step": 30765 + }, + { + "epoch": 149.3680387409201, + "grad_norm": 1.7589581347010608e-08, + "learning_rate": 0.037725190615030414, + "loss": 0.0, + "num_input_tokens_seen": 52726720, + "step": 30770 + }, + { + "epoch": 149.39225181598064, + "grad_norm": 1.3152082090073236e-08, + "learning_rate": 0.037686137325163224, + "loss": 0.0, + "num_input_tokens_seen": 52735328, + "step": 30775 + }, + { + "epoch": 149.41646489104116, + "grad_norm": 1.6881978481819715e-08, + "learning_rate": 0.037647101355505065, + "loss": 0.0, + "num_input_tokens_seen": 52743936, + "step": 30780 + }, + { + "epoch": 149.4406779661017, + "grad_norm": 1.3332486226147466e-08, + "learning_rate": 0.03760808271207581, + "loss": 0.0, + "num_input_tokens_seen": 52752320, + "step": 30785 + }, + { + "epoch": 149.46489104116222, + "grad_norm": 1.1495362883806592e-08, + "learning_rate": 0.03756908140089258, + "loss": 0.0, + "num_input_tokens_seen": 52761024, + "step": 30790 + }, + { + "epoch": 149.48910411622276, + "grad_norm": 1.2373304159041254e-08, + "learning_rate": 0.03753009742796989, + "loss": 0.0, + "num_input_tokens_seen": 52769312, + "step": 30795 + }, + { + "epoch": 149.5133171912833, + "grad_norm": 2.4002195786465563e-08, + "learning_rate": 0.037491130799319615, + "loss": 0.0, + "num_input_tokens_seen": 52777984, + "step": 30800 + }, + { + "epoch": 149.5133171912833, + "eval_loss": 1.1759440898895264, + "eval_runtime": 4.6244, + "eval_samples_per_second": 79.362, + "eval_steps_per_second": 19.894, + "num_input_tokens_seen": 52777984, + "step": 30800 + }, + { + "epoch": 149.53753026634382, + "grad_norm": 1.345103761707378e-08, + "learning_rate": 0.03745218152095079, + "loss": 0.0, + "num_input_tokens_seen": 52786336, + "step": 30805 + }, + { + "epoch": 149.56174334140437, + "grad_norm": 7.243169175552566e-09, + "learning_rate": 0.037413249598869935, + "loss": 0.0, + "num_input_tokens_seen": 52794784, + "step": 30810 + }, + { + "epoch": 149.58595641646488, + "grad_norm": 8.47589998187459e-09, + "learning_rate": 0.037374335039080886, + "loss": 0.0, + "num_input_tokens_seen": 52803200, + "step": 30815 + }, + { + "epoch": 149.61016949152543, + "grad_norm": 5.180414319738702e-09, + "learning_rate": 0.037335437847584724, + "loss": 0.0, + "num_input_tokens_seen": 52811840, + "step": 30820 + }, + { + "epoch": 149.63438256658597, + "grad_norm": 1.529634374719535e-08, + "learning_rate": 0.03729655803037983, + "loss": 0.0, + "num_input_tokens_seen": 52820224, + "step": 30825 + }, + { + "epoch": 149.65859564164649, + "grad_norm": 1.3250289754296318e-08, + "learning_rate": 0.03725769559346207, + "loss": 0.0, + "num_input_tokens_seen": 52828544, + "step": 30830 + }, + { + "epoch": 149.68280871670703, + "grad_norm": 1.2232356461083782e-08, + "learning_rate": 0.03721885054282439, + "loss": 0.0, + "num_input_tokens_seen": 52836736, + "step": 30835 + }, + { + "epoch": 149.70702179176754, + "grad_norm": 8.900223669172647e-09, + "learning_rate": 0.03718002288445731, + "loss": 0.0, + "num_input_tokens_seen": 52845024, + "step": 30840 + }, + { + "epoch": 149.7312348668281, + "grad_norm": 7.475910557275256e-09, + "learning_rate": 0.03714121262434844, + "loss": 0.0, + "num_input_tokens_seen": 52853376, + "step": 30845 + }, + { + "epoch": 149.75544794188863, + "grad_norm": 8.33183921855607e-09, + "learning_rate": 0.037102419768482844, + "loss": 0.0, + "num_input_tokens_seen": 52862112, + "step": 30850 + }, + { + "epoch": 149.77966101694915, + "grad_norm": 1.433976226650202e-08, + "learning_rate": 0.03706364432284293, + "loss": 0.0, + "num_input_tokens_seen": 52870368, + "step": 30855 + }, + { + "epoch": 149.8038740920097, + "grad_norm": 3.769192513658481e-09, + "learning_rate": 0.03702488629340828, + "loss": 0.0, + "num_input_tokens_seen": 52879072, + "step": 30860 + }, + { + "epoch": 149.8280871670702, + "grad_norm": 1.0953217888243216e-08, + "learning_rate": 0.036986145686155915, + "loss": 0.0, + "num_input_tokens_seen": 52887776, + "step": 30865 + }, + { + "epoch": 149.85230024213075, + "grad_norm": 1.4065776987592926e-08, + "learning_rate": 0.036947422507060075, + "loss": 0.0, + "num_input_tokens_seen": 52896384, + "step": 30870 + }, + { + "epoch": 149.8765133171913, + "grad_norm": 2.855471192830805e-09, + "learning_rate": 0.0369087167620924, + "loss": 0.0, + "num_input_tokens_seen": 52904576, + "step": 30875 + }, + { + "epoch": 149.9007263922518, + "grad_norm": 6.045723033309969e-09, + "learning_rate": 0.03687002845722183, + "loss": 0.0, + "num_input_tokens_seen": 52913280, + "step": 30880 + }, + { + "epoch": 149.92493946731236, + "grad_norm": 7.043011063245785e-09, + "learning_rate": 0.03683135759841451, + "loss": 0.0, + "num_input_tokens_seen": 52921920, + "step": 30885 + }, + { + "epoch": 149.94915254237287, + "grad_norm": 1.1674437416786532e-08, + "learning_rate": 0.03679270419163406, + "loss": 0.0, + "num_input_tokens_seen": 52930720, + "step": 30890 + }, + { + "epoch": 149.97336561743342, + "grad_norm": 1.0778809844680382e-08, + "learning_rate": 0.03675406824284127, + "loss": 0.0, + "num_input_tokens_seen": 52939744, + "step": 30895 + }, + { + "epoch": 149.99757869249396, + "grad_norm": 2.9585591754255347e-09, + "learning_rate": 0.03671544975799425, + "loss": 0.0, + "num_input_tokens_seen": 52948448, + "step": 30900 + }, + { + "epoch": 150.02421307506054, + "grad_norm": 8.996677181016821e-09, + "learning_rate": 0.03667684874304854, + "loss": 0.0, + "num_input_tokens_seen": 52957408, + "step": 30905 + }, + { + "epoch": 150.04842615012106, + "grad_norm": 4.30013535890339e-09, + "learning_rate": 0.03663826520395683, + "loss": 0.0, + "num_input_tokens_seen": 52965856, + "step": 30910 + }, + { + "epoch": 150.0726392251816, + "grad_norm": 5.2290078933481254e-09, + "learning_rate": 0.03659969914666922, + "loss": 0.0, + "num_input_tokens_seen": 52974784, + "step": 30915 + }, + { + "epoch": 150.09685230024212, + "grad_norm": 9.583247084776758e-09, + "learning_rate": 0.036561150577133106, + "loss": 0.0, + "num_input_tokens_seen": 52983264, + "step": 30920 + }, + { + "epoch": 150.12106537530266, + "grad_norm": 8.70340155501026e-09, + "learning_rate": 0.036522619501293103, + "loss": 0.0, + "num_input_tokens_seen": 52991808, + "step": 30925 + }, + { + "epoch": 150.1452784503632, + "grad_norm": 1.8943294932682875e-08, + "learning_rate": 0.03648410592509122, + "loss": 0.0, + "num_input_tokens_seen": 53000672, + "step": 30930 + }, + { + "epoch": 150.16949152542372, + "grad_norm": 1.8220896791376617e-08, + "learning_rate": 0.03644560985446676, + "loss": 0.0, + "num_input_tokens_seen": 53008896, + "step": 30935 + }, + { + "epoch": 150.19370460048427, + "grad_norm": 1.0618387946692565e-08, + "learning_rate": 0.036407131295356256, + "loss": 0.0, + "num_input_tokens_seen": 53017376, + "step": 30940 + }, + { + "epoch": 150.21791767554478, + "grad_norm": 1.3923739494714482e-08, + "learning_rate": 0.03636867025369362, + "loss": 0.0, + "num_input_tokens_seen": 53026496, + "step": 30945 + }, + { + "epoch": 150.24213075060533, + "grad_norm": 9.677733281421297e-09, + "learning_rate": 0.03633022673540999, + "loss": 0.0, + "num_input_tokens_seen": 53035040, + "step": 30950 + }, + { + "epoch": 150.26634382566587, + "grad_norm": 1.064369747894034e-08, + "learning_rate": 0.03629180074643385, + "loss": 0.0, + "num_input_tokens_seen": 53043552, + "step": 30955 + }, + { + "epoch": 150.2905569007264, + "grad_norm": 7.625452269621746e-09, + "learning_rate": 0.03625339229269102, + "loss": 0.0, + "num_input_tokens_seen": 53052256, + "step": 30960 + }, + { + "epoch": 150.31476997578693, + "grad_norm": 1.2213565270258187e-08, + "learning_rate": 0.036215001380104535, + "loss": 0.0, + "num_input_tokens_seen": 53060928, + "step": 30965 + }, + { + "epoch": 150.33898305084745, + "grad_norm": 1.4671229564555688e-08, + "learning_rate": 0.03617662801459471, + "loss": 0.0, + "num_input_tokens_seen": 53069376, + "step": 30970 + }, + { + "epoch": 150.363196125908, + "grad_norm": 1.9120184546750352e-08, + "learning_rate": 0.036138272202079276, + "loss": 0.0, + "num_input_tokens_seen": 53077792, + "step": 30975 + }, + { + "epoch": 150.38740920096853, + "grad_norm": 8.489030811631437e-09, + "learning_rate": 0.036099933948473106, + "loss": 0.0, + "num_input_tokens_seen": 53086432, + "step": 30980 + }, + { + "epoch": 150.41162227602905, + "grad_norm": 9.792923805207465e-09, + "learning_rate": 0.03606161325968851, + "loss": 0.0, + "num_input_tokens_seen": 53094592, + "step": 30985 + }, + { + "epoch": 150.4358353510896, + "grad_norm": 7.391207645923714e-09, + "learning_rate": 0.03602331014163496, + "loss": 0.0, + "num_input_tokens_seen": 53102880, + "step": 30990 + }, + { + "epoch": 150.4600484261501, + "grad_norm": 2.456664383032603e-08, + "learning_rate": 0.035985024600219295, + "loss": 0.0, + "num_input_tokens_seen": 53111488, + "step": 30995 + }, + { + "epoch": 150.48426150121065, + "grad_norm": 1.7475203506478465e-08, + "learning_rate": 0.03594675664134569, + "loss": 0.0, + "num_input_tokens_seen": 53119904, + "step": 31000 + }, + { + "epoch": 150.48426150121065, + "eval_loss": 1.177126169204712, + "eval_runtime": 4.6162, + "eval_samples_per_second": 79.502, + "eval_steps_per_second": 19.93, + "num_input_tokens_seen": 53119904, + "step": 31000 + }, + { + "epoch": 150.5084745762712, + "grad_norm": 1.568991159217603e-08, + "learning_rate": 0.03590850627091545, + "loss": 0.0, + "num_input_tokens_seen": 53128064, + "step": 31005 + }, + { + "epoch": 150.5326876513317, + "grad_norm": 3.229231548829148e-09, + "learning_rate": 0.03587027349482731, + "loss": 0.0, + "num_input_tokens_seen": 53136288, + "step": 31010 + }, + { + "epoch": 150.55690072639226, + "grad_norm": 5.4020139472754636e-09, + "learning_rate": 0.035832058318977275, + "loss": 0.0, + "num_input_tokens_seen": 53144896, + "step": 31015 + }, + { + "epoch": 150.58111380145277, + "grad_norm": 9.851221172141322e-09, + "learning_rate": 0.03579386074925853, + "loss": 0.0, + "num_input_tokens_seen": 53153856, + "step": 31020 + }, + { + "epoch": 150.60532687651332, + "grad_norm": 1.1925378018418087e-08, + "learning_rate": 0.035755680791561696, + "loss": 0.0, + "num_input_tokens_seen": 53162336, + "step": 31025 + }, + { + "epoch": 150.62953995157386, + "grad_norm": 5.33763344634508e-09, + "learning_rate": 0.03571751845177454, + "loss": 0.0, + "num_input_tokens_seen": 53170848, + "step": 31030 + }, + { + "epoch": 150.65375302663438, + "grad_norm": 2.070651916596944e-08, + "learning_rate": 0.03567937373578225, + "loss": 0.0, + "num_input_tokens_seen": 53179456, + "step": 31035 + }, + { + "epoch": 150.67796610169492, + "grad_norm": 1.2598312615352825e-08, + "learning_rate": 0.03564124664946711, + "loss": 0.0, + "num_input_tokens_seen": 53187680, + "step": 31040 + }, + { + "epoch": 150.70217917675544, + "grad_norm": 1.6121134649438318e-08, + "learning_rate": 0.035603137198708924, + "loss": 0.0, + "num_input_tokens_seen": 53196288, + "step": 31045 + }, + { + "epoch": 150.72639225181598, + "grad_norm": 8.850729926734857e-09, + "learning_rate": 0.035565045389384514, + "loss": 0.0, + "num_input_tokens_seen": 53204864, + "step": 31050 + }, + { + "epoch": 150.75060532687652, + "grad_norm": 1.3271098886491473e-08, + "learning_rate": 0.03552697122736823, + "loss": 0.0, + "num_input_tokens_seen": 53213312, + "step": 31055 + }, + { + "epoch": 150.77481840193704, + "grad_norm": 1.1420787870974891e-08, + "learning_rate": 0.03548891471853153, + "loss": 0.0, + "num_input_tokens_seen": 53221760, + "step": 31060 + }, + { + "epoch": 150.79903147699758, + "grad_norm": 1.4014032601039617e-08, + "learning_rate": 0.03545087586874322, + "loss": 0.0, + "num_input_tokens_seen": 53230624, + "step": 31065 + }, + { + "epoch": 150.8232445520581, + "grad_norm": 1.1121030318861358e-08, + "learning_rate": 0.03541285468386935, + "loss": 0.0, + "num_input_tokens_seen": 53239136, + "step": 31070 + }, + { + "epoch": 150.84745762711864, + "grad_norm": 1.570685270735339e-08, + "learning_rate": 0.03537485116977327, + "loss": 0.0, + "num_input_tokens_seen": 53248032, + "step": 31075 + }, + { + "epoch": 150.8716707021792, + "grad_norm": 1.8408510271683554e-08, + "learning_rate": 0.03533686533231565, + "loss": 0.0, + "num_input_tokens_seen": 53256352, + "step": 31080 + }, + { + "epoch": 150.8958837772397, + "grad_norm": 2.45686262445588e-08, + "learning_rate": 0.0352988971773543, + "loss": 0.0, + "num_input_tokens_seen": 53265056, + "step": 31085 + }, + { + "epoch": 150.92009685230025, + "grad_norm": 1.4502807843541632e-08, + "learning_rate": 0.03526094671074443, + "loss": 0.0, + "num_input_tokens_seen": 53273760, + "step": 31090 + }, + { + "epoch": 150.94430992736076, + "grad_norm": 1.539796912197744e-08, + "learning_rate": 0.03522301393833852, + "loss": 0.0, + "num_input_tokens_seen": 53282272, + "step": 31095 + }, + { + "epoch": 150.9685230024213, + "grad_norm": 1.2976044239110252e-08, + "learning_rate": 0.035185098865986204, + "loss": 0.0, + "num_input_tokens_seen": 53291104, + "step": 31100 + }, + { + "epoch": 150.99273607748185, + "grad_norm": 1.6217201803669923e-08, + "learning_rate": 0.03514720149953453, + "loss": 0.0, + "num_input_tokens_seen": 53300000, + "step": 31105 + }, + { + "epoch": 151.01937046004844, + "grad_norm": 9.929328470548171e-09, + "learning_rate": 0.03510932184482773, + "loss": 0.0, + "num_input_tokens_seen": 53308928, + "step": 31110 + }, + { + "epoch": 151.04358353510895, + "grad_norm": 1.339508681752477e-08, + "learning_rate": 0.03507145990770724, + "loss": 0.0, + "num_input_tokens_seen": 53317728, + "step": 31115 + }, + { + "epoch": 151.0677966101695, + "grad_norm": 1.5966893585073194e-08, + "learning_rate": 0.035033615694011984, + "loss": 0.0, + "num_input_tokens_seen": 53326080, + "step": 31120 + }, + { + "epoch": 151.09200968523, + "grad_norm": 1.496290202851469e-08, + "learning_rate": 0.03499578920957788, + "loss": 0.0, + "num_input_tokens_seen": 53334432, + "step": 31125 + }, + { + "epoch": 151.11622276029055, + "grad_norm": 1.3115142749597908e-08, + "learning_rate": 0.034957980460238375, + "loss": 0.0, + "num_input_tokens_seen": 53343072, + "step": 31130 + }, + { + "epoch": 151.1404358353511, + "grad_norm": 1.5553784038502272e-08, + "learning_rate": 0.03492018945182393, + "loss": 0.0, + "num_input_tokens_seen": 53351520, + "step": 31135 + }, + { + "epoch": 151.16464891041161, + "grad_norm": 1.0585335274981844e-08, + "learning_rate": 0.03488241619016247, + "loss": 0.0, + "num_input_tokens_seen": 53360256, + "step": 31140 + }, + { + "epoch": 151.18886198547216, + "grad_norm": 1.8737857487849396e-08, + "learning_rate": 0.03484466068107913, + "loss": 0.0, + "num_input_tokens_seen": 53368896, + "step": 31145 + }, + { + "epoch": 151.21307506053267, + "grad_norm": 9.875384954227684e-09, + "learning_rate": 0.034806922930396195, + "loss": 0.0, + "num_input_tokens_seen": 53377088, + "step": 31150 + }, + { + "epoch": 151.23728813559322, + "grad_norm": 5.884084330887163e-09, + "learning_rate": 0.03476920294393337, + "loss": 0.0, + "num_input_tokens_seen": 53385632, + "step": 31155 + }, + { + "epoch": 151.26150121065376, + "grad_norm": 1.3534134701842504e-08, + "learning_rate": 0.03473150072750755, + "loss": 0.0, + "num_input_tokens_seen": 53394144, + "step": 31160 + }, + { + "epoch": 151.28571428571428, + "grad_norm": 8.156118447288918e-09, + "learning_rate": 0.03469381628693284, + "loss": 0.0, + "num_input_tokens_seen": 53402624, + "step": 31165 + }, + { + "epoch": 151.30992736077482, + "grad_norm": 1.0864354749173799e-08, + "learning_rate": 0.03465614962802072, + "loss": 0.0, + "num_input_tokens_seen": 53411296, + "step": 31170 + }, + { + "epoch": 151.33414043583534, + "grad_norm": 1.3788784336554727e-08, + "learning_rate": 0.0346185007565798, + "loss": 0.0, + "num_input_tokens_seen": 53419584, + "step": 31175 + }, + { + "epoch": 151.35835351089588, + "grad_norm": 1.0619634061015404e-08, + "learning_rate": 0.03458086967841609, + "loss": 0.0, + "num_input_tokens_seen": 53428448, + "step": 31180 + }, + { + "epoch": 151.38256658595643, + "grad_norm": 8.782193638978697e-09, + "learning_rate": 0.03454325639933266, + "loss": 0.0, + "num_input_tokens_seen": 53437184, + "step": 31185 + }, + { + "epoch": 151.40677966101694, + "grad_norm": 9.940211320724757e-09, + "learning_rate": 0.03450566092513007, + "loss": 0.0, + "num_input_tokens_seen": 53445792, + "step": 31190 + }, + { + "epoch": 151.43099273607749, + "grad_norm": 8.703836762435913e-09, + "learning_rate": 0.034468083261605914, + "loss": 0.0, + "num_input_tokens_seen": 53454368, + "step": 31195 + }, + { + "epoch": 151.455205811138, + "grad_norm": 2.1524559912222685e-08, + "learning_rate": 0.03443052341455522, + "loss": 0.0, + "num_input_tokens_seen": 53462560, + "step": 31200 + }, + { + "epoch": 151.455205811138, + "eval_loss": 1.1725577116012573, + "eval_runtime": 4.6152, + "eval_samples_per_second": 79.52, + "eval_steps_per_second": 19.934, + "num_input_tokens_seen": 53462560, + "step": 31200 + }, + { + "epoch": 151.47941888619854, + "grad_norm": 9.219009555749835e-09, + "learning_rate": 0.0343929813897701, + "loss": 0.0, + "num_input_tokens_seen": 53470944, + "step": 31205 + }, + { + "epoch": 151.5036319612591, + "grad_norm": 9.708473136527118e-09, + "learning_rate": 0.034355457193040125, + "loss": 0.0, + "num_input_tokens_seen": 53479104, + "step": 31210 + }, + { + "epoch": 151.5278450363196, + "grad_norm": 1.4907294954014105e-08, + "learning_rate": 0.03431795083015186, + "loss": 0.0, + "num_input_tokens_seen": 53487744, + "step": 31215 + }, + { + "epoch": 151.55205811138015, + "grad_norm": 1.5381424134375266e-08, + "learning_rate": 0.03428046230688936, + "loss": 0.0, + "num_input_tokens_seen": 53496416, + "step": 31220 + }, + { + "epoch": 151.57627118644066, + "grad_norm": 1.241022040687767e-08, + "learning_rate": 0.034242991629033805, + "loss": 0.0, + "num_input_tokens_seen": 53505024, + "step": 31225 + }, + { + "epoch": 151.6004842615012, + "grad_norm": 9.489001584483958e-09, + "learning_rate": 0.03420553880236362, + "loss": 0.0, + "num_input_tokens_seen": 53513632, + "step": 31230 + }, + { + "epoch": 151.62469733656175, + "grad_norm": 1.157972295828813e-08, + "learning_rate": 0.03416810383265449, + "loss": 0.0, + "num_input_tokens_seen": 53522144, + "step": 31235 + }, + { + "epoch": 151.64891041162227, + "grad_norm": 1.3117756658687085e-08, + "learning_rate": 0.03413068672567944, + "loss": 0.0, + "num_input_tokens_seen": 53530784, + "step": 31240 + }, + { + "epoch": 151.6731234866828, + "grad_norm": 1.4260511882469018e-08, + "learning_rate": 0.034093287487208565, + "loss": 0.0, + "num_input_tokens_seen": 53539584, + "step": 31245 + }, + { + "epoch": 151.69733656174333, + "grad_norm": 1.2660600567926394e-08, + "learning_rate": 0.03405590612300937, + "loss": 0.0, + "num_input_tokens_seen": 53548256, + "step": 31250 + }, + { + "epoch": 151.72154963680387, + "grad_norm": 1.7690348741439266e-08, + "learning_rate": 0.03401854263884646, + "loss": 0.0, + "num_input_tokens_seen": 53556992, + "step": 31255 + }, + { + "epoch": 151.74576271186442, + "grad_norm": 8.275696572468405e-09, + "learning_rate": 0.033981197040481824, + "loss": 0.0, + "num_input_tokens_seen": 53565728, + "step": 31260 + }, + { + "epoch": 151.76997578692493, + "grad_norm": 1.4909167234122833e-08, + "learning_rate": 0.03394386933367459, + "loss": 0.0, + "num_input_tokens_seen": 53574464, + "step": 31265 + }, + { + "epoch": 151.79418886198548, + "grad_norm": 1.2822698458592185e-08, + "learning_rate": 0.033906559524181104, + "loss": 0.0, + "num_input_tokens_seen": 53582784, + "step": 31270 + }, + { + "epoch": 151.818401937046, + "grad_norm": 5.963959992527634e-09, + "learning_rate": 0.033869267617755085, + "loss": 0.0, + "num_input_tokens_seen": 53591328, + "step": 31275 + }, + { + "epoch": 151.84261501210653, + "grad_norm": 1.606749044924527e-08, + "learning_rate": 0.0338319936201474, + "loss": 0.0, + "num_input_tokens_seen": 53599872, + "step": 31280 + }, + { + "epoch": 151.86682808716708, + "grad_norm": 8.80049100260294e-09, + "learning_rate": 0.033794737537106136, + "loss": 0.0, + "num_input_tokens_seen": 53608544, + "step": 31285 + }, + { + "epoch": 151.8910411622276, + "grad_norm": 7.846240102082902e-09, + "learning_rate": 0.03375749937437671, + "loss": 0.0, + "num_input_tokens_seen": 53616960, + "step": 31290 + }, + { + "epoch": 151.91525423728814, + "grad_norm": 7.2987562660387084e-09, + "learning_rate": 0.033720279137701634, + "loss": 0.0, + "num_input_tokens_seen": 53625600, + "step": 31295 + }, + { + "epoch": 151.93946731234865, + "grad_norm": 1.2075152433510539e-08, + "learning_rate": 0.03368307683282078, + "loss": 0.0, + "num_input_tokens_seen": 53634144, + "step": 31300 + }, + { + "epoch": 151.9636803874092, + "grad_norm": 7.3704944369978875e-09, + "learning_rate": 0.033645892465471235, + "loss": 0.0, + "num_input_tokens_seen": 53642656, + "step": 31305 + }, + { + "epoch": 151.98789346246974, + "grad_norm": 1.1629349927488875e-08, + "learning_rate": 0.03360872604138724, + "loss": 0.0, + "num_input_tokens_seen": 53651584, + "step": 31310 + }, + { + "epoch": 152.01452784503633, + "grad_norm": 1.632161072961935e-08, + "learning_rate": 0.03357157756630034, + "loss": 0.0, + "num_input_tokens_seen": 53660544, + "step": 31315 + }, + { + "epoch": 152.03874092009684, + "grad_norm": 9.876831796873375e-09, + "learning_rate": 0.033534447045939365, + "loss": 0.0, + "num_input_tokens_seen": 53669120, + "step": 31320 + }, + { + "epoch": 152.0629539951574, + "grad_norm": 1.7579486311092296e-08, + "learning_rate": 0.03349733448603026, + "loss": 0.0, + "num_input_tokens_seen": 53677568, + "step": 31325 + }, + { + "epoch": 152.08716707021793, + "grad_norm": 1.2661370618616274e-08, + "learning_rate": 0.03346023989229619, + "loss": 0.0, + "num_input_tokens_seen": 53686176, + "step": 31330 + }, + { + "epoch": 152.11138014527845, + "grad_norm": 7.645188482285903e-09, + "learning_rate": 0.03342316327045769, + "loss": 0.0, + "num_input_tokens_seen": 53694688, + "step": 31335 + }, + { + "epoch": 152.135593220339, + "grad_norm": 3.4478246924152245e-09, + "learning_rate": 0.033386104626232385, + "loss": 0.0, + "num_input_tokens_seen": 53702848, + "step": 31340 + }, + { + "epoch": 152.1598062953995, + "grad_norm": 1.912936831161005e-08, + "learning_rate": 0.03334906396533525, + "loss": 0.0, + "num_input_tokens_seen": 53711072, + "step": 31345 + }, + { + "epoch": 152.18401937046005, + "grad_norm": 1.211430422642934e-08, + "learning_rate": 0.033312041293478326, + "loss": 0.0, + "num_input_tokens_seen": 53720128, + "step": 31350 + }, + { + "epoch": 152.2082324455206, + "grad_norm": 1.8045069438699102e-08, + "learning_rate": 0.03327503661637103, + "loss": 0.0, + "num_input_tokens_seen": 53729024, + "step": 31355 + }, + { + "epoch": 152.2324455205811, + "grad_norm": 1.24081749319771e-08, + "learning_rate": 0.03323804993971998, + "loss": 0.0, + "num_input_tokens_seen": 53737536, + "step": 31360 + }, + { + "epoch": 152.25665859564165, + "grad_norm": 1.1391369625357584e-08, + "learning_rate": 0.033201081269228924, + "loss": 0.0, + "num_input_tokens_seen": 53746272, + "step": 31365 + }, + { + "epoch": 152.28087167070217, + "grad_norm": 1.5184280499624947e-08, + "learning_rate": 0.03316413061059895, + "loss": 0.0, + "num_input_tokens_seen": 53754368, + "step": 31370 + }, + { + "epoch": 152.3050847457627, + "grad_norm": 1.5282168419616937e-08, + "learning_rate": 0.03312719796952827, + "loss": 0.0, + "num_input_tokens_seen": 53763328, + "step": 31375 + }, + { + "epoch": 152.32929782082326, + "grad_norm": 8.808450857600292e-09, + "learning_rate": 0.03309028335171236, + "loss": 0.0, + "num_input_tokens_seen": 53771744, + "step": 31380 + }, + { + "epoch": 152.35351089588377, + "grad_norm": 1.0459342725255283e-08, + "learning_rate": 0.03305338676284398, + "loss": 0.0, + "num_input_tokens_seen": 53780384, + "step": 31385 + }, + { + "epoch": 152.37772397094432, + "grad_norm": 1.0964596341977995e-08, + "learning_rate": 0.03301650820861296, + "loss": 0.0, + "num_input_tokens_seen": 53788832, + "step": 31390 + }, + { + "epoch": 152.40193704600483, + "grad_norm": 6.2463638705878566e-09, + "learning_rate": 0.03297964769470652, + "loss": 0.0, + "num_input_tokens_seen": 53797376, + "step": 31395 + }, + { + "epoch": 152.42615012106538, + "grad_norm": 8.76711059305535e-09, + "learning_rate": 0.032942805226808945, + "loss": 0.0, + "num_input_tokens_seen": 53806272, + "step": 31400 + }, + { + "epoch": 152.42615012106538, + "eval_loss": 1.187717080116272, + "eval_runtime": 4.6273, + "eval_samples_per_second": 79.312, + "eval_steps_per_second": 19.882, + "num_input_tokens_seen": 53806272, + "step": 31400 + }, + { + "epoch": 152.45036319612592, + "grad_norm": 1.4018751492983483e-08, + "learning_rate": 0.03290598081060187, + "loss": 0.0, + "num_input_tokens_seen": 53814464, + "step": 31405 + }, + { + "epoch": 152.47457627118644, + "grad_norm": 1.3780788954420586e-08, + "learning_rate": 0.03286917445176407, + "loss": 0.0, + "num_input_tokens_seen": 53823008, + "step": 31410 + }, + { + "epoch": 152.49878934624698, + "grad_norm": 6.471775559901971e-09, + "learning_rate": 0.032832386155971456, + "loss": 0.0, + "num_input_tokens_seen": 53831296, + "step": 31415 + }, + { + "epoch": 152.5230024213075, + "grad_norm": 5.370774491808561e-09, + "learning_rate": 0.032795615928897334, + "loss": 0.0, + "num_input_tokens_seen": 53840064, + "step": 31420 + }, + { + "epoch": 152.54721549636804, + "grad_norm": 1.1841454039540622e-08, + "learning_rate": 0.03275886377621215, + "loss": 0.0, + "num_input_tokens_seen": 53848480, + "step": 31425 + }, + { + "epoch": 152.57142857142858, + "grad_norm": 4.6027994748953915e-09, + "learning_rate": 0.03272212970358348, + "loss": 0.0, + "num_input_tokens_seen": 53857184, + "step": 31430 + }, + { + "epoch": 152.5956416464891, + "grad_norm": 6.648845474188647e-09, + "learning_rate": 0.032685413716676215, + "loss": 0.0, + "num_input_tokens_seen": 53865376, + "step": 31435 + }, + { + "epoch": 152.61985472154964, + "grad_norm": 4.720397406288157e-09, + "learning_rate": 0.032648715821152474, + "loss": 0.0, + "num_input_tokens_seen": 53873920, + "step": 31440 + }, + { + "epoch": 152.64406779661016, + "grad_norm": 1.0006003137164043e-08, + "learning_rate": 0.03261203602267143, + "loss": 0.0, + "num_input_tokens_seen": 53882592, + "step": 31445 + }, + { + "epoch": 152.6682808716707, + "grad_norm": 1.0901727520717941e-08, + "learning_rate": 0.03257537432688966, + "loss": 0.0, + "num_input_tokens_seen": 53891008, + "step": 31450 + }, + { + "epoch": 152.69249394673125, + "grad_norm": 9.529599331870031e-09, + "learning_rate": 0.03253873073946077, + "loss": 0.0, + "num_input_tokens_seen": 53899584, + "step": 31455 + }, + { + "epoch": 152.71670702179176, + "grad_norm": 5.892529575390881e-09, + "learning_rate": 0.03250210526603572, + "loss": 0.0, + "num_input_tokens_seen": 53907840, + "step": 31460 + }, + { + "epoch": 152.7409200968523, + "grad_norm": 8.448211907818859e-09, + "learning_rate": 0.03246549791226266, + "loss": 0.0, + "num_input_tokens_seen": 53916288, + "step": 31465 + }, + { + "epoch": 152.76513317191282, + "grad_norm": 7.144642211187602e-09, + "learning_rate": 0.03242890868378679, + "loss": 0.0, + "num_input_tokens_seen": 53924704, + "step": 31470 + }, + { + "epoch": 152.78934624697337, + "grad_norm": 1.0709462650027035e-08, + "learning_rate": 0.03239233758625074, + "loss": 0.0, + "num_input_tokens_seen": 53933376, + "step": 31475 + }, + { + "epoch": 152.8135593220339, + "grad_norm": 1.4138967330268315e-08, + "learning_rate": 0.032355784625294204, + "loss": 0.0, + "num_input_tokens_seen": 53942528, + "step": 31480 + }, + { + "epoch": 152.83777239709443, + "grad_norm": 1.2066427856893824e-08, + "learning_rate": 0.03231924980655402, + "loss": 0.0, + "num_input_tokens_seen": 53950976, + "step": 31485 + }, + { + "epoch": 152.86198547215497, + "grad_norm": 7.222489717406688e-09, + "learning_rate": 0.032282733135664446, + "loss": 0.0, + "num_input_tokens_seen": 53959264, + "step": 31490 + }, + { + "epoch": 152.88619854721549, + "grad_norm": 6.271398067525524e-09, + "learning_rate": 0.03224623461825669, + "loss": 0.0, + "num_input_tokens_seen": 53967840, + "step": 31495 + }, + { + "epoch": 152.91041162227603, + "grad_norm": 3.4168112783561355e-09, + "learning_rate": 0.03220975425995937, + "loss": 0.0, + "num_input_tokens_seen": 53976672, + "step": 31500 + }, + { + "epoch": 152.93462469733657, + "grad_norm": 5.814111858626347e-09, + "learning_rate": 0.032173292066398206, + "loss": 0.0, + "num_input_tokens_seen": 53985536, + "step": 31505 + }, + { + "epoch": 152.9588377723971, + "grad_norm": 1.5802925190655515e-08, + "learning_rate": 0.03213684804319606, + "loss": 0.0, + "num_input_tokens_seen": 53993760, + "step": 31510 + }, + { + "epoch": 152.98305084745763, + "grad_norm": 1.2335750199099493e-08, + "learning_rate": 0.03210042219597312, + "loss": 0.0, + "num_input_tokens_seen": 54002496, + "step": 31515 + }, + { + "epoch": 153.00968523002422, + "grad_norm": 7.434060922406616e-09, + "learning_rate": 0.03206401453034675, + "loss": 0.0, + "num_input_tokens_seen": 54011520, + "step": 31520 + }, + { + "epoch": 153.03389830508473, + "grad_norm": 1.5583221824044813e-08, + "learning_rate": 0.03202762505193136, + "loss": 0.0, + "num_input_tokens_seen": 54020064, + "step": 31525 + }, + { + "epoch": 153.05811138014528, + "grad_norm": 1.206101885031785e-08, + "learning_rate": 0.031991253766338754, + "loss": 0.0, + "num_input_tokens_seen": 54028672, + "step": 31530 + }, + { + "epoch": 153.08232445520582, + "grad_norm": 1.2807324090147176e-08, + "learning_rate": 0.03195490067917778, + "loss": 0.0, + "num_input_tokens_seen": 54037280, + "step": 31535 + }, + { + "epoch": 153.10653753026634, + "grad_norm": 1.6684163384184103e-08, + "learning_rate": 0.03191856579605461, + "loss": 0.0, + "num_input_tokens_seen": 54045728, + "step": 31540 + }, + { + "epoch": 153.13075060532688, + "grad_norm": 4.799519892628723e-09, + "learning_rate": 0.031882249122572454, + "loss": 0.0, + "num_input_tokens_seen": 54054368, + "step": 31545 + }, + { + "epoch": 153.1549636803874, + "grad_norm": 8.056430189640196e-09, + "learning_rate": 0.03184595066433188, + "loss": 0.0, + "num_input_tokens_seen": 54063104, + "step": 31550 + }, + { + "epoch": 153.17917675544794, + "grad_norm": 7.327980000582102e-09, + "learning_rate": 0.03180967042693049, + "loss": 0.0, + "num_input_tokens_seen": 54071776, + "step": 31555 + }, + { + "epoch": 153.20338983050848, + "grad_norm": 2.165054269198663e-08, + "learning_rate": 0.03177340841596323, + "loss": 0.0, + "num_input_tokens_seen": 54080448, + "step": 31560 + }, + { + "epoch": 153.227602905569, + "grad_norm": 1.3607356130762582e-08, + "learning_rate": 0.03173716463702209, + "loss": 0.0, + "num_input_tokens_seen": 54088960, + "step": 31565 + }, + { + "epoch": 153.25181598062954, + "grad_norm": 1.11890878784493e-08, + "learning_rate": 0.03170093909569638, + "loss": 0.0, + "num_input_tokens_seen": 54097536, + "step": 31570 + }, + { + "epoch": 153.27602905569006, + "grad_norm": 1.8989496197718836e-08, + "learning_rate": 0.03166473179757246, + "loss": 0.0, + "num_input_tokens_seen": 54106144, + "step": 31575 + }, + { + "epoch": 153.3002421307506, + "grad_norm": 7.942914770353582e-09, + "learning_rate": 0.031628542748234005, + "loss": 0.0, + "num_input_tokens_seen": 54114656, + "step": 31580 + }, + { + "epoch": 153.32445520581115, + "grad_norm": 8.531061190808487e-09, + "learning_rate": 0.03159237195326184, + "loss": 0.0, + "num_input_tokens_seen": 54123296, + "step": 31585 + }, + { + "epoch": 153.34866828087166, + "grad_norm": 9.712308290943383e-09, + "learning_rate": 0.031556219418233875, + "loss": 0.0, + "num_input_tokens_seen": 54131776, + "step": 31590 + }, + { + "epoch": 153.3728813559322, + "grad_norm": 1.3818316269009756e-08, + "learning_rate": 0.03152008514872533, + "loss": 0.0, + "num_input_tokens_seen": 54140256, + "step": 31595 + }, + { + "epoch": 153.39709443099272, + "grad_norm": 7.537675372759622e-09, + "learning_rate": 0.03148396915030862, + "loss": 0.0, + "num_input_tokens_seen": 54148640, + "step": 31600 + }, + { + "epoch": 153.39709443099272, + "eval_loss": 1.18183171749115, + "eval_runtime": 4.6254, + "eval_samples_per_second": 79.345, + "eval_steps_per_second": 19.89, + "num_input_tokens_seen": 54148640, + "step": 31600 + }, + { + "epoch": 153.42130750605327, + "grad_norm": 7.365692500371779e-09, + "learning_rate": 0.03144787142855318, + "loss": 0.0, + "num_input_tokens_seen": 54157280, + "step": 31605 + }, + { + "epoch": 153.4455205811138, + "grad_norm": 5.9140408126268085e-09, + "learning_rate": 0.031411791989025835, + "loss": 0.0, + "num_input_tokens_seen": 54165632, + "step": 31610 + }, + { + "epoch": 153.46973365617433, + "grad_norm": 2.2676323041537216e-08, + "learning_rate": 0.031375730837290394, + "loss": 0.0, + "num_input_tokens_seen": 54174048, + "step": 31615 + }, + { + "epoch": 153.49394673123487, + "grad_norm": 2.6684597642656627e-08, + "learning_rate": 0.031339687978908015, + "loss": 0.0, + "num_input_tokens_seen": 54183264, + "step": 31620 + }, + { + "epoch": 153.5181598062954, + "grad_norm": 1.0496497893086598e-08, + "learning_rate": 0.03130366341943694, + "loss": 0.0, + "num_input_tokens_seen": 54192032, + "step": 31625 + }, + { + "epoch": 153.54237288135593, + "grad_norm": 9.130539879720345e-09, + "learning_rate": 0.031267657164432555, + "loss": 0.0, + "num_input_tokens_seen": 54200416, + "step": 31630 + }, + { + "epoch": 153.56658595641647, + "grad_norm": 6.127566898328496e-09, + "learning_rate": 0.03123166921944752, + "loss": 0.0, + "num_input_tokens_seen": 54208896, + "step": 31635 + }, + { + "epoch": 153.590799031477, + "grad_norm": 2.3431123707950974e-08, + "learning_rate": 0.031195699590031666, + "loss": 0.0, + "num_input_tokens_seen": 54217248, + "step": 31640 + }, + { + "epoch": 153.61501210653753, + "grad_norm": 1.0293108587688948e-08, + "learning_rate": 0.031159748281731885, + "loss": 0.0, + "num_input_tokens_seen": 54225824, + "step": 31645 + }, + { + "epoch": 153.63922518159805, + "grad_norm": 1.9988144472904423e-08, + "learning_rate": 0.031123815300092394, + "loss": 0.0, + "num_input_tokens_seen": 54234144, + "step": 31650 + }, + { + "epoch": 153.6634382566586, + "grad_norm": 4.665343666943045e-09, + "learning_rate": 0.031087900650654424, + "loss": 0.0, + "num_input_tokens_seen": 54242944, + "step": 31655 + }, + { + "epoch": 153.68765133171914, + "grad_norm": 3.911850399163086e-09, + "learning_rate": 0.031052004338956534, + "loss": 0.0, + "num_input_tokens_seen": 54251232, + "step": 31660 + }, + { + "epoch": 153.71186440677965, + "grad_norm": 5.255064383646868e-09, + "learning_rate": 0.031016126370534407, + "loss": 0.0, + "num_input_tokens_seen": 54260000, + "step": 31665 + }, + { + "epoch": 153.7360774818402, + "grad_norm": 8.970662435103804e-09, + "learning_rate": 0.030980266750920804, + "loss": 0.0, + "num_input_tokens_seen": 54268416, + "step": 31670 + }, + { + "epoch": 153.7602905569007, + "grad_norm": 7.899734200123021e-09, + "learning_rate": 0.030944425485645747, + "loss": 0.0, + "num_input_tokens_seen": 54276576, + "step": 31675 + }, + { + "epoch": 153.78450363196126, + "grad_norm": 1.1293418644697795e-08, + "learning_rate": 0.03090860258023647, + "loss": 0.0, + "num_input_tokens_seen": 54285216, + "step": 31680 + }, + { + "epoch": 153.8087167070218, + "grad_norm": 8.190068179203536e-09, + "learning_rate": 0.030872798040217236, + "loss": 0.0, + "num_input_tokens_seen": 54293504, + "step": 31685 + }, + { + "epoch": 153.83292978208232, + "grad_norm": 9.877949125325358e-09, + "learning_rate": 0.03083701187110964, + "loss": 0.0, + "num_input_tokens_seen": 54302336, + "step": 31690 + }, + { + "epoch": 153.85714285714286, + "grad_norm": 1.2404600902016227e-08, + "learning_rate": 0.030801244078432294, + "loss": 0.0, + "num_input_tokens_seen": 54310816, + "step": 31695 + }, + { + "epoch": 153.88135593220338, + "grad_norm": 1.011358818914232e-08, + "learning_rate": 0.030765494667701024, + "loss": 0.0, + "num_input_tokens_seen": 54319424, + "step": 31700 + }, + { + "epoch": 153.90556900726392, + "grad_norm": 5.762185395496999e-09, + "learning_rate": 0.030729763644428913, + "loss": 0.0, + "num_input_tokens_seen": 54327840, + "step": 31705 + }, + { + "epoch": 153.92978208232446, + "grad_norm": 1.3705064638713793e-08, + "learning_rate": 0.030694051014126048, + "loss": 0.0, + "num_input_tokens_seen": 54336448, + "step": 31710 + }, + { + "epoch": 153.95399515738498, + "grad_norm": 9.444396376068198e-09, + "learning_rate": 0.030658356782299792, + "loss": 0.0, + "num_input_tokens_seen": 54345056, + "step": 31715 + }, + { + "epoch": 153.97820823244552, + "grad_norm": 7.3217072404929695e-09, + "learning_rate": 0.030622680954454726, + "loss": 0.0, + "num_input_tokens_seen": 54353312, + "step": 31720 + }, + { + "epoch": 154.0048426150121, + "grad_norm": 2.934857512570943e-08, + "learning_rate": 0.030587023536092398, + "loss": 0.0, + "num_input_tokens_seen": 54362336, + "step": 31725 + }, + { + "epoch": 154.02905569007265, + "grad_norm": 2.246658148408187e-08, + "learning_rate": 0.03055138453271171, + "loss": 0.0, + "num_input_tokens_seen": 54371104, + "step": 31730 + }, + { + "epoch": 154.05326876513317, + "grad_norm": 1.455343312528612e-08, + "learning_rate": 0.03051576394980858, + "loss": 0.0, + "num_input_tokens_seen": 54379328, + "step": 31735 + }, + { + "epoch": 154.0774818401937, + "grad_norm": 1.2247030944934068e-08, + "learning_rate": 0.030480161792876187, + "loss": 0.0, + "num_input_tokens_seen": 54388160, + "step": 31740 + }, + { + "epoch": 154.10169491525423, + "grad_norm": 9.470828565838474e-09, + "learning_rate": 0.030444578067404846, + "loss": 0.0, + "num_input_tokens_seen": 54396672, + "step": 31745 + }, + { + "epoch": 154.12590799031477, + "grad_norm": 4.446067958241429e-09, + "learning_rate": 0.030409012778881975, + "loss": 0.0, + "num_input_tokens_seen": 54405024, + "step": 31750 + }, + { + "epoch": 154.15012106537532, + "grad_norm": 1.106883473767084e-08, + "learning_rate": 0.030373465932792235, + "loss": 0.0, + "num_input_tokens_seen": 54413664, + "step": 31755 + }, + { + "epoch": 154.17433414043583, + "grad_norm": 9.994569616367244e-09, + "learning_rate": 0.030337937534617342, + "loss": 0.0, + "num_input_tokens_seen": 54422208, + "step": 31760 + }, + { + "epoch": 154.19854721549638, + "grad_norm": 1.561993023813102e-08, + "learning_rate": 0.030302427589836277, + "loss": 0.0, + "num_input_tokens_seen": 54430528, + "step": 31765 + }, + { + "epoch": 154.2227602905569, + "grad_norm": 7.973894433632722e-09, + "learning_rate": 0.030266936103925095, + "loss": 0.0, + "num_input_tokens_seen": 54438816, + "step": 31770 + }, + { + "epoch": 154.24697336561744, + "grad_norm": 1.01584287648393e-08, + "learning_rate": 0.030231463082356982, + "loss": 0.0, + "num_input_tokens_seen": 54447040, + "step": 31775 + }, + { + "epoch": 154.27118644067798, + "grad_norm": 1.159296036945534e-08, + "learning_rate": 0.030196008530602367, + "loss": 0.0, + "num_input_tokens_seen": 54455936, + "step": 31780 + }, + { + "epoch": 154.2953995157385, + "grad_norm": 8.490925296200658e-09, + "learning_rate": 0.030160572454128842, + "loss": 0.0, + "num_input_tokens_seen": 54464800, + "step": 31785 + }, + { + "epoch": 154.31961259079904, + "grad_norm": 6.493118043238155e-09, + "learning_rate": 0.03012515485840098, + "loss": 0.0, + "num_input_tokens_seen": 54473216, + "step": 31790 + }, + { + "epoch": 154.34382566585955, + "grad_norm": 1.6283490111845822e-08, + "learning_rate": 0.030089755748880734, + "loss": 0.0, + "num_input_tokens_seen": 54481184, + "step": 31795 + }, + { + "epoch": 154.3680387409201, + "grad_norm": 1.6867723218183528e-08, + "learning_rate": 0.030054375131027003, + "loss": 0.0, + "num_input_tokens_seen": 54489984, + "step": 31800 + }, + { + "epoch": 154.3680387409201, + "eval_loss": 1.1786577701568604, + "eval_runtime": 4.6253, + "eval_samples_per_second": 79.346, + "eval_steps_per_second": 19.891, + "num_input_tokens_seen": 54489984, + "step": 31800 + }, + { + "epoch": 154.39225181598064, + "grad_norm": 8.156477271370477e-09, + "learning_rate": 0.030019013010295942, + "loss": 0.0, + "num_input_tokens_seen": 54499136, + "step": 31805 + }, + { + "epoch": 154.41646489104116, + "grad_norm": 1.1353849416195771e-08, + "learning_rate": 0.029983669392140897, + "loss": 0.0, + "num_input_tokens_seen": 54507744, + "step": 31810 + }, + { + "epoch": 154.4406779661017, + "grad_norm": 9.932120015321289e-09, + "learning_rate": 0.029948344282012217, + "loss": 0.0, + "num_input_tokens_seen": 54516128, + "step": 31815 + }, + { + "epoch": 154.46489104116222, + "grad_norm": 1.7664193663335936e-08, + "learning_rate": 0.029913037685357507, + "loss": 0.0, + "num_input_tokens_seen": 54524672, + "step": 31820 + }, + { + "epoch": 154.48910411622276, + "grad_norm": 9.779956400279843e-09, + "learning_rate": 0.029877749607621528, + "loss": 0.0, + "num_input_tokens_seen": 54533280, + "step": 31825 + }, + { + "epoch": 154.5133171912833, + "grad_norm": 1.1580111980435959e-08, + "learning_rate": 0.029842480054246077, + "loss": 0.0, + "num_input_tokens_seen": 54541792, + "step": 31830 + }, + { + "epoch": 154.53753026634382, + "grad_norm": 7.634687548829788e-09, + "learning_rate": 0.02980722903067022, + "loss": 0.0, + "num_input_tokens_seen": 54550336, + "step": 31835 + }, + { + "epoch": 154.56174334140437, + "grad_norm": 8.27352053534014e-09, + "learning_rate": 0.029771996542330113, + "loss": 0.0, + "num_input_tokens_seen": 54558720, + "step": 31840 + }, + { + "epoch": 154.58595641646488, + "grad_norm": 5.4705777685626344e-09, + "learning_rate": 0.029736782594658954, + "loss": 0.0, + "num_input_tokens_seen": 54567424, + "step": 31845 + }, + { + "epoch": 154.61016949152543, + "grad_norm": 7.827699377571662e-09, + "learning_rate": 0.029701587193087284, + "loss": 0.0, + "num_input_tokens_seen": 54576256, + "step": 31850 + }, + { + "epoch": 154.63438256658597, + "grad_norm": 1.70895848583541e-08, + "learning_rate": 0.0296664103430426, + "loss": 0.0, + "num_input_tokens_seen": 54584576, + "step": 31855 + }, + { + "epoch": 154.65859564164649, + "grad_norm": 1.1408245903510306e-08, + "learning_rate": 0.029631252049949652, + "loss": 0.0, + "num_input_tokens_seen": 54593056, + "step": 31860 + }, + { + "epoch": 154.68280871670703, + "grad_norm": 8.720113520155337e-09, + "learning_rate": 0.02959611231923031, + "loss": 0.0, + "num_input_tokens_seen": 54601760, + "step": 31865 + }, + { + "epoch": 154.70702179176754, + "grad_norm": 9.293342984051378e-09, + "learning_rate": 0.029560991156303507, + "loss": 0.0, + "num_input_tokens_seen": 54610304, + "step": 31870 + }, + { + "epoch": 154.7312348668281, + "grad_norm": 1.4650928470416602e-08, + "learning_rate": 0.02952588856658544, + "loss": 0.0, + "num_input_tokens_seen": 54618688, + "step": 31875 + }, + { + "epoch": 154.75544794188863, + "grad_norm": 7.601309803817458e-09, + "learning_rate": 0.029490804555489296, + "loss": 0.0, + "num_input_tokens_seen": 54627200, + "step": 31880 + }, + { + "epoch": 154.77966101694915, + "grad_norm": 8.312535548782307e-09, + "learning_rate": 0.029455739128425484, + "loss": 0.0, + "num_input_tokens_seen": 54635808, + "step": 31885 + }, + { + "epoch": 154.8038740920097, + "grad_norm": 1.8101120602409537e-08, + "learning_rate": 0.029420692290801607, + "loss": 0.0, + "num_input_tokens_seen": 54644320, + "step": 31890 + }, + { + "epoch": 154.8280871670702, + "grad_norm": 6.55367404789331e-09, + "learning_rate": 0.02938566404802223, + "loss": 0.0, + "num_input_tokens_seen": 54652736, + "step": 31895 + }, + { + "epoch": 154.85230024213075, + "grad_norm": 6.974179012075865e-09, + "learning_rate": 0.029350654405489195, + "loss": 0.0, + "num_input_tokens_seen": 54660928, + "step": 31900 + }, + { + "epoch": 154.8765133171913, + "grad_norm": 1.0737065458954476e-08, + "learning_rate": 0.02931566336860145, + "loss": 0.0, + "num_input_tokens_seen": 54669536, + "step": 31905 + }, + { + "epoch": 154.9007263922518, + "grad_norm": 1.182546505162918e-08, + "learning_rate": 0.02928069094275505, + "loss": 0.0, + "num_input_tokens_seen": 54678080, + "step": 31910 + }, + { + "epoch": 154.92493946731236, + "grad_norm": 1.6515668832539632e-08, + "learning_rate": 0.02924573713334314, + "loss": 0.0, + "num_input_tokens_seen": 54686848, + "step": 31915 + }, + { + "epoch": 154.94915254237287, + "grad_norm": 1.2594288278933163e-08, + "learning_rate": 0.02921080194575603, + "loss": 0.0, + "num_input_tokens_seen": 54694656, + "step": 31920 + }, + { + "epoch": 154.97336561743342, + "grad_norm": 1.3614565474995288e-08, + "learning_rate": 0.029175885385381177, + "loss": 0.0, + "num_input_tokens_seen": 54703328, + "step": 31925 + }, + { + "epoch": 154.99757869249396, + "grad_norm": 8.553954877754677e-09, + "learning_rate": 0.029140987457603223, + "loss": 0.0, + "num_input_tokens_seen": 54712224, + "step": 31930 + }, + { + "epoch": 155.02421307506054, + "grad_norm": 2.086181005722665e-09, + "learning_rate": 0.029106108167803763, + "loss": 0.0, + "num_input_tokens_seen": 54721056, + "step": 31935 + }, + { + "epoch": 155.04842615012106, + "grad_norm": 1.2995830189765911e-08, + "learning_rate": 0.029071247521361674, + "loss": 0.0, + "num_input_tokens_seen": 54729216, + "step": 31940 + }, + { + "epoch": 155.0726392251816, + "grad_norm": 5.9407030406077865e-09, + "learning_rate": 0.029036405523652945, + "loss": 0.0, + "num_input_tokens_seen": 54737664, + "step": 31945 + }, + { + "epoch": 155.09685230024212, + "grad_norm": 7.537426682802106e-09, + "learning_rate": 0.029001582180050577, + "loss": 0.0, + "num_input_tokens_seen": 54746016, + "step": 31950 + }, + { + "epoch": 155.12106537530266, + "grad_norm": 1.2966070883635439e-08, + "learning_rate": 0.02896677749592482, + "loss": 0.0, + "num_input_tokens_seen": 54754368, + "step": 31955 + }, + { + "epoch": 155.1452784503632, + "grad_norm": 1.1211104933295246e-08, + "learning_rate": 0.028931991476642938, + "loss": 0.0, + "num_input_tokens_seen": 54763328, + "step": 31960 + }, + { + "epoch": 155.16949152542372, + "grad_norm": 1.0809849904092061e-08, + "learning_rate": 0.028897224127569412, + "loss": 0.0, + "num_input_tokens_seen": 54772128, + "step": 31965 + }, + { + "epoch": 155.19370460048427, + "grad_norm": 5.0078448055046465e-09, + "learning_rate": 0.028862475454065832, + "loss": 0.0, + "num_input_tokens_seen": 54780960, + "step": 31970 + }, + { + "epoch": 155.21791767554478, + "grad_norm": 9.624655739060017e-09, + "learning_rate": 0.028827745461490806, + "loss": 0.0, + "num_input_tokens_seen": 54789280, + "step": 31975 + }, + { + "epoch": 155.24213075060533, + "grad_norm": 1.07552988737325e-08, + "learning_rate": 0.028793034155200212, + "loss": 0.0, + "num_input_tokens_seen": 54797728, + "step": 31980 + }, + { + "epoch": 155.26634382566587, + "grad_norm": 6.689027109985091e-09, + "learning_rate": 0.028758341540546944, + "loss": 0.0, + "num_input_tokens_seen": 54806432, + "step": 31985 + }, + { + "epoch": 155.2905569007264, + "grad_norm": 1.469921251384676e-08, + "learning_rate": 0.02872366762288098, + "loss": 0.0, + "num_input_tokens_seen": 54814880, + "step": 31990 + }, + { + "epoch": 155.31476997578693, + "grad_norm": 8.777107041169074e-09, + "learning_rate": 0.028689012407549567, + "loss": 0.0, + "num_input_tokens_seen": 54823616, + "step": 31995 + }, + { + "epoch": 155.33898305084745, + "grad_norm": 4.592398905600703e-09, + "learning_rate": 0.028654375899896892, + "loss": 0.0, + "num_input_tokens_seen": 54832032, + "step": 32000 + }, + { + "epoch": 155.33898305084745, + "eval_loss": 1.1768851280212402, + "eval_runtime": 4.6179, + "eval_samples_per_second": 79.473, + "eval_steps_per_second": 19.922, + "num_input_tokens_seen": 54832032, + "step": 32000 + }, + { + "epoch": 155.363196125908, + "grad_norm": 1.0518753867927444e-08, + "learning_rate": 0.02861975810526437, + "loss": 0.0, + "num_input_tokens_seen": 54840864, + "step": 32005 + }, + { + "epoch": 155.38740920096853, + "grad_norm": 1.0825400131864171e-08, + "learning_rate": 0.02858515902899056, + "loss": 0.0, + "num_input_tokens_seen": 54849280, + "step": 32010 + }, + { + "epoch": 155.41162227602905, + "grad_norm": 1.467729493498382e-08, + "learning_rate": 0.028550578676410976, + "loss": 0.0, + "num_input_tokens_seen": 54857760, + "step": 32015 + }, + { + "epoch": 155.4358353510896, + "grad_norm": 1.1228928897821788e-08, + "learning_rate": 0.02851601705285837, + "loss": 0.0, + "num_input_tokens_seen": 54866112, + "step": 32020 + }, + { + "epoch": 155.4600484261501, + "grad_norm": 1.3276571841913665e-08, + "learning_rate": 0.028481474163662666, + "loss": 0.0, + "num_input_tokens_seen": 54875040, + "step": 32025 + }, + { + "epoch": 155.48426150121065, + "grad_norm": 7.520247535808267e-09, + "learning_rate": 0.028446950014150683, + "loss": 0.0, + "num_input_tokens_seen": 54883904, + "step": 32030 + }, + { + "epoch": 155.5084745762712, + "grad_norm": 9.789447474872759e-09, + "learning_rate": 0.028412444609646596, + "loss": 0.0, + "num_input_tokens_seen": 54892608, + "step": 32035 + }, + { + "epoch": 155.5326876513317, + "grad_norm": 6.448670486491892e-09, + "learning_rate": 0.028377957955471465, + "loss": 0.0, + "num_input_tokens_seen": 54901184, + "step": 32040 + }, + { + "epoch": 155.55690072639226, + "grad_norm": 4.968512712366646e-09, + "learning_rate": 0.0283434900569436, + "loss": 0.0, + "num_input_tokens_seen": 54909888, + "step": 32045 + }, + { + "epoch": 155.58111380145277, + "grad_norm": 1.64214579712052e-08, + "learning_rate": 0.028309040919378456, + "loss": 0.0, + "num_input_tokens_seen": 54918144, + "step": 32050 + }, + { + "epoch": 155.60532687651332, + "grad_norm": 1.8201383511495806e-08, + "learning_rate": 0.02827461054808848, + "loss": 0.0, + "num_input_tokens_seen": 54926624, + "step": 32055 + }, + { + "epoch": 155.62953995157386, + "grad_norm": 8.409326568425968e-09, + "learning_rate": 0.028240198948383186, + "loss": 0.0, + "num_input_tokens_seen": 54935168, + "step": 32060 + }, + { + "epoch": 155.65375302663438, + "grad_norm": 7.29851512559776e-09, + "learning_rate": 0.028205806125569402, + "loss": 0.0, + "num_input_tokens_seen": 54943776, + "step": 32065 + }, + { + "epoch": 155.67796610169492, + "grad_norm": 1.2340454880188645e-08, + "learning_rate": 0.028171432084950834, + "loss": 0.0, + "num_input_tokens_seen": 54952320, + "step": 32070 + }, + { + "epoch": 155.70217917675544, + "grad_norm": 6.656783124725507e-09, + "learning_rate": 0.028137076831828478, + "loss": 0.0, + "num_input_tokens_seen": 54961024, + "step": 32075 + }, + { + "epoch": 155.72639225181598, + "grad_norm": 1.3534084963851e-08, + "learning_rate": 0.028102740371500238, + "loss": 0.0, + "num_input_tokens_seen": 54969568, + "step": 32080 + }, + { + "epoch": 155.75060532687652, + "grad_norm": 6.178021649816401e-09, + "learning_rate": 0.0280684227092613, + "loss": 0.0, + "num_input_tokens_seen": 54978176, + "step": 32085 + }, + { + "epoch": 155.77481840193704, + "grad_norm": 8.100511372788333e-09, + "learning_rate": 0.02803412385040392, + "loss": 0.0, + "num_input_tokens_seen": 54986400, + "step": 32090 + }, + { + "epoch": 155.79903147699758, + "grad_norm": 1.909912938913294e-08, + "learning_rate": 0.027999843800217306, + "loss": 0.0, + "num_input_tokens_seen": 54994784, + "step": 32095 + }, + { + "epoch": 155.8232445520581, + "grad_norm": 6.434229149476778e-09, + "learning_rate": 0.027965582563987932, + "loss": 0.0, + "num_input_tokens_seen": 55003168, + "step": 32100 + }, + { + "epoch": 155.84745762711864, + "grad_norm": 1.0379365811274965e-08, + "learning_rate": 0.027931340146999346, + "loss": 0.0, + "num_input_tokens_seen": 55011776, + "step": 32105 + }, + { + "epoch": 155.8716707021792, + "grad_norm": 1.4313064511384255e-08, + "learning_rate": 0.02789711655453208, + "loss": 0.0, + "num_input_tokens_seen": 55020224, + "step": 32110 + }, + { + "epoch": 155.8958837772397, + "grad_norm": 9.10700759249039e-09, + "learning_rate": 0.02786291179186392, + "loss": 0.0, + "num_input_tokens_seen": 55028992, + "step": 32115 + }, + { + "epoch": 155.92009685230025, + "grad_norm": 6.822757470104079e-09, + "learning_rate": 0.02782872586426961, + "loss": 0.0, + "num_input_tokens_seen": 55037760, + "step": 32120 + }, + { + "epoch": 155.94430992736076, + "grad_norm": 1.4220272070986084e-08, + "learning_rate": 0.027794558777021083, + "loss": 0.0, + "num_input_tokens_seen": 55046464, + "step": 32125 + }, + { + "epoch": 155.9685230024213, + "grad_norm": 1.8315095218213173e-08, + "learning_rate": 0.02776041053538734, + "loss": 0.0, + "num_input_tokens_seen": 55054560, + "step": 32130 + }, + { + "epoch": 155.99273607748185, + "grad_norm": 8.306887622211434e-09, + "learning_rate": 0.027726281144634407, + "loss": 0.0, + "num_input_tokens_seen": 55063104, + "step": 32135 + }, + { + "epoch": 156.01937046004844, + "grad_norm": 1.2320354514372411e-08, + "learning_rate": 0.02769217061002552, + "loss": 0.0, + "num_input_tokens_seen": 55072032, + "step": 32140 + }, + { + "epoch": 156.04358353510895, + "grad_norm": 8.564375875153019e-09, + "learning_rate": 0.027658078936820967, + "loss": 0.0, + "num_input_tokens_seen": 55080864, + "step": 32145 + }, + { + "epoch": 156.0677966101695, + "grad_norm": 6.673186891958949e-09, + "learning_rate": 0.02762400613027805, + "loss": 0.0, + "num_input_tokens_seen": 55089312, + "step": 32150 + }, + { + "epoch": 156.09200968523, + "grad_norm": 1.4143700433066897e-08, + "learning_rate": 0.027589952195651295, + "loss": 0.0, + "num_input_tokens_seen": 55097792, + "step": 32155 + }, + { + "epoch": 156.11622276029055, + "grad_norm": 1.814918348941319e-08, + "learning_rate": 0.027555917138192186, + "loss": 0.0, + "num_input_tokens_seen": 55106208, + "step": 32160 + }, + { + "epoch": 156.1404358353511, + "grad_norm": 7.15406534013141e-09, + "learning_rate": 0.027521900963149375, + "loss": 0.0, + "num_input_tokens_seen": 55114912, + "step": 32165 + }, + { + "epoch": 156.16464891041161, + "grad_norm": 1.900813906274834e-08, + "learning_rate": 0.027487903675768633, + "loss": 0.0, + "num_input_tokens_seen": 55123072, + "step": 32170 + }, + { + "epoch": 156.18886198547216, + "grad_norm": 1.7880466884889756e-08, + "learning_rate": 0.027453925281292677, + "loss": 0.0, + "num_input_tokens_seen": 55131520, + "step": 32175 + }, + { + "epoch": 156.21307506053267, + "grad_norm": 1.0186292698222132e-08, + "learning_rate": 0.027419965784961475, + "loss": 0.0, + "num_input_tokens_seen": 55139712, + "step": 32180 + }, + { + "epoch": 156.23728813559322, + "grad_norm": 5.389068302719124e-09, + "learning_rate": 0.027386025192012015, + "loss": 0.0, + "num_input_tokens_seen": 55148384, + "step": 32185 + }, + { + "epoch": 156.26150121065376, + "grad_norm": 1.154696338545591e-08, + "learning_rate": 0.027352103507678277, + "loss": 0.0, + "num_input_tokens_seen": 55156800, + "step": 32190 + }, + { + "epoch": 156.28571428571428, + "grad_norm": 1.16016103390848e-08, + "learning_rate": 0.027318200737191527, + "loss": 0.0, + "num_input_tokens_seen": 55165088, + "step": 32195 + }, + { + "epoch": 156.30992736077482, + "grad_norm": 1.5193210245456612e-08, + "learning_rate": 0.027284316885779935, + "loss": 0.0, + "num_input_tokens_seen": 55173664, + "step": 32200 + }, + { + "epoch": 156.30992736077482, + "eval_loss": 1.183863878250122, + "eval_runtime": 4.6239, + "eval_samples_per_second": 79.369, + "eval_steps_per_second": 19.896, + "num_input_tokens_seen": 55173664, + "step": 32200 + }, + { + "epoch": 156.33414043583534, + "grad_norm": 1.1194407179004884e-08, + "learning_rate": 0.027250451958668785, + "loss": 0.0, + "num_input_tokens_seen": 55181792, + "step": 32205 + }, + { + "epoch": 156.35835351089588, + "grad_norm": 6.886433201458431e-09, + "learning_rate": 0.027216605961080536, + "loss": 0.0, + "num_input_tokens_seen": 55190432, + "step": 32210 + }, + { + "epoch": 156.38256658595643, + "grad_norm": 8.442571974853763e-09, + "learning_rate": 0.02718277889823461, + "loss": 0.0, + "num_input_tokens_seen": 55199168, + "step": 32215 + }, + { + "epoch": 156.40677966101694, + "grad_norm": 5.465205177301868e-09, + "learning_rate": 0.027148970775347604, + "loss": 0.0, + "num_input_tokens_seen": 55208288, + "step": 32220 + }, + { + "epoch": 156.43099273607749, + "grad_norm": 9.888319496553777e-09, + "learning_rate": 0.027115181597633174, + "loss": 0.0, + "num_input_tokens_seen": 55216992, + "step": 32225 + }, + { + "epoch": 156.455205811138, + "grad_norm": 1.3135623255777773e-08, + "learning_rate": 0.027081411370301976, + "loss": 0.0, + "num_input_tokens_seen": 55225920, + "step": 32230 + }, + { + "epoch": 156.47941888619854, + "grad_norm": 3.481665178384219e-09, + "learning_rate": 0.027047660098561875, + "loss": 0.0, + "num_input_tokens_seen": 55234208, + "step": 32235 + }, + { + "epoch": 156.5036319612591, + "grad_norm": 4.264689490440787e-09, + "learning_rate": 0.02701392778761766, + "loss": 0.0, + "num_input_tokens_seen": 55242880, + "step": 32240 + }, + { + "epoch": 156.5278450363196, + "grad_norm": 1.3117075425839175e-08, + "learning_rate": 0.02698021444267133, + "loss": 0.0, + "num_input_tokens_seen": 55251296, + "step": 32245 + }, + { + "epoch": 156.55205811138015, + "grad_norm": 8.122410299904459e-09, + "learning_rate": 0.026946520068921915, + "loss": 0.0, + "num_input_tokens_seen": 55260032, + "step": 32250 + }, + { + "epoch": 156.57627118644066, + "grad_norm": 1.5619969317981486e-08, + "learning_rate": 0.02691284467156547, + "loss": 0.0, + "num_input_tokens_seen": 55268320, + "step": 32255 + }, + { + "epoch": 156.6004842615012, + "grad_norm": 8.723586297776365e-09, + "learning_rate": 0.026879188255795182, + "loss": 0.0, + "num_input_tokens_seen": 55276448, + "step": 32260 + }, + { + "epoch": 156.62469733656175, + "grad_norm": 6.281114739437044e-09, + "learning_rate": 0.026845550826801328, + "loss": 0.0, + "num_input_tokens_seen": 55284928, + "step": 32265 + }, + { + "epoch": 156.64891041162227, + "grad_norm": 8.838908271968648e-09, + "learning_rate": 0.02681193238977121, + "loss": 0.0, + "num_input_tokens_seen": 55293760, + "step": 32270 + }, + { + "epoch": 156.6731234866828, + "grad_norm": 1.0399202388100548e-08, + "learning_rate": 0.026778332949889145, + "loss": 0.0, + "num_input_tokens_seen": 55302400, + "step": 32275 + }, + { + "epoch": 156.69733656174333, + "grad_norm": 8.052024824678483e-09, + "learning_rate": 0.026744752512336673, + "loss": 0.0, + "num_input_tokens_seen": 55310752, + "step": 32280 + }, + { + "epoch": 156.72154963680387, + "grad_norm": 1.9285000263380425e-08, + "learning_rate": 0.02671119108229225, + "loss": 0.0, + "num_input_tokens_seen": 55319520, + "step": 32285 + }, + { + "epoch": 156.74576271186442, + "grad_norm": 6.039496902587871e-09, + "learning_rate": 0.026677648664931556, + "loss": 0.0, + "num_input_tokens_seen": 55328352, + "step": 32290 + }, + { + "epoch": 156.76997578692493, + "grad_norm": 4.6842085765774755e-09, + "learning_rate": 0.026644125265427154, + "loss": 0.0, + "num_input_tokens_seen": 55337216, + "step": 32295 + }, + { + "epoch": 156.79418886198548, + "grad_norm": 6.5364962331670995e-09, + "learning_rate": 0.026610620888948822, + "loss": 0.0, + "num_input_tokens_seen": 55345728, + "step": 32300 + }, + { + "epoch": 156.818401937046, + "grad_norm": 6.181364753388152e-09, + "learning_rate": 0.026577135540663408, + "loss": 0.0, + "num_input_tokens_seen": 55354144, + "step": 32305 + }, + { + "epoch": 156.84261501210653, + "grad_norm": 1.3810725896234999e-08, + "learning_rate": 0.026543669225734673, + "loss": 0.0, + "num_input_tokens_seen": 55362976, + "step": 32310 + }, + { + "epoch": 156.86682808716708, + "grad_norm": 5.910887779236873e-09, + "learning_rate": 0.02651022194932363, + "loss": 0.0, + "num_input_tokens_seen": 55371200, + "step": 32315 + }, + { + "epoch": 156.8910411622276, + "grad_norm": 7.384870492899154e-09, + "learning_rate": 0.026476793716588194, + "loss": 0.0, + "num_input_tokens_seen": 55379936, + "step": 32320 + }, + { + "epoch": 156.91525423728814, + "grad_norm": 1.6326465512861432e-08, + "learning_rate": 0.026443384532683467, + "loss": 0.0, + "num_input_tokens_seen": 55388416, + "step": 32325 + }, + { + "epoch": 156.93946731234865, + "grad_norm": 1.3727689207598814e-08, + "learning_rate": 0.026409994402761584, + "loss": 0.0, + "num_input_tokens_seen": 55397184, + "step": 32330 + }, + { + "epoch": 156.9636803874092, + "grad_norm": 2.3442201069201474e-08, + "learning_rate": 0.026376623331971653, + "loss": 0.0, + "num_input_tokens_seen": 55405728, + "step": 32335 + }, + { + "epoch": 156.98789346246974, + "grad_norm": 5.435043082258062e-09, + "learning_rate": 0.026343271325459997, + "loss": 0.0, + "num_input_tokens_seen": 55414240, + "step": 32340 + }, + { + "epoch": 157.01452784503633, + "grad_norm": 8.434186682393374e-09, + "learning_rate": 0.02630993838836987, + "loss": 0.0, + "num_input_tokens_seen": 55422944, + "step": 32345 + }, + { + "epoch": 157.03874092009684, + "grad_norm": 7.943879332117376e-09, + "learning_rate": 0.026276624525841584, + "loss": 0.0, + "num_input_tokens_seen": 55431744, + "step": 32350 + }, + { + "epoch": 157.0629539951574, + "grad_norm": 5.194138896769118e-09, + "learning_rate": 0.026243329743012637, + "loss": 0.0, + "num_input_tokens_seen": 55440448, + "step": 32355 + }, + { + "epoch": 157.08716707021793, + "grad_norm": 8.281127783504871e-09, + "learning_rate": 0.026210054045017438, + "loss": 0.0, + "num_input_tokens_seen": 55448864, + "step": 32360 + }, + { + "epoch": 157.11138014527845, + "grad_norm": 7.255966938402025e-09, + "learning_rate": 0.02617679743698755, + "loss": 0.0, + "num_input_tokens_seen": 55457536, + "step": 32365 + }, + { + "epoch": 157.135593220339, + "grad_norm": 7.071443874906436e-09, + "learning_rate": 0.02614355992405158, + "loss": 0.0, + "num_input_tokens_seen": 55466208, + "step": 32370 + }, + { + "epoch": 157.1598062953995, + "grad_norm": 6.006628527899238e-09, + "learning_rate": 0.026110341511335115, + "loss": 0.0, + "num_input_tokens_seen": 55474336, + "step": 32375 + }, + { + "epoch": 157.18401937046005, + "grad_norm": 4.71763383913526e-09, + "learning_rate": 0.02607714220396093, + "loss": 0.0, + "num_input_tokens_seen": 55482816, + "step": 32380 + }, + { + "epoch": 157.2082324455206, + "grad_norm": 1.1587030890325423e-08, + "learning_rate": 0.02604396200704869, + "loss": 0.0, + "num_input_tokens_seen": 55491808, + "step": 32385 + }, + { + "epoch": 157.2324455205811, + "grad_norm": 7.704665350161122e-09, + "learning_rate": 0.02601080092571523, + "loss": 0.0, + "num_input_tokens_seen": 55500288, + "step": 32390 + }, + { + "epoch": 157.25665859564165, + "grad_norm": 7.257400902460631e-09, + "learning_rate": 0.025977658965074455, + "loss": 0.0, + "num_input_tokens_seen": 55508704, + "step": 32395 + }, + { + "epoch": 157.28087167070217, + "grad_norm": 8.486463975998504e-09, + "learning_rate": 0.02594453613023719, + "loss": 0.0, + "num_input_tokens_seen": 55517376, + "step": 32400 + }, + { + "epoch": 157.28087167070217, + "eval_loss": 1.1903668642044067, + "eval_runtime": 4.6426, + "eval_samples_per_second": 79.051, + "eval_steps_per_second": 19.817, + "num_input_tokens_seen": 55517376, + "step": 32400 + }, + { + "epoch": 157.3050847457627, + "grad_norm": 1.0071166123282183e-08, + "learning_rate": 0.025911432426311443, + "loss": 0.0, + "num_input_tokens_seen": 55525824, + "step": 32405 + }, + { + "epoch": 157.32929782082326, + "grad_norm": 3.4790865743872246e-09, + "learning_rate": 0.025878347858402234, + "loss": 0.0, + "num_input_tokens_seen": 55534368, + "step": 32410 + }, + { + "epoch": 157.35351089588377, + "grad_norm": 6.054377887920737e-09, + "learning_rate": 0.025845282431611598, + "loss": 0.0, + "num_input_tokens_seen": 55543136, + "step": 32415 + }, + { + "epoch": 157.37772397094432, + "grad_norm": 1.1636484664734326e-08, + "learning_rate": 0.025812236151038608, + "loss": 0.0, + "num_input_tokens_seen": 55551744, + "step": 32420 + }, + { + "epoch": 157.40193704600483, + "grad_norm": 8.459039690933423e-09, + "learning_rate": 0.025779209021779468, + "loss": 0.0, + "num_input_tokens_seen": 55560288, + "step": 32425 + }, + { + "epoch": 157.42615012106538, + "grad_norm": 1.3264467746409991e-08, + "learning_rate": 0.025746201048927324, + "loss": 0.0, + "num_input_tokens_seen": 55568832, + "step": 32430 + }, + { + "epoch": 157.45036319612592, + "grad_norm": 9.713163606761555e-09, + "learning_rate": 0.025713212237572485, + "loss": 0.0, + "num_input_tokens_seen": 55577184, + "step": 32435 + }, + { + "epoch": 157.47457627118644, + "grad_norm": 8.058738565352996e-09, + "learning_rate": 0.025680242592802164, + "loss": 0.0, + "num_input_tokens_seen": 55585632, + "step": 32440 + }, + { + "epoch": 157.49878934624698, + "grad_norm": 7.460844386741883e-09, + "learning_rate": 0.02564729211970073, + "loss": 0.0, + "num_input_tokens_seen": 55594048, + "step": 32445 + }, + { + "epoch": 157.5230024213075, + "grad_norm": 3.6614020704206496e-09, + "learning_rate": 0.025614360823349617, + "loss": 0.0, + "num_input_tokens_seen": 55602688, + "step": 32450 + }, + { + "epoch": 157.54721549636804, + "grad_norm": 1.0908794756403495e-08, + "learning_rate": 0.025581448708827146, + "loss": 0.0, + "num_input_tokens_seen": 55611680, + "step": 32455 + }, + { + "epoch": 157.57142857142858, + "grad_norm": 2.0574011827534378e-08, + "learning_rate": 0.025548555781208876, + "loss": 0.0, + "num_input_tokens_seen": 55620416, + "step": 32460 + }, + { + "epoch": 157.5956416464891, + "grad_norm": 7.831751247522334e-09, + "learning_rate": 0.02551568204556721, + "loss": 0.0, + "num_input_tokens_seen": 55628992, + "step": 32465 + }, + { + "epoch": 157.61985472154964, + "grad_norm": 4.3055381482304256e-09, + "learning_rate": 0.02548282750697173, + "loss": 0.0, + "num_input_tokens_seen": 55637664, + "step": 32470 + }, + { + "epoch": 157.64406779661016, + "grad_norm": 2.7717875994426322e-08, + "learning_rate": 0.02544999217048909, + "loss": 0.0, + "num_input_tokens_seen": 55646336, + "step": 32475 + }, + { + "epoch": 157.6682808716707, + "grad_norm": 1.598651522272121e-08, + "learning_rate": 0.025417176041182793, + "loss": 0.0, + "num_input_tokens_seen": 55654688, + "step": 32480 + }, + { + "epoch": 157.69249394673125, + "grad_norm": 8.337933898872052e-09, + "learning_rate": 0.025384379124113596, + "loss": 0.0, + "num_input_tokens_seen": 55663104, + "step": 32485 + }, + { + "epoch": 157.71670702179176, + "grad_norm": 8.043766541732111e-09, + "learning_rate": 0.025351601424339124, + "loss": 0.0, + "num_input_tokens_seen": 55671712, + "step": 32490 + }, + { + "epoch": 157.7409200968523, + "grad_norm": 1.4129309278132496e-08, + "learning_rate": 0.025318842946914184, + "loss": 0.0, + "num_input_tokens_seen": 55680448, + "step": 32495 + }, + { + "epoch": 157.76513317191282, + "grad_norm": 1.442512598259782e-08, + "learning_rate": 0.025286103696890494, + "loss": 0.0, + "num_input_tokens_seen": 55688992, + "step": 32500 + }, + { + "epoch": 157.78934624697337, + "grad_norm": 1.8276018920460047e-08, + "learning_rate": 0.025253383679316836, + "loss": 0.0, + "num_input_tokens_seen": 55697824, + "step": 32505 + }, + { + "epoch": 157.8135593220339, + "grad_norm": 8.783402449807909e-09, + "learning_rate": 0.025220682899239077, + "loss": 0.0, + "num_input_tokens_seen": 55706144, + "step": 32510 + }, + { + "epoch": 157.83777239709443, + "grad_norm": 9.17661147070703e-09, + "learning_rate": 0.02518800136170013, + "loss": 0.0, + "num_input_tokens_seen": 55714272, + "step": 32515 + }, + { + "epoch": 157.86198547215497, + "grad_norm": 1.642341196372854e-08, + "learning_rate": 0.02515533907173981, + "loss": 0.0, + "num_input_tokens_seen": 55723168, + "step": 32520 + }, + { + "epoch": 157.88619854721549, + "grad_norm": 9.577558301998579e-09, + "learning_rate": 0.025122696034395115, + "loss": 0.0, + "num_input_tokens_seen": 55731712, + "step": 32525 + }, + { + "epoch": 157.91041162227603, + "grad_norm": 8.526408024067678e-09, + "learning_rate": 0.025090072254700023, + "loss": 0.0, + "num_input_tokens_seen": 55740256, + "step": 32530 + }, + { + "epoch": 157.93462469733657, + "grad_norm": 1.0401577377194826e-08, + "learning_rate": 0.025057467737685468, + "loss": 0.0, + "num_input_tokens_seen": 55748544, + "step": 32535 + }, + { + "epoch": 157.9588377723971, + "grad_norm": 1.1097968766193844e-08, + "learning_rate": 0.025024882488379557, + "loss": 0.0, + "num_input_tokens_seen": 55757088, + "step": 32540 + }, + { + "epoch": 157.98305084745763, + "grad_norm": 1.738745503132577e-08, + "learning_rate": 0.02499231651180727, + "loss": 0.0, + "num_input_tokens_seen": 55766048, + "step": 32545 + }, + { + "epoch": 158.00968523002422, + "grad_norm": 1.3235199602945613e-08, + "learning_rate": 0.024959769812990713, + "loss": 0.0, + "num_input_tokens_seen": 55775104, + "step": 32550 + }, + { + "epoch": 158.03389830508473, + "grad_norm": 6.525965989823135e-09, + "learning_rate": 0.024927242396949045, + "loss": 0.0, + "num_input_tokens_seen": 55783808, + "step": 32555 + }, + { + "epoch": 158.05811138014528, + "grad_norm": 8.644887472541996e-09, + "learning_rate": 0.02489473426869836, + "loss": 0.0, + "num_input_tokens_seen": 55792640, + "step": 32560 + }, + { + "epoch": 158.08232445520582, + "grad_norm": 4.469450587407664e-09, + "learning_rate": 0.024862245433251776, + "loss": 0.0, + "num_input_tokens_seen": 55801056, + "step": 32565 + }, + { + "epoch": 158.10653753026634, + "grad_norm": 1.9119527294719774e-08, + "learning_rate": 0.024829775895619577, + "loss": 0.0, + "num_input_tokens_seen": 55809728, + "step": 32570 + }, + { + "epoch": 158.13075060532688, + "grad_norm": 1.1613599859572332e-08, + "learning_rate": 0.024797325660808882, + "loss": 0.0, + "num_input_tokens_seen": 55818368, + "step": 32575 + }, + { + "epoch": 158.1549636803874, + "grad_norm": 7.29792226650261e-09, + "learning_rate": 0.02476489473382401, + "loss": 0.0, + "num_input_tokens_seen": 55826848, + "step": 32580 + }, + { + "epoch": 158.17917675544794, + "grad_norm": 7.181347960738549e-09, + "learning_rate": 0.024732483119666127, + "loss": 0.0, + "num_input_tokens_seen": 55835264, + "step": 32585 + }, + { + "epoch": 158.20338983050848, + "grad_norm": 1.1390257625976119e-08, + "learning_rate": 0.024700090823333548, + "loss": 0.0, + "num_input_tokens_seen": 55843776, + "step": 32590 + }, + { + "epoch": 158.227602905569, + "grad_norm": 9.029482939126865e-09, + "learning_rate": 0.02466771784982163, + "loss": 0.0, + "num_input_tokens_seen": 55852480, + "step": 32595 + }, + { + "epoch": 158.25181598062954, + "grad_norm": 1.3490570438534633e-08, + "learning_rate": 0.024635364204122594, + "loss": 0.0, + "num_input_tokens_seen": 55861088, + "step": 32600 + }, + { + "epoch": 158.25181598062954, + "eval_loss": 1.1880522966384888, + "eval_runtime": 4.6179, + "eval_samples_per_second": 79.473, + "eval_steps_per_second": 19.922, + "num_input_tokens_seen": 55861088, + "step": 32600 + }, + { + "epoch": 158.27602905569006, + "grad_norm": 9.319410132491157e-09, + "learning_rate": 0.024603029891225852, + "loss": 0.0, + "num_input_tokens_seen": 55869472, + "step": 32605 + }, + { + "epoch": 158.3002421307506, + "grad_norm": 1.574613328614305e-08, + "learning_rate": 0.024570714916117748, + "loss": 0.0, + "num_input_tokens_seen": 55877856, + "step": 32610 + }, + { + "epoch": 158.32445520581115, + "grad_norm": 1.0424035856715363e-08, + "learning_rate": 0.024538419283781625, + "loss": 0.0, + "num_input_tokens_seen": 55886432, + "step": 32615 + }, + { + "epoch": 158.34866828087166, + "grad_norm": 3.1844937797842476e-09, + "learning_rate": 0.024506142999197938, + "loss": 0.0, + "num_input_tokens_seen": 55894784, + "step": 32620 + }, + { + "epoch": 158.3728813559322, + "grad_norm": 6.522198781055977e-09, + "learning_rate": 0.024473886067344002, + "loss": 0.0, + "num_input_tokens_seen": 55903456, + "step": 32625 + }, + { + "epoch": 158.39709443099272, + "grad_norm": 9.131752243263236e-09, + "learning_rate": 0.02444164849319434, + "loss": 0.0, + "num_input_tokens_seen": 55912064, + "step": 32630 + }, + { + "epoch": 158.42130750605327, + "grad_norm": 8.912818927342414e-09, + "learning_rate": 0.024409430281720306, + "loss": 0.0, + "num_input_tokens_seen": 55920608, + "step": 32635 + }, + { + "epoch": 158.4455205811138, + "grad_norm": 1.7755594328150437e-08, + "learning_rate": 0.024377231437890428, + "loss": 0.0, + "num_input_tokens_seen": 55929312, + "step": 32640 + }, + { + "epoch": 158.46973365617433, + "grad_norm": 1.3428738121490369e-08, + "learning_rate": 0.024345051966670115, + "loss": 0.0, + "num_input_tokens_seen": 55937120, + "step": 32645 + }, + { + "epoch": 158.49394673123487, + "grad_norm": 1.7777873395630195e-08, + "learning_rate": 0.024312891873021884, + "loss": 0.0, + "num_input_tokens_seen": 55946080, + "step": 32650 + }, + { + "epoch": 158.5181598062954, + "grad_norm": 1.002745531053506e-08, + "learning_rate": 0.024280751161905183, + "loss": 0.0, + "num_input_tokens_seen": 55954752, + "step": 32655 + }, + { + "epoch": 158.54237288135593, + "grad_norm": 4.694104660529774e-09, + "learning_rate": 0.02424862983827658, + "loss": 0.0, + "num_input_tokens_seen": 55963392, + "step": 32660 + }, + { + "epoch": 158.56658595641647, + "grad_norm": 5.34254995798733e-09, + "learning_rate": 0.024216527907089495, + "loss": 0.0, + "num_input_tokens_seen": 55972320, + "step": 32665 + }, + { + "epoch": 158.590799031477, + "grad_norm": 7.169504101511848e-09, + "learning_rate": 0.024184445373294505, + "loss": 0.0, + "num_input_tokens_seen": 55980576, + "step": 32670 + }, + { + "epoch": 158.61501210653753, + "grad_norm": 2.3032686868873498e-08, + "learning_rate": 0.02415238224183918, + "loss": 0.0, + "num_input_tokens_seen": 55988864, + "step": 32675 + }, + { + "epoch": 158.63922518159805, + "grad_norm": 4.668963438092533e-09, + "learning_rate": 0.024120338517667973, + "loss": 0.0, + "num_input_tokens_seen": 55997664, + "step": 32680 + }, + { + "epoch": 158.6634382566586, + "grad_norm": 8.22222290253194e-09, + "learning_rate": 0.02408831420572247, + "loss": 0.0, + "num_input_tokens_seen": 56005792, + "step": 32685 + }, + { + "epoch": 158.68765133171914, + "grad_norm": 9.44320799334264e-09, + "learning_rate": 0.024056309310941264, + "loss": 0.0, + "num_input_tokens_seen": 56013952, + "step": 32690 + }, + { + "epoch": 158.71186440677965, + "grad_norm": 1.2616598432657611e-08, + "learning_rate": 0.02402432383825982, + "loss": 0.0, + "num_input_tokens_seen": 56022912, + "step": 32695 + }, + { + "epoch": 158.7360774818402, + "grad_norm": 1.0754222401487823e-08, + "learning_rate": 0.023992357792610792, + "loss": 0.0, + "num_input_tokens_seen": 56031520, + "step": 32700 + }, + { + "epoch": 158.7602905569007, + "grad_norm": 7.5662356380235e-09, + "learning_rate": 0.0239604111789237, + "loss": 0.0, + "num_input_tokens_seen": 56039808, + "step": 32705 + }, + { + "epoch": 158.78450363196126, + "grad_norm": 1.133472160574911e-08, + "learning_rate": 0.023928484002125095, + "loss": 0.0, + "num_input_tokens_seen": 56048640, + "step": 32710 + }, + { + "epoch": 158.8087167070218, + "grad_norm": 7.493429876603841e-09, + "learning_rate": 0.023896576267138595, + "loss": 0.0, + "num_input_tokens_seen": 56057280, + "step": 32715 + }, + { + "epoch": 158.83292978208232, + "grad_norm": 1.697662099786612e-08, + "learning_rate": 0.02386468797888471, + "loss": 0.0, + "num_input_tokens_seen": 56065408, + "step": 32720 + }, + { + "epoch": 158.85714285714286, + "grad_norm": 1.5264200570186404e-08, + "learning_rate": 0.023832819142281057, + "loss": 0.0, + "num_input_tokens_seen": 56074016, + "step": 32725 + }, + { + "epoch": 158.88135593220338, + "grad_norm": 7.547074076796889e-09, + "learning_rate": 0.02380096976224225, + "loss": 0.0, + "num_input_tokens_seen": 56082304, + "step": 32730 + }, + { + "epoch": 158.90556900726392, + "grad_norm": 1.541416949635277e-08, + "learning_rate": 0.023769139843679777, + "loss": 0.0, + "num_input_tokens_seen": 56091072, + "step": 32735 + }, + { + "epoch": 158.92978208232446, + "grad_norm": 1.1245080422384035e-08, + "learning_rate": 0.023737329391502287, + "loss": 0.0, + "num_input_tokens_seen": 56099584, + "step": 32740 + }, + { + "epoch": 158.95399515738498, + "grad_norm": 6.601148516693911e-09, + "learning_rate": 0.023705538410615293, + "loss": 0.0, + "num_input_tokens_seen": 56108000, + "step": 32745 + }, + { + "epoch": 158.97820823244552, + "grad_norm": 1.9157109676370965e-08, + "learning_rate": 0.023673766905921396, + "loss": 0.0, + "num_input_tokens_seen": 56116608, + "step": 32750 + }, + { + "epoch": 159.0048426150121, + "grad_norm": 1.1844197622679076e-08, + "learning_rate": 0.0236420148823202, + "loss": 0.0, + "num_input_tokens_seen": 56125856, + "step": 32755 + }, + { + "epoch": 159.02905569007265, + "grad_norm": 7.49396367183408e-09, + "learning_rate": 0.02361028234470816, + "loss": 0.0, + "num_input_tokens_seen": 56134592, + "step": 32760 + }, + { + "epoch": 159.05326876513317, + "grad_norm": 4.98609420418461e-09, + "learning_rate": 0.023578569297978913, + "loss": 0.0, + "num_input_tokens_seen": 56143232, + "step": 32765 + }, + { + "epoch": 159.0774818401937, + "grad_norm": 1.640422375714934e-08, + "learning_rate": 0.023546875747023025, + "loss": 0.0, + "num_input_tokens_seen": 56151552, + "step": 32770 + }, + { + "epoch": 159.10169491525423, + "grad_norm": 9.57018198022297e-09, + "learning_rate": 0.02351520169672801, + "loss": 0.0, + "num_input_tokens_seen": 56160288, + "step": 32775 + }, + { + "epoch": 159.12590799031477, + "grad_norm": 1.605904564883076e-08, + "learning_rate": 0.023483547151978357, + "loss": 0.0, + "num_input_tokens_seen": 56168768, + "step": 32780 + }, + { + "epoch": 159.15012106537532, + "grad_norm": 9.013234603116871e-09, + "learning_rate": 0.023451912117655675, + "loss": 0.0, + "num_input_tokens_seen": 56177632, + "step": 32785 + }, + { + "epoch": 159.17433414043583, + "grad_norm": 1.1236842567541316e-08, + "learning_rate": 0.023420296598638417, + "loss": 0.0, + "num_input_tokens_seen": 56186208, + "step": 32790 + }, + { + "epoch": 159.19854721549638, + "grad_norm": 7.177743732711406e-09, + "learning_rate": 0.023388700599802165, + "loss": 0.0, + "num_input_tokens_seen": 56194656, + "step": 32795 + }, + { + "epoch": 159.2227602905569, + "grad_norm": 7.960994530264998e-09, + "learning_rate": 0.023357124126019334, + "loss": 0.0, + "num_input_tokens_seen": 56203392, + "step": 32800 + }, + { + "epoch": 159.2227602905569, + "eval_loss": 1.193556785583496, + "eval_runtime": 4.6232, + "eval_samples_per_second": 79.381, + "eval_steps_per_second": 19.899, + "num_input_tokens_seen": 56203392, + "step": 32800 + }, + { + "epoch": 159.24697336561744, + "grad_norm": 9.98738158841661e-09, + "learning_rate": 0.02332556718215945, + "loss": 0.0, + "num_input_tokens_seen": 56212128, + "step": 32805 + }, + { + "epoch": 159.27118644067798, + "grad_norm": 1.3451090019600542e-08, + "learning_rate": 0.023294029773089035, + "loss": 0.0, + "num_input_tokens_seen": 56220288, + "step": 32810 + }, + { + "epoch": 159.2953995157385, + "grad_norm": 1.0570992969860527e-08, + "learning_rate": 0.023262511903671484, + "loss": 0.0, + "num_input_tokens_seen": 56228576, + "step": 32815 + }, + { + "epoch": 159.31961259079904, + "grad_norm": 9.5801020449926e-09, + "learning_rate": 0.023231013578767324, + "loss": 0.0, + "num_input_tokens_seen": 56237056, + "step": 32820 + }, + { + "epoch": 159.34382566585955, + "grad_norm": 1.0506351344474751e-08, + "learning_rate": 0.0231995348032339, + "loss": 0.0, + "num_input_tokens_seen": 56246112, + "step": 32825 + }, + { + "epoch": 159.3680387409201, + "grad_norm": 7.140535274174908e-09, + "learning_rate": 0.023168075581925685, + "loss": 0.0, + "num_input_tokens_seen": 56254656, + "step": 32830 + }, + { + "epoch": 159.39225181598064, + "grad_norm": 1.2175871866304533e-08, + "learning_rate": 0.023136635919694126, + "loss": 0.0, + "num_input_tokens_seen": 56263232, + "step": 32835 + }, + { + "epoch": 159.41646489104116, + "grad_norm": 1.0048831100561983e-08, + "learning_rate": 0.02310521582138753, + "loss": 0.0, + "num_input_tokens_seen": 56271616, + "step": 32840 + }, + { + "epoch": 159.4406779661017, + "grad_norm": 9.977270565286744e-09, + "learning_rate": 0.023073815291851357, + "loss": 0.0, + "num_input_tokens_seen": 56279936, + "step": 32845 + }, + { + "epoch": 159.46489104116222, + "grad_norm": 8.391454642264762e-09, + "learning_rate": 0.02304243433592788, + "loss": 0.0, + "num_input_tokens_seen": 56288448, + "step": 32850 + }, + { + "epoch": 159.48910411622276, + "grad_norm": 6.3974558983659335e-09, + "learning_rate": 0.023011072958456513, + "loss": 0.0, + "num_input_tokens_seen": 56296928, + "step": 32855 + }, + { + "epoch": 159.5133171912833, + "grad_norm": 1.3201014503749775e-08, + "learning_rate": 0.022979731164273536, + "loss": 0.0, + "num_input_tokens_seen": 56305152, + "step": 32860 + }, + { + "epoch": 159.53753026634382, + "grad_norm": 1.0058923471945036e-08, + "learning_rate": 0.022948408958212218, + "loss": 0.0, + "num_input_tokens_seen": 56313376, + "step": 32865 + }, + { + "epoch": 159.56174334140437, + "grad_norm": 1.2086555756241069e-08, + "learning_rate": 0.022917106345102876, + "loss": 0.0, + "num_input_tokens_seen": 56321920, + "step": 32870 + }, + { + "epoch": 159.58595641646488, + "grad_norm": 1.6007373204729447e-08, + "learning_rate": 0.022885823329772785, + "loss": 0.0, + "num_input_tokens_seen": 56330304, + "step": 32875 + }, + { + "epoch": 159.61016949152543, + "grad_norm": 1.32248745288166e-08, + "learning_rate": 0.02285455991704612, + "loss": 0.0, + "num_input_tokens_seen": 56339232, + "step": 32880 + }, + { + "epoch": 159.63438256658597, + "grad_norm": 1.720678888261773e-08, + "learning_rate": 0.022823316111744117, + "loss": 0.0, + "num_input_tokens_seen": 56347616, + "step": 32885 + }, + { + "epoch": 159.65859564164649, + "grad_norm": 2.1905457003867923e-08, + "learning_rate": 0.022792091918685014, + "loss": 0.0, + "num_input_tokens_seen": 56355968, + "step": 32890 + }, + { + "epoch": 159.68280871670703, + "grad_norm": 5.431011640411043e-09, + "learning_rate": 0.022760887342683906, + "loss": 0.0, + "num_input_tokens_seen": 56364544, + "step": 32895 + }, + { + "epoch": 159.70702179176754, + "grad_norm": 1.1470374872146749e-08, + "learning_rate": 0.022729702388552975, + "loss": 0.0, + "num_input_tokens_seen": 56373120, + "step": 32900 + }, + { + "epoch": 159.7312348668281, + "grad_norm": 9.726597305359519e-09, + "learning_rate": 0.022698537061101292, + "loss": 0.0, + "num_input_tokens_seen": 56381792, + "step": 32905 + }, + { + "epoch": 159.75544794188863, + "grad_norm": 1.1186815918051707e-08, + "learning_rate": 0.022667391365134962, + "loss": 0.0, + "num_input_tokens_seen": 56390176, + "step": 32910 + }, + { + "epoch": 159.77966101694915, + "grad_norm": 5.000270419941444e-09, + "learning_rate": 0.022636265305457065, + "loss": 0.0, + "num_input_tokens_seen": 56398528, + "step": 32915 + }, + { + "epoch": 159.8038740920097, + "grad_norm": 6.633893434582205e-09, + "learning_rate": 0.02260515888686764, + "loss": 0.0, + "num_input_tokens_seen": 56407040, + "step": 32920 + }, + { + "epoch": 159.8280871670702, + "grad_norm": 1.035790742065501e-08, + "learning_rate": 0.022574072114163596, + "loss": 0.0, + "num_input_tokens_seen": 56416096, + "step": 32925 + }, + { + "epoch": 159.85230024213075, + "grad_norm": 1.726320597583708e-08, + "learning_rate": 0.022543004992139005, + "loss": 0.0, + "num_input_tokens_seen": 56424992, + "step": 32930 + }, + { + "epoch": 159.8765133171913, + "grad_norm": 2.187948489051905e-09, + "learning_rate": 0.022511957525584745, + "loss": 0.0, + "num_input_tokens_seen": 56433760, + "step": 32935 + }, + { + "epoch": 159.9007263922518, + "grad_norm": 2.333639059770576e-09, + "learning_rate": 0.022480929719288778, + "loss": 0.0, + "num_input_tokens_seen": 56442464, + "step": 32940 + }, + { + "epoch": 159.92493946731236, + "grad_norm": 1.0749629630879554e-08, + "learning_rate": 0.02244992157803592, + "loss": 0.0, + "num_input_tokens_seen": 56451200, + "step": 32945 + }, + { + "epoch": 159.94915254237287, + "grad_norm": 1.8285168934539797e-08, + "learning_rate": 0.022418933106608047, + "loss": 0.0, + "num_input_tokens_seen": 56459552, + "step": 32950 + }, + { + "epoch": 159.97336561743342, + "grad_norm": 1.7038695787618963e-08, + "learning_rate": 0.022387964309784018, + "loss": 0.0, + "num_input_tokens_seen": 56467968, + "step": 32955 + }, + { + "epoch": 159.99757869249396, + "grad_norm": 5.239607414608827e-09, + "learning_rate": 0.022357015192339517, + "loss": 0.0, + "num_input_tokens_seen": 56476832, + "step": 32960 + }, + { + "epoch": 160.02421307506054, + "grad_norm": 7.770441179388854e-09, + "learning_rate": 0.02232608575904734, + "loss": 0.0, + "num_input_tokens_seen": 56485888, + "step": 32965 + }, + { + "epoch": 160.04842615012106, + "grad_norm": 7.9649575823737e-09, + "learning_rate": 0.022295176014677225, + "loss": 0.0, + "num_input_tokens_seen": 56494304, + "step": 32970 + }, + { + "epoch": 160.0726392251816, + "grad_norm": 7.520884359735192e-09, + "learning_rate": 0.02226428596399577, + "loss": 0.0, + "num_input_tokens_seen": 56502720, + "step": 32975 + }, + { + "epoch": 160.09685230024212, + "grad_norm": 6.776406102915189e-09, + "learning_rate": 0.02223341561176669, + "loss": 0.0, + "num_input_tokens_seen": 56511360, + "step": 32980 + }, + { + "epoch": 160.12106537530266, + "grad_norm": 5.855965490297876e-09, + "learning_rate": 0.0222025649627505, + "loss": 0.0, + "num_input_tokens_seen": 56520128, + "step": 32985 + }, + { + "epoch": 160.1452784503632, + "grad_norm": 2.5650678026067908e-08, + "learning_rate": 0.022171734021704814, + "loss": 0.0, + "num_input_tokens_seen": 56528672, + "step": 32990 + }, + { + "epoch": 160.16949152542372, + "grad_norm": 1.0081092405300751e-08, + "learning_rate": 0.022140922793384116, + "loss": 0.0, + "num_input_tokens_seen": 56536960, + "step": 32995 + }, + { + "epoch": 160.19370460048427, + "grad_norm": 1.975390517827691e-08, + "learning_rate": 0.022110131282539934, + "loss": 0.0, + "num_input_tokens_seen": 56545632, + "step": 33000 + }, + { + "epoch": 160.19370460048427, + "eval_loss": 1.182714819908142, + "eval_runtime": 4.6371, + "eval_samples_per_second": 79.145, + "eval_steps_per_second": 19.84, + "num_input_tokens_seen": 56545632, + "step": 33000 + }, + { + "epoch": 160.21791767554478, + "grad_norm": 3.651807967131049e-09, + "learning_rate": 0.022079359493920675, + "loss": 0.0, + "num_input_tokens_seen": 56553888, + "step": 33005 + }, + { + "epoch": 160.24213075060533, + "grad_norm": 8.075733859413958e-09, + "learning_rate": 0.02204860743227169, + "loss": 0.0, + "num_input_tokens_seen": 56562752, + "step": 33010 + }, + { + "epoch": 160.26634382566587, + "grad_norm": 9.547530765985357e-09, + "learning_rate": 0.022017875102335365, + "loss": 0.0, + "num_input_tokens_seen": 56570944, + "step": 33015 + }, + { + "epoch": 160.2905569007264, + "grad_norm": 3.4151492744882717e-09, + "learning_rate": 0.02198716250885108, + "loss": 0.0, + "num_input_tokens_seen": 56579392, + "step": 33020 + }, + { + "epoch": 160.31476997578693, + "grad_norm": 9.776832676777758e-09, + "learning_rate": 0.021956469656555, + "loss": 0.0, + "num_input_tokens_seen": 56587936, + "step": 33025 + }, + { + "epoch": 160.33898305084745, + "grad_norm": 1.746050060091875e-08, + "learning_rate": 0.0219257965501804, + "loss": 0.0, + "num_input_tokens_seen": 56596224, + "step": 33030 + }, + { + "epoch": 160.363196125908, + "grad_norm": 1.1911799546737711e-08, + "learning_rate": 0.021895143194457494, + "loss": 0.0, + "num_input_tokens_seen": 56605120, + "step": 33035 + }, + { + "epoch": 160.38740920096853, + "grad_norm": 2.801540333052799e-09, + "learning_rate": 0.021864509594113322, + "loss": 0.0, + "num_input_tokens_seen": 56613696, + "step": 33040 + }, + { + "epoch": 160.41162227602905, + "grad_norm": 6.467887114780524e-09, + "learning_rate": 0.02183389575387207, + "loss": 0.0, + "num_input_tokens_seen": 56622112, + "step": 33045 + }, + { + "epoch": 160.4358353510896, + "grad_norm": 6.262922180866326e-09, + "learning_rate": 0.021803301678454682, + "loss": 0.0, + "num_input_tokens_seen": 56630752, + "step": 33050 + }, + { + "epoch": 160.4600484261501, + "grad_norm": 6.349660353066611e-09, + "learning_rate": 0.021772727372579213, + "loss": 0.0, + "num_input_tokens_seen": 56639360, + "step": 33055 + }, + { + "epoch": 160.48426150121065, + "grad_norm": 9.566938352634224e-09, + "learning_rate": 0.02174217284096061, + "loss": 0.0, + "num_input_tokens_seen": 56647648, + "step": 33060 + }, + { + "epoch": 160.5084745762712, + "grad_norm": 6.8476957437724195e-09, + "learning_rate": 0.0217116380883107, + "loss": 0.0, + "num_input_tokens_seen": 56656608, + "step": 33065 + }, + { + "epoch": 160.5326876513317, + "grad_norm": 8.204654733390271e-09, + "learning_rate": 0.021681123119338425, + "loss": 0.0, + "num_input_tokens_seen": 56665536, + "step": 33070 + }, + { + "epoch": 160.55690072639226, + "grad_norm": 9.481195384353214e-09, + "learning_rate": 0.02165062793874951, + "loss": 0.0, + "num_input_tokens_seen": 56674240, + "step": 33075 + }, + { + "epoch": 160.58111380145277, + "grad_norm": 7.771393306654772e-09, + "learning_rate": 0.021620152551246666, + "loss": 0.0, + "num_input_tokens_seen": 56682656, + "step": 33080 + }, + { + "epoch": 160.60532687651332, + "grad_norm": 1.5487310989215075e-08, + "learning_rate": 0.02158969696152967, + "loss": 0.0, + "num_input_tokens_seen": 56691456, + "step": 33085 + }, + { + "epoch": 160.62953995157386, + "grad_norm": 5.107932743442234e-09, + "learning_rate": 0.021559261174295057, + "loss": 0.0, + "num_input_tokens_seen": 56700512, + "step": 33090 + }, + { + "epoch": 160.65375302663438, + "grad_norm": 6.937955987496025e-09, + "learning_rate": 0.02152884519423646, + "loss": 0.0, + "num_input_tokens_seen": 56708992, + "step": 33095 + }, + { + "epoch": 160.67796610169492, + "grad_norm": 7.9024609078715e-09, + "learning_rate": 0.021498449026044447, + "loss": 0.0, + "num_input_tokens_seen": 56717600, + "step": 33100 + }, + { + "epoch": 160.70217917675544, + "grad_norm": 5.555757187636345e-09, + "learning_rate": 0.021468072674406414, + "loss": 0.0, + "num_input_tokens_seen": 56726272, + "step": 33105 + }, + { + "epoch": 160.72639225181598, + "grad_norm": 1.8392512401987915e-08, + "learning_rate": 0.021437716144006795, + "loss": 0.0, + "num_input_tokens_seen": 56735040, + "step": 33110 + }, + { + "epoch": 160.75060532687652, + "grad_norm": 5.903489253000771e-09, + "learning_rate": 0.021407379439527002, + "loss": 0.0, + "num_input_tokens_seen": 56743328, + "step": 33115 + }, + { + "epoch": 160.77481840193704, + "grad_norm": 9.034033965349408e-09, + "learning_rate": 0.021377062565645255, + "loss": 0.0, + "num_input_tokens_seen": 56752064, + "step": 33120 + }, + { + "epoch": 160.79903147699758, + "grad_norm": 1.2258379200602576e-08, + "learning_rate": 0.02134676552703688, + "loss": 0.0, + "num_input_tokens_seen": 56760800, + "step": 33125 + }, + { + "epoch": 160.8232445520581, + "grad_norm": 8.83487683012163e-09, + "learning_rate": 0.02131648832837398, + "loss": 0.0, + "num_input_tokens_seen": 56769216, + "step": 33130 + }, + { + "epoch": 160.84745762711864, + "grad_norm": 7.021325743039597e-09, + "learning_rate": 0.02128623097432574, + "loss": 0.0, + "num_input_tokens_seen": 56777664, + "step": 33135 + }, + { + "epoch": 160.8716707021792, + "grad_norm": 6.394011098365127e-09, + "learning_rate": 0.021255993469558192, + "loss": 0.0, + "num_input_tokens_seen": 56785920, + "step": 33140 + }, + { + "epoch": 160.8958837772397, + "grad_norm": 6.949095965325114e-09, + "learning_rate": 0.021225775818734364, + "loss": 0.0, + "num_input_tokens_seen": 56793952, + "step": 33145 + }, + { + "epoch": 160.92009685230025, + "grad_norm": 9.742896267539436e-09, + "learning_rate": 0.021195578026514166, + "loss": 0.0, + "num_input_tokens_seen": 56802720, + "step": 33150 + }, + { + "epoch": 160.94430992736076, + "grad_norm": 1.1846606362553302e-08, + "learning_rate": 0.02116540009755452, + "loss": 0.0, + "num_input_tokens_seen": 56811232, + "step": 33155 + }, + { + "epoch": 160.9685230024213, + "grad_norm": 1.3608095983386193e-08, + "learning_rate": 0.021135242036509173, + "loss": 0.0, + "num_input_tokens_seen": 56819584, + "step": 33160 + }, + { + "epoch": 160.99273607748185, + "grad_norm": 9.004473611184949e-09, + "learning_rate": 0.021105103848028967, + "loss": 0.0, + "num_input_tokens_seen": 56827744, + "step": 33165 + }, + { + "epoch": 161.01937046004844, + "grad_norm": 7.961538983636274e-09, + "learning_rate": 0.021074985536761504, + "loss": 0.0, + "num_input_tokens_seen": 56836928, + "step": 33170 + }, + { + "epoch": 161.04358353510895, + "grad_norm": 5.15980502768798e-09, + "learning_rate": 0.021044887107351435, + "loss": 0.0, + "num_input_tokens_seen": 56845472, + "step": 33175 + }, + { + "epoch": 161.0677966101695, + "grad_norm": 1.9825865393841013e-08, + "learning_rate": 0.021014808564440362, + "loss": 0.0, + "num_input_tokens_seen": 56853728, + "step": 33180 + }, + { + "epoch": 161.09200968523, + "grad_norm": 4.4887245032043666e-09, + "learning_rate": 0.02098474991266671, + "loss": 0.0, + "num_input_tokens_seen": 56862336, + "step": 33185 + }, + { + "epoch": 161.11622276029055, + "grad_norm": 5.751677800702737e-09, + "learning_rate": 0.02095471115666592, + "loss": 0.0, + "num_input_tokens_seen": 56870816, + "step": 33190 + }, + { + "epoch": 161.1404358353511, + "grad_norm": 5.890019583176809e-09, + "learning_rate": 0.020924692301070406, + "loss": 0.0, + "num_input_tokens_seen": 56879584, + "step": 33195 + }, + { + "epoch": 161.16464891041161, + "grad_norm": 1.2220598755163792e-08, + "learning_rate": 0.020894693350509346, + "loss": 0.0, + "num_input_tokens_seen": 56888352, + "step": 33200 + }, + { + "epoch": 161.16464891041161, + "eval_loss": 1.1904945373535156, + "eval_runtime": 4.6528, + "eval_samples_per_second": 78.878, + "eval_steps_per_second": 19.773, + "num_input_tokens_seen": 56888352, + "step": 33200 + }, + { + "epoch": 161.18886198547216, + "grad_norm": 1.3649919416991452e-08, + "learning_rate": 0.020864714309609057, + "loss": 0.0, + "num_input_tokens_seen": 56896928, + "step": 33205 + }, + { + "epoch": 161.21307506053267, + "grad_norm": 6.7535186332179364e-09, + "learning_rate": 0.020834755182992604, + "loss": 0.0, + "num_input_tokens_seen": 56905600, + "step": 33210 + }, + { + "epoch": 161.23728813559322, + "grad_norm": 1.0903335123657598e-08, + "learning_rate": 0.02080481597528011, + "loss": 0.0, + "num_input_tokens_seen": 56914496, + "step": 33215 + }, + { + "epoch": 161.26150121065376, + "grad_norm": 6.059538204539194e-09, + "learning_rate": 0.020774896691088583, + "loss": 0.0, + "num_input_tokens_seen": 56922624, + "step": 33220 + }, + { + "epoch": 161.28571428571428, + "grad_norm": 1.772247593123666e-08, + "learning_rate": 0.020744997335031882, + "loss": 0.0, + "num_input_tokens_seen": 56931392, + "step": 33225 + }, + { + "epoch": 161.30992736077482, + "grad_norm": 1.1703932933926353e-08, + "learning_rate": 0.02071511791172092, + "loss": 0.0, + "num_input_tokens_seen": 56940160, + "step": 33230 + }, + { + "epoch": 161.33414043583534, + "grad_norm": 7.132308965651646e-09, + "learning_rate": 0.02068525842576351, + "loss": 0.0, + "num_input_tokens_seen": 56948640, + "step": 33235 + }, + { + "epoch": 161.35835351089588, + "grad_norm": 1.552168704677115e-08, + "learning_rate": 0.020655418881764264, + "loss": 0.0, + "num_input_tokens_seen": 56956928, + "step": 33240 + }, + { + "epoch": 161.38256658595643, + "grad_norm": 4.157506783286635e-09, + "learning_rate": 0.020625599284324923, + "loss": 0.0, + "num_input_tokens_seen": 56965696, + "step": 33245 + }, + { + "epoch": 161.40677966101694, + "grad_norm": 4.01526456528245e-09, + "learning_rate": 0.02059579963804396, + "loss": 0.0, + "num_input_tokens_seen": 56974272, + "step": 33250 + }, + { + "epoch": 161.43099273607749, + "grad_norm": 8.581337418434032e-09, + "learning_rate": 0.02056601994751688, + "loss": 0.0, + "num_input_tokens_seen": 56982880, + "step": 33255 + }, + { + "epoch": 161.455205811138, + "grad_norm": 9.276608814445808e-09, + "learning_rate": 0.02053626021733614, + "loss": 0.0, + "num_input_tokens_seen": 56991808, + "step": 33260 + }, + { + "epoch": 161.47941888619854, + "grad_norm": 8.712831345292216e-09, + "learning_rate": 0.02050652045209097, + "loss": 0.0, + "num_input_tokens_seen": 57000320, + "step": 33265 + }, + { + "epoch": 161.5036319612591, + "grad_norm": 7.1785262178991616e-09, + "learning_rate": 0.020476800656367672, + "loss": 0.0, + "num_input_tokens_seen": 57009152, + "step": 33270 + }, + { + "epoch": 161.5278450363196, + "grad_norm": 9.991230065509171e-09, + "learning_rate": 0.020447100834749425, + "loss": 0.0, + "num_input_tokens_seen": 57017184, + "step": 33275 + }, + { + "epoch": 161.55205811138015, + "grad_norm": 2.9528131051392847e-09, + "learning_rate": 0.02041742099181627, + "loss": 0.0, + "num_input_tokens_seen": 57025632, + "step": 33280 + }, + { + "epoch": 161.57627118644066, + "grad_norm": 1.5565797539807136e-08, + "learning_rate": 0.02038776113214526, + "loss": 0.0, + "num_input_tokens_seen": 57034144, + "step": 33285 + }, + { + "epoch": 161.6004842615012, + "grad_norm": 8.391482175795772e-09, + "learning_rate": 0.0203581212603103, + "loss": 0.0, + "num_input_tokens_seen": 57042720, + "step": 33290 + }, + { + "epoch": 161.62469733656175, + "grad_norm": 1.3037437796015183e-08, + "learning_rate": 0.02032850138088219, + "loss": 0.0, + "num_input_tokens_seen": 57051232, + "step": 33295 + }, + { + "epoch": 161.64891041162227, + "grad_norm": 2.0933732525207915e-09, + "learning_rate": 0.020298901498428754, + "loss": 0.0, + "num_input_tokens_seen": 57059552, + "step": 33300 + }, + { + "epoch": 161.6731234866828, + "grad_norm": 4.459534519440922e-09, + "learning_rate": 0.020269321617514595, + "loss": 0.0, + "num_input_tokens_seen": 57068032, + "step": 33305 + }, + { + "epoch": 161.69733656174333, + "grad_norm": 1.130107385449719e-08, + "learning_rate": 0.020239761742701343, + "loss": 0.0, + "num_input_tokens_seen": 57076384, + "step": 33310 + }, + { + "epoch": 161.72154963680387, + "grad_norm": 1.1427617963022385e-08, + "learning_rate": 0.02021022187854754, + "loss": 0.0, + "num_input_tokens_seen": 57085216, + "step": 33315 + }, + { + "epoch": 161.74576271186442, + "grad_norm": 1.2068126054032291e-08, + "learning_rate": 0.020180702029608522, + "loss": 0.0, + "num_input_tokens_seen": 57093888, + "step": 33320 + }, + { + "epoch": 161.76997578692493, + "grad_norm": 8.624564173942417e-09, + "learning_rate": 0.020151202200436695, + "loss": 0.0, + "num_input_tokens_seen": 57102656, + "step": 33325 + }, + { + "epoch": 161.79418886198548, + "grad_norm": 1.1258254772883447e-08, + "learning_rate": 0.020121722395581226, + "loss": 0.0, + "num_input_tokens_seen": 57111072, + "step": 33330 + }, + { + "epoch": 161.818401937046, + "grad_norm": 5.590984564207702e-09, + "learning_rate": 0.020092262619588342, + "loss": 0.0, + "num_input_tokens_seen": 57119584, + "step": 33335 + }, + { + "epoch": 161.84261501210653, + "grad_norm": 6.150917108982412e-09, + "learning_rate": 0.02006282287700109, + "loss": 0.0, + "num_input_tokens_seen": 57128224, + "step": 33340 + }, + { + "epoch": 161.86682808716708, + "grad_norm": 1.184551745581075e-08, + "learning_rate": 0.020033403172359427, + "loss": 0.0, + "num_input_tokens_seen": 57136672, + "step": 33345 + }, + { + "epoch": 161.8910411622276, + "grad_norm": 1.0407633865838761e-08, + "learning_rate": 0.020004003510200284, + "loss": 0.0, + "num_input_tokens_seen": 57145216, + "step": 33350 + }, + { + "epoch": 161.91525423728814, + "grad_norm": 9.693527758258824e-09, + "learning_rate": 0.019974623895057407, + "loss": 0.0, + "num_input_tokens_seen": 57153984, + "step": 33355 + }, + { + "epoch": 161.93946731234865, + "grad_norm": 9.046488891328863e-09, + "learning_rate": 0.019945264331461553, + "loss": 0.0, + "num_input_tokens_seen": 57162656, + "step": 33360 + }, + { + "epoch": 161.9636803874092, + "grad_norm": 1.3265445630850081e-08, + "learning_rate": 0.019915924823940317, + "loss": 0.0, + "num_input_tokens_seen": 57170976, + "step": 33365 + }, + { + "epoch": 161.98789346246974, + "grad_norm": 1.5312334511463632e-08, + "learning_rate": 0.01988660537701816, + "loss": 0.0, + "num_input_tokens_seen": 57179488, + "step": 33370 + }, + { + "epoch": 162.01452784503633, + "grad_norm": 7.826279180278561e-09, + "learning_rate": 0.01985730599521659, + "loss": 0.0, + "num_input_tokens_seen": 57188544, + "step": 33375 + }, + { + "epoch": 162.03874092009684, + "grad_norm": 7.4469128641396765e-09, + "learning_rate": 0.019828026683053918, + "loss": 0.0, + "num_input_tokens_seen": 57197312, + "step": 33380 + }, + { + "epoch": 162.0629539951574, + "grad_norm": 4.983332413388553e-09, + "learning_rate": 0.01979876744504535, + "loss": 0.0, + "num_input_tokens_seen": 57206016, + "step": 33385 + }, + { + "epoch": 162.08716707021793, + "grad_norm": 8.140214724505768e-09, + "learning_rate": 0.019769528285703046, + "loss": 0.0, + "num_input_tokens_seen": 57214368, + "step": 33390 + }, + { + "epoch": 162.11138014527845, + "grad_norm": 9.322371319342437e-09, + "learning_rate": 0.019740309209536098, + "loss": 0.0, + "num_input_tokens_seen": 57223008, + "step": 33395 + }, + { + "epoch": 162.135593220339, + "grad_norm": 9.454804050790244e-09, + "learning_rate": 0.019711110221050387, + "loss": 0.0, + "num_input_tokens_seen": 57231584, + "step": 33400 + }, + { + "epoch": 162.135593220339, + "eval_loss": 1.187892198562622, + "eval_runtime": 4.6201, + "eval_samples_per_second": 79.436, + "eval_steps_per_second": 19.913, + "num_input_tokens_seen": 57231584, + "step": 33400 + }, + { + "epoch": 162.1598062953995, + "grad_norm": 5.686678239413823e-09, + "learning_rate": 0.019681931324748825, + "loss": 0.0, + "num_input_tokens_seen": 57240448, + "step": 33405 + }, + { + "epoch": 162.18401937046005, + "grad_norm": 1.3008442323325653e-08, + "learning_rate": 0.019652772525131094, + "loss": 0.0, + "num_input_tokens_seen": 57249024, + "step": 33410 + }, + { + "epoch": 162.2082324455206, + "grad_norm": 1.2845112529191738e-08, + "learning_rate": 0.019623633826693885, + "loss": 0.0, + "num_input_tokens_seen": 57257376, + "step": 33415 + }, + { + "epoch": 162.2324455205811, + "grad_norm": 2.6233462069313873e-09, + "learning_rate": 0.019594515233930788, + "loss": 0.0, + "num_input_tokens_seen": 57265472, + "step": 33420 + }, + { + "epoch": 162.25665859564165, + "grad_norm": 8.537882401071784e-09, + "learning_rate": 0.019565416751332186, + "loss": 0.0, + "num_input_tokens_seen": 57273984, + "step": 33425 + }, + { + "epoch": 162.28087167070217, + "grad_norm": 6.33414254380682e-09, + "learning_rate": 0.019536338383385497, + "loss": 0.0, + "num_input_tokens_seen": 57282624, + "step": 33430 + }, + { + "epoch": 162.3050847457627, + "grad_norm": 1.002680694028868e-08, + "learning_rate": 0.019507280134574933, + "loss": 0.0, + "num_input_tokens_seen": 57291072, + "step": 33435 + }, + { + "epoch": 162.32929782082326, + "grad_norm": 7.755155628785815e-09, + "learning_rate": 0.019478242009381624, + "loss": 0.0, + "num_input_tokens_seen": 57299712, + "step": 33440 + }, + { + "epoch": 162.35351089588377, + "grad_norm": 6.7647114576629974e-09, + "learning_rate": 0.01944922401228367, + "loss": 0.0, + "num_input_tokens_seen": 57308448, + "step": 33445 + }, + { + "epoch": 162.37772397094432, + "grad_norm": 9.067923301131486e-09, + "learning_rate": 0.01942022614775593, + "loss": 0.0, + "num_input_tokens_seen": 57316832, + "step": 33450 + }, + { + "epoch": 162.40193704600483, + "grad_norm": 8.98501095747406e-09, + "learning_rate": 0.01939124842027029, + "loss": 0.0, + "num_input_tokens_seen": 57325408, + "step": 33455 + }, + { + "epoch": 162.42615012106538, + "grad_norm": 1.282997263984953e-08, + "learning_rate": 0.01936229083429551, + "loss": 0.0, + "num_input_tokens_seen": 57334112, + "step": 33460 + }, + { + "epoch": 162.45036319612592, + "grad_norm": 6.067184088465183e-09, + "learning_rate": 0.019333353394297148, + "loss": 0.0, + "num_input_tokens_seen": 57342432, + "step": 33465 + }, + { + "epoch": 162.47457627118644, + "grad_norm": 5.580278905625846e-09, + "learning_rate": 0.019304436104737754, + "loss": 0.0, + "num_input_tokens_seen": 57351136, + "step": 33470 + }, + { + "epoch": 162.49878934624698, + "grad_norm": 9.612988627338837e-09, + "learning_rate": 0.019275538970076778, + "loss": 0.0, + "num_input_tokens_seen": 57360000, + "step": 33475 + }, + { + "epoch": 162.5230024213075, + "grad_norm": 2.0680495538272226e-08, + "learning_rate": 0.019246661994770434, + "loss": 0.0, + "num_input_tokens_seen": 57368128, + "step": 33480 + }, + { + "epoch": 162.54721549636804, + "grad_norm": 1.594232479362745e-08, + "learning_rate": 0.019217805183271985, + "loss": 0.0, + "num_input_tokens_seen": 57376736, + "step": 33485 + }, + { + "epoch": 162.57142857142858, + "grad_norm": 2.030938972552576e-08, + "learning_rate": 0.019188968540031465, + "loss": 0.0, + "num_input_tokens_seen": 57385024, + "step": 33490 + }, + { + "epoch": 162.5956416464891, + "grad_norm": 1.1291339419017277e-08, + "learning_rate": 0.019160152069495867, + "loss": 0.0, + "num_input_tokens_seen": 57393952, + "step": 33495 + }, + { + "epoch": 162.61985472154964, + "grad_norm": 8.467010204071812e-09, + "learning_rate": 0.019131355776109103, + "loss": 0.0, + "num_input_tokens_seen": 57402272, + "step": 33500 + }, + { + "epoch": 162.64406779661016, + "grad_norm": 1.0581662657216384e-08, + "learning_rate": 0.019102579664311857, + "loss": 0.0, + "num_input_tokens_seen": 57410912, + "step": 33505 + }, + { + "epoch": 162.6682808716707, + "grad_norm": 6.91267754149294e-09, + "learning_rate": 0.019073823738541763, + "loss": 0.0, + "num_input_tokens_seen": 57419616, + "step": 33510 + }, + { + "epoch": 162.69249394673125, + "grad_norm": 1.317336373318767e-08, + "learning_rate": 0.0190450880032334, + "loss": 0.0, + "num_input_tokens_seen": 57428416, + "step": 33515 + }, + { + "epoch": 162.71670702179176, + "grad_norm": 8.248949079359136e-09, + "learning_rate": 0.019016372462818114, + "loss": 0.0, + "num_input_tokens_seen": 57436704, + "step": 33520 + }, + { + "epoch": 162.7409200968523, + "grad_norm": 9.6904608781756e-09, + "learning_rate": 0.018987677121724278, + "loss": 0.0, + "num_input_tokens_seen": 57445376, + "step": 33525 + }, + { + "epoch": 162.76513317191282, + "grad_norm": 5.518586476682685e-09, + "learning_rate": 0.018959001984377, + "loss": 0.0, + "num_input_tokens_seen": 57454048, + "step": 33530 + }, + { + "epoch": 162.78934624697337, + "grad_norm": 7.782106514753195e-09, + "learning_rate": 0.018930347055198377, + "loss": 0.0, + "num_input_tokens_seen": 57462560, + "step": 33535 + }, + { + "epoch": 162.8135593220339, + "grad_norm": 1.5056487612241654e-08, + "learning_rate": 0.01890171233860739, + "loss": 0.0, + "num_input_tokens_seen": 57470816, + "step": 33540 + }, + { + "epoch": 162.83777239709443, + "grad_norm": 2.463147374953678e-08, + "learning_rate": 0.018873097839019807, + "loss": 0.0, + "num_input_tokens_seen": 57479520, + "step": 33545 + }, + { + "epoch": 162.86198547215497, + "grad_norm": 8.662444983542628e-09, + "learning_rate": 0.0188445035608484, + "loss": 0.0, + "num_input_tokens_seen": 57488128, + "step": 33550 + }, + { + "epoch": 162.88619854721549, + "grad_norm": 1.2974216367922509e-08, + "learning_rate": 0.018815929508502777, + "loss": 0.0, + "num_input_tokens_seen": 57496736, + "step": 33555 + }, + { + "epoch": 162.91041162227603, + "grad_norm": 8.526535921760114e-09, + "learning_rate": 0.01878737568638934, + "loss": 0.0, + "num_input_tokens_seen": 57505536, + "step": 33560 + }, + { + "epoch": 162.93462469733657, + "grad_norm": 9.235034958976485e-09, + "learning_rate": 0.01875884209891152, + "loss": 0.0, + "num_input_tokens_seen": 57513984, + "step": 33565 + }, + { + "epoch": 162.9588377723971, + "grad_norm": 6.4711342950829476e-09, + "learning_rate": 0.018730328750469514, + "loss": 0.0, + "num_input_tokens_seen": 57522624, + "step": 33570 + }, + { + "epoch": 162.98305084745763, + "grad_norm": 1.1856305270896428e-08, + "learning_rate": 0.018701835645460473, + "loss": 0.0, + "num_input_tokens_seen": 57531200, + "step": 33575 + }, + { + "epoch": 163.00968523002422, + "grad_norm": 8.597619505223975e-09, + "learning_rate": 0.01867336278827838, + "loss": 0.0, + "num_input_tokens_seen": 57539968, + "step": 33580 + }, + { + "epoch": 163.03389830508473, + "grad_norm": 1.4113353152822583e-08, + "learning_rate": 0.018644910183314056, + "loss": 0.0, + "num_input_tokens_seen": 57548384, + "step": 33585 + }, + { + "epoch": 163.05811138014528, + "grad_norm": 2.2840383806510545e-08, + "learning_rate": 0.01861647783495531, + "loss": 0.0, + "num_input_tokens_seen": 57556992, + "step": 33590 + }, + { + "epoch": 163.08232445520582, + "grad_norm": 2.27907612782019e-08, + "learning_rate": 0.01858806574758676, + "loss": 0.0, + "num_input_tokens_seen": 57565408, + "step": 33595 + }, + { + "epoch": 163.10653753026634, + "grad_norm": 1.3647932561866583e-08, + "learning_rate": 0.01855967392558988, + "loss": 0.0, + "num_input_tokens_seen": 57574112, + "step": 33600 + }, + { + "epoch": 163.10653753026634, + "eval_loss": 1.1913050413131714, + "eval_runtime": 4.6278, + "eval_samples_per_second": 79.304, + "eval_steps_per_second": 19.88, + "num_input_tokens_seen": 57574112, + "step": 33600 + }, + { + "epoch": 163.13075060532688, + "grad_norm": 1.1460306481581028e-08, + "learning_rate": 0.018531302373343096, + "loss": 0.0, + "num_input_tokens_seen": 57582624, + "step": 33605 + }, + { + "epoch": 163.1549636803874, + "grad_norm": 2.6858901769344357e-09, + "learning_rate": 0.018502951095221588, + "loss": 0.0, + "num_input_tokens_seen": 57591008, + "step": 33610 + }, + { + "epoch": 163.17917675544794, + "grad_norm": 2.1654702475615295e-09, + "learning_rate": 0.01847462009559751, + "loss": 0.0, + "num_input_tokens_seen": 57599904, + "step": 33615 + }, + { + "epoch": 163.20338983050848, + "grad_norm": 7.196343521087556e-09, + "learning_rate": 0.01844630937883992, + "loss": 0.0, + "num_input_tokens_seen": 57608160, + "step": 33620 + }, + { + "epoch": 163.227602905569, + "grad_norm": 9.331618144869935e-09, + "learning_rate": 0.018418018949314573, + "loss": 0.0, + "num_input_tokens_seen": 57617056, + "step": 33625 + }, + { + "epoch": 163.25181598062954, + "grad_norm": 7.399754142767279e-09, + "learning_rate": 0.018389748811384315, + "loss": 0.0, + "num_input_tokens_seen": 57625344, + "step": 33630 + }, + { + "epoch": 163.27602905569006, + "grad_norm": 1.7592379109032663e-08, + "learning_rate": 0.018361498969408658, + "loss": 0.0, + "num_input_tokens_seen": 57633728, + "step": 33635 + }, + { + "epoch": 163.3002421307506, + "grad_norm": 1.0352602330954142e-08, + "learning_rate": 0.01833326942774415, + "loss": 0.0, + "num_input_tokens_seen": 57642400, + "step": 33640 + }, + { + "epoch": 163.32445520581115, + "grad_norm": 1.5894562110929655e-08, + "learning_rate": 0.018305060190744155, + "loss": 0.0, + "num_input_tokens_seen": 57651328, + "step": 33645 + }, + { + "epoch": 163.34866828087166, + "grad_norm": 7.099891341511011e-09, + "learning_rate": 0.018276871262758846, + "loss": 0.0, + "num_input_tokens_seen": 57659680, + "step": 33650 + }, + { + "epoch": 163.3728813559322, + "grad_norm": 1.1497951923900018e-08, + "learning_rate": 0.0182487026481353, + "loss": 0.0, + "num_input_tokens_seen": 57668256, + "step": 33655 + }, + { + "epoch": 163.39709443099272, + "grad_norm": 1.2268680293914258e-08, + "learning_rate": 0.018220554351217538, + "loss": 0.0, + "num_input_tokens_seen": 57676864, + "step": 33660 + }, + { + "epoch": 163.42130750605327, + "grad_norm": 1.3340113902415851e-08, + "learning_rate": 0.01819242637634629, + "loss": 0.0, + "num_input_tokens_seen": 57685888, + "step": 33665 + }, + { + "epoch": 163.4455205811138, + "grad_norm": 1.1435061786357892e-08, + "learning_rate": 0.01816431872785933, + "loss": 0.0, + "num_input_tokens_seen": 57694080, + "step": 33670 + }, + { + "epoch": 163.46973365617433, + "grad_norm": 1.251314252215252e-08, + "learning_rate": 0.018136231410091148, + "loss": 0.0, + "num_input_tokens_seen": 57702944, + "step": 33675 + }, + { + "epoch": 163.49394673123487, + "grad_norm": 7.505981614031043e-09, + "learning_rate": 0.018108164427373175, + "loss": 0.0, + "num_input_tokens_seen": 57711456, + "step": 33680 + }, + { + "epoch": 163.5181598062954, + "grad_norm": 1.0995688803916437e-08, + "learning_rate": 0.01808011778403375, + "loss": 0.0, + "num_input_tokens_seen": 57719744, + "step": 33685 + }, + { + "epoch": 163.54237288135593, + "grad_norm": 1.0014981732808792e-08, + "learning_rate": 0.01805209148439793, + "loss": 0.0, + "num_input_tokens_seen": 57728416, + "step": 33690 + }, + { + "epoch": 163.56658595641647, + "grad_norm": 1.1761302154411624e-08, + "learning_rate": 0.018024085532787757, + "loss": 0.0, + "num_input_tokens_seen": 57737056, + "step": 33695 + }, + { + "epoch": 163.590799031477, + "grad_norm": 1.3272789090024162e-08, + "learning_rate": 0.017996099933522164, + "loss": 0.0, + "num_input_tokens_seen": 57745280, + "step": 33700 + }, + { + "epoch": 163.61501210653753, + "grad_norm": 5.0226334202818634e-09, + "learning_rate": 0.017968134690916775, + "loss": 0.0, + "num_input_tokens_seen": 57754112, + "step": 33705 + }, + { + "epoch": 163.63922518159805, + "grad_norm": 1.0557203999894682e-08, + "learning_rate": 0.017940189809284263, + "loss": 0.0, + "num_input_tokens_seen": 57762976, + "step": 33710 + }, + { + "epoch": 163.6634382566586, + "grad_norm": 4.597837666153737e-09, + "learning_rate": 0.017912265292934024, + "loss": 0.0, + "num_input_tokens_seen": 57771552, + "step": 33715 + }, + { + "epoch": 163.68765133171914, + "grad_norm": 1.5507536588188486e-08, + "learning_rate": 0.017884361146172423, + "loss": 0.0, + "num_input_tokens_seen": 57780032, + "step": 33720 + }, + { + "epoch": 163.71186440677965, + "grad_norm": 9.517249210944101e-09, + "learning_rate": 0.01785647737330261, + "loss": 0.0, + "num_input_tokens_seen": 57789152, + "step": 33725 + }, + { + "epoch": 163.7360774818402, + "grad_norm": 1.848168018625529e-08, + "learning_rate": 0.017828613978624563, + "loss": 0.0, + "num_input_tokens_seen": 57797728, + "step": 33730 + }, + { + "epoch": 163.7602905569007, + "grad_norm": 8.32609448053745e-09, + "learning_rate": 0.01780077096643523, + "loss": 0.0, + "num_input_tokens_seen": 57806080, + "step": 33735 + }, + { + "epoch": 163.78450363196126, + "grad_norm": 1.3873901139049849e-08, + "learning_rate": 0.017772948341028345, + "loss": 0.0, + "num_input_tokens_seen": 57814880, + "step": 33740 + }, + { + "epoch": 163.8087167070218, + "grad_norm": 9.43458910995787e-09, + "learning_rate": 0.01774514610669447, + "loss": 0.0, + "num_input_tokens_seen": 57823808, + "step": 33745 + }, + { + "epoch": 163.83292978208232, + "grad_norm": 7.64819052534449e-09, + "learning_rate": 0.017717364267721112, + "loss": 0.0, + "num_input_tokens_seen": 57832160, + "step": 33750 + }, + { + "epoch": 163.85714285714286, + "grad_norm": 1.2260939818986571e-08, + "learning_rate": 0.017689602828392513, + "loss": 0.0, + "num_input_tokens_seen": 57840608, + "step": 33755 + }, + { + "epoch": 163.88135593220338, + "grad_norm": 9.095297848205064e-09, + "learning_rate": 0.017661861792989897, + "loss": 0.0, + "num_input_tokens_seen": 57848960, + "step": 33760 + }, + { + "epoch": 163.90556900726392, + "grad_norm": 6.631656113142981e-09, + "learning_rate": 0.017634141165791272, + "loss": 0.0, + "num_input_tokens_seen": 57857504, + "step": 33765 + }, + { + "epoch": 163.92978208232446, + "grad_norm": 6.615908265672488e-09, + "learning_rate": 0.017606440951071455, + "loss": 0.0, + "num_input_tokens_seen": 57866048, + "step": 33770 + }, + { + "epoch": 163.95399515738498, + "grad_norm": 7.799404677655275e-09, + "learning_rate": 0.017578761153102213, + "loss": 0.0, + "num_input_tokens_seen": 57874464, + "step": 33775 + }, + { + "epoch": 163.97820823244552, + "grad_norm": 1.567407359459594e-08, + "learning_rate": 0.017551101776152146, + "loss": 0.0, + "num_input_tokens_seen": 57883008, + "step": 33780 + }, + { + "epoch": 164.0048426150121, + "grad_norm": 3.8076727548741474e-08, + "learning_rate": 0.017523462824486608, + "loss": 0.0, + "num_input_tokens_seen": 57891680, + "step": 33785 + }, + { + "epoch": 164.02905569007265, + "grad_norm": 9.165004755118389e-09, + "learning_rate": 0.01749584430236794, + "loss": 0.0, + "num_input_tokens_seen": 57900448, + "step": 33790 + }, + { + "epoch": 164.05326876513317, + "grad_norm": 2.4388320252199946e-09, + "learning_rate": 0.01746824621405524, + "loss": 0.0, + "num_input_tokens_seen": 57909056, + "step": 33795 + }, + { + "epoch": 164.0774818401937, + "grad_norm": 1.0516629345147521e-08, + "learning_rate": 0.017440668563804412, + "loss": 0.0, + "num_input_tokens_seen": 57917728, + "step": 33800 + }, + { + "epoch": 164.0774818401937, + "eval_loss": 1.1952580213546753, + "eval_runtime": 4.6276, + "eval_samples_per_second": 79.307, + "eval_steps_per_second": 19.881, + "num_input_tokens_seen": 57917728, + "step": 33800 + }, + { + "epoch": 164.10169491525423, + "grad_norm": 6.411354558366611e-09, + "learning_rate": 0.017413111355868392, + "loss": 0.0, + "num_input_tokens_seen": 57925952, + "step": 33805 + }, + { + "epoch": 164.12590799031477, + "grad_norm": 7.843508065263904e-09, + "learning_rate": 0.017385574594496748, + "loss": 0.0, + "num_input_tokens_seen": 57934816, + "step": 33810 + }, + { + "epoch": 164.15012106537532, + "grad_norm": 6.440978861377289e-09, + "learning_rate": 0.01735805828393605, + "loss": 0.0, + "num_input_tokens_seen": 57943392, + "step": 33815 + }, + { + "epoch": 164.17433414043583, + "grad_norm": 4.2820924583963915e-09, + "learning_rate": 0.017330562428429667, + "loss": 0.0, + "num_input_tokens_seen": 57951648, + "step": 33820 + }, + { + "epoch": 164.19854721549638, + "grad_norm": 6.056052992420291e-09, + "learning_rate": 0.01730308703221776, + "loss": 0.0, + "num_input_tokens_seen": 57960064, + "step": 33825 + }, + { + "epoch": 164.2227602905569, + "grad_norm": 1.603837418429066e-08, + "learning_rate": 0.01727563209953744, + "loss": 0.0, + "num_input_tokens_seen": 57968640, + "step": 33830 + }, + { + "epoch": 164.24697336561744, + "grad_norm": 8.755866254261946e-09, + "learning_rate": 0.017248197634622535, + "loss": 0.0, + "num_input_tokens_seen": 57977056, + "step": 33835 + }, + { + "epoch": 164.27118644067798, + "grad_norm": 9.485515484186635e-09, + "learning_rate": 0.01722078364170383, + "loss": 0.0, + "num_input_tokens_seen": 57985472, + "step": 33840 + }, + { + "epoch": 164.2953995157385, + "grad_norm": 1.131434324008751e-08, + "learning_rate": 0.017193390125008905, + "loss": 0.0, + "num_input_tokens_seen": 57994048, + "step": 33845 + }, + { + "epoch": 164.31961259079904, + "grad_norm": 8.663522343965724e-09, + "learning_rate": 0.017166017088762153, + "loss": 0.0, + "num_input_tokens_seen": 58002816, + "step": 33850 + }, + { + "epoch": 164.34382566585955, + "grad_norm": 1.2915883473851864e-08, + "learning_rate": 0.017138664537184878, + "loss": 0.0, + "num_input_tokens_seen": 58011296, + "step": 33855 + }, + { + "epoch": 164.3680387409201, + "grad_norm": 8.153334896121578e-09, + "learning_rate": 0.017111332474495172, + "loss": 0.0, + "num_input_tokens_seen": 58019872, + "step": 33860 + }, + { + "epoch": 164.39225181598064, + "grad_norm": 3.1424038926530784e-09, + "learning_rate": 0.017084020904907998, + "loss": 0.0, + "num_input_tokens_seen": 58028000, + "step": 33865 + }, + { + "epoch": 164.41646489104116, + "grad_norm": 4.392135544151188e-09, + "learning_rate": 0.017056729832635103, + "loss": 0.0, + "num_input_tokens_seen": 58036224, + "step": 33870 + }, + { + "epoch": 164.4406779661017, + "grad_norm": 6.992752155099424e-09, + "learning_rate": 0.017029459261885153, + "loss": 0.0, + "num_input_tokens_seen": 58044800, + "step": 33875 + }, + { + "epoch": 164.46489104116222, + "grad_norm": 8.17877321424021e-09, + "learning_rate": 0.01700220919686359, + "loss": 0.0, + "num_input_tokens_seen": 58053248, + "step": 33880 + }, + { + "epoch": 164.48910411622276, + "grad_norm": 7.1672654478049935e-09, + "learning_rate": 0.016974979641772723, + "loss": 0.0, + "num_input_tokens_seen": 58061856, + "step": 33885 + }, + { + "epoch": 164.5133171912833, + "grad_norm": 1.3204012105916263e-08, + "learning_rate": 0.01694777060081169, + "loss": 0.0, + "num_input_tokens_seen": 58070592, + "step": 33890 + }, + { + "epoch": 164.53753026634382, + "grad_norm": 8.910792992367078e-09, + "learning_rate": 0.016920582078176444, + "loss": 0.0, + "num_input_tokens_seen": 58079104, + "step": 33895 + }, + { + "epoch": 164.56174334140437, + "grad_norm": 3.6739307152089395e-09, + "learning_rate": 0.016893414078059863, + "loss": 0.0, + "num_input_tokens_seen": 58087520, + "step": 33900 + }, + { + "epoch": 164.58595641646488, + "grad_norm": 8.92284202080873e-09, + "learning_rate": 0.016866266604651535, + "loss": 0.0, + "num_input_tokens_seen": 58096384, + "step": 33905 + }, + { + "epoch": 164.61016949152543, + "grad_norm": 5.7915481299630756e-09, + "learning_rate": 0.016839139662137976, + "loss": 0.0, + "num_input_tokens_seen": 58105184, + "step": 33910 + }, + { + "epoch": 164.63438256658597, + "grad_norm": 6.0274105706525916e-09, + "learning_rate": 0.01681203325470245, + "loss": 0.0, + "num_input_tokens_seen": 58113856, + "step": 33915 + }, + { + "epoch": 164.65859564164649, + "grad_norm": 5.221556520496051e-09, + "learning_rate": 0.016784947386525157, + "loss": 0.0, + "num_input_tokens_seen": 58122688, + "step": 33920 + }, + { + "epoch": 164.68280871670703, + "grad_norm": 7.379556077324878e-09, + "learning_rate": 0.01675788206178308, + "loss": 0.0, + "num_input_tokens_seen": 58131328, + "step": 33925 + }, + { + "epoch": 164.70702179176754, + "grad_norm": 1.1335583138816219e-08, + "learning_rate": 0.016730837284649986, + "loss": 0.0, + "num_input_tokens_seen": 58139840, + "step": 33930 + }, + { + "epoch": 164.7312348668281, + "grad_norm": 1.0095217106709242e-08, + "learning_rate": 0.016703813059296583, + "loss": 0.0, + "num_input_tokens_seen": 58148480, + "step": 33935 + }, + { + "epoch": 164.75544794188863, + "grad_norm": 6.330445057045608e-09, + "learning_rate": 0.016676809389890294, + "loss": 0.0, + "num_input_tokens_seen": 58157152, + "step": 33940 + }, + { + "epoch": 164.77966101694915, + "grad_norm": 1.21827232746341e-08, + "learning_rate": 0.016649826280595435, + "loss": 0.0, + "num_input_tokens_seen": 58165440, + "step": 33945 + }, + { + "epoch": 164.8038740920097, + "grad_norm": 8.705676179943111e-09, + "learning_rate": 0.016622863735573163, + "loss": 0.0, + "num_input_tokens_seen": 58173920, + "step": 33950 + }, + { + "epoch": 164.8280871670702, + "grad_norm": 6.341331904025083e-09, + "learning_rate": 0.016595921758981395, + "loss": 0.0, + "num_input_tokens_seen": 58182848, + "step": 33955 + }, + { + "epoch": 164.85230024213075, + "grad_norm": 6.043008760059365e-09, + "learning_rate": 0.01656900035497495, + "loss": 0.0, + "num_input_tokens_seen": 58191840, + "step": 33960 + }, + { + "epoch": 164.8765133171913, + "grad_norm": 3.5794160968549704e-09, + "learning_rate": 0.016542099527705485, + "loss": 0.0, + "num_input_tokens_seen": 58200448, + "step": 33965 + }, + { + "epoch": 164.9007263922518, + "grad_norm": 9.59301615921504e-09, + "learning_rate": 0.01651521928132138, + "loss": 0.0, + "num_input_tokens_seen": 58208672, + "step": 33970 + }, + { + "epoch": 164.92493946731236, + "grad_norm": 1.1752118389551924e-08, + "learning_rate": 0.01648835961996794, + "loss": 0.0, + "num_input_tokens_seen": 58216960, + "step": 33975 + }, + { + "epoch": 164.94915254237287, + "grad_norm": 1.3611699323234916e-08, + "learning_rate": 0.016461520547787285, + "loss": 0.0, + "num_input_tokens_seen": 58225504, + "step": 33980 + }, + { + "epoch": 164.97336561743342, + "grad_norm": 7.002515900467188e-09, + "learning_rate": 0.016434702068918266, + "loss": 0.0, + "num_input_tokens_seen": 58234080, + "step": 33985 + }, + { + "epoch": 164.99757869249396, + "grad_norm": 7.888676378797754e-09, + "learning_rate": 0.01640790418749673, + "loss": 0.0, + "num_input_tokens_seen": 58242912, + "step": 33990 + }, + { + "epoch": 165.02421307506054, + "grad_norm": 5.285761606188544e-09, + "learning_rate": 0.016381126907655134, + "loss": 0.0, + "num_input_tokens_seen": 58252256, + "step": 33995 + }, + { + "epoch": 165.04842615012106, + "grad_norm": 4.070678460976751e-09, + "learning_rate": 0.016354370233522948, + "loss": 0.0, + "num_input_tokens_seen": 58261184, + "step": 34000 + }, + { + "epoch": 165.04842615012106, + "eval_loss": 1.1932696104049683, + "eval_runtime": 4.6209, + "eval_samples_per_second": 79.422, + "eval_steps_per_second": 19.91, + "num_input_tokens_seen": 58261184, + "step": 34000 + }, + { + "epoch": 165.0726392251816, + "grad_norm": 1.0275494233269455e-08, + "learning_rate": 0.016327634169226394, + "loss": 0.0, + "num_input_tokens_seen": 58269568, + "step": 34005 + }, + { + "epoch": 165.09685230024212, + "grad_norm": 3.419321048525603e-09, + "learning_rate": 0.016300918718888485, + "loss": 0.0, + "num_input_tokens_seen": 58277920, + "step": 34010 + }, + { + "epoch": 165.12106537530266, + "grad_norm": 4.71985472927372e-09, + "learning_rate": 0.016274223886629052, + "loss": 0.0, + "num_input_tokens_seen": 58286624, + "step": 34015 + }, + { + "epoch": 165.1452784503632, + "grad_norm": 7.537146018421481e-09, + "learning_rate": 0.01624754967656482, + "loss": 0.0, + "num_input_tokens_seen": 58294944, + "step": 34020 + }, + { + "epoch": 165.16949152542372, + "grad_norm": 8.098874459960825e-09, + "learning_rate": 0.016220896092809235, + "loss": 0.0, + "num_input_tokens_seen": 58303776, + "step": 34025 + }, + { + "epoch": 165.19370460048427, + "grad_norm": 1.2974918028874072e-08, + "learning_rate": 0.01619426313947267, + "loss": 0.0, + "num_input_tokens_seen": 58312320, + "step": 34030 + }, + { + "epoch": 165.21791767554478, + "grad_norm": 4.13988736625015e-08, + "learning_rate": 0.016167650820662228, + "loss": 0.0, + "num_input_tokens_seen": 58321024, + "step": 34035 + }, + { + "epoch": 165.24213075060533, + "grad_norm": 7.1161769810146325e-09, + "learning_rate": 0.016141059140481855, + "loss": 0.0, + "num_input_tokens_seen": 58329536, + "step": 34040 + }, + { + "epoch": 165.26634382566587, + "grad_norm": 1.298440022168279e-08, + "learning_rate": 0.016114488103032374, + "loss": 0.0, + "num_input_tokens_seen": 58338144, + "step": 34045 + }, + { + "epoch": 165.2905569007264, + "grad_norm": 8.739339030228166e-09, + "learning_rate": 0.016087937712411293, + "loss": 0.0, + "num_input_tokens_seen": 58346816, + "step": 34050 + }, + { + "epoch": 165.31476997578693, + "grad_norm": 4.043684498356015e-09, + "learning_rate": 0.01606140797271308, + "loss": 0.0, + "num_input_tokens_seen": 58355456, + "step": 34055 + }, + { + "epoch": 165.33898305084745, + "grad_norm": 1.0006266926154694e-08, + "learning_rate": 0.01603489888802897, + "loss": 0.0, + "num_input_tokens_seen": 58364000, + "step": 34060 + }, + { + "epoch": 165.363196125908, + "grad_norm": 9.028350511641747e-09, + "learning_rate": 0.016008410462446918, + "loss": 0.0, + "num_input_tokens_seen": 58372256, + "step": 34065 + }, + { + "epoch": 165.38740920096853, + "grad_norm": 3.962507655330683e-09, + "learning_rate": 0.01598194270005185, + "loss": 0.0, + "num_input_tokens_seen": 58380544, + "step": 34070 + }, + { + "epoch": 165.41162227602905, + "grad_norm": 1.803805638189715e-08, + "learning_rate": 0.015955495604925356, + "loss": 0.0, + "num_input_tokens_seen": 58389152, + "step": 34075 + }, + { + "epoch": 165.4358353510896, + "grad_norm": 1.2695581474986284e-08, + "learning_rate": 0.01592906918114598, + "loss": 0.0, + "num_input_tokens_seen": 58397728, + "step": 34080 + }, + { + "epoch": 165.4600484261501, + "grad_norm": 6.293097154497218e-09, + "learning_rate": 0.015902663432788965, + "loss": 0.0, + "num_input_tokens_seen": 58406368, + "step": 34085 + }, + { + "epoch": 165.48426150121065, + "grad_norm": 8.117065242174704e-09, + "learning_rate": 0.01587627836392643, + "loss": 0.0, + "num_input_tokens_seen": 58414752, + "step": 34090 + }, + { + "epoch": 165.5084745762712, + "grad_norm": 1.2187587827838797e-08, + "learning_rate": 0.01584991397862726, + "loss": 0.0, + "num_input_tokens_seen": 58423200, + "step": 34095 + }, + { + "epoch": 165.5326876513317, + "grad_norm": 6.884702141718435e-09, + "learning_rate": 0.015823570280957214, + "loss": 0.0, + "num_input_tokens_seen": 58431648, + "step": 34100 + }, + { + "epoch": 165.55690072639226, + "grad_norm": 4.764310279625761e-09, + "learning_rate": 0.015797247274978766, + "loss": 0.0, + "num_input_tokens_seen": 58440352, + "step": 34105 + }, + { + "epoch": 165.58111380145277, + "grad_norm": 9.908998954699655e-09, + "learning_rate": 0.015770944964751326, + "loss": 0.0, + "num_input_tokens_seen": 58448960, + "step": 34110 + }, + { + "epoch": 165.60532687651332, + "grad_norm": 4.287449506534813e-09, + "learning_rate": 0.015744663354330956, + "loss": 0.0, + "num_input_tokens_seen": 58457536, + "step": 34115 + }, + { + "epoch": 165.62953995157386, + "grad_norm": 1.2675666738459768e-08, + "learning_rate": 0.015718402447770664, + "loss": 0.0, + "num_input_tokens_seen": 58466272, + "step": 34120 + }, + { + "epoch": 165.65375302663438, + "grad_norm": 7.46879180724136e-09, + "learning_rate": 0.015692162249120224, + "loss": 0.0, + "num_input_tokens_seen": 58475040, + "step": 34125 + }, + { + "epoch": 165.67796610169492, + "grad_norm": 1.0313486065172128e-08, + "learning_rate": 0.01566594276242615, + "loss": 0.0, + "num_input_tokens_seen": 58483584, + "step": 34130 + }, + { + "epoch": 165.70217917675544, + "grad_norm": 8.449810628974319e-09, + "learning_rate": 0.015639743991731857, + "loss": 0.0, + "num_input_tokens_seen": 58492480, + "step": 34135 + }, + { + "epoch": 165.72639225181598, + "grad_norm": 1.0008580630938013e-08, + "learning_rate": 0.01561356594107755, + "loss": 0.0, + "num_input_tokens_seen": 58501152, + "step": 34140 + }, + { + "epoch": 165.75060532687652, + "grad_norm": 1.1284709167114215e-08, + "learning_rate": 0.015587408614500147, + "loss": 0.0, + "num_input_tokens_seen": 58509824, + "step": 34145 + }, + { + "epoch": 165.77481840193704, + "grad_norm": 6.101007254954993e-09, + "learning_rate": 0.015561272016033505, + "loss": 0.0, + "num_input_tokens_seen": 58518720, + "step": 34150 + }, + { + "epoch": 165.79903147699758, + "grad_norm": 8.638044945996626e-09, + "learning_rate": 0.015535156149708167, + "loss": 0.0, + "num_input_tokens_seen": 58527648, + "step": 34155 + }, + { + "epoch": 165.8232445520581, + "grad_norm": 6.960719556303729e-09, + "learning_rate": 0.015509061019551528, + "loss": 0.0, + "num_input_tokens_seen": 58536352, + "step": 34160 + }, + { + "epoch": 165.84745762711864, + "grad_norm": 3.813811932928957e-09, + "learning_rate": 0.015482986629587818, + "loss": 0.0, + "num_input_tokens_seen": 58544480, + "step": 34165 + }, + { + "epoch": 165.8716707021792, + "grad_norm": 1.0923010940189215e-08, + "learning_rate": 0.01545693298383799, + "loss": 0.0, + "num_input_tokens_seen": 58553120, + "step": 34170 + }, + { + "epoch": 165.8958837772397, + "grad_norm": 1.8465760476260584e-08, + "learning_rate": 0.015430900086319858, + "loss": 0.0, + "num_input_tokens_seen": 58561344, + "step": 34175 + }, + { + "epoch": 165.92009685230025, + "grad_norm": 9.043164439503926e-09, + "learning_rate": 0.015404887941048084, + "loss": 0.0, + "num_input_tokens_seen": 58569568, + "step": 34180 + }, + { + "epoch": 165.94430992736076, + "grad_norm": 1.26952395262947e-08, + "learning_rate": 0.01537889655203397, + "loss": 0.0, + "num_input_tokens_seen": 58578144, + "step": 34185 + }, + { + "epoch": 165.9685230024213, + "grad_norm": 1.0767221780838554e-08, + "learning_rate": 0.015352925923285798, + "loss": 0.0, + "num_input_tokens_seen": 58586816, + "step": 34190 + }, + { + "epoch": 165.99273607748185, + "grad_norm": 1.7392977724739467e-08, + "learning_rate": 0.015326976058808511, + "loss": 0.0, + "num_input_tokens_seen": 58595168, + "step": 34195 + }, + { + "epoch": 166.01937046004844, + "grad_norm": 6.2864082828184564e-09, + "learning_rate": 0.015301046962603908, + "loss": 0.0, + "num_input_tokens_seen": 58604352, + "step": 34200 + }, + { + "epoch": 166.01937046004844, + "eval_loss": 1.1970213651657104, + "eval_runtime": 4.6202, + "eval_samples_per_second": 79.435, + "eval_steps_per_second": 19.913, + "num_input_tokens_seen": 58604352, + "step": 34200 + }, + { + "epoch": 166.04358353510895, + "grad_norm": 6.56276677446499e-09, + "learning_rate": 0.015275138638670626, + "loss": 0.0, + "num_input_tokens_seen": 58612640, + "step": 34205 + }, + { + "epoch": 166.0677966101695, + "grad_norm": 5.622577958774855e-09, + "learning_rate": 0.015249251091004001, + "loss": 0.0, + "num_input_tokens_seen": 58621440, + "step": 34210 + }, + { + "epoch": 166.09200968523, + "grad_norm": 1.059358289978718e-08, + "learning_rate": 0.01522338432359624, + "loss": 0.0, + "num_input_tokens_seen": 58629888, + "step": 34215 + }, + { + "epoch": 166.11622276029055, + "grad_norm": 6.2422440549880776e-09, + "learning_rate": 0.01519753834043635, + "loss": 0.0, + "num_input_tokens_seen": 58638400, + "step": 34220 + }, + { + "epoch": 166.1404358353511, + "grad_norm": 5.384025669741277e-09, + "learning_rate": 0.015171713145510095, + "loss": 0.0, + "num_input_tokens_seen": 58646784, + "step": 34225 + }, + { + "epoch": 166.16464891041161, + "grad_norm": 6.0124416556561755e-09, + "learning_rate": 0.01514590874279999, + "loss": 0.0, + "num_input_tokens_seen": 58655552, + "step": 34230 + }, + { + "epoch": 166.18886198547216, + "grad_norm": 1.5865454727759243e-08, + "learning_rate": 0.015120125136285467, + "loss": 0.0, + "num_input_tokens_seen": 58664000, + "step": 34235 + }, + { + "epoch": 166.21307506053267, + "grad_norm": 5.761158217154616e-09, + "learning_rate": 0.015094362329942629, + "loss": 0.0, + "num_input_tokens_seen": 58672736, + "step": 34240 + }, + { + "epoch": 166.23728813559322, + "grad_norm": 1.3970976375787814e-08, + "learning_rate": 0.01506862032774448, + "loss": 0.0, + "num_input_tokens_seen": 58681280, + "step": 34245 + }, + { + "epoch": 166.26150121065376, + "grad_norm": 6.001959373946875e-09, + "learning_rate": 0.015042899133660697, + "loss": 0.0, + "num_input_tokens_seen": 58689600, + "step": 34250 + }, + { + "epoch": 166.28571428571428, + "grad_norm": 4.396960129326999e-09, + "learning_rate": 0.01501719875165789, + "loss": 0.0, + "num_input_tokens_seen": 58698656, + "step": 34255 + }, + { + "epoch": 166.30992736077482, + "grad_norm": 5.8451750106769396e-09, + "learning_rate": 0.014991519185699286, + "loss": 0.0, + "num_input_tokens_seen": 58707744, + "step": 34260 + }, + { + "epoch": 166.33414043583534, + "grad_norm": 7.930080592188915e-09, + "learning_rate": 0.014965860439745054, + "loss": 0.0, + "num_input_tokens_seen": 58715968, + "step": 34265 + }, + { + "epoch": 166.35835351089588, + "grad_norm": 1.495360280046043e-08, + "learning_rate": 0.01494022251775211, + "loss": 0.0, + "num_input_tokens_seen": 58724448, + "step": 34270 + }, + { + "epoch": 166.38256658595643, + "grad_norm": 1.2535112503542223e-08, + "learning_rate": 0.014914605423674109, + "loss": 0.0, + "num_input_tokens_seen": 58732960, + "step": 34275 + }, + { + "epoch": 166.40677966101694, + "grad_norm": 9.196971184621816e-09, + "learning_rate": 0.014889009161461525, + "loss": 0.0, + "num_input_tokens_seen": 58741952, + "step": 34280 + }, + { + "epoch": 166.43099273607749, + "grad_norm": 9.8679331372864e-09, + "learning_rate": 0.014863433735061665, + "loss": 0.0, + "num_input_tokens_seen": 58750688, + "step": 34285 + }, + { + "epoch": 166.455205811138, + "grad_norm": 1.0200023936590696e-08, + "learning_rate": 0.014837879148418541, + "loss": 0.0, + "num_input_tokens_seen": 58759328, + "step": 34290 + }, + { + "epoch": 166.47941888619854, + "grad_norm": 1.7122960827009592e-08, + "learning_rate": 0.01481234540547302, + "loss": 0.0, + "num_input_tokens_seen": 58767904, + "step": 34295 + }, + { + "epoch": 166.5036319612591, + "grad_norm": 1.2206426092120637e-08, + "learning_rate": 0.014786832510162717, + "loss": 0.0, + "num_input_tokens_seen": 58776736, + "step": 34300 + }, + { + "epoch": 166.5278450363196, + "grad_norm": 1.6542966108090695e-08, + "learning_rate": 0.014761340466422017, + "loss": 0.0, + "num_input_tokens_seen": 58785152, + "step": 34305 + }, + { + "epoch": 166.55205811138015, + "grad_norm": 1.6154576343296867e-08, + "learning_rate": 0.014735869278182144, + "loss": 0.0, + "num_input_tokens_seen": 58793472, + "step": 34310 + }, + { + "epoch": 166.57627118644066, + "grad_norm": 7.3523738208791656e-09, + "learning_rate": 0.014710418949371057, + "loss": 0.0, + "num_input_tokens_seen": 58802144, + "step": 34315 + }, + { + "epoch": 166.6004842615012, + "grad_norm": 8.389995365121194e-09, + "learning_rate": 0.014684989483913495, + "loss": 0.0, + "num_input_tokens_seen": 58810720, + "step": 34320 + }, + { + "epoch": 166.62469733656175, + "grad_norm": 1.6248980827526793e-08, + "learning_rate": 0.014659580885731077, + "loss": 0.0, + "num_input_tokens_seen": 58819232, + "step": 34325 + }, + { + "epoch": 166.64891041162227, + "grad_norm": 5.503657085625946e-09, + "learning_rate": 0.014634193158742047, + "loss": 0.0, + "num_input_tokens_seen": 58827360, + "step": 34330 + }, + { + "epoch": 166.6731234866828, + "grad_norm": 1.132654769975261e-08, + "learning_rate": 0.014608826306861576, + "loss": 0.0, + "num_input_tokens_seen": 58835808, + "step": 34335 + }, + { + "epoch": 166.69733656174333, + "grad_norm": 1.0042906062324164e-08, + "learning_rate": 0.014583480334001486, + "loss": 0.0, + "num_input_tokens_seen": 58844608, + "step": 34340 + }, + { + "epoch": 166.72154963680387, + "grad_norm": 1.7492771675620133e-08, + "learning_rate": 0.014558155244070496, + "loss": 0.0, + "num_input_tokens_seen": 58852928, + "step": 34345 + }, + { + "epoch": 166.74576271186442, + "grad_norm": 8.829301734181172e-09, + "learning_rate": 0.014532851040974036, + "loss": 0.0, + "num_input_tokens_seen": 58861344, + "step": 34350 + }, + { + "epoch": 166.76997578692493, + "grad_norm": 3.172985652000193e-09, + "learning_rate": 0.014507567728614335, + "loss": 0.0, + "num_input_tokens_seen": 58869952, + "step": 34355 + }, + { + "epoch": 166.79418886198548, + "grad_norm": 1.1244200237570112e-08, + "learning_rate": 0.01448230531089037, + "loss": 0.0, + "num_input_tokens_seen": 58878816, + "step": 34360 + }, + { + "epoch": 166.818401937046, + "grad_norm": 6.458608314829917e-09, + "learning_rate": 0.014457063791697993, + "loss": 0.0, + "num_input_tokens_seen": 58887456, + "step": 34365 + }, + { + "epoch": 166.84261501210653, + "grad_norm": 6.121755991017608e-09, + "learning_rate": 0.01443184317492971, + "loss": 0.0, + "num_input_tokens_seen": 58895584, + "step": 34370 + }, + { + "epoch": 166.86682808716708, + "grad_norm": 4.8792920814833e-09, + "learning_rate": 0.014406643464474822, + "loss": 0.0, + "num_input_tokens_seen": 58903936, + "step": 34375 + }, + { + "epoch": 166.8910411622276, + "grad_norm": 9.734472783407e-09, + "learning_rate": 0.014381464664219539, + "loss": 0.0, + "num_input_tokens_seen": 58912480, + "step": 34380 + }, + { + "epoch": 166.91525423728814, + "grad_norm": 7.434266979799986e-09, + "learning_rate": 0.014356306778046656, + "loss": 0.0, + "num_input_tokens_seen": 58920576, + "step": 34385 + }, + { + "epoch": 166.93946731234865, + "grad_norm": 5.9541616082015025e-09, + "learning_rate": 0.014331169809835885, + "loss": 0.0, + "num_input_tokens_seen": 58928736, + "step": 34390 + }, + { + "epoch": 166.9636803874092, + "grad_norm": 3.407595983162537e-09, + "learning_rate": 0.014306053763463644, + "loss": 0.0, + "num_input_tokens_seen": 58937568, + "step": 34395 + }, + { + "epoch": 166.98789346246974, + "grad_norm": 6.375511230061193e-09, + "learning_rate": 0.014280958642803147, + "loss": 0.0, + "num_input_tokens_seen": 58946112, + "step": 34400 + }, + { + "epoch": 166.98789346246974, + "eval_loss": 1.1914087533950806, + "eval_runtime": 4.6211, + "eval_samples_per_second": 79.419, + "eval_steps_per_second": 19.909, + "num_input_tokens_seen": 58946112, + "step": 34400 + }, + { + "epoch": 167.01452784503633, + "grad_norm": 1.063682208979344e-08, + "learning_rate": 0.014255884451724404, + "loss": 0.0, + "num_input_tokens_seen": 58955104, + "step": 34405 + }, + { + "epoch": 167.03874092009684, + "grad_norm": 9.244872423153083e-09, + "learning_rate": 0.014230831194094101, + "loss": 0.0, + "num_input_tokens_seen": 58963520, + "step": 34410 + }, + { + "epoch": 167.0629539951574, + "grad_norm": 1.6440786509974714e-08, + "learning_rate": 0.014205798873775865, + "loss": 0.0, + "num_input_tokens_seen": 58972128, + "step": 34415 + }, + { + "epoch": 167.08716707021793, + "grad_norm": 1.326340548502003e-08, + "learning_rate": 0.014180787494629893, + "loss": 0.0, + "num_input_tokens_seen": 58980672, + "step": 34420 + }, + { + "epoch": 167.11138014527845, + "grad_norm": 1.2477386235332233e-08, + "learning_rate": 0.014155797060513314, + "loss": 0.0, + "num_input_tokens_seen": 58988896, + "step": 34425 + }, + { + "epoch": 167.135593220339, + "grad_norm": 9.193707128929418e-09, + "learning_rate": 0.014130827575279963, + "loss": 0.0, + "num_input_tokens_seen": 58997824, + "step": 34430 + }, + { + "epoch": 167.1598062953995, + "grad_norm": 1.1688205070470303e-08, + "learning_rate": 0.014105879042780427, + "loss": 0.0, + "num_input_tokens_seen": 59006560, + "step": 34435 + }, + { + "epoch": 167.18401937046005, + "grad_norm": 1.4308508156091193e-08, + "learning_rate": 0.014080951466862113, + "loss": 0.0, + "num_input_tokens_seen": 59014944, + "step": 34440 + }, + { + "epoch": 167.2082324455206, + "grad_norm": 7.504173282768534e-09, + "learning_rate": 0.014056044851369126, + "loss": 0.0, + "num_input_tokens_seen": 59023520, + "step": 34445 + }, + { + "epoch": 167.2324455205811, + "grad_norm": 6.432692600810697e-09, + "learning_rate": 0.014031159200142428, + "loss": 0.0, + "num_input_tokens_seen": 59031968, + "step": 34450 + }, + { + "epoch": 167.25665859564165, + "grad_norm": 7.83979015039904e-09, + "learning_rate": 0.014006294517019667, + "loss": 0.0, + "num_input_tokens_seen": 59040352, + "step": 34455 + }, + { + "epoch": 167.28087167070217, + "grad_norm": 9.479157903058422e-09, + "learning_rate": 0.013981450805835276, + "loss": 0.0, + "num_input_tokens_seen": 59048800, + "step": 34460 + }, + { + "epoch": 167.3050847457627, + "grad_norm": 9.355034968905329e-09, + "learning_rate": 0.01395662807042049, + "loss": 0.0, + "num_input_tokens_seen": 59057728, + "step": 34465 + }, + { + "epoch": 167.32929782082326, + "grad_norm": 9.804473677377246e-09, + "learning_rate": 0.013931826314603296, + "loss": 0.0, + "num_input_tokens_seen": 59066560, + "step": 34470 + }, + { + "epoch": 167.35351089588377, + "grad_norm": 5.577856398986114e-09, + "learning_rate": 0.013907045542208401, + "loss": 0.0, + "num_input_tokens_seen": 59074816, + "step": 34475 + }, + { + "epoch": 167.37772397094432, + "grad_norm": 8.389958949805987e-09, + "learning_rate": 0.013882285757057333, + "loss": 0.0, + "num_input_tokens_seen": 59084032, + "step": 34480 + }, + { + "epoch": 167.40193704600483, + "grad_norm": 1.922101233731155e-08, + "learning_rate": 0.013857546962968403, + "loss": 0.0, + "num_input_tokens_seen": 59092576, + "step": 34485 + }, + { + "epoch": 167.42615012106538, + "grad_norm": 2.6099169492255214e-09, + "learning_rate": 0.013832829163756577, + "loss": 0.0, + "num_input_tokens_seen": 59101152, + "step": 34490 + }, + { + "epoch": 167.45036319612592, + "grad_norm": 1.293428475435121e-08, + "learning_rate": 0.013808132363233689, + "loss": 0.0, + "num_input_tokens_seen": 59109888, + "step": 34495 + }, + { + "epoch": 167.47457627118644, + "grad_norm": 4.28390389828337e-09, + "learning_rate": 0.013783456565208256, + "loss": 0.0, + "num_input_tokens_seen": 59118432, + "step": 34500 + }, + { + "epoch": 167.49878934624698, + "grad_norm": 5.385300205773547e-09, + "learning_rate": 0.01375880177348564, + "loss": 0.0, + "num_input_tokens_seen": 59127008, + "step": 34505 + }, + { + "epoch": 167.5230024213075, + "grad_norm": 1.0884060763771686e-08, + "learning_rate": 0.013734167991867928, + "loss": 0.0, + "num_input_tokens_seen": 59135616, + "step": 34510 + }, + { + "epoch": 167.54721549636804, + "grad_norm": 7.720296402169424e-09, + "learning_rate": 0.013709555224153935, + "loss": 0.0, + "num_input_tokens_seen": 59144000, + "step": 34515 + }, + { + "epoch": 167.57142857142858, + "grad_norm": 4.9884980590775285e-09, + "learning_rate": 0.013684963474139222, + "loss": 0.0, + "num_input_tokens_seen": 59152672, + "step": 34520 + }, + { + "epoch": 167.5956416464891, + "grad_norm": 3.495843836631707e-09, + "learning_rate": 0.013660392745616224, + "loss": 0.0, + "num_input_tokens_seen": 59161152, + "step": 34525 + }, + { + "epoch": 167.61985472154964, + "grad_norm": 9.103089837481093e-09, + "learning_rate": 0.013635843042373974, + "loss": 0.0, + "num_input_tokens_seen": 59169664, + "step": 34530 + }, + { + "epoch": 167.64406779661016, + "grad_norm": 4.575948064911017e-09, + "learning_rate": 0.01361131436819843, + "loss": 0.0, + "num_input_tokens_seen": 59178336, + "step": 34535 + }, + { + "epoch": 167.6682808716707, + "grad_norm": 1.5727197322235043e-08, + "learning_rate": 0.013586806726872147, + "loss": 0.0, + "num_input_tokens_seen": 59186496, + "step": 34540 + }, + { + "epoch": 167.69249394673125, + "grad_norm": 6.193838331114421e-09, + "learning_rate": 0.013562320122174537, + "loss": 0.0, + "num_input_tokens_seen": 59195200, + "step": 34545 + }, + { + "epoch": 167.71670702179176, + "grad_norm": 5.3249915588082786e-09, + "learning_rate": 0.013537854557881762, + "loss": 0.0, + "num_input_tokens_seen": 59203744, + "step": 34550 + }, + { + "epoch": 167.7409200968523, + "grad_norm": 1.262666948775859e-08, + "learning_rate": 0.013513410037766687, + "loss": 0.0, + "num_input_tokens_seen": 59212384, + "step": 34555 + }, + { + "epoch": 167.76513317191282, + "grad_norm": 3.788505065216441e-09, + "learning_rate": 0.013488986565598998, + "loss": 0.0, + "num_input_tokens_seen": 59220992, + "step": 34560 + }, + { + "epoch": 167.78934624697337, + "grad_norm": 4.6940389353267165e-09, + "learning_rate": 0.013464584145145097, + "loss": 0.0, + "num_input_tokens_seen": 59229472, + "step": 34565 + }, + { + "epoch": 167.8135593220339, + "grad_norm": 6.266881236172139e-09, + "learning_rate": 0.013440202780168109, + "loss": 0.0, + "num_input_tokens_seen": 59237728, + "step": 34570 + }, + { + "epoch": 167.83777239709443, + "grad_norm": 1.1751786210822956e-08, + "learning_rate": 0.01341584247442799, + "loss": 0.0, + "num_input_tokens_seen": 59246560, + "step": 34575 + }, + { + "epoch": 167.86198547215497, + "grad_norm": 9.975472003986852e-09, + "learning_rate": 0.013391503231681355, + "loss": 0.0, + "num_input_tokens_seen": 59255264, + "step": 34580 + }, + { + "epoch": 167.88619854721549, + "grad_norm": 9.112790522181058e-09, + "learning_rate": 0.013367185055681685, + "loss": 0.0, + "num_input_tokens_seen": 59263808, + "step": 34585 + }, + { + "epoch": 167.91041162227603, + "grad_norm": 1.8032553228408688e-08, + "learning_rate": 0.013342887950179095, + "loss": 0.0, + "num_input_tokens_seen": 59272480, + "step": 34590 + }, + { + "epoch": 167.93462469733657, + "grad_norm": 9.443162696243235e-09, + "learning_rate": 0.013318611918920554, + "loss": 0.0, + "num_input_tokens_seen": 59280640, + "step": 34595 + }, + { + "epoch": 167.9588377723971, + "grad_norm": 7.930220924379228e-09, + "learning_rate": 0.01329435696564965, + "loss": 0.0, + "num_input_tokens_seen": 59289344, + "step": 34600 + }, + { + "epoch": 167.9588377723971, + "eval_loss": 1.1899081468582153, + "eval_runtime": 4.6345, + "eval_samples_per_second": 79.188, + "eval_steps_per_second": 19.851, + "num_input_tokens_seen": 59289344, + "step": 34600 + }, + { + "epoch": 167.98305084745763, + "grad_norm": 9.108917176092746e-09, + "learning_rate": 0.013270123094106894, + "loss": 0.0, + "num_input_tokens_seen": 59297856, + "step": 34605 + }, + { + "epoch": 168.00968523002422, + "grad_norm": 1.6598376006982107e-08, + "learning_rate": 0.013245910308029395, + "loss": 0.0, + "num_input_tokens_seen": 59306656, + "step": 34610 + }, + { + "epoch": 168.03389830508473, + "grad_norm": 8.407012863642649e-09, + "learning_rate": 0.0132217186111511, + "loss": 0.0, + "num_input_tokens_seen": 59315104, + "step": 34615 + }, + { + "epoch": 168.05811138014528, + "grad_norm": 1.0780817127908904e-08, + "learning_rate": 0.013197548007202626, + "loss": 0.0, + "num_input_tokens_seen": 59323552, + "step": 34620 + }, + { + "epoch": 168.08232445520582, + "grad_norm": 1.1315746561990636e-08, + "learning_rate": 0.01317339849991142, + "loss": 0.0, + "num_input_tokens_seen": 59332160, + "step": 34625 + }, + { + "epoch": 168.10653753026634, + "grad_norm": 2.5972792805362133e-09, + "learning_rate": 0.013149270093001675, + "loss": 0.0, + "num_input_tokens_seen": 59340736, + "step": 34630 + }, + { + "epoch": 168.13075060532688, + "grad_norm": 8.467917034238326e-09, + "learning_rate": 0.013125162790194227, + "loss": 0.0, + "num_input_tokens_seen": 59349504, + "step": 34635 + }, + { + "epoch": 168.1549636803874, + "grad_norm": 9.563967395820328e-09, + "learning_rate": 0.01310107659520674, + "loss": 0.0, + "num_input_tokens_seen": 59357920, + "step": 34640 + }, + { + "epoch": 168.17917675544794, + "grad_norm": 1.6831338101042093e-08, + "learning_rate": 0.013077011511753655, + "loss": 0.0, + "num_input_tokens_seen": 59366528, + "step": 34645 + }, + { + "epoch": 168.20338983050848, + "grad_norm": 8.574953191953227e-09, + "learning_rate": 0.013052967543546056, + "loss": 0.0, + "num_input_tokens_seen": 59375136, + "step": 34650 + }, + { + "epoch": 168.227602905569, + "grad_norm": 9.688077895475544e-09, + "learning_rate": 0.01302894469429186, + "loss": 0.0, + "num_input_tokens_seen": 59383808, + "step": 34655 + }, + { + "epoch": 168.25181598062954, + "grad_norm": 7.036098814694469e-09, + "learning_rate": 0.013004942967695653, + "loss": 0.0, + "num_input_tokens_seen": 59392640, + "step": 34660 + }, + { + "epoch": 168.27602905569006, + "grad_norm": 1.0639483072338862e-08, + "learning_rate": 0.012980962367458859, + "loss": 0.0, + "num_input_tokens_seen": 59400928, + "step": 34665 + }, + { + "epoch": 168.3002421307506, + "grad_norm": 1.1657809828591326e-08, + "learning_rate": 0.012957002897279567, + "loss": 0.0, + "num_input_tokens_seen": 59409184, + "step": 34670 + }, + { + "epoch": 168.32445520581115, + "grad_norm": 1.1299319702118282e-08, + "learning_rate": 0.012933064560852576, + "loss": 0.0, + "num_input_tokens_seen": 59417824, + "step": 34675 + }, + { + "epoch": 168.34866828087166, + "grad_norm": 1.1868156235550487e-08, + "learning_rate": 0.012909147361869527, + "loss": 0.0, + "num_input_tokens_seen": 59426368, + "step": 34680 + }, + { + "epoch": 168.3728813559322, + "grad_norm": 1.3570615742253267e-08, + "learning_rate": 0.012885251304018774, + "loss": 0.0, + "num_input_tokens_seen": 59434432, + "step": 34685 + }, + { + "epoch": 168.39709443099272, + "grad_norm": 6.0556608616479934e-09, + "learning_rate": 0.012861376390985335, + "loss": 0.0, + "num_input_tokens_seen": 59442976, + "step": 34690 + }, + { + "epoch": 168.42130750605327, + "grad_norm": 1.3412410737601022e-08, + "learning_rate": 0.012837522626451063, + "loss": 0.0, + "num_input_tokens_seen": 59451616, + "step": 34695 + }, + { + "epoch": 168.4455205811138, + "grad_norm": 4.46490000527433e-09, + "learning_rate": 0.01281369001409447, + "loss": 0.0, + "num_input_tokens_seen": 59460032, + "step": 34700 + }, + { + "epoch": 168.46973365617433, + "grad_norm": 5.846825246180742e-09, + "learning_rate": 0.012789878557590877, + "loss": 0.0, + "num_input_tokens_seen": 59468800, + "step": 34705 + }, + { + "epoch": 168.49394673123487, + "grad_norm": 1.5200983582985828e-08, + "learning_rate": 0.012766088260612334, + "loss": 0.0, + "num_input_tokens_seen": 59477920, + "step": 34710 + }, + { + "epoch": 168.5181598062954, + "grad_norm": 7.279248315228415e-09, + "learning_rate": 0.012742319126827523, + "loss": 0.0, + "num_input_tokens_seen": 59486336, + "step": 34715 + }, + { + "epoch": 168.54237288135593, + "grad_norm": 9.529895983462211e-09, + "learning_rate": 0.012718571159902008, + "loss": 0.0, + "num_input_tokens_seen": 59494784, + "step": 34720 + }, + { + "epoch": 168.56658595641647, + "grad_norm": 1.09100843914689e-08, + "learning_rate": 0.01269484436349803, + "loss": 0.0, + "num_input_tokens_seen": 59502976, + "step": 34725 + }, + { + "epoch": 168.590799031477, + "grad_norm": 6.430710186577926e-09, + "learning_rate": 0.012671138741274528, + "loss": 0.0, + "num_input_tokens_seen": 59511648, + "step": 34730 + }, + { + "epoch": 168.61501210653753, + "grad_norm": 1.305018226815946e-08, + "learning_rate": 0.012647454296887194, + "loss": 0.0, + "num_input_tokens_seen": 59520320, + "step": 34735 + }, + { + "epoch": 168.63922518159805, + "grad_norm": 7.675176050270238e-09, + "learning_rate": 0.012623791033988507, + "loss": 0.0, + "num_input_tokens_seen": 59528704, + "step": 34740 + }, + { + "epoch": 168.6634382566586, + "grad_norm": 9.004483381147566e-09, + "learning_rate": 0.012600148956227597, + "loss": 0.0, + "num_input_tokens_seen": 59537248, + "step": 34745 + }, + { + "epoch": 168.68765133171914, + "grad_norm": 1.0027377150834127e-08, + "learning_rate": 0.012576528067250414, + "loss": 0.0, + "num_input_tokens_seen": 59545824, + "step": 34750 + }, + { + "epoch": 168.71186440677965, + "grad_norm": 1.300809682192039e-08, + "learning_rate": 0.012552928370699561, + "loss": 0.0, + "num_input_tokens_seen": 59554208, + "step": 34755 + }, + { + "epoch": 168.7360774818402, + "grad_norm": 1.1880977979217278e-08, + "learning_rate": 0.012529349870214411, + "loss": 0.0, + "num_input_tokens_seen": 59562528, + "step": 34760 + }, + { + "epoch": 168.7602905569007, + "grad_norm": 4.01399136151781e-09, + "learning_rate": 0.012505792569431106, + "loss": 0.0, + "num_input_tokens_seen": 59570848, + "step": 34765 + }, + { + "epoch": 168.78450363196126, + "grad_norm": 9.438860359978207e-09, + "learning_rate": 0.012482256471982422, + "loss": 0.0, + "num_input_tokens_seen": 59579872, + "step": 34770 + }, + { + "epoch": 168.8087167070218, + "grad_norm": 7.462538142988251e-09, + "learning_rate": 0.012458741581497956, + "loss": 0.0, + "num_input_tokens_seen": 59588448, + "step": 34775 + }, + { + "epoch": 168.83292978208232, + "grad_norm": 3.5831271283370825e-09, + "learning_rate": 0.012435247901603974, + "loss": 0.0, + "num_input_tokens_seen": 59597184, + "step": 34780 + }, + { + "epoch": 168.85714285714286, + "grad_norm": 6.296773769065567e-09, + "learning_rate": 0.012411775435923528, + "loss": 0.0, + "num_input_tokens_seen": 59605696, + "step": 34785 + }, + { + "epoch": 168.88135593220338, + "grad_norm": 9.09465747156446e-09, + "learning_rate": 0.012388324188076354, + "loss": 0.0, + "num_input_tokens_seen": 59614624, + "step": 34790 + }, + { + "epoch": 168.90556900726392, + "grad_norm": 1.5294261856979574e-08, + "learning_rate": 0.012364894161678913, + "loss": 0.0, + "num_input_tokens_seen": 59623136, + "step": 34795 + }, + { + "epoch": 168.92978208232446, + "grad_norm": 9.704935521881453e-09, + "learning_rate": 0.012341485360344445, + "loss": 0.0, + "num_input_tokens_seen": 59631584, + "step": 34800 + }, + { + "epoch": 168.92978208232446, + "eval_loss": 1.192289113998413, + "eval_runtime": 4.6262, + "eval_samples_per_second": 79.33, + "eval_steps_per_second": 19.887, + "num_input_tokens_seen": 59631584, + "step": 34800 + }, + { + "epoch": 168.95399515738498, + "grad_norm": 1.541899585788542e-08, + "learning_rate": 0.01231809778768283, + "loss": 0.0, + "num_input_tokens_seen": 59640480, + "step": 34805 + }, + { + "epoch": 168.97820823244552, + "grad_norm": 1.6175734529610963e-08, + "learning_rate": 0.012294731447300799, + "loss": 0.0, + "num_input_tokens_seen": 59649056, + "step": 34810 + }, + { + "epoch": 169.0048426150121, + "grad_norm": 3.679360816022381e-08, + "learning_rate": 0.012271386342801671, + "loss": 0.0, + "num_input_tokens_seen": 59657952, + "step": 34815 + }, + { + "epoch": 169.02905569007265, + "grad_norm": 1.0924496862685373e-08, + "learning_rate": 0.012248062477785565, + "loss": 0.0, + "num_input_tokens_seen": 59666656, + "step": 34820 + }, + { + "epoch": 169.05326876513317, + "grad_norm": 6.4550489398129685e-09, + "learning_rate": 0.012224759855849305, + "loss": 0.0, + "num_input_tokens_seen": 59675136, + "step": 34825 + }, + { + "epoch": 169.0774818401937, + "grad_norm": 7.393160750268635e-09, + "learning_rate": 0.012201478480586513, + "loss": 0.0, + "num_input_tokens_seen": 59683488, + "step": 34830 + }, + { + "epoch": 169.10169491525423, + "grad_norm": 7.212130448408516e-09, + "learning_rate": 0.012178218355587389, + "loss": 0.0, + "num_input_tokens_seen": 59691904, + "step": 34835 + }, + { + "epoch": 169.12590799031477, + "grad_norm": 8.743071155947746e-09, + "learning_rate": 0.01215497948443896, + "loss": 0.0, + "num_input_tokens_seen": 59700800, + "step": 34840 + }, + { + "epoch": 169.15012106537532, + "grad_norm": 1.734460042257524e-08, + "learning_rate": 0.012131761870724993, + "loss": 0.0, + "num_input_tokens_seen": 59709664, + "step": 34845 + }, + { + "epoch": 169.17433414043583, + "grad_norm": 7.632911191990388e-09, + "learning_rate": 0.012108565518025893, + "loss": 0.0, + "num_input_tokens_seen": 59718240, + "step": 34850 + }, + { + "epoch": 169.19854721549638, + "grad_norm": 1.2205597421655057e-08, + "learning_rate": 0.012085390429918862, + "loss": 0.0, + "num_input_tokens_seen": 59726912, + "step": 34855 + }, + { + "epoch": 169.2227602905569, + "grad_norm": 5.485080833977918e-09, + "learning_rate": 0.012062236609977744, + "loss": 0.0, + "num_input_tokens_seen": 59735360, + "step": 34860 + }, + { + "epoch": 169.24697336561744, + "grad_norm": 9.889185470512984e-09, + "learning_rate": 0.01203910406177318, + "loss": 0.0, + "num_input_tokens_seen": 59744160, + "step": 34865 + }, + { + "epoch": 169.27118644067798, + "grad_norm": 8.399157813698821e-09, + "learning_rate": 0.01201599278887252, + "loss": 0.0, + "num_input_tokens_seen": 59752608, + "step": 34870 + }, + { + "epoch": 169.2953995157385, + "grad_norm": 7.004627100570815e-09, + "learning_rate": 0.011992902794839744, + "loss": 0.0, + "num_input_tokens_seen": 59761280, + "step": 34875 + }, + { + "epoch": 169.31961259079904, + "grad_norm": 6.166034349774918e-09, + "learning_rate": 0.011969834083235703, + "loss": 0.0, + "num_input_tokens_seen": 59769696, + "step": 34880 + }, + { + "epoch": 169.34382566585955, + "grad_norm": 7.312892069677446e-09, + "learning_rate": 0.011946786657617836, + "loss": 0.0, + "num_input_tokens_seen": 59778112, + "step": 34885 + }, + { + "epoch": 169.3680387409201, + "grad_norm": 4.186575530695791e-09, + "learning_rate": 0.011923760521540332, + "loss": 0.0, + "num_input_tokens_seen": 59786848, + "step": 34890 + }, + { + "epoch": 169.39225181598064, + "grad_norm": 4.515510632074893e-09, + "learning_rate": 0.011900755678554153, + "loss": 0.0, + "num_input_tokens_seen": 59795456, + "step": 34895 + }, + { + "epoch": 169.41646489104116, + "grad_norm": 6.10634698361423e-09, + "learning_rate": 0.011877772132206893, + "loss": 0.0, + "num_input_tokens_seen": 59804064, + "step": 34900 + }, + { + "epoch": 169.4406779661017, + "grad_norm": 1.4778554380256992e-08, + "learning_rate": 0.011854809886042915, + "loss": 0.0, + "num_input_tokens_seen": 59812544, + "step": 34905 + }, + { + "epoch": 169.46489104116222, + "grad_norm": 1.213684441836449e-08, + "learning_rate": 0.011831868943603325, + "loss": 0.0, + "num_input_tokens_seen": 59820896, + "step": 34910 + }, + { + "epoch": 169.48910411622276, + "grad_norm": 6.5934875337347876e-09, + "learning_rate": 0.011808949308425836, + "loss": 0.0, + "num_input_tokens_seen": 59829152, + "step": 34915 + }, + { + "epoch": 169.5133171912833, + "grad_norm": 6.988013723230324e-09, + "learning_rate": 0.01178605098404501, + "loss": 0.0, + "num_input_tokens_seen": 59837696, + "step": 34920 + }, + { + "epoch": 169.53753026634382, + "grad_norm": 7.435011273315695e-09, + "learning_rate": 0.011763173973992002, + "loss": 0.0, + "num_input_tokens_seen": 59846112, + "step": 34925 + }, + { + "epoch": 169.56174334140437, + "grad_norm": 4.227020511393675e-09, + "learning_rate": 0.011740318281794776, + "loss": 0.0, + "num_input_tokens_seen": 59854720, + "step": 34930 + }, + { + "epoch": 169.58595641646488, + "grad_norm": 9.125827205025416e-09, + "learning_rate": 0.01171748391097796, + "loss": 0.0, + "num_input_tokens_seen": 59863392, + "step": 34935 + }, + { + "epoch": 169.61016949152543, + "grad_norm": 8.957013797328273e-09, + "learning_rate": 0.011694670865062873, + "loss": 0.0, + "num_input_tokens_seen": 59871872, + "step": 34940 + }, + { + "epoch": 169.63438256658597, + "grad_norm": 1.3827844647096299e-08, + "learning_rate": 0.011671879147567616, + "loss": 0.0, + "num_input_tokens_seen": 59880768, + "step": 34945 + }, + { + "epoch": 169.65859564164649, + "grad_norm": 4.728423430577777e-09, + "learning_rate": 0.011649108762006893, + "loss": 0.0, + "num_input_tokens_seen": 59889376, + "step": 34950 + }, + { + "epoch": 169.68280871670703, + "grad_norm": 4.697783051454962e-09, + "learning_rate": 0.011626359711892265, + "loss": 0.0, + "num_input_tokens_seen": 59897856, + "step": 34955 + }, + { + "epoch": 169.70702179176754, + "grad_norm": 8.621509728357069e-09, + "learning_rate": 0.01160363200073189, + "loss": 0.0, + "num_input_tokens_seen": 59906240, + "step": 34960 + }, + { + "epoch": 169.7312348668281, + "grad_norm": 1.1424289958483769e-08, + "learning_rate": 0.011580925632030614, + "loss": 0.0, + "num_input_tokens_seen": 59915264, + "step": 34965 + }, + { + "epoch": 169.75544794188863, + "grad_norm": 1.3782543994977914e-08, + "learning_rate": 0.011558240609290104, + "loss": 0.0, + "num_input_tokens_seen": 59923712, + "step": 34970 + }, + { + "epoch": 169.77966101694915, + "grad_norm": 4.139057985241834e-09, + "learning_rate": 0.011535576936008679, + "loss": 0.0, + "num_input_tokens_seen": 59932576, + "step": 34975 + }, + { + "epoch": 169.8038740920097, + "grad_norm": 8.702686571382401e-09, + "learning_rate": 0.011512934615681309, + "loss": 0.0, + "num_input_tokens_seen": 59940864, + "step": 34980 + }, + { + "epoch": 169.8280871670702, + "grad_norm": 1.0288796481461304e-08, + "learning_rate": 0.011490313651799765, + "loss": 0.0, + "num_input_tokens_seen": 59949440, + "step": 34985 + }, + { + "epoch": 169.85230024213075, + "grad_norm": 1.2088204215388032e-08, + "learning_rate": 0.011467714047852512, + "loss": 0.0, + "num_input_tokens_seen": 59957856, + "step": 34990 + }, + { + "epoch": 169.8765133171913, + "grad_norm": 8.86742412831154e-09, + "learning_rate": 0.011445135807324624, + "loss": 0.0, + "num_input_tokens_seen": 59966656, + "step": 34995 + }, + { + "epoch": 169.9007263922518, + "grad_norm": 4.987791513144657e-09, + "learning_rate": 0.011422578933698002, + "loss": 0.0, + "num_input_tokens_seen": 59974880, + "step": 35000 + }, + { + "epoch": 169.9007263922518, + "eval_loss": 1.197524070739746, + "eval_runtime": 4.6226, + "eval_samples_per_second": 79.392, + "eval_steps_per_second": 19.902, + "num_input_tokens_seen": 59974880, + "step": 35000 + }, + { + "epoch": 169.92493946731236, + "grad_norm": 1.8562493764306964e-08, + "learning_rate": 0.011400043430451161, + "loss": 0.0, + "num_input_tokens_seen": 59983392, + "step": 35005 + }, + { + "epoch": 169.94915254237287, + "grad_norm": 4.8630881543942905e-09, + "learning_rate": 0.011377529301059392, + "loss": 0.0, + "num_input_tokens_seen": 59992448, + "step": 35010 + }, + { + "epoch": 169.97336561743342, + "grad_norm": 1.1883924067035423e-08, + "learning_rate": 0.011355036548994646, + "loss": 0.0, + "num_input_tokens_seen": 60000512, + "step": 35015 + }, + { + "epoch": 169.99757869249396, + "grad_norm": 8.042815302644613e-09, + "learning_rate": 0.011332565177725584, + "loss": 0.0, + "num_input_tokens_seen": 60009600, + "step": 35020 + }, + { + "epoch": 170.02421307506054, + "grad_norm": 1.1428405777280659e-08, + "learning_rate": 0.011310115190717585, + "loss": 0.0, + "num_input_tokens_seen": 60018656, + "step": 35025 + }, + { + "epoch": 170.04842615012106, + "grad_norm": 9.032563141886385e-09, + "learning_rate": 0.01128768659143271, + "loss": 0.0, + "num_input_tokens_seen": 60027168, + "step": 35030 + }, + { + "epoch": 170.0726392251816, + "grad_norm": 7.252030975735124e-09, + "learning_rate": 0.011265279383329713, + "loss": 0.0, + "num_input_tokens_seen": 60035424, + "step": 35035 + }, + { + "epoch": 170.09685230024212, + "grad_norm": 5.284991555498664e-09, + "learning_rate": 0.01124289356986411, + "loss": 0.0, + "num_input_tokens_seen": 60044128, + "step": 35040 + }, + { + "epoch": 170.12106537530266, + "grad_norm": 2.2724373494042993e-09, + "learning_rate": 0.011220529154488023, + "loss": 0.0, + "num_input_tokens_seen": 60052448, + "step": 35045 + }, + { + "epoch": 170.1452784503632, + "grad_norm": 5.721006779424442e-09, + "learning_rate": 0.011198186140650346, + "loss": 0.0, + "num_input_tokens_seen": 60061056, + "step": 35050 + }, + { + "epoch": 170.16949152542372, + "grad_norm": 9.622893593075332e-09, + "learning_rate": 0.011175864531796685, + "loss": 0.0, + "num_input_tokens_seen": 60069152, + "step": 35055 + }, + { + "epoch": 170.19370460048427, + "grad_norm": 3.7048166756648016e-09, + "learning_rate": 0.011153564331369258, + "loss": 0.0, + "num_input_tokens_seen": 60077696, + "step": 35060 + }, + { + "epoch": 170.21791767554478, + "grad_norm": 1.3143516497393648e-08, + "learning_rate": 0.011131285542807078, + "loss": 0.0, + "num_input_tokens_seen": 60086400, + "step": 35065 + }, + { + "epoch": 170.24213075060533, + "grad_norm": 5.0122785921757895e-09, + "learning_rate": 0.011109028169545815, + "loss": 0.0, + "num_input_tokens_seen": 60094720, + "step": 35070 + }, + { + "epoch": 170.26634382566587, + "grad_norm": 1.8057485506872695e-09, + "learning_rate": 0.011086792215017804, + "loss": 0.0, + "num_input_tokens_seen": 60103200, + "step": 35075 + }, + { + "epoch": 170.2905569007264, + "grad_norm": 9.980594128933262e-09, + "learning_rate": 0.011064577682652137, + "loss": 0.0, + "num_input_tokens_seen": 60111840, + "step": 35080 + }, + { + "epoch": 170.31476997578693, + "grad_norm": 9.643708054341005e-09, + "learning_rate": 0.011042384575874559, + "loss": 0.0, + "num_input_tokens_seen": 60120128, + "step": 35085 + }, + { + "epoch": 170.33898305084745, + "grad_norm": 5.935927305245059e-09, + "learning_rate": 0.011020212898107512, + "loss": 0.0, + "num_input_tokens_seen": 60128704, + "step": 35090 + }, + { + "epoch": 170.363196125908, + "grad_norm": 3.902334899663629e-09, + "learning_rate": 0.010998062652770197, + "loss": 0.0, + "num_input_tokens_seen": 60137536, + "step": 35095 + }, + { + "epoch": 170.38740920096853, + "grad_norm": 5.556130666661829e-09, + "learning_rate": 0.010975933843278428, + "loss": 0.0, + "num_input_tokens_seen": 60146208, + "step": 35100 + }, + { + "epoch": 170.41162227602905, + "grad_norm": 4.644026940781032e-09, + "learning_rate": 0.010953826473044714, + "loss": 0.0, + "num_input_tokens_seen": 60155104, + "step": 35105 + }, + { + "epoch": 170.4358353510896, + "grad_norm": 9.14592934719849e-09, + "learning_rate": 0.010931740545478357, + "loss": 0.0, + "num_input_tokens_seen": 60163776, + "step": 35110 + }, + { + "epoch": 170.4600484261501, + "grad_norm": 5.796209290309662e-09, + "learning_rate": 0.010909676063985218, + "loss": 0.0, + "num_input_tokens_seen": 60172320, + "step": 35115 + }, + { + "epoch": 170.48426150121065, + "grad_norm": 1.1120493859095859e-08, + "learning_rate": 0.010887633031967974, + "loss": 0.0, + "num_input_tokens_seen": 60180896, + "step": 35120 + }, + { + "epoch": 170.5084745762712, + "grad_norm": 9.482076457345556e-09, + "learning_rate": 0.01086561145282589, + "loss": 0.0, + "num_input_tokens_seen": 60189088, + "step": 35125 + }, + { + "epoch": 170.5326876513317, + "grad_norm": 1.6448808537461446e-08, + "learning_rate": 0.010843611329954983, + "loss": 0.0, + "num_input_tokens_seen": 60197792, + "step": 35130 + }, + { + "epoch": 170.55690072639226, + "grad_norm": 9.820765534129805e-09, + "learning_rate": 0.010821632666747988, + "loss": 0.0, + "num_input_tokens_seen": 60206368, + "step": 35135 + }, + { + "epoch": 170.58111380145277, + "grad_norm": 3.5678831000751643e-09, + "learning_rate": 0.010799675466594244, + "loss": 0.0, + "num_input_tokens_seen": 60215136, + "step": 35140 + }, + { + "epoch": 170.60532687651332, + "grad_norm": 1.0095377867003208e-08, + "learning_rate": 0.010777739732879826, + "loss": 0.0, + "num_input_tokens_seen": 60223840, + "step": 35145 + }, + { + "epoch": 170.62953995157386, + "grad_norm": 1.0649564785580878e-08, + "learning_rate": 0.010755825468987562, + "loss": 0.0, + "num_input_tokens_seen": 60232768, + "step": 35150 + }, + { + "epoch": 170.65375302663438, + "grad_norm": 7.9092146165749e-09, + "learning_rate": 0.010733932678296814, + "loss": 0.0, + "num_input_tokens_seen": 60241376, + "step": 35155 + }, + { + "epoch": 170.67796610169492, + "grad_norm": 3.903813716732429e-09, + "learning_rate": 0.010712061364183817, + "loss": 0.0, + "num_input_tokens_seen": 60249792, + "step": 35160 + }, + { + "epoch": 170.70217917675544, + "grad_norm": 4.725879687583756e-09, + "learning_rate": 0.010690211530021337, + "loss": 0.0, + "num_input_tokens_seen": 60258496, + "step": 35165 + }, + { + "epoch": 170.72639225181598, + "grad_norm": 4.879958215298075e-09, + "learning_rate": 0.01066838317917893, + "loss": 0.0, + "num_input_tokens_seen": 60267296, + "step": 35170 + }, + { + "epoch": 170.75060532687652, + "grad_norm": 4.748360815653996e-09, + "learning_rate": 0.010646576315022787, + "loss": 0.0, + "num_input_tokens_seen": 60275936, + "step": 35175 + }, + { + "epoch": 170.77481840193704, + "grad_norm": 8.023588016214944e-09, + "learning_rate": 0.010624790940915785, + "loss": 0.0, + "num_input_tokens_seen": 60284352, + "step": 35180 + }, + { + "epoch": 170.79903147699758, + "grad_norm": 1.0638564695852892e-08, + "learning_rate": 0.0106030270602175, + "loss": 0.0, + "num_input_tokens_seen": 60292576, + "step": 35185 + }, + { + "epoch": 170.8232445520581, + "grad_norm": 1.123624038257276e-08, + "learning_rate": 0.010581284676284252, + "loss": 0.0, + "num_input_tokens_seen": 60301344, + "step": 35190 + }, + { + "epoch": 170.84745762711864, + "grad_norm": 8.654181371525738e-09, + "learning_rate": 0.010559563792468923, + "loss": 0.0, + "num_input_tokens_seen": 60310112, + "step": 35195 + }, + { + "epoch": 170.8716707021792, + "grad_norm": 2.5533208880546e-09, + "learning_rate": 0.010537864412121217, + "loss": 0.0, + "num_input_tokens_seen": 60318560, + "step": 35200 + }, + { + "epoch": 170.8716707021792, + "eval_loss": 1.1924669742584229, + "eval_runtime": 4.6289, + "eval_samples_per_second": 79.285, + "eval_steps_per_second": 19.875, + "num_input_tokens_seen": 60318560, + "step": 35200 + }, + { + "epoch": 170.8958837772397, + "grad_norm": 7.743703456242201e-09, + "learning_rate": 0.010516186538587357, + "loss": 0.0, + "num_input_tokens_seen": 60327200, + "step": 35205 + }, + { + "epoch": 170.92009685230025, + "grad_norm": 9.275855639145902e-09, + "learning_rate": 0.01049453017521042, + "loss": 0.0, + "num_input_tokens_seen": 60335520, + "step": 35210 + }, + { + "epoch": 170.94430992736076, + "grad_norm": 7.187012762699396e-09, + "learning_rate": 0.010472895325330083, + "loss": 0.0, + "num_input_tokens_seen": 60343712, + "step": 35215 + }, + { + "epoch": 170.9685230024213, + "grad_norm": 5.267013492016304e-09, + "learning_rate": 0.010451281992282662, + "loss": 0.0, + "num_input_tokens_seen": 60352352, + "step": 35220 + }, + { + "epoch": 170.99273607748185, + "grad_norm": 8.738037848843305e-09, + "learning_rate": 0.01042969017940124, + "loss": 0.0, + "num_input_tokens_seen": 60361120, + "step": 35225 + }, + { + "epoch": 171.01937046004844, + "grad_norm": 1.0158645480373707e-08, + "learning_rate": 0.01040811989001557, + "loss": 0.0, + "num_input_tokens_seen": 60370208, + "step": 35230 + }, + { + "epoch": 171.04358353510895, + "grad_norm": 1.3738670645579987e-08, + "learning_rate": 0.010386571127451992, + "loss": 0.0, + "num_input_tokens_seen": 60378816, + "step": 35235 + }, + { + "epoch": 171.0677966101695, + "grad_norm": 6.382288919581924e-09, + "learning_rate": 0.010365043895033682, + "loss": 0.0, + "num_input_tokens_seen": 60387136, + "step": 35240 + }, + { + "epoch": 171.09200968523, + "grad_norm": 1.2849542763149202e-08, + "learning_rate": 0.010343538196080365, + "loss": 0.0, + "num_input_tokens_seen": 60395712, + "step": 35245 + }, + { + "epoch": 171.11622276029055, + "grad_norm": 8.016300512281305e-09, + "learning_rate": 0.010322054033908457, + "loss": 0.0, + "num_input_tokens_seen": 60404320, + "step": 35250 + }, + { + "epoch": 171.1404358353511, + "grad_norm": 7.431221415998834e-09, + "learning_rate": 0.010300591411831156, + "loss": 0.0, + "num_input_tokens_seen": 60412896, + "step": 35255 + }, + { + "epoch": 171.16464891041161, + "grad_norm": 1.3104361151761168e-08, + "learning_rate": 0.010279150333158198, + "loss": 0.0, + "num_input_tokens_seen": 60421728, + "step": 35260 + }, + { + "epoch": 171.18886198547216, + "grad_norm": 1.2764227008688067e-08, + "learning_rate": 0.010257730801196107, + "loss": 0.0, + "num_input_tokens_seen": 60430176, + "step": 35265 + }, + { + "epoch": 171.21307506053267, + "grad_norm": 7.3796524446834155e-09, + "learning_rate": 0.010236332819248056, + "loss": 0.0, + "num_input_tokens_seen": 60439040, + "step": 35270 + }, + { + "epoch": 171.23728813559322, + "grad_norm": 2.406721710812576e-09, + "learning_rate": 0.010214956390613854, + "loss": 0.0, + "num_input_tokens_seen": 60447936, + "step": 35275 + }, + { + "epoch": 171.26150121065376, + "grad_norm": 4.91696461324409e-09, + "learning_rate": 0.010193601518590034, + "loss": 0.0, + "num_input_tokens_seen": 60456224, + "step": 35280 + }, + { + "epoch": 171.28571428571428, + "grad_norm": 1.5392108920764258e-08, + "learning_rate": 0.010172268206469758, + "loss": 0.0, + "num_input_tokens_seen": 60465024, + "step": 35285 + }, + { + "epoch": 171.30992736077482, + "grad_norm": 1.1945795463930153e-08, + "learning_rate": 0.010150956457542897, + "loss": 0.0, + "num_input_tokens_seen": 60473600, + "step": 35290 + }, + { + "epoch": 171.33414043583534, + "grad_norm": 3.1408429634893764e-08, + "learning_rate": 0.010129666275096054, + "loss": 0.0, + "num_input_tokens_seen": 60482112, + "step": 35295 + }, + { + "epoch": 171.35835351089588, + "grad_norm": 1.0674479966610306e-08, + "learning_rate": 0.010108397662412338, + "loss": 0.0, + "num_input_tokens_seen": 60490688, + "step": 35300 + }, + { + "epoch": 171.38256658595643, + "grad_norm": 7.485575714838433e-09, + "learning_rate": 0.010087150622771707, + "loss": 0.0, + "num_input_tokens_seen": 60499040, + "step": 35305 + }, + { + "epoch": 171.40677966101694, + "grad_norm": 6.3209726341995065e-09, + "learning_rate": 0.010065925159450739, + "loss": 0.0, + "num_input_tokens_seen": 60507648, + "step": 35310 + }, + { + "epoch": 171.43099273607749, + "grad_norm": 7.001417223762019e-09, + "learning_rate": 0.010044721275722618, + "loss": 0.0, + "num_input_tokens_seen": 60516128, + "step": 35315 + }, + { + "epoch": 171.455205811138, + "grad_norm": 6.43792619214878e-09, + "learning_rate": 0.01002353897485726, + "loss": 0.0, + "num_input_tokens_seen": 60525120, + "step": 35320 + }, + { + "epoch": 171.47941888619854, + "grad_norm": 5.124462632011273e-09, + "learning_rate": 0.010002378260121236, + "loss": 0.0, + "num_input_tokens_seen": 60534016, + "step": 35325 + }, + { + "epoch": 171.5036319612591, + "grad_norm": 8.944607721161901e-09, + "learning_rate": 0.009981239134777786, + "loss": 0.0, + "num_input_tokens_seen": 60542624, + "step": 35330 + }, + { + "epoch": 171.5278450363196, + "grad_norm": 8.08657141249114e-09, + "learning_rate": 0.009960121602086884, + "loss": 0.0, + "num_input_tokens_seen": 60551232, + "step": 35335 + }, + { + "epoch": 171.55205811138015, + "grad_norm": 4.207818982138178e-09, + "learning_rate": 0.009939025665305062, + "loss": 0.0, + "num_input_tokens_seen": 60559808, + "step": 35340 + }, + { + "epoch": 171.57627118644066, + "grad_norm": 6.8808359010574804e-09, + "learning_rate": 0.009917951327685597, + "loss": 0.0, + "num_input_tokens_seen": 60568128, + "step": 35345 + }, + { + "epoch": 171.6004842615012, + "grad_norm": 1.2380730218808367e-08, + "learning_rate": 0.009896898592478425, + "loss": 0.0, + "num_input_tokens_seen": 60577056, + "step": 35350 + }, + { + "epoch": 171.62469733656175, + "grad_norm": 1.8573741655814047e-08, + "learning_rate": 0.009875867462930132, + "loss": 0.0, + "num_input_tokens_seen": 60585536, + "step": 35355 + }, + { + "epoch": 171.64891041162227, + "grad_norm": 9.214276452951253e-09, + "learning_rate": 0.009854857942284006, + "loss": 0.0, + "num_input_tokens_seen": 60593920, + "step": 35360 + }, + { + "epoch": 171.6731234866828, + "grad_norm": 6.199019519925741e-09, + "learning_rate": 0.009833870033779923, + "loss": 0.0, + "num_input_tokens_seen": 60602784, + "step": 35365 + }, + { + "epoch": 171.69733656174333, + "grad_norm": 1.0395783789363122e-08, + "learning_rate": 0.009812903740654527, + "loss": 0.0, + "num_input_tokens_seen": 60611424, + "step": 35370 + }, + { + "epoch": 171.72154963680387, + "grad_norm": 7.661184575624702e-09, + "learning_rate": 0.009791959066141097, + "loss": 0.0, + "num_input_tokens_seen": 60619744, + "step": 35375 + }, + { + "epoch": 171.74576271186442, + "grad_norm": 9.467895800696624e-09, + "learning_rate": 0.009771036013469537, + "loss": 0.0, + "num_input_tokens_seen": 60628160, + "step": 35380 + }, + { + "epoch": 171.76997578692493, + "grad_norm": 5.007019687752745e-09, + "learning_rate": 0.00975013458586646, + "loss": 0.0, + "num_input_tokens_seen": 60636704, + "step": 35385 + }, + { + "epoch": 171.79418886198548, + "grad_norm": 4.367759487422518e-09, + "learning_rate": 0.009729254786555107, + "loss": 0.0, + "num_input_tokens_seen": 60644992, + "step": 35390 + }, + { + "epoch": 171.818401937046, + "grad_norm": 7.2059083144893066e-09, + "learning_rate": 0.009708396618755421, + "loss": 0.0, + "num_input_tokens_seen": 60653120, + "step": 35395 + }, + { + "epoch": 171.84261501210653, + "grad_norm": 7.368632815030196e-09, + "learning_rate": 0.009687560085683994, + "loss": 0.0, + "num_input_tokens_seen": 60662016, + "step": 35400 + }, + { + "epoch": 171.84261501210653, + "eval_loss": 1.1931350231170654, + "eval_runtime": 4.626, + "eval_samples_per_second": 79.334, + "eval_steps_per_second": 19.888, + "num_input_tokens_seen": 60662016, + "step": 35400 + }, + { + "epoch": 171.86682808716708, + "grad_norm": 1.0026342422975176e-08, + "learning_rate": 0.009666745190554054, + "loss": 0.0, + "num_input_tokens_seen": 60670848, + "step": 35405 + }, + { + "epoch": 171.8910411622276, + "grad_norm": 9.070436846059238e-09, + "learning_rate": 0.009645951936575553, + "loss": 0.0, + "num_input_tokens_seen": 60679520, + "step": 35410 + }, + { + "epoch": 171.91525423728814, + "grad_norm": 1.5503573536079784e-08, + "learning_rate": 0.00962518032695509, + "loss": 0.0, + "num_input_tokens_seen": 60688000, + "step": 35415 + }, + { + "epoch": 171.93946731234865, + "grad_norm": 3.81788867187538e-09, + "learning_rate": 0.009604430364895855, + "loss": 0.0, + "num_input_tokens_seen": 60696384, + "step": 35420 + }, + { + "epoch": 171.9636803874092, + "grad_norm": 8.609969626149905e-09, + "learning_rate": 0.00958370205359777, + "loss": 0.0, + "num_input_tokens_seen": 60704800, + "step": 35425 + }, + { + "epoch": 171.98789346246974, + "grad_norm": 1.2028073648195914e-08, + "learning_rate": 0.009562995396257445, + "loss": 0.0, + "num_input_tokens_seen": 60713312, + "step": 35430 + }, + { + "epoch": 172.01452784503633, + "grad_norm": 9.502843845154985e-09, + "learning_rate": 0.009542310396068026, + "loss": 0.0, + "num_input_tokens_seen": 60721824, + "step": 35435 + }, + { + "epoch": 172.03874092009684, + "grad_norm": 6.464016877316681e-09, + "learning_rate": 0.009521647056219495, + "loss": 0.0, + "num_input_tokens_seen": 60730400, + "step": 35440 + }, + { + "epoch": 172.0629539951574, + "grad_norm": 8.591388045431358e-09, + "learning_rate": 0.00950100537989832, + "loss": 0.0, + "num_input_tokens_seen": 60739264, + "step": 35445 + }, + { + "epoch": 172.08716707021793, + "grad_norm": 9.955062552080562e-09, + "learning_rate": 0.00948038537028772, + "loss": 0.0, + "num_input_tokens_seen": 60747520, + "step": 35450 + }, + { + "epoch": 172.11138014527845, + "grad_norm": 1.6026342919417402e-08, + "learning_rate": 0.009459787030567617, + "loss": 0.0, + "num_input_tokens_seen": 60755808, + "step": 35455 + }, + { + "epoch": 172.135593220339, + "grad_norm": 4.010042076174614e-09, + "learning_rate": 0.00943921036391449, + "loss": 0.0, + "num_input_tokens_seen": 60763968, + "step": 35460 + }, + { + "epoch": 172.1598062953995, + "grad_norm": 1.4407819826089963e-08, + "learning_rate": 0.009418655373501483, + "loss": 0.0, + "num_input_tokens_seen": 60772672, + "step": 35465 + }, + { + "epoch": 172.18401937046005, + "grad_norm": 9.13111808387157e-09, + "learning_rate": 0.00939812206249851, + "loss": 0.0, + "num_input_tokens_seen": 60781568, + "step": 35470 + }, + { + "epoch": 172.2082324455206, + "grad_norm": 3.7218559345575386e-09, + "learning_rate": 0.009377610434072004, + "loss": 0.0, + "num_input_tokens_seen": 60790656, + "step": 35475 + }, + { + "epoch": 172.2324455205811, + "grad_norm": 9.794590027922823e-09, + "learning_rate": 0.009357120491385167, + "loss": 0.0, + "num_input_tokens_seen": 60799040, + "step": 35480 + }, + { + "epoch": 172.25665859564165, + "grad_norm": 1.0651231896474656e-08, + "learning_rate": 0.009336652237597743, + "loss": 0.0, + "num_input_tokens_seen": 60807808, + "step": 35485 + }, + { + "epoch": 172.28087167070217, + "grad_norm": 5.057411378572851e-09, + "learning_rate": 0.009316205675866251, + "loss": 0.0, + "num_input_tokens_seen": 60816544, + "step": 35490 + }, + { + "epoch": 172.3050847457627, + "grad_norm": 4.469545622498572e-09, + "learning_rate": 0.00929578080934379, + "loss": 0.0, + "num_input_tokens_seen": 60825184, + "step": 35495 + }, + { + "epoch": 172.32929782082326, + "grad_norm": 6.27141005793419e-09, + "learning_rate": 0.00927537764118012, + "loss": 0.0, + "num_input_tokens_seen": 60833440, + "step": 35500 + }, + { + "epoch": 172.35351089588377, + "grad_norm": 9.2338048318652e-09, + "learning_rate": 0.009254996174521678, + "loss": 0.0, + "num_input_tokens_seen": 60841728, + "step": 35505 + }, + { + "epoch": 172.37772397094432, + "grad_norm": 7.285168024395716e-09, + "learning_rate": 0.009234636412511531, + "loss": 0.0, + "num_input_tokens_seen": 60850496, + "step": 35510 + }, + { + "epoch": 172.40193704600483, + "grad_norm": 1.2463254428496384e-08, + "learning_rate": 0.009214298358289418, + "loss": 0.0, + "num_input_tokens_seen": 60858976, + "step": 35515 + }, + { + "epoch": 172.42615012106538, + "grad_norm": 5.23177812183917e-09, + "learning_rate": 0.00919398201499173, + "loss": 0.0, + "num_input_tokens_seen": 60867424, + "step": 35520 + }, + { + "epoch": 172.45036319612592, + "grad_norm": 6.556596154894123e-09, + "learning_rate": 0.009173687385751495, + "loss": 0.0, + "num_input_tokens_seen": 60876064, + "step": 35525 + }, + { + "epoch": 172.47457627118644, + "grad_norm": 5.392944313342696e-09, + "learning_rate": 0.009153414473698407, + "loss": 0.0, + "num_input_tokens_seen": 60884576, + "step": 35530 + }, + { + "epoch": 172.49878934624698, + "grad_norm": 9.0050304990541e-09, + "learning_rate": 0.009133163281958784, + "loss": 0.0, + "num_input_tokens_seen": 60892928, + "step": 35535 + }, + { + "epoch": 172.5230024213075, + "grad_norm": 1.2903283774789998e-08, + "learning_rate": 0.009112933813655627, + "loss": 0.0, + "num_input_tokens_seen": 60901664, + "step": 35540 + }, + { + "epoch": 172.54721549636804, + "grad_norm": 7.077411545708401e-09, + "learning_rate": 0.009092726071908573, + "loss": 0.0, + "num_input_tokens_seen": 60910240, + "step": 35545 + }, + { + "epoch": 172.57142857142858, + "grad_norm": 7.781279620644455e-09, + "learning_rate": 0.0090725400598339, + "loss": 0.0, + "num_input_tokens_seen": 60919136, + "step": 35550 + }, + { + "epoch": 172.5956416464891, + "grad_norm": 7.528894840902467e-09, + "learning_rate": 0.009052375780544563, + "loss": 0.0, + "num_input_tokens_seen": 60927360, + "step": 35555 + }, + { + "epoch": 172.61985472154964, + "grad_norm": 9.078736873391335e-09, + "learning_rate": 0.009032233237150144, + "loss": 0.0, + "num_input_tokens_seen": 60935712, + "step": 35560 + }, + { + "epoch": 172.64406779661016, + "grad_norm": 6.084181602972194e-09, + "learning_rate": 0.009012112432756875, + "loss": 0.0, + "num_input_tokens_seen": 60944224, + "step": 35565 + }, + { + "epoch": 172.6682808716707, + "grad_norm": 1.0911874959163015e-08, + "learning_rate": 0.008992013370467605, + "loss": 0.0, + "num_input_tokens_seen": 60952896, + "step": 35570 + }, + { + "epoch": 172.69249394673125, + "grad_norm": 2.3687201533562074e-08, + "learning_rate": 0.008971936053381924, + "loss": 0.0, + "num_input_tokens_seen": 60961280, + "step": 35575 + }, + { + "epoch": 172.71670702179176, + "grad_norm": 7.098844623243394e-09, + "learning_rate": 0.008951880484595953, + "loss": 0.0, + "num_input_tokens_seen": 60969696, + "step": 35580 + }, + { + "epoch": 172.7409200968523, + "grad_norm": 8.04984257030128e-09, + "learning_rate": 0.008931846667202552, + "loss": 0.0, + "num_input_tokens_seen": 60978144, + "step": 35585 + }, + { + "epoch": 172.76513317191282, + "grad_norm": 6.292987464462385e-09, + "learning_rate": 0.008911834604291152, + "loss": 0.0, + "num_input_tokens_seen": 60986784, + "step": 35590 + }, + { + "epoch": 172.78934624697337, + "grad_norm": 7.0287020648152065e-09, + "learning_rate": 0.008891844298947882, + "loss": 0.0, + "num_input_tokens_seen": 60995360, + "step": 35595 + }, + { + "epoch": 172.8135593220339, + "grad_norm": 9.625054531170463e-09, + "learning_rate": 0.008871875754255508, + "loss": 0.0, + "num_input_tokens_seen": 61004352, + "step": 35600 + }, + { + "epoch": 172.8135593220339, + "eval_loss": 1.1947826147079468, + "eval_runtime": 4.6314, + "eval_samples_per_second": 79.242, + "eval_steps_per_second": 19.864, + "num_input_tokens_seen": 61004352, + "step": 35600 + }, + { + "epoch": 172.83777239709443, + "grad_norm": 9.246756249581267e-09, + "learning_rate": 0.008851928973293422, + "loss": 0.0, + "num_input_tokens_seen": 61013312, + "step": 35605 + }, + { + "epoch": 172.86198547215497, + "grad_norm": 5.555146120883592e-09, + "learning_rate": 0.00883200395913764, + "loss": 0.0, + "num_input_tokens_seen": 61021856, + "step": 35610 + }, + { + "epoch": 172.88619854721549, + "grad_norm": 9.308423365439467e-09, + "learning_rate": 0.00881210071486091, + "loss": 0.0, + "num_input_tokens_seen": 61030912, + "step": 35615 + }, + { + "epoch": 172.91041162227603, + "grad_norm": 4.525609220706883e-09, + "learning_rate": 0.008792219243532505, + "loss": 0.0, + "num_input_tokens_seen": 61039264, + "step": 35620 + }, + { + "epoch": 172.93462469733657, + "grad_norm": 8.767399251041752e-09, + "learning_rate": 0.008772359548218428, + "loss": 0.0, + "num_input_tokens_seen": 61047776, + "step": 35625 + }, + { + "epoch": 172.9588377723971, + "grad_norm": 9.986990789911943e-09, + "learning_rate": 0.008752521631981274, + "loss": 0.0, + "num_input_tokens_seen": 61056576, + "step": 35630 + }, + { + "epoch": 172.98305084745763, + "grad_norm": 2.7279938308311102e-09, + "learning_rate": 0.008732705497880315, + "loss": 0.0, + "num_input_tokens_seen": 61064960, + "step": 35635 + }, + { + "epoch": 173.00968523002422, + "grad_norm": 1.127213788976178e-08, + "learning_rate": 0.008712911148971459, + "loss": 0.0, + "num_input_tokens_seen": 61073920, + "step": 35640 + }, + { + "epoch": 173.03389830508473, + "grad_norm": 8.629074343957654e-09, + "learning_rate": 0.008693138588307208, + "loss": 0.0, + "num_input_tokens_seen": 61082304, + "step": 35645 + }, + { + "epoch": 173.05811138014528, + "grad_norm": 9.14613629277028e-09, + "learning_rate": 0.008673387818936762, + "loss": 0.0, + "num_input_tokens_seen": 61090784, + "step": 35650 + }, + { + "epoch": 173.08232445520582, + "grad_norm": 6.271550834213713e-09, + "learning_rate": 0.008653658843905948, + "loss": 0.0, + "num_input_tokens_seen": 61098912, + "step": 35655 + }, + { + "epoch": 173.10653753026634, + "grad_norm": 1.728930598687839e-08, + "learning_rate": 0.0086339516662572, + "loss": 0.0, + "num_input_tokens_seen": 61107488, + "step": 35660 + }, + { + "epoch": 173.13075060532688, + "grad_norm": 2.2195646209866027e-08, + "learning_rate": 0.008614266289029638, + "loss": 0.0, + "num_input_tokens_seen": 61115968, + "step": 35665 + }, + { + "epoch": 173.1549636803874, + "grad_norm": 1.2295899409764388e-08, + "learning_rate": 0.008594602715258965, + "loss": 0.0, + "num_input_tokens_seen": 61124512, + "step": 35670 + }, + { + "epoch": 173.17917675544794, + "grad_norm": 7.939005008950062e-09, + "learning_rate": 0.008574960947977573, + "loss": 0.0, + "num_input_tokens_seen": 61133248, + "step": 35675 + }, + { + "epoch": 173.20338983050848, + "grad_norm": 2.2568599433725467e-08, + "learning_rate": 0.008555340990214438, + "loss": 0.0, + "num_input_tokens_seen": 61141504, + "step": 35680 + }, + { + "epoch": 173.227602905569, + "grad_norm": 4.265950259707552e-09, + "learning_rate": 0.008535742844995258, + "loss": 0.0, + "num_input_tokens_seen": 61150208, + "step": 35685 + }, + { + "epoch": 173.25181598062954, + "grad_norm": 1.239697944299678e-08, + "learning_rate": 0.008516166515342266, + "loss": 0.0, + "num_input_tokens_seen": 61159072, + "step": 35690 + }, + { + "epoch": 173.27602905569006, + "grad_norm": 7.377235267114202e-09, + "learning_rate": 0.008496612004274411, + "loss": 0.0, + "num_input_tokens_seen": 61168000, + "step": 35695 + }, + { + "epoch": 173.3002421307506, + "grad_norm": 8.317074140506975e-09, + "learning_rate": 0.008477079314807201, + "loss": 0.0, + "num_input_tokens_seen": 61176896, + "step": 35700 + }, + { + "epoch": 173.32445520581115, + "grad_norm": 4.2503756070289e-09, + "learning_rate": 0.008457568449952874, + "loss": 0.0, + "num_input_tokens_seen": 61185440, + "step": 35705 + }, + { + "epoch": 173.34866828087166, + "grad_norm": 1.131434235190909e-08, + "learning_rate": 0.008438079412720189, + "loss": 0.0, + "num_input_tokens_seen": 61194176, + "step": 35710 + }, + { + "epoch": 173.3728813559322, + "grad_norm": 5.4585407305296485e-09, + "learning_rate": 0.00841861220611466, + "loss": 0.0, + "num_input_tokens_seen": 61202784, + "step": 35715 + }, + { + "epoch": 173.39709443099272, + "grad_norm": 6.329902824120381e-09, + "learning_rate": 0.008399166833138355, + "loss": 0.0, + "num_input_tokens_seen": 61211264, + "step": 35720 + }, + { + "epoch": 173.42130750605327, + "grad_norm": 4.743498926984557e-09, + "learning_rate": 0.008379743296789987, + "loss": 0.0, + "num_input_tokens_seen": 61219712, + "step": 35725 + }, + { + "epoch": 173.4455205811138, + "grad_norm": 1.399956683911796e-08, + "learning_rate": 0.008360341600064896, + "loss": 0.0, + "num_input_tokens_seen": 61228192, + "step": 35730 + }, + { + "epoch": 173.46973365617433, + "grad_norm": 9.907611620008083e-09, + "learning_rate": 0.008340961745955121, + "loss": 0.0, + "num_input_tokens_seen": 61236544, + "step": 35735 + }, + { + "epoch": 173.49394673123487, + "grad_norm": 9.330647365857203e-09, + "learning_rate": 0.008321603737449224, + "loss": 0.0, + "num_input_tokens_seen": 61245120, + "step": 35740 + }, + { + "epoch": 173.5181598062954, + "grad_norm": 5.2710125153510035e-09, + "learning_rate": 0.008302267577532479, + "loss": 0.0, + "num_input_tokens_seen": 61253248, + "step": 35745 + }, + { + "epoch": 173.54237288135593, + "grad_norm": 6.468730440190029e-09, + "learning_rate": 0.008282953269186771, + "loss": 0.0, + "num_input_tokens_seen": 61261728, + "step": 35750 + }, + { + "epoch": 173.56658595641647, + "grad_norm": 5.931428237460068e-09, + "learning_rate": 0.008263660815390567, + "loss": 0.0, + "num_input_tokens_seen": 61270560, + "step": 35755 + }, + { + "epoch": 173.590799031477, + "grad_norm": 1.4228346500999578e-08, + "learning_rate": 0.008244390219119069, + "loss": 0.0, + "num_input_tokens_seen": 61279040, + "step": 35760 + }, + { + "epoch": 173.61501210653753, + "grad_norm": 1.8150966951679948e-08, + "learning_rate": 0.008225141483343967, + "loss": 0.0, + "num_input_tokens_seen": 61287712, + "step": 35765 + }, + { + "epoch": 173.63922518159805, + "grad_norm": 2.0366581310327092e-08, + "learning_rate": 0.00820591461103372, + "loss": 0.0, + "num_input_tokens_seen": 61296320, + "step": 35770 + }, + { + "epoch": 173.6634382566586, + "grad_norm": 7.99063393230881e-09, + "learning_rate": 0.008186709605153358, + "loss": 0.0, + "num_input_tokens_seen": 61304992, + "step": 35775 + }, + { + "epoch": 173.68765133171914, + "grad_norm": 5.827914151268487e-09, + "learning_rate": 0.008167526468664492, + "loss": 0.0, + "num_input_tokens_seen": 61313248, + "step": 35780 + }, + { + "epoch": 173.71186440677965, + "grad_norm": 1.1954253587020958e-08, + "learning_rate": 0.008148365204525443, + "loss": 0.0, + "num_input_tokens_seen": 61321888, + "step": 35785 + }, + { + "epoch": 173.7360774818402, + "grad_norm": 5.2806767847357605e-09, + "learning_rate": 0.00812922581569106, + "loss": 0.0, + "num_input_tokens_seen": 61330272, + "step": 35790 + }, + { + "epoch": 173.7602905569007, + "grad_norm": 1.683824812914736e-08, + "learning_rate": 0.008110108305112934, + "loss": 0.0, + "num_input_tokens_seen": 61338528, + "step": 35795 + }, + { + "epoch": 173.78450363196126, + "grad_norm": 7.189364215065552e-09, + "learning_rate": 0.008091012675739223, + "loss": 0.0, + "num_input_tokens_seen": 61347296, + "step": 35800 + }, + { + "epoch": 173.78450363196126, + "eval_loss": 1.1954607963562012, + "eval_runtime": 4.617, + "eval_samples_per_second": 79.488, + "eval_steps_per_second": 19.926, + "num_input_tokens_seen": 61347296, + "step": 35800 + }, + { + "epoch": 173.8087167070218, + "grad_norm": 1.0304291642171393e-08, + "learning_rate": 0.008071938930514671, + "loss": 0.0, + "num_input_tokens_seen": 61356096, + "step": 35805 + }, + { + "epoch": 173.83292978208232, + "grad_norm": 1.0198825783902521e-08, + "learning_rate": 0.008052887072380726, + "loss": 0.0, + "num_input_tokens_seen": 61364384, + "step": 35810 + }, + { + "epoch": 173.85714285714286, + "grad_norm": 8.536436446604512e-09, + "learning_rate": 0.008033857104275437, + "loss": 0.0, + "num_input_tokens_seen": 61372928, + "step": 35815 + }, + { + "epoch": 173.88135593220338, + "grad_norm": 1.4729501174315374e-08, + "learning_rate": 0.008014849029133424, + "loss": 0.0, + "num_input_tokens_seen": 61381696, + "step": 35820 + }, + { + "epoch": 173.90556900726392, + "grad_norm": 6.361645432662044e-09, + "learning_rate": 0.007995862849885975, + "loss": 0.0, + "num_input_tokens_seen": 61390112, + "step": 35825 + }, + { + "epoch": 173.92978208232446, + "grad_norm": 8.333004508642716e-09, + "learning_rate": 0.007976898569461032, + "loss": 0.0, + "num_input_tokens_seen": 61398272, + "step": 35830 + }, + { + "epoch": 173.95399515738498, + "grad_norm": 7.86884601922111e-09, + "learning_rate": 0.007957956190783088, + "loss": 0.0, + "num_input_tokens_seen": 61407040, + "step": 35835 + }, + { + "epoch": 173.97820823244552, + "grad_norm": 1.2558479589586113e-08, + "learning_rate": 0.007939035716773324, + "loss": 0.0, + "num_input_tokens_seen": 61415552, + "step": 35840 + }, + { + "epoch": 174.0048426150121, + "grad_norm": 1.29516735114521e-08, + "learning_rate": 0.007920137150349487, + "loss": 0.0, + "num_input_tokens_seen": 61424512, + "step": 35845 + }, + { + "epoch": 174.02905569007265, + "grad_norm": 6.820466413870463e-09, + "learning_rate": 0.007901260494425981, + "loss": 0.0, + "num_input_tokens_seen": 61433120, + "step": 35850 + }, + { + "epoch": 174.05326876513317, + "grad_norm": 6.480032954669923e-09, + "learning_rate": 0.007882405751913861, + "loss": 0.0, + "num_input_tokens_seen": 61441632, + "step": 35855 + }, + { + "epoch": 174.0774818401937, + "grad_norm": 1.0698464336655888e-08, + "learning_rate": 0.007863572925720702, + "loss": 0.0, + "num_input_tokens_seen": 61450144, + "step": 35860 + }, + { + "epoch": 174.10169491525423, + "grad_norm": 3.86080722947213e-09, + "learning_rate": 0.007844762018750827, + "loss": 0.0, + "num_input_tokens_seen": 61459136, + "step": 35865 + }, + { + "epoch": 174.12590799031477, + "grad_norm": 1.536177052230414e-08, + "learning_rate": 0.007825973033905054, + "loss": 0.0, + "num_input_tokens_seen": 61467840, + "step": 35870 + }, + { + "epoch": 174.15012106537532, + "grad_norm": 7.646133504124464e-09, + "learning_rate": 0.007807205974080927, + "loss": 0.0, + "num_input_tokens_seen": 61476416, + "step": 35875 + }, + { + "epoch": 174.17433414043583, + "grad_norm": 8.228695946854714e-09, + "learning_rate": 0.007788460842172551, + "loss": 0.0, + "num_input_tokens_seen": 61484768, + "step": 35880 + }, + { + "epoch": 174.19854721549638, + "grad_norm": 7.993293138497393e-09, + "learning_rate": 0.0077697376410706285, + "loss": 0.0, + "num_input_tokens_seen": 61493568, + "step": 35885 + }, + { + "epoch": 174.2227602905569, + "grad_norm": 6.517979489473191e-09, + "learning_rate": 0.007751036373662567, + "loss": 0.0, + "num_input_tokens_seen": 61501856, + "step": 35890 + }, + { + "epoch": 174.24697336561744, + "grad_norm": 7.659552991867713e-09, + "learning_rate": 0.00773235704283231, + "loss": 0.0, + "num_input_tokens_seen": 61510080, + "step": 35895 + }, + { + "epoch": 174.27118644067798, + "grad_norm": 8.190044198386204e-09, + "learning_rate": 0.007713699651460437, + "loss": 0.0, + "num_input_tokens_seen": 61518752, + "step": 35900 + }, + { + "epoch": 174.2953995157385, + "grad_norm": 6.426289722583078e-09, + "learning_rate": 0.007695064202424162, + "loss": 0.0, + "num_input_tokens_seen": 61527296, + "step": 35905 + }, + { + "epoch": 174.31961259079904, + "grad_norm": 8.566130027531926e-09, + "learning_rate": 0.007676450698597286, + "loss": 0.0, + "num_input_tokens_seen": 61535744, + "step": 35910 + }, + { + "epoch": 174.34382566585955, + "grad_norm": 7.684952230135877e-09, + "learning_rate": 0.007657859142850265, + "loss": 0.0, + "num_input_tokens_seen": 61544448, + "step": 35915 + }, + { + "epoch": 174.3680387409201, + "grad_norm": 4.7005253023257865e-09, + "learning_rate": 0.0076392895380501535, + "loss": 0.0, + "num_input_tokens_seen": 61553024, + "step": 35920 + }, + { + "epoch": 174.39225181598064, + "grad_norm": 3.397276460148646e-09, + "learning_rate": 0.007620741887060611, + "loss": 0.0, + "num_input_tokens_seen": 61561600, + "step": 35925 + }, + { + "epoch": 174.41646489104116, + "grad_norm": 6.571631683272017e-09, + "learning_rate": 0.007602216192741901, + "loss": 0.0, + "num_input_tokens_seen": 61570432, + "step": 35930 + }, + { + "epoch": 174.4406779661017, + "grad_norm": 4.643449624808227e-09, + "learning_rate": 0.007583712457950969, + "loss": 0.0, + "num_input_tokens_seen": 61579072, + "step": 35935 + }, + { + "epoch": 174.46489104116222, + "grad_norm": 5.588394191846646e-09, + "learning_rate": 0.007565230685541269, + "loss": 0.0, + "num_input_tokens_seen": 61587488, + "step": 35940 + }, + { + "epoch": 174.48910411622276, + "grad_norm": 8.844178722711149e-09, + "learning_rate": 0.007546770878362968, + "loss": 0.0, + "num_input_tokens_seen": 61596160, + "step": 35945 + }, + { + "epoch": 174.5133171912833, + "grad_norm": 1.0763780089462216e-08, + "learning_rate": 0.0075283330392627405, + "loss": 0.0, + "num_input_tokens_seen": 61604608, + "step": 35950 + }, + { + "epoch": 174.53753026634382, + "grad_norm": 9.379227172701121e-09, + "learning_rate": 0.007509917171083979, + "loss": 0.0, + "num_input_tokens_seen": 61613312, + "step": 35955 + }, + { + "epoch": 174.56174334140437, + "grad_norm": 5.985507200989559e-09, + "learning_rate": 0.007491523276666662, + "loss": 0.0, + "num_input_tokens_seen": 61621728, + "step": 35960 + }, + { + "epoch": 174.58595641646488, + "grad_norm": 4.075451975893429e-09, + "learning_rate": 0.007473151358847318, + "loss": 0.0, + "num_input_tokens_seen": 61630464, + "step": 35965 + }, + { + "epoch": 174.61016949152543, + "grad_norm": 4.201648806656522e-09, + "learning_rate": 0.007454801420459117, + "loss": 0.0, + "num_input_tokens_seen": 61638592, + "step": 35970 + }, + { + "epoch": 174.63438256658597, + "grad_norm": 7.230375409505996e-09, + "learning_rate": 0.0074364734643319105, + "loss": 0.0, + "num_input_tokens_seen": 61647264, + "step": 35975 + }, + { + "epoch": 174.65859564164649, + "grad_norm": 1.3832894829590714e-08, + "learning_rate": 0.007418167493292022, + "loss": 0.0, + "num_input_tokens_seen": 61655936, + "step": 35980 + }, + { + "epoch": 174.68280871670703, + "grad_norm": 1.2806961713351939e-08, + "learning_rate": 0.0073998835101625245, + "loss": 0.0, + "num_input_tokens_seen": 61664160, + "step": 35985 + }, + { + "epoch": 174.70702179176754, + "grad_norm": 7.4173152064815895e-09, + "learning_rate": 0.007381621517762998, + "loss": 0.0, + "num_input_tokens_seen": 61672416, + "step": 35990 + }, + { + "epoch": 174.7312348668281, + "grad_norm": 6.244834427349133e-09, + "learning_rate": 0.007363381518909689, + "loss": 0.0, + "num_input_tokens_seen": 61681152, + "step": 35995 + }, + { + "epoch": 174.75544794188863, + "grad_norm": 7.571793858573983e-09, + "learning_rate": 0.007345163516415448, + "loss": 0.0, + "num_input_tokens_seen": 61689824, + "step": 36000 + }, + { + "epoch": 174.75544794188863, + "eval_loss": 1.1908310651779175, + "eval_runtime": 4.6276, + "eval_samples_per_second": 79.306, + "eval_steps_per_second": 19.881, + "num_input_tokens_seen": 61689824, + "step": 36000 + }, + { + "epoch": 174.77966101694915, + "grad_norm": 1.0235577718731292e-08, + "learning_rate": 0.007326967513089693, + "loss": 0.0, + "num_input_tokens_seen": 61698368, + "step": 36005 + }, + { + "epoch": 174.8038740920097, + "grad_norm": 4.3776560154640265e-09, + "learning_rate": 0.0073087935117384815, + "loss": 0.0, + "num_input_tokens_seen": 61707136, + "step": 36010 + }, + { + "epoch": 174.8280871670702, + "grad_norm": 1.0492690272201344e-08, + "learning_rate": 0.007290641515164503, + "loss": 0.0, + "num_input_tokens_seen": 61715712, + "step": 36015 + }, + { + "epoch": 174.85230024213075, + "grad_norm": 1.172695363038656e-08, + "learning_rate": 0.007272511526166986, + "loss": 0.0, + "num_input_tokens_seen": 61724576, + "step": 36020 + }, + { + "epoch": 174.8765133171913, + "grad_norm": 7.297189963395567e-09, + "learning_rate": 0.0072544035475418265, + "loss": 0.0, + "num_input_tokens_seen": 61732960, + "step": 36025 + }, + { + "epoch": 174.9007263922518, + "grad_norm": 2.701975088115205e-09, + "learning_rate": 0.007236317582081475, + "loss": 0.0, + "num_input_tokens_seen": 61741952, + "step": 36030 + }, + { + "epoch": 174.92493946731236, + "grad_norm": 1.3450538460801909e-08, + "learning_rate": 0.007218253632575066, + "loss": 0.0, + "num_input_tokens_seen": 61750592, + "step": 36035 + }, + { + "epoch": 174.94915254237287, + "grad_norm": 8.35639113461184e-09, + "learning_rate": 0.007200211701808223, + "loss": 0.0, + "num_input_tokens_seen": 61759392, + "step": 36040 + }, + { + "epoch": 174.97336561743342, + "grad_norm": 1.0390461824272279e-08, + "learning_rate": 0.007182191792563286, + "loss": 0.0, + "num_input_tokens_seen": 61767904, + "step": 36045 + }, + { + "epoch": 174.99757869249396, + "grad_norm": 1.1485790096799064e-08, + "learning_rate": 0.0071641939076191145, + "loss": 0.0, + "num_input_tokens_seen": 61776384, + "step": 36050 + }, + { + "epoch": 175.02421307506054, + "grad_norm": 1.595662624254146e-08, + "learning_rate": 0.007146218049751257, + "loss": 0.0, + "num_input_tokens_seen": 61785344, + "step": 36055 + }, + { + "epoch": 175.04842615012106, + "grad_norm": 5.224510601919974e-09, + "learning_rate": 0.0071282642217317775, + "loss": 0.0, + "num_input_tokens_seen": 61793728, + "step": 36060 + }, + { + "epoch": 175.0726392251816, + "grad_norm": 1.0862918564669144e-08, + "learning_rate": 0.007110332426329396, + "loss": 0.0, + "num_input_tokens_seen": 61802080, + "step": 36065 + }, + { + "epoch": 175.09685230024212, + "grad_norm": 5.011973946977832e-09, + "learning_rate": 0.007092422666309417, + "loss": 0.0, + "num_input_tokens_seen": 61810816, + "step": 36070 + }, + { + "epoch": 175.12106537530266, + "grad_norm": 5.55446888483857e-09, + "learning_rate": 0.0070745349444337295, + "loss": 0.0, + "num_input_tokens_seen": 61819168, + "step": 36075 + }, + { + "epoch": 175.1452784503632, + "grad_norm": 8.68728733394164e-09, + "learning_rate": 0.007056669263460913, + "loss": 0.0, + "num_input_tokens_seen": 61827456, + "step": 36080 + }, + { + "epoch": 175.16949152542372, + "grad_norm": 1.0348870205234562e-08, + "learning_rate": 0.007038825626145995, + "loss": 0.0, + "num_input_tokens_seen": 61835968, + "step": 36085 + }, + { + "epoch": 175.19370460048427, + "grad_norm": 6.621779569115915e-09, + "learning_rate": 0.007021004035240724, + "loss": 0.0, + "num_input_tokens_seen": 61843840, + "step": 36090 + }, + { + "epoch": 175.21791767554478, + "grad_norm": 8.384165361974283e-09, + "learning_rate": 0.007003204493493453, + "loss": 0.0, + "num_input_tokens_seen": 61852448, + "step": 36095 + }, + { + "epoch": 175.24213075060533, + "grad_norm": 8.790874694852846e-09, + "learning_rate": 0.006985427003649036, + "loss": 0.0, + "num_input_tokens_seen": 61861280, + "step": 36100 + }, + { + "epoch": 175.26634382566587, + "grad_norm": 8.628513903374824e-09, + "learning_rate": 0.006967671568449013, + "loss": 0.0, + "num_input_tokens_seen": 61870112, + "step": 36105 + }, + { + "epoch": 175.2905569007264, + "grad_norm": 9.049160532015321e-09, + "learning_rate": 0.006949938190631511, + "loss": 0.0, + "num_input_tokens_seen": 61878624, + "step": 36110 + }, + { + "epoch": 175.31476997578693, + "grad_norm": 3.487230726406665e-09, + "learning_rate": 0.0069322268729311905, + "loss": 0.0, + "num_input_tokens_seen": 61887360, + "step": 36115 + }, + { + "epoch": 175.33898305084745, + "grad_norm": 6.7267258430092625e-09, + "learning_rate": 0.006914537618079403, + "loss": 0.0, + "num_input_tokens_seen": 61895904, + "step": 36120 + }, + { + "epoch": 175.363196125908, + "grad_norm": 1.69035239139248e-08, + "learning_rate": 0.006896870428804031, + "loss": 0.0, + "num_input_tokens_seen": 61904576, + "step": 36125 + }, + { + "epoch": 175.38740920096853, + "grad_norm": 4.621816263039591e-09, + "learning_rate": 0.006879225307829595, + "loss": 0.0, + "num_input_tokens_seen": 61913312, + "step": 36130 + }, + { + "epoch": 175.41162227602905, + "grad_norm": 6.3417826545730804e-09, + "learning_rate": 0.00686160225787717, + "loss": 0.0, + "num_input_tokens_seen": 61921824, + "step": 36135 + }, + { + "epoch": 175.4358353510896, + "grad_norm": 8.572940579654187e-09, + "learning_rate": 0.006844001281664463, + "loss": 0.0, + "num_input_tokens_seen": 61930304, + "step": 36140 + }, + { + "epoch": 175.4600484261501, + "grad_norm": 5.4380566716361045e-09, + "learning_rate": 0.006826422381905789, + "loss": 0.0, + "num_input_tokens_seen": 61939200, + "step": 36145 + }, + { + "epoch": 175.48426150121065, + "grad_norm": 7.206386154479105e-09, + "learning_rate": 0.006808865561311994, + "loss": 0.0, + "num_input_tokens_seen": 61947616, + "step": 36150 + }, + { + "epoch": 175.5084745762712, + "grad_norm": 1.1447480297022139e-08, + "learning_rate": 0.00679133082259058, + "loss": 0.0, + "num_input_tokens_seen": 61956800, + "step": 36155 + }, + { + "epoch": 175.5326876513317, + "grad_norm": 3.01458191742654e-09, + "learning_rate": 0.00677381816844565, + "loss": 0.0, + "num_input_tokens_seen": 61965248, + "step": 36160 + }, + { + "epoch": 175.55690072639226, + "grad_norm": 4.3264876126158924e-09, + "learning_rate": 0.0067563276015778434, + "loss": 0.0, + "num_input_tokens_seen": 61973408, + "step": 36165 + }, + { + "epoch": 175.58111380145277, + "grad_norm": 8.906543946807233e-09, + "learning_rate": 0.006738859124684437, + "loss": 0.0, + "num_input_tokens_seen": 61981952, + "step": 36170 + }, + { + "epoch": 175.60532687651332, + "grad_norm": 1.0140298378757961e-08, + "learning_rate": 0.006721412740459259, + "loss": 0.0, + "num_input_tokens_seen": 61990592, + "step": 36175 + }, + { + "epoch": 175.62953995157386, + "grad_norm": 9.687550317494242e-09, + "learning_rate": 0.006703988451592824, + "loss": 0.0, + "num_input_tokens_seen": 61999072, + "step": 36180 + }, + { + "epoch": 175.65375302663438, + "grad_norm": 9.275670009856185e-09, + "learning_rate": 0.006686586260772114, + "loss": 0.0, + "num_input_tokens_seen": 62007552, + "step": 36185 + }, + { + "epoch": 175.67796610169492, + "grad_norm": 1.5428812005779946e-08, + "learning_rate": 0.006669206170680819, + "loss": 0.0, + "num_input_tokens_seen": 62016352, + "step": 36190 + }, + { + "epoch": 175.70217917675544, + "grad_norm": 3.900158862535363e-09, + "learning_rate": 0.0066518481839991095, + "loss": 0.0, + "num_input_tokens_seen": 62025120, + "step": 36195 + }, + { + "epoch": 175.72639225181598, + "grad_norm": 1.0670652805799818e-08, + "learning_rate": 0.006634512303403861, + "loss": 0.0, + "num_input_tokens_seen": 62033792, + "step": 36200 + }, + { + "epoch": 175.72639225181598, + "eval_loss": 1.1989526748657227, + "eval_runtime": 4.6346, + "eval_samples_per_second": 79.187, + "eval_steps_per_second": 19.851, + "num_input_tokens_seen": 62033792, + "step": 36200 + }, + { + "epoch": 175.75060532687652, + "grad_norm": 6.595157753253034e-09, + "learning_rate": 0.0066171985315684355, + "loss": 0.0, + "num_input_tokens_seen": 62042304, + "step": 36205 + }, + { + "epoch": 175.77481840193704, + "grad_norm": 1.3779836827154668e-08, + "learning_rate": 0.0065999068711628806, + "loss": 0.0, + "num_input_tokens_seen": 62051328, + "step": 36210 + }, + { + "epoch": 175.79903147699758, + "grad_norm": 8.203040025023256e-09, + "learning_rate": 0.0065826373248537295, + "loss": 0.0, + "num_input_tokens_seen": 62060256, + "step": 36215 + }, + { + "epoch": 175.8232445520581, + "grad_norm": 6.413800157645255e-09, + "learning_rate": 0.006565389895304218, + "loss": 0.0, + "num_input_tokens_seen": 62068896, + "step": 36220 + }, + { + "epoch": 175.84745762711864, + "grad_norm": 1.2273964955511474e-08, + "learning_rate": 0.006548164585174104, + "loss": 0.0, + "num_input_tokens_seen": 62077216, + "step": 36225 + }, + { + "epoch": 175.8716707021792, + "grad_norm": 4.3278172157101835e-09, + "learning_rate": 0.006530961397119728, + "loss": 0.0, + "num_input_tokens_seen": 62085888, + "step": 36230 + }, + { + "epoch": 175.8958837772397, + "grad_norm": 1.472551591774618e-08, + "learning_rate": 0.00651378033379405, + "loss": 0.0, + "num_input_tokens_seen": 62094048, + "step": 36235 + }, + { + "epoch": 175.92009685230025, + "grad_norm": 5.29486587907968e-09, + "learning_rate": 0.006496621397846619, + "loss": 0.0, + "num_input_tokens_seen": 62102400, + "step": 36240 + }, + { + "epoch": 175.94430992736076, + "grad_norm": 5.594311680567898e-09, + "learning_rate": 0.006479484591923518, + "loss": 0.0, + "num_input_tokens_seen": 62111104, + "step": 36245 + }, + { + "epoch": 175.9685230024213, + "grad_norm": 6.344547998082817e-09, + "learning_rate": 0.006462369918667515, + "loss": 0.0, + "num_input_tokens_seen": 62119808, + "step": 36250 + }, + { + "epoch": 175.99273607748185, + "grad_norm": 5.1945527879126985e-09, + "learning_rate": 0.006445277380717851, + "loss": 0.0, + "num_input_tokens_seen": 62128416, + "step": 36255 + }, + { + "epoch": 176.01937046004844, + "grad_norm": 1.206146382770612e-08, + "learning_rate": 0.006428206980710466, + "loss": 0.0, + "num_input_tokens_seen": 62137088, + "step": 36260 + }, + { + "epoch": 176.04358353510895, + "grad_norm": 6.317725009807873e-09, + "learning_rate": 0.006411158721277788, + "loss": 0.0, + "num_input_tokens_seen": 62145504, + "step": 36265 + }, + { + "epoch": 176.0677966101695, + "grad_norm": 1.1626911877726798e-08, + "learning_rate": 0.00639413260504888, + "loss": 0.0, + "num_input_tokens_seen": 62153920, + "step": 36270 + }, + { + "epoch": 176.09200968523, + "grad_norm": 8.212900581838767e-09, + "learning_rate": 0.006377128634649376, + "loss": 0.0, + "num_input_tokens_seen": 62162528, + "step": 36275 + }, + { + "epoch": 176.11622276029055, + "grad_norm": 5.728343133171165e-09, + "learning_rate": 0.006360146812701528, + "loss": 0.0, + "num_input_tokens_seen": 62171008, + "step": 36280 + }, + { + "epoch": 176.1404358353511, + "grad_norm": 1.9576320564596017e-09, + "learning_rate": 0.006343187141824125, + "loss": 0.0, + "num_input_tokens_seen": 62179552, + "step": 36285 + }, + { + "epoch": 176.16464891041161, + "grad_norm": 1.5050947155259564e-08, + "learning_rate": 0.00632624962463259, + "loss": 0.0, + "num_input_tokens_seen": 62188032, + "step": 36290 + }, + { + "epoch": 176.18886198547216, + "grad_norm": 5.762757826488496e-09, + "learning_rate": 0.006309334263738853, + "loss": 0.0, + "num_input_tokens_seen": 62196576, + "step": 36295 + }, + { + "epoch": 176.21307506053267, + "grad_norm": 9.782073817632408e-09, + "learning_rate": 0.006292441061751508, + "loss": 0.0, + "num_input_tokens_seen": 62204864, + "step": 36300 + }, + { + "epoch": 176.23728813559322, + "grad_norm": 8.000633933136214e-09, + "learning_rate": 0.0062755700212757054, + "loss": 0.0, + "num_input_tokens_seen": 62213504, + "step": 36305 + }, + { + "epoch": 176.26150121065376, + "grad_norm": 1.4608987797259942e-08, + "learning_rate": 0.006258721144913148, + "loss": 0.0, + "num_input_tokens_seen": 62221952, + "step": 36310 + }, + { + "epoch": 176.28571428571428, + "grad_norm": 1.4255912006433391e-08, + "learning_rate": 0.0062418944352621575, + "loss": 0.0, + "num_input_tokens_seen": 62230912, + "step": 36315 + }, + { + "epoch": 176.30992736077482, + "grad_norm": 6.967783683364814e-09, + "learning_rate": 0.0062250898949176405, + "loss": 0.0, + "num_input_tokens_seen": 62239488, + "step": 36320 + }, + { + "epoch": 176.33414043583534, + "grad_norm": 5.8119584700477844e-09, + "learning_rate": 0.006208307526471041, + "loss": 0.0, + "num_input_tokens_seen": 62248000, + "step": 36325 + }, + { + "epoch": 176.35835351089588, + "grad_norm": 6.127984342185755e-09, + "learning_rate": 0.006191547332510405, + "loss": 0.0, + "num_input_tokens_seen": 62256416, + "step": 36330 + }, + { + "epoch": 176.38256658595643, + "grad_norm": 1.3757807337810846e-08, + "learning_rate": 0.006174809315620416, + "loss": 0.0, + "num_input_tokens_seen": 62264960, + "step": 36335 + }, + { + "epoch": 176.40677966101694, + "grad_norm": 1.1334479133040531e-08, + "learning_rate": 0.00615809347838221, + "loss": 0.0, + "num_input_tokens_seen": 62273440, + "step": 36340 + }, + { + "epoch": 176.43099273607749, + "grad_norm": 7.881181929292325e-09, + "learning_rate": 0.006141399823373655, + "loss": 0.0, + "num_input_tokens_seen": 62281952, + "step": 36345 + }, + { + "epoch": 176.455205811138, + "grad_norm": 4.324013147538608e-09, + "learning_rate": 0.0061247283531690455, + "loss": 0.0, + "num_input_tokens_seen": 62290688, + "step": 36350 + }, + { + "epoch": 176.47941888619854, + "grad_norm": 7.177327621121776e-09, + "learning_rate": 0.0061080790703393895, + "loss": 0.0, + "num_input_tokens_seen": 62299136, + "step": 36355 + }, + { + "epoch": 176.5036319612591, + "grad_norm": 9.123463762250594e-09, + "learning_rate": 0.006091451977452217, + "loss": 0.0, + "num_input_tokens_seen": 62307456, + "step": 36360 + }, + { + "epoch": 176.5278450363196, + "grad_norm": 6.1606457713025975e-09, + "learning_rate": 0.00607484707707161, + "loss": 0.0, + "num_input_tokens_seen": 62316128, + "step": 36365 + }, + { + "epoch": 176.55205811138015, + "grad_norm": 1.3559209754987478e-08, + "learning_rate": 0.006058264371758254, + "loss": 0.0, + "num_input_tokens_seen": 62324608, + "step": 36370 + }, + { + "epoch": 176.57627118644066, + "grad_norm": 7.459110662466628e-09, + "learning_rate": 0.00604170386406942, + "loss": 0.0, + "num_input_tokens_seen": 62333280, + "step": 36375 + }, + { + "epoch": 176.6004842615012, + "grad_norm": 8.413064023216066e-09, + "learning_rate": 0.006025165556558931, + "loss": 0.0, + "num_input_tokens_seen": 62341760, + "step": 36380 + }, + { + "epoch": 176.62469733656175, + "grad_norm": 1.8665614831547828e-08, + "learning_rate": 0.006008649451777248, + "loss": 0.0, + "num_input_tokens_seen": 62350400, + "step": 36385 + }, + { + "epoch": 176.64891041162227, + "grad_norm": 1.1133567845433845e-08, + "learning_rate": 0.005992155552271283, + "loss": 0.0, + "num_input_tokens_seen": 62358752, + "step": 36390 + }, + { + "epoch": 176.6731234866828, + "grad_norm": 6.093906268489491e-09, + "learning_rate": 0.005975683860584685, + "loss": 0.0, + "num_input_tokens_seen": 62367584, + "step": 36395 + }, + { + "epoch": 176.69733656174333, + "grad_norm": 9.64647561829679e-09, + "learning_rate": 0.0059592343792575385, + "loss": 0.0, + "num_input_tokens_seen": 62376224, + "step": 36400 + }, + { + "epoch": 176.69733656174333, + "eval_loss": 1.1992243528366089, + "eval_runtime": 4.6251, + "eval_samples_per_second": 79.35, + "eval_steps_per_second": 19.892, + "num_input_tokens_seen": 62376224, + "step": 36400 + }, + { + "epoch": 176.72154963680387, + "grad_norm": 6.001924734988506e-09, + "learning_rate": 0.0059428071108265975, + "loss": 0.0, + "num_input_tokens_seen": 62384896, + "step": 36405 + }, + { + "epoch": 176.74576271186442, + "grad_norm": 9.76873870683903e-09, + "learning_rate": 0.005926402057825136, + "loss": 0.0, + "num_input_tokens_seen": 62393664, + "step": 36410 + }, + { + "epoch": 176.76997578692493, + "grad_norm": 6.742247204982732e-09, + "learning_rate": 0.005910019222782997, + "loss": 0.0, + "num_input_tokens_seen": 62402048, + "step": 36415 + }, + { + "epoch": 176.79418886198548, + "grad_norm": 9.678145396208038e-09, + "learning_rate": 0.005893658608226643, + "loss": 0.0, + "num_input_tokens_seen": 62410688, + "step": 36420 + }, + { + "epoch": 176.818401937046, + "grad_norm": 1.8823746117391238e-08, + "learning_rate": 0.0058773202166791045, + "loss": 0.0, + "num_input_tokens_seen": 62419200, + "step": 36425 + }, + { + "epoch": 176.84261501210653, + "grad_norm": 1.0596072463897599e-08, + "learning_rate": 0.005861004050659918, + "loss": 0.0, + "num_input_tokens_seen": 62428128, + "step": 36430 + }, + { + "epoch": 176.86682808716708, + "grad_norm": 5.367810196332812e-09, + "learning_rate": 0.005844710112685286, + "loss": 0.0, + "num_input_tokens_seen": 62436608, + "step": 36435 + }, + { + "epoch": 176.8910411622276, + "grad_norm": 1.2973864649268307e-08, + "learning_rate": 0.005828438405267933, + "loss": 0.0, + "num_input_tokens_seen": 62445760, + "step": 36440 + }, + { + "epoch": 176.91525423728814, + "grad_norm": 2.7548925363163335e-09, + "learning_rate": 0.00581218893091715, + "loss": 0.0, + "num_input_tokens_seen": 62454304, + "step": 36445 + }, + { + "epoch": 176.93946731234865, + "grad_norm": 3.064031472987949e-09, + "learning_rate": 0.005795961692138801, + "loss": 0.0, + "num_input_tokens_seen": 62462752, + "step": 36450 + }, + { + "epoch": 176.9636803874092, + "grad_norm": 8.206990642634082e-09, + "learning_rate": 0.00577975669143535, + "loss": 0.0, + "num_input_tokens_seen": 62471040, + "step": 36455 + }, + { + "epoch": 176.98789346246974, + "grad_norm": 1.1473712646647982e-08, + "learning_rate": 0.005763573931305782, + "loss": 0.0, + "num_input_tokens_seen": 62479744, + "step": 36460 + }, + { + "epoch": 177.01452784503633, + "grad_norm": 1.6249991574568412e-08, + "learning_rate": 0.005747413414245733, + "loss": 0.0, + "num_input_tokens_seen": 62488832, + "step": 36465 + }, + { + "epoch": 177.03874092009684, + "grad_norm": 3.273365356548652e-09, + "learning_rate": 0.005731275142747294, + "loss": 0.0, + "num_input_tokens_seen": 62497664, + "step": 36470 + }, + { + "epoch": 177.0629539951574, + "grad_norm": 8.247242888614892e-09, + "learning_rate": 0.005715159119299256, + "loss": 0.0, + "num_input_tokens_seen": 62506080, + "step": 36475 + }, + { + "epoch": 177.08716707021793, + "grad_norm": 2.8732263235298205e-09, + "learning_rate": 0.005699065346386867, + "loss": 0.0, + "num_input_tokens_seen": 62514400, + "step": 36480 + }, + { + "epoch": 177.11138014527845, + "grad_norm": 1.543570604667366e-08, + "learning_rate": 0.0056829938264919885, + "loss": 0.0, + "num_input_tokens_seen": 62522848, + "step": 36485 + }, + { + "epoch": 177.135593220339, + "grad_norm": 1.504388080775243e-08, + "learning_rate": 0.005666944562093074, + "loss": 0.0, + "num_input_tokens_seen": 62531584, + "step": 36490 + }, + { + "epoch": 177.1598062953995, + "grad_norm": 1.054364684449638e-08, + "learning_rate": 0.005650917555665108, + "loss": 0.0, + "num_input_tokens_seen": 62540224, + "step": 36495 + }, + { + "epoch": 177.18401937046005, + "grad_norm": 6.8042340650720234e-09, + "learning_rate": 0.005634912809679632, + "loss": 0.0, + "num_input_tokens_seen": 62548640, + "step": 36500 + }, + { + "epoch": 177.2082324455206, + "grad_norm": 7.70579422493256e-09, + "learning_rate": 0.005618930326604854, + "loss": 0.0, + "num_input_tokens_seen": 62556960, + "step": 36505 + }, + { + "epoch": 177.2324455205811, + "grad_norm": 1.0338261802189663e-08, + "learning_rate": 0.005602970108905386, + "loss": 0.0, + "num_input_tokens_seen": 62565216, + "step": 36510 + }, + { + "epoch": 177.25665859564165, + "grad_norm": 1.0662539295935858e-08, + "learning_rate": 0.005587032159042543, + "loss": 0.0, + "num_input_tokens_seen": 62573696, + "step": 36515 + }, + { + "epoch": 177.28087167070217, + "grad_norm": 9.288218194569708e-09, + "learning_rate": 0.005571116479474158, + "loss": 0.0, + "num_input_tokens_seen": 62582496, + "step": 36520 + }, + { + "epoch": 177.3050847457627, + "grad_norm": 5.2988773369122555e-09, + "learning_rate": 0.005555223072654619, + "loss": 0.0, + "num_input_tokens_seen": 62590944, + "step": 36525 + }, + { + "epoch": 177.32929782082326, + "grad_norm": 1.5113757356743918e-08, + "learning_rate": 0.005539351941034881, + "loss": 0.0, + "num_input_tokens_seen": 62599744, + "step": 36530 + }, + { + "epoch": 177.35351089588377, + "grad_norm": 6.96764734797739e-09, + "learning_rate": 0.0055235030870624865, + "loss": 0.0, + "num_input_tokens_seen": 62607872, + "step": 36535 + }, + { + "epoch": 177.37772397094432, + "grad_norm": 8.08763456205952e-09, + "learning_rate": 0.005507676513181514, + "loss": 0.0, + "num_input_tokens_seen": 62616480, + "step": 36540 + }, + { + "epoch": 177.40193704600483, + "grad_norm": 3.2754245982147268e-09, + "learning_rate": 0.005491872221832628, + "loss": 0.0, + "num_input_tokens_seen": 62625280, + "step": 36545 + }, + { + "epoch": 177.42615012106538, + "grad_norm": 1.6884266429428862e-08, + "learning_rate": 0.005476090215453061, + "loss": 0.0, + "num_input_tokens_seen": 62634304, + "step": 36550 + }, + { + "epoch": 177.45036319612592, + "grad_norm": 5.833260097176662e-09, + "learning_rate": 0.0054603304964765675, + "loss": 0.0, + "num_input_tokens_seen": 62642880, + "step": 36555 + }, + { + "epoch": 177.47457627118644, + "grad_norm": 5.657085910826254e-09, + "learning_rate": 0.005444593067333519, + "loss": 0.0, + "num_input_tokens_seen": 62651296, + "step": 36560 + }, + { + "epoch": 177.49878934624698, + "grad_norm": 6.643249061966117e-09, + "learning_rate": 0.00542887793045081, + "loss": 0.0, + "num_input_tokens_seen": 62659904, + "step": 36565 + }, + { + "epoch": 177.5230024213075, + "grad_norm": 1.307550512308353e-08, + "learning_rate": 0.005413185088251932, + "loss": 0.0, + "num_input_tokens_seen": 62668352, + "step": 36570 + }, + { + "epoch": 177.54721549636804, + "grad_norm": 1.0534010996821053e-08, + "learning_rate": 0.005397514543156884, + "loss": 0.0, + "num_input_tokens_seen": 62677120, + "step": 36575 + }, + { + "epoch": 177.57142857142858, + "grad_norm": 1.2855615239004692e-08, + "learning_rate": 0.0053818662975822825, + "loss": 0.0, + "num_input_tokens_seen": 62685760, + "step": 36580 + }, + { + "epoch": 177.5956416464891, + "grad_norm": 1.124938808771958e-08, + "learning_rate": 0.005366240353941315, + "loss": 0.0, + "num_input_tokens_seen": 62694368, + "step": 36585 + }, + { + "epoch": 177.61985472154964, + "grad_norm": 9.047075977264285e-09, + "learning_rate": 0.005350636714643636, + "loss": 0.0, + "num_input_tokens_seen": 62703072, + "step": 36590 + }, + { + "epoch": 177.64406779661016, + "grad_norm": 8.343491231244116e-09, + "learning_rate": 0.005335055382095555, + "loss": 0.0, + "num_input_tokens_seen": 62711552, + "step": 36595 + }, + { + "epoch": 177.6682808716707, + "grad_norm": 3.493906275409131e-09, + "learning_rate": 0.005319496358699915, + "loss": 0.0, + "num_input_tokens_seen": 62720096, + "step": 36600 + }, + { + "epoch": 177.6682808716707, + "eval_loss": 1.1939719915390015, + "eval_runtime": 4.6272, + "eval_samples_per_second": 79.314, + "eval_steps_per_second": 19.882, + "num_input_tokens_seen": 62720096, + "step": 36600 + }, + { + "epoch": 177.69249394673125, + "grad_norm": 7.683291336491038e-09, + "learning_rate": 0.005303959646856099, + "loss": 0.0, + "num_input_tokens_seen": 62728640, + "step": 36605 + }, + { + "epoch": 177.71670702179176, + "grad_norm": 1.1165784741251628e-08, + "learning_rate": 0.005288445248960089, + "loss": 0.0, + "num_input_tokens_seen": 62737376, + "step": 36610 + }, + { + "epoch": 177.7409200968523, + "grad_norm": 8.459442035757547e-09, + "learning_rate": 0.005272953167404354, + "loss": 0.0, + "num_input_tokens_seen": 62745600, + "step": 36615 + }, + { + "epoch": 177.76513317191282, + "grad_norm": 7.408064384151203e-09, + "learning_rate": 0.005257483404578017, + "loss": 0.0, + "num_input_tokens_seen": 62754080, + "step": 36620 + }, + { + "epoch": 177.78934624697337, + "grad_norm": 1.0979231745977813e-08, + "learning_rate": 0.0052420359628666865, + "loss": 0.0, + "num_input_tokens_seen": 62762880, + "step": 36625 + }, + { + "epoch": 177.8135593220339, + "grad_norm": 5.537996727866812e-09, + "learning_rate": 0.00522661084465254, + "loss": 0.0, + "num_input_tokens_seen": 62771328, + "step": 36630 + }, + { + "epoch": 177.83777239709443, + "grad_norm": 5.498572264173163e-09, + "learning_rate": 0.005211208052314326, + "loss": 0.0, + "num_input_tokens_seen": 62780032, + "step": 36635 + }, + { + "epoch": 177.86198547215497, + "grad_norm": 1.4632841605077829e-08, + "learning_rate": 0.005195827588227391, + "loss": 0.0, + "num_input_tokens_seen": 62788672, + "step": 36640 + }, + { + "epoch": 177.88619854721549, + "grad_norm": 1.4640671786025905e-08, + "learning_rate": 0.0051804694547635255, + "loss": 0.0, + "num_input_tokens_seen": 62797248, + "step": 36645 + }, + { + "epoch": 177.91041162227603, + "grad_norm": 5.935537394918811e-09, + "learning_rate": 0.005165133654291232, + "loss": 0.0, + "num_input_tokens_seen": 62805920, + "step": 36650 + }, + { + "epoch": 177.93462469733657, + "grad_norm": 3.128375114513915e-09, + "learning_rate": 0.005149820189175402, + "loss": 0.0, + "num_input_tokens_seen": 62814272, + "step": 36655 + }, + { + "epoch": 177.9588377723971, + "grad_norm": 2.2675705757535525e-09, + "learning_rate": 0.005134529061777598, + "loss": 0.0, + "num_input_tokens_seen": 62822592, + "step": 36660 + }, + { + "epoch": 177.98305084745763, + "grad_norm": 4.50376047567147e-09, + "learning_rate": 0.005119260274455933, + "loss": 0.0, + "num_input_tokens_seen": 62831136, + "step": 36665 + }, + { + "epoch": 178.00968523002422, + "grad_norm": 8.68995186920074e-09, + "learning_rate": 0.005104013829565007, + "loss": 0.0, + "num_input_tokens_seen": 62840480, + "step": 36670 + }, + { + "epoch": 178.03389830508473, + "grad_norm": 8.531343631545951e-09, + "learning_rate": 0.005088789729456006, + "loss": 0.0, + "num_input_tokens_seen": 62849600, + "step": 36675 + }, + { + "epoch": 178.05811138014528, + "grad_norm": 1.1741838612522315e-08, + "learning_rate": 0.005073587976476735, + "loss": 0.0, + "num_input_tokens_seen": 62858464, + "step": 36680 + }, + { + "epoch": 178.08232445520582, + "grad_norm": 5.078101494859766e-09, + "learning_rate": 0.005058408572971418, + "loss": 0.0, + "num_input_tokens_seen": 62867200, + "step": 36685 + }, + { + "epoch": 178.10653753026634, + "grad_norm": 1.1590964632546275e-08, + "learning_rate": 0.005043251521280983, + "loss": 0.0, + "num_input_tokens_seen": 62875360, + "step": 36690 + }, + { + "epoch": 178.13075060532688, + "grad_norm": 9.31054522368413e-09, + "learning_rate": 0.005028116823742795, + "loss": 0.0, + "num_input_tokens_seen": 62884032, + "step": 36695 + }, + { + "epoch": 178.1549636803874, + "grad_norm": 1.1636181795893208e-08, + "learning_rate": 0.005013004482690819, + "loss": 0.0, + "num_input_tokens_seen": 62892896, + "step": 36700 + }, + { + "epoch": 178.17917675544794, + "grad_norm": 8.390997230378616e-09, + "learning_rate": 0.0049979145004555746, + "loss": 0.0, + "num_input_tokens_seen": 62901312, + "step": 36705 + }, + { + "epoch": 178.20338983050848, + "grad_norm": 8.677001339663093e-09, + "learning_rate": 0.004982846879364116, + "loss": 0.0, + "num_input_tokens_seen": 62909728, + "step": 36710 + }, + { + "epoch": 178.227602905569, + "grad_norm": 7.363315734920661e-09, + "learning_rate": 0.0049678016217400535, + "loss": 0.0, + "num_input_tokens_seen": 62917984, + "step": 36715 + }, + { + "epoch": 178.25181598062954, + "grad_norm": 9.1144425340417e-09, + "learning_rate": 0.004952778729903595, + "loss": 0.0, + "num_input_tokens_seen": 62926432, + "step": 36720 + }, + { + "epoch": 178.27602905569006, + "grad_norm": 7.471422591720511e-09, + "learning_rate": 0.004937778206171422, + "loss": 0.0, + "num_input_tokens_seen": 62934688, + "step": 36725 + }, + { + "epoch": 178.3002421307506, + "grad_norm": 8.236184179111206e-09, + "learning_rate": 0.004922800052856835, + "loss": 0.0, + "num_input_tokens_seen": 62943104, + "step": 36730 + }, + { + "epoch": 178.32445520581115, + "grad_norm": 3.6087948185326013e-09, + "learning_rate": 0.004907844272269602, + "loss": 0.0, + "num_input_tokens_seen": 62951648, + "step": 36735 + }, + { + "epoch": 178.34866828087166, + "grad_norm": 5.430715876997283e-09, + "learning_rate": 0.004892910866716144, + "loss": 0.0, + "num_input_tokens_seen": 62960192, + "step": 36740 + }, + { + "epoch": 178.3728813559322, + "grad_norm": 8.23729706667109e-09, + "learning_rate": 0.004877999838499369, + "loss": 0.0, + "num_input_tokens_seen": 62968608, + "step": 36745 + }, + { + "epoch": 178.39709443099272, + "grad_norm": 3.0604880851825556e-09, + "learning_rate": 0.0048631111899187065, + "loss": 0.0, + "num_input_tokens_seen": 62976928, + "step": 36750 + }, + { + "epoch": 178.42130750605327, + "grad_norm": 3.5304772438848886e-09, + "learning_rate": 0.0048482449232702335, + "loss": 0.0, + "num_input_tokens_seen": 62985376, + "step": 36755 + }, + { + "epoch": 178.4455205811138, + "grad_norm": 1.792364123787138e-08, + "learning_rate": 0.004833401040846469, + "loss": 0.0, + "num_input_tokens_seen": 62993792, + "step": 36760 + }, + { + "epoch": 178.46973365617433, + "grad_norm": 7.235087640111715e-09, + "learning_rate": 0.004818579544936546, + "loss": 0.0, + "num_input_tokens_seen": 63002368, + "step": 36765 + }, + { + "epoch": 178.49394673123487, + "grad_norm": 1.2430915852235103e-08, + "learning_rate": 0.004803780437826121, + "loss": 0.0, + "num_input_tokens_seen": 63010848, + "step": 36770 + }, + { + "epoch": 178.5181598062954, + "grad_norm": 9.902183961685296e-09, + "learning_rate": 0.004789003721797402, + "loss": 0.0, + "num_input_tokens_seen": 63019520, + "step": 36775 + }, + { + "epoch": 178.54237288135593, + "grad_norm": 6.250455708567415e-09, + "learning_rate": 0.004774249399129132, + "loss": 0.0, + "num_input_tokens_seen": 63027936, + "step": 36780 + }, + { + "epoch": 178.56658595641647, + "grad_norm": 1.4132424119850384e-08, + "learning_rate": 0.004759517472096642, + "loss": 0.0, + "num_input_tokens_seen": 63036448, + "step": 36785 + }, + { + "epoch": 178.590799031477, + "grad_norm": 6.802785890158702e-09, + "learning_rate": 0.004744807942971746, + "loss": 0.0, + "num_input_tokens_seen": 63045216, + "step": 36790 + }, + { + "epoch": 178.61501210653753, + "grad_norm": 1.3558341116493011e-08, + "learning_rate": 0.004730120814022881, + "loss": 0.0, + "num_input_tokens_seen": 63053728, + "step": 36795 + }, + { + "epoch": 178.63922518159805, + "grad_norm": 2.0401587086382733e-09, + "learning_rate": 0.004715456087514935, + "loss": 0.0, + "num_input_tokens_seen": 63062656, + "step": 36800 + }, + { + "epoch": 178.63922518159805, + "eval_loss": 1.1968333721160889, + "eval_runtime": 4.615, + "eval_samples_per_second": 79.524, + "eval_steps_per_second": 19.935, + "num_input_tokens_seen": 63062656, + "step": 36800 + }, + { + "epoch": 178.6634382566586, + "grad_norm": 7.193171835950807e-09, + "learning_rate": 0.004700813765709432, + "loss": 0.0, + "num_input_tokens_seen": 63071040, + "step": 36805 + }, + { + "epoch": 178.68765133171914, + "grad_norm": 9.60030099861342e-09, + "learning_rate": 0.004686193850864401, + "loss": 0.0, + "num_input_tokens_seen": 63079808, + "step": 36810 + }, + { + "epoch": 178.71186440677965, + "grad_norm": 9.828300839842541e-09, + "learning_rate": 0.004671596345234385, + "loss": 0.0, + "num_input_tokens_seen": 63088512, + "step": 36815 + }, + { + "epoch": 178.7360774818402, + "grad_norm": 8.16873146902708e-09, + "learning_rate": 0.00465702125107052, + "loss": 0.0, + "num_input_tokens_seen": 63096960, + "step": 36820 + }, + { + "epoch": 178.7602905569007, + "grad_norm": 2.0996338889744948e-08, + "learning_rate": 0.004642468570620506, + "loss": 0.0, + "num_input_tokens_seen": 63105536, + "step": 36825 + }, + { + "epoch": 178.78450363196126, + "grad_norm": 3.248909141717604e-09, + "learning_rate": 0.004627938306128482, + "loss": 0.0, + "num_input_tokens_seen": 63113792, + "step": 36830 + }, + { + "epoch": 178.8087167070218, + "grad_norm": 9.678974066673618e-09, + "learning_rate": 0.004613430459835255, + "loss": 0.0, + "num_input_tokens_seen": 63122464, + "step": 36835 + }, + { + "epoch": 178.83292978208232, + "grad_norm": 1.4453354069132729e-08, + "learning_rate": 0.004598945033978085, + "loss": 0.0, + "num_input_tokens_seen": 63131104, + "step": 36840 + }, + { + "epoch": 178.85714285714286, + "grad_norm": 1.3747733618174607e-08, + "learning_rate": 0.004584482030790804, + "loss": 0.0, + "num_input_tokens_seen": 63139584, + "step": 36845 + }, + { + "epoch": 178.88135593220338, + "grad_norm": 7.77297071152816e-09, + "learning_rate": 0.004570041452503826, + "loss": 0.0, + "num_input_tokens_seen": 63147968, + "step": 36850 + }, + { + "epoch": 178.90556900726392, + "grad_norm": 7.813641289544648e-09, + "learning_rate": 0.004555623301344003, + "loss": 0.0, + "num_input_tokens_seen": 63156640, + "step": 36855 + }, + { + "epoch": 178.92978208232446, + "grad_norm": 9.587195926030745e-09, + "learning_rate": 0.004541227579534857, + "loss": 0.0, + "num_input_tokens_seen": 63165248, + "step": 36860 + }, + { + "epoch": 178.95399515738498, + "grad_norm": 1.09485869259629e-08, + "learning_rate": 0.004526854289296378, + "loss": 0.0, + "num_input_tokens_seen": 63173920, + "step": 36865 + }, + { + "epoch": 178.97820823244552, + "grad_norm": 1.9202106571469812e-08, + "learning_rate": 0.004512503432845078, + "loss": 0.0, + "num_input_tokens_seen": 63182336, + "step": 36870 + }, + { + "epoch": 179.0048426150121, + "grad_norm": 1.2819896255678032e-08, + "learning_rate": 0.004498175012394068, + "loss": 0.0, + "num_input_tokens_seen": 63191648, + "step": 36875 + }, + { + "epoch": 179.02905569007265, + "grad_norm": 1.0231253178005772e-08, + "learning_rate": 0.004483869030152965, + "loss": 0.0, + "num_input_tokens_seen": 63200160, + "step": 36880 + }, + { + "epoch": 179.05326876513317, + "grad_norm": 2.607515980912467e-09, + "learning_rate": 0.004469585488327904, + "loss": 0.0, + "num_input_tokens_seen": 63208768, + "step": 36885 + }, + { + "epoch": 179.0774818401937, + "grad_norm": 6.070916214184763e-09, + "learning_rate": 0.0044553243891216395, + "loss": 0.0, + "num_input_tokens_seen": 63217216, + "step": 36890 + }, + { + "epoch": 179.10169491525423, + "grad_norm": 5.836167993322761e-09, + "learning_rate": 0.004441085734733363, + "loss": 0.0, + "num_input_tokens_seen": 63225760, + "step": 36895 + }, + { + "epoch": 179.12590799031477, + "grad_norm": 4.9709640848050185e-09, + "learning_rate": 0.004426869527358884, + "loss": 0.0, + "num_input_tokens_seen": 63234208, + "step": 36900 + }, + { + "epoch": 179.15012106537532, + "grad_norm": 5.097942956666657e-09, + "learning_rate": 0.0044126757691905156, + "loss": 0.0, + "num_input_tokens_seen": 63242560, + "step": 36905 + }, + { + "epoch": 179.17433414043583, + "grad_norm": 3.739473619646105e-09, + "learning_rate": 0.004398504462417107, + "loss": 0.0, + "num_input_tokens_seen": 63251200, + "step": 36910 + }, + { + "epoch": 179.19854721549638, + "grad_norm": 1.442065045154095e-08, + "learning_rate": 0.0043843556092240605, + "loss": 0.0, + "num_input_tokens_seen": 63260000, + "step": 36915 + }, + { + "epoch": 179.2227602905569, + "grad_norm": 7.802166024362123e-09, + "learning_rate": 0.004370229211793281, + "loss": 0.0, + "num_input_tokens_seen": 63268544, + "step": 36920 + }, + { + "epoch": 179.24697336561744, + "grad_norm": 1.69084426460131e-08, + "learning_rate": 0.0043561252723032405, + "loss": 0.0, + "num_input_tokens_seen": 63277088, + "step": 36925 + }, + { + "epoch": 179.27118644067798, + "grad_norm": 4.967713795878126e-09, + "learning_rate": 0.004342043792929001, + "loss": 0.0, + "num_input_tokens_seen": 63285472, + "step": 36930 + }, + { + "epoch": 179.2953995157385, + "grad_norm": 3.7246759010400865e-09, + "learning_rate": 0.004327984775842025, + "loss": 0.0, + "num_input_tokens_seen": 63293888, + "step": 36935 + }, + { + "epoch": 179.31961259079904, + "grad_norm": 9.120626032199652e-09, + "learning_rate": 0.004313948223210428, + "loss": 0.0, + "num_input_tokens_seen": 63302112, + "step": 36940 + }, + { + "epoch": 179.34382566585955, + "grad_norm": 1.5300191336109492e-08, + "learning_rate": 0.004299934137198846, + "loss": 0.0, + "num_input_tokens_seen": 63310784, + "step": 36945 + }, + { + "epoch": 179.3680387409201, + "grad_norm": 6.418750864156664e-09, + "learning_rate": 0.004285942519968383, + "loss": 0.0, + "num_input_tokens_seen": 63319456, + "step": 36950 + }, + { + "epoch": 179.39225181598064, + "grad_norm": 6.1886558100354705e-09, + "learning_rate": 0.004271973373676746, + "loss": 0.0, + "num_input_tokens_seen": 63327712, + "step": 36955 + }, + { + "epoch": 179.41646489104116, + "grad_norm": 1.3722393887860562e-08, + "learning_rate": 0.004258026700478146, + "loss": 0.0, + "num_input_tokens_seen": 63336480, + "step": 36960 + }, + { + "epoch": 179.4406779661017, + "grad_norm": 1.058445775470318e-08, + "learning_rate": 0.004244102502523328, + "loss": 0.0, + "num_input_tokens_seen": 63345568, + "step": 36965 + }, + { + "epoch": 179.46489104116222, + "grad_norm": 9.37354194263662e-09, + "learning_rate": 0.004230200781959592, + "loss": 0.0, + "num_input_tokens_seen": 63354112, + "step": 36970 + }, + { + "epoch": 179.48910411622276, + "grad_norm": 4.380127371916842e-09, + "learning_rate": 0.004216321540930756, + "loss": 0.0, + "num_input_tokens_seen": 63362624, + "step": 36975 + }, + { + "epoch": 179.5133171912833, + "grad_norm": 1.490173850982046e-08, + "learning_rate": 0.004202464781577175, + "loss": 0.0, + "num_input_tokens_seen": 63371392, + "step": 36980 + }, + { + "epoch": 179.53753026634382, + "grad_norm": 3.873879883542486e-09, + "learning_rate": 0.00418863050603574, + "loss": 0.0, + "num_input_tokens_seen": 63379584, + "step": 36985 + }, + { + "epoch": 179.56174334140437, + "grad_norm": 9.648676524420807e-09, + "learning_rate": 0.004174818716439843, + "loss": 0.0, + "num_input_tokens_seen": 63388480, + "step": 36990 + }, + { + "epoch": 179.58595641646488, + "grad_norm": 3.803235948396377e-09, + "learning_rate": 0.004161029414919464, + "loss": 0.0, + "num_input_tokens_seen": 63396800, + "step": 36995 + }, + { + "epoch": 179.61016949152543, + "grad_norm": 6.971658805809966e-09, + "learning_rate": 0.004147262603601071, + "loss": 0.0, + "num_input_tokens_seen": 63405504, + "step": 37000 + }, + { + "epoch": 179.61016949152543, + "eval_loss": 1.1963489055633545, + "eval_runtime": 4.6309, + "eval_samples_per_second": 79.251, + "eval_steps_per_second": 19.867, + "num_input_tokens_seen": 63405504, + "step": 37000 + }, + { + "epoch": 179.63438256658597, + "grad_norm": 1.56529811334849e-08, + "learning_rate": 0.004133518284607679, + "loss": 0.0, + "num_input_tokens_seen": 63413888, + "step": 37005 + }, + { + "epoch": 179.65859564164649, + "grad_norm": 9.38139077533151e-09, + "learning_rate": 0.004119796460058861, + "loss": 0.0, + "num_input_tokens_seen": 63422368, + "step": 37010 + }, + { + "epoch": 179.68280871670703, + "grad_norm": 2.623053774186701e-09, + "learning_rate": 0.00410609713207064, + "loss": 0.0, + "num_input_tokens_seen": 63430848, + "step": 37015 + }, + { + "epoch": 179.70702179176754, + "grad_norm": 5.87665738294163e-09, + "learning_rate": 0.004092420302755678, + "loss": 0.0, + "num_input_tokens_seen": 63439168, + "step": 37020 + }, + { + "epoch": 179.7312348668281, + "grad_norm": 6.941882624289519e-09, + "learning_rate": 0.004078765974223103, + "loss": 0.0, + "num_input_tokens_seen": 63447360, + "step": 37025 + }, + { + "epoch": 179.75544794188863, + "grad_norm": 7.246972799634932e-09, + "learning_rate": 0.004065134148578564, + "loss": 0.0, + "num_input_tokens_seen": 63456192, + "step": 37030 + }, + { + "epoch": 179.77966101694915, + "grad_norm": 2.6530946328762184e-09, + "learning_rate": 0.004051524827924279, + "loss": 0.0, + "num_input_tokens_seen": 63464672, + "step": 37035 + }, + { + "epoch": 179.8038740920097, + "grad_norm": 1.9627734104687988e-08, + "learning_rate": 0.004037938014358955, + "loss": 0.0, + "num_input_tokens_seen": 63473536, + "step": 37040 + }, + { + "epoch": 179.8280871670702, + "grad_norm": 8.482638591544855e-09, + "learning_rate": 0.004024373709977863, + "loss": 0.0, + "num_input_tokens_seen": 63481952, + "step": 37045 + }, + { + "epoch": 179.85230024213075, + "grad_norm": 6.1044302945845175e-09, + "learning_rate": 0.004010831916872814, + "loss": 0.0, + "num_input_tokens_seen": 63490816, + "step": 37050 + }, + { + "epoch": 179.8765133171913, + "grad_norm": 6.456162715551272e-09, + "learning_rate": 0.003997312637132089, + "loss": 0.0, + "num_input_tokens_seen": 63499552, + "step": 37055 + }, + { + "epoch": 179.9007263922518, + "grad_norm": 1.3137776200267126e-08, + "learning_rate": 0.003983815872840535, + "loss": 0.0, + "num_input_tokens_seen": 63508096, + "step": 37060 + }, + { + "epoch": 179.92493946731236, + "grad_norm": 7.831650883360908e-09, + "learning_rate": 0.003970341626079521, + "loss": 0.0, + "num_input_tokens_seen": 63517024, + "step": 37065 + }, + { + "epoch": 179.94915254237287, + "grad_norm": 1.768029278537142e-08, + "learning_rate": 0.003956889898926952, + "loss": 0.0, + "num_input_tokens_seen": 63525344, + "step": 37070 + }, + { + "epoch": 179.97336561743342, + "grad_norm": 4.846775869538078e-09, + "learning_rate": 0.0039434606934572675, + "loss": 0.0, + "num_input_tokens_seen": 63533632, + "step": 37075 + }, + { + "epoch": 179.99757869249396, + "grad_norm": 6.55322995868346e-09, + "learning_rate": 0.003930054011741396, + "loss": 0.0, + "num_input_tokens_seen": 63542112, + "step": 37080 + }, + { + "epoch": 180.02421307506054, + "grad_norm": 6.217854675583112e-09, + "learning_rate": 0.0039166698558468155, + "loss": 0.0, + "num_input_tokens_seen": 63551200, + "step": 37085 + }, + { + "epoch": 180.04842615012106, + "grad_norm": 6.0858980077682645e-09, + "learning_rate": 0.0039033082278375594, + "loss": 0.0, + "num_input_tokens_seen": 63560000, + "step": 37090 + }, + { + "epoch": 180.0726392251816, + "grad_norm": 5.423223203848693e-09, + "learning_rate": 0.003889969129774112, + "loss": 0.0, + "num_input_tokens_seen": 63568448, + "step": 37095 + }, + { + "epoch": 180.09685230024212, + "grad_norm": 1.239176672385156e-08, + "learning_rate": 0.0038766525637135784, + "loss": 0.0, + "num_input_tokens_seen": 63576768, + "step": 37100 + }, + { + "epoch": 180.12106537530266, + "grad_norm": 6.752013170796545e-09, + "learning_rate": 0.0038633585317095318, + "loss": 0.0, + "num_input_tokens_seen": 63585376, + "step": 37105 + }, + { + "epoch": 180.1452784503632, + "grad_norm": 1.4319629038084258e-08, + "learning_rate": 0.00385008703581205, + "loss": 0.0, + "num_input_tokens_seen": 63594016, + "step": 37110 + }, + { + "epoch": 180.16949152542372, + "grad_norm": 8.396245476660624e-09, + "learning_rate": 0.0038368380780677944, + "loss": 0.0, + "num_input_tokens_seen": 63602784, + "step": 37115 + }, + { + "epoch": 180.19370460048427, + "grad_norm": 9.193124483886095e-09, + "learning_rate": 0.003823611660519882, + "loss": 0.0, + "num_input_tokens_seen": 63611136, + "step": 37120 + }, + { + "epoch": 180.21791767554478, + "grad_norm": 5.040383221910361e-09, + "learning_rate": 0.0038104077852080475, + "loss": 0.0, + "num_input_tokens_seen": 63619648, + "step": 37125 + }, + { + "epoch": 180.24213075060533, + "grad_norm": 6.548023012697968e-09, + "learning_rate": 0.003797226454168462, + "loss": 0.0, + "num_input_tokens_seen": 63628160, + "step": 37130 + }, + { + "epoch": 180.26634382566587, + "grad_norm": 3.7611151526562026e-08, + "learning_rate": 0.003784067669433849, + "loss": 0.0, + "num_input_tokens_seen": 63637056, + "step": 37135 + }, + { + "epoch": 180.2905569007264, + "grad_norm": 6.237910188389151e-09, + "learning_rate": 0.0037709314330334528, + "loss": 0.0, + "num_input_tokens_seen": 63645472, + "step": 37140 + }, + { + "epoch": 180.31476997578693, + "grad_norm": 7.984854555331822e-09, + "learning_rate": 0.003757817746993086, + "loss": 0.0, + "num_input_tokens_seen": 63654080, + "step": 37145 + }, + { + "epoch": 180.33898305084745, + "grad_norm": 7.532460877257563e-09, + "learning_rate": 0.0037447266133349977, + "loss": 0.0, + "num_input_tokens_seen": 63662880, + "step": 37150 + }, + { + "epoch": 180.363196125908, + "grad_norm": 9.281198032340399e-09, + "learning_rate": 0.003731658034078039, + "loss": 0.0, + "num_input_tokens_seen": 63671328, + "step": 37155 + }, + { + "epoch": 180.38740920096853, + "grad_norm": 1.062257570794145e-08, + "learning_rate": 0.0037186120112375153, + "loss": 0.0, + "num_input_tokens_seen": 63680192, + "step": 37160 + }, + { + "epoch": 180.41162227602905, + "grad_norm": 5.114604739731021e-09, + "learning_rate": 0.003705588546825317, + "loss": 0.0, + "num_input_tokens_seen": 63688736, + "step": 37165 + }, + { + "epoch": 180.4358353510896, + "grad_norm": 7.7971966661039e-09, + "learning_rate": 0.0036925876428498205, + "loss": 0.0, + "num_input_tokens_seen": 63697504, + "step": 37170 + }, + { + "epoch": 180.4600484261501, + "grad_norm": 8.612936142071703e-09, + "learning_rate": 0.0036796093013159057, + "loss": 0.0, + "num_input_tokens_seen": 63706368, + "step": 37175 + }, + { + "epoch": 180.48426150121065, + "grad_norm": 7.272639823696636e-09, + "learning_rate": 0.0036666535242250217, + "loss": 0.0, + "num_input_tokens_seen": 63714816, + "step": 37180 + }, + { + "epoch": 180.5084745762712, + "grad_norm": 8.171239684884313e-09, + "learning_rate": 0.003653720313575104, + "loss": 0.0, + "num_input_tokens_seen": 63723232, + "step": 37185 + }, + { + "epoch": 180.5326876513317, + "grad_norm": 3.775881385337243e-09, + "learning_rate": 0.003640809671360623, + "loss": 0.0, + "num_input_tokens_seen": 63731840, + "step": 37190 + }, + { + "epoch": 180.55690072639226, + "grad_norm": 7.805005530769904e-09, + "learning_rate": 0.003627921599572553, + "loss": 0.0, + "num_input_tokens_seen": 63740544, + "step": 37195 + }, + { + "epoch": 180.58111380145277, + "grad_norm": 6.183645151480732e-09, + "learning_rate": 0.003615056100198405, + "loss": 0.0, + "num_input_tokens_seen": 63748768, + "step": 37200 + }, + { + "epoch": 180.58111380145277, + "eval_loss": 1.1958789825439453, + "eval_runtime": 4.6358, + "eval_samples_per_second": 79.167, + "eval_steps_per_second": 19.846, + "num_input_tokens_seen": 63748768, + "step": 37200 + }, + { + "epoch": 180.60532687651332, + "grad_norm": 5.070826425424002e-09, + "learning_rate": 0.003602213175222174, + "loss": 0.0, + "num_input_tokens_seen": 63757216, + "step": 37205 + }, + { + "epoch": 180.62953995157386, + "grad_norm": 1.1938261046395837e-08, + "learning_rate": 0.0035893928266244432, + "loss": 0.0, + "num_input_tokens_seen": 63765568, + "step": 37210 + }, + { + "epoch": 180.65375302663438, + "grad_norm": 3.4220559719244648e-09, + "learning_rate": 0.003576595056382248, + "loss": 0.0, + "num_input_tokens_seen": 63774048, + "step": 37215 + }, + { + "epoch": 180.67796610169492, + "grad_norm": 9.07726960264199e-09, + "learning_rate": 0.0035638198664691423, + "loss": 0.0, + "num_input_tokens_seen": 63782336, + "step": 37220 + }, + { + "epoch": 180.70217917675544, + "grad_norm": 1.2192892029361246e-08, + "learning_rate": 0.003551067258855267, + "loss": 0.0, + "num_input_tokens_seen": 63791072, + "step": 37225 + }, + { + "epoch": 180.72639225181598, + "grad_norm": 6.208565661580678e-09, + "learning_rate": 0.0035383372355071996, + "loss": 0.0, + "num_input_tokens_seen": 63799680, + "step": 37230 + }, + { + "epoch": 180.75060532687652, + "grad_norm": 7.726346673564422e-09, + "learning_rate": 0.0035256297983881023, + "loss": 0.0, + "num_input_tokens_seen": 63808352, + "step": 37235 + }, + { + "epoch": 180.77481840193704, + "grad_norm": 8.525367967138209e-09, + "learning_rate": 0.0035129449494575747, + "loss": 0.0, + "num_input_tokens_seen": 63817088, + "step": 37240 + }, + { + "epoch": 180.79903147699758, + "grad_norm": 7.309141292211052e-09, + "learning_rate": 0.0035002826906718187, + "loss": 0.0, + "num_input_tokens_seen": 63825824, + "step": 37245 + }, + { + "epoch": 180.8232445520581, + "grad_norm": 9.694614888644537e-09, + "learning_rate": 0.003487643023983522, + "loss": 0.0, + "num_input_tokens_seen": 63834528, + "step": 37250 + }, + { + "epoch": 180.84745762711864, + "grad_norm": 1.064135712880443e-08, + "learning_rate": 0.003475025951341842, + "loss": 0.0, + "num_input_tokens_seen": 63842816, + "step": 37255 + }, + { + "epoch": 180.8716707021792, + "grad_norm": 1.6464422714079774e-08, + "learning_rate": 0.00346243147469249, + "loss": 0.0, + "num_input_tokens_seen": 63851552, + "step": 37260 + }, + { + "epoch": 180.8958837772397, + "grad_norm": 1.0901325175893817e-08, + "learning_rate": 0.0034498595959777446, + "loss": 0.0, + "num_input_tokens_seen": 63860352, + "step": 37265 + }, + { + "epoch": 180.92009685230025, + "grad_norm": 5.5315290126145555e-09, + "learning_rate": 0.003437310317136305, + "loss": 0.0, + "num_input_tokens_seen": 63868768, + "step": 37270 + }, + { + "epoch": 180.94430992736076, + "grad_norm": 2.7603486163485513e-09, + "learning_rate": 0.0034247836401034236, + "loss": 0.0, + "num_input_tokens_seen": 63877376, + "step": 37275 + }, + { + "epoch": 180.9685230024213, + "grad_norm": 5.354389820411143e-09, + "learning_rate": 0.003412279566810905, + "loss": 0.0, + "num_input_tokens_seen": 63885760, + "step": 37280 + }, + { + "epoch": 180.99273607748185, + "grad_norm": 1.0347122270104592e-08, + "learning_rate": 0.00339979809918699, + "loss": 0.0, + "num_input_tokens_seen": 63894432, + "step": 37285 + }, + { + "epoch": 181.01937046004844, + "grad_norm": 4.818735188649725e-09, + "learning_rate": 0.0033873392391565228, + "loss": 0.0, + "num_input_tokens_seen": 63903136, + "step": 37290 + }, + { + "epoch": 181.04358353510895, + "grad_norm": 6.856034850954984e-09, + "learning_rate": 0.003374902988640782, + "loss": 0.0, + "num_input_tokens_seen": 63911296, + "step": 37295 + }, + { + "epoch": 181.0677966101695, + "grad_norm": 1.1053616688627699e-08, + "learning_rate": 0.0033624893495576014, + "loss": 0.0, + "num_input_tokens_seen": 63919584, + "step": 37300 + }, + { + "epoch": 181.09200968523, + "grad_norm": 8.29125923473839e-09, + "learning_rate": 0.0033500983238213323, + "loss": 0.0, + "num_input_tokens_seen": 63928160, + "step": 37305 + }, + { + "epoch": 181.11622276029055, + "grad_norm": 5.816938042357833e-09, + "learning_rate": 0.0033377299133428126, + "loss": 0.0, + "num_input_tokens_seen": 63936672, + "step": 37310 + }, + { + "epoch": 181.1404358353511, + "grad_norm": 3.7893652660159205e-09, + "learning_rate": 0.003325384120029434, + "loss": 0.0, + "num_input_tokens_seen": 63945248, + "step": 37315 + }, + { + "epoch": 181.16464891041161, + "grad_norm": 6.989645751076523e-09, + "learning_rate": 0.0033130609457850233, + "loss": 0.0, + "num_input_tokens_seen": 63954304, + "step": 37320 + }, + { + "epoch": 181.18886198547216, + "grad_norm": 1.654845682708128e-08, + "learning_rate": 0.0033007603925100104, + "loss": 0.0, + "num_input_tokens_seen": 63962912, + "step": 37325 + }, + { + "epoch": 181.21307506053267, + "grad_norm": 1.9242877513647727e-08, + "learning_rate": 0.003288482462101294, + "loss": 0.0, + "num_input_tokens_seen": 63970976, + "step": 37330 + }, + { + "epoch": 181.23728813559322, + "grad_norm": 1.048236519807233e-08, + "learning_rate": 0.0032762271564522605, + "loss": 0.0, + "num_input_tokens_seen": 63979424, + "step": 37335 + }, + { + "epoch": 181.26150121065376, + "grad_norm": 1.0846271436548705e-08, + "learning_rate": 0.003263994477452864, + "loss": 0.0, + "num_input_tokens_seen": 63988128, + "step": 37340 + }, + { + "epoch": 181.28571428571428, + "grad_norm": 1.4971190509527332e-08, + "learning_rate": 0.0032517844269895125, + "loss": 0.0, + "num_input_tokens_seen": 63996896, + "step": 37345 + }, + { + "epoch": 181.30992736077482, + "grad_norm": 8.362973424880238e-09, + "learning_rate": 0.0032395970069451496, + "loss": 0.0, + "num_input_tokens_seen": 64005760, + "step": 37350 + }, + { + "epoch": 181.33414043583534, + "grad_norm": 1.2679965522011116e-08, + "learning_rate": 0.0032274322191992388, + "loss": 0.0, + "num_input_tokens_seen": 64014592, + "step": 37355 + }, + { + "epoch": 181.35835351089588, + "grad_norm": 8.88629259065965e-09, + "learning_rate": 0.0032152900656277294, + "loss": 0.0, + "num_input_tokens_seen": 64023104, + "step": 37360 + }, + { + "epoch": 181.38256658595643, + "grad_norm": 6.452025580472309e-09, + "learning_rate": 0.0032031705481030902, + "loss": 0.0, + "num_input_tokens_seen": 64031712, + "step": 37365 + }, + { + "epoch": 181.40677966101694, + "grad_norm": 9.049752058842842e-09, + "learning_rate": 0.0031910736684943428, + "loss": 0.0, + "num_input_tokens_seen": 64039872, + "step": 37370 + }, + { + "epoch": 181.43099273607749, + "grad_norm": 1.2382328051785407e-08, + "learning_rate": 0.0031789994286669453, + "loss": 0.0, + "num_input_tokens_seen": 64048512, + "step": 37375 + }, + { + "epoch": 181.455205811138, + "grad_norm": 1.3898460160532977e-08, + "learning_rate": 0.003166947830482908, + "loss": 0.0, + "num_input_tokens_seen": 64057216, + "step": 37380 + }, + { + "epoch": 181.47941888619854, + "grad_norm": 1.1408285871539192e-08, + "learning_rate": 0.003154918875800727, + "loss": 0.0, + "num_input_tokens_seen": 64066176, + "step": 37385 + }, + { + "epoch": 181.5036319612591, + "grad_norm": 9.22305165573789e-09, + "learning_rate": 0.00314291256647542, + "loss": 0.0, + "num_input_tokens_seen": 64075072, + "step": 37390 + }, + { + "epoch": 181.5278450363196, + "grad_norm": 7.946044711104605e-09, + "learning_rate": 0.0031309289043585375, + "loss": 0.0, + "num_input_tokens_seen": 64083520, + "step": 37395 + }, + { + "epoch": 181.55205811138015, + "grad_norm": 1.2152845840773807e-08, + "learning_rate": 0.003118967891298069, + "loss": 0.0, + "num_input_tokens_seen": 64092416, + "step": 37400 + }, + { + "epoch": 181.55205811138015, + "eval_loss": 1.1894118785858154, + "eval_runtime": 4.6157, + "eval_samples_per_second": 79.512, + "eval_steps_per_second": 19.932, + "num_input_tokens_seen": 64092416, + "step": 37400 + }, + { + "epoch": 181.57627118644066, + "grad_norm": 1.2654618686269714e-08, + "learning_rate": 0.003107029529138572, + "loss": 0.0, + "num_input_tokens_seen": 64101056, + "step": 37405 + }, + { + "epoch": 181.6004842615012, + "grad_norm": 1.426394380388274e-08, + "learning_rate": 0.0030951138197211235, + "loss": 0.0, + "num_input_tokens_seen": 64109728, + "step": 37410 + }, + { + "epoch": 181.62469733656175, + "grad_norm": 9.17376841158557e-09, + "learning_rate": 0.0030832207648832377, + "loss": 0.0, + "num_input_tokens_seen": 64117984, + "step": 37415 + }, + { + "epoch": 181.64891041162227, + "grad_norm": 6.503916516464869e-09, + "learning_rate": 0.0030713503664589635, + "loss": 0.0, + "num_input_tokens_seen": 64126560, + "step": 37420 + }, + { + "epoch": 181.6731234866828, + "grad_norm": 1.0131665284518476e-08, + "learning_rate": 0.0030595026262788872, + "loss": 0.0, + "num_input_tokens_seen": 64135200, + "step": 37425 + }, + { + "epoch": 181.69733656174333, + "grad_norm": 1.6995711504819155e-08, + "learning_rate": 0.00304767754617008, + "loss": 0.0, + "num_input_tokens_seen": 64143648, + "step": 37430 + }, + { + "epoch": 181.72154963680387, + "grad_norm": 5.016273174618391e-09, + "learning_rate": 0.003035875127956117, + "loss": 0.0, + "num_input_tokens_seen": 64152320, + "step": 37435 + }, + { + "epoch": 181.74576271186442, + "grad_norm": 8.37463698388774e-09, + "learning_rate": 0.0030240953734570752, + "loss": 0.0, + "num_input_tokens_seen": 64161056, + "step": 37440 + }, + { + "epoch": 181.76997578692493, + "grad_norm": 8.777736759668642e-09, + "learning_rate": 0.003012338284489535, + "loss": 0.0, + "num_input_tokens_seen": 64169760, + "step": 37445 + }, + { + "epoch": 181.79418886198548, + "grad_norm": 8.58993320917989e-09, + "learning_rate": 0.0030006038628665964, + "loss": 0.0, + "num_input_tokens_seen": 64177856, + "step": 37450 + }, + { + "epoch": 181.818401937046, + "grad_norm": 6.935529484053404e-09, + "learning_rate": 0.002988892110397845, + "loss": 0.0, + "num_input_tokens_seen": 64186688, + "step": 37455 + }, + { + "epoch": 181.84261501210653, + "grad_norm": 1.3064691550823682e-08, + "learning_rate": 0.0029772030288894025, + "loss": 0.0, + "num_input_tokens_seen": 64195616, + "step": 37460 + }, + { + "epoch": 181.86682808716708, + "grad_norm": 1.2262557191888845e-08, + "learning_rate": 0.0029655366201438438, + "loss": 0.0, + "num_input_tokens_seen": 64204512, + "step": 37465 + }, + { + "epoch": 181.8910411622276, + "grad_norm": 5.782862189107618e-09, + "learning_rate": 0.0029538928859602965, + "loss": 0.0, + "num_input_tokens_seen": 64212768, + "step": 37470 + }, + { + "epoch": 181.91525423728814, + "grad_norm": 5.120763368893222e-09, + "learning_rate": 0.002942271828134374, + "loss": 0.0, + "num_input_tokens_seen": 64220832, + "step": 37475 + }, + { + "epoch": 181.93946731234865, + "grad_norm": 5.353110399397565e-09, + "learning_rate": 0.00293067344845816, + "loss": 0.0, + "num_input_tokens_seen": 64229184, + "step": 37480 + }, + { + "epoch": 181.9636803874092, + "grad_norm": 2.239626972766473e-08, + "learning_rate": 0.0029190977487202896, + "loss": 0.0, + "num_input_tokens_seen": 64238016, + "step": 37485 + }, + { + "epoch": 181.98789346246974, + "grad_norm": 5.8816680414963685e-09, + "learning_rate": 0.0029075447307058853, + "loss": 0.0, + "num_input_tokens_seen": 64246752, + "step": 37490 + }, + { + "epoch": 182.01452784503633, + "grad_norm": 8.12859735077609e-09, + "learning_rate": 0.0028960143961965722, + "loss": 0.0, + "num_input_tokens_seen": 64255936, + "step": 37495 + }, + { + "epoch": 182.03874092009684, + "grad_norm": 1.3084504146831932e-08, + "learning_rate": 0.002884506746970461, + "loss": 0.0, + "num_input_tokens_seen": 64264512, + "step": 37500 + }, + { + "epoch": 182.0629539951574, + "grad_norm": 9.591572869283027e-09, + "learning_rate": 0.0028730217848021654, + "loss": 0.0, + "num_input_tokens_seen": 64273440, + "step": 37505 + }, + { + "epoch": 182.08716707021793, + "grad_norm": 7.941136637157342e-09, + "learning_rate": 0.0028615595114628188, + "loss": 0.0, + "num_input_tokens_seen": 64281824, + "step": 37510 + }, + { + "epoch": 182.11138014527845, + "grad_norm": 9.808966083824089e-09, + "learning_rate": 0.002850119928720074, + "loss": 0.0, + "num_input_tokens_seen": 64290528, + "step": 37515 + }, + { + "epoch": 182.135593220339, + "grad_norm": 2.385130315474271e-09, + "learning_rate": 0.0028387030383380195, + "loss": 0.0, + "num_input_tokens_seen": 64298912, + "step": 37520 + }, + { + "epoch": 182.1598062953995, + "grad_norm": 8.305311993694886e-09, + "learning_rate": 0.0028273088420772974, + "loss": 0.0, + "num_input_tokens_seen": 64307520, + "step": 37525 + }, + { + "epoch": 182.18401937046005, + "grad_norm": 7.405705382268479e-09, + "learning_rate": 0.002815937341695068, + "loss": 0.0, + "num_input_tokens_seen": 64315712, + "step": 37530 + }, + { + "epoch": 182.2082324455206, + "grad_norm": 1.6558020732304612e-08, + "learning_rate": 0.0028045885389448963, + "loss": 0.0, + "num_input_tokens_seen": 64323872, + "step": 37535 + }, + { + "epoch": 182.2324455205811, + "grad_norm": 5.0896358239072015e-09, + "learning_rate": 0.002793262435576965, + "loss": 0.0, + "num_input_tokens_seen": 64332608, + "step": 37540 + }, + { + "epoch": 182.25665859564165, + "grad_norm": 7.246133915117525e-09, + "learning_rate": 0.0027819590333378772, + "loss": 0.0, + "num_input_tokens_seen": 64341376, + "step": 37545 + }, + { + "epoch": 182.28087167070217, + "grad_norm": 1.5517215956606378e-08, + "learning_rate": 0.002770678333970755, + "loss": 0.0, + "num_input_tokens_seen": 64350016, + "step": 37550 + }, + { + "epoch": 182.3050847457627, + "grad_norm": 6.171066768700939e-09, + "learning_rate": 0.0027594203392152573, + "loss": 0.0, + "num_input_tokens_seen": 64358720, + "step": 37555 + }, + { + "epoch": 182.32929782082326, + "grad_norm": 5.168658390175551e-09, + "learning_rate": 0.002748185050807478, + "loss": 0.0, + "num_input_tokens_seen": 64367424, + "step": 37560 + }, + { + "epoch": 182.35351089588377, + "grad_norm": 9.831150116212939e-09, + "learning_rate": 0.002736972470480031, + "loss": 0.0, + "num_input_tokens_seen": 64375904, + "step": 37565 + }, + { + "epoch": 182.37772397094432, + "grad_norm": 1.0837203134883566e-08, + "learning_rate": 0.002725782599962068, + "loss": 0.0, + "num_input_tokens_seen": 64384896, + "step": 37570 + }, + { + "epoch": 182.40193704600483, + "grad_norm": 8.637900172914215e-09, + "learning_rate": 0.0027146154409791734, + "loss": 0.0, + "num_input_tokens_seen": 64393536, + "step": 37575 + }, + { + "epoch": 182.42615012106538, + "grad_norm": 5.922056178775392e-09, + "learning_rate": 0.002703470995253504, + "loss": 0.0, + "num_input_tokens_seen": 64401952, + "step": 37580 + }, + { + "epoch": 182.45036319612592, + "grad_norm": 1.1442454095345056e-08, + "learning_rate": 0.0026923492645036184, + "loss": 0.0, + "num_input_tokens_seen": 64410880, + "step": 37585 + }, + { + "epoch": 182.47457627118644, + "grad_norm": 1.4600669118181031e-08, + "learning_rate": 0.0026812502504446776, + "loss": 0.0, + "num_input_tokens_seen": 64419264, + "step": 37590 + }, + { + "epoch": 182.49878934624698, + "grad_norm": 8.936176243423688e-09, + "learning_rate": 0.0026701739547882798, + "loss": 0.0, + "num_input_tokens_seen": 64428384, + "step": 37595 + }, + { + "epoch": 182.5230024213075, + "grad_norm": 1.6210828235330155e-08, + "learning_rate": 0.0026591203792425077, + "loss": 0.0, + "num_input_tokens_seen": 64436992, + "step": 37600 + }, + { + "epoch": 182.5230024213075, + "eval_loss": 1.199663758277893, + "eval_runtime": 4.6142, + "eval_samples_per_second": 79.537, + "eval_steps_per_second": 19.938, + "num_input_tokens_seen": 64436992, + "step": 37600 + }, + { + "epoch": 182.54721549636804, + "grad_norm": 1.2571518936965731e-08, + "learning_rate": 0.0026480895255119818, + "loss": 0.0, + "num_input_tokens_seen": 64445376, + "step": 37605 + }, + { + "epoch": 182.57142857142858, + "grad_norm": 1.0053812893318081e-08, + "learning_rate": 0.002637081395297791, + "loss": 0.0, + "num_input_tokens_seen": 64453920, + "step": 37610 + }, + { + "epoch": 182.5956416464891, + "grad_norm": 9.059103689423864e-09, + "learning_rate": 0.0026260959902975113, + "loss": 0.0, + "num_input_tokens_seen": 64462432, + "step": 37615 + }, + { + "epoch": 182.61985472154964, + "grad_norm": 7.2606320955515e-09, + "learning_rate": 0.00261513331220527, + "loss": 0.0, + "num_input_tokens_seen": 64470848, + "step": 37620 + }, + { + "epoch": 182.64406779661016, + "grad_norm": 7.0160433018884305e-09, + "learning_rate": 0.0026041933627116154, + "loss": 0.0, + "num_input_tokens_seen": 64479456, + "step": 37625 + }, + { + "epoch": 182.6682808716707, + "grad_norm": 7.309581828707223e-09, + "learning_rate": 0.0025932761435036476, + "loss": 0.0, + "num_input_tokens_seen": 64487840, + "step": 37630 + }, + { + "epoch": 182.69249394673125, + "grad_norm": 8.906939186204e-09, + "learning_rate": 0.002582381656264904, + "loss": 0.0, + "num_input_tokens_seen": 64496320, + "step": 37635 + }, + { + "epoch": 182.71670702179176, + "grad_norm": 1.099721735897674e-08, + "learning_rate": 0.0025715099026754895, + "loss": 0.0, + "num_input_tokens_seen": 64504992, + "step": 37640 + }, + { + "epoch": 182.7409200968523, + "grad_norm": 1.2552473727112101e-08, + "learning_rate": 0.002560660884411947, + "loss": 0.0, + "num_input_tokens_seen": 64513632, + "step": 37645 + }, + { + "epoch": 182.76513317191282, + "grad_norm": 1.4439121898135454e-08, + "learning_rate": 0.0025498346031473385, + "loss": 0.0, + "num_input_tokens_seen": 64522144, + "step": 37650 + }, + { + "epoch": 182.78934624697337, + "grad_norm": 1.050306952521396e-08, + "learning_rate": 0.0025390310605511945, + "loss": 0.0, + "num_input_tokens_seen": 64530592, + "step": 37655 + }, + { + "epoch": 182.8135593220339, + "grad_norm": 1.3157689160436803e-08, + "learning_rate": 0.0025282502582895995, + "loss": 0.0, + "num_input_tokens_seen": 64539232, + "step": 37660 + }, + { + "epoch": 182.83777239709443, + "grad_norm": 6.16481443671546e-09, + "learning_rate": 0.002517492198025023, + "loss": 0.0, + "num_input_tokens_seen": 64547776, + "step": 37665 + }, + { + "epoch": 182.86198547215497, + "grad_norm": 1.4274787574208858e-08, + "learning_rate": 0.0025067568814165554, + "loss": 0.0, + "num_input_tokens_seen": 64556096, + "step": 37670 + }, + { + "epoch": 182.88619854721549, + "grad_norm": 3.4479816779509065e-09, + "learning_rate": 0.0024960443101196884, + "loss": 0.0, + "num_input_tokens_seen": 64564736, + "step": 37675 + }, + { + "epoch": 182.91041162227603, + "grad_norm": 5.491965548998223e-09, + "learning_rate": 0.002485354485786434, + "loss": 0.0, + "num_input_tokens_seen": 64573120, + "step": 37680 + }, + { + "epoch": 182.93462469733657, + "grad_norm": 3.5687446331422734e-09, + "learning_rate": 0.002474687410065307, + "loss": 0.0, + "num_input_tokens_seen": 64581440, + "step": 37685 + }, + { + "epoch": 182.9588377723971, + "grad_norm": 1.2868261123344382e-08, + "learning_rate": 0.002464043084601308, + "loss": 0.0, + "num_input_tokens_seen": 64589952, + "step": 37690 + }, + { + "epoch": 182.98305084745763, + "grad_norm": 4.969246791830528e-09, + "learning_rate": 0.0024534215110358915, + "loss": 0.0, + "num_input_tokens_seen": 64598656, + "step": 37695 + }, + { + "epoch": 183.00968523002422, + "grad_norm": 6.938346341911483e-09, + "learning_rate": 0.002442822691007096, + "loss": 0.0, + "num_input_tokens_seen": 64607584, + "step": 37700 + }, + { + "epoch": 183.03389830508473, + "grad_norm": 1.1764711871364852e-08, + "learning_rate": 0.002432246626149348, + "loss": 0.0, + "num_input_tokens_seen": 64615840, + "step": 37705 + }, + { + "epoch": 183.05811138014528, + "grad_norm": 5.909197575704184e-09, + "learning_rate": 0.002421693318093626, + "loss": 0.0, + "num_input_tokens_seen": 64624128, + "step": 37710 + }, + { + "epoch": 183.08232445520582, + "grad_norm": 1.1107023745182687e-08, + "learning_rate": 0.0024111627684673784, + "loss": 0.0, + "num_input_tokens_seen": 64632672, + "step": 37715 + }, + { + "epoch": 183.10653753026634, + "grad_norm": 1.7933812657133785e-08, + "learning_rate": 0.0024006549788945395, + "loss": 0.0, + "num_input_tokens_seen": 64641024, + "step": 37720 + }, + { + "epoch": 183.13075060532688, + "grad_norm": 1.5715601264787438e-08, + "learning_rate": 0.0023901699509955463, + "loss": 0.0, + "num_input_tokens_seen": 64649376, + "step": 37725 + }, + { + "epoch": 183.1549636803874, + "grad_norm": 6.026454890672994e-09, + "learning_rate": 0.0023797076863873554, + "loss": 0.0, + "num_input_tokens_seen": 64658080, + "step": 37730 + }, + { + "epoch": 183.17917675544794, + "grad_norm": 1.1414372558249397e-08, + "learning_rate": 0.0023692681866833262, + "loss": 0.0, + "num_input_tokens_seen": 64666752, + "step": 37735 + }, + { + "epoch": 183.20338983050848, + "grad_norm": 2.6852182699599325e-09, + "learning_rate": 0.0023588514534934046, + "loss": 0.0, + "num_input_tokens_seen": 64675424, + "step": 37740 + }, + { + "epoch": 183.227602905569, + "grad_norm": 1.724594334007179e-08, + "learning_rate": 0.002348457488423955, + "loss": 0.0, + "num_input_tokens_seen": 64684032, + "step": 37745 + }, + { + "epoch": 183.25181598062954, + "grad_norm": 8.183052457866324e-09, + "learning_rate": 0.0023380862930778624, + "loss": 0.0, + "num_input_tokens_seen": 64692832, + "step": 37750 + }, + { + "epoch": 183.27602905569006, + "grad_norm": 9.327049355078998e-09, + "learning_rate": 0.0023277378690545135, + "loss": 0.0, + "num_input_tokens_seen": 64701120, + "step": 37755 + }, + { + "epoch": 183.3002421307506, + "grad_norm": 2.0668586842020886e-08, + "learning_rate": 0.0023174122179497325, + "loss": 0.0, + "num_input_tokens_seen": 64709824, + "step": 37760 + }, + { + "epoch": 183.32445520581115, + "grad_norm": 8.117501337778776e-09, + "learning_rate": 0.0023071093413558784, + "loss": 0.0, + "num_input_tokens_seen": 64718592, + "step": 37765 + }, + { + "epoch": 183.34866828087166, + "grad_norm": 7.844805693935086e-09, + "learning_rate": 0.002296829240861814, + "loss": 0.0, + "num_input_tokens_seen": 64727040, + "step": 37770 + }, + { + "epoch": 183.3728813559322, + "grad_norm": 4.137393094794106e-09, + "learning_rate": 0.002286571918052821, + "loss": 0.0, + "num_input_tokens_seen": 64736032, + "step": 37775 + }, + { + "epoch": 183.39709443099272, + "grad_norm": 9.330050509959165e-09, + "learning_rate": 0.0022763373745107174, + "loss": 0.0, + "num_input_tokens_seen": 64744320, + "step": 37780 + }, + { + "epoch": 183.42130750605327, + "grad_norm": 8.526395589569802e-09, + "learning_rate": 0.0022661256118138074, + "loss": 0.0, + "num_input_tokens_seen": 64753024, + "step": 37785 + }, + { + "epoch": 183.4455205811138, + "grad_norm": 1.040348873715402e-08, + "learning_rate": 0.0022559366315368645, + "loss": 0.0, + "num_input_tokens_seen": 64761344, + "step": 37790 + }, + { + "epoch": 183.46973365617433, + "grad_norm": 1.687186390597617e-08, + "learning_rate": 0.002245770435251182, + "loss": 0.0, + "num_input_tokens_seen": 64769696, + "step": 37795 + }, + { + "epoch": 183.49394673123487, + "grad_norm": 8.40141289870644e-09, + "learning_rate": 0.002235627024524456, + "loss": 0.0, + "num_input_tokens_seen": 64777984, + "step": 37800 + }, + { + "epoch": 183.49394673123487, + "eval_loss": 1.1967915296554565, + "eval_runtime": 4.6205, + "eval_samples_per_second": 79.428, + "eval_steps_per_second": 19.911, + "num_input_tokens_seen": 64777984, + "step": 37800 + }, + { + "epoch": 183.5181598062954, + "grad_norm": 1.2365332757724445e-08, + "learning_rate": 0.0022255064009209847, + "loss": 0.0, + "num_input_tokens_seen": 64786784, + "step": 37805 + }, + { + "epoch": 183.54237288135593, + "grad_norm": 9.108789278400309e-09, + "learning_rate": 0.0022154085660014864, + "loss": 0.0, + "num_input_tokens_seen": 64795104, + "step": 37810 + }, + { + "epoch": 183.56658595641647, + "grad_norm": 6.7700809402992945e-09, + "learning_rate": 0.0022053335213231494, + "loss": 0.0, + "num_input_tokens_seen": 64803840, + "step": 37815 + }, + { + "epoch": 183.590799031477, + "grad_norm": 4.014315546641001e-09, + "learning_rate": 0.002195281268439697, + "loss": 0.0, + "num_input_tokens_seen": 64812096, + "step": 37820 + }, + { + "epoch": 183.61501210653753, + "grad_norm": 4.786746110596596e-09, + "learning_rate": 0.002185251808901306, + "loss": 0.0, + "num_input_tokens_seen": 64820672, + "step": 37825 + }, + { + "epoch": 183.63922518159805, + "grad_norm": 8.570868459401026e-09, + "learning_rate": 0.0021752451442546227, + "loss": 0.0, + "num_input_tokens_seen": 64829344, + "step": 37830 + }, + { + "epoch": 183.6634382566586, + "grad_norm": 5.735403263429362e-09, + "learning_rate": 0.0021652612760428456, + "loss": 0.0, + "num_input_tokens_seen": 64837856, + "step": 37835 + }, + { + "epoch": 183.68765133171914, + "grad_norm": 8.404098750247613e-09, + "learning_rate": 0.0021553002058055603, + "loss": 0.0, + "num_input_tokens_seen": 64846304, + "step": 37840 + }, + { + "epoch": 183.71186440677965, + "grad_norm": 6.354944126485407e-09, + "learning_rate": 0.0021453619350789376, + "loss": 0.0, + "num_input_tokens_seen": 64854944, + "step": 37845 + }, + { + "epoch": 183.7360774818402, + "grad_norm": 2.1889487555881715e-08, + "learning_rate": 0.0021354464653955516, + "loss": 0.0, + "num_input_tokens_seen": 64863488, + "step": 37850 + }, + { + "epoch": 183.7602905569007, + "grad_norm": 1.1280647527200927e-08, + "learning_rate": 0.002125553798284513, + "loss": 0.0, + "num_input_tokens_seen": 64872000, + "step": 37855 + }, + { + "epoch": 183.78450363196126, + "grad_norm": 4.5140811089083854e-09, + "learning_rate": 0.002115683935271384, + "loss": 0.0, + "num_input_tokens_seen": 64880640, + "step": 37860 + }, + { + "epoch": 183.8087167070218, + "grad_norm": 1.0026441898958183e-08, + "learning_rate": 0.0021058368778782144, + "loss": 0.0, + "num_input_tokens_seen": 64889152, + "step": 37865 + }, + { + "epoch": 183.83292978208232, + "grad_norm": 1.321841747170538e-08, + "learning_rate": 0.002096012627623539, + "loss": 0.0, + "num_input_tokens_seen": 64897600, + "step": 37870 + }, + { + "epoch": 183.85714285714286, + "grad_norm": 3.0320128630023646e-09, + "learning_rate": 0.00208621118602243, + "loss": 0.0, + "num_input_tokens_seen": 64906080, + "step": 37875 + }, + { + "epoch": 183.88135593220338, + "grad_norm": 6.615475278692884e-09, + "learning_rate": 0.002076432554586327, + "loss": 0.0, + "num_input_tokens_seen": 64914784, + "step": 37880 + }, + { + "epoch": 183.90556900726392, + "grad_norm": 1.8175859040070463e-08, + "learning_rate": 0.002066676734823258, + "loss": 0.0, + "num_input_tokens_seen": 64923136, + "step": 37885 + }, + { + "epoch": 183.92978208232446, + "grad_norm": 1.1562680590770924e-08, + "learning_rate": 0.0020569437282376866, + "loss": 0.0, + "num_input_tokens_seen": 64931488, + "step": 37890 + }, + { + "epoch": 183.95399515738498, + "grad_norm": 1.6422241344571376e-08, + "learning_rate": 0.002047233536330545, + "loss": 0.0, + "num_input_tokens_seen": 64940224, + "step": 37895 + }, + { + "epoch": 183.97820823244552, + "grad_norm": 1.0324246346726795e-08, + "learning_rate": 0.0020375461605993015, + "loss": 0.0, + "num_input_tokens_seen": 64949248, + "step": 37900 + }, + { + "epoch": 184.0048426150121, + "grad_norm": 3.811076254578438e-08, + "learning_rate": 0.002027881602537845, + "loss": 0.0, + "num_input_tokens_seen": 64958080, + "step": 37905 + }, + { + "epoch": 184.02905569007265, + "grad_norm": 1.8452048777817254e-08, + "learning_rate": 0.002018239863636567, + "loss": 0.0, + "num_input_tokens_seen": 64966432, + "step": 37910 + }, + { + "epoch": 184.05326876513317, + "grad_norm": 1.1096910945695981e-08, + "learning_rate": 0.002008620945382378, + "loss": 0.0, + "num_input_tokens_seen": 64974880, + "step": 37915 + }, + { + "epoch": 184.0774818401937, + "grad_norm": 1.1788808151891317e-08, + "learning_rate": 0.001999024849258607, + "loss": 0.0, + "num_input_tokens_seen": 64983328, + "step": 37920 + }, + { + "epoch": 184.10169491525423, + "grad_norm": 8.135861762070817e-09, + "learning_rate": 0.001989451576745105, + "loss": 0.0, + "num_input_tokens_seen": 64991680, + "step": 37925 + }, + { + "epoch": 184.12590799031477, + "grad_norm": 3.549878391240213e-09, + "learning_rate": 0.00197990112931819, + "loss": 0.0, + "num_input_tokens_seen": 64999936, + "step": 37930 + }, + { + "epoch": 184.15012106537532, + "grad_norm": 1.1373709085660266e-08, + "learning_rate": 0.0019703735084506345, + "loss": 0.0, + "num_input_tokens_seen": 65008544, + "step": 37935 + }, + { + "epoch": 184.17433414043583, + "grad_norm": 9.860613658929651e-09, + "learning_rate": 0.001960868715611763, + "loss": 0.0, + "num_input_tokens_seen": 65016896, + "step": 37940 + }, + { + "epoch": 184.19854721549638, + "grad_norm": 6.897693083374179e-09, + "learning_rate": 0.0019513867522673034, + "loss": 0.0, + "num_input_tokens_seen": 65025216, + "step": 37945 + }, + { + "epoch": 184.2227602905569, + "grad_norm": 7.626276499195228e-09, + "learning_rate": 0.001941927619879502, + "loss": 0.0, + "num_input_tokens_seen": 65033632, + "step": 37950 + }, + { + "epoch": 184.24697336561744, + "grad_norm": 1.1252661025196176e-08, + "learning_rate": 0.0019324913199070758, + "loss": 0.0, + "num_input_tokens_seen": 65042112, + "step": 37955 + }, + { + "epoch": 184.27118644067798, + "grad_norm": 4.997821712038331e-09, + "learning_rate": 0.0019230778538052106, + "loss": 0.0, + "num_input_tokens_seen": 65051008, + "step": 37960 + }, + { + "epoch": 184.2953995157385, + "grad_norm": 9.56964729681431e-09, + "learning_rate": 0.0019136872230255952, + "loss": 0.0, + "num_input_tokens_seen": 65059680, + "step": 37965 + }, + { + "epoch": 184.31961259079904, + "grad_norm": 7.85347253895452e-09, + "learning_rate": 0.0019043194290164045, + "loss": 0.0, + "num_input_tokens_seen": 65068512, + "step": 37970 + }, + { + "epoch": 184.34382566585955, + "grad_norm": 5.815513848261844e-09, + "learning_rate": 0.0018949744732222162, + "loss": 0.0, + "num_input_tokens_seen": 65077184, + "step": 37975 + }, + { + "epoch": 184.3680387409201, + "grad_norm": 7.0958647846453005e-09, + "learning_rate": 0.0018856523570841776, + "loss": 0.0, + "num_input_tokens_seen": 65085856, + "step": 37980 + }, + { + "epoch": 184.39225181598064, + "grad_norm": 1.2020873185747405e-08, + "learning_rate": 0.0018763530820398555, + "loss": 0.0, + "num_input_tokens_seen": 65094752, + "step": 37985 + }, + { + "epoch": 184.41646489104116, + "grad_norm": 1.1462400806294681e-08, + "learning_rate": 0.0018670766495233525, + "loss": 0.0, + "num_input_tokens_seen": 65103072, + "step": 37990 + }, + { + "epoch": 184.4406779661017, + "grad_norm": 2.257750786327506e-08, + "learning_rate": 0.001857823060965158, + "loss": 0.0, + "num_input_tokens_seen": 65111840, + "step": 37995 + }, + { + "epoch": 184.46489104116222, + "grad_norm": 1.0783442583317537e-08, + "learning_rate": 0.0018485923177923467, + "loss": 0.0, + "num_input_tokens_seen": 65120224, + "step": 38000 + }, + { + "epoch": 184.46489104116222, + "eval_loss": 1.1957893371582031, + "eval_runtime": 4.6302, + "eval_samples_per_second": 79.262, + "eval_steps_per_second": 19.87, + "num_input_tokens_seen": 65120224, + "step": 38000 + }, + { + "epoch": 184.48910411622276, + "grad_norm": 8.151558539282178e-09, + "learning_rate": 0.001839384421428364, + "loss": 0.0, + "num_input_tokens_seen": 65128832, + "step": 38005 + }, + { + "epoch": 184.5133171912833, + "grad_norm": 6.774468097603403e-09, + "learning_rate": 0.0018301993732932065, + "loss": 0.0, + "num_input_tokens_seen": 65137248, + "step": 38010 + }, + { + "epoch": 184.53753026634382, + "grad_norm": 8.541316098842344e-09, + "learning_rate": 0.0018210371748033248, + "loss": 0.0, + "num_input_tokens_seen": 65145920, + "step": 38015 + }, + { + "epoch": 184.56174334140437, + "grad_norm": 5.566738625617518e-09, + "learning_rate": 0.0018118978273716556, + "loss": 0.0, + "num_input_tokens_seen": 65154496, + "step": 38020 + }, + { + "epoch": 184.58595641646488, + "grad_norm": 7.026936366116843e-09, + "learning_rate": 0.001802781332407588, + "loss": 0.0, + "num_input_tokens_seen": 65163136, + "step": 38025 + }, + { + "epoch": 184.61016949152543, + "grad_norm": 1.3143010235694419e-08, + "learning_rate": 0.0017936876913169806, + "loss": 0.0, + "num_input_tokens_seen": 65171584, + "step": 38030 + }, + { + "epoch": 184.63438256658597, + "grad_norm": 1.2293774886984465e-08, + "learning_rate": 0.0017846169055022287, + "loss": 0.0, + "num_input_tokens_seen": 65179840, + "step": 38035 + }, + { + "epoch": 184.65859564164649, + "grad_norm": 1.0152689355891198e-08, + "learning_rate": 0.0017755689763621295, + "loss": 0.0, + "num_input_tokens_seen": 65188544, + "step": 38040 + }, + { + "epoch": 184.68280871670703, + "grad_norm": 9.178681814603351e-09, + "learning_rate": 0.0017665439052920173, + "loss": 0.0, + "num_input_tokens_seen": 65197280, + "step": 38045 + }, + { + "epoch": 184.70702179176754, + "grad_norm": 4.85432183339185e-09, + "learning_rate": 0.0017575416936836286, + "loss": 0.0, + "num_input_tokens_seen": 65205728, + "step": 38050 + }, + { + "epoch": 184.7312348668281, + "grad_norm": 7.007295188543594e-09, + "learning_rate": 0.0017485623429252528, + "loss": 0.0, + "num_input_tokens_seen": 65214432, + "step": 38055 + }, + { + "epoch": 184.75544794188863, + "grad_norm": 5.782366141460216e-09, + "learning_rate": 0.0017396058544016156, + "loss": 0.0, + "num_input_tokens_seen": 65222816, + "step": 38060 + }, + { + "epoch": 184.77966101694915, + "grad_norm": 6.656198259236135e-09, + "learning_rate": 0.0017306722294938958, + "loss": 0.0, + "num_input_tokens_seen": 65231616, + "step": 38065 + }, + { + "epoch": 184.8038740920097, + "grad_norm": 6.944282482379549e-09, + "learning_rate": 0.0017217614695798078, + "loss": 0.0, + "num_input_tokens_seen": 65240224, + "step": 38070 + }, + { + "epoch": 184.8280871670702, + "grad_norm": 7.683740754771407e-09, + "learning_rate": 0.001712873576033469, + "loss": 0.0, + "num_input_tokens_seen": 65248672, + "step": 38075 + }, + { + "epoch": 184.85230024213075, + "grad_norm": 6.248942696629456e-09, + "learning_rate": 0.0017040085502255163, + "loss": 0.0, + "num_input_tokens_seen": 65257088, + "step": 38080 + }, + { + "epoch": 184.8765133171913, + "grad_norm": 6.927963536185189e-09, + "learning_rate": 0.0016951663935230565, + "loss": 0.0, + "num_input_tokens_seen": 65265568, + "step": 38085 + }, + { + "epoch": 184.9007263922518, + "grad_norm": 1.3104456186852076e-08, + "learning_rate": 0.0016863471072896485, + "loss": 0.0, + "num_input_tokens_seen": 65274528, + "step": 38090 + }, + { + "epoch": 184.92493946731236, + "grad_norm": 6.370943772537885e-09, + "learning_rate": 0.0016775506928853377, + "loss": 0.0, + "num_input_tokens_seen": 65283072, + "step": 38095 + }, + { + "epoch": 184.94915254237287, + "grad_norm": 1.0006857564803795e-08, + "learning_rate": 0.001668777151666656, + "loss": 0.0, + "num_input_tokens_seen": 65291680, + "step": 38100 + }, + { + "epoch": 184.97336561743342, + "grad_norm": 5.2631961011684325e-09, + "learning_rate": 0.0016600264849865709, + "loss": 0.0, + "num_input_tokens_seen": 65300096, + "step": 38105 + }, + { + "epoch": 184.99757869249396, + "grad_norm": 1.6444452910491236e-08, + "learning_rate": 0.0016512986941945695, + "loss": 0.0, + "num_input_tokens_seen": 65308672, + "step": 38110 + }, + { + "epoch": 185.02421307506054, + "grad_norm": 6.2007106116368504e-09, + "learning_rate": 0.0016425937806365753, + "loss": 0.0, + "num_input_tokens_seen": 65317184, + "step": 38115 + }, + { + "epoch": 185.04842615012106, + "grad_norm": 8.968863873803912e-09, + "learning_rate": 0.0016339117456549979, + "loss": 0.0, + "num_input_tokens_seen": 65325728, + "step": 38120 + }, + { + "epoch": 185.0726392251816, + "grad_norm": 1.1803027000212296e-08, + "learning_rate": 0.0016252525905886995, + "loss": 0.0, + "num_input_tokens_seen": 65333888, + "step": 38125 + }, + { + "epoch": 185.09685230024212, + "grad_norm": 4.627692451464327e-09, + "learning_rate": 0.0016166163167730617, + "loss": 0.0, + "num_input_tokens_seen": 65342304, + "step": 38130 + }, + { + "epoch": 185.12106537530266, + "grad_norm": 7.51532081011419e-09, + "learning_rate": 0.0016080029255398864, + "loss": 0.0, + "num_input_tokens_seen": 65351168, + "step": 38135 + }, + { + "epoch": 185.1452784503632, + "grad_norm": 6.204720293112587e-09, + "learning_rate": 0.0015994124182174606, + "loss": 0.0, + "num_input_tokens_seen": 65359552, + "step": 38140 + }, + { + "epoch": 185.16949152542372, + "grad_norm": 4.086566640637557e-09, + "learning_rate": 0.001590844796130575, + "loss": 0.0, + "num_input_tokens_seen": 65368000, + "step": 38145 + }, + { + "epoch": 185.19370460048427, + "grad_norm": 9.245814780456385e-09, + "learning_rate": 0.001582300060600439, + "loss": 0.0, + "num_input_tokens_seen": 65376800, + "step": 38150 + }, + { + "epoch": 185.21791767554478, + "grad_norm": 8.63800231343248e-09, + "learning_rate": 0.0015737782129447652, + "loss": 0.0, + "num_input_tokens_seen": 65385024, + "step": 38155 + }, + { + "epoch": 185.24213075060533, + "grad_norm": 7.98900767762234e-09, + "learning_rate": 0.0015652792544777361, + "loss": 0.0, + "num_input_tokens_seen": 65393504, + "step": 38160 + }, + { + "epoch": 185.26634382566587, + "grad_norm": 9.377989940162479e-09, + "learning_rate": 0.0015568031865099863, + "loss": 0.0, + "num_input_tokens_seen": 65401952, + "step": 38165 + }, + { + "epoch": 185.2905569007264, + "grad_norm": 1.1534075028407642e-08, + "learning_rate": 0.0015483500103486369, + "loss": 0.0, + "num_input_tokens_seen": 65410592, + "step": 38170 + }, + { + "epoch": 185.31476997578693, + "grad_norm": 8.213134172763148e-09, + "learning_rate": 0.0015399197272972787, + "loss": 0.0, + "num_input_tokens_seen": 65419008, + "step": 38175 + }, + { + "epoch": 185.33898305084745, + "grad_norm": 8.026908027147783e-09, + "learning_rate": 0.0015315123386559714, + "loss": 0.0, + "num_input_tokens_seen": 65427360, + "step": 38180 + }, + { + "epoch": 185.363196125908, + "grad_norm": 7.175353200494783e-09, + "learning_rate": 0.0015231278457212283, + "loss": 0.0, + "num_input_tokens_seen": 65436000, + "step": 38185 + }, + { + "epoch": 185.38740920096853, + "grad_norm": 8.874900814248576e-09, + "learning_rate": 0.001514766249786048, + "loss": 0.0, + "num_input_tokens_seen": 65444896, + "step": 38190 + }, + { + "epoch": 185.41162227602905, + "grad_norm": 5.400796254662055e-09, + "learning_rate": 0.0015064275521398994, + "loss": 0.0, + "num_input_tokens_seen": 65453728, + "step": 38195 + }, + { + "epoch": 185.4358353510896, + "grad_norm": 7.51185691427736e-09, + "learning_rate": 0.0014981117540686872, + "loss": 0.0, + "num_input_tokens_seen": 65462240, + "step": 38200 + }, + { + "epoch": 185.4358353510896, + "eval_loss": 1.1939797401428223, + "eval_runtime": 4.6247, + "eval_samples_per_second": 79.356, + "eval_steps_per_second": 19.893, + "num_input_tokens_seen": 65462240, + "step": 38200 + }, + { + "epoch": 185.4600484261501, + "grad_norm": 3.0464284428433075e-09, + "learning_rate": 0.0014898188568548687, + "loss": 0.0, + "num_input_tokens_seen": 65470976, + "step": 38205 + }, + { + "epoch": 185.48426150121065, + "grad_norm": 8.904355475181092e-09, + "learning_rate": 0.0014815488617772542, + "loss": 0.0, + "num_input_tokens_seen": 65479552, + "step": 38210 + }, + { + "epoch": 185.5084745762712, + "grad_norm": 8.746334323461724e-09, + "learning_rate": 0.0014733017701112072, + "loss": 0.0, + "num_input_tokens_seen": 65488288, + "step": 38215 + }, + { + "epoch": 185.5326876513317, + "grad_norm": 2.9176425719867893e-09, + "learning_rate": 0.0014650775831285435, + "loss": 0.0, + "num_input_tokens_seen": 65496960, + "step": 38220 + }, + { + "epoch": 185.55690072639226, + "grad_norm": 1.3677566634839877e-08, + "learning_rate": 0.001456876302097515, + "loss": 0.0, + "num_input_tokens_seen": 65505568, + "step": 38225 + }, + { + "epoch": 185.58111380145277, + "grad_norm": 4.001392994723574e-09, + "learning_rate": 0.0014486979282828604, + "loss": 0.0, + "num_input_tokens_seen": 65514272, + "step": 38230 + }, + { + "epoch": 185.60532687651332, + "grad_norm": 1.2275454430721311e-08, + "learning_rate": 0.001440542462945804, + "loss": 0.0, + "num_input_tokens_seen": 65522784, + "step": 38235 + }, + { + "epoch": 185.62953995157386, + "grad_norm": 7.2691910268929405e-09, + "learning_rate": 0.0014324099073440232, + "loss": 0.0, + "num_input_tokens_seen": 65531456, + "step": 38240 + }, + { + "epoch": 185.65375302663438, + "grad_norm": 3.0396549721700694e-09, + "learning_rate": 0.0014243002627316482, + "loss": 0.0, + "num_input_tokens_seen": 65540160, + "step": 38245 + }, + { + "epoch": 185.67796610169492, + "grad_norm": 4.728506031170809e-09, + "learning_rate": 0.0014162135303592781, + "loss": 0.0, + "num_input_tokens_seen": 65548416, + "step": 38250 + }, + { + "epoch": 185.70217917675544, + "grad_norm": 9.749263618630266e-09, + "learning_rate": 0.001408149711474016, + "loss": 0.0, + "num_input_tokens_seen": 65557504, + "step": 38255 + }, + { + "epoch": 185.72639225181598, + "grad_norm": 1.0062135125110672e-08, + "learning_rate": 0.0014001088073193834, + "loss": 0.0, + "num_input_tokens_seen": 65565824, + "step": 38260 + }, + { + "epoch": 185.75060532687652, + "grad_norm": 6.591180046200407e-09, + "learning_rate": 0.0013920908191354052, + "loss": 0.0, + "num_input_tokens_seen": 65574208, + "step": 38265 + }, + { + "epoch": 185.77481840193704, + "grad_norm": 2.784973807123947e-09, + "learning_rate": 0.001384095748158526, + "loss": 0.0, + "num_input_tokens_seen": 65582528, + "step": 38270 + }, + { + "epoch": 185.79903147699758, + "grad_norm": 4.68072869352909e-09, + "learning_rate": 0.0013761235956217255, + "loss": 0.0, + "num_input_tokens_seen": 65590848, + "step": 38275 + }, + { + "epoch": 185.8232445520581, + "grad_norm": 5.762530452813053e-09, + "learning_rate": 0.0013681743627543873, + "loss": 0.0, + "num_input_tokens_seen": 65599328, + "step": 38280 + }, + { + "epoch": 185.84745762711864, + "grad_norm": 9.337747464144286e-09, + "learning_rate": 0.001360248050782381, + "loss": 0.0, + "num_input_tokens_seen": 65607904, + "step": 38285 + }, + { + "epoch": 185.8716707021792, + "grad_norm": 7.80389086685318e-09, + "learning_rate": 0.001352344660928062, + "loss": 0.0, + "num_input_tokens_seen": 65616864, + "step": 38290 + }, + { + "epoch": 185.8958837772397, + "grad_norm": 7.802189117001035e-09, + "learning_rate": 0.0013444641944102052, + "loss": 0.0, + "num_input_tokens_seen": 65625280, + "step": 38295 + }, + { + "epoch": 185.92009685230025, + "grad_norm": 1.0445734055508638e-08, + "learning_rate": 0.0013366066524441056, + "loss": 0.0, + "num_input_tokens_seen": 65633568, + "step": 38300 + }, + { + "epoch": 185.94430992736076, + "grad_norm": 9.492987729231572e-09, + "learning_rate": 0.0013287720362414768, + "loss": 0.0, + "num_input_tokens_seen": 65641888, + "step": 38305 + }, + { + "epoch": 185.9685230024213, + "grad_norm": 6.862116652683881e-09, + "learning_rate": 0.0013209603470105025, + "loss": 0.0, + "num_input_tokens_seen": 65650816, + "step": 38310 + }, + { + "epoch": 185.99273607748185, + "grad_norm": 9.295465730474461e-09, + "learning_rate": 0.0013131715859558857, + "loss": 0.0, + "num_input_tokens_seen": 65659360, + "step": 38315 + }, + { + "epoch": 186.01937046004844, + "grad_norm": 7.767839704797552e-09, + "learning_rate": 0.001305405754278699, + "loss": 0.0, + "num_input_tokens_seen": 65667936, + "step": 38320 + }, + { + "epoch": 186.04358353510895, + "grad_norm": 6.294545329410539e-09, + "learning_rate": 0.0012976628531765843, + "loss": 0.0, + "num_input_tokens_seen": 65676640, + "step": 38325 + }, + { + "epoch": 186.0677966101695, + "grad_norm": 1.474420407987509e-08, + "learning_rate": 0.0012899428838435533, + "loss": 0.0, + "num_input_tokens_seen": 65685440, + "step": 38330 + }, + { + "epoch": 186.09200968523, + "grad_norm": 8.927798944569076e-09, + "learning_rate": 0.001282245847470137, + "loss": 0.0, + "num_input_tokens_seen": 65693664, + "step": 38335 + }, + { + "epoch": 186.11622276029055, + "grad_norm": 3.922922431343068e-09, + "learning_rate": 0.001274571745243319, + "loss": 0.0, + "num_input_tokens_seen": 65702304, + "step": 38340 + }, + { + "epoch": 186.1404358353511, + "grad_norm": 8.571089615827532e-09, + "learning_rate": 0.0012669205783465364, + "loss": 0.0, + "num_input_tokens_seen": 65711072, + "step": 38345 + }, + { + "epoch": 186.16464891041161, + "grad_norm": 7.730948325956888e-09, + "learning_rate": 0.001259292347959695, + "loss": 0.0, + "num_input_tokens_seen": 65719360, + "step": 38350 + }, + { + "epoch": 186.18886198547216, + "grad_norm": 9.918762700067418e-09, + "learning_rate": 0.0012516870552591707, + "loss": 0.0, + "num_input_tokens_seen": 65727904, + "step": 38355 + }, + { + "epoch": 186.21307506053267, + "grad_norm": 3.172648810334522e-09, + "learning_rate": 0.001244104701417792, + "loss": 0.0, + "num_input_tokens_seen": 65736416, + "step": 38360 + }, + { + "epoch": 186.23728813559322, + "grad_norm": 1.2984273212168773e-08, + "learning_rate": 0.0012365452876048565, + "loss": 0.0, + "num_input_tokens_seen": 65745312, + "step": 38365 + }, + { + "epoch": 186.26150121065376, + "grad_norm": 6.069529767671611e-09, + "learning_rate": 0.001229008814986099, + "loss": 0.0, + "num_input_tokens_seen": 65754112, + "step": 38370 + }, + { + "epoch": 186.28571428571428, + "grad_norm": 8.589940314607247e-09, + "learning_rate": 0.0012214952847237725, + "loss": 0.0, + "num_input_tokens_seen": 65763008, + "step": 38375 + }, + { + "epoch": 186.30992736077482, + "grad_norm": 1.2390419357188875e-08, + "learning_rate": 0.0012140046979765339, + "loss": 0.0, + "num_input_tokens_seen": 65771424, + "step": 38380 + }, + { + "epoch": 186.33414043583534, + "grad_norm": 6.495334492484517e-09, + "learning_rate": 0.0012065370558995258, + "loss": 0.0, + "num_input_tokens_seen": 65779872, + "step": 38385 + }, + { + "epoch": 186.35835351089588, + "grad_norm": 1.5452105373015e-08, + "learning_rate": 0.0011990923596443602, + "loss": 0.0, + "num_input_tokens_seen": 65788288, + "step": 38390 + }, + { + "epoch": 186.38256658595643, + "grad_norm": 1.1929971677204776e-08, + "learning_rate": 0.001191670610359119, + "loss": 0.0, + "num_input_tokens_seen": 65796768, + "step": 38395 + }, + { + "epoch": 186.40677966101694, + "grad_norm": 3.477375720706277e-09, + "learning_rate": 0.0011842718091882865, + "loss": 0.0, + "num_input_tokens_seen": 65805504, + "step": 38400 + }, + { + "epoch": 186.40677966101694, + "eval_loss": 1.1946483850479126, + "eval_runtime": 4.6281, + "eval_samples_per_second": 79.298, + "eval_steps_per_second": 19.879, + "num_input_tokens_seen": 65805504, + "step": 38400 + }, + { + "epoch": 186.43099273607749, + "grad_norm": 6.941963448525712e-09, + "learning_rate": 0.0011768959572729, + "loss": 0.0, + "num_input_tokens_seen": 65814368, + "step": 38405 + }, + { + "epoch": 186.455205811138, + "grad_norm": 1.0184156629122754e-08, + "learning_rate": 0.001169543055750366, + "loss": 0.0, + "num_input_tokens_seen": 65822976, + "step": 38410 + }, + { + "epoch": 186.47941888619854, + "grad_norm": 2.374052732179166e-09, + "learning_rate": 0.0011622131057546115, + "loss": 0.0, + "num_input_tokens_seen": 65831840, + "step": 38415 + }, + { + "epoch": 186.5036319612591, + "grad_norm": 6.477372860302921e-09, + "learning_rate": 0.0011549061084160316, + "loss": 0.0, + "num_input_tokens_seen": 65840352, + "step": 38420 + }, + { + "epoch": 186.5278450363196, + "grad_norm": 9.65793844898144e-09, + "learning_rate": 0.0011476220648614088, + "loss": 0.0, + "num_input_tokens_seen": 65848864, + "step": 38425 + }, + { + "epoch": 186.55205811138015, + "grad_norm": 9.650840127051197e-09, + "learning_rate": 0.0011403609762140777, + "loss": 0.0, + "num_input_tokens_seen": 65857408, + "step": 38430 + }, + { + "epoch": 186.57627118644066, + "grad_norm": 9.421591506963978e-09, + "learning_rate": 0.0011331228435937756, + "loss": 0.0, + "num_input_tokens_seen": 65865920, + "step": 38435 + }, + { + "epoch": 186.6004842615012, + "grad_norm": 2.1000639449653136e-08, + "learning_rate": 0.0011259076681166935, + "loss": 0.0, + "num_input_tokens_seen": 65874624, + "step": 38440 + }, + { + "epoch": 186.62469733656175, + "grad_norm": 2.0953368817799856e-08, + "learning_rate": 0.0011187154508955244, + "loss": 0.0, + "num_input_tokens_seen": 65883136, + "step": 38445 + }, + { + "epoch": 186.64891041162227, + "grad_norm": 4.554955523872195e-09, + "learning_rate": 0.001111546193039381, + "loss": 0.0, + "num_input_tokens_seen": 65891360, + "step": 38450 + }, + { + "epoch": 186.6731234866828, + "grad_norm": 1.0939189110104053e-08, + "learning_rate": 0.0011043998956538792, + "loss": 0.0, + "num_input_tokens_seen": 65899648, + "step": 38455 + }, + { + "epoch": 186.69733656174333, + "grad_norm": 8.444676069530033e-09, + "learning_rate": 0.0010972765598410538, + "loss": 0.0, + "num_input_tokens_seen": 65908384, + "step": 38460 + }, + { + "epoch": 186.72154963680387, + "grad_norm": 6.001040997460905e-09, + "learning_rate": 0.0010901761866993931, + "loss": 0.0, + "num_input_tokens_seen": 65916864, + "step": 38465 + }, + { + "epoch": 186.74576271186442, + "grad_norm": 9.347463247877386e-09, + "learning_rate": 0.0010830987773238876, + "loss": 0.0, + "num_input_tokens_seen": 65925312, + "step": 38470 + }, + { + "epoch": 186.76997578692493, + "grad_norm": 1.936495230836499e-08, + "learning_rate": 0.0010760443328059644, + "loss": 0.0, + "num_input_tokens_seen": 65933760, + "step": 38475 + }, + { + "epoch": 186.79418886198548, + "grad_norm": 8.625674396967042e-09, + "learning_rate": 0.001069012854233503, + "loss": 0.0, + "num_input_tokens_seen": 65942336, + "step": 38480 + }, + { + "epoch": 186.818401937046, + "grad_norm": 8.927340644504511e-09, + "learning_rate": 0.0010620043426908365, + "loss": 0.0, + "num_input_tokens_seen": 65950656, + "step": 38485 + }, + { + "epoch": 186.84261501210653, + "grad_norm": 4.980395651443814e-09, + "learning_rate": 0.0010550187992587833, + "loss": 0.0, + "num_input_tokens_seen": 65959040, + "step": 38490 + }, + { + "epoch": 186.86682808716708, + "grad_norm": 1.094769697118636e-08, + "learning_rate": 0.0010480562250145653, + "loss": 0.0, + "num_input_tokens_seen": 65967872, + "step": 38495 + }, + { + "epoch": 186.8910411622276, + "grad_norm": 7.2248496074678314e-09, + "learning_rate": 0.0010411166210319567, + "loss": 0.0, + "num_input_tokens_seen": 65976768, + "step": 38500 + }, + { + "epoch": 186.91525423728814, + "grad_norm": 8.205413237760695e-09, + "learning_rate": 0.0010341999883810848, + "loss": 0.0, + "num_input_tokens_seen": 65985248, + "step": 38505 + }, + { + "epoch": 186.93946731234865, + "grad_norm": 1.636660229564768e-08, + "learning_rate": 0.0010273063281285965, + "loss": 0.0, + "num_input_tokens_seen": 65993760, + "step": 38510 + }, + { + "epoch": 186.9636803874092, + "grad_norm": 1.2053983589055406e-08, + "learning_rate": 0.0010204356413375747, + "loss": 0.0, + "num_input_tokens_seen": 66002560, + "step": 38515 + }, + { + "epoch": 186.98789346246974, + "grad_norm": 2.9221389752365212e-09, + "learning_rate": 0.001013587929067572, + "loss": 0.0, + "num_input_tokens_seen": 66011040, + "step": 38520 + }, + { + "epoch": 187.01452784503633, + "grad_norm": 3.4298670570365175e-09, + "learning_rate": 0.00100676319237461, + "loss": 0.0, + "num_input_tokens_seen": 66019968, + "step": 38525 + }, + { + "epoch": 187.03874092009684, + "grad_norm": 9.720060312190526e-09, + "learning_rate": 0.0009999614323110972, + "loss": 0.0, + "num_input_tokens_seen": 66028256, + "step": 38530 + }, + { + "epoch": 187.0629539951574, + "grad_norm": 6.865874091488422e-09, + "learning_rate": 0.000993182649926011, + "loss": 0.0, + "num_input_tokens_seen": 66036544, + "step": 38535 + }, + { + "epoch": 187.08716707021793, + "grad_norm": 8.597238476681923e-09, + "learning_rate": 0.000986426846264682, + "loss": 0.0, + "num_input_tokens_seen": 66045248, + "step": 38540 + }, + { + "epoch": 187.11138014527845, + "grad_norm": 7.645397204214532e-09, + "learning_rate": 0.00097969402236896, + "loss": 0.0, + "num_input_tokens_seen": 66053728, + "step": 38545 + }, + { + "epoch": 187.135593220339, + "grad_norm": 5.629987587241203e-09, + "learning_rate": 0.0009729841792771143, + "loss": 0.0, + "num_input_tokens_seen": 66062016, + "step": 38550 + }, + { + "epoch": 187.1598062953995, + "grad_norm": 1.182357145523838e-08, + "learning_rate": 0.0009662973180239176, + "loss": 0.0, + "num_input_tokens_seen": 66070528, + "step": 38555 + }, + { + "epoch": 187.18401937046005, + "grad_norm": 6.113748618474801e-09, + "learning_rate": 0.0009596334396405448, + "loss": 0.0, + "num_input_tokens_seen": 66079104, + "step": 38560 + }, + { + "epoch": 187.2082324455206, + "grad_norm": 5.783867607078719e-09, + "learning_rate": 0.0009529925451546406, + "loss": 0.0, + "num_input_tokens_seen": 66088160, + "step": 38565 + }, + { + "epoch": 187.2324455205811, + "grad_norm": 1.1010090616991874e-08, + "learning_rate": 0.0009463746355903357, + "loss": 0.0, + "num_input_tokens_seen": 66096576, + "step": 38570 + }, + { + "epoch": 187.25665859564165, + "grad_norm": 1.2006172944722948e-08, + "learning_rate": 0.0009397797119681971, + "loss": 0.0, + "num_input_tokens_seen": 66105376, + "step": 38575 + }, + { + "epoch": 187.28087167070217, + "grad_norm": 4.809383558068703e-09, + "learning_rate": 0.0009332077753052281, + "loss": 0.0, + "num_input_tokens_seen": 66113760, + "step": 38580 + }, + { + "epoch": 187.3050847457627, + "grad_norm": 2.4308441481935006e-08, + "learning_rate": 0.0009266588266149011, + "loss": 0.0, + "num_input_tokens_seen": 66122336, + "step": 38585 + }, + { + "epoch": 187.32929782082326, + "grad_norm": 8.697599973572778e-09, + "learning_rate": 0.0009201328669071584, + "loss": 0.0, + "num_input_tokens_seen": 66131200, + "step": 38590 + }, + { + "epoch": 187.35351089588377, + "grad_norm": 1.1918046993741882e-08, + "learning_rate": 0.0009136298971883949, + "loss": 0.0, + "num_input_tokens_seen": 66139808, + "step": 38595 + }, + { + "epoch": 187.37772397094432, + "grad_norm": 3.227975664543692e-09, + "learning_rate": 0.0009071499184614251, + "loss": 0.0, + "num_input_tokens_seen": 66148448, + "step": 38600 + }, + { + "epoch": 187.37772397094432, + "eval_loss": 1.198365330696106, + "eval_runtime": 4.6404, + "eval_samples_per_second": 79.087, + "eval_steps_per_second": 19.826, + "num_input_tokens_seen": 66148448, + "step": 38600 + }, + { + "epoch": 187.40193704600483, + "grad_norm": 1.3484846128619665e-08, + "learning_rate": 0.0009006929317255663, + "loss": 0.0, + "num_input_tokens_seen": 66157344, + "step": 38605 + }, + { + "epoch": 187.42615012106538, + "grad_norm": 8.43790370907982e-09, + "learning_rate": 0.0008942589379765387, + "loss": 0.0, + "num_input_tokens_seen": 66165920, + "step": 38610 + }, + { + "epoch": 187.45036319612592, + "grad_norm": 5.691680016184364e-09, + "learning_rate": 0.0008878479382065817, + "loss": 0.0, + "num_input_tokens_seen": 66174176, + "step": 38615 + }, + { + "epoch": 187.47457627118644, + "grad_norm": 4.54886883716199e-09, + "learning_rate": 0.0008814599334043215, + "loss": 0.0, + "num_input_tokens_seen": 66182560, + "step": 38620 + }, + { + "epoch": 187.49878934624698, + "grad_norm": 6.546869713019987e-09, + "learning_rate": 0.0008750949245548866, + "loss": 0.0, + "num_input_tokens_seen": 66190944, + "step": 38625 + }, + { + "epoch": 187.5230024213075, + "grad_norm": 5.95724847229917e-09, + "learning_rate": 0.0008687529126398252, + "loss": 0.0, + "num_input_tokens_seen": 66199360, + "step": 38630 + }, + { + "epoch": 187.54721549636804, + "grad_norm": 8.644700955073858e-09, + "learning_rate": 0.0008624338986371715, + "loss": 0.0, + "num_input_tokens_seen": 66208064, + "step": 38635 + }, + { + "epoch": 187.57142857142858, + "grad_norm": 1.7186849277095462e-08, + "learning_rate": 0.0008561378835213962, + "loss": 0.0, + "num_input_tokens_seen": 66216480, + "step": 38640 + }, + { + "epoch": 187.5956416464891, + "grad_norm": 1.2980451380428804e-08, + "learning_rate": 0.0008498648682634058, + "loss": 0.0, + "num_input_tokens_seen": 66224768, + "step": 38645 + }, + { + "epoch": 187.61985472154964, + "grad_norm": 1.1578534575562571e-08, + "learning_rate": 0.0008436148538306099, + "loss": 0.0, + "num_input_tokens_seen": 66233184, + "step": 38650 + }, + { + "epoch": 187.64406779661016, + "grad_norm": 6.771213367784412e-09, + "learning_rate": 0.0008373878411868041, + "loss": 0.0, + "num_input_tokens_seen": 66241568, + "step": 38655 + }, + { + "epoch": 187.6682808716707, + "grad_norm": 1.394942295007695e-08, + "learning_rate": 0.000831183831292287, + "loss": 0.0, + "num_input_tokens_seen": 66250016, + "step": 38660 + }, + { + "epoch": 187.69249394673125, + "grad_norm": 8.617265123689322e-09, + "learning_rate": 0.0008250028251037933, + "loss": 0.0, + "num_input_tokens_seen": 66258816, + "step": 38665 + }, + { + "epoch": 187.71670702179176, + "grad_norm": 4.698622380061579e-09, + "learning_rate": 0.0008188448235745271, + "loss": 0.0, + "num_input_tokens_seen": 66267200, + "step": 38670 + }, + { + "epoch": 187.7409200968523, + "grad_norm": 1.376937586172744e-08, + "learning_rate": 0.0008127098276541122, + "loss": 0.0, + "num_input_tokens_seen": 66275968, + "step": 38675 + }, + { + "epoch": 187.76513317191282, + "grad_norm": 1.0081869561417989e-08, + "learning_rate": 0.0008065978382886418, + "loss": 0.0, + "num_input_tokens_seen": 66284512, + "step": 38680 + }, + { + "epoch": 187.78934624697337, + "grad_norm": 6.5825291883925274e-09, + "learning_rate": 0.0008005088564206785, + "loss": 0.0, + "num_input_tokens_seen": 66292768, + "step": 38685 + }, + { + "epoch": 187.8135593220339, + "grad_norm": 8.291952013905757e-09, + "learning_rate": 0.0007944428829891881, + "loss": 0.0, + "num_input_tokens_seen": 66301120, + "step": 38690 + }, + { + "epoch": 187.83777239709443, + "grad_norm": 1.3020893696591429e-08, + "learning_rate": 0.0007883999189296386, + "loss": 0.0, + "num_input_tokens_seen": 66309472, + "step": 38695 + }, + { + "epoch": 187.86198547215497, + "grad_norm": 1.7860589451856868e-08, + "learning_rate": 0.0007823799651739515, + "loss": 0.0, + "num_input_tokens_seen": 66318464, + "step": 38700 + }, + { + "epoch": 187.88619854721549, + "grad_norm": 5.379172218766826e-09, + "learning_rate": 0.0007763830226504509, + "loss": 0.0, + "num_input_tokens_seen": 66327104, + "step": 38705 + }, + { + "epoch": 187.91041162227603, + "grad_norm": 5.342148945430836e-09, + "learning_rate": 0.0007704090922839468, + "loss": 0.0, + "num_input_tokens_seen": 66335584, + "step": 38710 + }, + { + "epoch": 187.93462469733657, + "grad_norm": 1.0283108586861545e-08, + "learning_rate": 0.0007644581749957025, + "loss": 0.0, + "num_input_tokens_seen": 66344224, + "step": 38715 + }, + { + "epoch": 187.9588377723971, + "grad_norm": 9.48582545845511e-09, + "learning_rate": 0.000758530271703417, + "loss": 0.0, + "num_input_tokens_seen": 66352640, + "step": 38720 + }, + { + "epoch": 187.98305084745763, + "grad_norm": 1.740887078938158e-08, + "learning_rate": 0.0007526253833212426, + "loss": 0.0, + "num_input_tokens_seen": 66361216, + "step": 38725 + }, + { + "epoch": 188.00968523002422, + "grad_norm": 5.977215611352449e-09, + "learning_rate": 0.0007467435107598008, + "loss": 0.0, + "num_input_tokens_seen": 66370336, + "step": 38730 + }, + { + "epoch": 188.03389830508473, + "grad_norm": 1.2032366214498325e-08, + "learning_rate": 0.0007408846549261328, + "loss": 0.0, + "num_input_tokens_seen": 66378688, + "step": 38735 + }, + { + "epoch": 188.05811138014528, + "grad_norm": 5.122345214658708e-09, + "learning_rate": 0.0007350488167237656, + "loss": 0.0, + "num_input_tokens_seen": 66387040, + "step": 38740 + }, + { + "epoch": 188.08232445520582, + "grad_norm": 4.8454653622798105e-09, + "learning_rate": 0.0007292359970526629, + "loss": 0.0, + "num_input_tokens_seen": 66395456, + "step": 38745 + }, + { + "epoch": 188.10653753026634, + "grad_norm": 2.0321788696264775e-08, + "learning_rate": 0.0007234461968092076, + "loss": 0.0, + "num_input_tokens_seen": 66403744, + "step": 38750 + }, + { + "epoch": 188.13075060532688, + "grad_norm": 7.206179208907315e-09, + "learning_rate": 0.0007176794168862854, + "loss": 0.0, + "num_input_tokens_seen": 66412416, + "step": 38755 + }, + { + "epoch": 188.1549636803874, + "grad_norm": 1.621504708282373e-08, + "learning_rate": 0.000711935658173185, + "loss": 0.0, + "num_input_tokens_seen": 66421120, + "step": 38760 + }, + { + "epoch": 188.17917675544794, + "grad_norm": 6.833908106074205e-09, + "learning_rate": 0.0007062149215556812, + "loss": 0.0, + "num_input_tokens_seen": 66429952, + "step": 38765 + }, + { + "epoch": 188.20338983050848, + "grad_norm": 8.421816133363791e-09, + "learning_rate": 0.0007005172079159849, + "loss": 0.0, + "num_input_tokens_seen": 66438880, + "step": 38770 + }, + { + "epoch": 188.227602905569, + "grad_norm": 5.82535886195501e-09, + "learning_rate": 0.0006948425181327267, + "loss": 0.0, + "num_input_tokens_seen": 66447488, + "step": 38775 + }, + { + "epoch": 188.25181598062954, + "grad_norm": 1.4436473350087908e-08, + "learning_rate": 0.000689190853081073, + "loss": 0.0, + "num_input_tokens_seen": 66456224, + "step": 38780 + }, + { + "epoch": 188.27602905569006, + "grad_norm": 1.0653089077550248e-08, + "learning_rate": 0.000683562213632527, + "loss": 0.0, + "num_input_tokens_seen": 66464832, + "step": 38785 + }, + { + "epoch": 188.3002421307506, + "grad_norm": 6.131733787384519e-09, + "learning_rate": 0.0006779566006551108, + "loss": 0.0, + "num_input_tokens_seen": 66473344, + "step": 38790 + }, + { + "epoch": 188.32445520581115, + "grad_norm": 5.584794404711602e-09, + "learning_rate": 0.0006723740150132995, + "loss": 0.0, + "num_input_tokens_seen": 66481728, + "step": 38795 + }, + { + "epoch": 188.34866828087166, + "grad_norm": 1.105290170499984e-08, + "learning_rate": 0.0006668144575679713, + "loss": 0.0, + "num_input_tokens_seen": 66490240, + "step": 38800 + }, + { + "epoch": 188.34866828087166, + "eval_loss": 1.194576621055603, + "eval_runtime": 4.6172, + "eval_samples_per_second": 79.485, + "eval_steps_per_second": 19.926, + "num_input_tokens_seen": 66490240, + "step": 38800 + }, + { + "epoch": 188.3728813559322, + "grad_norm": 1.4626593269895238e-08, + "learning_rate": 0.0006612779291765069, + "loss": 0.0, + "num_input_tokens_seen": 66498656, + "step": 38805 + }, + { + "epoch": 188.39709443099272, + "grad_norm": 1.1960907819741351e-08, + "learning_rate": 0.0006557644306926736, + "loss": 0.0, + "num_input_tokens_seen": 66507104, + "step": 38810 + }, + { + "epoch": 188.42130750605327, + "grad_norm": 7.917435596027644e-09, + "learning_rate": 0.0006502739629667575, + "loss": 0.0, + "num_input_tokens_seen": 66515776, + "step": 38815 + }, + { + "epoch": 188.4455205811138, + "grad_norm": 1.9956235774998277e-08, + "learning_rate": 0.0006448065268454317, + "loss": 0.0, + "num_input_tokens_seen": 66523840, + "step": 38820 + }, + { + "epoch": 188.46973365617433, + "grad_norm": 1.4096088740700452e-08, + "learning_rate": 0.0006393621231718549, + "loss": 0.0, + "num_input_tokens_seen": 66532064, + "step": 38825 + }, + { + "epoch": 188.49394673123487, + "grad_norm": 3.1697326985380414e-09, + "learning_rate": 0.0006339407527856389, + "loss": 0.0, + "num_input_tokens_seen": 66540704, + "step": 38830 + }, + { + "epoch": 188.5181598062954, + "grad_norm": 6.493322768363896e-09, + "learning_rate": 0.0006285424165227982, + "loss": 0.0, + "num_input_tokens_seen": 66549184, + "step": 38835 + }, + { + "epoch": 188.54237288135593, + "grad_norm": 6.409980546351335e-09, + "learning_rate": 0.0006231671152158169, + "loss": 0.0, + "num_input_tokens_seen": 66557632, + "step": 38840 + }, + { + "epoch": 188.56658595641647, + "grad_norm": 2.6051241164282146e-09, + "learning_rate": 0.0006178148496936819, + "loss": 0.0, + "num_input_tokens_seen": 66566080, + "step": 38845 + }, + { + "epoch": 188.590799031477, + "grad_norm": 8.93806806345765e-09, + "learning_rate": 0.000612485620781733, + "loss": 0.0, + "num_input_tokens_seen": 66574880, + "step": 38850 + }, + { + "epoch": 188.61501210653753, + "grad_norm": 1.2266864857224391e-08, + "learning_rate": 0.0006071794293018296, + "loss": 0.0, + "num_input_tokens_seen": 66583392, + "step": 38855 + }, + { + "epoch": 188.63922518159805, + "grad_norm": 8.77907346819029e-09, + "learning_rate": 0.0006018962760722501, + "loss": 0.0, + "num_input_tokens_seen": 66591776, + "step": 38860 + }, + { + "epoch": 188.6634382566586, + "grad_norm": 9.365224151736129e-09, + "learning_rate": 0.0005966361619077098, + "loss": 0.0, + "num_input_tokens_seen": 66600576, + "step": 38865 + }, + { + "epoch": 188.68765133171914, + "grad_norm": 1.1534146970859638e-08, + "learning_rate": 0.000591399087619393, + "loss": 0.0, + "num_input_tokens_seen": 66609088, + "step": 38870 + }, + { + "epoch": 188.71186440677965, + "grad_norm": 8.69116689727889e-09, + "learning_rate": 0.0005861850540149371, + "loss": 0.0, + "num_input_tokens_seen": 66617952, + "step": 38875 + }, + { + "epoch": 188.7360774818402, + "grad_norm": 1.2810103200422418e-08, + "learning_rate": 0.0005809940618983822, + "loss": 0.0, + "num_input_tokens_seen": 66626656, + "step": 38880 + }, + { + "epoch": 188.7602905569007, + "grad_norm": 3.826169603371454e-09, + "learning_rate": 0.0005758261120702712, + "loss": 0.0, + "num_input_tokens_seen": 66635264, + "step": 38885 + }, + { + "epoch": 188.78450363196126, + "grad_norm": 8.490863123711279e-09, + "learning_rate": 0.0005706812053275501, + "loss": 0.0, + "num_input_tokens_seen": 66643552, + "step": 38890 + }, + { + "epoch": 188.8087167070218, + "grad_norm": 8.29706614524639e-09, + "learning_rate": 0.0005655593424636173, + "loss": 0.0, + "num_input_tokens_seen": 66652256, + "step": 38895 + }, + { + "epoch": 188.83292978208232, + "grad_norm": 8.09962941161757e-09, + "learning_rate": 0.0005604605242683746, + "loss": 0.0, + "num_input_tokens_seen": 66660384, + "step": 38900 + }, + { + "epoch": 188.85714285714286, + "grad_norm": 5.257489998911069e-09, + "learning_rate": 0.0005553847515280596, + "loss": 0.0, + "num_input_tokens_seen": 66669248, + "step": 38905 + }, + { + "epoch": 188.88135593220338, + "grad_norm": 8.092450265451134e-09, + "learning_rate": 0.0005503320250254795, + "loss": 0.0, + "num_input_tokens_seen": 66678048, + "step": 38910 + }, + { + "epoch": 188.90556900726392, + "grad_norm": 1.2238236202222197e-08, + "learning_rate": 0.0005453023455397943, + "loss": 0.0, + "num_input_tokens_seen": 66686688, + "step": 38915 + }, + { + "epoch": 188.92978208232446, + "grad_norm": 8.52367421089184e-09, + "learning_rate": 0.0005402957138466502, + "loss": 0.0, + "num_input_tokens_seen": 66695424, + "step": 38920 + }, + { + "epoch": 188.95399515738498, + "grad_norm": 5.73774761036816e-09, + "learning_rate": 0.0005353121307181463, + "loss": 0.0, + "num_input_tokens_seen": 66704352, + "step": 38925 + }, + { + "epoch": 188.97820823244552, + "grad_norm": 4.985912127608572e-09, + "learning_rate": 0.0005303515969227845, + "loss": 0.0, + "num_input_tokens_seen": 66712864, + "step": 38930 + }, + { + "epoch": 189.0048426150121, + "grad_norm": 5.444530160048089e-08, + "learning_rate": 0.0005254141132255862, + "loss": 0.0, + "num_input_tokens_seen": 66721824, + "step": 38935 + }, + { + "epoch": 189.02905569007265, + "grad_norm": 6.129850849134755e-09, + "learning_rate": 0.0005204996803879258, + "loss": 0.0, + "num_input_tokens_seen": 66729952, + "step": 38940 + }, + { + "epoch": 189.05326876513317, + "grad_norm": 1.1190345539091595e-08, + "learning_rate": 0.0005156082991676969, + "loss": 0.0, + "num_input_tokens_seen": 66738432, + "step": 38945 + }, + { + "epoch": 189.0774818401937, + "grad_norm": 1.3729098746750878e-08, + "learning_rate": 0.0005107399703192127, + "loss": 0.0, + "num_input_tokens_seen": 66746912, + "step": 38950 + }, + { + "epoch": 189.10169491525423, + "grad_norm": 4.1345757928468174e-09, + "learning_rate": 0.0005058946945932063, + "loss": 0.0, + "num_input_tokens_seen": 66755072, + "step": 38955 + }, + { + "epoch": 189.12590799031477, + "grad_norm": 7.001899948733126e-09, + "learning_rate": 0.0005010724727369131, + "loss": 0.0, + "num_input_tokens_seen": 66763936, + "step": 38960 + }, + { + "epoch": 189.15012106537532, + "grad_norm": 4.691750543628359e-09, + "learning_rate": 0.000496273305493955, + "loss": 0.0, + "num_input_tokens_seen": 66772544, + "step": 38965 + }, + { + "epoch": 189.17433414043583, + "grad_norm": 8.78428707551393e-09, + "learning_rate": 0.0004914971936044399, + "loss": 0.0, + "num_input_tokens_seen": 66781120, + "step": 38970 + }, + { + "epoch": 189.19854721549638, + "grad_norm": 4.17881818037813e-09, + "learning_rate": 0.00048674413780491196, + "loss": 0.0, + "num_input_tokens_seen": 66789568, + "step": 38975 + }, + { + "epoch": 189.2227602905569, + "grad_norm": 6.383389816733143e-09, + "learning_rate": 0.0004820141388283183, + "loss": 0.0, + "num_input_tokens_seen": 66798144, + "step": 38980 + }, + { + "epoch": 189.24697336561744, + "grad_norm": 7.392922718452155e-09, + "learning_rate": 0.00047730719740410874, + "loss": 0.0, + "num_input_tokens_seen": 66806560, + "step": 38985 + }, + { + "epoch": 189.27118644067798, + "grad_norm": 3.1256892629727417e-09, + "learning_rate": 0.00047262331425816927, + "loss": 0.0, + "num_input_tokens_seen": 66814976, + "step": 38990 + }, + { + "epoch": 189.2953995157385, + "grad_norm": 3.6758891486243783e-09, + "learning_rate": 0.00046796249011277213, + "loss": 0.0, + "num_input_tokens_seen": 66823968, + "step": 38995 + }, + { + "epoch": 189.31961259079904, + "grad_norm": 1.822550466101802e-08, + "learning_rate": 0.00046332472568669236, + "loss": 0.0, + "num_input_tokens_seen": 66832256, + "step": 39000 + }, + { + "epoch": 189.31961259079904, + "eval_loss": 1.1952766180038452, + "eval_runtime": 4.6185, + "eval_samples_per_second": 79.463, + "eval_steps_per_second": 19.92, + "num_input_tokens_seen": 66832256, + "step": 39000 + }, + { + "epoch": 189.34382566585955, + "grad_norm": 5.846611195181595e-09, + "learning_rate": 0.0004587100216951578, + "loss": 0.0, + "num_input_tokens_seen": 66840864, + "step": 39005 + }, + { + "epoch": 189.3680387409201, + "grad_norm": 1.3476285865010595e-08, + "learning_rate": 0.00045411837884978265, + "loss": 0.0, + "num_input_tokens_seen": 66849440, + "step": 39010 + }, + { + "epoch": 189.39225181598064, + "grad_norm": 1.2707621621643739e-08, + "learning_rate": 0.00044954979785865045, + "loss": 0.0, + "num_input_tokens_seen": 66857824, + "step": 39015 + }, + { + "epoch": 189.41646489104116, + "grad_norm": 1.2364088419758446e-08, + "learning_rate": 0.00044500427942631426, + "loss": 0.0, + "num_input_tokens_seen": 66866240, + "step": 39020 + }, + { + "epoch": 189.4406779661017, + "grad_norm": 1.2385497960565317e-08, + "learning_rate": 0.0004404818242537467, + "loss": 0.0, + "num_input_tokens_seen": 66874848, + "step": 39025 + }, + { + "epoch": 189.46489104116222, + "grad_norm": 8.169723564321885e-09, + "learning_rate": 0.00043598243303837324, + "loss": 0.0, + "num_input_tokens_seen": 66883552, + "step": 39030 + }, + { + "epoch": 189.48910411622276, + "grad_norm": 4.010761056605361e-09, + "learning_rate": 0.00043150610647403885, + "loss": 0.0, + "num_input_tokens_seen": 66891968, + "step": 39035 + }, + { + "epoch": 189.5133171912833, + "grad_norm": 5.623205456828373e-09, + "learning_rate": 0.00042705284525104134, + "loss": 0.0, + "num_input_tokens_seen": 66900512, + "step": 39040 + }, + { + "epoch": 189.53753026634382, + "grad_norm": 5.216632903426444e-09, + "learning_rate": 0.0004226226500561647, + "loss": 0.0, + "num_input_tokens_seen": 66909376, + "step": 39045 + }, + { + "epoch": 189.56174334140437, + "grad_norm": 2.15681983384286e-09, + "learning_rate": 0.0004182155215725791, + "loss": 0.0, + "num_input_tokens_seen": 66918080, + "step": 39050 + }, + { + "epoch": 189.58595641646488, + "grad_norm": 5.418954174274404e-09, + "learning_rate": 0.00041383146047992424, + "loss": 0.0, + "num_input_tokens_seen": 66926208, + "step": 39055 + }, + { + "epoch": 189.61016949152543, + "grad_norm": 8.319688937774572e-09, + "learning_rate": 0.00040947046745427597, + "loss": 0.0, + "num_input_tokens_seen": 66934752, + "step": 39060 + }, + { + "epoch": 189.63438256658597, + "grad_norm": 7.774081822731205e-09, + "learning_rate": 0.00040513254316814625, + "loss": 0.0, + "num_input_tokens_seen": 66943328, + "step": 39065 + }, + { + "epoch": 189.65859564164649, + "grad_norm": 2.9389644051747155e-09, + "learning_rate": 0.0004008176882905168, + "loss": 0.0, + "num_input_tokens_seen": 66952256, + "step": 39070 + }, + { + "epoch": 189.68280871670703, + "grad_norm": 7.71708297264695e-09, + "learning_rate": 0.00039652590348677184, + "loss": 0.0, + "num_input_tokens_seen": 66960672, + "step": 39075 + }, + { + "epoch": 189.70702179176754, + "grad_norm": 1.5315640311541756e-08, + "learning_rate": 0.00039225718941878206, + "loss": 0.0, + "num_input_tokens_seen": 66969312, + "step": 39080 + }, + { + "epoch": 189.7312348668281, + "grad_norm": 5.942575764805724e-09, + "learning_rate": 0.00038801154674480417, + "loss": 0.0, + "num_input_tokens_seen": 66977952, + "step": 39085 + }, + { + "epoch": 189.75544794188863, + "grad_norm": 1.4204833753694857e-08, + "learning_rate": 0.00038378897611959784, + "loss": 0.0, + "num_input_tokens_seen": 66986496, + "step": 39090 + }, + { + "epoch": 189.77966101694915, + "grad_norm": 4.7103938527470746e-09, + "learning_rate": 0.00037958947819430875, + "loss": 0.0, + "num_input_tokens_seen": 66994944, + "step": 39095 + }, + { + "epoch": 189.8038740920097, + "grad_norm": 1.300770868795098e-08, + "learning_rate": 0.0003754130536165856, + "loss": 0.0, + "num_input_tokens_seen": 67003456, + "step": 39100 + }, + { + "epoch": 189.8280871670702, + "grad_norm": 1.3951168220671661e-08, + "learning_rate": 0.0003712597030304632, + "loss": 0.0, + "num_input_tokens_seen": 67012160, + "step": 39105 + }, + { + "epoch": 189.85230024213075, + "grad_norm": 7.322072281823466e-09, + "learning_rate": 0.00036712942707646247, + "loss": 0.0, + "num_input_tokens_seen": 67020672, + "step": 39110 + }, + { + "epoch": 189.8765133171913, + "grad_norm": 9.57035695137165e-09, + "learning_rate": 0.00036302222639149063, + "loss": 0.0, + "num_input_tokens_seen": 67029280, + "step": 39115 + }, + { + "epoch": 189.9007263922518, + "grad_norm": 6.748716696591828e-09, + "learning_rate": 0.000358938101608941, + "loss": 0.0, + "num_input_tokens_seen": 67037888, + "step": 39120 + }, + { + "epoch": 189.92493946731236, + "grad_norm": 9.546230472778916e-09, + "learning_rate": 0.0003548770533586598, + "loss": 0.0, + "num_input_tokens_seen": 67046592, + "step": 39125 + }, + { + "epoch": 189.94915254237287, + "grad_norm": 1.1071576544452455e-08, + "learning_rate": 0.0003508390822668961, + "loss": 0.0, + "num_input_tokens_seen": 67055232, + "step": 39130 + }, + { + "epoch": 189.97336561743342, + "grad_norm": 8.71921823630828e-09, + "learning_rate": 0.00034682418895633503, + "loss": 0.0, + "num_input_tokens_seen": 67063488, + "step": 39135 + }, + { + "epoch": 189.99757869249396, + "grad_norm": 1.4347718568785695e-08, + "learning_rate": 0.0003428323740461647, + "loss": 0.0, + "num_input_tokens_seen": 67072064, + "step": 39140 + }, + { + "epoch": 190.02421307506054, + "grad_norm": 5.6820996796602685e-09, + "learning_rate": 0.00033886363815194276, + "loss": 0.0, + "num_input_tokens_seen": 67080640, + "step": 39145 + }, + { + "epoch": 190.04842615012106, + "grad_norm": 4.605981818173177e-09, + "learning_rate": 0.0003349179818857129, + "loss": 0.0, + "num_input_tokens_seen": 67088800, + "step": 39150 + }, + { + "epoch": 190.0726392251816, + "grad_norm": 6.17843953776287e-09, + "learning_rate": 0.0003309954058559383, + "loss": 0.0, + "num_input_tokens_seen": 67097088, + "step": 39155 + }, + { + "epoch": 190.09685230024212, + "grad_norm": 1.9303243448121066e-08, + "learning_rate": 0.0003270959106675186, + "loss": 0.0, + "num_input_tokens_seen": 67105696, + "step": 39160 + }, + { + "epoch": 190.12106537530266, + "grad_norm": 1.4509273782437049e-08, + "learning_rate": 0.0003232194969218227, + "loss": 0.0, + "num_input_tokens_seen": 67114528, + "step": 39165 + }, + { + "epoch": 190.1452784503632, + "grad_norm": 1.4650616719791287e-08, + "learning_rate": 0.00031936616521663905, + "loss": 0.0, + "num_input_tokens_seen": 67122976, + "step": 39170 + }, + { + "epoch": 190.16949152542372, + "grad_norm": 1.3124859421509427e-08, + "learning_rate": 0.00031553591614619236, + "loss": 0.0, + "num_input_tokens_seen": 67131968, + "step": 39175 + }, + { + "epoch": 190.19370460048427, + "grad_norm": 1.0743437250937404e-08, + "learning_rate": 0.00031172875030117676, + "loss": 0.0, + "num_input_tokens_seen": 67140576, + "step": 39180 + }, + { + "epoch": 190.21791767554478, + "grad_norm": 5.571853645136571e-09, + "learning_rate": 0.0003079446682686726, + "loss": 0.0, + "num_input_tokens_seen": 67149152, + "step": 39185 + }, + { + "epoch": 190.24213075060533, + "grad_norm": 7.353683884048223e-09, + "learning_rate": 0.0003041836706322465, + "loss": 0.0, + "num_input_tokens_seen": 67157760, + "step": 39190 + }, + { + "epoch": 190.26634382566587, + "grad_norm": 4.884728177501074e-09, + "learning_rate": 0.0003004457579719011, + "loss": 0.0, + "num_input_tokens_seen": 67165824, + "step": 39195 + }, + { + "epoch": 190.2905569007264, + "grad_norm": 1.618066569619714e-08, + "learning_rate": 0.00029673093086405867, + "loss": 0.0, + "num_input_tokens_seen": 67174336, + "step": 39200 + }, + { + "epoch": 190.2905569007264, + "eval_loss": 1.1944750547409058, + "eval_runtime": 4.62, + "eval_samples_per_second": 79.438, + "eval_steps_per_second": 19.914, + "num_input_tokens_seen": 67174336, + "step": 39200 + }, + { + "epoch": 190.31476997578693, + "grad_norm": 1.515009095953701e-08, + "learning_rate": 0.00029303918988159426, + "loss": 0.0, + "num_input_tokens_seen": 67182560, + "step": 39205 + }, + { + "epoch": 190.33898305084745, + "grad_norm": 7.371517174448172e-09, + "learning_rate": 0.0002893705355938192, + "loss": 0.0, + "num_input_tokens_seen": 67191296, + "step": 39210 + }, + { + "epoch": 190.363196125908, + "grad_norm": 9.847291870812569e-09, + "learning_rate": 0.0002857249685664975, + "loss": 0.0, + "num_input_tokens_seen": 67199712, + "step": 39215 + }, + { + "epoch": 190.38740920096853, + "grad_norm": 9.678352341779828e-09, + "learning_rate": 0.0002821024893618129, + "loss": 0.0, + "num_input_tokens_seen": 67208192, + "step": 39220 + }, + { + "epoch": 190.41162227602905, + "grad_norm": 4.072946424571455e-09, + "learning_rate": 0.0002785030985383852, + "loss": 0.0, + "num_input_tokens_seen": 67216768, + "step": 39225 + }, + { + "epoch": 190.4358353510896, + "grad_norm": 8.560398612189601e-09, + "learning_rate": 0.00027492679665130356, + "loss": 0.0, + "num_input_tokens_seen": 67225760, + "step": 39230 + }, + { + "epoch": 190.4600484261501, + "grad_norm": 1.6582440309775848e-08, + "learning_rate": 0.000271373584252077, + "loss": 0.0, + "num_input_tokens_seen": 67234368, + "step": 39235 + }, + { + "epoch": 190.48426150121065, + "grad_norm": 8.638573412156347e-09, + "learning_rate": 0.00026784346188865046, + "loss": 0.0, + "num_input_tokens_seen": 67243072, + "step": 39240 + }, + { + "epoch": 190.5084745762712, + "grad_norm": 1.1973061653236527e-08, + "learning_rate": 0.0002643364301054218, + "loss": 0.0, + "num_input_tokens_seen": 67251648, + "step": 39245 + }, + { + "epoch": 190.5326876513317, + "grad_norm": 1.5986115542432344e-08, + "learning_rate": 0.0002608524894431918, + "loss": 0.0, + "num_input_tokens_seen": 67260128, + "step": 39250 + }, + { + "epoch": 190.55690072639226, + "grad_norm": 1.493266665875126e-08, + "learning_rate": 0.000257391640439264, + "loss": 0.0, + "num_input_tokens_seen": 67268704, + "step": 39255 + }, + { + "epoch": 190.58111380145277, + "grad_norm": 1.430232643429008e-08, + "learning_rate": 0.00025395388362732806, + "loss": 0.0, + "num_input_tokens_seen": 67277120, + "step": 39260 + }, + { + "epoch": 190.60532687651332, + "grad_norm": 6.066584568031885e-09, + "learning_rate": 0.00025053921953751, + "loss": 0.0, + "num_input_tokens_seen": 67285568, + "step": 39265 + }, + { + "epoch": 190.62953995157386, + "grad_norm": 5.761445098784179e-09, + "learning_rate": 0.00024714764869643855, + "loss": 0.0, + "num_input_tokens_seen": 67294144, + "step": 39270 + }, + { + "epoch": 190.65375302663438, + "grad_norm": 9.33382526824289e-09, + "learning_rate": 0.0002437791716270954, + "loss": 0.0, + "num_input_tokens_seen": 67302656, + "step": 39275 + }, + { + "epoch": 190.67796610169492, + "grad_norm": 1.3317158042980282e-08, + "learning_rate": 0.00024043378884896493, + "loss": 0.0, + "num_input_tokens_seen": 67311616, + "step": 39280 + }, + { + "epoch": 190.70217917675544, + "grad_norm": 8.223864256251545e-09, + "learning_rate": 0.00023711150087793453, + "loss": 0.0, + "num_input_tokens_seen": 67320384, + "step": 39285 + }, + { + "epoch": 190.72639225181598, + "grad_norm": 1.7682822317510727e-08, + "learning_rate": 0.000233812308226361, + "loss": 0.0, + "num_input_tokens_seen": 67329024, + "step": 39290 + }, + { + "epoch": 190.75060532687652, + "grad_norm": 1.3121586484032832e-08, + "learning_rate": 0.00023053621140300406, + "loss": 0.0, + "num_input_tokens_seen": 67337440, + "step": 39295 + }, + { + "epoch": 190.77481840193704, + "grad_norm": 4.243288831418113e-09, + "learning_rate": 0.00022728321091307623, + "loss": 0.0, + "num_input_tokens_seen": 67346080, + "step": 39300 + }, + { + "epoch": 190.79903147699758, + "grad_norm": 4.3873087385293275e-09, + "learning_rate": 0.0002240533072582429, + "loss": 0.0, + "num_input_tokens_seen": 67354624, + "step": 39305 + }, + { + "epoch": 190.8232445520581, + "grad_norm": 4.949847642876648e-09, + "learning_rate": 0.00022084650093658897, + "loss": 0.0, + "num_input_tokens_seen": 67362944, + "step": 39310 + }, + { + "epoch": 190.84745762711864, + "grad_norm": 5.238068201407486e-09, + "learning_rate": 0.0002176627924426522, + "loss": 0.0, + "num_input_tokens_seen": 67371584, + "step": 39315 + }, + { + "epoch": 190.8716707021792, + "grad_norm": 7.46819761587858e-09, + "learning_rate": 0.0002145021822673898, + "loss": 0.0, + "num_input_tokens_seen": 67380256, + "step": 39320 + }, + { + "epoch": 190.8958837772397, + "grad_norm": 1.3887547112290122e-08, + "learning_rate": 0.00021136467089822862, + "loss": 0.0, + "num_input_tokens_seen": 67388832, + "step": 39325 + }, + { + "epoch": 190.92009685230025, + "grad_norm": 7.137455071415388e-09, + "learning_rate": 0.00020825025881898162, + "loss": 0.0, + "num_input_tokens_seen": 67397280, + "step": 39330 + }, + { + "epoch": 190.94430992736076, + "grad_norm": 6.187137913116203e-09, + "learning_rate": 0.0002051589465099479, + "loss": 0.0, + "num_input_tokens_seen": 67405408, + "step": 39335 + }, + { + "epoch": 190.9685230024213, + "grad_norm": 4.799572295155485e-09, + "learning_rate": 0.0002020907344478462, + "loss": 0.0, + "num_input_tokens_seen": 67414112, + "step": 39340 + }, + { + "epoch": 190.99273607748185, + "grad_norm": 9.359869324043757e-09, + "learning_rate": 0.0001990456231058313, + "loss": 0.0, + "num_input_tokens_seen": 67423168, + "step": 39345 + }, + { + "epoch": 191.01937046004844, + "grad_norm": 1.0358212065852967e-08, + "learning_rate": 0.00019602361295349423, + "loss": 0.0, + "num_input_tokens_seen": 67432384, + "step": 39350 + }, + { + "epoch": 191.04358353510895, + "grad_norm": 7.517324540629033e-09, + "learning_rate": 0.0001930247044568789, + "loss": 0.0, + "num_input_tokens_seen": 67441024, + "step": 39355 + }, + { + "epoch": 191.0677966101695, + "grad_norm": 1.0582906995182384e-08, + "learning_rate": 0.00019004889807843205, + "loss": 0.0, + "num_input_tokens_seen": 67449440, + "step": 39360 + }, + { + "epoch": 191.09200968523, + "grad_norm": 1.3871010118293725e-08, + "learning_rate": 0.00018709619427708656, + "loss": 0.0, + "num_input_tokens_seen": 67458496, + "step": 39365 + }, + { + "epoch": 191.11622276029055, + "grad_norm": 7.045049432718997e-09, + "learning_rate": 0.00018416659350817822, + "loss": 0.0, + "num_input_tokens_seen": 67466880, + "step": 39370 + }, + { + "epoch": 191.1404358353511, + "grad_norm": 8.072897017541436e-09, + "learning_rate": 0.00018126009622346229, + "loss": 0.0, + "num_input_tokens_seen": 67475040, + "step": 39375 + }, + { + "epoch": 191.16464891041161, + "grad_norm": 1.5068330583289935e-08, + "learning_rate": 0.00017837670287119687, + "loss": 0.0, + "num_input_tokens_seen": 67483840, + "step": 39380 + }, + { + "epoch": 191.18886198547216, + "grad_norm": 4.5595420772315265e-09, + "learning_rate": 0.00017551641389602633, + "loss": 0.0, + "num_input_tokens_seen": 67492224, + "step": 39385 + }, + { + "epoch": 191.21307506053267, + "grad_norm": 6.914371741828518e-09, + "learning_rate": 0.00017267922973903115, + "loss": 0.0, + "num_input_tokens_seen": 67500800, + "step": 39390 + }, + { + "epoch": 191.23728813559322, + "grad_norm": 3.1583327064765854e-09, + "learning_rate": 0.00016986515083774467, + "loss": 0.0, + "num_input_tokens_seen": 67509312, + "step": 39395 + }, + { + "epoch": 191.26150121065376, + "grad_norm": 7.406929736220036e-09, + "learning_rate": 0.00016707417762611975, + "loss": 0.0, + "num_input_tokens_seen": 67517920, + "step": 39400 + }, + { + "epoch": 191.26150121065376, + "eval_loss": 1.201240062713623, + "eval_runtime": 4.6352, + "eval_samples_per_second": 79.177, + "eval_steps_per_second": 19.848, + "num_input_tokens_seen": 67517920, + "step": 39400 + }, + { + "epoch": 191.28571428571428, + "grad_norm": 1.2340436228441831e-08, + "learning_rate": 0.00016430631053459543, + "loss": 0.0, + "num_input_tokens_seen": 67526112, + "step": 39405 + }, + { + "epoch": 191.30992736077482, + "grad_norm": 1.1947215661223254e-08, + "learning_rate": 0.0001615615499899803, + "loss": 0.0, + "num_input_tokens_seen": 67534464, + "step": 39410 + }, + { + "epoch": 191.33414043583534, + "grad_norm": 7.028488457905269e-09, + "learning_rate": 0.00015883989641556905, + "loss": 0.0, + "num_input_tokens_seen": 67542720, + "step": 39415 + }, + { + "epoch": 191.35835351089588, + "grad_norm": 5.8972235983389965e-09, + "learning_rate": 0.00015614135023105934, + "loss": 0.0, + "num_input_tokens_seen": 67551360, + "step": 39420 + }, + { + "epoch": 191.38256658595643, + "grad_norm": 3.438298534774731e-09, + "learning_rate": 0.00015346591185261827, + "loss": 0.0, + "num_input_tokens_seen": 67560192, + "step": 39425 + }, + { + "epoch": 191.40677966101694, + "grad_norm": 6.428038989980678e-09, + "learning_rate": 0.00015081358169281576, + "loss": 0.0, + "num_input_tokens_seen": 67568768, + "step": 39430 + }, + { + "epoch": 191.43099273607749, + "grad_norm": 3.6541347725460582e-09, + "learning_rate": 0.00014818436016069135, + "loss": 0.0, + "num_input_tokens_seen": 67577248, + "step": 39435 + }, + { + "epoch": 191.455205811138, + "grad_norm": 4.580877011051143e-09, + "learning_rate": 0.00014557824766168735, + "loss": 0.0, + "num_input_tokens_seen": 67585376, + "step": 39440 + }, + { + "epoch": 191.47941888619854, + "grad_norm": 9.504365294787931e-09, + "learning_rate": 0.00014299524459769896, + "loss": 0.0, + "num_input_tokens_seen": 67593920, + "step": 39445 + }, + { + "epoch": 191.5036319612591, + "grad_norm": 8.019592101504713e-09, + "learning_rate": 0.0001404353513670742, + "loss": 0.0, + "num_input_tokens_seen": 67602272, + "step": 39450 + }, + { + "epoch": 191.5278450363196, + "grad_norm": 6.366318583417296e-09, + "learning_rate": 0.0001378985683645806, + "loss": 0.0, + "num_input_tokens_seen": 67611264, + "step": 39455 + }, + { + "epoch": 191.55205811138015, + "grad_norm": 4.725667412941448e-09, + "learning_rate": 0.0001353848959813886, + "loss": 0.0, + "num_input_tokens_seen": 67620032, + "step": 39460 + }, + { + "epoch": 191.57627118644066, + "grad_norm": 1.0721924681433848e-08, + "learning_rate": 0.00013289433460517142, + "loss": 0.0, + "num_input_tokens_seen": 67628928, + "step": 39465 + }, + { + "epoch": 191.6004842615012, + "grad_norm": 5.36020428043571e-09, + "learning_rate": 0.00013042688462000518, + "loss": 0.0, + "num_input_tokens_seen": 67637376, + "step": 39470 + }, + { + "epoch": 191.62469733656175, + "grad_norm": 8.139846130461592e-09, + "learning_rate": 0.0001279825464063855, + "loss": 0.0, + "num_input_tokens_seen": 67646112, + "step": 39475 + }, + { + "epoch": 191.64891041162227, + "grad_norm": 6.180640443886887e-09, + "learning_rate": 0.00012556132034126087, + "loss": 0.0, + "num_input_tokens_seen": 67654624, + "step": 39480 + }, + { + "epoch": 191.6731234866828, + "grad_norm": 6.9996080043210895e-09, + "learning_rate": 0.0001231632067980326, + "loss": 0.0, + "num_input_tokens_seen": 67663232, + "step": 39485 + }, + { + "epoch": 191.69733656174333, + "grad_norm": 9.942501044690744e-09, + "learning_rate": 0.00012078820614650486, + "loss": 0.0, + "num_input_tokens_seen": 67671680, + "step": 39490 + }, + { + "epoch": 191.72154963680387, + "grad_norm": 9.357330021941834e-09, + "learning_rate": 0.00011843631875291804, + "loss": 0.0, + "num_input_tokens_seen": 67679968, + "step": 39495 + }, + { + "epoch": 191.74576271186442, + "grad_norm": 7.395710266422384e-09, + "learning_rate": 0.00011610754497999863, + "loss": 0.0, + "num_input_tokens_seen": 67688416, + "step": 39500 + }, + { + "epoch": 191.76997578692493, + "grad_norm": 1.0303194741823063e-08, + "learning_rate": 0.0001138018851868594, + "loss": 0.0, + "num_input_tokens_seen": 67697056, + "step": 39505 + }, + { + "epoch": 191.79418886198548, + "grad_norm": 3.081352728528941e-09, + "learning_rate": 0.0001115193397290326, + "loss": 0.0, + "num_input_tokens_seen": 67705920, + "step": 39510 + }, + { + "epoch": 191.818401937046, + "grad_norm": 8.66504556995551e-09, + "learning_rate": 0.00010925990895856996, + "loss": 0.0, + "num_input_tokens_seen": 67714304, + "step": 39515 + }, + { + "epoch": 191.84261501210653, + "grad_norm": 1.0344512801907513e-08, + "learning_rate": 0.00010702359322385946, + "loss": 0.0, + "num_input_tokens_seen": 67723328, + "step": 39520 + }, + { + "epoch": 191.86682808716708, + "grad_norm": 8.57935855691494e-09, + "learning_rate": 0.00010481039286977523, + "loss": 0.0, + "num_input_tokens_seen": 67731808, + "step": 39525 + }, + { + "epoch": 191.8910411622276, + "grad_norm": 5.461352259317209e-09, + "learning_rate": 0.00010262030823764423, + "loss": 0.0, + "num_input_tokens_seen": 67740000, + "step": 39530 + }, + { + "epoch": 191.91525423728814, + "grad_norm": 1.2672317417639078e-08, + "learning_rate": 0.00010045333966517966, + "loss": 0.0, + "num_input_tokens_seen": 67748832, + "step": 39535 + }, + { + "epoch": 191.93946731234865, + "grad_norm": 1.2980903463244431e-08, + "learning_rate": 9.83094874865642e-05, + "loss": 0.0, + "num_input_tokens_seen": 67757536, + "step": 39540 + }, + { + "epoch": 191.9636803874092, + "grad_norm": 7.187736184022242e-09, + "learning_rate": 9.618875203241672e-05, + "loss": 0.0, + "num_input_tokens_seen": 67766048, + "step": 39545 + }, + { + "epoch": 191.98789346246974, + "grad_norm": 1.049791187313076e-08, + "learning_rate": 9.409113362977561e-05, + "loss": 0.0, + "num_input_tokens_seen": 67774592, + "step": 39550 + }, + { + "epoch": 192.01452784503633, + "grad_norm": 5.122563262460744e-09, + "learning_rate": 9.20166326020988e-05, + "loss": 0.0, + "num_input_tokens_seen": 67783200, + "step": 39555 + }, + { + "epoch": 192.03874092009684, + "grad_norm": 7.087893827417702e-09, + "learning_rate": 8.996524926933035e-05, + "loss": 0.0, + "num_input_tokens_seen": 67791744, + "step": 39560 + }, + { + "epoch": 192.0629539951574, + "grad_norm": 1.6647934586444535e-08, + "learning_rate": 8.793698394781723e-05, + "loss": 0.0, + "num_input_tokens_seen": 67800512, + "step": 39565 + }, + { + "epoch": 192.08716707021793, + "grad_norm": 8.324517075664062e-09, + "learning_rate": 8.593183695030926e-05, + "loss": 0.0, + "num_input_tokens_seen": 67808992, + "step": 39570 + }, + { + "epoch": 192.11138014527845, + "grad_norm": 8.33554025803096e-09, + "learning_rate": 8.39498085860757e-05, + "loss": 0.0, + "num_input_tokens_seen": 67817312, + "step": 39575 + }, + { + "epoch": 192.135593220339, + "grad_norm": 1.7934054241663944e-08, + "learning_rate": 8.199089916072211e-05, + "loss": 0.0, + "num_input_tokens_seen": 67825856, + "step": 39580 + }, + { + "epoch": 192.1598062953995, + "grad_norm": 1.1661906107462983e-08, + "learning_rate": 8.005510897637346e-05, + "loss": 0.0, + "num_input_tokens_seen": 67834688, + "step": 39585 + }, + { + "epoch": 192.18401937046005, + "grad_norm": 9.70885327689075e-09, + "learning_rate": 7.8142438331541e-05, + "loss": 0.0, + "num_input_tokens_seen": 67843424, + "step": 39590 + }, + { + "epoch": 192.2082324455206, + "grad_norm": 8.07418842896368e-09, + "learning_rate": 7.625288752117209e-05, + "loss": 0.0, + "num_input_tokens_seen": 67851552, + "step": 39595 + }, + { + "epoch": 192.2324455205811, + "grad_norm": 7.1299552928394405e-09, + "learning_rate": 7.4386456836667e-05, + "loss": 0.0, + "num_input_tokens_seen": 67860384, + "step": 39600 + }, + { + "epoch": 192.2324455205811, + "eval_loss": 1.2012585401535034, + "eval_runtime": 4.6314, + "eval_samples_per_second": 79.241, + "eval_steps_per_second": 19.864, + "num_input_tokens_seen": 67860384, + "step": 39600 + }, + { + "epoch": 192.25665859564165, + "grad_norm": 8.02380828446303e-09, + "learning_rate": 7.254314656586214e-05, + "loss": 0.0, + "num_input_tokens_seen": 67869056, + "step": 39605 + }, + { + "epoch": 192.28087167070217, + "grad_norm": 4.171614609305152e-09, + "learning_rate": 7.07229569929968e-05, + "loss": 0.0, + "num_input_tokens_seen": 67877664, + "step": 39610 + }, + { + "epoch": 192.3050847457627, + "grad_norm": 5.970178573733165e-09, + "learning_rate": 6.892588839879643e-05, + "loss": 0.0, + "num_input_tokens_seen": 67886048, + "step": 39615 + }, + { + "epoch": 192.32929782082326, + "grad_norm": 6.397711693750807e-09, + "learning_rate": 6.71519410603727e-05, + "loss": 0.0, + "num_input_tokens_seen": 67894720, + "step": 39620 + }, + { + "epoch": 192.35351089588377, + "grad_norm": 1.5048685853003008e-08, + "learning_rate": 6.540111525129011e-05, + "loss": 0.0, + "num_input_tokens_seen": 67903264, + "step": 39625 + }, + { + "epoch": 192.37772397094432, + "grad_norm": 3.8603218399657635e-09, + "learning_rate": 6.367341124154934e-05, + "loss": 0.0, + "num_input_tokens_seen": 67912064, + "step": 39630 + }, + { + "epoch": 192.40193704600483, + "grad_norm": 5.599810393164262e-09, + "learning_rate": 6.19688292975873e-05, + "loss": 0.0, + "num_input_tokens_seen": 67920672, + "step": 39635 + }, + { + "epoch": 192.42615012106538, + "grad_norm": 9.790128707720669e-09, + "learning_rate": 6.0287369682260336e-05, + "loss": 0.0, + "num_input_tokens_seen": 67929376, + "step": 39640 + }, + { + "epoch": 192.45036319612592, + "grad_norm": 4.5638572920836395e-09, + "learning_rate": 5.8629032654894384e-05, + "loss": 0.0, + "num_input_tokens_seen": 67937984, + "step": 39645 + }, + { + "epoch": 192.47457627118644, + "grad_norm": 8.59153193033535e-09, + "learning_rate": 5.699381847120155e-05, + "loss": 0.0, + "num_input_tokens_seen": 67946592, + "step": 39650 + }, + { + "epoch": 192.49878934624698, + "grad_norm": 1.8082653596707132e-08, + "learning_rate": 5.5381727383380094e-05, + "loss": 0.0, + "num_input_tokens_seen": 67955456, + "step": 39655 + }, + { + "epoch": 192.5230024213075, + "grad_norm": 2.7125679480377585e-09, + "learning_rate": 5.379275964001451e-05, + "loss": 0.0, + "num_input_tokens_seen": 67964224, + "step": 39660 + }, + { + "epoch": 192.54721549636804, + "grad_norm": 9.531119893324558e-09, + "learning_rate": 5.222691548614211e-05, + "loss": 0.0, + "num_input_tokens_seen": 67972576, + "step": 39665 + }, + { + "epoch": 192.57142857142858, + "grad_norm": 5.0360560166495816e-09, + "learning_rate": 5.068419516323641e-05, + "loss": 0.0, + "num_input_tokens_seen": 67981184, + "step": 39670 + }, + { + "epoch": 192.5956416464891, + "grad_norm": 7.706735694057443e-09, + "learning_rate": 4.91645989092071e-05, + "loss": 0.0, + "num_input_tokens_seen": 67989536, + "step": 39675 + }, + { + "epoch": 192.61985472154964, + "grad_norm": 2.321115744052804e-08, + "learning_rate": 4.7668126958400056e-05, + "loss": 0.0, + "num_input_tokens_seen": 67998176, + "step": 39680 + }, + { + "epoch": 192.64406779661016, + "grad_norm": 4.4944052923767686e-09, + "learning_rate": 4.619477954159734e-05, + "loss": 0.0, + "num_input_tokens_seen": 68006784, + "step": 39685 + }, + { + "epoch": 192.6682808716707, + "grad_norm": 7.1476087271094e-09, + "learning_rate": 4.4744556885983884e-05, + "loss": 0.0, + "num_input_tokens_seen": 68015264, + "step": 39690 + }, + { + "epoch": 192.69249394673125, + "grad_norm": 4.5045736030147054e-09, + "learning_rate": 4.331745921523078e-05, + "loss": 0.0, + "num_input_tokens_seen": 68023712, + "step": 39695 + }, + { + "epoch": 192.71670702179176, + "grad_norm": 1.1168723723642415e-08, + "learning_rate": 4.191348674937867e-05, + "loss": 0.0, + "num_input_tokens_seen": 68032032, + "step": 39700 + }, + { + "epoch": 192.7409200968523, + "grad_norm": 3.5343863391545938e-09, + "learning_rate": 4.0532639704971006e-05, + "loss": 0.0, + "num_input_tokens_seen": 68040768, + "step": 39705 + }, + { + "epoch": 192.76513317191282, + "grad_norm": 9.162683056729293e-09, + "learning_rate": 3.917491829493747e-05, + "loss": 0.0, + "num_input_tokens_seen": 68049536, + "step": 39710 + }, + { + "epoch": 192.78934624697337, + "grad_norm": 6.6644263441162366e-09, + "learning_rate": 3.78403227286439e-05, + "loss": 0.0, + "num_input_tokens_seen": 68058208, + "step": 39715 + }, + { + "epoch": 192.8135593220339, + "grad_norm": 1.1155426804521085e-08, + "learning_rate": 3.652885321192567e-05, + "loss": 0.0, + "num_input_tokens_seen": 68066880, + "step": 39720 + }, + { + "epoch": 192.83777239709443, + "grad_norm": 1.9236185977433706e-09, + "learning_rate": 3.524050994702099e-05, + "loss": 0.0, + "num_input_tokens_seen": 68075296, + "step": 39725 + }, + { + "epoch": 192.86198547215497, + "grad_norm": 7.474444174704331e-09, + "learning_rate": 3.3975293132604276e-05, + "loss": 0.0, + "num_input_tokens_seen": 68083520, + "step": 39730 + }, + { + "epoch": 192.88619854721549, + "grad_norm": 1.1781502884389283e-08, + "learning_rate": 3.2733202963786125e-05, + "loss": 0.0, + "num_input_tokens_seen": 68091968, + "step": 39735 + }, + { + "epoch": 192.91041162227603, + "grad_norm": 1.495798862549691e-08, + "learning_rate": 3.15142396321133e-05, + "loss": 0.0, + "num_input_tokens_seen": 68100544, + "step": 39740 + }, + { + "epoch": 192.93462469733657, + "grad_norm": 8.997370848362607e-09, + "learning_rate": 3.0318403325552132e-05, + "loss": 0.0, + "num_input_tokens_seen": 68108832, + "step": 39745 + }, + { + "epoch": 192.9588377723971, + "grad_norm": 1.231182622518645e-08, + "learning_rate": 2.914569422855506e-05, + "loss": 0.0, + "num_input_tokens_seen": 68117056, + "step": 39750 + }, + { + "epoch": 192.98305084745763, + "grad_norm": 7.461319562196422e-09, + "learning_rate": 2.7996112521927462e-05, + "loss": 0.0, + "num_input_tokens_seen": 68126016, + "step": 39755 + }, + { + "epoch": 193.00968523002422, + "grad_norm": 6.549600861660565e-09, + "learning_rate": 2.68696583829775e-05, + "loss": 0.0, + "num_input_tokens_seen": 68134432, + "step": 39760 + }, + { + "epoch": 193.03389830508473, + "grad_norm": 6.465103563613184e-09, + "learning_rate": 2.576633198539957e-05, + "loss": 0.0, + "num_input_tokens_seen": 68143008, + "step": 39765 + }, + { + "epoch": 193.05811138014528, + "grad_norm": 7.471937735203937e-09, + "learning_rate": 2.46861334993409e-05, + "loss": 0.0, + "num_input_tokens_seen": 68151488, + "step": 39770 + }, + { + "epoch": 193.08232445520582, + "grad_norm": 4.277898035809358e-09, + "learning_rate": 2.3629063091384903e-05, + "loss": 0.0, + "num_input_tokens_seen": 68160608, + "step": 39775 + }, + { + "epoch": 193.10653753026634, + "grad_norm": 1.0671558747787913e-08, + "learning_rate": 2.2595120924567834e-05, + "loss": 0.0, + "num_input_tokens_seen": 68168832, + "step": 39780 + }, + { + "epoch": 193.13075060532688, + "grad_norm": 2.0730572813931758e-08, + "learning_rate": 2.158430715829551e-05, + "loss": 0.0, + "num_input_tokens_seen": 68177600, + "step": 39785 + }, + { + "epoch": 193.1549636803874, + "grad_norm": 8.453669764207916e-09, + "learning_rate": 2.059662194849321e-05, + "loss": 0.0, + "num_input_tokens_seen": 68185984, + "step": 39790 + }, + { + "epoch": 193.17917675544794, + "grad_norm": 4.135311648667539e-09, + "learning_rate": 1.9632065447422463e-05, + "loss": 0.0, + "num_input_tokens_seen": 68194592, + "step": 39795 + }, + { + "epoch": 193.20338983050848, + "grad_norm": 6.011432240882186e-09, + "learning_rate": 1.8690637803880916e-05, + "loss": 0.0, + "num_input_tokens_seen": 68203104, + "step": 39800 + }, + { + "epoch": 193.20338983050848, + "eval_loss": 1.1948976516723633, + "eval_runtime": 4.629, + "eval_samples_per_second": 79.282, + "eval_steps_per_second": 19.875, + "num_input_tokens_seen": 68203104, + "step": 39800 + }, + { + "epoch": 193.227602905569, + "grad_norm": 8.936114959112729e-09, + "learning_rate": 1.7772339163019123e-05, + "loss": 0.0, + "num_input_tokens_seen": 68211744, + "step": 39805 + }, + { + "epoch": 193.25181598062954, + "grad_norm": 7.549651570570859e-09, + "learning_rate": 1.6877169666457138e-05, + "loss": 0.0, + "num_input_tokens_seen": 68220192, + "step": 39810 + }, + { + "epoch": 193.27602905569006, + "grad_norm": 7.3670767264388815e-09, + "learning_rate": 1.6005129452234532e-05, + "loss": 0.0, + "num_input_tokens_seen": 68228576, + "step": 39815 + }, + { + "epoch": 193.3002421307506, + "grad_norm": 1.1678288558414351e-08, + "learning_rate": 1.5156218654843733e-05, + "loss": 0.0, + "num_input_tokens_seen": 68236768, + "step": 39820 + }, + { + "epoch": 193.32445520581115, + "grad_norm": 1.2802464866012997e-08, + "learning_rate": 1.4330437405196683e-05, + "loss": 0.0, + "num_input_tokens_seen": 68245216, + "step": 39825 + }, + { + "epoch": 193.34866828087166, + "grad_norm": 7.139688396051724e-09, + "learning_rate": 1.352778583062486e-05, + "loss": 0.0, + "num_input_tokens_seen": 68253696, + "step": 39830 + }, + { + "epoch": 193.3728813559322, + "grad_norm": 1.0564473740259928e-08, + "learning_rate": 1.2748264054929237e-05, + "loss": 0.0, + "num_input_tokens_seen": 68262624, + "step": 39835 + }, + { + "epoch": 193.39709443099272, + "grad_norm": 8.246963112412686e-09, + "learning_rate": 1.1991872198297004e-05, + "loss": 0.0, + "num_input_tokens_seen": 68270976, + "step": 39840 + }, + { + "epoch": 193.42130750605327, + "grad_norm": 3.969837791828468e-09, + "learning_rate": 1.1258610377384847e-05, + "loss": 0.0, + "num_input_tokens_seen": 68279360, + "step": 39845 + }, + { + "epoch": 193.4455205811138, + "grad_norm": 8.597496936602056e-09, + "learning_rate": 1.0548478705268982e-05, + "loss": 0.0, + "num_input_tokens_seen": 68287712, + "step": 39850 + }, + { + "epoch": 193.46973365617433, + "grad_norm": 1.0073179623759643e-08, + "learning_rate": 9.86147729147846e-06, + "loss": 0.0, + "num_input_tokens_seen": 68295968, + "step": 39855 + }, + { + "epoch": 193.49394673123487, + "grad_norm": 7.214366437580111e-09, + "learning_rate": 9.197606241928557e-06, + "loss": 0.0, + "num_input_tokens_seen": 68304800, + "step": 39860 + }, + { + "epoch": 193.5181598062954, + "grad_norm": 8.031278753151128e-09, + "learning_rate": 8.556865659004042e-06, + "loss": 0.0, + "num_input_tokens_seen": 68313472, + "step": 39865 + }, + { + "epoch": 193.54237288135593, + "grad_norm": 9.800484868094372e-09, + "learning_rate": 7.939255641525867e-06, + "loss": 0.0, + "num_input_tokens_seen": 68321792, + "step": 39870 + }, + { + "epoch": 193.56658595641647, + "grad_norm": 6.337202318462687e-09, + "learning_rate": 7.344776284751164e-06, + "loss": 0.0, + "num_input_tokens_seen": 68330656, + "step": 39875 + }, + { + "epoch": 193.590799031477, + "grad_norm": 1.4799685033040078e-08, + "learning_rate": 6.773427680323296e-06, + "loss": 0.0, + "num_input_tokens_seen": 68339232, + "step": 39880 + }, + { + "epoch": 193.61501210653753, + "grad_norm": 7.927944523089536e-09, + "learning_rate": 6.225209916355112e-06, + "loss": 0.0, + "num_input_tokens_seen": 68347744, + "step": 39885 + }, + { + "epoch": 193.63922518159805, + "grad_norm": 2.9216314700875046e-08, + "learning_rate": 5.7001230774123e-06, + "loss": 0.0, + "num_input_tokens_seen": 68356192, + "step": 39890 + }, + { + "epoch": 193.6634382566586, + "grad_norm": 8.98770835533469e-09, + "learning_rate": 5.198167244446772e-06, + "loss": 0.0, + "num_input_tokens_seen": 68364768, + "step": 39895 + }, + { + "epoch": 193.68765133171914, + "grad_norm": 1.4785298318997775e-08, + "learning_rate": 4.71934249487993e-06, + "loss": 0.0, + "num_input_tokens_seen": 68373568, + "step": 39900 + }, + { + "epoch": 193.71186440677965, + "grad_norm": 7.754361597278603e-09, + "learning_rate": 4.2636489025527075e-06, + "loss": 0.0, + "num_input_tokens_seen": 68382272, + "step": 39905 + }, + { + "epoch": 193.7360774818402, + "grad_norm": 4.130647379696484e-09, + "learning_rate": 3.831086537742223e-06, + "loss": 0.0, + "num_input_tokens_seen": 68390784, + "step": 39910 + }, + { + "epoch": 193.7602905569007, + "grad_norm": 7.225145370881592e-09, + "learning_rate": 3.4216554671451236e-06, + "loss": 0.0, + "num_input_tokens_seen": 68399008, + "step": 39915 + }, + { + "epoch": 193.78450363196126, + "grad_norm": 1.3592209135993016e-08, + "learning_rate": 3.035355753894242e-06, + "loss": 0.0, + "num_input_tokens_seen": 68407520, + "step": 39920 + }, + { + "epoch": 193.8087167070218, + "grad_norm": 7.988923300672468e-09, + "learning_rate": 2.6721874575752477e-06, + "loss": 0.0, + "num_input_tokens_seen": 68416192, + "step": 39925 + }, + { + "epoch": 193.83292978208232, + "grad_norm": 5.3434527913509555e-09, + "learning_rate": 2.3321506341933418e-06, + "loss": 0.0, + "num_input_tokens_seen": 68425056, + "step": 39930 + }, + { + "epoch": 193.85714285714286, + "grad_norm": 9.710205972623953e-09, + "learning_rate": 2.0152453361732546e-06, + "loss": 0.0, + "num_input_tokens_seen": 68433856, + "step": 39935 + }, + { + "epoch": 193.88135593220338, + "grad_norm": 1.2193986265174317e-08, + "learning_rate": 1.7214716123925554e-06, + "loss": 0.0, + "num_input_tokens_seen": 68441984, + "step": 39940 + }, + { + "epoch": 193.90556900726392, + "grad_norm": 1.013553507789311e-08, + "learning_rate": 1.4508295081649968e-06, + "loss": 0.0, + "num_input_tokens_seen": 68450912, + "step": 39945 + }, + { + "epoch": 193.92978208232446, + "grad_norm": 7.52465201259156e-09, + "learning_rate": 1.2033190652238623e-06, + "loss": 0.0, + "num_input_tokens_seen": 68459296, + "step": 39950 + }, + { + "epoch": 193.95399515738498, + "grad_norm": 6.60289822818072e-09, + "learning_rate": 9.78940321721966e-07, + "loss": 0.0, + "num_input_tokens_seen": 68467744, + "step": 39955 + }, + { + "epoch": 193.97820823244552, + "grad_norm": 1.2542222371791922e-08, + "learning_rate": 7.776933122816132e-07, + "loss": 0.0, + "num_input_tokens_seen": 68476256, + "step": 39960 + }, + { + "epoch": 194.0048426150121, + "grad_norm": 1.5228719618676223e-08, + "learning_rate": 5.99578067927986e-07, + "loss": 0.0, + "num_input_tokens_seen": 68485248, + "step": 39965 + }, + { + "epoch": 194.02905569007265, + "grad_norm": 1.1641703601128484e-08, + "learning_rate": 4.445946161224512e-07, + "loss": 0.0, + "num_input_tokens_seen": 68493536, + "step": 39970 + }, + { + "epoch": 194.05326876513317, + "grad_norm": 6.591942103284509e-09, + "learning_rate": 3.127429807792126e-07, + "loss": 0.0, + "num_input_tokens_seen": 68502304, + "step": 39975 + }, + { + "epoch": 194.0774818401937, + "grad_norm": 1.8309997074084094e-08, + "learning_rate": 2.040231822320049e-07, + "loss": 0.0, + "num_input_tokens_seen": 68510848, + "step": 39980 + }, + { + "epoch": 194.10169491525423, + "grad_norm": 1.2873580423899966e-08, + "learning_rate": 1.1843523723409354e-07, + "loss": 0.0, + "num_input_tokens_seen": 68519488, + "step": 39985 + }, + { + "epoch": 194.12590799031477, + "grad_norm": 4.760449368035324e-09, + "learning_rate": 5.597915897492811e-08, + "loss": 0.0, + "num_input_tokens_seen": 68527808, + "step": 39990 + }, + { + "epoch": 194.15012106537532, + "grad_norm": 9.320007876567615e-09, + "learning_rate": 1.6654957113448885e-08, + "loss": 0.0, + "num_input_tokens_seen": 68536288, + "step": 39995 + }, + { + "epoch": 194.17433414043583, + "grad_norm": 9.812540113784962e-09, + "learning_rate": 4.626377114735902e-10, + "loss": 0.0, + "num_input_tokens_seen": 68544800, + "step": 40000 + }, + { + "epoch": 194.17433414043583, + "eval_loss": 1.1948976516723633, + "eval_runtime": 4.6236, + "eval_samples_per_second": 79.376, + "eval_steps_per_second": 19.898, + "num_input_tokens_seen": 68544800, + "step": 40000 + }, + { + "epoch": 194.17433414043583, + "num_input_tokens_seen": 68544800, + "step": 40000, + "total_flos": 2.870228767660032e+17, + "train_loss": 0.021644555192604012, + "train_runtime": 17219.8108, + "train_samples_per_second": 37.166, + "train_steps_per_second": 2.323 + } + ], + "logging_steps": 5, + "max_steps": 40000, + "num_input_tokens_seen": 68544800, + "num_train_epochs": 195, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.870228767660032e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}