diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,76497 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.959226430298147, + "eval_steps": 100, + "global_step": 10800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006446414182111201, + "grad_norm": 43.225736299194885, + "learning_rate": 3.3333333333333333e-06, + "loss": 9.105, + "step": 1 + }, + { + "epoch": 0.0012892828364222402, + "grad_norm": 41.23441491434792, + "learning_rate": 6.666666666666667e-06, + "loss": 9.0819, + "step": 2 + }, + { + "epoch": 0.0019339242546333602, + "grad_norm": 15.907340472671878, + "learning_rate": 1e-05, + "loss": 9.1065, + "step": 3 + }, + { + "epoch": 0.0025785656728444803, + "grad_norm": 12.458171856079126, + "learning_rate": 1.3333333333333333e-05, + "loss": 8.9377, + "step": 4 + }, + { + "epoch": 0.0032232070910556, + "grad_norm": 12.926673548679421, + "learning_rate": 1.6666666666666667e-05, + "loss": 8.8016, + "step": 5 + }, + { + "epoch": 0.0038678485092667205, + "grad_norm": 68.52883535141684, + "learning_rate": 2e-05, + "loss": 9.0373, + "step": 6 + }, + { + "epoch": 0.004512489927477841, + "grad_norm": 9.872945865284116, + "learning_rate": 2.3333333333333336e-05, + "loss": 8.8323, + "step": 7 + }, + { + "epoch": 0.005157131345688961, + "grad_norm": 10.212500729686894, + "learning_rate": 2.6666666666666667e-05, + "loss": 8.7798, + "step": 8 + }, + { + "epoch": 0.0058017727639000805, + "grad_norm": 14.469522094055176, + "learning_rate": 3e-05, + "loss": 8.8002, + "step": 9 + }, + { + "epoch": 0.0064464141821112, + "grad_norm": 9.079036742307167, + "learning_rate": 3.3333333333333335e-05, + "loss": 8.9482, + "step": 10 + }, + { + "epoch": 0.007091055600322321, + "grad_norm": 10.951942290689393, + "learning_rate": 3.6666666666666666e-05, + "loss": 8.755, + "step": 11 + }, + { + "epoch": 0.007735697018533441, + "grad_norm": 7.991712319215799, + "learning_rate": 4e-05, + "loss": 8.6958, + "step": 12 + }, + { + "epoch": 0.00838033843674456, + "grad_norm": 11.340541381601554, + "learning_rate": 4.3333333333333334e-05, + "loss": 8.5721, + "step": 13 + }, + { + "epoch": 0.009024979854955682, + "grad_norm": 15.466454418032095, + "learning_rate": 4.666666666666667e-05, + "loss": 8.7281, + "step": 14 + }, + { + "epoch": 0.009669621273166801, + "grad_norm": 8.76134915548763, + "learning_rate": 5e-05, + "loss": 8.6837, + "step": 15 + }, + { + "epoch": 0.010314262691377921, + "grad_norm": 10.378502687879507, + "learning_rate": 5.333333333333333e-05, + "loss": 8.5217, + "step": 16 + }, + { + "epoch": 0.010958904109589041, + "grad_norm": 8.639790726421793, + "learning_rate": 5.666666666666667e-05, + "loss": 8.4443, + "step": 17 + }, + { + "epoch": 0.011603545527800161, + "grad_norm": 97.26940703124015, + "learning_rate": 6e-05, + "loss": 9.5095, + "step": 18 + }, + { + "epoch": 0.012248186946011281, + "grad_norm": 11.305773878631868, + "learning_rate": 6.333333333333333e-05, + "loss": 8.6443, + "step": 19 + }, + { + "epoch": 0.0128928283642224, + "grad_norm": 6.89176356833831, + "learning_rate": 6.666666666666667e-05, + "loss": 8.6874, + "step": 20 + }, + { + "epoch": 0.01353746978243352, + "grad_norm": 6.247628046272242, + "learning_rate": 7e-05, + "loss": 8.6289, + "step": 21 + }, + { + "epoch": 0.014182111200644642, + "grad_norm": 9.579877841635856, + "learning_rate": 7.333333333333333e-05, + "loss": 8.5533, + "step": 22 + }, + { + "epoch": 0.014826752618855762, + "grad_norm": 5.310685387322702, + "learning_rate": 7.666666666666667e-05, + "loss": 8.56, + "step": 23 + }, + { + "epoch": 0.015471394037066882, + "grad_norm": 5.730095679095229, + "learning_rate": 8e-05, + "loss": 8.4114, + "step": 24 + }, + { + "epoch": 0.016116035455278, + "grad_norm": 6.915980361818145, + "learning_rate": 8.333333333333334e-05, + "loss": 8.3424, + "step": 25 + }, + { + "epoch": 0.01676067687348912, + "grad_norm": 9.983692591682646, + "learning_rate": 8.666666666666667e-05, + "loss": 8.3078, + "step": 26 + }, + { + "epoch": 0.017405318291700243, + "grad_norm": 13.813936829695146, + "learning_rate": 9e-05, + "loss": 8.3936, + "step": 27 + }, + { + "epoch": 0.018049959709911363, + "grad_norm": 8.820514442868676, + "learning_rate": 9.333333333333334e-05, + "loss": 8.4065, + "step": 28 + }, + { + "epoch": 0.018694601128122483, + "grad_norm": 8.070674197286117, + "learning_rate": 9.666666666666667e-05, + "loss": 8.2562, + "step": 29 + }, + { + "epoch": 0.019339242546333603, + "grad_norm": 13.83187694563091, + "learning_rate": 0.0001, + "loss": 8.3914, + "step": 30 + }, + { + "epoch": 0.019983883964544723, + "grad_norm": 11.53240436854111, + "learning_rate": 9.999999995935123e-05, + "loss": 8.4987, + "step": 31 + }, + { + "epoch": 0.020628525382755843, + "grad_norm": 5.39255566455533, + "learning_rate": 9.999999983740493e-05, + "loss": 8.2928, + "step": 32 + }, + { + "epoch": 0.021273166800966962, + "grad_norm": 5.727942816526761, + "learning_rate": 9.999999963416107e-05, + "loss": 8.1656, + "step": 33 + }, + { + "epoch": 0.021917808219178082, + "grad_norm": 7.507389291876974, + "learning_rate": 9.999999934961967e-05, + "loss": 7.9432, + "step": 34 + }, + { + "epoch": 0.022562449637389202, + "grad_norm": 8.04975169184875, + "learning_rate": 9.999999898378072e-05, + "loss": 7.9081, + "step": 35 + }, + { + "epoch": 0.023207091055600322, + "grad_norm": 8.681221304167282, + "learning_rate": 9.999999853664426e-05, + "loss": 8.1007, + "step": 36 + }, + { + "epoch": 0.023851732473811442, + "grad_norm": 6.171665878474078, + "learning_rate": 9.999999800821023e-05, + "loss": 7.948, + "step": 37 + }, + { + "epoch": 0.024496373892022562, + "grad_norm": 5.620555471416655, + "learning_rate": 9.999999739847867e-05, + "loss": 7.7609, + "step": 38 + }, + { + "epoch": 0.02514101531023368, + "grad_norm": 5.1586695531381235, + "learning_rate": 9.999999670744958e-05, + "loss": 7.6551, + "step": 39 + }, + { + "epoch": 0.0257856567284448, + "grad_norm": 4.934223336037897, + "learning_rate": 9.999999593512296e-05, + "loss": 7.5509, + "step": 40 + }, + { + "epoch": 0.02643029814665592, + "grad_norm": 6.109246435612051, + "learning_rate": 9.999999508149877e-05, + "loss": 7.3739, + "step": 41 + }, + { + "epoch": 0.02707493956486704, + "grad_norm": 7.316858636364242, + "learning_rate": 9.999999414657707e-05, + "loss": 7.3475, + "step": 42 + }, + { + "epoch": 0.02771958098307816, + "grad_norm": 13.26656226400544, + "learning_rate": 9.999999313035785e-05, + "loss": 7.5775, + "step": 43 + }, + { + "epoch": 0.028364222401289284, + "grad_norm": 12.118609322742586, + "learning_rate": 9.999999203284109e-05, + "loss": 7.3643, + "step": 44 + }, + { + "epoch": 0.029008863819500404, + "grad_norm": 9.528426931921327, + "learning_rate": 9.999999085402679e-05, + "loss": 7.1046, + "step": 45 + }, + { + "epoch": 0.029653505237711524, + "grad_norm": 5.290710256395323, + "learning_rate": 9.999998959391498e-05, + "loss": 6.9564, + "step": 46 + }, + { + "epoch": 0.030298146655922644, + "grad_norm": 7.670909914294247, + "learning_rate": 9.999998825250562e-05, + "loss": 6.9162, + "step": 47 + }, + { + "epoch": 0.030942788074133764, + "grad_norm": 4.829678768541371, + "learning_rate": 9.999998682979875e-05, + "loss": 6.7778, + "step": 48 + }, + { + "epoch": 0.03158742949234488, + "grad_norm": 7.405710160381378, + "learning_rate": 9.999998532579436e-05, + "loss": 6.7805, + "step": 49 + }, + { + "epoch": 0.032232070910556, + "grad_norm": 14.171925690264661, + "learning_rate": 9.999998374049244e-05, + "loss": 6.925, + "step": 50 + }, + { + "epoch": 0.03287671232876712, + "grad_norm": 10.146138664227712, + "learning_rate": 9.999998207389302e-05, + "loss": 6.6256, + "step": 51 + }, + { + "epoch": 0.03352135374697824, + "grad_norm": 6.518742158375556, + "learning_rate": 9.99999803259961e-05, + "loss": 6.9816, + "step": 52 + }, + { + "epoch": 0.03416599516518937, + "grad_norm": 7.120173184841264, + "learning_rate": 9.999997849680164e-05, + "loss": 6.4697, + "step": 53 + }, + { + "epoch": 0.03481063658340049, + "grad_norm": 10.604411127792721, + "learning_rate": 9.999997658630968e-05, + "loss": 6.6151, + "step": 54 + }, + { + "epoch": 0.035455278001611606, + "grad_norm": 9.379751636432653, + "learning_rate": 9.999997459452024e-05, + "loss": 6.9064, + "step": 55 + }, + { + "epoch": 0.036099919419822726, + "grad_norm": 5.910756342750617, + "learning_rate": 9.999997252143329e-05, + "loss": 6.3922, + "step": 56 + }, + { + "epoch": 0.036744560838033846, + "grad_norm": 6.702806551432813, + "learning_rate": 9.999997036704883e-05, + "loss": 6.5398, + "step": 57 + }, + { + "epoch": 0.037389202256244966, + "grad_norm": 7.743873074047672, + "learning_rate": 9.999996813136687e-05, + "loss": 5.8836, + "step": 58 + }, + { + "epoch": 0.038033843674456086, + "grad_norm": 13.979442241050457, + "learning_rate": 9.999996581438743e-05, + "loss": 6.7299, + "step": 59 + }, + { + "epoch": 0.038678485092667206, + "grad_norm": 11.590766964176805, + "learning_rate": 9.999996341611051e-05, + "loss": 7.031, + "step": 60 + }, + { + "epoch": 0.039323126510878326, + "grad_norm": 7.990833354630332, + "learning_rate": 9.99999609365361e-05, + "loss": 6.3883, + "step": 61 + }, + { + "epoch": 0.039967767929089446, + "grad_norm": 7.565783507357148, + "learning_rate": 9.999995837566422e-05, + "loss": 6.1486, + "step": 62 + }, + { + "epoch": 0.040612409347300565, + "grad_norm": 7.521787895065162, + "learning_rate": 9.999995573349487e-05, + "loss": 6.2271, + "step": 63 + }, + { + "epoch": 0.041257050765511685, + "grad_norm": 5.20384245243941, + "learning_rate": 9.999995301002803e-05, + "loss": 6.4071, + "step": 64 + }, + { + "epoch": 0.041901692183722805, + "grad_norm": 5.313064805883646, + "learning_rate": 9.999995020526375e-05, + "loss": 6.1491, + "step": 65 + }, + { + "epoch": 0.042546333601933925, + "grad_norm": 5.6325028560515875, + "learning_rate": 9.9999947319202e-05, + "loss": 6.1839, + "step": 66 + }, + { + "epoch": 0.043190975020145045, + "grad_norm": 5.450268049870203, + "learning_rate": 9.999994435184278e-05, + "loss": 6.0881, + "step": 67 + }, + { + "epoch": 0.043835616438356165, + "grad_norm": 6.149115049047647, + "learning_rate": 9.999994130318613e-05, + "loss": 5.6261, + "step": 68 + }, + { + "epoch": 0.044480257856567285, + "grad_norm": 8.094124950923076, + "learning_rate": 9.999993817323202e-05, + "loss": 6.0578, + "step": 69 + }, + { + "epoch": 0.045124899274778404, + "grad_norm": 10.996564511371796, + "learning_rate": 9.999993496198048e-05, + "loss": 5.9195, + "step": 70 + }, + { + "epoch": 0.045769540692989524, + "grad_norm": 7.474384468877641, + "learning_rate": 9.999993166943149e-05, + "loss": 6.1172, + "step": 71 + }, + { + "epoch": 0.046414182111200644, + "grad_norm": 4.165412421773972, + "learning_rate": 9.999992829558507e-05, + "loss": 6.2218, + "step": 72 + }, + { + "epoch": 0.047058823529411764, + "grad_norm": 7.425355215810523, + "learning_rate": 9.999992484044124e-05, + "loss": 5.6763, + "step": 73 + }, + { + "epoch": 0.047703464947622884, + "grad_norm": 10.342151521836893, + "learning_rate": 9.9999921304e-05, + "loss": 5.6944, + "step": 74 + }, + { + "epoch": 0.048348106365834004, + "grad_norm": 12.732699042360029, + "learning_rate": 9.999991768626132e-05, + "loss": 5.7244, + "step": 75 + }, + { + "epoch": 0.048992747784045124, + "grad_norm": 6.524805606062701, + "learning_rate": 9.999991398722525e-05, + "loss": 6.1394, + "step": 76 + }, + { + "epoch": 0.049637389202256244, + "grad_norm": 8.39198678550137, + "learning_rate": 9.999991020689177e-05, + "loss": 5.6479, + "step": 77 + }, + { + "epoch": 0.05028203062046736, + "grad_norm": 6.399611304321509, + "learning_rate": 9.99999063452609e-05, + "loss": 5.9905, + "step": 78 + }, + { + "epoch": 0.05092667203867848, + "grad_norm": 9.17079609192792, + "learning_rate": 9.999990240233264e-05, + "loss": 5.744, + "step": 79 + }, + { + "epoch": 0.0515713134568896, + "grad_norm": 6.7944445749176055, + "learning_rate": 9.999989837810699e-05, + "loss": 5.634, + "step": 80 + }, + { + "epoch": 0.05221595487510072, + "grad_norm": 6.0802614579770795, + "learning_rate": 9.999989427258398e-05, + "loss": 5.4834, + "step": 81 + }, + { + "epoch": 0.05286059629331184, + "grad_norm": 4.886095739669595, + "learning_rate": 9.99998900857636e-05, + "loss": 5.6753, + "step": 82 + }, + { + "epoch": 0.05350523771152296, + "grad_norm": 5.181058181034736, + "learning_rate": 9.999988581764585e-05, + "loss": 5.8262, + "step": 83 + }, + { + "epoch": 0.05414987912973408, + "grad_norm": 4.829824415884137, + "learning_rate": 9.999988146823077e-05, + "loss": 5.5697, + "step": 84 + }, + { + "epoch": 0.0547945205479452, + "grad_norm": 3.9497783712424472, + "learning_rate": 9.99998770375183e-05, + "loss": 5.9824, + "step": 85 + }, + { + "epoch": 0.05543916196615632, + "grad_norm": 4.589811306862617, + "learning_rate": 9.999987252550851e-05, + "loss": 5.872, + "step": 86 + }, + { + "epoch": 0.05608380338436744, + "grad_norm": 3.0634180083702978, + "learning_rate": 9.99998679322014e-05, + "loss": 5.9285, + "step": 87 + }, + { + "epoch": 0.05672844480257857, + "grad_norm": 5.963753726245155, + "learning_rate": 9.999986325759695e-05, + "loss": 5.3921, + "step": 88 + }, + { + "epoch": 0.05737308622078969, + "grad_norm": 7.184885264134631, + "learning_rate": 9.999985850169519e-05, + "loss": 5.6798, + "step": 89 + }, + { + "epoch": 0.05801772763900081, + "grad_norm": 9.922878581255617, + "learning_rate": 9.999985366449611e-05, + "loss": 5.7425, + "step": 90 + }, + { + "epoch": 0.05866236905721193, + "grad_norm": 9.640504025921846, + "learning_rate": 9.999984874599974e-05, + "loss": 5.898, + "step": 91 + }, + { + "epoch": 0.05930701047542305, + "grad_norm": 6.052807762462658, + "learning_rate": 9.999984374620607e-05, + "loss": 5.5247, + "step": 92 + }, + { + "epoch": 0.05995165189363417, + "grad_norm": 8.662194848356444, + "learning_rate": 9.999983866511511e-05, + "loss": 5.4294, + "step": 93 + }, + { + "epoch": 0.06059629331184529, + "grad_norm": 7.37400852999737, + "learning_rate": 9.999983350272688e-05, + "loss": 5.8414, + "step": 94 + }, + { + "epoch": 0.06124093473005641, + "grad_norm": 5.980850673085141, + "learning_rate": 9.999982825904138e-05, + "loss": 5.4159, + "step": 95 + }, + { + "epoch": 0.06188557614826753, + "grad_norm": 3.660752737849657, + "learning_rate": 9.999982293405861e-05, + "loss": 6.0144, + "step": 96 + }, + { + "epoch": 0.06253021756647864, + "grad_norm": 5.4149059251885046, + "learning_rate": 9.999981752777859e-05, + "loss": 5.6584, + "step": 97 + }, + { + "epoch": 0.06317485898468976, + "grad_norm": 4.792895750557159, + "learning_rate": 9.999981204020135e-05, + "loss": 5.6016, + "step": 98 + }, + { + "epoch": 0.06381950040290088, + "grad_norm": 5.247353673265185, + "learning_rate": 9.999980647132686e-05, + "loss": 5.7303, + "step": 99 + }, + { + "epoch": 0.064464141821112, + "grad_norm": 4.899451896007287, + "learning_rate": 9.999980082115515e-05, + "loss": 5.3114, + "step": 100 + }, + { + "epoch": 0.064464141821112, + "eval_loss": 5.579843044281006, + "eval_runtime": 2.9339, + "eval_samples_per_second": 34.084, + "eval_steps_per_second": 4.431, + "step": 100 + }, + { + "epoch": 0.06510878323932312, + "grad_norm": 4.3326854484652815, + "learning_rate": 9.999979508968622e-05, + "loss": 5.6477, + "step": 101 + }, + { + "epoch": 0.06575342465753424, + "grad_norm": 4.618845080260888, + "learning_rate": 9.999978927692008e-05, + "loss": 5.4135, + "step": 102 + }, + { + "epoch": 0.06639806607574536, + "grad_norm": 4.224630638781767, + "learning_rate": 9.999978338285676e-05, + "loss": 5.0982, + "step": 103 + }, + { + "epoch": 0.06704270749395648, + "grad_norm": 8.64022781308353, + "learning_rate": 9.999977740749624e-05, + "loss": 5.1099, + "step": 104 + }, + { + "epoch": 0.06768734891216761, + "grad_norm": 15.992988741070668, + "learning_rate": 9.999977135083855e-05, + "loss": 5.5541, + "step": 105 + }, + { + "epoch": 0.06833199033037873, + "grad_norm": 13.556834019882018, + "learning_rate": 9.999976521288367e-05, + "loss": 5.8743, + "step": 106 + }, + { + "epoch": 0.06897663174858985, + "grad_norm": 5.695374654039294, + "learning_rate": 9.999975899363167e-05, + "loss": 5.5746, + "step": 107 + }, + { + "epoch": 0.06962127316680097, + "grad_norm": 5.768590754844079, + "learning_rate": 9.999975269308249e-05, + "loss": 5.5058, + "step": 108 + }, + { + "epoch": 0.0702659145850121, + "grad_norm": 4.707287430839085, + "learning_rate": 9.999974631123622e-05, + "loss": 5.5941, + "step": 109 + }, + { + "epoch": 0.07091055600322321, + "grad_norm": 5.124624173071176, + "learning_rate": 9.999973984809279e-05, + "loss": 5.6159, + "step": 110 + }, + { + "epoch": 0.07155519742143433, + "grad_norm": 4.3310383588830605, + "learning_rate": 9.999973330365225e-05, + "loss": 5.4112, + "step": 111 + }, + { + "epoch": 0.07219983883964545, + "grad_norm": 3.7355619837082292, + "learning_rate": 9.99997266779146e-05, + "loss": 5.231, + "step": 112 + }, + { + "epoch": 0.07284448025785657, + "grad_norm": 4.178340019675862, + "learning_rate": 9.999971997087987e-05, + "loss": 5.4078, + "step": 113 + }, + { + "epoch": 0.07348912167606769, + "grad_norm": 3.996413908813565, + "learning_rate": 9.999971318254804e-05, + "loss": 5.3049, + "step": 114 + }, + { + "epoch": 0.07413376309427881, + "grad_norm": 3.516721335367199, + "learning_rate": 9.999970631291915e-05, + "loss": 5.7028, + "step": 115 + }, + { + "epoch": 0.07477840451248993, + "grad_norm": 3.681008265070767, + "learning_rate": 9.99996993619932e-05, + "loss": 5.3903, + "step": 116 + }, + { + "epoch": 0.07542304593070105, + "grad_norm": 4.464236323062881, + "learning_rate": 9.99996923297702e-05, + "loss": 5.1775, + "step": 117 + }, + { + "epoch": 0.07606768734891217, + "grad_norm": 9.366619521190778, + "learning_rate": 9.999968521625016e-05, + "loss": 5.2574, + "step": 118 + }, + { + "epoch": 0.07671232876712329, + "grad_norm": 18.221613243809884, + "learning_rate": 9.999967802143311e-05, + "loss": 5.4616, + "step": 119 + }, + { + "epoch": 0.07735697018533441, + "grad_norm": 9.317885941700492, + "learning_rate": 9.999967074531902e-05, + "loss": 5.6286, + "step": 120 + }, + { + "epoch": 0.07800161160354553, + "grad_norm": 9.849651688866306, + "learning_rate": 9.999966338790795e-05, + "loss": 5.2229, + "step": 121 + }, + { + "epoch": 0.07864625302175665, + "grad_norm": 7.1823719075069885, + "learning_rate": 9.99996559491999e-05, + "loss": 5.2653, + "step": 122 + }, + { + "epoch": 0.07929089443996777, + "grad_norm": 6.917019020588738, + "learning_rate": 9.999964842919485e-05, + "loss": 5.851, + "step": 123 + }, + { + "epoch": 0.07993553585817889, + "grad_norm": 3.932244153320896, + "learning_rate": 9.999964082789284e-05, + "loss": 5.3134, + "step": 124 + }, + { + "epoch": 0.08058017727639001, + "grad_norm": 5.880913228999087, + "learning_rate": 9.999963314529389e-05, + "loss": 5.7289, + "step": 125 + }, + { + "epoch": 0.08122481869460113, + "grad_norm": 4.336444655313896, + "learning_rate": 9.999962538139798e-05, + "loss": 5.7306, + "step": 126 + }, + { + "epoch": 0.08186946011281225, + "grad_norm": 4.780068336037182, + "learning_rate": 9.999961753620516e-05, + "loss": 5.0133, + "step": 127 + }, + { + "epoch": 0.08251410153102337, + "grad_norm": 5.8897131603436845, + "learning_rate": 9.999960960971541e-05, + "loss": 5.485, + "step": 128 + }, + { + "epoch": 0.08315874294923449, + "grad_norm": 3.6264522886884705, + "learning_rate": 9.999960160192877e-05, + "loss": 5.8617, + "step": 129 + }, + { + "epoch": 0.08380338436744561, + "grad_norm": 4.099823697781331, + "learning_rate": 9.999959351284524e-05, + "loss": 5.5206, + "step": 130 + }, + { + "epoch": 0.08444802578565673, + "grad_norm": 4.659672472755925, + "learning_rate": 9.999958534246485e-05, + "loss": 5.5683, + "step": 131 + }, + { + "epoch": 0.08509266720386785, + "grad_norm": 3.3817056100123675, + "learning_rate": 9.999957709078758e-05, + "loss": 5.6798, + "step": 132 + }, + { + "epoch": 0.08573730862207897, + "grad_norm": 3.472333412533513, + "learning_rate": 9.999956875781347e-05, + "loss": 5.2953, + "step": 133 + }, + { + "epoch": 0.08638195004029009, + "grad_norm": 2.4786250309461955, + "learning_rate": 9.999956034354252e-05, + "loss": 5.831, + "step": 134 + }, + { + "epoch": 0.08702659145850121, + "grad_norm": 3.76288381464272, + "learning_rate": 9.999955184797475e-05, + "loss": 5.4385, + "step": 135 + }, + { + "epoch": 0.08767123287671233, + "grad_norm": 5.791207326301022, + "learning_rate": 9.999954327111019e-05, + "loss": 5.0939, + "step": 136 + }, + { + "epoch": 0.08831587429492345, + "grad_norm": 6.771127700052826, + "learning_rate": 9.999953461294882e-05, + "loss": 5.4708, + "step": 137 + }, + { + "epoch": 0.08896051571313457, + "grad_norm": 3.979817290056849, + "learning_rate": 9.999952587349067e-05, + "loss": 5.5888, + "step": 138 + }, + { + "epoch": 0.08960515713134569, + "grad_norm": 3.449064269089117, + "learning_rate": 9.999951705273578e-05, + "loss": 5.8011, + "step": 139 + }, + { + "epoch": 0.09024979854955681, + "grad_norm": 4.947891652846447, + "learning_rate": 9.99995081506841e-05, + "loss": 5.2569, + "step": 140 + }, + { + "epoch": 0.09089443996776793, + "grad_norm": 7.1840722354806505, + "learning_rate": 9.999949916733572e-05, + "loss": 5.4505, + "step": 141 + }, + { + "epoch": 0.09153908138597905, + "grad_norm": 5.8459968829712325, + "learning_rate": 9.999949010269063e-05, + "loss": 5.331, + "step": 142 + }, + { + "epoch": 0.09218372280419017, + "grad_norm": 4.256222878055927, + "learning_rate": 9.999948095674881e-05, + "loss": 5.2742, + "step": 143 + }, + { + "epoch": 0.09282836422240129, + "grad_norm": 5.165337341236512, + "learning_rate": 9.99994717295103e-05, + "loss": 5.4469, + "step": 144 + }, + { + "epoch": 0.09347300564061241, + "grad_norm": 5.51068320546613, + "learning_rate": 9.999946242097512e-05, + "loss": 5.3663, + "step": 145 + }, + { + "epoch": 0.09411764705882353, + "grad_norm": 4.572035479383961, + "learning_rate": 9.999945303114328e-05, + "loss": 5.362, + "step": 146 + }, + { + "epoch": 0.09476228847703465, + "grad_norm": 5.352271372496922, + "learning_rate": 9.999944356001481e-05, + "loss": 4.8973, + "step": 147 + }, + { + "epoch": 0.09540692989524577, + "grad_norm": 9.890339858762893, + "learning_rate": 9.999943400758969e-05, + "loss": 5.178, + "step": 148 + }, + { + "epoch": 0.09605157131345689, + "grad_norm": 10.858422165721173, + "learning_rate": 9.999942437386797e-05, + "loss": 5.2198, + "step": 149 + }, + { + "epoch": 0.09669621273166801, + "grad_norm": 4.2170558626908, + "learning_rate": 9.999941465884964e-05, + "loss": 5.3309, + "step": 150 + }, + { + "epoch": 0.09734085414987913, + "grad_norm": 10.110169156619758, + "learning_rate": 9.999940486253475e-05, + "loss": 4.8918, + "step": 151 + }, + { + "epoch": 0.09798549556809025, + "grad_norm": 11.021633145582525, + "learning_rate": 9.999939498492328e-05, + "loss": 4.9036, + "step": 152 + }, + { + "epoch": 0.09863013698630137, + "grad_norm": 4.965507093271856, + "learning_rate": 9.999938502601526e-05, + "loss": 5.129, + "step": 153 + }, + { + "epoch": 0.09927477840451249, + "grad_norm": 7.25036015249297, + "learning_rate": 9.999937498581073e-05, + "loss": 5.1785, + "step": 154 + }, + { + "epoch": 0.0999194198227236, + "grad_norm": 4.858606802971374, + "learning_rate": 9.999936486430967e-05, + "loss": 5.1516, + "step": 155 + }, + { + "epoch": 0.10056406124093473, + "grad_norm": 5.438682217738336, + "learning_rate": 9.99993546615121e-05, + "loss": 4.8438, + "step": 156 + }, + { + "epoch": 0.10120870265914585, + "grad_norm": 4.965898615366433, + "learning_rate": 9.999934437741806e-05, + "loss": 5.4529, + "step": 157 + }, + { + "epoch": 0.10185334407735697, + "grad_norm": 4.5726714955167616, + "learning_rate": 9.999933401202755e-05, + "loss": 5.3825, + "step": 158 + }, + { + "epoch": 0.10249798549556809, + "grad_norm": 3.9583610114178147, + "learning_rate": 9.99993235653406e-05, + "loss": 5.1649, + "step": 159 + }, + { + "epoch": 0.1031426269137792, + "grad_norm": 3.5156499436799082, + "learning_rate": 9.99993130373572e-05, + "loss": 5.4595, + "step": 160 + }, + { + "epoch": 0.10378726833199033, + "grad_norm": 3.1946584266362317, + "learning_rate": 9.99993024280774e-05, + "loss": 5.5664, + "step": 161 + }, + { + "epoch": 0.10443190975020145, + "grad_norm": 3.5985491799277862, + "learning_rate": 9.999929173750121e-05, + "loss": 5.1838, + "step": 162 + }, + { + "epoch": 0.10507655116841257, + "grad_norm": 3.759106511385916, + "learning_rate": 9.999928096562863e-05, + "loss": 5.0436, + "step": 163 + }, + { + "epoch": 0.10572119258662369, + "grad_norm": 3.9329671886457085, + "learning_rate": 9.999927011245969e-05, + "loss": 5.2177, + "step": 164 + }, + { + "epoch": 0.1063658340048348, + "grad_norm": 3.860172759993439, + "learning_rate": 9.99992591779944e-05, + "loss": 4.9539, + "step": 165 + }, + { + "epoch": 0.10701047542304593, + "grad_norm": 2.990270505135037, + "learning_rate": 9.99992481622328e-05, + "loss": 5.2678, + "step": 166 + }, + { + "epoch": 0.10765511684125705, + "grad_norm": 3.6323964393962043, + "learning_rate": 9.99992370651749e-05, + "loss": 5.13, + "step": 167 + }, + { + "epoch": 0.10829975825946817, + "grad_norm": 4.374275691910658, + "learning_rate": 9.999922588682068e-05, + "loss": 5.0327, + "step": 168 + }, + { + "epoch": 0.10894439967767929, + "grad_norm": 3.4791636788537494, + "learning_rate": 9.99992146271702e-05, + "loss": 5.5595, + "step": 169 + }, + { + "epoch": 0.1095890410958904, + "grad_norm": 2.90450116553435, + "learning_rate": 9.999920328622348e-05, + "loss": 4.9961, + "step": 170 + }, + { + "epoch": 0.11023368251410152, + "grad_norm": 4.025352912676699, + "learning_rate": 9.999919186398052e-05, + "loss": 4.6688, + "step": 171 + }, + { + "epoch": 0.11087832393231264, + "grad_norm": 5.985784826080111, + "learning_rate": 9.999918036044135e-05, + "loss": 4.9104, + "step": 172 + }, + { + "epoch": 0.11152296535052376, + "grad_norm": 6.864153923608067, + "learning_rate": 9.999916877560596e-05, + "loss": 5.2944, + "step": 173 + }, + { + "epoch": 0.11216760676873488, + "grad_norm": 4.054484162664398, + "learning_rate": 9.999915710947443e-05, + "loss": 5.22, + "step": 174 + }, + { + "epoch": 0.11281224818694602, + "grad_norm": 4.822630442035845, + "learning_rate": 9.99991453620467e-05, + "loss": 4.7398, + "step": 175 + }, + { + "epoch": 0.11345688960515714, + "grad_norm": 9.00057836198595, + "learning_rate": 9.999913353332287e-05, + "loss": 4.3381, + "step": 176 + }, + { + "epoch": 0.11410153102336826, + "grad_norm": 10.168472278321952, + "learning_rate": 9.99991216233029e-05, + "loss": 5.4225, + "step": 177 + }, + { + "epoch": 0.11474617244157938, + "grad_norm": 6.647747146701357, + "learning_rate": 9.999910963198684e-05, + "loss": 4.9923, + "step": 178 + }, + { + "epoch": 0.1153908138597905, + "grad_norm": 6.6596466966752175, + "learning_rate": 9.99990975593747e-05, + "loss": 4.9179, + "step": 179 + }, + { + "epoch": 0.11603545527800162, + "grad_norm": 6.019737683332903, + "learning_rate": 9.99990854054665e-05, + "loss": 4.8442, + "step": 180 + }, + { + "epoch": 0.11668009669621274, + "grad_norm": 5.512115690553869, + "learning_rate": 9.999907317026225e-05, + "loss": 4.9278, + "step": 181 + }, + { + "epoch": 0.11732473811442386, + "grad_norm": 4.290290031748646, + "learning_rate": 9.999906085376199e-05, + "loss": 4.7681, + "step": 182 + }, + { + "epoch": 0.11796937953263498, + "grad_norm": 3.7956521105619543, + "learning_rate": 9.999904845596572e-05, + "loss": 4.9949, + "step": 183 + }, + { + "epoch": 0.1186140209508461, + "grad_norm": 4.5812097995542835, + "learning_rate": 9.999903597687347e-05, + "loss": 4.8115, + "step": 184 + }, + { + "epoch": 0.11925866236905722, + "grad_norm": 3.1559798847366856, + "learning_rate": 9.999902341648527e-05, + "loss": 5.0741, + "step": 185 + }, + { + "epoch": 0.11990330378726834, + "grad_norm": 3.278844716282679, + "learning_rate": 9.999901077480112e-05, + "loss": 4.7518, + "step": 186 + }, + { + "epoch": 0.12054794520547946, + "grad_norm": 3.3407160029348315, + "learning_rate": 9.999899805182108e-05, + "loss": 5.3026, + "step": 187 + }, + { + "epoch": 0.12119258662369058, + "grad_norm": 3.46831194569787, + "learning_rate": 9.999898524754513e-05, + "loss": 5.0216, + "step": 188 + }, + { + "epoch": 0.1218372280419017, + "grad_norm": 4.520875290390497, + "learning_rate": 9.99989723619733e-05, + "loss": 5.0006, + "step": 189 + }, + { + "epoch": 0.12248186946011282, + "grad_norm": 5.34213540890158, + "learning_rate": 9.999895939510561e-05, + "loss": 5.041, + "step": 190 + }, + { + "epoch": 0.12312651087832394, + "grad_norm": 6.724514855075693, + "learning_rate": 9.99989463469421e-05, + "loss": 5.1447, + "step": 191 + }, + { + "epoch": 0.12377115229653506, + "grad_norm": 4.972529025364966, + "learning_rate": 9.999893321748276e-05, + "loss": 5.2691, + "step": 192 + }, + { + "epoch": 0.12441579371474618, + "grad_norm": 3.7265798084419623, + "learning_rate": 9.999892000672763e-05, + "loss": 4.9824, + "step": 193 + }, + { + "epoch": 0.12506043513295728, + "grad_norm": 10.362187023871595, + "learning_rate": 9.999890671467676e-05, + "loss": 4.9157, + "step": 194 + }, + { + "epoch": 0.12570507655116842, + "grad_norm": 13.64735166163176, + "learning_rate": 9.999889334133012e-05, + "loss": 5.2685, + "step": 195 + }, + { + "epoch": 0.12634971796937952, + "grad_norm": 9.579911655926098, + "learning_rate": 9.999887988668776e-05, + "loss": 4.7333, + "step": 196 + }, + { + "epoch": 0.12699435938759066, + "grad_norm": 4.087301068729188, + "learning_rate": 9.999886635074969e-05, + "loss": 5.3468, + "step": 197 + }, + { + "epoch": 0.12763900080580176, + "grad_norm": 5.091720377941156, + "learning_rate": 9.999885273351595e-05, + "loss": 5.1433, + "step": 198 + }, + { + "epoch": 0.1282836422240129, + "grad_norm": 5.015247100114651, + "learning_rate": 9.999883903498655e-05, + "loss": 4.8933, + "step": 199 + }, + { + "epoch": 0.128928283642224, + "grad_norm": 5.4071273251098875, + "learning_rate": 9.999882525516152e-05, + "loss": 4.9437, + "step": 200 + }, + { + "epoch": 0.128928283642224, + "eval_loss": 5.09897518157959, + "eval_runtime": 2.9252, + "eval_samples_per_second": 34.186, + "eval_steps_per_second": 4.444, + "step": 200 + }, + { + "epoch": 0.12957292506043513, + "grad_norm": 3.355122544309545, + "learning_rate": 9.999881139404086e-05, + "loss": 4.6434, + "step": 201 + }, + { + "epoch": 0.13021756647864624, + "grad_norm": 3.7502567201152264, + "learning_rate": 9.999879745162463e-05, + "loss": 4.8991, + "step": 202 + }, + { + "epoch": 0.13086220789685737, + "grad_norm": 2.933867887637151, + "learning_rate": 9.999878342791282e-05, + "loss": 5.3459, + "step": 203 + }, + { + "epoch": 0.13150684931506848, + "grad_norm": 4.456476083808473, + "learning_rate": 9.999876932290547e-05, + "loss": 5.2759, + "step": 204 + }, + { + "epoch": 0.13215149073327961, + "grad_norm": 2.7896395563406426, + "learning_rate": 9.999875513660261e-05, + "loss": 5.1996, + "step": 205 + }, + { + "epoch": 0.13279613215149072, + "grad_norm": 5.027410294851459, + "learning_rate": 9.999874086900422e-05, + "loss": 4.7426, + "step": 206 + }, + { + "epoch": 0.13344077356970185, + "grad_norm": 7.918682544889744, + "learning_rate": 9.99987265201104e-05, + "loss": 4.773, + "step": 207 + }, + { + "epoch": 0.13408541498791296, + "grad_norm": 6.935053527457102, + "learning_rate": 9.99987120899211e-05, + "loss": 5.2676, + "step": 208 + }, + { + "epoch": 0.1347300564061241, + "grad_norm": 4.462261981836091, + "learning_rate": 9.999869757843637e-05, + "loss": 4.5648, + "step": 209 + }, + { + "epoch": 0.13537469782433523, + "grad_norm": 7.936747116657285, + "learning_rate": 9.999868298565626e-05, + "loss": 4.7778, + "step": 210 + }, + { + "epoch": 0.13601933924254633, + "grad_norm": 7.962340141096699, + "learning_rate": 9.999866831158076e-05, + "loss": 4.5058, + "step": 211 + }, + { + "epoch": 0.13666398066075747, + "grad_norm": 8.929401776861692, + "learning_rate": 9.99986535562099e-05, + "loss": 5.044, + "step": 212 + }, + { + "epoch": 0.13730862207896857, + "grad_norm": 6.5837564748361155, + "learning_rate": 9.99986387195437e-05, + "loss": 4.5309, + "step": 213 + }, + { + "epoch": 0.1379532634971797, + "grad_norm": 4.627883250661504, + "learning_rate": 9.999862380158221e-05, + "loss": 5.2746, + "step": 214 + }, + { + "epoch": 0.1385979049153908, + "grad_norm": 3.473910103278132, + "learning_rate": 9.999860880232543e-05, + "loss": 5.5459, + "step": 215 + }, + { + "epoch": 0.13924254633360195, + "grad_norm": 4.178251829731227, + "learning_rate": 9.999859372177339e-05, + "loss": 5.0182, + "step": 216 + }, + { + "epoch": 0.13988718775181305, + "grad_norm": 4.447585938509419, + "learning_rate": 9.999857855992612e-05, + "loss": 4.6576, + "step": 217 + }, + { + "epoch": 0.1405318291700242, + "grad_norm": 3.279792229001329, + "learning_rate": 9.999856331678365e-05, + "loss": 5.0255, + "step": 218 + }, + { + "epoch": 0.1411764705882353, + "grad_norm": 3.4997190880191753, + "learning_rate": 9.999854799234599e-05, + "loss": 5.0859, + "step": 219 + }, + { + "epoch": 0.14182111200644643, + "grad_norm": 3.5814419997058726, + "learning_rate": 9.999853258661319e-05, + "loss": 5.0192, + "step": 220 + }, + { + "epoch": 0.14246575342465753, + "grad_norm": 4.962336366143524, + "learning_rate": 9.999851709958524e-05, + "loss": 4.8675, + "step": 221 + }, + { + "epoch": 0.14311039484286867, + "grad_norm": 4.123806135568619, + "learning_rate": 9.999850153126217e-05, + "loss": 5.0481, + "step": 222 + }, + { + "epoch": 0.14375503626107977, + "grad_norm": 3.705299808724894, + "learning_rate": 9.999848588164404e-05, + "loss": 4.9193, + "step": 223 + }, + { + "epoch": 0.1443996776792909, + "grad_norm": 3.817484932735612, + "learning_rate": 9.999847015073085e-05, + "loss": 5.1289, + "step": 224 + }, + { + "epoch": 0.145044319097502, + "grad_norm": 5.575431463917602, + "learning_rate": 9.999845433852264e-05, + "loss": 4.9049, + "step": 225 + }, + { + "epoch": 0.14568896051571315, + "grad_norm": 5.611789233608045, + "learning_rate": 9.999843844501941e-05, + "loss": 5.2368, + "step": 226 + }, + { + "epoch": 0.14633360193392425, + "grad_norm": 4.18702234197498, + "learning_rate": 9.99984224702212e-05, + "loss": 4.7178, + "step": 227 + }, + { + "epoch": 0.14697824335213538, + "grad_norm": 3.8455276382564896, + "learning_rate": 9.999840641412806e-05, + "loss": 4.6198, + "step": 228 + }, + { + "epoch": 0.1476228847703465, + "grad_norm": 4.720937057348135, + "learning_rate": 9.999839027673998e-05, + "loss": 4.3429, + "step": 229 + }, + { + "epoch": 0.14826752618855762, + "grad_norm": 5.034387109255324, + "learning_rate": 9.9998374058057e-05, + "loss": 4.7661, + "step": 230 + }, + { + "epoch": 0.14891216760676873, + "grad_norm": 5.251234591020346, + "learning_rate": 9.999835775807916e-05, + "loss": 4.8577, + "step": 231 + }, + { + "epoch": 0.14955680902497986, + "grad_norm": 5.436595499614909, + "learning_rate": 9.999834137680646e-05, + "loss": 4.7187, + "step": 232 + }, + { + "epoch": 0.15020145044319097, + "grad_norm": 4.788927815371221, + "learning_rate": 9.999832491423893e-05, + "loss": 5.3405, + "step": 233 + }, + { + "epoch": 0.1508460918614021, + "grad_norm": 4.6549188298408, + "learning_rate": 9.999830837037665e-05, + "loss": 4.7878, + "step": 234 + }, + { + "epoch": 0.1514907332796132, + "grad_norm": 6.137849896127002, + "learning_rate": 9.999829174521959e-05, + "loss": 4.8756, + "step": 235 + }, + { + "epoch": 0.15213537469782434, + "grad_norm": 5.3186456143919445, + "learning_rate": 9.999827503876778e-05, + "loss": 4.9678, + "step": 236 + }, + { + "epoch": 0.15278001611603545, + "grad_norm": 4.086711121958693, + "learning_rate": 9.999825825102128e-05, + "loss": 4.8436, + "step": 237 + }, + { + "epoch": 0.15342465753424658, + "grad_norm": 7.006623909422808, + "learning_rate": 9.999824138198008e-05, + "loss": 4.5141, + "step": 238 + }, + { + "epoch": 0.1540692989524577, + "grad_norm": 7.236116686093367, + "learning_rate": 9.999822443164422e-05, + "loss": 4.6871, + "step": 239 + }, + { + "epoch": 0.15471394037066882, + "grad_norm": 4.434597978176569, + "learning_rate": 9.999820740001375e-05, + "loss": 5.0417, + "step": 240 + }, + { + "epoch": 0.15535858178887993, + "grad_norm": 5.224869671509863, + "learning_rate": 9.999819028708867e-05, + "loss": 4.6435, + "step": 241 + }, + { + "epoch": 0.15600322320709106, + "grad_norm": 5.377373492726496, + "learning_rate": 9.999817309286905e-05, + "loss": 4.8459, + "step": 242 + }, + { + "epoch": 0.15664786462530217, + "grad_norm": 3.863652782785485, + "learning_rate": 9.999815581735486e-05, + "loss": 5.0905, + "step": 243 + }, + { + "epoch": 0.1572925060435133, + "grad_norm": 5.476463834163656, + "learning_rate": 9.999813846054617e-05, + "loss": 3.9883, + "step": 244 + }, + { + "epoch": 0.1579371474617244, + "grad_norm": 9.783205730108081, + "learning_rate": 9.999812102244298e-05, + "loss": 4.856, + "step": 245 + }, + { + "epoch": 0.15858178887993554, + "grad_norm": 8.091024165414996, + "learning_rate": 9.999810350304534e-05, + "loss": 4.8511, + "step": 246 + }, + { + "epoch": 0.15922643029814665, + "grad_norm": 6.066806050589971, + "learning_rate": 9.999808590235328e-05, + "loss": 5.2685, + "step": 247 + }, + { + "epoch": 0.15987107171635778, + "grad_norm": 4.27690378446299, + "learning_rate": 9.999806822036682e-05, + "loss": 4.7878, + "step": 248 + }, + { + "epoch": 0.1605157131345689, + "grad_norm": 6.304294418984425, + "learning_rate": 9.999805045708598e-05, + "loss": 4.7467, + "step": 249 + }, + { + "epoch": 0.16116035455278002, + "grad_norm": 6.4556509667633355, + "learning_rate": 9.999803261251081e-05, + "loss": 4.5159, + "step": 250 + }, + { + "epoch": 0.16180499597099113, + "grad_norm": 6.882359133782237, + "learning_rate": 9.999801468664132e-05, + "loss": 5.1018, + "step": 251 + }, + { + "epoch": 0.16244963738920226, + "grad_norm": 3.3226721493814946, + "learning_rate": 9.999799667947757e-05, + "loss": 4.9359, + "step": 252 + }, + { + "epoch": 0.16309427880741337, + "grad_norm": 5.181349765298118, + "learning_rate": 9.999797859101954e-05, + "loss": 4.8525, + "step": 253 + }, + { + "epoch": 0.1637389202256245, + "grad_norm": 5.289253855098666, + "learning_rate": 9.99979604212673e-05, + "loss": 4.5463, + "step": 254 + }, + { + "epoch": 0.1643835616438356, + "grad_norm": 4.5392390917591285, + "learning_rate": 9.999794217022085e-05, + "loss": 4.8316, + "step": 255 + }, + { + "epoch": 0.16502820306204674, + "grad_norm": 3.0387677976142595, + "learning_rate": 9.999792383788029e-05, + "loss": 4.6178, + "step": 256 + }, + { + "epoch": 0.16567284448025785, + "grad_norm": 3.8854755606172686, + "learning_rate": 9.999790542424555e-05, + "loss": 4.4447, + "step": 257 + }, + { + "epoch": 0.16631748589846898, + "grad_norm": 5.031420601524187, + "learning_rate": 9.99978869293167e-05, + "loss": 4.5958, + "step": 258 + }, + { + "epoch": 0.1669621273166801, + "grad_norm": 5.723851337105397, + "learning_rate": 9.999786835309381e-05, + "loss": 4.4145, + "step": 259 + }, + { + "epoch": 0.16760676873489122, + "grad_norm": 4.771295438201126, + "learning_rate": 9.999784969557688e-05, + "loss": 4.8616, + "step": 260 + }, + { + "epoch": 0.16825141015310233, + "grad_norm": 3.509958628671012, + "learning_rate": 9.999783095676592e-05, + "loss": 4.834, + "step": 261 + }, + { + "epoch": 0.16889605157131346, + "grad_norm": 3.5659920043661835, + "learning_rate": 9.999781213666098e-05, + "loss": 4.8557, + "step": 262 + }, + { + "epoch": 0.16954069298952457, + "grad_norm": 3.9190768017110478, + "learning_rate": 9.999779323526209e-05, + "loss": 4.485, + "step": 263 + }, + { + "epoch": 0.1701853344077357, + "grad_norm": 4.557375684352304, + "learning_rate": 9.99977742525693e-05, + "loss": 4.6651, + "step": 264 + }, + { + "epoch": 0.1708299758259468, + "grad_norm": 3.7900726142753363, + "learning_rate": 9.99977551885826e-05, + "loss": 4.775, + "step": 265 + }, + { + "epoch": 0.17147461724415794, + "grad_norm": 3.4056268736468303, + "learning_rate": 9.999773604330207e-05, + "loss": 4.5574, + "step": 266 + }, + { + "epoch": 0.17211925866236905, + "grad_norm": 3.94668185832934, + "learning_rate": 9.999771681672772e-05, + "loss": 4.7805, + "step": 267 + }, + { + "epoch": 0.17276390008058018, + "grad_norm": 4.040032376957788, + "learning_rate": 9.999769750885955e-05, + "loss": 4.4523, + "step": 268 + }, + { + "epoch": 0.17340854149879129, + "grad_norm": 3.765139453661529, + "learning_rate": 9.999767811969763e-05, + "loss": 4.6642, + "step": 269 + }, + { + "epoch": 0.17405318291700242, + "grad_norm": 3.391238147495879, + "learning_rate": 9.999765864924197e-05, + "loss": 4.7555, + "step": 270 + }, + { + "epoch": 0.17469782433521353, + "grad_norm": 4.786488125548618, + "learning_rate": 9.999763909749263e-05, + "loss": 4.9038, + "step": 271 + }, + { + "epoch": 0.17534246575342466, + "grad_norm": 3.6808356459816594, + "learning_rate": 9.999761946444962e-05, + "loss": 4.7447, + "step": 272 + }, + { + "epoch": 0.17598710717163576, + "grad_norm": 3.5548755283537874, + "learning_rate": 9.999759975011297e-05, + "loss": 4.1055, + "step": 273 + }, + { + "epoch": 0.1766317485898469, + "grad_norm": 7.018244663658134, + "learning_rate": 9.999757995448274e-05, + "loss": 4.6238, + "step": 274 + }, + { + "epoch": 0.177276390008058, + "grad_norm": 6.573395995791948, + "learning_rate": 9.999756007755893e-05, + "loss": 4.4882, + "step": 275 + }, + { + "epoch": 0.17792103142626914, + "grad_norm": 2.8241709858845376, + "learning_rate": 9.999754011934158e-05, + "loss": 4.4233, + "step": 276 + }, + { + "epoch": 0.17856567284448024, + "grad_norm": 4.887268486506562, + "learning_rate": 9.999752007983075e-05, + "loss": 4.6911, + "step": 277 + }, + { + "epoch": 0.17921031426269138, + "grad_norm": 3.7626074990682223, + "learning_rate": 9.999749995902642e-05, + "loss": 4.9047, + "step": 278 + }, + { + "epoch": 0.1798549556809025, + "grad_norm": 5.0323398063014615, + "learning_rate": 9.999747975692867e-05, + "loss": 4.8371, + "step": 279 + }, + { + "epoch": 0.18049959709911362, + "grad_norm": 4.402616026180502, + "learning_rate": 9.999745947353752e-05, + "loss": 4.5517, + "step": 280 + }, + { + "epoch": 0.18114423851732475, + "grad_norm": 3.8144584550338387, + "learning_rate": 9.999743910885298e-05, + "loss": 4.6945, + "step": 281 + }, + { + "epoch": 0.18178887993553586, + "grad_norm": 5.323030221504235, + "learning_rate": 9.999741866287512e-05, + "loss": 4.7921, + "step": 282 + }, + { + "epoch": 0.182433521353747, + "grad_norm": 5.087054087973355, + "learning_rate": 9.999739813560396e-05, + "loss": 4.205, + "step": 283 + }, + { + "epoch": 0.1830781627719581, + "grad_norm": 5.209701675196639, + "learning_rate": 9.999737752703954e-05, + "loss": 4.6644, + "step": 284 + }, + { + "epoch": 0.18372280419016923, + "grad_norm": 6.01219834972663, + "learning_rate": 9.999735683718186e-05, + "loss": 4.2488, + "step": 285 + }, + { + "epoch": 0.18436744560838034, + "grad_norm": 5.809444467377353, + "learning_rate": 9.9997336066031e-05, + "loss": 4.8188, + "step": 286 + }, + { + "epoch": 0.18501208702659147, + "grad_norm": 5.416332747384424, + "learning_rate": 9.999731521358696e-05, + "loss": 3.9966, + "step": 287 + }, + { + "epoch": 0.18565672844480258, + "grad_norm": 7.716575640567441, + "learning_rate": 9.99972942798498e-05, + "loss": 4.5401, + "step": 288 + }, + { + "epoch": 0.1863013698630137, + "grad_norm": 8.117625089309078, + "learning_rate": 9.999727326481953e-05, + "loss": 4.4945, + "step": 289 + }, + { + "epoch": 0.18694601128122482, + "grad_norm": 4.981787572544719, + "learning_rate": 9.99972521684962e-05, + "loss": 4.5782, + "step": 290 + }, + { + "epoch": 0.18759065269943595, + "grad_norm": 7.166635043900352, + "learning_rate": 9.999723099087984e-05, + "loss": 4.4426, + "step": 291 + }, + { + "epoch": 0.18823529411764706, + "grad_norm": 7.31574096252708, + "learning_rate": 9.999720973197048e-05, + "loss": 4.5737, + "step": 292 + }, + { + "epoch": 0.1888799355358582, + "grad_norm": 6.603299290156071, + "learning_rate": 9.999718839176817e-05, + "loss": 4.3012, + "step": 293 + }, + { + "epoch": 0.1895245769540693, + "grad_norm": 9.39682571136314, + "learning_rate": 9.999716697027292e-05, + "loss": 4.3474, + "step": 294 + }, + { + "epoch": 0.19016921837228043, + "grad_norm": 7.783367851852369, + "learning_rate": 9.99971454674848e-05, + "loss": 4.4627, + "step": 295 + }, + { + "epoch": 0.19081385979049154, + "grad_norm": 4.1983182344975, + "learning_rate": 9.999712388340382e-05, + "loss": 4.338, + "step": 296 + }, + { + "epoch": 0.19145850120870267, + "grad_norm": 5.30263494966984, + "learning_rate": 9.999710221803002e-05, + "loss": 4.7181, + "step": 297 + }, + { + "epoch": 0.19210314262691378, + "grad_norm": 3.039758204959515, + "learning_rate": 9.999708047136343e-05, + "loss": 4.411, + "step": 298 + }, + { + "epoch": 0.1927477840451249, + "grad_norm": 3.7672918387674876, + "learning_rate": 9.999705864340411e-05, + "loss": 4.7751, + "step": 299 + }, + { + "epoch": 0.19339242546333602, + "grad_norm": 2.979531374529863, + "learning_rate": 9.999703673415205e-05, + "loss": 4.6332, + "step": 300 + }, + { + "epoch": 0.19339242546333602, + "eval_loss": 4.650531768798828, + "eval_runtime": 2.926, + "eval_samples_per_second": 34.176, + "eval_steps_per_second": 4.443, + "step": 300 + }, + { + "epoch": 0.19403706688154715, + "grad_norm": 4.389392785389915, + "learning_rate": 9.999701474360735e-05, + "loss": 4.6983, + "step": 301 + }, + { + "epoch": 0.19468170829975825, + "grad_norm": 3.4727756150411846, + "learning_rate": 9.999699267176999e-05, + "loss": 4.5761, + "step": 302 + }, + { + "epoch": 0.1953263497179694, + "grad_norm": 3.124109882385939, + "learning_rate": 9.999697051864002e-05, + "loss": 4.8055, + "step": 303 + }, + { + "epoch": 0.1959709911361805, + "grad_norm": 4.399142427744377, + "learning_rate": 9.999694828421749e-05, + "loss": 4.6083, + "step": 304 + }, + { + "epoch": 0.19661563255439163, + "grad_norm": 4.655276126107738, + "learning_rate": 9.999692596850245e-05, + "loss": 4.548, + "step": 305 + }, + { + "epoch": 0.19726027397260273, + "grad_norm": 4.560263279992175, + "learning_rate": 9.99969035714949e-05, + "loss": 4.4804, + "step": 306 + }, + { + "epoch": 0.19790491539081387, + "grad_norm": 5.975935789469558, + "learning_rate": 9.999688109319489e-05, + "loss": 4.6216, + "step": 307 + }, + { + "epoch": 0.19854955680902497, + "grad_norm": 4.644992969300892, + "learning_rate": 9.999685853360246e-05, + "loss": 4.7535, + "step": 308 + }, + { + "epoch": 0.1991941982272361, + "grad_norm": 3.4100646333598346, + "learning_rate": 9.999683589271766e-05, + "loss": 4.4564, + "step": 309 + }, + { + "epoch": 0.1998388396454472, + "grad_norm": 3.9168102770872357, + "learning_rate": 9.999681317054049e-05, + "loss": 5.0187, + "step": 310 + }, + { + "epoch": 0.20048348106365835, + "grad_norm": 3.3336488384144665, + "learning_rate": 9.999679036707102e-05, + "loss": 4.4727, + "step": 311 + }, + { + "epoch": 0.20112812248186945, + "grad_norm": 5.191511795209375, + "learning_rate": 9.99967674823093e-05, + "loss": 4.8548, + "step": 312 + }, + { + "epoch": 0.2017727639000806, + "grad_norm": 5.617409210941599, + "learning_rate": 9.999674451625531e-05, + "loss": 4.5892, + "step": 313 + }, + { + "epoch": 0.2024174053182917, + "grad_norm": 5.27154801581341, + "learning_rate": 9.999672146890916e-05, + "loss": 4.3564, + "step": 314 + }, + { + "epoch": 0.20306204673650283, + "grad_norm": 4.706478687787812, + "learning_rate": 9.999669834027085e-05, + "loss": 4.7537, + "step": 315 + }, + { + "epoch": 0.20370668815471393, + "grad_norm": 4.133183500102297, + "learning_rate": 9.99966751303404e-05, + "loss": 4.0012, + "step": 316 + }, + { + "epoch": 0.20435132957292507, + "grad_norm": 4.207820439512977, + "learning_rate": 9.999665183911789e-05, + "loss": 4.6511, + "step": 317 + }, + { + "epoch": 0.20499597099113617, + "grad_norm": 4.509492103988937, + "learning_rate": 9.99966284666033e-05, + "loss": 4.3946, + "step": 318 + }, + { + "epoch": 0.2056406124093473, + "grad_norm": 5.766471822167658, + "learning_rate": 9.999660501279673e-05, + "loss": 4.6475, + "step": 319 + }, + { + "epoch": 0.2062852538275584, + "grad_norm": 3.9425272374585925, + "learning_rate": 9.999658147769821e-05, + "loss": 4.3873, + "step": 320 + }, + { + "epoch": 0.20692989524576955, + "grad_norm": 3.0305110270069844, + "learning_rate": 9.999655786130774e-05, + "loss": 5.0665, + "step": 321 + }, + { + "epoch": 0.20757453666398065, + "grad_norm": 5.158481745141194, + "learning_rate": 9.999653416362538e-05, + "loss": 4.5321, + "step": 322 + }, + { + "epoch": 0.20821917808219179, + "grad_norm": 9.237604268723633, + "learning_rate": 9.999651038465118e-05, + "loss": 4.5647, + "step": 323 + }, + { + "epoch": 0.2088638195004029, + "grad_norm": 8.8480069773693, + "learning_rate": 9.999648652438517e-05, + "loss": 4.4195, + "step": 324 + }, + { + "epoch": 0.20950846091861403, + "grad_norm": 4.904433832824293, + "learning_rate": 9.999646258282739e-05, + "loss": 4.5649, + "step": 325 + }, + { + "epoch": 0.21015310233682513, + "grad_norm": 5.154605724858217, + "learning_rate": 9.999643855997787e-05, + "loss": 4.6621, + "step": 326 + }, + { + "epoch": 0.21079774375503627, + "grad_norm": 3.8979447832934393, + "learning_rate": 9.999641445583666e-05, + "loss": 4.4269, + "step": 327 + }, + { + "epoch": 0.21144238517324737, + "grad_norm": 4.364872474616429, + "learning_rate": 9.99963902704038e-05, + "loss": 4.2818, + "step": 328 + }, + { + "epoch": 0.2120870265914585, + "grad_norm": 4.083412125534887, + "learning_rate": 9.999636600367932e-05, + "loss": 4.5471, + "step": 329 + }, + { + "epoch": 0.2127316680096696, + "grad_norm": 3.4845773909107, + "learning_rate": 9.999634165566327e-05, + "loss": 4.7991, + "step": 330 + }, + { + "epoch": 0.21337630942788074, + "grad_norm": 4.935479408002655, + "learning_rate": 9.999631722635567e-05, + "loss": 4.2821, + "step": 331 + }, + { + "epoch": 0.21402095084609185, + "grad_norm": 6.298957328672417, + "learning_rate": 9.99962927157566e-05, + "loss": 4.6441, + "step": 332 + }, + { + "epoch": 0.21466559226430298, + "grad_norm": 3.5499386728989584, + "learning_rate": 9.999626812386607e-05, + "loss": 4.7932, + "step": 333 + }, + { + "epoch": 0.2153102336825141, + "grad_norm": 3.7193372934236084, + "learning_rate": 9.999624345068413e-05, + "loss": 4.5787, + "step": 334 + }, + { + "epoch": 0.21595487510072522, + "grad_norm": 4.941515779244685, + "learning_rate": 9.999621869621081e-05, + "loss": 4.8077, + "step": 335 + }, + { + "epoch": 0.21659951651893633, + "grad_norm": 3.763871532896083, + "learning_rate": 9.999619386044614e-05, + "loss": 4.3453, + "step": 336 + }, + { + "epoch": 0.21724415793714746, + "grad_norm": 4.200108829491212, + "learning_rate": 9.999616894339021e-05, + "loss": 4.5263, + "step": 337 + }, + { + "epoch": 0.21788879935535857, + "grad_norm": 5.614216724770857, + "learning_rate": 9.999614394504301e-05, + "loss": 4.1261, + "step": 338 + }, + { + "epoch": 0.2185334407735697, + "grad_norm": 4.02344892914904, + "learning_rate": 9.999611886540462e-05, + "loss": 4.4603, + "step": 339 + }, + { + "epoch": 0.2191780821917808, + "grad_norm": 4.33783094831668, + "learning_rate": 9.999609370447504e-05, + "loss": 4.7328, + "step": 340 + }, + { + "epoch": 0.21982272360999194, + "grad_norm": 4.007865292728331, + "learning_rate": 9.999606846225436e-05, + "loss": 4.7112, + "step": 341 + }, + { + "epoch": 0.22046736502820305, + "grad_norm": 3.0769846107443994, + "learning_rate": 9.999604313874257e-05, + "loss": 4.1506, + "step": 342 + }, + { + "epoch": 0.22111200644641418, + "grad_norm": 4.153802237028372, + "learning_rate": 9.999601773393975e-05, + "loss": 3.8488, + "step": 343 + }, + { + "epoch": 0.2217566478646253, + "grad_norm": 7.404698682008955, + "learning_rate": 9.999599224784592e-05, + "loss": 4.0069, + "step": 344 + }, + { + "epoch": 0.22240128928283642, + "grad_norm": 11.275679821701262, + "learning_rate": 9.999596668046114e-05, + "loss": 4.3692, + "step": 345 + }, + { + "epoch": 0.22304593070104753, + "grad_norm": 8.932485254082923, + "learning_rate": 9.999594103178542e-05, + "loss": 4.5137, + "step": 346 + }, + { + "epoch": 0.22369057211925866, + "grad_norm": 4.42306120985609, + "learning_rate": 9.999591530181886e-05, + "loss": 4.3178, + "step": 347 + }, + { + "epoch": 0.22433521353746977, + "grad_norm": 7.133029755263635, + "learning_rate": 9.999588949056143e-05, + "loss": 4.2569, + "step": 348 + }, + { + "epoch": 0.2249798549556809, + "grad_norm": 5.627032058905624, + "learning_rate": 9.999586359801323e-05, + "loss": 4.5362, + "step": 349 + }, + { + "epoch": 0.22562449637389204, + "grad_norm": 4.930741614785397, + "learning_rate": 9.999583762417427e-05, + "loss": 4.4726, + "step": 350 + }, + { + "epoch": 0.22626913779210314, + "grad_norm": 6.087858416763526, + "learning_rate": 9.999581156904461e-05, + "loss": 4.3316, + "step": 351 + }, + { + "epoch": 0.22691377921031428, + "grad_norm": 5.594185600733333, + "learning_rate": 9.99957854326243e-05, + "loss": 4.7247, + "step": 352 + }, + { + "epoch": 0.22755842062852538, + "grad_norm": 6.013723078953912, + "learning_rate": 9.999575921491336e-05, + "loss": 4.4058, + "step": 353 + }, + { + "epoch": 0.22820306204673652, + "grad_norm": 4.416803950108259, + "learning_rate": 9.999573291591185e-05, + "loss": 4.5351, + "step": 354 + }, + { + "epoch": 0.22884770346494762, + "grad_norm": 4.245368853246479, + "learning_rate": 9.999570653561979e-05, + "loss": 4.708, + "step": 355 + }, + { + "epoch": 0.22949234488315876, + "grad_norm": 4.275110663217722, + "learning_rate": 9.999568007403724e-05, + "loss": 4.5568, + "step": 356 + }, + { + "epoch": 0.23013698630136986, + "grad_norm": 4.411446225351622, + "learning_rate": 9.999565353116425e-05, + "loss": 4.4112, + "step": 357 + }, + { + "epoch": 0.230781627719581, + "grad_norm": 4.432298112106817, + "learning_rate": 9.999562690700083e-05, + "loss": 4.3902, + "step": 358 + }, + { + "epoch": 0.2314262691377921, + "grad_norm": 4.705107142345392, + "learning_rate": 9.999560020154711e-05, + "loss": 4.4321, + "step": 359 + }, + { + "epoch": 0.23207091055600323, + "grad_norm": 6.373856183078859, + "learning_rate": 9.9995573414803e-05, + "loss": 4.4968, + "step": 360 + }, + { + "epoch": 0.23271555197421434, + "grad_norm": 5.041878486298434, + "learning_rate": 9.999554654676867e-05, + "loss": 4.5995, + "step": 361 + }, + { + "epoch": 0.23336019339242547, + "grad_norm": 4.383194551843206, + "learning_rate": 9.999551959744411e-05, + "loss": 4.3408, + "step": 362 + }, + { + "epoch": 0.23400483481063658, + "grad_norm": 5.222011931285447, + "learning_rate": 9.999549256682935e-05, + "loss": 4.215, + "step": 363 + }, + { + "epoch": 0.23464947622884771, + "grad_norm": 3.5153078169547736, + "learning_rate": 9.999546545492445e-05, + "loss": 4.3196, + "step": 364 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 5.398684757226614, + "learning_rate": 9.999543826172946e-05, + "loss": 4.4195, + "step": 365 + }, + { + "epoch": 0.23593875906526995, + "grad_norm": 5.061415116042662, + "learning_rate": 9.999541098724442e-05, + "loss": 4.6249, + "step": 366 + }, + { + "epoch": 0.23658340048348106, + "grad_norm": 2.8530379879509105, + "learning_rate": 9.999538363146939e-05, + "loss": 4.2743, + "step": 367 + }, + { + "epoch": 0.2372280419016922, + "grad_norm": 3.7170144402782404, + "learning_rate": 9.999535619440437e-05, + "loss": 4.4936, + "step": 368 + }, + { + "epoch": 0.2378726833199033, + "grad_norm": 4.649545185267731, + "learning_rate": 9.999532867604946e-05, + "loss": 4.402, + "step": 369 + }, + { + "epoch": 0.23851732473811443, + "grad_norm": 3.4138899903320357, + "learning_rate": 9.999530107640466e-05, + "loss": 4.2557, + "step": 370 + }, + { + "epoch": 0.23916196615632554, + "grad_norm": 3.986278648037336, + "learning_rate": 9.999527339547005e-05, + "loss": 3.9708, + "step": 371 + }, + { + "epoch": 0.23980660757453667, + "grad_norm": 4.520644385777344, + "learning_rate": 9.999524563324564e-05, + "loss": 4.9425, + "step": 372 + }, + { + "epoch": 0.24045124899274778, + "grad_norm": 3.202692563957076, + "learning_rate": 9.999521778973151e-05, + "loss": 4.5522, + "step": 373 + }, + { + "epoch": 0.2410958904109589, + "grad_norm": 4.994993990895645, + "learning_rate": 9.99951898649277e-05, + "loss": 4.0718, + "step": 374 + }, + { + "epoch": 0.24174053182917002, + "grad_norm": 4.813720929882255, + "learning_rate": 9.999516185883426e-05, + "loss": 4.7841, + "step": 375 + }, + { + "epoch": 0.24238517324738115, + "grad_norm": 2.6278706820224507, + "learning_rate": 9.999513377145117e-05, + "loss": 4.2211, + "step": 376 + }, + { + "epoch": 0.24302981466559226, + "grad_norm": 18.933863033463027, + "learning_rate": 9.999510560277858e-05, + "loss": 4.7922, + "step": 377 + }, + { + "epoch": 0.2436744560838034, + "grad_norm": 3.1100410306274116, + "learning_rate": 9.999507735281647e-05, + "loss": 4.5795, + "step": 378 + }, + { + "epoch": 0.2443190975020145, + "grad_norm": 5.425656438109345, + "learning_rate": 9.99950490215649e-05, + "loss": 4.1079, + "step": 379 + }, + { + "epoch": 0.24496373892022563, + "grad_norm": 3.4667251108095947, + "learning_rate": 9.999502060902392e-05, + "loss": 4.7242, + "step": 380 + }, + { + "epoch": 0.24560838033843674, + "grad_norm": 4.845626450509548, + "learning_rate": 9.999499211519359e-05, + "loss": 4.2621, + "step": 381 + }, + { + "epoch": 0.24625302175664787, + "grad_norm": 4.765358831240654, + "learning_rate": 9.999496354007393e-05, + "loss": 4.7861, + "step": 382 + }, + { + "epoch": 0.24689766317485898, + "grad_norm": 4.595596369014694, + "learning_rate": 9.9994934883665e-05, + "loss": 4.2936, + "step": 383 + }, + { + "epoch": 0.2475423045930701, + "grad_norm": 4.231156873958068, + "learning_rate": 9.999490614596683e-05, + "loss": 4.0816, + "step": 384 + }, + { + "epoch": 0.24818694601128122, + "grad_norm": 3.9561810312353347, + "learning_rate": 9.999487732697952e-05, + "loss": 4.1834, + "step": 385 + }, + { + "epoch": 0.24883158742949235, + "grad_norm": 4.141731784625404, + "learning_rate": 9.999484842670306e-05, + "loss": 3.9568, + "step": 386 + }, + { + "epoch": 0.24947622884770346, + "grad_norm": 4.142647207021182, + "learning_rate": 9.999481944513751e-05, + "loss": 4.5638, + "step": 387 + }, + { + "epoch": 0.25012087026591456, + "grad_norm": 3.508346939111211, + "learning_rate": 9.999479038228294e-05, + "loss": 4.2736, + "step": 388 + }, + { + "epoch": 0.2507655116841257, + "grad_norm": 3.4594665255391197, + "learning_rate": 9.999476123813938e-05, + "loss": 4.4831, + "step": 389 + }, + { + "epoch": 0.25141015310233683, + "grad_norm": 2.854838115057171, + "learning_rate": 9.999473201270688e-05, + "loss": 4.5071, + "step": 390 + }, + { + "epoch": 0.25205479452054796, + "grad_norm": 2.981329309637292, + "learning_rate": 9.99947027059855e-05, + "loss": 4.0324, + "step": 391 + }, + { + "epoch": 0.25269943593875904, + "grad_norm": 2.559106635118357, + "learning_rate": 9.999467331797527e-05, + "loss": 4.8482, + "step": 392 + }, + { + "epoch": 0.2533440773569702, + "grad_norm": 3.3086982584781555, + "learning_rate": 9.999464384867626e-05, + "loss": 4.3315, + "step": 393 + }, + { + "epoch": 0.2539887187751813, + "grad_norm": 4.450959077541257, + "learning_rate": 9.999461429808849e-05, + "loss": 4.6949, + "step": 394 + }, + { + "epoch": 0.25463336019339244, + "grad_norm": 6.34541492482599, + "learning_rate": 9.999458466621202e-05, + "loss": 4.5104, + "step": 395 + }, + { + "epoch": 0.2552780016116035, + "grad_norm": 9.826389138382163, + "learning_rate": 9.999455495304689e-05, + "loss": 4.112, + "step": 396 + }, + { + "epoch": 0.25592264302981466, + "grad_norm": 14.108611627948848, + "learning_rate": 9.999452515859317e-05, + "loss": 4.5592, + "step": 397 + }, + { + "epoch": 0.2565672844480258, + "grad_norm": 6.411701988844554, + "learning_rate": 9.999449528285092e-05, + "loss": 4.5854, + "step": 398 + }, + { + "epoch": 0.2572119258662369, + "grad_norm": 7.191308453651435, + "learning_rate": 9.999446532582016e-05, + "loss": 4.4135, + "step": 399 + }, + { + "epoch": 0.257856567284448, + "grad_norm": 6.176342028894822, + "learning_rate": 9.999443528750093e-05, + "loss": 4.2409, + "step": 400 + }, + { + "epoch": 0.257856567284448, + "eval_loss": 4.588252067565918, + "eval_runtime": 2.9471, + "eval_samples_per_second": 33.931, + "eval_steps_per_second": 4.411, + "step": 400 + }, + { + "epoch": 0.25850120870265914, + "grad_norm": 6.332843029073648, + "learning_rate": 9.999440516789331e-05, + "loss": 4.1796, + "step": 401 + }, + { + "epoch": 0.25914585012087027, + "grad_norm": 4.247987123999801, + "learning_rate": 9.999437496699734e-05, + "loss": 4.2972, + "step": 402 + }, + { + "epoch": 0.2597904915390814, + "grad_norm": 3.327232127093582, + "learning_rate": 9.999434468481307e-05, + "loss": 4.3679, + "step": 403 + }, + { + "epoch": 0.2604351329572925, + "grad_norm": 4.182650746409697, + "learning_rate": 9.999431432134053e-05, + "loss": 4.3137, + "step": 404 + }, + { + "epoch": 0.2610797743755036, + "grad_norm": 3.122242418065241, + "learning_rate": 9.99942838765798e-05, + "loss": 4.5759, + "step": 405 + }, + { + "epoch": 0.26172441579371475, + "grad_norm": 2.9622390498188804, + "learning_rate": 9.999425335053091e-05, + "loss": 4.6096, + "step": 406 + }, + { + "epoch": 0.2623690572119259, + "grad_norm": 3.4975203777228443, + "learning_rate": 9.999422274319392e-05, + "loss": 4.012, + "step": 407 + }, + { + "epoch": 0.26301369863013696, + "grad_norm": 2.6459359127627673, + "learning_rate": 9.999419205456887e-05, + "loss": 4.8545, + "step": 408 + }, + { + "epoch": 0.2636583400483481, + "grad_norm": 2.4589248300024424, + "learning_rate": 9.99941612846558e-05, + "loss": 4.7323, + "step": 409 + }, + { + "epoch": 0.26430298146655923, + "grad_norm": 2.5956787280041103, + "learning_rate": 9.99941304334548e-05, + "loss": 4.7541, + "step": 410 + }, + { + "epoch": 0.26494762288477036, + "grad_norm": 2.7108742783051634, + "learning_rate": 9.999409950096592e-05, + "loss": 4.4577, + "step": 411 + }, + { + "epoch": 0.26559226430298144, + "grad_norm": 2.780299379523002, + "learning_rate": 9.999406848718914e-05, + "loss": 4.5946, + "step": 412 + }, + { + "epoch": 0.2662369057211926, + "grad_norm": 2.472361093664556, + "learning_rate": 9.99940373921246e-05, + "loss": 4.2963, + "step": 413 + }, + { + "epoch": 0.2668815471394037, + "grad_norm": 3.3547193845150787, + "learning_rate": 9.999400621577229e-05, + "loss": 4.3317, + "step": 414 + }, + { + "epoch": 0.26752618855761484, + "grad_norm": 4.038655975370842, + "learning_rate": 9.999397495813227e-05, + "loss": 4.616, + "step": 415 + }, + { + "epoch": 0.2681708299758259, + "grad_norm": 2.9702799038533048, + "learning_rate": 9.999394361920463e-05, + "loss": 4.7413, + "step": 416 + }, + { + "epoch": 0.26881547139403705, + "grad_norm": 4.1677677988894555, + "learning_rate": 9.999391219898938e-05, + "loss": 4.3084, + "step": 417 + }, + { + "epoch": 0.2694601128122482, + "grad_norm": 3.5183890399543025, + "learning_rate": 9.999388069748658e-05, + "loss": 4.8924, + "step": 418 + }, + { + "epoch": 0.2701047542304593, + "grad_norm": 3.538996554446731, + "learning_rate": 9.99938491146963e-05, + "loss": 4.1616, + "step": 419 + }, + { + "epoch": 0.27074939564867045, + "grad_norm": 5.0360474959954615, + "learning_rate": 9.999381745061858e-05, + "loss": 4.0548, + "step": 420 + }, + { + "epoch": 0.27139403706688153, + "grad_norm": 3.9127538988882122, + "learning_rate": 9.999378570525347e-05, + "loss": 4.3292, + "step": 421 + }, + { + "epoch": 0.27203867848509267, + "grad_norm": 3.4882806672915896, + "learning_rate": 9.999375387860101e-05, + "loss": 4.2188, + "step": 422 + }, + { + "epoch": 0.2726833199033038, + "grad_norm": 3.653410191170186, + "learning_rate": 9.999372197066129e-05, + "loss": 4.2427, + "step": 423 + }, + { + "epoch": 0.27332796132151493, + "grad_norm": 3.0004234240404952, + "learning_rate": 9.999368998143432e-05, + "loss": 4.2745, + "step": 424 + }, + { + "epoch": 0.273972602739726, + "grad_norm": 4.030002572445162, + "learning_rate": 9.999365791092018e-05, + "loss": 4.159, + "step": 425 + }, + { + "epoch": 0.27461724415793715, + "grad_norm": 3.4566464423689247, + "learning_rate": 9.999362575911891e-05, + "loss": 4.3491, + "step": 426 + }, + { + "epoch": 0.2752618855761483, + "grad_norm": 3.2926950151980345, + "learning_rate": 9.999359352603057e-05, + "loss": 4.1616, + "step": 427 + }, + { + "epoch": 0.2759065269943594, + "grad_norm": 4.548405710908879, + "learning_rate": 9.999356121165521e-05, + "loss": 4.7391, + "step": 428 + }, + { + "epoch": 0.2765511684125705, + "grad_norm": 3.8321386181178454, + "learning_rate": 9.999352881599287e-05, + "loss": 4.3283, + "step": 429 + }, + { + "epoch": 0.2771958098307816, + "grad_norm": 2.7511098431321797, + "learning_rate": 9.999349633904364e-05, + "loss": 4.2724, + "step": 430 + }, + { + "epoch": 0.27784045124899276, + "grad_norm": 3.909287662836966, + "learning_rate": 9.999346378080754e-05, + "loss": 4.3856, + "step": 431 + }, + { + "epoch": 0.2784850926672039, + "grad_norm": 3.45063060253309, + "learning_rate": 9.999343114128464e-05, + "loss": 4.7075, + "step": 432 + }, + { + "epoch": 0.27912973408541497, + "grad_norm": 2.2196593337519195, + "learning_rate": 9.999339842047497e-05, + "loss": 4.5199, + "step": 433 + }, + { + "epoch": 0.2797743755036261, + "grad_norm": 4.191806366439665, + "learning_rate": 9.99933656183786e-05, + "loss": 4.4347, + "step": 434 + }, + { + "epoch": 0.28041901692183724, + "grad_norm": 2.743059622043392, + "learning_rate": 9.999333273499558e-05, + "loss": 4.4499, + "step": 435 + }, + { + "epoch": 0.2810636583400484, + "grad_norm": 3.781461926659898, + "learning_rate": 9.999329977032598e-05, + "loss": 4.4468, + "step": 436 + }, + { + "epoch": 0.28170829975825945, + "grad_norm": 4.334395429517808, + "learning_rate": 9.999326672436985e-05, + "loss": 4.4354, + "step": 437 + }, + { + "epoch": 0.2823529411764706, + "grad_norm": 3.123131171678534, + "learning_rate": 9.999323359712723e-05, + "loss": 4.4219, + "step": 438 + }, + { + "epoch": 0.2829975825946817, + "grad_norm": 4.866319093331747, + "learning_rate": 9.999320038859817e-05, + "loss": 4.109, + "step": 439 + }, + { + "epoch": 0.28364222401289285, + "grad_norm": 7.019062552380983, + "learning_rate": 9.999316709878274e-05, + "loss": 3.9365, + "step": 440 + }, + { + "epoch": 0.28428686543110393, + "grad_norm": 8.231468678120729, + "learning_rate": 9.999313372768099e-05, + "loss": 4.4313, + "step": 441 + }, + { + "epoch": 0.28493150684931506, + "grad_norm": 6.018391983915858, + "learning_rate": 9.999310027529297e-05, + "loss": 4.7, + "step": 442 + }, + { + "epoch": 0.2855761482675262, + "grad_norm": 3.0552392127942216, + "learning_rate": 9.999306674161876e-05, + "loss": 4.5613, + "step": 443 + }, + { + "epoch": 0.28622078968573733, + "grad_norm": 5.50149453092289, + "learning_rate": 9.999303312665837e-05, + "loss": 4.6387, + "step": 444 + }, + { + "epoch": 0.2868654311039484, + "grad_norm": 3.6296600691552885, + "learning_rate": 9.999299943041188e-05, + "loss": 4.1092, + "step": 445 + }, + { + "epoch": 0.28751007252215954, + "grad_norm": 4.342115697420752, + "learning_rate": 9.999296565287937e-05, + "loss": 4.3458, + "step": 446 + }, + { + "epoch": 0.2881547139403707, + "grad_norm": 3.0586478936734456, + "learning_rate": 9.999293179406086e-05, + "loss": 4.2841, + "step": 447 + }, + { + "epoch": 0.2887993553585818, + "grad_norm": 3.3323709254148635, + "learning_rate": 9.999289785395641e-05, + "loss": 4.5821, + "step": 448 + }, + { + "epoch": 0.2894439967767929, + "grad_norm": 2.7834358225131113, + "learning_rate": 9.999286383256608e-05, + "loss": 3.9942, + "step": 449 + }, + { + "epoch": 0.290088638195004, + "grad_norm": 4.451487366734443, + "learning_rate": 9.999282972988994e-05, + "loss": 4.0548, + "step": 450 + }, + { + "epoch": 0.29073327961321516, + "grad_norm": 5.194821696961356, + "learning_rate": 9.999279554592802e-05, + "loss": 4.1564, + "step": 451 + }, + { + "epoch": 0.2913779210314263, + "grad_norm": 3.248131987855001, + "learning_rate": 9.999276128068041e-05, + "loss": 4.3048, + "step": 452 + }, + { + "epoch": 0.29202256244963737, + "grad_norm": 4.85824241899812, + "learning_rate": 9.999272693414713e-05, + "loss": 4.2067, + "step": 453 + }, + { + "epoch": 0.2926672038678485, + "grad_norm": 3.8576418301478452, + "learning_rate": 9.999269250632826e-05, + "loss": 4.1839, + "step": 454 + }, + { + "epoch": 0.29331184528605964, + "grad_norm": 2.8363953651371228, + "learning_rate": 9.999265799722383e-05, + "loss": 4.5545, + "step": 455 + }, + { + "epoch": 0.29395648670427077, + "grad_norm": 3.280606469833546, + "learning_rate": 9.999262340683394e-05, + "loss": 4.4923, + "step": 456 + }, + { + "epoch": 0.29460112812248185, + "grad_norm": 2.4051745477798576, + "learning_rate": 9.999258873515863e-05, + "loss": 4.3853, + "step": 457 + }, + { + "epoch": 0.295245769540693, + "grad_norm": 3.160244732684399, + "learning_rate": 9.999255398219794e-05, + "loss": 4.3092, + "step": 458 + }, + { + "epoch": 0.2958904109589041, + "grad_norm": 3.9600785929811635, + "learning_rate": 9.999251914795192e-05, + "loss": 3.9517, + "step": 459 + }, + { + "epoch": 0.29653505237711525, + "grad_norm": 5.780492063199603, + "learning_rate": 9.999248423242065e-05, + "loss": 4.0961, + "step": 460 + }, + { + "epoch": 0.2971796937953263, + "grad_norm": 7.52082207789231, + "learning_rate": 9.99924492356042e-05, + "loss": 4.3205, + "step": 461 + }, + { + "epoch": 0.29782433521353746, + "grad_norm": 5.713564942287188, + "learning_rate": 9.99924141575026e-05, + "loss": 4.1788, + "step": 462 + }, + { + "epoch": 0.2984689766317486, + "grad_norm": 4.199495295296684, + "learning_rate": 9.99923789981159e-05, + "loss": 4.3214, + "step": 463 + }, + { + "epoch": 0.29911361804995973, + "grad_norm": 4.777423534460928, + "learning_rate": 9.99923437574442e-05, + "loss": 4.108, + "step": 464 + }, + { + "epoch": 0.2997582594681708, + "grad_norm": 6.134948959653359, + "learning_rate": 9.999230843548752e-05, + "loss": 4.3123, + "step": 465 + }, + { + "epoch": 0.30040290088638194, + "grad_norm": 4.396327229042961, + "learning_rate": 9.999227303224593e-05, + "loss": 4.2812, + "step": 466 + }, + { + "epoch": 0.3010475423045931, + "grad_norm": 3.430226771393313, + "learning_rate": 9.999223754771949e-05, + "loss": 4.6665, + "step": 467 + }, + { + "epoch": 0.3016921837228042, + "grad_norm": 4.8756457932981805, + "learning_rate": 9.999220198190825e-05, + "loss": 4.2368, + "step": 468 + }, + { + "epoch": 0.3023368251410153, + "grad_norm": 4.597331637686352, + "learning_rate": 9.999216633481227e-05, + "loss": 4.4904, + "step": 469 + }, + { + "epoch": 0.3029814665592264, + "grad_norm": 3.9906899232847124, + "learning_rate": 9.999213060643165e-05, + "loss": 4.5384, + "step": 470 + }, + { + "epoch": 0.30362610797743755, + "grad_norm": 3.9515564575606645, + "learning_rate": 9.999209479676637e-05, + "loss": 4.5237, + "step": 471 + }, + { + "epoch": 0.3042707493956487, + "grad_norm": 3.1974376854623676, + "learning_rate": 9.999205890581656e-05, + "loss": 4.1285, + "step": 472 + }, + { + "epoch": 0.30491539081385977, + "grad_norm": 3.4576893923819676, + "learning_rate": 9.999202293358223e-05, + "loss": 4.3805, + "step": 473 + }, + { + "epoch": 0.3055600322320709, + "grad_norm": 3.408113254046555, + "learning_rate": 9.999198688006347e-05, + "loss": 4.0232, + "step": 474 + }, + { + "epoch": 0.30620467365028203, + "grad_norm": 3.1822485031740677, + "learning_rate": 9.999195074526032e-05, + "loss": 4.4779, + "step": 475 + }, + { + "epoch": 0.30684931506849317, + "grad_norm": 2.4030375055183715, + "learning_rate": 9.999191452917285e-05, + "loss": 4.6855, + "step": 476 + }, + { + "epoch": 0.30749395648670425, + "grad_norm": 2.8740967304136555, + "learning_rate": 9.999187823180112e-05, + "loss": 4.4484, + "step": 477 + }, + { + "epoch": 0.3081385979049154, + "grad_norm": 3.3334687627987196, + "learning_rate": 9.999184185314519e-05, + "loss": 4.2326, + "step": 478 + }, + { + "epoch": 0.3087832393231265, + "grad_norm": 3.208270267393602, + "learning_rate": 9.999180539320512e-05, + "loss": 4.5342, + "step": 479 + }, + { + "epoch": 0.30942788074133765, + "grad_norm": 2.405816415479978, + "learning_rate": 9.999176885198095e-05, + "loss": 4.4128, + "step": 480 + }, + { + "epoch": 0.3100725221595487, + "grad_norm": 3.4372476890635046, + "learning_rate": 9.999173222947277e-05, + "loss": 4.1078, + "step": 481 + }, + { + "epoch": 0.31071716357775986, + "grad_norm": 3.0460663641555974, + "learning_rate": 9.999169552568063e-05, + "loss": 4.7502, + "step": 482 + }, + { + "epoch": 0.311361804995971, + "grad_norm": 2.805390003920221, + "learning_rate": 9.999165874060457e-05, + "loss": 4.3811, + "step": 483 + }, + { + "epoch": 0.3120064464141821, + "grad_norm": 3.3613051126228193, + "learning_rate": 9.999162187424468e-05, + "loss": 4.2666, + "step": 484 + }, + { + "epoch": 0.3126510878323932, + "grad_norm": 3.3920507691079633, + "learning_rate": 9.999158492660101e-05, + "loss": 4.3607, + "step": 485 + }, + { + "epoch": 0.31329572925060434, + "grad_norm": 2.727250641344863, + "learning_rate": 9.999154789767361e-05, + "loss": 4.2999, + "step": 486 + }, + { + "epoch": 0.31394037066881547, + "grad_norm": 2.838519935943817, + "learning_rate": 9.999151078746255e-05, + "loss": 4.4901, + "step": 487 + }, + { + "epoch": 0.3145850120870266, + "grad_norm": 3.1708993013026783, + "learning_rate": 9.99914735959679e-05, + "loss": 4.717, + "step": 488 + }, + { + "epoch": 0.31522965350523774, + "grad_norm": 3.3278339939802666, + "learning_rate": 9.999143632318971e-05, + "loss": 4.3141, + "step": 489 + }, + { + "epoch": 0.3158742949234488, + "grad_norm": 3.8553612892614426, + "learning_rate": 9.999139896912802e-05, + "loss": 4.385, + "step": 490 + }, + { + "epoch": 0.31651893634165995, + "grad_norm": 2.6569572932823635, + "learning_rate": 9.999136153378293e-05, + "loss": 4.1723, + "step": 491 + }, + { + "epoch": 0.3171635777598711, + "grad_norm": 2.469444888360799, + "learning_rate": 9.99913240171545e-05, + "loss": 4.1911, + "step": 492 + }, + { + "epoch": 0.3178082191780822, + "grad_norm": 3.357547191553188, + "learning_rate": 9.999128641924276e-05, + "loss": 4.1389, + "step": 493 + }, + { + "epoch": 0.3184528605962933, + "grad_norm": 2.9306192255302457, + "learning_rate": 9.999124874004779e-05, + "loss": 4.2541, + "step": 494 + }, + { + "epoch": 0.31909750201450443, + "grad_norm": 3.094332898286635, + "learning_rate": 9.999121097956966e-05, + "loss": 4.4653, + "step": 495 + }, + { + "epoch": 0.31974214343271556, + "grad_norm": 2.8024277913939355, + "learning_rate": 9.999117313780841e-05, + "loss": 4.1734, + "step": 496 + }, + { + "epoch": 0.3203867848509267, + "grad_norm": 2.0223495349537592, + "learning_rate": 9.999113521476411e-05, + "loss": 4.2226, + "step": 497 + }, + { + "epoch": 0.3210314262691378, + "grad_norm": 3.1510906355909443, + "learning_rate": 9.999109721043683e-05, + "loss": 4.2343, + "step": 498 + }, + { + "epoch": 0.3216760676873489, + "grad_norm": 3.8416658159191206, + "learning_rate": 9.999105912482666e-05, + "loss": 4.7125, + "step": 499 + }, + { + "epoch": 0.32232070910556004, + "grad_norm": 4.015829730767605, + "learning_rate": 9.999102095793359e-05, + "loss": 4.4351, + "step": 500 + }, + { + "epoch": 0.32232070910556004, + "eval_loss": 4.428322792053223, + "eval_runtime": 2.9406, + "eval_samples_per_second": 34.006, + "eval_steps_per_second": 4.421, + "step": 500 + }, + { + "epoch": 0.3229653505237712, + "grad_norm": 3.8051160869983, + "learning_rate": 9.999098270975775e-05, + "loss": 4.3878, + "step": 501 + }, + { + "epoch": 0.32360999194198226, + "grad_norm": 3.6687758491321203, + "learning_rate": 9.999094438029918e-05, + "loss": 4.1169, + "step": 502 + }, + { + "epoch": 0.3242546333601934, + "grad_norm": 4.422860296921639, + "learning_rate": 9.999090596955793e-05, + "loss": 4.44, + "step": 503 + }, + { + "epoch": 0.3248992747784045, + "grad_norm": 3.2062409209957003, + "learning_rate": 9.999086747753408e-05, + "loss": 4.4772, + "step": 504 + }, + { + "epoch": 0.32554391619661566, + "grad_norm": 3.4428233492810674, + "learning_rate": 9.999082890422766e-05, + "loss": 4.6317, + "step": 505 + }, + { + "epoch": 0.32618855761482674, + "grad_norm": 3.5534343509945, + "learning_rate": 9.99907902496388e-05, + "loss": 4.2954, + "step": 506 + }, + { + "epoch": 0.32683319903303787, + "grad_norm": 2.4530932593617485, + "learning_rate": 9.999075151376749e-05, + "loss": 4.3902, + "step": 507 + }, + { + "epoch": 0.327477840451249, + "grad_norm": 3.490729321733633, + "learning_rate": 9.999071269661387e-05, + "loss": 4.635, + "step": 508 + }, + { + "epoch": 0.32812248186946014, + "grad_norm": 2.541276283737507, + "learning_rate": 9.999067379817792e-05, + "loss": 4.6167, + "step": 509 + }, + { + "epoch": 0.3287671232876712, + "grad_norm": 2.6188527735888916, + "learning_rate": 9.999063481845975e-05, + "loss": 4.575, + "step": 510 + }, + { + "epoch": 0.32941176470588235, + "grad_norm": 3.734921104173677, + "learning_rate": 9.999059575745943e-05, + "loss": 4.0005, + "step": 511 + }, + { + "epoch": 0.3300564061240935, + "grad_norm": 4.713487161764103, + "learning_rate": 9.999055661517701e-05, + "loss": 4.316, + "step": 512 + }, + { + "epoch": 0.3307010475423046, + "grad_norm": 4.746919707957659, + "learning_rate": 9.999051739161255e-05, + "loss": 3.8809, + "step": 513 + }, + { + "epoch": 0.3313456889605157, + "grad_norm": 3.867161107763998, + "learning_rate": 9.999047808676615e-05, + "loss": 4.3885, + "step": 514 + }, + { + "epoch": 0.33199033037872683, + "grad_norm": 2.6673953137673916, + "learning_rate": 9.999043870063783e-05, + "loss": 4.3744, + "step": 515 + }, + { + "epoch": 0.33263497179693796, + "grad_norm": 3.317729633088108, + "learning_rate": 9.999039923322766e-05, + "loss": 4.4639, + "step": 516 + }, + { + "epoch": 0.3332796132151491, + "grad_norm": 2.3005301853152664, + "learning_rate": 9.999035968453572e-05, + "loss": 4.4787, + "step": 517 + }, + { + "epoch": 0.3339242546333602, + "grad_norm": 3.295942324837071, + "learning_rate": 9.999032005456209e-05, + "loss": 4.5948, + "step": 518 + }, + { + "epoch": 0.3345688960515713, + "grad_norm": 3.641321421534829, + "learning_rate": 9.999028034330678e-05, + "loss": 4.3913, + "step": 519 + }, + { + "epoch": 0.33521353746978244, + "grad_norm": 2.4846592860941095, + "learning_rate": 9.99902405507699e-05, + "loss": 4.6604, + "step": 520 + }, + { + "epoch": 0.3358581788879936, + "grad_norm": 2.8407157099527605, + "learning_rate": 9.999020067695153e-05, + "loss": 4.6097, + "step": 521 + }, + { + "epoch": 0.33650282030620465, + "grad_norm": 3.1288964248733797, + "learning_rate": 9.999016072185172e-05, + "loss": 4.2923, + "step": 522 + }, + { + "epoch": 0.3371474617244158, + "grad_norm": 4.176262158460531, + "learning_rate": 9.999012068547048e-05, + "loss": 4.2351, + "step": 523 + }, + { + "epoch": 0.3377921031426269, + "grad_norm": 3.881204876633328, + "learning_rate": 9.999008056780796e-05, + "loss": 4.2999, + "step": 524 + }, + { + "epoch": 0.33843674456083805, + "grad_norm": 2.706529793206679, + "learning_rate": 9.999004036886419e-05, + "loss": 4.4527, + "step": 525 + }, + { + "epoch": 0.33908138597904913, + "grad_norm": 3.5069815852072708, + "learning_rate": 9.999000008863922e-05, + "loss": 4.4757, + "step": 526 + }, + { + "epoch": 0.33972602739726027, + "grad_norm": 3.0241870615578477, + "learning_rate": 9.998995972713313e-05, + "loss": 4.4055, + "step": 527 + }, + { + "epoch": 0.3403706688154714, + "grad_norm": 2.337803241726457, + "learning_rate": 9.998991928434599e-05, + "loss": 4.1505, + "step": 528 + }, + { + "epoch": 0.34101531023368253, + "grad_norm": 3.1357092335706764, + "learning_rate": 9.998987876027788e-05, + "loss": 4.1283, + "step": 529 + }, + { + "epoch": 0.3416599516518936, + "grad_norm": 4.054590980251277, + "learning_rate": 9.998983815492884e-05, + "loss": 4.3274, + "step": 530 + }, + { + "epoch": 0.34230459307010475, + "grad_norm": 3.9538706005835023, + "learning_rate": 9.998979746829892e-05, + "loss": 4.052, + "step": 531 + }, + { + "epoch": 0.3429492344883159, + "grad_norm": 2.7848101125416775, + "learning_rate": 9.998975670038824e-05, + "loss": 4.5859, + "step": 532 + }, + { + "epoch": 0.343593875906527, + "grad_norm": 2.679816886360122, + "learning_rate": 9.998971585119685e-05, + "loss": 4.5066, + "step": 533 + }, + { + "epoch": 0.3442385173247381, + "grad_norm": 3.128620727345067, + "learning_rate": 9.998967492072481e-05, + "loss": 4.393, + "step": 534 + }, + { + "epoch": 0.3448831587429492, + "grad_norm": 3.7527717079874585, + "learning_rate": 9.998963390897217e-05, + "loss": 4.1287, + "step": 535 + }, + { + "epoch": 0.34552780016116036, + "grad_norm": 4.572195494857459, + "learning_rate": 9.998959281593902e-05, + "loss": 4.1495, + "step": 536 + }, + { + "epoch": 0.3461724415793715, + "grad_norm": 4.367722510164231, + "learning_rate": 9.998955164162541e-05, + "loss": 4.4022, + "step": 537 + }, + { + "epoch": 0.34681708299758257, + "grad_norm": 3.3798062389338948, + "learning_rate": 9.998951038603143e-05, + "loss": 4.1217, + "step": 538 + }, + { + "epoch": 0.3474617244157937, + "grad_norm": 4.478268045791718, + "learning_rate": 9.998946904915713e-05, + "loss": 4.4272, + "step": 539 + }, + { + "epoch": 0.34810636583400484, + "grad_norm": 4.644899257388471, + "learning_rate": 9.99894276310026e-05, + "loss": 4.4998, + "step": 540 + }, + { + "epoch": 0.34875100725221597, + "grad_norm": 3.381457069413849, + "learning_rate": 9.998938613156788e-05, + "loss": 4.4746, + "step": 541 + }, + { + "epoch": 0.34939564867042705, + "grad_norm": 3.0533276045596733, + "learning_rate": 9.998934455085304e-05, + "loss": 4.4894, + "step": 542 + }, + { + "epoch": 0.3500402900886382, + "grad_norm": 4.2782352101392815, + "learning_rate": 9.998930288885815e-05, + "loss": 4.6242, + "step": 543 + }, + { + "epoch": 0.3506849315068493, + "grad_norm": 3.9813806232914937, + "learning_rate": 9.998926114558332e-05, + "loss": 4.5509, + "step": 544 + }, + { + "epoch": 0.35132957292506045, + "grad_norm": 3.165402198453505, + "learning_rate": 9.998921932102855e-05, + "loss": 4.4679, + "step": 545 + }, + { + "epoch": 0.35197421434327153, + "grad_norm": 2.4643821157728896, + "learning_rate": 9.998917741519395e-05, + "loss": 4.6252, + "step": 546 + }, + { + "epoch": 0.35261885576148266, + "grad_norm": 2.768901187157743, + "learning_rate": 9.99891354280796e-05, + "loss": 4.1865, + "step": 547 + }, + { + "epoch": 0.3532634971796938, + "grad_norm": 3.115122024511177, + "learning_rate": 9.998909335968556e-05, + "loss": 4.7389, + "step": 548 + }, + { + "epoch": 0.35390813859790493, + "grad_norm": 2.4499572223350787, + "learning_rate": 9.998905121001187e-05, + "loss": 4.5633, + "step": 549 + }, + { + "epoch": 0.354552780016116, + "grad_norm": 3.302357345728687, + "learning_rate": 9.998900897905862e-05, + "loss": 4.0625, + "step": 550 + }, + { + "epoch": 0.35519742143432714, + "grad_norm": 4.256284898349101, + "learning_rate": 9.998896666682588e-05, + "loss": 4.469, + "step": 551 + }, + { + "epoch": 0.3558420628525383, + "grad_norm": 4.966972938153561, + "learning_rate": 9.998892427331372e-05, + "loss": 4.257, + "step": 552 + }, + { + "epoch": 0.3564867042707494, + "grad_norm": 5.370348100462023, + "learning_rate": 9.998888179852221e-05, + "loss": 4.2034, + "step": 553 + }, + { + "epoch": 0.3571313456889605, + "grad_norm": 4.567878304563083, + "learning_rate": 9.998883924245142e-05, + "loss": 4.2437, + "step": 554 + }, + { + "epoch": 0.3577759871071716, + "grad_norm": 4.436550109028233, + "learning_rate": 9.998879660510141e-05, + "loss": 4.3487, + "step": 555 + }, + { + "epoch": 0.35842062852538276, + "grad_norm": 3.5994474760102033, + "learning_rate": 9.998875388647227e-05, + "loss": 4.5588, + "step": 556 + }, + { + "epoch": 0.3590652699435939, + "grad_norm": 3.018601341801269, + "learning_rate": 9.998871108656405e-05, + "loss": 4.2863, + "step": 557 + }, + { + "epoch": 0.359709911361805, + "grad_norm": 3.544973912500787, + "learning_rate": 9.998866820537685e-05, + "loss": 4.3731, + "step": 558 + }, + { + "epoch": 0.3603545527800161, + "grad_norm": 3.4396024047849334, + "learning_rate": 9.998862524291069e-05, + "loss": 4.1782, + "step": 559 + }, + { + "epoch": 0.36099919419822724, + "grad_norm": 3.828966233852844, + "learning_rate": 9.998858219916567e-05, + "loss": 4.0464, + "step": 560 + }, + { + "epoch": 0.36164383561643837, + "grad_norm": 2.918472278318506, + "learning_rate": 9.998853907414186e-05, + "loss": 4.3819, + "step": 561 + }, + { + "epoch": 0.3622884770346495, + "grad_norm": 3.326051872994878, + "learning_rate": 9.998849586783936e-05, + "loss": 4.4898, + "step": 562 + }, + { + "epoch": 0.3629331184528606, + "grad_norm": 2.4061197555349714, + "learning_rate": 9.998845258025819e-05, + "loss": 4.5044, + "step": 563 + }, + { + "epoch": 0.3635777598710717, + "grad_norm": 3.2165666748899433, + "learning_rate": 9.998840921139844e-05, + "loss": 4.0675, + "step": 564 + }, + { + "epoch": 0.36422240128928285, + "grad_norm": 3.930892568520273, + "learning_rate": 9.998836576126018e-05, + "loss": 4.2683, + "step": 565 + }, + { + "epoch": 0.364867042707494, + "grad_norm": 3.7416788482812824, + "learning_rate": 9.99883222298435e-05, + "loss": 3.985, + "step": 566 + }, + { + "epoch": 0.36551168412570506, + "grad_norm": 3.257650671187985, + "learning_rate": 9.998827861714846e-05, + "loss": 4.5129, + "step": 567 + }, + { + "epoch": 0.3661563255439162, + "grad_norm": 4.8828205550998245, + "learning_rate": 9.99882349231751e-05, + "loss": 4.096, + "step": 568 + }, + { + "epoch": 0.36680096696212733, + "grad_norm": 4.2346245739432815, + "learning_rate": 9.998819114792356e-05, + "loss": 4.4247, + "step": 569 + }, + { + "epoch": 0.36744560838033846, + "grad_norm": 2.9650287963336743, + "learning_rate": 9.998814729139384e-05, + "loss": 4.2944, + "step": 570 + }, + { + "epoch": 0.36809024979854954, + "grad_norm": 3.6645401431581814, + "learning_rate": 9.998810335358607e-05, + "loss": 4.4103, + "step": 571 + }, + { + "epoch": 0.3687348912167607, + "grad_norm": 2.750552621580325, + "learning_rate": 9.998805933450028e-05, + "loss": 4.2112, + "step": 572 + }, + { + "epoch": 0.3693795326349718, + "grad_norm": 3.5436915405897556, + "learning_rate": 9.998801523413656e-05, + "loss": 4.3208, + "step": 573 + }, + { + "epoch": 0.37002417405318294, + "grad_norm": 2.9813313877694876, + "learning_rate": 9.9987971052495e-05, + "loss": 4.1389, + "step": 574 + }, + { + "epoch": 0.370668815471394, + "grad_norm": 3.66360859775685, + "learning_rate": 9.998792678957563e-05, + "loss": 4.2813, + "step": 575 + }, + { + "epoch": 0.37131345688960515, + "grad_norm": 3.1073590071865365, + "learning_rate": 9.998788244537856e-05, + "loss": 4.3521, + "step": 576 + }, + { + "epoch": 0.3719580983078163, + "grad_norm": 3.3061399265860483, + "learning_rate": 9.998783801990384e-05, + "loss": 4.233, + "step": 577 + }, + { + "epoch": 0.3726027397260274, + "grad_norm": 2.6903926496323027, + "learning_rate": 9.998779351315155e-05, + "loss": 4.2742, + "step": 578 + }, + { + "epoch": 0.3732473811442385, + "grad_norm": 4.125773221634888, + "learning_rate": 9.998774892512178e-05, + "loss": 4.339, + "step": 579 + }, + { + "epoch": 0.37389202256244963, + "grad_norm": 5.3939191941570765, + "learning_rate": 9.998770425581457e-05, + "loss": 4.0135, + "step": 580 + }, + { + "epoch": 0.37453666398066077, + "grad_norm": 5.111467886841874, + "learning_rate": 9.998765950523003e-05, + "loss": 4.2397, + "step": 581 + }, + { + "epoch": 0.3751813053988719, + "grad_norm": 4.5085046049572925, + "learning_rate": 9.99876146733682e-05, + "loss": 3.9465, + "step": 582 + }, + { + "epoch": 0.375825946817083, + "grad_norm": 4.958692908622997, + "learning_rate": 9.998756976022918e-05, + "loss": 4.2285, + "step": 583 + }, + { + "epoch": 0.3764705882352941, + "grad_norm": 6.25010214822441, + "learning_rate": 9.998752476581303e-05, + "loss": 4.1966, + "step": 584 + }, + { + "epoch": 0.37711522965350525, + "grad_norm": 5.662315007845015, + "learning_rate": 9.998747969011981e-05, + "loss": 4.4128, + "step": 585 + }, + { + "epoch": 0.3777598710717164, + "grad_norm": 2.658539572857615, + "learning_rate": 9.998743453314965e-05, + "loss": 4.3899, + "step": 586 + }, + { + "epoch": 0.37840451248992746, + "grad_norm": 5.186445361965719, + "learning_rate": 9.998738929490255e-05, + "loss": 3.9884, + "step": 587 + }, + { + "epoch": 0.3790491539081386, + "grad_norm": 4.510243710506069, + "learning_rate": 9.998734397537863e-05, + "loss": 4.4201, + "step": 588 + }, + { + "epoch": 0.3796937953263497, + "grad_norm": 3.1553560180054805, + "learning_rate": 9.998729857457796e-05, + "loss": 4.4837, + "step": 589 + }, + { + "epoch": 0.38033843674456086, + "grad_norm": 4.039701345430421, + "learning_rate": 9.99872530925006e-05, + "loss": 4.4051, + "step": 590 + }, + { + "epoch": 0.38098307816277194, + "grad_norm": 2.06387284202075, + "learning_rate": 9.998720752914665e-05, + "loss": 4.547, + "step": 591 + }, + { + "epoch": 0.38162771958098307, + "grad_norm": 3.419356483908562, + "learning_rate": 9.998716188451614e-05, + "loss": 4.62, + "step": 592 + }, + { + "epoch": 0.3822723609991942, + "grad_norm": 3.054515151632714, + "learning_rate": 9.99871161586092e-05, + "loss": 4.2113, + "step": 593 + }, + { + "epoch": 0.38291700241740534, + "grad_norm": 3.9246272939345315, + "learning_rate": 9.998707035142586e-05, + "loss": 4.3301, + "step": 594 + }, + { + "epoch": 0.3835616438356164, + "grad_norm": 5.840385975191879, + "learning_rate": 9.99870244629662e-05, + "loss": 3.9415, + "step": 595 + }, + { + "epoch": 0.38420628525382755, + "grad_norm": 6.838360817172449, + "learning_rate": 9.998697849323034e-05, + "loss": 3.7231, + "step": 596 + }, + { + "epoch": 0.3848509266720387, + "grad_norm": 5.533358864040446, + "learning_rate": 9.998693244221831e-05, + "loss": 4.2719, + "step": 597 + }, + { + "epoch": 0.3854955680902498, + "grad_norm": 2.6318380725632604, + "learning_rate": 9.998688630993021e-05, + "loss": 4.3593, + "step": 598 + }, + { + "epoch": 0.3861402095084609, + "grad_norm": 3.4785393997892817, + "learning_rate": 9.998684009636609e-05, + "loss": 4.5399, + "step": 599 + }, + { + "epoch": 0.38678485092667203, + "grad_norm": 2.8088821375108606, + "learning_rate": 9.998679380152604e-05, + "loss": 4.2197, + "step": 600 + }, + { + "epoch": 0.38678485092667203, + "eval_loss": 4.3780622482299805, + "eval_runtime": 2.9274, + "eval_samples_per_second": 34.16, + "eval_steps_per_second": 4.441, + "step": 600 + }, + { + "epoch": 0.38742949234488316, + "grad_norm": 3.731779081348514, + "learning_rate": 9.998674742541015e-05, + "loss": 4.4238, + "step": 601 + }, + { + "epoch": 0.3880741337630943, + "grad_norm": 2.9174043568703305, + "learning_rate": 9.998670096801848e-05, + "loss": 4.3881, + "step": 602 + }, + { + "epoch": 0.3887187751813054, + "grad_norm": 2.9235494217021927, + "learning_rate": 9.998665442935111e-05, + "loss": 4.3511, + "step": 603 + }, + { + "epoch": 0.3893634165995165, + "grad_norm": 3.01924364330406, + "learning_rate": 9.998660780940811e-05, + "loss": 4.3651, + "step": 604 + }, + { + "epoch": 0.39000805801772764, + "grad_norm": 2.4195685214794778, + "learning_rate": 9.998656110818956e-05, + "loss": 4.4992, + "step": 605 + }, + { + "epoch": 0.3906526994359388, + "grad_norm": 2.663983937067608, + "learning_rate": 9.998651432569556e-05, + "loss": 3.9234, + "step": 606 + }, + { + "epoch": 0.39129734085414986, + "grad_norm": 3.1863067365510616, + "learning_rate": 9.998646746192614e-05, + "loss": 4.4248, + "step": 607 + }, + { + "epoch": 0.391941982272361, + "grad_norm": 3.4404829797210295, + "learning_rate": 9.998642051688144e-05, + "loss": 4.7302, + "step": 608 + }, + { + "epoch": 0.3925866236905721, + "grad_norm": 2.6081394390798223, + "learning_rate": 9.998637349056146e-05, + "loss": 4.5946, + "step": 609 + }, + { + "epoch": 0.39323126510878326, + "grad_norm": 2.586129092239953, + "learning_rate": 9.998632638296634e-05, + "loss": 4.3985, + "step": 610 + }, + { + "epoch": 0.39387590652699433, + "grad_norm": 5.1069434992306935, + "learning_rate": 9.998627919409613e-05, + "loss": 4.3396, + "step": 611 + }, + { + "epoch": 0.39452054794520547, + "grad_norm": 5.518018442423091, + "learning_rate": 9.998623192395091e-05, + "loss": 4.3258, + "step": 612 + }, + { + "epoch": 0.3951651893634166, + "grad_norm": 3.073318522200895, + "learning_rate": 9.998618457253076e-05, + "loss": 4.2677, + "step": 613 + }, + { + "epoch": 0.39580983078162774, + "grad_norm": 3.1607764483888987, + "learning_rate": 9.998613713983576e-05, + "loss": 4.5198, + "step": 614 + }, + { + "epoch": 0.3964544721998388, + "grad_norm": 3.200951622335752, + "learning_rate": 9.998608962586597e-05, + "loss": 4.3588, + "step": 615 + }, + { + "epoch": 0.39709911361804995, + "grad_norm": 2.5385261844836653, + "learning_rate": 9.998604203062151e-05, + "loss": 4.147, + "step": 616 + }, + { + "epoch": 0.3977437550362611, + "grad_norm": 3.904436319587839, + "learning_rate": 9.998599435410242e-05, + "loss": 3.7326, + "step": 617 + }, + { + "epoch": 0.3983883964544722, + "grad_norm": 2.0334823326661544, + "learning_rate": 9.998594659630878e-05, + "loss": 4.5869, + "step": 618 + }, + { + "epoch": 0.3990330378726833, + "grad_norm": 3.043313096297158, + "learning_rate": 9.998589875724071e-05, + "loss": 4.4239, + "step": 619 + }, + { + "epoch": 0.3996776792908944, + "grad_norm": 3.0183256294475416, + "learning_rate": 9.998585083689822e-05, + "loss": 4.1614, + "step": 620 + }, + { + "epoch": 0.40032232070910556, + "grad_norm": 2.8324979378621435, + "learning_rate": 9.998580283528145e-05, + "loss": 4.3064, + "step": 621 + }, + { + "epoch": 0.4009669621273167, + "grad_norm": 3.4514775319314857, + "learning_rate": 9.998575475239045e-05, + "loss": 4.2254, + "step": 622 + }, + { + "epoch": 0.4016116035455278, + "grad_norm": 3.1258153351899667, + "learning_rate": 9.998570658822531e-05, + "loss": 3.9879, + "step": 623 + }, + { + "epoch": 0.4022562449637389, + "grad_norm": 2.256989729553059, + "learning_rate": 9.998565834278609e-05, + "loss": 4.1545, + "step": 624 + }, + { + "epoch": 0.40290088638195004, + "grad_norm": 2.9717809020643773, + "learning_rate": 9.998561001607289e-05, + "loss": 4.0136, + "step": 625 + }, + { + "epoch": 0.4035455278001612, + "grad_norm": 3.5607359788207513, + "learning_rate": 9.998556160808577e-05, + "loss": 4.1022, + "step": 626 + }, + { + "epoch": 0.40419016921837225, + "grad_norm": 3.4119532267111174, + "learning_rate": 9.998551311882483e-05, + "loss": 4.2503, + "step": 627 + }, + { + "epoch": 0.4048348106365834, + "grad_norm": 3.8585520907210578, + "learning_rate": 9.998546454829015e-05, + "loss": 4.33, + "step": 628 + }, + { + "epoch": 0.4054794520547945, + "grad_norm": 2.705538033697984, + "learning_rate": 9.998541589648179e-05, + "loss": 4.2235, + "step": 629 + }, + { + "epoch": 0.40612409347300565, + "grad_norm": 3.054846234166915, + "learning_rate": 9.998536716339983e-05, + "loss": 4.2672, + "step": 630 + }, + { + "epoch": 0.4067687348912168, + "grad_norm": 3.556898925742873, + "learning_rate": 9.998531834904438e-05, + "loss": 3.9021, + "step": 631 + }, + { + "epoch": 0.40741337630942787, + "grad_norm": 2.9034639299836726, + "learning_rate": 9.998526945341548e-05, + "loss": 4.4876, + "step": 632 + }, + { + "epoch": 0.408058017727639, + "grad_norm": 3.032592724435467, + "learning_rate": 9.998522047651324e-05, + "loss": 4.5993, + "step": 633 + }, + { + "epoch": 0.40870265914585013, + "grad_norm": 3.9351777155339103, + "learning_rate": 9.998517141833775e-05, + "loss": 4.2444, + "step": 634 + }, + { + "epoch": 0.40934730056406127, + "grad_norm": 4.200860184354178, + "learning_rate": 9.998512227888905e-05, + "loss": 4.0788, + "step": 635 + }, + { + "epoch": 0.40999194198227235, + "grad_norm": 3.4402738218452864, + "learning_rate": 9.998507305816726e-05, + "loss": 4.2597, + "step": 636 + }, + { + "epoch": 0.4106365834004835, + "grad_norm": 3.3734220579712804, + "learning_rate": 9.998502375617243e-05, + "loss": 4.3355, + "step": 637 + }, + { + "epoch": 0.4112812248186946, + "grad_norm": 3.156364018076257, + "learning_rate": 9.998497437290465e-05, + "loss": 4.3646, + "step": 638 + }, + { + "epoch": 0.41192586623690575, + "grad_norm": 3.5373798128590592, + "learning_rate": 9.9984924908364e-05, + "loss": 3.9836, + "step": 639 + }, + { + "epoch": 0.4125705076551168, + "grad_norm": 4.678229368633622, + "learning_rate": 9.998487536255059e-05, + "loss": 4.112, + "step": 640 + }, + { + "epoch": 0.41321514907332796, + "grad_norm": 5.009686757119095, + "learning_rate": 9.998482573546447e-05, + "loss": 4.2055, + "step": 641 + }, + { + "epoch": 0.4138597904915391, + "grad_norm": 2.3825599341466095, + "learning_rate": 9.998477602710573e-05, + "loss": 4.3502, + "step": 642 + }, + { + "epoch": 0.4145044319097502, + "grad_norm": 3.2785952891065118, + "learning_rate": 9.998472623747445e-05, + "loss": 4.3188, + "step": 643 + }, + { + "epoch": 0.4151490733279613, + "grad_norm": 3.2435544130308958, + "learning_rate": 9.99846763665707e-05, + "loss": 4.4555, + "step": 644 + }, + { + "epoch": 0.41579371474617244, + "grad_norm": 2.9703562629130307, + "learning_rate": 9.99846264143946e-05, + "loss": 4.2752, + "step": 645 + }, + { + "epoch": 0.41643835616438357, + "grad_norm": 3.307231933119747, + "learning_rate": 9.998457638094618e-05, + "loss": 4.3994, + "step": 646 + }, + { + "epoch": 0.4170829975825947, + "grad_norm": 2.450142429773788, + "learning_rate": 9.998452626622557e-05, + "loss": 4.3527, + "step": 647 + }, + { + "epoch": 0.4177276390008058, + "grad_norm": 2.5963450943616024, + "learning_rate": 9.99844760702328e-05, + "loss": 4.3858, + "step": 648 + }, + { + "epoch": 0.4183722804190169, + "grad_norm": 2.888742336318682, + "learning_rate": 9.998442579296803e-05, + "loss": 4.4775, + "step": 649 + }, + { + "epoch": 0.41901692183722805, + "grad_norm": 1.6550859385987382, + "learning_rate": 9.998437543443126e-05, + "loss": 4.2404, + "step": 650 + }, + { + "epoch": 0.4196615632554392, + "grad_norm": 2.80231830318171, + "learning_rate": 9.998432499462263e-05, + "loss": 4.3415, + "step": 651 + }, + { + "epoch": 0.42030620467365026, + "grad_norm": 3.094471253136349, + "learning_rate": 9.99842744735422e-05, + "loss": 4.2314, + "step": 652 + }, + { + "epoch": 0.4209508460918614, + "grad_norm": 2.8145562495187533, + "learning_rate": 9.998422387119004e-05, + "loss": 4.4445, + "step": 653 + }, + { + "epoch": 0.42159548751007253, + "grad_norm": 3.1766387243223644, + "learning_rate": 9.998417318756627e-05, + "loss": 4.4345, + "step": 654 + }, + { + "epoch": 0.42224012892828366, + "grad_norm": 2.912474415387016, + "learning_rate": 9.998412242267093e-05, + "loss": 4.0654, + "step": 655 + }, + { + "epoch": 0.42288477034649474, + "grad_norm": 3.5848531036104414, + "learning_rate": 9.998407157650413e-05, + "loss": 4.4894, + "step": 656 + }, + { + "epoch": 0.4235294117647059, + "grad_norm": 3.478877143086779, + "learning_rate": 9.998402064906596e-05, + "loss": 4.4099, + "step": 657 + }, + { + "epoch": 0.424174053182917, + "grad_norm": 2.6927565772962683, + "learning_rate": 9.998396964035647e-05, + "loss": 3.9742, + "step": 658 + }, + { + "epoch": 0.42481869460112814, + "grad_norm": 3.7913887808031315, + "learning_rate": 9.99839185503758e-05, + "loss": 4.324, + "step": 659 + }, + { + "epoch": 0.4254633360193392, + "grad_norm": 4.525450581945123, + "learning_rate": 9.998386737912398e-05, + "loss": 4.2713, + "step": 660 + }, + { + "epoch": 0.42610797743755036, + "grad_norm": 3.109247130797736, + "learning_rate": 9.998381612660111e-05, + "loss": 4.4459, + "step": 661 + }, + { + "epoch": 0.4267526188557615, + "grad_norm": 3.1266931336900825, + "learning_rate": 9.998376479280727e-05, + "loss": 4.2065, + "step": 662 + }, + { + "epoch": 0.4273972602739726, + "grad_norm": 3.4832209074906655, + "learning_rate": 9.998371337774257e-05, + "loss": 4.0386, + "step": 663 + }, + { + "epoch": 0.4280419016921837, + "grad_norm": 3.124653710016953, + "learning_rate": 9.998366188140707e-05, + "loss": 4.3943, + "step": 664 + }, + { + "epoch": 0.42868654311039484, + "grad_norm": 2.603699839433056, + "learning_rate": 9.998361030380087e-05, + "loss": 4.3751, + "step": 665 + }, + { + "epoch": 0.42933118452860597, + "grad_norm": 3.8255236468124485, + "learning_rate": 9.998355864492403e-05, + "loss": 3.9236, + "step": 666 + }, + { + "epoch": 0.4299758259468171, + "grad_norm": 4.110445570315154, + "learning_rate": 9.998350690477665e-05, + "loss": 4.5248, + "step": 667 + }, + { + "epoch": 0.4306204673650282, + "grad_norm": 2.7340351272602526, + "learning_rate": 9.998345508335882e-05, + "loss": 4.4602, + "step": 668 + }, + { + "epoch": 0.4312651087832393, + "grad_norm": 3.8847329316061163, + "learning_rate": 9.998340318067063e-05, + "loss": 4.1063, + "step": 669 + }, + { + "epoch": 0.43190975020145045, + "grad_norm": 5.421454177114278, + "learning_rate": 9.998335119671215e-05, + "loss": 4.3304, + "step": 670 + }, + { + "epoch": 0.4325543916196616, + "grad_norm": 6.683454170987391, + "learning_rate": 9.998329913148347e-05, + "loss": 4.3523, + "step": 671 + }, + { + "epoch": 0.43319903303787266, + "grad_norm": 4.724114701626809, + "learning_rate": 9.998324698498466e-05, + "loss": 4.4926, + "step": 672 + }, + { + "epoch": 0.4338436744560838, + "grad_norm": 3.0649927133282526, + "learning_rate": 9.998319475721586e-05, + "loss": 4.0698, + "step": 673 + }, + { + "epoch": 0.43448831587429493, + "grad_norm": 5.543518942444433, + "learning_rate": 9.99831424481771e-05, + "loss": 4.4897, + "step": 674 + }, + { + "epoch": 0.43513295729250606, + "grad_norm": 2.916071613084921, + "learning_rate": 9.998309005786848e-05, + "loss": 4.0331, + "step": 675 + }, + { + "epoch": 0.43577759871071714, + "grad_norm": 5.785958247757137, + "learning_rate": 9.998303758629009e-05, + "loss": 4.4655, + "step": 676 + }, + { + "epoch": 0.4364222401289283, + "grad_norm": 2.944418242615444, + "learning_rate": 9.998298503344203e-05, + "loss": 4.6773, + "step": 677 + }, + { + "epoch": 0.4370668815471394, + "grad_norm": 4.139614645303387, + "learning_rate": 9.998293239932434e-05, + "loss": 4.547, + "step": 678 + }, + { + "epoch": 0.43771152296535054, + "grad_norm": 3.5665343596374677, + "learning_rate": 9.998287968393718e-05, + "loss": 4.2705, + "step": 679 + }, + { + "epoch": 0.4383561643835616, + "grad_norm": 2.827092162404123, + "learning_rate": 9.998282688728058e-05, + "loss": 4.5292, + "step": 680 + }, + { + "epoch": 0.43900080580177275, + "grad_norm": 2.7939483609136295, + "learning_rate": 9.998277400935464e-05, + "loss": 4.2579, + "step": 681 + }, + { + "epoch": 0.4396454472199839, + "grad_norm": 2.9165809459647307, + "learning_rate": 9.998272105015943e-05, + "loss": 4.1022, + "step": 682 + }, + { + "epoch": 0.440290088638195, + "grad_norm": 2.847994079011674, + "learning_rate": 9.998266800969509e-05, + "loss": 4.2643, + "step": 683 + }, + { + "epoch": 0.4409347300564061, + "grad_norm": 3.129320883454279, + "learning_rate": 9.998261488796165e-05, + "loss": 4.1066, + "step": 684 + }, + { + "epoch": 0.44157937147461723, + "grad_norm": 1.7582078319287464, + "learning_rate": 9.998256168495922e-05, + "loss": 4.5547, + "step": 685 + }, + { + "epoch": 0.44222401289282837, + "grad_norm": 3.539630809628449, + "learning_rate": 9.99825084006879e-05, + "loss": 4.5788, + "step": 686 + }, + { + "epoch": 0.4428686543110395, + "grad_norm": 2.581378410038623, + "learning_rate": 9.998245503514776e-05, + "loss": 4.0027, + "step": 687 + }, + { + "epoch": 0.4435132957292506, + "grad_norm": 2.882643360982598, + "learning_rate": 9.998240158833891e-05, + "loss": 4.4329, + "step": 688 + }, + { + "epoch": 0.4441579371474617, + "grad_norm": 2.4124106375898275, + "learning_rate": 9.99823480602614e-05, + "loss": 4.207, + "step": 689 + }, + { + "epoch": 0.44480257856567285, + "grad_norm": 2.8790040802020576, + "learning_rate": 9.998229445091534e-05, + "loss": 4.1466, + "step": 690 + }, + { + "epoch": 0.445447219983884, + "grad_norm": 3.341457431520915, + "learning_rate": 9.998224076030083e-05, + "loss": 4.3112, + "step": 691 + }, + { + "epoch": 0.44609186140209506, + "grad_norm": 3.8334115332616565, + "learning_rate": 9.998218698841793e-05, + "loss": 4.1942, + "step": 692 + }, + { + "epoch": 0.4467365028203062, + "grad_norm": 4.061596527770865, + "learning_rate": 9.998213313526675e-05, + "loss": 4.2523, + "step": 693 + }, + { + "epoch": 0.4473811442385173, + "grad_norm": 3.0205902472929997, + "learning_rate": 9.998207920084738e-05, + "loss": 4.3426, + "step": 694 + }, + { + "epoch": 0.44802578565672846, + "grad_norm": 3.7595560535162327, + "learning_rate": 9.99820251851599e-05, + "loss": 4.2876, + "step": 695 + }, + { + "epoch": 0.44867042707493954, + "grad_norm": 4.7343801352845105, + "learning_rate": 9.998197108820439e-05, + "loss": 4.1203, + "step": 696 + }, + { + "epoch": 0.44931506849315067, + "grad_norm": 5.84312913434973, + "learning_rate": 9.998191690998096e-05, + "loss": 4.0523, + "step": 697 + }, + { + "epoch": 0.4499597099113618, + "grad_norm": 3.7447963572546543, + "learning_rate": 9.998186265048966e-05, + "loss": 4.5144, + "step": 698 + }, + { + "epoch": 0.45060435132957294, + "grad_norm": 3.028201387887933, + "learning_rate": 9.998180830973064e-05, + "loss": 4.2845, + "step": 699 + }, + { + "epoch": 0.4512489927477841, + "grad_norm": 5.052146584951575, + "learning_rate": 9.998175388770394e-05, + "loss": 3.9155, + "step": 700 + }, + { + "epoch": 0.4512489927477841, + "eval_loss": 4.357644081115723, + "eval_runtime": 2.9405, + "eval_samples_per_second": 34.008, + "eval_steps_per_second": 4.421, + "step": 700 + }, + { + "epoch": 0.45189363416599515, + "grad_norm": 4.507088432423085, + "learning_rate": 9.998169938440966e-05, + "loss": 4.2316, + "step": 701 + }, + { + "epoch": 0.4525382755842063, + "grad_norm": 2.4619238544204576, + "learning_rate": 9.99816447998479e-05, + "loss": 4.3332, + "step": 702 + }, + { + "epoch": 0.4531829170024174, + "grad_norm": 3.341651309622693, + "learning_rate": 9.998159013401875e-05, + "loss": 4.3683, + "step": 703 + }, + { + "epoch": 0.45382755842062855, + "grad_norm": 1.6580189831030758, + "learning_rate": 9.998153538692227e-05, + "loss": 4.2503, + "step": 704 + }, + { + "epoch": 0.45447219983883963, + "grad_norm": 2.906028513094157, + "learning_rate": 9.998148055855858e-05, + "loss": 4.2035, + "step": 705 + }, + { + "epoch": 0.45511684125705076, + "grad_norm": 2.302980052361796, + "learning_rate": 9.99814256489278e-05, + "loss": 4.3944, + "step": 706 + }, + { + "epoch": 0.4557614826752619, + "grad_norm": 2.4051834697523145, + "learning_rate": 9.998137065802993e-05, + "loss": 4.5177, + "step": 707 + }, + { + "epoch": 0.45640612409347303, + "grad_norm": 3.2230221418799965, + "learning_rate": 9.998131558586516e-05, + "loss": 4.621, + "step": 708 + }, + { + "epoch": 0.4570507655116841, + "grad_norm": 2.064701224339921, + "learning_rate": 9.998126043243352e-05, + "loss": 4.5224, + "step": 709 + }, + { + "epoch": 0.45769540692989524, + "grad_norm": 2.3638181510848715, + "learning_rate": 9.99812051977351e-05, + "loss": 4.5534, + "step": 710 + }, + { + "epoch": 0.4583400483481064, + "grad_norm": 2.7532031155355825, + "learning_rate": 9.998114988177e-05, + "loss": 4.4962, + "step": 711 + }, + { + "epoch": 0.4589846897663175, + "grad_norm": 1.7541154761407216, + "learning_rate": 9.998109448453833e-05, + "loss": 4.4863, + "step": 712 + }, + { + "epoch": 0.4596293311845286, + "grad_norm": 2.896121911684507, + "learning_rate": 9.99810390060402e-05, + "loss": 4.0556, + "step": 713 + }, + { + "epoch": 0.4602739726027397, + "grad_norm": 2.8909442537419423, + "learning_rate": 9.998098344627562e-05, + "loss": 3.9542, + "step": 714 + }, + { + "epoch": 0.46091861402095086, + "grad_norm": 2.3864389377718993, + "learning_rate": 9.998092780524476e-05, + "loss": 4.4662, + "step": 715 + }, + { + "epoch": 0.461563255439162, + "grad_norm": 2.4056201484273023, + "learning_rate": 9.998087208294767e-05, + "loss": 4.5248, + "step": 716 + }, + { + "epoch": 0.46220789685737307, + "grad_norm": 2.5050487741505756, + "learning_rate": 9.998081627938446e-05, + "loss": 4.504, + "step": 717 + }, + { + "epoch": 0.4628525382755842, + "grad_norm": 1.6578751313045783, + "learning_rate": 9.99807603945552e-05, + "loss": 4.4802, + "step": 718 + }, + { + "epoch": 0.46349717969379534, + "grad_norm": 2.4002621371197326, + "learning_rate": 9.998070442846001e-05, + "loss": 4.3241, + "step": 719 + }, + { + "epoch": 0.46414182111200647, + "grad_norm": 3.0279557746730124, + "learning_rate": 9.998064838109896e-05, + "loss": 4.3143, + "step": 720 + }, + { + "epoch": 0.46478646253021755, + "grad_norm": 3.946128476158663, + "learning_rate": 9.998059225247216e-05, + "loss": 4.2842, + "step": 721 + }, + { + "epoch": 0.4654311039484287, + "grad_norm": 3.9513359708498608, + "learning_rate": 9.998053604257968e-05, + "loss": 4.3655, + "step": 722 + }, + { + "epoch": 0.4660757453666398, + "grad_norm": 3.437784175923075, + "learning_rate": 9.998047975142164e-05, + "loss": 4.1736, + "step": 723 + }, + { + "epoch": 0.46672038678485095, + "grad_norm": 2.5725694136682105, + "learning_rate": 9.998042337899811e-05, + "loss": 4.2917, + "step": 724 + }, + { + "epoch": 0.467365028203062, + "grad_norm": 3.106817822753629, + "learning_rate": 9.99803669253092e-05, + "loss": 4.2261, + "step": 725 + }, + { + "epoch": 0.46800966962127316, + "grad_norm": 3.132052857868221, + "learning_rate": 9.998031039035498e-05, + "loss": 4.2491, + "step": 726 + }, + { + "epoch": 0.4686543110394843, + "grad_norm": 3.280286801065297, + "learning_rate": 9.998025377413555e-05, + "loss": 4.4513, + "step": 727 + }, + { + "epoch": 0.46929895245769543, + "grad_norm": 2.7359059788444897, + "learning_rate": 9.998019707665102e-05, + "loss": 4.4575, + "step": 728 + }, + { + "epoch": 0.4699435938759065, + "grad_norm": 2.9734154093674676, + "learning_rate": 9.998014029790146e-05, + "loss": 3.9306, + "step": 729 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 3.4289893920671184, + "learning_rate": 9.998008343788699e-05, + "loss": 4.3295, + "step": 730 + }, + { + "epoch": 0.4712328767123288, + "grad_norm": 3.826894925474338, + "learning_rate": 9.998002649660767e-05, + "loss": 4.293, + "step": 731 + }, + { + "epoch": 0.4718775181305399, + "grad_norm": 3.233104639206533, + "learning_rate": 9.997996947406362e-05, + "loss": 4.4768, + "step": 732 + }, + { + "epoch": 0.472522159548751, + "grad_norm": 1.8880023700991337, + "learning_rate": 9.997991237025495e-05, + "loss": 4.1052, + "step": 733 + }, + { + "epoch": 0.4731668009669621, + "grad_norm": 2.95137247986563, + "learning_rate": 9.99798551851817e-05, + "loss": 4.1869, + "step": 734 + }, + { + "epoch": 0.47381144238517325, + "grad_norm": 2.8685960024785624, + "learning_rate": 9.9979797918844e-05, + "loss": 4.624, + "step": 735 + }, + { + "epoch": 0.4744560838033844, + "grad_norm": 2.271757256747844, + "learning_rate": 9.997974057124193e-05, + "loss": 4.3203, + "step": 736 + }, + { + "epoch": 0.47510072522159547, + "grad_norm": 3.142227653419826, + "learning_rate": 9.997968314237561e-05, + "loss": 4.1836, + "step": 737 + }, + { + "epoch": 0.4757453666398066, + "grad_norm": 2.92312290815153, + "learning_rate": 9.997962563224509e-05, + "loss": 4.3374, + "step": 738 + }, + { + "epoch": 0.47639000805801773, + "grad_norm": 2.5255844062087958, + "learning_rate": 9.997956804085051e-05, + "loss": 4.1722, + "step": 739 + }, + { + "epoch": 0.47703464947622887, + "grad_norm": 2.3792192583263647, + "learning_rate": 9.997951036819194e-05, + "loss": 4.3837, + "step": 740 + }, + { + "epoch": 0.47767929089443995, + "grad_norm": 2.8867962665494358, + "learning_rate": 9.997945261426948e-05, + "loss": 4.1489, + "step": 741 + }, + { + "epoch": 0.4783239323126511, + "grad_norm": 3.911533269553655, + "learning_rate": 9.997939477908322e-05, + "loss": 3.8153, + "step": 742 + }, + { + "epoch": 0.4789685737308622, + "grad_norm": 3.9340482551604947, + "learning_rate": 9.997933686263326e-05, + "loss": 4.3648, + "step": 743 + }, + { + "epoch": 0.47961321514907335, + "grad_norm": 4.2857561165777085, + "learning_rate": 9.99792788649197e-05, + "loss": 4.2657, + "step": 744 + }, + { + "epoch": 0.4802578565672844, + "grad_norm": 5.033375303790812, + "learning_rate": 9.997922078594263e-05, + "loss": 4.2173, + "step": 745 + }, + { + "epoch": 0.48090249798549556, + "grad_norm": 4.664658366897333, + "learning_rate": 9.997916262570214e-05, + "loss": 4.0622, + "step": 746 + }, + { + "epoch": 0.4815471394037067, + "grad_norm": 5.081312085407976, + "learning_rate": 9.997910438419833e-05, + "loss": 3.872, + "step": 747 + }, + { + "epoch": 0.4821917808219178, + "grad_norm": 3.0814939303217344, + "learning_rate": 9.99790460614313e-05, + "loss": 4.0255, + "step": 748 + }, + { + "epoch": 0.4828364222401289, + "grad_norm": 3.1313985325885283, + "learning_rate": 9.997898765740113e-05, + "loss": 4.3115, + "step": 749 + }, + { + "epoch": 0.48348106365834004, + "grad_norm": 2.8799762947868612, + "learning_rate": 9.997892917210793e-05, + "loss": 4.3154, + "step": 750 + }, + { + "epoch": 0.48412570507655117, + "grad_norm": 2.1094165432413177, + "learning_rate": 9.997887060555181e-05, + "loss": 4.1489, + "step": 751 + }, + { + "epoch": 0.4847703464947623, + "grad_norm": 2.772850187375905, + "learning_rate": 9.997881195773286e-05, + "loss": 4.4439, + "step": 752 + }, + { + "epoch": 0.4854149879129734, + "grad_norm": 3.4221937810945238, + "learning_rate": 9.997875322865115e-05, + "loss": 4.2917, + "step": 753 + }, + { + "epoch": 0.4860596293311845, + "grad_norm": 2.8242899896308677, + "learning_rate": 9.997869441830678e-05, + "loss": 4.6388, + "step": 754 + }, + { + "epoch": 0.48670427074939565, + "grad_norm": 3.026930675558695, + "learning_rate": 9.997863552669987e-05, + "loss": 4.4734, + "step": 755 + }, + { + "epoch": 0.4873489121676068, + "grad_norm": 3.9538967176722406, + "learning_rate": 9.99785765538305e-05, + "loss": 4.0815, + "step": 756 + }, + { + "epoch": 0.48799355358581786, + "grad_norm": 3.2237319714672585, + "learning_rate": 9.997851749969878e-05, + "loss": 4.0892, + "step": 757 + }, + { + "epoch": 0.488638195004029, + "grad_norm": 2.894028333373964, + "learning_rate": 9.99784583643048e-05, + "loss": 4.2814, + "step": 758 + }, + { + "epoch": 0.48928283642224013, + "grad_norm": 1.6155845332482786, + "learning_rate": 9.997839914764866e-05, + "loss": 4.3124, + "step": 759 + }, + { + "epoch": 0.48992747784045126, + "grad_norm": 2.3476289349109445, + "learning_rate": 9.997833984973047e-05, + "loss": 4.0982, + "step": 760 + }, + { + "epoch": 0.49057211925866234, + "grad_norm": 2.5997378302827543, + "learning_rate": 9.997828047055028e-05, + "loss": 4.4452, + "step": 761 + }, + { + "epoch": 0.4912167606768735, + "grad_norm": 2.547994033965668, + "learning_rate": 9.997822101010824e-05, + "loss": 4.1606, + "step": 762 + }, + { + "epoch": 0.4918614020950846, + "grad_norm": 2.8014664050638247, + "learning_rate": 9.997816146840441e-05, + "loss": 4.3646, + "step": 763 + }, + { + "epoch": 0.49250604351329574, + "grad_norm": 3.1359648803698663, + "learning_rate": 9.997810184543891e-05, + "loss": 4.0199, + "step": 764 + }, + { + "epoch": 0.4931506849315068, + "grad_norm": 4.2511521172340565, + "learning_rate": 9.997804214121185e-05, + "loss": 4.309, + "step": 765 + }, + { + "epoch": 0.49379532634971796, + "grad_norm": 3.6647026081507454, + "learning_rate": 9.99779823557233e-05, + "loss": 3.8876, + "step": 766 + }, + { + "epoch": 0.4944399677679291, + "grad_norm": 2.8890113713782086, + "learning_rate": 9.997792248897336e-05, + "loss": 3.6931, + "step": 767 + }, + { + "epoch": 0.4950846091861402, + "grad_norm": 2.6245565424369066, + "learning_rate": 9.997786254096214e-05, + "loss": 4.1873, + "step": 768 + }, + { + "epoch": 0.49572925060435136, + "grad_norm": 3.0581099634247924, + "learning_rate": 9.997780251168976e-05, + "loss": 4.2836, + "step": 769 + }, + { + "epoch": 0.49637389202256244, + "grad_norm": 2.9245338722040084, + "learning_rate": 9.997774240115627e-05, + "loss": 3.5821, + "step": 770 + }, + { + "epoch": 0.49701853344077357, + "grad_norm": 2.9482138712314963, + "learning_rate": 9.99776822093618e-05, + "loss": 4.3543, + "step": 771 + }, + { + "epoch": 0.4976631748589847, + "grad_norm": 1.7948145387769587, + "learning_rate": 9.997762193630645e-05, + "loss": 4.5295, + "step": 772 + }, + { + "epoch": 0.49830781627719584, + "grad_norm": 2.4850317224703615, + "learning_rate": 9.997756158199028e-05, + "loss": 4.492, + "step": 773 + }, + { + "epoch": 0.4989524576954069, + "grad_norm": 2.5791026403638324, + "learning_rate": 9.997750114641345e-05, + "loss": 4.5033, + "step": 774 + }, + { + "epoch": 0.49959709911361805, + "grad_norm": 3.0861291428072497, + "learning_rate": 9.997744062957602e-05, + "loss": 4.2037, + "step": 775 + }, + { + "epoch": 0.5002417405318291, + "grad_norm": 3.039639573949948, + "learning_rate": 9.997738003147811e-05, + "loss": 4.2002, + "step": 776 + }, + { + "epoch": 0.5008863819500403, + "grad_norm": 2.9182818700906235, + "learning_rate": 9.997731935211979e-05, + "loss": 4.4282, + "step": 777 + }, + { + "epoch": 0.5015310233682514, + "grad_norm": 1.8705359160739787, + "learning_rate": 9.997725859150118e-05, + "loss": 4.2428, + "step": 778 + }, + { + "epoch": 0.5021756647864626, + "grad_norm": 2.273488076261551, + "learning_rate": 9.997719774962239e-05, + "loss": 4.0786, + "step": 779 + }, + { + "epoch": 0.5028203062046737, + "grad_norm": 2.8920881436847026, + "learning_rate": 9.997713682648352e-05, + "loss": 3.9969, + "step": 780 + }, + { + "epoch": 0.5034649476228847, + "grad_norm": 3.1180304371625533, + "learning_rate": 9.997707582208462e-05, + "loss": 4.2628, + "step": 781 + }, + { + "epoch": 0.5041095890410959, + "grad_norm": 2.6649711210445024, + "learning_rate": 9.997701473642585e-05, + "loss": 4.0589, + "step": 782 + }, + { + "epoch": 0.504754230459307, + "grad_norm": 3.7454714230793247, + "learning_rate": 9.997695356950728e-05, + "loss": 4.0539, + "step": 783 + }, + { + "epoch": 0.5053988718775181, + "grad_norm": 5.0824123601031514, + "learning_rate": 9.997689232132903e-05, + "loss": 4.5836, + "step": 784 + }, + { + "epoch": 0.5060435132957293, + "grad_norm": 3.8219046758399515, + "learning_rate": 9.997683099189118e-05, + "loss": 4.4585, + "step": 785 + }, + { + "epoch": 0.5066881547139404, + "grad_norm": 2.336428338116268, + "learning_rate": 9.997676958119384e-05, + "loss": 4.3563, + "step": 786 + }, + { + "epoch": 0.5073327961321515, + "grad_norm": 3.1549985428891696, + "learning_rate": 9.997670808923711e-05, + "loss": 4.4574, + "step": 787 + }, + { + "epoch": 0.5079774375503626, + "grad_norm": 3.039176917923678, + "learning_rate": 9.99766465160211e-05, + "loss": 3.8315, + "step": 788 + }, + { + "epoch": 0.5086220789685737, + "grad_norm": 3.367827422245641, + "learning_rate": 9.99765848615459e-05, + "loss": 4.118, + "step": 789 + }, + { + "epoch": 0.5092667203867849, + "grad_norm": 2.1932642471355117, + "learning_rate": 9.997652312581162e-05, + "loss": 4.0643, + "step": 790 + }, + { + "epoch": 0.509911361804996, + "grad_norm": 3.046609648205813, + "learning_rate": 9.997646130881834e-05, + "loss": 4.4276, + "step": 791 + }, + { + "epoch": 0.510556003223207, + "grad_norm": 3.822364094111963, + "learning_rate": 9.997639941056619e-05, + "loss": 4.0091, + "step": 792 + }, + { + "epoch": 0.5112006446414182, + "grad_norm": 2.017262722274449, + "learning_rate": 9.997633743105524e-05, + "loss": 4.3128, + "step": 793 + }, + { + "epoch": 0.5118452860596293, + "grad_norm": 3.260524619616141, + "learning_rate": 9.997627537028564e-05, + "loss": 4.1303, + "step": 794 + }, + { + "epoch": 0.5124899274778405, + "grad_norm": 3.5852413717511866, + "learning_rate": 9.997621322825746e-05, + "loss": 4.3303, + "step": 795 + }, + { + "epoch": 0.5131345688960516, + "grad_norm": 1.7469134783508664, + "learning_rate": 9.997615100497078e-05, + "loss": 4.3625, + "step": 796 + }, + { + "epoch": 0.5137792103142627, + "grad_norm": 3.731425435013653, + "learning_rate": 9.997608870042574e-05, + "loss": 3.9359, + "step": 797 + }, + { + "epoch": 0.5144238517324738, + "grad_norm": 4.199353720065884, + "learning_rate": 9.997602631462243e-05, + "loss": 3.9778, + "step": 798 + }, + { + "epoch": 0.5150684931506849, + "grad_norm": 4.728780915379706, + "learning_rate": 9.997596384756095e-05, + "loss": 4.1874, + "step": 799 + }, + { + "epoch": 0.515713134568896, + "grad_norm": 6.1210312919521455, + "learning_rate": 9.99759012992414e-05, + "loss": 3.9734, + "step": 800 + }, + { + "epoch": 0.515713134568896, + "eval_loss": 4.321937084197998, + "eval_runtime": 2.9205, + "eval_samples_per_second": 34.241, + "eval_steps_per_second": 4.451, + "step": 800 + }, + { + "epoch": 0.5163577759871072, + "grad_norm": 4.733940210518701, + "learning_rate": 9.997583866966389e-05, + "loss": 4.4409, + "step": 801 + }, + { + "epoch": 0.5170024174053183, + "grad_norm": 3.5147934751941743, + "learning_rate": 9.997577595882851e-05, + "loss": 4.1712, + "step": 802 + }, + { + "epoch": 0.5176470588235295, + "grad_norm": 4.579405169079223, + "learning_rate": 9.99757131667354e-05, + "loss": 4.1926, + "step": 803 + }, + { + "epoch": 0.5182917002417405, + "grad_norm": 3.1731622720618935, + "learning_rate": 9.997565029338462e-05, + "loss": 4.2644, + "step": 804 + }, + { + "epoch": 0.5189363416599516, + "grad_norm": 3.6410162806941884, + "learning_rate": 9.997558733877628e-05, + "loss": 4.1954, + "step": 805 + }, + { + "epoch": 0.5195809830781628, + "grad_norm": 2.2544005534574376, + "learning_rate": 9.997552430291051e-05, + "loss": 4.424, + "step": 806 + }, + { + "epoch": 0.5202256244963739, + "grad_norm": 3.814973482610472, + "learning_rate": 9.997546118578737e-05, + "loss": 3.7973, + "step": 807 + }, + { + "epoch": 0.520870265914585, + "grad_norm": 4.511385090937999, + "learning_rate": 9.9975397987407e-05, + "loss": 4.3413, + "step": 808 + }, + { + "epoch": 0.5215149073327962, + "grad_norm": 3.659606750313287, + "learning_rate": 9.997533470776951e-05, + "loss": 4.2255, + "step": 809 + }, + { + "epoch": 0.5221595487510072, + "grad_norm": 3.613007404976541, + "learning_rate": 9.997527134687497e-05, + "loss": 4.4887, + "step": 810 + }, + { + "epoch": 0.5228041901692184, + "grad_norm": 3.135444109785103, + "learning_rate": 9.997520790472352e-05, + "loss": 3.9211, + "step": 811 + }, + { + "epoch": 0.5234488315874295, + "grad_norm": 5.030678275320933, + "learning_rate": 9.997514438131524e-05, + "loss": 3.9414, + "step": 812 + }, + { + "epoch": 0.5240934730056406, + "grad_norm": 3.613719318139986, + "learning_rate": 9.997508077665024e-05, + "loss": 4.3211, + "step": 813 + }, + { + "epoch": 0.5247381144238518, + "grad_norm": 4.982873111326261, + "learning_rate": 9.99750170907286e-05, + "loss": 4.0555, + "step": 814 + }, + { + "epoch": 0.5253827558420628, + "grad_norm": 3.6598536536044626, + "learning_rate": 9.997495332355047e-05, + "loss": 4.1718, + "step": 815 + }, + { + "epoch": 0.5260273972602739, + "grad_norm": 4.1151274622294745, + "learning_rate": 9.997488947511593e-05, + "loss": 4.1342, + "step": 816 + }, + { + "epoch": 0.5266720386784851, + "grad_norm": 3.4599906472531345, + "learning_rate": 9.997482554542511e-05, + "loss": 4.0484, + "step": 817 + }, + { + "epoch": 0.5273166800966962, + "grad_norm": 2.6536934875895386, + "learning_rate": 9.997476153447806e-05, + "loss": 4.4761, + "step": 818 + }, + { + "epoch": 0.5279613215149074, + "grad_norm": 2.2541950148219057, + "learning_rate": 9.997469744227493e-05, + "loss": 4.4154, + "step": 819 + }, + { + "epoch": 0.5286059629331185, + "grad_norm": 2.9201146746155815, + "learning_rate": 9.997463326881582e-05, + "loss": 4.0596, + "step": 820 + }, + { + "epoch": 0.5292506043513295, + "grad_norm": 2.2177066820437625, + "learning_rate": 9.997456901410083e-05, + "loss": 4.1665, + "step": 821 + }, + { + "epoch": 0.5298952457695407, + "grad_norm": 2.114159270761397, + "learning_rate": 9.997450467813007e-05, + "loss": 4.3309, + "step": 822 + }, + { + "epoch": 0.5305398871877518, + "grad_norm": 2.0822148958644346, + "learning_rate": 9.997444026090362e-05, + "loss": 4.4779, + "step": 823 + }, + { + "epoch": 0.5311845286059629, + "grad_norm": 2.455643218333259, + "learning_rate": 9.997437576242163e-05, + "loss": 4.3556, + "step": 824 + }, + { + "epoch": 0.5318291700241741, + "grad_norm": 3.660307275617132, + "learning_rate": 9.997431118268417e-05, + "loss": 3.8957, + "step": 825 + }, + { + "epoch": 0.5324738114423851, + "grad_norm": 5.076275912128078, + "learning_rate": 9.997424652169137e-05, + "loss": 3.8826, + "step": 826 + }, + { + "epoch": 0.5331184528605963, + "grad_norm": 4.510790301530441, + "learning_rate": 9.997418177944329e-05, + "loss": 4.3604, + "step": 827 + }, + { + "epoch": 0.5337630942788074, + "grad_norm": 2.2600190309912103, + "learning_rate": 9.99741169559401e-05, + "loss": 4.3792, + "step": 828 + }, + { + "epoch": 0.5344077356970185, + "grad_norm": 4.091583786113386, + "learning_rate": 9.997405205118186e-05, + "loss": 4.1621, + "step": 829 + }, + { + "epoch": 0.5350523771152297, + "grad_norm": 4.051110242939901, + "learning_rate": 9.997398706516872e-05, + "loss": 4.4228, + "step": 830 + }, + { + "epoch": 0.5356970185334408, + "grad_norm": 2.7903598344571687, + "learning_rate": 9.997392199790074e-05, + "loss": 3.7325, + "step": 831 + }, + { + "epoch": 0.5363416599516518, + "grad_norm": 3.1087192114846935, + "learning_rate": 9.997385684937807e-05, + "loss": 4.2689, + "step": 832 + }, + { + "epoch": 0.536986301369863, + "grad_norm": 2.1931346937251246, + "learning_rate": 9.997379161960076e-05, + "loss": 4.2829, + "step": 833 + }, + { + "epoch": 0.5376309427880741, + "grad_norm": 2.8573026840765836, + "learning_rate": 9.997372630856899e-05, + "loss": 4.3753, + "step": 834 + }, + { + "epoch": 0.5382755842062853, + "grad_norm": 1.9243522827402146, + "learning_rate": 9.997366091628282e-05, + "loss": 4.4473, + "step": 835 + }, + { + "epoch": 0.5389202256244964, + "grad_norm": 2.5620859116277246, + "learning_rate": 9.997359544274234e-05, + "loss": 4.1234, + "step": 836 + }, + { + "epoch": 0.5395648670427075, + "grad_norm": 3.021880379630453, + "learning_rate": 9.99735298879477e-05, + "loss": 4.2865, + "step": 837 + }, + { + "epoch": 0.5402095084609186, + "grad_norm": 1.868378950804378, + "learning_rate": 9.997346425189901e-05, + "loss": 4.2288, + "step": 838 + }, + { + "epoch": 0.5408541498791297, + "grad_norm": 2.9128237539795347, + "learning_rate": 9.997339853459634e-05, + "loss": 4.5772, + "step": 839 + }, + { + "epoch": 0.5414987912973409, + "grad_norm": 3.211301346481348, + "learning_rate": 9.99733327360398e-05, + "loss": 4.3013, + "step": 840 + }, + { + "epoch": 0.542143432715552, + "grad_norm": 2.4368013762542926, + "learning_rate": 9.997326685622954e-05, + "loss": 4.467, + "step": 841 + }, + { + "epoch": 0.5427880741337631, + "grad_norm": 2.3268203429831544, + "learning_rate": 9.997320089516564e-05, + "loss": 3.9674, + "step": 842 + }, + { + "epoch": 0.5434327155519743, + "grad_norm": 2.5257256388054525, + "learning_rate": 9.997313485284822e-05, + "loss": 4.0898, + "step": 843 + }, + { + "epoch": 0.5440773569701853, + "grad_norm": 1.925516271701914, + "learning_rate": 9.997306872927737e-05, + "loss": 4.1648, + "step": 844 + }, + { + "epoch": 0.5447219983883964, + "grad_norm": 1.81349533344554, + "learning_rate": 9.99730025244532e-05, + "loss": 4.7102, + "step": 845 + }, + { + "epoch": 0.5453666398066076, + "grad_norm": 2.447870675847326, + "learning_rate": 9.997293623837585e-05, + "loss": 4.2223, + "step": 846 + }, + { + "epoch": 0.5460112812248187, + "grad_norm": 2.475635662860096, + "learning_rate": 9.997286987104539e-05, + "loss": 4.4465, + "step": 847 + }, + { + "epoch": 0.5466559226430299, + "grad_norm": 1.8264979205749363, + "learning_rate": 9.997280342246195e-05, + "loss": 4.5102, + "step": 848 + }, + { + "epoch": 0.547300564061241, + "grad_norm": 2.2102320551037025, + "learning_rate": 9.997273689262562e-05, + "loss": 4.1035, + "step": 849 + }, + { + "epoch": 0.547945205479452, + "grad_norm": 2.2174815366857685, + "learning_rate": 9.997267028153656e-05, + "loss": 4.1436, + "step": 850 + }, + { + "epoch": 0.5485898468976632, + "grad_norm": 2.858132997278373, + "learning_rate": 9.997260358919482e-05, + "loss": 4.3312, + "step": 851 + }, + { + "epoch": 0.5492344883158743, + "grad_norm": 3.3870846841600386, + "learning_rate": 9.997253681560053e-05, + "loss": 4.153, + "step": 852 + }, + { + "epoch": 0.5498791297340854, + "grad_norm": 2.8090512447408575, + "learning_rate": 9.99724699607538e-05, + "loss": 4.4049, + "step": 853 + }, + { + "epoch": 0.5505237711522966, + "grad_norm": 4.074539530252481, + "learning_rate": 9.997240302465476e-05, + "loss": 3.6982, + "step": 854 + }, + { + "epoch": 0.5511684125705076, + "grad_norm": 4.830006198859739, + "learning_rate": 9.99723360073035e-05, + "loss": 4.0094, + "step": 855 + }, + { + "epoch": 0.5518130539887188, + "grad_norm": 4.3943609538801685, + "learning_rate": 9.997226890870011e-05, + "loss": 4.1984, + "step": 856 + }, + { + "epoch": 0.5524576954069299, + "grad_norm": 2.4957906717685314, + "learning_rate": 9.997220172884474e-05, + "loss": 4.4988, + "step": 857 + }, + { + "epoch": 0.553102336825141, + "grad_norm": 4.238879467062463, + "learning_rate": 9.997213446773749e-05, + "loss": 3.9451, + "step": 858 + }, + { + "epoch": 0.5537469782433522, + "grad_norm": 2.9149431941095463, + "learning_rate": 9.997206712537843e-05, + "loss": 4.1813, + "step": 859 + }, + { + "epoch": 0.5543916196615633, + "grad_norm": 2.516385811946381, + "learning_rate": 9.997199970176773e-05, + "loss": 4.4085, + "step": 860 + }, + { + "epoch": 0.5550362610797743, + "grad_norm": 3.1761883449592827, + "learning_rate": 9.997193219690548e-05, + "loss": 4.0423, + "step": 861 + }, + { + "epoch": 0.5556809024979855, + "grad_norm": 4.552447769247537, + "learning_rate": 9.997186461079177e-05, + "loss": 4.5207, + "step": 862 + }, + { + "epoch": 0.5563255439161966, + "grad_norm": 2.7557232789830985, + "learning_rate": 9.997179694342676e-05, + "loss": 4.274, + "step": 863 + }, + { + "epoch": 0.5569701853344078, + "grad_norm": 3.1973658547576878, + "learning_rate": 9.997172919481049e-05, + "loss": 4.0855, + "step": 864 + }, + { + "epoch": 0.5576148267526189, + "grad_norm": 2.850748165616256, + "learning_rate": 9.997166136494313e-05, + "loss": 4.1048, + "step": 865 + }, + { + "epoch": 0.5582594681708299, + "grad_norm": 2.6531604807199174, + "learning_rate": 9.997159345382475e-05, + "loss": 4.1164, + "step": 866 + }, + { + "epoch": 0.5589041095890411, + "grad_norm": 2.67825602512455, + "learning_rate": 9.997152546145551e-05, + "loss": 4.2262, + "step": 867 + }, + { + "epoch": 0.5595487510072522, + "grad_norm": 2.8961636626590637, + "learning_rate": 9.997145738783546e-05, + "loss": 4.2795, + "step": 868 + }, + { + "epoch": 0.5601933924254633, + "grad_norm": 3.4973447308893593, + "learning_rate": 9.997138923296477e-05, + "loss": 4.525, + "step": 869 + }, + { + "epoch": 0.5608380338436745, + "grad_norm": 3.202100447907203, + "learning_rate": 9.997132099684352e-05, + "loss": 3.884, + "step": 870 + }, + { + "epoch": 0.5614826752618856, + "grad_norm": 3.515737622636855, + "learning_rate": 9.997125267947184e-05, + "loss": 4.2842, + "step": 871 + }, + { + "epoch": 0.5621273166800967, + "grad_norm": 2.63298587915086, + "learning_rate": 9.997118428084981e-05, + "loss": 4.3326, + "step": 872 + }, + { + "epoch": 0.5627719580983078, + "grad_norm": 2.9814413957072516, + "learning_rate": 9.997111580097758e-05, + "loss": 4.1358, + "step": 873 + }, + { + "epoch": 0.5634165995165189, + "grad_norm": 2.361634691192679, + "learning_rate": 9.997104723985524e-05, + "loss": 3.9414, + "step": 874 + }, + { + "epoch": 0.5640612409347301, + "grad_norm": 2.5878690820221495, + "learning_rate": 9.997097859748291e-05, + "loss": 4.1424, + "step": 875 + }, + { + "epoch": 0.5647058823529412, + "grad_norm": 2.2312055198252225, + "learning_rate": 9.997090987386071e-05, + "loss": 4.0738, + "step": 876 + }, + { + "epoch": 0.5653505237711522, + "grad_norm": 2.273460863351967, + "learning_rate": 9.997084106898873e-05, + "loss": 4.1255, + "step": 877 + }, + { + "epoch": 0.5659951651893634, + "grad_norm": 2.3199866042536312, + "learning_rate": 9.997077218286711e-05, + "loss": 4.2103, + "step": 878 + }, + { + "epoch": 0.5666398066075745, + "grad_norm": 2.019520579525638, + "learning_rate": 9.997070321549595e-05, + "loss": 4.1822, + "step": 879 + }, + { + "epoch": 0.5672844480257857, + "grad_norm": 2.5059048905447225, + "learning_rate": 9.997063416687535e-05, + "loss": 4.3967, + "step": 880 + }, + { + "epoch": 0.5679290894439968, + "grad_norm": 3.1295349424626626, + "learning_rate": 9.997056503700546e-05, + "loss": 4.0412, + "step": 881 + }, + { + "epoch": 0.5685737308622079, + "grad_norm": 3.430671263450968, + "learning_rate": 9.997049582588635e-05, + "loss": 4.4384, + "step": 882 + }, + { + "epoch": 0.569218372280419, + "grad_norm": 2.9483614796951767, + "learning_rate": 9.997042653351816e-05, + "loss": 4.4615, + "step": 883 + }, + { + "epoch": 0.5698630136986301, + "grad_norm": 2.375561148004583, + "learning_rate": 9.997035715990101e-05, + "loss": 4.1714, + "step": 884 + }, + { + "epoch": 0.5705076551168412, + "grad_norm": 2.2315297324990984, + "learning_rate": 9.9970287705035e-05, + "loss": 4.4242, + "step": 885 + }, + { + "epoch": 0.5711522965350524, + "grad_norm": 2.119680314555951, + "learning_rate": 9.997021816892023e-05, + "loss": 4.3045, + "step": 886 + }, + { + "epoch": 0.5717969379532635, + "grad_norm": 2.1089378279326634, + "learning_rate": 9.997014855155685e-05, + "loss": 3.8475, + "step": 887 + }, + { + "epoch": 0.5724415793714747, + "grad_norm": 3.2052818913424197, + "learning_rate": 9.997007885294495e-05, + "loss": 3.9782, + "step": 888 + }, + { + "epoch": 0.5730862207896857, + "grad_norm": 3.1954126242740446, + "learning_rate": 9.997000907308464e-05, + "loss": 4.6412, + "step": 889 + }, + { + "epoch": 0.5737308622078968, + "grad_norm": 2.741575353557163, + "learning_rate": 9.996993921197605e-05, + "loss": 4.4093, + "step": 890 + }, + { + "epoch": 0.574375503626108, + "grad_norm": 2.0671963236951307, + "learning_rate": 9.99698692696193e-05, + "loss": 4.0924, + "step": 891 + }, + { + "epoch": 0.5750201450443191, + "grad_norm": 2.7359668422881054, + "learning_rate": 9.996979924601447e-05, + "loss": 4.216, + "step": 892 + }, + { + "epoch": 0.5756647864625302, + "grad_norm": 4.189141706196489, + "learning_rate": 9.996972914116172e-05, + "loss": 4.1836, + "step": 893 + }, + { + "epoch": 0.5763094278807414, + "grad_norm": 3.1354263715915285, + "learning_rate": 9.996965895506114e-05, + "loss": 4.3628, + "step": 894 + }, + { + "epoch": 0.5769540692989524, + "grad_norm": 2.7548276003779235, + "learning_rate": 9.996958868771284e-05, + "loss": 4.4412, + "step": 895 + }, + { + "epoch": 0.5775987107171636, + "grad_norm": 3.134973048172452, + "learning_rate": 9.996951833911695e-05, + "loss": 4.1675, + "step": 896 + }, + { + "epoch": 0.5782433521353747, + "grad_norm": 2.4289591784718425, + "learning_rate": 9.996944790927359e-05, + "loss": 4.3836, + "step": 897 + }, + { + "epoch": 0.5788879935535858, + "grad_norm": 2.5135092500255185, + "learning_rate": 9.996937739818285e-05, + "loss": 4.2138, + "step": 898 + }, + { + "epoch": 0.579532634971797, + "grad_norm": 3.1738141071454287, + "learning_rate": 9.996930680584487e-05, + "loss": 3.982, + "step": 899 + }, + { + "epoch": 0.580177276390008, + "grad_norm": 3.8647236613080405, + "learning_rate": 9.996923613225973e-05, + "loss": 4.0537, + "step": 900 + }, + { + "epoch": 0.580177276390008, + "eval_loss": 4.28272819519043, + "eval_runtime": 2.9379, + "eval_samples_per_second": 34.038, + "eval_steps_per_second": 4.425, + "step": 900 + }, + { + "epoch": 0.5808219178082191, + "grad_norm": 2.908975371850775, + "learning_rate": 9.99691653774276e-05, + "loss": 4.301, + "step": 901 + }, + { + "epoch": 0.5814665592264303, + "grad_norm": 2.909057468528378, + "learning_rate": 9.996909454134858e-05, + "loss": 4.1487, + "step": 902 + }, + { + "epoch": 0.5821112006446414, + "grad_norm": 3.009815663992221, + "learning_rate": 9.996902362402276e-05, + "loss": 4.1153, + "step": 903 + }, + { + "epoch": 0.5827558420628526, + "grad_norm": 2.6022322136314737, + "learning_rate": 9.996895262545029e-05, + "loss": 3.936, + "step": 904 + }, + { + "epoch": 0.5834004834810637, + "grad_norm": 2.7112459154499997, + "learning_rate": 9.996888154563124e-05, + "loss": 4.0914, + "step": 905 + }, + { + "epoch": 0.5840451248992747, + "grad_norm": 2.2512924617580943, + "learning_rate": 9.996881038456578e-05, + "loss": 4.0044, + "step": 906 + }, + { + "epoch": 0.5846897663174859, + "grad_norm": 1.789408128217528, + "learning_rate": 9.996873914225398e-05, + "loss": 4.0087, + "step": 907 + }, + { + "epoch": 0.585334407735697, + "grad_norm": 2.342506439523912, + "learning_rate": 9.9968667818696e-05, + "loss": 3.9866, + "step": 908 + }, + { + "epoch": 0.5859790491539082, + "grad_norm": 2.356290716637038, + "learning_rate": 9.996859641389194e-05, + "loss": 4.2436, + "step": 909 + }, + { + "epoch": 0.5866236905721193, + "grad_norm": 1.515713868586456, + "learning_rate": 9.99685249278419e-05, + "loss": 4.3481, + "step": 910 + }, + { + "epoch": 0.5872683319903304, + "grad_norm": 2.06827140489324, + "learning_rate": 9.996845336054603e-05, + "loss": 4.349, + "step": 911 + }, + { + "epoch": 0.5879129734085415, + "grad_norm": 2.641052059154104, + "learning_rate": 9.99683817120044e-05, + "loss": 4.2166, + "step": 912 + }, + { + "epoch": 0.5885576148267526, + "grad_norm": 2.686601004176138, + "learning_rate": 9.996830998221718e-05, + "loss": 3.7878, + "step": 913 + }, + { + "epoch": 0.5892022562449637, + "grad_norm": 1.9694444350170452, + "learning_rate": 9.996823817118446e-05, + "loss": 4.3429, + "step": 914 + }, + { + "epoch": 0.5898468976631749, + "grad_norm": 2.26949472486655, + "learning_rate": 9.996816627890638e-05, + "loss": 4.2187, + "step": 915 + }, + { + "epoch": 0.590491539081386, + "grad_norm": 3.6400956632076977, + "learning_rate": 9.9968094305383e-05, + "loss": 4.309, + "step": 916 + }, + { + "epoch": 0.5911361804995972, + "grad_norm": 5.107733278346077, + "learning_rate": 9.996802225061451e-05, + "loss": 3.9093, + "step": 917 + }, + { + "epoch": 0.5917808219178082, + "grad_norm": 4.254204049892344, + "learning_rate": 9.996795011460098e-05, + "loss": 4.4134, + "step": 918 + }, + { + "epoch": 0.5924254633360193, + "grad_norm": 1.8052752138014918, + "learning_rate": 9.996787789734255e-05, + "loss": 4.2094, + "step": 919 + }, + { + "epoch": 0.5930701047542305, + "grad_norm": 4.215075656998475, + "learning_rate": 9.996780559883933e-05, + "loss": 3.9853, + "step": 920 + }, + { + "epoch": 0.5937147461724416, + "grad_norm": 3.2886127458954486, + "learning_rate": 9.996773321909145e-05, + "loss": 4.2257, + "step": 921 + }, + { + "epoch": 0.5943593875906527, + "grad_norm": 2.8802879331190634, + "learning_rate": 9.996766075809903e-05, + "loss": 4.0636, + "step": 922 + }, + { + "epoch": 0.5950040290088638, + "grad_norm": 3.207787956977339, + "learning_rate": 9.996758821586218e-05, + "loss": 4.4574, + "step": 923 + }, + { + "epoch": 0.5956486704270749, + "grad_norm": 2.486899236569256, + "learning_rate": 9.9967515592381e-05, + "loss": 3.8737, + "step": 924 + }, + { + "epoch": 0.5962933118452861, + "grad_norm": 4.433519748041602, + "learning_rate": 9.996744288765565e-05, + "loss": 4.3517, + "step": 925 + }, + { + "epoch": 0.5969379532634972, + "grad_norm": 2.063083064064984, + "learning_rate": 9.996737010168622e-05, + "loss": 4.0695, + "step": 926 + }, + { + "epoch": 0.5975825946817083, + "grad_norm": 3.407979470501135, + "learning_rate": 9.996729723447286e-05, + "loss": 4.2283, + "step": 927 + }, + { + "epoch": 0.5982272360999195, + "grad_norm": 4.255222200151769, + "learning_rate": 9.996722428601564e-05, + "loss": 4.0607, + "step": 928 + }, + { + "epoch": 0.5988718775181305, + "grad_norm": 2.5280830044727898, + "learning_rate": 9.996715125631471e-05, + "loss": 4.4163, + "step": 929 + }, + { + "epoch": 0.5995165189363416, + "grad_norm": 2.356337431547483, + "learning_rate": 9.99670781453702e-05, + "loss": 4.425, + "step": 930 + }, + { + "epoch": 0.6001611603545528, + "grad_norm": 2.1081744822549355, + "learning_rate": 9.996700495318219e-05, + "loss": 3.9806, + "step": 931 + }, + { + "epoch": 0.6008058017727639, + "grad_norm": 2.7809219628177413, + "learning_rate": 9.996693167975087e-05, + "loss": 3.9827, + "step": 932 + }, + { + "epoch": 0.6014504431909751, + "grad_norm": 3.5620792867602775, + "learning_rate": 9.99668583250763e-05, + "loss": 4.1012, + "step": 933 + }, + { + "epoch": 0.6020950846091861, + "grad_norm": 4.4900607921844955, + "learning_rate": 9.99667848891586e-05, + "loss": 4.3577, + "step": 934 + }, + { + "epoch": 0.6027397260273972, + "grad_norm": 2.8518223991698273, + "learning_rate": 9.996671137199794e-05, + "loss": 4.2599, + "step": 935 + }, + { + "epoch": 0.6033843674456084, + "grad_norm": 2.6823173613818776, + "learning_rate": 9.996663777359438e-05, + "loss": 4.4395, + "step": 936 + }, + { + "epoch": 0.6040290088638195, + "grad_norm": 2.6028966416868906, + "learning_rate": 9.99665640939481e-05, + "loss": 4.0577, + "step": 937 + }, + { + "epoch": 0.6046736502820306, + "grad_norm": 3.562895147728426, + "learning_rate": 9.996649033305917e-05, + "loss": 4.1902, + "step": 938 + }, + { + "epoch": 0.6053182917002418, + "grad_norm": 3.6459068473998357, + "learning_rate": 9.996641649092774e-05, + "loss": 4.0405, + "step": 939 + }, + { + "epoch": 0.6059629331184528, + "grad_norm": 2.3304947723137426, + "learning_rate": 9.996634256755394e-05, + "loss": 4.2316, + "step": 940 + }, + { + "epoch": 0.606607574536664, + "grad_norm": 2.9517218055028667, + "learning_rate": 9.996626856293786e-05, + "loss": 4.0048, + "step": 941 + }, + { + "epoch": 0.6072522159548751, + "grad_norm": 3.4806060341037726, + "learning_rate": 9.996619447707965e-05, + "loss": 4.4033, + "step": 942 + }, + { + "epoch": 0.6078968573730862, + "grad_norm": 1.824506086382404, + "learning_rate": 9.99661203099794e-05, + "loss": 4.2134, + "step": 943 + }, + { + "epoch": 0.6085414987912974, + "grad_norm": 4.432343842168142, + "learning_rate": 9.996604606163725e-05, + "loss": 4.0937, + "step": 944 + }, + { + "epoch": 0.6091861402095085, + "grad_norm": 4.530741496864704, + "learning_rate": 9.996597173205334e-05, + "loss": 3.9487, + "step": 945 + }, + { + "epoch": 0.6098307816277195, + "grad_norm": 4.64584585337868, + "learning_rate": 9.996589732122776e-05, + "loss": 4.1352, + "step": 946 + }, + { + "epoch": 0.6104754230459307, + "grad_norm": 3.5805957761894645, + "learning_rate": 9.996582282916067e-05, + "loss": 4.1831, + "step": 947 + }, + { + "epoch": 0.6111200644641418, + "grad_norm": 3.123201445792184, + "learning_rate": 9.996574825585214e-05, + "loss": 4.0038, + "step": 948 + }, + { + "epoch": 0.611764705882353, + "grad_norm": 4.554431464196258, + "learning_rate": 9.996567360130235e-05, + "loss": 3.5161, + "step": 949 + }, + { + "epoch": 0.6124093473005641, + "grad_norm": 5.370368004690488, + "learning_rate": 9.996559886551136e-05, + "loss": 4.072, + "step": 950 + }, + { + "epoch": 0.6130539887187751, + "grad_norm": 3.3730099667257756, + "learning_rate": 9.996552404847937e-05, + "loss": 4.3922, + "step": 951 + }, + { + "epoch": 0.6136986301369863, + "grad_norm": 2.6650723464863417, + "learning_rate": 9.996544915020643e-05, + "loss": 4.5479, + "step": 952 + }, + { + "epoch": 0.6143432715551974, + "grad_norm": 2.994817871359553, + "learning_rate": 9.996537417069272e-05, + "loss": 4.3562, + "step": 953 + }, + { + "epoch": 0.6149879129734085, + "grad_norm": 2.5943354526030964, + "learning_rate": 9.996529910993831e-05, + "loss": 4.0781, + "step": 954 + }, + { + "epoch": 0.6156325543916197, + "grad_norm": 3.1654988678065648, + "learning_rate": 9.996522396794335e-05, + "loss": 4.3311, + "step": 955 + }, + { + "epoch": 0.6162771958098308, + "grad_norm": 2.6988196446649906, + "learning_rate": 9.996514874470797e-05, + "loss": 3.9195, + "step": 956 + }, + { + "epoch": 0.616921837228042, + "grad_norm": 2.8005904408436386, + "learning_rate": 9.99650734402323e-05, + "loss": 4.0499, + "step": 957 + }, + { + "epoch": 0.617566478646253, + "grad_norm": 1.9161755461674657, + "learning_rate": 9.996499805451644e-05, + "loss": 4.3542, + "step": 958 + }, + { + "epoch": 0.6182111200644641, + "grad_norm": 2.6219513582966116, + "learning_rate": 9.996492258756051e-05, + "loss": 4.136, + "step": 959 + }, + { + "epoch": 0.6188557614826753, + "grad_norm": 1.9319306229467654, + "learning_rate": 9.996484703936467e-05, + "loss": 4.2736, + "step": 960 + }, + { + "epoch": 0.6195004029008864, + "grad_norm": 2.4277366251635573, + "learning_rate": 9.9964771409929e-05, + "loss": 3.7209, + "step": 961 + }, + { + "epoch": 0.6201450443190974, + "grad_norm": 2.2055390109569393, + "learning_rate": 9.996469569925367e-05, + "loss": 4.1342, + "step": 962 + }, + { + "epoch": 0.6207896857373086, + "grad_norm": 2.981787786492267, + "learning_rate": 9.996461990733877e-05, + "loss": 3.8742, + "step": 963 + }, + { + "epoch": 0.6214343271555197, + "grad_norm": 4.530333466881121, + "learning_rate": 9.996454403418442e-05, + "loss": 4.0847, + "step": 964 + }, + { + "epoch": 0.6220789685737309, + "grad_norm": 2.8458298496253382, + "learning_rate": 9.996446807979078e-05, + "loss": 4.3228, + "step": 965 + }, + { + "epoch": 0.622723609991942, + "grad_norm": 2.5847022822238124, + "learning_rate": 9.996439204415795e-05, + "loss": 3.8485, + "step": 966 + }, + { + "epoch": 0.6233682514101531, + "grad_norm": 2.4277341224777795, + "learning_rate": 9.996431592728606e-05, + "loss": 4.0475, + "step": 967 + }, + { + "epoch": 0.6240128928283643, + "grad_norm": 2.6891341350299958, + "learning_rate": 9.996423972917524e-05, + "loss": 4.0331, + "step": 968 + }, + { + "epoch": 0.6246575342465753, + "grad_norm": 2.4083055875835906, + "learning_rate": 9.99641634498256e-05, + "loss": 4.2967, + "step": 969 + }, + { + "epoch": 0.6253021756647864, + "grad_norm": 2.525453677859455, + "learning_rate": 9.996408708923726e-05, + "loss": 3.9254, + "step": 970 + }, + { + "epoch": 0.6259468170829976, + "grad_norm": 3.019233976080675, + "learning_rate": 9.99640106474104e-05, + "loss": 4.1947, + "step": 971 + }, + { + "epoch": 0.6265914585012087, + "grad_norm": 2.815177087892475, + "learning_rate": 9.996393412434507e-05, + "loss": 4.4786, + "step": 972 + }, + { + "epoch": 0.6272360999194199, + "grad_norm": 2.511110676806012, + "learning_rate": 9.996385752004144e-05, + "loss": 4.0733, + "step": 973 + }, + { + "epoch": 0.6278807413376309, + "grad_norm": 3.386172708333759, + "learning_rate": 9.996378083449964e-05, + "loss": 4.3034, + "step": 974 + }, + { + "epoch": 0.628525382755842, + "grad_norm": 2.6566503396215317, + "learning_rate": 9.996370406771978e-05, + "loss": 4.3659, + "step": 975 + }, + { + "epoch": 0.6291700241740532, + "grad_norm": 2.5746166802204837, + "learning_rate": 9.996362721970199e-05, + "loss": 4.2454, + "step": 976 + }, + { + "epoch": 0.6298146655922643, + "grad_norm": 2.6297610486418272, + "learning_rate": 9.996355029044637e-05, + "loss": 4.0542, + "step": 977 + }, + { + "epoch": 0.6304593070104755, + "grad_norm": 2.4929513319128973, + "learning_rate": 9.99634732799531e-05, + "loss": 4.0828, + "step": 978 + }, + { + "epoch": 0.6311039484286866, + "grad_norm": 2.6792307116347325, + "learning_rate": 9.996339618822228e-05, + "loss": 4.4295, + "step": 979 + }, + { + "epoch": 0.6317485898468976, + "grad_norm": 2.2943259614094575, + "learning_rate": 9.996331901525402e-05, + "loss": 3.8227, + "step": 980 + }, + { + "epoch": 0.6323932312651088, + "grad_norm": 3.941805465109759, + "learning_rate": 9.996324176104847e-05, + "loss": 3.9155, + "step": 981 + }, + { + "epoch": 0.6330378726833199, + "grad_norm": 3.6868103838929045, + "learning_rate": 9.996316442560574e-05, + "loss": 3.8752, + "step": 982 + }, + { + "epoch": 0.633682514101531, + "grad_norm": 3.343111997547363, + "learning_rate": 9.996308700892598e-05, + "loss": 4.0165, + "step": 983 + }, + { + "epoch": 0.6343271555197422, + "grad_norm": 2.8952353801845643, + "learning_rate": 9.99630095110093e-05, + "loss": 4.3125, + "step": 984 + }, + { + "epoch": 0.6349717969379532, + "grad_norm": 2.682931583756805, + "learning_rate": 9.996293193185581e-05, + "loss": 3.7225, + "step": 985 + }, + { + "epoch": 0.6356164383561644, + "grad_norm": 6.579196841874286, + "learning_rate": 9.996285427146566e-05, + "loss": 3.9553, + "step": 986 + }, + { + "epoch": 0.6362610797743755, + "grad_norm": 3.744118564341561, + "learning_rate": 9.9962776529839e-05, + "loss": 4.0896, + "step": 987 + }, + { + "epoch": 0.6369057211925866, + "grad_norm": 2.5447213119668044, + "learning_rate": 9.996269870697592e-05, + "loss": 4.7148, + "step": 988 + }, + { + "epoch": 0.6375503626107978, + "grad_norm": 2.331357999385912, + "learning_rate": 9.996262080287656e-05, + "loss": 4.4261, + "step": 989 + }, + { + "epoch": 0.6381950040290089, + "grad_norm": 2.5908235733999168, + "learning_rate": 9.996254281754104e-05, + "loss": 4.0866, + "step": 990 + }, + { + "epoch": 0.6388396454472199, + "grad_norm": 2.856540856101297, + "learning_rate": 9.99624647509695e-05, + "loss": 4.0996, + "step": 991 + }, + { + "epoch": 0.6394842868654311, + "grad_norm": 2.4116521604870065, + "learning_rate": 9.996238660316208e-05, + "loss": 4.3658, + "step": 992 + }, + { + "epoch": 0.6401289282836422, + "grad_norm": 2.238950212238452, + "learning_rate": 9.996230837411889e-05, + "loss": 4.2676, + "step": 993 + }, + { + "epoch": 0.6407735697018534, + "grad_norm": 2.8004565066309715, + "learning_rate": 9.996223006384004e-05, + "loss": 3.7006, + "step": 994 + }, + { + "epoch": 0.6414182111200645, + "grad_norm": 2.301032294171345, + "learning_rate": 9.99621516723257e-05, + "loss": 3.9852, + "step": 995 + }, + { + "epoch": 0.6420628525382756, + "grad_norm": 2.3716308708696583, + "learning_rate": 9.996207319957596e-05, + "loss": 4.1057, + "step": 996 + }, + { + "epoch": 0.6427074939564867, + "grad_norm": 3.373587968188969, + "learning_rate": 9.996199464559099e-05, + "loss": 4.2617, + "step": 997 + }, + { + "epoch": 0.6433521353746978, + "grad_norm": 2.7494882690550853, + "learning_rate": 9.996191601037088e-05, + "loss": 4.192, + "step": 998 + }, + { + "epoch": 0.6439967767929089, + "grad_norm": 2.470434215201713, + "learning_rate": 9.996183729391579e-05, + "loss": 4.1489, + "step": 999 + }, + { + "epoch": 0.6446414182111201, + "grad_norm": 1.8987018292960405, + "learning_rate": 9.996175849622583e-05, + "loss": 4.5403, + "step": 1000 + }, + { + "epoch": 0.6446414182111201, + "eval_loss": 4.245604991912842, + "eval_runtime": 2.9338, + "eval_samples_per_second": 34.086, + "eval_steps_per_second": 4.431, + "step": 1000 + }, + { + "epoch": 0.6452860596293312, + "grad_norm": 2.5911203410161443, + "learning_rate": 9.996167961730113e-05, + "loss": 4.1823, + "step": 1001 + }, + { + "epoch": 0.6459307010475424, + "grad_norm": 2.5159495477618186, + "learning_rate": 9.996160065714184e-05, + "loss": 4.3221, + "step": 1002 + }, + { + "epoch": 0.6465753424657534, + "grad_norm": 3.3811496979652915, + "learning_rate": 9.996152161574806e-05, + "loss": 4.0506, + "step": 1003 + }, + { + "epoch": 0.6472199838839645, + "grad_norm": 4.957577693668301, + "learning_rate": 9.996144249311993e-05, + "loss": 4.3066, + "step": 1004 + }, + { + "epoch": 0.6478646253021757, + "grad_norm": 3.8655735350095313, + "learning_rate": 9.996136328925759e-05, + "loss": 4.6401, + "step": 1005 + }, + { + "epoch": 0.6485092667203868, + "grad_norm": 2.9251952697804677, + "learning_rate": 9.996128400416116e-05, + "loss": 4.4236, + "step": 1006 + }, + { + "epoch": 0.6491539081385979, + "grad_norm": 2.9138668882158414, + "learning_rate": 9.996120463783077e-05, + "loss": 4.2662, + "step": 1007 + }, + { + "epoch": 0.649798549556809, + "grad_norm": 3.001233419289977, + "learning_rate": 9.996112519026657e-05, + "loss": 4.1329, + "step": 1008 + }, + { + "epoch": 0.6504431909750201, + "grad_norm": 2.550215904259304, + "learning_rate": 9.996104566146867e-05, + "loss": 4.2206, + "step": 1009 + }, + { + "epoch": 0.6510878323932313, + "grad_norm": 2.8699494353390884, + "learning_rate": 9.99609660514372e-05, + "loss": 4.0454, + "step": 1010 + }, + { + "epoch": 0.6517324738114424, + "grad_norm": 1.9651042460535255, + "learning_rate": 9.99608863601723e-05, + "loss": 3.9544, + "step": 1011 + }, + { + "epoch": 0.6523771152296535, + "grad_norm": 2.4303160158250687, + "learning_rate": 9.996080658767408e-05, + "loss": 4.3831, + "step": 1012 + }, + { + "epoch": 0.6530217566478647, + "grad_norm": 1.8031959861436866, + "learning_rate": 9.99607267339427e-05, + "loss": 4.4819, + "step": 1013 + }, + { + "epoch": 0.6536663980660757, + "grad_norm": 2.2185132589177905, + "learning_rate": 9.99606467989783e-05, + "loss": 4.3207, + "step": 1014 + }, + { + "epoch": 0.6543110394842868, + "grad_norm": 3.249924780275078, + "learning_rate": 9.996056678278094e-05, + "loss": 3.9627, + "step": 1015 + }, + { + "epoch": 0.654955680902498, + "grad_norm": 4.518208728097048, + "learning_rate": 9.996048668535085e-05, + "loss": 4.11, + "step": 1016 + }, + { + "epoch": 0.6556003223207091, + "grad_norm": 4.461465056541282, + "learning_rate": 9.996040650668809e-05, + "loss": 4.1086, + "step": 1017 + }, + { + "epoch": 0.6562449637389203, + "grad_norm": 2.1151405632238394, + "learning_rate": 9.996032624679283e-05, + "loss": 4.0381, + "step": 1018 + }, + { + "epoch": 0.6568896051571314, + "grad_norm": 2.7846080405525284, + "learning_rate": 9.996024590566519e-05, + "loss": 3.9667, + "step": 1019 + }, + { + "epoch": 0.6575342465753424, + "grad_norm": 3.423048794015953, + "learning_rate": 9.996016548330528e-05, + "loss": 4.0718, + "step": 1020 + }, + { + "epoch": 0.6581788879935536, + "grad_norm": 2.2937939138887367, + "learning_rate": 9.996008497971326e-05, + "loss": 4.3595, + "step": 1021 + }, + { + "epoch": 0.6588235294117647, + "grad_norm": 2.68367999373843, + "learning_rate": 9.996000439488926e-05, + "loss": 4.2672, + "step": 1022 + }, + { + "epoch": 0.6594681708299758, + "grad_norm": 2.2681986377417975, + "learning_rate": 9.99599237288334e-05, + "loss": 4.2968, + "step": 1023 + }, + { + "epoch": 0.660112812248187, + "grad_norm": 1.9570802682990005, + "learning_rate": 9.99598429815458e-05, + "loss": 4.3988, + "step": 1024 + }, + { + "epoch": 0.660757453666398, + "grad_norm": 2.7457895195045863, + "learning_rate": 9.995976215302664e-05, + "loss": 4.1321, + "step": 1025 + }, + { + "epoch": 0.6614020950846092, + "grad_norm": 1.8448695774142905, + "learning_rate": 9.9959681243276e-05, + "loss": 4.731, + "step": 1026 + }, + { + "epoch": 0.6620467365028203, + "grad_norm": 3.074481661002905, + "learning_rate": 9.995960025229406e-05, + "loss": 4.0286, + "step": 1027 + }, + { + "epoch": 0.6626913779210314, + "grad_norm": 4.806998044631118, + "learning_rate": 9.995951918008091e-05, + "loss": 4.388, + "step": 1028 + }, + { + "epoch": 0.6633360193392426, + "grad_norm": 2.614935833546167, + "learning_rate": 9.995943802663671e-05, + "loss": 4.5855, + "step": 1029 + }, + { + "epoch": 0.6639806607574537, + "grad_norm": 2.4769087161718524, + "learning_rate": 9.995935679196157e-05, + "loss": 4.2296, + "step": 1030 + }, + { + "epoch": 0.6646253021756647, + "grad_norm": 3.4067560782886797, + "learning_rate": 9.995927547605567e-05, + "loss": 4.0258, + "step": 1031 + }, + { + "epoch": 0.6652699435938759, + "grad_norm": 2.687760365259961, + "learning_rate": 9.995919407891908e-05, + "loss": 3.9536, + "step": 1032 + }, + { + "epoch": 0.665914585012087, + "grad_norm": 3.0102522408235277, + "learning_rate": 9.995911260055198e-05, + "loss": 4.0683, + "step": 1033 + }, + { + "epoch": 0.6665592264302982, + "grad_norm": 2.0563374556058527, + "learning_rate": 9.99590310409545e-05, + "loss": 4.3881, + "step": 1034 + }, + { + "epoch": 0.6672038678485093, + "grad_norm": 2.7748730507677646, + "learning_rate": 9.995894940012676e-05, + "loss": 4.0939, + "step": 1035 + }, + { + "epoch": 0.6678485092667203, + "grad_norm": 3.2328661570221833, + "learning_rate": 9.995886767806889e-05, + "loss": 4.0172, + "step": 1036 + }, + { + "epoch": 0.6684931506849315, + "grad_norm": 2.3228063851122847, + "learning_rate": 9.995878587478103e-05, + "loss": 3.8729, + "step": 1037 + }, + { + "epoch": 0.6691377921031426, + "grad_norm": 1.9231745968184195, + "learning_rate": 9.995870399026334e-05, + "loss": 4.6828, + "step": 1038 + }, + { + "epoch": 0.6697824335213537, + "grad_norm": 1.452184235751207, + "learning_rate": 9.99586220245159e-05, + "loss": 4.5895, + "step": 1039 + }, + { + "epoch": 0.6704270749395649, + "grad_norm": 2.180214955313519, + "learning_rate": 9.995853997753891e-05, + "loss": 3.8443, + "step": 1040 + }, + { + "epoch": 0.671071716357776, + "grad_norm": 2.430379478859441, + "learning_rate": 9.995845784933244e-05, + "loss": 4.1963, + "step": 1041 + }, + { + "epoch": 0.6717163577759871, + "grad_norm": 2.3410046717911994, + "learning_rate": 9.995837563989668e-05, + "loss": 4.1029, + "step": 1042 + }, + { + "epoch": 0.6723609991941982, + "grad_norm": 2.151375232615351, + "learning_rate": 9.995829334923173e-05, + "loss": 3.996, + "step": 1043 + }, + { + "epoch": 0.6730056406124093, + "grad_norm": 1.6117839477766145, + "learning_rate": 9.995821097733773e-05, + "loss": 3.9865, + "step": 1044 + }, + { + "epoch": 0.6736502820306205, + "grad_norm": 2.430738940140261, + "learning_rate": 9.995812852421482e-05, + "loss": 4.319, + "step": 1045 + }, + { + "epoch": 0.6742949234488316, + "grad_norm": 3.4380335897167296, + "learning_rate": 9.995804598986313e-05, + "loss": 4.1714, + "step": 1046 + }, + { + "epoch": 0.6749395648670428, + "grad_norm": 4.553727562830632, + "learning_rate": 9.995796337428282e-05, + "loss": 4.0662, + "step": 1047 + }, + { + "epoch": 0.6755842062852538, + "grad_norm": 5.51235582540501, + "learning_rate": 9.9957880677474e-05, + "loss": 3.9968, + "step": 1048 + }, + { + "epoch": 0.6762288477034649, + "grad_norm": 3.8667314082537025, + "learning_rate": 9.995779789943682e-05, + "loss": 4.3406, + "step": 1049 + }, + { + "epoch": 0.6768734891216761, + "grad_norm": 2.319830598796395, + "learning_rate": 9.995771504017139e-05, + "loss": 4.3448, + "step": 1050 + }, + { + "epoch": 0.6775181305398872, + "grad_norm": 2.99248077952314, + "learning_rate": 9.995763209967789e-05, + "loss": 4.3491, + "step": 1051 + }, + { + "epoch": 0.6781627719580983, + "grad_norm": 2.480856811319744, + "learning_rate": 9.995754907795643e-05, + "loss": 4.0792, + "step": 1052 + }, + { + "epoch": 0.6788074133763095, + "grad_norm": 4.04665824035188, + "learning_rate": 9.995746597500713e-05, + "loss": 4.1625, + "step": 1053 + }, + { + "epoch": 0.6794520547945205, + "grad_norm": 4.130836384503813, + "learning_rate": 9.995738279083017e-05, + "loss": 4.051, + "step": 1054 + }, + { + "epoch": 0.6800966962127317, + "grad_norm": 2.6672956638252594, + "learning_rate": 9.995729952542564e-05, + "loss": 4.7044, + "step": 1055 + }, + { + "epoch": 0.6807413376309428, + "grad_norm": 4.203302379263184, + "learning_rate": 9.99572161787937e-05, + "loss": 4.4607, + "step": 1056 + }, + { + "epoch": 0.6813859790491539, + "grad_norm": 2.8304185503922143, + "learning_rate": 9.995713275093449e-05, + "loss": 4.1736, + "step": 1057 + }, + { + "epoch": 0.6820306204673651, + "grad_norm": 3.4852039686058265, + "learning_rate": 9.995704924184814e-05, + "loss": 4.1559, + "step": 1058 + }, + { + "epoch": 0.6826752618855761, + "grad_norm": 2.303074599671627, + "learning_rate": 9.99569656515348e-05, + "loss": 4.1933, + "step": 1059 + }, + { + "epoch": 0.6833199033037872, + "grad_norm": 3.083065803885712, + "learning_rate": 9.995688197999456e-05, + "loss": 4.1095, + "step": 1060 + }, + { + "epoch": 0.6839645447219984, + "grad_norm": 2.089266272835482, + "learning_rate": 9.995679822722763e-05, + "loss": 4.3666, + "step": 1061 + }, + { + "epoch": 0.6846091861402095, + "grad_norm": 2.8729394836478384, + "learning_rate": 9.995671439323411e-05, + "loss": 4.1399, + "step": 1062 + }, + { + "epoch": 0.6852538275584207, + "grad_norm": 2.7079621125525977, + "learning_rate": 9.995663047801412e-05, + "loss": 4.3535, + "step": 1063 + }, + { + "epoch": 0.6858984689766318, + "grad_norm": 2.3127781797667684, + "learning_rate": 9.995654648156782e-05, + "loss": 4.1489, + "step": 1064 + }, + { + "epoch": 0.6865431103948428, + "grad_norm": 3.352229270128944, + "learning_rate": 9.995646240389536e-05, + "loss": 3.9175, + "step": 1065 + }, + { + "epoch": 0.687187751813054, + "grad_norm": 2.725887057443685, + "learning_rate": 9.995637824499685e-05, + "loss": 3.8997, + "step": 1066 + }, + { + "epoch": 0.6878323932312651, + "grad_norm": 2.1814540832339024, + "learning_rate": 9.995629400487244e-05, + "loss": 4.0242, + "step": 1067 + }, + { + "epoch": 0.6884770346494762, + "grad_norm": 2.6896938781943427, + "learning_rate": 9.995620968352228e-05, + "loss": 4.1053, + "step": 1068 + }, + { + "epoch": 0.6891216760676874, + "grad_norm": 2.8218345741893542, + "learning_rate": 9.99561252809465e-05, + "loss": 4.1694, + "step": 1069 + }, + { + "epoch": 0.6897663174858985, + "grad_norm": 3.0272826395270314, + "learning_rate": 9.995604079714522e-05, + "loss": 3.8502, + "step": 1070 + }, + { + "epoch": 0.6904109589041096, + "grad_norm": 2.804027470833558, + "learning_rate": 9.995595623211861e-05, + "loss": 3.9789, + "step": 1071 + }, + { + "epoch": 0.6910556003223207, + "grad_norm": 1.7566658198700273, + "learning_rate": 9.995587158586678e-05, + "loss": 4.2686, + "step": 1072 + }, + { + "epoch": 0.6917002417405318, + "grad_norm": 2.309104125319435, + "learning_rate": 9.995578685838989e-05, + "loss": 3.8987, + "step": 1073 + }, + { + "epoch": 0.692344883158743, + "grad_norm": 2.142774099991435, + "learning_rate": 9.995570204968807e-05, + "loss": 4.3066, + "step": 1074 + }, + { + "epoch": 0.6929895245769541, + "grad_norm": 2.5419150963942734, + "learning_rate": 9.995561715976149e-05, + "loss": 4.1133, + "step": 1075 + }, + { + "epoch": 0.6936341659951651, + "grad_norm": 2.0059264508830603, + "learning_rate": 9.995553218861023e-05, + "loss": 4.2777, + "step": 1076 + }, + { + "epoch": 0.6942788074133763, + "grad_norm": 2.4977799151223845, + "learning_rate": 9.995544713623447e-05, + "loss": 4.3091, + "step": 1077 + }, + { + "epoch": 0.6949234488315874, + "grad_norm": 2.774857131420172, + "learning_rate": 9.995536200263432e-05, + "loss": 4.0982, + "step": 1078 + }, + { + "epoch": 0.6955680902497986, + "grad_norm": 2.1945167040655553, + "learning_rate": 9.995527678780996e-05, + "loss": 3.9417, + "step": 1079 + }, + { + "epoch": 0.6962127316680097, + "grad_norm": 2.460544912920611, + "learning_rate": 9.995519149176151e-05, + "loss": 4.2288, + "step": 1080 + }, + { + "epoch": 0.6968573730862208, + "grad_norm": 2.98963572105773, + "learning_rate": 9.99551061144891e-05, + "loss": 4.2584, + "step": 1081 + }, + { + "epoch": 0.6975020145044319, + "grad_norm": 3.078691083405916, + "learning_rate": 9.99550206559929e-05, + "loss": 3.9706, + "step": 1082 + }, + { + "epoch": 0.698146655922643, + "grad_norm": 2.8571141805767675, + "learning_rate": 9.9954935116273e-05, + "loss": 4.1487, + "step": 1083 + }, + { + "epoch": 0.6987912973408541, + "grad_norm": 1.9192087616494442, + "learning_rate": 9.99548494953296e-05, + "loss": 4.0448, + "step": 1084 + }, + { + "epoch": 0.6994359387590653, + "grad_norm": 1.536496871099247, + "learning_rate": 9.99547637931628e-05, + "loss": 4.5561, + "step": 1085 + }, + { + "epoch": 0.7000805801772764, + "grad_norm": 2.3179253656678487, + "learning_rate": 9.995467800977275e-05, + "loss": 4.2899, + "step": 1086 + }, + { + "epoch": 0.7007252215954876, + "grad_norm": 2.4290601984536107, + "learning_rate": 9.995459214515959e-05, + "loss": 4.2545, + "step": 1087 + }, + { + "epoch": 0.7013698630136986, + "grad_norm": 2.0879232335197955, + "learning_rate": 9.995450619932348e-05, + "loss": 4.0834, + "step": 1088 + }, + { + "epoch": 0.7020145044319097, + "grad_norm": 2.1174327451405923, + "learning_rate": 9.995442017226453e-05, + "loss": 3.9747, + "step": 1089 + }, + { + "epoch": 0.7026591458501209, + "grad_norm": 2.1103966055467116, + "learning_rate": 9.995433406398291e-05, + "loss": 4.0536, + "step": 1090 + }, + { + "epoch": 0.703303787268332, + "grad_norm": 2.419059965174679, + "learning_rate": 9.995424787447875e-05, + "loss": 4.1703, + "step": 1091 + }, + { + "epoch": 0.7039484286865431, + "grad_norm": 2.624660625709817, + "learning_rate": 9.995416160375219e-05, + "loss": 4.1081, + "step": 1092 + }, + { + "epoch": 0.7045930701047542, + "grad_norm": 2.2476378098533116, + "learning_rate": 9.995407525180336e-05, + "loss": 3.9803, + "step": 1093 + }, + { + "epoch": 0.7052377115229653, + "grad_norm": 1.837020641908962, + "learning_rate": 9.995398881863243e-05, + "loss": 4.3822, + "step": 1094 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 2.0352957284013318, + "learning_rate": 9.99539023042395e-05, + "loss": 4.1062, + "step": 1095 + }, + { + "epoch": 0.7065269943593876, + "grad_norm": 2.6186952868765148, + "learning_rate": 9.995381570862475e-05, + "loss": 4.168, + "step": 1096 + }, + { + "epoch": 0.7071716357775987, + "grad_norm": 2.5924045333517207, + "learning_rate": 9.995372903178832e-05, + "loss": 4.0626, + "step": 1097 + }, + { + "epoch": 0.7078162771958099, + "grad_norm": 2.647172460734636, + "learning_rate": 9.995364227373032e-05, + "loss": 4.3794, + "step": 1098 + }, + { + "epoch": 0.7084609186140209, + "grad_norm": 2.105611892739831, + "learning_rate": 9.995355543445094e-05, + "loss": 4.054, + "step": 1099 + }, + { + "epoch": 0.709105560032232, + "grad_norm": 4.191575702341112, + "learning_rate": 9.995346851395026e-05, + "loss": 3.8873, + "step": 1100 + }, + { + "epoch": 0.709105560032232, + "eval_loss": 4.244497299194336, + "eval_runtime": 2.9389, + "eval_samples_per_second": 34.026, + "eval_steps_per_second": 4.423, + "step": 1100 + }, + { + "epoch": 0.7097502014504432, + "grad_norm": 6.1025979025250505, + "learning_rate": 9.99533815122285e-05, + "loss": 4.0967, + "step": 1101 + }, + { + "epoch": 0.7103948428686543, + "grad_norm": 5.039081920383538, + "learning_rate": 9.995329442928574e-05, + "loss": 3.866, + "step": 1102 + }, + { + "epoch": 0.7110394842868655, + "grad_norm": 4.090046164927685, + "learning_rate": 9.995320726512216e-05, + "loss": 4.0025, + "step": 1103 + }, + { + "epoch": 0.7116841257050766, + "grad_norm": 3.7580016546959563, + "learning_rate": 9.995312001973788e-05, + "loss": 3.9014, + "step": 1104 + }, + { + "epoch": 0.7123287671232876, + "grad_norm": 2.254099254515131, + "learning_rate": 9.995303269313307e-05, + "loss": 4.2213, + "step": 1105 + }, + { + "epoch": 0.7129734085414988, + "grad_norm": 3.8858339718127493, + "learning_rate": 9.995294528530781e-05, + "loss": 4.1039, + "step": 1106 + }, + { + "epoch": 0.7136180499597099, + "grad_norm": 5.0596739247127775, + "learning_rate": 9.995285779626235e-05, + "loss": 4.3029, + "step": 1107 + }, + { + "epoch": 0.714262691377921, + "grad_norm": 2.3964755237409654, + "learning_rate": 9.995277022599673e-05, + "loss": 4.1199, + "step": 1108 + }, + { + "epoch": 0.7149073327961322, + "grad_norm": 4.072595872004978, + "learning_rate": 9.995268257451115e-05, + "loss": 3.8906, + "step": 1109 + }, + { + "epoch": 0.7155519742143432, + "grad_norm": 3.1685474769309714, + "learning_rate": 9.995259484180573e-05, + "loss": 4.0894, + "step": 1110 + }, + { + "epoch": 0.7161966156325544, + "grad_norm": 2.982259301242794, + "learning_rate": 9.995250702788064e-05, + "loss": 4.1294, + "step": 1111 + }, + { + "epoch": 0.7168412570507655, + "grad_norm": 3.8663908794458135, + "learning_rate": 9.995241913273601e-05, + "loss": 4.1428, + "step": 1112 + }, + { + "epoch": 0.7174858984689766, + "grad_norm": 3.221012466850802, + "learning_rate": 9.995233115637199e-05, + "loss": 4.3181, + "step": 1113 + }, + { + "epoch": 0.7181305398871878, + "grad_norm": 2.3847672544904097, + "learning_rate": 9.995224309878868e-05, + "loss": 4.5703, + "step": 1114 + }, + { + "epoch": 0.7187751813053989, + "grad_norm": 2.7018712322462974, + "learning_rate": 9.99521549599863e-05, + "loss": 4.1921, + "step": 1115 + }, + { + "epoch": 0.71941982272361, + "grad_norm": 3.0260070378312327, + "learning_rate": 9.995206673996494e-05, + "loss": 4.2068, + "step": 1116 + }, + { + "epoch": 0.7200644641418211, + "grad_norm": 2.198074519294714, + "learning_rate": 9.995197843872477e-05, + "loss": 3.7888, + "step": 1117 + }, + { + "epoch": 0.7207091055600322, + "grad_norm": 2.279606412806249, + "learning_rate": 9.995189005626592e-05, + "loss": 4.2975, + "step": 1118 + }, + { + "epoch": 0.7213537469782434, + "grad_norm": 1.9486843835226426, + "learning_rate": 9.995180159258854e-05, + "loss": 4.4797, + "step": 1119 + }, + { + "epoch": 0.7219983883964545, + "grad_norm": 1.9984029086183022, + "learning_rate": 9.995171304769278e-05, + "loss": 4.4093, + "step": 1120 + }, + { + "epoch": 0.7226430298146655, + "grad_norm": 2.6405120646029503, + "learning_rate": 9.99516244215788e-05, + "loss": 3.8818, + "step": 1121 + }, + { + "epoch": 0.7232876712328767, + "grad_norm": 2.85031724687732, + "learning_rate": 9.995153571424672e-05, + "loss": 4.2861, + "step": 1122 + }, + { + "epoch": 0.7239323126510878, + "grad_norm": 1.8273397965930247, + "learning_rate": 9.995144692569668e-05, + "loss": 4.3016, + "step": 1123 + }, + { + "epoch": 0.724576954069299, + "grad_norm": 2.543527005227009, + "learning_rate": 9.995135805592885e-05, + "loss": 4.5646, + "step": 1124 + }, + { + "epoch": 0.7252215954875101, + "grad_norm": 2.2339711167243075, + "learning_rate": 9.995126910494337e-05, + "loss": 4.2526, + "step": 1125 + }, + { + "epoch": 0.7258662369057212, + "grad_norm": 1.770460607032549, + "learning_rate": 9.995118007274038e-05, + "loss": 4.1059, + "step": 1126 + }, + { + "epoch": 0.7265108783239324, + "grad_norm": 2.2389469857198603, + "learning_rate": 9.995109095932002e-05, + "loss": 4.4061, + "step": 1127 + }, + { + "epoch": 0.7271555197421434, + "grad_norm": 1.6119978690643821, + "learning_rate": 9.995100176468245e-05, + "loss": 4.0244, + "step": 1128 + }, + { + "epoch": 0.7278001611603545, + "grad_norm": 2.1154071391496134, + "learning_rate": 9.995091248882781e-05, + "loss": 4.1248, + "step": 1129 + }, + { + "epoch": 0.7284448025785657, + "grad_norm": 1.6453676103965778, + "learning_rate": 9.995082313175625e-05, + "loss": 4.0771, + "step": 1130 + }, + { + "epoch": 0.7290894439967768, + "grad_norm": 1.8537274077606691, + "learning_rate": 9.995073369346791e-05, + "loss": 4.2524, + "step": 1131 + }, + { + "epoch": 0.729734085414988, + "grad_norm": 1.735330839930639, + "learning_rate": 9.995064417396295e-05, + "loss": 4.2917, + "step": 1132 + }, + { + "epoch": 0.730378726833199, + "grad_norm": 1.5402736305702924, + "learning_rate": 9.99505545732415e-05, + "loss": 4.2546, + "step": 1133 + }, + { + "epoch": 0.7310233682514101, + "grad_norm": 1.7950080158493316, + "learning_rate": 9.995046489130373e-05, + "loss": 3.7005, + "step": 1134 + }, + { + "epoch": 0.7316680096696213, + "grad_norm": 1.6363939102738714, + "learning_rate": 9.995037512814977e-05, + "loss": 4.1963, + "step": 1135 + }, + { + "epoch": 0.7323126510878324, + "grad_norm": 2.2968129283607115, + "learning_rate": 9.995028528377977e-05, + "loss": 4.0252, + "step": 1136 + }, + { + "epoch": 0.7329572925060435, + "grad_norm": 3.1559933774973903, + "learning_rate": 9.995019535819386e-05, + "loss": 4.0717, + "step": 1137 + }, + { + "epoch": 0.7336019339242547, + "grad_norm": 2.5970195897801385, + "learning_rate": 9.995010535139223e-05, + "loss": 4.0368, + "step": 1138 + }, + { + "epoch": 0.7342465753424657, + "grad_norm": 2.87049146400981, + "learning_rate": 9.995001526337499e-05, + "loss": 4.0962, + "step": 1139 + }, + { + "epoch": 0.7348912167606769, + "grad_norm": 4.149669428972564, + "learning_rate": 9.994992509414231e-05, + "loss": 3.6944, + "step": 1140 + }, + { + "epoch": 0.735535858178888, + "grad_norm": 3.807618420851895, + "learning_rate": 9.994983484369431e-05, + "loss": 4.498, + "step": 1141 + }, + { + "epoch": 0.7361804995970991, + "grad_norm": 2.5984420173976757, + "learning_rate": 9.994974451203117e-05, + "loss": 4.1189, + "step": 1142 + }, + { + "epoch": 0.7368251410153103, + "grad_norm": 3.664950893254266, + "learning_rate": 9.994965409915303e-05, + "loss": 4.0792, + "step": 1143 + }, + { + "epoch": 0.7374697824335213, + "grad_norm": 3.9650374662725705, + "learning_rate": 9.994956360506003e-05, + "loss": 4.2802, + "step": 1144 + }, + { + "epoch": 0.7381144238517324, + "grad_norm": 2.709852993314551, + "learning_rate": 9.994947302975233e-05, + "loss": 4.2399, + "step": 1145 + }, + { + "epoch": 0.7387590652699436, + "grad_norm": 2.671830795533795, + "learning_rate": 9.994938237323004e-05, + "loss": 4.0171, + "step": 1146 + }, + { + "epoch": 0.7394037066881547, + "grad_norm": 4.23579879108681, + "learning_rate": 9.994929163549338e-05, + "loss": 4.0517, + "step": 1147 + }, + { + "epoch": 0.7400483481063659, + "grad_norm": 3.0238184830482866, + "learning_rate": 9.994920081654246e-05, + "loss": 3.9979, + "step": 1148 + }, + { + "epoch": 0.740692989524577, + "grad_norm": 2.4557554964439623, + "learning_rate": 9.99491099163774e-05, + "loss": 3.9405, + "step": 1149 + }, + { + "epoch": 0.741337630942788, + "grad_norm": 2.693442102152216, + "learning_rate": 9.99490189349984e-05, + "loss": 4.3266, + "step": 1150 + }, + { + "epoch": 0.7419822723609992, + "grad_norm": 3.1927369249839517, + "learning_rate": 9.994892787240559e-05, + "loss": 4.0661, + "step": 1151 + }, + { + "epoch": 0.7426269137792103, + "grad_norm": 4.256968942997105, + "learning_rate": 9.99488367285991e-05, + "loss": 4.0454, + "step": 1152 + }, + { + "epoch": 0.7432715551974214, + "grad_norm": 3.162418696465399, + "learning_rate": 9.99487455035791e-05, + "loss": 4.3496, + "step": 1153 + }, + { + "epoch": 0.7439161966156326, + "grad_norm": 3.1493587091465267, + "learning_rate": 9.994865419734576e-05, + "loss": 4.0364, + "step": 1154 + }, + { + "epoch": 0.7445608380338437, + "grad_norm": 4.644933758066119, + "learning_rate": 9.994856280989919e-05, + "loss": 4.2744, + "step": 1155 + }, + { + "epoch": 0.7452054794520548, + "grad_norm": 3.5177976601983945, + "learning_rate": 9.994847134123954e-05, + "loss": 4.3153, + "step": 1156 + }, + { + "epoch": 0.7458501208702659, + "grad_norm": 2.2901420929428573, + "learning_rate": 9.9948379791367e-05, + "loss": 4.061, + "step": 1157 + }, + { + "epoch": 0.746494762288477, + "grad_norm": 3.095399805744926, + "learning_rate": 9.99482881602817e-05, + "loss": 3.807, + "step": 1158 + }, + { + "epoch": 0.7471394037066882, + "grad_norm": 2.5291845702710765, + "learning_rate": 9.994819644798377e-05, + "loss": 3.8435, + "step": 1159 + }, + { + "epoch": 0.7477840451248993, + "grad_norm": 2.158924627534014, + "learning_rate": 9.99481046544734e-05, + "loss": 4.0129, + "step": 1160 + }, + { + "epoch": 0.7484286865431103, + "grad_norm": 2.547285483885909, + "learning_rate": 9.99480127797507e-05, + "loss": 4.22, + "step": 1161 + }, + { + "epoch": 0.7490733279613215, + "grad_norm": 2.8730233331394164, + "learning_rate": 9.994792082381585e-05, + "loss": 4.1698, + "step": 1162 + }, + { + "epoch": 0.7497179693795326, + "grad_norm": 2.9704581867950286, + "learning_rate": 9.9947828786669e-05, + "loss": 4.0149, + "step": 1163 + }, + { + "epoch": 0.7503626107977438, + "grad_norm": 2.2102248400700937, + "learning_rate": 9.994773666831027e-05, + "loss": 3.8832, + "step": 1164 + }, + { + "epoch": 0.7510072522159549, + "grad_norm": 2.5907866965848787, + "learning_rate": 9.994764446873986e-05, + "loss": 3.7024, + "step": 1165 + }, + { + "epoch": 0.751651893634166, + "grad_norm": 3.3959191896421306, + "learning_rate": 9.994755218795786e-05, + "loss": 4.134, + "step": 1166 + }, + { + "epoch": 0.7522965350523771, + "grad_norm": 3.4196188820051456, + "learning_rate": 9.994745982596448e-05, + "loss": 3.8101, + "step": 1167 + }, + { + "epoch": 0.7529411764705882, + "grad_norm": 3.101115789454838, + "learning_rate": 9.994736738275986e-05, + "loss": 3.8564, + "step": 1168 + }, + { + "epoch": 0.7535858178887993, + "grad_norm": 2.1420826933871906, + "learning_rate": 9.994727485834412e-05, + "loss": 4.1728, + "step": 1169 + }, + { + "epoch": 0.7542304593070105, + "grad_norm": 2.0382889103822652, + "learning_rate": 9.994718225271744e-05, + "loss": 4.3544, + "step": 1170 + }, + { + "epoch": 0.7548751007252216, + "grad_norm": 2.4060716656851624, + "learning_rate": 9.994708956587996e-05, + "loss": 3.7915, + "step": 1171 + }, + { + "epoch": 0.7555197421434328, + "grad_norm": 2.844602718351255, + "learning_rate": 9.994699679783185e-05, + "loss": 3.6794, + "step": 1172 + }, + { + "epoch": 0.7561643835616438, + "grad_norm": 2.1609209998869954, + "learning_rate": 9.994690394857323e-05, + "loss": 3.9725, + "step": 1173 + }, + { + "epoch": 0.7568090249798549, + "grad_norm": 1.952409704200252, + "learning_rate": 9.994681101810429e-05, + "loss": 4.245, + "step": 1174 + }, + { + "epoch": 0.7574536663980661, + "grad_norm": 1.887453221697848, + "learning_rate": 9.994671800642514e-05, + "loss": 4.3488, + "step": 1175 + }, + { + "epoch": 0.7580983078162772, + "grad_norm": 1.763899117096623, + "learning_rate": 9.994662491353598e-05, + "loss": 4.0276, + "step": 1176 + }, + { + "epoch": 0.7587429492344883, + "grad_norm": 2.022411906629917, + "learning_rate": 9.994653173943693e-05, + "loss": 4.3518, + "step": 1177 + }, + { + "epoch": 0.7593875906526995, + "grad_norm": 2.559341723726075, + "learning_rate": 9.994643848412815e-05, + "loss": 3.8671, + "step": 1178 + }, + { + "epoch": 0.7600322320709105, + "grad_norm": 2.6355034497061793, + "learning_rate": 9.994634514760979e-05, + "loss": 4.0366, + "step": 1179 + }, + { + "epoch": 0.7606768734891217, + "grad_norm": 1.5956935318063472, + "learning_rate": 9.994625172988204e-05, + "loss": 4.1217, + "step": 1180 + }, + { + "epoch": 0.7613215149073328, + "grad_norm": 2.1404286483835344, + "learning_rate": 9.994615823094497e-05, + "loss": 4.3343, + "step": 1181 + }, + { + "epoch": 0.7619661563255439, + "grad_norm": 2.4450186693437046, + "learning_rate": 9.994606465079883e-05, + "loss": 4.47, + "step": 1182 + }, + { + "epoch": 0.7626107977437551, + "grad_norm": 2.0119877411394738, + "learning_rate": 9.994597098944372e-05, + "loss": 4.0673, + "step": 1183 + }, + { + "epoch": 0.7632554391619661, + "grad_norm": 2.557980282554674, + "learning_rate": 9.99458772468798e-05, + "loss": 4.0762, + "step": 1184 + }, + { + "epoch": 0.7639000805801772, + "grad_norm": 3.289378986555992, + "learning_rate": 9.994578342310724e-05, + "loss": 4.0082, + "step": 1185 + }, + { + "epoch": 0.7645447219983884, + "grad_norm": 2.801886012241697, + "learning_rate": 9.994568951812617e-05, + "loss": 4.2677, + "step": 1186 + }, + { + "epoch": 0.7651893634165995, + "grad_norm": 2.286741461554111, + "learning_rate": 9.994559553193676e-05, + "loss": 3.906, + "step": 1187 + }, + { + "epoch": 0.7658340048348107, + "grad_norm": 2.574092790141579, + "learning_rate": 9.994550146453914e-05, + "loss": 3.9077, + "step": 1188 + }, + { + "epoch": 0.7664786462530218, + "grad_norm": 2.9444331344519235, + "learning_rate": 9.99454073159335e-05, + "loss": 3.9036, + "step": 1189 + }, + { + "epoch": 0.7671232876712328, + "grad_norm": 1.8676671327834409, + "learning_rate": 9.994531308611998e-05, + "loss": 4.3876, + "step": 1190 + }, + { + "epoch": 0.767767929089444, + "grad_norm": 2.855428578319293, + "learning_rate": 9.994521877509875e-05, + "loss": 4.1585, + "step": 1191 + }, + { + "epoch": 0.7684125705076551, + "grad_norm": 3.072950212739594, + "learning_rate": 9.994512438286991e-05, + "loss": 4.1489, + "step": 1192 + }, + { + "epoch": 0.7690572119258663, + "grad_norm": 1.6979085453576435, + "learning_rate": 9.994502990943367e-05, + "loss": 4.263, + "step": 1193 + }, + { + "epoch": 0.7697018533440774, + "grad_norm": 3.3288214639756166, + "learning_rate": 9.99449353547902e-05, + "loss": 4.1095, + "step": 1194 + }, + { + "epoch": 0.7703464947622884, + "grad_norm": 2.4631211328233187, + "learning_rate": 9.994484071893959e-05, + "loss": 4.2566, + "step": 1195 + }, + { + "epoch": 0.7709911361804996, + "grad_norm": 2.520282095293977, + "learning_rate": 9.994474600188204e-05, + "loss": 4.3165, + "step": 1196 + }, + { + "epoch": 0.7716357775987107, + "grad_norm": 2.112999601007035, + "learning_rate": 9.994465120361771e-05, + "loss": 4.216, + "step": 1197 + }, + { + "epoch": 0.7722804190169218, + "grad_norm": 3.153879409681399, + "learning_rate": 9.994455632414672e-05, + "loss": 3.7841, + "step": 1198 + }, + { + "epoch": 0.772925060435133, + "grad_norm": 3.479536255854951, + "learning_rate": 9.994446136346926e-05, + "loss": 4.2363, + "step": 1199 + }, + { + "epoch": 0.7735697018533441, + "grad_norm": 1.882485961336376, + "learning_rate": 9.994436632158547e-05, + "loss": 3.96, + "step": 1200 + }, + { + "epoch": 0.7735697018533441, + "eval_loss": 4.2120819091796875, + "eval_runtime": 2.9666, + "eval_samples_per_second": 33.709, + "eval_steps_per_second": 4.382, + "step": 1200 + }, + { + "epoch": 0.7742143432715552, + "grad_norm": 2.3165118638394033, + "learning_rate": 9.994427119849551e-05, + "loss": 3.9916, + "step": 1201 + }, + { + "epoch": 0.7748589846897663, + "grad_norm": 2.281663992212712, + "learning_rate": 9.994417599419952e-05, + "loss": 4.2601, + "step": 1202 + }, + { + "epoch": 0.7755036261079774, + "grad_norm": 2.167202552025709, + "learning_rate": 9.994408070869769e-05, + "loss": 4.1979, + "step": 1203 + }, + { + "epoch": 0.7761482675261886, + "grad_norm": 1.96939967545405, + "learning_rate": 9.994398534199016e-05, + "loss": 4.188, + "step": 1204 + }, + { + "epoch": 0.7767929089443997, + "grad_norm": 2.232947262722584, + "learning_rate": 9.994388989407708e-05, + "loss": 4.4629, + "step": 1205 + }, + { + "epoch": 0.7774375503626108, + "grad_norm": 2.145410494468569, + "learning_rate": 9.994379436495859e-05, + "loss": 3.7653, + "step": 1206 + }, + { + "epoch": 0.7780821917808219, + "grad_norm": 1.9242609102298314, + "learning_rate": 9.99436987546349e-05, + "loss": 4.5287, + "step": 1207 + }, + { + "epoch": 0.778726833199033, + "grad_norm": 2.208468747249909, + "learning_rate": 9.994360306310611e-05, + "loss": 4.3114, + "step": 1208 + }, + { + "epoch": 0.7793714746172442, + "grad_norm": 1.5457335636856957, + "learning_rate": 9.994350729037243e-05, + "loss": 4.3736, + "step": 1209 + }, + { + "epoch": 0.7800161160354553, + "grad_norm": 2.5752470085866066, + "learning_rate": 9.994341143643397e-05, + "loss": 4.0088, + "step": 1210 + }, + { + "epoch": 0.7806607574536664, + "grad_norm": 3.047428881100571, + "learning_rate": 9.994331550129092e-05, + "loss": 3.7437, + "step": 1211 + }, + { + "epoch": 0.7813053988718776, + "grad_norm": 4.228750537390787, + "learning_rate": 9.994321948494342e-05, + "loss": 3.8797, + "step": 1212 + }, + { + "epoch": 0.7819500402900886, + "grad_norm": 3.204751200872402, + "learning_rate": 9.994312338739163e-05, + "loss": 4.6635, + "step": 1213 + }, + { + "epoch": 0.7825946817082997, + "grad_norm": 1.998839645056302, + "learning_rate": 9.99430272086357e-05, + "loss": 4.4017, + "step": 1214 + }, + { + "epoch": 0.7832393231265109, + "grad_norm": 3.91670353166577, + "learning_rate": 9.994293094867581e-05, + "loss": 4.0409, + "step": 1215 + }, + { + "epoch": 0.783883964544722, + "grad_norm": 2.2023409903767073, + "learning_rate": 9.994283460751212e-05, + "loss": 4.0959, + "step": 1216 + }, + { + "epoch": 0.7845286059629332, + "grad_norm": 2.572028799939178, + "learning_rate": 9.994273818514475e-05, + "loss": 3.9645, + "step": 1217 + }, + { + "epoch": 0.7851732473811442, + "grad_norm": 3.571739524419931, + "learning_rate": 9.99426416815739e-05, + "loss": 3.9, + "step": 1218 + }, + { + "epoch": 0.7858178887993553, + "grad_norm": 2.5506102734565266, + "learning_rate": 9.99425450967997e-05, + "loss": 3.9381, + "step": 1219 + }, + { + "epoch": 0.7864625302175665, + "grad_norm": 2.709382386965575, + "learning_rate": 9.994244843082233e-05, + "loss": 4.138, + "step": 1220 + }, + { + "epoch": 0.7871071716357776, + "grad_norm": 3.0944776608014863, + "learning_rate": 9.994235168364194e-05, + "loss": 4.2828, + "step": 1221 + }, + { + "epoch": 0.7877518130539887, + "grad_norm": 2.6908152993385057, + "learning_rate": 9.994225485525868e-05, + "loss": 4.0754, + "step": 1222 + }, + { + "epoch": 0.7883964544721999, + "grad_norm": 3.197311154590536, + "learning_rate": 9.994215794567271e-05, + "loss": 4.4399, + "step": 1223 + }, + { + "epoch": 0.7890410958904109, + "grad_norm": 2.648642115319857, + "learning_rate": 9.99420609548842e-05, + "loss": 4.2137, + "step": 1224 + }, + { + "epoch": 0.7896857373086221, + "grad_norm": 1.7575460951848267, + "learning_rate": 9.994196388289333e-05, + "loss": 4.2965, + "step": 1225 + }, + { + "epoch": 0.7903303787268332, + "grad_norm": 1.8401080032356385, + "learning_rate": 9.994186672970022e-05, + "loss": 4.1667, + "step": 1226 + }, + { + "epoch": 0.7909750201450443, + "grad_norm": 2.2921869369898284, + "learning_rate": 9.994176949530502e-05, + "loss": 4.1251, + "step": 1227 + }, + { + "epoch": 0.7916196615632555, + "grad_norm": 2.09790772261732, + "learning_rate": 9.994167217970794e-05, + "loss": 3.9801, + "step": 1228 + }, + { + "epoch": 0.7922643029814666, + "grad_norm": 1.3864051192748648, + "learning_rate": 9.994157478290912e-05, + "loss": 4.2403, + "step": 1229 + }, + { + "epoch": 0.7929089443996776, + "grad_norm": 2.090129295466916, + "learning_rate": 9.99414773049087e-05, + "loss": 3.9148, + "step": 1230 + }, + { + "epoch": 0.7935535858178888, + "grad_norm": 3.4948926825062903, + "learning_rate": 9.994137974570686e-05, + "loss": 4.0314, + "step": 1231 + }, + { + "epoch": 0.7941982272360999, + "grad_norm": 4.280993499914027, + "learning_rate": 9.994128210530374e-05, + "loss": 3.9842, + "step": 1232 + }, + { + "epoch": 0.7948428686543111, + "grad_norm": 3.795785789513831, + "learning_rate": 9.994118438369952e-05, + "loss": 4.2356, + "step": 1233 + }, + { + "epoch": 0.7954875100725222, + "grad_norm": 3.064322673750678, + "learning_rate": 9.994108658089436e-05, + "loss": 4.3256, + "step": 1234 + }, + { + "epoch": 0.7961321514907332, + "grad_norm": 3.34238963424384, + "learning_rate": 9.994098869688842e-05, + "loss": 3.6785, + "step": 1235 + }, + { + "epoch": 0.7967767929089444, + "grad_norm": 3.943326830395087, + "learning_rate": 9.994089073168186e-05, + "loss": 4.3458, + "step": 1236 + }, + { + "epoch": 0.7974214343271555, + "grad_norm": 2.5361238468239735, + "learning_rate": 9.994079268527484e-05, + "loss": 4.451, + "step": 1237 + }, + { + "epoch": 0.7980660757453666, + "grad_norm": 2.4228372615708293, + "learning_rate": 9.99406945576675e-05, + "loss": 4.5425, + "step": 1238 + }, + { + "epoch": 0.7987107171635778, + "grad_norm": 2.6811360111161116, + "learning_rate": 9.994059634886003e-05, + "loss": 3.9639, + "step": 1239 + }, + { + "epoch": 0.7993553585817889, + "grad_norm": 2.2369710922935195, + "learning_rate": 9.994049805885258e-05, + "loss": 4.1833, + "step": 1240 + }, + { + "epoch": 0.8, + "grad_norm": 2.6416216537172232, + "learning_rate": 9.994039968764531e-05, + "loss": 4.2453, + "step": 1241 + }, + { + "epoch": 0.8006446414182111, + "grad_norm": 2.683135375799, + "learning_rate": 9.994030123523838e-05, + "loss": 3.9934, + "step": 1242 + }, + { + "epoch": 0.8012892828364222, + "grad_norm": 1.8450998918139334, + "learning_rate": 9.994020270163196e-05, + "loss": 4.4098, + "step": 1243 + }, + { + "epoch": 0.8019339242546334, + "grad_norm": 1.8983789153560318, + "learning_rate": 9.99401040868262e-05, + "loss": 3.7429, + "step": 1244 + }, + { + "epoch": 0.8025785656728445, + "grad_norm": 2.0703274386640946, + "learning_rate": 9.994000539082129e-05, + "loss": 4.2558, + "step": 1245 + }, + { + "epoch": 0.8032232070910555, + "grad_norm": 1.9920681517576027, + "learning_rate": 9.993990661361736e-05, + "loss": 4.1038, + "step": 1246 + }, + { + "epoch": 0.8038678485092667, + "grad_norm": 2.4782929411892707, + "learning_rate": 9.993980775521457e-05, + "loss": 3.9033, + "step": 1247 + }, + { + "epoch": 0.8045124899274778, + "grad_norm": 1.262414618646948, + "learning_rate": 9.993970881561312e-05, + "loss": 4.2757, + "step": 1248 + }, + { + "epoch": 0.805157131345689, + "grad_norm": 2.3238417376080074, + "learning_rate": 9.993960979481313e-05, + "loss": 3.9509, + "step": 1249 + }, + { + "epoch": 0.8058017727639001, + "grad_norm": 2.7396126574758877, + "learning_rate": 9.993951069281478e-05, + "loss": 4.0819, + "step": 1250 + }, + { + "epoch": 0.8064464141821112, + "grad_norm": 3.4959266701329503, + "learning_rate": 9.993941150961826e-05, + "loss": 4.3598, + "step": 1251 + }, + { + "epoch": 0.8070910556003223, + "grad_norm": 2.3044385311187447, + "learning_rate": 9.993931224522369e-05, + "loss": 4.1839, + "step": 1252 + }, + { + "epoch": 0.8077356970185334, + "grad_norm": 3.23374027847794, + "learning_rate": 9.993921289963124e-05, + "loss": 4.1195, + "step": 1253 + }, + { + "epoch": 0.8083803384367445, + "grad_norm": 2.7323417247396975, + "learning_rate": 9.993911347284109e-05, + "loss": 4.1684, + "step": 1254 + }, + { + "epoch": 0.8090249798549557, + "grad_norm": 2.102812963135701, + "learning_rate": 9.99390139648534e-05, + "loss": 4.0231, + "step": 1255 + }, + { + "epoch": 0.8096696212731668, + "grad_norm": 3.0194934535569664, + "learning_rate": 9.993891437566833e-05, + "loss": 4.0623, + "step": 1256 + }, + { + "epoch": 0.810314262691378, + "grad_norm": 2.7593626424836306, + "learning_rate": 9.993881470528604e-05, + "loss": 4.1281, + "step": 1257 + }, + { + "epoch": 0.810958904109589, + "grad_norm": 1.8258281693118672, + "learning_rate": 9.99387149537067e-05, + "loss": 4.1541, + "step": 1258 + }, + { + "epoch": 0.8116035455278001, + "grad_norm": 2.429468376627721, + "learning_rate": 9.993861512093047e-05, + "loss": 4.1618, + "step": 1259 + }, + { + "epoch": 0.8122481869460113, + "grad_norm": 2.3378835780788108, + "learning_rate": 9.99385152069575e-05, + "loss": 3.951, + "step": 1260 + }, + { + "epoch": 0.8128928283642224, + "grad_norm": 2.265705509643553, + "learning_rate": 9.9938415211788e-05, + "loss": 4.1418, + "step": 1261 + }, + { + "epoch": 0.8135374697824336, + "grad_norm": 2.0602376853202915, + "learning_rate": 9.993831513542206e-05, + "loss": 4.4632, + "step": 1262 + }, + { + "epoch": 0.8141821112006447, + "grad_norm": 2.088905915488605, + "learning_rate": 9.993821497785994e-05, + "loss": 4.031, + "step": 1263 + }, + { + "epoch": 0.8148267526188557, + "grad_norm": 2.0754118692129238, + "learning_rate": 9.99381147391017e-05, + "loss": 4.2855, + "step": 1264 + }, + { + "epoch": 0.8154713940370669, + "grad_norm": 2.552451748230588, + "learning_rate": 9.993801441914759e-05, + "loss": 3.8914, + "step": 1265 + }, + { + "epoch": 0.816116035455278, + "grad_norm": 2.8078877253263426, + "learning_rate": 9.993791401799774e-05, + "loss": 3.9501, + "step": 1266 + }, + { + "epoch": 0.8167606768734891, + "grad_norm": 2.9170416402793666, + "learning_rate": 9.99378135356523e-05, + "loss": 4.2524, + "step": 1267 + }, + { + "epoch": 0.8174053182917003, + "grad_norm": 2.579966381216297, + "learning_rate": 9.993771297211147e-05, + "loss": 3.8206, + "step": 1268 + }, + { + "epoch": 0.8180499597099113, + "grad_norm": 3.2077957736087424, + "learning_rate": 9.993761232737538e-05, + "loss": 4.2745, + "step": 1269 + }, + { + "epoch": 0.8186946011281225, + "grad_norm": 2.4561881647826795, + "learning_rate": 9.993751160144421e-05, + "loss": 4.4788, + "step": 1270 + }, + { + "epoch": 0.8193392425463336, + "grad_norm": 2.6630405841834666, + "learning_rate": 9.993741079431815e-05, + "loss": 4.1256, + "step": 1271 + }, + { + "epoch": 0.8199838839645447, + "grad_norm": 3.1721063998943997, + "learning_rate": 9.993730990599732e-05, + "loss": 4.0382, + "step": 1272 + }, + { + "epoch": 0.8206285253827559, + "grad_norm": 2.082610049647515, + "learning_rate": 9.993720893648191e-05, + "loss": 4.1313, + "step": 1273 + }, + { + "epoch": 0.821273166800967, + "grad_norm": 2.0831040778473273, + "learning_rate": 9.99371078857721e-05, + "loss": 4.4142, + "step": 1274 + }, + { + "epoch": 0.821917808219178, + "grad_norm": 2.5059697538934254, + "learning_rate": 9.993700675386803e-05, + "loss": 3.9917, + "step": 1275 + }, + { + "epoch": 0.8225624496373892, + "grad_norm": 1.878181602498454, + "learning_rate": 9.993690554076988e-05, + "loss": 4.1126, + "step": 1276 + }, + { + "epoch": 0.8232070910556003, + "grad_norm": 2.3104666372566482, + "learning_rate": 9.99368042464778e-05, + "loss": 3.9431, + "step": 1277 + }, + { + "epoch": 0.8238517324738115, + "grad_norm": 3.2808345004017987, + "learning_rate": 9.993670287099199e-05, + "loss": 4.1748, + "step": 1278 + }, + { + "epoch": 0.8244963738920226, + "grad_norm": 2.7918763785367213, + "learning_rate": 9.993660141431257e-05, + "loss": 4.2082, + "step": 1279 + }, + { + "epoch": 0.8251410153102336, + "grad_norm": 1.8103987526161756, + "learning_rate": 9.993649987643975e-05, + "loss": 3.9535, + "step": 1280 + }, + { + "epoch": 0.8257856567284448, + "grad_norm": 3.0903303702005966, + "learning_rate": 9.993639825737368e-05, + "loss": 3.991, + "step": 1281 + }, + { + "epoch": 0.8264302981466559, + "grad_norm": 3.2798821161046985, + "learning_rate": 9.993629655711452e-05, + "loss": 3.8791, + "step": 1282 + }, + { + "epoch": 0.827074939564867, + "grad_norm": 2.7527870647004047, + "learning_rate": 9.993619477566244e-05, + "loss": 3.9824, + "step": 1283 + }, + { + "epoch": 0.8277195809830782, + "grad_norm": 2.8807007411823378, + "learning_rate": 9.993609291301761e-05, + "loss": 4.027, + "step": 1284 + }, + { + "epoch": 0.8283642224012893, + "grad_norm": 2.842254859986483, + "learning_rate": 9.993599096918021e-05, + "loss": 4.5057, + "step": 1285 + }, + { + "epoch": 0.8290088638195005, + "grad_norm": 2.4062068078018357, + "learning_rate": 9.993588894415037e-05, + "loss": 4.2387, + "step": 1286 + }, + { + "epoch": 0.8296535052377115, + "grad_norm": 3.4156764683673626, + "learning_rate": 9.99357868379283e-05, + "loss": 3.5732, + "step": 1287 + }, + { + "epoch": 0.8302981466559226, + "grad_norm": 3.011393534161188, + "learning_rate": 9.993568465051416e-05, + "loss": 3.8852, + "step": 1288 + }, + { + "epoch": 0.8309427880741338, + "grad_norm": 2.5977875962613477, + "learning_rate": 9.993558238190808e-05, + "loss": 4.1857, + "step": 1289 + }, + { + "epoch": 0.8315874294923449, + "grad_norm": 2.1333631368208215, + "learning_rate": 9.993548003211028e-05, + "loss": 4.2131, + "step": 1290 + }, + { + "epoch": 0.832232070910556, + "grad_norm": 2.9506339857937607, + "learning_rate": 9.993537760112089e-05, + "loss": 4.193, + "step": 1291 + }, + { + "epoch": 0.8328767123287671, + "grad_norm": 1.787331204488234, + "learning_rate": 9.99352750889401e-05, + "loss": 4.1435, + "step": 1292 + }, + { + "epoch": 0.8335213537469782, + "grad_norm": 2.777691153059358, + "learning_rate": 9.993517249556805e-05, + "loss": 4.3318, + "step": 1293 + }, + { + "epoch": 0.8341659951651894, + "grad_norm": 2.1606557272334777, + "learning_rate": 9.993506982100497e-05, + "loss": 4.2744, + "step": 1294 + }, + { + "epoch": 0.8348106365834005, + "grad_norm": 2.55744707717894, + "learning_rate": 9.993496706525094e-05, + "loss": 4.416, + "step": 1295 + }, + { + "epoch": 0.8354552780016116, + "grad_norm": 3.0369958731289826, + "learning_rate": 9.99348642283062e-05, + "loss": 4.2897, + "step": 1296 + }, + { + "epoch": 0.8360999194198228, + "grad_norm": 3.108765283957298, + "learning_rate": 9.993476131017089e-05, + "loss": 3.6912, + "step": 1297 + }, + { + "epoch": 0.8367445608380338, + "grad_norm": 2.0184293645111087, + "learning_rate": 9.99346583108452e-05, + "loss": 4.6414, + "step": 1298 + }, + { + "epoch": 0.8373892022562449, + "grad_norm": 2.8629241129854113, + "learning_rate": 9.993455523032928e-05, + "loss": 4.5143, + "step": 1299 + }, + { + "epoch": 0.8380338436744561, + "grad_norm": 3.3514899989301137, + "learning_rate": 9.993445206862328e-05, + "loss": 4.3096, + "step": 1300 + }, + { + "epoch": 0.8380338436744561, + "eval_loss": 4.201112747192383, + "eval_runtime": 2.9494, + "eval_samples_per_second": 33.905, + "eval_steps_per_second": 4.408, + "step": 1300 + }, + { + "epoch": 0.8386784850926672, + "grad_norm": 2.68298357386762, + "learning_rate": 9.993434882572741e-05, + "loss": 3.9165, + "step": 1301 + }, + { + "epoch": 0.8393231265108784, + "grad_norm": 2.5692036612624145, + "learning_rate": 9.993424550164182e-05, + "loss": 4.0958, + "step": 1302 + }, + { + "epoch": 0.8399677679290894, + "grad_norm": 2.674154895731774, + "learning_rate": 9.993414209636666e-05, + "loss": 4.2436, + "step": 1303 + }, + { + "epoch": 0.8406124093473005, + "grad_norm": 2.8176847835553986, + "learning_rate": 9.993403860990216e-05, + "loss": 4.2678, + "step": 1304 + }, + { + "epoch": 0.8412570507655117, + "grad_norm": 2.320552045642087, + "learning_rate": 9.993393504224842e-05, + "loss": 4.5101, + "step": 1305 + }, + { + "epoch": 0.8419016921837228, + "grad_norm": 2.851340185168477, + "learning_rate": 9.993383139340567e-05, + "loss": 4.3946, + "step": 1306 + }, + { + "epoch": 0.8425463336019339, + "grad_norm": 3.5185154430135173, + "learning_rate": 9.993372766337402e-05, + "loss": 4.2573, + "step": 1307 + }, + { + "epoch": 0.8431909750201451, + "grad_norm": 1.918428970280574, + "learning_rate": 9.993362385215369e-05, + "loss": 4.2395, + "step": 1308 + }, + { + "epoch": 0.8438356164383561, + "grad_norm": 2.770884751644612, + "learning_rate": 9.993351995974482e-05, + "loss": 3.7656, + "step": 1309 + }, + { + "epoch": 0.8444802578565673, + "grad_norm": 3.56830883053597, + "learning_rate": 9.993341598614761e-05, + "loss": 3.832, + "step": 1310 + }, + { + "epoch": 0.8451248992747784, + "grad_norm": 2.24535789750023, + "learning_rate": 9.99333119313622e-05, + "loss": 4.1481, + "step": 1311 + }, + { + "epoch": 0.8457695406929895, + "grad_norm": 2.686265914042191, + "learning_rate": 9.993320779538879e-05, + "loss": 3.9989, + "step": 1312 + }, + { + "epoch": 0.8464141821112007, + "grad_norm": 1.8769712656808006, + "learning_rate": 9.99331035782275e-05, + "loss": 4.1228, + "step": 1313 + }, + { + "epoch": 0.8470588235294118, + "grad_norm": 2.6359618609292497, + "learning_rate": 9.993299927987858e-05, + "loss": 3.896, + "step": 1314 + }, + { + "epoch": 0.8477034649476228, + "grad_norm": 2.848945804919617, + "learning_rate": 9.993289490034213e-05, + "loss": 4.2436, + "step": 1315 + }, + { + "epoch": 0.848348106365834, + "grad_norm": 2.310777846861372, + "learning_rate": 9.993279043961836e-05, + "loss": 4.3938, + "step": 1316 + }, + { + "epoch": 0.8489927477840451, + "grad_norm": 2.0197036865343403, + "learning_rate": 9.993268589770742e-05, + "loss": 4.2067, + "step": 1317 + }, + { + "epoch": 0.8496373892022563, + "grad_norm": 2.1817312667582747, + "learning_rate": 9.993258127460952e-05, + "loss": 3.8399, + "step": 1318 + }, + { + "epoch": 0.8502820306204674, + "grad_norm": 2.958472619836966, + "learning_rate": 9.993247657032478e-05, + "loss": 3.9593, + "step": 1319 + }, + { + "epoch": 0.8509266720386784, + "grad_norm": 1.8822404439170504, + "learning_rate": 9.993237178485341e-05, + "loss": 4.1654, + "step": 1320 + }, + { + "epoch": 0.8515713134568896, + "grad_norm": 2.861759132703459, + "learning_rate": 9.993226691819556e-05, + "loss": 3.9261, + "step": 1321 + }, + { + "epoch": 0.8522159548751007, + "grad_norm": 3.1697095814919223, + "learning_rate": 9.99321619703514e-05, + "loss": 4.0994, + "step": 1322 + }, + { + "epoch": 0.8528605962933118, + "grad_norm": 2.790782368593025, + "learning_rate": 9.993205694132114e-05, + "loss": 4.1858, + "step": 1323 + }, + { + "epoch": 0.853505237711523, + "grad_norm": 1.7520504228136338, + "learning_rate": 9.99319518311049e-05, + "loss": 4.1557, + "step": 1324 + }, + { + "epoch": 0.8541498791297341, + "grad_norm": 3.158499889922895, + "learning_rate": 9.99318466397029e-05, + "loss": 4.3115, + "step": 1325 + }, + { + "epoch": 0.8547945205479452, + "grad_norm": 2.700042257296401, + "learning_rate": 9.993174136711528e-05, + "loss": 4.095, + "step": 1326 + }, + { + "epoch": 0.8554391619661563, + "grad_norm": 2.817576107050785, + "learning_rate": 9.993163601334223e-05, + "loss": 3.8507, + "step": 1327 + }, + { + "epoch": 0.8560838033843674, + "grad_norm": 3.3732175951698316, + "learning_rate": 9.993153057838391e-05, + "loss": 4.1601, + "step": 1328 + }, + { + "epoch": 0.8567284448025786, + "grad_norm": 1.7044438440932654, + "learning_rate": 9.993142506224051e-05, + "loss": 4.4277, + "step": 1329 + }, + { + "epoch": 0.8573730862207897, + "grad_norm": 2.6956579018582625, + "learning_rate": 9.993131946491218e-05, + "loss": 4.3002, + "step": 1330 + }, + { + "epoch": 0.8580177276390009, + "grad_norm": 2.42149522382192, + "learning_rate": 9.993121378639911e-05, + "loss": 4.0815, + "step": 1331 + }, + { + "epoch": 0.8586623690572119, + "grad_norm": 1.8428117288567416, + "learning_rate": 9.993110802670146e-05, + "loss": 4.1866, + "step": 1332 + }, + { + "epoch": 0.859307010475423, + "grad_norm": 2.594322791676015, + "learning_rate": 9.993100218581944e-05, + "loss": 3.7426, + "step": 1333 + }, + { + "epoch": 0.8599516518936342, + "grad_norm": 2.8680237367728805, + "learning_rate": 9.99308962637532e-05, + "loss": 4.4826, + "step": 1334 + }, + { + "epoch": 0.8605962933118453, + "grad_norm": 3.220688518863623, + "learning_rate": 9.993079026050288e-05, + "loss": 4.0763, + "step": 1335 + }, + { + "epoch": 0.8612409347300564, + "grad_norm": 2.940829428881643, + "learning_rate": 9.993068417606872e-05, + "loss": 4.1589, + "step": 1336 + }, + { + "epoch": 0.8618855761482676, + "grad_norm": 2.1517208729963757, + "learning_rate": 9.993057801045085e-05, + "loss": 3.9164, + "step": 1337 + }, + { + "epoch": 0.8625302175664786, + "grad_norm": 1.8883643964250614, + "learning_rate": 9.993047176364944e-05, + "loss": 4.5327, + "step": 1338 + }, + { + "epoch": 0.8631748589846898, + "grad_norm": 1.933157876798501, + "learning_rate": 9.993036543566468e-05, + "loss": 4.1955, + "step": 1339 + }, + { + "epoch": 0.8638195004029009, + "grad_norm": 2.7368127206874364, + "learning_rate": 9.993025902649676e-05, + "loss": 4.3733, + "step": 1340 + }, + { + "epoch": 0.864464141821112, + "grad_norm": 1.9331457297738865, + "learning_rate": 9.993015253614584e-05, + "loss": 4.2895, + "step": 1341 + }, + { + "epoch": 0.8651087832393232, + "grad_norm": 3.7254414153181745, + "learning_rate": 9.993004596461207e-05, + "loss": 4.032, + "step": 1342 + }, + { + "epoch": 0.8657534246575342, + "grad_norm": 2.8475636970302953, + "learning_rate": 9.992993931189566e-05, + "loss": 4.22, + "step": 1343 + }, + { + "epoch": 0.8663980660757453, + "grad_norm": 3.5822724649984576, + "learning_rate": 9.992983257799679e-05, + "loss": 4.3033, + "step": 1344 + }, + { + "epoch": 0.8670427074939565, + "grad_norm": 2.043131474160878, + "learning_rate": 9.992972576291561e-05, + "loss": 4.2937, + "step": 1345 + }, + { + "epoch": 0.8676873489121676, + "grad_norm": 5.494167233146733, + "learning_rate": 9.99296188666523e-05, + "loss": 4.3298, + "step": 1346 + }, + { + "epoch": 0.8683319903303788, + "grad_norm": 2.910991643505098, + "learning_rate": 9.992951188920705e-05, + "loss": 3.9723, + "step": 1347 + }, + { + "epoch": 0.8689766317485899, + "grad_norm": 3.8328525504922832, + "learning_rate": 9.992940483058001e-05, + "loss": 4.2485, + "step": 1348 + }, + { + "epoch": 0.8696212731668009, + "grad_norm": 1.709673681848429, + "learning_rate": 9.992929769077138e-05, + "loss": 3.9826, + "step": 1349 + }, + { + "epoch": 0.8702659145850121, + "grad_norm": 2.99247703269188, + "learning_rate": 9.992919046978132e-05, + "loss": 3.9082, + "step": 1350 + }, + { + "epoch": 0.8709105560032232, + "grad_norm": 2.400670265200452, + "learning_rate": 9.992908316761002e-05, + "loss": 4.2306, + "step": 1351 + }, + { + "epoch": 0.8715551974214343, + "grad_norm": 2.6469275637380907, + "learning_rate": 9.992897578425764e-05, + "loss": 4.1268, + "step": 1352 + }, + { + "epoch": 0.8721998388396455, + "grad_norm": 2.6871036805454662, + "learning_rate": 9.992886831972437e-05, + "loss": 3.8767, + "step": 1353 + }, + { + "epoch": 0.8728444802578565, + "grad_norm": 2.8673681142750973, + "learning_rate": 9.99287607740104e-05, + "loss": 3.992, + "step": 1354 + }, + { + "epoch": 0.8734891216760677, + "grad_norm": 2.3105780523887067, + "learning_rate": 9.992865314711587e-05, + "loss": 4.2008, + "step": 1355 + }, + { + "epoch": 0.8741337630942788, + "grad_norm": 2.3853698369526466, + "learning_rate": 9.992854543904099e-05, + "loss": 3.9243, + "step": 1356 + }, + { + "epoch": 0.8747784045124899, + "grad_norm": 2.3091611104502423, + "learning_rate": 9.992843764978591e-05, + "loss": 4.0585, + "step": 1357 + }, + { + "epoch": 0.8754230459307011, + "grad_norm": 2.4674804194942213, + "learning_rate": 9.992832977935082e-05, + "loss": 4.3199, + "step": 1358 + }, + { + "epoch": 0.8760676873489122, + "grad_norm": 2.053783285426729, + "learning_rate": 9.992822182773592e-05, + "loss": 4.2049, + "step": 1359 + }, + { + "epoch": 0.8767123287671232, + "grad_norm": 1.9649409770200865, + "learning_rate": 9.992811379494133e-05, + "loss": 3.787, + "step": 1360 + }, + { + "epoch": 0.8773569701853344, + "grad_norm": 1.8448730602665784, + "learning_rate": 9.992800568096729e-05, + "loss": 4.1186, + "step": 1361 + }, + { + "epoch": 0.8780016116035455, + "grad_norm": 1.701026185561265, + "learning_rate": 9.992789748581393e-05, + "loss": 3.8943, + "step": 1362 + }, + { + "epoch": 0.8786462530217567, + "grad_norm": 2.2862273420153514, + "learning_rate": 9.992778920948147e-05, + "loss": 4.0924, + "step": 1363 + }, + { + "epoch": 0.8792908944399678, + "grad_norm": 2.2222886338240038, + "learning_rate": 9.992768085197003e-05, + "loss": 3.9288, + "step": 1364 + }, + { + "epoch": 0.8799355358581789, + "grad_norm": 2.5183287886053374, + "learning_rate": 9.992757241327984e-05, + "loss": 4.1141, + "step": 1365 + }, + { + "epoch": 0.88058017727639, + "grad_norm": 3.0541167792555246, + "learning_rate": 9.992746389341108e-05, + "loss": 4.0341, + "step": 1366 + }, + { + "epoch": 0.8812248186946011, + "grad_norm": 3.2519384409853758, + "learning_rate": 9.99273552923639e-05, + "loss": 4.0155, + "step": 1367 + }, + { + "epoch": 0.8818694601128122, + "grad_norm": 2.0512935999597723, + "learning_rate": 9.992724661013847e-05, + "loss": 3.8051, + "step": 1368 + }, + { + "epoch": 0.8825141015310234, + "grad_norm": 2.5526840469716747, + "learning_rate": 9.9927137846735e-05, + "loss": 4.4303, + "step": 1369 + }, + { + "epoch": 0.8831587429492345, + "grad_norm": 2.3991780050371867, + "learning_rate": 9.992702900215365e-05, + "loss": 3.7905, + "step": 1370 + }, + { + "epoch": 0.8838033843674457, + "grad_norm": 2.024422033823165, + "learning_rate": 9.992692007639461e-05, + "loss": 4.2582, + "step": 1371 + }, + { + "epoch": 0.8844480257856567, + "grad_norm": 2.7250872512422952, + "learning_rate": 9.992681106945803e-05, + "loss": 4.3022, + "step": 1372 + }, + { + "epoch": 0.8850926672038678, + "grad_norm": 2.2355636203965674, + "learning_rate": 9.992670198134414e-05, + "loss": 4.198, + "step": 1373 + }, + { + "epoch": 0.885737308622079, + "grad_norm": 2.7137016086957244, + "learning_rate": 9.992659281205307e-05, + "loss": 4.0256, + "step": 1374 + }, + { + "epoch": 0.8863819500402901, + "grad_norm": 2.481047555959722, + "learning_rate": 9.992648356158504e-05, + "loss": 4.4181, + "step": 1375 + }, + { + "epoch": 0.8870265914585012, + "grad_norm": 2.9197552897034833, + "learning_rate": 9.99263742299402e-05, + "loss": 4.1563, + "step": 1376 + }, + { + "epoch": 0.8876712328767123, + "grad_norm": 2.690820692986983, + "learning_rate": 9.992626481711873e-05, + "loss": 3.9695, + "step": 1377 + }, + { + "epoch": 0.8883158742949234, + "grad_norm": 2.3420771448820785, + "learning_rate": 9.992615532312084e-05, + "loss": 4.3911, + "step": 1378 + }, + { + "epoch": 0.8889605157131346, + "grad_norm": 2.7153393180325374, + "learning_rate": 9.992604574794667e-05, + "loss": 4.0228, + "step": 1379 + }, + { + "epoch": 0.8896051571313457, + "grad_norm": 2.1028714432036963, + "learning_rate": 9.992593609159643e-05, + "loss": 4.425, + "step": 1380 + }, + { + "epoch": 0.8902497985495568, + "grad_norm": 3.7968456332619915, + "learning_rate": 9.992582635407027e-05, + "loss": 3.5428, + "step": 1381 + }, + { + "epoch": 0.890894439967768, + "grad_norm": 4.815568888698795, + "learning_rate": 9.99257165353684e-05, + "loss": 4.2162, + "step": 1382 + }, + { + "epoch": 0.891539081385979, + "grad_norm": 1.9063639020424212, + "learning_rate": 9.9925606635491e-05, + "loss": 4.1733, + "step": 1383 + }, + { + "epoch": 0.8921837228041901, + "grad_norm": 4.0241519451787395, + "learning_rate": 9.992549665443823e-05, + "loss": 4.142, + "step": 1384 + }, + { + "epoch": 0.8928283642224013, + "grad_norm": 2.378961051070429, + "learning_rate": 9.992538659221027e-05, + "loss": 4.4847, + "step": 1385 + }, + { + "epoch": 0.8934730056406124, + "grad_norm": 2.7056744540890594, + "learning_rate": 9.992527644880733e-05, + "loss": 4.2277, + "step": 1386 + }, + { + "epoch": 0.8941176470588236, + "grad_norm": 2.499822689932413, + "learning_rate": 9.992516622422956e-05, + "loss": 4.3102, + "step": 1387 + }, + { + "epoch": 0.8947622884770347, + "grad_norm": 2.4329207892115465, + "learning_rate": 9.992505591847715e-05, + "loss": 4.1613, + "step": 1388 + }, + { + "epoch": 0.8954069298952457, + "grad_norm": 2.182424209379419, + "learning_rate": 9.99249455315503e-05, + "loss": 4.2273, + "step": 1389 + }, + { + "epoch": 0.8960515713134569, + "grad_norm": 1.9555800838726254, + "learning_rate": 9.992483506344915e-05, + "loss": 3.9828, + "step": 1390 + }, + { + "epoch": 0.896696212731668, + "grad_norm": 2.2717620132729834, + "learning_rate": 9.992472451417393e-05, + "loss": 3.9601, + "step": 1391 + }, + { + "epoch": 0.8973408541498791, + "grad_norm": 2.3443495052361443, + "learning_rate": 9.992461388372479e-05, + "loss": 3.8392, + "step": 1392 + }, + { + "epoch": 0.8979854955680903, + "grad_norm": 2.2934261957164943, + "learning_rate": 9.992450317210191e-05, + "loss": 4.1115, + "step": 1393 + }, + { + "epoch": 0.8986301369863013, + "grad_norm": 1.967446165741754, + "learning_rate": 9.99243923793055e-05, + "loss": 4.0405, + "step": 1394 + }, + { + "epoch": 0.8992747784045125, + "grad_norm": 2.0833428660602453, + "learning_rate": 9.992428150533571e-05, + "loss": 4.01, + "step": 1395 + }, + { + "epoch": 0.8999194198227236, + "grad_norm": 2.0358468880304432, + "learning_rate": 9.992417055019276e-05, + "loss": 4.0745, + "step": 1396 + }, + { + "epoch": 0.9005640612409347, + "grad_norm": 1.4120910269002385, + "learning_rate": 9.992405951387678e-05, + "loss": 4.492, + "step": 1397 + }, + { + "epoch": 0.9012087026591459, + "grad_norm": 1.986309492401045, + "learning_rate": 9.992394839638799e-05, + "loss": 4.2633, + "step": 1398 + }, + { + "epoch": 0.901853344077357, + "grad_norm": 1.5808352674572084, + "learning_rate": 9.992383719772656e-05, + "loss": 4.1204, + "step": 1399 + }, + { + "epoch": 0.9024979854955681, + "grad_norm": 1.7880996705301455, + "learning_rate": 9.992372591789269e-05, + "loss": 4.1297, + "step": 1400 + }, + { + "epoch": 0.9024979854955681, + "eval_loss": 4.180212497711182, + "eval_runtime": 2.9283, + "eval_samples_per_second": 34.15, + "eval_steps_per_second": 4.44, + "step": 1400 + }, + { + "epoch": 0.9031426269137792, + "grad_norm": 2.6396349739231204, + "learning_rate": 9.992361455688654e-05, + "loss": 3.6326, + "step": 1401 + }, + { + "epoch": 0.9037872683319903, + "grad_norm": 4.12949364781833, + "learning_rate": 9.992350311470829e-05, + "loss": 4.1035, + "step": 1402 + }, + { + "epoch": 0.9044319097502015, + "grad_norm": 4.356576419463853, + "learning_rate": 9.992339159135813e-05, + "loss": 4.4295, + "step": 1403 + }, + { + "epoch": 0.9050765511684126, + "grad_norm": 1.7883633301806503, + "learning_rate": 9.992327998683626e-05, + "loss": 4.3049, + "step": 1404 + }, + { + "epoch": 0.9057211925866236, + "grad_norm": 3.6722034155472394, + "learning_rate": 9.992316830114285e-05, + "loss": 4.3898, + "step": 1405 + }, + { + "epoch": 0.9063658340048348, + "grad_norm": 3.2656923205005093, + "learning_rate": 9.99230565342781e-05, + "loss": 4.4015, + "step": 1406 + }, + { + "epoch": 0.9070104754230459, + "grad_norm": 3.136687283061932, + "learning_rate": 9.992294468624215e-05, + "loss": 3.872, + "step": 1407 + }, + { + "epoch": 0.9076551168412571, + "grad_norm": 3.7198129731272522, + "learning_rate": 9.992283275703523e-05, + "loss": 3.9701, + "step": 1408 + }, + { + "epoch": 0.9082997582594682, + "grad_norm": 2.5236598842090996, + "learning_rate": 9.99227207466575e-05, + "loss": 4.1762, + "step": 1409 + }, + { + "epoch": 0.9089443996776793, + "grad_norm": 3.4186083963076483, + "learning_rate": 9.992260865510915e-05, + "loss": 4.2126, + "step": 1410 + }, + { + "epoch": 0.9095890410958904, + "grad_norm": 2.8753527309810143, + "learning_rate": 9.992249648239037e-05, + "loss": 4.1343, + "step": 1411 + }, + { + "epoch": 0.9102336825141015, + "grad_norm": 2.732744968987052, + "learning_rate": 9.992238422850132e-05, + "loss": 4.1289, + "step": 1412 + }, + { + "epoch": 0.9108783239323126, + "grad_norm": 2.469044627945555, + "learning_rate": 9.99222718934422e-05, + "loss": 4.6316, + "step": 1413 + }, + { + "epoch": 0.9115229653505238, + "grad_norm": 2.4534316951236, + "learning_rate": 9.992215947721322e-05, + "loss": 3.9426, + "step": 1414 + }, + { + "epoch": 0.9121676067687349, + "grad_norm": 3.2773582361470996, + "learning_rate": 9.992204697981453e-05, + "loss": 4.1884, + "step": 1415 + }, + { + "epoch": 0.9128122481869461, + "grad_norm": 2.563937561627475, + "learning_rate": 9.992193440124633e-05, + "loss": 3.9222, + "step": 1416 + }, + { + "epoch": 0.9134568896051571, + "grad_norm": 4.390665944765846, + "learning_rate": 9.992182174150879e-05, + "loss": 4.132, + "step": 1417 + }, + { + "epoch": 0.9141015310233682, + "grad_norm": 2.086600523065258, + "learning_rate": 9.99217090006021e-05, + "loss": 4.1674, + "step": 1418 + }, + { + "epoch": 0.9147461724415794, + "grad_norm": 4.406509760057099, + "learning_rate": 9.992159617852649e-05, + "loss": 3.8655, + "step": 1419 + }, + { + "epoch": 0.9153908138597905, + "grad_norm": 3.364249337528094, + "learning_rate": 9.992148327528206e-05, + "loss": 4.1913, + "step": 1420 + }, + { + "epoch": 0.9160354552780016, + "grad_norm": 3.0031303634219997, + "learning_rate": 9.992137029086906e-05, + "loss": 3.8897, + "step": 1421 + }, + { + "epoch": 0.9166800966962128, + "grad_norm": 3.5165313814850836, + "learning_rate": 9.992125722528767e-05, + "loss": 4.3266, + "step": 1422 + }, + { + "epoch": 0.9173247381144238, + "grad_norm": 2.025248324070038, + "learning_rate": 9.992114407853805e-05, + "loss": 4.6713, + "step": 1423 + }, + { + "epoch": 0.917969379532635, + "grad_norm": 1.9873632311149552, + "learning_rate": 9.99210308506204e-05, + "loss": 4.1152, + "step": 1424 + }, + { + "epoch": 0.9186140209508461, + "grad_norm": 2.6505737279161496, + "learning_rate": 9.99209175415349e-05, + "loss": 3.9954, + "step": 1425 + }, + { + "epoch": 0.9192586623690572, + "grad_norm": 1.5168193907849137, + "learning_rate": 9.992080415128174e-05, + "loss": 4.2126, + "step": 1426 + }, + { + "epoch": 0.9199033037872684, + "grad_norm": 1.7380096957213524, + "learning_rate": 9.992069067986111e-05, + "loss": 4.266, + "step": 1427 + }, + { + "epoch": 0.9205479452054794, + "grad_norm": 2.027240012013055, + "learning_rate": 9.992057712727321e-05, + "loss": 4.1872, + "step": 1428 + }, + { + "epoch": 0.9211925866236905, + "grad_norm": 2.1039849098493932, + "learning_rate": 9.992046349351819e-05, + "loss": 4.164, + "step": 1429 + }, + { + "epoch": 0.9218372280419017, + "grad_norm": 2.1722083486622634, + "learning_rate": 9.992034977859627e-05, + "loss": 4.3776, + "step": 1430 + }, + { + "epoch": 0.9224818694601128, + "grad_norm": 2.139636880435321, + "learning_rate": 9.99202359825076e-05, + "loss": 4.3067, + "step": 1431 + }, + { + "epoch": 0.923126510878324, + "grad_norm": 3.2886633954443227, + "learning_rate": 9.992012210525241e-05, + "loss": 3.9749, + "step": 1432 + }, + { + "epoch": 0.9237711522965351, + "grad_norm": 1.9156121140449165, + "learning_rate": 9.992000814683087e-05, + "loss": 4.2902, + "step": 1433 + }, + { + "epoch": 0.9244157937147461, + "grad_norm": 2.3663441744299174, + "learning_rate": 9.991989410724315e-05, + "loss": 4.2111, + "step": 1434 + }, + { + "epoch": 0.9250604351329573, + "grad_norm": 2.770042000386222, + "learning_rate": 9.991977998648946e-05, + "loss": 3.8976, + "step": 1435 + }, + { + "epoch": 0.9257050765511684, + "grad_norm": 2.9463284877700575, + "learning_rate": 9.991966578456999e-05, + "loss": 4.4926, + "step": 1436 + }, + { + "epoch": 0.9263497179693795, + "grad_norm": 2.5054672098250634, + "learning_rate": 9.99195515014849e-05, + "loss": 3.7425, + "step": 1437 + }, + { + "epoch": 0.9269943593875907, + "grad_norm": 2.4279335391174683, + "learning_rate": 9.99194371372344e-05, + "loss": 4.2352, + "step": 1438 + }, + { + "epoch": 0.9276390008058018, + "grad_norm": 3.8917891421869957, + "learning_rate": 9.991932269181866e-05, + "loss": 3.9069, + "step": 1439 + }, + { + "epoch": 0.9282836422240129, + "grad_norm": 3.6682934454916007, + "learning_rate": 9.991920816523789e-05, + "loss": 4.4159, + "step": 1440 + }, + { + "epoch": 0.928928283642224, + "grad_norm": 1.914201293258703, + "learning_rate": 9.991909355749227e-05, + "loss": 4.1288, + "step": 1441 + }, + { + "epoch": 0.9295729250604351, + "grad_norm": 2.531729931459934, + "learning_rate": 9.991897886858199e-05, + "loss": 4.0719, + "step": 1442 + }, + { + "epoch": 0.9302175664786463, + "grad_norm": 2.467343340579263, + "learning_rate": 9.991886409850724e-05, + "loss": 3.9548, + "step": 1443 + }, + { + "epoch": 0.9308622078968574, + "grad_norm": 2.1467237697159565, + "learning_rate": 9.991874924726818e-05, + "loss": 4.1788, + "step": 1444 + }, + { + "epoch": 0.9315068493150684, + "grad_norm": 2.011359786774267, + "learning_rate": 9.991863431486504e-05, + "loss": 3.8237, + "step": 1445 + }, + { + "epoch": 0.9321514907332796, + "grad_norm": 2.723788674877364, + "learning_rate": 9.9918519301298e-05, + "loss": 4.1399, + "step": 1446 + }, + { + "epoch": 0.9327961321514907, + "grad_norm": 2.233346360796615, + "learning_rate": 9.991840420656721e-05, + "loss": 4.0767, + "step": 1447 + }, + { + "epoch": 0.9334407735697019, + "grad_norm": 1.7154902227840274, + "learning_rate": 9.991828903067292e-05, + "loss": 4.4223, + "step": 1448 + }, + { + "epoch": 0.934085414987913, + "grad_norm": 1.9108180110390813, + "learning_rate": 9.991817377361526e-05, + "loss": 4.2891, + "step": 1449 + }, + { + "epoch": 0.934730056406124, + "grad_norm": 1.5007295970561159, + "learning_rate": 9.991805843539445e-05, + "loss": 4.1857, + "step": 1450 + }, + { + "epoch": 0.9353746978243352, + "grad_norm": 1.9658627853797161, + "learning_rate": 9.99179430160107e-05, + "loss": 3.7521, + "step": 1451 + }, + { + "epoch": 0.9360193392425463, + "grad_norm": 2.673711058789136, + "learning_rate": 9.991782751546416e-05, + "loss": 4.1564, + "step": 1452 + }, + { + "epoch": 0.9366639806607574, + "grad_norm": 2.235230610129986, + "learning_rate": 9.991771193375504e-05, + "loss": 3.9593, + "step": 1453 + }, + { + "epoch": 0.9373086220789686, + "grad_norm": 2.2057046599487924, + "learning_rate": 9.991759627088352e-05, + "loss": 4.211, + "step": 1454 + }, + { + "epoch": 0.9379532634971797, + "grad_norm": 3.396301724858227, + "learning_rate": 9.991748052684981e-05, + "loss": 4.1091, + "step": 1455 + }, + { + "epoch": 0.9385979049153909, + "grad_norm": 2.5387624382364047, + "learning_rate": 9.991736470165408e-05, + "loss": 4.0185, + "step": 1456 + }, + { + "epoch": 0.9392425463336019, + "grad_norm": 3.7332788212536516, + "learning_rate": 9.99172487952965e-05, + "loss": 3.7959, + "step": 1457 + }, + { + "epoch": 0.939887187751813, + "grad_norm": 2.278701879874191, + "learning_rate": 9.991713280777732e-05, + "loss": 4.2442, + "step": 1458 + }, + { + "epoch": 0.9405318291700242, + "grad_norm": 2.6038302421522936, + "learning_rate": 9.991701673909668e-05, + "loss": 4.3653, + "step": 1459 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 3.4776910533437984, + "learning_rate": 9.991690058925479e-05, + "loss": 4.4158, + "step": 1460 + }, + { + "epoch": 0.9418211120064464, + "grad_norm": 2.013795783498799, + "learning_rate": 9.991678435825185e-05, + "loss": 4.0961, + "step": 1461 + }, + { + "epoch": 0.9424657534246575, + "grad_norm": 2.616714029247372, + "learning_rate": 9.991666804608801e-05, + "loss": 4.1851, + "step": 1462 + }, + { + "epoch": 0.9431103948428686, + "grad_norm": 2.644033732158587, + "learning_rate": 9.991655165276353e-05, + "loss": 4.0182, + "step": 1463 + }, + { + "epoch": 0.9437550362610798, + "grad_norm": 1.7477544669006815, + "learning_rate": 9.991643517827853e-05, + "loss": 4.009, + "step": 1464 + }, + { + "epoch": 0.9443996776792909, + "grad_norm": 2.439855787670872, + "learning_rate": 9.991631862263324e-05, + "loss": 4.4117, + "step": 1465 + }, + { + "epoch": 0.945044319097502, + "grad_norm": 2.160995992385545, + "learning_rate": 9.991620198582785e-05, + "loss": 3.4822, + "step": 1466 + }, + { + "epoch": 0.9456889605157132, + "grad_norm": 3.0263503692726106, + "learning_rate": 9.991608526786254e-05, + "loss": 3.9088, + "step": 1467 + }, + { + "epoch": 0.9463336019339242, + "grad_norm": 1.8331026494821114, + "learning_rate": 9.99159684687375e-05, + "loss": 4.2451, + "step": 1468 + }, + { + "epoch": 0.9469782433521354, + "grad_norm": 2.201186699860381, + "learning_rate": 9.991585158845296e-05, + "loss": 4.189, + "step": 1469 + }, + { + "epoch": 0.9476228847703465, + "grad_norm": 2.1203828657745083, + "learning_rate": 9.991573462700904e-05, + "loss": 4.0359, + "step": 1470 + }, + { + "epoch": 0.9482675261885576, + "grad_norm": 1.9366486973787185, + "learning_rate": 9.991561758440601e-05, + "loss": 3.9805, + "step": 1471 + }, + { + "epoch": 0.9489121676067688, + "grad_norm": 1.8721955321195813, + "learning_rate": 9.991550046064399e-05, + "loss": 4.4441, + "step": 1472 + }, + { + "epoch": 0.9495568090249799, + "grad_norm": 1.677851343489257, + "learning_rate": 9.991538325572324e-05, + "loss": 3.8847, + "step": 1473 + }, + { + "epoch": 0.9502014504431909, + "grad_norm": 3.1078023751407127, + "learning_rate": 9.99152659696439e-05, + "loss": 4.0149, + "step": 1474 + }, + { + "epoch": 0.9508460918614021, + "grad_norm": 2.261785119012118, + "learning_rate": 9.99151486024062e-05, + "loss": 3.9895, + "step": 1475 + }, + { + "epoch": 0.9514907332796132, + "grad_norm": 2.170626527236146, + "learning_rate": 9.99150311540103e-05, + "loss": 4.2196, + "step": 1476 + }, + { + "epoch": 0.9521353746978244, + "grad_norm": 2.261822405852417, + "learning_rate": 9.991491362445641e-05, + "loss": 3.7297, + "step": 1477 + }, + { + "epoch": 0.9527800161160355, + "grad_norm": 2.4559591097825146, + "learning_rate": 9.991479601374471e-05, + "loss": 4.2469, + "step": 1478 + }, + { + "epoch": 0.9534246575342465, + "grad_norm": 1.955601757587832, + "learning_rate": 9.991467832187543e-05, + "loss": 4.1767, + "step": 1479 + }, + { + "epoch": 0.9540692989524577, + "grad_norm": 1.9738941090904956, + "learning_rate": 9.991456054884873e-05, + "loss": 4.0059, + "step": 1480 + }, + { + "epoch": 0.9547139403706688, + "grad_norm": 2.878945531830995, + "learning_rate": 9.991444269466481e-05, + "loss": 4.008, + "step": 1481 + }, + { + "epoch": 0.9553585817888799, + "grad_norm": 2.4344091808165484, + "learning_rate": 9.991432475932386e-05, + "loss": 4.1629, + "step": 1482 + }, + { + "epoch": 0.9560032232070911, + "grad_norm": 1.9906483144711655, + "learning_rate": 9.991420674282607e-05, + "loss": 4.0329, + "step": 1483 + }, + { + "epoch": 0.9566478646253022, + "grad_norm": 1.8677475760607485, + "learning_rate": 9.991408864517167e-05, + "loss": 3.8763, + "step": 1484 + }, + { + "epoch": 0.9572925060435133, + "grad_norm": 2.230317585886976, + "learning_rate": 9.991397046636081e-05, + "loss": 4.1159, + "step": 1485 + }, + { + "epoch": 0.9579371474617244, + "grad_norm": 2.0477510696178065, + "learning_rate": 9.99138522063937e-05, + "loss": 3.9271, + "step": 1486 + }, + { + "epoch": 0.9585817888799355, + "grad_norm": 1.762815797198688, + "learning_rate": 9.991373386527053e-05, + "loss": 4.1969, + "step": 1487 + }, + { + "epoch": 0.9592264302981467, + "grad_norm": 2.2809047098318183, + "learning_rate": 9.991361544299151e-05, + "loss": 3.9757, + "step": 1488 + }, + { + "epoch": 0.9598710717163578, + "grad_norm": 1.9927321130157287, + "learning_rate": 9.991349693955681e-05, + "loss": 4.2209, + "step": 1489 + }, + { + "epoch": 0.9605157131345688, + "grad_norm": 2.76543900315631, + "learning_rate": 9.991337835496666e-05, + "loss": 4.1781, + "step": 1490 + }, + { + "epoch": 0.96116035455278, + "grad_norm": 1.6304909011647222, + "learning_rate": 9.991325968922122e-05, + "loss": 4.2102, + "step": 1491 + }, + { + "epoch": 0.9618049959709911, + "grad_norm": 2.3693611108170654, + "learning_rate": 9.99131409423207e-05, + "loss": 4.0451, + "step": 1492 + }, + { + "epoch": 0.9624496373892023, + "grad_norm": 2.1965647075490993, + "learning_rate": 9.991302211426527e-05, + "loss": 3.7688, + "step": 1493 + }, + { + "epoch": 0.9630942788074134, + "grad_norm": 2.0201211961700545, + "learning_rate": 9.991290320505517e-05, + "loss": 4.2242, + "step": 1494 + }, + { + "epoch": 0.9637389202256245, + "grad_norm": 1.756911151055617, + "learning_rate": 9.991278421469058e-05, + "loss": 4.325, + "step": 1495 + }, + { + "epoch": 0.9643835616438357, + "grad_norm": 1.526382539444706, + "learning_rate": 9.991266514317168e-05, + "loss": 4.1866, + "step": 1496 + }, + { + "epoch": 0.9650282030620467, + "grad_norm": 1.8400113530373483, + "learning_rate": 9.991254599049869e-05, + "loss": 4.2295, + "step": 1497 + }, + { + "epoch": 0.9656728444802578, + "grad_norm": 1.9314075959752988, + "learning_rate": 9.991242675667176e-05, + "loss": 4.4928, + "step": 1498 + }, + { + "epoch": 0.966317485898469, + "grad_norm": 1.9430455809792504, + "learning_rate": 9.991230744169114e-05, + "loss": 4.2642, + "step": 1499 + }, + { + "epoch": 0.9669621273166801, + "grad_norm": 2.1241365752961054, + "learning_rate": 9.991218804555699e-05, + "loss": 4.0299, + "step": 1500 + }, + { + "epoch": 0.9669621273166801, + "eval_loss": 4.164045810699463, + "eval_runtime": 2.9464, + "eval_samples_per_second": 33.94, + "eval_steps_per_second": 4.412, + "step": 1500 + }, + { + "epoch": 0.9676067687348913, + "grad_norm": 1.7998614611790402, + "learning_rate": 9.991206856826951e-05, + "loss": 4.2083, + "step": 1501 + }, + { + "epoch": 0.9682514101531023, + "grad_norm": 1.962550464849981, + "learning_rate": 9.991194900982892e-05, + "loss": 4.0389, + "step": 1502 + }, + { + "epoch": 0.9688960515713134, + "grad_norm": 2.1136838516318597, + "learning_rate": 9.99118293702354e-05, + "loss": 4.0209, + "step": 1503 + }, + { + "epoch": 0.9695406929895246, + "grad_norm": 2.7686081358142483, + "learning_rate": 9.991170964948916e-05, + "loss": 3.9957, + "step": 1504 + }, + { + "epoch": 0.9701853344077357, + "grad_norm": 2.5228212134472163, + "learning_rate": 9.991158984759035e-05, + "loss": 4.2712, + "step": 1505 + }, + { + "epoch": 0.9708299758259468, + "grad_norm": 2.337849430733804, + "learning_rate": 9.991146996453921e-05, + "loss": 4.2827, + "step": 1506 + }, + { + "epoch": 0.971474617244158, + "grad_norm": 2.177038291924365, + "learning_rate": 9.991135000033594e-05, + "loss": 3.8201, + "step": 1507 + }, + { + "epoch": 0.972119258662369, + "grad_norm": 2.109083864936888, + "learning_rate": 9.991122995498072e-05, + "loss": 4.4073, + "step": 1508 + }, + { + "epoch": 0.9727639000805802, + "grad_norm": 2.418930554535946, + "learning_rate": 9.991110982847373e-05, + "loss": 3.9133, + "step": 1509 + }, + { + "epoch": 0.9734085414987913, + "grad_norm": 2.442503849224056, + "learning_rate": 9.991098962081522e-05, + "loss": 3.9843, + "step": 1510 + }, + { + "epoch": 0.9740531829170024, + "grad_norm": 2.400625677061702, + "learning_rate": 9.991086933200535e-05, + "loss": 4.0743, + "step": 1511 + }, + { + "epoch": 0.9746978243352136, + "grad_norm": 2.762287527313859, + "learning_rate": 9.99107489620443e-05, + "loss": 4.1113, + "step": 1512 + }, + { + "epoch": 0.9753424657534246, + "grad_norm": 2.2878365516364236, + "learning_rate": 9.991062851093232e-05, + "loss": 4.0554, + "step": 1513 + }, + { + "epoch": 0.9759871071716357, + "grad_norm": 2.640349477144453, + "learning_rate": 9.991050797866957e-05, + "loss": 4.1663, + "step": 1514 + }, + { + "epoch": 0.9766317485898469, + "grad_norm": 2.643748417430668, + "learning_rate": 9.991038736525626e-05, + "loss": 4.354, + "step": 1515 + }, + { + "epoch": 0.977276390008058, + "grad_norm": 2.0117093388644083, + "learning_rate": 9.991026667069258e-05, + "loss": 4.2141, + "step": 1516 + }, + { + "epoch": 0.9779210314262692, + "grad_norm": 2.1914618385795994, + "learning_rate": 9.991014589497873e-05, + "loss": 4.2607, + "step": 1517 + }, + { + "epoch": 0.9785656728444803, + "grad_norm": 3.0792024152609008, + "learning_rate": 9.991002503811492e-05, + "loss": 4.0489, + "step": 1518 + }, + { + "epoch": 0.9792103142626913, + "grad_norm": 2.3764334323757157, + "learning_rate": 9.990990410010134e-05, + "loss": 3.9798, + "step": 1519 + }, + { + "epoch": 0.9798549556809025, + "grad_norm": 2.786174712000678, + "learning_rate": 9.990978308093818e-05, + "loss": 4.0848, + "step": 1520 + }, + { + "epoch": 0.9804995970991136, + "grad_norm": 2.1575478640911845, + "learning_rate": 9.990966198062565e-05, + "loss": 4.2127, + "step": 1521 + }, + { + "epoch": 0.9811442385173247, + "grad_norm": 2.442086963684339, + "learning_rate": 9.990954079916396e-05, + "loss": 3.998, + "step": 1522 + }, + { + "epoch": 0.9817888799355359, + "grad_norm": 3.4980985793693704, + "learning_rate": 9.990941953655329e-05, + "loss": 3.8805, + "step": 1523 + }, + { + "epoch": 0.982433521353747, + "grad_norm": 3.059077276798565, + "learning_rate": 9.990929819279383e-05, + "loss": 4.0238, + "step": 1524 + }, + { + "epoch": 0.9830781627719581, + "grad_norm": 2.6103125401582705, + "learning_rate": 9.99091767678858e-05, + "loss": 4.052, + "step": 1525 + }, + { + "epoch": 0.9837228041901692, + "grad_norm": 2.481695565806437, + "learning_rate": 9.990905526182941e-05, + "loss": 4.1489, + "step": 1526 + }, + { + "epoch": 0.9843674456083803, + "grad_norm": 3.1723001770070702, + "learning_rate": 9.990893367462483e-05, + "loss": 4.1451, + "step": 1527 + }, + { + "epoch": 0.9850120870265915, + "grad_norm": 2.5077011955787865, + "learning_rate": 9.990881200627227e-05, + "loss": 4.0259, + "step": 1528 + }, + { + "epoch": 0.9856567284448026, + "grad_norm": 2.7651879695660466, + "learning_rate": 9.990869025677194e-05, + "loss": 4.2319, + "step": 1529 + }, + { + "epoch": 0.9863013698630136, + "grad_norm": 1.8157315240344423, + "learning_rate": 9.990856842612403e-05, + "loss": 4.7255, + "step": 1530 + }, + { + "epoch": 0.9869460112812248, + "grad_norm": 2.4266797814885304, + "learning_rate": 9.990844651432875e-05, + "loss": 3.9361, + "step": 1531 + }, + { + "epoch": 0.9875906526994359, + "grad_norm": 2.9617573489789817, + "learning_rate": 9.990832452138628e-05, + "loss": 4.015, + "step": 1532 + }, + { + "epoch": 0.9882352941176471, + "grad_norm": 5.479562515571908, + "learning_rate": 9.990820244729683e-05, + "loss": 3.6683, + "step": 1533 + }, + { + "epoch": 0.9888799355358582, + "grad_norm": 6.1058094515684385, + "learning_rate": 9.990808029206062e-05, + "loss": 4.1375, + "step": 1534 + }, + { + "epoch": 0.9895245769540693, + "grad_norm": 2.744881488717138, + "learning_rate": 9.990795805567782e-05, + "loss": 4.2229, + "step": 1535 + }, + { + "epoch": 0.9901692183722804, + "grad_norm": 3.700387317348618, + "learning_rate": 9.990783573814865e-05, + "loss": 3.8171, + "step": 1536 + }, + { + "epoch": 0.9908138597904915, + "grad_norm": 4.3202617042110525, + "learning_rate": 9.990771333947332e-05, + "loss": 3.8057, + "step": 1537 + }, + { + "epoch": 0.9914585012087027, + "grad_norm": 2.03192764067503, + "learning_rate": 9.9907590859652e-05, + "loss": 4.2439, + "step": 1538 + }, + { + "epoch": 0.9921031426269138, + "grad_norm": 2.9397231930781555, + "learning_rate": 9.990746829868491e-05, + "loss": 3.9079, + "step": 1539 + }, + { + "epoch": 0.9927477840451249, + "grad_norm": 2.6328837804708773, + "learning_rate": 9.990734565657224e-05, + "loss": 3.9175, + "step": 1540 + }, + { + "epoch": 0.9933924254633361, + "grad_norm": 2.751058115797257, + "learning_rate": 9.990722293331423e-05, + "loss": 3.9979, + "step": 1541 + }, + { + "epoch": 0.9940370668815471, + "grad_norm": 2.3476121142061825, + "learning_rate": 9.990710012891104e-05, + "loss": 3.9367, + "step": 1542 + }, + { + "epoch": 0.9946817082997582, + "grad_norm": 2.84894537158981, + "learning_rate": 9.990697724336288e-05, + "loss": 4.2277, + "step": 1543 + }, + { + "epoch": 0.9953263497179694, + "grad_norm": 2.341162029814536, + "learning_rate": 9.990685427666995e-05, + "loss": 3.8654, + "step": 1544 + }, + { + "epoch": 0.9959709911361805, + "grad_norm": 2.3626089794040483, + "learning_rate": 9.990673122883246e-05, + "loss": 4.208, + "step": 1545 + }, + { + "epoch": 0.9966156325543917, + "grad_norm": 1.7481441810048228, + "learning_rate": 9.990660809985061e-05, + "loss": 4.2769, + "step": 1546 + }, + { + "epoch": 0.9972602739726028, + "grad_norm": 1.9644458011522228, + "learning_rate": 9.990648488972463e-05, + "loss": 3.7496, + "step": 1547 + }, + { + "epoch": 0.9979049153908138, + "grad_norm": 2.119049246841261, + "learning_rate": 9.990636159845466e-05, + "loss": 4.1091, + "step": 1548 + }, + { + "epoch": 0.998549556809025, + "grad_norm": 1.8755904714487162, + "learning_rate": 9.990623822604093e-05, + "loss": 4.352, + "step": 1549 + }, + { + "epoch": 0.9991941982272361, + "grad_norm": 1.71683090334246, + "learning_rate": 9.99061147724837e-05, + "loss": 4.0733, + "step": 1550 + }, + { + "epoch": 0.9998388396454472, + "grad_norm": 1.988065327825825, + "learning_rate": 9.990599123778308e-05, + "loss": 4.3645, + "step": 1551 + }, + { + "epoch": 1.0, + "grad_norm": 1.988065327825825, + "learning_rate": 9.990586762193931e-05, + "loss": 0.8908, + "step": 1552 + }, + { + "epoch": 1.000644641418211, + "grad_norm": 2.003033949839073, + "learning_rate": 9.990574392495262e-05, + "loss": 3.9573, + "step": 1553 + }, + { + "epoch": 1.0012892828364222, + "grad_norm": 1.8373420325734051, + "learning_rate": 9.99056201468232e-05, + "loss": 3.7822, + "step": 1554 + }, + { + "epoch": 1.0019339242546335, + "grad_norm": 2.166132433613854, + "learning_rate": 9.990549628755123e-05, + "loss": 3.9379, + "step": 1555 + }, + { + "epoch": 1.0025785656728445, + "grad_norm": 2.2692884578988464, + "learning_rate": 9.990537234713693e-05, + "loss": 4.1969, + "step": 1556 + }, + { + "epoch": 1.0032232070910556, + "grad_norm": 1.917802209539256, + "learning_rate": 9.99052483255805e-05, + "loss": 3.5924, + "step": 1557 + }, + { + "epoch": 1.0038678485092667, + "grad_norm": 1.8232668102124443, + "learning_rate": 9.990512422288214e-05, + "loss": 3.9044, + "step": 1558 + }, + { + "epoch": 1.0045124899274778, + "grad_norm": 1.6570061861050436, + "learning_rate": 9.990500003904207e-05, + "loss": 3.8117, + "step": 1559 + }, + { + "epoch": 1.005157131345689, + "grad_norm": 2.3526566753055875, + "learning_rate": 9.990487577406047e-05, + "loss": 3.7573, + "step": 1560 + }, + { + "epoch": 1.0058017727639001, + "grad_norm": 2.1000801429604756, + "learning_rate": 9.990475142793757e-05, + "loss": 3.8345, + "step": 1561 + }, + { + "epoch": 1.0064464141821112, + "grad_norm": 1.546377747863194, + "learning_rate": 9.990462700067356e-05, + "loss": 3.7085, + "step": 1562 + }, + { + "epoch": 1.0070910556003223, + "grad_norm": 2.1496966754561315, + "learning_rate": 9.990450249226864e-05, + "loss": 4.5081, + "step": 1563 + }, + { + "epoch": 1.0077356970185334, + "grad_norm": 2.064578190042294, + "learning_rate": 9.990437790272303e-05, + "loss": 4.0127, + "step": 1564 + }, + { + "epoch": 1.0083803384367445, + "grad_norm": 1.751248492897941, + "learning_rate": 9.990425323203691e-05, + "loss": 3.7155, + "step": 1565 + }, + { + "epoch": 1.0090249798549558, + "grad_norm": 2.760870831331999, + "learning_rate": 9.990412848021052e-05, + "loss": 4.1768, + "step": 1566 + }, + { + "epoch": 1.0096696212731668, + "grad_norm": 2.698590884684101, + "learning_rate": 9.990400364724403e-05, + "loss": 3.8627, + "step": 1567 + }, + { + "epoch": 1.010314262691378, + "grad_norm": 2.1381379609330864, + "learning_rate": 9.990387873313767e-05, + "loss": 3.8629, + "step": 1568 + }, + { + "epoch": 1.010958904109589, + "grad_norm": 2.2131965853462727, + "learning_rate": 9.990375373789164e-05, + "loss": 4.027, + "step": 1569 + }, + { + "epoch": 1.0116035455278, + "grad_norm": 2.7136858408078224, + "learning_rate": 9.990362866150612e-05, + "loss": 4.3517, + "step": 1570 + }, + { + "epoch": 1.0122481869460114, + "grad_norm": 2.8092319982862, + "learning_rate": 9.990350350398133e-05, + "loss": 3.6553, + "step": 1571 + }, + { + "epoch": 1.0128928283642225, + "grad_norm": 2.0386969783206377, + "learning_rate": 9.99033782653175e-05, + "loss": 4.1341, + "step": 1572 + }, + { + "epoch": 1.0135374697824335, + "grad_norm": 1.7274286798806264, + "learning_rate": 9.990325294551483e-05, + "loss": 4.0092, + "step": 1573 + }, + { + "epoch": 1.0141821112006446, + "grad_norm": 1.8295843365249442, + "learning_rate": 9.990312754457349e-05, + "loss": 3.8569, + "step": 1574 + }, + { + "epoch": 1.0148267526188557, + "grad_norm": 1.6069094862535729, + "learning_rate": 9.990300206249371e-05, + "loss": 3.6786, + "step": 1575 + }, + { + "epoch": 1.015471394037067, + "grad_norm": 2.3966815058277495, + "learning_rate": 9.990287649927569e-05, + "loss": 4.0425, + "step": 1576 + }, + { + "epoch": 1.016116035455278, + "grad_norm": 2.6501930479405975, + "learning_rate": 9.990275085491967e-05, + "loss": 4.114, + "step": 1577 + }, + { + "epoch": 1.0167606768734891, + "grad_norm": 1.8107060368286536, + "learning_rate": 9.990262512942578e-05, + "loss": 4.2002, + "step": 1578 + }, + { + "epoch": 1.0174053182917002, + "grad_norm": 2.4878267587032803, + "learning_rate": 9.990249932279431e-05, + "loss": 3.9009, + "step": 1579 + }, + { + "epoch": 1.0180499597099113, + "grad_norm": 2.9941311661328003, + "learning_rate": 9.990237343502544e-05, + "loss": 3.8908, + "step": 1580 + }, + { + "epoch": 1.0186946011281224, + "grad_norm": 1.8769089212230876, + "learning_rate": 9.990224746611935e-05, + "loss": 4.0129, + "step": 1581 + }, + { + "epoch": 1.0193392425463337, + "grad_norm": 2.613338942595072, + "learning_rate": 9.990212141607624e-05, + "loss": 3.4751, + "step": 1582 + }, + { + "epoch": 1.0199838839645448, + "grad_norm": 3.8002984523154466, + "learning_rate": 9.990199528489639e-05, + "loss": 4.2505, + "step": 1583 + }, + { + "epoch": 1.0206285253827558, + "grad_norm": 2.19926851421206, + "learning_rate": 9.990186907257992e-05, + "loss": 3.9855, + "step": 1584 + }, + { + "epoch": 1.021273166800967, + "grad_norm": 2.5862093417916414, + "learning_rate": 9.990174277912708e-05, + "loss": 3.6891, + "step": 1585 + }, + { + "epoch": 1.021917808219178, + "grad_norm": 2.8316635419258396, + "learning_rate": 9.99016164045381e-05, + "loss": 4.0644, + "step": 1586 + }, + { + "epoch": 1.0225624496373893, + "grad_norm": 1.9223346304655007, + "learning_rate": 9.990148994881312e-05, + "loss": 4.0486, + "step": 1587 + }, + { + "epoch": 1.0232070910556004, + "grad_norm": 3.715093157507281, + "learning_rate": 9.990136341195242e-05, + "loss": 3.8477, + "step": 1588 + }, + { + "epoch": 1.0238517324738114, + "grad_norm": 3.640444026830007, + "learning_rate": 9.990123679395617e-05, + "loss": 3.6647, + "step": 1589 + }, + { + "epoch": 1.0244963738920225, + "grad_norm": 1.669448521611858, + "learning_rate": 9.990111009482458e-05, + "loss": 3.8342, + "step": 1590 + }, + { + "epoch": 1.0251410153102336, + "grad_norm": 3.349055278619222, + "learning_rate": 9.990098331455785e-05, + "loss": 3.8846, + "step": 1591 + }, + { + "epoch": 1.025785656728445, + "grad_norm": 2.829914619466856, + "learning_rate": 9.990085645315623e-05, + "loss": 3.9582, + "step": 1592 + }, + { + "epoch": 1.026430298146656, + "grad_norm": 1.9019400270919455, + "learning_rate": 9.990072951061987e-05, + "loss": 4.1208, + "step": 1593 + }, + { + "epoch": 1.027074939564867, + "grad_norm": 2.807072784450782, + "learning_rate": 9.990060248694902e-05, + "loss": 4.2439, + "step": 1594 + }, + { + "epoch": 1.0277195809830781, + "grad_norm": 2.2877827328367872, + "learning_rate": 9.990047538214386e-05, + "loss": 4.2013, + "step": 1595 + }, + { + "epoch": 1.0283642224012892, + "grad_norm": 2.681579475474968, + "learning_rate": 9.990034819620463e-05, + "loss": 4.022, + "step": 1596 + }, + { + "epoch": 1.0290088638195003, + "grad_norm": 1.9072522987484506, + "learning_rate": 9.990022092913153e-05, + "loss": 3.9764, + "step": 1597 + }, + { + "epoch": 1.0296535052377116, + "grad_norm": 2.1968839663331763, + "learning_rate": 9.990009358092475e-05, + "loss": 4.0846, + "step": 1598 + }, + { + "epoch": 1.0302981466559227, + "grad_norm": 1.9105667484512296, + "learning_rate": 9.989996615158452e-05, + "loss": 3.666, + "step": 1599 + }, + { + "epoch": 1.0309427880741338, + "grad_norm": 2.5169499440419947, + "learning_rate": 9.989983864111104e-05, + "loss": 3.7193, + "step": 1600 + }, + { + "epoch": 1.0309427880741338, + "eval_loss": 4.174848556518555, + "eval_runtime": 2.9653, + "eval_samples_per_second": 33.723, + "eval_steps_per_second": 4.384, + "step": 1600 + }, + { + "epoch": 1.0315874294923448, + "grad_norm": 2.9522112602501793, + "learning_rate": 9.989971104950451e-05, + "loss": 3.4683, + "step": 1601 + }, + { + "epoch": 1.032232070910556, + "grad_norm": 2.081784180532212, + "learning_rate": 9.989958337676516e-05, + "loss": 3.9789, + "step": 1602 + }, + { + "epoch": 1.0328767123287672, + "grad_norm": 1.9782369762420446, + "learning_rate": 9.989945562289319e-05, + "loss": 4.0776, + "step": 1603 + }, + { + "epoch": 1.0335213537469783, + "grad_norm": 1.9532403682175483, + "learning_rate": 9.989932778788879e-05, + "loss": 3.8881, + "step": 1604 + }, + { + "epoch": 1.0341659951651894, + "grad_norm": 1.8006132339324428, + "learning_rate": 9.98991998717522e-05, + "loss": 4.0446, + "step": 1605 + }, + { + "epoch": 1.0348106365834004, + "grad_norm": 1.9841322356523032, + "learning_rate": 9.989907187448363e-05, + "loss": 4.1025, + "step": 1606 + }, + { + "epoch": 1.0354552780016115, + "grad_norm": 1.8951627662116266, + "learning_rate": 9.989894379608328e-05, + "loss": 3.9916, + "step": 1607 + }, + { + "epoch": 1.0360999194198228, + "grad_norm": 1.8419164345722556, + "learning_rate": 9.989881563655135e-05, + "loss": 4.1194, + "step": 1608 + }, + { + "epoch": 1.036744560838034, + "grad_norm": 1.5350240116277494, + "learning_rate": 9.989868739588805e-05, + "loss": 4.2487, + "step": 1609 + }, + { + "epoch": 1.037389202256245, + "grad_norm": 2.3650184874461733, + "learning_rate": 9.989855907409362e-05, + "loss": 4.1009, + "step": 1610 + }, + { + "epoch": 1.038033843674456, + "grad_norm": 1.5622367604770306, + "learning_rate": 9.989843067116824e-05, + "loss": 4.0171, + "step": 1611 + }, + { + "epoch": 1.0386784850926671, + "grad_norm": 2.1396308164540785, + "learning_rate": 9.989830218711215e-05, + "loss": 4.2009, + "step": 1612 + }, + { + "epoch": 1.0393231265108782, + "grad_norm": 1.9674938204139127, + "learning_rate": 9.989817362192552e-05, + "loss": 4.3475, + "step": 1613 + }, + { + "epoch": 1.0399677679290895, + "grad_norm": 2.1336332602035317, + "learning_rate": 9.98980449756086e-05, + "loss": 4.0328, + "step": 1614 + }, + { + "epoch": 1.0406124093473006, + "grad_norm": 1.8449614766799225, + "learning_rate": 9.98979162481616e-05, + "loss": 4.1189, + "step": 1615 + }, + { + "epoch": 1.0412570507655117, + "grad_norm": 2.2527841251031826, + "learning_rate": 9.989778743958468e-05, + "loss": 3.8065, + "step": 1616 + }, + { + "epoch": 1.0419016921837227, + "grad_norm": 2.319548071555923, + "learning_rate": 9.989765854987812e-05, + "loss": 4.2641, + "step": 1617 + }, + { + "epoch": 1.0425463336019338, + "grad_norm": 2.448717407409401, + "learning_rate": 9.989752957904209e-05, + "loss": 3.7273, + "step": 1618 + }, + { + "epoch": 1.0431909750201451, + "grad_norm": 4.8451862033194155, + "learning_rate": 9.989740052707682e-05, + "loss": 4.0234, + "step": 1619 + }, + { + "epoch": 1.0438356164383562, + "grad_norm": 6.10597755622432, + "learning_rate": 9.98972713939825e-05, + "loss": 3.7916, + "step": 1620 + }, + { + "epoch": 1.0444802578565673, + "grad_norm": 3.4455623894550595, + "learning_rate": 9.989714217975936e-05, + "loss": 3.8017, + "step": 1621 + }, + { + "epoch": 1.0451248992747784, + "grad_norm": 2.841127554663438, + "learning_rate": 9.989701288440762e-05, + "loss": 4.1496, + "step": 1622 + }, + { + "epoch": 1.0457695406929894, + "grad_norm": 3.2442582968401603, + "learning_rate": 9.989688350792747e-05, + "loss": 3.8385, + "step": 1623 + }, + { + "epoch": 1.0464141821112007, + "grad_norm": 2.217954790758494, + "learning_rate": 9.989675405031915e-05, + "loss": 4.3124, + "step": 1624 + }, + { + "epoch": 1.0470588235294118, + "grad_norm": 3.153212613700778, + "learning_rate": 9.989662451158284e-05, + "loss": 4.2115, + "step": 1625 + }, + { + "epoch": 1.047703464947623, + "grad_norm": 2.468657159080837, + "learning_rate": 9.989649489171879e-05, + "loss": 3.7662, + "step": 1626 + }, + { + "epoch": 1.048348106365834, + "grad_norm": 2.631615652814648, + "learning_rate": 9.989636519072715e-05, + "loss": 3.9367, + "step": 1627 + }, + { + "epoch": 1.048992747784045, + "grad_norm": 2.4615008203473625, + "learning_rate": 9.989623540860822e-05, + "loss": 3.8861, + "step": 1628 + }, + { + "epoch": 1.0496373892022564, + "grad_norm": 2.7059626152988048, + "learning_rate": 9.989610554536214e-05, + "loss": 3.7547, + "step": 1629 + }, + { + "epoch": 1.0502820306204674, + "grad_norm": 2.2086484087494638, + "learning_rate": 9.989597560098916e-05, + "loss": 3.8371, + "step": 1630 + }, + { + "epoch": 1.0509266720386785, + "grad_norm": 2.981456198040469, + "learning_rate": 9.989584557548949e-05, + "loss": 4.0454, + "step": 1631 + }, + { + "epoch": 1.0515713134568896, + "grad_norm": 2.247541593576782, + "learning_rate": 9.989571546886333e-05, + "loss": 4.0057, + "step": 1632 + }, + { + "epoch": 1.0522159548751007, + "grad_norm": 3.2292066884790307, + "learning_rate": 9.989558528111092e-05, + "loss": 3.8109, + "step": 1633 + }, + { + "epoch": 1.0528605962933117, + "grad_norm": 2.5168069128047574, + "learning_rate": 9.989545501223243e-05, + "loss": 4.069, + "step": 1634 + }, + { + "epoch": 1.053505237711523, + "grad_norm": 2.13085919060896, + "learning_rate": 9.989532466222812e-05, + "loss": 4.018, + "step": 1635 + }, + { + "epoch": 1.0541498791297341, + "grad_norm": 1.6724418483261907, + "learning_rate": 9.989519423109818e-05, + "loss": 4.006, + "step": 1636 + }, + { + "epoch": 1.0547945205479452, + "grad_norm": 2.471873719208773, + "learning_rate": 9.98950637188428e-05, + "loss": 3.7679, + "step": 1637 + }, + { + "epoch": 1.0554391619661563, + "grad_norm": 2.1523973333901445, + "learning_rate": 9.989493312546225e-05, + "loss": 4.2439, + "step": 1638 + }, + { + "epoch": 1.0560838033843674, + "grad_norm": 2.3194101203587203, + "learning_rate": 9.989480245095672e-05, + "loss": 4.0572, + "step": 1639 + }, + { + "epoch": 1.0567284448025787, + "grad_norm": 1.6848020137600248, + "learning_rate": 9.989467169532641e-05, + "loss": 4.2436, + "step": 1640 + }, + { + "epoch": 1.0573730862207897, + "grad_norm": 1.8802854103289632, + "learning_rate": 9.989454085857156e-05, + "loss": 4.2225, + "step": 1641 + }, + { + "epoch": 1.0580177276390008, + "grad_norm": 1.9663257808629886, + "learning_rate": 9.989440994069236e-05, + "loss": 3.8273, + "step": 1642 + }, + { + "epoch": 1.058662369057212, + "grad_norm": 2.171120446713258, + "learning_rate": 9.989427894168903e-05, + "loss": 4.054, + "step": 1643 + }, + { + "epoch": 1.059307010475423, + "grad_norm": 2.4717867479650586, + "learning_rate": 9.989414786156181e-05, + "loss": 4.132, + "step": 1644 + }, + { + "epoch": 1.0599516518936343, + "grad_norm": 2.034615373319026, + "learning_rate": 9.989401670031088e-05, + "loss": 4.5167, + "step": 1645 + }, + { + "epoch": 1.0605962933118454, + "grad_norm": 1.7545673961438044, + "learning_rate": 9.989388545793649e-05, + "loss": 3.882, + "step": 1646 + }, + { + "epoch": 1.0612409347300564, + "grad_norm": 1.8911040422816, + "learning_rate": 9.989375413443882e-05, + "loss": 4.1214, + "step": 1647 + }, + { + "epoch": 1.0618855761482675, + "grad_norm": 1.4458414911422692, + "learning_rate": 9.98936227298181e-05, + "loss": 4.0149, + "step": 1648 + }, + { + "epoch": 1.0625302175664786, + "grad_norm": 1.9604709341489464, + "learning_rate": 9.989349124407457e-05, + "loss": 3.8983, + "step": 1649 + }, + { + "epoch": 1.0631748589846897, + "grad_norm": 2.1167253133735184, + "learning_rate": 9.98933596772084e-05, + "loss": 4.0498, + "step": 1650 + }, + { + "epoch": 1.063819500402901, + "grad_norm": 2.31456302248963, + "learning_rate": 9.989322802921984e-05, + "loss": 3.7374, + "step": 1651 + }, + { + "epoch": 1.064464141821112, + "grad_norm": 1.8711957953232585, + "learning_rate": 9.989309630010911e-05, + "loss": 4.0039, + "step": 1652 + }, + { + "epoch": 1.0651087832393231, + "grad_norm": 1.8573141928385586, + "learning_rate": 9.98929644898764e-05, + "loss": 3.9539, + "step": 1653 + }, + { + "epoch": 1.0657534246575342, + "grad_norm": 1.5773976479827314, + "learning_rate": 9.989283259852196e-05, + "loss": 4.1813, + "step": 1654 + }, + { + "epoch": 1.0663980660757453, + "grad_norm": 1.9506339472140766, + "learning_rate": 9.989270062604596e-05, + "loss": 3.7305, + "step": 1655 + }, + { + "epoch": 1.0670427074939566, + "grad_norm": 1.787590813617889, + "learning_rate": 9.989256857244867e-05, + "loss": 3.9127, + "step": 1656 + }, + { + "epoch": 1.0676873489121677, + "grad_norm": 2.018083986682579, + "learning_rate": 9.989243643773026e-05, + "loss": 4.2811, + "step": 1657 + }, + { + "epoch": 1.0683319903303787, + "grad_norm": 2.1816710765748097, + "learning_rate": 9.989230422189096e-05, + "loss": 3.8697, + "step": 1658 + }, + { + "epoch": 1.0689766317485898, + "grad_norm": 2.4005840000177647, + "learning_rate": 9.989217192493102e-05, + "loss": 3.9789, + "step": 1659 + }, + { + "epoch": 1.069621273166801, + "grad_norm": 2.3724795820795763, + "learning_rate": 9.989203954685063e-05, + "loss": 4.1671, + "step": 1660 + }, + { + "epoch": 1.0702659145850122, + "grad_norm": 1.5748560916495937, + "learning_rate": 9.989190708765e-05, + "loss": 4.0743, + "step": 1661 + }, + { + "epoch": 1.0709105560032233, + "grad_norm": 2.4663890676438474, + "learning_rate": 9.989177454732935e-05, + "loss": 3.9368, + "step": 1662 + }, + { + "epoch": 1.0715551974214343, + "grad_norm": 3.1127130603451656, + "learning_rate": 9.989164192588892e-05, + "loss": 4.2971, + "step": 1663 + }, + { + "epoch": 1.0721998388396454, + "grad_norm": 1.9525638136869143, + "learning_rate": 9.989150922332889e-05, + "loss": 4.0206, + "step": 1664 + }, + { + "epoch": 1.0728444802578565, + "grad_norm": 2.2169153840534817, + "learning_rate": 9.989137643964952e-05, + "loss": 4.0424, + "step": 1665 + }, + { + "epoch": 1.0734891216760678, + "grad_norm": 2.815278404626884, + "learning_rate": 9.989124357485099e-05, + "loss": 4.0747, + "step": 1666 + }, + { + "epoch": 1.0741337630942789, + "grad_norm": 1.5967459926227936, + "learning_rate": 9.989111062893355e-05, + "loss": 3.8258, + "step": 1667 + }, + { + "epoch": 1.07477840451249, + "grad_norm": 3.9678061463699383, + "learning_rate": 9.98909776018974e-05, + "loss": 3.8521, + "step": 1668 + }, + { + "epoch": 1.075423045930701, + "grad_norm": 2.780424535090114, + "learning_rate": 9.989084449374278e-05, + "loss": 4.1547, + "step": 1669 + }, + { + "epoch": 1.0760676873489121, + "grad_norm": 2.269751780915827, + "learning_rate": 9.989071130446987e-05, + "loss": 4.0325, + "step": 1670 + }, + { + "epoch": 1.0767123287671232, + "grad_norm": 2.098384754163906, + "learning_rate": 9.989057803407893e-05, + "loss": 4.1171, + "step": 1671 + }, + { + "epoch": 1.0773569701853345, + "grad_norm": 1.5577559962247365, + "learning_rate": 9.989044468257014e-05, + "loss": 4.0192, + "step": 1672 + }, + { + "epoch": 1.0780016116035456, + "grad_norm": 2.120078688257591, + "learning_rate": 9.989031124994375e-05, + "loss": 4.0611, + "step": 1673 + }, + { + "epoch": 1.0786462530217567, + "grad_norm": 1.765335938666819, + "learning_rate": 9.989017773619997e-05, + "loss": 3.9715, + "step": 1674 + }, + { + "epoch": 1.0792908944399677, + "grad_norm": 2.1250392512696363, + "learning_rate": 9.989004414133902e-05, + "loss": 3.7627, + "step": 1675 + }, + { + "epoch": 1.0799355358581788, + "grad_norm": 2.0473010756262404, + "learning_rate": 9.988991046536111e-05, + "loss": 4.0205, + "step": 1676 + }, + { + "epoch": 1.08058017727639, + "grad_norm": 2.2017287574511877, + "learning_rate": 9.988977670826647e-05, + "loss": 4.038, + "step": 1677 + }, + { + "epoch": 1.0812248186946012, + "grad_norm": 2.618819690016994, + "learning_rate": 9.98896428700553e-05, + "loss": 4.5017, + "step": 1678 + }, + { + "epoch": 1.0818694601128123, + "grad_norm": 1.633509137948601, + "learning_rate": 9.988950895072784e-05, + "loss": 3.7669, + "step": 1679 + }, + { + "epoch": 1.0825141015310233, + "grad_norm": 2.4923677968805054, + "learning_rate": 9.988937495028431e-05, + "loss": 3.4521, + "step": 1680 + }, + { + "epoch": 1.0831587429492344, + "grad_norm": 2.1996753971614473, + "learning_rate": 9.988924086872494e-05, + "loss": 4.4848, + "step": 1681 + }, + { + "epoch": 1.0838033843674455, + "grad_norm": 1.4554861037545916, + "learning_rate": 9.988910670604991e-05, + "loss": 4.2968, + "step": 1682 + }, + { + "epoch": 1.0844480257856568, + "grad_norm": 1.6558917607610368, + "learning_rate": 9.988897246225948e-05, + "loss": 3.8912, + "step": 1683 + }, + { + "epoch": 1.0850926672038679, + "grad_norm": 1.7608322602531004, + "learning_rate": 9.988883813735386e-05, + "loss": 4.0925, + "step": 1684 + }, + { + "epoch": 1.085737308622079, + "grad_norm": 2.0289388701073063, + "learning_rate": 9.988870373133326e-05, + "loss": 3.9955, + "step": 1685 + }, + { + "epoch": 1.08638195004029, + "grad_norm": 1.671929454523853, + "learning_rate": 9.988856924419791e-05, + "loss": 3.9375, + "step": 1686 + }, + { + "epoch": 1.0870265914585011, + "grad_norm": 1.5553868507638393, + "learning_rate": 9.988843467594804e-05, + "loss": 4.2969, + "step": 1687 + }, + { + "epoch": 1.0876712328767124, + "grad_norm": 1.8650964456897894, + "learning_rate": 9.988830002658385e-05, + "loss": 3.8296, + "step": 1688 + }, + { + "epoch": 1.0883158742949235, + "grad_norm": 1.8742779288293305, + "learning_rate": 9.988816529610557e-05, + "loss": 4.2066, + "step": 1689 + }, + { + "epoch": 1.0889605157131346, + "grad_norm": 1.8001216899385712, + "learning_rate": 9.988803048451342e-05, + "loss": 4.0371, + "step": 1690 + }, + { + "epoch": 1.0896051571313456, + "grad_norm": 2.080063346296714, + "learning_rate": 9.988789559180762e-05, + "loss": 4.0618, + "step": 1691 + }, + { + "epoch": 1.0902497985495567, + "grad_norm": 2.0072640385685325, + "learning_rate": 9.988776061798842e-05, + "loss": 4.009, + "step": 1692 + }, + { + "epoch": 1.090894439967768, + "grad_norm": 2.097403870728721, + "learning_rate": 9.9887625563056e-05, + "loss": 3.8818, + "step": 1693 + }, + { + "epoch": 1.091539081385979, + "grad_norm": 2.6788268125993273, + "learning_rate": 9.98874904270106e-05, + "loss": 4.0341, + "step": 1694 + }, + { + "epoch": 1.0921837228041902, + "grad_norm": 2.2458643114599575, + "learning_rate": 9.988735520985245e-05, + "loss": 3.8973, + "step": 1695 + }, + { + "epoch": 1.0928283642224013, + "grad_norm": 1.9219576891010872, + "learning_rate": 9.988721991158173e-05, + "loss": 4.048, + "step": 1696 + }, + { + "epoch": 1.0934730056406123, + "grad_norm": 1.999577672171787, + "learning_rate": 9.988708453219873e-05, + "loss": 3.8526, + "step": 1697 + }, + { + "epoch": 1.0941176470588236, + "grad_norm": 2.0612703218999027, + "learning_rate": 9.98869490717036e-05, + "loss": 4.0981, + "step": 1698 + }, + { + "epoch": 1.0947622884770347, + "grad_norm": 2.9455676428524424, + "learning_rate": 9.988681353009664e-05, + "loss": 4.0124, + "step": 1699 + }, + { + "epoch": 1.0954069298952458, + "grad_norm": 3.893944546332383, + "learning_rate": 9.9886677907378e-05, + "loss": 4.0464, + "step": 1700 + }, + { + "epoch": 1.0954069298952458, + "eval_loss": 4.151337623596191, + "eval_runtime": 2.9755, + "eval_samples_per_second": 33.608, + "eval_steps_per_second": 4.369, + "step": 1700 + }, + { + "epoch": 1.0960515713134569, + "grad_norm": 2.286090850887206, + "learning_rate": 9.988654220354795e-05, + "loss": 3.9937, + "step": 1701 + }, + { + "epoch": 1.096696212731668, + "grad_norm": 2.4799286041178705, + "learning_rate": 9.988640641860672e-05, + "loss": 3.7009, + "step": 1702 + }, + { + "epoch": 1.097340854149879, + "grad_norm": 3.269036445966811, + "learning_rate": 9.988627055255448e-05, + "loss": 4.2686, + "step": 1703 + }, + { + "epoch": 1.0979854955680903, + "grad_norm": 2.122919576469794, + "learning_rate": 9.98861346053915e-05, + "loss": 4.113, + "step": 1704 + }, + { + "epoch": 1.0986301369863014, + "grad_norm": 2.459850331072532, + "learning_rate": 9.988599857711796e-05, + "loss": 3.9982, + "step": 1705 + }, + { + "epoch": 1.0992747784045125, + "grad_norm": 1.986101207930958, + "learning_rate": 9.988586246773415e-05, + "loss": 4.0301, + "step": 1706 + }, + { + "epoch": 1.0999194198227236, + "grad_norm": 2.971704182410924, + "learning_rate": 9.988572627724025e-05, + "loss": 3.8354, + "step": 1707 + }, + { + "epoch": 1.1005640612409346, + "grad_norm": 1.8463928821909004, + "learning_rate": 9.988559000563645e-05, + "loss": 3.9625, + "step": 1708 + }, + { + "epoch": 1.101208702659146, + "grad_norm": 2.474889377043244, + "learning_rate": 9.988545365292304e-05, + "loss": 4.2057, + "step": 1709 + }, + { + "epoch": 1.101853344077357, + "grad_norm": 2.3656901303332023, + "learning_rate": 9.988531721910023e-05, + "loss": 3.8039, + "step": 1710 + }, + { + "epoch": 1.102497985495568, + "grad_norm": 2.564114795717795, + "learning_rate": 9.98851807041682e-05, + "loss": 3.8806, + "step": 1711 + }, + { + "epoch": 1.1031426269137792, + "grad_norm": 2.7047102705448514, + "learning_rate": 9.988504410812721e-05, + "loss": 3.8367, + "step": 1712 + }, + { + "epoch": 1.1037872683319903, + "grad_norm": 2.7166827940684857, + "learning_rate": 9.988490743097751e-05, + "loss": 4.0502, + "step": 1713 + }, + { + "epoch": 1.1044319097502013, + "grad_norm": 2.9296132724081967, + "learning_rate": 9.988477067271924e-05, + "loss": 3.7991, + "step": 1714 + }, + { + "epoch": 1.1050765511684126, + "grad_norm": 3.0824288163461286, + "learning_rate": 9.988463383335271e-05, + "loss": 4.4135, + "step": 1715 + }, + { + "epoch": 1.1057211925866237, + "grad_norm": 2.352577514668447, + "learning_rate": 9.988449691287812e-05, + "loss": 4.3055, + "step": 1716 + }, + { + "epoch": 1.1063658340048348, + "grad_norm": 2.266663303691391, + "learning_rate": 9.988435991129567e-05, + "loss": 4.329, + "step": 1717 + }, + { + "epoch": 1.1070104754230459, + "grad_norm": 1.9851905679827282, + "learning_rate": 9.98842228286056e-05, + "loss": 3.8681, + "step": 1718 + }, + { + "epoch": 1.107655116841257, + "grad_norm": 2.639253488479525, + "learning_rate": 9.988408566480814e-05, + "loss": 4.0991, + "step": 1719 + }, + { + "epoch": 1.1082997582594682, + "grad_norm": 1.8860733468185769, + "learning_rate": 9.988394841990352e-05, + "loss": 4.2269, + "step": 1720 + }, + { + "epoch": 1.1089443996776793, + "grad_norm": 1.9731321519824423, + "learning_rate": 9.988381109389195e-05, + "loss": 4.2461, + "step": 1721 + }, + { + "epoch": 1.1095890410958904, + "grad_norm": 2.5698376131180303, + "learning_rate": 9.988367368677368e-05, + "loss": 4.2563, + "step": 1722 + }, + { + "epoch": 1.1102336825141015, + "grad_norm": 1.674584530406128, + "learning_rate": 9.98835361985489e-05, + "loss": 3.9962, + "step": 1723 + }, + { + "epoch": 1.1108783239323126, + "grad_norm": 1.9388546816462662, + "learning_rate": 9.988339862921786e-05, + "loss": 3.9599, + "step": 1724 + }, + { + "epoch": 1.1115229653505239, + "grad_norm": 1.7115321366622684, + "learning_rate": 9.988326097878079e-05, + "loss": 4.153, + "step": 1725 + }, + { + "epoch": 1.112167606768735, + "grad_norm": 1.8565266668323264, + "learning_rate": 9.988312324723791e-05, + "loss": 4.0143, + "step": 1726 + }, + { + "epoch": 1.112812248186946, + "grad_norm": 1.7288227006430414, + "learning_rate": 9.988298543458941e-05, + "loss": 3.9377, + "step": 1727 + }, + { + "epoch": 1.113456889605157, + "grad_norm": 2.4479815748465996, + "learning_rate": 9.988284754083557e-05, + "loss": 3.9424, + "step": 1728 + }, + { + "epoch": 1.1141015310233682, + "grad_norm": 1.5772052453832803, + "learning_rate": 9.98827095659766e-05, + "loss": 3.9653, + "step": 1729 + }, + { + "epoch": 1.1147461724415795, + "grad_norm": 2.0074475668130187, + "learning_rate": 9.988257151001274e-05, + "loss": 3.9701, + "step": 1730 + }, + { + "epoch": 1.1153908138597906, + "grad_norm": 2.1643254762267623, + "learning_rate": 9.988243337294418e-05, + "loss": 3.7033, + "step": 1731 + }, + { + "epoch": 1.1160354552780016, + "grad_norm": 1.8933556616409832, + "learning_rate": 9.988229515477118e-05, + "loss": 3.7641, + "step": 1732 + }, + { + "epoch": 1.1166800966962127, + "grad_norm": 2.180558157945113, + "learning_rate": 9.988215685549393e-05, + "loss": 3.7359, + "step": 1733 + }, + { + "epoch": 1.1173247381144238, + "grad_norm": 2.942179193261661, + "learning_rate": 9.988201847511269e-05, + "loss": 4.0519, + "step": 1734 + }, + { + "epoch": 1.117969379532635, + "grad_norm": 2.7741404057660852, + "learning_rate": 9.988188001362768e-05, + "loss": 3.9713, + "step": 1735 + }, + { + "epoch": 1.1186140209508462, + "grad_norm": 2.4337591566791335, + "learning_rate": 9.988174147103911e-05, + "loss": 4.3089, + "step": 1736 + }, + { + "epoch": 1.1192586623690572, + "grad_norm": 3.0544409926898926, + "learning_rate": 9.988160284734724e-05, + "loss": 3.8599, + "step": 1737 + }, + { + "epoch": 1.1199033037872683, + "grad_norm": 2.658545160549155, + "learning_rate": 9.988146414255229e-05, + "loss": 4.1745, + "step": 1738 + }, + { + "epoch": 1.1205479452054794, + "grad_norm": 3.4932073781742785, + "learning_rate": 9.988132535665445e-05, + "loss": 4.0687, + "step": 1739 + }, + { + "epoch": 1.1211925866236905, + "grad_norm": 2.5024079971396485, + "learning_rate": 9.988118648965399e-05, + "loss": 3.9291, + "step": 1740 + }, + { + "epoch": 1.1218372280419018, + "grad_norm": 1.8132680208186058, + "learning_rate": 9.988104754155111e-05, + "loss": 4.3266, + "step": 1741 + }, + { + "epoch": 1.1224818694601129, + "grad_norm": 3.1812323413774233, + "learning_rate": 9.988090851234608e-05, + "loss": 3.9593, + "step": 1742 + }, + { + "epoch": 1.123126510878324, + "grad_norm": 2.4020888427856297, + "learning_rate": 9.988076940203908e-05, + "loss": 3.7655, + "step": 1743 + }, + { + "epoch": 1.123771152296535, + "grad_norm": 2.3847625814418953, + "learning_rate": 9.988063021063037e-05, + "loss": 4.1167, + "step": 1744 + }, + { + "epoch": 1.124415793714746, + "grad_norm": 2.876723898483236, + "learning_rate": 9.988049093812018e-05, + "loss": 4.1772, + "step": 1745 + }, + { + "epoch": 1.1250604351329572, + "grad_norm": 1.8987797039045657, + "learning_rate": 9.988035158450871e-05, + "loss": 4.2816, + "step": 1746 + }, + { + "epoch": 1.1257050765511685, + "grad_norm": 2.2422532035571394, + "learning_rate": 9.988021214979621e-05, + "loss": 3.9692, + "step": 1747 + }, + { + "epoch": 1.1263497179693795, + "grad_norm": 1.9880372979015941, + "learning_rate": 9.988007263398289e-05, + "loss": 3.8898, + "step": 1748 + }, + { + "epoch": 1.1269943593875906, + "grad_norm": 2.9479020219952723, + "learning_rate": 9.9879933037069e-05, + "loss": 3.9322, + "step": 1749 + }, + { + "epoch": 1.1276390008058017, + "grad_norm": 2.084125301661699, + "learning_rate": 9.987979335905477e-05, + "loss": 4.1222, + "step": 1750 + }, + { + "epoch": 1.1282836422240128, + "grad_norm": 2.529869952251121, + "learning_rate": 9.987965359994042e-05, + "loss": 3.7897, + "step": 1751 + }, + { + "epoch": 1.128928283642224, + "grad_norm": 2.551560303946689, + "learning_rate": 9.987951375972618e-05, + "loss": 4.2093, + "step": 1752 + }, + { + "epoch": 1.1295729250604352, + "grad_norm": 1.980282159130903, + "learning_rate": 9.987937383841228e-05, + "loss": 4.2115, + "step": 1753 + }, + { + "epoch": 1.1302175664786462, + "grad_norm": 2.2734969051022493, + "learning_rate": 9.987923383599894e-05, + "loss": 4.2315, + "step": 1754 + }, + { + "epoch": 1.1308622078968573, + "grad_norm": 2.17710654842764, + "learning_rate": 9.987909375248643e-05, + "loss": 3.8797, + "step": 1755 + }, + { + "epoch": 1.1315068493150684, + "grad_norm": 2.1425692704912445, + "learning_rate": 9.987895358787492e-05, + "loss": 3.7631, + "step": 1756 + }, + { + "epoch": 1.1321514907332797, + "grad_norm": 1.8979234856505818, + "learning_rate": 9.98788133421647e-05, + "loss": 3.5245, + "step": 1757 + }, + { + "epoch": 1.1327961321514908, + "grad_norm": 1.4760517510167022, + "learning_rate": 9.987867301535595e-05, + "loss": 3.6849, + "step": 1758 + }, + { + "epoch": 1.1334407735697019, + "grad_norm": 1.6809150954130114, + "learning_rate": 9.987853260744894e-05, + "loss": 4.019, + "step": 1759 + }, + { + "epoch": 1.134085414987913, + "grad_norm": 2.1377553802986062, + "learning_rate": 9.987839211844387e-05, + "loss": 4.3091, + "step": 1760 + }, + { + "epoch": 1.134730056406124, + "grad_norm": 2.031132280375137, + "learning_rate": 9.987825154834099e-05, + "loss": 4.0106, + "step": 1761 + }, + { + "epoch": 1.1353746978243353, + "grad_norm": 2.0547532849785735, + "learning_rate": 9.98781108971405e-05, + "loss": 4.1345, + "step": 1762 + }, + { + "epoch": 1.1360193392425464, + "grad_norm": 2.3890742627583403, + "learning_rate": 9.987797016484269e-05, + "loss": 4.2415, + "step": 1763 + }, + { + "epoch": 1.1366639806607575, + "grad_norm": 1.5850732464410109, + "learning_rate": 9.987782935144775e-05, + "loss": 3.9407, + "step": 1764 + }, + { + "epoch": 1.1373086220789685, + "grad_norm": 2.2391394662775483, + "learning_rate": 9.987768845695589e-05, + "loss": 3.9583, + "step": 1765 + }, + { + "epoch": 1.1379532634971796, + "grad_norm": 3.2693234968494482, + "learning_rate": 9.98775474813674e-05, + "loss": 3.7463, + "step": 1766 + }, + { + "epoch": 1.138597904915391, + "grad_norm": 2.6571194417296504, + "learning_rate": 9.987740642468248e-05, + "loss": 3.9873, + "step": 1767 + }, + { + "epoch": 1.139242546333602, + "grad_norm": 1.6439502716426346, + "learning_rate": 9.987726528690134e-05, + "loss": 3.981, + "step": 1768 + }, + { + "epoch": 1.139887187751813, + "grad_norm": 2.1120313002512185, + "learning_rate": 9.987712406802424e-05, + "loss": 4.168, + "step": 1769 + }, + { + "epoch": 1.1405318291700242, + "grad_norm": 1.7331021424633688, + "learning_rate": 9.987698276805142e-05, + "loss": 4.0185, + "step": 1770 + }, + { + "epoch": 1.1411764705882352, + "grad_norm": 1.6622923797221245, + "learning_rate": 9.98768413869831e-05, + "loss": 3.8232, + "step": 1771 + }, + { + "epoch": 1.1418211120064465, + "grad_norm": 2.1216525309085834, + "learning_rate": 9.98766999248195e-05, + "loss": 4.3118, + "step": 1772 + }, + { + "epoch": 1.1424657534246576, + "grad_norm": 1.49690562734132, + "learning_rate": 9.987655838156084e-05, + "loss": 4.4311, + "step": 1773 + }, + { + "epoch": 1.1431103948428687, + "grad_norm": 1.8971208215317883, + "learning_rate": 9.987641675720743e-05, + "loss": 4.2814, + "step": 1774 + }, + { + "epoch": 1.1437550362610798, + "grad_norm": 2.085487411117836, + "learning_rate": 9.987627505175941e-05, + "loss": 3.7476, + "step": 1775 + }, + { + "epoch": 1.1443996776792908, + "grad_norm": 1.9589998010563194, + "learning_rate": 9.987613326521705e-05, + "loss": 4.0861, + "step": 1776 + }, + { + "epoch": 1.145044319097502, + "grad_norm": 2.5269041166185486, + "learning_rate": 9.98759913975806e-05, + "loss": 4.1722, + "step": 1777 + }, + { + "epoch": 1.1456889605157132, + "grad_norm": 2.0610434476757007, + "learning_rate": 9.987584944885025e-05, + "loss": 4.3665, + "step": 1778 + }, + { + "epoch": 1.1463336019339243, + "grad_norm": 1.5199547469886772, + "learning_rate": 9.987570741902627e-05, + "loss": 4.1803, + "step": 1779 + }, + { + "epoch": 1.1469782433521354, + "grad_norm": 1.726974181981382, + "learning_rate": 9.98755653081089e-05, + "loss": 4.3246, + "step": 1780 + }, + { + "epoch": 1.1476228847703465, + "grad_norm": 1.9197713857097998, + "learning_rate": 9.987542311609835e-05, + "loss": 4.0594, + "step": 1781 + }, + { + "epoch": 1.1482675261885575, + "grad_norm": 1.5661541499528553, + "learning_rate": 9.987528084299487e-05, + "loss": 3.8765, + "step": 1782 + }, + { + "epoch": 1.1489121676067686, + "grad_norm": 1.782424143523922, + "learning_rate": 9.987513848879866e-05, + "loss": 4.4166, + "step": 1783 + }, + { + "epoch": 1.14955680902498, + "grad_norm": 2.2091872558945127, + "learning_rate": 9.987499605350999e-05, + "loss": 4.0629, + "step": 1784 + }, + { + "epoch": 1.150201450443191, + "grad_norm": 1.7570237374099884, + "learning_rate": 9.987485353712906e-05, + "loss": 3.9182, + "step": 1785 + }, + { + "epoch": 1.150846091861402, + "grad_norm": 2.0084259680736958, + "learning_rate": 9.987471093965614e-05, + "loss": 4.2801, + "step": 1786 + }, + { + "epoch": 1.1514907332796132, + "grad_norm": 1.7527227165332855, + "learning_rate": 9.987456826109148e-05, + "loss": 3.9127, + "step": 1787 + }, + { + "epoch": 1.1521353746978242, + "grad_norm": 2.6534583797898343, + "learning_rate": 9.987442550143525e-05, + "loss": 3.6899, + "step": 1788 + }, + { + "epoch": 1.1527800161160355, + "grad_norm": 2.5342483932656976, + "learning_rate": 9.98742826606877e-05, + "loss": 4.0666, + "step": 1789 + }, + { + "epoch": 1.1534246575342466, + "grad_norm": 2.6166236706295174, + "learning_rate": 9.987413973884911e-05, + "loss": 3.5084, + "step": 1790 + }, + { + "epoch": 1.1540692989524577, + "grad_norm": 2.7890576150850235, + "learning_rate": 9.987399673591968e-05, + "loss": 3.9769, + "step": 1791 + }, + { + "epoch": 1.1547139403706688, + "grad_norm": 1.6208209074052944, + "learning_rate": 9.987385365189967e-05, + "loss": 4.0587, + "step": 1792 + }, + { + "epoch": 1.1553585817888798, + "grad_norm": 2.3232307141746587, + "learning_rate": 9.987371048678927e-05, + "loss": 4.1813, + "step": 1793 + }, + { + "epoch": 1.1560032232070911, + "grad_norm": 2.3318106837904287, + "learning_rate": 9.987356724058877e-05, + "loss": 4.2278, + "step": 1794 + }, + { + "epoch": 1.1566478646253022, + "grad_norm": 1.5794673561017498, + "learning_rate": 9.987342391329836e-05, + "loss": 3.7768, + "step": 1795 + }, + { + "epoch": 1.1572925060435133, + "grad_norm": 2.5252951824321603, + "learning_rate": 9.98732805049183e-05, + "loss": 4.0385, + "step": 1796 + }, + { + "epoch": 1.1579371474617244, + "grad_norm": 2.534682626525382, + "learning_rate": 9.98731370154488e-05, + "loss": 4.1502, + "step": 1797 + }, + { + "epoch": 1.1585817888799355, + "grad_norm": 2.1021397039328993, + "learning_rate": 9.987299344489014e-05, + "loss": 3.8213, + "step": 1798 + }, + { + "epoch": 1.1592264302981468, + "grad_norm": 1.6689985927992406, + "learning_rate": 9.987284979324252e-05, + "loss": 4.2314, + "step": 1799 + }, + { + "epoch": 1.1598710717163578, + "grad_norm": 1.7009512041430892, + "learning_rate": 9.987270606050619e-05, + "loss": 3.6096, + "step": 1800 + }, + { + "epoch": 1.1598710717163578, + "eval_loss": 4.131776332855225, + "eval_runtime": 2.9785, + "eval_samples_per_second": 33.574, + "eval_steps_per_second": 4.365, + "step": 1800 + }, + { + "epoch": 1.160515713134569, + "grad_norm": 1.6801225916382196, + "learning_rate": 9.987256224668138e-05, + "loss": 3.9875, + "step": 1801 + }, + { + "epoch": 1.16116035455278, + "grad_norm": 1.6360670761234484, + "learning_rate": 9.987241835176832e-05, + "loss": 3.8988, + "step": 1802 + }, + { + "epoch": 1.161804995970991, + "grad_norm": 1.8889609741239772, + "learning_rate": 9.987227437576726e-05, + "loss": 4.0203, + "step": 1803 + }, + { + "epoch": 1.1624496373892024, + "grad_norm": 1.601432185646929, + "learning_rate": 9.987213031867844e-05, + "loss": 4.2486, + "step": 1804 + }, + { + "epoch": 1.1630942788074135, + "grad_norm": 1.715981552923504, + "learning_rate": 9.987198618050208e-05, + "loss": 4.1872, + "step": 1805 + }, + { + "epoch": 1.1637389202256245, + "grad_norm": 1.7674404355735864, + "learning_rate": 9.987184196123842e-05, + "loss": 3.8717, + "step": 1806 + }, + { + "epoch": 1.1643835616438356, + "grad_norm": 2.0625861584955834, + "learning_rate": 9.987169766088771e-05, + "loss": 3.623, + "step": 1807 + }, + { + "epoch": 1.1650282030620467, + "grad_norm": 3.157136276514062, + "learning_rate": 9.987155327945017e-05, + "loss": 3.8269, + "step": 1808 + }, + { + "epoch": 1.1656728444802578, + "grad_norm": 2.8097444929024498, + "learning_rate": 9.987140881692605e-05, + "loss": 3.9898, + "step": 1809 + }, + { + "epoch": 1.166317485898469, + "grad_norm": 2.122664422648581, + "learning_rate": 9.987126427331557e-05, + "loss": 3.6724, + "step": 1810 + }, + { + "epoch": 1.1669621273166801, + "grad_norm": 2.0916606678185032, + "learning_rate": 9.9871119648619e-05, + "loss": 4.002, + "step": 1811 + }, + { + "epoch": 1.1676067687348912, + "grad_norm": 1.9974877598066305, + "learning_rate": 9.987097494283655e-05, + "loss": 3.7406, + "step": 1812 + }, + { + "epoch": 1.1682514101531023, + "grad_norm": 1.8863723668319896, + "learning_rate": 9.987083015596847e-05, + "loss": 3.7292, + "step": 1813 + }, + { + "epoch": 1.1688960515713134, + "grad_norm": 1.3878788729530598, + "learning_rate": 9.987068528801498e-05, + "loss": 4.0767, + "step": 1814 + }, + { + "epoch": 1.1695406929895245, + "grad_norm": 1.8098728947442713, + "learning_rate": 9.987054033897634e-05, + "loss": 4.0636, + "step": 1815 + }, + { + "epoch": 1.1701853344077358, + "grad_norm": 1.7317687479071933, + "learning_rate": 9.987039530885277e-05, + "loss": 4.0332, + "step": 1816 + }, + { + "epoch": 1.1708299758259468, + "grad_norm": 1.8976469064391464, + "learning_rate": 9.987025019764453e-05, + "loss": 4.0595, + "step": 1817 + }, + { + "epoch": 1.171474617244158, + "grad_norm": 2.0599858087710845, + "learning_rate": 9.987010500535184e-05, + "loss": 4.3376, + "step": 1818 + }, + { + "epoch": 1.172119258662369, + "grad_norm": 1.875850623547248, + "learning_rate": 9.986995973197494e-05, + "loss": 3.8919, + "step": 1819 + }, + { + "epoch": 1.17276390008058, + "grad_norm": 1.6043036693256487, + "learning_rate": 9.986981437751409e-05, + "loss": 3.9493, + "step": 1820 + }, + { + "epoch": 1.1734085414987914, + "grad_norm": 1.6013770765786242, + "learning_rate": 9.98696689419695e-05, + "loss": 4.0795, + "step": 1821 + }, + { + "epoch": 1.1740531829170024, + "grad_norm": 1.3727263803066676, + "learning_rate": 9.986952342534142e-05, + "loss": 4.0635, + "step": 1822 + }, + { + "epoch": 1.1746978243352135, + "grad_norm": 1.4447399377727432, + "learning_rate": 9.986937782763008e-05, + "loss": 4.1039, + "step": 1823 + }, + { + "epoch": 1.1753424657534246, + "grad_norm": 1.774554463730614, + "learning_rate": 9.986923214883573e-05, + "loss": 4.2899, + "step": 1824 + }, + { + "epoch": 1.1759871071716357, + "grad_norm": 2.4303274002068034, + "learning_rate": 9.986908638895862e-05, + "loss": 3.918, + "step": 1825 + }, + { + "epoch": 1.176631748589847, + "grad_norm": 2.80084862936718, + "learning_rate": 9.986894054799898e-05, + "loss": 4.1367, + "step": 1826 + }, + { + "epoch": 1.177276390008058, + "grad_norm": 1.6789991280861813, + "learning_rate": 9.986879462595703e-05, + "loss": 4.2136, + "step": 1827 + }, + { + "epoch": 1.1779210314262691, + "grad_norm": 2.16655514659322, + "learning_rate": 9.986864862283303e-05, + "loss": 3.8378, + "step": 1828 + }, + { + "epoch": 1.1785656728444802, + "grad_norm": 2.772615064961051, + "learning_rate": 9.986850253862722e-05, + "loss": 3.9919, + "step": 1829 + }, + { + "epoch": 1.1792103142626913, + "grad_norm": 2.3431594929312336, + "learning_rate": 9.986835637333984e-05, + "loss": 4.064, + "step": 1830 + }, + { + "epoch": 1.1798549556809026, + "grad_norm": 1.5476625713981236, + "learning_rate": 9.986821012697112e-05, + "loss": 3.8515, + "step": 1831 + }, + { + "epoch": 1.1804995970991137, + "grad_norm": 2.9488840590848833, + "learning_rate": 9.986806379952132e-05, + "loss": 4.1497, + "step": 1832 + }, + { + "epoch": 1.1811442385173248, + "grad_norm": 1.8553261256061133, + "learning_rate": 9.986791739099065e-05, + "loss": 3.9807, + "step": 1833 + }, + { + "epoch": 1.1817888799355358, + "grad_norm": 2.134202121498314, + "learning_rate": 9.986777090137935e-05, + "loss": 4.133, + "step": 1834 + }, + { + "epoch": 1.182433521353747, + "grad_norm": 3.1209150475644876, + "learning_rate": 9.986762433068772e-05, + "loss": 4.0715, + "step": 1835 + }, + { + "epoch": 1.1830781627719582, + "grad_norm": 2.3175013274605516, + "learning_rate": 9.986747767891592e-05, + "loss": 4.2672, + "step": 1836 + }, + { + "epoch": 1.1837228041901693, + "grad_norm": 2.175040690691584, + "learning_rate": 9.986733094606424e-05, + "loss": 3.7198, + "step": 1837 + }, + { + "epoch": 1.1843674456083804, + "grad_norm": 2.8428579371650295, + "learning_rate": 9.986718413213293e-05, + "loss": 3.9031, + "step": 1838 + }, + { + "epoch": 1.1850120870265914, + "grad_norm": 2.2383096545365624, + "learning_rate": 9.98670372371222e-05, + "loss": 3.8312, + "step": 1839 + }, + { + "epoch": 1.1856567284448025, + "grad_norm": 2.7866878950880807, + "learning_rate": 9.98668902610323e-05, + "loss": 3.8492, + "step": 1840 + }, + { + "epoch": 1.1863013698630138, + "grad_norm": 2.5423352851298975, + "learning_rate": 9.986674320386346e-05, + "loss": 4.1903, + "step": 1841 + }, + { + "epoch": 1.186946011281225, + "grad_norm": 1.859085369910911, + "learning_rate": 9.986659606561594e-05, + "loss": 4.091, + "step": 1842 + }, + { + "epoch": 1.187590652699436, + "grad_norm": 2.0884184491257467, + "learning_rate": 9.986644884629e-05, + "loss": 4.0382, + "step": 1843 + }, + { + "epoch": 1.188235294117647, + "grad_norm": 2.050854090722569, + "learning_rate": 9.986630154588585e-05, + "loss": 3.8352, + "step": 1844 + }, + { + "epoch": 1.1888799355358581, + "grad_norm": 2.4697500731174244, + "learning_rate": 9.986615416440372e-05, + "loss": 4.0702, + "step": 1845 + }, + { + "epoch": 1.1895245769540692, + "grad_norm": 2.4672137494199906, + "learning_rate": 9.98660067018439e-05, + "loss": 4.0664, + "step": 1846 + }, + { + "epoch": 1.1901692183722805, + "grad_norm": 2.0677000909482564, + "learning_rate": 9.986585915820658e-05, + "loss": 4.025, + "step": 1847 + }, + { + "epoch": 1.1908138597904916, + "grad_norm": 1.7155849338451088, + "learning_rate": 9.986571153349205e-05, + "loss": 4.1867, + "step": 1848 + }, + { + "epoch": 1.1914585012087027, + "grad_norm": 2.5331026464749846, + "learning_rate": 9.986556382770052e-05, + "loss": 3.7619, + "step": 1849 + }, + { + "epoch": 1.1921031426269137, + "grad_norm": 2.330599639154227, + "learning_rate": 9.986541604083224e-05, + "loss": 3.6707, + "step": 1850 + }, + { + "epoch": 1.1927477840451248, + "grad_norm": 2.5154336467444334, + "learning_rate": 9.986526817288745e-05, + "loss": 3.9183, + "step": 1851 + }, + { + "epoch": 1.193392425463336, + "grad_norm": 2.4791924961139338, + "learning_rate": 9.986512022386642e-05, + "loss": 4.271, + "step": 1852 + }, + { + "epoch": 1.1940370668815472, + "grad_norm": 1.8826250302523608, + "learning_rate": 9.986497219376934e-05, + "loss": 4.1961, + "step": 1853 + }, + { + "epoch": 1.1946817082997583, + "grad_norm": 2.3583334605443733, + "learning_rate": 9.986482408259652e-05, + "loss": 4.1329, + "step": 1854 + }, + { + "epoch": 1.1953263497179694, + "grad_norm": 2.8431665010066283, + "learning_rate": 9.986467589034814e-05, + "loss": 3.8533, + "step": 1855 + }, + { + "epoch": 1.1959709911361804, + "grad_norm": 1.4818192688216174, + "learning_rate": 9.986452761702448e-05, + "loss": 4.0488, + "step": 1856 + }, + { + "epoch": 1.1966156325543915, + "grad_norm": 2.01594074806603, + "learning_rate": 9.986437926262576e-05, + "loss": 4.2512, + "step": 1857 + }, + { + "epoch": 1.1972602739726028, + "grad_norm": 1.8458960952790953, + "learning_rate": 9.986423082715228e-05, + "loss": 4.2576, + "step": 1858 + }, + { + "epoch": 1.197904915390814, + "grad_norm": 1.7349328197369136, + "learning_rate": 9.986408231060419e-05, + "loss": 4.2661, + "step": 1859 + }, + { + "epoch": 1.198549556809025, + "grad_norm": 1.851187014584169, + "learning_rate": 9.986393371298181e-05, + "loss": 3.8556, + "step": 1860 + }, + { + "epoch": 1.199194198227236, + "grad_norm": 1.6495377852471031, + "learning_rate": 9.986378503428536e-05, + "loss": 4.3747, + "step": 1861 + }, + { + "epoch": 1.1998388396454471, + "grad_norm": 1.7000236591447193, + "learning_rate": 9.986363627451508e-05, + "loss": 3.9121, + "step": 1862 + }, + { + "epoch": 1.2004834810636584, + "grad_norm": 1.7319747233503275, + "learning_rate": 9.986348743367123e-05, + "loss": 3.972, + "step": 1863 + }, + { + "epoch": 1.2011281224818695, + "grad_norm": 2.265485603242088, + "learning_rate": 9.986333851175402e-05, + "loss": 3.9694, + "step": 1864 + }, + { + "epoch": 1.2017727639000806, + "grad_norm": 2.2976973491182, + "learning_rate": 9.986318950876373e-05, + "loss": 4.334, + "step": 1865 + }, + { + "epoch": 1.2024174053182917, + "grad_norm": 1.9337860937154356, + "learning_rate": 9.986304042470058e-05, + "loss": 4.0627, + "step": 1866 + }, + { + "epoch": 1.2030620467365027, + "grad_norm": 1.6215821942061577, + "learning_rate": 9.986289125956485e-05, + "loss": 4.117, + "step": 1867 + }, + { + "epoch": 1.203706688154714, + "grad_norm": 1.5040038503453739, + "learning_rate": 9.986274201335674e-05, + "loss": 4.0313, + "step": 1868 + }, + { + "epoch": 1.2043513295729251, + "grad_norm": 1.733823603777781, + "learning_rate": 9.986259268607654e-05, + "loss": 3.9554, + "step": 1869 + }, + { + "epoch": 1.2049959709911362, + "grad_norm": 2.0830244750968263, + "learning_rate": 9.986244327772446e-05, + "loss": 3.7168, + "step": 1870 + }, + { + "epoch": 1.2056406124093473, + "grad_norm": 1.9132040124551664, + "learning_rate": 9.986229378830075e-05, + "loss": 3.9762, + "step": 1871 + }, + { + "epoch": 1.2062852538275584, + "grad_norm": 2.269411552294148, + "learning_rate": 9.986214421780567e-05, + "loss": 3.8522, + "step": 1872 + }, + { + "epoch": 1.2069298952457697, + "grad_norm": 1.5273391653519295, + "learning_rate": 9.986199456623945e-05, + "loss": 4.1851, + "step": 1873 + }, + { + "epoch": 1.2075745366639807, + "grad_norm": 1.974767732117611, + "learning_rate": 9.986184483360235e-05, + "loss": 4.2032, + "step": 1874 + }, + { + "epoch": 1.2082191780821918, + "grad_norm": 1.5911415511377123, + "learning_rate": 9.986169501989461e-05, + "loss": 3.8849, + "step": 1875 + }, + { + "epoch": 1.208863819500403, + "grad_norm": 1.3807526429452746, + "learning_rate": 9.986154512511649e-05, + "loss": 4.1909, + "step": 1876 + }, + { + "epoch": 1.209508460918614, + "grad_norm": 1.8871692772426834, + "learning_rate": 9.98613951492682e-05, + "loss": 3.8135, + "step": 1877 + }, + { + "epoch": 1.210153102336825, + "grad_norm": 2.9111826835307415, + "learning_rate": 9.986124509235002e-05, + "loss": 3.8694, + "step": 1878 + }, + { + "epoch": 1.2107977437550363, + "grad_norm": 3.957302382645077, + "learning_rate": 9.986109495436219e-05, + "loss": 4.0089, + "step": 1879 + }, + { + "epoch": 1.2114423851732474, + "grad_norm": 2.6737197667530714, + "learning_rate": 9.986094473530494e-05, + "loss": 4.3266, + "step": 1880 + }, + { + "epoch": 1.2120870265914585, + "grad_norm": 2.3654447737375457, + "learning_rate": 9.986079443517854e-05, + "loss": 3.9425, + "step": 1881 + }, + { + "epoch": 1.2127316680096696, + "grad_norm": 3.439810218037016, + "learning_rate": 9.986064405398322e-05, + "loss": 4.0516, + "step": 1882 + }, + { + "epoch": 1.2133763094278807, + "grad_norm": 2.062506654771073, + "learning_rate": 9.986049359171922e-05, + "loss": 4.0357, + "step": 1883 + }, + { + "epoch": 1.2140209508460917, + "grad_norm": 2.857493449736241, + "learning_rate": 9.986034304838683e-05, + "loss": 4.2558, + "step": 1884 + }, + { + "epoch": 1.214665592264303, + "grad_norm": 2.9560883747642417, + "learning_rate": 9.986019242398625e-05, + "loss": 3.9058, + "step": 1885 + }, + { + "epoch": 1.2153102336825141, + "grad_norm": 2.5524764512917617, + "learning_rate": 9.986004171851774e-05, + "loss": 4.1414, + "step": 1886 + }, + { + "epoch": 1.2159548751007252, + "grad_norm": 2.7582331455524387, + "learning_rate": 9.985989093198156e-05, + "loss": 3.9104, + "step": 1887 + }, + { + "epoch": 1.2165995165189363, + "grad_norm": 2.20140967955542, + "learning_rate": 9.985974006437794e-05, + "loss": 3.9097, + "step": 1888 + }, + { + "epoch": 1.2172441579371474, + "grad_norm": 3.519238648717434, + "learning_rate": 9.985958911570714e-05, + "loss": 3.9908, + "step": 1889 + }, + { + "epoch": 1.2178887993553587, + "grad_norm": 2.8999923622551127, + "learning_rate": 9.985943808596942e-05, + "loss": 3.9385, + "step": 1890 + }, + { + "epoch": 1.2185334407735697, + "grad_norm": 2.9471330574802113, + "learning_rate": 9.985928697516499e-05, + "loss": 4.3205, + "step": 1891 + }, + { + "epoch": 1.2191780821917808, + "grad_norm": 2.226854990725804, + "learning_rate": 9.985913578329413e-05, + "loss": 4.1293, + "step": 1892 + }, + { + "epoch": 1.2198227236099919, + "grad_norm": 2.759523191726537, + "learning_rate": 9.985898451035709e-05, + "loss": 4.1158, + "step": 1893 + }, + { + "epoch": 1.220467365028203, + "grad_norm": 2.5280312774570355, + "learning_rate": 9.98588331563541e-05, + "loss": 3.943, + "step": 1894 + }, + { + "epoch": 1.2211120064464143, + "grad_norm": 2.761842795864244, + "learning_rate": 9.985868172128541e-05, + "loss": 3.7472, + "step": 1895 + }, + { + "epoch": 1.2217566478646253, + "grad_norm": 2.908388157096695, + "learning_rate": 9.985853020515127e-05, + "loss": 3.8778, + "step": 1896 + }, + { + "epoch": 1.2224012892828364, + "grad_norm": 2.2698888978656973, + "learning_rate": 9.985837860795196e-05, + "loss": 4.1103, + "step": 1897 + }, + { + "epoch": 1.2230459307010475, + "grad_norm": 2.6574436939677764, + "learning_rate": 9.985822692968768e-05, + "loss": 3.742, + "step": 1898 + }, + { + "epoch": 1.2236905721192586, + "grad_norm": 2.2448543233798675, + "learning_rate": 9.985807517035871e-05, + "loss": 3.7552, + "step": 1899 + }, + { + "epoch": 1.2243352135374699, + "grad_norm": 2.3275738575748792, + "learning_rate": 9.985792332996529e-05, + "loss": 4.112, + "step": 1900 + }, + { + "epoch": 1.2243352135374699, + "eval_loss": 4.146688461303711, + "eval_runtime": 2.9707, + "eval_samples_per_second": 33.662, + "eval_steps_per_second": 4.376, + "step": 1900 + }, + { + "epoch": 1.224979854955681, + "grad_norm": 2.6490544598441734, + "learning_rate": 9.985777140850767e-05, + "loss": 3.9491, + "step": 1901 + }, + { + "epoch": 1.225624496373892, + "grad_norm": 2.530789543181635, + "learning_rate": 9.985761940598611e-05, + "loss": 3.6989, + "step": 1902 + }, + { + "epoch": 1.2262691377921031, + "grad_norm": 3.4379751980997324, + "learning_rate": 9.985746732240085e-05, + "loss": 4.0391, + "step": 1903 + }, + { + "epoch": 1.2269137792103142, + "grad_norm": 2.2070676845167805, + "learning_rate": 9.985731515775213e-05, + "loss": 4.1249, + "step": 1904 + }, + { + "epoch": 1.2275584206285255, + "grad_norm": 2.3092824061302655, + "learning_rate": 9.98571629120402e-05, + "loss": 4.0335, + "step": 1905 + }, + { + "epoch": 1.2282030620467366, + "grad_norm": 2.2973240122313485, + "learning_rate": 9.985701058526535e-05, + "loss": 4.3205, + "step": 1906 + }, + { + "epoch": 1.2288477034649476, + "grad_norm": 1.7455785535504513, + "learning_rate": 9.985685817742778e-05, + "loss": 4.0768, + "step": 1907 + }, + { + "epoch": 1.2294923448831587, + "grad_norm": 1.6193026184362642, + "learning_rate": 9.985670568852777e-05, + "loss": 4.0962, + "step": 1908 + }, + { + "epoch": 1.2301369863013698, + "grad_norm": 2.012500261160069, + "learning_rate": 9.985655311856554e-05, + "loss": 4.3611, + "step": 1909 + }, + { + "epoch": 1.230781627719581, + "grad_norm": 1.6591548814205679, + "learning_rate": 9.985640046754137e-05, + "loss": 3.7873, + "step": 1910 + }, + { + "epoch": 1.2314262691377922, + "grad_norm": 2.634720435683717, + "learning_rate": 9.985624773545552e-05, + "loss": 3.6252, + "step": 1911 + }, + { + "epoch": 1.2320709105560033, + "grad_norm": 1.6684732143478502, + "learning_rate": 9.985609492230821e-05, + "loss": 4.1499, + "step": 1912 + }, + { + "epoch": 1.2327155519742143, + "grad_norm": 1.9691998783440425, + "learning_rate": 9.98559420280997e-05, + "loss": 4.301, + "step": 1913 + }, + { + "epoch": 1.2333601933924254, + "grad_norm": 1.8395030498665421, + "learning_rate": 9.985578905283025e-05, + "loss": 3.9287, + "step": 1914 + }, + { + "epoch": 1.2340048348106365, + "grad_norm": 1.9141132004239265, + "learning_rate": 9.98556359965001e-05, + "loss": 3.9044, + "step": 1915 + }, + { + "epoch": 1.2346494762288478, + "grad_norm": 1.8419752057820133, + "learning_rate": 9.985548285910952e-05, + "loss": 3.8697, + "step": 1916 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 2.1681516350351093, + "learning_rate": 9.985532964065873e-05, + "loss": 4.0776, + "step": 1917 + }, + { + "epoch": 1.23593875906527, + "grad_norm": 1.554843531415167, + "learning_rate": 9.985517634114802e-05, + "loss": 3.9256, + "step": 1918 + }, + { + "epoch": 1.236583400483481, + "grad_norm": 2.0855807495650374, + "learning_rate": 9.985502296057761e-05, + "loss": 4.2105, + "step": 1919 + }, + { + "epoch": 1.237228041901692, + "grad_norm": 1.513862967923368, + "learning_rate": 9.985486949894777e-05, + "loss": 4.0076, + "step": 1920 + }, + { + "epoch": 1.2378726833199032, + "grad_norm": 2.502535367726221, + "learning_rate": 9.985471595625877e-05, + "loss": 3.9514, + "step": 1921 + }, + { + "epoch": 1.2385173247381145, + "grad_norm": 2.1722162241526513, + "learning_rate": 9.985456233251081e-05, + "loss": 3.8428, + "step": 1922 + }, + { + "epoch": 1.2391619661563256, + "grad_norm": 1.4417520051662895, + "learning_rate": 9.985440862770418e-05, + "loss": 4.007, + "step": 1923 + }, + { + "epoch": 1.2398066075745366, + "grad_norm": 2.2968008150128463, + "learning_rate": 9.98542548418391e-05, + "loss": 3.7432, + "step": 1924 + }, + { + "epoch": 1.2404512489927477, + "grad_norm": 2.259001528023464, + "learning_rate": 9.985410097491587e-05, + "loss": 3.5786, + "step": 1925 + }, + { + "epoch": 1.2410958904109588, + "grad_norm": 2.048480789293144, + "learning_rate": 9.985394702693473e-05, + "loss": 3.9654, + "step": 1926 + }, + { + "epoch": 1.24174053182917, + "grad_norm": 2.2975768257591525, + "learning_rate": 9.985379299789589e-05, + "loss": 4.0615, + "step": 1927 + }, + { + "epoch": 1.2423851732473812, + "grad_norm": 2.1126994398036536, + "learning_rate": 9.985363888779965e-05, + "loss": 4.2437, + "step": 1928 + }, + { + "epoch": 1.2430298146655923, + "grad_norm": 2.0655043782175633, + "learning_rate": 9.985348469664624e-05, + "loss": 4.0206, + "step": 1929 + }, + { + "epoch": 1.2436744560838033, + "grad_norm": 1.7014231475012214, + "learning_rate": 9.985333042443593e-05, + "loss": 4.3014, + "step": 1930 + }, + { + "epoch": 1.2443190975020144, + "grad_norm": 2.052057524201329, + "learning_rate": 9.985317607116898e-05, + "loss": 4.1587, + "step": 1931 + }, + { + "epoch": 1.2449637389202257, + "grad_norm": 2.187265308056579, + "learning_rate": 9.985302163684559e-05, + "loss": 4.118, + "step": 1932 + }, + { + "epoch": 1.2456083803384368, + "grad_norm": 2.278905151228911, + "learning_rate": 9.985286712146608e-05, + "loss": 3.942, + "step": 1933 + }, + { + "epoch": 1.2462530217566479, + "grad_norm": 2.5553617838041967, + "learning_rate": 9.985271252503067e-05, + "loss": 3.962, + "step": 1934 + }, + { + "epoch": 1.246897663174859, + "grad_norm": 2.1882606754904423, + "learning_rate": 9.985255784753961e-05, + "loss": 3.9849, + "step": 1935 + }, + { + "epoch": 1.24754230459307, + "grad_norm": 1.7201581870459972, + "learning_rate": 9.985240308899317e-05, + "loss": 3.8905, + "step": 1936 + }, + { + "epoch": 1.2481869460112813, + "grad_norm": 1.8491826334146604, + "learning_rate": 9.985224824939159e-05, + "loss": 4.1301, + "step": 1937 + }, + { + "epoch": 1.2488315874294924, + "grad_norm": 1.5108413091606745, + "learning_rate": 9.985209332873514e-05, + "loss": 3.9685, + "step": 1938 + }, + { + "epoch": 1.2494762288477035, + "grad_norm": 1.8812434913441822, + "learning_rate": 9.985193832702406e-05, + "loss": 4.1439, + "step": 1939 + }, + { + "epoch": 1.2501208702659146, + "grad_norm": 1.479853170792131, + "learning_rate": 9.985178324425861e-05, + "loss": 3.5786, + "step": 1940 + }, + { + "epoch": 1.2507655116841256, + "grad_norm": 1.9606326912159509, + "learning_rate": 9.985162808043905e-05, + "loss": 3.914, + "step": 1941 + }, + { + "epoch": 1.251410153102337, + "grad_norm": 1.775860767545027, + "learning_rate": 9.985147283556563e-05, + "loss": 3.8876, + "step": 1942 + }, + { + "epoch": 1.252054794520548, + "grad_norm": 2.5201695540751574, + "learning_rate": 9.985131750963859e-05, + "loss": 3.5268, + "step": 1943 + }, + { + "epoch": 1.252699435938759, + "grad_norm": 2.311758486158248, + "learning_rate": 9.985116210265822e-05, + "loss": 3.8375, + "step": 1944 + }, + { + "epoch": 1.2533440773569702, + "grad_norm": 1.8405003378563496, + "learning_rate": 9.985100661462476e-05, + "loss": 4.303, + "step": 1945 + }, + { + "epoch": 1.2539887187751813, + "grad_norm": 1.7675926854399695, + "learning_rate": 9.985085104553845e-05, + "loss": 4.2124, + "step": 1946 + }, + { + "epoch": 1.2546333601933926, + "grad_norm": 1.6666974027208041, + "learning_rate": 9.985069539539954e-05, + "loss": 3.9817, + "step": 1947 + }, + { + "epoch": 1.2552780016116034, + "grad_norm": 1.7835527093020447, + "learning_rate": 9.985053966420833e-05, + "loss": 3.8697, + "step": 1948 + }, + { + "epoch": 1.2559226430298147, + "grad_norm": 2.312531757896441, + "learning_rate": 9.985038385196504e-05, + "loss": 4.2301, + "step": 1949 + }, + { + "epoch": 1.2565672844480258, + "grad_norm": 1.5240278261847862, + "learning_rate": 9.985022795866991e-05, + "loss": 3.9399, + "step": 1950 + }, + { + "epoch": 1.2572119258662369, + "grad_norm": 2.4911780933240233, + "learning_rate": 9.985007198432326e-05, + "loss": 4.0531, + "step": 1951 + }, + { + "epoch": 1.257856567284448, + "grad_norm": 2.0613557910577436, + "learning_rate": 9.984991592892527e-05, + "loss": 4.2339, + "step": 1952 + }, + { + "epoch": 1.258501208702659, + "grad_norm": 1.8147120303560689, + "learning_rate": 9.984975979247625e-05, + "loss": 4.1186, + "step": 1953 + }, + { + "epoch": 1.2591458501208703, + "grad_norm": 2.964435465034282, + "learning_rate": 9.984960357497644e-05, + "loss": 4.1043, + "step": 1954 + }, + { + "epoch": 1.2597904915390814, + "grad_norm": 2.393842934017144, + "learning_rate": 9.984944727642609e-05, + "loss": 4.236, + "step": 1955 + }, + { + "epoch": 1.2604351329572925, + "grad_norm": 1.8860636495144578, + "learning_rate": 9.984929089682547e-05, + "loss": 4.0808, + "step": 1956 + }, + { + "epoch": 1.2610797743755036, + "grad_norm": 2.072350839078069, + "learning_rate": 9.984913443617481e-05, + "loss": 3.9394, + "step": 1957 + }, + { + "epoch": 1.2617244157937146, + "grad_norm": 2.35439714206629, + "learning_rate": 9.984897789447439e-05, + "loss": 3.7513, + "step": 1958 + }, + { + "epoch": 1.262369057211926, + "grad_norm": 1.8553809701465476, + "learning_rate": 9.984882127172448e-05, + "loss": 4.0506, + "step": 1959 + }, + { + "epoch": 1.263013698630137, + "grad_norm": 2.0250861135470686, + "learning_rate": 9.98486645679253e-05, + "loss": 3.9265, + "step": 1960 + }, + { + "epoch": 1.263658340048348, + "grad_norm": 2.0007881138613706, + "learning_rate": 9.984850778307714e-05, + "loss": 4.1735, + "step": 1961 + }, + { + "epoch": 1.2643029814665592, + "grad_norm": 2.101144238165606, + "learning_rate": 9.984835091718024e-05, + "loss": 4.4083, + "step": 1962 + }, + { + "epoch": 1.2649476228847703, + "grad_norm": 1.6050404445133826, + "learning_rate": 9.984819397023486e-05, + "loss": 4.2484, + "step": 1963 + }, + { + "epoch": 1.2655922643029816, + "grad_norm": 1.628058126126927, + "learning_rate": 9.984803694224127e-05, + "loss": 4.1329, + "step": 1964 + }, + { + "epoch": 1.2662369057211926, + "grad_norm": 1.4886598415306376, + "learning_rate": 9.984787983319971e-05, + "loss": 3.8977, + "step": 1965 + }, + { + "epoch": 1.2668815471394037, + "grad_norm": 1.2410781779344384, + "learning_rate": 9.984772264311044e-05, + "loss": 4.0005, + "step": 1966 + }, + { + "epoch": 1.2675261885576148, + "grad_norm": 1.915169848189784, + "learning_rate": 9.984756537197373e-05, + "loss": 3.9365, + "step": 1967 + }, + { + "epoch": 1.2681708299758259, + "grad_norm": 2.0248110112531386, + "learning_rate": 9.984740801978985e-05, + "loss": 3.7101, + "step": 1968 + }, + { + "epoch": 1.2688154713940372, + "grad_norm": 1.438572921314123, + "learning_rate": 9.984725058655901e-05, + "loss": 3.9656, + "step": 1969 + }, + { + "epoch": 1.2694601128122482, + "grad_norm": 1.9162525803576458, + "learning_rate": 9.984709307228153e-05, + "loss": 3.9308, + "step": 1970 + }, + { + "epoch": 1.2701047542304593, + "grad_norm": 1.4179489753068883, + "learning_rate": 9.984693547695761e-05, + "loss": 3.8029, + "step": 1971 + }, + { + "epoch": 1.2707493956486704, + "grad_norm": 1.5886789318186032, + "learning_rate": 9.984677780058755e-05, + "loss": 3.9651, + "step": 1972 + }, + { + "epoch": 1.2713940370668815, + "grad_norm": 1.7849188881136149, + "learning_rate": 9.98466200431716e-05, + "loss": 3.8461, + "step": 1973 + }, + { + "epoch": 1.2720386784850928, + "grad_norm": 1.2702413155219725, + "learning_rate": 9.984646220471002e-05, + "loss": 3.9414, + "step": 1974 + }, + { + "epoch": 1.2726833199033039, + "grad_norm": 1.772907932918725, + "learning_rate": 9.984630428520305e-05, + "loss": 4.1419, + "step": 1975 + }, + { + "epoch": 1.273327961321515, + "grad_norm": 1.6343875822211082, + "learning_rate": 9.984614628465097e-05, + "loss": 4.0882, + "step": 1976 + }, + { + "epoch": 1.273972602739726, + "grad_norm": 1.2696322704569547, + "learning_rate": 9.984598820305402e-05, + "loss": 4.1981, + "step": 1977 + }, + { + "epoch": 1.274617244157937, + "grad_norm": 1.2259423464939572, + "learning_rate": 9.984583004041249e-05, + "loss": 3.9588, + "step": 1978 + }, + { + "epoch": 1.2752618855761484, + "grad_norm": 1.5367958254415375, + "learning_rate": 9.984567179672661e-05, + "loss": 4.0326, + "step": 1979 + }, + { + "epoch": 1.2759065269943595, + "grad_norm": 1.8250082250375497, + "learning_rate": 9.984551347199667e-05, + "loss": 4.0807, + "step": 1980 + }, + { + "epoch": 1.2765511684125705, + "grad_norm": 1.6293666742972488, + "learning_rate": 9.98453550662229e-05, + "loss": 3.7704, + "step": 1981 + }, + { + "epoch": 1.2771958098307816, + "grad_norm": 1.6832570049120543, + "learning_rate": 9.984519657940558e-05, + "loss": 3.7406, + "step": 1982 + }, + { + "epoch": 1.2778404512489927, + "grad_norm": 1.9898583007323325, + "learning_rate": 9.984503801154497e-05, + "loss": 3.7155, + "step": 1983 + }, + { + "epoch": 1.278485092667204, + "grad_norm": 1.9341337276779835, + "learning_rate": 9.98448793626413e-05, + "loss": 4.3344, + "step": 1984 + }, + { + "epoch": 1.2791297340854149, + "grad_norm": 2.231324565262406, + "learning_rate": 9.984472063269485e-05, + "loss": 3.8755, + "step": 1985 + }, + { + "epoch": 1.2797743755036262, + "grad_norm": 1.7392920614597838, + "learning_rate": 9.984456182170591e-05, + "loss": 3.8261, + "step": 1986 + }, + { + "epoch": 1.2804190169218372, + "grad_norm": 1.9237432041723592, + "learning_rate": 9.984440292967471e-05, + "loss": 4.2905, + "step": 1987 + }, + { + "epoch": 1.2810636583400483, + "grad_norm": 2.6597219654003896, + "learning_rate": 9.984424395660152e-05, + "loss": 3.9846, + "step": 1988 + }, + { + "epoch": 1.2817082997582594, + "grad_norm": 2.2337772277424164, + "learning_rate": 9.984408490248659e-05, + "loss": 3.9149, + "step": 1989 + }, + { + "epoch": 1.2823529411764705, + "grad_norm": 2.16928554864474, + "learning_rate": 9.984392576733019e-05, + "loss": 3.722, + "step": 1990 + }, + { + "epoch": 1.2829975825946818, + "grad_norm": 2.689156868936116, + "learning_rate": 9.984376655113257e-05, + "loss": 3.6246, + "step": 1991 + }, + { + "epoch": 1.2836422240128929, + "grad_norm": 2.851276904812941, + "learning_rate": 9.984360725389402e-05, + "loss": 3.9802, + "step": 1992 + }, + { + "epoch": 1.284286865431104, + "grad_norm": 1.7433336517647702, + "learning_rate": 9.984344787561477e-05, + "loss": 3.5967, + "step": 1993 + }, + { + "epoch": 1.284931506849315, + "grad_norm": 2.2649196437752126, + "learning_rate": 9.98432884162951e-05, + "loss": 4.0556, + "step": 1994 + }, + { + "epoch": 1.285576148267526, + "grad_norm": 2.0813105441948228, + "learning_rate": 9.984312887593526e-05, + "loss": 4.0828, + "step": 1995 + }, + { + "epoch": 1.2862207896857374, + "grad_norm": 1.9946139250515096, + "learning_rate": 9.984296925453553e-05, + "loss": 4.0121, + "step": 1996 + }, + { + "epoch": 1.2868654311039485, + "grad_norm": 2.2464272715222195, + "learning_rate": 9.984280955209615e-05, + "loss": 3.7861, + "step": 1997 + }, + { + "epoch": 1.2875100725221595, + "grad_norm": 2.1735506659696973, + "learning_rate": 9.984264976861741e-05, + "loss": 3.789, + "step": 1998 + }, + { + "epoch": 1.2881547139403706, + "grad_norm": 2.367119114672446, + "learning_rate": 9.984248990409953e-05, + "loss": 3.7142, + "step": 1999 + }, + { + "epoch": 1.2887993553585817, + "grad_norm": 2.3666676413192227, + "learning_rate": 9.984232995854283e-05, + "loss": 3.8719, + "step": 2000 + }, + { + "epoch": 1.2887993553585817, + "eval_loss": 4.128237247467041, + "eval_runtime": 2.9616, + "eval_samples_per_second": 33.765, + "eval_steps_per_second": 4.39, + "step": 2000 + }, + { + "epoch": 1.289443996776793, + "grad_norm": 2.718179628020237, + "learning_rate": 9.984216993194751e-05, + "loss": 4.2339, + "step": 2001 + }, + { + "epoch": 1.290088638195004, + "grad_norm": 3.0188496763200967, + "learning_rate": 9.98420098243139e-05, + "loss": 3.5405, + "step": 2002 + }, + { + "epoch": 1.2907332796132152, + "grad_norm": 2.067614472929055, + "learning_rate": 9.984184963564219e-05, + "loss": 4.1121, + "step": 2003 + }, + { + "epoch": 1.2913779210314262, + "grad_norm": 2.791737527502173, + "learning_rate": 9.98416893659327e-05, + "loss": 4.1271, + "step": 2004 + }, + { + "epoch": 1.2920225624496373, + "grad_norm": 2.5059965015037453, + "learning_rate": 9.984152901518567e-05, + "loss": 3.9689, + "step": 2005 + }, + { + "epoch": 1.2926672038678486, + "grad_norm": 1.8557114334400189, + "learning_rate": 9.984136858340136e-05, + "loss": 3.883, + "step": 2006 + }, + { + "epoch": 1.2933118452860597, + "grad_norm": 2.1613268432842716, + "learning_rate": 9.984120807058005e-05, + "loss": 3.7374, + "step": 2007 + }, + { + "epoch": 1.2939564867042708, + "grad_norm": 1.577312993907106, + "learning_rate": 9.984104747672198e-05, + "loss": 3.6512, + "step": 2008 + }, + { + "epoch": 1.2946011281224818, + "grad_norm": 2.319686330734809, + "learning_rate": 9.984088680182745e-05, + "loss": 3.8357, + "step": 2009 + }, + { + "epoch": 1.295245769540693, + "grad_norm": 1.4917424469807388, + "learning_rate": 9.984072604589667e-05, + "loss": 3.8565, + "step": 2010 + }, + { + "epoch": 1.2958904109589042, + "grad_norm": 2.1705959175867005, + "learning_rate": 9.984056520892996e-05, + "loss": 3.8896, + "step": 2011 + }, + { + "epoch": 1.2965350523771153, + "grad_norm": 1.9485745536519974, + "learning_rate": 9.984040429092755e-05, + "loss": 4.1875, + "step": 2012 + }, + { + "epoch": 1.2971796937953264, + "grad_norm": 1.4599449848312016, + "learning_rate": 9.984024329188972e-05, + "loss": 4.2601, + "step": 2013 + }, + { + "epoch": 1.2978243352135375, + "grad_norm": 1.8044392226407004, + "learning_rate": 9.984008221181672e-05, + "loss": 4.2381, + "step": 2014 + }, + { + "epoch": 1.2984689766317485, + "grad_norm": 1.4321623429697508, + "learning_rate": 9.983992105070882e-05, + "loss": 4.203, + "step": 2015 + }, + { + "epoch": 1.2991136180499598, + "grad_norm": 1.892345421217022, + "learning_rate": 9.98397598085663e-05, + "loss": 4.1289, + "step": 2016 + }, + { + "epoch": 1.2997582594681707, + "grad_norm": 1.2978513364248956, + "learning_rate": 9.983959848538941e-05, + "loss": 4.1526, + "step": 2017 + }, + { + "epoch": 1.300402900886382, + "grad_norm": 2.5441441997787893, + "learning_rate": 9.983943708117841e-05, + "loss": 3.5984, + "step": 2018 + }, + { + "epoch": 1.301047542304593, + "grad_norm": 2.74128928030036, + "learning_rate": 9.983927559593358e-05, + "loss": 4.2854, + "step": 2019 + }, + { + "epoch": 1.3016921837228042, + "grad_norm": 2.1606193211235487, + "learning_rate": 9.983911402965517e-05, + "loss": 3.9025, + "step": 2020 + }, + { + "epoch": 1.3023368251410152, + "grad_norm": 1.6868657528107762, + "learning_rate": 9.983895238234346e-05, + "loss": 3.9239, + "step": 2021 + }, + { + "epoch": 1.3029814665592263, + "grad_norm": 2.2462911723699426, + "learning_rate": 9.98387906539987e-05, + "loss": 3.6667, + "step": 2022 + }, + { + "epoch": 1.3036261079774376, + "grad_norm": 2.1170201321498268, + "learning_rate": 9.983862884462118e-05, + "loss": 4.0947, + "step": 2023 + }, + { + "epoch": 1.3042707493956487, + "grad_norm": 2.758451975429622, + "learning_rate": 9.983846695421111e-05, + "loss": 4.0036, + "step": 2024 + }, + { + "epoch": 1.3049153908138598, + "grad_norm": 3.5407893276206877, + "learning_rate": 9.983830498276883e-05, + "loss": 3.9263, + "step": 2025 + }, + { + "epoch": 1.3055600322320708, + "grad_norm": 1.5916458088845855, + "learning_rate": 9.983814293029458e-05, + "loss": 4.0865, + "step": 2026 + }, + { + "epoch": 1.306204673650282, + "grad_norm": 2.8021459542984846, + "learning_rate": 9.983798079678859e-05, + "loss": 4.1954, + "step": 2027 + }, + { + "epoch": 1.3068493150684932, + "grad_norm": 1.8732066946536927, + "learning_rate": 9.983781858225118e-05, + "loss": 4.2852, + "step": 2028 + }, + { + "epoch": 1.3074939564867043, + "grad_norm": 2.272730020914282, + "learning_rate": 9.983765628668256e-05, + "loss": 3.949, + "step": 2029 + }, + { + "epoch": 1.3081385979049154, + "grad_norm": 2.3151538482324643, + "learning_rate": 9.983749391008305e-05, + "loss": 3.8695, + "step": 2030 + }, + { + "epoch": 1.3087832393231265, + "grad_norm": 1.8072320644056568, + "learning_rate": 9.98373314524529e-05, + "loss": 4.2409, + "step": 2031 + }, + { + "epoch": 1.3094278807413375, + "grad_norm": 1.8464338743908653, + "learning_rate": 9.983716891379235e-05, + "loss": 4.3346, + "step": 2032 + }, + { + "epoch": 1.3100725221595488, + "grad_norm": 2.454394514976431, + "learning_rate": 9.983700629410169e-05, + "loss": 4.1157, + "step": 2033 + }, + { + "epoch": 1.31071716357776, + "grad_norm": 2.4024045933309623, + "learning_rate": 9.98368435933812e-05, + "loss": 4.0864, + "step": 2034 + }, + { + "epoch": 1.311361804995971, + "grad_norm": 2.863939310384546, + "learning_rate": 9.983668081163112e-05, + "loss": 4.4104, + "step": 2035 + }, + { + "epoch": 1.312006446414182, + "grad_norm": 2.3333363322622716, + "learning_rate": 9.983651794885172e-05, + "loss": 3.9418, + "step": 2036 + }, + { + "epoch": 1.3126510878323931, + "grad_norm": 2.6128475777728237, + "learning_rate": 9.983635500504329e-05, + "loss": 3.8273, + "step": 2037 + }, + { + "epoch": 1.3132957292506044, + "grad_norm": 3.777435803702139, + "learning_rate": 9.983619198020609e-05, + "loss": 3.8507, + "step": 2038 + }, + { + "epoch": 1.3139403706688155, + "grad_norm": 2.4965576356563193, + "learning_rate": 9.983602887434038e-05, + "loss": 4.1422, + "step": 2039 + }, + { + "epoch": 1.3145850120870266, + "grad_norm": 2.4937908253122414, + "learning_rate": 9.983586568744642e-05, + "loss": 4.2763, + "step": 2040 + }, + { + "epoch": 1.3152296535052377, + "grad_norm": 2.800433843055833, + "learning_rate": 9.983570241952448e-05, + "loss": 3.5925, + "step": 2041 + }, + { + "epoch": 1.3158742949234488, + "grad_norm": 2.207749610903992, + "learning_rate": 9.983553907057485e-05, + "loss": 3.876, + "step": 2042 + }, + { + "epoch": 1.31651893634166, + "grad_norm": 2.2225071050748144, + "learning_rate": 9.983537564059779e-05, + "loss": 4.2307, + "step": 2043 + }, + { + "epoch": 1.3171635777598711, + "grad_norm": 2.5949865898027977, + "learning_rate": 9.983521212959355e-05, + "loss": 4.0585, + "step": 2044 + }, + { + "epoch": 1.3178082191780822, + "grad_norm": 1.688257386190153, + "learning_rate": 9.983504853756242e-05, + "loss": 3.9996, + "step": 2045 + }, + { + "epoch": 1.3184528605962933, + "grad_norm": 3.072526269852436, + "learning_rate": 9.983488486450465e-05, + "loss": 3.872, + "step": 2046 + }, + { + "epoch": 1.3190975020145044, + "grad_norm": 2.4064012246323387, + "learning_rate": 9.983472111042051e-05, + "loss": 4.2169, + "step": 2047 + }, + { + "epoch": 1.3197421434327157, + "grad_norm": 2.1543867215978323, + "learning_rate": 9.983455727531031e-05, + "loss": 3.9554, + "step": 2048 + }, + { + "epoch": 1.3203867848509268, + "grad_norm": 1.885667361864381, + "learning_rate": 9.983439335917426e-05, + "loss": 3.8991, + "step": 2049 + }, + { + "epoch": 1.3210314262691378, + "grad_norm": 1.7541246217028583, + "learning_rate": 9.983422936201267e-05, + "loss": 4.0731, + "step": 2050 + }, + { + "epoch": 1.321676067687349, + "grad_norm": 1.8010639792765257, + "learning_rate": 9.983406528382578e-05, + "loss": 3.9983, + "step": 2051 + }, + { + "epoch": 1.32232070910556, + "grad_norm": 1.9032326643215474, + "learning_rate": 9.98339011246139e-05, + "loss": 3.9555, + "step": 2052 + }, + { + "epoch": 1.3229653505237713, + "grad_norm": 2.374898855025377, + "learning_rate": 9.983373688437725e-05, + "loss": 4.2143, + "step": 2053 + }, + { + "epoch": 1.3236099919419821, + "grad_norm": 1.3853309438910781, + "learning_rate": 9.983357256311615e-05, + "loss": 4.0644, + "step": 2054 + }, + { + "epoch": 1.3242546333601934, + "grad_norm": 2.179431049397923, + "learning_rate": 9.983340816083081e-05, + "loss": 4.1853, + "step": 2055 + }, + { + "epoch": 1.3248992747784045, + "grad_norm": 1.5806449211781373, + "learning_rate": 9.983324367752155e-05, + "loss": 4.3078, + "step": 2056 + }, + { + "epoch": 1.3255439161966156, + "grad_norm": 1.8252815454361213, + "learning_rate": 9.983307911318863e-05, + "loss": 3.9143, + "step": 2057 + }, + { + "epoch": 1.3261885576148267, + "grad_norm": 1.6257146998466954, + "learning_rate": 9.983291446783231e-05, + "loss": 4.213, + "step": 2058 + }, + { + "epoch": 1.3268331990330378, + "grad_norm": 1.6189770641678551, + "learning_rate": 9.983274974145287e-05, + "loss": 3.9036, + "step": 2059 + }, + { + "epoch": 1.327477840451249, + "grad_norm": 1.5530319516557975, + "learning_rate": 9.983258493405055e-05, + "loss": 3.9275, + "step": 2060 + }, + { + "epoch": 1.3281224818694601, + "grad_norm": 1.6150608356556426, + "learning_rate": 9.983242004562567e-05, + "loss": 4.0922, + "step": 2061 + }, + { + "epoch": 1.3287671232876712, + "grad_norm": 1.4180558972764319, + "learning_rate": 9.983225507617847e-05, + "loss": 4.164, + "step": 2062 + }, + { + "epoch": 1.3294117647058823, + "grad_norm": 1.9178102474915735, + "learning_rate": 9.983209002570923e-05, + "loss": 3.7801, + "step": 2063 + }, + { + "epoch": 1.3300564061240934, + "grad_norm": 1.31914688912374, + "learning_rate": 9.983192489421823e-05, + "loss": 4.0543, + "step": 2064 + }, + { + "epoch": 1.3307010475423047, + "grad_norm": 1.8392997190059504, + "learning_rate": 9.983175968170571e-05, + "loss": 3.9102, + "step": 2065 + }, + { + "epoch": 1.3313456889605157, + "grad_norm": 1.2891383304156383, + "learning_rate": 9.983159438817198e-05, + "loss": 4.2966, + "step": 2066 + }, + { + "epoch": 1.3319903303787268, + "grad_norm": 1.9609380194113153, + "learning_rate": 9.983142901361729e-05, + "loss": 3.9041, + "step": 2067 + }, + { + "epoch": 1.332634971796938, + "grad_norm": 1.9389222603867586, + "learning_rate": 9.98312635580419e-05, + "loss": 4.1892, + "step": 2068 + }, + { + "epoch": 1.333279613215149, + "grad_norm": 2.0557647940138355, + "learning_rate": 9.98310980214461e-05, + "loss": 4.0378, + "step": 2069 + }, + { + "epoch": 1.3339242546333603, + "grad_norm": 1.6257043318154578, + "learning_rate": 9.983093240383015e-05, + "loss": 3.9401, + "step": 2070 + }, + { + "epoch": 1.3345688960515714, + "grad_norm": 1.7345196385466548, + "learning_rate": 9.983076670519435e-05, + "loss": 4.0149, + "step": 2071 + }, + { + "epoch": 1.3352135374697824, + "grad_norm": 1.7659159905101791, + "learning_rate": 9.983060092553896e-05, + "loss": 3.9082, + "step": 2072 + }, + { + "epoch": 1.3358581788879935, + "grad_norm": 2.02165942046072, + "learning_rate": 9.983043506486422e-05, + "loss": 4.1177, + "step": 2073 + }, + { + "epoch": 1.3365028203062046, + "grad_norm": 2.0094903042418815, + "learning_rate": 9.983026912317043e-05, + "loss": 4.3177, + "step": 2074 + }, + { + "epoch": 1.337147461724416, + "grad_norm": 1.4770733857182181, + "learning_rate": 9.983010310045786e-05, + "loss": 4.3957, + "step": 2075 + }, + { + "epoch": 1.337792103142627, + "grad_norm": 1.3194085386927066, + "learning_rate": 9.982993699672679e-05, + "loss": 3.9907, + "step": 2076 + }, + { + "epoch": 1.338436744560838, + "grad_norm": 1.7154187893795683, + "learning_rate": 9.982977081197749e-05, + "loss": 4.0689, + "step": 2077 + }, + { + "epoch": 1.3390813859790491, + "grad_norm": 1.6819969827399992, + "learning_rate": 9.982960454621022e-05, + "loss": 4.3641, + "step": 2078 + }, + { + "epoch": 1.3397260273972602, + "grad_norm": 1.4381468476471508, + "learning_rate": 9.982943819942527e-05, + "loss": 4.1854, + "step": 2079 + }, + { + "epoch": 1.3403706688154715, + "grad_norm": 1.2099432312722054, + "learning_rate": 9.982927177162289e-05, + "loss": 3.6513, + "step": 2080 + }, + { + "epoch": 1.3410153102336826, + "grad_norm": 1.304872491889683, + "learning_rate": 9.982910526280338e-05, + "loss": 3.9664, + "step": 2081 + }, + { + "epoch": 1.3416599516518937, + "grad_norm": 1.4458514995347242, + "learning_rate": 9.982893867296697e-05, + "loss": 4.019, + "step": 2082 + }, + { + "epoch": 1.3423045930701047, + "grad_norm": 1.5515668213588372, + "learning_rate": 9.9828772002114e-05, + "loss": 4.115, + "step": 2083 + }, + { + "epoch": 1.3429492344883158, + "grad_norm": 1.6651388629447776, + "learning_rate": 9.98286052502447e-05, + "loss": 3.9648, + "step": 2084 + }, + { + "epoch": 1.3435938759065271, + "grad_norm": 1.8769921870175534, + "learning_rate": 9.982843841735935e-05, + "loss": 4.0878, + "step": 2085 + }, + { + "epoch": 1.344238517324738, + "grad_norm": 1.4198752625055426, + "learning_rate": 9.982827150345823e-05, + "loss": 4.1894, + "step": 2086 + }, + { + "epoch": 1.3448831587429493, + "grad_norm": 1.5817531501258684, + "learning_rate": 9.98281045085416e-05, + "loss": 3.7049, + "step": 2087 + }, + { + "epoch": 1.3455278001611604, + "grad_norm": 2.1962423507680495, + "learning_rate": 9.982793743260975e-05, + "loss": 3.9634, + "step": 2088 + }, + { + "epoch": 1.3461724415793714, + "grad_norm": 2.614631902275045, + "learning_rate": 9.982777027566295e-05, + "loss": 3.9808, + "step": 2089 + }, + { + "epoch": 1.3468170829975825, + "grad_norm": 1.8113062594320117, + "learning_rate": 9.982760303770147e-05, + "loss": 4.1972, + "step": 2090 + }, + { + "epoch": 1.3474617244157936, + "grad_norm": 1.4823558832230452, + "learning_rate": 9.982743571872558e-05, + "loss": 4.0516, + "step": 2091 + }, + { + "epoch": 1.348106365834005, + "grad_norm": 2.0644086973730693, + "learning_rate": 9.982726831873558e-05, + "loss": 3.8256, + "step": 2092 + }, + { + "epoch": 1.348751007252216, + "grad_norm": 2.934664309446722, + "learning_rate": 9.982710083773172e-05, + "loss": 3.5998, + "step": 2093 + }, + { + "epoch": 1.349395648670427, + "grad_norm": 2.971321023828635, + "learning_rate": 9.98269332757143e-05, + "loss": 4.0546, + "step": 2094 + }, + { + "epoch": 1.3500402900886381, + "grad_norm": 1.5707764737301042, + "learning_rate": 9.982676563268354e-05, + "loss": 3.8227, + "step": 2095 + }, + { + "epoch": 1.3506849315068492, + "grad_norm": 2.5216216626267207, + "learning_rate": 9.982659790863978e-05, + "loss": 4.1097, + "step": 2096 + }, + { + "epoch": 1.3513295729250605, + "grad_norm": 3.6053860528063897, + "learning_rate": 9.982643010358329e-05, + "loss": 3.8196, + "step": 2097 + }, + { + "epoch": 1.3519742143432716, + "grad_norm": 1.9059156333879215, + "learning_rate": 9.982626221751429e-05, + "loss": 3.9813, + "step": 2098 + }, + { + "epoch": 1.3526188557614827, + "grad_norm": 1.7739662479032234, + "learning_rate": 9.982609425043309e-05, + "loss": 4.0668, + "step": 2099 + }, + { + "epoch": 1.3532634971796937, + "grad_norm": 2.0561130463038557, + "learning_rate": 9.982592620233998e-05, + "loss": 4.1058, + "step": 2100 + }, + { + "epoch": 1.3532634971796937, + "eval_loss": 4.120633125305176, + "eval_runtime": 2.9675, + "eval_samples_per_second": 33.699, + "eval_steps_per_second": 4.381, + "step": 2100 + }, + { + "epoch": 1.3539081385979048, + "grad_norm": 1.2722986805763004, + "learning_rate": 9.982575807323524e-05, + "loss": 4.1368, + "step": 2101 + }, + { + "epoch": 1.3545527800161161, + "grad_norm": 1.957853830487355, + "learning_rate": 9.98255898631191e-05, + "loss": 3.9799, + "step": 2102 + }, + { + "epoch": 1.3551974214343272, + "grad_norm": 1.8331068456870834, + "learning_rate": 9.982542157199186e-05, + "loss": 4.3044, + "step": 2103 + }, + { + "epoch": 1.3558420628525383, + "grad_norm": 1.9551700279246405, + "learning_rate": 9.982525319985382e-05, + "loss": 3.8447, + "step": 2104 + }, + { + "epoch": 1.3564867042707494, + "grad_norm": 2.6570665644400786, + "learning_rate": 9.982508474670522e-05, + "loss": 4.1672, + "step": 2105 + }, + { + "epoch": 1.3571313456889604, + "grad_norm": 3.0109945959847018, + "learning_rate": 9.982491621254637e-05, + "loss": 3.8588, + "step": 2106 + }, + { + "epoch": 1.3577759871071717, + "grad_norm": 2.1436733117030493, + "learning_rate": 9.982474759737752e-05, + "loss": 3.8773, + "step": 2107 + }, + { + "epoch": 1.3584206285253828, + "grad_norm": 2.298826701166528, + "learning_rate": 9.982457890119896e-05, + "loss": 3.5297, + "step": 2108 + }, + { + "epoch": 1.359065269943594, + "grad_norm": 2.7940695158083644, + "learning_rate": 9.982441012401098e-05, + "loss": 4.055, + "step": 2109 + }, + { + "epoch": 1.359709911361805, + "grad_norm": 2.458127538547821, + "learning_rate": 9.982424126581382e-05, + "loss": 3.7797, + "step": 2110 + }, + { + "epoch": 1.360354552780016, + "grad_norm": 1.716531953922474, + "learning_rate": 9.98240723266078e-05, + "loss": 4.5109, + "step": 2111 + }, + { + "epoch": 1.3609991941982273, + "grad_norm": 1.7046508067341326, + "learning_rate": 9.982390330639317e-05, + "loss": 4.0946, + "step": 2112 + }, + { + "epoch": 1.3616438356164384, + "grad_norm": 2.004260875537598, + "learning_rate": 9.982373420517021e-05, + "loss": 4.03, + "step": 2113 + }, + { + "epoch": 1.3622884770346495, + "grad_norm": 1.7027638645855834, + "learning_rate": 9.98235650229392e-05, + "loss": 3.7987, + "step": 2114 + }, + { + "epoch": 1.3629331184528606, + "grad_norm": 2.1219383202991406, + "learning_rate": 9.982339575970041e-05, + "loss": 3.8874, + "step": 2115 + }, + { + "epoch": 1.3635777598710717, + "grad_norm": 1.5154907093983794, + "learning_rate": 9.982322641545416e-05, + "loss": 4.4091, + "step": 2116 + }, + { + "epoch": 1.364222401289283, + "grad_norm": 2.001277510831595, + "learning_rate": 9.982305699020067e-05, + "loss": 3.8955, + "step": 2117 + }, + { + "epoch": 1.364867042707494, + "grad_norm": 2.1411866030965614, + "learning_rate": 9.982288748394025e-05, + "loss": 4.3536, + "step": 2118 + }, + { + "epoch": 1.3655116841257051, + "grad_norm": 2.4177165789861546, + "learning_rate": 9.982271789667317e-05, + "loss": 3.9717, + "step": 2119 + }, + { + "epoch": 1.3661563255439162, + "grad_norm": 2.0112017208106647, + "learning_rate": 9.982254822839971e-05, + "loss": 4.0283, + "step": 2120 + }, + { + "epoch": 1.3668009669621273, + "grad_norm": 1.6289949303704176, + "learning_rate": 9.982237847912018e-05, + "loss": 3.9145, + "step": 2121 + }, + { + "epoch": 1.3674456083803386, + "grad_norm": 1.748459692484323, + "learning_rate": 9.982220864883479e-05, + "loss": 4.1137, + "step": 2122 + }, + { + "epoch": 1.3680902497985494, + "grad_norm": 1.7384931963840256, + "learning_rate": 9.982203873754387e-05, + "loss": 4.3582, + "step": 2123 + }, + { + "epoch": 1.3687348912167607, + "grad_norm": 1.8863129217637427, + "learning_rate": 9.982186874524769e-05, + "loss": 3.8605, + "step": 2124 + }, + { + "epoch": 1.3693795326349718, + "grad_norm": 1.872628775201897, + "learning_rate": 9.982169867194654e-05, + "loss": 3.86, + "step": 2125 + }, + { + "epoch": 1.3700241740531829, + "grad_norm": 2.1104747666413743, + "learning_rate": 9.982152851764065e-05, + "loss": 3.8935, + "step": 2126 + }, + { + "epoch": 1.370668815471394, + "grad_norm": 2.5777146507601474, + "learning_rate": 9.982135828233036e-05, + "loss": 3.8022, + "step": 2127 + }, + { + "epoch": 1.371313456889605, + "grad_norm": 2.94891706423159, + "learning_rate": 9.982118796601594e-05, + "loss": 3.7606, + "step": 2128 + }, + { + "epoch": 1.3719580983078163, + "grad_norm": 1.5575511940568378, + "learning_rate": 9.982101756869763e-05, + "loss": 3.9988, + "step": 2129 + }, + { + "epoch": 1.3726027397260274, + "grad_norm": 2.1788044587423596, + "learning_rate": 9.982084709037573e-05, + "loss": 4.1021, + "step": 2130 + }, + { + "epoch": 1.3732473811442385, + "grad_norm": 1.7903801642426902, + "learning_rate": 9.982067653105054e-05, + "loss": 4.2001, + "step": 2131 + }, + { + "epoch": 1.3738920225624496, + "grad_norm": 1.96588639921719, + "learning_rate": 9.982050589072231e-05, + "loss": 4.0774, + "step": 2132 + }, + { + "epoch": 1.3745366639806607, + "grad_norm": 1.8785090965514717, + "learning_rate": 9.982033516939135e-05, + "loss": 4.2797, + "step": 2133 + }, + { + "epoch": 1.375181305398872, + "grad_norm": 2.789282015564159, + "learning_rate": 9.982016436705792e-05, + "loss": 3.7437, + "step": 2134 + }, + { + "epoch": 1.375825946817083, + "grad_norm": 3.128416066350626, + "learning_rate": 9.981999348372229e-05, + "loss": 3.5712, + "step": 2135 + }, + { + "epoch": 1.3764705882352941, + "grad_norm": 1.6610474284647199, + "learning_rate": 9.981982251938477e-05, + "loss": 3.7589, + "step": 2136 + }, + { + "epoch": 1.3771152296535052, + "grad_norm": 2.8140798423132876, + "learning_rate": 9.981965147404562e-05, + "loss": 3.6826, + "step": 2137 + }, + { + "epoch": 1.3777598710717163, + "grad_norm": 3.6725807282230916, + "learning_rate": 9.981948034770513e-05, + "loss": 4.0667, + "step": 2138 + }, + { + "epoch": 1.3784045124899276, + "grad_norm": 2.8344745264334312, + "learning_rate": 9.981930914036358e-05, + "loss": 3.8808, + "step": 2139 + }, + { + "epoch": 1.3790491539081386, + "grad_norm": 2.5487984749445216, + "learning_rate": 9.981913785202126e-05, + "loss": 3.7162, + "step": 2140 + }, + { + "epoch": 1.3796937953263497, + "grad_norm": 1.7227724274593248, + "learning_rate": 9.981896648267843e-05, + "loss": 3.9497, + "step": 2141 + }, + { + "epoch": 1.3803384367445608, + "grad_norm": 2.3847025298053475, + "learning_rate": 9.981879503233537e-05, + "loss": 4.1149, + "step": 2142 + }, + { + "epoch": 1.3809830781627719, + "grad_norm": 2.0094381913579817, + "learning_rate": 9.98186235009924e-05, + "loss": 3.9913, + "step": 2143 + }, + { + "epoch": 1.3816277195809832, + "grad_norm": 2.0875416939168687, + "learning_rate": 9.981845188864976e-05, + "loss": 4.1504, + "step": 2144 + }, + { + "epoch": 1.3822723609991943, + "grad_norm": 2.2111431985340673, + "learning_rate": 9.981828019530775e-05, + "loss": 3.6672, + "step": 2145 + }, + { + "epoch": 1.3829170024174053, + "grad_norm": 2.1431960874030174, + "learning_rate": 9.981810842096666e-05, + "loss": 3.7203, + "step": 2146 + }, + { + "epoch": 1.3835616438356164, + "grad_norm": 1.4221087903666039, + "learning_rate": 9.981793656562675e-05, + "loss": 4.2492, + "step": 2147 + }, + { + "epoch": 1.3842062852538275, + "grad_norm": 2.3331318589445114, + "learning_rate": 9.981776462928831e-05, + "loss": 4.0644, + "step": 2148 + }, + { + "epoch": 1.3848509266720388, + "grad_norm": 2.078195688183096, + "learning_rate": 9.981759261195164e-05, + "loss": 3.9286, + "step": 2149 + }, + { + "epoch": 1.3854955680902499, + "grad_norm": 1.8702806906887854, + "learning_rate": 9.9817420513617e-05, + "loss": 4.1084, + "step": 2150 + }, + { + "epoch": 1.386140209508461, + "grad_norm": 1.5481289065993198, + "learning_rate": 9.981724833428469e-05, + "loss": 4.1982, + "step": 2151 + }, + { + "epoch": 1.386784850926672, + "grad_norm": 1.7220563218667357, + "learning_rate": 9.981707607395498e-05, + "loss": 4.254, + "step": 2152 + }, + { + "epoch": 1.387429492344883, + "grad_norm": 2.049231304312407, + "learning_rate": 9.981690373262817e-05, + "loss": 3.9304, + "step": 2153 + }, + { + "epoch": 1.3880741337630944, + "grad_norm": 1.7097524652126979, + "learning_rate": 9.98167313103045e-05, + "loss": 4.1546, + "step": 2154 + }, + { + "epoch": 1.3887187751813053, + "grad_norm": 1.7144148466649693, + "learning_rate": 9.981655880698431e-05, + "loss": 4.4076, + "step": 2155 + }, + { + "epoch": 1.3893634165995166, + "grad_norm": 1.3203514415482824, + "learning_rate": 9.981638622266784e-05, + "loss": 3.9579, + "step": 2156 + }, + { + "epoch": 1.3900080580177276, + "grad_norm": 1.2947409100725409, + "learning_rate": 9.981621355735541e-05, + "loss": 3.8797, + "step": 2157 + }, + { + "epoch": 1.3906526994359387, + "grad_norm": 1.6022969482064633, + "learning_rate": 9.981604081104727e-05, + "loss": 4.1265, + "step": 2158 + }, + { + "epoch": 1.3912973408541498, + "grad_norm": 1.5886111675090446, + "learning_rate": 9.981586798374371e-05, + "loss": 4.0634, + "step": 2159 + }, + { + "epoch": 1.3919419822723609, + "grad_norm": 1.8324165850930318, + "learning_rate": 9.981569507544504e-05, + "loss": 4.1051, + "step": 2160 + }, + { + "epoch": 1.3925866236905722, + "grad_norm": 1.705580948664618, + "learning_rate": 9.981552208615151e-05, + "loss": 4.013, + "step": 2161 + }, + { + "epoch": 1.3932312651087833, + "grad_norm": 1.787150172194081, + "learning_rate": 9.981534901586343e-05, + "loss": 4.2094, + "step": 2162 + }, + { + "epoch": 1.3938759065269943, + "grad_norm": 2.4602874201635796, + "learning_rate": 9.981517586458107e-05, + "loss": 3.8997, + "step": 2163 + }, + { + "epoch": 1.3945205479452054, + "grad_norm": 2.9287162969637315, + "learning_rate": 9.981500263230471e-05, + "loss": 3.8478, + "step": 2164 + }, + { + "epoch": 1.3951651893634165, + "grad_norm": 2.157666632614995, + "learning_rate": 9.981482931903466e-05, + "loss": 4.1212, + "step": 2165 + }, + { + "epoch": 1.3958098307816278, + "grad_norm": 1.6088139845435605, + "learning_rate": 9.981465592477118e-05, + "loss": 4.0338, + "step": 2166 + }, + { + "epoch": 1.3964544721998389, + "grad_norm": 1.9674356462266793, + "learning_rate": 9.981448244951457e-05, + "loss": 4.157, + "step": 2167 + }, + { + "epoch": 1.39709911361805, + "grad_norm": 1.5191436729639574, + "learning_rate": 9.98143088932651e-05, + "loss": 3.9599, + "step": 2168 + }, + { + "epoch": 1.397743755036261, + "grad_norm": 2.450742247314485, + "learning_rate": 9.981413525602306e-05, + "loss": 4.1098, + "step": 2169 + }, + { + "epoch": 1.398388396454472, + "grad_norm": 3.4999114340603783, + "learning_rate": 9.981396153778874e-05, + "loss": 3.6824, + "step": 2170 + }, + { + "epoch": 1.3990330378726834, + "grad_norm": 2.4702605392137467, + "learning_rate": 9.981378773856241e-05, + "loss": 4.0644, + "step": 2171 + }, + { + "epoch": 1.3996776792908945, + "grad_norm": 1.9436450191262633, + "learning_rate": 9.981361385834438e-05, + "loss": 3.9649, + "step": 2172 + }, + { + "epoch": 1.4003223207091056, + "grad_norm": 2.048054449460824, + "learning_rate": 9.981343989713493e-05, + "loss": 4.1094, + "step": 2173 + }, + { + "epoch": 1.4009669621273166, + "grad_norm": 2.8429449104902904, + "learning_rate": 9.981326585493431e-05, + "loss": 3.688, + "step": 2174 + }, + { + "epoch": 1.4016116035455277, + "grad_norm": 2.539126897021565, + "learning_rate": 9.981309173174286e-05, + "loss": 3.9589, + "step": 2175 + }, + { + "epoch": 1.402256244963739, + "grad_norm": 1.6075758137956777, + "learning_rate": 9.981291752756085e-05, + "loss": 3.9555, + "step": 2176 + }, + { + "epoch": 1.40290088638195, + "grad_norm": 2.0684103109728196, + "learning_rate": 9.981274324238855e-05, + "loss": 3.9728, + "step": 2177 + }, + { + "epoch": 1.4035455278001612, + "grad_norm": 2.0370736243267893, + "learning_rate": 9.981256887622624e-05, + "loss": 3.8748, + "step": 2178 + }, + { + "epoch": 1.4041901692183723, + "grad_norm": 1.9322122700870628, + "learning_rate": 9.981239442907423e-05, + "loss": 3.8787, + "step": 2179 + }, + { + "epoch": 1.4048348106365833, + "grad_norm": 2.154719487275065, + "learning_rate": 9.981221990093279e-05, + "loss": 3.8535, + "step": 2180 + }, + { + "epoch": 1.4054794520547946, + "grad_norm": 2.085311200733991, + "learning_rate": 9.981204529180222e-05, + "loss": 4.2643, + "step": 2181 + }, + { + "epoch": 1.4061240934730057, + "grad_norm": 1.6537727914835076, + "learning_rate": 9.981187060168278e-05, + "loss": 3.8048, + "step": 2182 + }, + { + "epoch": 1.4067687348912168, + "grad_norm": 2.45113434307199, + "learning_rate": 9.981169583057481e-05, + "loss": 4.2618, + "step": 2183 + }, + { + "epoch": 1.4074133763094279, + "grad_norm": 1.622478931777901, + "learning_rate": 9.981152097847856e-05, + "loss": 4.1304, + "step": 2184 + }, + { + "epoch": 1.408058017727639, + "grad_norm": 2.231103787718885, + "learning_rate": 9.981134604539431e-05, + "loss": 4.163, + "step": 2185 + }, + { + "epoch": 1.4087026591458502, + "grad_norm": 3.574660599216614, + "learning_rate": 9.981117103132235e-05, + "loss": 4.104, + "step": 2186 + }, + { + "epoch": 1.4093473005640613, + "grad_norm": 2.281652888333202, + "learning_rate": 9.981099593626298e-05, + "loss": 3.9176, + "step": 2187 + }, + { + "epoch": 1.4099919419822724, + "grad_norm": 2.338308900191729, + "learning_rate": 9.981082076021649e-05, + "loss": 3.9387, + "step": 2188 + }, + { + "epoch": 1.4106365834004835, + "grad_norm": 2.2156838815024087, + "learning_rate": 9.981064550318317e-05, + "loss": 3.9681, + "step": 2189 + }, + { + "epoch": 1.4112812248186946, + "grad_norm": 2.3971962021117807, + "learning_rate": 9.981047016516327e-05, + "loss": 4.0965, + "step": 2190 + }, + { + "epoch": 1.4119258662369059, + "grad_norm": 2.6759632667633317, + "learning_rate": 9.981029474615712e-05, + "loss": 3.8315, + "step": 2191 + }, + { + "epoch": 1.4125705076551167, + "grad_norm": 1.6003407514881096, + "learning_rate": 9.9810119246165e-05, + "loss": 4.1291, + "step": 2192 + }, + { + "epoch": 1.413215149073328, + "grad_norm": 2.1289935935986413, + "learning_rate": 9.980994366518719e-05, + "loss": 4.009, + "step": 2193 + }, + { + "epoch": 1.413859790491539, + "grad_norm": 1.6430210430593224, + "learning_rate": 9.9809768003224e-05, + "loss": 3.9686, + "step": 2194 + }, + { + "epoch": 1.4145044319097502, + "grad_norm": 1.9151536570044956, + "learning_rate": 9.980959226027566e-05, + "loss": 4.1421, + "step": 2195 + }, + { + "epoch": 1.4151490733279612, + "grad_norm": 1.6889416923170286, + "learning_rate": 9.980941643634251e-05, + "loss": 3.8849, + "step": 2196 + }, + { + "epoch": 1.4157937147461723, + "grad_norm": 2.171301113006166, + "learning_rate": 9.980924053142486e-05, + "loss": 3.788, + "step": 2197 + }, + { + "epoch": 1.4164383561643836, + "grad_norm": 2.1026832696188986, + "learning_rate": 9.980906454552294e-05, + "loss": 3.9348, + "step": 2198 + }, + { + "epoch": 1.4170829975825947, + "grad_norm": 2.290013151664431, + "learning_rate": 9.980888847863707e-05, + "loss": 4.3298, + "step": 2199 + }, + { + "epoch": 1.4177276390008058, + "grad_norm": 1.6599753182725923, + "learning_rate": 9.980871233076752e-05, + "loss": 3.9353, + "step": 2200 + }, + { + "epoch": 1.4177276390008058, + "eval_loss": 4.1037187576293945, + "eval_runtime": 2.9718, + "eval_samples_per_second": 33.65, + "eval_steps_per_second": 4.375, + "step": 2200 + }, + { + "epoch": 1.4183722804190169, + "grad_norm": 2.170169773165853, + "learning_rate": 9.980853610191463e-05, + "loss": 4.0166, + "step": 2201 + }, + { + "epoch": 1.419016921837228, + "grad_norm": 2.153223587409801, + "learning_rate": 9.980835979207863e-05, + "loss": 3.9974, + "step": 2202 + }, + { + "epoch": 1.4196615632554392, + "grad_norm": 1.521524779922555, + "learning_rate": 9.980818340125984e-05, + "loss": 4.3435, + "step": 2203 + }, + { + "epoch": 1.4203062046736503, + "grad_norm": 2.4770791198198725, + "learning_rate": 9.980800692945854e-05, + "loss": 4.0181, + "step": 2204 + }, + { + "epoch": 1.4209508460918614, + "grad_norm": 2.080007994227895, + "learning_rate": 9.9807830376675e-05, + "loss": 3.9041, + "step": 2205 + }, + { + "epoch": 1.4215954875100725, + "grad_norm": 1.6929287041141645, + "learning_rate": 9.980765374290957e-05, + "loss": 3.9822, + "step": 2206 + }, + { + "epoch": 1.4222401289282836, + "grad_norm": 2.663235877793047, + "learning_rate": 9.980747702816248e-05, + "loss": 3.6846, + "step": 2207 + }, + { + "epoch": 1.4228847703464949, + "grad_norm": 1.7401672100938808, + "learning_rate": 9.980730023243404e-05, + "loss": 4.2698, + "step": 2208 + }, + { + "epoch": 1.423529411764706, + "grad_norm": 2.2135364612275077, + "learning_rate": 9.980712335572455e-05, + "loss": 3.723, + "step": 2209 + }, + { + "epoch": 1.424174053182917, + "grad_norm": 1.9012984905130617, + "learning_rate": 9.98069463980343e-05, + "loss": 4.3105, + "step": 2210 + }, + { + "epoch": 1.424818694601128, + "grad_norm": 1.664017753247487, + "learning_rate": 9.980676935936356e-05, + "loss": 3.8807, + "step": 2211 + }, + { + "epoch": 1.4254633360193392, + "grad_norm": 2.4348018743861566, + "learning_rate": 9.980659223971265e-05, + "loss": 3.9529, + "step": 2212 + }, + { + "epoch": 1.4261079774375505, + "grad_norm": 1.6986723109414639, + "learning_rate": 9.980641503908182e-05, + "loss": 3.8984, + "step": 2213 + }, + { + "epoch": 1.4267526188557615, + "grad_norm": 1.8378033205579276, + "learning_rate": 9.980623775747143e-05, + "loss": 3.9074, + "step": 2214 + }, + { + "epoch": 1.4273972602739726, + "grad_norm": 2.052702989595387, + "learning_rate": 9.98060603948817e-05, + "loss": 3.9686, + "step": 2215 + }, + { + "epoch": 1.4280419016921837, + "grad_norm": 1.903790184621587, + "learning_rate": 9.980588295131295e-05, + "loss": 3.7968, + "step": 2216 + }, + { + "epoch": 1.4286865431103948, + "grad_norm": 2.2697252001161736, + "learning_rate": 9.980570542676546e-05, + "loss": 3.5494, + "step": 2217 + }, + { + "epoch": 1.429331184528606, + "grad_norm": 1.6362391377171446, + "learning_rate": 9.980552782123955e-05, + "loss": 4.0411, + "step": 2218 + }, + { + "epoch": 1.4299758259468172, + "grad_norm": 1.900046054937347, + "learning_rate": 9.980535013473549e-05, + "loss": 3.9812, + "step": 2219 + }, + { + "epoch": 1.4306204673650282, + "grad_norm": 1.8300220110636434, + "learning_rate": 9.980517236725358e-05, + "loss": 4.2204, + "step": 2220 + }, + { + "epoch": 1.4312651087832393, + "grad_norm": 1.7572469601037453, + "learning_rate": 9.980499451879411e-05, + "loss": 4.0747, + "step": 2221 + }, + { + "epoch": 1.4319097502014504, + "grad_norm": 1.864589125760284, + "learning_rate": 9.980481658935735e-05, + "loss": 4.1421, + "step": 2222 + }, + { + "epoch": 1.4325543916196617, + "grad_norm": 1.7025154155451083, + "learning_rate": 9.980463857894363e-05, + "loss": 3.9339, + "step": 2223 + }, + { + "epoch": 1.4331990330378725, + "grad_norm": 1.871401082222568, + "learning_rate": 9.980446048755321e-05, + "loss": 3.951, + "step": 2224 + }, + { + "epoch": 1.4338436744560838, + "grad_norm": 2.0366489404014634, + "learning_rate": 9.98042823151864e-05, + "loss": 3.7568, + "step": 2225 + }, + { + "epoch": 1.434488315874295, + "grad_norm": 1.7920992257788089, + "learning_rate": 9.98041040618435e-05, + "loss": 4.0905, + "step": 2226 + }, + { + "epoch": 1.435132957292506, + "grad_norm": 2.0841955204608267, + "learning_rate": 9.98039257275248e-05, + "loss": 3.8082, + "step": 2227 + }, + { + "epoch": 1.435777598710717, + "grad_norm": 1.8823232530325316, + "learning_rate": 9.980374731223056e-05, + "loss": 4.0966, + "step": 2228 + }, + { + "epoch": 1.4364222401289282, + "grad_norm": 1.4420507023624094, + "learning_rate": 9.980356881596111e-05, + "loss": 4.3655, + "step": 2229 + }, + { + "epoch": 1.4370668815471395, + "grad_norm": 2.0922734184725265, + "learning_rate": 9.980339023871674e-05, + "loss": 4.2432, + "step": 2230 + }, + { + "epoch": 1.4377115229653505, + "grad_norm": 2.5410743304332066, + "learning_rate": 9.980321158049773e-05, + "loss": 4.0949, + "step": 2231 + }, + { + "epoch": 1.4383561643835616, + "grad_norm": 2.115182725450714, + "learning_rate": 9.980303284130438e-05, + "loss": 3.8986, + "step": 2232 + }, + { + "epoch": 1.4390008058017727, + "grad_norm": 1.8118491858559294, + "learning_rate": 9.980285402113698e-05, + "loss": 4.0616, + "step": 2233 + }, + { + "epoch": 1.4396454472199838, + "grad_norm": 1.9859094969780005, + "learning_rate": 9.980267511999581e-05, + "loss": 4.0863, + "step": 2234 + }, + { + "epoch": 1.440290088638195, + "grad_norm": 2.3801433383521453, + "learning_rate": 9.980249613788119e-05, + "loss": 4.0996, + "step": 2235 + }, + { + "epoch": 1.4409347300564062, + "grad_norm": 1.5357268242972117, + "learning_rate": 9.98023170747934e-05, + "loss": 3.9054, + "step": 2236 + }, + { + "epoch": 1.4415793714746172, + "grad_norm": 2.574566782447034, + "learning_rate": 9.980213793073276e-05, + "loss": 3.8175, + "step": 2237 + }, + { + "epoch": 1.4422240128928283, + "grad_norm": 3.176807237301294, + "learning_rate": 9.98019587056995e-05, + "loss": 4.119, + "step": 2238 + }, + { + "epoch": 1.4428686543110394, + "grad_norm": 1.841525516238065, + "learning_rate": 9.9801779399694e-05, + "loss": 3.9152, + "step": 2239 + }, + { + "epoch": 1.4435132957292507, + "grad_norm": 2.638385027211402, + "learning_rate": 9.980160001271648e-05, + "loss": 4.1099, + "step": 2240 + }, + { + "epoch": 1.4441579371474618, + "grad_norm": 3.124135029660807, + "learning_rate": 9.980142054476727e-05, + "loss": 3.8442, + "step": 2241 + }, + { + "epoch": 1.4448025785656728, + "grad_norm": 2.3496965894842328, + "learning_rate": 9.980124099584665e-05, + "loss": 4.0736, + "step": 2242 + }, + { + "epoch": 1.445447219983884, + "grad_norm": 2.315126016566155, + "learning_rate": 9.980106136595495e-05, + "loss": 4.2237, + "step": 2243 + }, + { + "epoch": 1.446091861402095, + "grad_norm": 2.42208592275253, + "learning_rate": 9.980088165509244e-05, + "loss": 3.9445, + "step": 2244 + }, + { + "epoch": 1.4467365028203063, + "grad_norm": 2.652369956607507, + "learning_rate": 9.980070186325938e-05, + "loss": 4.1879, + "step": 2245 + }, + { + "epoch": 1.4473811442385174, + "grad_norm": 1.810349591660638, + "learning_rate": 9.980052199045614e-05, + "loss": 3.9254, + "step": 2246 + }, + { + "epoch": 1.4480257856567285, + "grad_norm": 3.3665292610191964, + "learning_rate": 9.980034203668295e-05, + "loss": 3.5221, + "step": 2247 + }, + { + "epoch": 1.4486704270749395, + "grad_norm": 3.142120641944308, + "learning_rate": 9.980016200194013e-05, + "loss": 4.2632, + "step": 2248 + }, + { + "epoch": 1.4493150684931506, + "grad_norm": 2.1812087118310255, + "learning_rate": 9.9799981886228e-05, + "loss": 3.4293, + "step": 2249 + }, + { + "epoch": 1.449959709911362, + "grad_norm": 3.1585956406380964, + "learning_rate": 9.979980168954682e-05, + "loss": 3.9345, + "step": 2250 + }, + { + "epoch": 1.450604351329573, + "grad_norm": 2.511226969635951, + "learning_rate": 9.97996214118969e-05, + "loss": 3.9649, + "step": 2251 + }, + { + "epoch": 1.451248992747784, + "grad_norm": 1.97167739729174, + "learning_rate": 9.979944105327852e-05, + "loss": 4.1248, + "step": 2252 + }, + { + "epoch": 1.4518936341659952, + "grad_norm": 2.1507233589063866, + "learning_rate": 9.979926061369202e-05, + "loss": 3.7012, + "step": 2253 + }, + { + "epoch": 1.4525382755842062, + "grad_norm": 2.4492519825270413, + "learning_rate": 9.979908009313763e-05, + "loss": 3.9349, + "step": 2254 + }, + { + "epoch": 1.4531829170024175, + "grad_norm": 1.7782177771229024, + "learning_rate": 9.97988994916157e-05, + "loss": 3.8703, + "step": 2255 + }, + { + "epoch": 1.4538275584206286, + "grad_norm": 2.715463908263355, + "learning_rate": 9.979871880912653e-05, + "loss": 4.036, + "step": 2256 + }, + { + "epoch": 1.4544721998388397, + "grad_norm": 2.0766337089747524, + "learning_rate": 9.979853804567036e-05, + "loss": 3.9761, + "step": 2257 + }, + { + "epoch": 1.4551168412570508, + "grad_norm": 2.062909203410658, + "learning_rate": 9.979835720124755e-05, + "loss": 3.9601, + "step": 2258 + }, + { + "epoch": 1.4557614826752618, + "grad_norm": 2.6866085108085223, + "learning_rate": 9.979817627585835e-05, + "loss": 3.9389, + "step": 2259 + }, + { + "epoch": 1.4564061240934731, + "grad_norm": 1.8068125727538389, + "learning_rate": 9.97979952695031e-05, + "loss": 3.9735, + "step": 2260 + }, + { + "epoch": 1.457050765511684, + "grad_norm": 3.0914710483158063, + "learning_rate": 9.979781418218208e-05, + "loss": 3.5211, + "step": 2261 + }, + { + "epoch": 1.4576954069298953, + "grad_norm": 3.0829772326743083, + "learning_rate": 9.979763301389557e-05, + "loss": 3.7573, + "step": 2262 + }, + { + "epoch": 1.4583400483481064, + "grad_norm": 2.0781558757427443, + "learning_rate": 9.979745176464386e-05, + "loss": 3.8122, + "step": 2263 + }, + { + "epoch": 1.4589846897663175, + "grad_norm": 1.845701996200521, + "learning_rate": 9.979727043442729e-05, + "loss": 3.9589, + "step": 2264 + }, + { + "epoch": 1.4596293311845285, + "grad_norm": 2.2284949895062125, + "learning_rate": 9.979708902324614e-05, + "loss": 3.9045, + "step": 2265 + }, + { + "epoch": 1.4602739726027396, + "grad_norm": 1.8794445837643985, + "learning_rate": 9.979690753110069e-05, + "loss": 3.7318, + "step": 2266 + }, + { + "epoch": 1.460918614020951, + "grad_norm": 2.500466509141429, + "learning_rate": 9.979672595799126e-05, + "loss": 3.8552, + "step": 2267 + }, + { + "epoch": 1.461563255439162, + "grad_norm": 1.707700020387214, + "learning_rate": 9.979654430391813e-05, + "loss": 4.134, + "step": 2268 + }, + { + "epoch": 1.462207896857373, + "grad_norm": 2.746767744865056, + "learning_rate": 9.979636256888165e-05, + "loss": 3.8872, + "step": 2269 + }, + { + "epoch": 1.4628525382755841, + "grad_norm": 1.3013457046746333, + "learning_rate": 9.979618075288203e-05, + "loss": 4.2166, + "step": 2270 + }, + { + "epoch": 1.4634971796937952, + "grad_norm": 2.3176195714723438, + "learning_rate": 9.979599885591962e-05, + "loss": 4.2421, + "step": 2271 + }, + { + "epoch": 1.4641418211120065, + "grad_norm": 1.443504108228214, + "learning_rate": 9.979581687799473e-05, + "loss": 4.0396, + "step": 2272 + }, + { + "epoch": 1.4647864625302176, + "grad_norm": 1.7994746874837892, + "learning_rate": 9.979563481910763e-05, + "loss": 4.4214, + "step": 2273 + }, + { + "epoch": 1.4654311039484287, + "grad_norm": 2.083269506705622, + "learning_rate": 9.979545267925864e-05, + "loss": 3.758, + "step": 2274 + }, + { + "epoch": 1.4660757453666398, + "grad_norm": 2.035770170128351, + "learning_rate": 9.979527045844805e-05, + "loss": 4.0704, + "step": 2275 + }, + { + "epoch": 1.4667203867848508, + "grad_norm": 1.9982720699257386, + "learning_rate": 9.979508815667615e-05, + "loss": 4.1297, + "step": 2276 + }, + { + "epoch": 1.4673650282030621, + "grad_norm": 1.9560435747388298, + "learning_rate": 9.979490577394328e-05, + "loss": 3.8415, + "step": 2277 + }, + { + "epoch": 1.4680096696212732, + "grad_norm": 1.659925718316218, + "learning_rate": 9.979472331024967e-05, + "loss": 3.7888, + "step": 2278 + }, + { + "epoch": 1.4686543110394843, + "grad_norm": 1.9462209775499963, + "learning_rate": 9.979454076559567e-05, + "loss": 3.8771, + "step": 2279 + }, + { + "epoch": 1.4692989524576954, + "grad_norm": 1.5893621430281502, + "learning_rate": 9.979435813998158e-05, + "loss": 4.2124, + "step": 2280 + }, + { + "epoch": 1.4699435938759065, + "grad_norm": 2.062036096098237, + "learning_rate": 9.97941754334077e-05, + "loss": 3.6775, + "step": 2281 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 2.38768012099808, + "learning_rate": 9.97939926458743e-05, + "loss": 3.8149, + "step": 2282 + }, + { + "epoch": 1.4712328767123288, + "grad_norm": 2.8352959543735126, + "learning_rate": 9.97938097773817e-05, + "loss": 3.849, + "step": 2283 + }, + { + "epoch": 1.47187751813054, + "grad_norm": 2.179814774644278, + "learning_rate": 9.97936268279302e-05, + "loss": 3.9726, + "step": 2284 + }, + { + "epoch": 1.472522159548751, + "grad_norm": 9.132832689440205, + "learning_rate": 9.979344379752011e-05, + "loss": 4.3697, + "step": 2285 + }, + { + "epoch": 1.473166800966962, + "grad_norm": 1.9841696669456663, + "learning_rate": 9.979326068615171e-05, + "loss": 3.923, + "step": 2286 + }, + { + "epoch": 1.4738114423851734, + "grad_norm": 1.7447759316044336, + "learning_rate": 9.979307749382532e-05, + "loss": 3.9983, + "step": 2287 + }, + { + "epoch": 1.4744560838033844, + "grad_norm": 2.40095596797202, + "learning_rate": 9.979289422054122e-05, + "loss": 4.0188, + "step": 2288 + }, + { + "epoch": 1.4751007252215955, + "grad_norm": 1.7460414477085218, + "learning_rate": 9.979271086629973e-05, + "loss": 4.1879, + "step": 2289 + }, + { + "epoch": 1.4757453666398066, + "grad_norm": 1.8427734810083567, + "learning_rate": 9.979252743110114e-05, + "loss": 4.2692, + "step": 2290 + }, + { + "epoch": 1.4763900080580177, + "grad_norm": 2.160802149021169, + "learning_rate": 9.979234391494575e-05, + "loss": 4.0604, + "step": 2291 + }, + { + "epoch": 1.477034649476229, + "grad_norm": 1.932337221728522, + "learning_rate": 9.979216031783389e-05, + "loss": 4.0354, + "step": 2292 + }, + { + "epoch": 1.4776792908944398, + "grad_norm": 1.7893359049450739, + "learning_rate": 9.979197663976581e-05, + "loss": 4.1028, + "step": 2293 + }, + { + "epoch": 1.4783239323126511, + "grad_norm": 2.281334675030731, + "learning_rate": 9.979179288074186e-05, + "loss": 3.7363, + "step": 2294 + }, + { + "epoch": 1.4789685737308622, + "grad_norm": 1.66082756592752, + "learning_rate": 9.979160904076233e-05, + "loss": 4.4012, + "step": 2295 + }, + { + "epoch": 1.4796132151490733, + "grad_norm": 2.0911412289995583, + "learning_rate": 9.979142511982749e-05, + "loss": 3.8645, + "step": 2296 + }, + { + "epoch": 1.4802578565672844, + "grad_norm": 2.0933371651307118, + "learning_rate": 9.979124111793768e-05, + "loss": 4.034, + "step": 2297 + }, + { + "epoch": 1.4809024979854954, + "grad_norm": 1.6104353424989208, + "learning_rate": 9.979105703509317e-05, + "loss": 3.8127, + "step": 2298 + }, + { + "epoch": 1.4815471394037067, + "grad_norm": 1.6381306498978578, + "learning_rate": 9.97908728712943e-05, + "loss": 4.1229, + "step": 2299 + }, + { + "epoch": 1.4821917808219178, + "grad_norm": 2.332442381528007, + "learning_rate": 9.979068862654133e-05, + "loss": 3.824, + "step": 2300 + }, + { + "epoch": 1.4821917808219178, + "eval_loss": 4.09398078918457, + "eval_runtime": 2.9741, + "eval_samples_per_second": 33.623, + "eval_steps_per_second": 4.371, + "step": 2300 + }, + { + "epoch": 1.482836422240129, + "grad_norm": 2.0813302166508976, + "learning_rate": 9.979050430083462e-05, + "loss": 4.1241, + "step": 2301 + }, + { + "epoch": 1.48348106365834, + "grad_norm": 1.8361839920039238, + "learning_rate": 9.979031989417443e-05, + "loss": 4.1323, + "step": 2302 + }, + { + "epoch": 1.484125705076551, + "grad_norm": 1.8195506630424907, + "learning_rate": 9.979013540656106e-05, + "loss": 4.159, + "step": 2303 + }, + { + "epoch": 1.4847703464947624, + "grad_norm": 1.748668311958494, + "learning_rate": 9.978995083799482e-05, + "loss": 4.1609, + "step": 2304 + }, + { + "epoch": 1.4854149879129734, + "grad_norm": 2.512772007321459, + "learning_rate": 9.978976618847602e-05, + "loss": 3.6554, + "step": 2305 + }, + { + "epoch": 1.4860596293311845, + "grad_norm": 1.881017033651195, + "learning_rate": 9.978958145800497e-05, + "loss": 4.0901, + "step": 2306 + }, + { + "epoch": 1.4867042707493956, + "grad_norm": 1.7171435927863006, + "learning_rate": 9.978939664658196e-05, + "loss": 3.9605, + "step": 2307 + }, + { + "epoch": 1.4873489121676067, + "grad_norm": 1.649250787854772, + "learning_rate": 9.978921175420728e-05, + "loss": 4.2588, + "step": 2308 + }, + { + "epoch": 1.487993553585818, + "grad_norm": 1.6011595464199553, + "learning_rate": 9.978902678088128e-05, + "loss": 3.9942, + "step": 2309 + }, + { + "epoch": 1.488638195004029, + "grad_norm": 1.7230930286804045, + "learning_rate": 9.978884172660422e-05, + "loss": 3.5497, + "step": 2310 + }, + { + "epoch": 1.4892828364222401, + "grad_norm": 1.866922074196695, + "learning_rate": 9.978865659137642e-05, + "loss": 3.9852, + "step": 2311 + }, + { + "epoch": 1.4899274778404512, + "grad_norm": 2.0245461568482535, + "learning_rate": 9.978847137519818e-05, + "loss": 4.2606, + "step": 2312 + }, + { + "epoch": 1.4905721192586623, + "grad_norm": 2.149102550623704, + "learning_rate": 9.978828607806982e-05, + "loss": 3.6788, + "step": 2313 + }, + { + "epoch": 1.4912167606768736, + "grad_norm": 1.5338345548724592, + "learning_rate": 9.978810069999161e-05, + "loss": 3.9309, + "step": 2314 + }, + { + "epoch": 1.4918614020950847, + "grad_norm": 1.2610858176963313, + "learning_rate": 9.978791524096388e-05, + "loss": 4.658, + "step": 2315 + }, + { + "epoch": 1.4925060435132957, + "grad_norm": 1.458718087743192, + "learning_rate": 9.978772970098694e-05, + "loss": 3.8009, + "step": 2316 + }, + { + "epoch": 1.4931506849315068, + "grad_norm": 2.1849987507927815, + "learning_rate": 9.978754408006107e-05, + "loss": 3.5805, + "step": 2317 + }, + { + "epoch": 1.493795326349718, + "grad_norm": 3.0235155247981655, + "learning_rate": 9.978735837818661e-05, + "loss": 3.9128, + "step": 2318 + }, + { + "epoch": 1.4944399677679292, + "grad_norm": 2.750668323199533, + "learning_rate": 9.978717259536385e-05, + "loss": 4.1514, + "step": 2319 + }, + { + "epoch": 1.4950846091861403, + "grad_norm": 1.6407401400417925, + "learning_rate": 9.978698673159306e-05, + "loss": 4.1288, + "step": 2320 + }, + { + "epoch": 1.4957292506043514, + "grad_norm": 2.0089109273859433, + "learning_rate": 9.978680078687459e-05, + "loss": 3.5532, + "step": 2321 + }, + { + "epoch": 1.4963738920225624, + "grad_norm": 1.9929799124788214, + "learning_rate": 9.978661476120875e-05, + "loss": 4.1512, + "step": 2322 + }, + { + "epoch": 1.4970185334407735, + "grad_norm": 1.431828576351427, + "learning_rate": 9.978642865459581e-05, + "loss": 4.0788, + "step": 2323 + }, + { + "epoch": 1.4976631748589848, + "grad_norm": 2.4428151462280434, + "learning_rate": 9.978624246703608e-05, + "loss": 4.1114, + "step": 2324 + }, + { + "epoch": 1.498307816277196, + "grad_norm": 1.5404838361841957, + "learning_rate": 9.978605619852989e-05, + "loss": 4.1447, + "step": 2325 + }, + { + "epoch": 1.498952457695407, + "grad_norm": 1.7990794407253625, + "learning_rate": 9.978586984907753e-05, + "loss": 4.185, + "step": 2326 + }, + { + "epoch": 1.499597099113618, + "grad_norm": 2.0083616287226045, + "learning_rate": 9.978568341867932e-05, + "loss": 4.0402, + "step": 2327 + }, + { + "epoch": 1.5002417405318291, + "grad_norm": 1.2818960156220676, + "learning_rate": 9.978549690733556e-05, + "loss": 4.1729, + "step": 2328 + }, + { + "epoch": 1.5008863819500404, + "grad_norm": 1.776132448643161, + "learning_rate": 9.978531031504652e-05, + "loss": 4.0696, + "step": 2329 + }, + { + "epoch": 1.5015310233682513, + "grad_norm": 1.7785928721272617, + "learning_rate": 9.978512364181256e-05, + "loss": 3.9111, + "step": 2330 + }, + { + "epoch": 1.5021756647864626, + "grad_norm": 1.477941019900265, + "learning_rate": 9.978493688763397e-05, + "loss": 4.4435, + "step": 2331 + }, + { + "epoch": 1.5028203062046737, + "grad_norm": 1.3958745349332522, + "learning_rate": 9.978475005251105e-05, + "loss": 3.9081, + "step": 2332 + }, + { + "epoch": 1.5034649476228847, + "grad_norm": 1.1904818080503572, + "learning_rate": 9.97845631364441e-05, + "loss": 3.9364, + "step": 2333 + }, + { + "epoch": 1.504109589041096, + "grad_norm": 1.3590539234507764, + "learning_rate": 9.978437613943345e-05, + "loss": 3.9495, + "step": 2334 + }, + { + "epoch": 1.504754230459307, + "grad_norm": 1.4866070296302967, + "learning_rate": 9.978418906147937e-05, + "loss": 3.5882, + "step": 2335 + }, + { + "epoch": 1.5053988718775182, + "grad_norm": 1.7643394418777736, + "learning_rate": 9.978400190258221e-05, + "loss": 4.0508, + "step": 2336 + }, + { + "epoch": 1.5060435132957293, + "grad_norm": 1.5591537276391318, + "learning_rate": 9.978381466274224e-05, + "loss": 4.2765, + "step": 2337 + }, + { + "epoch": 1.5066881547139404, + "grad_norm": 1.937465687833817, + "learning_rate": 9.978362734195981e-05, + "loss": 3.9661, + "step": 2338 + }, + { + "epoch": 1.5073327961321517, + "grad_norm": 2.1321070697925295, + "learning_rate": 9.978343994023519e-05, + "loss": 4.3587, + "step": 2339 + }, + { + "epoch": 1.5079774375503625, + "grad_norm": 1.5343827963538221, + "learning_rate": 9.97832524575687e-05, + "loss": 4.1301, + "step": 2340 + }, + { + "epoch": 1.5086220789685738, + "grad_norm": 2.1881095115786233, + "learning_rate": 9.978306489396064e-05, + "loss": 3.9447, + "step": 2341 + }, + { + "epoch": 1.5092667203867849, + "grad_norm": 1.8774637936337921, + "learning_rate": 9.978287724941134e-05, + "loss": 4.2998, + "step": 2342 + }, + { + "epoch": 1.509911361804996, + "grad_norm": 1.3659188239179711, + "learning_rate": 9.978268952392107e-05, + "loss": 3.8628, + "step": 2343 + }, + { + "epoch": 1.510556003223207, + "grad_norm": 1.6876303752017638, + "learning_rate": 9.97825017174902e-05, + "loss": 3.7911, + "step": 2344 + }, + { + "epoch": 1.5112006446414181, + "grad_norm": 1.9152780635651387, + "learning_rate": 9.978231383011898e-05, + "loss": 3.8229, + "step": 2345 + }, + { + "epoch": 1.5118452860596294, + "grad_norm": 1.1699835079752556, + "learning_rate": 9.978212586180775e-05, + "loss": 4.0197, + "step": 2346 + }, + { + "epoch": 1.5124899274778405, + "grad_norm": 2.597544953006601, + "learning_rate": 9.97819378125568e-05, + "loss": 4.0169, + "step": 2347 + }, + { + "epoch": 1.5131345688960516, + "grad_norm": 2.8840460136847508, + "learning_rate": 9.978174968236645e-05, + "loss": 3.9572, + "step": 2348 + }, + { + "epoch": 1.5137792103142627, + "grad_norm": 2.544929836525109, + "learning_rate": 9.9781561471237e-05, + "loss": 3.7257, + "step": 2349 + }, + { + "epoch": 1.5144238517324737, + "grad_norm": 2.0917445129019305, + "learning_rate": 9.978137317916879e-05, + "loss": 3.9588, + "step": 2350 + }, + { + "epoch": 1.515068493150685, + "grad_norm": 2.0066074189642764, + "learning_rate": 9.978118480616209e-05, + "loss": 3.9803, + "step": 2351 + }, + { + "epoch": 1.515713134568896, + "grad_norm": 2.0988886568102263, + "learning_rate": 9.978099635221721e-05, + "loss": 3.9335, + "step": 2352 + }, + { + "epoch": 1.5163577759871072, + "grad_norm": 1.7247255108802622, + "learning_rate": 9.978080781733449e-05, + "loss": 4.0637, + "step": 2353 + }, + { + "epoch": 1.5170024174053183, + "grad_norm": 1.2729532013750722, + "learning_rate": 9.978061920151421e-05, + "loss": 4.2147, + "step": 2354 + }, + { + "epoch": 1.5176470588235293, + "grad_norm": 1.7208581455347212, + "learning_rate": 9.978043050475672e-05, + "loss": 3.9512, + "step": 2355 + }, + { + "epoch": 1.5182917002417406, + "grad_norm": 1.657756285021262, + "learning_rate": 9.978024172706229e-05, + "loss": 3.9558, + "step": 2356 + }, + { + "epoch": 1.5189363416599515, + "grad_norm": 1.6170677692201931, + "learning_rate": 9.978005286843123e-05, + "loss": 4.2961, + "step": 2357 + }, + { + "epoch": 1.5195809830781628, + "grad_norm": 1.4616877677165963, + "learning_rate": 9.977986392886387e-05, + "loss": 3.9161, + "step": 2358 + }, + { + "epoch": 1.5202256244963739, + "grad_norm": 2.244575578900248, + "learning_rate": 9.977967490836052e-05, + "loss": 4.1288, + "step": 2359 + }, + { + "epoch": 1.520870265914585, + "grad_norm": 2.188674272894347, + "learning_rate": 9.977948580692149e-05, + "loss": 3.8905, + "step": 2360 + }, + { + "epoch": 1.5215149073327963, + "grad_norm": 1.7821510857239498, + "learning_rate": 9.977929662454706e-05, + "loss": 3.8674, + "step": 2361 + }, + { + "epoch": 1.5221595487510071, + "grad_norm": 2.401768679871879, + "learning_rate": 9.977910736123758e-05, + "loss": 3.9313, + "step": 2362 + }, + { + "epoch": 1.5228041901692184, + "grad_norm": 3.0753328841429783, + "learning_rate": 9.977891801699335e-05, + "loss": 3.8643, + "step": 2363 + }, + { + "epoch": 1.5234488315874295, + "grad_norm": 2.296486618031004, + "learning_rate": 9.977872859181468e-05, + "loss": 3.826, + "step": 2364 + }, + { + "epoch": 1.5240934730056406, + "grad_norm": 1.9490089400583024, + "learning_rate": 9.977853908570187e-05, + "loss": 3.8175, + "step": 2365 + }, + { + "epoch": 1.5247381144238519, + "grad_norm": 2.3468947967418137, + "learning_rate": 9.977834949865524e-05, + "loss": 3.7966, + "step": 2366 + }, + { + "epoch": 1.5253827558420627, + "grad_norm": 1.9846118812880051, + "learning_rate": 9.977815983067511e-05, + "loss": 3.8694, + "step": 2367 + }, + { + "epoch": 1.526027397260274, + "grad_norm": 2.3885657514968885, + "learning_rate": 9.977797008176176e-05, + "loss": 4.2061, + "step": 2368 + }, + { + "epoch": 1.526672038678485, + "grad_norm": 2.268197203850748, + "learning_rate": 9.977778025191554e-05, + "loss": 3.868, + "step": 2369 + }, + { + "epoch": 1.5273166800966962, + "grad_norm": 2.0593157715585124, + "learning_rate": 9.977759034113675e-05, + "loss": 4.0341, + "step": 2370 + }, + { + "epoch": 1.5279613215149075, + "grad_norm": 2.4543049408758346, + "learning_rate": 9.977740034942569e-05, + "loss": 4.2268, + "step": 2371 + }, + { + "epoch": 1.5286059629331183, + "grad_norm": 1.7501378013852233, + "learning_rate": 9.977721027678268e-05, + "loss": 4.2669, + "step": 2372 + }, + { + "epoch": 1.5292506043513296, + "grad_norm": 2.7563647889339964, + "learning_rate": 9.977702012320804e-05, + "loss": 3.9896, + "step": 2373 + }, + { + "epoch": 1.5298952457695407, + "grad_norm": 2.392097088676128, + "learning_rate": 9.977682988870206e-05, + "loss": 3.9922, + "step": 2374 + }, + { + "epoch": 1.5305398871877518, + "grad_norm": 2.0286555574023617, + "learning_rate": 9.977663957326509e-05, + "loss": 4.1054, + "step": 2375 + }, + { + "epoch": 1.5311845286059629, + "grad_norm": 2.231291545567087, + "learning_rate": 9.97764491768974e-05, + "loss": 3.8727, + "step": 2376 + }, + { + "epoch": 1.531829170024174, + "grad_norm": 2.3480868128648313, + "learning_rate": 9.977625869959933e-05, + "loss": 4.1835, + "step": 2377 + }, + { + "epoch": 1.5324738114423853, + "grad_norm": 2.189996026628768, + "learning_rate": 9.977606814137118e-05, + "loss": 3.991, + "step": 2378 + }, + { + "epoch": 1.5331184528605963, + "grad_norm": 1.9521981272899822, + "learning_rate": 9.977587750221325e-05, + "loss": 4.315, + "step": 2379 + }, + { + "epoch": 1.5337630942788074, + "grad_norm": 1.7714665985537192, + "learning_rate": 9.977568678212589e-05, + "loss": 4.0579, + "step": 2380 + }, + { + "epoch": 1.5344077356970185, + "grad_norm": 1.7566295195422168, + "learning_rate": 9.977549598110941e-05, + "loss": 3.5463, + "step": 2381 + }, + { + "epoch": 1.5350523771152296, + "grad_norm": 1.9569486933106672, + "learning_rate": 9.97753050991641e-05, + "loss": 3.7931, + "step": 2382 + }, + { + "epoch": 1.5356970185334409, + "grad_norm": 2.0482301942134424, + "learning_rate": 9.977511413629026e-05, + "loss": 4.2937, + "step": 2383 + }, + { + "epoch": 1.5363416599516517, + "grad_norm": 1.5270323153641276, + "learning_rate": 9.977492309248824e-05, + "loss": 4.1901, + "step": 2384 + }, + { + "epoch": 1.536986301369863, + "grad_norm": 1.8274392542517375, + "learning_rate": 9.977473196775833e-05, + "loss": 3.9647, + "step": 2385 + }, + { + "epoch": 1.537630942788074, + "grad_norm": 2.1934308657276778, + "learning_rate": 9.977454076210087e-05, + "loss": 4.2031, + "step": 2386 + }, + { + "epoch": 1.5382755842062852, + "grad_norm": 2.6266148637560516, + "learning_rate": 9.977434947551614e-05, + "loss": 3.4772, + "step": 2387 + }, + { + "epoch": 1.5389202256244965, + "grad_norm": 2.690128340271726, + "learning_rate": 9.977415810800448e-05, + "loss": 4.0142, + "step": 2388 + }, + { + "epoch": 1.5395648670427073, + "grad_norm": 1.723909092147864, + "learning_rate": 9.977396665956618e-05, + "loss": 4.4798, + "step": 2389 + }, + { + "epoch": 1.5402095084609186, + "grad_norm": 2.624839901845, + "learning_rate": 9.977377513020158e-05, + "loss": 4.2924, + "step": 2390 + }, + { + "epoch": 1.5408541498791297, + "grad_norm": 1.2546603931835782, + "learning_rate": 9.9773583519911e-05, + "loss": 4.1292, + "step": 2391 + }, + { + "epoch": 1.5414987912973408, + "grad_norm": 2.0703018409084635, + "learning_rate": 9.97733918286947e-05, + "loss": 4.3739, + "step": 2392 + }, + { + "epoch": 1.542143432715552, + "grad_norm": 1.6462340203958403, + "learning_rate": 9.977320005655306e-05, + "loss": 4.2028, + "step": 2393 + }, + { + "epoch": 1.542788074133763, + "grad_norm": 1.775420960294619, + "learning_rate": 9.977300820348636e-05, + "loss": 3.7188, + "step": 2394 + }, + { + "epoch": 1.5434327155519743, + "grad_norm": 1.7715787745807319, + "learning_rate": 9.977281626949494e-05, + "loss": 3.7647, + "step": 2395 + }, + { + "epoch": 1.5440773569701853, + "grad_norm": 1.4434757791745838, + "learning_rate": 9.977262425457907e-05, + "loss": 4.5595, + "step": 2396 + }, + { + "epoch": 1.5447219983883964, + "grad_norm": 1.380786594610482, + "learning_rate": 9.977243215873911e-05, + "loss": 4.3547, + "step": 2397 + }, + { + "epoch": 1.5453666398066077, + "grad_norm": 1.8037019098584188, + "learning_rate": 9.977223998197537e-05, + "loss": 3.8486, + "step": 2398 + }, + { + "epoch": 1.5460112812248186, + "grad_norm": 2.133193876572135, + "learning_rate": 9.977204772428814e-05, + "loss": 4.0104, + "step": 2399 + }, + { + "epoch": 1.5466559226430299, + "grad_norm": 2.3660222325424436, + "learning_rate": 9.977185538567775e-05, + "loss": 3.9454, + "step": 2400 + }, + { + "epoch": 1.5466559226430299, + "eval_loss": 4.083171844482422, + "eval_runtime": 2.97, + "eval_samples_per_second": 33.67, + "eval_steps_per_second": 4.377, + "step": 2400 + }, + { + "epoch": 1.547300564061241, + "grad_norm": 1.861381282503112, + "learning_rate": 9.977166296614452e-05, + "loss": 4.0087, + "step": 2401 + }, + { + "epoch": 1.547945205479452, + "grad_norm": 1.7758756103576316, + "learning_rate": 9.977147046568876e-05, + "loss": 3.9884, + "step": 2402 + }, + { + "epoch": 1.5485898468976633, + "grad_norm": 2.0814328843445034, + "learning_rate": 9.97712778843108e-05, + "loss": 3.7813, + "step": 2403 + }, + { + "epoch": 1.5492344883158742, + "grad_norm": 1.3723528291702136, + "learning_rate": 9.977108522201094e-05, + "loss": 3.925, + "step": 2404 + }, + { + "epoch": 1.5498791297340855, + "grad_norm": 2.005446247873804, + "learning_rate": 9.97708924787895e-05, + "loss": 3.919, + "step": 2405 + }, + { + "epoch": 1.5505237711522966, + "grad_norm": 2.122971336139549, + "learning_rate": 9.977069965464679e-05, + "loss": 3.8756, + "step": 2406 + }, + { + "epoch": 1.5511684125705076, + "grad_norm": 1.506293929709827, + "learning_rate": 9.977050674958315e-05, + "loss": 3.9301, + "step": 2407 + }, + { + "epoch": 1.551813053988719, + "grad_norm": 1.8474450373028333, + "learning_rate": 9.977031376359887e-05, + "loss": 3.8072, + "step": 2408 + }, + { + "epoch": 1.5524576954069298, + "grad_norm": 1.7625117040196208, + "learning_rate": 9.977012069669427e-05, + "loss": 4.1608, + "step": 2409 + }, + { + "epoch": 1.553102336825141, + "grad_norm": 1.3118021899634407, + "learning_rate": 9.976992754886969e-05, + "loss": 3.9844, + "step": 2410 + }, + { + "epoch": 1.5537469782433522, + "grad_norm": 1.5394512887273715, + "learning_rate": 9.976973432012544e-05, + "loss": 3.9605, + "step": 2411 + }, + { + "epoch": 1.5543916196615633, + "grad_norm": 1.8801920716882503, + "learning_rate": 9.976954101046183e-05, + "loss": 3.7061, + "step": 2412 + }, + { + "epoch": 1.5550362610797743, + "grad_norm": 2.083923665242518, + "learning_rate": 9.976934761987914e-05, + "loss": 3.8448, + "step": 2413 + }, + { + "epoch": 1.5556809024979854, + "grad_norm": 2.092884481941996, + "learning_rate": 9.976915414837776e-05, + "loss": 3.7919, + "step": 2414 + }, + { + "epoch": 1.5563255439161967, + "grad_norm": 1.32072059835236, + "learning_rate": 9.976896059595799e-05, + "loss": 4.3191, + "step": 2415 + }, + { + "epoch": 1.5569701853344078, + "grad_norm": 1.6037614122178836, + "learning_rate": 9.976876696262009e-05, + "loss": 4.2028, + "step": 2416 + }, + { + "epoch": 1.5576148267526189, + "grad_norm": 1.7255276280897904, + "learning_rate": 9.976857324836445e-05, + "loss": 3.9364, + "step": 2417 + }, + { + "epoch": 1.55825946817083, + "grad_norm": 1.9147212120935029, + "learning_rate": 9.976837945319135e-05, + "loss": 3.6743, + "step": 2418 + }, + { + "epoch": 1.558904109589041, + "grad_norm": 2.0663379342594603, + "learning_rate": 9.97681855771011e-05, + "loss": 3.7637, + "step": 2419 + }, + { + "epoch": 1.5595487510072523, + "grad_norm": 1.9953254553191608, + "learning_rate": 9.976799162009404e-05, + "loss": 4.1254, + "step": 2420 + }, + { + "epoch": 1.5601933924254632, + "grad_norm": 1.4754965976630041, + "learning_rate": 9.97677975821705e-05, + "loss": 4.4567, + "step": 2421 + }, + { + "epoch": 1.5608380338436745, + "grad_norm": 1.2069506999200184, + "learning_rate": 9.976760346333076e-05, + "loss": 4.5511, + "step": 2422 + }, + { + "epoch": 1.5614826752618856, + "grad_norm": 1.3453978783018004, + "learning_rate": 9.976740926357517e-05, + "loss": 3.8733, + "step": 2423 + }, + { + "epoch": 1.5621273166800966, + "grad_norm": 1.7617996350861196, + "learning_rate": 9.976721498290403e-05, + "loss": 4.1594, + "step": 2424 + }, + { + "epoch": 1.562771958098308, + "grad_norm": 1.400063717130363, + "learning_rate": 9.976702062131769e-05, + "loss": 3.9343, + "step": 2425 + }, + { + "epoch": 1.5634165995165188, + "grad_norm": 1.3818744269584986, + "learning_rate": 9.976682617881643e-05, + "loss": 3.8332, + "step": 2426 + }, + { + "epoch": 1.56406124093473, + "grad_norm": 1.9307347177393037, + "learning_rate": 9.976663165540059e-05, + "loss": 4.1951, + "step": 2427 + }, + { + "epoch": 1.5647058823529412, + "grad_norm": 1.5895403594506412, + "learning_rate": 9.976643705107048e-05, + "loss": 3.9333, + "step": 2428 + }, + { + "epoch": 1.5653505237711522, + "grad_norm": 1.2946769978420463, + "learning_rate": 9.976624236582642e-05, + "loss": 3.885, + "step": 2429 + }, + { + "epoch": 1.5659951651893635, + "grad_norm": 2.049868565445518, + "learning_rate": 9.976604759966876e-05, + "loss": 3.8981, + "step": 2430 + }, + { + "epoch": 1.5666398066075744, + "grad_norm": 2.1673347782017545, + "learning_rate": 9.976585275259776e-05, + "loss": 3.6806, + "step": 2431 + }, + { + "epoch": 1.5672844480257857, + "grad_norm": 2.2896082041758072, + "learning_rate": 9.97656578246138e-05, + "loss": 4.0347, + "step": 2432 + }, + { + "epoch": 1.5679290894439968, + "grad_norm": 2.7188358763415046, + "learning_rate": 9.976546281571718e-05, + "loss": 4.387, + "step": 2433 + }, + { + "epoch": 1.5685737308622079, + "grad_norm": 2.4030246296066964, + "learning_rate": 9.97652677259082e-05, + "loss": 4.1466, + "step": 2434 + }, + { + "epoch": 1.5692183722804192, + "grad_norm": 2.4210116204087915, + "learning_rate": 9.97650725551872e-05, + "loss": 4.4094, + "step": 2435 + }, + { + "epoch": 1.56986301369863, + "grad_norm": 2.0296381573004716, + "learning_rate": 9.976487730355451e-05, + "loss": 4.2611, + "step": 2436 + }, + { + "epoch": 1.5705076551168413, + "grad_norm": 3.0499824307915615, + "learning_rate": 9.976468197101043e-05, + "loss": 3.8323, + "step": 2437 + }, + { + "epoch": 1.5711522965350524, + "grad_norm": 2.787211094328352, + "learning_rate": 9.976448655755528e-05, + "loss": 4.0066, + "step": 2438 + }, + { + "epoch": 1.5717969379532635, + "grad_norm": 2.132022845455185, + "learning_rate": 9.97642910631894e-05, + "loss": 3.8831, + "step": 2439 + }, + { + "epoch": 1.5724415793714748, + "grad_norm": 2.222763546656547, + "learning_rate": 9.976409548791308e-05, + "loss": 3.539, + "step": 2440 + }, + { + "epoch": 1.5730862207896856, + "grad_norm": 2.034438475256864, + "learning_rate": 9.97638998317267e-05, + "loss": 3.91, + "step": 2441 + }, + { + "epoch": 1.573730862207897, + "grad_norm": 2.4930648192142475, + "learning_rate": 9.97637040946305e-05, + "loss": 4.1353, + "step": 2442 + }, + { + "epoch": 1.574375503626108, + "grad_norm": 2.7766744668182115, + "learning_rate": 9.976350827662488e-05, + "loss": 3.7984, + "step": 2443 + }, + { + "epoch": 1.575020145044319, + "grad_norm": 1.6349182015825297, + "learning_rate": 9.97633123777101e-05, + "loss": 3.9119, + "step": 2444 + }, + { + "epoch": 1.5756647864625302, + "grad_norm": 2.6276765965393185, + "learning_rate": 9.976311639788651e-05, + "loss": 4.0306, + "step": 2445 + }, + { + "epoch": 1.5763094278807412, + "grad_norm": 2.1180081426057757, + "learning_rate": 9.976292033715443e-05, + "loss": 4.224, + "step": 2446 + }, + { + "epoch": 1.5769540692989525, + "grad_norm": 2.6101175235606293, + "learning_rate": 9.97627241955142e-05, + "loss": 4.3007, + "step": 2447 + }, + { + "epoch": 1.5775987107171636, + "grad_norm": 3.2166608117806494, + "learning_rate": 9.97625279729661e-05, + "loss": 3.8383, + "step": 2448 + }, + { + "epoch": 1.5782433521353747, + "grad_norm": 1.697128725757312, + "learning_rate": 9.976233166951048e-05, + "loss": 3.9691, + "step": 2449 + }, + { + "epoch": 1.5788879935535858, + "grad_norm": 3.289438046513567, + "learning_rate": 9.976213528514768e-05, + "loss": 3.5999, + "step": 2450 + }, + { + "epoch": 1.5795326349717969, + "grad_norm": 4.2225743321343705, + "learning_rate": 9.976193881987798e-05, + "loss": 3.8886, + "step": 2451 + }, + { + "epoch": 1.5801772763900082, + "grad_norm": 2.776524822775413, + "learning_rate": 9.976174227370173e-05, + "loss": 4.0198, + "step": 2452 + }, + { + "epoch": 1.580821917808219, + "grad_norm": 2.217901510021136, + "learning_rate": 9.976154564661925e-05, + "loss": 3.3833, + "step": 2453 + }, + { + "epoch": 1.5814665592264303, + "grad_norm": 3.327211067465488, + "learning_rate": 9.976134893863086e-05, + "loss": 3.7409, + "step": 2454 + }, + { + "epoch": 1.5821112006446414, + "grad_norm": 2.875932865295242, + "learning_rate": 9.976115214973686e-05, + "loss": 4.4154, + "step": 2455 + }, + { + "epoch": 1.5827558420628525, + "grad_norm": 1.6602593515445672, + "learning_rate": 9.976095527993761e-05, + "loss": 3.8679, + "step": 2456 + }, + { + "epoch": 1.5834004834810638, + "grad_norm": 2.54396777086534, + "learning_rate": 9.976075832923343e-05, + "loss": 4.242, + "step": 2457 + }, + { + "epoch": 1.5840451248992746, + "grad_norm": 1.7049905291890233, + "learning_rate": 9.976056129762462e-05, + "loss": 3.9772, + "step": 2458 + }, + { + "epoch": 1.584689766317486, + "grad_norm": 1.7608395254431068, + "learning_rate": 9.976036418511153e-05, + "loss": 4.1864, + "step": 2459 + }, + { + "epoch": 1.585334407735697, + "grad_norm": 2.1346836903749122, + "learning_rate": 9.976016699169445e-05, + "loss": 3.9126, + "step": 2460 + }, + { + "epoch": 1.585979049153908, + "grad_norm": 2.768021118318374, + "learning_rate": 9.975996971737372e-05, + "loss": 3.4153, + "step": 2461 + }, + { + "epoch": 1.5866236905721194, + "grad_norm": 1.7868885547163307, + "learning_rate": 9.975977236214969e-05, + "loss": 4.0264, + "step": 2462 + }, + { + "epoch": 1.5872683319903302, + "grad_norm": 2.467513668779401, + "learning_rate": 9.975957492602266e-05, + "loss": 4.1521, + "step": 2463 + }, + { + "epoch": 1.5879129734085415, + "grad_norm": 1.5427755150898477, + "learning_rate": 9.975937740899294e-05, + "loss": 3.9284, + "step": 2464 + }, + { + "epoch": 1.5885576148267526, + "grad_norm": 2.849513865356846, + "learning_rate": 9.975917981106088e-05, + "loss": 3.9006, + "step": 2465 + }, + { + "epoch": 1.5892022562449637, + "grad_norm": 1.8665565102578912, + "learning_rate": 9.975898213222679e-05, + "loss": 4.2267, + "step": 2466 + }, + { + "epoch": 1.589846897663175, + "grad_norm": 2.1693115948876605, + "learning_rate": 9.975878437249099e-05, + "loss": 3.9334, + "step": 2467 + }, + { + "epoch": 1.5904915390813859, + "grad_norm": 2.3765001860051127, + "learning_rate": 9.975858653185384e-05, + "loss": 4.2274, + "step": 2468 + }, + { + "epoch": 1.5911361804995972, + "grad_norm": 1.8307010330960178, + "learning_rate": 9.975838861031563e-05, + "loss": 3.6889, + "step": 2469 + }, + { + "epoch": 1.5917808219178082, + "grad_norm": 2.334557400997604, + "learning_rate": 9.975819060787668e-05, + "loss": 4.004, + "step": 2470 + }, + { + "epoch": 1.5924254633360193, + "grad_norm": 2.064151083233871, + "learning_rate": 9.975799252453733e-05, + "loss": 3.8609, + "step": 2471 + }, + { + "epoch": 1.5930701047542306, + "grad_norm": 1.9514072816184223, + "learning_rate": 9.975779436029792e-05, + "loss": 3.9845, + "step": 2472 + }, + { + "epoch": 1.5937147461724415, + "grad_norm": 1.7882463144999736, + "learning_rate": 9.975759611515875e-05, + "loss": 4.184, + "step": 2473 + }, + { + "epoch": 1.5943593875906528, + "grad_norm": 1.4144048983238648, + "learning_rate": 9.975739778912015e-05, + "loss": 4.1421, + "step": 2474 + }, + { + "epoch": 1.5950040290088638, + "grad_norm": 1.8280512225868817, + "learning_rate": 9.975719938218246e-05, + "loss": 3.8971, + "step": 2475 + }, + { + "epoch": 1.595648670427075, + "grad_norm": 1.5253407905144116, + "learning_rate": 9.975700089434601e-05, + "loss": 3.8423, + "step": 2476 + }, + { + "epoch": 1.5962933118452862, + "grad_norm": 1.4383314739236024, + "learning_rate": 9.97568023256111e-05, + "loss": 3.9116, + "step": 2477 + }, + { + "epoch": 1.596937953263497, + "grad_norm": 1.6594352143123445, + "learning_rate": 9.975660367597806e-05, + "loss": 4.0678, + "step": 2478 + }, + { + "epoch": 1.5975825946817084, + "grad_norm": 1.2800508836029845, + "learning_rate": 9.975640494544725e-05, + "loss": 4.1374, + "step": 2479 + }, + { + "epoch": 1.5982272360999195, + "grad_norm": 1.8231705312826378, + "learning_rate": 9.975620613401896e-05, + "loss": 3.9076, + "step": 2480 + }, + { + "epoch": 1.5988718775181305, + "grad_norm": 1.3023101563124155, + "learning_rate": 9.975600724169352e-05, + "loss": 3.7386, + "step": 2481 + }, + { + "epoch": 1.5995165189363416, + "grad_norm": 1.6620824523778388, + "learning_rate": 9.975580826847127e-05, + "loss": 4.0884, + "step": 2482 + }, + { + "epoch": 1.6001611603545527, + "grad_norm": 2.219280069696187, + "learning_rate": 9.975560921435253e-05, + "loss": 4.1089, + "step": 2483 + }, + { + "epoch": 1.600805801772764, + "grad_norm": 2.2922432708913316, + "learning_rate": 9.975541007933764e-05, + "loss": 4.1653, + "step": 2484 + }, + { + "epoch": 1.601450443190975, + "grad_norm": 1.355420277737847, + "learning_rate": 9.975521086342691e-05, + "loss": 4.5131, + "step": 2485 + }, + { + "epoch": 1.6020950846091861, + "grad_norm": 2.2439441295589866, + "learning_rate": 9.975501156662068e-05, + "loss": 3.8026, + "step": 2486 + }, + { + "epoch": 1.6027397260273972, + "grad_norm": 1.61491216276523, + "learning_rate": 9.975481218891925e-05, + "loss": 4.0856, + "step": 2487 + }, + { + "epoch": 1.6033843674456083, + "grad_norm": 1.325903866952788, + "learning_rate": 9.975461273032299e-05, + "loss": 4.0542, + "step": 2488 + }, + { + "epoch": 1.6040290088638196, + "grad_norm": 1.5862733932756554, + "learning_rate": 9.975441319083218e-05, + "loss": 4.1791, + "step": 2489 + }, + { + "epoch": 1.6046736502820305, + "grad_norm": 1.3490033515626745, + "learning_rate": 9.97542135704472e-05, + "loss": 4.0189, + "step": 2490 + }, + { + "epoch": 1.6053182917002418, + "grad_norm": 1.3378758229555863, + "learning_rate": 9.975401386916834e-05, + "loss": 4.2094, + "step": 2491 + }, + { + "epoch": 1.6059629331184528, + "grad_norm": 1.2098685144960075, + "learning_rate": 9.975381408699594e-05, + "loss": 3.8672, + "step": 2492 + }, + { + "epoch": 1.606607574536664, + "grad_norm": 1.1049938270495239, + "learning_rate": 9.975361422393032e-05, + "loss": 4.2289, + "step": 2493 + }, + { + "epoch": 1.6072522159548752, + "grad_norm": 1.7126292012875897, + "learning_rate": 9.975341427997185e-05, + "loss": 4.0049, + "step": 2494 + }, + { + "epoch": 1.607896857373086, + "grad_norm": 1.6390661887820133, + "learning_rate": 9.975321425512077e-05, + "loss": 4.065, + "step": 2495 + }, + { + "epoch": 1.6085414987912974, + "grad_norm": 1.5175626217101301, + "learning_rate": 9.97530141493775e-05, + "loss": 3.596, + "step": 2496 + }, + { + "epoch": 1.6091861402095085, + "grad_norm": 1.6309233284213505, + "learning_rate": 9.975281396274233e-05, + "loss": 4.0504, + "step": 2497 + }, + { + "epoch": 1.6098307816277195, + "grad_norm": 1.5939426127701577, + "learning_rate": 9.975261369521557e-05, + "loss": 3.9213, + "step": 2498 + }, + { + "epoch": 1.6104754230459308, + "grad_norm": 1.5869229970147303, + "learning_rate": 9.975241334679758e-05, + "loss": 4.0233, + "step": 2499 + }, + { + "epoch": 1.6111200644641417, + "grad_norm": 1.5763810219927388, + "learning_rate": 9.975221291748867e-05, + "loss": 3.9009, + "step": 2500 + }, + { + "epoch": 1.6111200644641417, + "eval_loss": 4.082088470458984, + "eval_runtime": 2.9763, + "eval_samples_per_second": 33.599, + "eval_steps_per_second": 4.368, + "step": 2500 + }, + { + "epoch": 1.611764705882353, + "grad_norm": 1.991052701295644, + "learning_rate": 9.975201240728919e-05, + "loss": 3.9327, + "step": 2501 + }, + { + "epoch": 1.612409347300564, + "grad_norm": 2.209785789826201, + "learning_rate": 9.975181181619944e-05, + "loss": 3.8426, + "step": 2502 + }, + { + "epoch": 1.6130539887187751, + "grad_norm": 2.1513648890544994, + "learning_rate": 9.975161114421979e-05, + "loss": 4.0021, + "step": 2503 + }, + { + "epoch": 1.6136986301369864, + "grad_norm": 1.2134806405533654, + "learning_rate": 9.975141039135053e-05, + "loss": 4.0308, + "step": 2504 + }, + { + "epoch": 1.6143432715551973, + "grad_norm": 2.1678882638921837, + "learning_rate": 9.975120955759202e-05, + "loss": 4.1066, + "step": 2505 + }, + { + "epoch": 1.6149879129734086, + "grad_norm": 2.8400881779564253, + "learning_rate": 9.975100864294454e-05, + "loss": 3.9123, + "step": 2506 + }, + { + "epoch": 1.6156325543916197, + "grad_norm": 2.325408345990496, + "learning_rate": 9.975080764740848e-05, + "loss": 4.1366, + "step": 2507 + }, + { + "epoch": 1.6162771958098308, + "grad_norm": 1.745855072453502, + "learning_rate": 9.975060657098415e-05, + "loss": 4.3246, + "step": 2508 + }, + { + "epoch": 1.616921837228042, + "grad_norm": 2.3264247329375034, + "learning_rate": 9.975040541367187e-05, + "loss": 4.0761, + "step": 2509 + }, + { + "epoch": 1.617566478646253, + "grad_norm": 2.4711598799249326, + "learning_rate": 9.975020417547197e-05, + "loss": 3.8443, + "step": 2510 + }, + { + "epoch": 1.6182111200644642, + "grad_norm": 1.6157287383989931, + "learning_rate": 9.975000285638478e-05, + "loss": 3.9439, + "step": 2511 + }, + { + "epoch": 1.6188557614826753, + "grad_norm": 2.2459418586985413, + "learning_rate": 9.974980145641065e-05, + "loss": 4.2336, + "step": 2512 + }, + { + "epoch": 1.6195004029008864, + "grad_norm": 2.0332027470018406, + "learning_rate": 9.97495999755499e-05, + "loss": 3.7337, + "step": 2513 + }, + { + "epoch": 1.6201450443190974, + "grad_norm": 1.8766124379180462, + "learning_rate": 9.974939841380284e-05, + "loss": 3.8944, + "step": 2514 + }, + { + "epoch": 1.6207896857373085, + "grad_norm": 2.874600696785465, + "learning_rate": 9.974919677116983e-05, + "loss": 3.6339, + "step": 2515 + }, + { + "epoch": 1.6214343271555198, + "grad_norm": 2.53642889350106, + "learning_rate": 9.974899504765119e-05, + "loss": 4.1614, + "step": 2516 + }, + { + "epoch": 1.622078968573731, + "grad_norm": 1.7989383745710268, + "learning_rate": 9.974879324324725e-05, + "loss": 3.8248, + "step": 2517 + }, + { + "epoch": 1.622723609991942, + "grad_norm": 2.744593474481356, + "learning_rate": 9.974859135795835e-05, + "loss": 4.3148, + "step": 2518 + }, + { + "epoch": 1.623368251410153, + "grad_norm": 1.3669165784417405, + "learning_rate": 9.974838939178482e-05, + "loss": 3.5788, + "step": 2519 + }, + { + "epoch": 1.6240128928283641, + "grad_norm": 3.365789674661326, + "learning_rate": 9.974818734472697e-05, + "loss": 3.7284, + "step": 2520 + }, + { + "epoch": 1.6246575342465754, + "grad_norm": 2.4614924833981187, + "learning_rate": 9.974798521678515e-05, + "loss": 4.2133, + "step": 2521 + }, + { + "epoch": 1.6253021756647863, + "grad_norm": 1.8558817690207554, + "learning_rate": 9.974778300795969e-05, + "loss": 3.4353, + "step": 2522 + }, + { + "epoch": 1.6259468170829976, + "grad_norm": 2.604137353934387, + "learning_rate": 9.974758071825094e-05, + "loss": 4.3242, + "step": 2523 + }, + { + "epoch": 1.6265914585012087, + "grad_norm": 1.696436353386384, + "learning_rate": 9.974737834765921e-05, + "loss": 3.7478, + "step": 2524 + }, + { + "epoch": 1.6272360999194198, + "grad_norm": 2.59322780336718, + "learning_rate": 9.974717589618484e-05, + "loss": 3.735, + "step": 2525 + }, + { + "epoch": 1.627880741337631, + "grad_norm": 2.6406480410244497, + "learning_rate": 9.974697336382814e-05, + "loss": 3.9811, + "step": 2526 + }, + { + "epoch": 1.628525382755842, + "grad_norm": 2.908767890260953, + "learning_rate": 9.974677075058948e-05, + "loss": 3.9493, + "step": 2527 + }, + { + "epoch": 1.6291700241740532, + "grad_norm": 2.267024781933221, + "learning_rate": 9.974656805646916e-05, + "loss": 4.3322, + "step": 2528 + }, + { + "epoch": 1.6298146655922643, + "grad_norm": 2.5090410629675324, + "learning_rate": 9.974636528146755e-05, + "loss": 3.9902, + "step": 2529 + }, + { + "epoch": 1.6304593070104754, + "grad_norm": 1.9792558582434312, + "learning_rate": 9.974616242558493e-05, + "loss": 4.0716, + "step": 2530 + }, + { + "epoch": 1.6311039484286867, + "grad_norm": 2.340167392348053, + "learning_rate": 9.974595948882169e-05, + "loss": 3.7903, + "step": 2531 + }, + { + "epoch": 1.6317485898468975, + "grad_norm": 2.15723995062895, + "learning_rate": 9.974575647117813e-05, + "loss": 4.3742, + "step": 2532 + }, + { + "epoch": 1.6323932312651088, + "grad_norm": 2.074479950726298, + "learning_rate": 9.974555337265459e-05, + "loss": 4.0295, + "step": 2533 + }, + { + "epoch": 1.63303787268332, + "grad_norm": 2.6699522478565423, + "learning_rate": 9.97453501932514e-05, + "loss": 4.2266, + "step": 2534 + }, + { + "epoch": 1.633682514101531, + "grad_norm": 2.0961190427755994, + "learning_rate": 9.97451469329689e-05, + "loss": 4.0133, + "step": 2535 + }, + { + "epoch": 1.6343271555197423, + "grad_norm": 2.220153774981141, + "learning_rate": 9.974494359180743e-05, + "loss": 4.3494, + "step": 2536 + }, + { + "epoch": 1.6349717969379531, + "grad_norm": 1.6429982744101308, + "learning_rate": 9.974474016976731e-05, + "loss": 4.0998, + "step": 2537 + }, + { + "epoch": 1.6356164383561644, + "grad_norm": 2.202821274416296, + "learning_rate": 9.974453666684889e-05, + "loss": 3.7898, + "step": 2538 + }, + { + "epoch": 1.6362610797743755, + "grad_norm": 2.0263249809179937, + "learning_rate": 9.974433308305247e-05, + "loss": 3.7628, + "step": 2539 + }, + { + "epoch": 1.6369057211925866, + "grad_norm": 2.2714817368021145, + "learning_rate": 9.974412941837842e-05, + "loss": 4.0179, + "step": 2540 + }, + { + "epoch": 1.637550362610798, + "grad_norm": 1.672856852558253, + "learning_rate": 9.974392567282708e-05, + "loss": 3.9622, + "step": 2541 + }, + { + "epoch": 1.6381950040290088, + "grad_norm": 2.1285312138919092, + "learning_rate": 9.974372184639875e-05, + "loss": 3.8419, + "step": 2542 + }, + { + "epoch": 1.63883964544722, + "grad_norm": 2.6326964526000043, + "learning_rate": 9.974351793909379e-05, + "loss": 3.7232, + "step": 2543 + }, + { + "epoch": 1.6394842868654311, + "grad_norm": 1.497179082847308, + "learning_rate": 9.974331395091252e-05, + "loss": 3.9781, + "step": 2544 + }, + { + "epoch": 1.6401289282836422, + "grad_norm": 2.280401569072673, + "learning_rate": 9.974310988185528e-05, + "loss": 3.8308, + "step": 2545 + }, + { + "epoch": 1.6407735697018535, + "grad_norm": 1.9063930826762314, + "learning_rate": 9.974290573192243e-05, + "loss": 4.3829, + "step": 2546 + }, + { + "epoch": 1.6414182111200644, + "grad_norm": 1.6398168630460912, + "learning_rate": 9.974270150111427e-05, + "loss": 3.8856, + "step": 2547 + }, + { + "epoch": 1.6420628525382757, + "grad_norm": 1.3794590792433277, + "learning_rate": 9.974249718943114e-05, + "loss": 3.7588, + "step": 2548 + }, + { + "epoch": 1.6427074939564867, + "grad_norm": 1.711033806859095, + "learning_rate": 9.974229279687338e-05, + "loss": 4.0988, + "step": 2549 + }, + { + "epoch": 1.6433521353746978, + "grad_norm": 1.4040191615845, + "learning_rate": 9.974208832344136e-05, + "loss": 3.9237, + "step": 2550 + }, + { + "epoch": 1.643996776792909, + "grad_norm": 1.6408227540211127, + "learning_rate": 9.974188376913535e-05, + "loss": 4.3471, + "step": 2551 + }, + { + "epoch": 1.64464141821112, + "grad_norm": 1.926550248611329, + "learning_rate": 9.974167913395573e-05, + "loss": 3.806, + "step": 2552 + }, + { + "epoch": 1.6452860596293313, + "grad_norm": 1.4782654354836227, + "learning_rate": 9.974147441790285e-05, + "loss": 3.9088, + "step": 2553 + }, + { + "epoch": 1.6459307010475424, + "grad_norm": 1.5994390248394723, + "learning_rate": 9.9741269620977e-05, + "loss": 3.8702, + "step": 2554 + }, + { + "epoch": 1.6465753424657534, + "grad_norm": 1.4695650238225555, + "learning_rate": 9.974106474317854e-05, + "loss": 4.3788, + "step": 2555 + }, + { + "epoch": 1.6472199838839645, + "grad_norm": 2.1634158148641185, + "learning_rate": 9.974085978450781e-05, + "loss": 3.6278, + "step": 2556 + }, + { + "epoch": 1.6478646253021756, + "grad_norm": 1.648100265100938, + "learning_rate": 9.974065474496514e-05, + "loss": 4.1484, + "step": 2557 + }, + { + "epoch": 1.648509266720387, + "grad_norm": 1.5010494275898707, + "learning_rate": 9.974044962455089e-05, + "loss": 4.2527, + "step": 2558 + }, + { + "epoch": 1.6491539081385977, + "grad_norm": 1.7553251045273903, + "learning_rate": 9.974024442326536e-05, + "loss": 4.132, + "step": 2559 + }, + { + "epoch": 1.649798549556809, + "grad_norm": 1.4150645218513291, + "learning_rate": 9.974003914110891e-05, + "loss": 4.4266, + "step": 2560 + }, + { + "epoch": 1.6504431909750201, + "grad_norm": 1.621747761587582, + "learning_rate": 9.973983377808186e-05, + "loss": 3.9134, + "step": 2561 + }, + { + "epoch": 1.6510878323932312, + "grad_norm": 1.8486747896971545, + "learning_rate": 9.973962833418457e-05, + "loss": 4.2103, + "step": 2562 + }, + { + "epoch": 1.6517324738114425, + "grad_norm": 1.654130739191428, + "learning_rate": 9.973942280941736e-05, + "loss": 3.9513, + "step": 2563 + }, + { + "epoch": 1.6523771152296534, + "grad_norm": 1.5489058969620797, + "learning_rate": 9.973921720378057e-05, + "loss": 3.4516, + "step": 2564 + }, + { + "epoch": 1.6530217566478647, + "grad_norm": 2.1145731375384726, + "learning_rate": 9.973901151727456e-05, + "loss": 3.9471, + "step": 2565 + }, + { + "epoch": 1.6536663980660757, + "grad_norm": 2.093176357048944, + "learning_rate": 9.973880574989962e-05, + "loss": 4.3668, + "step": 2566 + }, + { + "epoch": 1.6543110394842868, + "grad_norm": 1.7957992460241878, + "learning_rate": 9.973859990165613e-05, + "loss": 3.8293, + "step": 2567 + }, + { + "epoch": 1.6549556809024981, + "grad_norm": 1.5263778890008584, + "learning_rate": 9.973839397254442e-05, + "loss": 4.2785, + "step": 2568 + }, + { + "epoch": 1.655600322320709, + "grad_norm": 1.6342050084402835, + "learning_rate": 9.973818796256484e-05, + "loss": 3.8855, + "step": 2569 + }, + { + "epoch": 1.6562449637389203, + "grad_norm": 1.2599443860517763, + "learning_rate": 9.973798187171766e-05, + "loss": 3.6629, + "step": 2570 + }, + { + "epoch": 1.6568896051571314, + "grad_norm": 1.448696768448241, + "learning_rate": 9.97377757000033e-05, + "loss": 4.0743, + "step": 2571 + }, + { + "epoch": 1.6575342465753424, + "grad_norm": 1.523557804233157, + "learning_rate": 9.973756944742209e-05, + "loss": 4.0219, + "step": 2572 + }, + { + "epoch": 1.6581788879935537, + "grad_norm": 1.2352673950221544, + "learning_rate": 9.973736311397431e-05, + "loss": 3.8826, + "step": 2573 + }, + { + "epoch": 1.6588235294117646, + "grad_norm": 1.0555680940977932, + "learning_rate": 9.973715669966035e-05, + "loss": 4.0684, + "step": 2574 + }, + { + "epoch": 1.6594681708299759, + "grad_norm": 1.30028437342577, + "learning_rate": 9.973695020448054e-05, + "loss": 4.2624, + "step": 2575 + }, + { + "epoch": 1.660112812248187, + "grad_norm": 2.0408016086055247, + "learning_rate": 9.973674362843522e-05, + "loss": 3.8961, + "step": 2576 + }, + { + "epoch": 1.660757453666398, + "grad_norm": 2.866532633884706, + "learning_rate": 9.973653697152471e-05, + "loss": 3.8367, + "step": 2577 + }, + { + "epoch": 1.6614020950846093, + "grad_norm": 2.4410124251936383, + "learning_rate": 9.973633023374936e-05, + "loss": 4.1635, + "step": 2578 + }, + { + "epoch": 1.6620467365028202, + "grad_norm": 1.4704806249702018, + "learning_rate": 9.973612341510951e-05, + "loss": 4.1507, + "step": 2579 + }, + { + "epoch": 1.6626913779210315, + "grad_norm": 2.170283589769529, + "learning_rate": 9.973591651560551e-05, + "loss": 4.0606, + "step": 2580 + }, + { + "epoch": 1.6633360193392426, + "grad_norm": 1.707287060985302, + "learning_rate": 9.973570953523769e-05, + "loss": 3.7282, + "step": 2581 + }, + { + "epoch": 1.6639806607574537, + "grad_norm": 2.46802590935491, + "learning_rate": 9.97355024740064e-05, + "loss": 3.3907, + "step": 2582 + }, + { + "epoch": 1.6646253021756647, + "grad_norm": 2.577922662317232, + "learning_rate": 9.973529533191195e-05, + "loss": 4.2366, + "step": 2583 + }, + { + "epoch": 1.6652699435938758, + "grad_norm": 1.9403616745635288, + "learning_rate": 9.973508810895474e-05, + "loss": 3.927, + "step": 2584 + }, + { + "epoch": 1.6659145850120871, + "grad_norm": 2.7729057654109273, + "learning_rate": 9.973488080513504e-05, + "loss": 3.9824, + "step": 2585 + }, + { + "epoch": 1.6665592264302982, + "grad_norm": 1.9593392615627523, + "learning_rate": 9.973467342045324e-05, + "loss": 4.018, + "step": 2586 + }, + { + "epoch": 1.6672038678485093, + "grad_norm": 2.7172600978368373, + "learning_rate": 9.973446595490966e-05, + "loss": 3.8099, + "step": 2587 + }, + { + "epoch": 1.6678485092667203, + "grad_norm": 2.892925335541771, + "learning_rate": 9.973425840850463e-05, + "loss": 4.1032, + "step": 2588 + }, + { + "epoch": 1.6684931506849314, + "grad_norm": 1.716827224841486, + "learning_rate": 9.973405078123851e-05, + "loss": 3.9919, + "step": 2589 + }, + { + "epoch": 1.6691377921031427, + "grad_norm": 2.4306997403047292, + "learning_rate": 9.973384307311165e-05, + "loss": 3.8405, + "step": 2590 + }, + { + "epoch": 1.6697824335213536, + "grad_norm": 1.6196363795978135, + "learning_rate": 9.973363528412437e-05, + "loss": 3.9406, + "step": 2591 + }, + { + "epoch": 1.6704270749395649, + "grad_norm": 2.731283288785662, + "learning_rate": 9.9733427414277e-05, + "loss": 4.1612, + "step": 2592 + }, + { + "epoch": 1.671071716357776, + "grad_norm": 2.9337253224348774, + "learning_rate": 9.973321946356993e-05, + "loss": 3.8982, + "step": 2593 + }, + { + "epoch": 1.671716357775987, + "grad_norm": 1.8668280433207056, + "learning_rate": 9.973301143200344e-05, + "loss": 4.3097, + "step": 2594 + }, + { + "epoch": 1.6723609991941983, + "grad_norm": 2.783243752775476, + "learning_rate": 9.973280331957794e-05, + "loss": 3.7593, + "step": 2595 + }, + { + "epoch": 1.6730056406124092, + "grad_norm": 1.9497468652801875, + "learning_rate": 9.97325951262937e-05, + "loss": 3.7712, + "step": 2596 + }, + { + "epoch": 1.6736502820306205, + "grad_norm": 1.7105433459350614, + "learning_rate": 9.973238685215113e-05, + "loss": 4.2238, + "step": 2597 + }, + { + "epoch": 1.6742949234488316, + "grad_norm": 1.4472387254766632, + "learning_rate": 9.973217849715052e-05, + "loss": 4.0011, + "step": 2598 + }, + { + "epoch": 1.6749395648670427, + "grad_norm": 1.5907041290299346, + "learning_rate": 9.973197006129222e-05, + "loss": 4.2579, + "step": 2599 + }, + { + "epoch": 1.675584206285254, + "grad_norm": 1.8222857302028637, + "learning_rate": 9.97317615445766e-05, + "loss": 4.0423, + "step": 2600 + }, + { + "epoch": 1.675584206285254, + "eval_loss": 4.072576522827148, + "eval_runtime": 2.9607, + "eval_samples_per_second": 33.776, + "eval_steps_per_second": 4.391, + "step": 2600 + }, + { + "epoch": 1.6762288477034648, + "grad_norm": 1.5666384615181161, + "learning_rate": 9.973155294700399e-05, + "loss": 3.9261, + "step": 2601 + }, + { + "epoch": 1.676873489121676, + "grad_norm": 2.059304557400265, + "learning_rate": 9.97313442685747e-05, + "loss": 3.597, + "step": 2602 + }, + { + "epoch": 1.6775181305398872, + "grad_norm": 2.2538543026702187, + "learning_rate": 9.973113550928913e-05, + "loss": 3.8374, + "step": 2603 + }, + { + "epoch": 1.6781627719580983, + "grad_norm": 1.6309556244514487, + "learning_rate": 9.973092666914758e-05, + "loss": 4.0094, + "step": 2604 + }, + { + "epoch": 1.6788074133763096, + "grad_norm": 1.278641049930893, + "learning_rate": 9.973071774815042e-05, + "loss": 3.9503, + "step": 2605 + }, + { + "epoch": 1.6794520547945204, + "grad_norm": 1.775265376056082, + "learning_rate": 9.973050874629797e-05, + "loss": 3.9069, + "step": 2606 + }, + { + "epoch": 1.6800966962127317, + "grad_norm": 1.4404694358849353, + "learning_rate": 9.973029966359058e-05, + "loss": 3.9336, + "step": 2607 + }, + { + "epoch": 1.6807413376309428, + "grad_norm": 1.40573207447374, + "learning_rate": 9.973009050002861e-05, + "loss": 4.234, + "step": 2608 + }, + { + "epoch": 1.6813859790491539, + "grad_norm": 1.3883023619036885, + "learning_rate": 9.97298812556124e-05, + "loss": 4.2781, + "step": 2609 + }, + { + "epoch": 1.6820306204673652, + "grad_norm": 1.6550111159562697, + "learning_rate": 9.972967193034225e-05, + "loss": 4.2895, + "step": 2610 + }, + { + "epoch": 1.682675261885576, + "grad_norm": 2.083980693638547, + "learning_rate": 9.972946252421856e-05, + "loss": 3.9239, + "step": 2611 + }, + { + "epoch": 1.6833199033037873, + "grad_norm": 1.8011764120662759, + "learning_rate": 9.972925303724164e-05, + "loss": 3.8905, + "step": 2612 + }, + { + "epoch": 1.6839645447219984, + "grad_norm": 1.183236121254592, + "learning_rate": 9.972904346941185e-05, + "loss": 3.8773, + "step": 2613 + }, + { + "epoch": 1.6846091861402095, + "grad_norm": 1.6790841709102833, + "learning_rate": 9.972883382072952e-05, + "loss": 3.7738, + "step": 2614 + }, + { + "epoch": 1.6852538275584208, + "grad_norm": 1.1756992035026155, + "learning_rate": 9.972862409119503e-05, + "loss": 3.7892, + "step": 2615 + }, + { + "epoch": 1.6858984689766316, + "grad_norm": 1.8576501675548687, + "learning_rate": 9.972841428080868e-05, + "loss": 3.9597, + "step": 2616 + }, + { + "epoch": 1.686543110394843, + "grad_norm": 2.001523762134091, + "learning_rate": 9.972820438957085e-05, + "loss": 4.3469, + "step": 2617 + }, + { + "epoch": 1.687187751813054, + "grad_norm": 1.771056194976681, + "learning_rate": 9.972799441748185e-05, + "loss": 3.7595, + "step": 2618 + }, + { + "epoch": 1.687832393231265, + "grad_norm": 2.163289969816121, + "learning_rate": 9.972778436454206e-05, + "loss": 3.7624, + "step": 2619 + }, + { + "epoch": 1.6884770346494762, + "grad_norm": 2.298697072550766, + "learning_rate": 9.972757423075179e-05, + "loss": 3.6674, + "step": 2620 + }, + { + "epoch": 1.6891216760676873, + "grad_norm": 2.0746455536746464, + "learning_rate": 9.97273640161114e-05, + "loss": 4.3533, + "step": 2621 + }, + { + "epoch": 1.6897663174858986, + "grad_norm": 2.0284537490916428, + "learning_rate": 9.972715372062126e-05, + "loss": 3.9748, + "step": 2622 + }, + { + "epoch": 1.6904109589041096, + "grad_norm": 1.8765938092495273, + "learning_rate": 9.972694334428168e-05, + "loss": 4.5287, + "step": 2623 + }, + { + "epoch": 1.6910556003223207, + "grad_norm": 1.7371592635054072, + "learning_rate": 9.972673288709301e-05, + "loss": 4.1809, + "step": 2624 + }, + { + "epoch": 1.6917002417405318, + "grad_norm": 1.8915227249727098, + "learning_rate": 9.972652234905561e-05, + "loss": 3.9394, + "step": 2625 + }, + { + "epoch": 1.6923448831587429, + "grad_norm": 1.7278989111236422, + "learning_rate": 9.972631173016984e-05, + "loss": 4.0186, + "step": 2626 + }, + { + "epoch": 1.6929895245769542, + "grad_norm": 1.9006180356521891, + "learning_rate": 9.9726101030436e-05, + "loss": 3.8476, + "step": 2627 + }, + { + "epoch": 1.693634165995165, + "grad_norm": 2.070410766939091, + "learning_rate": 9.972589024985446e-05, + "loss": 3.9475, + "step": 2628 + }, + { + "epoch": 1.6942788074133763, + "grad_norm": 1.9341558520778006, + "learning_rate": 9.97256793884256e-05, + "loss": 3.8112, + "step": 2629 + }, + { + "epoch": 1.6949234488315874, + "grad_norm": 2.5958392351684036, + "learning_rate": 9.972546844614972e-05, + "loss": 3.8246, + "step": 2630 + }, + { + "epoch": 1.6955680902497985, + "grad_norm": 2.3105696067426447, + "learning_rate": 9.972525742302717e-05, + "loss": 3.97, + "step": 2631 + }, + { + "epoch": 1.6962127316680098, + "grad_norm": 2.055772756553893, + "learning_rate": 9.972504631905833e-05, + "loss": 3.8616, + "step": 2632 + }, + { + "epoch": 1.6968573730862206, + "grad_norm": 2.5462835698570037, + "learning_rate": 9.972483513424352e-05, + "loss": 4.279, + "step": 2633 + }, + { + "epoch": 1.697502014504432, + "grad_norm": 1.8396880882348952, + "learning_rate": 9.972462386858308e-05, + "loss": 4.0129, + "step": 2634 + }, + { + "epoch": 1.698146655922643, + "grad_norm": 2.1269323481118314, + "learning_rate": 9.972441252207738e-05, + "loss": 4.3825, + "step": 2635 + }, + { + "epoch": 1.698791297340854, + "grad_norm": 2.2140729303139723, + "learning_rate": 9.972420109472674e-05, + "loss": 3.7553, + "step": 2636 + }, + { + "epoch": 1.6994359387590654, + "grad_norm": 1.9862076833932767, + "learning_rate": 9.972398958653153e-05, + "loss": 4.0652, + "step": 2637 + }, + { + "epoch": 1.7000805801772763, + "grad_norm": 1.844368105240003, + "learning_rate": 9.972377799749208e-05, + "loss": 3.8797, + "step": 2638 + }, + { + "epoch": 1.7007252215954876, + "grad_norm": 1.8088153151070916, + "learning_rate": 9.972356632760876e-05, + "loss": 3.906, + "step": 2639 + }, + { + "epoch": 1.7013698630136986, + "grad_norm": 1.901272508062528, + "learning_rate": 9.972335457688191e-05, + "loss": 4.0025, + "step": 2640 + }, + { + "epoch": 1.7020145044319097, + "grad_norm": 1.9154194287344393, + "learning_rate": 9.972314274531188e-05, + "loss": 3.9996, + "step": 2641 + }, + { + "epoch": 1.702659145850121, + "grad_norm": 1.5653246459548142, + "learning_rate": 9.972293083289899e-05, + "loss": 4.0844, + "step": 2642 + }, + { + "epoch": 1.7033037872683319, + "grad_norm": 1.7114696412710788, + "learning_rate": 9.972271883964363e-05, + "loss": 3.7916, + "step": 2643 + }, + { + "epoch": 1.7039484286865432, + "grad_norm": 2.0181472824411597, + "learning_rate": 9.97225067655461e-05, + "loss": 3.7852, + "step": 2644 + }, + { + "epoch": 1.7045930701047542, + "grad_norm": 1.3826410762209125, + "learning_rate": 9.97222946106068e-05, + "loss": 4.293, + "step": 2645 + }, + { + "epoch": 1.7052377115229653, + "grad_norm": 2.3900759771288267, + "learning_rate": 9.972208237482604e-05, + "loss": 4.0148, + "step": 2646 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 1.9772768584144174, + "learning_rate": 9.97218700582042e-05, + "loss": 4.09, + "step": 2647 + }, + { + "epoch": 1.7065269943593875, + "grad_norm": 1.580332838166708, + "learning_rate": 9.97216576607416e-05, + "loss": 3.993, + "step": 2648 + }, + { + "epoch": 1.7071716357775988, + "grad_norm": 2.398565582359536, + "learning_rate": 9.972144518243859e-05, + "loss": 4.1708, + "step": 2649 + }, + { + "epoch": 1.7078162771958099, + "grad_norm": 2.39711189474806, + "learning_rate": 9.972123262329553e-05, + "loss": 3.9152, + "step": 2650 + }, + { + "epoch": 1.708460918614021, + "grad_norm": 2.0690751531150564, + "learning_rate": 9.972101998331279e-05, + "loss": 4.0541, + "step": 2651 + }, + { + "epoch": 1.709105560032232, + "grad_norm": 2.2879280859950506, + "learning_rate": 9.972080726249067e-05, + "loss": 4.0192, + "step": 2652 + }, + { + "epoch": 1.709750201450443, + "grad_norm": 2.159891105777734, + "learning_rate": 9.972059446082956e-05, + "loss": 3.8456, + "step": 2653 + }, + { + "epoch": 1.7103948428686544, + "grad_norm": 1.9068682397986285, + "learning_rate": 9.972038157832982e-05, + "loss": 4.0748, + "step": 2654 + }, + { + "epoch": 1.7110394842868655, + "grad_norm": 1.6856270883422229, + "learning_rate": 9.972016861499175e-05, + "loss": 4.3003, + "step": 2655 + }, + { + "epoch": 1.7116841257050766, + "grad_norm": 2.6063951831953105, + "learning_rate": 9.971995557081571e-05, + "loss": 3.8545, + "step": 2656 + }, + { + "epoch": 1.7123287671232876, + "grad_norm": 1.900245015331593, + "learning_rate": 9.971974244580209e-05, + "loss": 3.8613, + "step": 2657 + }, + { + "epoch": 1.7129734085414987, + "grad_norm": 2.2133568582157532, + "learning_rate": 9.971952923995122e-05, + "loss": 4.0084, + "step": 2658 + }, + { + "epoch": 1.71361804995971, + "grad_norm": 1.933702300358696, + "learning_rate": 9.971931595326343e-05, + "loss": 3.805, + "step": 2659 + }, + { + "epoch": 1.7142626913779209, + "grad_norm": 2.619621110462994, + "learning_rate": 9.97191025857391e-05, + "loss": 3.9461, + "step": 2660 + }, + { + "epoch": 1.7149073327961322, + "grad_norm": 2.0026431447678172, + "learning_rate": 9.971888913737855e-05, + "loss": 4.2189, + "step": 2661 + }, + { + "epoch": 1.7155519742143432, + "grad_norm": 2.0019388529233155, + "learning_rate": 9.971867560818216e-05, + "loss": 3.8281, + "step": 2662 + }, + { + "epoch": 1.7161966156325543, + "grad_norm": 2.0165978647816925, + "learning_rate": 9.971846199815026e-05, + "loss": 4.0597, + "step": 2663 + }, + { + "epoch": 1.7168412570507656, + "grad_norm": 1.8861868350353463, + "learning_rate": 9.971824830728322e-05, + "loss": 3.6077, + "step": 2664 + }, + { + "epoch": 1.7174858984689765, + "grad_norm": 2.2663307722291646, + "learning_rate": 9.971803453558137e-05, + "loss": 3.8098, + "step": 2665 + }, + { + "epoch": 1.7181305398871878, + "grad_norm": 2.1830410321260962, + "learning_rate": 9.971782068304508e-05, + "loss": 3.9209, + "step": 2666 + }, + { + "epoch": 1.7187751813053989, + "grad_norm": 1.329657539142339, + "learning_rate": 9.971760674967468e-05, + "loss": 4.0042, + "step": 2667 + }, + { + "epoch": 1.71941982272361, + "grad_norm": 1.9311885852309882, + "learning_rate": 9.971739273547054e-05, + "loss": 4.1698, + "step": 2668 + }, + { + "epoch": 1.7200644641418212, + "grad_norm": 1.6008819500715405, + "learning_rate": 9.971717864043299e-05, + "loss": 4.139, + "step": 2669 + }, + { + "epoch": 1.720709105560032, + "grad_norm": 1.6338475671709194, + "learning_rate": 9.971696446456242e-05, + "loss": 3.9455, + "step": 2670 + }, + { + "epoch": 1.7213537469782434, + "grad_norm": 1.8118259234568226, + "learning_rate": 9.971675020785914e-05, + "loss": 4.3264, + "step": 2671 + }, + { + "epoch": 1.7219983883964545, + "grad_norm": 2.200711404983553, + "learning_rate": 9.971653587032353e-05, + "loss": 4.3927, + "step": 2672 + }, + { + "epoch": 1.7226430298146655, + "grad_norm": 1.978110933837368, + "learning_rate": 9.971632145195592e-05, + "loss": 3.8315, + "step": 2673 + }, + { + "epoch": 1.7232876712328768, + "grad_norm": 1.4482743953642263, + "learning_rate": 9.971610695275669e-05, + "loss": 3.7944, + "step": 2674 + }, + { + "epoch": 1.7239323126510877, + "grad_norm": 1.6254418978357965, + "learning_rate": 9.971589237272619e-05, + "loss": 4.1572, + "step": 2675 + }, + { + "epoch": 1.724576954069299, + "grad_norm": 2.0820648956656784, + "learning_rate": 9.971567771186473e-05, + "loss": 3.872, + "step": 2676 + }, + { + "epoch": 1.72522159548751, + "grad_norm": 1.888614352517129, + "learning_rate": 9.971546297017269e-05, + "loss": 4.0545, + "step": 2677 + }, + { + "epoch": 1.7258662369057212, + "grad_norm": 1.6309308208801323, + "learning_rate": 9.971524814765043e-05, + "loss": 4.1077, + "step": 2678 + }, + { + "epoch": 1.7265108783239325, + "grad_norm": 1.3249638630835303, + "learning_rate": 9.971503324429831e-05, + "loss": 4.3975, + "step": 2679 + }, + { + "epoch": 1.7271555197421433, + "grad_norm": 1.3451791780588134, + "learning_rate": 9.971481826011666e-05, + "loss": 3.9306, + "step": 2680 + }, + { + "epoch": 1.7278001611603546, + "grad_norm": 1.7273878533107543, + "learning_rate": 9.971460319510584e-05, + "loss": 4.0869, + "step": 2681 + }, + { + "epoch": 1.7284448025785657, + "grad_norm": 1.7808952698763, + "learning_rate": 9.971438804926622e-05, + "loss": 4.0563, + "step": 2682 + }, + { + "epoch": 1.7290894439967768, + "grad_norm": 1.9878376168273317, + "learning_rate": 9.971417282259813e-05, + "loss": 4.2192, + "step": 2683 + }, + { + "epoch": 1.729734085414988, + "grad_norm": 2.2576904495163994, + "learning_rate": 9.971395751510193e-05, + "loss": 3.6933, + "step": 2684 + }, + { + "epoch": 1.730378726833199, + "grad_norm": 2.179219522728672, + "learning_rate": 9.971374212677798e-05, + "loss": 4.1836, + "step": 2685 + }, + { + "epoch": 1.7310233682514102, + "grad_norm": 1.9326690520153684, + "learning_rate": 9.971352665762663e-05, + "loss": 4.0492, + "step": 2686 + }, + { + "epoch": 1.7316680096696213, + "grad_norm": 2.0875970430112356, + "learning_rate": 9.971331110764823e-05, + "loss": 4.2531, + "step": 2687 + }, + { + "epoch": 1.7323126510878324, + "grad_norm": 2.1222305953666445, + "learning_rate": 9.971309547684316e-05, + "loss": 3.8206, + "step": 2688 + }, + { + "epoch": 1.7329572925060435, + "grad_norm": 1.9757110670908344, + "learning_rate": 9.971287976521173e-05, + "loss": 3.748, + "step": 2689 + }, + { + "epoch": 1.7336019339242545, + "grad_norm": 1.8175381465431952, + "learning_rate": 9.971266397275432e-05, + "loss": 4.2834, + "step": 2690 + }, + { + "epoch": 1.7342465753424658, + "grad_norm": 1.1534091252859597, + "learning_rate": 9.971244809947129e-05, + "loss": 4.1481, + "step": 2691 + }, + { + "epoch": 1.734891216760677, + "grad_norm": 1.703844600658729, + "learning_rate": 9.971223214536297e-05, + "loss": 4.0363, + "step": 2692 + }, + { + "epoch": 1.735535858178888, + "grad_norm": 1.4107244190235757, + "learning_rate": 9.971201611042975e-05, + "loss": 3.8218, + "step": 2693 + }, + { + "epoch": 1.736180499597099, + "grad_norm": 1.9684158271506043, + "learning_rate": 9.971179999467196e-05, + "loss": 4.1453, + "step": 2694 + }, + { + "epoch": 1.7368251410153102, + "grad_norm": 2.4762684402914306, + "learning_rate": 9.971158379808994e-05, + "loss": 4.2915, + "step": 2695 + }, + { + "epoch": 1.7374697824335215, + "grad_norm": 1.833498739011485, + "learning_rate": 9.97113675206841e-05, + "loss": 4.1403, + "step": 2696 + }, + { + "epoch": 1.7381144238517323, + "grad_norm": 1.4495868302289432, + "learning_rate": 9.971115116245473e-05, + "loss": 4.0286, + "step": 2697 + }, + { + "epoch": 1.7387590652699436, + "grad_norm": 2.3346879192833083, + "learning_rate": 9.971093472340224e-05, + "loss": 4.1723, + "step": 2698 + }, + { + "epoch": 1.7394037066881547, + "grad_norm": 2.000561743826689, + "learning_rate": 9.971071820352693e-05, + "loss": 3.7808, + "step": 2699 + }, + { + "epoch": 1.7400483481063658, + "grad_norm": 2.1281943228105273, + "learning_rate": 9.971050160282921e-05, + "loss": 4.1666, + "step": 2700 + }, + { + "epoch": 1.7400483481063658, + "eval_loss": 4.067024230957031, + "eval_runtime": 2.9708, + "eval_samples_per_second": 33.661, + "eval_steps_per_second": 4.376, + "step": 2700 + }, + { + "epoch": 1.740692989524577, + "grad_norm": 1.7105031115910778, + "learning_rate": 9.971028492130941e-05, + "loss": 3.9457, + "step": 2701 + }, + { + "epoch": 1.741337630942788, + "grad_norm": 1.9300570476995556, + "learning_rate": 9.971006815896788e-05, + "loss": 4.1834, + "step": 2702 + }, + { + "epoch": 1.7419822723609992, + "grad_norm": 1.5766504441600961, + "learning_rate": 9.970985131580499e-05, + "loss": 4.0554, + "step": 2703 + }, + { + "epoch": 1.7426269137792103, + "grad_norm": 2.236882482899728, + "learning_rate": 9.97096343918211e-05, + "loss": 3.9091, + "step": 2704 + }, + { + "epoch": 1.7432715551974214, + "grad_norm": 2.2779811460772477, + "learning_rate": 9.970941738701654e-05, + "loss": 4.0092, + "step": 2705 + }, + { + "epoch": 1.7439161966156327, + "grad_norm": 2.5441566727294886, + "learning_rate": 9.97092003013917e-05, + "loss": 4.0618, + "step": 2706 + }, + { + "epoch": 1.7445608380338435, + "grad_norm": 2.7376875735960273, + "learning_rate": 9.970898313494692e-05, + "loss": 3.6886, + "step": 2707 + }, + { + "epoch": 1.7452054794520548, + "grad_norm": 3.5666007081576065, + "learning_rate": 9.970876588768253e-05, + "loss": 4.2611, + "step": 2708 + }, + { + "epoch": 1.745850120870266, + "grad_norm": 2.636190992670873, + "learning_rate": 9.970854855959895e-05, + "loss": 4.0418, + "step": 2709 + }, + { + "epoch": 1.746494762288477, + "grad_norm": 2.3783791682136384, + "learning_rate": 9.970833115069647e-05, + "loss": 3.891, + "step": 2710 + }, + { + "epoch": 1.7471394037066883, + "grad_norm": 2.358156008986349, + "learning_rate": 9.970811366097549e-05, + "loss": 4.0101, + "step": 2711 + }, + { + "epoch": 1.7477840451248992, + "grad_norm": 1.9355599461861106, + "learning_rate": 9.970789609043636e-05, + "loss": 3.8068, + "step": 2712 + }, + { + "epoch": 1.7484286865431105, + "grad_norm": 3.067906627314466, + "learning_rate": 9.970767843907941e-05, + "loss": 3.7883, + "step": 2713 + }, + { + "epoch": 1.7490733279613215, + "grad_norm": 1.8513411050467328, + "learning_rate": 9.970746070690504e-05, + "loss": 3.9675, + "step": 2714 + }, + { + "epoch": 1.7497179693795326, + "grad_norm": 2.1039526553272214, + "learning_rate": 9.970724289391359e-05, + "loss": 3.8368, + "step": 2715 + }, + { + "epoch": 1.750362610797744, + "grad_norm": 1.816000062355257, + "learning_rate": 9.97070250001054e-05, + "loss": 4.1741, + "step": 2716 + }, + { + "epoch": 1.7510072522159548, + "grad_norm": 1.574742842957754, + "learning_rate": 9.970680702548085e-05, + "loss": 3.967, + "step": 2717 + }, + { + "epoch": 1.751651893634166, + "grad_norm": 1.425957988990152, + "learning_rate": 9.970658897004028e-05, + "loss": 3.9198, + "step": 2718 + }, + { + "epoch": 1.7522965350523771, + "grad_norm": 2.3760776518936892, + "learning_rate": 9.970637083378407e-05, + "loss": 4.1043, + "step": 2719 + }, + { + "epoch": 1.7529411764705882, + "grad_norm": 1.873064873611869, + "learning_rate": 9.970615261671257e-05, + "loss": 3.9166, + "step": 2720 + }, + { + "epoch": 1.7535858178887993, + "grad_norm": 2.046942914341953, + "learning_rate": 9.970593431882613e-05, + "loss": 3.9221, + "step": 2721 + }, + { + "epoch": 1.7542304593070104, + "grad_norm": 1.404531696375765, + "learning_rate": 9.97057159401251e-05, + "loss": 3.8699, + "step": 2722 + }, + { + "epoch": 1.7548751007252217, + "grad_norm": 1.5020655270340346, + "learning_rate": 9.970549748060988e-05, + "loss": 3.7013, + "step": 2723 + }, + { + "epoch": 1.7555197421434328, + "grad_norm": 1.4674005268560397, + "learning_rate": 9.970527894028078e-05, + "loss": 4.0902, + "step": 2724 + }, + { + "epoch": 1.7561643835616438, + "grad_norm": 1.3148397009140904, + "learning_rate": 9.970506031913819e-05, + "loss": 4.1134, + "step": 2725 + }, + { + "epoch": 1.756809024979855, + "grad_norm": 2.0042796196782278, + "learning_rate": 9.970484161718246e-05, + "loss": 3.2488, + "step": 2726 + }, + { + "epoch": 1.757453666398066, + "grad_norm": 1.4172291822011085, + "learning_rate": 9.970462283441395e-05, + "loss": 4.0487, + "step": 2727 + }, + { + "epoch": 1.7580983078162773, + "grad_norm": 1.6487810858786203, + "learning_rate": 9.9704403970833e-05, + "loss": 4.2993, + "step": 2728 + }, + { + "epoch": 1.7587429492344882, + "grad_norm": 1.1101179444315112, + "learning_rate": 9.970418502644e-05, + "loss": 4.5226, + "step": 2729 + }, + { + "epoch": 1.7593875906526995, + "grad_norm": 1.5819969726709338, + "learning_rate": 9.970396600123529e-05, + "loss": 3.7297, + "step": 2730 + }, + { + "epoch": 1.7600322320709105, + "grad_norm": 1.978476037913533, + "learning_rate": 9.970374689521926e-05, + "loss": 3.5519, + "step": 2731 + }, + { + "epoch": 1.7606768734891216, + "grad_norm": 1.636214127387787, + "learning_rate": 9.970352770839222e-05, + "loss": 3.7179, + "step": 2732 + }, + { + "epoch": 1.761321514907333, + "grad_norm": 1.3845309338629983, + "learning_rate": 9.970330844075458e-05, + "loss": 3.9217, + "step": 2733 + }, + { + "epoch": 1.7619661563255438, + "grad_norm": 1.7532635400005079, + "learning_rate": 9.970308909230666e-05, + "loss": 4.085, + "step": 2734 + }, + { + "epoch": 1.762610797743755, + "grad_norm": 1.5696058311089705, + "learning_rate": 9.970286966304882e-05, + "loss": 3.4895, + "step": 2735 + }, + { + "epoch": 1.7632554391619661, + "grad_norm": 1.6354892914426835, + "learning_rate": 9.970265015298148e-05, + "loss": 3.8738, + "step": 2736 + }, + { + "epoch": 1.7639000805801772, + "grad_norm": 1.523265244322984, + "learning_rate": 9.970243056210493e-05, + "loss": 3.7922, + "step": 2737 + }, + { + "epoch": 1.7645447219983885, + "grad_norm": 1.41761557534911, + "learning_rate": 9.970221089041956e-05, + "loss": 3.6689, + "step": 2738 + }, + { + "epoch": 1.7651893634165994, + "grad_norm": 1.4398099738765722, + "learning_rate": 9.970199113792572e-05, + "loss": 3.9727, + "step": 2739 + }, + { + "epoch": 1.7658340048348107, + "grad_norm": 1.6148594728710475, + "learning_rate": 9.970177130462381e-05, + "loss": 4.2511, + "step": 2740 + }, + { + "epoch": 1.7664786462530218, + "grad_norm": 1.7407643793377465, + "learning_rate": 9.970155139051414e-05, + "loss": 3.9019, + "step": 2741 + }, + { + "epoch": 1.7671232876712328, + "grad_norm": 1.48311854556348, + "learning_rate": 9.970133139559708e-05, + "loss": 3.8226, + "step": 2742 + }, + { + "epoch": 1.7677679290894441, + "grad_norm": 1.6683456348845978, + "learning_rate": 9.970111131987302e-05, + "loss": 3.7366, + "step": 2743 + }, + { + "epoch": 1.768412570507655, + "grad_norm": 1.665966321705366, + "learning_rate": 9.97008911633423e-05, + "loss": 3.9452, + "step": 2744 + }, + { + "epoch": 1.7690572119258663, + "grad_norm": 1.793282556074106, + "learning_rate": 9.970067092600529e-05, + "loss": 3.8539, + "step": 2745 + }, + { + "epoch": 1.7697018533440774, + "grad_norm": 1.4115463731994307, + "learning_rate": 9.970045060786235e-05, + "loss": 4.525, + "step": 2746 + }, + { + "epoch": 1.7703464947622884, + "grad_norm": 1.5848984418520016, + "learning_rate": 9.970023020891383e-05, + "loss": 3.871, + "step": 2747 + }, + { + "epoch": 1.7709911361804997, + "grad_norm": 1.5229284710592337, + "learning_rate": 9.970000972916012e-05, + "loss": 3.9411, + "step": 2748 + }, + { + "epoch": 1.7716357775987106, + "grad_norm": 1.3393304018508152, + "learning_rate": 9.969978916860157e-05, + "loss": 3.7748, + "step": 2749 + }, + { + "epoch": 1.772280419016922, + "grad_norm": 1.9840054688033306, + "learning_rate": 9.96995685272385e-05, + "loss": 3.8223, + "step": 2750 + }, + { + "epoch": 1.772925060435133, + "grad_norm": 1.7639907366364118, + "learning_rate": 9.969934780507135e-05, + "loss": 4.0842, + "step": 2751 + }, + { + "epoch": 1.773569701853344, + "grad_norm": 1.312091931611414, + "learning_rate": 9.96991270021004e-05, + "loss": 3.9661, + "step": 2752 + }, + { + "epoch": 1.7742143432715554, + "grad_norm": 1.881066376569964, + "learning_rate": 9.969890611832608e-05, + "loss": 4.1847, + "step": 2753 + }, + { + "epoch": 1.7748589846897662, + "grad_norm": 1.779800471677235, + "learning_rate": 9.969868515374873e-05, + "loss": 4.0169, + "step": 2754 + }, + { + "epoch": 1.7755036261079775, + "grad_norm": 1.450925924035148, + "learning_rate": 9.96984641083687e-05, + "loss": 3.8881, + "step": 2755 + }, + { + "epoch": 1.7761482675261886, + "grad_norm": 1.5950482436612246, + "learning_rate": 9.969824298218635e-05, + "loss": 3.9443, + "step": 2756 + }, + { + "epoch": 1.7767929089443997, + "grad_norm": 1.137247757973018, + "learning_rate": 9.969802177520208e-05, + "loss": 3.8761, + "step": 2757 + }, + { + "epoch": 1.7774375503626108, + "grad_norm": 1.2014869685800373, + "learning_rate": 9.969780048741621e-05, + "loss": 3.7706, + "step": 2758 + }, + { + "epoch": 1.7780821917808218, + "grad_norm": 1.3367185698723056, + "learning_rate": 9.969757911882915e-05, + "loss": 3.836, + "step": 2759 + }, + { + "epoch": 1.7787268331990331, + "grad_norm": 1.7211330873262638, + "learning_rate": 9.96973576694412e-05, + "loss": 4.1193, + "step": 2760 + }, + { + "epoch": 1.7793714746172442, + "grad_norm": 2.5974292757484294, + "learning_rate": 9.969713613925278e-05, + "loss": 4.0935, + "step": 2761 + }, + { + "epoch": 1.7800161160354553, + "grad_norm": 1.9261136864340596, + "learning_rate": 9.969691452826422e-05, + "loss": 3.9751, + "step": 2762 + }, + { + "epoch": 1.7806607574536664, + "grad_norm": 1.6002709828799762, + "learning_rate": 9.969669283647591e-05, + "loss": 3.7051, + "step": 2763 + }, + { + "epoch": 1.7813053988718774, + "grad_norm": 2.5373031068358967, + "learning_rate": 9.96964710638882e-05, + "loss": 4.0159, + "step": 2764 + }, + { + "epoch": 1.7819500402900887, + "grad_norm": 1.7994376575968576, + "learning_rate": 9.969624921050145e-05, + "loss": 3.7766, + "step": 2765 + }, + { + "epoch": 1.7825946817082996, + "grad_norm": 1.7224757982274304, + "learning_rate": 9.969602727631606e-05, + "loss": 4.3435, + "step": 2766 + }, + { + "epoch": 1.783239323126511, + "grad_norm": 2.0170327117044318, + "learning_rate": 9.969580526133233e-05, + "loss": 3.7883, + "step": 2767 + }, + { + "epoch": 1.783883964544722, + "grad_norm": 1.4325903864683172, + "learning_rate": 9.969558316555067e-05, + "loss": 3.8607, + "step": 2768 + }, + { + "epoch": 1.784528605962933, + "grad_norm": 1.6315134922787595, + "learning_rate": 9.969536098897144e-05, + "loss": 3.6745, + "step": 2769 + }, + { + "epoch": 1.7851732473811444, + "grad_norm": 1.6709955038366857, + "learning_rate": 9.969513873159498e-05, + "loss": 4.093, + "step": 2770 + }, + { + "epoch": 1.7858178887993552, + "grad_norm": 2.2555389717796914, + "learning_rate": 9.96949163934217e-05, + "loss": 3.8497, + "step": 2771 + }, + { + "epoch": 1.7864625302175665, + "grad_norm": 2.137930103971139, + "learning_rate": 9.969469397445193e-05, + "loss": 4.2729, + "step": 2772 + }, + { + "epoch": 1.7871071716357776, + "grad_norm": 1.819622248758585, + "learning_rate": 9.969447147468605e-05, + "loss": 4.1156, + "step": 2773 + }, + { + "epoch": 1.7877518130539887, + "grad_norm": 1.8785413761960836, + "learning_rate": 9.96942488941244e-05, + "loss": 3.8932, + "step": 2774 + }, + { + "epoch": 1.7883964544722, + "grad_norm": 2.0580307871920858, + "learning_rate": 9.969402623276738e-05, + "loss": 3.9128, + "step": 2775 + }, + { + "epoch": 1.7890410958904108, + "grad_norm": 2.4966083124810554, + "learning_rate": 9.969380349061533e-05, + "loss": 3.7306, + "step": 2776 + }, + { + "epoch": 1.7896857373086221, + "grad_norm": 1.947457731877862, + "learning_rate": 9.969358066766865e-05, + "loss": 4.2295, + "step": 2777 + }, + { + "epoch": 1.7903303787268332, + "grad_norm": 1.4167875121026003, + "learning_rate": 9.969335776392768e-05, + "loss": 3.9605, + "step": 2778 + }, + { + "epoch": 1.7909750201450443, + "grad_norm": 2.323030086071887, + "learning_rate": 9.969313477939277e-05, + "loss": 3.9045, + "step": 2779 + }, + { + "epoch": 1.7916196615632556, + "grad_norm": 2.24905175717657, + "learning_rate": 9.96929117140643e-05, + "loss": 4.0738, + "step": 2780 + }, + { + "epoch": 1.7922643029814664, + "grad_norm": 1.5042657867832971, + "learning_rate": 9.969268856794267e-05, + "loss": 4.2794, + "step": 2781 + }, + { + "epoch": 1.7929089443996777, + "grad_norm": 2.115308379325484, + "learning_rate": 9.969246534102821e-05, + "loss": 3.6536, + "step": 2782 + }, + { + "epoch": 1.7935535858178888, + "grad_norm": 1.5598004286930764, + "learning_rate": 9.969224203332128e-05, + "loss": 3.7246, + "step": 2783 + }, + { + "epoch": 1.7941982272361, + "grad_norm": 1.6209822315834324, + "learning_rate": 9.969201864482228e-05, + "loss": 3.8243, + "step": 2784 + }, + { + "epoch": 1.7948428686543112, + "grad_norm": 1.9148476331318973, + "learning_rate": 9.969179517553155e-05, + "loss": 3.742, + "step": 2785 + }, + { + "epoch": 1.795487510072522, + "grad_norm": 1.637544413530926, + "learning_rate": 9.969157162544947e-05, + "loss": 4.0482, + "step": 2786 + }, + { + "epoch": 1.7961321514907334, + "grad_norm": 1.9714376388645383, + "learning_rate": 9.969134799457638e-05, + "loss": 3.5717, + "step": 2787 + }, + { + "epoch": 1.7967767929089444, + "grad_norm": 1.8679979587974973, + "learning_rate": 9.96911242829127e-05, + "loss": 3.9006, + "step": 2788 + }, + { + "epoch": 1.7974214343271555, + "grad_norm": 1.6783086068743565, + "learning_rate": 9.969090049045876e-05, + "loss": 3.9399, + "step": 2789 + }, + { + "epoch": 1.7980660757453666, + "grad_norm": 2.250157446864187, + "learning_rate": 9.969067661721492e-05, + "loss": 3.7224, + "step": 2790 + }, + { + "epoch": 1.7987107171635777, + "grad_norm": 1.8256457311787164, + "learning_rate": 9.969045266318157e-05, + "loss": 3.914, + "step": 2791 + }, + { + "epoch": 1.799355358581789, + "grad_norm": 2.0956995890529644, + "learning_rate": 9.969022862835909e-05, + "loss": 3.8477, + "step": 2792 + }, + { + "epoch": 1.8, + "grad_norm": 2.2626372029181776, + "learning_rate": 9.969000451274782e-05, + "loss": 4.1456, + "step": 2793 + }, + { + "epoch": 1.8006446414182111, + "grad_norm": 1.640888680269742, + "learning_rate": 9.968978031634812e-05, + "loss": 4.1239, + "step": 2794 + }, + { + "epoch": 1.8012892828364222, + "grad_norm": 2.0535594583235155, + "learning_rate": 9.968955603916038e-05, + "loss": 4.1508, + "step": 2795 + }, + { + "epoch": 1.8019339242546333, + "grad_norm": 1.8785267215905768, + "learning_rate": 9.968933168118496e-05, + "loss": 3.8113, + "step": 2796 + }, + { + "epoch": 1.8025785656728446, + "grad_norm": 1.4142274872835152, + "learning_rate": 9.968910724242225e-05, + "loss": 4.329, + "step": 2797 + }, + { + "epoch": 1.8032232070910554, + "grad_norm": 1.930011285709244, + "learning_rate": 9.968888272287258e-05, + "loss": 4.3081, + "step": 2798 + }, + { + "epoch": 1.8038678485092667, + "grad_norm": 1.3085779361003085, + "learning_rate": 9.968865812253634e-05, + "loss": 4.0585, + "step": 2799 + }, + { + "epoch": 1.8045124899274778, + "grad_norm": 1.6824741832186478, + "learning_rate": 9.968843344141392e-05, + "loss": 3.7626, + "step": 2800 + }, + { + "epoch": 1.8045124899274778, + "eval_loss": 4.0666279792785645, + "eval_runtime": 2.9727, + "eval_samples_per_second": 33.64, + "eval_steps_per_second": 4.373, + "step": 2800 + }, + { + "epoch": 1.805157131345689, + "grad_norm": 1.5881076331650448, + "learning_rate": 9.968820867950564e-05, + "loss": 4.3546, + "step": 2801 + }, + { + "epoch": 1.8058017727639002, + "grad_norm": 1.6098065920543185, + "learning_rate": 9.968798383681191e-05, + "loss": 4.2176, + "step": 2802 + }, + { + "epoch": 1.806446414182111, + "grad_norm": 1.3563076303494896, + "learning_rate": 9.968775891333308e-05, + "loss": 4.3451, + "step": 2803 + }, + { + "epoch": 1.8070910556003223, + "grad_norm": 1.595560911087322, + "learning_rate": 9.968753390906953e-05, + "loss": 3.4086, + "step": 2804 + }, + { + "epoch": 1.8077356970185334, + "grad_norm": 1.7512088559131345, + "learning_rate": 9.968730882402162e-05, + "loss": 3.7613, + "step": 2805 + }, + { + "epoch": 1.8083803384367445, + "grad_norm": 1.6402154698348652, + "learning_rate": 9.968708365818973e-05, + "loss": 4.0292, + "step": 2806 + }, + { + "epoch": 1.8090249798549558, + "grad_norm": 1.5695246702465078, + "learning_rate": 9.968685841157421e-05, + "loss": 3.8912, + "step": 2807 + }, + { + "epoch": 1.8096696212731667, + "grad_norm": 1.9631446982742462, + "learning_rate": 9.968663308417546e-05, + "loss": 3.9692, + "step": 2808 + }, + { + "epoch": 1.810314262691378, + "grad_norm": 1.6417457434950982, + "learning_rate": 9.968640767599383e-05, + "loss": 3.7737, + "step": 2809 + }, + { + "epoch": 1.810958904109589, + "grad_norm": 1.5718397186106203, + "learning_rate": 9.968618218702968e-05, + "loss": 4.0904, + "step": 2810 + }, + { + "epoch": 1.8116035455278001, + "grad_norm": 1.8961425826267944, + "learning_rate": 9.96859566172834e-05, + "loss": 3.7712, + "step": 2811 + }, + { + "epoch": 1.8122481869460114, + "grad_norm": 1.4004791062322843, + "learning_rate": 9.968573096675536e-05, + "loss": 4.1766, + "step": 2812 + }, + { + "epoch": 1.8128928283642223, + "grad_norm": 2.1187107903765616, + "learning_rate": 9.968550523544593e-05, + "loss": 3.7667, + "step": 2813 + }, + { + "epoch": 1.8135374697824336, + "grad_norm": 1.9326189465429748, + "learning_rate": 9.968527942335548e-05, + "loss": 3.8222, + "step": 2814 + }, + { + "epoch": 1.8141821112006447, + "grad_norm": 1.5961421154590931, + "learning_rate": 9.968505353048436e-05, + "loss": 4.2906, + "step": 2815 + }, + { + "epoch": 1.8148267526188557, + "grad_norm": 1.4666125546512991, + "learning_rate": 9.968482755683297e-05, + "loss": 4.1871, + "step": 2816 + }, + { + "epoch": 1.815471394037067, + "grad_norm": 1.550323820782811, + "learning_rate": 9.968460150240166e-05, + "loss": 4.1799, + "step": 2817 + }, + { + "epoch": 1.8161160354552779, + "grad_norm": 1.4765766541980385, + "learning_rate": 9.968437536719082e-05, + "loss": 4.2304, + "step": 2818 + }, + { + "epoch": 1.8167606768734892, + "grad_norm": 1.6571285436470664, + "learning_rate": 9.96841491512008e-05, + "loss": 3.8589, + "step": 2819 + }, + { + "epoch": 1.8174053182917003, + "grad_norm": 1.6278956382195635, + "learning_rate": 9.968392285443199e-05, + "loss": 4.4031, + "step": 2820 + }, + { + "epoch": 1.8180499597099113, + "grad_norm": 1.9677808364470966, + "learning_rate": 9.968369647688475e-05, + "loss": 4.1402, + "step": 2821 + }, + { + "epoch": 1.8186946011281226, + "grad_norm": 1.9933522177801073, + "learning_rate": 9.968347001855946e-05, + "loss": 3.7058, + "step": 2822 + }, + { + "epoch": 1.8193392425463335, + "grad_norm": 2.153114598679885, + "learning_rate": 9.968324347945649e-05, + "loss": 3.768, + "step": 2823 + }, + { + "epoch": 1.8199838839645448, + "grad_norm": 2.5170490527130998, + "learning_rate": 9.968301685957621e-05, + "loss": 4.1938, + "step": 2824 + }, + { + "epoch": 1.8206285253827559, + "grad_norm": 2.2404527166612875, + "learning_rate": 9.968279015891899e-05, + "loss": 4.2316, + "step": 2825 + }, + { + "epoch": 1.821273166800967, + "grad_norm": 1.3708285091701875, + "learning_rate": 9.968256337748522e-05, + "loss": 4.0969, + "step": 2826 + }, + { + "epoch": 1.821917808219178, + "grad_norm": 1.8159735874184106, + "learning_rate": 9.968233651527524e-05, + "loss": 3.8277, + "step": 2827 + }, + { + "epoch": 1.8225624496373891, + "grad_norm": 1.9816371539970283, + "learning_rate": 9.968210957228945e-05, + "loss": 3.5499, + "step": 2828 + }, + { + "epoch": 1.8232070910556004, + "grad_norm": 1.803866719093065, + "learning_rate": 9.96818825485282e-05, + "loss": 4.1184, + "step": 2829 + }, + { + "epoch": 1.8238517324738115, + "grad_norm": 1.6608405280502267, + "learning_rate": 9.968165544399188e-05, + "loss": 3.9017, + "step": 2830 + }, + { + "epoch": 1.8244963738920226, + "grad_norm": 2.220692956261737, + "learning_rate": 9.968142825868087e-05, + "loss": 4.2593, + "step": 2831 + }, + { + "epoch": 1.8251410153102336, + "grad_norm": 1.7397188095405884, + "learning_rate": 9.968120099259552e-05, + "loss": 4.0569, + "step": 2832 + }, + { + "epoch": 1.8257856567284447, + "grad_norm": 1.4665644023851134, + "learning_rate": 9.968097364573621e-05, + "loss": 4.0013, + "step": 2833 + }, + { + "epoch": 1.826430298146656, + "grad_norm": 1.806050731398707, + "learning_rate": 9.968074621810334e-05, + "loss": 3.9257, + "step": 2834 + }, + { + "epoch": 1.8270749395648669, + "grad_norm": 1.731127815183038, + "learning_rate": 9.968051870969726e-05, + "loss": 3.8394, + "step": 2835 + }, + { + "epoch": 1.8277195809830782, + "grad_norm": 1.759021445181984, + "learning_rate": 9.968029112051834e-05, + "loss": 4.0986, + "step": 2836 + }, + { + "epoch": 1.8283642224012893, + "grad_norm": 1.1184586505907008, + "learning_rate": 9.968006345056695e-05, + "loss": 3.798, + "step": 2837 + }, + { + "epoch": 1.8290088638195003, + "grad_norm": 1.6607212656830344, + "learning_rate": 9.967983569984347e-05, + "loss": 3.9328, + "step": 2838 + }, + { + "epoch": 1.8296535052377116, + "grad_norm": 1.9163340846125065, + "learning_rate": 9.96796078683483e-05, + "loss": 4.1535, + "step": 2839 + }, + { + "epoch": 1.8302981466559225, + "grad_norm": 1.6059895470165022, + "learning_rate": 9.967937995608177e-05, + "loss": 3.9454, + "step": 2840 + }, + { + "epoch": 1.8309427880741338, + "grad_norm": 1.33060903304863, + "learning_rate": 9.967915196304429e-05, + "loss": 4.2588, + "step": 2841 + }, + { + "epoch": 1.8315874294923449, + "grad_norm": 1.5782694569957667, + "learning_rate": 9.967892388923621e-05, + "loss": 4.1161, + "step": 2842 + }, + { + "epoch": 1.832232070910556, + "grad_norm": 1.901786153299746, + "learning_rate": 9.967869573465792e-05, + "loss": 4.2265, + "step": 2843 + }, + { + "epoch": 1.8328767123287673, + "grad_norm": 1.8117505154634757, + "learning_rate": 9.967846749930979e-05, + "loss": 3.7269, + "step": 2844 + }, + { + "epoch": 1.833521353746978, + "grad_norm": 2.0470496458606027, + "learning_rate": 9.96782391831922e-05, + "loss": 4.164, + "step": 2845 + }, + { + "epoch": 1.8341659951651894, + "grad_norm": 2.3120578365375457, + "learning_rate": 9.967801078630551e-05, + "loss": 3.9065, + "step": 2846 + }, + { + "epoch": 1.8348106365834005, + "grad_norm": 2.249110568290413, + "learning_rate": 9.967778230865011e-05, + "loss": 4.0439, + "step": 2847 + }, + { + "epoch": 1.8354552780016116, + "grad_norm": 2.2230373680544258, + "learning_rate": 9.967755375022638e-05, + "loss": 3.9085, + "step": 2848 + }, + { + "epoch": 1.8360999194198229, + "grad_norm": 2.499266092365138, + "learning_rate": 9.967732511103467e-05, + "loss": 3.9183, + "step": 2849 + }, + { + "epoch": 1.8367445608380337, + "grad_norm": 2.193662509516109, + "learning_rate": 9.967709639107538e-05, + "loss": 4.4094, + "step": 2850 + }, + { + "epoch": 1.837389202256245, + "grad_norm": 2.5402299616891892, + "learning_rate": 9.967686759034886e-05, + "loss": 4.0983, + "step": 2851 + }, + { + "epoch": 1.838033843674456, + "grad_norm": 3.130126842091992, + "learning_rate": 9.967663870885553e-05, + "loss": 3.9935, + "step": 2852 + }, + { + "epoch": 1.8386784850926672, + "grad_norm": 1.5655936811544906, + "learning_rate": 9.967640974659572e-05, + "loss": 4.3932, + "step": 2853 + }, + { + "epoch": 1.8393231265108785, + "grad_norm": 2.405354751193837, + "learning_rate": 9.967618070356982e-05, + "loss": 3.7889, + "step": 2854 + }, + { + "epoch": 1.8399677679290893, + "grad_norm": 2.4866681942093214, + "learning_rate": 9.967595157977821e-05, + "loss": 4.0817, + "step": 2855 + }, + { + "epoch": 1.8406124093473006, + "grad_norm": 1.517047053320357, + "learning_rate": 9.967572237522127e-05, + "loss": 3.8972, + "step": 2856 + }, + { + "epoch": 1.8412570507655117, + "grad_norm": 2.528877676956206, + "learning_rate": 9.967549308989938e-05, + "loss": 4.3817, + "step": 2857 + }, + { + "epoch": 1.8419016921837228, + "grad_norm": 1.894528715724563, + "learning_rate": 9.967526372381291e-05, + "loss": 3.9919, + "step": 2858 + }, + { + "epoch": 1.8425463336019339, + "grad_norm": 2.805562442209204, + "learning_rate": 9.967503427696223e-05, + "loss": 4.2248, + "step": 2859 + }, + { + "epoch": 1.843190975020145, + "grad_norm": 1.4954347139012147, + "learning_rate": 9.967480474934772e-05, + "loss": 3.9073, + "step": 2860 + }, + { + "epoch": 1.8438356164383563, + "grad_norm": 2.341082661496895, + "learning_rate": 9.967457514096977e-05, + "loss": 4.161, + "step": 2861 + }, + { + "epoch": 1.8444802578565673, + "grad_norm": 1.211008092085144, + "learning_rate": 9.967434545182874e-05, + "loss": 4.0943, + "step": 2862 + }, + { + "epoch": 1.8451248992747784, + "grad_norm": 1.9601315001077793, + "learning_rate": 9.967411568192501e-05, + "loss": 4.3197, + "step": 2863 + }, + { + "epoch": 1.8457695406929895, + "grad_norm": 1.7141833299079072, + "learning_rate": 9.967388583125898e-05, + "loss": 3.8709, + "step": 2864 + }, + { + "epoch": 1.8464141821112006, + "grad_norm": 1.3948637594329898, + "learning_rate": 9.9673655899831e-05, + "loss": 3.8776, + "step": 2865 + }, + { + "epoch": 1.8470588235294119, + "grad_norm": 1.8674988788638167, + "learning_rate": 9.967342588764146e-05, + "loss": 3.9761, + "step": 2866 + }, + { + "epoch": 1.8477034649476227, + "grad_norm": 2.2294850128914736, + "learning_rate": 9.967319579469073e-05, + "loss": 3.5406, + "step": 2867 + }, + { + "epoch": 1.848348106365834, + "grad_norm": 2.6092289055418503, + "learning_rate": 9.967296562097919e-05, + "loss": 3.763, + "step": 2868 + }, + { + "epoch": 1.848992747784045, + "grad_norm": 2.0944128609771098, + "learning_rate": 9.967273536650723e-05, + "loss": 3.9801, + "step": 2869 + }, + { + "epoch": 1.8496373892022562, + "grad_norm": 1.5980757745887126, + "learning_rate": 9.967250503127521e-05, + "loss": 3.9816, + "step": 2870 + }, + { + "epoch": 1.8502820306204675, + "grad_norm": 1.7099852605444663, + "learning_rate": 9.967227461528352e-05, + "loss": 4.1771, + "step": 2871 + }, + { + "epoch": 1.8509266720386783, + "grad_norm": 1.6821431926032178, + "learning_rate": 9.967204411853253e-05, + "loss": 4.3948, + "step": 2872 + }, + { + "epoch": 1.8515713134568896, + "grad_norm": 1.5445832738936438, + "learning_rate": 9.967181354102264e-05, + "loss": 4.3196, + "step": 2873 + }, + { + "epoch": 1.8522159548751007, + "grad_norm": 1.8216300183409515, + "learning_rate": 9.96715828827542e-05, + "loss": 3.9636, + "step": 2874 + }, + { + "epoch": 1.8528605962933118, + "grad_norm": 1.7184672847534144, + "learning_rate": 9.96713521437276e-05, + "loss": 4.0025, + "step": 2875 + }, + { + "epoch": 1.853505237711523, + "grad_norm": 1.5528408986702953, + "learning_rate": 9.967112132394324e-05, + "loss": 4.1078, + "step": 2876 + }, + { + "epoch": 1.854149879129734, + "grad_norm": 1.610028723948517, + "learning_rate": 9.967089042340146e-05, + "loss": 3.6733, + "step": 2877 + }, + { + "epoch": 1.8547945205479452, + "grad_norm": 1.899342150231661, + "learning_rate": 9.967065944210266e-05, + "loss": 3.8925, + "step": 2878 + }, + { + "epoch": 1.8554391619661563, + "grad_norm": 1.931429741204433, + "learning_rate": 9.967042838004723e-05, + "loss": 3.8282, + "step": 2879 + }, + { + "epoch": 1.8560838033843674, + "grad_norm": 1.752121773854244, + "learning_rate": 9.967019723723554e-05, + "loss": 3.9584, + "step": 2880 + }, + { + "epoch": 1.8567284448025787, + "grad_norm": 1.7380929614961111, + "learning_rate": 9.966996601366794e-05, + "loss": 4.078, + "step": 2881 + }, + { + "epoch": 1.8573730862207896, + "grad_norm": 1.9059676630868305, + "learning_rate": 9.966973470934486e-05, + "loss": 4.2261, + "step": 2882 + }, + { + "epoch": 1.8580177276390009, + "grad_norm": 1.4469155784674572, + "learning_rate": 9.966950332426665e-05, + "loss": 4.0491, + "step": 2883 + }, + { + "epoch": 1.858662369057212, + "grad_norm": 1.943787552609593, + "learning_rate": 9.96692718584337e-05, + "loss": 3.907, + "step": 2884 + }, + { + "epoch": 1.859307010475423, + "grad_norm": 1.545050035293602, + "learning_rate": 9.96690403118464e-05, + "loss": 3.8361, + "step": 2885 + }, + { + "epoch": 1.8599516518936343, + "grad_norm": 2.227704267201435, + "learning_rate": 9.96688086845051e-05, + "loss": 4.0418, + "step": 2886 + }, + { + "epoch": 1.8605962933118452, + "grad_norm": 2.7148104937379256, + "learning_rate": 9.96685769764102e-05, + "loss": 3.7304, + "step": 2887 + }, + { + "epoch": 1.8612409347300565, + "grad_norm": 2.0101896150900784, + "learning_rate": 9.966834518756208e-05, + "loss": 3.8494, + "step": 2888 + }, + { + "epoch": 1.8618855761482676, + "grad_norm": 1.8917431061589798, + "learning_rate": 9.966811331796112e-05, + "loss": 3.9401, + "step": 2889 + }, + { + "epoch": 1.8625302175664786, + "grad_norm": 1.5666182238212443, + "learning_rate": 9.96678813676077e-05, + "loss": 3.9194, + "step": 2890 + }, + { + "epoch": 1.86317485898469, + "grad_norm": 1.6075956721899007, + "learning_rate": 9.96676493365022e-05, + "loss": 3.9845, + "step": 2891 + }, + { + "epoch": 1.8638195004029008, + "grad_norm": 1.619070837299383, + "learning_rate": 9.9667417224645e-05, + "loss": 4.1355, + "step": 2892 + }, + { + "epoch": 1.864464141821112, + "grad_norm": 1.3267647123410111, + "learning_rate": 9.96671850320365e-05, + "loss": 3.8732, + "step": 2893 + }, + { + "epoch": 1.8651087832393232, + "grad_norm": 2.124048018063243, + "learning_rate": 9.966695275867705e-05, + "loss": 3.8322, + "step": 2894 + }, + { + "epoch": 1.8657534246575342, + "grad_norm": 1.7966924942986127, + "learning_rate": 9.966672040456703e-05, + "loss": 3.9708, + "step": 2895 + }, + { + "epoch": 1.8663980660757453, + "grad_norm": 1.8720660150662947, + "learning_rate": 9.966648796970686e-05, + "loss": 4.144, + "step": 2896 + }, + { + "epoch": 1.8670427074939564, + "grad_norm": 1.7256800554640672, + "learning_rate": 9.96662554540969e-05, + "loss": 3.6866, + "step": 2897 + }, + { + "epoch": 1.8676873489121677, + "grad_norm": 1.6189757700031717, + "learning_rate": 9.966602285773753e-05, + "loss": 3.6705, + "step": 2898 + }, + { + "epoch": 1.8683319903303788, + "grad_norm": 2.229128893252128, + "learning_rate": 9.966579018062913e-05, + "loss": 3.8231, + "step": 2899 + }, + { + "epoch": 1.8689766317485899, + "grad_norm": 2.366378500715918, + "learning_rate": 9.966555742277209e-05, + "loss": 3.682, + "step": 2900 + }, + { + "epoch": 1.8689766317485899, + "eval_loss": 4.058013916015625, + "eval_runtime": 2.972, + "eval_samples_per_second": 33.648, + "eval_steps_per_second": 4.374, + "step": 2900 + }, + { + "epoch": 1.869621273166801, + "grad_norm": 1.9052442022703262, + "learning_rate": 9.966532458416677e-05, + "loss": 4.4018, + "step": 2901 + }, + { + "epoch": 1.870265914585012, + "grad_norm": 1.6538274826996542, + "learning_rate": 9.966509166481359e-05, + "loss": 4.18, + "step": 2902 + }, + { + "epoch": 1.8709105560032233, + "grad_norm": 1.7352803418932143, + "learning_rate": 9.96648586647129e-05, + "loss": 3.8538, + "step": 2903 + }, + { + "epoch": 1.8715551974214342, + "grad_norm": 2.049083041599123, + "learning_rate": 9.96646255838651e-05, + "loss": 3.9443, + "step": 2904 + }, + { + "epoch": 1.8721998388396455, + "grad_norm": 1.6355023440328655, + "learning_rate": 9.966439242227057e-05, + "loss": 4.0902, + "step": 2905 + }, + { + "epoch": 1.8728444802578565, + "grad_norm": 1.4363961412021164, + "learning_rate": 9.96641591799297e-05, + "loss": 3.9731, + "step": 2906 + }, + { + "epoch": 1.8734891216760676, + "grad_norm": 1.312290948258132, + "learning_rate": 9.966392585684284e-05, + "loss": 3.8203, + "step": 2907 + }, + { + "epoch": 1.874133763094279, + "grad_norm": 1.5227039294568243, + "learning_rate": 9.966369245301042e-05, + "loss": 3.7104, + "step": 2908 + }, + { + "epoch": 1.8747784045124898, + "grad_norm": 1.6450067230888847, + "learning_rate": 9.966345896843278e-05, + "loss": 3.7028, + "step": 2909 + }, + { + "epoch": 1.875423045930701, + "grad_norm": 1.9672585635033637, + "learning_rate": 9.966322540311035e-05, + "loss": 4.0966, + "step": 2910 + }, + { + "epoch": 1.8760676873489122, + "grad_norm": 1.458163643531426, + "learning_rate": 9.966299175704347e-05, + "loss": 4.1049, + "step": 2911 + }, + { + "epoch": 1.8767123287671232, + "grad_norm": 1.30762105864508, + "learning_rate": 9.966275803023253e-05, + "loss": 3.7518, + "step": 2912 + }, + { + "epoch": 1.8773569701853345, + "grad_norm": 1.6186279448089236, + "learning_rate": 9.966252422267797e-05, + "loss": 3.927, + "step": 2913 + }, + { + "epoch": 1.8780016116035454, + "grad_norm": 1.2690318092731032, + "learning_rate": 9.966229033438008e-05, + "loss": 4.1337, + "step": 2914 + }, + { + "epoch": 1.8786462530217567, + "grad_norm": 1.3902436308975938, + "learning_rate": 9.966205636533931e-05, + "loss": 3.9266, + "step": 2915 + }, + { + "epoch": 1.8792908944399678, + "grad_norm": 1.591131587881099, + "learning_rate": 9.966182231555605e-05, + "loss": 4.098, + "step": 2916 + }, + { + "epoch": 1.8799355358581789, + "grad_norm": 1.5423953401372015, + "learning_rate": 9.966158818503064e-05, + "loss": 4.0469, + "step": 2917 + }, + { + "epoch": 1.8805801772763902, + "grad_norm": 2.2590418758285296, + "learning_rate": 9.96613539737635e-05, + "loss": 4.1804, + "step": 2918 + }, + { + "epoch": 1.881224818694601, + "grad_norm": 1.4374760415884085, + "learning_rate": 9.966111968175499e-05, + "loss": 3.6085, + "step": 2919 + }, + { + "epoch": 1.8818694601128123, + "grad_norm": 1.5882026787010706, + "learning_rate": 9.96608853090055e-05, + "loss": 3.8343, + "step": 2920 + }, + { + "epoch": 1.8825141015310234, + "grad_norm": 1.7258158163683597, + "learning_rate": 9.966065085551544e-05, + "loss": 4.3433, + "step": 2921 + }, + { + "epoch": 1.8831587429492345, + "grad_norm": 1.1657659794722057, + "learning_rate": 9.966041632128516e-05, + "loss": 3.9981, + "step": 2922 + }, + { + "epoch": 1.8838033843674458, + "grad_norm": 1.7603404131732145, + "learning_rate": 9.966018170631508e-05, + "loss": 4.3969, + "step": 2923 + }, + { + "epoch": 1.8844480257856566, + "grad_norm": 1.6337914468319585, + "learning_rate": 9.965994701060554e-05, + "loss": 4.1851, + "step": 2924 + }, + { + "epoch": 1.885092667203868, + "grad_norm": 1.5866885617557454, + "learning_rate": 9.965971223415698e-05, + "loss": 4.1675, + "step": 2925 + }, + { + "epoch": 1.885737308622079, + "grad_norm": 1.3766197302454597, + "learning_rate": 9.965947737696973e-05, + "loss": 4.0397, + "step": 2926 + }, + { + "epoch": 1.88638195004029, + "grad_norm": 1.5071568479366235, + "learning_rate": 9.965924243904423e-05, + "loss": 3.7864, + "step": 2927 + }, + { + "epoch": 1.8870265914585012, + "grad_norm": 1.5743665163328806, + "learning_rate": 9.965900742038083e-05, + "loss": 3.9819, + "step": 2928 + }, + { + "epoch": 1.8876712328767122, + "grad_norm": 1.699740322703295, + "learning_rate": 9.965877232097992e-05, + "loss": 4.2211, + "step": 2929 + }, + { + "epoch": 1.8883158742949235, + "grad_norm": 1.38654552691697, + "learning_rate": 9.96585371408419e-05, + "loss": 4.1377, + "step": 2930 + }, + { + "epoch": 1.8889605157131346, + "grad_norm": 1.8854116343713725, + "learning_rate": 9.965830187996714e-05, + "loss": 3.9788, + "step": 2931 + }, + { + "epoch": 1.8896051571313457, + "grad_norm": 2.064993211206099, + "learning_rate": 9.965806653835602e-05, + "loss": 4.062, + "step": 2932 + }, + { + "epoch": 1.8902497985495568, + "grad_norm": 1.72368270347937, + "learning_rate": 9.965783111600895e-05, + "loss": 3.9008, + "step": 2933 + }, + { + "epoch": 1.8908944399677678, + "grad_norm": 1.532470221382637, + "learning_rate": 9.965759561292634e-05, + "loss": 3.9842, + "step": 2934 + }, + { + "epoch": 1.8915390813859791, + "grad_norm": 2.1166833985851947, + "learning_rate": 9.965736002910852e-05, + "loss": 4.2148, + "step": 2935 + }, + { + "epoch": 1.89218372280419, + "grad_norm": 2.374465244906533, + "learning_rate": 9.965712436455588e-05, + "loss": 3.8023, + "step": 2936 + }, + { + "epoch": 1.8928283642224013, + "grad_norm": 1.6100212998862597, + "learning_rate": 9.965688861926886e-05, + "loss": 4.039, + "step": 2937 + }, + { + "epoch": 1.8934730056406124, + "grad_norm": 1.9576035502140339, + "learning_rate": 9.96566527932478e-05, + "loss": 3.8368, + "step": 2938 + }, + { + "epoch": 1.8941176470588235, + "grad_norm": 2.0159617397279552, + "learning_rate": 9.96564168864931e-05, + "loss": 3.9626, + "step": 2939 + }, + { + "epoch": 1.8947622884770348, + "grad_norm": 2.0721649560887756, + "learning_rate": 9.965618089900516e-05, + "loss": 3.6204, + "step": 2940 + }, + { + "epoch": 1.8954069298952456, + "grad_norm": 1.7032288824876305, + "learning_rate": 9.965594483078435e-05, + "loss": 4.2377, + "step": 2941 + }, + { + "epoch": 1.896051571313457, + "grad_norm": 1.7136104736206461, + "learning_rate": 9.965570868183107e-05, + "loss": 4.0917, + "step": 2942 + }, + { + "epoch": 1.896696212731668, + "grad_norm": 2.0833263540870206, + "learning_rate": 9.96554724521457e-05, + "loss": 3.8634, + "step": 2943 + }, + { + "epoch": 1.897340854149879, + "grad_norm": 2.084229437254759, + "learning_rate": 9.965523614172864e-05, + "loss": 4.1171, + "step": 2944 + }, + { + "epoch": 1.8979854955680904, + "grad_norm": 2.1515854687337814, + "learning_rate": 9.965499975058025e-05, + "loss": 4.1523, + "step": 2945 + }, + { + "epoch": 1.8986301369863012, + "grad_norm": 2.3367171791043035, + "learning_rate": 9.965476327870095e-05, + "loss": 3.679, + "step": 2946 + }, + { + "epoch": 1.8992747784045125, + "grad_norm": 1.7007941454548214, + "learning_rate": 9.965452672609112e-05, + "loss": 3.9734, + "step": 2947 + }, + { + "epoch": 1.8999194198227236, + "grad_norm": 1.3381241554431642, + "learning_rate": 9.965429009275114e-05, + "loss": 4.1659, + "step": 2948 + }, + { + "epoch": 1.9005640612409347, + "grad_norm": 1.373450009919943, + "learning_rate": 9.965405337868139e-05, + "loss": 4.4169, + "step": 2949 + }, + { + "epoch": 1.901208702659146, + "grad_norm": 1.4158723185478321, + "learning_rate": 9.965381658388229e-05, + "loss": 4.2458, + "step": 2950 + }, + { + "epoch": 1.9018533440773568, + "grad_norm": 1.4587966837145385, + "learning_rate": 9.96535797083542e-05, + "loss": 4.1363, + "step": 2951 + }, + { + "epoch": 1.9024979854955681, + "grad_norm": 1.7380474677579856, + "learning_rate": 9.965334275209754e-05, + "loss": 4.2434, + "step": 2952 + }, + { + "epoch": 1.9031426269137792, + "grad_norm": 1.4371929103912398, + "learning_rate": 9.965310571511266e-05, + "loss": 4.0087, + "step": 2953 + }, + { + "epoch": 1.9037872683319903, + "grad_norm": 1.923628903661804, + "learning_rate": 9.965286859739995e-05, + "loss": 4.091, + "step": 2954 + }, + { + "epoch": 1.9044319097502016, + "grad_norm": 1.4739399716274848, + "learning_rate": 9.965263139895985e-05, + "loss": 3.9389, + "step": 2955 + }, + { + "epoch": 1.9050765511684125, + "grad_norm": 1.5175464830836147, + "learning_rate": 9.96523941197927e-05, + "loss": 4.4356, + "step": 2956 + }, + { + "epoch": 1.9057211925866238, + "grad_norm": 2.468172169877143, + "learning_rate": 9.965215675989891e-05, + "loss": 3.8923, + "step": 2957 + }, + { + "epoch": 1.9063658340048348, + "grad_norm": 1.7099092271821486, + "learning_rate": 9.965191931927888e-05, + "loss": 3.9707, + "step": 2958 + }, + { + "epoch": 1.907010475423046, + "grad_norm": 1.403070857050818, + "learning_rate": 9.965168179793297e-05, + "loss": 3.8718, + "step": 2959 + }, + { + "epoch": 1.9076551168412572, + "grad_norm": 2.4147762280779554, + "learning_rate": 9.96514441958616e-05, + "loss": 4.0439, + "step": 2960 + }, + { + "epoch": 1.908299758259468, + "grad_norm": 1.7099517294651385, + "learning_rate": 9.965120651306512e-05, + "loss": 4.1026, + "step": 2961 + }, + { + "epoch": 1.9089443996776794, + "grad_norm": 1.8986331997654071, + "learning_rate": 9.965096874954397e-05, + "loss": 4.1917, + "step": 2962 + }, + { + "epoch": 1.9095890410958904, + "grad_norm": 1.9225471455059977, + "learning_rate": 9.965073090529853e-05, + "loss": 3.7528, + "step": 2963 + }, + { + "epoch": 1.9102336825141015, + "grad_norm": 1.4064169117642877, + "learning_rate": 9.965049298032915e-05, + "loss": 3.8285, + "step": 2964 + }, + { + "epoch": 1.9108783239323126, + "grad_norm": 2.1620210843909833, + "learning_rate": 9.965025497463628e-05, + "loss": 3.9586, + "step": 2965 + }, + { + "epoch": 1.9115229653505237, + "grad_norm": 2.5402014462373645, + "learning_rate": 9.965001688822024e-05, + "loss": 3.8736, + "step": 2966 + }, + { + "epoch": 1.912167606768735, + "grad_norm": 1.8901624290171117, + "learning_rate": 9.964977872108148e-05, + "loss": 4.1608, + "step": 2967 + }, + { + "epoch": 1.912812248186946, + "grad_norm": 1.7828166139434403, + "learning_rate": 9.964954047322039e-05, + "loss": 4.2746, + "step": 2968 + }, + { + "epoch": 1.9134568896051571, + "grad_norm": 1.7918098261701525, + "learning_rate": 9.964930214463732e-05, + "loss": 4.0519, + "step": 2969 + }, + { + "epoch": 1.9141015310233682, + "grad_norm": 1.667724048492519, + "learning_rate": 9.96490637353327e-05, + "loss": 3.9892, + "step": 2970 + }, + { + "epoch": 1.9147461724415793, + "grad_norm": 2.6087948518696367, + "learning_rate": 9.96488252453069e-05, + "loss": 3.7358, + "step": 2971 + }, + { + "epoch": 1.9153908138597906, + "grad_norm": 1.7259740233065761, + "learning_rate": 9.964858667456032e-05, + "loss": 3.8765, + "step": 2972 + }, + { + "epoch": 1.9160354552780015, + "grad_norm": 2.2554784557148033, + "learning_rate": 9.964834802309335e-05, + "loss": 3.8702, + "step": 2973 + }, + { + "epoch": 1.9166800966962128, + "grad_norm": 2.8976400948416376, + "learning_rate": 9.964810929090638e-05, + "loss": 3.9612, + "step": 2974 + }, + { + "epoch": 1.9173247381144238, + "grad_norm": 2.4659862700304376, + "learning_rate": 9.964787047799981e-05, + "loss": 3.7137, + "step": 2975 + }, + { + "epoch": 1.917969379532635, + "grad_norm": 1.746539161426026, + "learning_rate": 9.964763158437401e-05, + "loss": 3.9976, + "step": 2976 + }, + { + "epoch": 1.9186140209508462, + "grad_norm": 2.4796056242419575, + "learning_rate": 9.964739261002941e-05, + "loss": 4.1968, + "step": 2977 + }, + { + "epoch": 1.919258662369057, + "grad_norm": 2.8185907220586315, + "learning_rate": 9.964715355496638e-05, + "loss": 3.5574, + "step": 2978 + }, + { + "epoch": 1.9199033037872684, + "grad_norm": 2.4182365971413384, + "learning_rate": 9.964691441918528e-05, + "loss": 4.1515, + "step": 2979 + }, + { + "epoch": 1.9205479452054794, + "grad_norm": 2.1280948481909703, + "learning_rate": 9.964667520268658e-05, + "loss": 4.1447, + "step": 2980 + }, + { + "epoch": 1.9211925866236905, + "grad_norm": 2.8585726053877147, + "learning_rate": 9.964643590547062e-05, + "loss": 4.1046, + "step": 2981 + }, + { + "epoch": 1.9218372280419018, + "grad_norm": 1.9361057516230786, + "learning_rate": 9.964619652753779e-05, + "loss": 4.0365, + "step": 2982 + }, + { + "epoch": 1.9224818694601127, + "grad_norm": 2.213056921877274, + "learning_rate": 9.964595706888851e-05, + "loss": 3.6582, + "step": 2983 + }, + { + "epoch": 1.923126510878324, + "grad_norm": 1.7066598307532908, + "learning_rate": 9.964571752952313e-05, + "loss": 3.8864, + "step": 2984 + }, + { + "epoch": 1.923771152296535, + "grad_norm": 1.8292083962498344, + "learning_rate": 9.96454779094421e-05, + "loss": 3.9774, + "step": 2985 + }, + { + "epoch": 1.9244157937147461, + "grad_norm": 1.4512155214839828, + "learning_rate": 9.964523820864577e-05, + "loss": 3.9207, + "step": 2986 + }, + { + "epoch": 1.9250604351329574, + "grad_norm": 1.4423828093461215, + "learning_rate": 9.964499842713456e-05, + "loss": 3.9841, + "step": 2987 + }, + { + "epoch": 1.9257050765511683, + "grad_norm": 1.3255017166502105, + "learning_rate": 9.964475856490885e-05, + "loss": 4.0279, + "step": 2988 + }, + { + "epoch": 1.9263497179693796, + "grad_norm": 1.3554294433580778, + "learning_rate": 9.964451862196904e-05, + "loss": 3.731, + "step": 2989 + }, + { + "epoch": 1.9269943593875907, + "grad_norm": 1.5485273677940654, + "learning_rate": 9.964427859831552e-05, + "loss": 3.8891, + "step": 2990 + }, + { + "epoch": 1.9276390008058018, + "grad_norm": 1.6711699661589507, + "learning_rate": 9.96440384939487e-05, + "loss": 3.8681, + "step": 2991 + }, + { + "epoch": 1.928283642224013, + "grad_norm": 1.7736966947462318, + "learning_rate": 9.964379830886893e-05, + "loss": 4.0909, + "step": 2992 + }, + { + "epoch": 1.928928283642224, + "grad_norm": 1.8522300823206805, + "learning_rate": 9.964355804307664e-05, + "loss": 4.0593, + "step": 2993 + }, + { + "epoch": 1.9295729250604352, + "grad_norm": 1.7332150804782118, + "learning_rate": 9.964331769657224e-05, + "loss": 4.0558, + "step": 2994 + }, + { + "epoch": 1.9302175664786463, + "grad_norm": 1.2334220278243346, + "learning_rate": 9.964307726935608e-05, + "loss": 3.8181, + "step": 2995 + }, + { + "epoch": 1.9308622078968574, + "grad_norm": 2.2493963563832993, + "learning_rate": 9.96428367614286e-05, + "loss": 4.0088, + "step": 2996 + }, + { + "epoch": 1.9315068493150684, + "grad_norm": 2.0245331783271023, + "learning_rate": 9.964259617279015e-05, + "loss": 4.4333, + "step": 2997 + }, + { + "epoch": 1.9321514907332795, + "grad_norm": 1.832372142314015, + "learning_rate": 9.964235550344117e-05, + "loss": 3.8099, + "step": 2998 + }, + { + "epoch": 1.9327961321514908, + "grad_norm": 2.1449659204573948, + "learning_rate": 9.964211475338203e-05, + "loss": 3.8797, + "step": 2999 + }, + { + "epoch": 1.933440773569702, + "grad_norm": 2.148972505499396, + "learning_rate": 9.964187392261313e-05, + "loss": 4.1189, + "step": 3000 + }, + { + "epoch": 1.933440773569702, + "eval_loss": 4.063990592956543, + "eval_runtime": 2.9574, + "eval_samples_per_second": 33.813, + "eval_steps_per_second": 4.396, + "step": 3000 + }, + { + "epoch": 1.934085414987913, + "grad_norm": 1.5532155279814486, + "learning_rate": 9.964163301113485e-05, + "loss": 3.6232, + "step": 3001 + }, + { + "epoch": 1.934730056406124, + "grad_norm": 2.0823177175583667, + "learning_rate": 9.964139201894761e-05, + "loss": 3.9773, + "step": 3002 + }, + { + "epoch": 1.9353746978243351, + "grad_norm": 1.7825033329863025, + "learning_rate": 9.96411509460518e-05, + "loss": 3.8733, + "step": 3003 + }, + { + "epoch": 1.9360193392425464, + "grad_norm": 1.5342949643636339, + "learning_rate": 9.964090979244781e-05, + "loss": 4.2013, + "step": 3004 + }, + { + "epoch": 1.9366639806607573, + "grad_norm": 1.5669951884315156, + "learning_rate": 9.964066855813605e-05, + "loss": 3.8977, + "step": 3005 + }, + { + "epoch": 1.9373086220789686, + "grad_norm": 1.527169179103903, + "learning_rate": 9.964042724311689e-05, + "loss": 3.9656, + "step": 3006 + }, + { + "epoch": 1.9379532634971797, + "grad_norm": 1.4389303143556487, + "learning_rate": 9.964018584739073e-05, + "loss": 3.9779, + "step": 3007 + }, + { + "epoch": 1.9385979049153907, + "grad_norm": 1.7462421542918916, + "learning_rate": 9.9639944370958e-05, + "loss": 3.9359, + "step": 3008 + }, + { + "epoch": 1.939242546333602, + "grad_norm": 1.5791749785434177, + "learning_rate": 9.963970281381906e-05, + "loss": 4.2335, + "step": 3009 + }, + { + "epoch": 1.939887187751813, + "grad_norm": 1.2954385618999047, + "learning_rate": 9.963946117597432e-05, + "loss": 3.8108, + "step": 3010 + }, + { + "epoch": 1.9405318291700242, + "grad_norm": 1.8139554678048824, + "learning_rate": 9.96392194574242e-05, + "loss": 3.7224, + "step": 3011 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 1.6057548091578797, + "learning_rate": 9.963897765816905e-05, + "loss": 4.0559, + "step": 3012 + }, + { + "epoch": 1.9418211120064464, + "grad_norm": 1.4665562948650093, + "learning_rate": 9.96387357782093e-05, + "loss": 4.1969, + "step": 3013 + }, + { + "epoch": 1.9424657534246577, + "grad_norm": 1.29961559263397, + "learning_rate": 9.963849381754534e-05, + "loss": 3.9717, + "step": 3014 + }, + { + "epoch": 1.9431103948428685, + "grad_norm": 1.6991559148116977, + "learning_rate": 9.963825177617755e-05, + "loss": 3.9111, + "step": 3015 + }, + { + "epoch": 1.9437550362610798, + "grad_norm": 1.2635436982825718, + "learning_rate": 9.963800965410638e-05, + "loss": 3.8941, + "step": 3016 + }, + { + "epoch": 1.944399677679291, + "grad_norm": 1.7371825424791563, + "learning_rate": 9.963776745133216e-05, + "loss": 4.0633, + "step": 3017 + }, + { + "epoch": 1.945044319097502, + "grad_norm": 1.9014808047982785, + "learning_rate": 9.963752516785534e-05, + "loss": 3.9772, + "step": 3018 + }, + { + "epoch": 1.9456889605157133, + "grad_norm": 1.4292322053514757, + "learning_rate": 9.963728280367629e-05, + "loss": 3.9071, + "step": 3019 + }, + { + "epoch": 1.9463336019339241, + "grad_norm": 1.6121471515541328, + "learning_rate": 9.96370403587954e-05, + "loss": 3.8176, + "step": 3020 + }, + { + "epoch": 1.9469782433521354, + "grad_norm": 2.0037016911889935, + "learning_rate": 9.96367978332131e-05, + "loss": 3.888, + "step": 3021 + }, + { + "epoch": 1.9476228847703465, + "grad_norm": 1.4604876667694395, + "learning_rate": 9.963655522692976e-05, + "loss": 3.9189, + "step": 3022 + }, + { + "epoch": 1.9482675261885576, + "grad_norm": 2.052429847162598, + "learning_rate": 9.963631253994582e-05, + "loss": 3.8924, + "step": 3023 + }, + { + "epoch": 1.9489121676067689, + "grad_norm": 2.3899401885544678, + "learning_rate": 9.963606977226161e-05, + "loss": 3.4628, + "step": 3024 + }, + { + "epoch": 1.9495568090249797, + "grad_norm": 2.1303583690339343, + "learning_rate": 9.963582692387759e-05, + "loss": 4.0376, + "step": 3025 + }, + { + "epoch": 1.950201450443191, + "grad_norm": 2.091648818079748, + "learning_rate": 9.963558399479412e-05, + "loss": 3.9755, + "step": 3026 + }, + { + "epoch": 1.9508460918614021, + "grad_norm": 1.6063594220484538, + "learning_rate": 9.963534098501163e-05, + "loss": 3.7866, + "step": 3027 + }, + { + "epoch": 1.9514907332796132, + "grad_norm": 2.0274614795160115, + "learning_rate": 9.963509789453051e-05, + "loss": 4.0236, + "step": 3028 + }, + { + "epoch": 1.9521353746978245, + "grad_norm": 2.3708011502891324, + "learning_rate": 9.963485472335115e-05, + "loss": 3.9516, + "step": 3029 + }, + { + "epoch": 1.9527800161160354, + "grad_norm": 1.9886719463877085, + "learning_rate": 9.963461147147393e-05, + "loss": 3.7456, + "step": 3030 + }, + { + "epoch": 1.9534246575342467, + "grad_norm": 2.192144848182242, + "learning_rate": 9.963436813889929e-05, + "loss": 4.0065, + "step": 3031 + }, + { + "epoch": 1.9540692989524577, + "grad_norm": 2.1778210498016337, + "learning_rate": 9.963412472562761e-05, + "loss": 3.655, + "step": 3032 + }, + { + "epoch": 1.9547139403706688, + "grad_norm": 1.6496433626534206, + "learning_rate": 9.963388123165931e-05, + "loss": 3.6213, + "step": 3033 + }, + { + "epoch": 1.95535858178888, + "grad_norm": 1.6376283009022732, + "learning_rate": 9.963363765699475e-05, + "loss": 3.9909, + "step": 3034 + }, + { + "epoch": 1.956003223207091, + "grad_norm": 1.724166956172561, + "learning_rate": 9.963339400163436e-05, + "loss": 3.7757, + "step": 3035 + }, + { + "epoch": 1.9566478646253023, + "grad_norm": 1.7013414032130216, + "learning_rate": 9.963315026557853e-05, + "loss": 3.9137, + "step": 3036 + }, + { + "epoch": 1.9572925060435133, + "grad_norm": 1.5887983632893996, + "learning_rate": 9.963290644882766e-05, + "loss": 4.0649, + "step": 3037 + }, + { + "epoch": 1.9579371474617244, + "grad_norm": 1.961904028319742, + "learning_rate": 9.963266255138218e-05, + "loss": 3.8692, + "step": 3038 + }, + { + "epoch": 1.9585817888799355, + "grad_norm": 1.3189514294322642, + "learning_rate": 9.963241857324243e-05, + "loss": 3.9663, + "step": 3039 + }, + { + "epoch": 1.9592264302981466, + "grad_norm": 1.5462653850054564, + "learning_rate": 9.963217451440885e-05, + "loss": 4.2258, + "step": 3040 + }, + { + "epoch": 1.9598710717163579, + "grad_norm": 1.527459266188919, + "learning_rate": 9.963193037488184e-05, + "loss": 4.0607, + "step": 3041 + }, + { + "epoch": 1.9605157131345687, + "grad_norm": 1.7474222228710665, + "learning_rate": 9.963168615466178e-05, + "loss": 3.9514, + "step": 3042 + }, + { + "epoch": 1.96116035455278, + "grad_norm": 1.8984562685023523, + "learning_rate": 9.963144185374911e-05, + "loss": 3.9803, + "step": 3043 + }, + { + "epoch": 1.9618049959709911, + "grad_norm": 1.3709752079466337, + "learning_rate": 9.963119747214421e-05, + "loss": 4.161, + "step": 3044 + }, + { + "epoch": 1.9624496373892022, + "grad_norm": 1.366455693226654, + "learning_rate": 9.963095300984748e-05, + "loss": 3.8918, + "step": 3045 + }, + { + "epoch": 1.9630942788074135, + "grad_norm": 1.6452759078566443, + "learning_rate": 9.96307084668593e-05, + "loss": 3.9736, + "step": 3046 + }, + { + "epoch": 1.9637389202256244, + "grad_norm": 1.24912226369968, + "learning_rate": 9.963046384318011e-05, + "loss": 3.6664, + "step": 3047 + }, + { + "epoch": 1.9643835616438357, + "grad_norm": 1.426126771871605, + "learning_rate": 9.963021913881028e-05, + "loss": 3.8402, + "step": 3048 + }, + { + "epoch": 1.9650282030620467, + "grad_norm": 1.6743880337578732, + "learning_rate": 9.962997435375023e-05, + "loss": 4.0867, + "step": 3049 + }, + { + "epoch": 1.9656728444802578, + "grad_norm": 1.5153006675051583, + "learning_rate": 9.962972948800037e-05, + "loss": 3.7304, + "step": 3050 + }, + { + "epoch": 1.966317485898469, + "grad_norm": 2.0892316808383704, + "learning_rate": 9.962948454156108e-05, + "loss": 3.8042, + "step": 3051 + }, + { + "epoch": 1.96696212731668, + "grad_norm": 1.9804410264473153, + "learning_rate": 9.962923951443277e-05, + "loss": 4.3738, + "step": 3052 + }, + { + "epoch": 1.9676067687348913, + "grad_norm": 1.7526805068859115, + "learning_rate": 9.962899440661586e-05, + "loss": 3.8282, + "step": 3053 + }, + { + "epoch": 1.9682514101531023, + "grad_norm": 2.6427164728282624, + "learning_rate": 9.962874921811074e-05, + "loss": 3.9122, + "step": 3054 + }, + { + "epoch": 1.9688960515713134, + "grad_norm": 2.1924420284586885, + "learning_rate": 9.96285039489178e-05, + "loss": 4.0517, + "step": 3055 + }, + { + "epoch": 1.9695406929895247, + "grad_norm": 1.9063377156578787, + "learning_rate": 9.962825859903745e-05, + "loss": 4.4195, + "step": 3056 + }, + { + "epoch": 1.9701853344077356, + "grad_norm": 1.7783927182143457, + "learning_rate": 9.96280131684701e-05, + "loss": 3.7961, + "step": 3057 + }, + { + "epoch": 1.9708299758259469, + "grad_norm": 1.9062392279724658, + "learning_rate": 9.962776765721616e-05, + "loss": 4.0704, + "step": 3058 + }, + { + "epoch": 1.971474617244158, + "grad_norm": 2.945923074073166, + "learning_rate": 9.9627522065276e-05, + "loss": 3.9132, + "step": 3059 + }, + { + "epoch": 1.972119258662369, + "grad_norm": 2.0529927374107877, + "learning_rate": 9.962727639265006e-05, + "loss": 3.9046, + "step": 3060 + }, + { + "epoch": 1.9727639000805803, + "grad_norm": 2.3472510207267443, + "learning_rate": 9.962703063933874e-05, + "loss": 4.0793, + "step": 3061 + }, + { + "epoch": 1.9734085414987912, + "grad_norm": 2.1680846540434127, + "learning_rate": 9.962678480534242e-05, + "loss": 4.0066, + "step": 3062 + }, + { + "epoch": 1.9740531829170025, + "grad_norm": 2.051836916447928, + "learning_rate": 9.962653889066152e-05, + "loss": 3.9847, + "step": 3063 + }, + { + "epoch": 1.9746978243352136, + "grad_norm": 2.2519901016648864, + "learning_rate": 9.962629289529643e-05, + "loss": 3.6941, + "step": 3064 + }, + { + "epoch": 1.9753424657534246, + "grad_norm": 2.117573580122024, + "learning_rate": 9.962604681924757e-05, + "loss": 4.0141, + "step": 3065 + }, + { + "epoch": 1.9759871071716357, + "grad_norm": 2.859047593364294, + "learning_rate": 9.962580066251536e-05, + "loss": 3.8948, + "step": 3066 + }, + { + "epoch": 1.9766317485898468, + "grad_norm": 2.1039016080506467, + "learning_rate": 9.962555442510016e-05, + "loss": 4.1668, + "step": 3067 + }, + { + "epoch": 1.977276390008058, + "grad_norm": 2.093709771544918, + "learning_rate": 9.96253081070024e-05, + "loss": 3.7713, + "step": 3068 + }, + { + "epoch": 1.9779210314262692, + "grad_norm": 3.0901100945952864, + "learning_rate": 9.962506170822248e-05, + "loss": 3.8004, + "step": 3069 + }, + { + "epoch": 1.9785656728444803, + "grad_norm": 1.8779323171275522, + "learning_rate": 9.962481522876082e-05, + "loss": 3.847, + "step": 3070 + }, + { + "epoch": 1.9792103142626913, + "grad_norm": 1.893633892358618, + "learning_rate": 9.962456866861782e-05, + "loss": 4.0441, + "step": 3071 + }, + { + "epoch": 1.9798549556809024, + "grad_norm": 1.582240321565105, + "learning_rate": 9.962432202779384e-05, + "loss": 4.0094, + "step": 3072 + }, + { + "epoch": 1.9804995970991137, + "grad_norm": 1.6627234993315747, + "learning_rate": 9.962407530628936e-05, + "loss": 3.7459, + "step": 3073 + }, + { + "epoch": 1.9811442385173246, + "grad_norm": 1.2844459391824288, + "learning_rate": 9.962382850410474e-05, + "loss": 4.071, + "step": 3074 + }, + { + "epoch": 1.9817888799355359, + "grad_norm": 1.481981830459369, + "learning_rate": 9.962358162124036e-05, + "loss": 3.7406, + "step": 3075 + }, + { + "epoch": 1.982433521353747, + "grad_norm": 1.3456940641671853, + "learning_rate": 9.962333465769669e-05, + "loss": 4.0311, + "step": 3076 + }, + { + "epoch": 1.983078162771958, + "grad_norm": 1.519358532967883, + "learning_rate": 9.96230876134741e-05, + "loss": 3.7556, + "step": 3077 + }, + { + "epoch": 1.9837228041901693, + "grad_norm": 1.4385371975943073, + "learning_rate": 9.9622840488573e-05, + "loss": 3.745, + "step": 3078 + }, + { + "epoch": 1.9843674456083802, + "grad_norm": 1.424632741065701, + "learning_rate": 9.962259328299378e-05, + "loss": 4.0305, + "step": 3079 + }, + { + "epoch": 1.9850120870265915, + "grad_norm": 1.6201997227926141, + "learning_rate": 9.962234599673688e-05, + "loss": 3.7743, + "step": 3080 + }, + { + "epoch": 1.9856567284448026, + "grad_norm": 1.3099408045798222, + "learning_rate": 9.962209862980268e-05, + "loss": 4.1459, + "step": 3081 + }, + { + "epoch": 1.9863013698630136, + "grad_norm": 1.361384367811755, + "learning_rate": 9.962185118219157e-05, + "loss": 4.1843, + "step": 3082 + }, + { + "epoch": 1.986946011281225, + "grad_norm": 1.3552525010883303, + "learning_rate": 9.962160365390402e-05, + "loss": 3.8729, + "step": 3083 + }, + { + "epoch": 1.9875906526994358, + "grad_norm": 1.5003185111081128, + "learning_rate": 9.962135604494036e-05, + "loss": 3.952, + "step": 3084 + }, + { + "epoch": 1.988235294117647, + "grad_norm": 1.5884952038569127, + "learning_rate": 9.962110835530105e-05, + "loss": 4.226, + "step": 3085 + }, + { + "epoch": 1.9888799355358582, + "grad_norm": 1.9317686960498617, + "learning_rate": 9.962086058498648e-05, + "loss": 3.8741, + "step": 3086 + }, + { + "epoch": 1.9895245769540693, + "grad_norm": 1.3914991886450692, + "learning_rate": 9.962061273399706e-05, + "loss": 4.0957, + "step": 3087 + }, + { + "epoch": 1.9901692183722806, + "grad_norm": 2.5373722768120377, + "learning_rate": 9.962036480233318e-05, + "loss": 4.0364, + "step": 3088 + }, + { + "epoch": 1.9908138597904914, + "grad_norm": 2.4839263894674466, + "learning_rate": 9.962011678999529e-05, + "loss": 3.5082, + "step": 3089 + }, + { + "epoch": 1.9914585012087027, + "grad_norm": 1.7451461079398076, + "learning_rate": 9.961986869698373e-05, + "loss": 3.7052, + "step": 3090 + }, + { + "epoch": 1.9921031426269138, + "grad_norm": 1.907625181812559, + "learning_rate": 9.961962052329897e-05, + "loss": 4.1322, + "step": 3091 + }, + { + "epoch": 1.9927477840451249, + "grad_norm": 1.43733557451856, + "learning_rate": 9.961937226894138e-05, + "loss": 4.0193, + "step": 3092 + }, + { + "epoch": 1.9933924254633362, + "grad_norm": 2.0218507934701404, + "learning_rate": 9.96191239339114e-05, + "loss": 3.9994, + "step": 3093 + }, + { + "epoch": 1.994037066881547, + "grad_norm": 3.141425618113301, + "learning_rate": 9.961887551820941e-05, + "loss": 4.1577, + "step": 3094 + }, + { + "epoch": 1.9946817082997583, + "grad_norm": 2.1451860856792426, + "learning_rate": 9.961862702183583e-05, + "loss": 3.9637, + "step": 3095 + }, + { + "epoch": 1.9953263497179694, + "grad_norm": 1.9990115650503768, + "learning_rate": 9.961837844479106e-05, + "loss": 4.2574, + "step": 3096 + }, + { + "epoch": 1.9959709911361805, + "grad_norm": 2.2932504655376733, + "learning_rate": 9.96181297870755e-05, + "loss": 3.8254, + "step": 3097 + }, + { + "epoch": 1.9966156325543918, + "grad_norm": 1.5478370234673406, + "learning_rate": 9.961788104868959e-05, + "loss": 3.8197, + "step": 3098 + }, + { + "epoch": 1.9972602739726026, + "grad_norm": 1.9179408357660197, + "learning_rate": 9.961763222963372e-05, + "loss": 3.7068, + "step": 3099 + }, + { + "epoch": 1.997904915390814, + "grad_norm": 1.359037883897292, + "learning_rate": 9.961738332990828e-05, + "loss": 4.2017, + "step": 3100 + }, + { + "epoch": 1.997904915390814, + "eval_loss": 4.050742149353027, + "eval_runtime": 2.9693, + "eval_samples_per_second": 33.678, + "eval_steps_per_second": 4.378, + "step": 3100 + }, + { + "epoch": 1.998549556809025, + "grad_norm": 1.7102188385142614, + "learning_rate": 9.961713434951372e-05, + "loss": 3.6826, + "step": 3101 + }, + { + "epoch": 1.999194198227236, + "grad_norm": 1.3973127917581043, + "learning_rate": 9.961688528845042e-05, + "loss": 3.9961, + "step": 3102 + }, + { + "epoch": 1.9998388396454472, + "grad_norm": 1.7514343905717853, + "learning_rate": 9.961663614671879e-05, + "loss": 3.8732, + "step": 3103 + }, + { + "epoch": 2.0, + "grad_norm": 1.7514343905717853, + "learning_rate": 9.961638692431926e-05, + "loss": 1.0391, + "step": 3104 + }, + { + "epoch": 2.0006446414182113, + "grad_norm": 1.5493612543763264, + "learning_rate": 9.961613762125222e-05, + "loss": 3.7962, + "step": 3105 + }, + { + "epoch": 2.001289282836422, + "grad_norm": 1.8925405054002247, + "learning_rate": 9.961588823751807e-05, + "loss": 3.6246, + "step": 3106 + }, + { + "epoch": 2.0019339242546335, + "grad_norm": 1.5963581417179822, + "learning_rate": 9.961563877311723e-05, + "loss": 4.2104, + "step": 3107 + }, + { + "epoch": 2.0025785656728443, + "grad_norm": 2.3119740575479373, + "learning_rate": 9.961538922805013e-05, + "loss": 3.2351, + "step": 3108 + }, + { + "epoch": 2.0032232070910556, + "grad_norm": 2.797052403156984, + "learning_rate": 9.961513960231715e-05, + "loss": 3.425, + "step": 3109 + }, + { + "epoch": 2.003867848509267, + "grad_norm": 3.4414533866563386, + "learning_rate": 9.961488989591871e-05, + "loss": 3.9141, + "step": 3110 + }, + { + "epoch": 2.0045124899274778, + "grad_norm": 2.508562929576956, + "learning_rate": 9.961464010885526e-05, + "loss": 3.888, + "step": 3111 + }, + { + "epoch": 2.005157131345689, + "grad_norm": 1.7124637404050358, + "learning_rate": 9.961439024112713e-05, + "loss": 3.7528, + "step": 3112 + }, + { + "epoch": 2.0058017727639, + "grad_norm": 2.062374596900836, + "learning_rate": 9.96141402927348e-05, + "loss": 3.573, + "step": 3113 + }, + { + "epoch": 2.0064464141821112, + "grad_norm": 2.7855261941494347, + "learning_rate": 9.961389026367862e-05, + "loss": 3.7238, + "step": 3114 + }, + { + "epoch": 2.0070910556003225, + "grad_norm": 2.1592747614556296, + "learning_rate": 9.961364015395907e-05, + "loss": 4.1737, + "step": 3115 + }, + { + "epoch": 2.0077356970185334, + "grad_norm": 1.901376374787019, + "learning_rate": 9.961338996357651e-05, + "loss": 3.762, + "step": 3116 + }, + { + "epoch": 2.0083803384367447, + "grad_norm": 1.9957794384633538, + "learning_rate": 9.961313969253137e-05, + "loss": 3.9975, + "step": 3117 + }, + { + "epoch": 2.0090249798549555, + "grad_norm": 1.3964724892806784, + "learning_rate": 9.961288934082405e-05, + "loss": 4.3769, + "step": 3118 + }, + { + "epoch": 2.009669621273167, + "grad_norm": 1.990150412591947, + "learning_rate": 9.961263890845499e-05, + "loss": 3.7323, + "step": 3119 + }, + { + "epoch": 2.010314262691378, + "grad_norm": 1.9863771171172877, + "learning_rate": 9.961238839542455e-05, + "loss": 4.0484, + "step": 3120 + }, + { + "epoch": 2.010958904109589, + "grad_norm": 2.030603782459638, + "learning_rate": 9.961213780173318e-05, + "loss": 3.8171, + "step": 3121 + }, + { + "epoch": 2.0116035455278003, + "grad_norm": 1.5594669838687991, + "learning_rate": 9.96118871273813e-05, + "loss": 4.2233, + "step": 3122 + }, + { + "epoch": 2.012248186946011, + "grad_norm": 2.0057683430399846, + "learning_rate": 9.961163637236929e-05, + "loss": 3.1637, + "step": 3123 + }, + { + "epoch": 2.0128928283642225, + "grad_norm": 1.5715036212197062, + "learning_rate": 9.961138553669757e-05, + "loss": 3.551, + "step": 3124 + }, + { + "epoch": 2.0135374697824333, + "grad_norm": 2.5310226217828946, + "learning_rate": 9.961113462036656e-05, + "loss": 4.0463, + "step": 3125 + }, + { + "epoch": 2.0141821112006446, + "grad_norm": 1.996377753793291, + "learning_rate": 9.961088362337669e-05, + "loss": 3.668, + "step": 3126 + }, + { + "epoch": 2.014826752618856, + "grad_norm": 1.797366603619829, + "learning_rate": 9.961063254572834e-05, + "loss": 3.6841, + "step": 3127 + }, + { + "epoch": 2.0154713940370668, + "grad_norm": 2.1371304580725337, + "learning_rate": 9.961038138742193e-05, + "loss": 3.7365, + "step": 3128 + }, + { + "epoch": 2.016116035455278, + "grad_norm": 1.7958392312667737, + "learning_rate": 9.961013014845789e-05, + "loss": 3.6748, + "step": 3129 + }, + { + "epoch": 2.016760676873489, + "grad_norm": 2.0827945290991434, + "learning_rate": 9.960987882883661e-05, + "loss": 3.9395, + "step": 3130 + }, + { + "epoch": 2.0174053182917002, + "grad_norm": 1.945071280754476, + "learning_rate": 9.960962742855852e-05, + "loss": 3.6554, + "step": 3131 + }, + { + "epoch": 2.0180499597099115, + "grad_norm": 2.0506000359649836, + "learning_rate": 9.960937594762401e-05, + "loss": 3.7213, + "step": 3132 + }, + { + "epoch": 2.0186946011281224, + "grad_norm": 1.2314255304302382, + "learning_rate": 9.960912438603352e-05, + "loss": 3.763, + "step": 3133 + }, + { + "epoch": 2.0193392425463337, + "grad_norm": 1.784536586009585, + "learning_rate": 9.960887274378745e-05, + "loss": 3.8478, + "step": 3134 + }, + { + "epoch": 2.0199838839645445, + "grad_norm": 1.6502242039036323, + "learning_rate": 9.960862102088623e-05, + "loss": 4.1211, + "step": 3135 + }, + { + "epoch": 2.020628525382756, + "grad_norm": 1.5953942763036864, + "learning_rate": 9.960836921733024e-05, + "loss": 3.8477, + "step": 3136 + }, + { + "epoch": 2.021273166800967, + "grad_norm": 1.4970587402477447, + "learning_rate": 9.960811733311992e-05, + "loss": 3.6495, + "step": 3137 + }, + { + "epoch": 2.021917808219178, + "grad_norm": 1.80733830121271, + "learning_rate": 9.960786536825567e-05, + "loss": 3.7503, + "step": 3138 + }, + { + "epoch": 2.0225624496373893, + "grad_norm": 1.2344404366146104, + "learning_rate": 9.960761332273792e-05, + "loss": 3.9377, + "step": 3139 + }, + { + "epoch": 2.0232070910556, + "grad_norm": 1.715150932145634, + "learning_rate": 9.960736119656706e-05, + "loss": 3.5304, + "step": 3140 + }, + { + "epoch": 2.0238517324738114, + "grad_norm": 1.1862009030888896, + "learning_rate": 9.960710898974352e-05, + "loss": 3.9201, + "step": 3141 + }, + { + "epoch": 2.0244963738920227, + "grad_norm": 1.7370766279016105, + "learning_rate": 9.960685670226771e-05, + "loss": 3.7421, + "step": 3142 + }, + { + "epoch": 2.0251410153102336, + "grad_norm": 1.4014054725402911, + "learning_rate": 9.960660433414008e-05, + "loss": 3.8427, + "step": 3143 + }, + { + "epoch": 2.025785656728445, + "grad_norm": 1.421394191441674, + "learning_rate": 9.960635188536099e-05, + "loss": 3.9895, + "step": 3144 + }, + { + "epoch": 2.0264302981466558, + "grad_norm": 1.5767551099580064, + "learning_rate": 9.960609935593085e-05, + "loss": 3.8549, + "step": 3145 + }, + { + "epoch": 2.027074939564867, + "grad_norm": 1.4697180698895431, + "learning_rate": 9.960584674585013e-05, + "loss": 3.8342, + "step": 3146 + }, + { + "epoch": 2.0277195809830784, + "grad_norm": 1.4740667500750526, + "learning_rate": 9.96055940551192e-05, + "loss": 3.6387, + "step": 3147 + }, + { + "epoch": 2.028364222401289, + "grad_norm": 1.6019871689207206, + "learning_rate": 9.96053412837385e-05, + "loss": 3.7915, + "step": 3148 + }, + { + "epoch": 2.0290088638195005, + "grad_norm": 1.4265299556905842, + "learning_rate": 9.960508843170844e-05, + "loss": 3.9879, + "step": 3149 + }, + { + "epoch": 2.0296535052377114, + "grad_norm": 1.2105964627247034, + "learning_rate": 9.960483549902941e-05, + "loss": 4.0562, + "step": 3150 + }, + { + "epoch": 2.0302981466559227, + "grad_norm": 1.5775320113079148, + "learning_rate": 9.960458248570185e-05, + "loss": 3.7056, + "step": 3151 + }, + { + "epoch": 2.030942788074134, + "grad_norm": 1.1283755525725412, + "learning_rate": 9.960432939172618e-05, + "loss": 3.5561, + "step": 3152 + }, + { + "epoch": 2.031587429492345, + "grad_norm": 1.5922726744149864, + "learning_rate": 9.960407621710281e-05, + "loss": 3.904, + "step": 3153 + }, + { + "epoch": 2.032232070910556, + "grad_norm": 1.688041993773696, + "learning_rate": 9.960382296183215e-05, + "loss": 3.9577, + "step": 3154 + }, + { + "epoch": 2.032876712328767, + "grad_norm": 1.4890852393602279, + "learning_rate": 9.960356962591462e-05, + "loss": 4.1282, + "step": 3155 + }, + { + "epoch": 2.0335213537469783, + "grad_norm": 1.7460856555489834, + "learning_rate": 9.960331620935064e-05, + "loss": 3.7665, + "step": 3156 + }, + { + "epoch": 2.034165995165189, + "grad_norm": 1.82217667530889, + "learning_rate": 9.96030627121406e-05, + "loss": 3.8432, + "step": 3157 + }, + { + "epoch": 2.0348106365834004, + "grad_norm": 1.2647395648768127, + "learning_rate": 9.960280913428496e-05, + "loss": 3.9434, + "step": 3158 + }, + { + "epoch": 2.0354552780016117, + "grad_norm": 1.6673596743173953, + "learning_rate": 9.96025554757841e-05, + "loss": 3.8791, + "step": 3159 + }, + { + "epoch": 2.0360999194198226, + "grad_norm": 2.0274109959733693, + "learning_rate": 9.960230173663847e-05, + "loss": 3.9734, + "step": 3160 + }, + { + "epoch": 2.036744560838034, + "grad_norm": 1.7926129252674028, + "learning_rate": 9.960204791684844e-05, + "loss": 3.7713, + "step": 3161 + }, + { + "epoch": 2.0373892022562448, + "grad_norm": 1.7399139393752194, + "learning_rate": 9.960179401641446e-05, + "loss": 3.9894, + "step": 3162 + }, + { + "epoch": 2.038033843674456, + "grad_norm": 2.5267345542270387, + "learning_rate": 9.960154003533698e-05, + "loss": 4.0132, + "step": 3163 + }, + { + "epoch": 2.0386784850926674, + "grad_norm": 1.6434609686352648, + "learning_rate": 9.960128597361633e-05, + "loss": 3.6518, + "step": 3164 + }, + { + "epoch": 2.039323126510878, + "grad_norm": 2.806025497490414, + "learning_rate": 9.960103183125299e-05, + "loss": 3.664, + "step": 3165 + }, + { + "epoch": 2.0399677679290895, + "grad_norm": 2.4237543598924582, + "learning_rate": 9.960077760824736e-05, + "loss": 3.9845, + "step": 3166 + }, + { + "epoch": 2.0406124093473004, + "grad_norm": 1.9498744416458713, + "learning_rate": 9.960052330459988e-05, + "loss": 4.1108, + "step": 3167 + }, + { + "epoch": 2.0412570507655117, + "grad_norm": 1.8482416463332287, + "learning_rate": 9.960026892031093e-05, + "loss": 4.0194, + "step": 3168 + }, + { + "epoch": 2.041901692183723, + "grad_norm": 1.8450185131649972, + "learning_rate": 9.960001445538094e-05, + "loss": 3.6467, + "step": 3169 + }, + { + "epoch": 2.042546333601934, + "grad_norm": 2.181684399712338, + "learning_rate": 9.959975990981035e-05, + "loss": 3.8778, + "step": 3170 + }, + { + "epoch": 2.043190975020145, + "grad_norm": 1.8486985725677665, + "learning_rate": 9.959950528359955e-05, + "loss": 4.007, + "step": 3171 + }, + { + "epoch": 2.043835616438356, + "grad_norm": 2.281836515754561, + "learning_rate": 9.959925057674899e-05, + "loss": 4.0623, + "step": 3172 + }, + { + "epoch": 2.0444802578565673, + "grad_norm": 1.5206834097005923, + "learning_rate": 9.959899578925905e-05, + "loss": 3.7514, + "step": 3173 + }, + { + "epoch": 2.0451248992747786, + "grad_norm": 2.058035396957572, + "learning_rate": 9.959874092113016e-05, + "loss": 3.6199, + "step": 3174 + }, + { + "epoch": 2.0457695406929894, + "grad_norm": 1.5785744773765664, + "learning_rate": 9.959848597236277e-05, + "loss": 3.6837, + "step": 3175 + }, + { + "epoch": 2.0464141821112007, + "grad_norm": 1.9997842710318592, + "learning_rate": 9.959823094295725e-05, + "loss": 3.7581, + "step": 3176 + }, + { + "epoch": 2.0470588235294116, + "grad_norm": 1.5143247383782161, + "learning_rate": 9.959797583291404e-05, + "loss": 4.123, + "step": 3177 + }, + { + "epoch": 2.047703464947623, + "grad_norm": 2.339931583342558, + "learning_rate": 9.959772064223359e-05, + "loss": 3.7704, + "step": 3178 + }, + { + "epoch": 2.048348106365834, + "grad_norm": 2.8379632711587495, + "learning_rate": 9.959746537091628e-05, + "loss": 3.8354, + "step": 3179 + }, + { + "epoch": 2.048992747784045, + "grad_norm": 1.8495491398355814, + "learning_rate": 9.959721001896255e-05, + "loss": 3.8547, + "step": 3180 + }, + { + "epoch": 2.0496373892022564, + "grad_norm": 2.6435837024569713, + "learning_rate": 9.95969545863728e-05, + "loss": 3.7492, + "step": 3181 + }, + { + "epoch": 2.050282030620467, + "grad_norm": 1.5992114114302916, + "learning_rate": 9.959669907314745e-05, + "loss": 3.6653, + "step": 3182 + }, + { + "epoch": 2.0509266720386785, + "grad_norm": 2.040572603020579, + "learning_rate": 9.959644347928693e-05, + "loss": 3.3825, + "step": 3183 + }, + { + "epoch": 2.05157131345689, + "grad_norm": 1.4651014667728344, + "learning_rate": 9.959618780479167e-05, + "loss": 3.5473, + "step": 3184 + }, + { + "epoch": 2.0522159548751007, + "grad_norm": 1.636450329797329, + "learning_rate": 9.959593204966207e-05, + "loss": 3.9967, + "step": 3185 + }, + { + "epoch": 2.052860596293312, + "grad_norm": 1.7549278299075854, + "learning_rate": 9.959567621389855e-05, + "loss": 3.9514, + "step": 3186 + }, + { + "epoch": 2.053505237711523, + "grad_norm": 1.3090078513792525, + "learning_rate": 9.959542029750157e-05, + "loss": 3.8929, + "step": 3187 + }, + { + "epoch": 2.054149879129734, + "grad_norm": 1.6547665853733196, + "learning_rate": 9.95951643004715e-05, + "loss": 4.0058, + "step": 3188 + }, + { + "epoch": 2.0547945205479454, + "grad_norm": 1.6074256008016585, + "learning_rate": 9.959490822280878e-05, + "loss": 3.978, + "step": 3189 + }, + { + "epoch": 2.0554391619661563, + "grad_norm": 1.2824486793915935, + "learning_rate": 9.959465206451383e-05, + "loss": 4.3685, + "step": 3190 + }, + { + "epoch": 2.0560838033843676, + "grad_norm": 1.233742329312858, + "learning_rate": 9.959439582558707e-05, + "loss": 4.0253, + "step": 3191 + }, + { + "epoch": 2.0567284448025784, + "grad_norm": 1.4099427644031888, + "learning_rate": 9.959413950602892e-05, + "loss": 3.6297, + "step": 3192 + }, + { + "epoch": 2.0573730862207897, + "grad_norm": 1.4073740688269742, + "learning_rate": 9.95938831058398e-05, + "loss": 3.9378, + "step": 3193 + }, + { + "epoch": 2.0580177276390006, + "grad_norm": 1.7226882039748381, + "learning_rate": 9.959362662502014e-05, + "loss": 3.6901, + "step": 3194 + }, + { + "epoch": 2.058662369057212, + "grad_norm": 1.4074119413150707, + "learning_rate": 9.959337006357036e-05, + "loss": 3.8006, + "step": 3195 + }, + { + "epoch": 2.059307010475423, + "grad_norm": 1.4403184632908137, + "learning_rate": 9.959311342149087e-05, + "loss": 4.0808, + "step": 3196 + }, + { + "epoch": 2.059951651893634, + "grad_norm": 1.580285587333905, + "learning_rate": 9.959285669878208e-05, + "loss": 3.7986, + "step": 3197 + }, + { + "epoch": 2.0605962933118454, + "grad_norm": 1.822269275841511, + "learning_rate": 9.959259989544445e-05, + "loss": 3.4774, + "step": 3198 + }, + { + "epoch": 2.061240934730056, + "grad_norm": 1.7398053829188083, + "learning_rate": 9.95923430114784e-05, + "loss": 3.6574, + "step": 3199 + }, + { + "epoch": 2.0618855761482675, + "grad_norm": 2.2434762029380004, + "learning_rate": 9.959208604688432e-05, + "loss": 3.7072, + "step": 3200 + }, + { + "epoch": 2.0618855761482675, + "eval_loss": 4.0688557624816895, + "eval_runtime": 2.9832, + "eval_samples_per_second": 33.521, + "eval_steps_per_second": 4.358, + "step": 3200 + }, + { + "epoch": 2.062530217566479, + "grad_norm": 2.265630692592984, + "learning_rate": 9.959182900166263e-05, + "loss": 3.832, + "step": 3201 + }, + { + "epoch": 2.0631748589846897, + "grad_norm": 1.4982368788941638, + "learning_rate": 9.959157187581377e-05, + "loss": 3.748, + "step": 3202 + }, + { + "epoch": 2.063819500402901, + "grad_norm": 2.387397793153926, + "learning_rate": 9.959131466933817e-05, + "loss": 4.0712, + "step": 3203 + }, + { + "epoch": 2.064464141821112, + "grad_norm": 1.7208589980404259, + "learning_rate": 9.959105738223623e-05, + "loss": 3.6781, + "step": 3204 + }, + { + "epoch": 2.065108783239323, + "grad_norm": 2.0106327761882987, + "learning_rate": 9.95908000145084e-05, + "loss": 3.7655, + "step": 3205 + }, + { + "epoch": 2.0657534246575344, + "grad_norm": 1.9085728665213926, + "learning_rate": 9.959054256615508e-05, + "loss": 3.5411, + "step": 3206 + }, + { + "epoch": 2.0663980660757453, + "grad_norm": 1.8790134509018313, + "learning_rate": 9.95902850371767e-05, + "loss": 3.8985, + "step": 3207 + }, + { + "epoch": 2.0670427074939566, + "grad_norm": 1.823931807012881, + "learning_rate": 9.959002742757368e-05, + "loss": 3.9029, + "step": 3208 + }, + { + "epoch": 2.0676873489121674, + "grad_norm": 1.7712652793226584, + "learning_rate": 9.958976973734646e-05, + "loss": 3.5579, + "step": 3209 + }, + { + "epoch": 2.0683319903303787, + "grad_norm": 2.109918454165808, + "learning_rate": 9.958951196649544e-05, + "loss": 3.7253, + "step": 3210 + }, + { + "epoch": 2.06897663174859, + "grad_norm": 2.017923885840121, + "learning_rate": 9.958925411502104e-05, + "loss": 3.9261, + "step": 3211 + }, + { + "epoch": 2.069621273166801, + "grad_norm": 2.012461370446795, + "learning_rate": 9.95889961829237e-05, + "loss": 3.4058, + "step": 3212 + }, + { + "epoch": 2.070265914585012, + "grad_norm": 2.2452979963151796, + "learning_rate": 9.958873817020387e-05, + "loss": 3.7935, + "step": 3213 + }, + { + "epoch": 2.070910556003223, + "grad_norm": 1.832336103437912, + "learning_rate": 9.958848007686193e-05, + "loss": 3.8906, + "step": 3214 + }, + { + "epoch": 2.0715551974214343, + "grad_norm": 1.7811368868915005, + "learning_rate": 9.95882219028983e-05, + "loss": 3.8684, + "step": 3215 + }, + { + "epoch": 2.0721998388396456, + "grad_norm": 2.022000802388438, + "learning_rate": 9.958796364831343e-05, + "loss": 3.9824, + "step": 3216 + }, + { + "epoch": 2.0728444802578565, + "grad_norm": 1.3615875549225247, + "learning_rate": 9.958770531310774e-05, + "loss": 3.7762, + "step": 3217 + }, + { + "epoch": 2.073489121676068, + "grad_norm": 1.9768636121754524, + "learning_rate": 9.958744689728164e-05, + "loss": 3.6385, + "step": 3218 + }, + { + "epoch": 2.0741337630942787, + "grad_norm": 1.272398896310214, + "learning_rate": 9.958718840083558e-05, + "loss": 3.5863, + "step": 3219 + }, + { + "epoch": 2.07477840451249, + "grad_norm": 1.7291409209900677, + "learning_rate": 9.958692982376996e-05, + "loss": 3.7992, + "step": 3220 + }, + { + "epoch": 2.0754230459307013, + "grad_norm": 1.7388880222923702, + "learning_rate": 9.958667116608522e-05, + "loss": 3.6835, + "step": 3221 + }, + { + "epoch": 2.076067687348912, + "grad_norm": 1.7199569196062388, + "learning_rate": 9.958641242778178e-05, + "loss": 4.053, + "step": 3222 + }, + { + "epoch": 2.0767123287671234, + "grad_norm": 1.7821123623421222, + "learning_rate": 9.958615360886004e-05, + "loss": 3.6237, + "step": 3223 + }, + { + "epoch": 2.0773569701853343, + "grad_norm": 1.6276623010595723, + "learning_rate": 9.958589470932047e-05, + "loss": 3.9101, + "step": 3224 + }, + { + "epoch": 2.0780016116035456, + "grad_norm": 1.2022223832276135, + "learning_rate": 9.958563572916347e-05, + "loss": 3.807, + "step": 3225 + }, + { + "epoch": 2.0786462530217564, + "grad_norm": 1.6130425857202662, + "learning_rate": 9.958537666838948e-05, + "loss": 3.9288, + "step": 3226 + }, + { + "epoch": 2.0792908944399677, + "grad_norm": 1.4736155397718766, + "learning_rate": 9.958511752699889e-05, + "loss": 4.0733, + "step": 3227 + }, + { + "epoch": 2.079935535858179, + "grad_norm": 1.3395722470820837, + "learning_rate": 9.958485830499218e-05, + "loss": 3.8741, + "step": 3228 + }, + { + "epoch": 2.08058017727639, + "grad_norm": 1.6624349759662624, + "learning_rate": 9.958459900236973e-05, + "loss": 3.9753, + "step": 3229 + }, + { + "epoch": 2.081224818694601, + "grad_norm": 1.3410828775328953, + "learning_rate": 9.958433961913197e-05, + "loss": 3.8692, + "step": 3230 + }, + { + "epoch": 2.081869460112812, + "grad_norm": 1.4774379251052547, + "learning_rate": 9.958408015527936e-05, + "loss": 3.9892, + "step": 3231 + }, + { + "epoch": 2.0825141015310233, + "grad_norm": 1.4704899893707168, + "learning_rate": 9.958382061081228e-05, + "loss": 3.7949, + "step": 3232 + }, + { + "epoch": 2.0831587429492346, + "grad_norm": 1.4554520303351899, + "learning_rate": 9.95835609857312e-05, + "loss": 4.1017, + "step": 3233 + }, + { + "epoch": 2.0838033843674455, + "grad_norm": 1.88009740981017, + "learning_rate": 9.958330128003652e-05, + "loss": 3.4942, + "step": 3234 + }, + { + "epoch": 2.084448025785657, + "grad_norm": 1.758275206072601, + "learning_rate": 9.958304149372867e-05, + "loss": 4.107, + "step": 3235 + }, + { + "epoch": 2.0850926672038677, + "grad_norm": 1.166538725967802, + "learning_rate": 9.958278162680809e-05, + "loss": 3.9218, + "step": 3236 + }, + { + "epoch": 2.085737308622079, + "grad_norm": 1.3834616643184474, + "learning_rate": 9.958252167927519e-05, + "loss": 4.0937, + "step": 3237 + }, + { + "epoch": 2.0863819500402903, + "grad_norm": 1.4802340225771222, + "learning_rate": 9.958226165113039e-05, + "loss": 3.9795, + "step": 3238 + }, + { + "epoch": 2.087026591458501, + "grad_norm": 1.6141883438232605, + "learning_rate": 9.958200154237416e-05, + "loss": 3.54, + "step": 3239 + }, + { + "epoch": 2.0876712328767124, + "grad_norm": 1.4441817537717587, + "learning_rate": 9.958174135300688e-05, + "loss": 3.6176, + "step": 3240 + }, + { + "epoch": 2.0883158742949233, + "grad_norm": 1.4601022256186909, + "learning_rate": 9.9581481083029e-05, + "loss": 3.6402, + "step": 3241 + }, + { + "epoch": 2.0889605157131346, + "grad_norm": 1.2824830021771996, + "learning_rate": 9.958122073244093e-05, + "loss": 3.5268, + "step": 3242 + }, + { + "epoch": 2.089605157131346, + "grad_norm": 1.651838902005749, + "learning_rate": 9.958096030124313e-05, + "loss": 3.7123, + "step": 3243 + }, + { + "epoch": 2.0902497985495567, + "grad_norm": 2.1296239874355534, + "learning_rate": 9.9580699789436e-05, + "loss": 3.7914, + "step": 3244 + }, + { + "epoch": 2.090894439967768, + "grad_norm": 2.055151681063791, + "learning_rate": 9.958043919701996e-05, + "loss": 3.844, + "step": 3245 + }, + { + "epoch": 2.091539081385979, + "grad_norm": 1.9074214064025896, + "learning_rate": 9.958017852399546e-05, + "loss": 3.6286, + "step": 3246 + }, + { + "epoch": 2.09218372280419, + "grad_norm": 2.380411127896196, + "learning_rate": 9.957991777036294e-05, + "loss": 3.9338, + "step": 3247 + }, + { + "epoch": 2.0928283642224015, + "grad_norm": 2.905147950441128, + "learning_rate": 9.957965693612281e-05, + "loss": 4.0141, + "step": 3248 + }, + { + "epoch": 2.0934730056406123, + "grad_norm": 2.2338499898693738, + "learning_rate": 9.957939602127548e-05, + "loss": 3.4317, + "step": 3249 + }, + { + "epoch": 2.0941176470588236, + "grad_norm": 1.5000549651977813, + "learning_rate": 9.957913502582142e-05, + "loss": 3.8824, + "step": 3250 + }, + { + "epoch": 2.0947622884770345, + "grad_norm": 1.5972274375255306, + "learning_rate": 9.9578873949761e-05, + "loss": 3.8578, + "step": 3251 + }, + { + "epoch": 2.095406929895246, + "grad_norm": 1.4713698711226166, + "learning_rate": 9.957861279309471e-05, + "loss": 3.804, + "step": 3252 + }, + { + "epoch": 2.096051571313457, + "grad_norm": 1.6012501361127023, + "learning_rate": 9.957835155582295e-05, + "loss": 3.9057, + "step": 3253 + }, + { + "epoch": 2.096696212731668, + "grad_norm": 1.5337058915114417, + "learning_rate": 9.957809023794616e-05, + "loss": 3.9382, + "step": 3254 + }, + { + "epoch": 2.0973408541498793, + "grad_norm": 2.039198424045178, + "learning_rate": 9.957782883946475e-05, + "loss": 3.848, + "step": 3255 + }, + { + "epoch": 2.09798549556809, + "grad_norm": 2.3883663870319993, + "learning_rate": 9.957756736037915e-05, + "loss": 3.5673, + "step": 3256 + }, + { + "epoch": 2.0986301369863014, + "grad_norm": 1.82744435498199, + "learning_rate": 9.957730580068982e-05, + "loss": 3.7631, + "step": 3257 + }, + { + "epoch": 2.0992747784045127, + "grad_norm": 2.3677564174404058, + "learning_rate": 9.957704416039717e-05, + "loss": 3.527, + "step": 3258 + }, + { + "epoch": 2.0999194198227236, + "grad_norm": 2.8240452939220493, + "learning_rate": 9.957678243950163e-05, + "loss": 3.6807, + "step": 3259 + }, + { + "epoch": 2.100564061240935, + "grad_norm": 1.66748199466906, + "learning_rate": 9.957652063800363e-05, + "loss": 3.7191, + "step": 3260 + }, + { + "epoch": 2.1012087026591457, + "grad_norm": 2.4772281374632477, + "learning_rate": 9.957625875590358e-05, + "loss": 3.7831, + "step": 3261 + }, + { + "epoch": 2.101853344077357, + "grad_norm": 2.0478883905822873, + "learning_rate": 9.957599679320196e-05, + "loss": 3.7976, + "step": 3262 + }, + { + "epoch": 2.102497985495568, + "grad_norm": 1.77595131284275, + "learning_rate": 9.957573474989913e-05, + "loss": 3.7888, + "step": 3263 + }, + { + "epoch": 2.103142626913779, + "grad_norm": 2.07870691779586, + "learning_rate": 9.957547262599559e-05, + "loss": 4.1462, + "step": 3264 + }, + { + "epoch": 2.1037872683319905, + "grad_norm": 1.474776478705579, + "learning_rate": 9.957521042149174e-05, + "loss": 3.4148, + "step": 3265 + }, + { + "epoch": 2.1044319097502013, + "grad_norm": 1.8856372912419495, + "learning_rate": 9.9574948136388e-05, + "loss": 3.8753, + "step": 3266 + }, + { + "epoch": 2.1050765511684126, + "grad_norm": 1.515383654974183, + "learning_rate": 9.957468577068483e-05, + "loss": 4.1202, + "step": 3267 + }, + { + "epoch": 2.1057211925866235, + "grad_norm": 1.6183578995227852, + "learning_rate": 9.957442332438262e-05, + "loss": 3.5511, + "step": 3268 + }, + { + "epoch": 2.106365834004835, + "grad_norm": 1.7275624759374315, + "learning_rate": 9.957416079748184e-05, + "loss": 3.7023, + "step": 3269 + }, + { + "epoch": 2.107010475423046, + "grad_norm": 1.571543284613313, + "learning_rate": 9.95738981899829e-05, + "loss": 3.736, + "step": 3270 + }, + { + "epoch": 2.107655116841257, + "grad_norm": 1.475264985583223, + "learning_rate": 9.957363550188623e-05, + "loss": 4.0893, + "step": 3271 + }, + { + "epoch": 2.1082997582594682, + "grad_norm": 1.6155489721681986, + "learning_rate": 9.957337273319228e-05, + "loss": 3.6679, + "step": 3272 + }, + { + "epoch": 2.108944399677679, + "grad_norm": 1.3504029629952892, + "learning_rate": 9.957310988390146e-05, + "loss": 3.9577, + "step": 3273 + }, + { + "epoch": 2.1095890410958904, + "grad_norm": 1.5315376516969037, + "learning_rate": 9.957284695401421e-05, + "loss": 3.5395, + "step": 3274 + }, + { + "epoch": 2.1102336825141017, + "grad_norm": 1.602173667803106, + "learning_rate": 9.957258394353097e-05, + "loss": 3.6559, + "step": 3275 + }, + { + "epoch": 2.1108783239323126, + "grad_norm": 1.8061898280683832, + "learning_rate": 9.957232085245217e-05, + "loss": 3.8291, + "step": 3276 + }, + { + "epoch": 2.111522965350524, + "grad_norm": 1.6303930014519734, + "learning_rate": 9.957205768077822e-05, + "loss": 3.8488, + "step": 3277 + }, + { + "epoch": 2.1121676067687347, + "grad_norm": 1.5475009187912367, + "learning_rate": 9.957179442850959e-05, + "loss": 3.6158, + "step": 3278 + }, + { + "epoch": 2.112812248186946, + "grad_norm": 1.8221508522541872, + "learning_rate": 9.957153109564668e-05, + "loss": 4.1309, + "step": 3279 + }, + { + "epoch": 2.1134568896051573, + "grad_norm": 1.3645586604666475, + "learning_rate": 9.957126768218994e-05, + "loss": 3.8712, + "step": 3280 + }, + { + "epoch": 2.114101531023368, + "grad_norm": 1.8527982036436002, + "learning_rate": 9.957100418813978e-05, + "loss": 3.4315, + "step": 3281 + }, + { + "epoch": 2.1147461724415795, + "grad_norm": 1.5255939353294237, + "learning_rate": 9.957074061349669e-05, + "loss": 3.5427, + "step": 3282 + }, + { + "epoch": 2.1153908138597903, + "grad_norm": 1.8546502176220814, + "learning_rate": 9.957047695826101e-05, + "loss": 3.9492, + "step": 3283 + }, + { + "epoch": 2.1160354552780016, + "grad_norm": 1.909416779670157, + "learning_rate": 9.957021322243326e-05, + "loss": 3.9429, + "step": 3284 + }, + { + "epoch": 2.116680096696213, + "grad_norm": 2.1220127179235106, + "learning_rate": 9.956994940601383e-05, + "loss": 3.8513, + "step": 3285 + }, + { + "epoch": 2.117324738114424, + "grad_norm": 1.5057167327956227, + "learning_rate": 9.956968550900315e-05, + "loss": 3.5829, + "step": 3286 + }, + { + "epoch": 2.117969379532635, + "grad_norm": 1.9763507039853718, + "learning_rate": 9.956942153140168e-05, + "loss": 3.7957, + "step": 3287 + }, + { + "epoch": 2.118614020950846, + "grad_norm": 1.9656246642105266, + "learning_rate": 9.956915747320984e-05, + "loss": 3.7065, + "step": 3288 + }, + { + "epoch": 2.1192586623690572, + "grad_norm": 1.6302053432017998, + "learning_rate": 9.956889333442806e-05, + "loss": 4.1769, + "step": 3289 + }, + { + "epoch": 2.1199033037872685, + "grad_norm": 1.861771500263726, + "learning_rate": 9.956862911505677e-05, + "loss": 4.0717, + "step": 3290 + }, + { + "epoch": 2.1205479452054794, + "grad_norm": 1.5428059867172739, + "learning_rate": 9.956836481509643e-05, + "loss": 3.6993, + "step": 3291 + }, + { + "epoch": 2.1211925866236907, + "grad_norm": 1.664641095605172, + "learning_rate": 9.956810043454742e-05, + "loss": 3.9169, + "step": 3292 + }, + { + "epoch": 2.1218372280419016, + "grad_norm": 1.9094805320014956, + "learning_rate": 9.956783597341024e-05, + "loss": 3.2919, + "step": 3293 + }, + { + "epoch": 2.122481869460113, + "grad_norm": 1.3546413386425216, + "learning_rate": 9.956757143168529e-05, + "loss": 3.6091, + "step": 3294 + }, + { + "epoch": 2.1231265108783237, + "grad_norm": 1.622336679114078, + "learning_rate": 9.9567306809373e-05, + "loss": 3.5715, + "step": 3295 + }, + { + "epoch": 2.123771152296535, + "grad_norm": 1.5865890071610884, + "learning_rate": 9.95670421064738e-05, + "loss": 3.4113, + "step": 3296 + }, + { + "epoch": 2.1244157937147463, + "grad_norm": 1.43920295188358, + "learning_rate": 9.956677732298815e-05, + "loss": 3.8349, + "step": 3297 + }, + { + "epoch": 2.125060435132957, + "grad_norm": 1.2867000826865689, + "learning_rate": 9.956651245891646e-05, + "loss": 3.8157, + "step": 3298 + }, + { + "epoch": 2.1257050765511685, + "grad_norm": 1.451379263300198, + "learning_rate": 9.95662475142592e-05, + "loss": 3.9802, + "step": 3299 + }, + { + "epoch": 2.1263497179693793, + "grad_norm": 1.2562620825284645, + "learning_rate": 9.956598248901676e-05, + "loss": 3.9073, + "step": 3300 + }, + { + "epoch": 2.1263497179693793, + "eval_loss": 4.073439121246338, + "eval_runtime": 2.9687, + "eval_samples_per_second": 33.685, + "eval_steps_per_second": 4.379, + "step": 3300 + }, + { + "epoch": 2.1269943593875906, + "grad_norm": 1.486755838611642, + "learning_rate": 9.95657173831896e-05, + "loss": 3.8233, + "step": 3301 + }, + { + "epoch": 2.127639000805802, + "grad_norm": 1.1790948732927822, + "learning_rate": 9.956545219677815e-05, + "loss": 4.0511, + "step": 3302 + }, + { + "epoch": 2.128283642224013, + "grad_norm": 1.575300931034987, + "learning_rate": 9.956518692978288e-05, + "loss": 3.9677, + "step": 3303 + }, + { + "epoch": 2.128928283642224, + "grad_norm": 1.2887753505167903, + "learning_rate": 9.956492158220416e-05, + "loss": 3.7208, + "step": 3304 + }, + { + "epoch": 2.129572925060435, + "grad_norm": 1.6856449763609613, + "learning_rate": 9.956465615404247e-05, + "loss": 3.5928, + "step": 3305 + }, + { + "epoch": 2.1302175664786462, + "grad_norm": 1.6084112869411793, + "learning_rate": 9.956439064529824e-05, + "loss": 3.8699, + "step": 3306 + }, + { + "epoch": 2.1308622078968575, + "grad_norm": 1.4182484424878172, + "learning_rate": 9.956412505597188e-05, + "loss": 3.6284, + "step": 3307 + }, + { + "epoch": 2.1315068493150684, + "grad_norm": 2.1704712537133997, + "learning_rate": 9.956385938606387e-05, + "loss": 3.3099, + "step": 3308 + }, + { + "epoch": 2.1321514907332797, + "grad_norm": 2.1878947920149274, + "learning_rate": 9.95635936355746e-05, + "loss": 4.0264, + "step": 3309 + }, + { + "epoch": 2.1327961321514906, + "grad_norm": 1.9519988291344652, + "learning_rate": 9.956332780450456e-05, + "loss": 4.0138, + "step": 3310 + }, + { + "epoch": 2.133440773569702, + "grad_norm": 1.957950714744235, + "learning_rate": 9.956306189285413e-05, + "loss": 3.9414, + "step": 3311 + }, + { + "epoch": 2.134085414987913, + "grad_norm": 1.7020105235508423, + "learning_rate": 9.95627959006238e-05, + "loss": 3.974, + "step": 3312 + }, + { + "epoch": 2.134730056406124, + "grad_norm": 1.9533880273714053, + "learning_rate": 9.956252982781398e-05, + "loss": 3.6433, + "step": 3313 + }, + { + "epoch": 2.1353746978243353, + "grad_norm": 1.565982330530687, + "learning_rate": 9.956226367442509e-05, + "loss": 3.8415, + "step": 3314 + }, + { + "epoch": 2.136019339242546, + "grad_norm": 1.5214268234941026, + "learning_rate": 9.95619974404576e-05, + "loss": 3.879, + "step": 3315 + }, + { + "epoch": 2.1366639806607575, + "grad_norm": 1.6736840022808153, + "learning_rate": 9.956173112591192e-05, + "loss": 3.6196, + "step": 3316 + }, + { + "epoch": 2.1373086220789688, + "grad_norm": 1.9267092834210526, + "learning_rate": 9.95614647307885e-05, + "loss": 4.0689, + "step": 3317 + }, + { + "epoch": 2.1379532634971796, + "grad_norm": 1.8678889664219285, + "learning_rate": 9.956119825508778e-05, + "loss": 4.1103, + "step": 3318 + }, + { + "epoch": 2.138597904915391, + "grad_norm": 1.7621558393849128, + "learning_rate": 9.95609316988102e-05, + "loss": 3.8726, + "step": 3319 + }, + { + "epoch": 2.139242546333602, + "grad_norm": 1.4925471138776, + "learning_rate": 9.956066506195619e-05, + "loss": 3.6866, + "step": 3320 + }, + { + "epoch": 2.139887187751813, + "grad_norm": 1.613346756470167, + "learning_rate": 9.956039834452619e-05, + "loss": 3.6721, + "step": 3321 + }, + { + "epoch": 2.1405318291700244, + "grad_norm": 1.783849947247806, + "learning_rate": 9.956013154652063e-05, + "loss": 3.9452, + "step": 3322 + }, + { + "epoch": 2.1411764705882352, + "grad_norm": 1.487187272057393, + "learning_rate": 9.955986466793996e-05, + "loss": 3.8083, + "step": 3323 + }, + { + "epoch": 2.1418211120064465, + "grad_norm": 1.7771800590513747, + "learning_rate": 9.955959770878461e-05, + "loss": 3.9527, + "step": 3324 + }, + { + "epoch": 2.1424657534246574, + "grad_norm": 1.535252574751163, + "learning_rate": 9.955933066905504e-05, + "loss": 4.0026, + "step": 3325 + }, + { + "epoch": 2.1431103948428687, + "grad_norm": 1.5139833609885753, + "learning_rate": 9.955906354875167e-05, + "loss": 3.9271, + "step": 3326 + }, + { + "epoch": 2.14375503626108, + "grad_norm": 1.5653260374155775, + "learning_rate": 9.955879634787494e-05, + "loss": 3.8355, + "step": 3327 + }, + { + "epoch": 2.144399677679291, + "grad_norm": 1.3638776478523664, + "learning_rate": 9.955852906642527e-05, + "loss": 3.5878, + "step": 3328 + }, + { + "epoch": 2.145044319097502, + "grad_norm": 1.1846729673897505, + "learning_rate": 9.955826170440314e-05, + "loss": 4.1109, + "step": 3329 + }, + { + "epoch": 2.145688960515713, + "grad_norm": 1.5811348848129658, + "learning_rate": 9.955799426180896e-05, + "loss": 3.5444, + "step": 3330 + }, + { + "epoch": 2.1463336019339243, + "grad_norm": 1.6089295014827596, + "learning_rate": 9.955772673864318e-05, + "loss": 3.7518, + "step": 3331 + }, + { + "epoch": 2.1469782433521356, + "grad_norm": 1.5606625897343676, + "learning_rate": 9.955745913490621e-05, + "loss": 3.8078, + "step": 3332 + }, + { + "epoch": 2.1476228847703465, + "grad_norm": 1.5903337284461239, + "learning_rate": 9.955719145059855e-05, + "loss": 3.7368, + "step": 3333 + }, + { + "epoch": 2.1482675261885578, + "grad_norm": 1.545526956374363, + "learning_rate": 9.95569236857206e-05, + "loss": 3.8243, + "step": 3334 + }, + { + "epoch": 2.1489121676067686, + "grad_norm": 1.7500551114748886, + "learning_rate": 9.955665584027279e-05, + "loss": 3.9035, + "step": 3335 + }, + { + "epoch": 2.14955680902498, + "grad_norm": 1.7195642421383166, + "learning_rate": 9.955638791425559e-05, + "loss": 4.0276, + "step": 3336 + }, + { + "epoch": 2.1502014504431908, + "grad_norm": 1.6087305804682335, + "learning_rate": 9.955611990766941e-05, + "loss": 3.4825, + "step": 3337 + }, + { + "epoch": 2.150846091861402, + "grad_norm": 1.2831142954925263, + "learning_rate": 9.955585182051471e-05, + "loss": 3.5254, + "step": 3338 + }, + { + "epoch": 2.1514907332796134, + "grad_norm": 1.6664803138482185, + "learning_rate": 9.955558365279194e-05, + "loss": 3.7059, + "step": 3339 + }, + { + "epoch": 2.1521353746978242, + "grad_norm": 1.7777491998148722, + "learning_rate": 9.955531540450151e-05, + "loss": 4.0155, + "step": 3340 + }, + { + "epoch": 2.1527800161160355, + "grad_norm": 1.744240139941933, + "learning_rate": 9.955504707564388e-05, + "loss": 3.5401, + "step": 3341 + }, + { + "epoch": 2.1534246575342464, + "grad_norm": 2.4841549829511953, + "learning_rate": 9.955477866621948e-05, + "loss": 3.6992, + "step": 3342 + }, + { + "epoch": 2.1540692989524577, + "grad_norm": 2.3044178452217845, + "learning_rate": 9.955451017622878e-05, + "loss": 3.9162, + "step": 3343 + }, + { + "epoch": 2.154713940370669, + "grad_norm": 1.7947368670336272, + "learning_rate": 9.955424160567218e-05, + "loss": 3.7789, + "step": 3344 + }, + { + "epoch": 2.15535858178888, + "grad_norm": 2.4682279478499676, + "learning_rate": 9.955397295455015e-05, + "loss": 3.309, + "step": 3345 + }, + { + "epoch": 2.156003223207091, + "grad_norm": 1.7492848618878518, + "learning_rate": 9.95537042228631e-05, + "loss": 4.1644, + "step": 3346 + }, + { + "epoch": 2.156647864625302, + "grad_norm": 1.9148456837132763, + "learning_rate": 9.955343541061152e-05, + "loss": 3.8306, + "step": 3347 + }, + { + "epoch": 2.1572925060435133, + "grad_norm": 1.789023376986163, + "learning_rate": 9.955316651779581e-05, + "loss": 3.7333, + "step": 3348 + }, + { + "epoch": 2.1579371474617246, + "grad_norm": 1.4476031682839678, + "learning_rate": 9.955289754441643e-05, + "loss": 3.8779, + "step": 3349 + }, + { + "epoch": 2.1585817888799355, + "grad_norm": 1.8539672372870237, + "learning_rate": 9.955262849047383e-05, + "loss": 3.9877, + "step": 3350 + }, + { + "epoch": 2.1592264302981468, + "grad_norm": 1.6567912814239667, + "learning_rate": 9.955235935596842e-05, + "loss": 3.7531, + "step": 3351 + }, + { + "epoch": 2.1598710717163576, + "grad_norm": 2.2728524202424647, + "learning_rate": 9.955209014090067e-05, + "loss": 3.6392, + "step": 3352 + }, + { + "epoch": 2.160515713134569, + "grad_norm": 2.558773989011837, + "learning_rate": 9.955182084527101e-05, + "loss": 3.715, + "step": 3353 + }, + { + "epoch": 2.16116035455278, + "grad_norm": 1.6997823669942138, + "learning_rate": 9.955155146907988e-05, + "loss": 3.8941, + "step": 3354 + }, + { + "epoch": 2.161804995970991, + "grad_norm": 1.7775963580620964, + "learning_rate": 9.955128201232773e-05, + "loss": 3.7889, + "step": 3355 + }, + { + "epoch": 2.1624496373892024, + "grad_norm": 1.3139508975870324, + "learning_rate": 9.955101247501502e-05, + "loss": 3.7541, + "step": 3356 + }, + { + "epoch": 2.1630942788074132, + "grad_norm": 1.5285420724650816, + "learning_rate": 9.955074285714214e-05, + "loss": 3.54, + "step": 3357 + }, + { + "epoch": 2.1637389202256245, + "grad_norm": 2.0793621360799293, + "learning_rate": 9.95504731587096e-05, + "loss": 3.6145, + "step": 3358 + }, + { + "epoch": 2.1643835616438354, + "grad_norm": 1.512762360472688, + "learning_rate": 9.95502033797178e-05, + "loss": 3.8565, + "step": 3359 + }, + { + "epoch": 2.1650282030620467, + "grad_norm": 1.9108742844933462, + "learning_rate": 9.954993352016717e-05, + "loss": 3.5389, + "step": 3360 + }, + { + "epoch": 2.165672844480258, + "grad_norm": 1.6904976082419387, + "learning_rate": 9.95496635800582e-05, + "loss": 3.8736, + "step": 3361 + }, + { + "epoch": 2.166317485898469, + "grad_norm": 1.5719773892919302, + "learning_rate": 9.954939355939131e-05, + "loss": 4.0346, + "step": 3362 + }, + { + "epoch": 2.16696212731668, + "grad_norm": 1.5587227493109468, + "learning_rate": 9.954912345816694e-05, + "loss": 3.6229, + "step": 3363 + }, + { + "epoch": 2.167606768734891, + "grad_norm": 1.9597515278535291, + "learning_rate": 9.954885327638552e-05, + "loss": 3.6752, + "step": 3364 + }, + { + "epoch": 2.1682514101531023, + "grad_norm": 1.6896909234415987, + "learning_rate": 9.954858301404751e-05, + "loss": 3.8659, + "step": 3365 + }, + { + "epoch": 2.1688960515713136, + "grad_norm": 1.9893342000331122, + "learning_rate": 9.954831267115338e-05, + "loss": 3.7824, + "step": 3366 + }, + { + "epoch": 2.1695406929895245, + "grad_norm": 2.0494282717686123, + "learning_rate": 9.954804224770352e-05, + "loss": 3.7352, + "step": 3367 + }, + { + "epoch": 2.1701853344077358, + "grad_norm": 1.7396938706775904, + "learning_rate": 9.954777174369842e-05, + "loss": 3.779, + "step": 3368 + }, + { + "epoch": 2.1708299758259466, + "grad_norm": 1.3783039732489786, + "learning_rate": 9.95475011591385e-05, + "loss": 4.0784, + "step": 3369 + }, + { + "epoch": 2.171474617244158, + "grad_norm": 1.7235388618289653, + "learning_rate": 9.954723049402422e-05, + "loss": 3.8652, + "step": 3370 + }, + { + "epoch": 2.172119258662369, + "grad_norm": 1.4124714908622367, + "learning_rate": 9.954695974835598e-05, + "loss": 3.8095, + "step": 3371 + }, + { + "epoch": 2.17276390008058, + "grad_norm": 2.367102991680922, + "learning_rate": 9.954668892213429e-05, + "loss": 3.3323, + "step": 3372 + }, + { + "epoch": 2.1734085414987914, + "grad_norm": 1.8925730799317084, + "learning_rate": 9.954641801535955e-05, + "loss": 3.7652, + "step": 3373 + }, + { + "epoch": 2.1740531829170022, + "grad_norm": 1.6708491294183057, + "learning_rate": 9.954614702803223e-05, + "loss": 3.5852, + "step": 3374 + }, + { + "epoch": 2.1746978243352135, + "grad_norm": 1.944447355026774, + "learning_rate": 9.954587596015277e-05, + "loss": 4.1415, + "step": 3375 + }, + { + "epoch": 2.175342465753425, + "grad_norm": 1.4103586943593354, + "learning_rate": 9.954560481172159e-05, + "loss": 3.7756, + "step": 3376 + }, + { + "epoch": 2.1759871071716357, + "grad_norm": 1.4575430144361157, + "learning_rate": 9.954533358273917e-05, + "loss": 4.1889, + "step": 3377 + }, + { + "epoch": 2.176631748589847, + "grad_norm": 1.3844575509587553, + "learning_rate": 9.954506227320593e-05, + "loss": 3.9073, + "step": 3378 + }, + { + "epoch": 2.177276390008058, + "grad_norm": 1.4995560263683068, + "learning_rate": 9.954479088312235e-05, + "loss": 3.6734, + "step": 3379 + }, + { + "epoch": 2.177921031426269, + "grad_norm": 1.614744031653908, + "learning_rate": 9.954451941248882e-05, + "loss": 3.326, + "step": 3380 + }, + { + "epoch": 2.1785656728444804, + "grad_norm": 1.4186975764015293, + "learning_rate": 9.954424786130584e-05, + "loss": 3.7839, + "step": 3381 + }, + { + "epoch": 2.1792103142626913, + "grad_norm": 1.5308830787790912, + "learning_rate": 9.954397622957383e-05, + "loss": 3.984, + "step": 3382 + }, + { + "epoch": 2.1798549556809026, + "grad_norm": 1.8725040570170675, + "learning_rate": 9.954370451729324e-05, + "loss": 3.9218, + "step": 3383 + }, + { + "epoch": 2.1804995970991135, + "grad_norm": 2.0341000311043653, + "learning_rate": 9.954343272446452e-05, + "loss": 4.1941, + "step": 3384 + }, + { + "epoch": 2.1811442385173248, + "grad_norm": 2.2483976344802805, + "learning_rate": 9.954316085108808e-05, + "loss": 3.7933, + "step": 3385 + }, + { + "epoch": 2.181788879935536, + "grad_norm": 1.5820702083537395, + "learning_rate": 9.954288889716443e-05, + "loss": 3.4159, + "step": 3386 + }, + { + "epoch": 2.182433521353747, + "grad_norm": 1.9698285388568926, + "learning_rate": 9.954261686269399e-05, + "loss": 3.6594, + "step": 3387 + }, + { + "epoch": 2.183078162771958, + "grad_norm": 1.5850288612020895, + "learning_rate": 9.954234474767717e-05, + "loss": 3.9407, + "step": 3388 + }, + { + "epoch": 2.183722804190169, + "grad_norm": 1.869197102830226, + "learning_rate": 9.954207255211448e-05, + "loss": 3.8339, + "step": 3389 + }, + { + "epoch": 2.1843674456083804, + "grad_norm": 2.90910679313411, + "learning_rate": 9.954180027600631e-05, + "loss": 3.8673, + "step": 3390 + }, + { + "epoch": 2.1850120870265917, + "grad_norm": 1.7081351715449498, + "learning_rate": 9.954152791935314e-05, + "loss": 3.9813, + "step": 3391 + }, + { + "epoch": 2.1856567284448025, + "grad_norm": 2.973436919195608, + "learning_rate": 9.954125548215544e-05, + "loss": 4.0326, + "step": 3392 + }, + { + "epoch": 2.186301369863014, + "grad_norm": 2.0167732945062222, + "learning_rate": 9.95409829644136e-05, + "loss": 3.9279, + "step": 3393 + }, + { + "epoch": 2.1869460112812247, + "grad_norm": 2.0912033581227343, + "learning_rate": 9.954071036612812e-05, + "loss": 4.1669, + "step": 3394 + }, + { + "epoch": 2.187590652699436, + "grad_norm": 2.0280404216652426, + "learning_rate": 9.95404376872994e-05, + "loss": 3.9487, + "step": 3395 + }, + { + "epoch": 2.1882352941176473, + "grad_norm": 2.598964642919737, + "learning_rate": 9.954016492792791e-05, + "loss": 4.1602, + "step": 3396 + }, + { + "epoch": 2.188879935535858, + "grad_norm": 1.3255935952938809, + "learning_rate": 9.95398920880141e-05, + "loss": 3.9711, + "step": 3397 + }, + { + "epoch": 2.1895245769540694, + "grad_norm": 2.6095312748268964, + "learning_rate": 9.953961916755843e-05, + "loss": 3.8177, + "step": 3398 + }, + { + "epoch": 2.1901692183722803, + "grad_norm": 1.8529218000388348, + "learning_rate": 9.953934616656131e-05, + "loss": 3.8234, + "step": 3399 + }, + { + "epoch": 2.1908138597904916, + "grad_norm": 1.8462290350069561, + "learning_rate": 9.953907308502325e-05, + "loss": 4.1361, + "step": 3400 + }, + { + "epoch": 2.1908138597904916, + "eval_loss": 4.066194534301758, + "eval_runtime": 2.9834, + "eval_samples_per_second": 33.519, + "eval_steps_per_second": 4.357, + "step": 3400 + }, + { + "epoch": 2.191458501208703, + "grad_norm": 1.4695298010762983, + "learning_rate": 9.953879992294464e-05, + "loss": 4.2556, + "step": 3401 + }, + { + "epoch": 2.1921031426269137, + "grad_norm": 2.0836045899969964, + "learning_rate": 9.953852668032596e-05, + "loss": 3.8872, + "step": 3402 + }, + { + "epoch": 2.192747784045125, + "grad_norm": 2.481942040522632, + "learning_rate": 9.953825335716765e-05, + "loss": 3.6186, + "step": 3403 + }, + { + "epoch": 2.193392425463336, + "grad_norm": 1.357281507230976, + "learning_rate": 9.953797995347016e-05, + "loss": 3.8392, + "step": 3404 + }, + { + "epoch": 2.194037066881547, + "grad_norm": 1.94022229712576, + "learning_rate": 9.953770646923394e-05, + "loss": 3.9914, + "step": 3405 + }, + { + "epoch": 2.194681708299758, + "grad_norm": 2.450698781619511, + "learning_rate": 9.953743290445941e-05, + "loss": 3.7766, + "step": 3406 + }, + { + "epoch": 2.1953263497179694, + "grad_norm": 1.9610373015159046, + "learning_rate": 9.953715925914708e-05, + "loss": 3.5686, + "step": 3407 + }, + { + "epoch": 2.1959709911361807, + "grad_norm": 2.1572240030034804, + "learning_rate": 9.953688553329734e-05, + "loss": 4.0595, + "step": 3408 + }, + { + "epoch": 2.1966156325543915, + "grad_norm": 1.9460426374356004, + "learning_rate": 9.95366117269107e-05, + "loss": 4.0153, + "step": 3409 + }, + { + "epoch": 2.197260273972603, + "grad_norm": 1.8594955664085064, + "learning_rate": 9.953633783998754e-05, + "loss": 3.7948, + "step": 3410 + }, + { + "epoch": 2.1979049153908137, + "grad_norm": 2.5646796235445852, + "learning_rate": 9.953606387252836e-05, + "loss": 3.5358, + "step": 3411 + }, + { + "epoch": 2.198549556809025, + "grad_norm": 1.40792022819274, + "learning_rate": 9.953578982453359e-05, + "loss": 3.8035, + "step": 3412 + }, + { + "epoch": 2.1991941982272363, + "grad_norm": 2.3276091015466522, + "learning_rate": 9.953551569600368e-05, + "loss": 3.7887, + "step": 3413 + }, + { + "epoch": 2.199838839645447, + "grad_norm": 1.6561951228288492, + "learning_rate": 9.95352414869391e-05, + "loss": 3.5982, + "step": 3414 + }, + { + "epoch": 2.2004834810636584, + "grad_norm": 1.917204298257893, + "learning_rate": 9.953496719734028e-05, + "loss": 3.7413, + "step": 3415 + }, + { + "epoch": 2.2011281224818693, + "grad_norm": 1.7224148288474215, + "learning_rate": 9.953469282720766e-05, + "loss": 4.2265, + "step": 3416 + }, + { + "epoch": 2.2017727639000806, + "grad_norm": 1.4098823011362027, + "learning_rate": 9.953441837654172e-05, + "loss": 3.9182, + "step": 3417 + }, + { + "epoch": 2.202417405318292, + "grad_norm": 1.6746281259141143, + "learning_rate": 9.95341438453429e-05, + "loss": 3.9938, + "step": 3418 + }, + { + "epoch": 2.2030620467365027, + "grad_norm": 1.5149332539278104, + "learning_rate": 9.953386923361164e-05, + "loss": 3.9777, + "step": 3419 + }, + { + "epoch": 2.203706688154714, + "grad_norm": 1.4906385806054268, + "learning_rate": 9.95335945413484e-05, + "loss": 3.6752, + "step": 3420 + }, + { + "epoch": 2.204351329572925, + "grad_norm": 1.3204207427909254, + "learning_rate": 9.953331976855365e-05, + "loss": 3.9206, + "step": 3421 + }, + { + "epoch": 2.204995970991136, + "grad_norm": 1.433878843091434, + "learning_rate": 9.95330449152278e-05, + "loss": 4.0447, + "step": 3422 + }, + { + "epoch": 2.2056406124093475, + "grad_norm": 1.4355673028547975, + "learning_rate": 9.953276998137131e-05, + "loss": 3.8879, + "step": 3423 + }, + { + "epoch": 2.2062852538275584, + "grad_norm": 1.2679735057487145, + "learning_rate": 9.953249496698468e-05, + "loss": 3.6723, + "step": 3424 + }, + { + "epoch": 2.2069298952457697, + "grad_norm": 1.7052176089502826, + "learning_rate": 9.953221987206831e-05, + "loss": 3.7071, + "step": 3425 + }, + { + "epoch": 2.2075745366639805, + "grad_norm": 1.2769650992178128, + "learning_rate": 9.953194469662267e-05, + "loss": 3.8213, + "step": 3426 + }, + { + "epoch": 2.208219178082192, + "grad_norm": 1.7361997153308943, + "learning_rate": 9.95316694406482e-05, + "loss": 3.711, + "step": 3427 + }, + { + "epoch": 2.2088638195004027, + "grad_norm": 1.4538086474242207, + "learning_rate": 9.953139410414537e-05, + "loss": 3.7508, + "step": 3428 + }, + { + "epoch": 2.209508460918614, + "grad_norm": 1.4907008040440979, + "learning_rate": 9.953111868711463e-05, + "loss": 3.6978, + "step": 3429 + }, + { + "epoch": 2.2101531023368253, + "grad_norm": 1.156799862285187, + "learning_rate": 9.953084318955642e-05, + "loss": 3.7475, + "step": 3430 + }, + { + "epoch": 2.210797743755036, + "grad_norm": 1.7743568706394737, + "learning_rate": 9.953056761147119e-05, + "loss": 3.6903, + "step": 3431 + }, + { + "epoch": 2.2114423851732474, + "grad_norm": 1.4879895414594229, + "learning_rate": 9.953029195285941e-05, + "loss": 3.9098, + "step": 3432 + }, + { + "epoch": 2.2120870265914583, + "grad_norm": 1.8662782098751596, + "learning_rate": 9.953001621372153e-05, + "loss": 3.8975, + "step": 3433 + }, + { + "epoch": 2.2127316680096696, + "grad_norm": 2.035531427306358, + "learning_rate": 9.9529740394058e-05, + "loss": 4.0001, + "step": 3434 + }, + { + "epoch": 2.213376309427881, + "grad_norm": 1.3382118524837663, + "learning_rate": 9.952946449386925e-05, + "loss": 3.7272, + "step": 3435 + }, + { + "epoch": 2.2140209508460917, + "grad_norm": 1.5199473711452858, + "learning_rate": 9.952918851315578e-05, + "loss": 3.6453, + "step": 3436 + }, + { + "epoch": 2.214665592264303, + "grad_norm": 1.4302180118526868, + "learning_rate": 9.952891245191799e-05, + "loss": 4.0209, + "step": 3437 + }, + { + "epoch": 2.215310233682514, + "grad_norm": 1.6546670101513248, + "learning_rate": 9.952863631015638e-05, + "loss": 3.3322, + "step": 3438 + }, + { + "epoch": 2.215954875100725, + "grad_norm": 1.7818096092273172, + "learning_rate": 9.952836008787137e-05, + "loss": 3.8434, + "step": 3439 + }, + { + "epoch": 2.2165995165189365, + "grad_norm": 1.8824460838645816, + "learning_rate": 9.952808378506343e-05, + "loss": 3.5036, + "step": 3440 + }, + { + "epoch": 2.2172441579371474, + "grad_norm": 1.5800580776649562, + "learning_rate": 9.952780740173301e-05, + "loss": 4.1311, + "step": 3441 + }, + { + "epoch": 2.2178887993553587, + "grad_norm": 1.4931977811002175, + "learning_rate": 9.952753093788057e-05, + "loss": 3.8483, + "step": 3442 + }, + { + "epoch": 2.2185334407735695, + "grad_norm": 1.8942963215401332, + "learning_rate": 9.952725439350657e-05, + "loss": 3.5139, + "step": 3443 + }, + { + "epoch": 2.219178082191781, + "grad_norm": 1.6867498663759595, + "learning_rate": 9.952697776861143e-05, + "loss": 3.668, + "step": 3444 + }, + { + "epoch": 2.219822723609992, + "grad_norm": 1.5561796160429275, + "learning_rate": 9.952670106319565e-05, + "loss": 4.2275, + "step": 3445 + }, + { + "epoch": 2.220467365028203, + "grad_norm": 1.517717863910922, + "learning_rate": 9.952642427725966e-05, + "loss": 3.783, + "step": 3446 + }, + { + "epoch": 2.2211120064464143, + "grad_norm": 1.212681911652548, + "learning_rate": 9.95261474108039e-05, + "loss": 3.7503, + "step": 3447 + }, + { + "epoch": 2.221756647864625, + "grad_norm": 1.8320358383568547, + "learning_rate": 9.952587046382885e-05, + "loss": 4.0099, + "step": 3448 + }, + { + "epoch": 2.2224012892828364, + "grad_norm": 1.7577223498464782, + "learning_rate": 9.952559343633494e-05, + "loss": 4.0296, + "step": 3449 + }, + { + "epoch": 2.2230459307010477, + "grad_norm": 1.4968775724985728, + "learning_rate": 9.952531632832266e-05, + "loss": 3.9553, + "step": 3450 + }, + { + "epoch": 2.2236905721192586, + "grad_norm": 1.544718188111802, + "learning_rate": 9.952503913979243e-05, + "loss": 3.7599, + "step": 3451 + }, + { + "epoch": 2.22433521353747, + "grad_norm": 1.54321237423203, + "learning_rate": 9.952476187074475e-05, + "loss": 3.5474, + "step": 3452 + }, + { + "epoch": 2.2249798549556807, + "grad_norm": 1.5895264194226115, + "learning_rate": 9.952448452118002e-05, + "loss": 4.031, + "step": 3453 + }, + { + "epoch": 2.225624496373892, + "grad_norm": 1.9850024269519266, + "learning_rate": 9.952420709109874e-05, + "loss": 3.787, + "step": 3454 + }, + { + "epoch": 2.2262691377921033, + "grad_norm": 1.5970546122001028, + "learning_rate": 9.952392958050134e-05, + "loss": 3.7181, + "step": 3455 + }, + { + "epoch": 2.226913779210314, + "grad_norm": 1.599362636158866, + "learning_rate": 9.952365198938828e-05, + "loss": 3.9791, + "step": 3456 + }, + { + "epoch": 2.2275584206285255, + "grad_norm": 1.819640782658271, + "learning_rate": 9.952337431776002e-05, + "loss": 3.6293, + "step": 3457 + }, + { + "epoch": 2.2282030620467363, + "grad_norm": 1.784495153805148, + "learning_rate": 9.952309656561704e-05, + "loss": 3.7362, + "step": 3458 + }, + { + "epoch": 2.2288477034649476, + "grad_norm": 1.6586922917955753, + "learning_rate": 9.952281873295974e-05, + "loss": 3.8433, + "step": 3459 + }, + { + "epoch": 2.229492344883159, + "grad_norm": 1.6391815118029855, + "learning_rate": 9.952254081978862e-05, + "loss": 3.3772, + "step": 3460 + }, + { + "epoch": 2.23013698630137, + "grad_norm": 1.802164533703292, + "learning_rate": 9.952226282610412e-05, + "loss": 3.747, + "step": 3461 + }, + { + "epoch": 2.230781627719581, + "grad_norm": 1.4106174753325178, + "learning_rate": 9.95219847519067e-05, + "loss": 3.8304, + "step": 3462 + }, + { + "epoch": 2.231426269137792, + "grad_norm": 1.7534248515022943, + "learning_rate": 9.952170659719683e-05, + "loss": 3.7752, + "step": 3463 + }, + { + "epoch": 2.2320709105560033, + "grad_norm": 1.6145031862729553, + "learning_rate": 9.952142836197494e-05, + "loss": 3.9018, + "step": 3464 + }, + { + "epoch": 2.2327155519742146, + "grad_norm": 1.7746964264916505, + "learning_rate": 9.95211500462415e-05, + "loss": 3.7161, + "step": 3465 + }, + { + "epoch": 2.2333601933924254, + "grad_norm": 1.8917054931512225, + "learning_rate": 9.952087164999698e-05, + "loss": 3.6092, + "step": 3466 + }, + { + "epoch": 2.2340048348106367, + "grad_norm": 1.3633540459778637, + "learning_rate": 9.952059317324184e-05, + "loss": 4.0659, + "step": 3467 + }, + { + "epoch": 2.2346494762288476, + "grad_norm": 1.6820073694066648, + "learning_rate": 9.952031461597651e-05, + "loss": 3.7542, + "step": 3468 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 2.163041339038727, + "learning_rate": 9.952003597820145e-05, + "loss": 3.9324, + "step": 3469 + }, + { + "epoch": 2.23593875906527, + "grad_norm": 1.3821115152458308, + "learning_rate": 9.951975725991715e-05, + "loss": 3.6124, + "step": 3470 + }, + { + "epoch": 2.236583400483481, + "grad_norm": 2.048770976285689, + "learning_rate": 9.951947846112403e-05, + "loss": 3.7136, + "step": 3471 + }, + { + "epoch": 2.2372280419016923, + "grad_norm": 1.6399223513328016, + "learning_rate": 9.951919958182257e-05, + "loss": 3.3819, + "step": 3472 + }, + { + "epoch": 2.237872683319903, + "grad_norm": 1.5787703425785944, + "learning_rate": 9.951892062201321e-05, + "loss": 3.9168, + "step": 3473 + }, + { + "epoch": 2.2385173247381145, + "grad_norm": 1.6929445405142312, + "learning_rate": 9.951864158169644e-05, + "loss": 3.7413, + "step": 3474 + }, + { + "epoch": 2.2391619661563253, + "grad_norm": 1.6122933414497795, + "learning_rate": 9.951836246087268e-05, + "loss": 3.8787, + "step": 3475 + }, + { + "epoch": 2.2398066075745366, + "grad_norm": 1.8111420847229869, + "learning_rate": 9.951808325954241e-05, + "loss": 3.6241, + "step": 3476 + }, + { + "epoch": 2.240451248992748, + "grad_norm": 1.3962240357392848, + "learning_rate": 9.95178039777061e-05, + "loss": 3.7937, + "step": 3477 + }, + { + "epoch": 2.241095890410959, + "grad_norm": 1.8739553507704605, + "learning_rate": 9.951752461536417e-05, + "loss": 3.6392, + "step": 3478 + }, + { + "epoch": 2.24174053182917, + "grad_norm": 1.790676834607365, + "learning_rate": 9.951724517251712e-05, + "loss": 3.5814, + "step": 3479 + }, + { + "epoch": 2.242385173247381, + "grad_norm": 1.6681183471825822, + "learning_rate": 9.95169656491654e-05, + "loss": 3.8525, + "step": 3480 + }, + { + "epoch": 2.2430298146655923, + "grad_norm": 1.556926090609054, + "learning_rate": 9.951668604530944e-05, + "loss": 3.9644, + "step": 3481 + }, + { + "epoch": 2.2436744560838036, + "grad_norm": 1.451704321776283, + "learning_rate": 9.951640636094975e-05, + "loss": 3.7527, + "step": 3482 + }, + { + "epoch": 2.2443190975020144, + "grad_norm": 1.7032707763427295, + "learning_rate": 9.951612659608674e-05, + "loss": 3.7274, + "step": 3483 + }, + { + "epoch": 2.2449637389202257, + "grad_norm": 1.6743339659474465, + "learning_rate": 9.951584675072089e-05, + "loss": 4.1501, + "step": 3484 + }, + { + "epoch": 2.2456083803384366, + "grad_norm": 1.5154987092998742, + "learning_rate": 9.951556682485265e-05, + "loss": 4.1928, + "step": 3485 + }, + { + "epoch": 2.246253021756648, + "grad_norm": 1.5431470830037046, + "learning_rate": 9.951528681848251e-05, + "loss": 3.9838, + "step": 3486 + }, + { + "epoch": 2.246897663174859, + "grad_norm": 1.6065646768563109, + "learning_rate": 9.95150067316109e-05, + "loss": 3.6901, + "step": 3487 + }, + { + "epoch": 2.24754230459307, + "grad_norm": 2.0757385693187196, + "learning_rate": 9.951472656423828e-05, + "loss": 4.075, + "step": 3488 + }, + { + "epoch": 2.2481869460112813, + "grad_norm": 1.5360357660930053, + "learning_rate": 9.951444631636513e-05, + "loss": 3.8574, + "step": 3489 + }, + { + "epoch": 2.248831587429492, + "grad_norm": 1.9604673306654243, + "learning_rate": 9.951416598799188e-05, + "loss": 3.4935, + "step": 3490 + }, + { + "epoch": 2.2494762288477035, + "grad_norm": 2.282853717556584, + "learning_rate": 9.951388557911901e-05, + "loss": 3.7953, + "step": 3491 + }, + { + "epoch": 2.2501208702659143, + "grad_norm": 1.1863332520044116, + "learning_rate": 9.9513605089747e-05, + "loss": 4.1417, + "step": 3492 + }, + { + "epoch": 2.2507655116841256, + "grad_norm": 2.1832817569270224, + "learning_rate": 9.951332451987628e-05, + "loss": 4.3018, + "step": 3493 + }, + { + "epoch": 2.251410153102337, + "grad_norm": 1.5579034053481702, + "learning_rate": 9.951304386950733e-05, + "loss": 3.5314, + "step": 3494 + }, + { + "epoch": 2.252054794520548, + "grad_norm": 2.2700619468367216, + "learning_rate": 9.95127631386406e-05, + "loss": 3.9521, + "step": 3495 + }, + { + "epoch": 2.252699435938759, + "grad_norm": 1.9556118501865685, + "learning_rate": 9.951248232727654e-05, + "loss": 3.8372, + "step": 3496 + }, + { + "epoch": 2.25334407735697, + "grad_norm": 1.9474533761222577, + "learning_rate": 9.951220143541565e-05, + "loss": 3.5923, + "step": 3497 + }, + { + "epoch": 2.2539887187751813, + "grad_norm": 2.2089355717585524, + "learning_rate": 9.951192046305834e-05, + "loss": 4.0423, + "step": 3498 + }, + { + "epoch": 2.2546333601933926, + "grad_norm": 1.5018436581697507, + "learning_rate": 9.95116394102051e-05, + "loss": 3.8655, + "step": 3499 + }, + { + "epoch": 2.2552780016116034, + "grad_norm": 1.6551320577391753, + "learning_rate": 9.95113582768564e-05, + "loss": 3.6107, + "step": 3500 + }, + { + "epoch": 2.2552780016116034, + "eval_loss": 4.0718817710876465, + "eval_runtime": 2.9773, + "eval_samples_per_second": 33.587, + "eval_steps_per_second": 4.366, + "step": 3500 + }, + { + "epoch": 2.2559226430298147, + "grad_norm": 1.3869585115743004, + "learning_rate": 9.951107706301269e-05, + "loss": 4.0652, + "step": 3501 + }, + { + "epoch": 2.2565672844480256, + "grad_norm": 1.6197122924723928, + "learning_rate": 9.951079576867443e-05, + "loss": 4.3053, + "step": 3502 + }, + { + "epoch": 2.257211925866237, + "grad_norm": 1.2594492602960377, + "learning_rate": 9.951051439384208e-05, + "loss": 3.8277, + "step": 3503 + }, + { + "epoch": 2.257856567284448, + "grad_norm": 2.3322413438755656, + "learning_rate": 9.95102329385161e-05, + "loss": 3.3858, + "step": 3504 + }, + { + "epoch": 2.258501208702659, + "grad_norm": 2.3585345760572842, + "learning_rate": 9.950995140269698e-05, + "loss": 3.9089, + "step": 3505 + }, + { + "epoch": 2.2591458501208703, + "grad_norm": 1.621188879266224, + "learning_rate": 9.950966978638514e-05, + "loss": 3.9653, + "step": 3506 + }, + { + "epoch": 2.259790491539081, + "grad_norm": 2.1464832771633335, + "learning_rate": 9.950938808958107e-05, + "loss": 3.5579, + "step": 3507 + }, + { + "epoch": 2.2604351329572925, + "grad_norm": 1.5061908190783986, + "learning_rate": 9.950910631228524e-05, + "loss": 3.8691, + "step": 3508 + }, + { + "epoch": 2.261079774375504, + "grad_norm": 1.7727170542083388, + "learning_rate": 9.950882445449809e-05, + "loss": 3.9356, + "step": 3509 + }, + { + "epoch": 2.2617244157937146, + "grad_norm": 2.0256533313369003, + "learning_rate": 9.950854251622007e-05, + "loss": 4.2912, + "step": 3510 + }, + { + "epoch": 2.262369057211926, + "grad_norm": 1.5233085069953927, + "learning_rate": 9.950826049745169e-05, + "loss": 3.9575, + "step": 3511 + }, + { + "epoch": 2.263013698630137, + "grad_norm": 2.3540424713097035, + "learning_rate": 9.950797839819338e-05, + "loss": 4.1653, + "step": 3512 + }, + { + "epoch": 2.263658340048348, + "grad_norm": 1.5858825844088977, + "learning_rate": 9.950769621844563e-05, + "loss": 3.522, + "step": 3513 + }, + { + "epoch": 2.2643029814665594, + "grad_norm": 2.10873545174134, + "learning_rate": 9.950741395820887e-05, + "loss": 3.728, + "step": 3514 + }, + { + "epoch": 2.2649476228847703, + "grad_norm": 1.6948425983822684, + "learning_rate": 9.950713161748356e-05, + "loss": 3.6206, + "step": 3515 + }, + { + "epoch": 2.2655922643029816, + "grad_norm": 2.319394351752735, + "learning_rate": 9.950684919627021e-05, + "loss": 3.9835, + "step": 3516 + }, + { + "epoch": 2.2662369057211924, + "grad_norm": 2.1522156432898023, + "learning_rate": 9.950656669456925e-05, + "loss": 3.5774, + "step": 3517 + }, + { + "epoch": 2.2668815471394037, + "grad_norm": 2.051358514717645, + "learning_rate": 9.950628411238114e-05, + "loss": 4.0375, + "step": 3518 + }, + { + "epoch": 2.267526188557615, + "grad_norm": 1.5087198261832586, + "learning_rate": 9.950600144970635e-05, + "loss": 3.7314, + "step": 3519 + }, + { + "epoch": 2.268170829975826, + "grad_norm": 2.0697181240433795, + "learning_rate": 9.950571870654538e-05, + "loss": 3.7582, + "step": 3520 + }, + { + "epoch": 2.268815471394037, + "grad_norm": 1.722199934995292, + "learning_rate": 9.950543588289864e-05, + "loss": 3.738, + "step": 3521 + }, + { + "epoch": 2.269460112812248, + "grad_norm": 1.600333864005699, + "learning_rate": 9.950515297876661e-05, + "loss": 3.9352, + "step": 3522 + }, + { + "epoch": 2.2701047542304593, + "grad_norm": 1.6957344297417147, + "learning_rate": 9.950486999414979e-05, + "loss": 3.4943, + "step": 3523 + }, + { + "epoch": 2.2707493956486706, + "grad_norm": 1.85031938087208, + "learning_rate": 9.950458692904858e-05, + "loss": 3.8579, + "step": 3524 + }, + { + "epoch": 2.2713940370668815, + "grad_norm": 1.1864258085727182, + "learning_rate": 9.95043037834635e-05, + "loss": 3.6118, + "step": 3525 + }, + { + "epoch": 2.2720386784850928, + "grad_norm": 1.8837405242231178, + "learning_rate": 9.9504020557395e-05, + "loss": 4.035, + "step": 3526 + }, + { + "epoch": 2.2726833199033036, + "grad_norm": 1.613155412530898, + "learning_rate": 9.950373725084354e-05, + "loss": 3.808, + "step": 3527 + }, + { + "epoch": 2.273327961321515, + "grad_norm": 1.563802563244877, + "learning_rate": 9.950345386380959e-05, + "loss": 3.5702, + "step": 3528 + }, + { + "epoch": 2.2739726027397262, + "grad_norm": 1.9187760577364603, + "learning_rate": 9.95031703962936e-05, + "loss": 4.0486, + "step": 3529 + }, + { + "epoch": 2.274617244157937, + "grad_norm": 1.611776734071255, + "learning_rate": 9.950288684829606e-05, + "loss": 4.0952, + "step": 3530 + }, + { + "epoch": 2.2752618855761484, + "grad_norm": 2.542166997587977, + "learning_rate": 9.950260321981742e-05, + "loss": 4.065, + "step": 3531 + }, + { + "epoch": 2.2759065269943592, + "grad_norm": 2.3310953828228422, + "learning_rate": 9.950231951085814e-05, + "loss": 3.9239, + "step": 3532 + }, + { + "epoch": 2.2765511684125705, + "grad_norm": 2.0060353567806413, + "learning_rate": 9.950203572141871e-05, + "loss": 3.6558, + "step": 3533 + }, + { + "epoch": 2.277195809830782, + "grad_norm": 2.7441439227410696, + "learning_rate": 9.950175185149958e-05, + "loss": 4.0246, + "step": 3534 + }, + { + "epoch": 2.2778404512489927, + "grad_norm": 1.817910867497291, + "learning_rate": 9.950146790110121e-05, + "loss": 4.1433, + "step": 3535 + }, + { + "epoch": 2.278485092667204, + "grad_norm": 2.432021167716349, + "learning_rate": 9.95011838702241e-05, + "loss": 3.2385, + "step": 3536 + }, + { + "epoch": 2.279129734085415, + "grad_norm": 2.136357191102738, + "learning_rate": 9.950089975886865e-05, + "loss": 3.9987, + "step": 3537 + }, + { + "epoch": 2.279774375503626, + "grad_norm": 1.6350718900611745, + "learning_rate": 9.95006155670354e-05, + "loss": 3.8822, + "step": 3538 + }, + { + "epoch": 2.2804190169218375, + "grad_norm": 1.9659644495130912, + "learning_rate": 9.950033129472477e-05, + "loss": 4.0783, + "step": 3539 + }, + { + "epoch": 2.2810636583400483, + "grad_norm": 1.4088001345212475, + "learning_rate": 9.950004694193723e-05, + "loss": 3.5394, + "step": 3540 + }, + { + "epoch": 2.2817082997582596, + "grad_norm": 1.5885433416588468, + "learning_rate": 9.949976250867327e-05, + "loss": 3.6693, + "step": 3541 + }, + { + "epoch": 2.2823529411764705, + "grad_norm": 1.7773535831020046, + "learning_rate": 9.949947799493334e-05, + "loss": 3.8571, + "step": 3542 + }, + { + "epoch": 2.2829975825946818, + "grad_norm": 2.2308642272334476, + "learning_rate": 9.949919340071793e-05, + "loss": 3.6267, + "step": 3543 + }, + { + "epoch": 2.283642224012893, + "grad_norm": 1.3382427134449983, + "learning_rate": 9.949890872602748e-05, + "loss": 3.7111, + "step": 3544 + }, + { + "epoch": 2.284286865431104, + "grad_norm": 1.9299393643712728, + "learning_rate": 9.949862397086246e-05, + "loss": 3.7328, + "step": 3545 + }, + { + "epoch": 2.2849315068493152, + "grad_norm": 1.7238502865032508, + "learning_rate": 9.949833913522335e-05, + "loss": 3.8483, + "step": 3546 + }, + { + "epoch": 2.285576148267526, + "grad_norm": 1.4255079501326113, + "learning_rate": 9.949805421911061e-05, + "loss": 3.7112, + "step": 3547 + }, + { + "epoch": 2.2862207896857374, + "grad_norm": 1.5245954042269618, + "learning_rate": 9.949776922252472e-05, + "loss": 3.8646, + "step": 3548 + }, + { + "epoch": 2.2868654311039482, + "grad_norm": 1.2293158056015137, + "learning_rate": 9.949748414546614e-05, + "loss": 3.8043, + "step": 3549 + }, + { + "epoch": 2.2875100725221595, + "grad_norm": 1.468076229880505, + "learning_rate": 9.949719898793531e-05, + "loss": 3.9776, + "step": 3550 + }, + { + "epoch": 2.288154713940371, + "grad_norm": 1.3901620046873062, + "learning_rate": 9.949691374993276e-05, + "loss": 3.8883, + "step": 3551 + }, + { + "epoch": 2.2887993553585817, + "grad_norm": 1.5152034427781516, + "learning_rate": 9.94966284314589e-05, + "loss": 3.5506, + "step": 3552 + }, + { + "epoch": 2.289443996776793, + "grad_norm": 2.007939070563528, + "learning_rate": 9.949634303251423e-05, + "loss": 3.6509, + "step": 3553 + }, + { + "epoch": 2.290088638195004, + "grad_norm": 1.3740448326617822, + "learning_rate": 9.94960575530992e-05, + "loss": 3.6353, + "step": 3554 + }, + { + "epoch": 2.290733279613215, + "grad_norm": 2.2253042151761018, + "learning_rate": 9.949577199321432e-05, + "loss": 3.9529, + "step": 3555 + }, + { + "epoch": 2.2913779210314265, + "grad_norm": 1.8457831208678255, + "learning_rate": 9.949548635286001e-05, + "loss": 4.0704, + "step": 3556 + }, + { + "epoch": 2.2920225624496373, + "grad_norm": 1.723695992090317, + "learning_rate": 9.949520063203675e-05, + "loss": 3.9318, + "step": 3557 + }, + { + "epoch": 2.2926672038678486, + "grad_norm": 2.082062209900969, + "learning_rate": 9.949491483074503e-05, + "loss": 4.232, + "step": 3558 + }, + { + "epoch": 2.2933118452860595, + "grad_norm": 1.3344742178116602, + "learning_rate": 9.94946289489853e-05, + "loss": 4.0004, + "step": 3559 + }, + { + "epoch": 2.2939564867042708, + "grad_norm": 1.500486680828004, + "learning_rate": 9.949434298675805e-05, + "loss": 3.7908, + "step": 3560 + }, + { + "epoch": 2.2946011281224816, + "grad_norm": 1.4620512146195337, + "learning_rate": 9.949405694406373e-05, + "loss": 3.8911, + "step": 3561 + }, + { + "epoch": 2.295245769540693, + "grad_norm": 1.8702198187545225, + "learning_rate": 9.949377082090281e-05, + "loss": 4.1995, + "step": 3562 + }, + { + "epoch": 2.2958904109589042, + "grad_norm": 1.8929038412763668, + "learning_rate": 9.949348461727577e-05, + "loss": 3.9541, + "step": 3563 + }, + { + "epoch": 2.296535052377115, + "grad_norm": 1.580656249039295, + "learning_rate": 9.949319833318307e-05, + "loss": 4.0388, + "step": 3564 + }, + { + "epoch": 2.2971796937953264, + "grad_norm": 1.688561130605131, + "learning_rate": 9.949291196862519e-05, + "loss": 3.9868, + "step": 3565 + }, + { + "epoch": 2.2978243352135372, + "grad_norm": 1.7531020524516776, + "learning_rate": 9.949262552360259e-05, + "loss": 4.3537, + "step": 3566 + }, + { + "epoch": 2.2984689766317485, + "grad_norm": 2.0611485310539397, + "learning_rate": 9.949233899811577e-05, + "loss": 3.6244, + "step": 3567 + }, + { + "epoch": 2.29911361804996, + "grad_norm": 2.1517280739298963, + "learning_rate": 9.949205239216515e-05, + "loss": 3.7391, + "step": 3568 + }, + { + "epoch": 2.2997582594681707, + "grad_norm": 1.4874446332619886, + "learning_rate": 9.949176570575123e-05, + "loss": 4.0711, + "step": 3569 + }, + { + "epoch": 2.300402900886382, + "grad_norm": 2.0920434440133095, + "learning_rate": 9.949147893887451e-05, + "loss": 3.7254, + "step": 3570 + }, + { + "epoch": 2.301047542304593, + "grad_norm": 1.8217420929867147, + "learning_rate": 9.949119209153539e-05, + "loss": 3.8446, + "step": 3571 + }, + { + "epoch": 2.301692183722804, + "grad_norm": 1.73942743686378, + "learning_rate": 9.949090516373441e-05, + "loss": 3.7536, + "step": 3572 + }, + { + "epoch": 2.3023368251410155, + "grad_norm": 1.6427524449720081, + "learning_rate": 9.949061815547198e-05, + "loss": 3.8648, + "step": 3573 + }, + { + "epoch": 2.3029814665592263, + "grad_norm": 1.8954230603331284, + "learning_rate": 9.949033106674864e-05, + "loss": 4.0671, + "step": 3574 + }, + { + "epoch": 2.3036261079774376, + "grad_norm": 1.531270702623179, + "learning_rate": 9.94900438975648e-05, + "loss": 3.8893, + "step": 3575 + }, + { + "epoch": 2.3042707493956485, + "grad_norm": 1.6359189125223312, + "learning_rate": 9.948975664792096e-05, + "loss": 4.1356, + "step": 3576 + }, + { + "epoch": 2.3049153908138598, + "grad_norm": 2.002175718123425, + "learning_rate": 9.948946931781761e-05, + "loss": 3.9162, + "step": 3577 + }, + { + "epoch": 2.305560032232071, + "grad_norm": 1.2461993013829094, + "learning_rate": 9.948918190725516e-05, + "loss": 3.7579, + "step": 3578 + }, + { + "epoch": 2.306204673650282, + "grad_norm": 1.7672649649994878, + "learning_rate": 9.948889441623417e-05, + "loss": 3.9613, + "step": 3579 + }, + { + "epoch": 2.3068493150684932, + "grad_norm": 1.9720174524263316, + "learning_rate": 9.948860684475502e-05, + "loss": 3.8437, + "step": 3580 + }, + { + "epoch": 2.307493956486704, + "grad_norm": 1.946698677306441, + "learning_rate": 9.948831919281827e-05, + "loss": 4.0581, + "step": 3581 + }, + { + "epoch": 2.3081385979049154, + "grad_norm": 2.1178079624750166, + "learning_rate": 9.948803146042432e-05, + "loss": 3.6222, + "step": 3582 + }, + { + "epoch": 2.3087832393231267, + "grad_norm": 1.9333338580095534, + "learning_rate": 9.948774364757369e-05, + "loss": 3.8128, + "step": 3583 + }, + { + "epoch": 2.3094278807413375, + "grad_norm": 1.6105753297926177, + "learning_rate": 9.948745575426682e-05, + "loss": 4.0172, + "step": 3584 + }, + { + "epoch": 2.310072522159549, + "grad_norm": 1.8157453240631745, + "learning_rate": 9.94871677805042e-05, + "loss": 4.121, + "step": 3585 + }, + { + "epoch": 2.3107171635777597, + "grad_norm": 1.612205950402559, + "learning_rate": 9.94868797262863e-05, + "loss": 3.9041, + "step": 3586 + }, + { + "epoch": 2.311361804995971, + "grad_norm": 2.0653009482090874, + "learning_rate": 9.94865915916136e-05, + "loss": 4.0075, + "step": 3587 + }, + { + "epoch": 2.3120064464141823, + "grad_norm": 1.9806029650734782, + "learning_rate": 9.948630337648656e-05, + "loss": 3.6192, + "step": 3588 + }, + { + "epoch": 2.312651087832393, + "grad_norm": 2.3063578135513785, + "learning_rate": 9.948601508090564e-05, + "loss": 4.1774, + "step": 3589 + }, + { + "epoch": 2.3132957292506044, + "grad_norm": 2.59112095482421, + "learning_rate": 9.948572670487137e-05, + "loss": 3.4986, + "step": 3590 + }, + { + "epoch": 2.3139403706688153, + "grad_norm": 1.7770747513057554, + "learning_rate": 9.948543824838418e-05, + "loss": 3.8503, + "step": 3591 + }, + { + "epoch": 2.3145850120870266, + "grad_norm": 1.8738345915275858, + "learning_rate": 9.948514971144455e-05, + "loss": 4.0817, + "step": 3592 + }, + { + "epoch": 2.315229653505238, + "grad_norm": 1.6794910866347414, + "learning_rate": 9.948486109405295e-05, + "loss": 3.7727, + "step": 3593 + }, + { + "epoch": 2.3158742949234488, + "grad_norm": 2.4542494574916853, + "learning_rate": 9.948457239620986e-05, + "loss": 3.7721, + "step": 3594 + }, + { + "epoch": 2.31651893634166, + "grad_norm": 2.0017186408735315, + "learning_rate": 9.948428361791575e-05, + "loss": 3.9646, + "step": 3595 + }, + { + "epoch": 2.317163577759871, + "grad_norm": 1.9407608228060766, + "learning_rate": 9.948399475917108e-05, + "loss": 3.6207, + "step": 3596 + }, + { + "epoch": 2.317808219178082, + "grad_norm": 1.5923500850954995, + "learning_rate": 9.948370581997637e-05, + "loss": 3.8943, + "step": 3597 + }, + { + "epoch": 2.3184528605962935, + "grad_norm": 1.703650013640598, + "learning_rate": 9.948341680033205e-05, + "loss": 3.5711, + "step": 3598 + }, + { + "epoch": 2.3190975020145044, + "grad_norm": 1.6003968543488745, + "learning_rate": 9.948312770023862e-05, + "loss": 3.6534, + "step": 3599 + }, + { + "epoch": 2.3197421434327157, + "grad_norm": 1.755054842449183, + "learning_rate": 9.948283851969654e-05, + "loss": 3.622, + "step": 3600 + }, + { + "epoch": 2.3197421434327157, + "eval_loss": 4.070638656616211, + "eval_runtime": 2.983, + "eval_samples_per_second": 33.524, + "eval_steps_per_second": 4.358, + "step": 3600 + }, + { + "epoch": 2.3203867848509265, + "grad_norm": 1.599369627738356, + "learning_rate": 9.948254925870629e-05, + "loss": 3.8268, + "step": 3601 + }, + { + "epoch": 2.321031426269138, + "grad_norm": 1.552737147443064, + "learning_rate": 9.948225991726837e-05, + "loss": 3.9542, + "step": 3602 + }, + { + "epoch": 2.321676067687349, + "grad_norm": 1.6712054274682295, + "learning_rate": 9.948197049538319e-05, + "loss": 3.9257, + "step": 3603 + }, + { + "epoch": 2.32232070910556, + "grad_norm": 1.6510400023490437, + "learning_rate": 9.94816809930513e-05, + "loss": 3.537, + "step": 3604 + }, + { + "epoch": 2.3229653505237713, + "grad_norm": 2.2392496244817055, + "learning_rate": 9.948139141027311e-05, + "loss": 3.4928, + "step": 3605 + }, + { + "epoch": 2.323609991941982, + "grad_norm": 1.380122956539465, + "learning_rate": 9.948110174704915e-05, + "loss": 3.7057, + "step": 3606 + }, + { + "epoch": 2.3242546333601934, + "grad_norm": 2.2032480413690463, + "learning_rate": 9.948081200337988e-05, + "loss": 3.3432, + "step": 3607 + }, + { + "epoch": 2.3248992747784047, + "grad_norm": 1.8403107514465997, + "learning_rate": 9.948052217926576e-05, + "loss": 3.9489, + "step": 3608 + }, + { + "epoch": 2.3255439161966156, + "grad_norm": 1.935158180276848, + "learning_rate": 9.948023227470727e-05, + "loss": 4.1597, + "step": 3609 + }, + { + "epoch": 2.326188557614827, + "grad_norm": 1.8886189768843529, + "learning_rate": 9.947994228970488e-05, + "loss": 3.4309, + "step": 3610 + }, + { + "epoch": 2.3268331990330378, + "grad_norm": 1.7500056140932452, + "learning_rate": 9.94796522242591e-05, + "loss": 3.7764, + "step": 3611 + }, + { + "epoch": 2.327477840451249, + "grad_norm": 1.9804248800858242, + "learning_rate": 9.947936207837038e-05, + "loss": 3.552, + "step": 3612 + }, + { + "epoch": 2.3281224818694604, + "grad_norm": 1.6498043506629898, + "learning_rate": 9.947907185203918e-05, + "loss": 3.8329, + "step": 3613 + }, + { + "epoch": 2.328767123287671, + "grad_norm": 1.7025666107932416, + "learning_rate": 9.947878154526602e-05, + "loss": 3.6864, + "step": 3614 + }, + { + "epoch": 2.3294117647058825, + "grad_norm": 1.6930357039467587, + "learning_rate": 9.947849115805134e-05, + "loss": 3.8225, + "step": 3615 + }, + { + "epoch": 2.3300564061240934, + "grad_norm": 1.4531123104432464, + "learning_rate": 9.947820069039564e-05, + "loss": 3.5944, + "step": 3616 + }, + { + "epoch": 2.3307010475423047, + "grad_norm": 1.6649319358816321, + "learning_rate": 9.947791014229939e-05, + "loss": 3.7862, + "step": 3617 + }, + { + "epoch": 2.3313456889605155, + "grad_norm": 1.555939937051078, + "learning_rate": 9.947761951376305e-05, + "loss": 3.6722, + "step": 3618 + }, + { + "epoch": 2.331990330378727, + "grad_norm": 2.0079238649805236, + "learning_rate": 9.947732880478712e-05, + "loss": 3.9844, + "step": 3619 + }, + { + "epoch": 2.332634971796938, + "grad_norm": 1.4189635531890965, + "learning_rate": 9.947703801537207e-05, + "loss": 4.0167, + "step": 3620 + }, + { + "epoch": 2.333279613215149, + "grad_norm": 2.207081787340689, + "learning_rate": 9.947674714551838e-05, + "loss": 3.5735, + "step": 3621 + }, + { + "epoch": 2.3339242546333603, + "grad_norm": 1.9112734556467532, + "learning_rate": 9.947645619522651e-05, + "loss": 4.0219, + "step": 3622 + }, + { + "epoch": 2.334568896051571, + "grad_norm": 1.7111111234499057, + "learning_rate": 9.947616516449698e-05, + "loss": 4.0045, + "step": 3623 + }, + { + "epoch": 2.3352135374697824, + "grad_norm": 1.530771566673289, + "learning_rate": 9.947587405333022e-05, + "loss": 3.7851, + "step": 3624 + }, + { + "epoch": 2.3358581788879937, + "grad_norm": 1.386420910962165, + "learning_rate": 9.947558286172674e-05, + "loss": 4.0679, + "step": 3625 + }, + { + "epoch": 2.3365028203062046, + "grad_norm": 1.356267917626287, + "learning_rate": 9.9475291589687e-05, + "loss": 3.9821, + "step": 3626 + }, + { + "epoch": 2.337147461724416, + "grad_norm": 1.5761401784557216, + "learning_rate": 9.947500023721149e-05, + "loss": 4.2764, + "step": 3627 + }, + { + "epoch": 2.3377921031426268, + "grad_norm": 1.3457412586995416, + "learning_rate": 9.947470880430068e-05, + "loss": 3.7157, + "step": 3628 + }, + { + "epoch": 2.338436744560838, + "grad_norm": 1.3098082153058217, + "learning_rate": 9.947441729095506e-05, + "loss": 4.0052, + "step": 3629 + }, + { + "epoch": 2.339081385979049, + "grad_norm": 1.5184679197218582, + "learning_rate": 9.94741256971751e-05, + "loss": 4.043, + "step": 3630 + }, + { + "epoch": 2.33972602739726, + "grad_norm": 1.335716593812083, + "learning_rate": 9.947383402296128e-05, + "loss": 4.074, + "step": 3631 + }, + { + "epoch": 2.3403706688154715, + "grad_norm": 1.1474698048513154, + "learning_rate": 9.947354226831407e-05, + "loss": 3.6418, + "step": 3632 + }, + { + "epoch": 2.3410153102336824, + "grad_norm": 1.6033805536298296, + "learning_rate": 9.947325043323397e-05, + "loss": 3.4912, + "step": 3633 + }, + { + "epoch": 2.3416599516518937, + "grad_norm": 1.446396869551523, + "learning_rate": 9.947295851772142e-05, + "loss": 3.4948, + "step": 3634 + }, + { + "epoch": 2.3423045930701045, + "grad_norm": 1.6137092198509075, + "learning_rate": 9.947266652177697e-05, + "loss": 3.6089, + "step": 3635 + }, + { + "epoch": 2.342949234488316, + "grad_norm": 1.7891673500870007, + "learning_rate": 9.947237444540104e-05, + "loss": 4.1591, + "step": 3636 + }, + { + "epoch": 2.343593875906527, + "grad_norm": 1.1162726334042905, + "learning_rate": 9.947208228859412e-05, + "loss": 3.9943, + "step": 3637 + }, + { + "epoch": 2.344238517324738, + "grad_norm": 1.7295279822537557, + "learning_rate": 9.947179005135669e-05, + "loss": 4.0752, + "step": 3638 + }, + { + "epoch": 2.3448831587429493, + "grad_norm": 1.4941611025572643, + "learning_rate": 9.947149773368926e-05, + "loss": 3.9652, + "step": 3639 + }, + { + "epoch": 2.34552780016116, + "grad_norm": 1.740570751888269, + "learning_rate": 9.947120533559226e-05, + "loss": 3.3922, + "step": 3640 + }, + { + "epoch": 2.3461724415793714, + "grad_norm": 2.195654990691485, + "learning_rate": 9.947091285706622e-05, + "loss": 4.2149, + "step": 3641 + }, + { + "epoch": 2.3468170829975827, + "grad_norm": 1.5678821359311992, + "learning_rate": 9.947062029811159e-05, + "loss": 3.7073, + "step": 3642 + }, + { + "epoch": 2.3474617244157936, + "grad_norm": 1.6774491788870818, + "learning_rate": 9.947032765872885e-05, + "loss": 3.7586, + "step": 3643 + }, + { + "epoch": 2.348106365834005, + "grad_norm": 1.4835545066186338, + "learning_rate": 9.947003493891849e-05, + "loss": 3.5515, + "step": 3644 + }, + { + "epoch": 2.3487510072522158, + "grad_norm": 1.5603791039073809, + "learning_rate": 9.9469742138681e-05, + "loss": 3.6798, + "step": 3645 + }, + { + "epoch": 2.349395648670427, + "grad_norm": 1.0921572953669174, + "learning_rate": 9.946944925801682e-05, + "loss": 3.8176, + "step": 3646 + }, + { + "epoch": 2.3500402900886384, + "grad_norm": 1.5144716110127179, + "learning_rate": 9.94691562969265e-05, + "loss": 3.9141, + "step": 3647 + }, + { + "epoch": 2.350684931506849, + "grad_norm": 1.3587985964463292, + "learning_rate": 9.946886325541046e-05, + "loss": 4.0865, + "step": 3648 + }, + { + "epoch": 2.3513295729250605, + "grad_norm": 1.5523513367886712, + "learning_rate": 9.946857013346922e-05, + "loss": 3.6072, + "step": 3649 + }, + { + "epoch": 2.3519742143432714, + "grad_norm": 1.5764701512073946, + "learning_rate": 9.946827693110322e-05, + "loss": 3.9468, + "step": 3650 + }, + { + "epoch": 2.3526188557614827, + "grad_norm": 1.6294646735316796, + "learning_rate": 9.946798364831298e-05, + "loss": 3.6545, + "step": 3651 + }, + { + "epoch": 2.353263497179694, + "grad_norm": 1.8385053832684384, + "learning_rate": 9.946769028509897e-05, + "loss": 4.1724, + "step": 3652 + }, + { + "epoch": 2.353908138597905, + "grad_norm": 2.1624709170041636, + "learning_rate": 9.946739684146166e-05, + "loss": 4.0117, + "step": 3653 + }, + { + "epoch": 2.354552780016116, + "grad_norm": 1.4173407093644743, + "learning_rate": 9.946710331740155e-05, + "loss": 3.9757, + "step": 3654 + }, + { + "epoch": 2.355197421434327, + "grad_norm": 1.5436936337285476, + "learning_rate": 9.946680971291911e-05, + "loss": 3.8398, + "step": 3655 + }, + { + "epoch": 2.3558420628525383, + "grad_norm": 1.6069878231547834, + "learning_rate": 9.946651602801483e-05, + "loss": 3.8148, + "step": 3656 + }, + { + "epoch": 2.3564867042707496, + "grad_norm": 1.3856062552325918, + "learning_rate": 9.946622226268918e-05, + "loss": 3.7425, + "step": 3657 + }, + { + "epoch": 2.3571313456889604, + "grad_norm": 1.7309791428199774, + "learning_rate": 9.946592841694266e-05, + "loss": 3.8996, + "step": 3658 + }, + { + "epoch": 2.3577759871071717, + "grad_norm": 1.6345791344316598, + "learning_rate": 9.946563449077573e-05, + "loss": 4.1981, + "step": 3659 + }, + { + "epoch": 2.3584206285253826, + "grad_norm": 1.364366945322773, + "learning_rate": 9.946534048418891e-05, + "loss": 3.7156, + "step": 3660 + }, + { + "epoch": 2.359065269943594, + "grad_norm": 1.5401320793305766, + "learning_rate": 9.946504639718264e-05, + "loss": 3.8592, + "step": 3661 + }, + { + "epoch": 2.359709911361805, + "grad_norm": 1.6657115650393362, + "learning_rate": 9.946475222975743e-05, + "loss": 3.6808, + "step": 3662 + }, + { + "epoch": 2.360354552780016, + "grad_norm": 1.5324070758267494, + "learning_rate": 9.946445798191375e-05, + "loss": 3.8697, + "step": 3663 + }, + { + "epoch": 2.3609991941982273, + "grad_norm": 1.551416830592017, + "learning_rate": 9.946416365365208e-05, + "loss": 3.8889, + "step": 3664 + }, + { + "epoch": 2.361643835616438, + "grad_norm": 1.2199142757776222, + "learning_rate": 9.946386924497292e-05, + "loss": 3.4898, + "step": 3665 + }, + { + "epoch": 2.3622884770346495, + "grad_norm": 1.6777023163707536, + "learning_rate": 9.946357475587676e-05, + "loss": 3.8058, + "step": 3666 + }, + { + "epoch": 2.362933118452861, + "grad_norm": 1.8444079282935566, + "learning_rate": 9.946328018636405e-05, + "loss": 4.0721, + "step": 3667 + }, + { + "epoch": 2.3635777598710717, + "grad_norm": 1.3663804305900613, + "learning_rate": 9.94629855364353e-05, + "loss": 4.1433, + "step": 3668 + }, + { + "epoch": 2.364222401289283, + "grad_norm": 1.6313396080452853, + "learning_rate": 9.946269080609099e-05, + "loss": 4.0077, + "step": 3669 + }, + { + "epoch": 2.364867042707494, + "grad_norm": 1.6833822136194592, + "learning_rate": 9.94623959953316e-05, + "loss": 3.5339, + "step": 3670 + }, + { + "epoch": 2.365511684125705, + "grad_norm": 1.6267258273318153, + "learning_rate": 9.94621011041576e-05, + "loss": 4.1058, + "step": 3671 + }, + { + "epoch": 2.3661563255439164, + "grad_norm": 1.7026831252813646, + "learning_rate": 9.946180613256951e-05, + "loss": 3.9554, + "step": 3672 + }, + { + "epoch": 2.3668009669621273, + "grad_norm": 1.8236713592182265, + "learning_rate": 9.946151108056779e-05, + "loss": 3.7146, + "step": 3673 + }, + { + "epoch": 2.3674456083803386, + "grad_norm": 1.8370966647478124, + "learning_rate": 9.94612159481529e-05, + "loss": 3.8497, + "step": 3674 + }, + { + "epoch": 2.3680902497985494, + "grad_norm": 1.6030699713880134, + "learning_rate": 9.946092073532538e-05, + "loss": 3.7757, + "step": 3675 + }, + { + "epoch": 2.3687348912167607, + "grad_norm": 1.7644447409830921, + "learning_rate": 9.94606254420857e-05, + "loss": 3.6597, + "step": 3676 + }, + { + "epoch": 2.369379532634972, + "grad_norm": 1.7556107401544128, + "learning_rate": 9.946033006843431e-05, + "loss": 3.7832, + "step": 3677 + }, + { + "epoch": 2.370024174053183, + "grad_norm": 1.4090568739681761, + "learning_rate": 9.946003461437174e-05, + "loss": 3.8583, + "step": 3678 + }, + { + "epoch": 2.370668815471394, + "grad_norm": 1.4639059763526605, + "learning_rate": 9.945973907989844e-05, + "loss": 3.2677, + "step": 3679 + }, + { + "epoch": 2.371313456889605, + "grad_norm": 1.7088388551952203, + "learning_rate": 9.94594434650149e-05, + "loss": 3.4956, + "step": 3680 + }, + { + "epoch": 2.3719580983078163, + "grad_norm": 1.8867468810315466, + "learning_rate": 9.945914776972163e-05, + "loss": 3.7192, + "step": 3681 + }, + { + "epoch": 2.3726027397260276, + "grad_norm": 2.130205522670558, + "learning_rate": 9.945885199401907e-05, + "loss": 3.6001, + "step": 3682 + }, + { + "epoch": 2.3732473811442385, + "grad_norm": 1.5315239594161953, + "learning_rate": 9.945855613790775e-05, + "loss": 4.0636, + "step": 3683 + }, + { + "epoch": 2.37389202256245, + "grad_norm": 1.8969966433258167, + "learning_rate": 9.945826020138815e-05, + "loss": 3.6403, + "step": 3684 + }, + { + "epoch": 2.3745366639806607, + "grad_norm": 1.8168846569232542, + "learning_rate": 9.945796418446077e-05, + "loss": 3.9815, + "step": 3685 + }, + { + "epoch": 2.375181305398872, + "grad_norm": 1.6897071117541707, + "learning_rate": 9.945766808712603e-05, + "loss": 3.453, + "step": 3686 + }, + { + "epoch": 2.375825946817083, + "grad_norm": 1.6165516244647524, + "learning_rate": 9.945737190938448e-05, + "loss": 3.6568, + "step": 3687 + }, + { + "epoch": 2.376470588235294, + "grad_norm": 1.683976177795054, + "learning_rate": 9.945707565123658e-05, + "loss": 3.9343, + "step": 3688 + }, + { + "epoch": 2.3771152296535054, + "grad_norm": 1.5900838929883803, + "learning_rate": 9.945677931268283e-05, + "loss": 3.4281, + "step": 3689 + }, + { + "epoch": 2.3777598710717163, + "grad_norm": 1.600339993790976, + "learning_rate": 9.945648289372371e-05, + "loss": 3.5449, + "step": 3690 + }, + { + "epoch": 2.3784045124899276, + "grad_norm": 1.8607269271313873, + "learning_rate": 9.945618639435971e-05, + "loss": 3.9997, + "step": 3691 + }, + { + "epoch": 2.3790491539081384, + "grad_norm": 2.16983652060475, + "learning_rate": 9.94558898145913e-05, + "loss": 3.8453, + "step": 3692 + }, + { + "epoch": 2.3796937953263497, + "grad_norm": 1.798663998222398, + "learning_rate": 9.945559315441899e-05, + "loss": 3.9449, + "step": 3693 + }, + { + "epoch": 2.380338436744561, + "grad_norm": 2.030599134501775, + "learning_rate": 9.945529641384326e-05, + "loss": 3.7957, + "step": 3694 + }, + { + "epoch": 2.380983078162772, + "grad_norm": 1.9560308511382207, + "learning_rate": 9.945499959286459e-05, + "loss": 4.1087, + "step": 3695 + }, + { + "epoch": 2.381627719580983, + "grad_norm": 1.790597481823151, + "learning_rate": 9.945470269148346e-05, + "loss": 3.9442, + "step": 3696 + }, + { + "epoch": 2.382272360999194, + "grad_norm": 2.186767673394763, + "learning_rate": 9.945440570970039e-05, + "loss": 4.0739, + "step": 3697 + }, + { + "epoch": 2.3829170024174053, + "grad_norm": 1.8835199668088873, + "learning_rate": 9.945410864751584e-05, + "loss": 3.78, + "step": 3698 + }, + { + "epoch": 2.383561643835616, + "grad_norm": 1.5618209366429938, + "learning_rate": 9.945381150493032e-05, + "loss": 3.9357, + "step": 3699 + }, + { + "epoch": 2.3842062852538275, + "grad_norm": 1.5678611870047463, + "learning_rate": 9.945351428194429e-05, + "loss": 3.5816, + "step": 3700 + }, + { + "epoch": 2.3842062852538275, + "eval_loss": 4.0675530433654785, + "eval_runtime": 2.9786, + "eval_samples_per_second": 33.573, + "eval_steps_per_second": 4.365, + "step": 3700 + }, + { + "epoch": 2.384850926672039, + "grad_norm": 2.0321181142778113, + "learning_rate": 9.945321697855825e-05, + "loss": 3.7637, + "step": 3701 + }, + { + "epoch": 2.3854955680902497, + "grad_norm": 1.6444174707513817, + "learning_rate": 9.94529195947727e-05, + "loss": 3.6918, + "step": 3702 + }, + { + "epoch": 2.386140209508461, + "grad_norm": 1.5798594041950351, + "learning_rate": 9.94526221305881e-05, + "loss": 3.9413, + "step": 3703 + }, + { + "epoch": 2.386784850926672, + "grad_norm": 1.97491131634022, + "learning_rate": 9.945232458600497e-05, + "loss": 3.4496, + "step": 3704 + }, + { + "epoch": 2.387429492344883, + "grad_norm": 1.8740184213134325, + "learning_rate": 9.94520269610238e-05, + "loss": 3.871, + "step": 3705 + }, + { + "epoch": 2.3880741337630944, + "grad_norm": 2.004550086409456, + "learning_rate": 9.945172925564504e-05, + "loss": 3.8786, + "step": 3706 + }, + { + "epoch": 2.3887187751813053, + "grad_norm": 1.7145609908247894, + "learning_rate": 9.945143146986923e-05, + "loss": 4.0527, + "step": 3707 + }, + { + "epoch": 2.3893634165995166, + "grad_norm": 1.4503642120713962, + "learning_rate": 9.94511336036968e-05, + "loss": 3.6119, + "step": 3708 + }, + { + "epoch": 2.3900080580177274, + "grad_norm": 2.1638057885255444, + "learning_rate": 9.94508356571283e-05, + "loss": 3.865, + "step": 3709 + }, + { + "epoch": 2.3906526994359387, + "grad_norm": 1.5358866032548764, + "learning_rate": 9.945053763016417e-05, + "loss": 3.8067, + "step": 3710 + }, + { + "epoch": 2.39129734085415, + "grad_norm": 1.811108081753932, + "learning_rate": 9.945023952280494e-05, + "loss": 3.832, + "step": 3711 + }, + { + "epoch": 2.391941982272361, + "grad_norm": 1.775338551600745, + "learning_rate": 9.944994133505106e-05, + "loss": 3.583, + "step": 3712 + }, + { + "epoch": 2.392586623690572, + "grad_norm": 1.4867257901623128, + "learning_rate": 9.944964306690306e-05, + "loss": 3.8411, + "step": 3713 + }, + { + "epoch": 2.393231265108783, + "grad_norm": 1.889155721232517, + "learning_rate": 9.94493447183614e-05, + "loss": 3.8977, + "step": 3714 + }, + { + "epoch": 2.3938759065269943, + "grad_norm": 1.6751492032805324, + "learning_rate": 9.944904628942659e-05, + "loss": 3.8453, + "step": 3715 + }, + { + "epoch": 2.3945205479452056, + "grad_norm": 2.1276716201943713, + "learning_rate": 9.94487477800991e-05, + "loss": 3.6844, + "step": 3716 + }, + { + "epoch": 2.3951651893634165, + "grad_norm": 1.3664389047329777, + "learning_rate": 9.944844919037942e-05, + "loss": 4.1317, + "step": 3717 + }, + { + "epoch": 2.395809830781628, + "grad_norm": 2.0511011367308463, + "learning_rate": 9.944815052026808e-05, + "loss": 3.8014, + "step": 3718 + }, + { + "epoch": 2.3964544721998386, + "grad_norm": 1.615514859717097, + "learning_rate": 9.94478517697655e-05, + "loss": 3.9709, + "step": 3719 + }, + { + "epoch": 2.39709911361805, + "grad_norm": 1.808377559071116, + "learning_rate": 9.944755293887226e-05, + "loss": 4.1228, + "step": 3720 + }, + { + "epoch": 2.3977437550362612, + "grad_norm": 1.5622004098502047, + "learning_rate": 9.944725402758877e-05, + "loss": 3.6484, + "step": 3721 + }, + { + "epoch": 2.398388396454472, + "grad_norm": 2.147211058493944, + "learning_rate": 9.944695503591558e-05, + "loss": 4.1447, + "step": 3722 + }, + { + "epoch": 2.3990330378726834, + "grad_norm": 1.4029112608485639, + "learning_rate": 9.944665596385312e-05, + "loss": 4.1122, + "step": 3723 + }, + { + "epoch": 2.3996776792908943, + "grad_norm": 2.128406349369414, + "learning_rate": 9.944635681140194e-05, + "loss": 3.6793, + "step": 3724 + }, + { + "epoch": 2.4003223207091056, + "grad_norm": 2.0594594056060633, + "learning_rate": 9.94460575785625e-05, + "loss": 3.9373, + "step": 3725 + }, + { + "epoch": 2.400966962127317, + "grad_norm": 1.2516361672110765, + "learning_rate": 9.944575826533531e-05, + "loss": 3.6025, + "step": 3726 + }, + { + "epoch": 2.4016116035455277, + "grad_norm": 1.97397906406501, + "learning_rate": 9.944545887172083e-05, + "loss": 3.8935, + "step": 3727 + }, + { + "epoch": 2.402256244963739, + "grad_norm": 1.1079195797550587, + "learning_rate": 9.944515939771959e-05, + "loss": 3.5867, + "step": 3728 + }, + { + "epoch": 2.40290088638195, + "grad_norm": 2.125876234553681, + "learning_rate": 9.944485984333205e-05, + "loss": 4.1118, + "step": 3729 + }, + { + "epoch": 2.403545527800161, + "grad_norm": 1.5579413234864332, + "learning_rate": 9.944456020855872e-05, + "loss": 4.1063, + "step": 3730 + }, + { + "epoch": 2.4041901692183725, + "grad_norm": 1.7650599879501243, + "learning_rate": 9.94442604934001e-05, + "loss": 3.9683, + "step": 3731 + }, + { + "epoch": 2.4048348106365833, + "grad_norm": 1.762191831934799, + "learning_rate": 9.944396069785666e-05, + "loss": 4.0206, + "step": 3732 + }, + { + "epoch": 2.4054794520547946, + "grad_norm": 2.2326308626162823, + "learning_rate": 9.944366082192889e-05, + "loss": 3.5633, + "step": 3733 + }, + { + "epoch": 2.4061240934730055, + "grad_norm": 2.030427995666133, + "learning_rate": 9.944336086561731e-05, + "loss": 4.0577, + "step": 3734 + }, + { + "epoch": 2.406768734891217, + "grad_norm": 1.487724105064622, + "learning_rate": 9.94430608289224e-05, + "loss": 3.5028, + "step": 3735 + }, + { + "epoch": 2.407413376309428, + "grad_norm": 1.7383527640245904, + "learning_rate": 9.944276071184465e-05, + "loss": 3.9124, + "step": 3736 + }, + { + "epoch": 2.408058017727639, + "grad_norm": 1.8062452269821274, + "learning_rate": 9.944246051438455e-05, + "loss": 3.9527, + "step": 3737 + }, + { + "epoch": 2.4087026591458502, + "grad_norm": 2.341711987592862, + "learning_rate": 9.94421602365426e-05, + "loss": 3.7878, + "step": 3738 + }, + { + "epoch": 2.409347300564061, + "grad_norm": 1.5064742473478616, + "learning_rate": 9.944185987831928e-05, + "loss": 3.5875, + "step": 3739 + }, + { + "epoch": 2.4099919419822724, + "grad_norm": 2.1551105768358165, + "learning_rate": 9.94415594397151e-05, + "loss": 4.3452, + "step": 3740 + }, + { + "epoch": 2.4106365834004837, + "grad_norm": 1.2228670668741506, + "learning_rate": 9.944125892073054e-05, + "loss": 3.8509, + "step": 3741 + }, + { + "epoch": 2.4112812248186946, + "grad_norm": 2.075823980492308, + "learning_rate": 9.94409583213661e-05, + "loss": 3.8321, + "step": 3742 + }, + { + "epoch": 2.411925866236906, + "grad_norm": 1.5744426639084246, + "learning_rate": 9.944065764162231e-05, + "loss": 3.6757, + "step": 3743 + }, + { + "epoch": 2.4125705076551167, + "grad_norm": 1.7412326368572406, + "learning_rate": 9.944035688149957e-05, + "loss": 4.1539, + "step": 3744 + }, + { + "epoch": 2.413215149073328, + "grad_norm": 1.9525013374747928, + "learning_rate": 9.944005604099846e-05, + "loss": 4.0909, + "step": 3745 + }, + { + "epoch": 2.4138597904915393, + "grad_norm": 1.4946867363010143, + "learning_rate": 9.943975512011944e-05, + "loss": 3.8299, + "step": 3746 + }, + { + "epoch": 2.41450443190975, + "grad_norm": 1.752498650825556, + "learning_rate": 9.943945411886303e-05, + "loss": 3.8947, + "step": 3747 + }, + { + "epoch": 2.4151490733279615, + "grad_norm": 1.5679299716627357, + "learning_rate": 9.943915303722968e-05, + "loss": 3.4733, + "step": 3748 + }, + { + "epoch": 2.4157937147461723, + "grad_norm": 1.5838370404470867, + "learning_rate": 9.943885187521992e-05, + "loss": 3.6809, + "step": 3749 + }, + { + "epoch": 2.4164383561643836, + "grad_norm": 1.3291390235991092, + "learning_rate": 9.943855063283423e-05, + "loss": 3.4271, + "step": 3750 + }, + { + "epoch": 2.417082997582595, + "grad_norm": 1.6846036159981979, + "learning_rate": 9.943824931007311e-05, + "loss": 3.691, + "step": 3751 + }, + { + "epoch": 2.417727639000806, + "grad_norm": 2.1934455993557984, + "learning_rate": 9.943794790693704e-05, + "loss": 3.8019, + "step": 3752 + }, + { + "epoch": 2.418372280419017, + "grad_norm": 1.3920065673478939, + "learning_rate": 9.943764642342655e-05, + "loss": 3.6626, + "step": 3753 + }, + { + "epoch": 2.419016921837228, + "grad_norm": 1.7981684067238513, + "learning_rate": 9.943734485954208e-05, + "loss": 3.7204, + "step": 3754 + }, + { + "epoch": 2.4196615632554392, + "grad_norm": 1.21334651229931, + "learning_rate": 9.943704321528419e-05, + "loss": 3.873, + "step": 3755 + }, + { + "epoch": 2.42030620467365, + "grad_norm": 1.3924242603371368, + "learning_rate": 9.943674149065333e-05, + "loss": 3.7086, + "step": 3756 + }, + { + "epoch": 2.4209508460918614, + "grad_norm": 1.3141431907655452, + "learning_rate": 9.943643968565002e-05, + "loss": 3.6771, + "step": 3757 + }, + { + "epoch": 2.4215954875100727, + "grad_norm": 1.4323669464370676, + "learning_rate": 9.943613780027473e-05, + "loss": 3.8662, + "step": 3758 + }, + { + "epoch": 2.4222401289282836, + "grad_norm": 1.248152022083437, + "learning_rate": 9.943583583452797e-05, + "loss": 3.7792, + "step": 3759 + }, + { + "epoch": 2.422884770346495, + "grad_norm": 1.4973738513566266, + "learning_rate": 9.943553378841025e-05, + "loss": 3.5724, + "step": 3760 + }, + { + "epoch": 2.4235294117647057, + "grad_norm": 1.6360813663037428, + "learning_rate": 9.943523166192205e-05, + "loss": 3.8993, + "step": 3761 + }, + { + "epoch": 2.424174053182917, + "grad_norm": 1.3028955921997345, + "learning_rate": 9.943492945506387e-05, + "loss": 3.8071, + "step": 3762 + }, + { + "epoch": 2.4248186946011283, + "grad_norm": 1.7273898801652856, + "learning_rate": 9.94346271678362e-05, + "loss": 3.9312, + "step": 3763 + }, + { + "epoch": 2.425463336019339, + "grad_norm": 1.77829932119385, + "learning_rate": 9.943432480023953e-05, + "loss": 3.4698, + "step": 3764 + }, + { + "epoch": 2.4261079774375505, + "grad_norm": 1.4381257620470678, + "learning_rate": 9.943402235227438e-05, + "loss": 3.9855, + "step": 3765 + }, + { + "epoch": 2.4267526188557613, + "grad_norm": 1.583244634745896, + "learning_rate": 9.943371982394127e-05, + "loss": 4.0804, + "step": 3766 + }, + { + "epoch": 2.4273972602739726, + "grad_norm": 1.925007401384334, + "learning_rate": 9.943341721524062e-05, + "loss": 4.0733, + "step": 3767 + }, + { + "epoch": 2.4280419016921835, + "grad_norm": 1.2745973667872554, + "learning_rate": 9.943311452617296e-05, + "loss": 3.4065, + "step": 3768 + }, + { + "epoch": 2.428686543110395, + "grad_norm": 1.8381589361935085, + "learning_rate": 9.94328117567388e-05, + "loss": 3.6713, + "step": 3769 + }, + { + "epoch": 2.429331184528606, + "grad_norm": 1.790519362915826, + "learning_rate": 9.943250890693866e-05, + "loss": 3.7067, + "step": 3770 + }, + { + "epoch": 2.429975825946817, + "grad_norm": 1.3902094968623255, + "learning_rate": 9.943220597677301e-05, + "loss": 4.1929, + "step": 3771 + }, + { + "epoch": 2.4306204673650282, + "grad_norm": 1.4428325906291883, + "learning_rate": 9.943190296624234e-05, + "loss": 3.8136, + "step": 3772 + }, + { + "epoch": 2.431265108783239, + "grad_norm": 1.4785595133691372, + "learning_rate": 9.943159987534714e-05, + "loss": 3.6327, + "step": 3773 + }, + { + "epoch": 2.4319097502014504, + "grad_norm": 1.7726466357043154, + "learning_rate": 9.943129670408794e-05, + "loss": 3.6656, + "step": 3774 + }, + { + "epoch": 2.4325543916196617, + "grad_norm": 1.4492699620580716, + "learning_rate": 9.943099345246522e-05, + "loss": 3.8885, + "step": 3775 + }, + { + "epoch": 2.4331990330378725, + "grad_norm": 1.5362757542410572, + "learning_rate": 9.943069012047947e-05, + "loss": 3.9253, + "step": 3776 + }, + { + "epoch": 2.433843674456084, + "grad_norm": 1.5974888708545623, + "learning_rate": 9.94303867081312e-05, + "loss": 3.8418, + "step": 3777 + }, + { + "epoch": 2.4344883158742947, + "grad_norm": 1.1312744912120893, + "learning_rate": 9.943008321542092e-05, + "loss": 3.9356, + "step": 3778 + }, + { + "epoch": 2.435132957292506, + "grad_norm": 1.910902393702198, + "learning_rate": 9.942977964234909e-05, + "loss": 3.9494, + "step": 3779 + }, + { + "epoch": 2.4357775987107173, + "grad_norm": 1.8071819326409224, + "learning_rate": 9.942947598891628e-05, + "loss": 3.9448, + "step": 3780 + }, + { + "epoch": 2.436422240128928, + "grad_norm": 1.4837675273974966, + "learning_rate": 9.94291722551229e-05, + "loss": 3.8338, + "step": 3781 + }, + { + "epoch": 2.4370668815471395, + "grad_norm": 1.6685342254345181, + "learning_rate": 9.94288684409695e-05, + "loss": 3.6522, + "step": 3782 + }, + { + "epoch": 2.4377115229653503, + "grad_norm": 1.4042270894126474, + "learning_rate": 9.942856454645658e-05, + "loss": 3.6625, + "step": 3783 + }, + { + "epoch": 2.4383561643835616, + "grad_norm": 2.1322879733214006, + "learning_rate": 9.942826057158461e-05, + "loss": 3.7172, + "step": 3784 + }, + { + "epoch": 2.439000805801773, + "grad_norm": 1.6515369456283864, + "learning_rate": 9.942795651635411e-05, + "loss": 3.8106, + "step": 3785 + }, + { + "epoch": 2.4396454472199838, + "grad_norm": 1.579550245588504, + "learning_rate": 9.942765238076559e-05, + "loss": 3.7342, + "step": 3786 + }, + { + "epoch": 2.440290088638195, + "grad_norm": 1.8844969572395833, + "learning_rate": 9.942734816481953e-05, + "loss": 3.9785, + "step": 3787 + }, + { + "epoch": 2.440934730056406, + "grad_norm": 1.7578176784347723, + "learning_rate": 9.942704386851645e-05, + "loss": 3.9645, + "step": 3788 + }, + { + "epoch": 2.4415793714746172, + "grad_norm": 1.6853641033434514, + "learning_rate": 9.942673949185682e-05, + "loss": 3.8975, + "step": 3789 + }, + { + "epoch": 2.4422240128928285, + "grad_norm": 1.4413500462396662, + "learning_rate": 9.942643503484117e-05, + "loss": 4.1141, + "step": 3790 + }, + { + "epoch": 2.4428686543110394, + "grad_norm": 1.4717680171612528, + "learning_rate": 9.942613049746997e-05, + "loss": 3.9373, + "step": 3791 + }, + { + "epoch": 2.4435132957292507, + "grad_norm": 1.581543800293895, + "learning_rate": 9.942582587974375e-05, + "loss": 3.5361, + "step": 3792 + }, + { + "epoch": 2.4441579371474615, + "grad_norm": 1.4005588664678839, + "learning_rate": 9.942552118166299e-05, + "loss": 3.828, + "step": 3793 + }, + { + "epoch": 2.444802578565673, + "grad_norm": 1.859735551599531, + "learning_rate": 9.94252164032282e-05, + "loss": 3.835, + "step": 3794 + }, + { + "epoch": 2.445447219983884, + "grad_norm": 1.869480598121122, + "learning_rate": 9.942491154443987e-05, + "loss": 3.6026, + "step": 3795 + }, + { + "epoch": 2.446091861402095, + "grad_norm": 1.8816680195582853, + "learning_rate": 9.942460660529852e-05, + "loss": 3.9054, + "step": 3796 + }, + { + "epoch": 2.4467365028203063, + "grad_norm": 2.1382513360450894, + "learning_rate": 9.942430158580463e-05, + "loss": 3.5476, + "step": 3797 + }, + { + "epoch": 2.447381144238517, + "grad_norm": 1.9605103763567797, + "learning_rate": 9.942399648595872e-05, + "loss": 3.6849, + "step": 3798 + }, + { + "epoch": 2.4480257856567285, + "grad_norm": 1.4524737396187373, + "learning_rate": 9.942369130576128e-05, + "loss": 3.7043, + "step": 3799 + }, + { + "epoch": 2.4486704270749398, + "grad_norm": 1.7955565768835757, + "learning_rate": 9.94233860452128e-05, + "loss": 3.5167, + "step": 3800 + }, + { + "epoch": 2.4486704270749398, + "eval_loss": 4.0540385246276855, + "eval_runtime": 2.9704, + "eval_samples_per_second": 33.665, + "eval_steps_per_second": 4.376, + "step": 3800 + }, + { + "epoch": 2.4493150684931506, + "grad_norm": 2.0520717941933277, + "learning_rate": 9.94230807043138e-05, + "loss": 3.6715, + "step": 3801 + }, + { + "epoch": 2.449959709911362, + "grad_norm": 1.2312012349988366, + "learning_rate": 9.942277528306478e-05, + "loss": 3.724, + "step": 3802 + }, + { + "epoch": 2.4506043513295728, + "grad_norm": 2.2006913883883255, + "learning_rate": 9.942246978146625e-05, + "loss": 3.464, + "step": 3803 + }, + { + "epoch": 2.451248992747784, + "grad_norm": 1.7084262981913623, + "learning_rate": 9.942216419951867e-05, + "loss": 3.797, + "step": 3804 + }, + { + "epoch": 2.4518936341659954, + "grad_norm": 1.5206429661823295, + "learning_rate": 9.942185853722259e-05, + "loss": 3.7567, + "step": 3805 + }, + { + "epoch": 2.4525382755842062, + "grad_norm": 1.744287101912101, + "learning_rate": 9.942155279457849e-05, + "loss": 3.9103, + "step": 3806 + }, + { + "epoch": 2.4531829170024175, + "grad_norm": 1.6195408123017336, + "learning_rate": 9.942124697158687e-05, + "loss": 3.9102, + "step": 3807 + }, + { + "epoch": 2.4538275584206284, + "grad_norm": 2.6277174719830527, + "learning_rate": 9.942094106824822e-05, + "loss": 3.6579, + "step": 3808 + }, + { + "epoch": 2.4544721998388397, + "grad_norm": 2.0660295640205146, + "learning_rate": 9.94206350845631e-05, + "loss": 4.0141, + "step": 3809 + }, + { + "epoch": 2.455116841257051, + "grad_norm": 1.9835190629063575, + "learning_rate": 9.942032902053195e-05, + "loss": 4.0621, + "step": 3810 + }, + { + "epoch": 2.455761482675262, + "grad_norm": 2.2377457993609675, + "learning_rate": 9.94200228761553e-05, + "loss": 3.9425, + "step": 3811 + }, + { + "epoch": 2.456406124093473, + "grad_norm": 1.777273404990644, + "learning_rate": 9.941971665143362e-05, + "loss": 3.7257, + "step": 3812 + }, + { + "epoch": 2.457050765511684, + "grad_norm": 2.019938304687697, + "learning_rate": 9.941941034636747e-05, + "loss": 3.6147, + "step": 3813 + }, + { + "epoch": 2.4576954069298953, + "grad_norm": 2.3727845686013267, + "learning_rate": 9.941910396095732e-05, + "loss": 3.4968, + "step": 3814 + }, + { + "epoch": 2.4583400483481066, + "grad_norm": 2.570248902846058, + "learning_rate": 9.941879749520368e-05, + "loss": 3.8836, + "step": 3815 + }, + { + "epoch": 2.4589846897663175, + "grad_norm": 2.1686134061156266, + "learning_rate": 9.941849094910704e-05, + "loss": 3.5138, + "step": 3816 + }, + { + "epoch": 2.4596293311845288, + "grad_norm": 2.4353718150806345, + "learning_rate": 9.94181843226679e-05, + "loss": 3.6784, + "step": 3817 + }, + { + "epoch": 2.4602739726027396, + "grad_norm": 2.172274438682317, + "learning_rate": 9.94178776158868e-05, + "loss": 4.0034, + "step": 3818 + }, + { + "epoch": 2.460918614020951, + "grad_norm": 1.945190845986333, + "learning_rate": 9.941757082876422e-05, + "loss": 3.7419, + "step": 3819 + }, + { + "epoch": 2.461563255439162, + "grad_norm": 2.0395689334382983, + "learning_rate": 9.941726396130067e-05, + "loss": 3.6201, + "step": 3820 + }, + { + "epoch": 2.462207896857373, + "grad_norm": 1.585832671479238, + "learning_rate": 9.941695701349662e-05, + "loss": 3.8871, + "step": 3821 + }, + { + "epoch": 2.4628525382755844, + "grad_norm": 1.8909872553827505, + "learning_rate": 9.941664998535262e-05, + "loss": 4.0478, + "step": 3822 + }, + { + "epoch": 2.4634971796937952, + "grad_norm": 2.021168408513019, + "learning_rate": 9.941634287686916e-05, + "loss": 3.8956, + "step": 3823 + }, + { + "epoch": 2.4641418211120065, + "grad_norm": 1.6953534200054947, + "learning_rate": 9.941603568804674e-05, + "loss": 4.0152, + "step": 3824 + }, + { + "epoch": 2.4647864625302174, + "grad_norm": 1.640458410220231, + "learning_rate": 9.941572841888586e-05, + "loss": 3.9992, + "step": 3825 + }, + { + "epoch": 2.4654311039484287, + "grad_norm": 1.5703075138569642, + "learning_rate": 9.941542106938704e-05, + "loss": 3.8739, + "step": 3826 + }, + { + "epoch": 2.46607574536664, + "grad_norm": 1.857589846704698, + "learning_rate": 9.941511363955077e-05, + "loss": 3.4214, + "step": 3827 + }, + { + "epoch": 2.466720386784851, + "grad_norm": 2.1597380991309816, + "learning_rate": 9.941480612937756e-05, + "loss": 3.6457, + "step": 3828 + }, + { + "epoch": 2.467365028203062, + "grad_norm": 1.2705025282012437, + "learning_rate": 9.941449853886792e-05, + "loss": 4.0147, + "step": 3829 + }, + { + "epoch": 2.468009669621273, + "grad_norm": 1.976575807093291, + "learning_rate": 9.941419086802234e-05, + "loss": 3.5323, + "step": 3830 + }, + { + "epoch": 2.4686543110394843, + "grad_norm": 1.259075357459387, + "learning_rate": 9.941388311684134e-05, + "loss": 4.0884, + "step": 3831 + }, + { + "epoch": 2.4692989524576956, + "grad_norm": 1.608168425842585, + "learning_rate": 9.941357528532543e-05, + "loss": 3.7725, + "step": 3832 + }, + { + "epoch": 2.4699435938759065, + "grad_norm": 1.4334792009964297, + "learning_rate": 9.94132673734751e-05, + "loss": 3.788, + "step": 3833 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 1.6753432021867356, + "learning_rate": 9.941295938129087e-05, + "loss": 3.4805, + "step": 3834 + }, + { + "epoch": 2.4712328767123286, + "grad_norm": 1.6435494087268534, + "learning_rate": 9.941265130877322e-05, + "loss": 3.8991, + "step": 3835 + }, + { + "epoch": 2.47187751813054, + "grad_norm": 1.5760505535979301, + "learning_rate": 9.941234315592268e-05, + "loss": 3.5926, + "step": 3836 + }, + { + "epoch": 2.4725221595487508, + "grad_norm": 1.2694992296950522, + "learning_rate": 9.941203492273977e-05, + "loss": 3.8001, + "step": 3837 + }, + { + "epoch": 2.473166800966962, + "grad_norm": 1.2522101505746774, + "learning_rate": 9.941172660922496e-05, + "loss": 3.8373, + "step": 3838 + }, + { + "epoch": 2.4738114423851734, + "grad_norm": 1.0881242285439208, + "learning_rate": 9.941141821537878e-05, + "loss": 3.7529, + "step": 3839 + }, + { + "epoch": 2.474456083803384, + "grad_norm": 1.393207333550562, + "learning_rate": 9.941110974120172e-05, + "loss": 3.9578, + "step": 3840 + }, + { + "epoch": 2.4751007252215955, + "grad_norm": 1.3760594299164373, + "learning_rate": 9.941080118669431e-05, + "loss": 3.851, + "step": 3841 + }, + { + "epoch": 2.4757453666398064, + "grad_norm": 1.438634235503829, + "learning_rate": 9.941049255185703e-05, + "loss": 3.8146, + "step": 3842 + }, + { + "epoch": 2.4763900080580177, + "grad_norm": 2.2734025865600613, + "learning_rate": 9.941018383669042e-05, + "loss": 3.9202, + "step": 3843 + }, + { + "epoch": 2.477034649476229, + "grad_norm": 1.784721740540692, + "learning_rate": 9.940987504119495e-05, + "loss": 4.1336, + "step": 3844 + }, + { + "epoch": 2.47767929089444, + "grad_norm": 1.4851146966963513, + "learning_rate": 9.940956616537113e-05, + "loss": 4.079, + "step": 3845 + }, + { + "epoch": 2.478323932312651, + "grad_norm": 1.9224030341975205, + "learning_rate": 9.94092572092195e-05, + "loss": 3.9069, + "step": 3846 + }, + { + "epoch": 2.478968573730862, + "grad_norm": 1.3121682046676881, + "learning_rate": 9.940894817274055e-05, + "loss": 3.564, + "step": 3847 + }, + { + "epoch": 2.4796132151490733, + "grad_norm": 1.7163397138175758, + "learning_rate": 9.940863905593478e-05, + "loss": 3.8844, + "step": 3848 + }, + { + "epoch": 2.4802578565672846, + "grad_norm": 2.2990672520403383, + "learning_rate": 9.94083298588027e-05, + "loss": 3.9359, + "step": 3849 + }, + { + "epoch": 2.4809024979854954, + "grad_norm": 2.0862446050712666, + "learning_rate": 9.940802058134482e-05, + "loss": 3.7099, + "step": 3850 + }, + { + "epoch": 2.4815471394037067, + "grad_norm": 1.6798470188099057, + "learning_rate": 9.940771122356167e-05, + "loss": 3.4159, + "step": 3851 + }, + { + "epoch": 2.4821917808219176, + "grad_norm": 2.0053348364789763, + "learning_rate": 9.940740178545372e-05, + "loss": 3.9058, + "step": 3852 + }, + { + "epoch": 2.482836422240129, + "grad_norm": 1.448954504704405, + "learning_rate": 9.94070922670215e-05, + "loss": 3.8553, + "step": 3853 + }, + { + "epoch": 2.48348106365834, + "grad_norm": 1.7065342105357277, + "learning_rate": 9.940678266826551e-05, + "loss": 3.5933, + "step": 3854 + }, + { + "epoch": 2.484125705076551, + "grad_norm": 1.7801225189239906, + "learning_rate": 9.94064729891863e-05, + "loss": 3.6318, + "step": 3855 + }, + { + "epoch": 2.4847703464947624, + "grad_norm": 1.849596124026491, + "learning_rate": 9.940616322978429e-05, + "loss": 3.7836, + "step": 3856 + }, + { + "epoch": 2.485414987912973, + "grad_norm": 1.972808552827559, + "learning_rate": 9.940585339006005e-05, + "loss": 3.5499, + "step": 3857 + }, + { + "epoch": 2.4860596293311845, + "grad_norm": 2.0579790644190235, + "learning_rate": 9.940554347001411e-05, + "loss": 3.7454, + "step": 3858 + }, + { + "epoch": 2.486704270749396, + "grad_norm": 1.7074326438122036, + "learning_rate": 9.940523346964694e-05, + "loss": 4.2094, + "step": 3859 + }, + { + "epoch": 2.4873489121676067, + "grad_norm": 1.3082899759122613, + "learning_rate": 9.940492338895903e-05, + "loss": 3.966, + "step": 3860 + }, + { + "epoch": 2.487993553585818, + "grad_norm": 1.7889672008996371, + "learning_rate": 9.940461322795092e-05, + "loss": 4.1087, + "step": 3861 + }, + { + "epoch": 2.488638195004029, + "grad_norm": 1.4456702894210143, + "learning_rate": 9.940430298662314e-05, + "loss": 3.7277, + "step": 3862 + }, + { + "epoch": 2.48928283642224, + "grad_norm": 1.6323198292398018, + "learning_rate": 9.940399266497616e-05, + "loss": 3.6663, + "step": 3863 + }, + { + "epoch": 2.4899274778404514, + "grad_norm": 1.28745006654936, + "learning_rate": 9.940368226301052e-05, + "loss": 3.777, + "step": 3864 + }, + { + "epoch": 2.4905721192586623, + "grad_norm": 1.6046254099905273, + "learning_rate": 9.940337178072668e-05, + "loss": 3.6962, + "step": 3865 + }, + { + "epoch": 2.4912167606768736, + "grad_norm": 1.415593796301356, + "learning_rate": 9.940306121812521e-05, + "loss": 3.7098, + "step": 3866 + }, + { + "epoch": 2.4918614020950844, + "grad_norm": 1.395931558618503, + "learning_rate": 9.94027505752066e-05, + "loss": 3.8348, + "step": 3867 + }, + { + "epoch": 2.4925060435132957, + "grad_norm": 1.636979486321469, + "learning_rate": 9.940243985197134e-05, + "loss": 3.455, + "step": 3868 + }, + { + "epoch": 2.493150684931507, + "grad_norm": 1.230781395462349, + "learning_rate": 9.940212904841996e-05, + "loss": 3.9336, + "step": 3869 + }, + { + "epoch": 2.493795326349718, + "grad_norm": 1.6075445612911332, + "learning_rate": 9.940181816455297e-05, + "loss": 4.1837, + "step": 3870 + }, + { + "epoch": 2.494439967767929, + "grad_norm": 1.4142112082636327, + "learning_rate": 9.940150720037087e-05, + "loss": 3.8684, + "step": 3871 + }, + { + "epoch": 2.49508460918614, + "grad_norm": 1.3759352622793275, + "learning_rate": 9.940119615587418e-05, + "loss": 3.8082, + "step": 3872 + }, + { + "epoch": 2.4957292506043514, + "grad_norm": 1.747025952722025, + "learning_rate": 9.940088503106343e-05, + "loss": 4.0695, + "step": 3873 + }, + { + "epoch": 2.4963738920225627, + "grad_norm": 1.574213202570384, + "learning_rate": 9.940057382593908e-05, + "loss": 3.843, + "step": 3874 + }, + { + "epoch": 2.4970185334407735, + "grad_norm": 1.6973200967215574, + "learning_rate": 9.940026254050169e-05, + "loss": 3.9776, + "step": 3875 + }, + { + "epoch": 2.497663174858985, + "grad_norm": 2.030405563656934, + "learning_rate": 9.939995117475174e-05, + "loss": 3.673, + "step": 3876 + }, + { + "epoch": 2.4983078162771957, + "grad_norm": 1.279230061342313, + "learning_rate": 9.939963972868974e-05, + "loss": 4.0082, + "step": 3877 + }, + { + "epoch": 2.498952457695407, + "grad_norm": 1.467191259959321, + "learning_rate": 9.939932820231624e-05, + "loss": 4.0434, + "step": 3878 + }, + { + "epoch": 2.4995970991136183, + "grad_norm": 1.347080639458755, + "learning_rate": 9.939901659563172e-05, + "loss": 3.9038, + "step": 3879 + }, + { + "epoch": 2.500241740531829, + "grad_norm": 1.366599808548921, + "learning_rate": 9.939870490863668e-05, + "loss": 3.7723, + "step": 3880 + }, + { + "epoch": 2.5008863819500404, + "grad_norm": 1.3656463708661637, + "learning_rate": 9.939839314133166e-05, + "loss": 3.5427, + "step": 3881 + }, + { + "epoch": 2.5015310233682513, + "grad_norm": 1.1898202176139654, + "learning_rate": 9.939808129371716e-05, + "loss": 4.0163, + "step": 3882 + }, + { + "epoch": 2.5021756647864626, + "grad_norm": 2.0604005333195343, + "learning_rate": 9.93977693657937e-05, + "loss": 3.8196, + "step": 3883 + }, + { + "epoch": 2.502820306204674, + "grad_norm": 2.0138119580738167, + "learning_rate": 9.939745735756179e-05, + "loss": 3.7568, + "step": 3884 + }, + { + "epoch": 2.5034649476228847, + "grad_norm": 1.2525087174803518, + "learning_rate": 9.939714526902192e-05, + "loss": 3.7343, + "step": 3885 + }, + { + "epoch": 2.504109589041096, + "grad_norm": 1.9708528438641615, + "learning_rate": 9.939683310017463e-05, + "loss": 3.4556, + "step": 3886 + }, + { + "epoch": 2.504754230459307, + "grad_norm": 1.93816077476318, + "learning_rate": 9.939652085102043e-05, + "loss": 3.4936, + "step": 3887 + }, + { + "epoch": 2.505398871877518, + "grad_norm": 1.4612340610739154, + "learning_rate": 9.939620852155984e-05, + "loss": 4.0936, + "step": 3888 + }, + { + "epoch": 2.5060435132957295, + "grad_norm": 1.7870020986452968, + "learning_rate": 9.939589611179332e-05, + "loss": 3.85, + "step": 3889 + }, + { + "epoch": 2.5066881547139404, + "grad_norm": 1.6122307429343834, + "learning_rate": 9.939558362172145e-05, + "loss": 3.6084, + "step": 3890 + }, + { + "epoch": 2.5073327961321517, + "grad_norm": 1.6983673153452592, + "learning_rate": 9.93952710513447e-05, + "loss": 3.8109, + "step": 3891 + }, + { + "epoch": 2.5079774375503625, + "grad_norm": 1.5544763661437992, + "learning_rate": 9.939495840066362e-05, + "loss": 4.0598, + "step": 3892 + }, + { + "epoch": 2.508622078968574, + "grad_norm": 1.5761676910808216, + "learning_rate": 9.939464566967868e-05, + "loss": 3.6279, + "step": 3893 + }, + { + "epoch": 2.509266720386785, + "grad_norm": 1.471176682404029, + "learning_rate": 9.939433285839043e-05, + "loss": 4.0293, + "step": 3894 + }, + { + "epoch": 2.509911361804996, + "grad_norm": 1.9035967852273192, + "learning_rate": 9.939401996679938e-05, + "loss": 3.7563, + "step": 3895 + }, + { + "epoch": 2.510556003223207, + "grad_norm": 1.2720085491541342, + "learning_rate": 9.939370699490603e-05, + "loss": 3.6838, + "step": 3896 + }, + { + "epoch": 2.511200644641418, + "grad_norm": 1.6725477424104906, + "learning_rate": 9.939339394271088e-05, + "loss": 3.6793, + "step": 3897 + }, + { + "epoch": 2.5118452860596294, + "grad_norm": 1.9849700813090605, + "learning_rate": 9.939308081021448e-05, + "loss": 4.128, + "step": 3898 + }, + { + "epoch": 2.5124899274778407, + "grad_norm": 1.5400098903182844, + "learning_rate": 9.939276759741731e-05, + "loss": 3.729, + "step": 3899 + }, + { + "epoch": 2.5131345688960516, + "grad_norm": 2.3272372572943087, + "learning_rate": 9.939245430431991e-05, + "loss": 3.8571, + "step": 3900 + }, + { + "epoch": 2.5131345688960516, + "eval_loss": 4.04829216003418, + "eval_runtime": 2.9775, + "eval_samples_per_second": 33.585, + "eval_steps_per_second": 4.366, + "step": 3900 + }, + { + "epoch": 2.5137792103142624, + "grad_norm": 1.7261277889499518, + "learning_rate": 9.939214093092279e-05, + "loss": 3.7465, + "step": 3901 + }, + { + "epoch": 2.5144238517324737, + "grad_norm": 2.1123167285781563, + "learning_rate": 9.939182747722645e-05, + "loss": 3.7724, + "step": 3902 + }, + { + "epoch": 2.515068493150685, + "grad_norm": 1.8490701820982176, + "learning_rate": 9.939151394323144e-05, + "loss": 3.9615, + "step": 3903 + }, + { + "epoch": 2.515713134568896, + "grad_norm": 1.7620949323756157, + "learning_rate": 9.939120032893822e-05, + "loss": 4.1589, + "step": 3904 + }, + { + "epoch": 2.516357775987107, + "grad_norm": 1.8490265042497713, + "learning_rate": 9.939088663434734e-05, + "loss": 3.8816, + "step": 3905 + }, + { + "epoch": 2.517002417405318, + "grad_norm": 3.128397097255793, + "learning_rate": 9.939057285945933e-05, + "loss": 3.6174, + "step": 3906 + }, + { + "epoch": 2.5176470588235293, + "grad_norm": 1.3653511701947079, + "learning_rate": 9.939025900427466e-05, + "loss": 3.6532, + "step": 3907 + }, + { + "epoch": 2.5182917002417406, + "grad_norm": 1.8380089086434779, + "learning_rate": 9.938994506879387e-05, + "loss": 3.9972, + "step": 3908 + }, + { + "epoch": 2.5189363416599515, + "grad_norm": 1.3445241443481275, + "learning_rate": 9.93896310530175e-05, + "loss": 3.9307, + "step": 3909 + }, + { + "epoch": 2.519580983078163, + "grad_norm": 1.7060884826210687, + "learning_rate": 9.938931695694603e-05, + "loss": 3.6496, + "step": 3910 + }, + { + "epoch": 2.5202256244963737, + "grad_norm": 1.7187704315728718, + "learning_rate": 9.938900278058e-05, + "loss": 3.9093, + "step": 3911 + }, + { + "epoch": 2.520870265914585, + "grad_norm": 1.3291934706861837, + "learning_rate": 9.93886885239199e-05, + "loss": 3.6127, + "step": 3912 + }, + { + "epoch": 2.5215149073327963, + "grad_norm": 1.8203612051991578, + "learning_rate": 9.938837418696626e-05, + "loss": 3.7811, + "step": 3913 + }, + { + "epoch": 2.522159548751007, + "grad_norm": 1.4264548592704263, + "learning_rate": 9.93880597697196e-05, + "loss": 4.1511, + "step": 3914 + }, + { + "epoch": 2.5228041901692184, + "grad_norm": 1.935045025113345, + "learning_rate": 9.938774527218044e-05, + "loss": 3.9006, + "step": 3915 + }, + { + "epoch": 2.5234488315874293, + "grad_norm": 1.8165613546304982, + "learning_rate": 9.938743069434927e-05, + "loss": 3.8374, + "step": 3916 + }, + { + "epoch": 2.5240934730056406, + "grad_norm": 1.6424732752131739, + "learning_rate": 9.938711603622665e-05, + "loss": 3.7119, + "step": 3917 + }, + { + "epoch": 2.524738114423852, + "grad_norm": 1.6336957712893316, + "learning_rate": 9.938680129781304e-05, + "loss": 3.9022, + "step": 3918 + }, + { + "epoch": 2.5253827558420627, + "grad_norm": 1.4076336723082106, + "learning_rate": 9.938648647910902e-05, + "loss": 3.9595, + "step": 3919 + }, + { + "epoch": 2.526027397260274, + "grad_norm": 1.6901629678649308, + "learning_rate": 9.938617158011508e-05, + "loss": 3.9439, + "step": 3920 + }, + { + "epoch": 2.526672038678485, + "grad_norm": 1.3574682370111515, + "learning_rate": 9.938585660083171e-05, + "loss": 3.7563, + "step": 3921 + }, + { + "epoch": 2.527316680096696, + "grad_norm": 1.434032675195727, + "learning_rate": 9.938554154125947e-05, + "loss": 4.0899, + "step": 3922 + }, + { + "epoch": 2.5279613215149075, + "grad_norm": 1.5214844101242335, + "learning_rate": 9.938522640139885e-05, + "loss": 3.8894, + "step": 3923 + }, + { + "epoch": 2.5286059629331183, + "grad_norm": 1.582296930504612, + "learning_rate": 9.938491118125037e-05, + "loss": 3.8866, + "step": 3924 + }, + { + "epoch": 2.5292506043513296, + "grad_norm": 1.686426727583047, + "learning_rate": 9.938459588081456e-05, + "loss": 3.6753, + "step": 3925 + }, + { + "epoch": 2.5298952457695405, + "grad_norm": 1.482118096879272, + "learning_rate": 9.938428050009194e-05, + "loss": 4.0663, + "step": 3926 + }, + { + "epoch": 2.530539887187752, + "grad_norm": 1.670125596729244, + "learning_rate": 9.938396503908302e-05, + "loss": 3.5527, + "step": 3927 + }, + { + "epoch": 2.531184528605963, + "grad_norm": 2.1327882781431464, + "learning_rate": 9.93836494977883e-05, + "loss": 3.9844, + "step": 3928 + }, + { + "epoch": 2.531829170024174, + "grad_norm": 1.3835498020094026, + "learning_rate": 9.938333387620833e-05, + "loss": 3.9146, + "step": 3929 + }, + { + "epoch": 2.5324738114423853, + "grad_norm": 1.8362197251414605, + "learning_rate": 9.938301817434362e-05, + "loss": 3.7794, + "step": 3930 + }, + { + "epoch": 2.533118452860596, + "grad_norm": 1.7937447729187224, + "learning_rate": 9.938270239219466e-05, + "loss": 3.7639, + "step": 3931 + }, + { + "epoch": 2.5337630942788074, + "grad_norm": 1.3189452375633908, + "learning_rate": 9.9382386529762e-05, + "loss": 3.6076, + "step": 3932 + }, + { + "epoch": 2.5344077356970187, + "grad_norm": 2.4723473706238, + "learning_rate": 9.938207058704617e-05, + "loss": 4.0288, + "step": 3933 + }, + { + "epoch": 2.5350523771152296, + "grad_norm": 1.6639613702969833, + "learning_rate": 9.938175456404767e-05, + "loss": 3.591, + "step": 3934 + }, + { + "epoch": 2.535697018533441, + "grad_norm": 1.3949449113152679, + "learning_rate": 9.938143846076699e-05, + "loss": 4.0461, + "step": 3935 + }, + { + "epoch": 2.5363416599516517, + "grad_norm": 2.0173231913027085, + "learning_rate": 9.93811222772047e-05, + "loss": 3.8272, + "step": 3936 + }, + { + "epoch": 2.536986301369863, + "grad_norm": 1.7591695418309188, + "learning_rate": 9.938080601336128e-05, + "loss": 3.9465, + "step": 3937 + }, + { + "epoch": 2.5376309427880743, + "grad_norm": 2.1164942228741688, + "learning_rate": 9.938048966923727e-05, + "loss": 3.7336, + "step": 3938 + }, + { + "epoch": 2.538275584206285, + "grad_norm": 1.6054956716147653, + "learning_rate": 9.93801732448332e-05, + "loss": 3.7591, + "step": 3939 + }, + { + "epoch": 2.5389202256244965, + "grad_norm": 1.6598537420083033, + "learning_rate": 9.937985674014957e-05, + "loss": 4.1072, + "step": 3940 + }, + { + "epoch": 2.5395648670427073, + "grad_norm": 1.6896772519675975, + "learning_rate": 9.937954015518691e-05, + "loss": 3.5438, + "step": 3941 + }, + { + "epoch": 2.5402095084609186, + "grad_norm": 1.4483736274960888, + "learning_rate": 9.937922348994573e-05, + "loss": 3.6893, + "step": 3942 + }, + { + "epoch": 2.54085414987913, + "grad_norm": 2.4785414335174765, + "learning_rate": 9.937890674442654e-05, + "loss": 3.7884, + "step": 3943 + }, + { + "epoch": 2.541498791297341, + "grad_norm": 1.6864285732731894, + "learning_rate": 9.93785899186299e-05, + "loss": 3.3833, + "step": 3944 + }, + { + "epoch": 2.542143432715552, + "grad_norm": 2.3973971834063907, + "learning_rate": 9.93782730125563e-05, + "loss": 3.8205, + "step": 3945 + }, + { + "epoch": 2.542788074133763, + "grad_norm": 1.7882810623229501, + "learning_rate": 9.937795602620625e-05, + "loss": 3.7782, + "step": 3946 + }, + { + "epoch": 2.5434327155519743, + "grad_norm": 1.5550397129171734, + "learning_rate": 9.937763895958031e-05, + "loss": 3.9414, + "step": 3947 + }, + { + "epoch": 2.5440773569701856, + "grad_norm": 1.5981907350539395, + "learning_rate": 9.937732181267896e-05, + "loss": 3.7341, + "step": 3948 + }, + { + "epoch": 2.5447219983883964, + "grad_norm": 1.5573808909399875, + "learning_rate": 9.937700458550275e-05, + "loss": 3.9646, + "step": 3949 + }, + { + "epoch": 2.5453666398066077, + "grad_norm": 1.4018010659454618, + "learning_rate": 9.937668727805217e-05, + "loss": 4.0519, + "step": 3950 + }, + { + "epoch": 2.5460112812248186, + "grad_norm": 1.9837827232495169, + "learning_rate": 9.937636989032778e-05, + "loss": 4.0218, + "step": 3951 + }, + { + "epoch": 2.54665592264303, + "grad_norm": 1.5323159178135608, + "learning_rate": 9.937605242233007e-05, + "loss": 3.8, + "step": 3952 + }, + { + "epoch": 2.547300564061241, + "grad_norm": 1.6661081945286278, + "learning_rate": 9.937573487405958e-05, + "loss": 3.1445, + "step": 3953 + }, + { + "epoch": 2.547945205479452, + "grad_norm": 2.0145516900957325, + "learning_rate": 9.937541724551683e-05, + "loss": 4.1063, + "step": 3954 + }, + { + "epoch": 2.5485898468976633, + "grad_norm": 1.3280623080080114, + "learning_rate": 9.937509953670233e-05, + "loss": 3.7603, + "step": 3955 + }, + { + "epoch": 2.549234488315874, + "grad_norm": 2.681443806695645, + "learning_rate": 9.93747817476166e-05, + "loss": 3.4135, + "step": 3956 + }, + { + "epoch": 2.5498791297340855, + "grad_norm": 2.235539003973382, + "learning_rate": 9.937446387826019e-05, + "loss": 3.8219, + "step": 3957 + }, + { + "epoch": 2.550523771152297, + "grad_norm": 2.1785014133449474, + "learning_rate": 9.93741459286336e-05, + "loss": 3.978, + "step": 3958 + }, + { + "epoch": 2.5511684125705076, + "grad_norm": 1.7807399178681023, + "learning_rate": 9.937382789873734e-05, + "loss": 4.0294, + "step": 3959 + }, + { + "epoch": 2.551813053988719, + "grad_norm": 2.2328070124263126, + "learning_rate": 9.937350978857196e-05, + "loss": 3.7078, + "step": 3960 + }, + { + "epoch": 2.55245769540693, + "grad_norm": 1.8394440407295545, + "learning_rate": 9.937319159813796e-05, + "loss": 3.9493, + "step": 3961 + }, + { + "epoch": 2.553102336825141, + "grad_norm": 1.823982741493518, + "learning_rate": 9.937287332743589e-05, + "loss": 3.6022, + "step": 3962 + }, + { + "epoch": 2.5537469782433524, + "grad_norm": 2.1682437345534877, + "learning_rate": 9.937255497646624e-05, + "loss": 3.6841, + "step": 3963 + }, + { + "epoch": 2.5543916196615633, + "grad_norm": 1.4848002437543733, + "learning_rate": 9.937223654522954e-05, + "loss": 4.0673, + "step": 3964 + }, + { + "epoch": 2.555036261079774, + "grad_norm": 1.758685326540891, + "learning_rate": 9.937191803372633e-05, + "loss": 3.7444, + "step": 3965 + }, + { + "epoch": 2.5556809024979854, + "grad_norm": 1.4687934548294834, + "learning_rate": 9.937159944195713e-05, + "loss": 3.7943, + "step": 3966 + }, + { + "epoch": 2.5563255439161967, + "grad_norm": 1.8247629635803129, + "learning_rate": 9.937128076992244e-05, + "loss": 3.6224, + "step": 3967 + }, + { + "epoch": 2.556970185334408, + "grad_norm": 1.7063357823535625, + "learning_rate": 9.937096201762281e-05, + "loss": 3.9031, + "step": 3968 + }, + { + "epoch": 2.557614826752619, + "grad_norm": 1.478550922888572, + "learning_rate": 9.937064318505875e-05, + "loss": 3.8838, + "step": 3969 + }, + { + "epoch": 2.5582594681708297, + "grad_norm": 1.4100116212935305, + "learning_rate": 9.937032427223079e-05, + "loss": 3.8317, + "step": 3970 + }, + { + "epoch": 2.558904109589041, + "grad_norm": 1.4725246148865407, + "learning_rate": 9.937000527913945e-05, + "loss": 4.0121, + "step": 3971 + }, + { + "epoch": 2.5595487510072523, + "grad_norm": 1.2042609245311648, + "learning_rate": 9.936968620578526e-05, + "loss": 3.4461, + "step": 3972 + }, + { + "epoch": 2.560193392425463, + "grad_norm": 1.4155626434045947, + "learning_rate": 9.936936705216873e-05, + "loss": 4.0558, + "step": 3973 + }, + { + "epoch": 2.5608380338436745, + "grad_norm": 1.479262987686084, + "learning_rate": 9.936904781829041e-05, + "loss": 3.7286, + "step": 3974 + }, + { + "epoch": 2.5614826752618853, + "grad_norm": 1.6918580486438424, + "learning_rate": 9.93687285041508e-05, + "loss": 3.8367, + "step": 3975 + }, + { + "epoch": 2.5621273166800966, + "grad_norm": 1.6455708999042, + "learning_rate": 9.936840910975043e-05, + "loss": 3.7784, + "step": 3976 + }, + { + "epoch": 2.562771958098308, + "grad_norm": 1.4866491495531131, + "learning_rate": 9.936808963508982e-05, + "loss": 4.1138, + "step": 3977 + }, + { + "epoch": 2.563416599516519, + "grad_norm": 1.2739314443709588, + "learning_rate": 9.936777008016951e-05, + "loss": 3.7867, + "step": 3978 + }, + { + "epoch": 2.56406124093473, + "grad_norm": 1.4976836827888973, + "learning_rate": 9.936745044499002e-05, + "loss": 4.03, + "step": 3979 + }, + { + "epoch": 2.564705882352941, + "grad_norm": 1.9539479860784974, + "learning_rate": 9.936713072955186e-05, + "loss": 3.9066, + "step": 3980 + }, + { + "epoch": 2.5653505237711522, + "grad_norm": 1.5352256088455662, + "learning_rate": 9.936681093385557e-05, + "loss": 3.7836, + "step": 3981 + }, + { + "epoch": 2.5659951651893635, + "grad_norm": 1.7544852475486887, + "learning_rate": 9.936649105790168e-05, + "loss": 3.7729, + "step": 3982 + }, + { + "epoch": 2.5666398066075744, + "grad_norm": 1.8246578774675823, + "learning_rate": 9.936617110169069e-05, + "loss": 3.8366, + "step": 3983 + }, + { + "epoch": 2.5672844480257857, + "grad_norm": 1.5195182523848838, + "learning_rate": 9.936585106522315e-05, + "loss": 3.9924, + "step": 3984 + }, + { + "epoch": 2.5679290894439966, + "grad_norm": 1.5395074812864422, + "learning_rate": 9.936553094849958e-05, + "loss": 3.7102, + "step": 3985 + }, + { + "epoch": 2.568573730862208, + "grad_norm": 1.5482816147842935, + "learning_rate": 9.93652107515205e-05, + "loss": 3.8146, + "step": 3986 + }, + { + "epoch": 2.569218372280419, + "grad_norm": 1.3671601574723522, + "learning_rate": 9.936489047428645e-05, + "loss": 3.813, + "step": 3987 + }, + { + "epoch": 2.56986301369863, + "grad_norm": 1.5378005354808129, + "learning_rate": 9.936457011679793e-05, + "loss": 3.7581, + "step": 3988 + }, + { + "epoch": 2.5705076551168413, + "grad_norm": 1.781478715049122, + "learning_rate": 9.93642496790555e-05, + "loss": 3.7848, + "step": 3989 + }, + { + "epoch": 2.571152296535052, + "grad_norm": 2.015742091782912, + "learning_rate": 9.936392916105966e-05, + "loss": 3.8931, + "step": 3990 + }, + { + "epoch": 2.5717969379532635, + "grad_norm": 1.3147780494695451, + "learning_rate": 9.936360856281093e-05, + "loss": 3.7506, + "step": 3991 + }, + { + "epoch": 2.5724415793714748, + "grad_norm": 2.0766572913794468, + "learning_rate": 9.936328788430986e-05, + "loss": 3.9212, + "step": 3992 + }, + { + "epoch": 2.5730862207896856, + "grad_norm": 1.5655790957923068, + "learning_rate": 9.936296712555697e-05, + "loss": 3.7808, + "step": 3993 + }, + { + "epoch": 2.573730862207897, + "grad_norm": 2.0423496146224984, + "learning_rate": 9.936264628655279e-05, + "loss": 3.8792, + "step": 3994 + }, + { + "epoch": 2.574375503626108, + "grad_norm": 1.9461819853118005, + "learning_rate": 9.936232536729783e-05, + "loss": 3.847, + "step": 3995 + }, + { + "epoch": 2.575020145044319, + "grad_norm": 1.451246899622313, + "learning_rate": 9.936200436779264e-05, + "loss": 3.669, + "step": 3996 + }, + { + "epoch": 2.5756647864625304, + "grad_norm": 1.6925929648789346, + "learning_rate": 9.936168328803774e-05, + "loss": 3.9896, + "step": 3997 + }, + { + "epoch": 2.5763094278807412, + "grad_norm": 1.6494190720856143, + "learning_rate": 9.936136212803363e-05, + "loss": 4.0101, + "step": 3998 + }, + { + "epoch": 2.5769540692989525, + "grad_norm": 1.4491563433406933, + "learning_rate": 9.936104088778087e-05, + "loss": 3.8308, + "step": 3999 + }, + { + "epoch": 2.5775987107171634, + "grad_norm": 1.5396212175890407, + "learning_rate": 9.936071956727999e-05, + "loss": 3.77, + "step": 4000 + }, + { + "epoch": 2.5775987107171634, + "eval_loss": 4.062774181365967, + "eval_runtime": 2.9655, + "eval_samples_per_second": 33.721, + "eval_steps_per_second": 4.384, + "step": 4000 + }, + { + "epoch": 2.5782433521353747, + "grad_norm": 1.6662154889290166, + "learning_rate": 9.936039816653151e-05, + "loss": 3.7718, + "step": 4001 + }, + { + "epoch": 2.578887993553586, + "grad_norm": 1.9950105954150827, + "learning_rate": 9.936007668553593e-05, + "loss": 3.6582, + "step": 4002 + }, + { + "epoch": 2.579532634971797, + "grad_norm": 1.410421823777405, + "learning_rate": 9.935975512429382e-05, + "loss": 3.8288, + "step": 4003 + }, + { + "epoch": 2.580177276390008, + "grad_norm": 1.6973099255200812, + "learning_rate": 9.935943348280568e-05, + "loss": 3.8122, + "step": 4004 + }, + { + "epoch": 2.580821917808219, + "grad_norm": 1.7124700289130113, + "learning_rate": 9.935911176107204e-05, + "loss": 3.7658, + "step": 4005 + }, + { + "epoch": 2.5814665592264303, + "grad_norm": 1.3297487944035766, + "learning_rate": 9.935878995909344e-05, + "loss": 3.8559, + "step": 4006 + }, + { + "epoch": 2.5821112006446416, + "grad_norm": 1.733262941293602, + "learning_rate": 9.935846807687042e-05, + "loss": 3.9273, + "step": 4007 + }, + { + "epoch": 2.5827558420628525, + "grad_norm": 1.2720805288066495, + "learning_rate": 9.935814611440349e-05, + "loss": 3.6599, + "step": 4008 + }, + { + "epoch": 2.5834004834810638, + "grad_norm": 1.5404199988151994, + "learning_rate": 9.935782407169317e-05, + "loss": 3.7364, + "step": 4009 + }, + { + "epoch": 2.5840451248992746, + "grad_norm": 1.3056024311789909, + "learning_rate": 9.935750194874002e-05, + "loss": 3.6621, + "step": 4010 + }, + { + "epoch": 2.584689766317486, + "grad_norm": 1.5629939975360116, + "learning_rate": 9.935717974554453e-05, + "loss": 3.6104, + "step": 4011 + }, + { + "epoch": 2.5853344077356972, + "grad_norm": 1.8585305656817477, + "learning_rate": 9.935685746210726e-05, + "loss": 3.6466, + "step": 4012 + }, + { + "epoch": 2.585979049153908, + "grad_norm": 1.2691458010764551, + "learning_rate": 9.935653509842873e-05, + "loss": 3.8295, + "step": 4013 + }, + { + "epoch": 2.5866236905721194, + "grad_norm": 1.73920468303866, + "learning_rate": 9.935621265450946e-05, + "loss": 3.5748, + "step": 4014 + }, + { + "epoch": 2.5872683319903302, + "grad_norm": 1.4560981657040653, + "learning_rate": 9.935589013034998e-05, + "loss": 3.5614, + "step": 4015 + }, + { + "epoch": 2.5879129734085415, + "grad_norm": 1.4351862702746603, + "learning_rate": 9.935556752595085e-05, + "loss": 3.8471, + "step": 4016 + }, + { + "epoch": 2.588557614826753, + "grad_norm": 1.6786000315949083, + "learning_rate": 9.935524484131256e-05, + "loss": 3.808, + "step": 4017 + }, + { + "epoch": 2.5892022562449637, + "grad_norm": 1.3132988329314523, + "learning_rate": 9.935492207643567e-05, + "loss": 3.6979, + "step": 4018 + }, + { + "epoch": 2.589846897663175, + "grad_norm": 1.4858456653063432, + "learning_rate": 9.93545992313207e-05, + "loss": 3.7557, + "step": 4019 + }, + { + "epoch": 2.590491539081386, + "grad_norm": 1.682450002017209, + "learning_rate": 9.935427630596816e-05, + "loss": 3.8176, + "step": 4020 + }, + { + "epoch": 2.591136180499597, + "grad_norm": 1.411064331401859, + "learning_rate": 9.935395330037863e-05, + "loss": 3.5761, + "step": 4021 + }, + { + "epoch": 2.5917808219178085, + "grad_norm": 1.3977937072902034, + "learning_rate": 9.935363021455258e-05, + "loss": 4.0985, + "step": 4022 + }, + { + "epoch": 2.5924254633360193, + "grad_norm": 1.5200090852967985, + "learning_rate": 9.935330704849056e-05, + "loss": 3.8776, + "step": 4023 + }, + { + "epoch": 2.5930701047542306, + "grad_norm": 1.2871538751510365, + "learning_rate": 9.935298380219312e-05, + "loss": 3.4803, + "step": 4024 + }, + { + "epoch": 2.5937147461724415, + "grad_norm": 1.418130053372887, + "learning_rate": 9.935266047566078e-05, + "loss": 3.871, + "step": 4025 + }, + { + "epoch": 2.5943593875906528, + "grad_norm": 1.3731120512184498, + "learning_rate": 9.93523370688941e-05, + "loss": 4.135, + "step": 4026 + }, + { + "epoch": 2.595004029008864, + "grad_norm": 1.0859868645971973, + "learning_rate": 9.935201358189355e-05, + "loss": 3.8527, + "step": 4027 + }, + { + "epoch": 2.595648670427075, + "grad_norm": 1.29838484445868, + "learning_rate": 9.93516900146597e-05, + "loss": 3.7934, + "step": 4028 + }, + { + "epoch": 2.5962933118452862, + "grad_norm": 1.2893309246307025, + "learning_rate": 9.935136636719308e-05, + "loss": 4.1219, + "step": 4029 + }, + { + "epoch": 2.596937953263497, + "grad_norm": 1.412808345422571, + "learning_rate": 9.935104263949422e-05, + "loss": 3.9459, + "step": 4030 + }, + { + "epoch": 2.5975825946817084, + "grad_norm": 1.576423644120092, + "learning_rate": 9.935071883156367e-05, + "loss": 4.1567, + "step": 4031 + }, + { + "epoch": 2.5982272360999197, + "grad_norm": 1.4274693879770626, + "learning_rate": 9.935039494340192e-05, + "loss": 3.8406, + "step": 4032 + }, + { + "epoch": 2.5988718775181305, + "grad_norm": 1.4663454013991681, + "learning_rate": 9.93500709750095e-05, + "loss": 3.6748, + "step": 4033 + }, + { + "epoch": 2.5995165189363414, + "grad_norm": 1.3781320826658336, + "learning_rate": 9.934974692638699e-05, + "loss": 3.8561, + "step": 4034 + }, + { + "epoch": 2.6001611603545527, + "grad_norm": 1.425250335904857, + "learning_rate": 9.934942279753488e-05, + "loss": 3.6983, + "step": 4035 + }, + { + "epoch": 2.600805801772764, + "grad_norm": 1.3379210040529015, + "learning_rate": 9.934909858845375e-05, + "loss": 4.0177, + "step": 4036 + }, + { + "epoch": 2.6014504431909753, + "grad_norm": 1.3010747882871112, + "learning_rate": 9.934877429914407e-05, + "loss": 3.6203, + "step": 4037 + }, + { + "epoch": 2.602095084609186, + "grad_norm": 2.0083385796543474, + "learning_rate": 9.934844992960643e-05, + "loss": 3.7716, + "step": 4038 + }, + { + "epoch": 2.602739726027397, + "grad_norm": 1.8977272359432338, + "learning_rate": 9.934812547984132e-05, + "loss": 3.9084, + "step": 4039 + }, + { + "epoch": 2.6033843674456083, + "grad_norm": 1.2944540828761457, + "learning_rate": 9.934780094984929e-05, + "loss": 3.4795, + "step": 4040 + }, + { + "epoch": 2.6040290088638196, + "grad_norm": 1.865210198450078, + "learning_rate": 9.934747633963088e-05, + "loss": 4.0811, + "step": 4041 + }, + { + "epoch": 2.6046736502820305, + "grad_norm": 1.1981830284003585, + "learning_rate": 9.93471516491866e-05, + "loss": 3.5212, + "step": 4042 + }, + { + "epoch": 2.6053182917002418, + "grad_norm": 2.068455706590111, + "learning_rate": 9.934682687851701e-05, + "loss": 3.8539, + "step": 4043 + }, + { + "epoch": 2.6059629331184526, + "grad_norm": 2.2457009777938732, + "learning_rate": 9.934650202762265e-05, + "loss": 4.1228, + "step": 4044 + }, + { + "epoch": 2.606607574536664, + "grad_norm": 2.358203881260712, + "learning_rate": 9.934617709650401e-05, + "loss": 3.8052, + "step": 4045 + }, + { + "epoch": 2.607252215954875, + "grad_norm": 2.8529785284838103, + "learning_rate": 9.934585208516166e-05, + "loss": 4.1395, + "step": 4046 + }, + { + "epoch": 2.607896857373086, + "grad_norm": 1.779592703877406, + "learning_rate": 9.934552699359613e-05, + "loss": 3.7999, + "step": 4047 + }, + { + "epoch": 2.6085414987912974, + "grad_norm": 2.257052437627069, + "learning_rate": 9.934520182180794e-05, + "loss": 3.3994, + "step": 4048 + }, + { + "epoch": 2.6091861402095082, + "grad_norm": 1.9280543399647285, + "learning_rate": 9.934487656979763e-05, + "loss": 4.1189, + "step": 4049 + }, + { + "epoch": 2.6098307816277195, + "grad_norm": 1.6494201329022369, + "learning_rate": 9.934455123756576e-05, + "loss": 3.8707, + "step": 4050 + }, + { + "epoch": 2.610475423045931, + "grad_norm": 1.6408095699970624, + "learning_rate": 9.93442258251128e-05, + "loss": 3.9382, + "step": 4051 + }, + { + "epoch": 2.6111200644641417, + "grad_norm": 1.5148605812125853, + "learning_rate": 9.934390033243935e-05, + "loss": 3.8641, + "step": 4052 + }, + { + "epoch": 2.611764705882353, + "grad_norm": 1.6682552228752696, + "learning_rate": 9.934357475954592e-05, + "loss": 3.553, + "step": 4053 + }, + { + "epoch": 2.612409347300564, + "grad_norm": 2.031795565746747, + "learning_rate": 9.934324910643303e-05, + "loss": 3.8698, + "step": 4054 + }, + { + "epoch": 2.613053988718775, + "grad_norm": 2.150717872427461, + "learning_rate": 9.934292337310123e-05, + "loss": 3.9643, + "step": 4055 + }, + { + "epoch": 2.6136986301369864, + "grad_norm": 1.940004101655461, + "learning_rate": 9.934259755955107e-05, + "loss": 3.8693, + "step": 4056 + }, + { + "epoch": 2.6143432715551973, + "grad_norm": 1.9662618500617948, + "learning_rate": 9.934227166578306e-05, + "loss": 3.9102, + "step": 4057 + }, + { + "epoch": 2.6149879129734086, + "grad_norm": 1.7299861087302641, + "learning_rate": 9.934194569179775e-05, + "loss": 4.1532, + "step": 4058 + }, + { + "epoch": 2.6156325543916195, + "grad_norm": 1.8248502304145466, + "learning_rate": 9.934161963759567e-05, + "loss": 3.5639, + "step": 4059 + }, + { + "epoch": 2.6162771958098308, + "grad_norm": 1.7756654476201328, + "learning_rate": 9.934129350317734e-05, + "loss": 3.9304, + "step": 4060 + }, + { + "epoch": 2.616921837228042, + "grad_norm": 2.38415441359218, + "learning_rate": 9.934096728854331e-05, + "loss": 3.8054, + "step": 4061 + }, + { + "epoch": 2.617566478646253, + "grad_norm": 1.8581629644551214, + "learning_rate": 9.934064099369413e-05, + "loss": 3.9502, + "step": 4062 + }, + { + "epoch": 2.618211120064464, + "grad_norm": 2.1032974404258202, + "learning_rate": 9.934031461863031e-05, + "loss": 3.8303, + "step": 4063 + }, + { + "epoch": 2.618855761482675, + "grad_norm": 1.8477522411630873, + "learning_rate": 9.933998816335242e-05, + "loss": 3.9606, + "step": 4064 + }, + { + "epoch": 2.6195004029008864, + "grad_norm": 1.9276339521497818, + "learning_rate": 9.933966162786096e-05, + "loss": 3.929, + "step": 4065 + }, + { + "epoch": 2.6201450443190977, + "grad_norm": 2.2746571782828027, + "learning_rate": 9.933933501215648e-05, + "loss": 3.8508, + "step": 4066 + }, + { + "epoch": 2.6207896857373085, + "grad_norm": 1.459470933305228, + "learning_rate": 9.933900831623951e-05, + "loss": 3.5945, + "step": 4067 + }, + { + "epoch": 2.62143432715552, + "grad_norm": 2.023769939711452, + "learning_rate": 9.93386815401106e-05, + "loss": 4.1826, + "step": 4068 + }, + { + "epoch": 2.6220789685737307, + "grad_norm": 1.6365496293819506, + "learning_rate": 9.933835468377029e-05, + "loss": 3.4408, + "step": 4069 + }, + { + "epoch": 2.622723609991942, + "grad_norm": 1.666340669279689, + "learning_rate": 9.933802774721909e-05, + "loss": 3.7967, + "step": 4070 + }, + { + "epoch": 2.6233682514101533, + "grad_norm": 1.9516014340637005, + "learning_rate": 9.933770073045757e-05, + "loss": 3.7236, + "step": 4071 + }, + { + "epoch": 2.624012892828364, + "grad_norm": 1.8649011721105613, + "learning_rate": 9.933737363348624e-05, + "loss": 4.033, + "step": 4072 + }, + { + "epoch": 2.6246575342465754, + "grad_norm": 1.584596325588885, + "learning_rate": 9.933704645630565e-05, + "loss": 4.113, + "step": 4073 + }, + { + "epoch": 2.6253021756647863, + "grad_norm": 1.9535168617255858, + "learning_rate": 9.933671919891634e-05, + "loss": 3.8034, + "step": 4074 + }, + { + "epoch": 2.6259468170829976, + "grad_norm": 1.5968291288450642, + "learning_rate": 9.933639186131885e-05, + "loss": 4.0001, + "step": 4075 + }, + { + "epoch": 2.626591458501209, + "grad_norm": 1.8068210575586408, + "learning_rate": 9.93360644435137e-05, + "loss": 3.6772, + "step": 4076 + }, + { + "epoch": 2.6272360999194198, + "grad_norm": 1.5554443198522485, + "learning_rate": 9.933573694550145e-05, + "loss": 3.6412, + "step": 4077 + }, + { + "epoch": 2.627880741337631, + "grad_norm": 1.7179535526779985, + "learning_rate": 9.933540936728262e-05, + "loss": 3.9382, + "step": 4078 + }, + { + "epoch": 2.628525382755842, + "grad_norm": 1.4414039510547312, + "learning_rate": 9.933508170885775e-05, + "loss": 4.0103, + "step": 4079 + }, + { + "epoch": 2.629170024174053, + "grad_norm": 1.6164932711827615, + "learning_rate": 9.933475397022739e-05, + "loss": 4.1062, + "step": 4080 + }, + { + "epoch": 2.6298146655922645, + "grad_norm": 1.205076409044804, + "learning_rate": 9.933442615139206e-05, + "loss": 3.6191, + "step": 4081 + }, + { + "epoch": 2.6304593070104754, + "grad_norm": 1.5462336195644346, + "learning_rate": 9.933409825235232e-05, + "loss": 3.6109, + "step": 4082 + }, + { + "epoch": 2.6311039484286867, + "grad_norm": 1.645225541761875, + "learning_rate": 9.933377027310869e-05, + "loss": 3.8663, + "step": 4083 + }, + { + "epoch": 2.6317485898468975, + "grad_norm": 1.4780792930249411, + "learning_rate": 9.93334422136617e-05, + "loss": 3.7481, + "step": 4084 + }, + { + "epoch": 2.632393231265109, + "grad_norm": 1.3481426916220112, + "learning_rate": 9.933311407401193e-05, + "loss": 4.1428, + "step": 4085 + }, + { + "epoch": 2.63303787268332, + "grad_norm": 1.7201287714056837, + "learning_rate": 9.933278585415991e-05, + "loss": 4.0668, + "step": 4086 + }, + { + "epoch": 2.633682514101531, + "grad_norm": 1.3130636656179124, + "learning_rate": 9.933245755410614e-05, + "loss": 3.6584, + "step": 4087 + }, + { + "epoch": 2.6343271555197423, + "grad_norm": 1.992403345859793, + "learning_rate": 9.933212917385118e-05, + "loss": 3.7123, + "step": 4088 + }, + { + "epoch": 2.634971796937953, + "grad_norm": 2.6992328265890198, + "learning_rate": 9.933180071339557e-05, + "loss": 3.5606, + "step": 4089 + }, + { + "epoch": 2.6356164383561644, + "grad_norm": 1.9725419842191096, + "learning_rate": 9.933147217273987e-05, + "loss": 3.9387, + "step": 4090 + }, + { + "epoch": 2.6362610797743757, + "grad_norm": 1.45710634927256, + "learning_rate": 9.933114355188459e-05, + "loss": 3.9915, + "step": 4091 + }, + { + "epoch": 2.6369057211925866, + "grad_norm": 1.543763170833078, + "learning_rate": 9.933081485083028e-05, + "loss": 3.7442, + "step": 4092 + }, + { + "epoch": 2.637550362610798, + "grad_norm": 1.5099348257359908, + "learning_rate": 9.933048606957748e-05, + "loss": 3.9073, + "step": 4093 + }, + { + "epoch": 2.6381950040290088, + "grad_norm": 1.715132398265792, + "learning_rate": 9.933015720812674e-05, + "loss": 3.9925, + "step": 4094 + }, + { + "epoch": 2.63883964544722, + "grad_norm": 1.4245613403718667, + "learning_rate": 9.932982826647858e-05, + "loss": 3.6243, + "step": 4095 + }, + { + "epoch": 2.6394842868654314, + "grad_norm": 1.7188037854209721, + "learning_rate": 9.932949924463355e-05, + "loss": 3.6219, + "step": 4096 + }, + { + "epoch": 2.640128928283642, + "grad_norm": 1.4074022594297906, + "learning_rate": 9.932917014259219e-05, + "loss": 3.879, + "step": 4097 + }, + { + "epoch": 2.6407735697018535, + "grad_norm": 1.520168348002672, + "learning_rate": 9.932884096035505e-05, + "loss": 3.892, + "step": 4098 + }, + { + "epoch": 2.6414182111200644, + "grad_norm": 1.5131599406386838, + "learning_rate": 9.932851169792265e-05, + "loss": 4.3287, + "step": 4099 + }, + { + "epoch": 2.6420628525382757, + "grad_norm": 1.197237160289361, + "learning_rate": 9.932818235529556e-05, + "loss": 3.9512, + "step": 4100 + }, + { + "epoch": 2.6420628525382757, + "eval_loss": 4.040430068969727, + "eval_runtime": 2.9562, + "eval_samples_per_second": 33.827, + "eval_steps_per_second": 4.398, + "step": 4100 + }, + { + "epoch": 2.642707493956487, + "grad_norm": 1.5911279436485153, + "learning_rate": 9.932785293247429e-05, + "loss": 3.9381, + "step": 4101 + }, + { + "epoch": 2.643352135374698, + "grad_norm": 1.1536185002348842, + "learning_rate": 9.93275234294594e-05, + "loss": 3.8043, + "step": 4102 + }, + { + "epoch": 2.6439967767929087, + "grad_norm": 1.6346102590033538, + "learning_rate": 9.932719384625145e-05, + "loss": 3.8521, + "step": 4103 + }, + { + "epoch": 2.64464141821112, + "grad_norm": 1.1811176227187121, + "learning_rate": 9.932686418285092e-05, + "loss": 3.874, + "step": 4104 + }, + { + "epoch": 2.6452860596293313, + "grad_norm": 1.3221091858037843, + "learning_rate": 9.93265344392584e-05, + "loss": 3.5523, + "step": 4105 + }, + { + "epoch": 2.6459307010475426, + "grad_norm": 6.213305577385368, + "learning_rate": 9.932620461547442e-05, + "loss": 4.0137, + "step": 4106 + }, + { + "epoch": 2.6465753424657534, + "grad_norm": 1.296458345138076, + "learning_rate": 9.932587471149954e-05, + "loss": 3.7948, + "step": 4107 + }, + { + "epoch": 2.6472199838839643, + "grad_norm": 1.4596985985468702, + "learning_rate": 9.932554472733426e-05, + "loss": 4.0912, + "step": 4108 + }, + { + "epoch": 2.6478646253021756, + "grad_norm": 1.5136805241693174, + "learning_rate": 9.932521466297917e-05, + "loss": 3.965, + "step": 4109 + }, + { + "epoch": 2.648509266720387, + "grad_norm": 1.6623986539824251, + "learning_rate": 9.932488451843476e-05, + "loss": 3.6677, + "step": 4110 + }, + { + "epoch": 2.6491539081385977, + "grad_norm": 1.802050791396024, + "learning_rate": 9.932455429370162e-05, + "loss": 3.9208, + "step": 4111 + }, + { + "epoch": 2.649798549556809, + "grad_norm": 1.7961280325131292, + "learning_rate": 9.932422398878027e-05, + "loss": 4.0711, + "step": 4112 + }, + { + "epoch": 2.65044319097502, + "grad_norm": 1.3806737186125222, + "learning_rate": 9.932389360367125e-05, + "loss": 3.7604, + "step": 4113 + }, + { + "epoch": 2.651087832393231, + "grad_norm": 1.362998432534421, + "learning_rate": 9.932356313837511e-05, + "loss": 3.9185, + "step": 4114 + }, + { + "epoch": 2.6517324738114425, + "grad_norm": 1.7312336448068126, + "learning_rate": 9.932323259289239e-05, + "loss": 3.9168, + "step": 4115 + }, + { + "epoch": 2.6523771152296534, + "grad_norm": 1.6095890269101345, + "learning_rate": 9.932290196722363e-05, + "loss": 4.0088, + "step": 4116 + }, + { + "epoch": 2.6530217566478647, + "grad_norm": 1.1411018849364645, + "learning_rate": 9.932257126136938e-05, + "loss": 3.9195, + "step": 4117 + }, + { + "epoch": 2.6536663980660755, + "grad_norm": 1.239159782660133, + "learning_rate": 9.932224047533018e-05, + "loss": 3.6671, + "step": 4118 + }, + { + "epoch": 2.654311039484287, + "grad_norm": 1.2789599335312294, + "learning_rate": 9.932190960910658e-05, + "loss": 3.7859, + "step": 4119 + }, + { + "epoch": 2.654955680902498, + "grad_norm": 1.243808575410319, + "learning_rate": 9.93215786626991e-05, + "loss": 4.0614, + "step": 4120 + }, + { + "epoch": 2.655600322320709, + "grad_norm": 1.2912845178461003, + "learning_rate": 9.932124763610832e-05, + "loss": 3.8772, + "step": 4121 + }, + { + "epoch": 2.6562449637389203, + "grad_norm": 1.1605304633952875, + "learning_rate": 9.932091652933474e-05, + "loss": 3.5818, + "step": 4122 + }, + { + "epoch": 2.656889605157131, + "grad_norm": 1.1057078991988922, + "learning_rate": 9.932058534237894e-05, + "loss": 4.1361, + "step": 4123 + }, + { + "epoch": 2.6575342465753424, + "grad_norm": 1.5561043457707506, + "learning_rate": 9.932025407524145e-05, + "loss": 3.6499, + "step": 4124 + }, + { + "epoch": 2.6581788879935537, + "grad_norm": 1.978682156579346, + "learning_rate": 9.931992272792283e-05, + "loss": 3.887, + "step": 4125 + }, + { + "epoch": 2.6588235294117646, + "grad_norm": 1.7573066042463659, + "learning_rate": 9.931959130042358e-05, + "loss": 3.5302, + "step": 4126 + }, + { + "epoch": 2.659468170829976, + "grad_norm": 1.5760928676614945, + "learning_rate": 9.931925979274429e-05, + "loss": 3.9325, + "step": 4127 + }, + { + "epoch": 2.6601128122481867, + "grad_norm": 1.3212998328367285, + "learning_rate": 9.93189282048855e-05, + "loss": 3.7309, + "step": 4128 + }, + { + "epoch": 2.660757453666398, + "grad_norm": 1.3237704526094256, + "learning_rate": 9.931859653684772e-05, + "loss": 3.8679, + "step": 4129 + }, + { + "epoch": 2.6614020950846093, + "grad_norm": 1.3430913694494717, + "learning_rate": 9.931826478863153e-05, + "loss": 3.5629, + "step": 4130 + }, + { + "epoch": 2.66204673650282, + "grad_norm": 1.4534407108500496, + "learning_rate": 9.931793296023746e-05, + "loss": 3.8467, + "step": 4131 + }, + { + "epoch": 2.6626913779210315, + "grad_norm": 1.7545659988675746, + "learning_rate": 9.931760105166605e-05, + "loss": 3.5269, + "step": 4132 + }, + { + "epoch": 2.6633360193392424, + "grad_norm": 1.7836492536862973, + "learning_rate": 9.931726906291787e-05, + "loss": 3.6661, + "step": 4133 + }, + { + "epoch": 2.6639806607574537, + "grad_norm": 1.4951173381788876, + "learning_rate": 9.931693699399343e-05, + "loss": 3.6765, + "step": 4134 + }, + { + "epoch": 2.664625302175665, + "grad_norm": 1.4464631109252621, + "learning_rate": 9.93166048448933e-05, + "loss": 3.7011, + "step": 4135 + }, + { + "epoch": 2.665269943593876, + "grad_norm": 1.7141128398355416, + "learning_rate": 9.931627261561802e-05, + "loss": 3.9799, + "step": 4136 + }, + { + "epoch": 2.665914585012087, + "grad_norm": 1.0998580785379977, + "learning_rate": 9.931594030616813e-05, + "loss": 4.0082, + "step": 4137 + }, + { + "epoch": 2.666559226430298, + "grad_norm": 2.013391152050552, + "learning_rate": 9.931560791654418e-05, + "loss": 4.1191, + "step": 4138 + }, + { + "epoch": 2.6672038678485093, + "grad_norm": 1.7788496132133473, + "learning_rate": 9.931527544674673e-05, + "loss": 3.6003, + "step": 4139 + }, + { + "epoch": 2.6678485092667206, + "grad_norm": 1.7268078174493293, + "learning_rate": 9.93149428967763e-05, + "loss": 3.7304, + "step": 4140 + }, + { + "epoch": 2.6684931506849314, + "grad_norm": 1.4627561512898832, + "learning_rate": 9.931461026663344e-05, + "loss": 3.9186, + "step": 4141 + }, + { + "epoch": 2.6691377921031427, + "grad_norm": 2.0994341940131394, + "learning_rate": 9.931427755631873e-05, + "loss": 3.8587, + "step": 4142 + }, + { + "epoch": 2.6697824335213536, + "grad_norm": 1.3255933206407187, + "learning_rate": 9.931394476583267e-05, + "loss": 3.7745, + "step": 4143 + }, + { + "epoch": 2.670427074939565, + "grad_norm": 1.559292875313032, + "learning_rate": 9.931361189517583e-05, + "loss": 3.9147, + "step": 4144 + }, + { + "epoch": 2.671071716357776, + "grad_norm": 1.8780756727741712, + "learning_rate": 9.931327894434876e-05, + "loss": 4.0795, + "step": 4145 + }, + { + "epoch": 2.671716357775987, + "grad_norm": 1.1815329794028606, + "learning_rate": 9.931294591335199e-05, + "loss": 3.9989, + "step": 4146 + }, + { + "epoch": 2.6723609991941983, + "grad_norm": 1.5385907673401733, + "learning_rate": 9.93126128021861e-05, + "loss": 3.8069, + "step": 4147 + }, + { + "epoch": 2.673005640612409, + "grad_norm": 1.6505698634639276, + "learning_rate": 9.931227961085159e-05, + "loss": 3.8485, + "step": 4148 + }, + { + "epoch": 2.6736502820306205, + "grad_norm": 1.4623838016235875, + "learning_rate": 9.931194633934905e-05, + "loss": 4.1692, + "step": 4149 + }, + { + "epoch": 2.674294923448832, + "grad_norm": 1.7566015182357033, + "learning_rate": 9.931161298767899e-05, + "loss": 3.7957, + "step": 4150 + }, + { + "epoch": 2.6749395648670427, + "grad_norm": 1.7242655177345845, + "learning_rate": 9.9311279555842e-05, + "loss": 3.6821, + "step": 4151 + }, + { + "epoch": 2.675584206285254, + "grad_norm": 1.60599406449387, + "learning_rate": 9.931094604383859e-05, + "loss": 4.1187, + "step": 4152 + }, + { + "epoch": 2.676228847703465, + "grad_norm": 1.7016790233467092, + "learning_rate": 9.931061245166933e-05, + "loss": 4.2092, + "step": 4153 + }, + { + "epoch": 2.676873489121676, + "grad_norm": 1.9056099955342911, + "learning_rate": 9.931027877933476e-05, + "loss": 3.4665, + "step": 4154 + }, + { + "epoch": 2.6775181305398874, + "grad_norm": 1.2553477539945732, + "learning_rate": 9.930994502683543e-05, + "loss": 3.9961, + "step": 4155 + }, + { + "epoch": 2.6781627719580983, + "grad_norm": 1.512034883442915, + "learning_rate": 9.930961119417187e-05, + "loss": 3.8768, + "step": 4156 + }, + { + "epoch": 2.6788074133763096, + "grad_norm": 1.571517927654095, + "learning_rate": 9.930927728134466e-05, + "loss": 4.0342, + "step": 4157 + }, + { + "epoch": 2.6794520547945204, + "grad_norm": 1.3366149310136923, + "learning_rate": 9.930894328835433e-05, + "loss": 3.6872, + "step": 4158 + }, + { + "epoch": 2.6800966962127317, + "grad_norm": 1.7721536313061113, + "learning_rate": 9.930860921520143e-05, + "loss": 4.0785, + "step": 4159 + }, + { + "epoch": 2.680741337630943, + "grad_norm": 1.5807787569846434, + "learning_rate": 9.930827506188652e-05, + "loss": 4.1515, + "step": 4160 + }, + { + "epoch": 2.681385979049154, + "grad_norm": 1.3710373181349575, + "learning_rate": 9.930794082841012e-05, + "loss": 3.7092, + "step": 4161 + }, + { + "epoch": 2.682030620467365, + "grad_norm": 1.2289651943956497, + "learning_rate": 9.930760651477282e-05, + "loss": 3.8256, + "step": 4162 + }, + { + "epoch": 2.682675261885576, + "grad_norm": 1.4855680013318822, + "learning_rate": 9.930727212097514e-05, + "loss": 3.6772, + "step": 4163 + }, + { + "epoch": 2.6833199033037873, + "grad_norm": 1.7477633949551121, + "learning_rate": 9.930693764701763e-05, + "loss": 3.7812, + "step": 4164 + }, + { + "epoch": 2.6839645447219986, + "grad_norm": 1.6988508403034985, + "learning_rate": 9.930660309290084e-05, + "loss": 3.6516, + "step": 4165 + }, + { + "epoch": 2.6846091861402095, + "grad_norm": 1.3398835140790692, + "learning_rate": 9.930626845862534e-05, + "loss": 3.7324, + "step": 4166 + }, + { + "epoch": 2.685253827558421, + "grad_norm": 1.8147857358475128, + "learning_rate": 9.930593374419167e-05, + "loss": 3.8869, + "step": 4167 + }, + { + "epoch": 2.6858984689766316, + "grad_norm": 1.7708058331063439, + "learning_rate": 9.930559894960036e-05, + "loss": 3.8573, + "step": 4168 + }, + { + "epoch": 2.686543110394843, + "grad_norm": 1.3693320898434824, + "learning_rate": 9.930526407485198e-05, + "loss": 3.8339, + "step": 4169 + }, + { + "epoch": 2.6871877518130542, + "grad_norm": 1.3553557499172142, + "learning_rate": 9.930492911994707e-05, + "loss": 3.996, + "step": 4170 + }, + { + "epoch": 2.687832393231265, + "grad_norm": 1.4107016504871455, + "learning_rate": 9.930459408488618e-05, + "loss": 3.9621, + "step": 4171 + }, + { + "epoch": 2.688477034649476, + "grad_norm": 1.3295620842460802, + "learning_rate": 9.930425896966988e-05, + "loss": 3.7174, + "step": 4172 + }, + { + "epoch": 2.6891216760676873, + "grad_norm": 1.990863311982117, + "learning_rate": 9.93039237742987e-05, + "loss": 4.4442, + "step": 4173 + }, + { + "epoch": 2.6897663174858986, + "grad_norm": 1.4876798367402675, + "learning_rate": 9.93035884987732e-05, + "loss": 3.917, + "step": 4174 + }, + { + "epoch": 2.69041095890411, + "grad_norm": 1.4741651872187966, + "learning_rate": 9.930325314309391e-05, + "loss": 3.9467, + "step": 4175 + }, + { + "epoch": 2.6910556003223207, + "grad_norm": 1.9939473503415133, + "learning_rate": 9.930291770726141e-05, + "loss": 3.5926, + "step": 4176 + }, + { + "epoch": 2.6917002417405316, + "grad_norm": 1.7733907605937074, + "learning_rate": 9.930258219127623e-05, + "loss": 3.9761, + "step": 4177 + }, + { + "epoch": 2.692344883158743, + "grad_norm": 1.737108323739256, + "learning_rate": 9.930224659513894e-05, + "loss": 3.4351, + "step": 4178 + }, + { + "epoch": 2.692989524576954, + "grad_norm": 2.271919135047584, + "learning_rate": 9.930191091885008e-05, + "loss": 3.8572, + "step": 4179 + }, + { + "epoch": 2.693634165995165, + "grad_norm": 1.30212979446339, + "learning_rate": 9.930157516241019e-05, + "loss": 3.9808, + "step": 4180 + }, + { + "epoch": 2.6942788074133763, + "grad_norm": 1.8730810703774319, + "learning_rate": 9.930123932581984e-05, + "loss": 4.0108, + "step": 4181 + }, + { + "epoch": 2.694923448831587, + "grad_norm": 1.838073652662221, + "learning_rate": 9.930090340907956e-05, + "loss": 3.7124, + "step": 4182 + }, + { + "epoch": 2.6955680902497985, + "grad_norm": 2.117955218363424, + "learning_rate": 9.930056741218992e-05, + "loss": 3.6206, + "step": 4183 + }, + { + "epoch": 2.69621273166801, + "grad_norm": 1.5666021221254962, + "learning_rate": 9.930023133515149e-05, + "loss": 3.8974, + "step": 4184 + }, + { + "epoch": 2.6968573730862206, + "grad_norm": 2.2953527237008307, + "learning_rate": 9.92998951779648e-05, + "loss": 3.8453, + "step": 4185 + }, + { + "epoch": 2.697502014504432, + "grad_norm": 2.0789073319070686, + "learning_rate": 9.929955894063038e-05, + "loss": 3.4696, + "step": 4186 + }, + { + "epoch": 2.698146655922643, + "grad_norm": 1.6060677958028833, + "learning_rate": 9.929922262314881e-05, + "loss": 3.6181, + "step": 4187 + }, + { + "epoch": 2.698791297340854, + "grad_norm": 1.7192779891503691, + "learning_rate": 9.929888622552065e-05, + "loss": 3.7085, + "step": 4188 + }, + { + "epoch": 2.6994359387590654, + "grad_norm": 1.7468911778648373, + "learning_rate": 9.929854974774643e-05, + "loss": 3.4519, + "step": 4189 + }, + { + "epoch": 2.7000805801772763, + "grad_norm": 1.6309724325755675, + "learning_rate": 9.929821318982671e-05, + "loss": 3.9769, + "step": 4190 + }, + { + "epoch": 2.7007252215954876, + "grad_norm": 1.4833799112873969, + "learning_rate": 9.929787655176205e-05, + "loss": 4.1191, + "step": 4191 + }, + { + "epoch": 2.7013698630136984, + "grad_norm": 1.4147318540811218, + "learning_rate": 9.929753983355299e-05, + "loss": 3.7483, + "step": 4192 + }, + { + "epoch": 2.7020145044319097, + "grad_norm": 1.7628243181873144, + "learning_rate": 9.92972030352001e-05, + "loss": 4.0469, + "step": 4193 + }, + { + "epoch": 2.702659145850121, + "grad_norm": 1.360236732337564, + "learning_rate": 9.92968661567039e-05, + "loss": 3.3913, + "step": 4194 + }, + { + "epoch": 2.703303787268332, + "grad_norm": 1.4658672816438576, + "learning_rate": 9.9296529198065e-05, + "loss": 4.0458, + "step": 4195 + }, + { + "epoch": 2.703948428686543, + "grad_norm": 1.4357362275343402, + "learning_rate": 9.929619215928389e-05, + "loss": 3.7976, + "step": 4196 + }, + { + "epoch": 2.704593070104754, + "grad_norm": 1.5922619230326636, + "learning_rate": 9.929585504036119e-05, + "loss": 3.9219, + "step": 4197 + }, + { + "epoch": 2.7052377115229653, + "grad_norm": 1.7465762232897954, + "learning_rate": 9.929551784129741e-05, + "loss": 3.8569, + "step": 4198 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 1.6955441008425673, + "learning_rate": 9.929518056209309e-05, + "loss": 3.7477, + "step": 4199 + }, + { + "epoch": 2.7065269943593875, + "grad_norm": 2.012774573744627, + "learning_rate": 9.929484320274881e-05, + "loss": 3.534, + "step": 4200 + }, + { + "epoch": 2.7065269943593875, + "eval_loss": 4.043041229248047, + "eval_runtime": 2.9827, + "eval_samples_per_second": 33.526, + "eval_steps_per_second": 4.358, + "step": 4200 + }, + { + "epoch": 2.707171635777599, + "grad_norm": 1.200407826962898, + "learning_rate": 9.929450576326512e-05, + "loss": 3.8258, + "step": 4201 + }, + { + "epoch": 2.7078162771958096, + "grad_norm": 1.5369838012168031, + "learning_rate": 9.929416824364258e-05, + "loss": 4.2175, + "step": 4202 + }, + { + "epoch": 2.708460918614021, + "grad_norm": 1.1728366054409836, + "learning_rate": 9.929383064388173e-05, + "loss": 3.4697, + "step": 4203 + }, + { + "epoch": 2.7091055600322322, + "grad_norm": 1.4259347815322145, + "learning_rate": 9.929349296398314e-05, + "loss": 4.0044, + "step": 4204 + }, + { + "epoch": 2.709750201450443, + "grad_norm": 1.3056191027714348, + "learning_rate": 9.929315520394735e-05, + "loss": 3.7474, + "step": 4205 + }, + { + "epoch": 2.7103948428686544, + "grad_norm": 1.3648388629429855, + "learning_rate": 9.929281736377493e-05, + "loss": 4.1097, + "step": 4206 + }, + { + "epoch": 2.7110394842868653, + "grad_norm": 1.460051559672619, + "learning_rate": 9.929247944346641e-05, + "loss": 3.5434, + "step": 4207 + }, + { + "epoch": 2.7116841257050766, + "grad_norm": 1.347093517034494, + "learning_rate": 9.929214144302239e-05, + "loss": 3.5644, + "step": 4208 + }, + { + "epoch": 2.712328767123288, + "grad_norm": 1.5090914408166989, + "learning_rate": 9.929180336244336e-05, + "loss": 3.9074, + "step": 4209 + }, + { + "epoch": 2.7129734085414987, + "grad_norm": 1.178038228770447, + "learning_rate": 9.929146520172994e-05, + "loss": 4.2162, + "step": 4210 + }, + { + "epoch": 2.71361804995971, + "grad_norm": 1.4616930087430013, + "learning_rate": 9.929112696088267e-05, + "loss": 3.9959, + "step": 4211 + }, + { + "epoch": 2.714262691377921, + "grad_norm": 1.725758102420005, + "learning_rate": 9.929078863990206e-05, + "loss": 4.0013, + "step": 4212 + }, + { + "epoch": 2.714907332796132, + "grad_norm": 1.3301965269467948, + "learning_rate": 9.929045023878871e-05, + "loss": 3.646, + "step": 4213 + }, + { + "epoch": 2.7155519742143435, + "grad_norm": 1.76272274554122, + "learning_rate": 9.929011175754317e-05, + "loss": 4.0978, + "step": 4214 + }, + { + "epoch": 2.7161966156325543, + "grad_norm": 1.6559255569480391, + "learning_rate": 9.928977319616599e-05, + "loss": 4.1302, + "step": 4215 + }, + { + "epoch": 2.7168412570507656, + "grad_norm": 1.274706277264175, + "learning_rate": 9.928943455465771e-05, + "loss": 3.7261, + "step": 4216 + }, + { + "epoch": 2.7174858984689765, + "grad_norm": 1.8824200242154234, + "learning_rate": 9.928909583301893e-05, + "loss": 4.2348, + "step": 4217 + }, + { + "epoch": 2.718130539887188, + "grad_norm": 1.5791960439162944, + "learning_rate": 9.928875703125016e-05, + "loss": 4.0449, + "step": 4218 + }, + { + "epoch": 2.718775181305399, + "grad_norm": 1.4478310176807005, + "learning_rate": 9.928841814935197e-05, + "loss": 3.5461, + "step": 4219 + }, + { + "epoch": 2.71941982272361, + "grad_norm": 1.5343413297061694, + "learning_rate": 9.928807918732493e-05, + "loss": 3.9873, + "step": 4220 + }, + { + "epoch": 2.7200644641418212, + "grad_norm": 1.5715818382431814, + "learning_rate": 9.92877401451696e-05, + "loss": 4.0183, + "step": 4221 + }, + { + "epoch": 2.720709105560032, + "grad_norm": 1.4410508300753577, + "learning_rate": 9.92874010228865e-05, + "loss": 3.7125, + "step": 4222 + }, + { + "epoch": 2.7213537469782434, + "grad_norm": 1.5710533102856192, + "learning_rate": 9.928706182047623e-05, + "loss": 4.0365, + "step": 4223 + }, + { + "epoch": 2.7219983883964547, + "grad_norm": 1.4816240818857727, + "learning_rate": 9.928672253793932e-05, + "loss": 3.781, + "step": 4224 + }, + { + "epoch": 2.7226430298146655, + "grad_norm": 1.83307584936468, + "learning_rate": 9.928638317527635e-05, + "loss": 3.9303, + "step": 4225 + }, + { + "epoch": 2.723287671232877, + "grad_norm": 1.2265646201813627, + "learning_rate": 9.928604373248786e-05, + "loss": 3.9182, + "step": 4226 + }, + { + "epoch": 2.7239323126510877, + "grad_norm": 1.8566383103555884, + "learning_rate": 9.92857042095744e-05, + "loss": 3.7537, + "step": 4227 + }, + { + "epoch": 2.724576954069299, + "grad_norm": 1.5603930454492643, + "learning_rate": 9.928536460653654e-05, + "loss": 3.6503, + "step": 4228 + }, + { + "epoch": 2.7252215954875103, + "grad_norm": 1.2880485894528697, + "learning_rate": 9.928502492337486e-05, + "loss": 3.9219, + "step": 4229 + }, + { + "epoch": 2.725866236905721, + "grad_norm": 1.9688851426026193, + "learning_rate": 9.928468516008987e-05, + "loss": 3.514, + "step": 4230 + }, + { + "epoch": 2.7265108783239325, + "grad_norm": 1.8334284697637555, + "learning_rate": 9.928434531668216e-05, + "loss": 4.0791, + "step": 4231 + }, + { + "epoch": 2.7271555197421433, + "grad_norm": 1.3754275530249704, + "learning_rate": 9.92840053931523e-05, + "loss": 3.8849, + "step": 4232 + }, + { + "epoch": 2.7278001611603546, + "grad_norm": 1.463153478842898, + "learning_rate": 9.928366538950079e-05, + "loss": 4.0395, + "step": 4233 + }, + { + "epoch": 2.728444802578566, + "grad_norm": 1.4051982872075455, + "learning_rate": 9.928332530572824e-05, + "loss": 3.934, + "step": 4234 + }, + { + "epoch": 2.7290894439967768, + "grad_norm": 1.5357434597241306, + "learning_rate": 9.928298514183522e-05, + "loss": 3.6143, + "step": 4235 + }, + { + "epoch": 2.729734085414988, + "grad_norm": 1.4422270806138766, + "learning_rate": 9.928264489782224e-05, + "loss": 4.0521, + "step": 4236 + }, + { + "epoch": 2.730378726833199, + "grad_norm": 1.3917119243273257, + "learning_rate": 9.92823045736899e-05, + "loss": 4.0665, + "step": 4237 + }, + { + "epoch": 2.7310233682514102, + "grad_norm": 1.9652986713431468, + "learning_rate": 9.928196416943873e-05, + "loss": 4.03, + "step": 4238 + }, + { + "epoch": 2.7316680096696215, + "grad_norm": 2.0418017160378503, + "learning_rate": 9.928162368506932e-05, + "loss": 3.8938, + "step": 4239 + }, + { + "epoch": 2.7323126510878324, + "grad_norm": 1.4733515166974793, + "learning_rate": 9.92812831205822e-05, + "loss": 3.7347, + "step": 4240 + }, + { + "epoch": 2.7329572925060432, + "grad_norm": 2.049150359637276, + "learning_rate": 9.928094247597793e-05, + "loss": 3.9971, + "step": 4241 + }, + { + "epoch": 2.7336019339242545, + "grad_norm": 1.7820743843134574, + "learning_rate": 9.928060175125708e-05, + "loss": 3.9277, + "step": 4242 + }, + { + "epoch": 2.734246575342466, + "grad_norm": 1.543777870247601, + "learning_rate": 9.928026094642022e-05, + "loss": 3.9772, + "step": 4243 + }, + { + "epoch": 2.734891216760677, + "grad_norm": 1.8653219911507855, + "learning_rate": 9.927992006146789e-05, + "loss": 4.0912, + "step": 4244 + }, + { + "epoch": 2.735535858178888, + "grad_norm": 1.4221915073615155, + "learning_rate": 9.927957909640067e-05, + "loss": 3.6726, + "step": 4245 + }, + { + "epoch": 2.736180499597099, + "grad_norm": 1.6062696430399548, + "learning_rate": 9.92792380512191e-05, + "loss": 3.7604, + "step": 4246 + }, + { + "epoch": 2.73682514101531, + "grad_norm": 2.2693201980254027, + "learning_rate": 9.927889692592376e-05, + "loss": 3.5471, + "step": 4247 + }, + { + "epoch": 2.7374697824335215, + "grad_norm": 1.4829695130070721, + "learning_rate": 9.92785557205152e-05, + "loss": 3.9362, + "step": 4248 + }, + { + "epoch": 2.7381144238517323, + "grad_norm": 1.3281731679539917, + "learning_rate": 9.927821443499395e-05, + "loss": 4.2042, + "step": 4249 + }, + { + "epoch": 2.7387590652699436, + "grad_norm": 1.5296279011353315, + "learning_rate": 9.927787306936063e-05, + "loss": 4.1503, + "step": 4250 + }, + { + "epoch": 2.7394037066881545, + "grad_norm": 1.6026156888224787, + "learning_rate": 9.927753162361577e-05, + "loss": 3.6237, + "step": 4251 + }, + { + "epoch": 2.7400483481063658, + "grad_norm": 1.3299272474800912, + "learning_rate": 9.927719009775991e-05, + "loss": 4.1917, + "step": 4252 + }, + { + "epoch": 2.740692989524577, + "grad_norm": 1.5126144364403944, + "learning_rate": 9.927684849179365e-05, + "loss": 3.8445, + "step": 4253 + }, + { + "epoch": 2.741337630942788, + "grad_norm": 1.4777166450724317, + "learning_rate": 9.927650680571751e-05, + "loss": 3.6971, + "step": 4254 + }, + { + "epoch": 2.7419822723609992, + "grad_norm": 1.266442932650446, + "learning_rate": 9.92761650395321e-05, + "loss": 3.8689, + "step": 4255 + }, + { + "epoch": 2.74262691377921, + "grad_norm": 1.5706487846322719, + "learning_rate": 9.927582319323795e-05, + "loss": 4.0205, + "step": 4256 + }, + { + "epoch": 2.7432715551974214, + "grad_norm": 1.3282534622215982, + "learning_rate": 9.927548126683563e-05, + "loss": 4.0617, + "step": 4257 + }, + { + "epoch": 2.7439161966156327, + "grad_norm": 1.4800831578504126, + "learning_rate": 9.92751392603257e-05, + "loss": 4.0469, + "step": 4258 + }, + { + "epoch": 2.7445608380338435, + "grad_norm": 2.0503819154345866, + "learning_rate": 9.927479717370872e-05, + "loss": 3.705, + "step": 4259 + }, + { + "epoch": 2.745205479452055, + "grad_norm": 1.5044891355915706, + "learning_rate": 9.927445500698524e-05, + "loss": 3.8145, + "step": 4260 + }, + { + "epoch": 2.7458501208702657, + "grad_norm": 1.4906819318334645, + "learning_rate": 9.927411276015585e-05, + "loss": 3.4099, + "step": 4261 + }, + { + "epoch": 2.746494762288477, + "grad_norm": 1.5654583964923114, + "learning_rate": 9.92737704332211e-05, + "loss": 3.6003, + "step": 4262 + }, + { + "epoch": 2.7471394037066883, + "grad_norm": 1.5161293864043803, + "learning_rate": 9.927342802618152e-05, + "loss": 4.1379, + "step": 4263 + }, + { + "epoch": 2.747784045124899, + "grad_norm": 1.6313400618790934, + "learning_rate": 9.927308553903772e-05, + "loss": 3.7942, + "step": 4264 + }, + { + "epoch": 2.7484286865431105, + "grad_norm": 1.667074428067043, + "learning_rate": 9.927274297179025e-05, + "loss": 3.6164, + "step": 4265 + }, + { + "epoch": 2.7490733279613213, + "grad_norm": 1.6453998591291012, + "learning_rate": 9.927240032443966e-05, + "loss": 3.731, + "step": 4266 + }, + { + "epoch": 2.7497179693795326, + "grad_norm": 1.5411699326161317, + "learning_rate": 9.927205759698653e-05, + "loss": 3.7377, + "step": 4267 + }, + { + "epoch": 2.750362610797744, + "grad_norm": 1.5102324845981625, + "learning_rate": 9.927171478943139e-05, + "loss": 3.9625, + "step": 4268 + }, + { + "epoch": 2.7510072522159548, + "grad_norm": 1.2968065051432343, + "learning_rate": 9.927137190177483e-05, + "loss": 3.8576, + "step": 4269 + }, + { + "epoch": 2.751651893634166, + "grad_norm": 1.6008996476509865, + "learning_rate": 9.927102893401742e-05, + "loss": 3.7267, + "step": 4270 + }, + { + "epoch": 2.752296535052377, + "grad_norm": 1.4138492671491858, + "learning_rate": 9.927068588615971e-05, + "loss": 3.7995, + "step": 4271 + }, + { + "epoch": 2.7529411764705882, + "grad_norm": 1.8739968159417744, + "learning_rate": 9.927034275820225e-05, + "loss": 3.7721, + "step": 4272 + }, + { + "epoch": 2.7535858178887995, + "grad_norm": 2.1158086641257867, + "learning_rate": 9.926999955014563e-05, + "loss": 3.6018, + "step": 4273 + }, + { + "epoch": 2.7542304593070104, + "grad_norm": 1.9158239944800406, + "learning_rate": 9.92696562619904e-05, + "loss": 3.7226, + "step": 4274 + }, + { + "epoch": 2.7548751007252217, + "grad_norm": 2.521070388345981, + "learning_rate": 9.926931289373713e-05, + "loss": 4.0253, + "step": 4275 + }, + { + "epoch": 2.7555197421434325, + "grad_norm": 2.4232915314951544, + "learning_rate": 9.926896944538638e-05, + "loss": 3.931, + "step": 4276 + }, + { + "epoch": 2.756164383561644, + "grad_norm": 2.222838514788053, + "learning_rate": 9.926862591693871e-05, + "loss": 3.7833, + "step": 4277 + }, + { + "epoch": 2.756809024979855, + "grad_norm": 2.507038721386602, + "learning_rate": 9.926828230839469e-05, + "loss": 3.8176, + "step": 4278 + }, + { + "epoch": 2.757453666398066, + "grad_norm": 1.9024506190392594, + "learning_rate": 9.926793861975486e-05, + "loss": 3.7957, + "step": 4279 + }, + { + "epoch": 2.7580983078162773, + "grad_norm": 2.049278781825025, + "learning_rate": 9.926759485101984e-05, + "loss": 4.3147, + "step": 4280 + }, + { + "epoch": 2.758742949234488, + "grad_norm": 1.7985066807731658, + "learning_rate": 9.926725100219014e-05, + "loss": 4.116, + "step": 4281 + }, + { + "epoch": 2.7593875906526995, + "grad_norm": 2.0854050816671457, + "learning_rate": 9.926690707326636e-05, + "loss": 3.8757, + "step": 4282 + }, + { + "epoch": 2.7600322320709108, + "grad_norm": 1.7291809990994342, + "learning_rate": 9.926656306424906e-05, + "loss": 3.7699, + "step": 4283 + }, + { + "epoch": 2.7606768734891216, + "grad_norm": 1.7260217100794395, + "learning_rate": 9.926621897513877e-05, + "loss": 4.0434, + "step": 4284 + }, + { + "epoch": 2.761321514907333, + "grad_norm": 1.8845028906667107, + "learning_rate": 9.926587480593608e-05, + "loss": 3.7662, + "step": 4285 + }, + { + "epoch": 2.7619661563255438, + "grad_norm": 1.5576869568507883, + "learning_rate": 9.926553055664157e-05, + "loss": 4.0051, + "step": 4286 + }, + { + "epoch": 2.762610797743755, + "grad_norm": 2.212879213087522, + "learning_rate": 9.926518622725579e-05, + "loss": 3.7719, + "step": 4287 + }, + { + "epoch": 2.7632554391619664, + "grad_norm": 1.5960268318174293, + "learning_rate": 9.926484181777932e-05, + "loss": 4.013, + "step": 4288 + }, + { + "epoch": 2.763900080580177, + "grad_norm": 2.030417891174118, + "learning_rate": 9.926449732821269e-05, + "loss": 3.8045, + "step": 4289 + }, + { + "epoch": 2.7645447219983885, + "grad_norm": 1.6923014760194979, + "learning_rate": 9.92641527585565e-05, + "loss": 3.5774, + "step": 4290 + }, + { + "epoch": 2.7651893634165994, + "grad_norm": 1.8418922420453558, + "learning_rate": 9.92638081088113e-05, + "loss": 3.8935, + "step": 4291 + }, + { + "epoch": 2.7658340048348107, + "grad_norm": 1.643952310959402, + "learning_rate": 9.926346337897767e-05, + "loss": 3.8843, + "step": 4292 + }, + { + "epoch": 2.766478646253022, + "grad_norm": 1.855116507847385, + "learning_rate": 9.926311856905616e-05, + "loss": 3.5349, + "step": 4293 + }, + { + "epoch": 2.767123287671233, + "grad_norm": 2.0743885670904687, + "learning_rate": 9.926277367904735e-05, + "loss": 3.7388, + "step": 4294 + }, + { + "epoch": 2.767767929089444, + "grad_norm": 1.7212663734606817, + "learning_rate": 9.926242870895179e-05, + "loss": 3.4905, + "step": 4295 + }, + { + "epoch": 2.768412570507655, + "grad_norm": 2.2483006136908337, + "learning_rate": 9.926208365877006e-05, + "loss": 3.8669, + "step": 4296 + }, + { + "epoch": 2.7690572119258663, + "grad_norm": 1.74851003990779, + "learning_rate": 9.926173852850273e-05, + "loss": 3.7175, + "step": 4297 + }, + { + "epoch": 2.7697018533440776, + "grad_norm": 2.2436125890475913, + "learning_rate": 9.926139331815034e-05, + "loss": 3.9045, + "step": 4298 + }, + { + "epoch": 2.7703464947622884, + "grad_norm": 2.502433352840122, + "learning_rate": 9.926104802771349e-05, + "loss": 3.5236, + "step": 4299 + }, + { + "epoch": 2.7709911361804997, + "grad_norm": 1.9366430455248294, + "learning_rate": 9.926070265719275e-05, + "loss": 4.0126, + "step": 4300 + }, + { + "epoch": 2.7709911361804997, + "eval_loss": 4.055446624755859, + "eval_runtime": 2.9765, + "eval_samples_per_second": 33.596, + "eval_steps_per_second": 4.367, + "step": 4300 + }, + { + "epoch": 2.7716357775987106, + "grad_norm": 2.4314855751396482, + "learning_rate": 9.926035720658864e-05, + "loss": 3.5442, + "step": 4301 + }, + { + "epoch": 2.772280419016922, + "grad_norm": 1.5583776153343092, + "learning_rate": 9.926001167590178e-05, + "loss": 3.8455, + "step": 4302 + }, + { + "epoch": 2.772925060435133, + "grad_norm": 2.140977755345046, + "learning_rate": 9.92596660651327e-05, + "loss": 4.0284, + "step": 4303 + }, + { + "epoch": 2.773569701853344, + "grad_norm": 1.7359156154442057, + "learning_rate": 9.9259320374282e-05, + "loss": 3.5285, + "step": 4304 + }, + { + "epoch": 2.7742143432715554, + "grad_norm": 2.0809708301127885, + "learning_rate": 9.925897460335022e-05, + "loss": 4.029, + "step": 4305 + }, + { + "epoch": 2.774858984689766, + "grad_norm": 1.8607559961302136, + "learning_rate": 9.925862875233795e-05, + "loss": 3.8301, + "step": 4306 + }, + { + "epoch": 2.7755036261079775, + "grad_norm": 2.434416289594967, + "learning_rate": 9.925828282124572e-05, + "loss": 3.6907, + "step": 4307 + }, + { + "epoch": 2.776148267526189, + "grad_norm": 2.3679697558938537, + "learning_rate": 9.925793681007415e-05, + "loss": 3.8176, + "step": 4308 + }, + { + "epoch": 2.7767929089443997, + "grad_norm": 2.2140285660565895, + "learning_rate": 9.925759071882378e-05, + "loss": 3.9183, + "step": 4309 + }, + { + "epoch": 2.7774375503626105, + "grad_norm": 2.310445601375679, + "learning_rate": 9.925724454749517e-05, + "loss": 3.4078, + "step": 4310 + }, + { + "epoch": 2.778082191780822, + "grad_norm": 2.4121097484565066, + "learning_rate": 9.925689829608892e-05, + "loss": 3.5782, + "step": 4311 + }, + { + "epoch": 2.778726833199033, + "grad_norm": 2.2566488568291834, + "learning_rate": 9.925655196460556e-05, + "loss": 3.6246, + "step": 4312 + }, + { + "epoch": 2.7793714746172444, + "grad_norm": 2.096871231432969, + "learning_rate": 9.92562055530457e-05, + "loss": 4.058, + "step": 4313 + }, + { + "epoch": 2.7800161160354553, + "grad_norm": 2.27138303248035, + "learning_rate": 9.925585906140987e-05, + "loss": 4.1752, + "step": 4314 + }, + { + "epoch": 2.780660757453666, + "grad_norm": 1.8782863902107272, + "learning_rate": 9.925551248969865e-05, + "loss": 3.452, + "step": 4315 + }, + { + "epoch": 2.7813053988718774, + "grad_norm": 1.487756893628571, + "learning_rate": 9.925516583791263e-05, + "loss": 3.6446, + "step": 4316 + }, + { + "epoch": 2.7819500402900887, + "grad_norm": 1.871990544333039, + "learning_rate": 9.925481910605235e-05, + "loss": 3.5485, + "step": 4317 + }, + { + "epoch": 2.7825946817082996, + "grad_norm": 1.288117631846512, + "learning_rate": 9.925447229411841e-05, + "loss": 3.605, + "step": 4318 + }, + { + "epoch": 2.783239323126511, + "grad_norm": 1.854308484335437, + "learning_rate": 9.925412540211135e-05, + "loss": 4.0412, + "step": 4319 + }, + { + "epoch": 2.7838839645447218, + "grad_norm": 1.9674913605453714, + "learning_rate": 9.925377843003177e-05, + "loss": 3.9436, + "step": 4320 + }, + { + "epoch": 2.784528605962933, + "grad_norm": 1.7927418227490926, + "learning_rate": 9.925343137788023e-05, + "loss": 3.5449, + "step": 4321 + }, + { + "epoch": 2.7851732473811444, + "grad_norm": 1.458853858570859, + "learning_rate": 9.925308424565725e-05, + "loss": 4.1879, + "step": 4322 + }, + { + "epoch": 2.785817888799355, + "grad_norm": 1.501313869205455, + "learning_rate": 9.925273703336349e-05, + "loss": 3.9439, + "step": 4323 + }, + { + "epoch": 2.7864625302175665, + "grad_norm": 1.1258876652762828, + "learning_rate": 9.925238974099945e-05, + "loss": 3.8293, + "step": 4324 + }, + { + "epoch": 2.7871071716357774, + "grad_norm": 1.3582998949288612, + "learning_rate": 9.925204236856572e-05, + "loss": 3.9517, + "step": 4325 + }, + { + "epoch": 2.7877518130539887, + "grad_norm": 1.2772624317930967, + "learning_rate": 9.925169491606289e-05, + "loss": 3.6165, + "step": 4326 + }, + { + "epoch": 2.7883964544722, + "grad_norm": 1.4003180384423202, + "learning_rate": 9.925134738349151e-05, + "loss": 3.8219, + "step": 4327 + }, + { + "epoch": 2.789041095890411, + "grad_norm": 1.4008415488033636, + "learning_rate": 9.925099977085214e-05, + "loss": 3.9516, + "step": 4328 + }, + { + "epoch": 2.789685737308622, + "grad_norm": 1.3939285699153496, + "learning_rate": 9.925065207814537e-05, + "loss": 3.9858, + "step": 4329 + }, + { + "epoch": 2.790330378726833, + "grad_norm": 1.1119879618696502, + "learning_rate": 9.925030430537178e-05, + "loss": 3.5025, + "step": 4330 + }, + { + "epoch": 2.7909750201450443, + "grad_norm": 1.2826017198532067, + "learning_rate": 9.924995645253195e-05, + "loss": 3.9051, + "step": 4331 + }, + { + "epoch": 2.7916196615632556, + "grad_norm": 1.6513575926610806, + "learning_rate": 9.92496085196264e-05, + "loss": 3.7343, + "step": 4332 + }, + { + "epoch": 2.7922643029814664, + "grad_norm": 1.5029135402757006, + "learning_rate": 9.924926050665573e-05, + "loss": 3.994, + "step": 4333 + }, + { + "epoch": 2.7929089443996777, + "grad_norm": 1.6060795538346524, + "learning_rate": 9.924891241362052e-05, + "loss": 4.1659, + "step": 4334 + }, + { + "epoch": 2.7935535858178886, + "grad_norm": 1.2551374645195663, + "learning_rate": 9.924856424052134e-05, + "loss": 3.6093, + "step": 4335 + }, + { + "epoch": 2.7941982272361, + "grad_norm": 1.5774577745489338, + "learning_rate": 9.924821598735874e-05, + "loss": 3.7737, + "step": 4336 + }, + { + "epoch": 2.794842868654311, + "grad_norm": 1.527922900924166, + "learning_rate": 9.924786765413334e-05, + "loss": 3.8825, + "step": 4337 + }, + { + "epoch": 2.795487510072522, + "grad_norm": 1.9492817408068008, + "learning_rate": 9.924751924084564e-05, + "loss": 3.9221, + "step": 4338 + }, + { + "epoch": 2.7961321514907334, + "grad_norm": 1.6182765344478622, + "learning_rate": 9.924717074749628e-05, + "loss": 3.9485, + "step": 4339 + }, + { + "epoch": 2.796776792908944, + "grad_norm": 1.57583352167431, + "learning_rate": 9.924682217408582e-05, + "loss": 3.7096, + "step": 4340 + }, + { + "epoch": 2.7974214343271555, + "grad_norm": 1.8340901795895046, + "learning_rate": 9.924647352061479e-05, + "loss": 3.7993, + "step": 4341 + }, + { + "epoch": 2.798066075745367, + "grad_norm": 1.1330521295759275, + "learning_rate": 9.924612478708379e-05, + "loss": 3.9024, + "step": 4342 + }, + { + "epoch": 2.7987107171635777, + "grad_norm": 1.4509855838821506, + "learning_rate": 9.924577597349342e-05, + "loss": 3.5692, + "step": 4343 + }, + { + "epoch": 2.799355358581789, + "grad_norm": 1.2932497451363536, + "learning_rate": 9.924542707984419e-05, + "loss": 3.874, + "step": 4344 + }, + { + "epoch": 2.8, + "grad_norm": 1.3954547990050703, + "learning_rate": 9.924507810613674e-05, + "loss": 3.8064, + "step": 4345 + }, + { + "epoch": 2.800644641418211, + "grad_norm": 1.2096916189671767, + "learning_rate": 9.92447290523716e-05, + "loss": 3.9964, + "step": 4346 + }, + { + "epoch": 2.8012892828364224, + "grad_norm": 1.4152963918019221, + "learning_rate": 9.924437991854935e-05, + "loss": 3.8095, + "step": 4347 + }, + { + "epoch": 2.8019339242546333, + "grad_norm": 1.2282684802420782, + "learning_rate": 9.924403070467058e-05, + "loss": 3.7493, + "step": 4348 + }, + { + "epoch": 2.8025785656728446, + "grad_norm": 1.3276205791868858, + "learning_rate": 9.924368141073585e-05, + "loss": 3.9459, + "step": 4349 + }, + { + "epoch": 2.8032232070910554, + "grad_norm": 1.4854182903415598, + "learning_rate": 9.924333203674573e-05, + "loss": 3.8074, + "step": 4350 + }, + { + "epoch": 2.8038678485092667, + "grad_norm": 1.333421041852796, + "learning_rate": 9.92429825827008e-05, + "loss": 4.0061, + "step": 4351 + }, + { + "epoch": 2.804512489927478, + "grad_norm": 1.3811689456185579, + "learning_rate": 9.924263304860164e-05, + "loss": 3.8885, + "step": 4352 + }, + { + "epoch": 2.805157131345689, + "grad_norm": 1.4376335691787279, + "learning_rate": 9.924228343444881e-05, + "loss": 4.0706, + "step": 4353 + }, + { + "epoch": 2.8058017727639, + "grad_norm": 1.3515215742428286, + "learning_rate": 9.924193374024292e-05, + "loss": 3.55, + "step": 4354 + }, + { + "epoch": 2.806446414182111, + "grad_norm": 1.8370291211018008, + "learning_rate": 9.924158396598448e-05, + "loss": 3.6814, + "step": 4355 + }, + { + "epoch": 2.8070910556003223, + "grad_norm": 1.599167567950136, + "learning_rate": 9.92412341116741e-05, + "loss": 3.8169, + "step": 4356 + }, + { + "epoch": 2.8077356970185336, + "grad_norm": 1.533536529887646, + "learning_rate": 9.924088417731239e-05, + "loss": 4.0474, + "step": 4357 + }, + { + "epoch": 2.8083803384367445, + "grad_norm": 1.6023821061344037, + "learning_rate": 9.924053416289987e-05, + "loss": 3.9454, + "step": 4358 + }, + { + "epoch": 2.809024979854956, + "grad_norm": 1.5665092708341133, + "learning_rate": 9.924018406843712e-05, + "loss": 4.009, + "step": 4359 + }, + { + "epoch": 2.8096696212731667, + "grad_norm": 1.6473630247888331, + "learning_rate": 9.923983389392475e-05, + "loss": 3.6839, + "step": 4360 + }, + { + "epoch": 2.810314262691378, + "grad_norm": 1.606993518044603, + "learning_rate": 9.923948363936332e-05, + "loss": 3.6044, + "step": 4361 + }, + { + "epoch": 2.8109589041095893, + "grad_norm": 1.5022125362401422, + "learning_rate": 9.923913330475339e-05, + "loss": 3.8234, + "step": 4362 + }, + { + "epoch": 2.8116035455278, + "grad_norm": 1.5310639323282589, + "learning_rate": 9.923878289009555e-05, + "loss": 3.9262, + "step": 4363 + }, + { + "epoch": 2.8122481869460114, + "grad_norm": 1.5323673614067477, + "learning_rate": 9.923843239539038e-05, + "loss": 3.6551, + "step": 4364 + }, + { + "epoch": 2.8128928283642223, + "grad_norm": 1.1899906695295863, + "learning_rate": 9.923808182063844e-05, + "loss": 3.7935, + "step": 4365 + }, + { + "epoch": 2.8135374697824336, + "grad_norm": 1.522766932663456, + "learning_rate": 9.92377311658403e-05, + "loss": 3.7226, + "step": 4366 + }, + { + "epoch": 2.814182111200645, + "grad_norm": 1.7805103900886519, + "learning_rate": 9.923738043099656e-05, + "loss": 3.943, + "step": 4367 + }, + { + "epoch": 2.8148267526188557, + "grad_norm": 1.565072122923306, + "learning_rate": 9.92370296161078e-05, + "loss": 3.9441, + "step": 4368 + }, + { + "epoch": 2.815471394037067, + "grad_norm": 1.4677003027892368, + "learning_rate": 9.923667872117454e-05, + "loss": 3.6834, + "step": 4369 + }, + { + "epoch": 2.816116035455278, + "grad_norm": 2.0380847906502457, + "learning_rate": 9.923632774619743e-05, + "loss": 4.0099, + "step": 4370 + }, + { + "epoch": 2.816760676873489, + "grad_norm": 1.7264353156381005, + "learning_rate": 9.9235976691177e-05, + "loss": 3.7761, + "step": 4371 + }, + { + "epoch": 2.8174053182917005, + "grad_norm": 1.7790882852943386, + "learning_rate": 9.923562555611385e-05, + "loss": 3.8784, + "step": 4372 + }, + { + "epoch": 2.8180499597099113, + "grad_norm": 2.1522399637833387, + "learning_rate": 9.923527434100855e-05, + "loss": 3.8093, + "step": 4373 + }, + { + "epoch": 2.8186946011281226, + "grad_norm": 1.712429341924036, + "learning_rate": 9.923492304586165e-05, + "loss": 3.8176, + "step": 4374 + }, + { + "epoch": 2.8193392425463335, + "grad_norm": 2.1280977503644687, + "learning_rate": 9.923457167067378e-05, + "loss": 4.0651, + "step": 4375 + }, + { + "epoch": 2.819983883964545, + "grad_norm": 1.4192358078975336, + "learning_rate": 9.923422021544548e-05, + "loss": 4.1666, + "step": 4376 + }, + { + "epoch": 2.820628525382756, + "grad_norm": 1.5483003687273518, + "learning_rate": 9.923386868017733e-05, + "loss": 3.8776, + "step": 4377 + }, + { + "epoch": 2.821273166800967, + "grad_norm": 1.490109418357633, + "learning_rate": 9.923351706486991e-05, + "loss": 3.7855, + "step": 4378 + }, + { + "epoch": 2.821917808219178, + "grad_norm": 1.412129021849041, + "learning_rate": 9.92331653695238e-05, + "loss": 4.0291, + "step": 4379 + }, + { + "epoch": 2.822562449637389, + "grad_norm": 1.6833647684768347, + "learning_rate": 9.923281359413957e-05, + "loss": 4.0312, + "step": 4380 + }, + { + "epoch": 2.8232070910556004, + "grad_norm": 1.5536646449827751, + "learning_rate": 9.923246173871782e-05, + "loss": 3.382, + "step": 4381 + }, + { + "epoch": 2.8238517324738117, + "grad_norm": 1.1876771187979316, + "learning_rate": 9.923210980325911e-05, + "loss": 3.5765, + "step": 4382 + }, + { + "epoch": 2.8244963738920226, + "grad_norm": 1.4955868111368957, + "learning_rate": 9.923175778776402e-05, + "loss": 3.8234, + "step": 4383 + }, + { + "epoch": 2.8251410153102334, + "grad_norm": 1.621229278294363, + "learning_rate": 9.923140569223313e-05, + "loss": 4.006, + "step": 4384 + }, + { + "epoch": 2.8257856567284447, + "grad_norm": 1.374188887329286, + "learning_rate": 9.923105351666702e-05, + "loss": 3.7244, + "step": 4385 + }, + { + "epoch": 2.826430298146656, + "grad_norm": 1.8660242088249996, + "learning_rate": 9.923070126106626e-05, + "loss": 3.6403, + "step": 4386 + }, + { + "epoch": 2.827074939564867, + "grad_norm": 1.7253595687213068, + "learning_rate": 9.923034892543143e-05, + "loss": 4.0016, + "step": 4387 + }, + { + "epoch": 2.827719580983078, + "grad_norm": 1.3831857772356047, + "learning_rate": 9.922999650976313e-05, + "loss": 3.7829, + "step": 4388 + }, + { + "epoch": 2.828364222401289, + "grad_norm": 1.5077108811944826, + "learning_rate": 9.922964401406192e-05, + "loss": 3.9995, + "step": 4389 + }, + { + "epoch": 2.8290088638195003, + "grad_norm": 1.8748242542841607, + "learning_rate": 9.922929143832835e-05, + "loss": 3.7319, + "step": 4390 + }, + { + "epoch": 2.8296535052377116, + "grad_norm": 1.3055740463264291, + "learning_rate": 9.922893878256306e-05, + "loss": 3.8297, + "step": 4391 + }, + { + "epoch": 2.8302981466559225, + "grad_norm": 1.8566272600625817, + "learning_rate": 9.92285860467666e-05, + "loss": 4.0867, + "step": 4392 + }, + { + "epoch": 2.830942788074134, + "grad_norm": 1.7882448255783552, + "learning_rate": 9.922823323093953e-05, + "loss": 4.0549, + "step": 4393 + }, + { + "epoch": 2.8315874294923447, + "grad_norm": 1.4165833398411574, + "learning_rate": 9.922788033508245e-05, + "loss": 3.7539, + "step": 4394 + }, + { + "epoch": 2.832232070910556, + "grad_norm": 1.802373792702418, + "learning_rate": 9.922752735919596e-05, + "loss": 3.9021, + "step": 4395 + }, + { + "epoch": 2.8328767123287673, + "grad_norm": 1.536225921218292, + "learning_rate": 9.92271743032806e-05, + "loss": 4.0061, + "step": 4396 + }, + { + "epoch": 2.833521353746978, + "grad_norm": 1.425705158861146, + "learning_rate": 9.922682116733697e-05, + "loss": 4.2128, + "step": 4397 + }, + { + "epoch": 2.8341659951651894, + "grad_norm": 1.6363296638037637, + "learning_rate": 9.922646795136565e-05, + "loss": 3.9337, + "step": 4398 + }, + { + "epoch": 2.8348106365834003, + "grad_norm": 1.6360611137786027, + "learning_rate": 9.922611465536719e-05, + "loss": 3.4575, + "step": 4399 + }, + { + "epoch": 2.8354552780016116, + "grad_norm": 2.2749610566664353, + "learning_rate": 9.922576127934224e-05, + "loss": 3.8226, + "step": 4400 + }, + { + "epoch": 2.8354552780016116, + "eval_loss": 4.0348944664001465, + "eval_runtime": 2.9796, + "eval_samples_per_second": 33.562, + "eval_steps_per_second": 4.363, + "step": 4400 + }, + { + "epoch": 2.836099919419823, + "grad_norm": 1.6251613497737185, + "learning_rate": 9.922540782329132e-05, + "loss": 3.6907, + "step": 4401 + }, + { + "epoch": 2.8367445608380337, + "grad_norm": 2.0507369516224805, + "learning_rate": 9.922505428721504e-05, + "loss": 3.8881, + "step": 4402 + }, + { + "epoch": 2.837389202256245, + "grad_norm": 1.6683334812111388, + "learning_rate": 9.922470067111394e-05, + "loss": 3.6263, + "step": 4403 + }, + { + "epoch": 2.838033843674456, + "grad_norm": 1.8423238543810863, + "learning_rate": 9.922434697498864e-05, + "loss": 3.5607, + "step": 4404 + }, + { + "epoch": 2.838678485092667, + "grad_norm": 2.0867477147163203, + "learning_rate": 9.922399319883972e-05, + "loss": 3.8149, + "step": 4405 + }, + { + "epoch": 2.8393231265108785, + "grad_norm": 1.4103721690699218, + "learning_rate": 9.922363934266776e-05, + "loss": 4.0869, + "step": 4406 + }, + { + "epoch": 2.8399677679290893, + "grad_norm": 1.8041626780603899, + "learning_rate": 9.922328540647333e-05, + "loss": 3.9588, + "step": 4407 + }, + { + "epoch": 2.8406124093473006, + "grad_norm": 1.2644636676657215, + "learning_rate": 9.9222931390257e-05, + "loss": 3.9246, + "step": 4408 + }, + { + "epoch": 2.8412570507655115, + "grad_norm": 1.8073575167723945, + "learning_rate": 9.922257729401936e-05, + "loss": 3.8104, + "step": 4409 + }, + { + "epoch": 2.841901692183723, + "grad_norm": 1.5253487255104954, + "learning_rate": 9.922222311776101e-05, + "loss": 3.9065, + "step": 4410 + }, + { + "epoch": 2.842546333601934, + "grad_norm": 1.6687863379940053, + "learning_rate": 9.922186886148251e-05, + "loss": 4.0218, + "step": 4411 + }, + { + "epoch": 2.843190975020145, + "grad_norm": 1.5850425485777744, + "learning_rate": 9.922151452518447e-05, + "loss": 3.9642, + "step": 4412 + }, + { + "epoch": 2.8438356164383563, + "grad_norm": 1.3478086524107162, + "learning_rate": 9.922116010886744e-05, + "loss": 3.5884, + "step": 4413 + }, + { + "epoch": 2.844480257856567, + "grad_norm": 1.3338618996083893, + "learning_rate": 9.922080561253201e-05, + "loss": 3.6473, + "step": 4414 + }, + { + "epoch": 2.8451248992747784, + "grad_norm": 1.4244387255683695, + "learning_rate": 9.922045103617877e-05, + "loss": 3.9114, + "step": 4415 + }, + { + "epoch": 2.8457695406929897, + "grad_norm": 1.5627301099184658, + "learning_rate": 9.92200963798083e-05, + "loss": 3.9228, + "step": 4416 + }, + { + "epoch": 2.8464141821112006, + "grad_norm": 1.6512093357602098, + "learning_rate": 9.921974164342118e-05, + "loss": 3.8444, + "step": 4417 + }, + { + "epoch": 2.847058823529412, + "grad_norm": 1.469401062986088, + "learning_rate": 9.9219386827018e-05, + "loss": 4.1401, + "step": 4418 + }, + { + "epoch": 2.8477034649476227, + "grad_norm": 1.4939530310600846, + "learning_rate": 9.921903193059933e-05, + "loss": 4.0352, + "step": 4419 + }, + { + "epoch": 2.848348106365834, + "grad_norm": 1.4381417688646918, + "learning_rate": 9.921867695416578e-05, + "loss": 3.853, + "step": 4420 + }, + { + "epoch": 2.8489927477840453, + "grad_norm": 1.8947858045556565, + "learning_rate": 9.92183218977179e-05, + "loss": 3.9113, + "step": 4421 + }, + { + "epoch": 2.849637389202256, + "grad_norm": 1.3771310288483856, + "learning_rate": 9.921796676125628e-05, + "loss": 3.8131, + "step": 4422 + }, + { + "epoch": 2.8502820306204675, + "grad_norm": 1.631357154868409, + "learning_rate": 9.92176115447815e-05, + "loss": 3.7876, + "step": 4423 + }, + { + "epoch": 2.8509266720386783, + "grad_norm": 1.289502351519129, + "learning_rate": 9.921725624829417e-05, + "loss": 3.6933, + "step": 4424 + }, + { + "epoch": 2.8515713134568896, + "grad_norm": 1.6860465249550625, + "learning_rate": 9.921690087179485e-05, + "loss": 3.9579, + "step": 4425 + }, + { + "epoch": 2.852215954875101, + "grad_norm": 1.6107661134743636, + "learning_rate": 9.921654541528414e-05, + "loss": 3.8146, + "step": 4426 + }, + { + "epoch": 2.852860596293312, + "grad_norm": 1.4920731401005765, + "learning_rate": 9.92161898787626e-05, + "loss": 4.0516, + "step": 4427 + }, + { + "epoch": 2.853505237711523, + "grad_norm": 1.613494003977371, + "learning_rate": 9.921583426223082e-05, + "loss": 3.811, + "step": 4428 + }, + { + "epoch": 2.854149879129734, + "grad_norm": 1.2843766741536116, + "learning_rate": 9.92154785656894e-05, + "loss": 3.7007, + "step": 4429 + }, + { + "epoch": 2.8547945205479452, + "grad_norm": 1.6219908099811968, + "learning_rate": 9.921512278913894e-05, + "loss": 3.8419, + "step": 4430 + }, + { + "epoch": 2.8554391619661565, + "grad_norm": 1.4912700976451432, + "learning_rate": 9.921476693257996e-05, + "loss": 3.3953, + "step": 4431 + }, + { + "epoch": 2.8560838033843674, + "grad_norm": 1.5148971225445278, + "learning_rate": 9.921441099601312e-05, + "loss": 3.8324, + "step": 4432 + }, + { + "epoch": 2.8567284448025787, + "grad_norm": 1.2506027704030298, + "learning_rate": 9.921405497943896e-05, + "loss": 3.5254, + "step": 4433 + }, + { + "epoch": 2.8573730862207896, + "grad_norm": 1.304854308729455, + "learning_rate": 9.921369888285805e-05, + "loss": 3.7995, + "step": 4434 + }, + { + "epoch": 2.858017727639001, + "grad_norm": 1.6427771190585285, + "learning_rate": 9.921334270627102e-05, + "loss": 3.8215, + "step": 4435 + }, + { + "epoch": 2.858662369057212, + "grad_norm": 1.406077439050512, + "learning_rate": 9.921298644967844e-05, + "loss": 3.7764, + "step": 4436 + }, + { + "epoch": 2.859307010475423, + "grad_norm": 1.5063213203935055, + "learning_rate": 9.921263011308088e-05, + "loss": 4.1846, + "step": 4437 + }, + { + "epoch": 2.8599516518936343, + "grad_norm": 1.5056775068769876, + "learning_rate": 9.921227369647893e-05, + "loss": 4.0637, + "step": 4438 + }, + { + "epoch": 2.860596293311845, + "grad_norm": 1.4212706497269971, + "learning_rate": 9.921191719987318e-05, + "loss": 3.8459, + "step": 4439 + }, + { + "epoch": 2.8612409347300565, + "grad_norm": 1.624342694245189, + "learning_rate": 9.921156062326421e-05, + "loss": 3.9942, + "step": 4440 + }, + { + "epoch": 2.8618855761482678, + "grad_norm": 1.5941185217087288, + "learning_rate": 9.921120396665262e-05, + "loss": 3.7209, + "step": 4441 + }, + { + "epoch": 2.8625302175664786, + "grad_norm": 2.1662706401567715, + "learning_rate": 9.921084723003898e-05, + "loss": 3.8868, + "step": 4442 + }, + { + "epoch": 2.86317485898469, + "grad_norm": 1.6108639393915365, + "learning_rate": 9.921049041342388e-05, + "loss": 3.7563, + "step": 4443 + }, + { + "epoch": 2.863819500402901, + "grad_norm": 1.636641727796395, + "learning_rate": 9.921013351680794e-05, + "loss": 3.7107, + "step": 4444 + }, + { + "epoch": 2.864464141821112, + "grad_norm": 2.3960078094048245, + "learning_rate": 9.920977654019168e-05, + "loss": 4.0426, + "step": 4445 + }, + { + "epoch": 2.8651087832393234, + "grad_norm": 1.2206795164411852, + "learning_rate": 9.920941948357573e-05, + "loss": 3.7344, + "step": 4446 + }, + { + "epoch": 2.8657534246575342, + "grad_norm": 1.6605001731284397, + "learning_rate": 9.920906234696067e-05, + "loss": 4.075, + "step": 4447 + }, + { + "epoch": 2.866398066075745, + "grad_norm": 1.15581547985726, + "learning_rate": 9.920870513034709e-05, + "loss": 3.6771, + "step": 4448 + }, + { + "epoch": 2.8670427074939564, + "grad_norm": 1.4881911744882763, + "learning_rate": 9.920834783373555e-05, + "loss": 4.0145, + "step": 4449 + }, + { + "epoch": 2.8676873489121677, + "grad_norm": 1.208079202977052, + "learning_rate": 9.920799045712665e-05, + "loss": 3.5857, + "step": 4450 + }, + { + "epoch": 2.868331990330379, + "grad_norm": 1.353869380882891, + "learning_rate": 9.9207633000521e-05, + "loss": 3.9051, + "step": 4451 + }, + { + "epoch": 2.86897663174859, + "grad_norm": 1.4839875797066757, + "learning_rate": 9.920727546391919e-05, + "loss": 3.5491, + "step": 4452 + }, + { + "epoch": 2.8696212731668007, + "grad_norm": 1.2674069079298458, + "learning_rate": 9.920691784732177e-05, + "loss": 3.7261, + "step": 4453 + }, + { + "epoch": 2.870265914585012, + "grad_norm": 1.4033832414562042, + "learning_rate": 9.920656015072935e-05, + "loss": 4.2387, + "step": 4454 + }, + { + "epoch": 2.8709105560032233, + "grad_norm": 1.5527622424148104, + "learning_rate": 9.920620237414251e-05, + "loss": 3.6946, + "step": 4455 + }, + { + "epoch": 2.871555197421434, + "grad_norm": 1.6941293486455364, + "learning_rate": 9.920584451756185e-05, + "loss": 4.0817, + "step": 4456 + }, + { + "epoch": 2.8721998388396455, + "grad_norm": 1.2408074715823278, + "learning_rate": 9.920548658098794e-05, + "loss": 3.6783, + "step": 4457 + }, + { + "epoch": 2.8728444802578563, + "grad_norm": 2.0273219317817803, + "learning_rate": 9.920512856442137e-05, + "loss": 3.8296, + "step": 4458 + }, + { + "epoch": 2.8734891216760676, + "grad_norm": 1.7968710541689823, + "learning_rate": 9.920477046786274e-05, + "loss": 4.2584, + "step": 4459 + }, + { + "epoch": 2.874133763094279, + "grad_norm": 1.2943797827488668, + "learning_rate": 9.920441229131263e-05, + "loss": 3.4945, + "step": 4460 + }, + { + "epoch": 2.87477840451249, + "grad_norm": 1.544961919283553, + "learning_rate": 9.920405403477165e-05, + "loss": 4.3317, + "step": 4461 + }, + { + "epoch": 2.875423045930701, + "grad_norm": 1.3145822048891291, + "learning_rate": 9.920369569824035e-05, + "loss": 4.1057, + "step": 4462 + }, + { + "epoch": 2.876067687348912, + "grad_norm": 1.3804878457847276, + "learning_rate": 9.920333728171935e-05, + "loss": 3.574, + "step": 4463 + }, + { + "epoch": 2.8767123287671232, + "grad_norm": 1.4769405593334337, + "learning_rate": 9.920297878520922e-05, + "loss": 3.7153, + "step": 4464 + }, + { + "epoch": 2.8773569701853345, + "grad_norm": 1.232054465007174, + "learning_rate": 9.920262020871054e-05, + "loss": 3.8502, + "step": 4465 + }, + { + "epoch": 2.8780016116035454, + "grad_norm": 1.2164068162854857, + "learning_rate": 9.920226155222393e-05, + "loss": 3.815, + "step": 4466 + }, + { + "epoch": 2.8786462530217567, + "grad_norm": 1.223391542431204, + "learning_rate": 9.920190281574996e-05, + "loss": 3.809, + "step": 4467 + }, + { + "epoch": 2.8792908944399676, + "grad_norm": 1.1530340665816883, + "learning_rate": 9.920154399928922e-05, + "loss": 3.852, + "step": 4468 + }, + { + "epoch": 2.879935535858179, + "grad_norm": 1.2332214789696865, + "learning_rate": 9.92011851028423e-05, + "loss": 4.0388, + "step": 4469 + }, + { + "epoch": 2.88058017727639, + "grad_norm": 1.2393134783825983, + "learning_rate": 9.92008261264098e-05, + "loss": 4.0646, + "step": 4470 + }, + { + "epoch": 2.881224818694601, + "grad_norm": 1.5175768502862346, + "learning_rate": 9.920046706999231e-05, + "loss": 4.0323, + "step": 4471 + }, + { + "epoch": 2.8818694601128123, + "grad_norm": 1.5130104135407216, + "learning_rate": 9.92001079335904e-05, + "loss": 3.6412, + "step": 4472 + }, + { + "epoch": 2.882514101531023, + "grad_norm": 1.7338664439062443, + "learning_rate": 9.919974871720466e-05, + "loss": 3.9135, + "step": 4473 + }, + { + "epoch": 2.8831587429492345, + "grad_norm": 1.212371603204769, + "learning_rate": 9.91993894208357e-05, + "loss": 3.9502, + "step": 4474 + }, + { + "epoch": 2.8838033843674458, + "grad_norm": 1.1788502202956603, + "learning_rate": 9.919903004448408e-05, + "loss": 3.9882, + "step": 4475 + }, + { + "epoch": 2.8844480257856566, + "grad_norm": 1.4522448848818836, + "learning_rate": 9.919867058815043e-05, + "loss": 3.9527, + "step": 4476 + }, + { + "epoch": 2.885092667203868, + "grad_norm": 1.5877746809012845, + "learning_rate": 9.919831105183534e-05, + "loss": 4.1867, + "step": 4477 + }, + { + "epoch": 2.885737308622079, + "grad_norm": 1.5162239011127894, + "learning_rate": 9.919795143553934e-05, + "loss": 3.6807, + "step": 4478 + }, + { + "epoch": 2.88638195004029, + "grad_norm": 1.8003219321009964, + "learning_rate": 9.919759173926309e-05, + "loss": 3.5458, + "step": 4479 + }, + { + "epoch": 2.8870265914585014, + "grad_norm": 1.5131933212938347, + "learning_rate": 9.919723196300713e-05, + "loss": 4.0011, + "step": 4480 + }, + { + "epoch": 2.8876712328767122, + "grad_norm": 1.4719101468212523, + "learning_rate": 9.919687210677211e-05, + "loss": 3.4494, + "step": 4481 + }, + { + "epoch": 2.8883158742949235, + "grad_norm": 1.9792648490144036, + "learning_rate": 9.919651217055856e-05, + "loss": 3.6513, + "step": 4482 + }, + { + "epoch": 2.8889605157131344, + "grad_norm": 1.5157139102252073, + "learning_rate": 9.919615215436711e-05, + "loss": 3.8405, + "step": 4483 + }, + { + "epoch": 2.8896051571313457, + "grad_norm": 1.3844016937604473, + "learning_rate": 9.919579205819831e-05, + "loss": 3.7786, + "step": 4484 + }, + { + "epoch": 2.890249798549557, + "grad_norm": 1.695559520942929, + "learning_rate": 9.919543188205279e-05, + "loss": 3.5021, + "step": 4485 + }, + { + "epoch": 2.890894439967768, + "grad_norm": 1.326555233384077, + "learning_rate": 9.919507162593114e-05, + "loss": 3.5989, + "step": 4486 + }, + { + "epoch": 2.891539081385979, + "grad_norm": 1.551968433043887, + "learning_rate": 9.919471128983394e-05, + "loss": 3.9016, + "step": 4487 + }, + { + "epoch": 2.89218372280419, + "grad_norm": 1.8803635393019196, + "learning_rate": 9.919435087376178e-05, + "loss": 3.9721, + "step": 4488 + }, + { + "epoch": 2.8928283642224013, + "grad_norm": 1.4093666612657918, + "learning_rate": 9.919399037771525e-05, + "loss": 4.0035, + "step": 4489 + }, + { + "epoch": 2.8934730056406126, + "grad_norm": 1.6001067486747083, + "learning_rate": 9.919362980169495e-05, + "loss": 3.9039, + "step": 4490 + }, + { + "epoch": 2.8941176470588235, + "grad_norm": 1.4260707855434178, + "learning_rate": 9.919326914570148e-05, + "loss": 3.5656, + "step": 4491 + }, + { + "epoch": 2.8947622884770348, + "grad_norm": 1.4227104119064504, + "learning_rate": 9.919290840973541e-05, + "loss": 3.9236, + "step": 4492 + }, + { + "epoch": 2.8954069298952456, + "grad_norm": 1.8243142350350876, + "learning_rate": 9.919254759379734e-05, + "loss": 3.7337, + "step": 4493 + }, + { + "epoch": 2.896051571313457, + "grad_norm": 1.6964276614169775, + "learning_rate": 9.919218669788788e-05, + "loss": 4.0703, + "step": 4494 + }, + { + "epoch": 2.896696212731668, + "grad_norm": 1.8624486113016354, + "learning_rate": 9.919182572200761e-05, + "loss": 3.7448, + "step": 4495 + }, + { + "epoch": 2.897340854149879, + "grad_norm": 2.1202510786905426, + "learning_rate": 9.919146466615711e-05, + "loss": 3.569, + "step": 4496 + }, + { + "epoch": 2.8979854955680904, + "grad_norm": 2.037971245200058, + "learning_rate": 9.919110353033699e-05, + "loss": 3.7171, + "step": 4497 + }, + { + "epoch": 2.8986301369863012, + "grad_norm": 1.8279448973696426, + "learning_rate": 9.919074231454786e-05, + "loss": 3.8441, + "step": 4498 + }, + { + "epoch": 2.8992747784045125, + "grad_norm": 1.6811581081042204, + "learning_rate": 9.919038101879027e-05, + "loss": 3.914, + "step": 4499 + }, + { + "epoch": 2.899919419822724, + "grad_norm": 1.1601419226385845, + "learning_rate": 9.919001964306483e-05, + "loss": 3.8653, + "step": 4500 + }, + { + "epoch": 2.899919419822724, + "eval_loss": 4.026595592498779, + "eval_runtime": 2.9704, + "eval_samples_per_second": 33.665, + "eval_steps_per_second": 4.376, + "step": 4500 + }, + { + "epoch": 2.9005640612409347, + "grad_norm": 1.6437687863839117, + "learning_rate": 9.918965818737217e-05, + "loss": 3.7781, + "step": 4501 + }, + { + "epoch": 2.901208702659146, + "grad_norm": 1.4143024447150472, + "learning_rate": 9.918929665171281e-05, + "loss": 3.7182, + "step": 4502 + }, + { + "epoch": 2.901853344077357, + "grad_norm": 1.8811375200221325, + "learning_rate": 9.918893503608742e-05, + "loss": 3.7486, + "step": 4503 + }, + { + "epoch": 2.902497985495568, + "grad_norm": 1.3063732358828668, + "learning_rate": 9.918857334049655e-05, + "loss": 3.8951, + "step": 4504 + }, + { + "epoch": 2.9031426269137794, + "grad_norm": 1.7421624680334835, + "learning_rate": 9.91882115649408e-05, + "loss": 3.8781, + "step": 4505 + }, + { + "epoch": 2.9037872683319903, + "grad_norm": 1.7621624642711464, + "learning_rate": 9.918784970942077e-05, + "loss": 3.462, + "step": 4506 + }, + { + "epoch": 2.9044319097502016, + "grad_norm": 1.559002632931317, + "learning_rate": 9.918748777393706e-05, + "loss": 3.5896, + "step": 4507 + }, + { + "epoch": 2.9050765511684125, + "grad_norm": 1.46042779794042, + "learning_rate": 9.918712575849023e-05, + "loss": 3.7136, + "step": 4508 + }, + { + "epoch": 2.9057211925866238, + "grad_norm": 1.5172322152400395, + "learning_rate": 9.918676366308094e-05, + "loss": 3.8117, + "step": 4509 + }, + { + "epoch": 2.906365834004835, + "grad_norm": 1.0742254877395818, + "learning_rate": 9.918640148770973e-05, + "loss": 3.8259, + "step": 4510 + }, + { + "epoch": 2.907010475423046, + "grad_norm": 1.36601539557734, + "learning_rate": 9.918603923237722e-05, + "loss": 3.9393, + "step": 4511 + }, + { + "epoch": 2.907655116841257, + "grad_norm": 1.1280734603220042, + "learning_rate": 9.918567689708398e-05, + "loss": 4.0492, + "step": 4512 + }, + { + "epoch": 2.908299758259468, + "grad_norm": 1.5920382146073786, + "learning_rate": 9.918531448183064e-05, + "loss": 3.9382, + "step": 4513 + }, + { + "epoch": 2.9089443996776794, + "grad_norm": 1.5234811458147441, + "learning_rate": 9.918495198661774e-05, + "loss": 4.1063, + "step": 4514 + }, + { + "epoch": 2.9095890410958907, + "grad_norm": 1.4542389663878295, + "learning_rate": 9.918458941144597e-05, + "loss": 4.1455, + "step": 4515 + }, + { + "epoch": 2.9102336825141015, + "grad_norm": 1.6878928901437884, + "learning_rate": 9.918422675631581e-05, + "loss": 3.692, + "step": 4516 + }, + { + "epoch": 2.9108783239323124, + "grad_norm": 1.6190904508579933, + "learning_rate": 9.918386402122795e-05, + "loss": 3.7741, + "step": 4517 + }, + { + "epoch": 2.9115229653505237, + "grad_norm": 1.6981738757556244, + "learning_rate": 9.918350120618292e-05, + "loss": 3.7446, + "step": 4518 + }, + { + "epoch": 2.912167606768735, + "grad_norm": 1.6173690970664447, + "learning_rate": 9.918313831118139e-05, + "loss": 3.8064, + "step": 4519 + }, + { + "epoch": 2.9128122481869463, + "grad_norm": 1.4994103779441441, + "learning_rate": 9.918277533622387e-05, + "loss": 3.6739, + "step": 4520 + }, + { + "epoch": 2.913456889605157, + "grad_norm": 1.6993506879908689, + "learning_rate": 9.9182412281311e-05, + "loss": 3.6934, + "step": 4521 + }, + { + "epoch": 2.914101531023368, + "grad_norm": 1.4698042569066352, + "learning_rate": 9.918204914644338e-05, + "loss": 3.8554, + "step": 4522 + }, + { + "epoch": 2.9147461724415793, + "grad_norm": 1.9318655651535448, + "learning_rate": 9.918168593162161e-05, + "loss": 3.5183, + "step": 4523 + }, + { + "epoch": 2.9153908138597906, + "grad_norm": 1.346079051715497, + "learning_rate": 9.918132263684626e-05, + "loss": 3.7418, + "step": 4524 + }, + { + "epoch": 2.9160354552780015, + "grad_norm": 2.2990758119091654, + "learning_rate": 9.918095926211795e-05, + "loss": 3.9009, + "step": 4525 + }, + { + "epoch": 2.9166800966962128, + "grad_norm": 1.3442377531317946, + "learning_rate": 9.918059580743728e-05, + "loss": 3.7615, + "step": 4526 + }, + { + "epoch": 2.9173247381144236, + "grad_norm": 2.3769357064731627, + "learning_rate": 9.918023227280481e-05, + "loss": 3.6043, + "step": 4527 + }, + { + "epoch": 2.917969379532635, + "grad_norm": 1.8888631816063974, + "learning_rate": 9.917986865822118e-05, + "loss": 4.2224, + "step": 4528 + }, + { + "epoch": 2.918614020950846, + "grad_norm": 1.7148878005252246, + "learning_rate": 9.917950496368697e-05, + "loss": 3.5936, + "step": 4529 + }, + { + "epoch": 2.919258662369057, + "grad_norm": 1.9751145193547774, + "learning_rate": 9.917914118920279e-05, + "loss": 4.1239, + "step": 4530 + }, + { + "epoch": 2.9199033037872684, + "grad_norm": 1.8814372347163184, + "learning_rate": 9.917877733476922e-05, + "loss": 3.8239, + "step": 4531 + }, + { + "epoch": 2.9205479452054792, + "grad_norm": 1.476778179916311, + "learning_rate": 9.917841340038684e-05, + "loss": 3.8125, + "step": 4532 + }, + { + "epoch": 2.9211925866236905, + "grad_norm": 1.7761845149516362, + "learning_rate": 9.91780493860563e-05, + "loss": 3.7629, + "step": 4533 + }, + { + "epoch": 2.921837228041902, + "grad_norm": 1.4458725256196323, + "learning_rate": 9.917768529177816e-05, + "loss": 3.8473, + "step": 4534 + }, + { + "epoch": 2.9224818694601127, + "grad_norm": 1.6233659372549833, + "learning_rate": 9.917732111755302e-05, + "loss": 3.8814, + "step": 4535 + }, + { + "epoch": 2.923126510878324, + "grad_norm": 1.621656841286354, + "learning_rate": 9.917695686338147e-05, + "loss": 3.9041, + "step": 4536 + }, + { + "epoch": 2.923771152296535, + "grad_norm": 1.7279063354584925, + "learning_rate": 9.917659252926414e-05, + "loss": 3.3661, + "step": 4537 + }, + { + "epoch": 2.924415793714746, + "grad_norm": 1.5934466958121163, + "learning_rate": 9.917622811520161e-05, + "loss": 3.6441, + "step": 4538 + }, + { + "epoch": 2.9250604351329574, + "grad_norm": 1.5529740964497118, + "learning_rate": 9.91758636211945e-05, + "loss": 3.9309, + "step": 4539 + }, + { + "epoch": 2.9257050765511683, + "grad_norm": 1.6859388208916708, + "learning_rate": 9.917549904724336e-05, + "loss": 3.7301, + "step": 4540 + }, + { + "epoch": 2.9263497179693796, + "grad_norm": 1.7060840257634566, + "learning_rate": 9.917513439334884e-05, + "loss": 4.0132, + "step": 4541 + }, + { + "epoch": 2.9269943593875904, + "grad_norm": 1.3288207550255555, + "learning_rate": 9.91747696595115e-05, + "loss": 3.977, + "step": 4542 + }, + { + "epoch": 2.9276390008058018, + "grad_norm": 1.6882458014601973, + "learning_rate": 9.917440484573195e-05, + "loss": 3.8668, + "step": 4543 + }, + { + "epoch": 2.928283642224013, + "grad_norm": 1.2393953607835118, + "learning_rate": 9.917403995201081e-05, + "loss": 3.3348, + "step": 4544 + }, + { + "epoch": 2.928928283642224, + "grad_norm": 1.6603706396032203, + "learning_rate": 9.917367497834865e-05, + "loss": 3.891, + "step": 4545 + }, + { + "epoch": 2.929572925060435, + "grad_norm": 1.285953514809116, + "learning_rate": 9.91733099247461e-05, + "loss": 4.0599, + "step": 4546 + }, + { + "epoch": 2.930217566478646, + "grad_norm": 1.4840827538960346, + "learning_rate": 9.917294479120372e-05, + "loss": 4.0881, + "step": 4547 + }, + { + "epoch": 2.9308622078968574, + "grad_norm": 1.300196857045252, + "learning_rate": 9.917257957772214e-05, + "loss": 3.9576, + "step": 4548 + }, + { + "epoch": 2.9315068493150687, + "grad_norm": 2.280689944771613, + "learning_rate": 9.917221428430195e-05, + "loss": 3.9667, + "step": 4549 + }, + { + "epoch": 2.9321514907332795, + "grad_norm": 1.6009814422646633, + "learning_rate": 9.917184891094377e-05, + "loss": 3.8429, + "step": 4550 + }, + { + "epoch": 2.932796132151491, + "grad_norm": 1.7648001489041842, + "learning_rate": 9.917148345764816e-05, + "loss": 3.8855, + "step": 4551 + }, + { + "epoch": 2.9334407735697017, + "grad_norm": 1.196326599686696, + "learning_rate": 9.917111792441577e-05, + "loss": 3.7761, + "step": 4552 + }, + { + "epoch": 2.934085414987913, + "grad_norm": 2.042949446180039, + "learning_rate": 9.917075231124714e-05, + "loss": 3.7675, + "step": 4553 + }, + { + "epoch": 2.9347300564061243, + "grad_norm": 1.7349695296975916, + "learning_rate": 9.917038661814292e-05, + "loss": 3.9278, + "step": 4554 + }, + { + "epoch": 2.935374697824335, + "grad_norm": 1.9010525186875338, + "learning_rate": 9.917002084510368e-05, + "loss": 3.8535, + "step": 4555 + }, + { + "epoch": 2.9360193392425464, + "grad_norm": 1.999496074729169, + "learning_rate": 9.916965499213006e-05, + "loss": 3.5674, + "step": 4556 + }, + { + "epoch": 2.9366639806607573, + "grad_norm": 1.5993698786553978, + "learning_rate": 9.916928905922263e-05, + "loss": 3.9321, + "step": 4557 + }, + { + "epoch": 2.9373086220789686, + "grad_norm": 1.7158815885230074, + "learning_rate": 9.916892304638197e-05, + "loss": 3.8977, + "step": 4558 + }, + { + "epoch": 2.93795326349718, + "grad_norm": 1.4239890672311097, + "learning_rate": 9.916855695360873e-05, + "loss": 4.1309, + "step": 4559 + }, + { + "epoch": 2.9385979049153907, + "grad_norm": 1.502320016264602, + "learning_rate": 9.916819078090348e-05, + "loss": 4.0147, + "step": 4560 + }, + { + "epoch": 2.939242546333602, + "grad_norm": 2.0070569295303193, + "learning_rate": 9.916782452826684e-05, + "loss": 3.3426, + "step": 4561 + }, + { + "epoch": 2.939887187751813, + "grad_norm": 1.62882706608973, + "learning_rate": 9.91674581956994e-05, + "loss": 3.7847, + "step": 4562 + }, + { + "epoch": 2.940531829170024, + "grad_norm": 1.848351746722522, + "learning_rate": 9.916709178320176e-05, + "loss": 3.7995, + "step": 4563 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 1.4219443052911305, + "learning_rate": 9.916672529077452e-05, + "loss": 3.7252, + "step": 4564 + }, + { + "epoch": 2.9418211120064464, + "grad_norm": 1.638548746648029, + "learning_rate": 9.91663587184183e-05, + "loss": 3.799, + "step": 4565 + }, + { + "epoch": 2.9424657534246577, + "grad_norm": 1.3620494909304268, + "learning_rate": 9.916599206613367e-05, + "loss": 4.1262, + "step": 4566 + }, + { + "epoch": 2.9431103948428685, + "grad_norm": 1.635439602081081, + "learning_rate": 9.916562533392126e-05, + "loss": 3.4277, + "step": 4567 + }, + { + "epoch": 2.94375503626108, + "grad_norm": 1.5052407192607544, + "learning_rate": 9.916525852178168e-05, + "loss": 3.7789, + "step": 4568 + }, + { + "epoch": 2.944399677679291, + "grad_norm": 1.4271687678958886, + "learning_rate": 9.91648916297155e-05, + "loss": 3.9975, + "step": 4569 + }, + { + "epoch": 2.945044319097502, + "grad_norm": 1.5750337720403043, + "learning_rate": 9.916452465772333e-05, + "loss": 3.4461, + "step": 4570 + }, + { + "epoch": 2.9456889605157133, + "grad_norm": 1.6487679719576986, + "learning_rate": 9.916415760580578e-05, + "loss": 3.7718, + "step": 4571 + }, + { + "epoch": 2.946333601933924, + "grad_norm": 1.4729084214943944, + "learning_rate": 9.916379047396348e-05, + "loss": 3.6448, + "step": 4572 + }, + { + "epoch": 2.9469782433521354, + "grad_norm": 1.6113008480197404, + "learning_rate": 9.916342326219699e-05, + "loss": 4.0743, + "step": 4573 + }, + { + "epoch": 2.9476228847703467, + "grad_norm": 1.588505940896003, + "learning_rate": 9.916305597050693e-05, + "loss": 3.6689, + "step": 4574 + }, + { + "epoch": 2.9482675261885576, + "grad_norm": 1.4982337355779725, + "learning_rate": 9.91626885988939e-05, + "loss": 3.9175, + "step": 4575 + }, + { + "epoch": 2.948912167606769, + "grad_norm": 1.3628283550302063, + "learning_rate": 9.91623211473585e-05, + "loss": 3.7474, + "step": 4576 + }, + { + "epoch": 2.9495568090249797, + "grad_norm": 1.5294976668170057, + "learning_rate": 9.916195361590137e-05, + "loss": 3.9301, + "step": 4577 + }, + { + "epoch": 2.950201450443191, + "grad_norm": 1.7662304028683953, + "learning_rate": 9.916158600452307e-05, + "loss": 3.8169, + "step": 4578 + }, + { + "epoch": 2.9508460918614023, + "grad_norm": 1.295754998086489, + "learning_rate": 9.91612183132242e-05, + "loss": 3.7948, + "step": 4579 + }, + { + "epoch": 2.951490733279613, + "grad_norm": 1.5284481224547424, + "learning_rate": 9.91608505420054e-05, + "loss": 3.9085, + "step": 4580 + }, + { + "epoch": 2.9521353746978245, + "grad_norm": 1.3967997168056892, + "learning_rate": 9.916048269086725e-05, + "loss": 3.9804, + "step": 4581 + }, + { + "epoch": 2.9527800161160354, + "grad_norm": 1.1036477471790627, + "learning_rate": 9.916011475981037e-05, + "loss": 3.6906, + "step": 4582 + }, + { + "epoch": 2.9534246575342467, + "grad_norm": 1.540567281765222, + "learning_rate": 9.915974674883534e-05, + "loss": 3.7787, + "step": 4583 + }, + { + "epoch": 2.954069298952458, + "grad_norm": 1.2068599349082874, + "learning_rate": 9.915937865794279e-05, + "loss": 3.7638, + "step": 4584 + }, + { + "epoch": 2.954713940370669, + "grad_norm": 1.2314565530014931, + "learning_rate": 9.91590104871333e-05, + "loss": 3.6505, + "step": 4585 + }, + { + "epoch": 2.9553585817888797, + "grad_norm": 1.3579474271476477, + "learning_rate": 9.915864223640749e-05, + "loss": 3.943, + "step": 4586 + }, + { + "epoch": 2.956003223207091, + "grad_norm": 1.417748425517747, + "learning_rate": 9.915827390576596e-05, + "loss": 3.7983, + "step": 4587 + }, + { + "epoch": 2.9566478646253023, + "grad_norm": 1.299590197647659, + "learning_rate": 9.915790549520933e-05, + "loss": 3.9421, + "step": 4588 + }, + { + "epoch": 2.9572925060435136, + "grad_norm": 1.683239772292692, + "learning_rate": 9.915753700473819e-05, + "loss": 3.9245, + "step": 4589 + }, + { + "epoch": 2.9579371474617244, + "grad_norm": 1.481679070079368, + "learning_rate": 9.915716843435315e-05, + "loss": 3.7824, + "step": 4590 + }, + { + "epoch": 2.9585817888799353, + "grad_norm": 1.6273780023318662, + "learning_rate": 9.91567997840548e-05, + "loss": 3.8287, + "step": 4591 + }, + { + "epoch": 2.9592264302981466, + "grad_norm": 1.4676608034466239, + "learning_rate": 9.915643105384377e-05, + "loss": 4.0752, + "step": 4592 + }, + { + "epoch": 2.959871071716358, + "grad_norm": 1.408320701358444, + "learning_rate": 9.915606224372063e-05, + "loss": 3.9013, + "step": 4593 + }, + { + "epoch": 2.9605157131345687, + "grad_norm": 1.2351736816723995, + "learning_rate": 9.915569335368604e-05, + "loss": 3.6496, + "step": 4594 + }, + { + "epoch": 2.96116035455278, + "grad_norm": 1.7253933514912276, + "learning_rate": 9.915532438374057e-05, + "loss": 3.7852, + "step": 4595 + }, + { + "epoch": 2.961804995970991, + "grad_norm": 1.5398290980335918, + "learning_rate": 9.915495533388481e-05, + "loss": 4.1444, + "step": 4596 + }, + { + "epoch": 2.962449637389202, + "grad_norm": 1.4335110444157924, + "learning_rate": 9.915458620411942e-05, + "loss": 4.0423, + "step": 4597 + }, + { + "epoch": 2.9630942788074135, + "grad_norm": 1.3413964847355089, + "learning_rate": 9.915421699444495e-05, + "loss": 3.6371, + "step": 4598 + }, + { + "epoch": 2.9637389202256244, + "grad_norm": 1.6575536672934135, + "learning_rate": 9.915384770486204e-05, + "loss": 3.5925, + "step": 4599 + }, + { + "epoch": 2.9643835616438357, + "grad_norm": 1.2213231231872392, + "learning_rate": 9.915347833537129e-05, + "loss": 3.4649, + "step": 4600 + }, + { + "epoch": 2.9643835616438357, + "eval_loss": 4.031943321228027, + "eval_runtime": 2.9702, + "eval_samples_per_second": 33.668, + "eval_steps_per_second": 4.377, + "step": 4600 + }, + { + "epoch": 2.9650282030620465, + "grad_norm": 1.584099581057168, + "learning_rate": 9.91531088859733e-05, + "loss": 3.5329, + "step": 4601 + }, + { + "epoch": 2.965672844480258, + "grad_norm": 1.1162085385191405, + "learning_rate": 9.915273935666866e-05, + "loss": 4.0904, + "step": 4602 + }, + { + "epoch": 2.966317485898469, + "grad_norm": 1.3840688809656894, + "learning_rate": 9.915236974745802e-05, + "loss": 3.7993, + "step": 4603 + }, + { + "epoch": 2.96696212731668, + "grad_norm": 1.3055380366697624, + "learning_rate": 9.915200005834196e-05, + "loss": 3.5419, + "step": 4604 + }, + { + "epoch": 2.9676067687348913, + "grad_norm": 1.487318228290224, + "learning_rate": 9.91516302893211e-05, + "loss": 3.9267, + "step": 4605 + }, + { + "epoch": 2.968251410153102, + "grad_norm": 1.5730061340528125, + "learning_rate": 9.915126044039602e-05, + "loss": 3.9543, + "step": 4606 + }, + { + "epoch": 2.9688960515713134, + "grad_norm": 1.171179370139283, + "learning_rate": 9.915089051156737e-05, + "loss": 3.8691, + "step": 4607 + }, + { + "epoch": 2.9695406929895247, + "grad_norm": 1.4980826202109394, + "learning_rate": 9.915052050283571e-05, + "loss": 3.7785, + "step": 4608 + }, + { + "epoch": 2.9701853344077356, + "grad_norm": 1.7922020405580759, + "learning_rate": 9.915015041420168e-05, + "loss": 3.4331, + "step": 4609 + }, + { + "epoch": 2.970829975825947, + "grad_norm": 1.3257669414419653, + "learning_rate": 9.914978024566588e-05, + "loss": 3.8924, + "step": 4610 + }, + { + "epoch": 2.9714746172441577, + "grad_norm": 1.280155305806249, + "learning_rate": 9.914940999722891e-05, + "loss": 3.8702, + "step": 4611 + }, + { + "epoch": 2.972119258662369, + "grad_norm": 1.325766955159087, + "learning_rate": 9.914903966889139e-05, + "loss": 3.6403, + "step": 4612 + }, + { + "epoch": 2.9727639000805803, + "grad_norm": 1.2234771569211582, + "learning_rate": 9.914866926065394e-05, + "loss": 3.9254, + "step": 4613 + }, + { + "epoch": 2.973408541498791, + "grad_norm": 1.6628317076807118, + "learning_rate": 9.914829877251714e-05, + "loss": 3.5871, + "step": 4614 + }, + { + "epoch": 2.9740531829170025, + "grad_norm": 1.738301065229928, + "learning_rate": 9.91479282044816e-05, + "loss": 3.3431, + "step": 4615 + }, + { + "epoch": 2.9746978243352133, + "grad_norm": 1.5891260192166379, + "learning_rate": 9.914755755654793e-05, + "loss": 3.4394, + "step": 4616 + }, + { + "epoch": 2.9753424657534246, + "grad_norm": 1.5234225589919406, + "learning_rate": 9.914718682871678e-05, + "loss": 3.8628, + "step": 4617 + }, + { + "epoch": 2.975987107171636, + "grad_norm": 1.4949441729153017, + "learning_rate": 9.914681602098871e-05, + "loss": 3.4454, + "step": 4618 + }, + { + "epoch": 2.976631748589847, + "grad_norm": 1.522278003403628, + "learning_rate": 9.914644513336433e-05, + "loss": 3.6217, + "step": 4619 + }, + { + "epoch": 2.977276390008058, + "grad_norm": 1.80418748066383, + "learning_rate": 9.91460741658443e-05, + "loss": 3.8589, + "step": 4620 + }, + { + "epoch": 2.977921031426269, + "grad_norm": 2.3989244597480237, + "learning_rate": 9.914570311842916e-05, + "loss": 3.9623, + "step": 4621 + }, + { + "epoch": 2.9785656728444803, + "grad_norm": 1.644968134334905, + "learning_rate": 9.914533199111957e-05, + "loss": 3.6161, + "step": 4622 + }, + { + "epoch": 2.9792103142626916, + "grad_norm": 1.5962230611188124, + "learning_rate": 9.914496078391612e-05, + "loss": 4.3439, + "step": 4623 + }, + { + "epoch": 2.9798549556809024, + "grad_norm": 1.596304223129717, + "learning_rate": 9.914458949681944e-05, + "loss": 3.7974, + "step": 4624 + }, + { + "epoch": 2.9804995970991137, + "grad_norm": 1.4690826050838157, + "learning_rate": 9.914421812983009e-05, + "loss": 3.833, + "step": 4625 + }, + { + "epoch": 2.9811442385173246, + "grad_norm": 1.9440457505416635, + "learning_rate": 9.914384668294872e-05, + "loss": 4.0069, + "step": 4626 + }, + { + "epoch": 2.981788879935536, + "grad_norm": 1.4836290328228015, + "learning_rate": 9.914347515617594e-05, + "loss": 3.8127, + "step": 4627 + }, + { + "epoch": 2.982433521353747, + "grad_norm": 1.3085410836974212, + "learning_rate": 9.914310354951236e-05, + "loss": 3.7833, + "step": 4628 + }, + { + "epoch": 2.983078162771958, + "grad_norm": 1.9504122477937944, + "learning_rate": 9.914273186295858e-05, + "loss": 3.7554, + "step": 4629 + }, + { + "epoch": 2.9837228041901693, + "grad_norm": 1.2408531968018517, + "learning_rate": 9.91423600965152e-05, + "loss": 4.0895, + "step": 4630 + }, + { + "epoch": 2.98436744560838, + "grad_norm": 1.9917648860673391, + "learning_rate": 9.914198825018285e-05, + "loss": 3.8922, + "step": 4631 + }, + { + "epoch": 2.9850120870265915, + "grad_norm": 1.4482505465039828, + "learning_rate": 9.914161632396214e-05, + "loss": 3.8075, + "step": 4632 + }, + { + "epoch": 2.985656728444803, + "grad_norm": 2.252079741184563, + "learning_rate": 9.914124431785367e-05, + "loss": 3.8192, + "step": 4633 + }, + { + "epoch": 2.9863013698630136, + "grad_norm": 2.1296125326045257, + "learning_rate": 9.914087223185806e-05, + "loss": 3.5911, + "step": 4634 + }, + { + "epoch": 2.986946011281225, + "grad_norm": 1.3275747952005448, + "learning_rate": 9.914050006597592e-05, + "loss": 3.7037, + "step": 4635 + }, + { + "epoch": 2.987590652699436, + "grad_norm": 2.370792103060939, + "learning_rate": 9.914012782020784e-05, + "loss": 3.6625, + "step": 4636 + }, + { + "epoch": 2.988235294117647, + "grad_norm": 1.2251299190823017, + "learning_rate": 9.913975549455446e-05, + "loss": 4.1966, + "step": 4637 + }, + { + "epoch": 2.9888799355358584, + "grad_norm": 2.7970851442761933, + "learning_rate": 9.91393830890164e-05, + "loss": 3.8188, + "step": 4638 + }, + { + "epoch": 2.9895245769540693, + "grad_norm": 1.6362602922847076, + "learning_rate": 9.913901060359423e-05, + "loss": 3.303, + "step": 4639 + }, + { + "epoch": 2.9901692183722806, + "grad_norm": 2.2908509115322118, + "learning_rate": 9.91386380382886e-05, + "loss": 3.9179, + "step": 4640 + }, + { + "epoch": 2.9908138597904914, + "grad_norm": 1.7068641530683855, + "learning_rate": 9.91382653931001e-05, + "loss": 3.4479, + "step": 4641 + }, + { + "epoch": 2.9914585012087027, + "grad_norm": 2.0437733730419088, + "learning_rate": 9.913789266802934e-05, + "loss": 4.201, + "step": 4642 + }, + { + "epoch": 2.992103142626914, + "grad_norm": 2.194843501281569, + "learning_rate": 9.913751986307697e-05, + "loss": 3.7348, + "step": 4643 + }, + { + "epoch": 2.992747784045125, + "grad_norm": 2.230107013642439, + "learning_rate": 9.913714697824355e-05, + "loss": 3.9312, + "step": 4644 + }, + { + "epoch": 2.993392425463336, + "grad_norm": 2.513407698969423, + "learning_rate": 9.913677401352972e-05, + "loss": 3.9671, + "step": 4645 + }, + { + "epoch": 2.994037066881547, + "grad_norm": 1.855524884715391, + "learning_rate": 9.913640096893608e-05, + "loss": 3.6075, + "step": 4646 + }, + { + "epoch": 2.9946817082997583, + "grad_norm": 2.3729518895500283, + "learning_rate": 9.913602784446326e-05, + "loss": 3.7786, + "step": 4647 + }, + { + "epoch": 2.9953263497179696, + "grad_norm": 2.092033140453471, + "learning_rate": 9.913565464011186e-05, + "loss": 3.8643, + "step": 4648 + }, + { + "epoch": 2.9959709911361805, + "grad_norm": 1.5317072457590832, + "learning_rate": 9.91352813558825e-05, + "loss": 4.0495, + "step": 4649 + }, + { + "epoch": 2.996615632554392, + "grad_norm": 1.816636491737236, + "learning_rate": 9.913490799177579e-05, + "loss": 3.8544, + "step": 4650 + }, + { + "epoch": 2.9972602739726026, + "grad_norm": 1.6859293294760553, + "learning_rate": 9.913453454779235e-05, + "loss": 4.1096, + "step": 4651 + }, + { + "epoch": 2.997904915390814, + "grad_norm": 1.3310025556682097, + "learning_rate": 9.913416102393277e-05, + "loss": 3.7224, + "step": 4652 + }, + { + "epoch": 2.9985495568090252, + "grad_norm": 1.9004345655162038, + "learning_rate": 9.913378742019768e-05, + "loss": 3.5345, + "step": 4653 + }, + { + "epoch": 2.999194198227236, + "grad_norm": 1.3698441162997446, + "learning_rate": 9.91334137365877e-05, + "loss": 3.8042, + "step": 4654 + }, + { + "epoch": 2.999838839645447, + "grad_norm": 1.9800601179200388, + "learning_rate": 9.913303997310344e-05, + "loss": 3.6987, + "step": 4655 + }, + { + "epoch": 3.0, + "grad_norm": 1.9800601179200388, + "learning_rate": 9.91326661297455e-05, + "loss": 1.0456, + "step": 4656 + }, + { + "epoch": 3.0006446414182113, + "grad_norm": 1.2629317276441034, + "learning_rate": 9.913229220651453e-05, + "loss": 3.3758, + "step": 4657 + }, + { + "epoch": 3.001289282836422, + "grad_norm": 2.141735110357083, + "learning_rate": 9.91319182034111e-05, + "loss": 3.545, + "step": 4658 + }, + { + "epoch": 3.0019339242546335, + "grad_norm": 1.4944013100401972, + "learning_rate": 9.913154412043584e-05, + "loss": 3.4192, + "step": 4659 + }, + { + "epoch": 3.0025785656728443, + "grad_norm": 2.2018269907918455, + "learning_rate": 9.913116995758938e-05, + "loss": 3.5923, + "step": 4660 + }, + { + "epoch": 3.0032232070910556, + "grad_norm": 2.3009184637980815, + "learning_rate": 9.913079571487231e-05, + "loss": 3.33, + "step": 4661 + }, + { + "epoch": 3.003867848509267, + "grad_norm": 1.3376098829091514, + "learning_rate": 9.913042139228525e-05, + "loss": 3.6228, + "step": 4662 + }, + { + "epoch": 3.0045124899274778, + "grad_norm": 1.9285090477351299, + "learning_rate": 9.913004698982884e-05, + "loss": 3.4525, + "step": 4663 + }, + { + "epoch": 3.005157131345689, + "grad_norm": 3.0237183671418815, + "learning_rate": 9.912967250750367e-05, + "loss": 3.5893, + "step": 4664 + }, + { + "epoch": 3.0058017727639, + "grad_norm": 1.973246412735191, + "learning_rate": 9.912929794531036e-05, + "loss": 3.2088, + "step": 4665 + }, + { + "epoch": 3.0064464141821112, + "grad_norm": 2.1376005977060832, + "learning_rate": 9.912892330324953e-05, + "loss": 3.6792, + "step": 4666 + }, + { + "epoch": 3.0070910556003225, + "grad_norm": 1.9884248260491173, + "learning_rate": 9.91285485813218e-05, + "loss": 3.4795, + "step": 4667 + }, + { + "epoch": 3.0077356970185334, + "grad_norm": 2.2719676615391267, + "learning_rate": 9.912817377952777e-05, + "loss": 3.3358, + "step": 4668 + }, + { + "epoch": 3.0083803384367447, + "grad_norm": 1.654823788763778, + "learning_rate": 9.912779889786805e-05, + "loss": 3.5301, + "step": 4669 + }, + { + "epoch": 3.0090249798549555, + "grad_norm": 1.6477898572715783, + "learning_rate": 9.91274239363433e-05, + "loss": 3.4499, + "step": 4670 + }, + { + "epoch": 3.009669621273167, + "grad_norm": 1.6046423567221493, + "learning_rate": 9.912704889495407e-05, + "loss": 3.4769, + "step": 4671 + }, + { + "epoch": 3.010314262691378, + "grad_norm": 1.5990409233447305, + "learning_rate": 9.912667377370103e-05, + "loss": 3.4902, + "step": 4672 + }, + { + "epoch": 3.010958904109589, + "grad_norm": 1.7713953931414756, + "learning_rate": 9.912629857258477e-05, + "loss": 3.4647, + "step": 4673 + }, + { + "epoch": 3.0116035455278003, + "grad_norm": 1.4852012627480125, + "learning_rate": 9.91259232916059e-05, + "loss": 3.5732, + "step": 4674 + }, + { + "epoch": 3.012248186946011, + "grad_norm": 1.42041209918025, + "learning_rate": 9.912554793076507e-05, + "loss": 3.4635, + "step": 4675 + }, + { + "epoch": 3.0128928283642225, + "grad_norm": 1.9018669972368702, + "learning_rate": 9.912517249006288e-05, + "loss": 3.3698, + "step": 4676 + }, + { + "epoch": 3.0135374697824333, + "grad_norm": 1.4557659612449905, + "learning_rate": 9.912479696949992e-05, + "loss": 3.2235, + "step": 4677 + }, + { + "epoch": 3.0141821112006446, + "grad_norm": 1.8975633518114057, + "learning_rate": 9.912442136907683e-05, + "loss": 3.2459, + "step": 4678 + }, + { + "epoch": 3.014826752618856, + "grad_norm": 1.853650161632955, + "learning_rate": 9.912404568879425e-05, + "loss": 3.3796, + "step": 4679 + }, + { + "epoch": 3.0154713940370668, + "grad_norm": 1.9937928994901504, + "learning_rate": 9.912366992865276e-05, + "loss": 3.696, + "step": 4680 + }, + { + "epoch": 3.016116035455278, + "grad_norm": 2.3082838663380065, + "learning_rate": 9.9123294088653e-05, + "loss": 3.5759, + "step": 4681 + }, + { + "epoch": 3.016760676873489, + "grad_norm": 1.3621574192112058, + "learning_rate": 9.912291816879557e-05, + "loss": 3.4931, + "step": 4682 + }, + { + "epoch": 3.0174053182917002, + "grad_norm": 2.2217295612375625, + "learning_rate": 9.912254216908108e-05, + "loss": 3.6877, + "step": 4683 + }, + { + "epoch": 3.0180499597099115, + "grad_norm": 1.517984276313879, + "learning_rate": 9.912216608951018e-05, + "loss": 3.5603, + "step": 4684 + }, + { + "epoch": 3.0186946011281224, + "grad_norm": 2.019302281574766, + "learning_rate": 9.912178993008344e-05, + "loss": 3.3747, + "step": 4685 + }, + { + "epoch": 3.0193392425463337, + "grad_norm": 1.952233751457859, + "learning_rate": 9.912141369080155e-05, + "loss": 3.7219, + "step": 4686 + }, + { + "epoch": 3.0199838839645445, + "grad_norm": 1.8111186513768625, + "learning_rate": 9.912103737166506e-05, + "loss": 3.2733, + "step": 4687 + }, + { + "epoch": 3.020628525382756, + "grad_norm": 2.1378379256673306, + "learning_rate": 9.912066097267462e-05, + "loss": 3.6363, + "step": 4688 + }, + { + "epoch": 3.021273166800967, + "grad_norm": 1.7862051536162191, + "learning_rate": 9.912028449383084e-05, + "loss": 3.5383, + "step": 4689 + }, + { + "epoch": 3.021917808219178, + "grad_norm": 1.5902099802869032, + "learning_rate": 9.911990793513435e-05, + "loss": 3.2152, + "step": 4690 + }, + { + "epoch": 3.0225624496373893, + "grad_norm": 2.322633885413984, + "learning_rate": 9.911953129658576e-05, + "loss": 3.6439, + "step": 4691 + }, + { + "epoch": 3.0232070910556, + "grad_norm": 2.2905693636087863, + "learning_rate": 9.911915457818567e-05, + "loss": 3.7477, + "step": 4692 + }, + { + "epoch": 3.0238517324738114, + "grad_norm": 1.9407813592951695, + "learning_rate": 9.911877777993472e-05, + "loss": 3.2024, + "step": 4693 + }, + { + "epoch": 3.0244963738920227, + "grad_norm": 1.4882506700962979, + "learning_rate": 9.911840090183354e-05, + "loss": 3.7014, + "step": 4694 + }, + { + "epoch": 3.0251410153102336, + "grad_norm": 1.5963275557905083, + "learning_rate": 9.91180239438827e-05, + "loss": 3.2486, + "step": 4695 + }, + { + "epoch": 3.025785656728445, + "grad_norm": 1.4541204452429255, + "learning_rate": 9.911764690608288e-05, + "loss": 3.7055, + "step": 4696 + }, + { + "epoch": 3.0264302981466558, + "grad_norm": 2.022643067951106, + "learning_rate": 9.911726978843467e-05, + "loss": 3.6308, + "step": 4697 + }, + { + "epoch": 3.027074939564867, + "grad_norm": 1.2643032849898734, + "learning_rate": 9.911689259093869e-05, + "loss": 3.7976, + "step": 4698 + }, + { + "epoch": 3.0277195809830784, + "grad_norm": 1.989289960677123, + "learning_rate": 9.911651531359556e-05, + "loss": 3.3076, + "step": 4699 + }, + { + "epoch": 3.028364222401289, + "grad_norm": 1.4695834750612462, + "learning_rate": 9.911613795640589e-05, + "loss": 3.4443, + "step": 4700 + }, + { + "epoch": 3.028364222401289, + "eval_loss": 4.135890483856201, + "eval_runtime": 2.9775, + "eval_samples_per_second": 33.586, + "eval_steps_per_second": 4.366, + "step": 4700 + }, + { + "epoch": 3.0290088638195005, + "grad_norm": 1.6707863277714718, + "learning_rate": 9.911576051937033e-05, + "loss": 3.6518, + "step": 4701 + }, + { + "epoch": 3.0296535052377114, + "grad_norm": 1.472873570120378, + "learning_rate": 9.911538300248948e-05, + "loss": 3.7644, + "step": 4702 + }, + { + "epoch": 3.0302981466559227, + "grad_norm": 1.4892051843294922, + "learning_rate": 9.911500540576395e-05, + "loss": 3.7459, + "step": 4703 + }, + { + "epoch": 3.030942788074134, + "grad_norm": 1.4578417613692982, + "learning_rate": 9.911462772919436e-05, + "loss": 3.4577, + "step": 4704 + }, + { + "epoch": 3.031587429492345, + "grad_norm": 2.009207166871006, + "learning_rate": 9.911424997278135e-05, + "loss": 3.2759, + "step": 4705 + }, + { + "epoch": 3.032232070910556, + "grad_norm": 1.4113885062337486, + "learning_rate": 9.911387213652554e-05, + "loss": 3.5884, + "step": 4706 + }, + { + "epoch": 3.032876712328767, + "grad_norm": 1.6607724544601197, + "learning_rate": 9.911349422042753e-05, + "loss": 3.5377, + "step": 4707 + }, + { + "epoch": 3.0335213537469783, + "grad_norm": 1.450401050202119, + "learning_rate": 9.911311622448797e-05, + "loss": 3.5313, + "step": 4708 + }, + { + "epoch": 3.034165995165189, + "grad_norm": 1.866355424366525, + "learning_rate": 9.911273814870745e-05, + "loss": 3.4648, + "step": 4709 + }, + { + "epoch": 3.0348106365834004, + "grad_norm": 1.6758761296820206, + "learning_rate": 9.91123599930866e-05, + "loss": 3.404, + "step": 4710 + }, + { + "epoch": 3.0354552780016117, + "grad_norm": 1.6442200185553781, + "learning_rate": 9.911198175762605e-05, + "loss": 3.4137, + "step": 4711 + }, + { + "epoch": 3.0360999194198226, + "grad_norm": 1.6654830373041722, + "learning_rate": 9.911160344232643e-05, + "loss": 3.2446, + "step": 4712 + }, + { + "epoch": 3.036744560838034, + "grad_norm": 1.6400127790624466, + "learning_rate": 9.911122504718833e-05, + "loss": 3.4774, + "step": 4713 + }, + { + "epoch": 3.0373892022562448, + "grad_norm": 1.272307754845399, + "learning_rate": 9.911084657221239e-05, + "loss": 3.6245, + "step": 4714 + }, + { + "epoch": 3.038033843674456, + "grad_norm": 1.6887376071717755, + "learning_rate": 9.911046801739923e-05, + "loss": 3.623, + "step": 4715 + }, + { + "epoch": 3.0386784850926674, + "grad_norm": 1.447192522498651, + "learning_rate": 9.911008938274948e-05, + "loss": 3.5447, + "step": 4716 + }, + { + "epoch": 3.039323126510878, + "grad_norm": 1.6395627638528332, + "learning_rate": 9.910971066826376e-05, + "loss": 3.3601, + "step": 4717 + }, + { + "epoch": 3.0399677679290895, + "grad_norm": 1.3681050333564182, + "learning_rate": 9.910933187394268e-05, + "loss": 3.6808, + "step": 4718 + }, + { + "epoch": 3.0406124093473004, + "grad_norm": 1.5365850366610885, + "learning_rate": 9.910895299978686e-05, + "loss": 3.684, + "step": 4719 + }, + { + "epoch": 3.0412570507655117, + "grad_norm": 1.4401550880357261, + "learning_rate": 9.910857404579693e-05, + "loss": 3.7967, + "step": 4720 + }, + { + "epoch": 3.041901692183723, + "grad_norm": 1.2200939149205787, + "learning_rate": 9.910819501197353e-05, + "loss": 3.4498, + "step": 4721 + }, + { + "epoch": 3.042546333601934, + "grad_norm": 1.723897614926556, + "learning_rate": 9.910781589831726e-05, + "loss": 3.9253, + "step": 4722 + }, + { + "epoch": 3.043190975020145, + "grad_norm": 1.5738207372302946, + "learning_rate": 9.910743670482872e-05, + "loss": 3.7139, + "step": 4723 + }, + { + "epoch": 3.043835616438356, + "grad_norm": 2.028068730326324, + "learning_rate": 9.91070574315086e-05, + "loss": 3.7573, + "step": 4724 + }, + { + "epoch": 3.0444802578565673, + "grad_norm": 1.9205509850973306, + "learning_rate": 9.910667807835746e-05, + "loss": 3.5123, + "step": 4725 + }, + { + "epoch": 3.0451248992747786, + "grad_norm": 1.5932459348517714, + "learning_rate": 9.910629864537596e-05, + "loss": 3.4401, + "step": 4726 + }, + { + "epoch": 3.0457695406929894, + "grad_norm": 1.934189204662561, + "learning_rate": 9.91059191325647e-05, + "loss": 3.4128, + "step": 4727 + }, + { + "epoch": 3.0464141821112007, + "grad_norm": 1.3680988889538555, + "learning_rate": 9.910553953992433e-05, + "loss": 3.597, + "step": 4728 + }, + { + "epoch": 3.0470588235294116, + "grad_norm": 1.7215461532373522, + "learning_rate": 9.910515986745545e-05, + "loss": 3.4903, + "step": 4729 + }, + { + "epoch": 3.047703464947623, + "grad_norm": 1.683139976320457, + "learning_rate": 9.910478011515867e-05, + "loss": 3.7042, + "step": 4730 + }, + { + "epoch": 3.048348106365834, + "grad_norm": 1.63949242015198, + "learning_rate": 9.910440028303466e-05, + "loss": 3.7294, + "step": 4731 + }, + { + "epoch": 3.048992747784045, + "grad_norm": 1.3491925254633201, + "learning_rate": 9.9104020371084e-05, + "loss": 3.308, + "step": 4732 + }, + { + "epoch": 3.0496373892022564, + "grad_norm": 1.983703862533528, + "learning_rate": 9.910364037930735e-05, + "loss": 3.4261, + "step": 4733 + }, + { + "epoch": 3.050282030620467, + "grad_norm": 1.9297727722409646, + "learning_rate": 9.91032603077053e-05, + "loss": 3.9016, + "step": 4734 + }, + { + "epoch": 3.0509266720386785, + "grad_norm": 1.4091209830558455, + "learning_rate": 9.910288015627851e-05, + "loss": 3.4662, + "step": 4735 + }, + { + "epoch": 3.05157131345689, + "grad_norm": 1.8693837802248283, + "learning_rate": 9.910249992502756e-05, + "loss": 3.5053, + "step": 4736 + }, + { + "epoch": 3.0522159548751007, + "grad_norm": 1.6597839747887375, + "learning_rate": 9.910211961395313e-05, + "loss": 3.18, + "step": 4737 + }, + { + "epoch": 3.052860596293312, + "grad_norm": 1.8251086351766885, + "learning_rate": 9.910173922305579e-05, + "loss": 3.3827, + "step": 4738 + }, + { + "epoch": 3.053505237711523, + "grad_norm": 1.3932029827522279, + "learning_rate": 9.910135875233619e-05, + "loss": 3.7216, + "step": 4739 + }, + { + "epoch": 3.054149879129734, + "grad_norm": 1.5839967899618834, + "learning_rate": 9.910097820179495e-05, + "loss": 3.5843, + "step": 4740 + }, + { + "epoch": 3.0547945205479454, + "grad_norm": 1.9495605053218947, + "learning_rate": 9.910059757143271e-05, + "loss": 3.3791, + "step": 4741 + }, + { + "epoch": 3.0554391619661563, + "grad_norm": 1.5240181089463773, + "learning_rate": 9.910021686125008e-05, + "loss": 3.6257, + "step": 4742 + }, + { + "epoch": 3.0560838033843676, + "grad_norm": 1.6356298218124998, + "learning_rate": 9.90998360712477e-05, + "loss": 3.8093, + "step": 4743 + }, + { + "epoch": 3.0567284448025784, + "grad_norm": 1.7009736083776328, + "learning_rate": 9.909945520142616e-05, + "loss": 3.631, + "step": 4744 + }, + { + "epoch": 3.0573730862207897, + "grad_norm": 1.360150053543603, + "learning_rate": 9.909907425178611e-05, + "loss": 3.7086, + "step": 4745 + }, + { + "epoch": 3.0580177276390006, + "grad_norm": 1.384312831182662, + "learning_rate": 9.909869322232819e-05, + "loss": 3.7214, + "step": 4746 + }, + { + "epoch": 3.058662369057212, + "grad_norm": 1.2478964771681103, + "learning_rate": 9.909831211305302e-05, + "loss": 3.6058, + "step": 4747 + }, + { + "epoch": 3.059307010475423, + "grad_norm": 1.3863564412445581, + "learning_rate": 9.909793092396118e-05, + "loss": 3.3676, + "step": 4748 + }, + { + "epoch": 3.059951651893634, + "grad_norm": 1.3683202197566784, + "learning_rate": 9.909754965505337e-05, + "loss": 3.333, + "step": 4749 + }, + { + "epoch": 3.0605962933118454, + "grad_norm": 1.465637862924072, + "learning_rate": 9.909716830633018e-05, + "loss": 3.4362, + "step": 4750 + }, + { + "epoch": 3.061240934730056, + "grad_norm": 1.548462829872405, + "learning_rate": 9.909678687779221e-05, + "loss": 3.7258, + "step": 4751 + }, + { + "epoch": 3.0618855761482675, + "grad_norm": 1.4033334803942397, + "learning_rate": 9.909640536944012e-05, + "loss": 3.7692, + "step": 4752 + }, + { + "epoch": 3.062530217566479, + "grad_norm": 1.4885796690807944, + "learning_rate": 9.909602378127454e-05, + "loss": 3.5735, + "step": 4753 + }, + { + "epoch": 3.0631748589846897, + "grad_norm": 1.3394788174266024, + "learning_rate": 9.909564211329609e-05, + "loss": 3.4781, + "step": 4754 + }, + { + "epoch": 3.063819500402901, + "grad_norm": 1.6952427202324603, + "learning_rate": 9.909526036550537e-05, + "loss": 3.4754, + "step": 4755 + }, + { + "epoch": 3.064464141821112, + "grad_norm": 1.7644699472595824, + "learning_rate": 9.909487853790304e-05, + "loss": 3.2623, + "step": 4756 + }, + { + "epoch": 3.065108783239323, + "grad_norm": 1.6187804382010014, + "learning_rate": 9.909449663048971e-05, + "loss": 3.2552, + "step": 4757 + }, + { + "epoch": 3.0657534246575344, + "grad_norm": 2.1917614844668947, + "learning_rate": 9.909411464326603e-05, + "loss": 3.567, + "step": 4758 + }, + { + "epoch": 3.0663980660757453, + "grad_norm": 1.957204378334581, + "learning_rate": 9.90937325762326e-05, + "loss": 3.695, + "step": 4759 + }, + { + "epoch": 3.0670427074939566, + "grad_norm": 1.425927816067063, + "learning_rate": 9.909335042939006e-05, + "loss": 3.4454, + "step": 4760 + }, + { + "epoch": 3.0676873489121674, + "grad_norm": 2.173102061441222, + "learning_rate": 9.909296820273904e-05, + "loss": 3.5531, + "step": 4761 + }, + { + "epoch": 3.0683319903303787, + "grad_norm": 1.1347397163626338, + "learning_rate": 9.909258589628016e-05, + "loss": 3.4345, + "step": 4762 + }, + { + "epoch": 3.06897663174859, + "grad_norm": 2.035122263121055, + "learning_rate": 9.909220351001405e-05, + "loss": 3.4046, + "step": 4763 + }, + { + "epoch": 3.069621273166801, + "grad_norm": 1.4895873185701471, + "learning_rate": 9.909182104394133e-05, + "loss": 3.6423, + "step": 4764 + }, + { + "epoch": 3.070265914585012, + "grad_norm": 2.331114094152823, + "learning_rate": 9.909143849806265e-05, + "loss": 3.5468, + "step": 4765 + }, + { + "epoch": 3.070910556003223, + "grad_norm": 1.7755659635176568, + "learning_rate": 9.909105587237864e-05, + "loss": 3.4785, + "step": 4766 + }, + { + "epoch": 3.0715551974214343, + "grad_norm": 1.923371706944064, + "learning_rate": 9.90906731668899e-05, + "loss": 3.6964, + "step": 4767 + }, + { + "epoch": 3.0721998388396456, + "grad_norm": 2.12650385681847, + "learning_rate": 9.909029038159707e-05, + "loss": 3.6123, + "step": 4768 + }, + { + "epoch": 3.0728444802578565, + "grad_norm": 1.4186280303161527, + "learning_rate": 9.908990751650076e-05, + "loss": 3.4776, + "step": 4769 + }, + { + "epoch": 3.073489121676068, + "grad_norm": 1.7544584780211498, + "learning_rate": 9.908952457160165e-05, + "loss": 3.6039, + "step": 4770 + }, + { + "epoch": 3.0741337630942787, + "grad_norm": 1.459846024788456, + "learning_rate": 9.908914154690035e-05, + "loss": 3.5467, + "step": 4771 + }, + { + "epoch": 3.07477840451249, + "grad_norm": 1.6903768589632302, + "learning_rate": 9.908875844239745e-05, + "loss": 3.492, + "step": 4772 + }, + { + "epoch": 3.0754230459307013, + "grad_norm": 1.9201475166132085, + "learning_rate": 9.908837525809363e-05, + "loss": 3.6411, + "step": 4773 + }, + { + "epoch": 3.076067687348912, + "grad_norm": 1.2911712981909305, + "learning_rate": 9.908799199398948e-05, + "loss": 3.6544, + "step": 4774 + }, + { + "epoch": 3.0767123287671234, + "grad_norm": 1.5843545570450928, + "learning_rate": 9.908760865008566e-05, + "loss": 3.4889, + "step": 4775 + }, + { + "epoch": 3.0773569701853343, + "grad_norm": 1.5256440636606965, + "learning_rate": 9.908722522638277e-05, + "loss": 3.5898, + "step": 4776 + }, + { + "epoch": 3.0780016116035456, + "grad_norm": 1.5942564994121562, + "learning_rate": 9.908684172288148e-05, + "loss": 3.4636, + "step": 4777 + }, + { + "epoch": 3.0786462530217564, + "grad_norm": 1.6349722000773654, + "learning_rate": 9.908645813958238e-05, + "loss": 3.7888, + "step": 4778 + }, + { + "epoch": 3.0792908944399677, + "grad_norm": 1.6260983632307726, + "learning_rate": 9.908607447648613e-05, + "loss": 3.5712, + "step": 4779 + }, + { + "epoch": 3.079935535858179, + "grad_norm": 1.5787136339328394, + "learning_rate": 9.908569073359333e-05, + "loss": 3.7065, + "step": 4780 + }, + { + "epoch": 3.08058017727639, + "grad_norm": 1.687670091357351, + "learning_rate": 9.908530691090463e-05, + "loss": 3.6218, + "step": 4781 + }, + { + "epoch": 3.081224818694601, + "grad_norm": 1.7715917822523224, + "learning_rate": 9.908492300842063e-05, + "loss": 3.5358, + "step": 4782 + }, + { + "epoch": 3.081869460112812, + "grad_norm": 1.7551946388981403, + "learning_rate": 9.908453902614201e-05, + "loss": 3.4732, + "step": 4783 + }, + { + "epoch": 3.0825141015310233, + "grad_norm": 1.5771705901318678, + "learning_rate": 9.90841549640694e-05, + "loss": 3.8708, + "step": 4784 + }, + { + "epoch": 3.0831587429492346, + "grad_norm": 2.1306505932807394, + "learning_rate": 9.908377082220338e-05, + "loss": 3.1533, + "step": 4785 + }, + { + "epoch": 3.0838033843674455, + "grad_norm": 1.5663900136554776, + "learning_rate": 9.908338660054463e-05, + "loss": 3.7156, + "step": 4786 + }, + { + "epoch": 3.084448025785657, + "grad_norm": 1.792553426645421, + "learning_rate": 9.908300229909375e-05, + "loss": 3.6285, + "step": 4787 + }, + { + "epoch": 3.0850926672038677, + "grad_norm": 1.7326143446064013, + "learning_rate": 9.908261791785138e-05, + "loss": 3.6457, + "step": 4788 + }, + { + "epoch": 3.085737308622079, + "grad_norm": 1.5664569823041063, + "learning_rate": 9.908223345681814e-05, + "loss": 3.5955, + "step": 4789 + }, + { + "epoch": 3.0863819500402903, + "grad_norm": 2.042629747666713, + "learning_rate": 9.908184891599469e-05, + "loss": 3.8848, + "step": 4790 + }, + { + "epoch": 3.087026591458501, + "grad_norm": 1.4750248121322562, + "learning_rate": 9.908146429538164e-05, + "loss": 3.7293, + "step": 4791 + }, + { + "epoch": 3.0876712328767124, + "grad_norm": 1.778489746933121, + "learning_rate": 9.908107959497964e-05, + "loss": 3.5356, + "step": 4792 + }, + { + "epoch": 3.0883158742949233, + "grad_norm": 1.9768236406692252, + "learning_rate": 9.908069481478929e-05, + "loss": 3.8801, + "step": 4793 + }, + { + "epoch": 3.0889605157131346, + "grad_norm": 1.6064590725701555, + "learning_rate": 9.908030995481125e-05, + "loss": 3.3024, + "step": 4794 + }, + { + "epoch": 3.089605157131346, + "grad_norm": 1.7391071105078126, + "learning_rate": 9.907992501504615e-05, + "loss": 3.6743, + "step": 4795 + }, + { + "epoch": 3.0902497985495567, + "grad_norm": 1.7957753691733562, + "learning_rate": 9.90795399954946e-05, + "loss": 3.5578, + "step": 4796 + }, + { + "epoch": 3.090894439967768, + "grad_norm": 1.437635628643977, + "learning_rate": 9.907915489615727e-05, + "loss": 3.3927, + "step": 4797 + }, + { + "epoch": 3.091539081385979, + "grad_norm": 1.7214469771352439, + "learning_rate": 9.907876971703475e-05, + "loss": 3.8037, + "step": 4798 + }, + { + "epoch": 3.09218372280419, + "grad_norm": 1.374456359652849, + "learning_rate": 9.907838445812772e-05, + "loss": 3.4453, + "step": 4799 + }, + { + "epoch": 3.0928283642224015, + "grad_norm": 1.714178342495029, + "learning_rate": 9.907799911943676e-05, + "loss": 2.9386, + "step": 4800 + }, + { + "epoch": 3.0928283642224015, + "eval_loss": 4.132169246673584, + "eval_runtime": 2.9694, + "eval_samples_per_second": 33.677, + "eval_steps_per_second": 4.378, + "step": 4800 + }, + { + "epoch": 3.0934730056406123, + "grad_norm": 1.560844113644303, + "learning_rate": 9.907761370096253e-05, + "loss": 3.8315, + "step": 4801 + }, + { + "epoch": 3.0941176470588236, + "grad_norm": 1.952993805342655, + "learning_rate": 9.907722820270566e-05, + "loss": 3.7596, + "step": 4802 + }, + { + "epoch": 3.0947622884770345, + "grad_norm": 2.007039878028542, + "learning_rate": 9.90768426246668e-05, + "loss": 3.4889, + "step": 4803 + }, + { + "epoch": 3.095406929895246, + "grad_norm": 1.7793247796014624, + "learning_rate": 9.907645696684655e-05, + "loss": 3.3816, + "step": 4804 + }, + { + "epoch": 3.096051571313457, + "grad_norm": 1.7768609094771228, + "learning_rate": 9.907607122924558e-05, + "loss": 3.5031, + "step": 4805 + }, + { + "epoch": 3.096696212731668, + "grad_norm": 1.657465851375977, + "learning_rate": 9.90756854118645e-05, + "loss": 3.4861, + "step": 4806 + }, + { + "epoch": 3.0973408541498793, + "grad_norm": 1.7497297585131095, + "learning_rate": 9.907529951470393e-05, + "loss": 3.7863, + "step": 4807 + }, + { + "epoch": 3.09798549556809, + "grad_norm": 2.0728800860249583, + "learning_rate": 9.907491353776455e-05, + "loss": 3.3571, + "step": 4808 + }, + { + "epoch": 3.0986301369863014, + "grad_norm": 1.5947347087388304, + "learning_rate": 9.907452748104695e-05, + "loss": 3.4916, + "step": 4809 + }, + { + "epoch": 3.0992747784045127, + "grad_norm": 1.8172379000730667, + "learning_rate": 9.907414134455178e-05, + "loss": 3.8303, + "step": 4810 + }, + { + "epoch": 3.0999194198227236, + "grad_norm": 1.7266530302548513, + "learning_rate": 9.90737551282797e-05, + "loss": 3.4504, + "step": 4811 + }, + { + "epoch": 3.100564061240935, + "grad_norm": 2.0379363815828833, + "learning_rate": 9.907336883223128e-05, + "loss": 3.7253, + "step": 4812 + }, + { + "epoch": 3.1012087026591457, + "grad_norm": 1.6518330014757734, + "learning_rate": 9.90729824564072e-05, + "loss": 3.4374, + "step": 4813 + }, + { + "epoch": 3.101853344077357, + "grad_norm": 1.731782580656484, + "learning_rate": 9.907259600080812e-05, + "loss": 3.3722, + "step": 4814 + }, + { + "epoch": 3.102497985495568, + "grad_norm": 1.777728636832511, + "learning_rate": 9.907220946543461e-05, + "loss": 3.7032, + "step": 4815 + }, + { + "epoch": 3.103142626913779, + "grad_norm": 1.814378894809954, + "learning_rate": 9.907182285028734e-05, + "loss": 4.056, + "step": 4816 + }, + { + "epoch": 3.1037872683319905, + "grad_norm": 1.270783897551042, + "learning_rate": 9.907143615536696e-05, + "loss": 3.7154, + "step": 4817 + }, + { + "epoch": 3.1044319097502013, + "grad_norm": 1.523824585627452, + "learning_rate": 9.907104938067408e-05, + "loss": 3.4595, + "step": 4818 + }, + { + "epoch": 3.1050765511684126, + "grad_norm": 1.4278562283747585, + "learning_rate": 9.907066252620933e-05, + "loss": 3.4476, + "step": 4819 + }, + { + "epoch": 3.1057211925866235, + "grad_norm": 1.6486715285471794, + "learning_rate": 9.907027559197337e-05, + "loss": 3.5076, + "step": 4820 + }, + { + "epoch": 3.106365834004835, + "grad_norm": 1.429817780035904, + "learning_rate": 9.906988857796683e-05, + "loss": 3.7033, + "step": 4821 + }, + { + "epoch": 3.107010475423046, + "grad_norm": 1.539520535089535, + "learning_rate": 9.906950148419032e-05, + "loss": 3.4015, + "step": 4822 + }, + { + "epoch": 3.107655116841257, + "grad_norm": 1.6645325275094822, + "learning_rate": 9.906911431064452e-05, + "loss": 3.4112, + "step": 4823 + }, + { + "epoch": 3.1082997582594682, + "grad_norm": 1.505888873958323, + "learning_rate": 9.906872705733003e-05, + "loss": 3.6217, + "step": 4824 + }, + { + "epoch": 3.108944399677679, + "grad_norm": 1.4677574260991757, + "learning_rate": 9.906833972424748e-05, + "loss": 3.7994, + "step": 4825 + }, + { + "epoch": 3.1095890410958904, + "grad_norm": 1.3665603087450653, + "learning_rate": 9.906795231139755e-05, + "loss": 3.4258, + "step": 4826 + }, + { + "epoch": 3.1102336825141017, + "grad_norm": 1.5720685845441307, + "learning_rate": 9.906756481878084e-05, + "loss": 3.5941, + "step": 4827 + }, + { + "epoch": 3.1108783239323126, + "grad_norm": 1.7190012319294263, + "learning_rate": 9.906717724639798e-05, + "loss": 3.3629, + "step": 4828 + }, + { + "epoch": 3.111522965350524, + "grad_norm": 1.3287320653250239, + "learning_rate": 9.906678959424963e-05, + "loss": 3.5191, + "step": 4829 + }, + { + "epoch": 3.1121676067687347, + "grad_norm": 1.620345727888524, + "learning_rate": 9.906640186233643e-05, + "loss": 3.5899, + "step": 4830 + }, + { + "epoch": 3.112812248186946, + "grad_norm": 1.7151557079452326, + "learning_rate": 9.906601405065901e-05, + "loss": 3.8901, + "step": 4831 + }, + { + "epoch": 3.1134568896051573, + "grad_norm": 1.4586864806488034, + "learning_rate": 9.906562615921797e-05, + "loss": 3.5297, + "step": 4832 + }, + { + "epoch": 3.114101531023368, + "grad_norm": 1.5005695303563273, + "learning_rate": 9.906523818801402e-05, + "loss": 3.7336, + "step": 4833 + }, + { + "epoch": 3.1147461724415795, + "grad_norm": 1.7345153889879743, + "learning_rate": 9.906485013704773e-05, + "loss": 3.2692, + "step": 4834 + }, + { + "epoch": 3.1153908138597903, + "grad_norm": 1.7997910136495388, + "learning_rate": 9.906446200631977e-05, + "loss": 3.7941, + "step": 4835 + }, + { + "epoch": 3.1160354552780016, + "grad_norm": 2.4355279432159858, + "learning_rate": 9.906407379583079e-05, + "loss": 3.5906, + "step": 4836 + }, + { + "epoch": 3.116680096696213, + "grad_norm": 1.7393212863290584, + "learning_rate": 9.906368550558137e-05, + "loss": 3.6153, + "step": 4837 + }, + { + "epoch": 3.117324738114424, + "grad_norm": 1.6854172660449154, + "learning_rate": 9.906329713557222e-05, + "loss": 3.5852, + "step": 4838 + }, + { + "epoch": 3.117969379532635, + "grad_norm": 1.6322721547138115, + "learning_rate": 9.906290868580392e-05, + "loss": 3.5993, + "step": 4839 + }, + { + "epoch": 3.118614020950846, + "grad_norm": 1.606192468823587, + "learning_rate": 9.906252015627716e-05, + "loss": 3.6386, + "step": 4840 + }, + { + "epoch": 3.1192586623690572, + "grad_norm": 2.005474603275486, + "learning_rate": 9.906213154699253e-05, + "loss": 3.4484, + "step": 4841 + }, + { + "epoch": 3.1199033037872685, + "grad_norm": 1.4812037680605756, + "learning_rate": 9.90617428579507e-05, + "loss": 3.4767, + "step": 4842 + }, + { + "epoch": 3.1205479452054794, + "grad_norm": 1.3817972943770662, + "learning_rate": 9.90613540891523e-05, + "loss": 3.5896, + "step": 4843 + }, + { + "epoch": 3.1211925866236907, + "grad_norm": 1.3813192238352379, + "learning_rate": 9.906096524059796e-05, + "loss": 3.5664, + "step": 4844 + }, + { + "epoch": 3.1218372280419016, + "grad_norm": 1.4263952221112983, + "learning_rate": 9.906057631228831e-05, + "loss": 3.475, + "step": 4845 + }, + { + "epoch": 3.122481869460113, + "grad_norm": 2.0622328997190116, + "learning_rate": 9.906018730422403e-05, + "loss": 3.7797, + "step": 4846 + }, + { + "epoch": 3.1231265108783237, + "grad_norm": 1.4151985520541281, + "learning_rate": 9.905979821640573e-05, + "loss": 3.3207, + "step": 4847 + }, + { + "epoch": 3.123771152296535, + "grad_norm": 1.8291815569807641, + "learning_rate": 9.905940904883404e-05, + "loss": 3.4346, + "step": 4848 + }, + { + "epoch": 3.1244157937147463, + "grad_norm": 1.595151893481068, + "learning_rate": 9.90590198015096e-05, + "loss": 3.6927, + "step": 4849 + }, + { + "epoch": 3.125060435132957, + "grad_norm": 1.9079510198159222, + "learning_rate": 9.905863047443308e-05, + "loss": 3.5992, + "step": 4850 + }, + { + "epoch": 3.1257050765511685, + "grad_norm": 1.6645760669495475, + "learning_rate": 9.905824106760509e-05, + "loss": 3.5912, + "step": 4851 + }, + { + "epoch": 3.1263497179693793, + "grad_norm": 1.6968472640817567, + "learning_rate": 9.905785158102627e-05, + "loss": 3.5831, + "step": 4852 + }, + { + "epoch": 3.1269943593875906, + "grad_norm": 1.7185706678200174, + "learning_rate": 9.90574620146973e-05, + "loss": 3.5025, + "step": 4853 + }, + { + "epoch": 3.127639000805802, + "grad_norm": 1.6406142190005812, + "learning_rate": 9.905707236861876e-05, + "loss": 3.3952, + "step": 4854 + }, + { + "epoch": 3.128283642224013, + "grad_norm": 2.1589858578823278, + "learning_rate": 9.905668264279132e-05, + "loss": 3.502, + "step": 4855 + }, + { + "epoch": 3.128928283642224, + "grad_norm": 1.3697733371639804, + "learning_rate": 9.905629283721563e-05, + "loss": 3.4002, + "step": 4856 + }, + { + "epoch": 3.129572925060435, + "grad_norm": 1.6776996354824258, + "learning_rate": 9.90559029518923e-05, + "loss": 3.6461, + "step": 4857 + }, + { + "epoch": 3.1302175664786462, + "grad_norm": 1.380530944536107, + "learning_rate": 9.905551298682201e-05, + "loss": 3.5812, + "step": 4858 + }, + { + "epoch": 3.1308622078968575, + "grad_norm": 1.4151317581813727, + "learning_rate": 9.905512294200536e-05, + "loss": 3.4256, + "step": 4859 + }, + { + "epoch": 3.1315068493150684, + "grad_norm": 1.4273418212909064, + "learning_rate": 9.905473281744302e-05, + "loss": 3.4934, + "step": 4860 + }, + { + "epoch": 3.1321514907332797, + "grad_norm": 1.7019011139933926, + "learning_rate": 9.905434261313562e-05, + "loss": 3.7154, + "step": 4861 + }, + { + "epoch": 3.1327961321514906, + "grad_norm": 1.349001983499361, + "learning_rate": 9.90539523290838e-05, + "loss": 3.4016, + "step": 4862 + }, + { + "epoch": 3.133440773569702, + "grad_norm": 1.8111250629152498, + "learning_rate": 9.905356196528822e-05, + "loss": 3.9582, + "step": 4863 + }, + { + "epoch": 3.134085414987913, + "grad_norm": 1.59523235245692, + "learning_rate": 9.905317152174948e-05, + "loss": 3.4356, + "step": 4864 + }, + { + "epoch": 3.134730056406124, + "grad_norm": 1.8244412867395952, + "learning_rate": 9.905278099846825e-05, + "loss": 3.3998, + "step": 4865 + }, + { + "epoch": 3.1353746978243353, + "grad_norm": 2.309908127116147, + "learning_rate": 9.905239039544518e-05, + "loss": 3.7154, + "step": 4866 + }, + { + "epoch": 3.136019339242546, + "grad_norm": 1.5484063752716541, + "learning_rate": 9.905199971268088e-05, + "loss": 3.3931, + "step": 4867 + }, + { + "epoch": 3.1366639806607575, + "grad_norm": 1.9026667736649714, + "learning_rate": 9.905160895017601e-05, + "loss": 3.6498, + "step": 4868 + }, + { + "epoch": 3.1373086220789688, + "grad_norm": 1.8601357947087944, + "learning_rate": 9.905121810793122e-05, + "loss": 3.686, + "step": 4869 + }, + { + "epoch": 3.1379532634971796, + "grad_norm": 1.8635160849789498, + "learning_rate": 9.905082718594713e-05, + "loss": 3.6883, + "step": 4870 + }, + { + "epoch": 3.138597904915391, + "grad_norm": 1.3363069741667637, + "learning_rate": 9.905043618422441e-05, + "loss": 3.8025, + "step": 4871 + }, + { + "epoch": 3.139242546333602, + "grad_norm": 2.0628256562196015, + "learning_rate": 9.905004510276368e-05, + "loss": 3.7302, + "step": 4872 + }, + { + "epoch": 3.139887187751813, + "grad_norm": 1.7962806852224387, + "learning_rate": 9.90496539415656e-05, + "loss": 3.2499, + "step": 4873 + }, + { + "epoch": 3.1405318291700244, + "grad_norm": 1.2314890443901225, + "learning_rate": 9.90492627006308e-05, + "loss": 3.0665, + "step": 4874 + }, + { + "epoch": 3.1411764705882352, + "grad_norm": 1.6982250439991793, + "learning_rate": 9.90488713799599e-05, + "loss": 3.5483, + "step": 4875 + }, + { + "epoch": 3.1418211120064465, + "grad_norm": 1.2534176112791449, + "learning_rate": 9.90484799795536e-05, + "loss": 3.6819, + "step": 4876 + }, + { + "epoch": 3.1424657534246574, + "grad_norm": 1.5716904969116654, + "learning_rate": 9.90480884994125e-05, + "loss": 3.7844, + "step": 4877 + }, + { + "epoch": 3.1431103948428687, + "grad_norm": 1.2777414132841407, + "learning_rate": 9.904769693953724e-05, + "loss": 3.5616, + "step": 4878 + }, + { + "epoch": 3.14375503626108, + "grad_norm": 1.6561670120810894, + "learning_rate": 9.904730529992849e-05, + "loss": 3.6875, + "step": 4879 + }, + { + "epoch": 3.144399677679291, + "grad_norm": 1.748256281932543, + "learning_rate": 9.904691358058687e-05, + "loss": 3.2739, + "step": 4880 + }, + { + "epoch": 3.145044319097502, + "grad_norm": 2.1702201967509347, + "learning_rate": 9.904652178151303e-05, + "loss": 3.3553, + "step": 4881 + }, + { + "epoch": 3.145688960515713, + "grad_norm": 1.786674466866127, + "learning_rate": 9.904612990270762e-05, + "loss": 3.8674, + "step": 4882 + }, + { + "epoch": 3.1463336019339243, + "grad_norm": 1.895882696875769, + "learning_rate": 9.90457379441713e-05, + "loss": 3.8022, + "step": 4883 + }, + { + "epoch": 3.1469782433521356, + "grad_norm": 1.8589100402274261, + "learning_rate": 9.904534590590468e-05, + "loss": 3.7238, + "step": 4884 + }, + { + "epoch": 3.1476228847703465, + "grad_norm": 1.5083439954150526, + "learning_rate": 9.90449537879084e-05, + "loss": 3.3027, + "step": 4885 + }, + { + "epoch": 3.1482675261885578, + "grad_norm": 1.8802555730552346, + "learning_rate": 9.904456159018315e-05, + "loss": 3.585, + "step": 4886 + }, + { + "epoch": 3.1489121676067686, + "grad_norm": 1.4534456623999759, + "learning_rate": 9.904416931272952e-05, + "loss": 3.5954, + "step": 4887 + }, + { + "epoch": 3.14955680902498, + "grad_norm": 1.776283757761067, + "learning_rate": 9.904377695554819e-05, + "loss": 3.8276, + "step": 4888 + }, + { + "epoch": 3.1502014504431908, + "grad_norm": 1.4902835934777743, + "learning_rate": 9.904338451863981e-05, + "loss": 3.7379, + "step": 4889 + }, + { + "epoch": 3.150846091861402, + "grad_norm": 1.5132924579830398, + "learning_rate": 9.904299200200499e-05, + "loss": 3.5108, + "step": 4890 + }, + { + "epoch": 3.1514907332796134, + "grad_norm": 1.2789335891994018, + "learning_rate": 9.904259940564439e-05, + "loss": 3.4167, + "step": 4891 + }, + { + "epoch": 3.1521353746978242, + "grad_norm": 1.6190169192190085, + "learning_rate": 9.904220672955867e-05, + "loss": 3.3381, + "step": 4892 + }, + { + "epoch": 3.1527800161160355, + "grad_norm": 1.2271430660942064, + "learning_rate": 9.904181397374846e-05, + "loss": 3.2495, + "step": 4893 + }, + { + "epoch": 3.1534246575342464, + "grad_norm": 1.5123411702583784, + "learning_rate": 9.90414211382144e-05, + "loss": 3.3791, + "step": 4894 + }, + { + "epoch": 3.1540692989524577, + "grad_norm": 1.3259314651672016, + "learning_rate": 9.904102822295715e-05, + "loss": 3.7997, + "step": 4895 + }, + { + "epoch": 3.154713940370669, + "grad_norm": 1.379306006003097, + "learning_rate": 9.904063522797736e-05, + "loss": 3.8753, + "step": 4896 + }, + { + "epoch": 3.15535858178888, + "grad_norm": 1.2767161581993425, + "learning_rate": 9.904024215327564e-05, + "loss": 3.433, + "step": 4897 + }, + { + "epoch": 3.156003223207091, + "grad_norm": 1.2728611975516277, + "learning_rate": 9.903984899885268e-05, + "loss": 3.7242, + "step": 4898 + }, + { + "epoch": 3.156647864625302, + "grad_norm": 1.319475762644227, + "learning_rate": 9.903945576470909e-05, + "loss": 3.681, + "step": 4899 + }, + { + "epoch": 3.1572925060435133, + "grad_norm": 1.4740467198642344, + "learning_rate": 9.903906245084555e-05, + "loss": 3.6647, + "step": 4900 + }, + { + "epoch": 3.1572925060435133, + "eval_loss": 4.117143630981445, + "eval_runtime": 2.9841, + "eval_samples_per_second": 33.511, + "eval_steps_per_second": 4.356, + "step": 4900 + }, + { + "epoch": 3.1579371474617246, + "grad_norm": 1.368414328343931, + "learning_rate": 9.903866905726267e-05, + "loss": 3.5341, + "step": 4901 + }, + { + "epoch": 3.1585817888799355, + "grad_norm": 1.5780527852212083, + "learning_rate": 9.903827558396111e-05, + "loss": 3.5259, + "step": 4902 + }, + { + "epoch": 3.1592264302981468, + "grad_norm": 1.4506186982628706, + "learning_rate": 9.903788203094154e-05, + "loss": 3.4496, + "step": 4903 + }, + { + "epoch": 3.1598710717163576, + "grad_norm": 1.6299375564410659, + "learning_rate": 9.903748839820458e-05, + "loss": 3.71, + "step": 4904 + }, + { + "epoch": 3.160515713134569, + "grad_norm": 1.630565859022396, + "learning_rate": 9.903709468575088e-05, + "loss": 3.5398, + "step": 4905 + }, + { + "epoch": 3.16116035455278, + "grad_norm": 1.5912658828715167, + "learning_rate": 9.903670089358108e-05, + "loss": 3.4494, + "step": 4906 + }, + { + "epoch": 3.161804995970991, + "grad_norm": 1.462502025820688, + "learning_rate": 9.903630702169585e-05, + "loss": 3.6745, + "step": 4907 + }, + { + "epoch": 3.1624496373892024, + "grad_norm": 1.5578984732390897, + "learning_rate": 9.903591307009581e-05, + "loss": 3.5117, + "step": 4908 + }, + { + "epoch": 3.1630942788074132, + "grad_norm": 1.7754488765011147, + "learning_rate": 9.903551903878162e-05, + "loss": 3.7316, + "step": 4909 + }, + { + "epoch": 3.1637389202256245, + "grad_norm": 1.7944268258655158, + "learning_rate": 9.903512492775394e-05, + "loss": 4.0702, + "step": 4910 + }, + { + "epoch": 3.1643835616438354, + "grad_norm": 1.5263277346250645, + "learning_rate": 9.903473073701338e-05, + "loss": 3.3648, + "step": 4911 + }, + { + "epoch": 3.1650282030620467, + "grad_norm": 1.9895112481538613, + "learning_rate": 9.903433646656064e-05, + "loss": 3.5237, + "step": 4912 + }, + { + "epoch": 3.165672844480258, + "grad_norm": 1.5676269139738155, + "learning_rate": 9.903394211639633e-05, + "loss": 3.4271, + "step": 4913 + }, + { + "epoch": 3.166317485898469, + "grad_norm": 1.6843082306796702, + "learning_rate": 9.903354768652109e-05, + "loss": 3.616, + "step": 4914 + }, + { + "epoch": 3.16696212731668, + "grad_norm": 1.9423430185045043, + "learning_rate": 9.90331531769356e-05, + "loss": 3.8267, + "step": 4915 + }, + { + "epoch": 3.167606768734891, + "grad_norm": 1.8723538273882785, + "learning_rate": 9.90327585876405e-05, + "loss": 3.5505, + "step": 4916 + }, + { + "epoch": 3.1682514101531023, + "grad_norm": 1.7036124884463502, + "learning_rate": 9.903236391863642e-05, + "loss": 3.7394, + "step": 4917 + }, + { + "epoch": 3.1688960515713136, + "grad_norm": 1.402078421270448, + "learning_rate": 9.903196916992402e-05, + "loss": 3.9937, + "step": 4918 + }, + { + "epoch": 3.1695406929895245, + "grad_norm": 1.5721428275497304, + "learning_rate": 9.903157434150394e-05, + "loss": 3.4681, + "step": 4919 + }, + { + "epoch": 3.1701853344077358, + "grad_norm": 1.484841237053584, + "learning_rate": 9.903117943337686e-05, + "loss": 3.449, + "step": 4920 + }, + { + "epoch": 3.1708299758259466, + "grad_norm": 1.68767768630214, + "learning_rate": 9.903078444554337e-05, + "loss": 3.5837, + "step": 4921 + }, + { + "epoch": 3.171474617244158, + "grad_norm": 1.4248812289083514, + "learning_rate": 9.903038937800416e-05, + "loss": 3.5332, + "step": 4922 + }, + { + "epoch": 3.172119258662369, + "grad_norm": 1.5401174748171096, + "learning_rate": 9.902999423075989e-05, + "loss": 3.5027, + "step": 4923 + }, + { + "epoch": 3.17276390008058, + "grad_norm": 1.7869044331127644, + "learning_rate": 9.902959900381118e-05, + "loss": 3.3995, + "step": 4924 + }, + { + "epoch": 3.1734085414987914, + "grad_norm": 1.347795705514912, + "learning_rate": 9.902920369715869e-05, + "loss": 3.8026, + "step": 4925 + }, + { + "epoch": 3.1740531829170022, + "grad_norm": 1.6997922511532249, + "learning_rate": 9.902880831080307e-05, + "loss": 3.9276, + "step": 4926 + }, + { + "epoch": 3.1746978243352135, + "grad_norm": 1.3791754929446223, + "learning_rate": 9.902841284474495e-05, + "loss": 3.7787, + "step": 4927 + }, + { + "epoch": 3.175342465753425, + "grad_norm": 1.6453865795019336, + "learning_rate": 9.902801729898504e-05, + "loss": 3.6586, + "step": 4928 + }, + { + "epoch": 3.1759871071716357, + "grad_norm": 1.984391529546756, + "learning_rate": 9.902762167352392e-05, + "loss": 3.3474, + "step": 4929 + }, + { + "epoch": 3.176631748589847, + "grad_norm": 1.7835859106172647, + "learning_rate": 9.902722596836225e-05, + "loss": 3.4466, + "step": 4930 + }, + { + "epoch": 3.177276390008058, + "grad_norm": 1.4663233306853134, + "learning_rate": 9.902683018350073e-05, + "loss": 3.7539, + "step": 4931 + }, + { + "epoch": 3.177921031426269, + "grad_norm": 1.4878203105820087, + "learning_rate": 9.902643431893997e-05, + "loss": 4.006, + "step": 4932 + }, + { + "epoch": 3.1785656728444804, + "grad_norm": 1.290640319188829, + "learning_rate": 9.902603837468061e-05, + "loss": 3.839, + "step": 4933 + }, + { + "epoch": 3.1792103142626913, + "grad_norm": 1.6640656117401122, + "learning_rate": 9.902564235072333e-05, + "loss": 3.5602, + "step": 4934 + }, + { + "epoch": 3.1798549556809026, + "grad_norm": 1.3669358013246282, + "learning_rate": 9.902524624706876e-05, + "loss": 3.1995, + "step": 4935 + }, + { + "epoch": 3.1804995970991135, + "grad_norm": 1.3680178327179875, + "learning_rate": 9.902485006371758e-05, + "loss": 3.406, + "step": 4936 + }, + { + "epoch": 3.1811442385173248, + "grad_norm": 1.3430085671454328, + "learning_rate": 9.90244538006704e-05, + "loss": 3.8152, + "step": 4937 + }, + { + "epoch": 3.181788879935536, + "grad_norm": 1.3762929715782652, + "learning_rate": 9.90240574579279e-05, + "loss": 3.3314, + "step": 4938 + }, + { + "epoch": 3.182433521353747, + "grad_norm": 1.5216229366033933, + "learning_rate": 9.902366103549074e-05, + "loss": 3.7291, + "step": 4939 + }, + { + "epoch": 3.183078162771958, + "grad_norm": 1.6402172973692808, + "learning_rate": 9.902326453335952e-05, + "loss": 3.5835, + "step": 4940 + }, + { + "epoch": 3.183722804190169, + "grad_norm": 1.212290407093522, + "learning_rate": 9.902286795153493e-05, + "loss": 3.4814, + "step": 4941 + }, + { + "epoch": 3.1843674456083804, + "grad_norm": 1.3397613082545574, + "learning_rate": 9.90224712900176e-05, + "loss": 3.547, + "step": 4942 + }, + { + "epoch": 3.1850120870265917, + "grad_norm": 1.2146912409845496, + "learning_rate": 9.902207454880822e-05, + "loss": 3.3151, + "step": 4943 + }, + { + "epoch": 3.1856567284448025, + "grad_norm": 1.2507256364028139, + "learning_rate": 9.902167772790742e-05, + "loss": 3.2685, + "step": 4944 + }, + { + "epoch": 3.186301369863014, + "grad_norm": 1.2408418495717979, + "learning_rate": 9.902128082731581e-05, + "loss": 3.7457, + "step": 4945 + }, + { + "epoch": 3.1869460112812247, + "grad_norm": 1.5221203005847956, + "learning_rate": 9.902088384703415e-05, + "loss": 3.7693, + "step": 4946 + }, + { + "epoch": 3.187590652699436, + "grad_norm": 1.6022694118389693, + "learning_rate": 9.902048678706297e-05, + "loss": 3.4062, + "step": 4947 + }, + { + "epoch": 3.1882352941176473, + "grad_norm": 1.45391832237102, + "learning_rate": 9.9020089647403e-05, + "loss": 3.3549, + "step": 4948 + }, + { + "epoch": 3.188879935535858, + "grad_norm": 1.746680939809846, + "learning_rate": 9.901969242805483e-05, + "loss": 3.4165, + "step": 4949 + }, + { + "epoch": 3.1895245769540694, + "grad_norm": 1.5351157022854447, + "learning_rate": 9.90192951290192e-05, + "loss": 3.6325, + "step": 4950 + }, + { + "epoch": 3.1901692183722803, + "grad_norm": 1.8016316438111328, + "learning_rate": 9.901889775029668e-05, + "loss": 3.2729, + "step": 4951 + }, + { + "epoch": 3.1908138597904916, + "grad_norm": 1.3521887026745871, + "learning_rate": 9.901850029188797e-05, + "loss": 3.3835, + "step": 4952 + }, + { + "epoch": 3.191458501208703, + "grad_norm": 1.8724323885272363, + "learning_rate": 9.901810275379371e-05, + "loss": 3.732, + "step": 4953 + }, + { + "epoch": 3.1921031426269137, + "grad_norm": 1.5326406526011132, + "learning_rate": 9.901770513601454e-05, + "loss": 3.4423, + "step": 4954 + }, + { + "epoch": 3.192747784045125, + "grad_norm": 1.4060215591843583, + "learning_rate": 9.901730743855113e-05, + "loss": 3.4227, + "step": 4955 + }, + { + "epoch": 3.193392425463336, + "grad_norm": 1.6245413667282365, + "learning_rate": 9.901690966140412e-05, + "loss": 3.5735, + "step": 4956 + }, + { + "epoch": 3.194037066881547, + "grad_norm": 1.8086284635974093, + "learning_rate": 9.901651180457416e-05, + "loss": 3.4974, + "step": 4957 + }, + { + "epoch": 3.194681708299758, + "grad_norm": 1.4874062084615287, + "learning_rate": 9.901611386806193e-05, + "loss": 3.5562, + "step": 4958 + }, + { + "epoch": 3.1953263497179694, + "grad_norm": 1.7392801803092435, + "learning_rate": 9.901571585186808e-05, + "loss": 3.5336, + "step": 4959 + }, + { + "epoch": 3.1959709911361807, + "grad_norm": 1.672981789044058, + "learning_rate": 9.901531775599323e-05, + "loss": 3.3775, + "step": 4960 + }, + { + "epoch": 3.1966156325543915, + "grad_norm": 1.3950366468507842, + "learning_rate": 9.901491958043807e-05, + "loss": 3.6184, + "step": 4961 + }, + { + "epoch": 3.197260273972603, + "grad_norm": 2.085547652873677, + "learning_rate": 9.901452132520323e-05, + "loss": 3.5432, + "step": 4962 + }, + { + "epoch": 3.1979049153908137, + "grad_norm": 1.3418874160170189, + "learning_rate": 9.901412299028938e-05, + "loss": 3.6893, + "step": 4963 + }, + { + "epoch": 3.198549556809025, + "grad_norm": 1.8534221319546977, + "learning_rate": 9.901372457569717e-05, + "loss": 3.5689, + "step": 4964 + }, + { + "epoch": 3.1991941982272363, + "grad_norm": 1.7766921299913807, + "learning_rate": 9.901332608142724e-05, + "loss": 3.6145, + "step": 4965 + }, + { + "epoch": 3.199838839645447, + "grad_norm": 1.443912992399375, + "learning_rate": 9.901292750748026e-05, + "loss": 3.1861, + "step": 4966 + }, + { + "epoch": 3.2004834810636584, + "grad_norm": 1.7206980450196372, + "learning_rate": 9.901252885385687e-05, + "loss": 3.4263, + "step": 4967 + }, + { + "epoch": 3.2011281224818693, + "grad_norm": 1.3930147520597644, + "learning_rate": 9.901213012055774e-05, + "loss": 3.6046, + "step": 4968 + }, + { + "epoch": 3.2017727639000806, + "grad_norm": 1.9999612334681376, + "learning_rate": 9.901173130758353e-05, + "loss": 3.431, + "step": 4969 + }, + { + "epoch": 3.202417405318292, + "grad_norm": 1.4172708030468861, + "learning_rate": 9.901133241493488e-05, + "loss": 3.6637, + "step": 4970 + }, + { + "epoch": 3.2030620467365027, + "grad_norm": 1.8887596631164683, + "learning_rate": 9.901093344261245e-05, + "loss": 3.2911, + "step": 4971 + }, + { + "epoch": 3.203706688154714, + "grad_norm": 1.5724373388088644, + "learning_rate": 9.901053439061691e-05, + "loss": 3.8177, + "step": 4972 + }, + { + "epoch": 3.204351329572925, + "grad_norm": 1.5933704918112952, + "learning_rate": 9.901013525894887e-05, + "loss": 3.577, + "step": 4973 + }, + { + "epoch": 3.204995970991136, + "grad_norm": 1.684599682264057, + "learning_rate": 9.900973604760905e-05, + "loss": 3.2636, + "step": 4974 + }, + { + "epoch": 3.2056406124093475, + "grad_norm": 1.5852199051890719, + "learning_rate": 9.900933675659804e-05, + "loss": 3.1351, + "step": 4975 + }, + { + "epoch": 3.2062852538275584, + "grad_norm": 1.9657541821968714, + "learning_rate": 9.900893738591654e-05, + "loss": 3.6998, + "step": 4976 + }, + { + "epoch": 3.2069298952457697, + "grad_norm": 1.9277459915265314, + "learning_rate": 9.900853793556522e-05, + "loss": 3.2651, + "step": 4977 + }, + { + "epoch": 3.2075745366639805, + "grad_norm": 1.8568013914698516, + "learning_rate": 9.900813840554467e-05, + "loss": 3.434, + "step": 4978 + }, + { + "epoch": 3.208219178082192, + "grad_norm": 1.5570398487479926, + "learning_rate": 9.90077387958556e-05, + "loss": 3.3705, + "step": 4979 + }, + { + "epoch": 3.2088638195004027, + "grad_norm": 2.3126422934130684, + "learning_rate": 9.900733910649866e-05, + "loss": 3.5327, + "step": 4980 + }, + { + "epoch": 3.209508460918614, + "grad_norm": 1.6445416981957208, + "learning_rate": 9.90069393374745e-05, + "loss": 3.3448, + "step": 4981 + }, + { + "epoch": 3.2101531023368253, + "grad_norm": 2.318123774390251, + "learning_rate": 9.900653948878377e-05, + "loss": 3.4128, + "step": 4982 + }, + { + "epoch": 3.210797743755036, + "grad_norm": 6.919318153111589, + "learning_rate": 9.900613956042713e-05, + "loss": 3.5447, + "step": 4983 + }, + { + "epoch": 3.2114423851732474, + "grad_norm": 2.0421381130586407, + "learning_rate": 9.900573955240524e-05, + "loss": 4.0471, + "step": 4984 + }, + { + "epoch": 3.2120870265914583, + "grad_norm": 6.465981180044247, + "learning_rate": 9.900533946471876e-05, + "loss": 3.5147, + "step": 4985 + }, + { + "epoch": 3.2127316680096696, + "grad_norm": 1.9455505523906222, + "learning_rate": 9.900493929736834e-05, + "loss": 3.5872, + "step": 4986 + }, + { + "epoch": 3.213376309427881, + "grad_norm": 2.5549158437170987, + "learning_rate": 9.900453905035462e-05, + "loss": 3.5093, + "step": 4987 + }, + { + "epoch": 3.2140209508460917, + "grad_norm": 1.7715983980012548, + "learning_rate": 9.900413872367831e-05, + "loss": 3.6915, + "step": 4988 + }, + { + "epoch": 3.214665592264303, + "grad_norm": 1.8394471396549326, + "learning_rate": 9.900373831734003e-05, + "loss": 3.6354, + "step": 4989 + }, + { + "epoch": 3.215310233682514, + "grad_norm": 1.6133240641336488, + "learning_rate": 9.900333783134043e-05, + "loss": 3.1836, + "step": 4990 + }, + { + "epoch": 3.215954875100725, + "grad_norm": 1.975327769556873, + "learning_rate": 9.90029372656802e-05, + "loss": 3.782, + "step": 4991 + }, + { + "epoch": 3.2165995165189365, + "grad_norm": 1.6005482992182507, + "learning_rate": 9.900253662035995e-05, + "loss": 3.4687, + "step": 4992 + }, + { + "epoch": 3.2172441579371474, + "grad_norm": 1.6558090450311225, + "learning_rate": 9.900213589538038e-05, + "loss": 3.5769, + "step": 4993 + }, + { + "epoch": 3.2178887993553587, + "grad_norm": 1.950257799182276, + "learning_rate": 9.900173509074214e-05, + "loss": 3.7811, + "step": 4994 + }, + { + "epoch": 3.2185334407735695, + "grad_norm": 2.1233107484992555, + "learning_rate": 9.900133420644587e-05, + "loss": 3.4481, + "step": 4995 + }, + { + "epoch": 3.219178082191781, + "grad_norm": 1.1591277108581761, + "learning_rate": 9.900093324249226e-05, + "loss": 3.5603, + "step": 4996 + }, + { + "epoch": 3.219822723609992, + "grad_norm": 1.4399852121256833, + "learning_rate": 9.900053219888192e-05, + "loss": 3.6141, + "step": 4997 + }, + { + "epoch": 3.220467365028203, + "grad_norm": 1.5392051453406688, + "learning_rate": 9.900013107561558e-05, + "loss": 3.4581, + "step": 4998 + }, + { + "epoch": 3.2211120064464143, + "grad_norm": 1.5658468670209578, + "learning_rate": 9.899972987269384e-05, + "loss": 3.538, + "step": 4999 + }, + { + "epoch": 3.221756647864625, + "grad_norm": 2.1958314431418557, + "learning_rate": 9.899932859011736e-05, + "loss": 3.6868, + "step": 5000 + }, + { + "epoch": 3.221756647864625, + "eval_loss": 4.147977828979492, + "eval_runtime": 2.9709, + "eval_samples_per_second": 33.659, + "eval_steps_per_second": 4.376, + "step": 5000 + }, + { + "epoch": 3.2224012892828364, + "grad_norm": 1.5391063563502811, + "learning_rate": 9.899892722788681e-05, + "loss": 3.7475, + "step": 5001 + }, + { + "epoch": 3.2230459307010477, + "grad_norm": 1.8166346876376307, + "learning_rate": 9.899852578600289e-05, + "loss": 3.5811, + "step": 5002 + }, + { + "epoch": 3.2236905721192586, + "grad_norm": 1.8878559927045353, + "learning_rate": 9.899812426446618e-05, + "loss": 3.6838, + "step": 5003 + }, + { + "epoch": 3.22433521353747, + "grad_norm": 1.4748269493208828, + "learning_rate": 9.899772266327742e-05, + "loss": 3.7726, + "step": 5004 + }, + { + "epoch": 3.2249798549556807, + "grad_norm": 1.7862549875125358, + "learning_rate": 9.899732098243722e-05, + "loss": 3.5694, + "step": 5005 + }, + { + "epoch": 3.225624496373892, + "grad_norm": 1.5225181914187167, + "learning_rate": 9.899691922194625e-05, + "loss": 3.6068, + "step": 5006 + }, + { + "epoch": 3.2262691377921033, + "grad_norm": 1.6490571348100704, + "learning_rate": 9.899651738180516e-05, + "loss": 3.7113, + "step": 5007 + }, + { + "epoch": 3.226913779210314, + "grad_norm": 1.4483969016006157, + "learning_rate": 9.899611546201465e-05, + "loss": 3.6875, + "step": 5008 + }, + { + "epoch": 3.2275584206285255, + "grad_norm": 1.4574714369905943, + "learning_rate": 9.899571346257534e-05, + "loss": 3.8374, + "step": 5009 + }, + { + "epoch": 3.2282030620467363, + "grad_norm": 1.2548362004588858, + "learning_rate": 9.899531138348789e-05, + "loss": 3.6549, + "step": 5010 + }, + { + "epoch": 3.2288477034649476, + "grad_norm": 1.6888773218905586, + "learning_rate": 9.899490922475299e-05, + "loss": 3.5001, + "step": 5011 + }, + { + "epoch": 3.229492344883159, + "grad_norm": 1.3399918939458693, + "learning_rate": 9.899450698637127e-05, + "loss": 3.6758, + "step": 5012 + }, + { + "epoch": 3.23013698630137, + "grad_norm": 1.4631842617682291, + "learning_rate": 9.899410466834342e-05, + "loss": 3.4909, + "step": 5013 + }, + { + "epoch": 3.230781627719581, + "grad_norm": 1.798025044844807, + "learning_rate": 9.899370227067006e-05, + "loss": 3.8361, + "step": 5014 + }, + { + "epoch": 3.231426269137792, + "grad_norm": 1.7318743974355093, + "learning_rate": 9.89932997933519e-05, + "loss": 3.1442, + "step": 5015 + }, + { + "epoch": 3.2320709105560033, + "grad_norm": 1.719588033464007, + "learning_rate": 9.899289723638957e-05, + "loss": 3.5196, + "step": 5016 + }, + { + "epoch": 3.2327155519742146, + "grad_norm": 1.6848273674349379, + "learning_rate": 9.899249459978374e-05, + "loss": 3.7261, + "step": 5017 + }, + { + "epoch": 3.2333601933924254, + "grad_norm": 1.3175690533161306, + "learning_rate": 9.899209188353506e-05, + "loss": 3.884, + "step": 5018 + }, + { + "epoch": 3.2340048348106367, + "grad_norm": 2.426001934930617, + "learning_rate": 9.89916890876442e-05, + "loss": 3.6571, + "step": 5019 + }, + { + "epoch": 3.2346494762288476, + "grad_norm": 1.5901178704040937, + "learning_rate": 9.899128621211183e-05, + "loss": 3.4793, + "step": 5020 + }, + { + "epoch": 3.235294117647059, + "grad_norm": 1.7316891980879185, + "learning_rate": 9.899088325693859e-05, + "loss": 3.355, + "step": 5021 + }, + { + "epoch": 3.23593875906527, + "grad_norm": 1.8654393355901189, + "learning_rate": 9.899048022212517e-05, + "loss": 3.594, + "step": 5022 + }, + { + "epoch": 3.236583400483481, + "grad_norm": 2.1315381277866265, + "learning_rate": 9.899007710767222e-05, + "loss": 3.7425, + "step": 5023 + }, + { + "epoch": 3.2372280419016923, + "grad_norm": 1.5443090268281978, + "learning_rate": 9.898967391358039e-05, + "loss": 3.4455, + "step": 5024 + }, + { + "epoch": 3.237872683319903, + "grad_norm": 1.7183273535589099, + "learning_rate": 9.898927063985036e-05, + "loss": 3.5654, + "step": 5025 + }, + { + "epoch": 3.2385173247381145, + "grad_norm": 5.1684968330741645, + "learning_rate": 9.898886728648277e-05, + "loss": 3.3771, + "step": 5026 + }, + { + "epoch": 3.2391619661563253, + "grad_norm": 1.619120100415065, + "learning_rate": 9.898846385347831e-05, + "loss": 3.7893, + "step": 5027 + }, + { + "epoch": 3.2398066075745366, + "grad_norm": 1.7056660173693092, + "learning_rate": 9.898806034083761e-05, + "loss": 3.771, + "step": 5028 + }, + { + "epoch": 3.240451248992748, + "grad_norm": 2.579941124721234, + "learning_rate": 9.898765674856137e-05, + "loss": 3.6395, + "step": 5029 + }, + { + "epoch": 3.241095890410959, + "grad_norm": 1.5808880148626228, + "learning_rate": 9.898725307665023e-05, + "loss": 3.4546, + "step": 5030 + }, + { + "epoch": 3.24174053182917, + "grad_norm": 2.376558713993752, + "learning_rate": 9.898684932510485e-05, + "loss": 3.6613, + "step": 5031 + }, + { + "epoch": 3.242385173247381, + "grad_norm": 1.4398121373129684, + "learning_rate": 9.898644549392592e-05, + "loss": 3.3938, + "step": 5032 + }, + { + "epoch": 3.2430298146655923, + "grad_norm": 2.0763971923424362, + "learning_rate": 9.898604158311407e-05, + "loss": 3.9351, + "step": 5033 + }, + { + "epoch": 3.2436744560838036, + "grad_norm": 1.6699922159217746, + "learning_rate": 9.898563759266999e-05, + "loss": 3.7361, + "step": 5034 + }, + { + "epoch": 3.2443190975020144, + "grad_norm": 1.7548730637037615, + "learning_rate": 9.898523352259432e-05, + "loss": 3.4756, + "step": 5035 + }, + { + "epoch": 3.2449637389202257, + "grad_norm": 1.829684050243124, + "learning_rate": 9.898482937288774e-05, + "loss": 3.7282, + "step": 5036 + }, + { + "epoch": 3.2456083803384366, + "grad_norm": 1.475215682496196, + "learning_rate": 9.89844251435509e-05, + "loss": 3.4874, + "step": 5037 + }, + { + "epoch": 3.246253021756648, + "grad_norm": 1.668003360522706, + "learning_rate": 9.898402083458446e-05, + "loss": 3.5297, + "step": 5038 + }, + { + "epoch": 3.246897663174859, + "grad_norm": 1.7551287899583725, + "learning_rate": 9.898361644598911e-05, + "loss": 3.6709, + "step": 5039 + }, + { + "epoch": 3.24754230459307, + "grad_norm": 1.638605011331411, + "learning_rate": 9.898321197776551e-05, + "loss": 3.5815, + "step": 5040 + }, + { + "epoch": 3.2481869460112813, + "grad_norm": 1.4989401193086302, + "learning_rate": 9.898280742991429e-05, + "loss": 3.8139, + "step": 5041 + }, + { + "epoch": 3.248831587429492, + "grad_norm": 1.4339820948692128, + "learning_rate": 9.898240280243616e-05, + "loss": 3.7467, + "step": 5042 + }, + { + "epoch": 3.2494762288477035, + "grad_norm": 1.4560296003808513, + "learning_rate": 9.898199809533177e-05, + "loss": 3.7114, + "step": 5043 + }, + { + "epoch": 3.2501208702659143, + "grad_norm": 1.4686390143499681, + "learning_rate": 9.898159330860176e-05, + "loss": 3.7697, + "step": 5044 + }, + { + "epoch": 3.2507655116841256, + "grad_norm": 1.5821071377583995, + "learning_rate": 9.898118844224683e-05, + "loss": 3.5265, + "step": 5045 + }, + { + "epoch": 3.251410153102337, + "grad_norm": 1.6046578644986587, + "learning_rate": 9.898078349626762e-05, + "loss": 3.5014, + "step": 5046 + }, + { + "epoch": 3.252054794520548, + "grad_norm": 1.5355495109995387, + "learning_rate": 9.89803784706648e-05, + "loss": 4.0868, + "step": 5047 + }, + { + "epoch": 3.252699435938759, + "grad_norm": 1.2201870575978984, + "learning_rate": 9.897997336543903e-05, + "loss": 3.6421, + "step": 5048 + }, + { + "epoch": 3.25334407735697, + "grad_norm": 1.385238130424063, + "learning_rate": 9.8979568180591e-05, + "loss": 3.6639, + "step": 5049 + }, + { + "epoch": 3.2539887187751813, + "grad_norm": 1.4803887479601128, + "learning_rate": 9.897916291612135e-05, + "loss": 3.415, + "step": 5050 + }, + { + "epoch": 3.2546333601933926, + "grad_norm": 1.2234214257649374, + "learning_rate": 9.897875757203076e-05, + "loss": 3.6612, + "step": 5051 + }, + { + "epoch": 3.2552780016116034, + "grad_norm": 1.6436058238812148, + "learning_rate": 9.89783521483199e-05, + "loss": 3.6616, + "step": 5052 + }, + { + "epoch": 3.2559226430298147, + "grad_norm": 1.2165653126221576, + "learning_rate": 9.89779466449894e-05, + "loss": 3.6867, + "step": 5053 + }, + { + "epoch": 3.2565672844480256, + "grad_norm": 1.6464323564537695, + "learning_rate": 9.897754106203996e-05, + "loss": 3.4961, + "step": 5054 + }, + { + "epoch": 3.257211925866237, + "grad_norm": 1.4574154863198143, + "learning_rate": 9.897713539947224e-05, + "loss": 3.3319, + "step": 5055 + }, + { + "epoch": 3.257856567284448, + "grad_norm": 1.2218960714801974, + "learning_rate": 9.897672965728691e-05, + "loss": 3.6437, + "step": 5056 + }, + { + "epoch": 3.258501208702659, + "grad_norm": 1.4265077739639795, + "learning_rate": 9.897632383548464e-05, + "loss": 3.3244, + "step": 5057 + }, + { + "epoch": 3.2591458501208703, + "grad_norm": 1.3697296268714492, + "learning_rate": 9.897591793406607e-05, + "loss": 3.3958, + "step": 5058 + }, + { + "epoch": 3.259790491539081, + "grad_norm": 1.5018787758705059, + "learning_rate": 9.897551195303191e-05, + "loss": 3.5905, + "step": 5059 + }, + { + "epoch": 3.2604351329572925, + "grad_norm": 1.4700167502676753, + "learning_rate": 9.897510589238277e-05, + "loss": 3.3732, + "step": 5060 + }, + { + "epoch": 3.261079774375504, + "grad_norm": 1.5165283931417088, + "learning_rate": 9.897469975211936e-05, + "loss": 3.6052, + "step": 5061 + }, + { + "epoch": 3.2617244157937146, + "grad_norm": 1.1605130180843044, + "learning_rate": 9.897429353224234e-05, + "loss": 3.8977, + "step": 5062 + }, + { + "epoch": 3.262369057211926, + "grad_norm": 1.413381920516655, + "learning_rate": 9.897388723275236e-05, + "loss": 3.4726, + "step": 5063 + }, + { + "epoch": 3.263013698630137, + "grad_norm": 1.207686337671047, + "learning_rate": 9.897348085365011e-05, + "loss": 3.4638, + "step": 5064 + }, + { + "epoch": 3.263658340048348, + "grad_norm": 1.6297290296356957, + "learning_rate": 9.897307439493623e-05, + "loss": 3.0904, + "step": 5065 + }, + { + "epoch": 3.2643029814665594, + "grad_norm": 1.3058972503979085, + "learning_rate": 9.897266785661143e-05, + "loss": 3.2903, + "step": 5066 + }, + { + "epoch": 3.2649476228847703, + "grad_norm": 1.272409833312352, + "learning_rate": 9.897226123867635e-05, + "loss": 3.2246, + "step": 5067 + }, + { + "epoch": 3.2655922643029816, + "grad_norm": 1.3404831376941257, + "learning_rate": 9.897185454113163e-05, + "loss": 3.8265, + "step": 5068 + }, + { + "epoch": 3.2662369057211924, + "grad_norm": 1.333913440045542, + "learning_rate": 9.8971447763978e-05, + "loss": 3.2705, + "step": 5069 + }, + { + "epoch": 3.2668815471394037, + "grad_norm": 1.4619015460078835, + "learning_rate": 9.897104090721609e-05, + "loss": 3.4399, + "step": 5070 + }, + { + "epoch": 3.267526188557615, + "grad_norm": 1.6612726543561245, + "learning_rate": 9.897063397084656e-05, + "loss": 3.4115, + "step": 5071 + }, + { + "epoch": 3.268170829975826, + "grad_norm": 1.376836464108808, + "learning_rate": 9.897022695487011e-05, + "loss": 3.4794, + "step": 5072 + }, + { + "epoch": 3.268815471394037, + "grad_norm": 1.5813058864172054, + "learning_rate": 9.896981985928737e-05, + "loss": 3.5306, + "step": 5073 + }, + { + "epoch": 3.269460112812248, + "grad_norm": 1.6281549570844116, + "learning_rate": 9.896941268409903e-05, + "loss": 3.5717, + "step": 5074 + }, + { + "epoch": 3.2701047542304593, + "grad_norm": 1.380700082355793, + "learning_rate": 9.896900542930579e-05, + "loss": 3.5514, + "step": 5075 + }, + { + "epoch": 3.2707493956486706, + "grad_norm": 2.153960091873715, + "learning_rate": 9.896859809490825e-05, + "loss": 3.4174, + "step": 5076 + }, + { + "epoch": 3.2713940370668815, + "grad_norm": 1.3607065239013847, + "learning_rate": 9.896819068090713e-05, + "loss": 3.7374, + "step": 5077 + }, + { + "epoch": 3.2720386784850928, + "grad_norm": 1.6463542086547802, + "learning_rate": 9.896778318730309e-05, + "loss": 3.6037, + "step": 5078 + }, + { + "epoch": 3.2726833199033036, + "grad_norm": 1.8451639062483647, + "learning_rate": 9.89673756140968e-05, + "loss": 3.5166, + "step": 5079 + }, + { + "epoch": 3.273327961321515, + "grad_norm": 1.403237119017231, + "learning_rate": 9.896696796128891e-05, + "loss": 3.3511, + "step": 5080 + }, + { + "epoch": 3.2739726027397262, + "grad_norm": 1.7984759203251945, + "learning_rate": 9.89665602288801e-05, + "loss": 3.5789, + "step": 5081 + }, + { + "epoch": 3.274617244157937, + "grad_norm": 1.4790429846891706, + "learning_rate": 9.896615241687106e-05, + "loss": 3.3944, + "step": 5082 + }, + { + "epoch": 3.2752618855761484, + "grad_norm": 1.5888105872825038, + "learning_rate": 9.896574452526244e-05, + "loss": 3.5445, + "step": 5083 + }, + { + "epoch": 3.2759065269943592, + "grad_norm": 1.5338064058917078, + "learning_rate": 9.89653365540549e-05, + "loss": 3.8761, + "step": 5084 + }, + { + "epoch": 3.2765511684125705, + "grad_norm": 1.5577918661239636, + "learning_rate": 9.896492850324914e-05, + "loss": 3.7754, + "step": 5085 + }, + { + "epoch": 3.277195809830782, + "grad_norm": 1.2021685644002587, + "learning_rate": 9.896452037284579e-05, + "loss": 3.5003, + "step": 5086 + }, + { + "epoch": 3.2778404512489927, + "grad_norm": 1.71565034670987, + "learning_rate": 9.896411216284555e-05, + "loss": 3.507, + "step": 5087 + }, + { + "epoch": 3.278485092667204, + "grad_norm": 1.5143275822637812, + "learning_rate": 9.896370387324911e-05, + "loss": 3.7688, + "step": 5088 + }, + { + "epoch": 3.279129734085415, + "grad_norm": 1.1929685913297612, + "learning_rate": 9.896329550405708e-05, + "loss": 3.5087, + "step": 5089 + }, + { + "epoch": 3.279774375503626, + "grad_norm": 1.2897914743412586, + "learning_rate": 9.896288705527019e-05, + "loss": 3.4796, + "step": 5090 + }, + { + "epoch": 3.2804190169218375, + "grad_norm": 1.2364363975771042, + "learning_rate": 9.896247852688907e-05, + "loss": 3.6841, + "step": 5091 + }, + { + "epoch": 3.2810636583400483, + "grad_norm": 1.4686940551327927, + "learning_rate": 9.89620699189144e-05, + "loss": 3.5672, + "step": 5092 + }, + { + "epoch": 3.2817082997582596, + "grad_norm": 1.7972562099665141, + "learning_rate": 9.896166123134687e-05, + "loss": 3.6001, + "step": 5093 + }, + { + "epoch": 3.2823529411764705, + "grad_norm": 1.2291215807304148, + "learning_rate": 9.896125246418713e-05, + "loss": 3.6371, + "step": 5094 + }, + { + "epoch": 3.2829975825946818, + "grad_norm": 1.5545897170475622, + "learning_rate": 9.896084361743588e-05, + "loss": 3.9069, + "step": 5095 + }, + { + "epoch": 3.283642224012893, + "grad_norm": 1.5758002917200813, + "learning_rate": 9.896043469109375e-05, + "loss": 3.5878, + "step": 5096 + }, + { + "epoch": 3.284286865431104, + "grad_norm": 1.3034296440132775, + "learning_rate": 9.896002568516145e-05, + "loss": 3.1061, + "step": 5097 + }, + { + "epoch": 3.2849315068493152, + "grad_norm": 1.581109671640475, + "learning_rate": 9.895961659963963e-05, + "loss": 3.6126, + "step": 5098 + }, + { + "epoch": 3.285576148267526, + "grad_norm": 1.3901809458084788, + "learning_rate": 9.895920743452896e-05, + "loss": 3.5992, + "step": 5099 + }, + { + "epoch": 3.2862207896857374, + "grad_norm": 1.9103454353163238, + "learning_rate": 9.895879818983012e-05, + "loss": 3.7533, + "step": 5100 + }, + { + "epoch": 3.2862207896857374, + "eval_loss": 4.0673112869262695, + "eval_runtime": 2.9733, + "eval_samples_per_second": 33.633, + "eval_steps_per_second": 4.372, + "step": 5100 + }, + { + "epoch": 3.2868654311039482, + "grad_norm": 1.4856677912390714, + "learning_rate": 9.89583888655438e-05, + "loss": 3.6364, + "step": 5101 + }, + { + "epoch": 3.2875100725221595, + "grad_norm": 1.7051509751009577, + "learning_rate": 9.895797946167062e-05, + "loss": 3.3366, + "step": 5102 + }, + { + "epoch": 3.288154713940371, + "grad_norm": 1.668354506509251, + "learning_rate": 9.895756997821131e-05, + "loss": 3.6353, + "step": 5103 + }, + { + "epoch": 3.2887993553585817, + "grad_norm": 1.4083777293129376, + "learning_rate": 9.895716041516651e-05, + "loss": 3.5527, + "step": 5104 + }, + { + "epoch": 3.289443996776793, + "grad_norm": 1.9201134324102624, + "learning_rate": 9.89567507725369e-05, + "loss": 3.5933, + "step": 5105 + }, + { + "epoch": 3.290088638195004, + "grad_norm": 1.5580877175448982, + "learning_rate": 9.895634105032316e-05, + "loss": 3.8122, + "step": 5106 + }, + { + "epoch": 3.290733279613215, + "grad_norm": 1.5039163530698958, + "learning_rate": 9.895593124852597e-05, + "loss": 3.6387, + "step": 5107 + }, + { + "epoch": 3.2913779210314265, + "grad_norm": 1.6262539207460436, + "learning_rate": 9.895552136714597e-05, + "loss": 3.4602, + "step": 5108 + }, + { + "epoch": 3.2920225624496373, + "grad_norm": 1.4782647116100416, + "learning_rate": 9.895511140618387e-05, + "loss": 3.7971, + "step": 5109 + }, + { + "epoch": 3.2926672038678486, + "grad_norm": 2.022204322957579, + "learning_rate": 9.89547013656403e-05, + "loss": 3.5185, + "step": 5110 + }, + { + "epoch": 3.2933118452860595, + "grad_norm": 1.6167288774439041, + "learning_rate": 9.895429124551597e-05, + "loss": 3.8074, + "step": 5111 + }, + { + "epoch": 3.2939564867042708, + "grad_norm": 1.6124542884490343, + "learning_rate": 9.895388104581156e-05, + "loss": 3.3234, + "step": 5112 + }, + { + "epoch": 3.2946011281224816, + "grad_norm": 1.9412485737701397, + "learning_rate": 9.89534707665277e-05, + "loss": 3.3433, + "step": 5113 + }, + { + "epoch": 3.295245769540693, + "grad_norm": 1.8779688676082855, + "learning_rate": 9.895306040766512e-05, + "loss": 3.6408, + "step": 5114 + }, + { + "epoch": 3.2958904109589042, + "grad_norm": 1.7853613274086473, + "learning_rate": 9.895264996922445e-05, + "loss": 3.9099, + "step": 5115 + }, + { + "epoch": 3.296535052377115, + "grad_norm": 1.6420971550731769, + "learning_rate": 9.895223945120638e-05, + "loss": 3.6426, + "step": 5116 + }, + { + "epoch": 3.2971796937953264, + "grad_norm": 1.4667974257151635, + "learning_rate": 9.895182885361157e-05, + "loss": 3.8243, + "step": 5117 + }, + { + "epoch": 3.2978243352135372, + "grad_norm": 1.317936024474231, + "learning_rate": 9.895141817644071e-05, + "loss": 3.4837, + "step": 5118 + }, + { + "epoch": 3.2984689766317485, + "grad_norm": 1.5658717126334263, + "learning_rate": 9.895100741969449e-05, + "loss": 3.6594, + "step": 5119 + }, + { + "epoch": 3.29911361804996, + "grad_norm": 1.22824914105833, + "learning_rate": 9.895059658337356e-05, + "loss": 3.7596, + "step": 5120 + }, + { + "epoch": 3.2997582594681707, + "grad_norm": 1.3520682368911805, + "learning_rate": 9.89501856674786e-05, + "loss": 3.5314, + "step": 5121 + }, + { + "epoch": 3.300402900886382, + "grad_norm": 1.5389956919239836, + "learning_rate": 9.894977467201027e-05, + "loss": 3.2447, + "step": 5122 + }, + { + "epoch": 3.301047542304593, + "grad_norm": 1.61875052880202, + "learning_rate": 9.89493635969693e-05, + "loss": 3.2664, + "step": 5123 + }, + { + "epoch": 3.301692183722804, + "grad_norm": 1.5952574137428621, + "learning_rate": 9.89489524423563e-05, + "loss": 3.417, + "step": 5124 + }, + { + "epoch": 3.3023368251410155, + "grad_norm": 1.5511402402191194, + "learning_rate": 9.894854120817196e-05, + "loss": 3.8416, + "step": 5125 + }, + { + "epoch": 3.3029814665592263, + "grad_norm": 1.8160408313079748, + "learning_rate": 9.894812989441699e-05, + "loss": 3.4029, + "step": 5126 + }, + { + "epoch": 3.3036261079774376, + "grad_norm": 1.5324841527217166, + "learning_rate": 9.894771850109202e-05, + "loss": 3.8227, + "step": 5127 + }, + { + "epoch": 3.3042707493956485, + "grad_norm": 2.0176454548110945, + "learning_rate": 9.894730702819777e-05, + "loss": 3.5929, + "step": 5128 + }, + { + "epoch": 3.3049153908138598, + "grad_norm": 1.4144046382838837, + "learning_rate": 9.894689547573489e-05, + "loss": 3.5985, + "step": 5129 + }, + { + "epoch": 3.305560032232071, + "grad_norm": 2.1969148393239744, + "learning_rate": 9.894648384370405e-05, + "loss": 3.5065, + "step": 5130 + }, + { + "epoch": 3.306204673650282, + "grad_norm": 2.008550002691802, + "learning_rate": 9.894607213210595e-05, + "loss": 3.7311, + "step": 5131 + }, + { + "epoch": 3.3068493150684932, + "grad_norm": 1.622157408406542, + "learning_rate": 9.894566034094124e-05, + "loss": 3.8301, + "step": 5132 + }, + { + "epoch": 3.307493956486704, + "grad_norm": 1.6636059026698624, + "learning_rate": 9.894524847021062e-05, + "loss": 3.7529, + "step": 5133 + }, + { + "epoch": 3.3081385979049154, + "grad_norm": 1.7590450993100022, + "learning_rate": 9.894483651991475e-05, + "loss": 3.2706, + "step": 5134 + }, + { + "epoch": 3.3087832393231267, + "grad_norm": 1.6010568069798887, + "learning_rate": 9.894442449005429e-05, + "loss": 3.521, + "step": 5135 + }, + { + "epoch": 3.3094278807413375, + "grad_norm": 1.8248135856273386, + "learning_rate": 9.894401238062996e-05, + "loss": 3.688, + "step": 5136 + }, + { + "epoch": 3.310072522159549, + "grad_norm": 1.8478073438493512, + "learning_rate": 9.894360019164242e-05, + "loss": 3.9506, + "step": 5137 + }, + { + "epoch": 3.3107171635777597, + "grad_norm": 1.4120886802955497, + "learning_rate": 9.894318792309232e-05, + "loss": 3.6351, + "step": 5138 + }, + { + "epoch": 3.311361804995971, + "grad_norm": 1.598162809329037, + "learning_rate": 9.894277557498038e-05, + "loss": 3.3591, + "step": 5139 + }, + { + "epoch": 3.3120064464141823, + "grad_norm": 1.753455461140192, + "learning_rate": 9.894236314730724e-05, + "loss": 2.8933, + "step": 5140 + }, + { + "epoch": 3.312651087832393, + "grad_norm": 1.8096774420126391, + "learning_rate": 9.89419506400736e-05, + "loss": 3.5612, + "step": 5141 + }, + { + "epoch": 3.3132957292506044, + "grad_norm": 1.776704959581468, + "learning_rate": 9.894153805328015e-05, + "loss": 3.3921, + "step": 5142 + }, + { + "epoch": 3.3139403706688153, + "grad_norm": 2.1615904454000683, + "learning_rate": 9.894112538692753e-05, + "loss": 3.5772, + "step": 5143 + }, + { + "epoch": 3.3145850120870266, + "grad_norm": 1.2881507568937076, + "learning_rate": 9.894071264101644e-05, + "loss": 3.7829, + "step": 5144 + }, + { + "epoch": 3.315229653505238, + "grad_norm": 1.961062494272619, + "learning_rate": 9.894029981554755e-05, + "loss": 3.1817, + "step": 5145 + }, + { + "epoch": 3.3158742949234488, + "grad_norm": 1.408877737612926, + "learning_rate": 9.893988691052156e-05, + "loss": 3.9485, + "step": 5146 + }, + { + "epoch": 3.31651893634166, + "grad_norm": 2.1088878206789023, + "learning_rate": 9.893947392593911e-05, + "loss": 3.4627, + "step": 5147 + }, + { + "epoch": 3.317163577759871, + "grad_norm": 1.576723776527728, + "learning_rate": 9.89390608618009e-05, + "loss": 3.0771, + "step": 5148 + }, + { + "epoch": 3.317808219178082, + "grad_norm": 2.0913630018700595, + "learning_rate": 9.893864771810762e-05, + "loss": 4.0168, + "step": 5149 + }, + { + "epoch": 3.3184528605962935, + "grad_norm": 1.6203677127551959, + "learning_rate": 9.893823449485992e-05, + "loss": 3.3812, + "step": 5150 + }, + { + "epoch": 3.3190975020145044, + "grad_norm": 1.5964092637959704, + "learning_rate": 9.89378211920585e-05, + "loss": 3.2636, + "step": 5151 + }, + { + "epoch": 3.3197421434327157, + "grad_norm": 1.1723744679089982, + "learning_rate": 9.893740780970405e-05, + "loss": 3.8711, + "step": 5152 + }, + { + "epoch": 3.3203867848509265, + "grad_norm": 1.5780270391498568, + "learning_rate": 9.893699434779722e-05, + "loss": 3.7965, + "step": 5153 + }, + { + "epoch": 3.321031426269138, + "grad_norm": 1.198212781989322, + "learning_rate": 9.893658080633868e-05, + "loss": 3.8565, + "step": 5154 + }, + { + "epoch": 3.321676067687349, + "grad_norm": 1.5442311767189747, + "learning_rate": 9.893616718532916e-05, + "loss": 3.891, + "step": 5155 + }, + { + "epoch": 3.32232070910556, + "grad_norm": 1.4642025977924649, + "learning_rate": 9.893575348476929e-05, + "loss": 3.549, + "step": 5156 + }, + { + "epoch": 3.3229653505237713, + "grad_norm": 1.6153656712432203, + "learning_rate": 9.893533970465977e-05, + "loss": 3.6604, + "step": 5157 + }, + { + "epoch": 3.323609991941982, + "grad_norm": 1.3371985845140835, + "learning_rate": 9.893492584500132e-05, + "loss": 3.9834, + "step": 5158 + }, + { + "epoch": 3.3242546333601934, + "grad_norm": 1.3875250168612057, + "learning_rate": 9.893451190579453e-05, + "loss": 3.6713, + "step": 5159 + }, + { + "epoch": 3.3248992747784047, + "grad_norm": 1.1618114391597953, + "learning_rate": 9.893409788704016e-05, + "loss": 3.977, + "step": 5160 + }, + { + "epoch": 3.3255439161966156, + "grad_norm": 1.4151170439197083, + "learning_rate": 9.893368378873884e-05, + "loss": 3.3109, + "step": 5161 + }, + { + "epoch": 3.326188557614827, + "grad_norm": 1.4666399601560045, + "learning_rate": 9.893326961089127e-05, + "loss": 3.689, + "step": 5162 + }, + { + "epoch": 3.3268331990330378, + "grad_norm": 1.894406790359556, + "learning_rate": 9.893285535349812e-05, + "loss": 3.6038, + "step": 5163 + }, + { + "epoch": 3.327477840451249, + "grad_norm": 1.6788931728117504, + "learning_rate": 9.893244101656011e-05, + "loss": 3.4214, + "step": 5164 + }, + { + "epoch": 3.3281224818694604, + "grad_norm": 1.4035744739951332, + "learning_rate": 9.893202660007785e-05, + "loss": 3.4449, + "step": 5165 + }, + { + "epoch": 3.328767123287671, + "grad_norm": 1.7192789667042356, + "learning_rate": 9.893161210405209e-05, + "loss": 3.4271, + "step": 5166 + }, + { + "epoch": 3.3294117647058825, + "grad_norm": 1.1510854753878454, + "learning_rate": 9.893119752848347e-05, + "loss": 3.3604, + "step": 5167 + }, + { + "epoch": 3.3300564061240934, + "grad_norm": 1.8774709010080106, + "learning_rate": 9.893078287337267e-05, + "loss": 3.8748, + "step": 5168 + }, + { + "epoch": 3.3307010475423047, + "grad_norm": 1.3801240487648985, + "learning_rate": 9.893036813872041e-05, + "loss": 3.6317, + "step": 5169 + }, + { + "epoch": 3.3313456889605155, + "grad_norm": 1.8724757282887656, + "learning_rate": 9.892995332452731e-05, + "loss": 3.7811, + "step": 5170 + }, + { + "epoch": 3.331990330378727, + "grad_norm": 1.328611602931251, + "learning_rate": 9.89295384307941e-05, + "loss": 3.5289, + "step": 5171 + }, + { + "epoch": 3.332634971796938, + "grad_norm": 1.6335275553575477, + "learning_rate": 9.892912345752147e-05, + "loss": 3.8023, + "step": 5172 + }, + { + "epoch": 3.333279613215149, + "grad_norm": 1.6180623585192486, + "learning_rate": 9.892870840471004e-05, + "loss": 3.5124, + "step": 5173 + }, + { + "epoch": 3.3339242546333603, + "grad_norm": 1.4697164977275439, + "learning_rate": 9.892829327236056e-05, + "loss": 4.0802, + "step": 5174 + }, + { + "epoch": 3.334568896051571, + "grad_norm": 1.5860468501062968, + "learning_rate": 9.892787806047366e-05, + "loss": 3.1093, + "step": 5175 + }, + { + "epoch": 3.3352135374697824, + "grad_norm": 1.319695151942975, + "learning_rate": 9.892746276905007e-05, + "loss": 3.3472, + "step": 5176 + }, + { + "epoch": 3.3358581788879937, + "grad_norm": 1.6424347728743753, + "learning_rate": 9.892704739809043e-05, + "loss": 3.337, + "step": 5177 + }, + { + "epoch": 3.3365028203062046, + "grad_norm": 1.3146398566257156, + "learning_rate": 9.892663194759544e-05, + "loss": 3.7039, + "step": 5178 + }, + { + "epoch": 3.337147461724416, + "grad_norm": 1.3442438530953331, + "learning_rate": 9.892621641756579e-05, + "loss": 3.9441, + "step": 5179 + }, + { + "epoch": 3.3377921031426268, + "grad_norm": 1.1655205607747359, + "learning_rate": 9.892580080800214e-05, + "loss": 3.3405, + "step": 5180 + }, + { + "epoch": 3.338436744560838, + "grad_norm": 1.6153648138268542, + "learning_rate": 9.892538511890518e-05, + "loss": 3.603, + "step": 5181 + }, + { + "epoch": 3.339081385979049, + "grad_norm": 1.4965341363788085, + "learning_rate": 9.89249693502756e-05, + "loss": 3.5658, + "step": 5182 + }, + { + "epoch": 3.33972602739726, + "grad_norm": 1.7007033308185162, + "learning_rate": 9.892455350211409e-05, + "loss": 3.2932, + "step": 5183 + }, + { + "epoch": 3.3403706688154715, + "grad_norm": 1.4898205537753353, + "learning_rate": 9.89241375744213e-05, + "loss": 3.5972, + "step": 5184 + }, + { + "epoch": 3.3410153102336824, + "grad_norm": 1.5364902114756305, + "learning_rate": 9.892372156719797e-05, + "loss": 3.7799, + "step": 5185 + }, + { + "epoch": 3.3416599516518937, + "grad_norm": 1.3157313379268054, + "learning_rate": 9.892330548044473e-05, + "loss": 3.6635, + "step": 5186 + }, + { + "epoch": 3.3423045930701045, + "grad_norm": 1.313963373236741, + "learning_rate": 9.89228893141623e-05, + "loss": 3.3411, + "step": 5187 + }, + { + "epoch": 3.342949234488316, + "grad_norm": 1.5951544371943804, + "learning_rate": 9.892247306835132e-05, + "loss": 3.581, + "step": 5188 + }, + { + "epoch": 3.343593875906527, + "grad_norm": 1.5202098726192976, + "learning_rate": 9.892205674301251e-05, + "loss": 3.8486, + "step": 5189 + }, + { + "epoch": 3.344238517324738, + "grad_norm": 1.3927894397424991, + "learning_rate": 9.892164033814656e-05, + "loss": 3.7279, + "step": 5190 + }, + { + "epoch": 3.3448831587429493, + "grad_norm": 1.7471862226593224, + "learning_rate": 9.892122385375412e-05, + "loss": 3.5633, + "step": 5191 + }, + { + "epoch": 3.34552780016116, + "grad_norm": 2.06647997516481, + "learning_rate": 9.892080728983591e-05, + "loss": 3.3277, + "step": 5192 + }, + { + "epoch": 3.3461724415793714, + "grad_norm": 1.8394472046976167, + "learning_rate": 9.892039064639259e-05, + "loss": 4.0901, + "step": 5193 + }, + { + "epoch": 3.3468170829975827, + "grad_norm": 1.3511175654787426, + "learning_rate": 9.891997392342486e-05, + "loss": 3.7068, + "step": 5194 + }, + { + "epoch": 3.3474617244157936, + "grad_norm": 1.6419514772989918, + "learning_rate": 9.891955712093338e-05, + "loss": 3.6054, + "step": 5195 + }, + { + "epoch": 3.348106365834005, + "grad_norm": 1.607372740477171, + "learning_rate": 9.891914023891885e-05, + "loss": 3.7296, + "step": 5196 + }, + { + "epoch": 3.3487510072522158, + "grad_norm": 1.4624458736976584, + "learning_rate": 9.891872327738196e-05, + "loss": 3.424, + "step": 5197 + }, + { + "epoch": 3.349395648670427, + "grad_norm": 1.8311454016793398, + "learning_rate": 9.891830623632339e-05, + "loss": 3.6283, + "step": 5198 + }, + { + "epoch": 3.3500402900886384, + "grad_norm": 1.327298553607268, + "learning_rate": 9.891788911574383e-05, + "loss": 3.7036, + "step": 5199 + }, + { + "epoch": 3.350684931506849, + "grad_norm": 1.6134021375418113, + "learning_rate": 9.891747191564395e-05, + "loss": 3.5526, + "step": 5200 + }, + { + "epoch": 3.350684931506849, + "eval_loss": 4.040228843688965, + "eval_runtime": 2.9803, + "eval_samples_per_second": 33.554, + "eval_steps_per_second": 4.362, + "step": 5200 + }, + { + "epoch": 3.3513295729250605, + "grad_norm": 1.4755703985573996, + "learning_rate": 9.891705463602443e-05, + "loss": 3.4909, + "step": 5201 + }, + { + "epoch": 3.3519742143432714, + "grad_norm": 1.7106375539040521, + "learning_rate": 9.891663727688599e-05, + "loss": 3.6764, + "step": 5202 + }, + { + "epoch": 3.3526188557614827, + "grad_norm": 1.862350018020734, + "learning_rate": 9.891621983822929e-05, + "loss": 3.4913, + "step": 5203 + }, + { + "epoch": 3.353263497179694, + "grad_norm": 1.3632685020730226, + "learning_rate": 9.891580232005502e-05, + "loss": 3.6067, + "step": 5204 + }, + { + "epoch": 3.353908138597905, + "grad_norm": 1.837424086213715, + "learning_rate": 9.891538472236387e-05, + "loss": 3.2272, + "step": 5205 + }, + { + "epoch": 3.354552780016116, + "grad_norm": 1.7153737042099308, + "learning_rate": 9.891496704515651e-05, + "loss": 3.6456, + "step": 5206 + }, + { + "epoch": 3.355197421434327, + "grad_norm": 2.0801654591574525, + "learning_rate": 9.891454928843366e-05, + "loss": 3.4083, + "step": 5207 + }, + { + "epoch": 3.3558420628525383, + "grad_norm": 1.5860842190679623, + "learning_rate": 9.891413145219597e-05, + "loss": 3.9773, + "step": 5208 + }, + { + "epoch": 3.3564867042707496, + "grad_norm": 1.546031946151899, + "learning_rate": 9.891371353644414e-05, + "loss": 3.8751, + "step": 5209 + }, + { + "epoch": 3.3571313456889604, + "grad_norm": 1.9854676212122522, + "learning_rate": 9.891329554117886e-05, + "loss": 4.0784, + "step": 5210 + }, + { + "epoch": 3.3577759871071717, + "grad_norm": 1.354873383739495, + "learning_rate": 9.891287746640081e-05, + "loss": 3.6944, + "step": 5211 + }, + { + "epoch": 3.3584206285253826, + "grad_norm": 1.8036987650048593, + "learning_rate": 9.891245931211069e-05, + "loss": 3.5313, + "step": 5212 + }, + { + "epoch": 3.359065269943594, + "grad_norm": 1.9491327705425343, + "learning_rate": 9.891204107830917e-05, + "loss": 3.6019, + "step": 5213 + }, + { + "epoch": 3.359709911361805, + "grad_norm": 1.545169198229476, + "learning_rate": 9.891162276499694e-05, + "loss": 4.0369, + "step": 5214 + }, + { + "epoch": 3.360354552780016, + "grad_norm": 1.7317480055259766, + "learning_rate": 9.89112043721747e-05, + "loss": 3.6333, + "step": 5215 + }, + { + "epoch": 3.3609991941982273, + "grad_norm": 1.794757282410069, + "learning_rate": 9.891078589984313e-05, + "loss": 3.7917, + "step": 5216 + }, + { + "epoch": 3.361643835616438, + "grad_norm": 1.4161091738457363, + "learning_rate": 9.891036734800291e-05, + "loss": 3.5766, + "step": 5217 + }, + { + "epoch": 3.3622884770346495, + "grad_norm": 1.6277464832505308, + "learning_rate": 9.890994871665473e-05, + "loss": 3.2434, + "step": 5218 + }, + { + "epoch": 3.362933118452861, + "grad_norm": 1.5029042432024458, + "learning_rate": 9.890953000579928e-05, + "loss": 3.6526, + "step": 5219 + }, + { + "epoch": 3.3635777598710717, + "grad_norm": 1.4488475401999361, + "learning_rate": 9.890911121543725e-05, + "loss": 3.2099, + "step": 5220 + }, + { + "epoch": 3.364222401289283, + "grad_norm": 1.5385540867292198, + "learning_rate": 9.890869234556933e-05, + "loss": 3.6547, + "step": 5221 + }, + { + "epoch": 3.364867042707494, + "grad_norm": 1.4270793299116618, + "learning_rate": 9.89082733961962e-05, + "loss": 3.4847, + "step": 5222 + }, + { + "epoch": 3.365511684125705, + "grad_norm": 1.3608376783902005, + "learning_rate": 9.890785436731856e-05, + "loss": 3.6058, + "step": 5223 + }, + { + "epoch": 3.3661563255439164, + "grad_norm": 1.417672567753657, + "learning_rate": 9.890743525893707e-05, + "loss": 3.9984, + "step": 5224 + }, + { + "epoch": 3.3668009669621273, + "grad_norm": 1.4269366852492715, + "learning_rate": 9.890701607105245e-05, + "loss": 3.4244, + "step": 5225 + }, + { + "epoch": 3.3674456083803386, + "grad_norm": 1.4559067774075334, + "learning_rate": 9.890659680366537e-05, + "loss": 3.5748, + "step": 5226 + }, + { + "epoch": 3.3680902497985494, + "grad_norm": 1.4762087252154128, + "learning_rate": 9.890617745677655e-05, + "loss": 3.8727, + "step": 5227 + }, + { + "epoch": 3.3687348912167607, + "grad_norm": 1.5759036203043146, + "learning_rate": 9.890575803038663e-05, + "loss": 3.7587, + "step": 5228 + }, + { + "epoch": 3.369379532634972, + "grad_norm": 1.3464924812428283, + "learning_rate": 9.890533852449633e-05, + "loss": 3.3947, + "step": 5229 + }, + { + "epoch": 3.370024174053183, + "grad_norm": 1.5196083263312956, + "learning_rate": 9.890491893910634e-05, + "loss": 3.4972, + "step": 5230 + }, + { + "epoch": 3.370668815471394, + "grad_norm": 1.6805031350961113, + "learning_rate": 9.890449927421732e-05, + "loss": 3.3667, + "step": 5231 + }, + { + "epoch": 3.371313456889605, + "grad_norm": 1.47833753581709, + "learning_rate": 9.890407952983e-05, + "loss": 3.4803, + "step": 5232 + }, + { + "epoch": 3.3719580983078163, + "grad_norm": 1.5460778963564128, + "learning_rate": 9.890365970594504e-05, + "loss": 3.4442, + "step": 5233 + }, + { + "epoch": 3.3726027397260276, + "grad_norm": 1.4368390261122002, + "learning_rate": 9.890323980256314e-05, + "loss": 3.7061, + "step": 5234 + }, + { + "epoch": 3.3732473811442385, + "grad_norm": 2.2473679481035624, + "learning_rate": 9.8902819819685e-05, + "loss": 3.795, + "step": 5235 + }, + { + "epoch": 3.37389202256245, + "grad_norm": 1.6053547420499863, + "learning_rate": 9.890239975731129e-05, + "loss": 3.7788, + "step": 5236 + }, + { + "epoch": 3.3745366639806607, + "grad_norm": 2.158304147870326, + "learning_rate": 9.89019796154427e-05, + "loss": 3.4373, + "step": 5237 + }, + { + "epoch": 3.375181305398872, + "grad_norm": 1.53770760811663, + "learning_rate": 9.890155939407992e-05, + "loss": 3.6283, + "step": 5238 + }, + { + "epoch": 3.375825946817083, + "grad_norm": 1.6861789920669221, + "learning_rate": 9.890113909322368e-05, + "loss": 3.3482, + "step": 5239 + }, + { + "epoch": 3.376470588235294, + "grad_norm": 1.8033764488218296, + "learning_rate": 9.890071871287463e-05, + "loss": 3.6733, + "step": 5240 + }, + { + "epoch": 3.3771152296535054, + "grad_norm": 1.7036361471343788, + "learning_rate": 9.890029825303345e-05, + "loss": 3.5432, + "step": 5241 + }, + { + "epoch": 3.3777598710717163, + "grad_norm": 1.383179984279932, + "learning_rate": 9.889987771370088e-05, + "loss": 3.5372, + "step": 5242 + }, + { + "epoch": 3.3784045124899276, + "grad_norm": 1.694434042288888, + "learning_rate": 9.889945709487756e-05, + "loss": 3.6515, + "step": 5243 + }, + { + "epoch": 3.3790491539081384, + "grad_norm": 1.6526113282832442, + "learning_rate": 9.88990363965642e-05, + "loss": 4.0145, + "step": 5244 + }, + { + "epoch": 3.3796937953263497, + "grad_norm": 1.5856133250801998, + "learning_rate": 9.889861561876151e-05, + "loss": 3.4411, + "step": 5245 + }, + { + "epoch": 3.380338436744561, + "grad_norm": 1.470841673632583, + "learning_rate": 9.889819476147014e-05, + "loss": 3.707, + "step": 5246 + }, + { + "epoch": 3.380983078162772, + "grad_norm": 1.6121057751821088, + "learning_rate": 9.889777382469082e-05, + "loss": 3.455, + "step": 5247 + }, + { + "epoch": 3.381627719580983, + "grad_norm": 1.2722487527355495, + "learning_rate": 9.889735280842422e-05, + "loss": 3.8504, + "step": 5248 + }, + { + "epoch": 3.382272360999194, + "grad_norm": 1.4472727835079886, + "learning_rate": 9.889693171267104e-05, + "loss": 3.5234, + "step": 5249 + }, + { + "epoch": 3.3829170024174053, + "grad_norm": 1.724379794510634, + "learning_rate": 9.889651053743197e-05, + "loss": 3.3872, + "step": 5250 + }, + { + "epoch": 3.383561643835616, + "grad_norm": 1.6490559446466844, + "learning_rate": 9.88960892827077e-05, + "loss": 3.5502, + "step": 5251 + }, + { + "epoch": 3.3842062852538275, + "grad_norm": 1.4453651666681286, + "learning_rate": 9.889566794849894e-05, + "loss": 3.9086, + "step": 5252 + }, + { + "epoch": 3.384850926672039, + "grad_norm": 1.4110844348943352, + "learning_rate": 9.889524653480633e-05, + "loss": 3.4078, + "step": 5253 + }, + { + "epoch": 3.3854955680902497, + "grad_norm": 1.4581919036850115, + "learning_rate": 9.889482504163063e-05, + "loss": 3.6881, + "step": 5254 + }, + { + "epoch": 3.386140209508461, + "grad_norm": 1.3296166358432266, + "learning_rate": 9.889440346897247e-05, + "loss": 4.064, + "step": 5255 + }, + { + "epoch": 3.386784850926672, + "grad_norm": 1.234812916234829, + "learning_rate": 9.889398181683259e-05, + "loss": 3.6041, + "step": 5256 + }, + { + "epoch": 3.387429492344883, + "grad_norm": 1.70874873829662, + "learning_rate": 9.889356008521164e-05, + "loss": 3.6762, + "step": 5257 + }, + { + "epoch": 3.3880741337630944, + "grad_norm": 1.3807473968164694, + "learning_rate": 9.889313827411037e-05, + "loss": 3.6103, + "step": 5258 + }, + { + "epoch": 3.3887187751813053, + "grad_norm": 1.630214868255609, + "learning_rate": 9.889271638352943e-05, + "loss": 3.7774, + "step": 5259 + }, + { + "epoch": 3.3893634165995166, + "grad_norm": 1.397494810438936, + "learning_rate": 9.889229441346952e-05, + "loss": 3.4861, + "step": 5260 + }, + { + "epoch": 3.3900080580177274, + "grad_norm": 1.405838766470712, + "learning_rate": 9.889187236393132e-05, + "loss": 3.2071, + "step": 5261 + }, + { + "epoch": 3.3906526994359387, + "grad_norm": 1.4714783410378935, + "learning_rate": 9.889145023491555e-05, + "loss": 3.7276, + "step": 5262 + }, + { + "epoch": 3.39129734085415, + "grad_norm": 1.6848052367601485, + "learning_rate": 9.88910280264229e-05, + "loss": 3.4896, + "step": 5263 + }, + { + "epoch": 3.391941982272361, + "grad_norm": 1.3880988382383181, + "learning_rate": 9.889060573845406e-05, + "loss": 3.6752, + "step": 5264 + }, + { + "epoch": 3.392586623690572, + "grad_norm": 1.600841616961636, + "learning_rate": 9.88901833710097e-05, + "loss": 3.3689, + "step": 5265 + }, + { + "epoch": 3.393231265108783, + "grad_norm": 1.6795839057392243, + "learning_rate": 9.888976092409052e-05, + "loss": 3.8304, + "step": 5266 + }, + { + "epoch": 3.3938759065269943, + "grad_norm": 1.3781807271776871, + "learning_rate": 9.888933839769727e-05, + "loss": 3.477, + "step": 5267 + }, + { + "epoch": 3.3945205479452056, + "grad_norm": 2.2619331749273806, + "learning_rate": 9.888891579183056e-05, + "loss": 3.7698, + "step": 5268 + }, + { + "epoch": 3.3951651893634165, + "grad_norm": 1.245135777831872, + "learning_rate": 9.888849310649114e-05, + "loss": 3.6008, + "step": 5269 + }, + { + "epoch": 3.395809830781628, + "grad_norm": 2.428541030700998, + "learning_rate": 9.888807034167968e-05, + "loss": 3.6206, + "step": 5270 + }, + { + "epoch": 3.3964544721998386, + "grad_norm": 1.6748419449255465, + "learning_rate": 9.88876474973969e-05, + "loss": 3.6408, + "step": 5271 + }, + { + "epoch": 3.39709911361805, + "grad_norm": 1.7639142728665143, + "learning_rate": 9.888722457364347e-05, + "loss": 3.467, + "step": 5272 + }, + { + "epoch": 3.3977437550362612, + "grad_norm": 1.8410783051041761, + "learning_rate": 9.888680157042008e-05, + "loss": 3.4923, + "step": 5273 + }, + { + "epoch": 3.398388396454472, + "grad_norm": 1.401221919327131, + "learning_rate": 9.888637848772745e-05, + "loss": 3.5052, + "step": 5274 + }, + { + "epoch": 3.3990330378726834, + "grad_norm": 1.7747818510131719, + "learning_rate": 9.888595532556627e-05, + "loss": 3.5726, + "step": 5275 + }, + { + "epoch": 3.3996776792908943, + "grad_norm": 1.3964281926186217, + "learning_rate": 9.888553208393721e-05, + "loss": 3.7886, + "step": 5276 + }, + { + "epoch": 3.4003223207091056, + "grad_norm": 2.1904430819430396, + "learning_rate": 9.888510876284098e-05, + "loss": 3.4412, + "step": 5277 + }, + { + "epoch": 3.400966962127317, + "grad_norm": 1.3065693533428795, + "learning_rate": 9.888468536227828e-05, + "loss": 3.2969, + "step": 5278 + }, + { + "epoch": 3.4016116035455277, + "grad_norm": 2.7812375747726787, + "learning_rate": 9.888426188224981e-05, + "loss": 3.1946, + "step": 5279 + }, + { + "epoch": 3.402256244963739, + "grad_norm": 1.9444979948077958, + "learning_rate": 9.888383832275626e-05, + "loss": 4.2147, + "step": 5280 + }, + { + "epoch": 3.40290088638195, + "grad_norm": 1.9726597421285985, + "learning_rate": 9.888341468379831e-05, + "loss": 3.4371, + "step": 5281 + }, + { + "epoch": 3.403545527800161, + "grad_norm": 2.02994736748691, + "learning_rate": 9.888299096537669e-05, + "loss": 3.6051, + "step": 5282 + }, + { + "epoch": 3.4041901692183725, + "grad_norm": 1.5763283650413178, + "learning_rate": 9.888256716749204e-05, + "loss": 3.9108, + "step": 5283 + }, + { + "epoch": 3.4048348106365833, + "grad_norm": 1.7329133773945213, + "learning_rate": 9.888214329014512e-05, + "loss": 3.6726, + "step": 5284 + }, + { + "epoch": 3.4054794520547946, + "grad_norm": 1.8140072517251034, + "learning_rate": 9.888171933333658e-05, + "loss": 3.4583, + "step": 5285 + }, + { + "epoch": 3.4061240934730055, + "grad_norm": 1.8156106156040726, + "learning_rate": 9.888129529706714e-05, + "loss": 3.7639, + "step": 5286 + }, + { + "epoch": 3.406768734891217, + "grad_norm": 1.527077361771665, + "learning_rate": 9.88808711813375e-05, + "loss": 3.3497, + "step": 5287 + }, + { + "epoch": 3.407413376309428, + "grad_norm": 1.4445718385355502, + "learning_rate": 9.888044698614835e-05, + "loss": 3.562, + "step": 5288 + }, + { + "epoch": 3.408058017727639, + "grad_norm": 1.531480591833028, + "learning_rate": 9.888002271150037e-05, + "loss": 3.7077, + "step": 5289 + }, + { + "epoch": 3.4087026591458502, + "grad_norm": 1.2970545759609848, + "learning_rate": 9.887959835739427e-05, + "loss": 3.5806, + "step": 5290 + }, + { + "epoch": 3.409347300564061, + "grad_norm": 1.5652936170019296, + "learning_rate": 9.887917392383075e-05, + "loss": 3.776, + "step": 5291 + }, + { + "epoch": 3.4099919419822724, + "grad_norm": 1.2794176568201248, + "learning_rate": 9.887874941081052e-05, + "loss": 3.6666, + "step": 5292 + }, + { + "epoch": 3.4106365834004837, + "grad_norm": 1.4060516715524838, + "learning_rate": 9.887832481833423e-05, + "loss": 3.4825, + "step": 5293 + }, + { + "epoch": 3.4112812248186946, + "grad_norm": 1.3489354315988067, + "learning_rate": 9.887790014640262e-05, + "loss": 3.683, + "step": 5294 + }, + { + "epoch": 3.411925866236906, + "grad_norm": 1.651850534036196, + "learning_rate": 9.887747539501637e-05, + "loss": 3.4076, + "step": 5295 + }, + { + "epoch": 3.4125705076551167, + "grad_norm": 1.6130279756285284, + "learning_rate": 9.887705056417621e-05, + "loss": 3.3735, + "step": 5296 + }, + { + "epoch": 3.413215149073328, + "grad_norm": 1.6187424625869964, + "learning_rate": 9.887662565388278e-05, + "loss": 3.7451, + "step": 5297 + }, + { + "epoch": 3.4138597904915393, + "grad_norm": 1.4518550822917855, + "learning_rate": 9.887620066413682e-05, + "loss": 3.414, + "step": 5298 + }, + { + "epoch": 3.41450443190975, + "grad_norm": 1.503664166087311, + "learning_rate": 9.887577559493904e-05, + "loss": 3.6067, + "step": 5299 + }, + { + "epoch": 3.4151490733279615, + "grad_norm": 1.5489717163003158, + "learning_rate": 9.887535044629009e-05, + "loss": 3.3366, + "step": 5300 + }, + { + "epoch": 3.4151490733279615, + "eval_loss": 4.063844680786133, + "eval_runtime": 3.0256, + "eval_samples_per_second": 33.051, + "eval_steps_per_second": 4.297, + "step": 5300 + }, + { + "epoch": 3.4157937147461723, + "grad_norm": 1.6349448241734594, + "learning_rate": 9.887492521819069e-05, + "loss": 3.8948, + "step": 5301 + }, + { + "epoch": 3.4164383561643836, + "grad_norm": 1.4879270148176695, + "learning_rate": 9.887449991064154e-05, + "loss": 3.6832, + "step": 5302 + }, + { + "epoch": 3.417082997582595, + "grad_norm": 1.6774871192793956, + "learning_rate": 9.887407452364336e-05, + "loss": 3.6853, + "step": 5303 + }, + { + "epoch": 3.417727639000806, + "grad_norm": 1.6685847727321272, + "learning_rate": 9.887364905719682e-05, + "loss": 3.7463, + "step": 5304 + }, + { + "epoch": 3.418372280419017, + "grad_norm": 2.0001460703499374, + "learning_rate": 9.887322351130263e-05, + "loss": 3.6776, + "step": 5305 + }, + { + "epoch": 3.419016921837228, + "grad_norm": 1.7217005284281706, + "learning_rate": 9.88727978859615e-05, + "loss": 3.5021, + "step": 5306 + }, + { + "epoch": 3.4196615632554392, + "grad_norm": 2.023376190029713, + "learning_rate": 9.88723721811741e-05, + "loss": 3.6884, + "step": 5307 + }, + { + "epoch": 3.42030620467365, + "grad_norm": 1.458903380701255, + "learning_rate": 9.887194639694116e-05, + "loss": 3.5278, + "step": 5308 + }, + { + "epoch": 3.4209508460918614, + "grad_norm": 1.8878974140730187, + "learning_rate": 9.887152053326334e-05, + "loss": 3.6554, + "step": 5309 + }, + { + "epoch": 3.4215954875100727, + "grad_norm": 1.5754639928727274, + "learning_rate": 9.887109459014138e-05, + "loss": 3.5777, + "step": 5310 + }, + { + "epoch": 3.4222401289282836, + "grad_norm": 1.5572368025092136, + "learning_rate": 9.887066856757597e-05, + "loss": 3.762, + "step": 5311 + }, + { + "epoch": 3.422884770346495, + "grad_norm": 1.6201635593189085, + "learning_rate": 9.887024246556777e-05, + "loss": 3.6072, + "step": 5312 + }, + { + "epoch": 3.4235294117647057, + "grad_norm": 1.4942334225539242, + "learning_rate": 9.886981628411755e-05, + "loss": 3.6848, + "step": 5313 + }, + { + "epoch": 3.424174053182917, + "grad_norm": 2.315153246477206, + "learning_rate": 9.886939002322593e-05, + "loss": 3.8008, + "step": 5314 + }, + { + "epoch": 3.4248186946011283, + "grad_norm": 1.4841714296355517, + "learning_rate": 9.88689636828937e-05, + "loss": 3.8458, + "step": 5315 + }, + { + "epoch": 3.425463336019339, + "grad_norm": 1.8704634131465547, + "learning_rate": 9.88685372631215e-05, + "loss": 3.3755, + "step": 5316 + }, + { + "epoch": 3.4261079774375505, + "grad_norm": 1.6177479254299743, + "learning_rate": 9.886811076391002e-05, + "loss": 3.7566, + "step": 5317 + }, + { + "epoch": 3.4267526188557613, + "grad_norm": 1.622234758912864, + "learning_rate": 9.886768418526002e-05, + "loss": 3.6848, + "step": 5318 + }, + { + "epoch": 3.4273972602739726, + "grad_norm": 1.5764191198307331, + "learning_rate": 9.886725752717213e-05, + "loss": 3.6751, + "step": 5319 + }, + { + "epoch": 3.4280419016921835, + "grad_norm": 1.3993847803662522, + "learning_rate": 9.886683078964712e-05, + "loss": 3.5261, + "step": 5320 + }, + { + "epoch": 3.428686543110395, + "grad_norm": 1.5224270671006002, + "learning_rate": 9.886640397268562e-05, + "loss": 3.6911, + "step": 5321 + }, + { + "epoch": 3.429331184528606, + "grad_norm": 1.2956327292774825, + "learning_rate": 9.886597707628839e-05, + "loss": 3.3715, + "step": 5322 + }, + { + "epoch": 3.429975825946817, + "grad_norm": 1.548038022436915, + "learning_rate": 9.88655501004561e-05, + "loss": 3.6155, + "step": 5323 + }, + { + "epoch": 3.4306204673650282, + "grad_norm": 1.339843027447737, + "learning_rate": 9.886512304518946e-05, + "loss": 3.9222, + "step": 5324 + }, + { + "epoch": 3.431265108783239, + "grad_norm": 2.2090127782708993, + "learning_rate": 9.886469591048917e-05, + "loss": 3.6685, + "step": 5325 + }, + { + "epoch": 3.4319097502014504, + "grad_norm": 1.1833346209714195, + "learning_rate": 9.886426869635593e-05, + "loss": 3.7833, + "step": 5326 + }, + { + "epoch": 3.4325543916196617, + "grad_norm": 1.9402623282237823, + "learning_rate": 9.886384140279046e-05, + "loss": 3.8936, + "step": 5327 + }, + { + "epoch": 3.4331990330378725, + "grad_norm": 1.7779700980844126, + "learning_rate": 9.886341402979343e-05, + "loss": 3.6076, + "step": 5328 + }, + { + "epoch": 3.433843674456084, + "grad_norm": 1.5237172201694713, + "learning_rate": 9.886298657736556e-05, + "loss": 3.974, + "step": 5329 + }, + { + "epoch": 3.4344883158742947, + "grad_norm": 1.4533590404059955, + "learning_rate": 9.886255904550755e-05, + "loss": 3.6968, + "step": 5330 + }, + { + "epoch": 3.435132957292506, + "grad_norm": 2.0507729177582985, + "learning_rate": 9.886213143422012e-05, + "loss": 3.7326, + "step": 5331 + }, + { + "epoch": 3.4357775987107173, + "grad_norm": 1.9993310599844116, + "learning_rate": 9.886170374350392e-05, + "loss": 3.9134, + "step": 5332 + }, + { + "epoch": 3.436422240128928, + "grad_norm": 1.2761512927108025, + "learning_rate": 9.886127597335971e-05, + "loss": 3.5501, + "step": 5333 + }, + { + "epoch": 3.4370668815471395, + "grad_norm": 1.9038854470324635, + "learning_rate": 9.886084812378817e-05, + "loss": 3.4096, + "step": 5334 + }, + { + "epoch": 3.4377115229653503, + "grad_norm": 1.4139925352892453, + "learning_rate": 9.886042019479e-05, + "loss": 3.5087, + "step": 5335 + }, + { + "epoch": 3.4383561643835616, + "grad_norm": 1.3493455976219686, + "learning_rate": 9.885999218636589e-05, + "loss": 3.4649, + "step": 5336 + }, + { + "epoch": 3.439000805801773, + "grad_norm": 1.5974611424154341, + "learning_rate": 9.885956409851657e-05, + "loss": 3.4083, + "step": 5337 + }, + { + "epoch": 3.4396454472199838, + "grad_norm": 1.6848275102890788, + "learning_rate": 9.885913593124274e-05, + "loss": 3.8208, + "step": 5338 + }, + { + "epoch": 3.440290088638195, + "grad_norm": 1.2462376594961821, + "learning_rate": 9.885870768454507e-05, + "loss": 3.5567, + "step": 5339 + }, + { + "epoch": 3.440934730056406, + "grad_norm": 1.6762706346546836, + "learning_rate": 9.88582793584243e-05, + "loss": 3.3661, + "step": 5340 + }, + { + "epoch": 3.4415793714746172, + "grad_norm": 1.941162537638197, + "learning_rate": 9.885785095288114e-05, + "loss": 3.851, + "step": 5341 + }, + { + "epoch": 3.4422240128928285, + "grad_norm": 1.340765278066314, + "learning_rate": 9.885742246791623e-05, + "loss": 3.7914, + "step": 5342 + }, + { + "epoch": 3.4428686543110394, + "grad_norm": 1.6595114962096855, + "learning_rate": 9.885699390353035e-05, + "loss": 3.7386, + "step": 5343 + }, + { + "epoch": 3.4435132957292507, + "grad_norm": 1.2812252018644699, + "learning_rate": 9.885656525972416e-05, + "loss": 3.5585, + "step": 5344 + }, + { + "epoch": 3.4441579371474615, + "grad_norm": 1.5566848661891741, + "learning_rate": 9.88561365364984e-05, + "loss": 3.9011, + "step": 5345 + }, + { + "epoch": 3.444802578565673, + "grad_norm": 1.7856036793300967, + "learning_rate": 9.885570773385371e-05, + "loss": 3.7387, + "step": 5346 + }, + { + "epoch": 3.445447219983884, + "grad_norm": 1.4731806111642778, + "learning_rate": 9.885527885179086e-05, + "loss": 3.6297, + "step": 5347 + }, + { + "epoch": 3.446091861402095, + "grad_norm": 1.7052582642501033, + "learning_rate": 9.885484989031054e-05, + "loss": 3.8273, + "step": 5348 + }, + { + "epoch": 3.4467365028203063, + "grad_norm": 1.411596658732839, + "learning_rate": 9.885442084941344e-05, + "loss": 3.9729, + "step": 5349 + }, + { + "epoch": 3.447381144238517, + "grad_norm": 1.477167800688154, + "learning_rate": 9.885399172910025e-05, + "loss": 3.8324, + "step": 5350 + }, + { + "epoch": 3.4480257856567285, + "grad_norm": 1.3175408260569441, + "learning_rate": 9.885356252937169e-05, + "loss": 3.4058, + "step": 5351 + }, + { + "epoch": 3.4486704270749398, + "grad_norm": 1.574769994741785, + "learning_rate": 9.885313325022848e-05, + "loss": 3.4102, + "step": 5352 + }, + { + "epoch": 3.4493150684931506, + "grad_norm": 1.518497635813938, + "learning_rate": 9.88527038916713e-05, + "loss": 3.1486, + "step": 5353 + }, + { + "epoch": 3.449959709911362, + "grad_norm": 1.6523838815592804, + "learning_rate": 9.885227445370088e-05, + "loss": 3.613, + "step": 5354 + }, + { + "epoch": 3.4506043513295728, + "grad_norm": 1.5691101027366567, + "learning_rate": 9.88518449363179e-05, + "loss": 3.6049, + "step": 5355 + }, + { + "epoch": 3.451248992747784, + "grad_norm": 1.4901232441110146, + "learning_rate": 9.885141533952309e-05, + "loss": 3.6154, + "step": 5356 + }, + { + "epoch": 3.4518936341659954, + "grad_norm": 1.6381977935226086, + "learning_rate": 9.885098566331715e-05, + "loss": 3.8365, + "step": 5357 + }, + { + "epoch": 3.4525382755842062, + "grad_norm": 1.6393761251752634, + "learning_rate": 9.885055590770078e-05, + "loss": 3.9699, + "step": 5358 + }, + { + "epoch": 3.4531829170024175, + "grad_norm": 1.7597562114668508, + "learning_rate": 9.885012607267467e-05, + "loss": 3.5909, + "step": 5359 + }, + { + "epoch": 3.4538275584206284, + "grad_norm": 1.5204850930562575, + "learning_rate": 9.884969615823956e-05, + "loss": 3.6622, + "step": 5360 + }, + { + "epoch": 3.4544721998388397, + "grad_norm": 1.701280828892806, + "learning_rate": 9.884926616439612e-05, + "loss": 3.7517, + "step": 5361 + }, + { + "epoch": 3.455116841257051, + "grad_norm": 1.5184510246425775, + "learning_rate": 9.884883609114507e-05, + "loss": 3.5133, + "step": 5362 + }, + { + "epoch": 3.455761482675262, + "grad_norm": 1.5146683838699917, + "learning_rate": 9.884840593848714e-05, + "loss": 3.2943, + "step": 5363 + }, + { + "epoch": 3.456406124093473, + "grad_norm": 1.6834830472156501, + "learning_rate": 9.8847975706423e-05, + "loss": 3.8415, + "step": 5364 + }, + { + "epoch": 3.457050765511684, + "grad_norm": 1.4768374851589683, + "learning_rate": 9.884754539495337e-05, + "loss": 3.6667, + "step": 5365 + }, + { + "epoch": 3.4576954069298953, + "grad_norm": 2.160408267254558, + "learning_rate": 9.884711500407898e-05, + "loss": 3.6014, + "step": 5366 + }, + { + "epoch": 3.4583400483481066, + "grad_norm": 1.3130079678316744, + "learning_rate": 9.88466845338005e-05, + "loss": 3.7023, + "step": 5367 + }, + { + "epoch": 3.4589846897663175, + "grad_norm": 1.7227893902533942, + "learning_rate": 9.884625398411867e-05, + "loss": 3.7597, + "step": 5368 + }, + { + "epoch": 3.4596293311845288, + "grad_norm": 1.4659216742426961, + "learning_rate": 9.884582335503417e-05, + "loss": 3.3447, + "step": 5369 + }, + { + "epoch": 3.4602739726027396, + "grad_norm": 1.7369864598151405, + "learning_rate": 9.884539264654771e-05, + "loss": 3.6216, + "step": 5370 + }, + { + "epoch": 3.460918614020951, + "grad_norm": 2.046244387976165, + "learning_rate": 9.884496185866003e-05, + "loss": 3.5716, + "step": 5371 + }, + { + "epoch": 3.461563255439162, + "grad_norm": 1.56925187148102, + "learning_rate": 9.88445309913718e-05, + "loss": 3.3165, + "step": 5372 + }, + { + "epoch": 3.462207896857373, + "grad_norm": 1.957265374039299, + "learning_rate": 9.884410004468372e-05, + "loss": 3.5664, + "step": 5373 + }, + { + "epoch": 3.4628525382755844, + "grad_norm": 1.3591242951230726, + "learning_rate": 9.884366901859655e-05, + "loss": 3.672, + "step": 5374 + }, + { + "epoch": 3.4634971796937952, + "grad_norm": 1.5957286052841073, + "learning_rate": 9.884323791311095e-05, + "loss": 3.6526, + "step": 5375 + }, + { + "epoch": 3.4641418211120065, + "grad_norm": 1.1220879656063623, + "learning_rate": 9.884280672822765e-05, + "loss": 4.3411, + "step": 5376 + }, + { + "epoch": 3.4647864625302174, + "grad_norm": 1.9819390591935682, + "learning_rate": 9.884237546394735e-05, + "loss": 3.8404, + "step": 5377 + }, + { + "epoch": 3.4654311039484287, + "grad_norm": 1.5994155851536922, + "learning_rate": 9.884194412027077e-05, + "loss": 3.6236, + "step": 5378 + }, + { + "epoch": 3.46607574536664, + "grad_norm": 1.51939020517787, + "learning_rate": 9.88415126971986e-05, + "loss": 3.3485, + "step": 5379 + }, + { + "epoch": 3.466720386784851, + "grad_norm": 1.3262576934462513, + "learning_rate": 9.884108119473156e-05, + "loss": 3.5328, + "step": 5380 + }, + { + "epoch": 3.467365028203062, + "grad_norm": 1.682334833391114, + "learning_rate": 9.884064961287035e-05, + "loss": 3.6904, + "step": 5381 + }, + { + "epoch": 3.468009669621273, + "grad_norm": 1.4073519675258936, + "learning_rate": 9.88402179516157e-05, + "loss": 3.4902, + "step": 5382 + }, + { + "epoch": 3.4686543110394843, + "grad_norm": 1.725792813029431, + "learning_rate": 9.88397862109683e-05, + "loss": 3.4188, + "step": 5383 + }, + { + "epoch": 3.4692989524576956, + "grad_norm": 1.5236717039575405, + "learning_rate": 9.883935439092885e-05, + "loss": 3.6779, + "step": 5384 + }, + { + "epoch": 3.4699435938759065, + "grad_norm": 1.5759463687977668, + "learning_rate": 9.883892249149807e-05, + "loss": 3.6684, + "step": 5385 + }, + { + "epoch": 3.4705882352941178, + "grad_norm": 1.5284055230329403, + "learning_rate": 9.883849051267669e-05, + "loss": 3.828, + "step": 5386 + }, + { + "epoch": 3.4712328767123286, + "grad_norm": 1.7391625682913738, + "learning_rate": 9.883805845446541e-05, + "loss": 3.7577, + "step": 5387 + }, + { + "epoch": 3.47187751813054, + "grad_norm": 1.1222469349984319, + "learning_rate": 9.88376263168649e-05, + "loss": 3.6526, + "step": 5388 + }, + { + "epoch": 3.4725221595487508, + "grad_norm": 1.5650763208777665, + "learning_rate": 9.883719409987593e-05, + "loss": 3.7025, + "step": 5389 + }, + { + "epoch": 3.473166800966962, + "grad_norm": 1.3803101921060472, + "learning_rate": 9.883676180349916e-05, + "loss": 3.766, + "step": 5390 + }, + { + "epoch": 3.4738114423851734, + "grad_norm": 1.6452559187660087, + "learning_rate": 9.883632942773534e-05, + "loss": 3.8353, + "step": 5391 + }, + { + "epoch": 3.474456083803384, + "grad_norm": 1.25407899040569, + "learning_rate": 9.883589697258514e-05, + "loss": 3.3419, + "step": 5392 + }, + { + "epoch": 3.4751007252215955, + "grad_norm": 1.5293904268417484, + "learning_rate": 9.88354644380493e-05, + "loss": 3.7948, + "step": 5393 + }, + { + "epoch": 3.4757453666398064, + "grad_norm": 1.4171519531463013, + "learning_rate": 9.883503182412852e-05, + "loss": 3.9137, + "step": 5394 + }, + { + "epoch": 3.4763900080580177, + "grad_norm": 1.1925224346847836, + "learning_rate": 9.883459913082351e-05, + "loss": 3.7592, + "step": 5395 + }, + { + "epoch": 3.477034649476229, + "grad_norm": 1.5635597888258321, + "learning_rate": 9.8834166358135e-05, + "loss": 3.6806, + "step": 5396 + }, + { + "epoch": 3.47767929089444, + "grad_norm": 1.3942891431898932, + "learning_rate": 9.883373350606366e-05, + "loss": 3.9387, + "step": 5397 + }, + { + "epoch": 3.478323932312651, + "grad_norm": 1.8296318169361652, + "learning_rate": 9.883330057461024e-05, + "loss": 3.6827, + "step": 5398 + }, + { + "epoch": 3.478968573730862, + "grad_norm": 1.6258148272479847, + "learning_rate": 9.883286756377542e-05, + "loss": 3.526, + "step": 5399 + }, + { + "epoch": 3.4796132151490733, + "grad_norm": 1.5107486344209213, + "learning_rate": 9.883243447355993e-05, + "loss": 3.3734, + "step": 5400 + }, + { + "epoch": 3.4796132151490733, + "eval_loss": 4.063033580780029, + "eval_runtime": 2.9649, + "eval_samples_per_second": 33.728, + "eval_steps_per_second": 4.385, + "step": 5400 + }, + { + "epoch": 3.4802578565672846, + "grad_norm": 1.774218069241768, + "learning_rate": 9.88320013039645e-05, + "loss": 3.6407, + "step": 5401 + }, + { + "epoch": 3.4809024979854954, + "grad_norm": 1.4225953866441576, + "learning_rate": 9.883156805498978e-05, + "loss": 3.5647, + "step": 5402 + }, + { + "epoch": 3.4815471394037067, + "grad_norm": 1.4764816801318617, + "learning_rate": 9.883113472663655e-05, + "loss": 3.6086, + "step": 5403 + }, + { + "epoch": 3.4821917808219176, + "grad_norm": 1.2984629738347435, + "learning_rate": 9.88307013189055e-05, + "loss": 3.5385, + "step": 5404 + }, + { + "epoch": 3.482836422240129, + "grad_norm": 1.5089948703079945, + "learning_rate": 9.883026783179732e-05, + "loss": 3.7271, + "step": 5405 + }, + { + "epoch": 3.48348106365834, + "grad_norm": 1.488947954936311, + "learning_rate": 9.882983426531273e-05, + "loss": 3.4661, + "step": 5406 + }, + { + "epoch": 3.484125705076551, + "grad_norm": 1.2586965964750536, + "learning_rate": 9.882940061945245e-05, + "loss": 3.5489, + "step": 5407 + }, + { + "epoch": 3.4847703464947624, + "grad_norm": 1.7877389477454684, + "learning_rate": 9.88289668942172e-05, + "loss": 3.9068, + "step": 5408 + }, + { + "epoch": 3.485414987912973, + "grad_norm": 1.3273845686332888, + "learning_rate": 9.882853308960768e-05, + "loss": 3.6562, + "step": 5409 + }, + { + "epoch": 3.4860596293311845, + "grad_norm": 1.3268246303621638, + "learning_rate": 9.88280992056246e-05, + "loss": 3.5906, + "step": 5410 + }, + { + "epoch": 3.486704270749396, + "grad_norm": 1.456220128813178, + "learning_rate": 9.882766524226868e-05, + "loss": 3.5516, + "step": 5411 + }, + { + "epoch": 3.4873489121676067, + "grad_norm": 1.2278730607362232, + "learning_rate": 9.882723119954064e-05, + "loss": 3.4346, + "step": 5412 + }, + { + "epoch": 3.487993553585818, + "grad_norm": 1.5492190124840446, + "learning_rate": 9.882679707744117e-05, + "loss": 3.97, + "step": 5413 + }, + { + "epoch": 3.488638195004029, + "grad_norm": 1.3783671450787605, + "learning_rate": 9.8826362875971e-05, + "loss": 3.6612, + "step": 5414 + }, + { + "epoch": 3.48928283642224, + "grad_norm": 1.3257548530934606, + "learning_rate": 9.882592859513083e-05, + "loss": 3.4947, + "step": 5415 + }, + { + "epoch": 3.4899274778404514, + "grad_norm": 1.535619599056015, + "learning_rate": 9.88254942349214e-05, + "loss": 3.4746, + "step": 5416 + }, + { + "epoch": 3.4905721192586623, + "grad_norm": 1.5671082383832968, + "learning_rate": 9.88250597953434e-05, + "loss": 3.5589, + "step": 5417 + }, + { + "epoch": 3.4912167606768736, + "grad_norm": 1.5363694955465128, + "learning_rate": 9.882462527639755e-05, + "loss": 3.3722, + "step": 5418 + }, + { + "epoch": 3.4918614020950844, + "grad_norm": 1.549817346213886, + "learning_rate": 9.882419067808456e-05, + "loss": 3.2868, + "step": 5419 + }, + { + "epoch": 3.4925060435132957, + "grad_norm": 1.6942029694964431, + "learning_rate": 9.882375600040515e-05, + "loss": 3.4783, + "step": 5420 + }, + { + "epoch": 3.493150684931507, + "grad_norm": 1.3816983467691353, + "learning_rate": 9.882332124336004e-05, + "loss": 3.859, + "step": 5421 + }, + { + "epoch": 3.493795326349718, + "grad_norm": 1.701800093122689, + "learning_rate": 9.882288640694993e-05, + "loss": 3.5135, + "step": 5422 + }, + { + "epoch": 3.494439967767929, + "grad_norm": 1.5308496810406569, + "learning_rate": 9.882245149117551e-05, + "loss": 3.9384, + "step": 5423 + }, + { + "epoch": 3.49508460918614, + "grad_norm": 1.6692410922258232, + "learning_rate": 9.882201649603758e-05, + "loss": 3.5734, + "step": 5424 + }, + { + "epoch": 3.4957292506043514, + "grad_norm": 1.8638634309738267, + "learning_rate": 9.882158142153675e-05, + "loss": 3.3656, + "step": 5425 + }, + { + "epoch": 3.4963738920225627, + "grad_norm": 1.7647978495894932, + "learning_rate": 9.88211462676738e-05, + "loss": 3.7427, + "step": 5426 + }, + { + "epoch": 3.4970185334407735, + "grad_norm": 1.9601566721169408, + "learning_rate": 9.882071103444944e-05, + "loss": 3.6949, + "step": 5427 + }, + { + "epoch": 3.497663174858985, + "grad_norm": 1.5321342987298643, + "learning_rate": 9.882027572186435e-05, + "loss": 3.6886, + "step": 5428 + }, + { + "epoch": 3.4983078162771957, + "grad_norm": 1.9632566676659178, + "learning_rate": 9.881984032991927e-05, + "loss": 3.6966, + "step": 5429 + }, + { + "epoch": 3.498952457695407, + "grad_norm": 1.6332779832668187, + "learning_rate": 9.881940485861493e-05, + "loss": 3.9882, + "step": 5430 + }, + { + "epoch": 3.4995970991136183, + "grad_norm": 1.6606503959276182, + "learning_rate": 9.8818969307952e-05, + "loss": 3.7266, + "step": 5431 + }, + { + "epoch": 3.500241740531829, + "grad_norm": 1.7271331833725283, + "learning_rate": 9.881853367793125e-05, + "loss": 3.413, + "step": 5432 + }, + { + "epoch": 3.5008863819500404, + "grad_norm": 1.2814300552574847, + "learning_rate": 9.881809796855334e-05, + "loss": 3.4994, + "step": 5433 + }, + { + "epoch": 3.5015310233682513, + "grad_norm": 1.4707719859233404, + "learning_rate": 9.881766217981903e-05, + "loss": 3.8345, + "step": 5434 + }, + { + "epoch": 3.5021756647864626, + "grad_norm": 1.2017909211866913, + "learning_rate": 9.881722631172901e-05, + "loss": 3.8627, + "step": 5435 + }, + { + "epoch": 3.502820306204674, + "grad_norm": 1.5560489138395277, + "learning_rate": 9.881679036428401e-05, + "loss": 3.4308, + "step": 5436 + }, + { + "epoch": 3.5034649476228847, + "grad_norm": 1.1711684530104869, + "learning_rate": 9.881635433748477e-05, + "loss": 3.5406, + "step": 5437 + }, + { + "epoch": 3.504109589041096, + "grad_norm": 1.4450690572761957, + "learning_rate": 9.881591823133195e-05, + "loss": 3.4572, + "step": 5438 + }, + { + "epoch": 3.504754230459307, + "grad_norm": 1.4418737312083116, + "learning_rate": 9.881548204582629e-05, + "loss": 3.6389, + "step": 5439 + }, + { + "epoch": 3.505398871877518, + "grad_norm": 1.6324429296146372, + "learning_rate": 9.881504578096852e-05, + "loss": 3.7281, + "step": 5440 + }, + { + "epoch": 3.5060435132957295, + "grad_norm": 1.399610000640876, + "learning_rate": 9.881460943675934e-05, + "loss": 3.8993, + "step": 5441 + }, + { + "epoch": 3.5066881547139404, + "grad_norm": 1.3139480017299456, + "learning_rate": 9.881417301319948e-05, + "loss": 3.7207, + "step": 5442 + }, + { + "epoch": 3.5073327961321517, + "grad_norm": 1.3173362697898556, + "learning_rate": 9.881373651028964e-05, + "loss": 3.6252, + "step": 5443 + }, + { + "epoch": 3.5079774375503625, + "grad_norm": 1.4798562901326568, + "learning_rate": 9.881329992803056e-05, + "loss": 3.5911, + "step": 5444 + }, + { + "epoch": 3.508622078968574, + "grad_norm": 1.3706638640571607, + "learning_rate": 9.881286326642293e-05, + "loss": 3.8122, + "step": 5445 + }, + { + "epoch": 3.509266720386785, + "grad_norm": 1.4743164029407863, + "learning_rate": 9.88124265254675e-05, + "loss": 4.1094, + "step": 5446 + }, + { + "epoch": 3.509911361804996, + "grad_norm": 1.4916160410220898, + "learning_rate": 9.881198970516494e-05, + "loss": 3.7435, + "step": 5447 + }, + { + "epoch": 3.510556003223207, + "grad_norm": 1.1401362778128827, + "learning_rate": 9.881155280551602e-05, + "loss": 3.6306, + "step": 5448 + }, + { + "epoch": 3.511200644641418, + "grad_norm": 1.4135599587499774, + "learning_rate": 9.881111582652142e-05, + "loss": 3.8706, + "step": 5449 + }, + { + "epoch": 3.5118452860596294, + "grad_norm": 1.1893543941185991, + "learning_rate": 9.881067876818188e-05, + "loss": 3.85, + "step": 5450 + }, + { + "epoch": 3.5124899274778407, + "grad_norm": 1.3471235447312329, + "learning_rate": 9.88102416304981e-05, + "loss": 3.4865, + "step": 5451 + }, + { + "epoch": 3.5131345688960516, + "grad_norm": 1.210058908029274, + "learning_rate": 9.88098044134708e-05, + "loss": 3.5374, + "step": 5452 + }, + { + "epoch": 3.5137792103142624, + "grad_norm": 1.6586655643691925, + "learning_rate": 9.880936711710073e-05, + "loss": 3.7382, + "step": 5453 + }, + { + "epoch": 3.5144238517324737, + "grad_norm": 1.490576752103487, + "learning_rate": 9.880892974138857e-05, + "loss": 3.7253, + "step": 5454 + }, + { + "epoch": 3.515068493150685, + "grad_norm": 1.4096966796435306, + "learning_rate": 9.880849228633503e-05, + "loss": 3.2994, + "step": 5455 + }, + { + "epoch": 3.515713134568896, + "grad_norm": 1.3893395906921844, + "learning_rate": 9.880805475194088e-05, + "loss": 3.8699, + "step": 5456 + }, + { + "epoch": 3.516357775987107, + "grad_norm": 1.8545417823091068, + "learning_rate": 9.88076171382068e-05, + "loss": 3.4597, + "step": 5457 + }, + { + "epoch": 3.517002417405318, + "grad_norm": 1.4870677059571336, + "learning_rate": 9.880717944513351e-05, + "loss": 3.3054, + "step": 5458 + }, + { + "epoch": 3.5176470588235293, + "grad_norm": 1.5544688093206436, + "learning_rate": 9.880674167272175e-05, + "loss": 4.0955, + "step": 5459 + }, + { + "epoch": 3.5182917002417406, + "grad_norm": 1.3732668056403956, + "learning_rate": 9.880630382097221e-05, + "loss": 3.7417, + "step": 5460 + }, + { + "epoch": 3.5189363416599515, + "grad_norm": 1.3805166797054085, + "learning_rate": 9.880586588988563e-05, + "loss": 3.116, + "step": 5461 + }, + { + "epoch": 3.519580983078163, + "grad_norm": 1.5071270181928684, + "learning_rate": 9.880542787946272e-05, + "loss": 3.5571, + "step": 5462 + }, + { + "epoch": 3.5202256244963737, + "grad_norm": 1.2896917079121952, + "learning_rate": 9.880498978970422e-05, + "loss": 3.4268, + "step": 5463 + }, + { + "epoch": 3.520870265914585, + "grad_norm": 1.4107399739625468, + "learning_rate": 9.880455162061081e-05, + "loss": 3.7077, + "step": 5464 + }, + { + "epoch": 3.5215149073327963, + "grad_norm": 1.7744833736746526, + "learning_rate": 9.880411337218325e-05, + "loss": 3.3136, + "step": 5465 + }, + { + "epoch": 3.522159548751007, + "grad_norm": 1.390926780751181, + "learning_rate": 9.880367504442223e-05, + "loss": 3.7841, + "step": 5466 + }, + { + "epoch": 3.5228041901692184, + "grad_norm": 1.5675983581519595, + "learning_rate": 9.880323663732849e-05, + "loss": 3.6478, + "step": 5467 + }, + { + "epoch": 3.5234488315874293, + "grad_norm": 1.1761224562894614, + "learning_rate": 9.880279815090272e-05, + "loss": 3.6743, + "step": 5468 + }, + { + "epoch": 3.5240934730056406, + "grad_norm": 1.562740891895073, + "learning_rate": 9.880235958514567e-05, + "loss": 4.04, + "step": 5469 + }, + { + "epoch": 3.524738114423852, + "grad_norm": 1.6859824322809422, + "learning_rate": 9.880192094005806e-05, + "loss": 3.5711, + "step": 5470 + }, + { + "epoch": 3.5253827558420627, + "grad_norm": 1.710738084768158, + "learning_rate": 9.88014822156406e-05, + "loss": 3.6432, + "step": 5471 + }, + { + "epoch": 3.526027397260274, + "grad_norm": 1.6616070116577755, + "learning_rate": 9.880104341189403e-05, + "loss": 3.7787, + "step": 5472 + }, + { + "epoch": 3.526672038678485, + "grad_norm": 2.482850446611119, + "learning_rate": 9.880060452881903e-05, + "loss": 3.6101, + "step": 5473 + }, + { + "epoch": 3.527316680096696, + "grad_norm": 1.2639405250925015, + "learning_rate": 9.880016556641636e-05, + "loss": 3.8715, + "step": 5474 + }, + { + "epoch": 3.5279613215149075, + "grad_norm": 2.0876510551743697, + "learning_rate": 9.879972652468671e-05, + "loss": 3.5095, + "step": 5475 + }, + { + "epoch": 3.5286059629331183, + "grad_norm": 1.3851398008759837, + "learning_rate": 9.879928740363083e-05, + "loss": 3.7815, + "step": 5476 + }, + { + "epoch": 3.5292506043513296, + "grad_norm": 1.5296264227496024, + "learning_rate": 9.87988482032494e-05, + "loss": 3.518, + "step": 5477 + }, + { + "epoch": 3.5298952457695405, + "grad_norm": 1.4419150140968204, + "learning_rate": 9.87984089235432e-05, + "loss": 3.6157, + "step": 5478 + }, + { + "epoch": 3.530539887187752, + "grad_norm": 1.412219615572282, + "learning_rate": 9.879796956451293e-05, + "loss": 3.7903, + "step": 5479 + }, + { + "epoch": 3.531184528605963, + "grad_norm": 1.4079540733005602, + "learning_rate": 9.879753012615928e-05, + "loss": 3.4728, + "step": 5480 + }, + { + "epoch": 3.531829170024174, + "grad_norm": 1.3486749102049065, + "learning_rate": 9.879709060848299e-05, + "loss": 3.6029, + "step": 5481 + }, + { + "epoch": 3.5324738114423853, + "grad_norm": 1.6888804529400072, + "learning_rate": 9.879665101148479e-05, + "loss": 3.6922, + "step": 5482 + }, + { + "epoch": 3.533118452860596, + "grad_norm": 1.7023692136361304, + "learning_rate": 9.879621133516541e-05, + "loss": 3.6357, + "step": 5483 + }, + { + "epoch": 3.5337630942788074, + "grad_norm": 1.15348305155439, + "learning_rate": 9.879577157952554e-05, + "loss": 3.8901, + "step": 5484 + }, + { + "epoch": 3.5344077356970187, + "grad_norm": 1.4941148656235441, + "learning_rate": 9.879533174456593e-05, + "loss": 3.8241, + "step": 5485 + }, + { + "epoch": 3.5350523771152296, + "grad_norm": 1.2730410084083368, + "learning_rate": 9.879489183028729e-05, + "loss": 3.5342, + "step": 5486 + }, + { + "epoch": 3.535697018533441, + "grad_norm": 1.5609891727141305, + "learning_rate": 9.879445183669037e-05, + "loss": 3.6969, + "step": 5487 + }, + { + "epoch": 3.5363416599516517, + "grad_norm": 1.43480160704503, + "learning_rate": 9.879401176377583e-05, + "loss": 3.5441, + "step": 5488 + }, + { + "epoch": 3.536986301369863, + "grad_norm": 1.6410670326103052, + "learning_rate": 9.879357161154446e-05, + "loss": 3.469, + "step": 5489 + }, + { + "epoch": 3.5376309427880743, + "grad_norm": 1.5392835173701447, + "learning_rate": 9.879313137999696e-05, + "loss": 3.6428, + "step": 5490 + }, + { + "epoch": 3.538275584206285, + "grad_norm": 1.4358721279686646, + "learning_rate": 9.879269106913403e-05, + "loss": 3.7189, + "step": 5491 + }, + { + "epoch": 3.5389202256244965, + "grad_norm": 1.387934505348463, + "learning_rate": 9.87922506789564e-05, + "loss": 3.5515, + "step": 5492 + }, + { + "epoch": 3.5395648670427073, + "grad_norm": 1.3188839978446607, + "learning_rate": 9.879181020946484e-05, + "loss": 3.7048, + "step": 5493 + }, + { + "epoch": 3.5402095084609186, + "grad_norm": 1.3269382022979381, + "learning_rate": 9.879136966066003e-05, + "loss": 3.8354, + "step": 5494 + }, + { + "epoch": 3.54085414987913, + "grad_norm": 1.6519000905127361, + "learning_rate": 9.879092903254269e-05, + "loss": 3.7587, + "step": 5495 + }, + { + "epoch": 3.541498791297341, + "grad_norm": 1.3945170091580579, + "learning_rate": 9.879048832511356e-05, + "loss": 3.5549, + "step": 5496 + }, + { + "epoch": 3.542143432715552, + "grad_norm": 1.765964398529369, + "learning_rate": 9.879004753837335e-05, + "loss": 3.8377, + "step": 5497 + }, + { + "epoch": 3.542788074133763, + "grad_norm": 1.180046918795195, + "learning_rate": 9.87896066723228e-05, + "loss": 3.5222, + "step": 5498 + }, + { + "epoch": 3.5434327155519743, + "grad_norm": 1.5065207085107315, + "learning_rate": 9.878916572696264e-05, + "loss": 3.6114, + "step": 5499 + }, + { + "epoch": 3.5440773569701856, + "grad_norm": 1.377920882044314, + "learning_rate": 9.878872470229355e-05, + "loss": 3.7024, + "step": 5500 + }, + { + "epoch": 3.5440773569701856, + "eval_loss": 4.049370765686035, + "eval_runtime": 2.9855, + "eval_samples_per_second": 33.495, + "eval_steps_per_second": 4.354, + "step": 5500 + }, + { + "epoch": 3.5447219983883964, + "grad_norm": 1.212192106256778, + "learning_rate": 9.878828359831632e-05, + "loss": 3.5696, + "step": 5501 + }, + { + "epoch": 3.5453666398066077, + "grad_norm": 1.3624060997994545, + "learning_rate": 9.878784241503161e-05, + "loss": 3.4719, + "step": 5502 + }, + { + "epoch": 3.5460112812248186, + "grad_norm": 1.7129986595822548, + "learning_rate": 9.878740115244019e-05, + "loss": 3.6554, + "step": 5503 + }, + { + "epoch": 3.54665592264303, + "grad_norm": 1.7157966666690674, + "learning_rate": 9.878695981054278e-05, + "loss": 3.4126, + "step": 5504 + }, + { + "epoch": 3.547300564061241, + "grad_norm": 1.3642438381536024, + "learning_rate": 9.878651838934008e-05, + "loss": 3.3501, + "step": 5505 + }, + { + "epoch": 3.547945205479452, + "grad_norm": 1.7121634519463085, + "learning_rate": 9.878607688883284e-05, + "loss": 3.6865, + "step": 5506 + }, + { + "epoch": 3.5485898468976633, + "grad_norm": 1.342477125768469, + "learning_rate": 9.878563530902175e-05, + "loss": 3.738, + "step": 5507 + }, + { + "epoch": 3.549234488315874, + "grad_norm": 1.8546877165906641, + "learning_rate": 9.878519364990758e-05, + "loss": 3.5334, + "step": 5508 + }, + { + "epoch": 3.5498791297340855, + "grad_norm": 1.5889163311051304, + "learning_rate": 9.878475191149102e-05, + "loss": 3.3544, + "step": 5509 + }, + { + "epoch": 3.550523771152297, + "grad_norm": 1.7330596115371384, + "learning_rate": 9.878431009377284e-05, + "loss": 3.3188, + "step": 5510 + }, + { + "epoch": 3.5511684125705076, + "grad_norm": 1.548070474728092, + "learning_rate": 9.87838681967537e-05, + "loss": 3.5489, + "step": 5511 + }, + { + "epoch": 3.551813053988719, + "grad_norm": 1.6831524093500863, + "learning_rate": 9.878342622043439e-05, + "loss": 3.2903, + "step": 5512 + }, + { + "epoch": 3.55245769540693, + "grad_norm": 1.3529553432384056, + "learning_rate": 9.878298416481559e-05, + "loss": 3.6513, + "step": 5513 + }, + { + "epoch": 3.553102336825141, + "grad_norm": 1.6804294883309834, + "learning_rate": 9.878254202989805e-05, + "loss": 3.4941, + "step": 5514 + }, + { + "epoch": 3.5537469782433524, + "grad_norm": 1.4760887823864968, + "learning_rate": 9.87820998156825e-05, + "loss": 3.7653, + "step": 5515 + }, + { + "epoch": 3.5543916196615633, + "grad_norm": 1.9868718127023144, + "learning_rate": 9.878165752216963e-05, + "loss": 3.2546, + "step": 5516 + }, + { + "epoch": 3.555036261079774, + "grad_norm": 1.3095021673274096, + "learning_rate": 9.87812151493602e-05, + "loss": 3.7436, + "step": 5517 + }, + { + "epoch": 3.5556809024979854, + "grad_norm": 1.5801875703422117, + "learning_rate": 9.878077269725495e-05, + "loss": 3.4361, + "step": 5518 + }, + { + "epoch": 3.5563255439161967, + "grad_norm": 1.421635355340136, + "learning_rate": 9.878033016585458e-05, + "loss": 3.3457, + "step": 5519 + }, + { + "epoch": 3.556970185334408, + "grad_norm": 1.505984652724436, + "learning_rate": 9.877988755515982e-05, + "loss": 3.6641, + "step": 5520 + }, + { + "epoch": 3.557614826752619, + "grad_norm": 1.3871189073489067, + "learning_rate": 9.877944486517138e-05, + "loss": 3.6489, + "step": 5521 + }, + { + "epoch": 3.5582594681708297, + "grad_norm": 1.945560516394117, + "learning_rate": 9.877900209589001e-05, + "loss": 3.3463, + "step": 5522 + }, + { + "epoch": 3.558904109589041, + "grad_norm": 2.254673651268028, + "learning_rate": 9.877855924731646e-05, + "loss": 3.7181, + "step": 5523 + }, + { + "epoch": 3.5595487510072523, + "grad_norm": 1.6838967145087116, + "learning_rate": 9.877811631945142e-05, + "loss": 3.5253, + "step": 5524 + }, + { + "epoch": 3.560193392425463, + "grad_norm": 1.8174351107387994, + "learning_rate": 9.877767331229561e-05, + "loss": 3.6379, + "step": 5525 + }, + { + "epoch": 3.5608380338436745, + "grad_norm": 1.494125993206337, + "learning_rate": 9.87772302258498e-05, + "loss": 3.5776, + "step": 5526 + }, + { + "epoch": 3.5614826752618853, + "grad_norm": 1.8110898868138003, + "learning_rate": 9.877678706011468e-05, + "loss": 3.5029, + "step": 5527 + }, + { + "epoch": 3.5621273166800966, + "grad_norm": 1.7512481683266417, + "learning_rate": 9.8776343815091e-05, + "loss": 3.726, + "step": 5528 + }, + { + "epoch": 3.562771958098308, + "grad_norm": 1.5881680733727779, + "learning_rate": 9.877590049077948e-05, + "loss": 3.7931, + "step": 5529 + }, + { + "epoch": 3.563416599516519, + "grad_norm": 2.0120485348984394, + "learning_rate": 9.877545708718084e-05, + "loss": 3.7167, + "step": 5530 + }, + { + "epoch": 3.56406124093473, + "grad_norm": 1.3821522150248642, + "learning_rate": 9.877501360429582e-05, + "loss": 3.5397, + "step": 5531 + }, + { + "epoch": 3.564705882352941, + "grad_norm": 1.7439322327669315, + "learning_rate": 9.877457004212513e-05, + "loss": 3.9445, + "step": 5532 + }, + { + "epoch": 3.5653505237711522, + "grad_norm": 1.4355313173127116, + "learning_rate": 9.877412640066954e-05, + "loss": 3.7177, + "step": 5533 + }, + { + "epoch": 3.5659951651893635, + "grad_norm": 1.4930899855962125, + "learning_rate": 9.877368267992974e-05, + "loss": 4.1222, + "step": 5534 + }, + { + "epoch": 3.5666398066075744, + "grad_norm": 1.6303782657377714, + "learning_rate": 9.877323887990647e-05, + "loss": 3.7263, + "step": 5535 + }, + { + "epoch": 3.5672844480257857, + "grad_norm": 1.5957707896624451, + "learning_rate": 9.877279500060046e-05, + "loss": 3.4528, + "step": 5536 + }, + { + "epoch": 3.5679290894439966, + "grad_norm": 1.2998024008856306, + "learning_rate": 9.877235104201242e-05, + "loss": 3.7267, + "step": 5537 + }, + { + "epoch": 3.568573730862208, + "grad_norm": 1.54380924095078, + "learning_rate": 9.877190700414313e-05, + "loss": 3.4287, + "step": 5538 + }, + { + "epoch": 3.569218372280419, + "grad_norm": 1.7375500617486412, + "learning_rate": 9.877146288699326e-05, + "loss": 3.7715, + "step": 5539 + }, + { + "epoch": 3.56986301369863, + "grad_norm": 1.666148127033315, + "learning_rate": 9.877101869056358e-05, + "loss": 3.5252, + "step": 5540 + }, + { + "epoch": 3.5705076551168413, + "grad_norm": 1.6561693401264215, + "learning_rate": 9.87705744148548e-05, + "loss": 3.6518, + "step": 5541 + }, + { + "epoch": 3.571152296535052, + "grad_norm": 1.7381821152312156, + "learning_rate": 9.877013005986765e-05, + "loss": 3.5742, + "step": 5542 + }, + { + "epoch": 3.5717969379532635, + "grad_norm": 1.746708223924028, + "learning_rate": 9.876968562560287e-05, + "loss": 3.6765, + "step": 5543 + }, + { + "epoch": 3.5724415793714748, + "grad_norm": 1.617950012923037, + "learning_rate": 9.87692411120612e-05, + "loss": 3.6939, + "step": 5544 + }, + { + "epoch": 3.5730862207896856, + "grad_norm": 1.9614812053930812, + "learning_rate": 9.876879651924332e-05, + "loss": 3.7055, + "step": 5545 + }, + { + "epoch": 3.573730862207897, + "grad_norm": 1.2829724782205056, + "learning_rate": 9.876835184715002e-05, + "loss": 4.2079, + "step": 5546 + }, + { + "epoch": 3.574375503626108, + "grad_norm": 1.5499384949645414, + "learning_rate": 9.8767907095782e-05, + "loss": 4.0808, + "step": 5547 + }, + { + "epoch": 3.575020145044319, + "grad_norm": 1.3520429117118118, + "learning_rate": 9.876746226514001e-05, + "loss": 3.8785, + "step": 5548 + }, + { + "epoch": 3.5756647864625304, + "grad_norm": 1.5869666079411733, + "learning_rate": 9.876701735522475e-05, + "loss": 3.843, + "step": 5549 + }, + { + "epoch": 3.5763094278807412, + "grad_norm": 1.557645047691047, + "learning_rate": 9.876657236603697e-05, + "loss": 3.4776, + "step": 5550 + }, + { + "epoch": 3.5769540692989525, + "grad_norm": 2.165765758474283, + "learning_rate": 9.87661272975774e-05, + "loss": 3.8229, + "step": 5551 + }, + { + "epoch": 3.5775987107171634, + "grad_norm": 1.8048592270943085, + "learning_rate": 9.876568214984676e-05, + "loss": 3.7683, + "step": 5552 + }, + { + "epoch": 3.5782433521353747, + "grad_norm": 1.8659879990692192, + "learning_rate": 9.87652369228458e-05, + "loss": 3.6022, + "step": 5553 + }, + { + "epoch": 3.578887993553586, + "grad_norm": 1.9825242151425988, + "learning_rate": 9.876479161657523e-05, + "loss": 3.3066, + "step": 5554 + }, + { + "epoch": 3.579532634971797, + "grad_norm": 1.3030521162938462, + "learning_rate": 9.87643462310358e-05, + "loss": 3.5141, + "step": 5555 + }, + { + "epoch": 3.580177276390008, + "grad_norm": 2.1476687688055613, + "learning_rate": 9.876390076622825e-05, + "loss": 3.8787, + "step": 5556 + }, + { + "epoch": 3.580821917808219, + "grad_norm": 1.6971976997454976, + "learning_rate": 9.876345522215328e-05, + "loss": 3.1457, + "step": 5557 + }, + { + "epoch": 3.5814665592264303, + "grad_norm": 1.89293827299349, + "learning_rate": 9.876300959881164e-05, + "loss": 3.9082, + "step": 5558 + }, + { + "epoch": 3.5821112006446416, + "grad_norm": 1.3427154586157377, + "learning_rate": 9.876256389620405e-05, + "loss": 3.5437, + "step": 5559 + }, + { + "epoch": 3.5827558420628525, + "grad_norm": 1.5704630799799235, + "learning_rate": 9.876211811433126e-05, + "loss": 3.6312, + "step": 5560 + }, + { + "epoch": 3.5834004834810638, + "grad_norm": 1.440704262259888, + "learning_rate": 9.876167225319399e-05, + "loss": 3.4908, + "step": 5561 + }, + { + "epoch": 3.5840451248992746, + "grad_norm": 1.983839036033773, + "learning_rate": 9.876122631279298e-05, + "loss": 3.6159, + "step": 5562 + }, + { + "epoch": 3.584689766317486, + "grad_norm": 1.5523893849332466, + "learning_rate": 9.876078029312896e-05, + "loss": 3.8812, + "step": 5563 + }, + { + "epoch": 3.5853344077356972, + "grad_norm": 1.4590813662184063, + "learning_rate": 9.876033419420265e-05, + "loss": 3.7743, + "step": 5564 + }, + { + "epoch": 3.585979049153908, + "grad_norm": 1.4420652737669533, + "learning_rate": 9.875988801601481e-05, + "loss": 3.5084, + "step": 5565 + }, + { + "epoch": 3.5866236905721194, + "grad_norm": 1.3319306859751476, + "learning_rate": 9.875944175856614e-05, + "loss": 3.4148, + "step": 5566 + }, + { + "epoch": 3.5872683319903302, + "grad_norm": 1.4247653561909643, + "learning_rate": 9.875899542185741e-05, + "loss": 3.3924, + "step": 5567 + }, + { + "epoch": 3.5879129734085415, + "grad_norm": 1.2494017939356936, + "learning_rate": 9.875854900588931e-05, + "loss": 3.649, + "step": 5568 + }, + { + "epoch": 3.588557614826753, + "grad_norm": 1.3912620998232454, + "learning_rate": 9.87581025106626e-05, + "loss": 3.3504, + "step": 5569 + }, + { + "epoch": 3.5892022562449637, + "grad_norm": 1.3149760974245603, + "learning_rate": 9.875765593617802e-05, + "loss": 3.6469, + "step": 5570 + }, + { + "epoch": 3.589846897663175, + "grad_norm": 1.4402967059947545, + "learning_rate": 9.875720928243629e-05, + "loss": 3.4509, + "step": 5571 + }, + { + "epoch": 3.590491539081386, + "grad_norm": 1.3008531011295832, + "learning_rate": 9.875676254943815e-05, + "loss": 3.5544, + "step": 5572 + }, + { + "epoch": 3.591136180499597, + "grad_norm": 1.5005006100932423, + "learning_rate": 9.875631573718431e-05, + "loss": 3.7712, + "step": 5573 + }, + { + "epoch": 3.5917808219178085, + "grad_norm": 1.7165030837667634, + "learning_rate": 9.875586884567553e-05, + "loss": 3.8987, + "step": 5574 + }, + { + "epoch": 3.5924254633360193, + "grad_norm": 2.0720368475281385, + "learning_rate": 9.875542187491254e-05, + "loss": 3.9061, + "step": 5575 + }, + { + "epoch": 3.5930701047542306, + "grad_norm": 1.8570469849834892, + "learning_rate": 9.875497482489607e-05, + "loss": 3.8103, + "step": 5576 + }, + { + "epoch": 3.5937147461724415, + "grad_norm": 1.5689375864119093, + "learning_rate": 9.875452769562686e-05, + "loss": 3.4301, + "step": 5577 + }, + { + "epoch": 3.5943593875906528, + "grad_norm": 1.402584401203995, + "learning_rate": 9.875408048710564e-05, + "loss": 3.688, + "step": 5578 + }, + { + "epoch": 3.595004029008864, + "grad_norm": 2.1665991303273335, + "learning_rate": 9.875363319933313e-05, + "loss": 3.6207, + "step": 5579 + }, + { + "epoch": 3.595648670427075, + "grad_norm": 2.152858010695399, + "learning_rate": 9.87531858323101e-05, + "loss": 3.8756, + "step": 5580 + }, + { + "epoch": 3.5962933118452862, + "grad_norm": 1.4256722096383663, + "learning_rate": 9.875273838603726e-05, + "loss": 3.8863, + "step": 5581 + }, + { + "epoch": 3.596937953263497, + "grad_norm": 2.266858082403613, + "learning_rate": 9.875229086051534e-05, + "loss": 3.5646, + "step": 5582 + }, + { + "epoch": 3.5975825946817084, + "grad_norm": 1.497103697738028, + "learning_rate": 9.875184325574507e-05, + "loss": 3.8391, + "step": 5583 + }, + { + "epoch": 3.5982272360999197, + "grad_norm": 2.151708695853291, + "learning_rate": 9.875139557172723e-05, + "loss": 3.7157, + "step": 5584 + }, + { + "epoch": 3.5988718775181305, + "grad_norm": 1.839196697028174, + "learning_rate": 9.875094780846251e-05, + "loss": 3.4625, + "step": 5585 + }, + { + "epoch": 3.5995165189363414, + "grad_norm": 1.7981951586894274, + "learning_rate": 9.875049996595166e-05, + "loss": 3.5226, + "step": 5586 + }, + { + "epoch": 3.6001611603545527, + "grad_norm": 1.6878019287473847, + "learning_rate": 9.875005204419541e-05, + "loss": 3.6777, + "step": 5587 + }, + { + "epoch": 3.600805801772764, + "grad_norm": 1.6407218626295512, + "learning_rate": 9.87496040431945e-05, + "loss": 3.5767, + "step": 5588 + }, + { + "epoch": 3.6014504431909753, + "grad_norm": 1.5890216762585883, + "learning_rate": 9.874915596294967e-05, + "loss": 3.7631, + "step": 5589 + }, + { + "epoch": 3.602095084609186, + "grad_norm": 1.7939577592427949, + "learning_rate": 9.874870780346164e-05, + "loss": 3.506, + "step": 5590 + }, + { + "epoch": 3.602739726027397, + "grad_norm": 1.4647711134066945, + "learning_rate": 9.874825956473118e-05, + "loss": 3.6523, + "step": 5591 + }, + { + "epoch": 3.6033843674456083, + "grad_norm": 1.4870671328065492, + "learning_rate": 9.8747811246759e-05, + "loss": 3.6381, + "step": 5592 + }, + { + "epoch": 3.6040290088638196, + "grad_norm": 1.4464240060160531, + "learning_rate": 9.874736284954583e-05, + "loss": 3.636, + "step": 5593 + }, + { + "epoch": 3.6046736502820305, + "grad_norm": 1.6499457259229195, + "learning_rate": 9.874691437309244e-05, + "loss": 3.5049, + "step": 5594 + }, + { + "epoch": 3.6053182917002418, + "grad_norm": 1.5430768789405476, + "learning_rate": 9.874646581739951e-05, + "loss": 3.9008, + "step": 5595 + }, + { + "epoch": 3.6059629331184526, + "grad_norm": 27.095344164834803, + "learning_rate": 9.874601718246783e-05, + "loss": 3.7806, + "step": 5596 + }, + { + "epoch": 3.606607574536664, + "grad_norm": 1.4733940023640126, + "learning_rate": 9.874556846829811e-05, + "loss": 3.6307, + "step": 5597 + }, + { + "epoch": 3.607252215954875, + "grad_norm": 1.3710004494548185, + "learning_rate": 9.874511967489111e-05, + "loss": 3.4225, + "step": 5598 + }, + { + "epoch": 3.607896857373086, + "grad_norm": 1.889032791531572, + "learning_rate": 9.874467080224753e-05, + "loss": 3.4909, + "step": 5599 + }, + { + "epoch": 3.6085414987912974, + "grad_norm": 1.2582980921438003, + "learning_rate": 9.874422185036814e-05, + "loss": 3.6494, + "step": 5600 + }, + { + "epoch": 3.6085414987912974, + "eval_loss": 4.055093288421631, + "eval_runtime": 2.9797, + "eval_samples_per_second": 33.561, + "eval_steps_per_second": 4.363, + "step": 5600 + }, + { + "epoch": 3.6091861402095082, + "grad_norm": 2.2393818537194954, + "learning_rate": 9.874377281925366e-05, + "loss": 3.358, + "step": 5601 + }, + { + "epoch": 3.6098307816277195, + "grad_norm": 1.567414557237977, + "learning_rate": 9.874332370890484e-05, + "loss": 3.5688, + "step": 5602 + }, + { + "epoch": 3.610475423045931, + "grad_norm": 1.5943529202614257, + "learning_rate": 9.874287451932242e-05, + "loss": 3.6605, + "step": 5603 + }, + { + "epoch": 3.6111200644641417, + "grad_norm": 1.4969178257943447, + "learning_rate": 9.874242525050711e-05, + "loss": 3.7356, + "step": 5604 + }, + { + "epoch": 3.611764705882353, + "grad_norm": 1.530042093702225, + "learning_rate": 9.874197590245967e-05, + "loss": 3.8511, + "step": 5605 + }, + { + "epoch": 3.612409347300564, + "grad_norm": 1.3827393936833425, + "learning_rate": 9.874152647518084e-05, + "loss": 3.3797, + "step": 5606 + }, + { + "epoch": 3.613053988718775, + "grad_norm": 1.4527459156666307, + "learning_rate": 9.874107696867137e-05, + "loss": 3.6478, + "step": 5607 + }, + { + "epoch": 3.6136986301369864, + "grad_norm": 1.540976199333686, + "learning_rate": 9.874062738293196e-05, + "loss": 3.7869, + "step": 5608 + }, + { + "epoch": 3.6143432715551973, + "grad_norm": 1.8431356797196354, + "learning_rate": 9.874017771796337e-05, + "loss": 3.6486, + "step": 5609 + }, + { + "epoch": 3.6149879129734086, + "grad_norm": 1.4531467895610146, + "learning_rate": 9.873972797376634e-05, + "loss": 3.6421, + "step": 5610 + }, + { + "epoch": 3.6156325543916195, + "grad_norm": 1.3389316085042977, + "learning_rate": 9.873927815034159e-05, + "loss": 3.5643, + "step": 5611 + }, + { + "epoch": 3.6162771958098308, + "grad_norm": 1.153806158097495, + "learning_rate": 9.87388282476899e-05, + "loss": 3.8806, + "step": 5612 + }, + { + "epoch": 3.616921837228042, + "grad_norm": 1.2939420185678976, + "learning_rate": 9.873837826581199e-05, + "loss": 3.6825, + "step": 5613 + }, + { + "epoch": 3.617566478646253, + "grad_norm": 1.4436930718690457, + "learning_rate": 9.873792820470856e-05, + "loss": 3.764, + "step": 5614 + }, + { + "epoch": 3.618211120064464, + "grad_norm": 1.5554101249472485, + "learning_rate": 9.87374780643804e-05, + "loss": 3.4873, + "step": 5615 + }, + { + "epoch": 3.618855761482675, + "grad_norm": 1.3675720682703556, + "learning_rate": 9.873702784482824e-05, + "loss": 3.9013, + "step": 5616 + }, + { + "epoch": 3.6195004029008864, + "grad_norm": 1.874825036218731, + "learning_rate": 9.873657754605281e-05, + "loss": 3.8818, + "step": 5617 + }, + { + "epoch": 3.6201450443190977, + "grad_norm": 1.2359014711419316, + "learning_rate": 9.873612716805485e-05, + "loss": 4.0023, + "step": 5618 + }, + { + "epoch": 3.6207896857373085, + "grad_norm": 1.7297225101253852, + "learning_rate": 9.87356767108351e-05, + "loss": 3.6497, + "step": 5619 + }, + { + "epoch": 3.62143432715552, + "grad_norm": 1.5019330970702556, + "learning_rate": 9.873522617439429e-05, + "loss": 3.8751, + "step": 5620 + }, + { + "epoch": 3.6220789685737307, + "grad_norm": 2.05706909896923, + "learning_rate": 9.873477555873317e-05, + "loss": 3.5636, + "step": 5621 + }, + { + "epoch": 3.622723609991942, + "grad_norm": 1.4529017686358587, + "learning_rate": 9.87343248638525e-05, + "loss": 3.6517, + "step": 5622 + }, + { + "epoch": 3.6233682514101533, + "grad_norm": 1.7404948557863973, + "learning_rate": 9.873387408975299e-05, + "loss": 3.5671, + "step": 5623 + }, + { + "epoch": 3.624012892828364, + "grad_norm": 1.5100587085769945, + "learning_rate": 9.873342323643538e-05, + "loss": 3.8362, + "step": 5624 + }, + { + "epoch": 3.6246575342465754, + "grad_norm": 2.0553301347592887, + "learning_rate": 9.873297230390041e-05, + "loss": 3.6214, + "step": 5625 + }, + { + "epoch": 3.6253021756647863, + "grad_norm": 1.539435092373431, + "learning_rate": 9.873252129214887e-05, + "loss": 3.6314, + "step": 5626 + }, + { + "epoch": 3.6259468170829976, + "grad_norm": 1.459073949817598, + "learning_rate": 9.873207020118145e-05, + "loss": 3.3535, + "step": 5627 + }, + { + "epoch": 3.626591458501209, + "grad_norm": 1.4555755675189033, + "learning_rate": 9.873161903099889e-05, + "loss": 3.6866, + "step": 5628 + }, + { + "epoch": 3.6272360999194198, + "grad_norm": 1.3766954325127128, + "learning_rate": 9.873116778160195e-05, + "loss": 3.6642, + "step": 5629 + }, + { + "epoch": 3.627880741337631, + "grad_norm": 1.5958051857906148, + "learning_rate": 9.873071645299137e-05, + "loss": 3.6678, + "step": 5630 + }, + { + "epoch": 3.628525382755842, + "grad_norm": 1.3854878950878438, + "learning_rate": 9.873026504516787e-05, + "loss": 3.494, + "step": 5631 + }, + { + "epoch": 3.629170024174053, + "grad_norm": 1.6707248398555448, + "learning_rate": 9.872981355813224e-05, + "loss": 3.7485, + "step": 5632 + }, + { + "epoch": 3.6298146655922645, + "grad_norm": 1.3261932337070914, + "learning_rate": 9.872936199188517e-05, + "loss": 3.2918, + "step": 5633 + }, + { + "epoch": 3.6304593070104754, + "grad_norm": 1.38715319448389, + "learning_rate": 9.872891034642741e-05, + "loss": 4.0964, + "step": 5634 + }, + { + "epoch": 3.6311039484286867, + "grad_norm": 1.7152695627647785, + "learning_rate": 9.872845862175973e-05, + "loss": 3.9004, + "step": 5635 + }, + { + "epoch": 3.6317485898468975, + "grad_norm": 1.4000799237792017, + "learning_rate": 9.872800681788285e-05, + "loss": 4.1824, + "step": 5636 + }, + { + "epoch": 3.632393231265109, + "grad_norm": 1.929806302015124, + "learning_rate": 9.872755493479753e-05, + "loss": 3.8403, + "step": 5637 + }, + { + "epoch": 3.63303787268332, + "grad_norm": 1.6683526047520647, + "learning_rate": 9.87271029725045e-05, + "loss": 3.6803, + "step": 5638 + }, + { + "epoch": 3.633682514101531, + "grad_norm": 1.406478285669166, + "learning_rate": 9.872665093100447e-05, + "loss": 3.5066, + "step": 5639 + }, + { + "epoch": 3.6343271555197423, + "grad_norm": 1.9058297263797936, + "learning_rate": 9.872619881029824e-05, + "loss": 3.6706, + "step": 5640 + }, + { + "epoch": 3.634971796937953, + "grad_norm": 1.2678176001204484, + "learning_rate": 9.872574661038651e-05, + "loss": 3.7052, + "step": 5641 + }, + { + "epoch": 3.6356164383561644, + "grad_norm": 1.7507651771200832, + "learning_rate": 9.872529433127005e-05, + "loss": 3.484, + "step": 5642 + }, + { + "epoch": 3.6362610797743757, + "grad_norm": 1.3880098498637587, + "learning_rate": 9.87248419729496e-05, + "loss": 3.8042, + "step": 5643 + }, + { + "epoch": 3.6369057211925866, + "grad_norm": 1.3580243860061725, + "learning_rate": 9.872438953542589e-05, + "loss": 3.3946, + "step": 5644 + }, + { + "epoch": 3.637550362610798, + "grad_norm": 1.4996660576048704, + "learning_rate": 9.872393701869966e-05, + "loss": 3.995, + "step": 5645 + }, + { + "epoch": 3.6381950040290088, + "grad_norm": 1.6558436879984701, + "learning_rate": 9.872348442277167e-05, + "loss": 3.638, + "step": 5646 + }, + { + "epoch": 3.63883964544722, + "grad_norm": 1.3798408272207574, + "learning_rate": 9.872303174764265e-05, + "loss": 3.5995, + "step": 5647 + }, + { + "epoch": 3.6394842868654314, + "grad_norm": 1.7400224787655099, + "learning_rate": 9.872257899331333e-05, + "loss": 3.5908, + "step": 5648 + }, + { + "epoch": 3.640128928283642, + "grad_norm": 1.4634017579750198, + "learning_rate": 9.87221261597845e-05, + "loss": 3.796, + "step": 5649 + }, + { + "epoch": 3.6407735697018535, + "grad_norm": 1.8358569236588589, + "learning_rate": 9.872167324705686e-05, + "loss": 3.7083, + "step": 5650 + }, + { + "epoch": 3.6414182111200644, + "grad_norm": 1.2966260494762902, + "learning_rate": 9.872122025513117e-05, + "loss": 3.4948, + "step": 5651 + }, + { + "epoch": 3.6420628525382757, + "grad_norm": 2.040139391909689, + "learning_rate": 9.872076718400819e-05, + "loss": 3.414, + "step": 5652 + }, + { + "epoch": 3.642707493956487, + "grad_norm": 1.6665639686576958, + "learning_rate": 9.872031403368864e-05, + "loss": 3.8002, + "step": 5653 + }, + { + "epoch": 3.643352135374698, + "grad_norm": 1.985817330584961, + "learning_rate": 9.871986080417326e-05, + "loss": 3.3736, + "step": 5654 + }, + { + "epoch": 3.6439967767929087, + "grad_norm": 1.7567222873243402, + "learning_rate": 9.871940749546282e-05, + "loss": 3.5976, + "step": 5655 + }, + { + "epoch": 3.64464141821112, + "grad_norm": 2.0599486488497525, + "learning_rate": 9.871895410755803e-05, + "loss": 3.36, + "step": 5656 + }, + { + "epoch": 3.6452860596293313, + "grad_norm": 1.5232393293643924, + "learning_rate": 9.871850064045967e-05, + "loss": 3.3832, + "step": 5657 + }, + { + "epoch": 3.6459307010475426, + "grad_norm": 2.1571068738231696, + "learning_rate": 9.871804709416845e-05, + "loss": 3.9459, + "step": 5658 + }, + { + "epoch": 3.6465753424657534, + "grad_norm": 1.32943362652239, + "learning_rate": 9.871759346868516e-05, + "loss": 3.8265, + "step": 5659 + }, + { + "epoch": 3.6472199838839643, + "grad_norm": 1.9241730000922033, + "learning_rate": 9.871713976401053e-05, + "loss": 3.6574, + "step": 5660 + }, + { + "epoch": 3.6478646253021756, + "grad_norm": 1.6267536855776468, + "learning_rate": 9.871668598014527e-05, + "loss": 3.439, + "step": 5661 + }, + { + "epoch": 3.648509266720387, + "grad_norm": 2.0036282907958562, + "learning_rate": 9.871623211709015e-05, + "loss": 3.7134, + "step": 5662 + }, + { + "epoch": 3.6491539081385977, + "grad_norm": 1.817253849146205, + "learning_rate": 9.871577817484592e-05, + "loss": 3.685, + "step": 5663 + }, + { + "epoch": 3.649798549556809, + "grad_norm": 1.3983903658104828, + "learning_rate": 9.871532415341332e-05, + "loss": 3.9188, + "step": 5664 + }, + { + "epoch": 3.65044319097502, + "grad_norm": 1.5270178421660963, + "learning_rate": 9.871487005279311e-05, + "loss": 3.587, + "step": 5665 + }, + { + "epoch": 3.651087832393231, + "grad_norm": 1.3911426642752511, + "learning_rate": 9.871441587298601e-05, + "loss": 3.694, + "step": 5666 + }, + { + "epoch": 3.6517324738114425, + "grad_norm": 1.4908723420222436, + "learning_rate": 9.871396161399278e-05, + "loss": 3.57, + "step": 5667 + }, + { + "epoch": 3.6523771152296534, + "grad_norm": 1.5806435457023695, + "learning_rate": 9.871350727581417e-05, + "loss": 3.801, + "step": 5668 + }, + { + "epoch": 3.6530217566478647, + "grad_norm": 1.3338547511014147, + "learning_rate": 9.871305285845091e-05, + "loss": 3.4077, + "step": 5669 + }, + { + "epoch": 3.6536663980660755, + "grad_norm": 1.760824730282153, + "learning_rate": 9.871259836190377e-05, + "loss": 3.2549, + "step": 5670 + }, + { + "epoch": 3.654311039484287, + "grad_norm": 1.970776405614221, + "learning_rate": 9.871214378617348e-05, + "loss": 3.6963, + "step": 5671 + }, + { + "epoch": 3.654955680902498, + "grad_norm": 1.445935283167858, + "learning_rate": 9.871168913126077e-05, + "loss": 3.6982, + "step": 5672 + }, + { + "epoch": 3.655600322320709, + "grad_norm": 1.5418996425027867, + "learning_rate": 9.871123439716641e-05, + "loss": 3.9544, + "step": 5673 + }, + { + "epoch": 3.6562449637389203, + "grad_norm": 1.6049319558657995, + "learning_rate": 9.871077958389117e-05, + "loss": 3.9268, + "step": 5674 + }, + { + "epoch": 3.656889605157131, + "grad_norm": 1.5708325939517869, + "learning_rate": 9.871032469143576e-05, + "loss": 3.5373, + "step": 5675 + }, + { + "epoch": 3.6575342465753424, + "grad_norm": 1.3475651326522389, + "learning_rate": 9.870986971980094e-05, + "loss": 3.5022, + "step": 5676 + }, + { + "epoch": 3.6581788879935537, + "grad_norm": 1.504212509976851, + "learning_rate": 9.870941466898744e-05, + "loss": 3.5697, + "step": 5677 + }, + { + "epoch": 3.6588235294117646, + "grad_norm": 1.3599618946693592, + "learning_rate": 9.870895953899605e-05, + "loss": 3.6718, + "step": 5678 + }, + { + "epoch": 3.659468170829976, + "grad_norm": 1.3126221242766205, + "learning_rate": 9.870850432982746e-05, + "loss": 3.288, + "step": 5679 + }, + { + "epoch": 3.6601128122481867, + "grad_norm": 1.2904846205296991, + "learning_rate": 9.870804904148245e-05, + "loss": 3.4402, + "step": 5680 + }, + { + "epoch": 3.660757453666398, + "grad_norm": 1.2175472108167107, + "learning_rate": 9.870759367396179e-05, + "loss": 3.2876, + "step": 5681 + }, + { + "epoch": 3.6614020950846093, + "grad_norm": 1.2321384218451776, + "learning_rate": 9.870713822726618e-05, + "loss": 3.7005, + "step": 5682 + }, + { + "epoch": 3.66204673650282, + "grad_norm": 1.2502997579470367, + "learning_rate": 9.87066827013964e-05, + "loss": 3.5427, + "step": 5683 + }, + { + "epoch": 3.6626913779210315, + "grad_norm": 1.4707352825794602, + "learning_rate": 9.87062270963532e-05, + "loss": 3.9867, + "step": 5684 + }, + { + "epoch": 3.6633360193392424, + "grad_norm": 1.364225642871411, + "learning_rate": 9.87057714121373e-05, + "loss": 3.7091, + "step": 5685 + }, + { + "epoch": 3.6639806607574537, + "grad_norm": 1.2728528875799778, + "learning_rate": 9.870531564874946e-05, + "loss": 3.1137, + "step": 5686 + }, + { + "epoch": 3.664625302175665, + "grad_norm": 1.7480281893200984, + "learning_rate": 9.870485980619045e-05, + "loss": 3.629, + "step": 5687 + }, + { + "epoch": 3.665269943593876, + "grad_norm": 1.757840255741376, + "learning_rate": 9.8704403884461e-05, + "loss": 3.9379, + "step": 5688 + }, + { + "epoch": 3.665914585012087, + "grad_norm": 1.6320545235474586, + "learning_rate": 9.870394788356188e-05, + "loss": 3.3326, + "step": 5689 + }, + { + "epoch": 3.666559226430298, + "grad_norm": 1.5839879848446183, + "learning_rate": 9.870349180349378e-05, + "loss": 3.497, + "step": 5690 + }, + { + "epoch": 3.6672038678485093, + "grad_norm": 1.875995431511149, + "learning_rate": 9.870303564425752e-05, + "loss": 3.7544, + "step": 5691 + }, + { + "epoch": 3.6678485092667206, + "grad_norm": 1.642092446276887, + "learning_rate": 9.870257940585381e-05, + "loss": 3.6178, + "step": 5692 + }, + { + "epoch": 3.6684931506849314, + "grad_norm": 1.4379022747093761, + "learning_rate": 9.870212308828341e-05, + "loss": 3.5147, + "step": 5693 + }, + { + "epoch": 3.6691377921031427, + "grad_norm": 1.8041840352023164, + "learning_rate": 9.870166669154708e-05, + "loss": 3.8912, + "step": 5694 + }, + { + "epoch": 3.6697824335213536, + "grad_norm": 1.2338510510495586, + "learning_rate": 9.870121021564554e-05, + "loss": 3.625, + "step": 5695 + }, + { + "epoch": 3.670427074939565, + "grad_norm": 1.9725436091089794, + "learning_rate": 9.870075366057956e-05, + "loss": 3.7678, + "step": 5696 + }, + { + "epoch": 3.671071716357776, + "grad_norm": 1.2971779906212566, + "learning_rate": 9.870029702634989e-05, + "loss": 3.5321, + "step": 5697 + }, + { + "epoch": 3.671716357775987, + "grad_norm": 1.5302714189804794, + "learning_rate": 9.869984031295726e-05, + "loss": 3.7186, + "step": 5698 + }, + { + "epoch": 3.6723609991941983, + "grad_norm": 1.619802222225746, + "learning_rate": 9.869938352040247e-05, + "loss": 3.9342, + "step": 5699 + }, + { + "epoch": 3.673005640612409, + "grad_norm": 1.7441410575251073, + "learning_rate": 9.869892664868622e-05, + "loss": 3.9161, + "step": 5700 + }, + { + "epoch": 3.673005640612409, + "eval_loss": 4.049792766571045, + "eval_runtime": 2.9752, + "eval_samples_per_second": 33.611, + "eval_steps_per_second": 4.369, + "step": 5700 + }, + { + "epoch": 3.6736502820306205, + "grad_norm": 1.6760411327629368, + "learning_rate": 9.869846969780927e-05, + "loss": 3.6299, + "step": 5701 + }, + { + "epoch": 3.674294923448832, + "grad_norm": 1.2095246629249023, + "learning_rate": 9.86980126677724e-05, + "loss": 3.9894, + "step": 5702 + }, + { + "epoch": 3.6749395648670427, + "grad_norm": 1.5061706509158301, + "learning_rate": 9.869755555857632e-05, + "loss": 3.723, + "step": 5703 + }, + { + "epoch": 3.675584206285254, + "grad_norm": 1.310504877760035, + "learning_rate": 9.86970983702218e-05, + "loss": 3.5413, + "step": 5704 + }, + { + "epoch": 3.676228847703465, + "grad_norm": 1.250955723851139, + "learning_rate": 9.86966411027096e-05, + "loss": 3.6318, + "step": 5705 + }, + { + "epoch": 3.676873489121676, + "grad_norm": 1.409368445735034, + "learning_rate": 9.869618375604046e-05, + "loss": 3.6941, + "step": 5706 + }, + { + "epoch": 3.6775181305398874, + "grad_norm": 1.3556555117924882, + "learning_rate": 9.869572633021512e-05, + "loss": 3.6467, + "step": 5707 + }, + { + "epoch": 3.6781627719580983, + "grad_norm": 1.937316986646986, + "learning_rate": 9.869526882523436e-05, + "loss": 3.4524, + "step": 5708 + }, + { + "epoch": 3.6788074133763096, + "grad_norm": 1.2414323032322625, + "learning_rate": 9.86948112410989e-05, + "loss": 3.8827, + "step": 5709 + }, + { + "epoch": 3.6794520547945204, + "grad_norm": 1.3448483039846404, + "learning_rate": 9.869435357780951e-05, + "loss": 3.7244, + "step": 5710 + }, + { + "epoch": 3.6800966962127317, + "grad_norm": 1.4340548295456448, + "learning_rate": 9.869389583536696e-05, + "loss": 3.657, + "step": 5711 + }, + { + "epoch": 3.680741337630943, + "grad_norm": 1.3388801886814143, + "learning_rate": 9.869343801377195e-05, + "loss": 3.5137, + "step": 5712 + }, + { + "epoch": 3.681385979049154, + "grad_norm": 1.4107355144689053, + "learning_rate": 9.86929801130253e-05, + "loss": 3.9179, + "step": 5713 + }, + { + "epoch": 3.682030620467365, + "grad_norm": 1.4644807361928944, + "learning_rate": 9.869252213312768e-05, + "loss": 3.5946, + "step": 5714 + }, + { + "epoch": 3.682675261885576, + "grad_norm": 1.3361965720455844, + "learning_rate": 9.869206407407991e-05, + "loss": 3.6467, + "step": 5715 + }, + { + "epoch": 3.6833199033037873, + "grad_norm": 1.6016505296148071, + "learning_rate": 9.869160593588271e-05, + "loss": 3.8463, + "step": 5716 + }, + { + "epoch": 3.6839645447219986, + "grad_norm": 1.438785734644395, + "learning_rate": 9.869114771853685e-05, + "loss": 3.744, + "step": 5717 + }, + { + "epoch": 3.6846091861402095, + "grad_norm": 1.5152439254547234, + "learning_rate": 9.869068942204306e-05, + "loss": 3.7811, + "step": 5718 + }, + { + "epoch": 3.685253827558421, + "grad_norm": 1.6685005864375155, + "learning_rate": 9.869023104640212e-05, + "loss": 3.9182, + "step": 5719 + }, + { + "epoch": 3.6858984689766316, + "grad_norm": 1.3731815025836402, + "learning_rate": 9.868977259161475e-05, + "loss": 3.6194, + "step": 5720 + }, + { + "epoch": 3.686543110394843, + "grad_norm": 1.6593827311274507, + "learning_rate": 9.868931405768173e-05, + "loss": 3.7802, + "step": 5721 + }, + { + "epoch": 3.6871877518130542, + "grad_norm": 1.2218628382295227, + "learning_rate": 9.868885544460383e-05, + "loss": 3.6074, + "step": 5722 + }, + { + "epoch": 3.687832393231265, + "grad_norm": 1.6339128630703041, + "learning_rate": 9.868839675238175e-05, + "loss": 3.4016, + "step": 5723 + }, + { + "epoch": 3.688477034649476, + "grad_norm": 1.7382191453696971, + "learning_rate": 9.868793798101628e-05, + "loss": 3.8698, + "step": 5724 + }, + { + "epoch": 3.6891216760676873, + "grad_norm": 1.564920375113647, + "learning_rate": 9.868747913050816e-05, + "loss": 3.1362, + "step": 5725 + }, + { + "epoch": 3.6897663174858986, + "grad_norm": 1.4869005457593463, + "learning_rate": 9.868702020085816e-05, + "loss": 3.9742, + "step": 5726 + }, + { + "epoch": 3.69041095890411, + "grad_norm": 1.645192463316018, + "learning_rate": 9.868656119206702e-05, + "loss": 4.1567, + "step": 5727 + }, + { + "epoch": 3.6910556003223207, + "grad_norm": 1.3776060058814539, + "learning_rate": 9.868610210413549e-05, + "loss": 3.7057, + "step": 5728 + }, + { + "epoch": 3.6917002417405316, + "grad_norm": 1.4258287604482331, + "learning_rate": 9.868564293706434e-05, + "loss": 3.3993, + "step": 5729 + }, + { + "epoch": 3.692344883158743, + "grad_norm": 1.2906901299144975, + "learning_rate": 9.86851836908543e-05, + "loss": 3.7291, + "step": 5730 + }, + { + "epoch": 3.692989524576954, + "grad_norm": 1.248086068722541, + "learning_rate": 9.868472436550614e-05, + "loss": 3.5994, + "step": 5731 + }, + { + "epoch": 3.693634165995165, + "grad_norm": 1.3977088842206058, + "learning_rate": 9.868426496102063e-05, + "loss": 3.8071, + "step": 5732 + }, + { + "epoch": 3.6942788074133763, + "grad_norm": 1.2062537661094523, + "learning_rate": 9.868380547739849e-05, + "loss": 3.8163, + "step": 5733 + }, + { + "epoch": 3.694923448831587, + "grad_norm": 1.4512876607210716, + "learning_rate": 9.86833459146405e-05, + "loss": 3.6847, + "step": 5734 + }, + { + "epoch": 3.6955680902497985, + "grad_norm": 1.4110990941576398, + "learning_rate": 9.86828862727474e-05, + "loss": 3.7987, + "step": 5735 + }, + { + "epoch": 3.69621273166801, + "grad_norm": 1.2173282696096182, + "learning_rate": 9.868242655171996e-05, + "loss": 3.9557, + "step": 5736 + }, + { + "epoch": 3.6968573730862206, + "grad_norm": 1.7161962064933383, + "learning_rate": 9.868196675155893e-05, + "loss": 3.3035, + "step": 5737 + }, + { + "epoch": 3.697502014504432, + "grad_norm": 1.607332507172891, + "learning_rate": 9.868150687226507e-05, + "loss": 3.4042, + "step": 5738 + }, + { + "epoch": 3.698146655922643, + "grad_norm": 1.3925600343263738, + "learning_rate": 9.868104691383909e-05, + "loss": 3.4487, + "step": 5739 + }, + { + "epoch": 3.698791297340854, + "grad_norm": 1.4080861466369534, + "learning_rate": 9.868058687628182e-05, + "loss": 3.3432, + "step": 5740 + }, + { + "epoch": 3.6994359387590654, + "grad_norm": 1.6207179228065858, + "learning_rate": 9.868012675959396e-05, + "loss": 3.3383, + "step": 5741 + }, + { + "epoch": 3.7000805801772763, + "grad_norm": 1.6053778351758967, + "learning_rate": 9.86796665637763e-05, + "loss": 3.6859, + "step": 5742 + }, + { + "epoch": 3.7007252215954876, + "grad_norm": 1.5401778384225562, + "learning_rate": 9.867920628882956e-05, + "loss": 3.5213, + "step": 5743 + }, + { + "epoch": 3.7013698630136984, + "grad_norm": 1.9168358039714983, + "learning_rate": 9.867874593475452e-05, + "loss": 3.5261, + "step": 5744 + }, + { + "epoch": 3.7020145044319097, + "grad_norm": 1.5573074143024053, + "learning_rate": 9.867828550155194e-05, + "loss": 3.4218, + "step": 5745 + }, + { + "epoch": 3.702659145850121, + "grad_norm": 1.4242604573325275, + "learning_rate": 9.867782498922257e-05, + "loss": 3.3745, + "step": 5746 + }, + { + "epoch": 3.703303787268332, + "grad_norm": 1.4495803788796289, + "learning_rate": 9.867736439776716e-05, + "loss": 3.7368, + "step": 5747 + }, + { + "epoch": 3.703948428686543, + "grad_norm": 1.6100232868884976, + "learning_rate": 9.867690372718647e-05, + "loss": 3.6548, + "step": 5748 + }, + { + "epoch": 3.704593070104754, + "grad_norm": 1.4897812819768366, + "learning_rate": 9.867644297748125e-05, + "loss": 3.531, + "step": 5749 + }, + { + "epoch": 3.7052377115229653, + "grad_norm": 1.3352449090943102, + "learning_rate": 9.867598214865228e-05, + "loss": 3.5801, + "step": 5750 + }, + { + "epoch": 3.7058823529411766, + "grad_norm": 1.3423335815137962, + "learning_rate": 9.86755212407003e-05, + "loss": 3.503, + "step": 5751 + }, + { + "epoch": 3.7065269943593875, + "grad_norm": 1.358240690672046, + "learning_rate": 9.867506025362606e-05, + "loss": 3.6783, + "step": 5752 + }, + { + "epoch": 3.707171635777599, + "grad_norm": 1.6369786913972313, + "learning_rate": 9.867459918743031e-05, + "loss": 3.4546, + "step": 5753 + }, + { + "epoch": 3.7078162771958096, + "grad_norm": 1.6963286315907895, + "learning_rate": 9.867413804211386e-05, + "loss": 3.8234, + "step": 5754 + }, + { + "epoch": 3.708460918614021, + "grad_norm": 1.7240349198955907, + "learning_rate": 9.867367681767739e-05, + "loss": 3.5529, + "step": 5755 + }, + { + "epoch": 3.7091055600322322, + "grad_norm": 1.346889891301299, + "learning_rate": 9.867321551412173e-05, + "loss": 3.3542, + "step": 5756 + }, + { + "epoch": 3.709750201450443, + "grad_norm": 1.884045286083293, + "learning_rate": 9.86727541314476e-05, + "loss": 3.7242, + "step": 5757 + }, + { + "epoch": 3.7103948428686544, + "grad_norm": 1.34734833093751, + "learning_rate": 9.867229266965574e-05, + "loss": 3.3717, + "step": 5758 + }, + { + "epoch": 3.7110394842868653, + "grad_norm": 1.560980977894319, + "learning_rate": 9.867183112874694e-05, + "loss": 3.682, + "step": 5759 + }, + { + "epoch": 3.7116841257050766, + "grad_norm": 1.3887887994717714, + "learning_rate": 9.867136950872195e-05, + "loss": 3.634, + "step": 5760 + }, + { + "epoch": 3.712328767123288, + "grad_norm": 1.6641084063228953, + "learning_rate": 9.867090780958154e-05, + "loss": 3.5647, + "step": 5761 + }, + { + "epoch": 3.7129734085414987, + "grad_norm": 1.4435245220194441, + "learning_rate": 9.867044603132645e-05, + "loss": 4.1046, + "step": 5762 + }, + { + "epoch": 3.71361804995971, + "grad_norm": 1.4233249056999864, + "learning_rate": 9.866998417395743e-05, + "loss": 3.7211, + "step": 5763 + }, + { + "epoch": 3.714262691377921, + "grad_norm": 1.6906559052397572, + "learning_rate": 9.866952223747527e-05, + "loss": 3.8686, + "step": 5764 + }, + { + "epoch": 3.714907332796132, + "grad_norm": 1.4949960350256162, + "learning_rate": 9.866906022188069e-05, + "loss": 3.4924, + "step": 5765 + }, + { + "epoch": 3.7155519742143435, + "grad_norm": 1.3875297578508288, + "learning_rate": 9.866859812717447e-05, + "loss": 3.8481, + "step": 5766 + }, + { + "epoch": 3.7161966156325543, + "grad_norm": 1.543177800890074, + "learning_rate": 9.866813595335739e-05, + "loss": 3.8067, + "step": 5767 + }, + { + "epoch": 3.7168412570507656, + "grad_norm": 1.2459900004494715, + "learning_rate": 9.866767370043017e-05, + "loss": 3.6437, + "step": 5768 + }, + { + "epoch": 3.7174858984689765, + "grad_norm": 1.5625581725711697, + "learning_rate": 9.866721136839361e-05, + "loss": 3.7248, + "step": 5769 + }, + { + "epoch": 3.718130539887188, + "grad_norm": 1.2044275640124071, + "learning_rate": 9.866674895724841e-05, + "loss": 3.5911, + "step": 5770 + }, + { + "epoch": 3.718775181305399, + "grad_norm": 1.4656154442068647, + "learning_rate": 9.866628646699538e-05, + "loss": 3.4948, + "step": 5771 + }, + { + "epoch": 3.71941982272361, + "grad_norm": 1.331760566746482, + "learning_rate": 9.866582389763527e-05, + "loss": 3.9524, + "step": 5772 + }, + { + "epoch": 3.7200644641418212, + "grad_norm": 1.413937425769586, + "learning_rate": 9.866536124916884e-05, + "loss": 3.9473, + "step": 5773 + }, + { + "epoch": 3.720709105560032, + "grad_norm": 1.2293126668688104, + "learning_rate": 9.866489852159685e-05, + "loss": 3.7605, + "step": 5774 + }, + { + "epoch": 3.7213537469782434, + "grad_norm": 1.7141223078495094, + "learning_rate": 9.866443571492004e-05, + "loss": 3.8777, + "step": 5775 + }, + { + "epoch": 3.7219983883964547, + "grad_norm": 1.3815665013416387, + "learning_rate": 9.866397282913919e-05, + "loss": 3.9022, + "step": 5776 + }, + { + "epoch": 3.7226430298146655, + "grad_norm": 1.1692579271896861, + "learning_rate": 9.866350986425504e-05, + "loss": 3.7882, + "step": 5777 + }, + { + "epoch": 3.723287671232877, + "grad_norm": 1.3313598062697434, + "learning_rate": 9.866304682026838e-05, + "loss": 3.7714, + "step": 5778 + }, + { + "epoch": 3.7239323126510877, + "grad_norm": 1.2110861010179983, + "learning_rate": 9.866258369717997e-05, + "loss": 3.4446, + "step": 5779 + }, + { + "epoch": 3.724576954069299, + "grad_norm": 1.1845900783262742, + "learning_rate": 9.866212049499053e-05, + "loss": 3.7564, + "step": 5780 + }, + { + "epoch": 3.7252215954875103, + "grad_norm": 1.7395906812728061, + "learning_rate": 9.866165721370086e-05, + "loss": 3.7861, + "step": 5781 + }, + { + "epoch": 3.725866236905721, + "grad_norm": 1.4639799587618485, + "learning_rate": 9.86611938533117e-05, + "loss": 3.6805, + "step": 5782 + }, + { + "epoch": 3.7265108783239325, + "grad_norm": 1.473138517215584, + "learning_rate": 9.866073041382383e-05, + "loss": 3.7085, + "step": 5783 + }, + { + "epoch": 3.7271555197421433, + "grad_norm": 1.411062466379032, + "learning_rate": 9.866026689523799e-05, + "loss": 3.6945, + "step": 5784 + }, + { + "epoch": 3.7278001611603546, + "grad_norm": 1.2007078565239524, + "learning_rate": 9.865980329755496e-05, + "loss": 3.5543, + "step": 5785 + }, + { + "epoch": 3.728444802578566, + "grad_norm": 1.5431042421017478, + "learning_rate": 9.865933962077548e-05, + "loss": 3.7285, + "step": 5786 + }, + { + "epoch": 3.7290894439967768, + "grad_norm": 1.487347462229813, + "learning_rate": 9.865887586490033e-05, + "loss": 4.0299, + "step": 5787 + }, + { + "epoch": 3.729734085414988, + "grad_norm": 1.6140292455377137, + "learning_rate": 9.865841202993026e-05, + "loss": 3.5664, + "step": 5788 + }, + { + "epoch": 3.730378726833199, + "grad_norm": 1.5253417274087702, + "learning_rate": 9.865794811586606e-05, + "loss": 3.4442, + "step": 5789 + }, + { + "epoch": 3.7310233682514102, + "grad_norm": 1.6816486972863343, + "learning_rate": 9.865748412270846e-05, + "loss": 3.9012, + "step": 5790 + }, + { + "epoch": 3.7316680096696215, + "grad_norm": 1.5469079926175524, + "learning_rate": 9.865702005045823e-05, + "loss": 3.6946, + "step": 5791 + }, + { + "epoch": 3.7323126510878324, + "grad_norm": 2.218562814395504, + "learning_rate": 9.865655589911613e-05, + "loss": 3.9585, + "step": 5792 + }, + { + "epoch": 3.7329572925060432, + "grad_norm": 1.5692389146157464, + "learning_rate": 9.865609166868292e-05, + "loss": 3.5014, + "step": 5793 + }, + { + "epoch": 3.7336019339242545, + "grad_norm": 2.4353567491444, + "learning_rate": 9.865562735915937e-05, + "loss": 4.0865, + "step": 5794 + }, + { + "epoch": 3.734246575342466, + "grad_norm": 1.21732947013978, + "learning_rate": 9.865516297054626e-05, + "loss": 3.7895, + "step": 5795 + }, + { + "epoch": 3.734891216760677, + "grad_norm": 2.523124461946675, + "learning_rate": 9.865469850284432e-05, + "loss": 3.2781, + "step": 5796 + }, + { + "epoch": 3.735535858178888, + "grad_norm": 1.9750032076182746, + "learning_rate": 9.865423395605433e-05, + "loss": 4.0053, + "step": 5797 + }, + { + "epoch": 3.736180499597099, + "grad_norm": 1.8987993711173683, + "learning_rate": 9.865376933017704e-05, + "loss": 3.7648, + "step": 5798 + }, + { + "epoch": 3.73682514101531, + "grad_norm": 1.8457127844035437, + "learning_rate": 9.865330462521323e-05, + "loss": 3.6738, + "step": 5799 + }, + { + "epoch": 3.7374697824335215, + "grad_norm": 1.837077138357606, + "learning_rate": 9.865283984116367e-05, + "loss": 3.8117, + "step": 5800 + }, + { + "epoch": 3.7374697824335215, + "eval_loss": 4.054846286773682, + "eval_runtime": 2.973, + "eval_samples_per_second": 33.636, + "eval_steps_per_second": 4.373, + "step": 5800 + }, + { + "epoch": 3.7381144238517323, + "grad_norm": 2.128925972839067, + "learning_rate": 9.86523749780291e-05, + "loss": 3.6039, + "step": 5801 + }, + { + "epoch": 3.7387590652699436, + "grad_norm": 1.8448298487895958, + "learning_rate": 9.865191003581027e-05, + "loss": 3.7547, + "step": 5802 + }, + { + "epoch": 3.7394037066881545, + "grad_norm": 2.5309640937582576, + "learning_rate": 9.865144501450799e-05, + "loss": 3.8916, + "step": 5803 + }, + { + "epoch": 3.7400483481063658, + "grad_norm": 1.387367368659842, + "learning_rate": 9.8650979914123e-05, + "loss": 3.4845, + "step": 5804 + }, + { + "epoch": 3.740692989524577, + "grad_norm": 2.013816446174902, + "learning_rate": 9.865051473465605e-05, + "loss": 3.8931, + "step": 5805 + }, + { + "epoch": 3.741337630942788, + "grad_norm": 1.5662206803124479, + "learning_rate": 9.865004947610793e-05, + "loss": 3.4365, + "step": 5806 + }, + { + "epoch": 3.7419822723609992, + "grad_norm": 1.8845967644620745, + "learning_rate": 9.86495841384794e-05, + "loss": 3.8645, + "step": 5807 + }, + { + "epoch": 3.74262691377921, + "grad_norm": 1.7656293843459392, + "learning_rate": 9.864911872177121e-05, + "loss": 3.6768, + "step": 5808 + }, + { + "epoch": 3.7432715551974214, + "grad_norm": 2.056133792815763, + "learning_rate": 9.864865322598412e-05, + "loss": 3.7929, + "step": 5809 + }, + { + "epoch": 3.7439161966156327, + "grad_norm": 1.7716952751751647, + "learning_rate": 9.864818765111892e-05, + "loss": 3.6132, + "step": 5810 + }, + { + "epoch": 3.7445608380338435, + "grad_norm": 2.1909000443431017, + "learning_rate": 9.864772199717635e-05, + "loss": 3.891, + "step": 5811 + }, + { + "epoch": 3.745205479452055, + "grad_norm": 1.5573995063118393, + "learning_rate": 9.86472562641572e-05, + "loss": 3.8721, + "step": 5812 + }, + { + "epoch": 3.7458501208702657, + "grad_norm": 1.8269098780302753, + "learning_rate": 9.86467904520622e-05, + "loss": 3.6841, + "step": 5813 + }, + { + "epoch": 3.746494762288477, + "grad_norm": 1.541568878555879, + "learning_rate": 9.864632456089214e-05, + "loss": 3.7223, + "step": 5814 + }, + { + "epoch": 3.7471394037066883, + "grad_norm": 1.5454221169286886, + "learning_rate": 9.864585859064779e-05, + "loss": 3.6593, + "step": 5815 + }, + { + "epoch": 3.747784045124899, + "grad_norm": 1.7912129191712687, + "learning_rate": 9.86453925413299e-05, + "loss": 3.5168, + "step": 5816 + }, + { + "epoch": 3.7484286865431105, + "grad_norm": 1.7591477841299175, + "learning_rate": 9.864492641293923e-05, + "loss": 3.795, + "step": 5817 + }, + { + "epoch": 3.7490733279613213, + "grad_norm": 1.6689633825609482, + "learning_rate": 9.864446020547656e-05, + "loss": 3.5028, + "step": 5818 + }, + { + "epoch": 3.7497179693795326, + "grad_norm": 2.017802003503971, + "learning_rate": 9.864399391894267e-05, + "loss": 3.4599, + "step": 5819 + }, + { + "epoch": 3.750362610797744, + "grad_norm": 1.8364104379020487, + "learning_rate": 9.864352755333829e-05, + "loss": 4.0759, + "step": 5820 + }, + { + "epoch": 3.7510072522159548, + "grad_norm": 1.70891193179125, + "learning_rate": 9.86430611086642e-05, + "loss": 3.8484, + "step": 5821 + }, + { + "epoch": 3.751651893634166, + "grad_norm": 1.7412634874623778, + "learning_rate": 9.864259458492118e-05, + "loss": 3.6059, + "step": 5822 + }, + { + "epoch": 3.752296535052377, + "grad_norm": 1.323180491229288, + "learning_rate": 9.864212798210999e-05, + "loss": 3.3568, + "step": 5823 + }, + { + "epoch": 3.7529411764705882, + "grad_norm": 1.8087697624331447, + "learning_rate": 9.864166130023139e-05, + "loss": 3.4551, + "step": 5824 + }, + { + "epoch": 3.7535858178887995, + "grad_norm": 1.8399424087579546, + "learning_rate": 9.864119453928614e-05, + "loss": 3.9598, + "step": 5825 + }, + { + "epoch": 3.7542304593070104, + "grad_norm": 1.3589650919441, + "learning_rate": 9.864072769927502e-05, + "loss": 3.7586, + "step": 5826 + }, + { + "epoch": 3.7548751007252217, + "grad_norm": 1.9062486561598635, + "learning_rate": 9.864026078019879e-05, + "loss": 3.7366, + "step": 5827 + }, + { + "epoch": 3.7555197421434325, + "grad_norm": 1.2269065867090772, + "learning_rate": 9.863979378205822e-05, + "loss": 3.8048, + "step": 5828 + }, + { + "epoch": 3.756164383561644, + "grad_norm": 1.6289067835065174, + "learning_rate": 9.863932670485409e-05, + "loss": 3.896, + "step": 5829 + }, + { + "epoch": 3.756809024979855, + "grad_norm": 1.1492940930648456, + "learning_rate": 9.863885954858713e-05, + "loss": 3.5722, + "step": 5830 + }, + { + "epoch": 3.757453666398066, + "grad_norm": 1.445369956677279, + "learning_rate": 9.863839231325815e-05, + "loss": 3.9668, + "step": 5831 + }, + { + "epoch": 3.7580983078162773, + "grad_norm": 1.4060363924032206, + "learning_rate": 9.863792499886788e-05, + "loss": 3.5591, + "step": 5832 + }, + { + "epoch": 3.758742949234488, + "grad_norm": 1.3882000389725921, + "learning_rate": 9.86374576054171e-05, + "loss": 3.6149, + "step": 5833 + }, + { + "epoch": 3.7593875906526995, + "grad_norm": 1.4049384052662055, + "learning_rate": 9.863699013290661e-05, + "loss": 3.7582, + "step": 5834 + }, + { + "epoch": 3.7600322320709108, + "grad_norm": 1.2491356842949524, + "learning_rate": 9.863652258133713e-05, + "loss": 3.6294, + "step": 5835 + }, + { + "epoch": 3.7606768734891216, + "grad_norm": 1.260648877582725, + "learning_rate": 9.863605495070945e-05, + "loss": 3.2583, + "step": 5836 + }, + { + "epoch": 3.761321514907333, + "grad_norm": 1.2535408787573714, + "learning_rate": 9.863558724102436e-05, + "loss": 3.5495, + "step": 5837 + }, + { + "epoch": 3.7619661563255438, + "grad_norm": 1.2985818056847744, + "learning_rate": 9.863511945228257e-05, + "loss": 3.647, + "step": 5838 + }, + { + "epoch": 3.762610797743755, + "grad_norm": 1.3631460371430069, + "learning_rate": 9.863465158448491e-05, + "loss": 3.8219, + "step": 5839 + }, + { + "epoch": 3.7632554391619664, + "grad_norm": 1.5298516148015557, + "learning_rate": 9.863418363763211e-05, + "loss": 3.5238, + "step": 5840 + }, + { + "epoch": 3.763900080580177, + "grad_norm": 1.3030448428294192, + "learning_rate": 9.863371561172495e-05, + "loss": 3.7209, + "step": 5841 + }, + { + "epoch": 3.7645447219983885, + "grad_norm": 1.2192704747985204, + "learning_rate": 9.863324750676419e-05, + "loss": 3.5724, + "step": 5842 + }, + { + "epoch": 3.7651893634165994, + "grad_norm": 1.3505124275014575, + "learning_rate": 9.863277932275062e-05, + "loss": 3.8931, + "step": 5843 + }, + { + "epoch": 3.7658340048348107, + "grad_norm": 1.2081398565014705, + "learning_rate": 9.8632311059685e-05, + "loss": 3.5913, + "step": 5844 + }, + { + "epoch": 3.766478646253022, + "grad_norm": 1.1278932011555742, + "learning_rate": 9.863184271756808e-05, + "loss": 3.9653, + "step": 5845 + }, + { + "epoch": 3.767123287671233, + "grad_norm": 1.4243980600680881, + "learning_rate": 9.863137429640066e-05, + "loss": 3.7877, + "step": 5846 + }, + { + "epoch": 3.767767929089444, + "grad_norm": 1.6117835078822582, + "learning_rate": 9.86309057961835e-05, + "loss": 3.2437, + "step": 5847 + }, + { + "epoch": 3.768412570507655, + "grad_norm": 1.8381806494256305, + "learning_rate": 9.863043721691733e-05, + "loss": 3.5249, + "step": 5848 + }, + { + "epoch": 3.7690572119258663, + "grad_norm": 1.671219821571838, + "learning_rate": 9.862996855860297e-05, + "loss": 3.7689, + "step": 5849 + }, + { + "epoch": 3.7697018533440776, + "grad_norm": 1.7732477906267388, + "learning_rate": 9.862949982124118e-05, + "loss": 3.5891, + "step": 5850 + }, + { + "epoch": 3.7703464947622884, + "grad_norm": 1.6543354576548612, + "learning_rate": 9.86290310048327e-05, + "loss": 3.6101, + "step": 5851 + }, + { + "epoch": 3.7709911361804997, + "grad_norm": 1.4208050424043186, + "learning_rate": 9.862856210937836e-05, + "loss": 3.4759, + "step": 5852 + }, + { + "epoch": 3.7716357775987106, + "grad_norm": 1.4876348872039153, + "learning_rate": 9.862809313487887e-05, + "loss": 3.5285, + "step": 5853 + }, + { + "epoch": 3.772280419016922, + "grad_norm": 1.5706695189853819, + "learning_rate": 9.862762408133502e-05, + "loss": 3.9719, + "step": 5854 + }, + { + "epoch": 3.772925060435133, + "grad_norm": 1.2430471347765735, + "learning_rate": 9.86271549487476e-05, + "loss": 3.703, + "step": 5855 + }, + { + "epoch": 3.773569701853344, + "grad_norm": 1.4317780078379485, + "learning_rate": 9.862668573711736e-05, + "loss": 3.4694, + "step": 5856 + }, + { + "epoch": 3.7742143432715554, + "grad_norm": 1.6026126881118756, + "learning_rate": 9.862621644644505e-05, + "loss": 3.6712, + "step": 5857 + }, + { + "epoch": 3.774858984689766, + "grad_norm": 1.291774467009232, + "learning_rate": 9.862574707673149e-05, + "loss": 3.6173, + "step": 5858 + }, + { + "epoch": 3.7755036261079775, + "grad_norm": 1.3150863723393489, + "learning_rate": 9.862527762797741e-05, + "loss": 4.0245, + "step": 5859 + }, + { + "epoch": 3.776148267526189, + "grad_norm": 1.325439576947203, + "learning_rate": 9.862480810018361e-05, + "loss": 3.6664, + "step": 5860 + }, + { + "epoch": 3.7767929089443997, + "grad_norm": 1.3826915817469736, + "learning_rate": 9.862433849335084e-05, + "loss": 3.6672, + "step": 5861 + }, + { + "epoch": 3.7774375503626105, + "grad_norm": 1.2780470058057658, + "learning_rate": 9.862386880747988e-05, + "loss": 3.4784, + "step": 5862 + }, + { + "epoch": 3.778082191780822, + "grad_norm": 1.3259223286865998, + "learning_rate": 9.86233990425715e-05, + "loss": 3.5826, + "step": 5863 + }, + { + "epoch": 3.778726833199033, + "grad_norm": 1.5122500950847608, + "learning_rate": 9.862292919862648e-05, + "loss": 3.8667, + "step": 5864 + }, + { + "epoch": 3.7793714746172444, + "grad_norm": 1.3304081302055366, + "learning_rate": 9.862245927564557e-05, + "loss": 3.7398, + "step": 5865 + }, + { + "epoch": 3.7800161160354553, + "grad_norm": 1.5181355018960958, + "learning_rate": 9.862198927362957e-05, + "loss": 3.7827, + "step": 5866 + }, + { + "epoch": 3.780660757453666, + "grad_norm": 1.2618255245121497, + "learning_rate": 9.862151919257923e-05, + "loss": 3.6612, + "step": 5867 + }, + { + "epoch": 3.7813053988718774, + "grad_norm": 1.6114673552711134, + "learning_rate": 9.862104903249534e-05, + "loss": 3.4073, + "step": 5868 + }, + { + "epoch": 3.7819500402900887, + "grad_norm": 1.292988428896257, + "learning_rate": 9.862057879337865e-05, + "loss": 3.8247, + "step": 5869 + }, + { + "epoch": 3.7825946817082996, + "grad_norm": 1.4732309338555052, + "learning_rate": 9.862010847522994e-05, + "loss": 3.659, + "step": 5870 + }, + { + "epoch": 3.783239323126511, + "grad_norm": 1.2875394079464477, + "learning_rate": 9.861963807804999e-05, + "loss": 3.7257, + "step": 5871 + }, + { + "epoch": 3.7838839645447218, + "grad_norm": 1.547473968227625, + "learning_rate": 9.861916760183959e-05, + "loss": 3.6342, + "step": 5872 + }, + { + "epoch": 3.784528605962933, + "grad_norm": 1.3571838799402158, + "learning_rate": 9.861869704659946e-05, + "loss": 4.1632, + "step": 5873 + }, + { + "epoch": 3.7851732473811444, + "grad_norm": 1.655866740644342, + "learning_rate": 9.861822641233042e-05, + "loss": 3.4072, + "step": 5874 + }, + { + "epoch": 3.785817888799355, + "grad_norm": 1.614555940056868, + "learning_rate": 9.861775569903324e-05, + "loss": 3.7277, + "step": 5875 + }, + { + "epoch": 3.7864625302175665, + "grad_norm": 1.601977197589586, + "learning_rate": 9.861728490670865e-05, + "loss": 3.5986, + "step": 5876 + }, + { + "epoch": 3.7871071716357774, + "grad_norm": 1.657675482882959, + "learning_rate": 9.861681403535749e-05, + "loss": 3.768, + "step": 5877 + }, + { + "epoch": 3.7877518130539887, + "grad_norm": 1.8564636827729006, + "learning_rate": 9.861634308498046e-05, + "loss": 3.5002, + "step": 5878 + }, + { + "epoch": 3.7883964544722, + "grad_norm": 1.9844415441782297, + "learning_rate": 9.86158720555784e-05, + "loss": 3.761, + "step": 5879 + }, + { + "epoch": 3.789041095890411, + "grad_norm": 1.878317039027783, + "learning_rate": 9.861540094715204e-05, + "loss": 3.8265, + "step": 5880 + }, + { + "epoch": 3.789685737308622, + "grad_norm": 1.1429524094092032, + "learning_rate": 9.861492975970217e-05, + "loss": 3.746, + "step": 5881 + }, + { + "epoch": 3.790330378726833, + "grad_norm": 1.7996567344532912, + "learning_rate": 9.861445849322956e-05, + "loss": 3.4432, + "step": 5882 + }, + { + "epoch": 3.7909750201450443, + "grad_norm": 1.1717788364621302, + "learning_rate": 9.8613987147735e-05, + "loss": 3.9155, + "step": 5883 + }, + { + "epoch": 3.7916196615632556, + "grad_norm": 1.5326808480808198, + "learning_rate": 9.861351572321923e-05, + "loss": 3.4141, + "step": 5884 + }, + { + "epoch": 3.7922643029814664, + "grad_norm": 1.1284472060904673, + "learning_rate": 9.861304421968308e-05, + "loss": 3.8526, + "step": 5885 + }, + { + "epoch": 3.7929089443996777, + "grad_norm": 1.3195219362986317, + "learning_rate": 9.861257263712725e-05, + "loss": 3.7773, + "step": 5886 + }, + { + "epoch": 3.7935535858178886, + "grad_norm": 1.3285378881806253, + "learning_rate": 9.861210097555257e-05, + "loss": 3.6022, + "step": 5887 + }, + { + "epoch": 3.7941982272361, + "grad_norm": 1.8220018536551936, + "learning_rate": 9.861162923495979e-05, + "loss": 3.4611, + "step": 5888 + }, + { + "epoch": 3.794842868654311, + "grad_norm": 1.4794881769593635, + "learning_rate": 9.86111574153497e-05, + "loss": 3.2945, + "step": 5889 + }, + { + "epoch": 3.795487510072522, + "grad_norm": 1.6855428310965939, + "learning_rate": 9.861068551672306e-05, + "loss": 3.7821, + "step": 5890 + }, + { + "epoch": 3.7961321514907334, + "grad_norm": 1.167114870553028, + "learning_rate": 9.861021353908067e-05, + "loss": 3.7486, + "step": 5891 + }, + { + "epoch": 3.796776792908944, + "grad_norm": 1.575576586532042, + "learning_rate": 9.860974148242326e-05, + "loss": 3.3992, + "step": 5892 + }, + { + "epoch": 3.7974214343271555, + "grad_norm": 1.3270023382690685, + "learning_rate": 9.860926934675164e-05, + "loss": 3.775, + "step": 5893 + }, + { + "epoch": 3.798066075745367, + "grad_norm": 1.225970807127294, + "learning_rate": 9.860879713206659e-05, + "loss": 3.6561, + "step": 5894 + }, + { + "epoch": 3.7987107171635777, + "grad_norm": 1.577884906185689, + "learning_rate": 9.860832483836887e-05, + "loss": 3.8083, + "step": 5895 + }, + { + "epoch": 3.799355358581789, + "grad_norm": 1.5041372028856521, + "learning_rate": 9.860785246565927e-05, + "loss": 3.7183, + "step": 5896 + }, + { + "epoch": 3.8, + "grad_norm": 1.4358369377513287, + "learning_rate": 9.860738001393854e-05, + "loss": 3.7275, + "step": 5897 + }, + { + "epoch": 3.800644641418211, + "grad_norm": 1.9241327224749007, + "learning_rate": 9.860690748320747e-05, + "loss": 3.7009, + "step": 5898 + }, + { + "epoch": 3.8012892828364224, + "grad_norm": 1.5145563187527533, + "learning_rate": 9.860643487346685e-05, + "loss": 4.0218, + "step": 5899 + }, + { + "epoch": 3.8019339242546333, + "grad_norm": 1.3851277294333566, + "learning_rate": 9.860596218471744e-05, + "loss": 3.8477, + "step": 5900 + }, + { + "epoch": 3.8019339242546333, + "eval_loss": 4.034081935882568, + "eval_runtime": 2.977, + "eval_samples_per_second": 33.591, + "eval_steps_per_second": 4.367, + "step": 5900 + }, + { + "epoch": 3.8025785656728446, + "grad_norm": 1.3853760410281029, + "learning_rate": 9.860548941696002e-05, + "loss": 3.7637, + "step": 5901 + }, + { + "epoch": 3.8032232070910554, + "grad_norm": 1.4438006596923125, + "learning_rate": 9.860501657019538e-05, + "loss": 3.6688, + "step": 5902 + }, + { + "epoch": 3.8038678485092667, + "grad_norm": 1.1790743025500907, + "learning_rate": 9.860454364442424e-05, + "loss": 3.7727, + "step": 5903 + }, + { + "epoch": 3.804512489927478, + "grad_norm": 1.5516700457842119, + "learning_rate": 9.860407063964746e-05, + "loss": 3.872, + "step": 5904 + }, + { + "epoch": 3.805157131345689, + "grad_norm": 1.3769237518435204, + "learning_rate": 9.860359755586576e-05, + "loss": 3.4866, + "step": 5905 + }, + { + "epoch": 3.8058017727639, + "grad_norm": 1.337021592601854, + "learning_rate": 9.860312439307993e-05, + "loss": 3.8628, + "step": 5906 + }, + { + "epoch": 3.806446414182111, + "grad_norm": 1.4049668084473077, + "learning_rate": 9.860265115129077e-05, + "loss": 3.9945, + "step": 5907 + }, + { + "epoch": 3.8070910556003223, + "grad_norm": 1.2089747487976767, + "learning_rate": 9.860217783049903e-05, + "loss": 3.7522, + "step": 5908 + }, + { + "epoch": 3.8077356970185336, + "grad_norm": 1.1060727737421936, + "learning_rate": 9.860170443070549e-05, + "loss": 3.6031, + "step": 5909 + }, + { + "epoch": 3.8083803384367445, + "grad_norm": 1.4251941601591753, + "learning_rate": 9.860123095191094e-05, + "loss": 3.51, + "step": 5910 + }, + { + "epoch": 3.809024979854956, + "grad_norm": 1.5484658310478894, + "learning_rate": 9.860075739411615e-05, + "loss": 3.2802, + "step": 5911 + }, + { + "epoch": 3.8096696212731667, + "grad_norm": 1.528373948750025, + "learning_rate": 9.860028375732189e-05, + "loss": 3.5197, + "step": 5912 + }, + { + "epoch": 3.810314262691378, + "grad_norm": 1.29411423950175, + "learning_rate": 9.859981004152897e-05, + "loss": 3.9662, + "step": 5913 + }, + { + "epoch": 3.8109589041095893, + "grad_norm": 1.6885413210734048, + "learning_rate": 9.85993362467381e-05, + "loss": 3.7866, + "step": 5914 + }, + { + "epoch": 3.8116035455278, + "grad_norm": 1.375050263984345, + "learning_rate": 9.859886237295013e-05, + "loss": 3.546, + "step": 5915 + }, + { + "epoch": 3.8122481869460114, + "grad_norm": 1.58336791178992, + "learning_rate": 9.859838842016582e-05, + "loss": 3.5184, + "step": 5916 + }, + { + "epoch": 3.8128928283642223, + "grad_norm": 1.330080499507435, + "learning_rate": 9.859791438838591e-05, + "loss": 4.1223, + "step": 5917 + }, + { + "epoch": 3.8135374697824336, + "grad_norm": 1.4667278223381262, + "learning_rate": 9.859744027761124e-05, + "loss": 3.7868, + "step": 5918 + }, + { + "epoch": 3.814182111200645, + "grad_norm": 1.313438803881905, + "learning_rate": 9.859696608784254e-05, + "loss": 3.3575, + "step": 5919 + }, + { + "epoch": 3.8148267526188557, + "grad_norm": 1.4958250179621588, + "learning_rate": 9.859649181908061e-05, + "loss": 3.2911, + "step": 5920 + }, + { + "epoch": 3.815471394037067, + "grad_norm": 1.7543382101651441, + "learning_rate": 9.859601747132621e-05, + "loss": 3.8949, + "step": 5921 + }, + { + "epoch": 3.816116035455278, + "grad_norm": 1.446615586386178, + "learning_rate": 9.859554304458014e-05, + "loss": 3.622, + "step": 5922 + }, + { + "epoch": 3.816760676873489, + "grad_norm": 1.6378688961198833, + "learning_rate": 9.859506853884317e-05, + "loss": 3.7205, + "step": 5923 + }, + { + "epoch": 3.8174053182917005, + "grad_norm": 1.535696956641445, + "learning_rate": 9.859459395411609e-05, + "loss": 3.6258, + "step": 5924 + }, + { + "epoch": 3.8180499597099113, + "grad_norm": 1.3957669957195777, + "learning_rate": 9.859411929039967e-05, + "loss": 3.7116, + "step": 5925 + }, + { + "epoch": 3.8186946011281226, + "grad_norm": 1.7404912519898195, + "learning_rate": 9.859364454769469e-05, + "loss": 3.9774, + "step": 5926 + }, + { + "epoch": 3.8193392425463335, + "grad_norm": 1.371454919081753, + "learning_rate": 9.859316972600193e-05, + "loss": 3.6028, + "step": 5927 + }, + { + "epoch": 3.819983883964545, + "grad_norm": 1.3287489078191734, + "learning_rate": 9.859269482532218e-05, + "loss": 3.3863, + "step": 5928 + }, + { + "epoch": 3.820628525382756, + "grad_norm": 1.3194898628083453, + "learning_rate": 9.859221984565617e-05, + "loss": 3.9114, + "step": 5929 + }, + { + "epoch": 3.821273166800967, + "grad_norm": 1.2209510617359476, + "learning_rate": 9.859174478700477e-05, + "loss": 3.4586, + "step": 5930 + }, + { + "epoch": 3.821917808219178, + "grad_norm": 1.1846985426084882, + "learning_rate": 9.859126964936868e-05, + "loss": 3.7852, + "step": 5931 + }, + { + "epoch": 3.822562449637389, + "grad_norm": 1.1694553240160952, + "learning_rate": 9.859079443274871e-05, + "loss": 3.5344, + "step": 5932 + }, + { + "epoch": 3.8232070910556004, + "grad_norm": 1.445950335426683, + "learning_rate": 9.859031913714565e-05, + "loss": 3.9789, + "step": 5933 + }, + { + "epoch": 3.8238517324738117, + "grad_norm": 1.421771312130879, + "learning_rate": 9.858984376256028e-05, + "loss": 3.8802, + "step": 5934 + }, + { + "epoch": 3.8244963738920226, + "grad_norm": 1.500239152978125, + "learning_rate": 9.858936830899337e-05, + "loss": 3.7655, + "step": 5935 + }, + { + "epoch": 3.8251410153102334, + "grad_norm": 1.4343107450901706, + "learning_rate": 9.858889277644569e-05, + "loss": 3.8203, + "step": 5936 + }, + { + "epoch": 3.8257856567284447, + "grad_norm": 1.4251999447449106, + "learning_rate": 9.858841716491805e-05, + "loss": 4.1274, + "step": 5937 + }, + { + "epoch": 3.826430298146656, + "grad_norm": 1.4347796860045074, + "learning_rate": 9.85879414744112e-05, + "loss": 4.0317, + "step": 5938 + }, + { + "epoch": 3.827074939564867, + "grad_norm": 1.2906100034412498, + "learning_rate": 9.858746570492594e-05, + "loss": 3.6478, + "step": 5939 + }, + { + "epoch": 3.827719580983078, + "grad_norm": 1.4906116455432383, + "learning_rate": 9.858698985646307e-05, + "loss": 3.6472, + "step": 5940 + }, + { + "epoch": 3.828364222401289, + "grad_norm": 1.3530063795232872, + "learning_rate": 9.858651392902333e-05, + "loss": 3.4702, + "step": 5941 + }, + { + "epoch": 3.8290088638195003, + "grad_norm": 1.5315628983532414, + "learning_rate": 9.858603792260753e-05, + "loss": 3.7362, + "step": 5942 + }, + { + "epoch": 3.8296535052377116, + "grad_norm": 1.7177026036462477, + "learning_rate": 9.858556183721643e-05, + "loss": 3.4025, + "step": 5943 + }, + { + "epoch": 3.8302981466559225, + "grad_norm": 1.8335544524443166, + "learning_rate": 9.858508567285083e-05, + "loss": 3.7441, + "step": 5944 + }, + { + "epoch": 3.830942788074134, + "grad_norm": 1.2994033921896435, + "learning_rate": 9.858460942951152e-05, + "loss": 3.8829, + "step": 5945 + }, + { + "epoch": 3.8315874294923447, + "grad_norm": 1.1945560933077821, + "learning_rate": 9.858413310719924e-05, + "loss": 3.9548, + "step": 5946 + }, + { + "epoch": 3.832232070910556, + "grad_norm": 1.182630635868447, + "learning_rate": 9.858365670591483e-05, + "loss": 4.0227, + "step": 5947 + }, + { + "epoch": 3.8328767123287673, + "grad_norm": 1.3946288883165219, + "learning_rate": 9.858318022565905e-05, + "loss": 3.7575, + "step": 5948 + }, + { + "epoch": 3.833521353746978, + "grad_norm": 1.3601268799278738, + "learning_rate": 9.858270366643266e-05, + "loss": 3.5508, + "step": 5949 + }, + { + "epoch": 3.8341659951651894, + "grad_norm": 1.7000578411215412, + "learning_rate": 9.858222702823646e-05, + "loss": 3.7599, + "step": 5950 + }, + { + "epoch": 3.8348106365834003, + "grad_norm": 1.7577708867317554, + "learning_rate": 9.858175031107122e-05, + "loss": 3.5763, + "step": 5951 + }, + { + "epoch": 3.8354552780016116, + "grad_norm": 2.071282434548208, + "learning_rate": 9.858127351493776e-05, + "loss": 3.1833, + "step": 5952 + }, + { + "epoch": 3.836099919419823, + "grad_norm": 2.4212834331399473, + "learning_rate": 9.858079663983682e-05, + "loss": 3.7355, + "step": 5953 + }, + { + "epoch": 3.8367445608380337, + "grad_norm": 1.487844497839605, + "learning_rate": 9.858031968576921e-05, + "loss": 3.6516, + "step": 5954 + }, + { + "epoch": 3.837389202256245, + "grad_norm": 2.1732203579458043, + "learning_rate": 9.85798426527357e-05, + "loss": 4.0128, + "step": 5955 + }, + { + "epoch": 3.838033843674456, + "grad_norm": 1.7286038644372264, + "learning_rate": 9.857936554073708e-05, + "loss": 3.6537, + "step": 5956 + }, + { + "epoch": 3.838678485092667, + "grad_norm": 2.0138794657932597, + "learning_rate": 9.857888834977413e-05, + "loss": 3.3571, + "step": 5957 + }, + { + "epoch": 3.8393231265108785, + "grad_norm": 2.05295111902606, + "learning_rate": 9.857841107984764e-05, + "loss": 3.6757, + "step": 5958 + }, + { + "epoch": 3.8399677679290893, + "grad_norm": 1.8004423692664746, + "learning_rate": 9.857793373095839e-05, + "loss": 3.6687, + "step": 5959 + }, + { + "epoch": 3.8406124093473006, + "grad_norm": 1.9116725960498733, + "learning_rate": 9.857745630310716e-05, + "loss": 3.6388, + "step": 5960 + }, + { + "epoch": 3.8412570507655115, + "grad_norm": 1.8435413522503483, + "learning_rate": 9.857697879629473e-05, + "loss": 3.5397, + "step": 5961 + }, + { + "epoch": 3.841901692183723, + "grad_norm": 1.5520350008384685, + "learning_rate": 9.85765012105219e-05, + "loss": 3.6143, + "step": 5962 + }, + { + "epoch": 3.842546333601934, + "grad_norm": 1.6715660456308503, + "learning_rate": 9.857602354578945e-05, + "loss": 3.5027, + "step": 5963 + }, + { + "epoch": 3.843190975020145, + "grad_norm": 2.004241126749341, + "learning_rate": 9.857554580209815e-05, + "loss": 3.7495, + "step": 5964 + }, + { + "epoch": 3.8438356164383563, + "grad_norm": 1.3053643885177455, + "learning_rate": 9.85750679794488e-05, + "loss": 3.5865, + "step": 5965 + }, + { + "epoch": 3.844480257856567, + "grad_norm": 1.837994767632569, + "learning_rate": 9.857459007784217e-05, + "loss": 3.6755, + "step": 5966 + }, + { + "epoch": 3.8451248992747784, + "grad_norm": 1.5791952857875091, + "learning_rate": 9.857411209727909e-05, + "loss": 3.9692, + "step": 5967 + }, + { + "epoch": 3.8457695406929897, + "grad_norm": 2.068489127455079, + "learning_rate": 9.857363403776028e-05, + "loss": 3.6934, + "step": 5968 + }, + { + "epoch": 3.8464141821112006, + "grad_norm": 1.5606043292130833, + "learning_rate": 9.857315589928656e-05, + "loss": 3.4962, + "step": 5969 + }, + { + "epoch": 3.847058823529412, + "grad_norm": 2.631570577774557, + "learning_rate": 9.85726776818587e-05, + "loss": 3.6223, + "step": 5970 + }, + { + "epoch": 3.8477034649476227, + "grad_norm": 1.5447754198274823, + "learning_rate": 9.857219938547752e-05, + "loss": 3.8811, + "step": 5971 + }, + { + "epoch": 3.848348106365834, + "grad_norm": 2.569724543388624, + "learning_rate": 9.857172101014376e-05, + "loss": 3.7487, + "step": 5972 + }, + { + "epoch": 3.8489927477840453, + "grad_norm": 1.7269028387602816, + "learning_rate": 9.857124255585824e-05, + "loss": 3.4277, + "step": 5973 + }, + { + "epoch": 3.849637389202256, + "grad_norm": 2.7702999929118772, + "learning_rate": 9.857076402262173e-05, + "loss": 3.8228, + "step": 5974 + }, + { + "epoch": 3.8502820306204675, + "grad_norm": 1.2739787442615362, + "learning_rate": 9.857028541043502e-05, + "loss": 3.5994, + "step": 5975 + }, + { + "epoch": 3.8509266720386783, + "grad_norm": 2.135727784100517, + "learning_rate": 9.856980671929889e-05, + "loss": 3.4756, + "step": 5976 + }, + { + "epoch": 3.8515713134568896, + "grad_norm": 1.660804122136777, + "learning_rate": 9.856932794921413e-05, + "loss": 3.4808, + "step": 5977 + }, + { + "epoch": 3.852215954875101, + "grad_norm": 2.305267679284918, + "learning_rate": 9.856884910018153e-05, + "loss": 3.7954, + "step": 5978 + }, + { + "epoch": 3.852860596293312, + "grad_norm": 1.4825088679116145, + "learning_rate": 9.856837017220187e-05, + "loss": 3.4032, + "step": 5979 + }, + { + "epoch": 3.853505237711523, + "grad_norm": 2.2162730769398524, + "learning_rate": 9.856789116527594e-05, + "loss": 3.4663, + "step": 5980 + }, + { + "epoch": 3.854149879129734, + "grad_norm": 1.8579795882326133, + "learning_rate": 9.856741207940453e-05, + "loss": 3.8394, + "step": 5981 + }, + { + "epoch": 3.8547945205479452, + "grad_norm": 1.815308854476595, + "learning_rate": 9.856693291458841e-05, + "loss": 3.7469, + "step": 5982 + }, + { + "epoch": 3.8554391619661565, + "grad_norm": 1.7337822371006042, + "learning_rate": 9.856645367082842e-05, + "loss": 3.6764, + "step": 5983 + }, + { + "epoch": 3.8560838033843674, + "grad_norm": 1.6114687426647099, + "learning_rate": 9.856597434812527e-05, + "loss": 3.6988, + "step": 5984 + }, + { + "epoch": 3.8567284448025787, + "grad_norm": 1.2716519913530633, + "learning_rate": 9.85654949464798e-05, + "loss": 3.4952, + "step": 5985 + }, + { + "epoch": 3.8573730862207896, + "grad_norm": 1.4494999707770715, + "learning_rate": 9.856501546589277e-05, + "loss": 3.6669, + "step": 5986 + }, + { + "epoch": 3.858017727639001, + "grad_norm": 1.4675812183400119, + "learning_rate": 9.856453590636498e-05, + "loss": 3.8259, + "step": 5987 + }, + { + "epoch": 3.858662369057212, + "grad_norm": 1.3399891318000385, + "learning_rate": 9.856405626789724e-05, + "loss": 3.3745, + "step": 5988 + }, + { + "epoch": 3.859307010475423, + "grad_norm": 1.7267827803180695, + "learning_rate": 9.85635765504903e-05, + "loss": 3.7575, + "step": 5989 + }, + { + "epoch": 3.8599516518936343, + "grad_norm": 1.314689808411903, + "learning_rate": 9.856309675414496e-05, + "loss": 4.0816, + "step": 5990 + }, + { + "epoch": 3.860596293311845, + "grad_norm": 1.6814083212427144, + "learning_rate": 9.856261687886203e-05, + "loss": 3.6465, + "step": 5991 + }, + { + "epoch": 3.8612409347300565, + "grad_norm": 1.4047185271676583, + "learning_rate": 9.856213692464226e-05, + "loss": 3.569, + "step": 5992 + }, + { + "epoch": 3.8618855761482678, + "grad_norm": 1.7231809668237221, + "learning_rate": 9.856165689148647e-05, + "loss": 3.3422, + "step": 5993 + }, + { + "epoch": 3.8625302175664786, + "grad_norm": 1.364526314115813, + "learning_rate": 9.856117677939542e-05, + "loss": 3.6104, + "step": 5994 + }, + { + "epoch": 3.86317485898469, + "grad_norm": 1.831211166819141, + "learning_rate": 9.856069658836993e-05, + "loss": 3.4163, + "step": 5995 + }, + { + "epoch": 3.863819500402901, + "grad_norm": 1.2565347428710738, + "learning_rate": 9.856021631841077e-05, + "loss": 3.6491, + "step": 5996 + }, + { + "epoch": 3.864464141821112, + "grad_norm": 1.455941963089679, + "learning_rate": 9.855973596951872e-05, + "loss": 3.8102, + "step": 5997 + }, + { + "epoch": 3.8651087832393234, + "grad_norm": 1.2688553833289176, + "learning_rate": 9.855925554169459e-05, + "loss": 3.6097, + "step": 5998 + }, + { + "epoch": 3.8657534246575342, + "grad_norm": 1.5698115278186233, + "learning_rate": 9.855877503493917e-05, + "loss": 3.8569, + "step": 5999 + }, + { + "epoch": 3.866398066075745, + "grad_norm": 1.4941607852277061, + "learning_rate": 9.855829444925322e-05, + "loss": 3.8886, + "step": 6000 + }, + { + "epoch": 3.866398066075745, + "eval_loss": 4.0444769859313965, + "eval_runtime": 2.9726, + "eval_samples_per_second": 33.641, + "eval_steps_per_second": 4.373, + "step": 6000 + }, + { + "epoch": 3.8670427074939564, + "grad_norm": 1.1848867760810364, + "learning_rate": 9.855781378463755e-05, + "loss": 3.367, + "step": 6001 + }, + { + "epoch": 3.8676873489121677, + "grad_norm": 1.485904267994971, + "learning_rate": 9.855733304109297e-05, + "loss": 3.7498, + "step": 6002 + }, + { + "epoch": 3.868331990330379, + "grad_norm": 1.9019566921810525, + "learning_rate": 9.855685221862023e-05, + "loss": 3.662, + "step": 6003 + }, + { + "epoch": 3.86897663174859, + "grad_norm": 1.3134720958935497, + "learning_rate": 9.855637131722015e-05, + "loss": 3.7503, + "step": 6004 + }, + { + "epoch": 3.8696212731668007, + "grad_norm": 1.5300138469473203, + "learning_rate": 9.855589033689349e-05, + "loss": 3.7471, + "step": 6005 + }, + { + "epoch": 3.870265914585012, + "grad_norm": 1.395545668831909, + "learning_rate": 9.855540927764106e-05, + "loss": 3.6059, + "step": 6006 + }, + { + "epoch": 3.8709105560032233, + "grad_norm": 70.52716062928022, + "learning_rate": 9.855492813946366e-05, + "loss": 3.5105, + "step": 6007 + }, + { + "epoch": 3.871555197421434, + "grad_norm": 1.9804558342354033, + "learning_rate": 9.855444692236205e-05, + "loss": 3.6797, + "step": 6008 + }, + { + "epoch": 3.8721998388396455, + "grad_norm": 1.3985946784992298, + "learning_rate": 9.855396562633705e-05, + "loss": 3.5405, + "step": 6009 + }, + { + "epoch": 3.8728444802578563, + "grad_norm": 1.3569550057611683, + "learning_rate": 9.855348425138941e-05, + "loss": 3.656, + "step": 6010 + }, + { + "epoch": 3.8734891216760676, + "grad_norm": 1.4743467880493506, + "learning_rate": 9.855300279751998e-05, + "loss": 3.6365, + "step": 6011 + }, + { + "epoch": 3.874133763094279, + "grad_norm": 1.4816742706678208, + "learning_rate": 9.855252126472952e-05, + "loss": 3.6917, + "step": 6012 + }, + { + "epoch": 3.87477840451249, + "grad_norm": 1.4070445231138127, + "learning_rate": 9.855203965301881e-05, + "loss": 3.7598, + "step": 6013 + }, + { + "epoch": 3.875423045930701, + "grad_norm": 1.2144980839381967, + "learning_rate": 9.855155796238864e-05, + "loss": 3.7952, + "step": 6014 + }, + { + "epoch": 3.876067687348912, + "grad_norm": 1.6166122605036362, + "learning_rate": 9.855107619283984e-05, + "loss": 3.2789, + "step": 6015 + }, + { + "epoch": 3.8767123287671232, + "grad_norm": 2.179793499596964, + "learning_rate": 9.855059434437315e-05, + "loss": 3.5854, + "step": 6016 + }, + { + "epoch": 3.8773569701853345, + "grad_norm": 1.5785059946178002, + "learning_rate": 9.85501124169894e-05, + "loss": 3.5519, + "step": 6017 + }, + { + "epoch": 3.8780016116035454, + "grad_norm": 1.342353541589148, + "learning_rate": 9.854963041068936e-05, + "loss": 3.4764, + "step": 6018 + }, + { + "epoch": 3.8786462530217567, + "grad_norm": 1.619631310005825, + "learning_rate": 9.85491483254738e-05, + "loss": 3.9094, + "step": 6019 + }, + { + "epoch": 3.8792908944399676, + "grad_norm": 1.218327515760784, + "learning_rate": 9.854866616134358e-05, + "loss": 3.6665, + "step": 6020 + }, + { + "epoch": 3.879935535858179, + "grad_norm": 1.6359176822072912, + "learning_rate": 9.854818391829943e-05, + "loss": 3.8696, + "step": 6021 + }, + { + "epoch": 3.88058017727639, + "grad_norm": 1.329732875964443, + "learning_rate": 9.854770159634217e-05, + "loss": 3.4662, + "step": 6022 + }, + { + "epoch": 3.881224818694601, + "grad_norm": 1.5778886888979335, + "learning_rate": 9.85472191954726e-05, + "loss": 3.5803, + "step": 6023 + }, + { + "epoch": 3.8818694601128123, + "grad_norm": 1.7374029168233547, + "learning_rate": 9.854673671569148e-05, + "loss": 3.7092, + "step": 6024 + }, + { + "epoch": 3.882514101531023, + "grad_norm": 1.4205715385833098, + "learning_rate": 9.854625415699963e-05, + "loss": 3.7607, + "step": 6025 + }, + { + "epoch": 3.8831587429492345, + "grad_norm": 1.8286622777527908, + "learning_rate": 9.854577151939782e-05, + "loss": 3.7058, + "step": 6026 + }, + { + "epoch": 3.8838033843674458, + "grad_norm": 1.4921724627715256, + "learning_rate": 9.854528880288686e-05, + "loss": 3.4968, + "step": 6027 + }, + { + "epoch": 3.8844480257856566, + "grad_norm": 1.8235856411039644, + "learning_rate": 9.854480600746757e-05, + "loss": 3.3734, + "step": 6028 + }, + { + "epoch": 3.885092667203868, + "grad_norm": 1.3432800722513183, + "learning_rate": 9.854432313314066e-05, + "loss": 3.9712, + "step": 6029 + }, + { + "epoch": 3.885737308622079, + "grad_norm": 1.5079111741577442, + "learning_rate": 9.8543840179907e-05, + "loss": 3.6306, + "step": 6030 + }, + { + "epoch": 3.88638195004029, + "grad_norm": 1.4681094765631222, + "learning_rate": 9.854335714776736e-05, + "loss": 3.7012, + "step": 6031 + }, + { + "epoch": 3.8870265914585014, + "grad_norm": 1.3955887890291345, + "learning_rate": 9.854287403672252e-05, + "loss": 3.5704, + "step": 6032 + }, + { + "epoch": 3.8876712328767122, + "grad_norm": 1.4442194774302552, + "learning_rate": 9.854239084677329e-05, + "loss": 3.3963, + "step": 6033 + }, + { + "epoch": 3.8883158742949235, + "grad_norm": 1.7388655146821976, + "learning_rate": 9.854190757792047e-05, + "loss": 3.8019, + "step": 6034 + }, + { + "epoch": 3.8889605157131344, + "grad_norm": 1.28710259852592, + "learning_rate": 9.854142423016484e-05, + "loss": 3.7068, + "step": 6035 + }, + { + "epoch": 3.8896051571313457, + "grad_norm": 1.69331426953556, + "learning_rate": 9.854094080350718e-05, + "loss": 3.6211, + "step": 6036 + }, + { + "epoch": 3.890249798549557, + "grad_norm": 1.242368408613759, + "learning_rate": 9.854045729794829e-05, + "loss": 3.8474, + "step": 6037 + }, + { + "epoch": 3.890894439967768, + "grad_norm": 1.6648545572440716, + "learning_rate": 9.8539973713489e-05, + "loss": 3.7464, + "step": 6038 + }, + { + "epoch": 3.891539081385979, + "grad_norm": 1.5098375560434207, + "learning_rate": 9.853949005013007e-05, + "loss": 3.672, + "step": 6039 + }, + { + "epoch": 3.89218372280419, + "grad_norm": 1.4806313551511807, + "learning_rate": 9.85390063078723e-05, + "loss": 3.7068, + "step": 6040 + }, + { + "epoch": 3.8928283642224013, + "grad_norm": 1.2811537368363897, + "learning_rate": 9.853852248671649e-05, + "loss": 3.4745, + "step": 6041 + }, + { + "epoch": 3.8934730056406126, + "grad_norm": 2.0916928608696446, + "learning_rate": 9.853803858666343e-05, + "loss": 3.6865, + "step": 6042 + }, + { + "epoch": 3.8941176470588235, + "grad_norm": 1.55699209997959, + "learning_rate": 9.853755460771392e-05, + "loss": 3.7934, + "step": 6043 + }, + { + "epoch": 3.8947622884770348, + "grad_norm": 1.2961518084191177, + "learning_rate": 9.853707054986873e-05, + "loss": 3.6987, + "step": 6044 + }, + { + "epoch": 3.8954069298952456, + "grad_norm": 1.362526905416283, + "learning_rate": 9.85365864131287e-05, + "loss": 3.6197, + "step": 6045 + }, + { + "epoch": 3.896051571313457, + "grad_norm": 1.3421568327476823, + "learning_rate": 9.85361021974946e-05, + "loss": 3.5829, + "step": 6046 + }, + { + "epoch": 3.896696212731668, + "grad_norm": 1.2007412148980803, + "learning_rate": 9.85356179029672e-05, + "loss": 3.9754, + "step": 6047 + }, + { + "epoch": 3.897340854149879, + "grad_norm": 1.2548860195790243, + "learning_rate": 9.853513352954734e-05, + "loss": 3.7483, + "step": 6048 + }, + { + "epoch": 3.8979854955680904, + "grad_norm": 1.7470814082261128, + "learning_rate": 9.853464907723581e-05, + "loss": 3.7941, + "step": 6049 + }, + { + "epoch": 3.8986301369863012, + "grad_norm": 1.6393175105597542, + "learning_rate": 9.853416454603336e-05, + "loss": 3.7831, + "step": 6050 + }, + { + "epoch": 3.8992747784045125, + "grad_norm": 1.482487131988384, + "learning_rate": 9.853367993594086e-05, + "loss": 3.9123, + "step": 6051 + }, + { + "epoch": 3.899919419822724, + "grad_norm": 1.3221951507001521, + "learning_rate": 9.853319524695905e-05, + "loss": 3.6046, + "step": 6052 + }, + { + "epoch": 3.9005640612409347, + "grad_norm": 1.8582154975180345, + "learning_rate": 9.853271047908873e-05, + "loss": 3.5356, + "step": 6053 + }, + { + "epoch": 3.901208702659146, + "grad_norm": 1.248716125525672, + "learning_rate": 9.853222563233072e-05, + "loss": 3.5885, + "step": 6054 + }, + { + "epoch": 3.901853344077357, + "grad_norm": 1.4661077508904885, + "learning_rate": 9.853174070668579e-05, + "loss": 3.3913, + "step": 6055 + }, + { + "epoch": 3.902497985495568, + "grad_norm": 1.6630478454787174, + "learning_rate": 9.853125570215473e-05, + "loss": 3.7835, + "step": 6056 + }, + { + "epoch": 3.9031426269137794, + "grad_norm": 1.2329551041626239, + "learning_rate": 9.853077061873838e-05, + "loss": 3.6741, + "step": 6057 + }, + { + "epoch": 3.9037872683319903, + "grad_norm": 1.9189005074895067, + "learning_rate": 9.853028545643753e-05, + "loss": 3.6682, + "step": 6058 + }, + { + "epoch": 3.9044319097502016, + "grad_norm": 1.471154133593654, + "learning_rate": 9.852980021525294e-05, + "loss": 3.9459, + "step": 6059 + }, + { + "epoch": 3.9050765511684125, + "grad_norm": 1.4423518496612666, + "learning_rate": 9.852931489518543e-05, + "loss": 3.8598, + "step": 6060 + }, + { + "epoch": 3.9057211925866238, + "grad_norm": 1.5152053917383985, + "learning_rate": 9.85288294962358e-05, + "loss": 3.777, + "step": 6061 + }, + { + "epoch": 3.906365834004835, + "grad_norm": 1.1877730452661979, + "learning_rate": 9.852834401840482e-05, + "loss": 3.8236, + "step": 6062 + }, + { + "epoch": 3.907010475423046, + "grad_norm": 1.6898777327917172, + "learning_rate": 9.852785846169332e-05, + "loss": 3.6219, + "step": 6063 + }, + { + "epoch": 3.907655116841257, + "grad_norm": 1.4818460393897308, + "learning_rate": 9.852737282610209e-05, + "loss": 3.9906, + "step": 6064 + }, + { + "epoch": 3.908299758259468, + "grad_norm": 2.4943191520251293, + "learning_rate": 9.852688711163191e-05, + "loss": 3.7613, + "step": 6065 + }, + { + "epoch": 3.9089443996776794, + "grad_norm": 1.4418679635654852, + "learning_rate": 9.852640131828362e-05, + "loss": 3.9001, + "step": 6066 + }, + { + "epoch": 3.9095890410958907, + "grad_norm": 1.0412681155271082, + "learning_rate": 9.852591544605797e-05, + "loss": 3.7969, + "step": 6067 + }, + { + "epoch": 3.9102336825141015, + "grad_norm": 1.3122850750794892, + "learning_rate": 9.852542949495576e-05, + "loss": 3.503, + "step": 6068 + }, + { + "epoch": 3.9108783239323124, + "grad_norm": 1.318353042650765, + "learning_rate": 9.852494346497784e-05, + "loss": 3.4835, + "step": 6069 + }, + { + "epoch": 3.9115229653505237, + "grad_norm": 1.7803539815791027, + "learning_rate": 9.852445735612496e-05, + "loss": 3.8776, + "step": 6070 + }, + { + "epoch": 3.912167606768735, + "grad_norm": 1.527691947842779, + "learning_rate": 9.852397116839792e-05, + "loss": 3.5559, + "step": 6071 + }, + { + "epoch": 3.9128122481869463, + "grad_norm": 1.2836236206654648, + "learning_rate": 9.852348490179754e-05, + "loss": 3.57, + "step": 6072 + }, + { + "epoch": 3.913456889605157, + "grad_norm": 1.5499632091551216, + "learning_rate": 9.85229985563246e-05, + "loss": 3.7614, + "step": 6073 + }, + { + "epoch": 3.914101531023368, + "grad_norm": 1.4010743831595047, + "learning_rate": 9.852251213197991e-05, + "loss": 3.7131, + "step": 6074 + }, + { + "epoch": 3.9147461724415793, + "grad_norm": 1.4254748190571398, + "learning_rate": 9.852202562876426e-05, + "loss": 3.9481, + "step": 6075 + }, + { + "epoch": 3.9153908138597906, + "grad_norm": 1.4616109921492109, + "learning_rate": 9.852153904667848e-05, + "loss": 3.3449, + "step": 6076 + }, + { + "epoch": 3.9160354552780015, + "grad_norm": 1.6174570765058374, + "learning_rate": 9.852105238572332e-05, + "loss": 3.8365, + "step": 6077 + }, + { + "epoch": 3.9166800966962128, + "grad_norm": 1.361213943429873, + "learning_rate": 9.852056564589961e-05, + "loss": 3.7522, + "step": 6078 + }, + { + "epoch": 3.9173247381144236, + "grad_norm": 1.4734249024810144, + "learning_rate": 9.852007882720815e-05, + "loss": 3.4757, + "step": 6079 + }, + { + "epoch": 3.917969379532635, + "grad_norm": 1.5224738450296398, + "learning_rate": 9.851959192964973e-05, + "loss": 3.5943, + "step": 6080 + }, + { + "epoch": 3.918614020950846, + "grad_norm": 1.5215498742426992, + "learning_rate": 9.851910495322514e-05, + "loss": 3.7786, + "step": 6081 + }, + { + "epoch": 3.919258662369057, + "grad_norm": 1.1710822974123614, + "learning_rate": 9.851861789793522e-05, + "loss": 3.784, + "step": 6082 + }, + { + "epoch": 3.9199033037872684, + "grad_norm": 1.484059586089119, + "learning_rate": 9.851813076378071e-05, + "loss": 3.6081, + "step": 6083 + }, + { + "epoch": 3.9205479452054792, + "grad_norm": 1.2225448836249537, + "learning_rate": 9.851764355076248e-05, + "loss": 3.7788, + "step": 6084 + }, + { + "epoch": 3.9211925866236905, + "grad_norm": 1.4461946471564422, + "learning_rate": 9.851715625888126e-05, + "loss": 3.7485, + "step": 6085 + }, + { + "epoch": 3.921837228041902, + "grad_norm": 1.3599363858202338, + "learning_rate": 9.85166688881379e-05, + "loss": 3.9006, + "step": 6086 + }, + { + "epoch": 3.9224818694601127, + "grad_norm": 1.4681525710197363, + "learning_rate": 9.851618143853316e-05, + "loss": 3.673, + "step": 6087 + }, + { + "epoch": 3.923126510878324, + "grad_norm": 1.3800059092898789, + "learning_rate": 9.851569391006789e-05, + "loss": 3.6853, + "step": 6088 + }, + { + "epoch": 3.923771152296535, + "grad_norm": 1.3355163316012566, + "learning_rate": 9.851520630274285e-05, + "loss": 3.9974, + "step": 6089 + }, + { + "epoch": 3.924415793714746, + "grad_norm": 1.1730071016713064, + "learning_rate": 9.851471861655886e-05, + "loss": 3.7272, + "step": 6090 + }, + { + "epoch": 3.9250604351329574, + "grad_norm": 1.375719279601959, + "learning_rate": 9.851423085151672e-05, + "loss": 3.8179, + "step": 6091 + }, + { + "epoch": 3.9257050765511683, + "grad_norm": 1.3264665246790857, + "learning_rate": 9.851374300761722e-05, + "loss": 3.6291, + "step": 6092 + }, + { + "epoch": 3.9263497179693796, + "grad_norm": 1.3633878570805689, + "learning_rate": 9.851325508486116e-05, + "loss": 3.944, + "step": 6093 + }, + { + "epoch": 3.9269943593875904, + "grad_norm": 1.3758983043002662, + "learning_rate": 9.851276708324937e-05, + "loss": 3.4334, + "step": 6094 + }, + { + "epoch": 3.9276390008058018, + "grad_norm": 1.6103465415814253, + "learning_rate": 9.851227900278262e-05, + "loss": 3.551, + "step": 6095 + }, + { + "epoch": 3.928283642224013, + "grad_norm": 1.421993390829655, + "learning_rate": 9.851179084346173e-05, + "loss": 3.5629, + "step": 6096 + }, + { + "epoch": 3.928928283642224, + "grad_norm": 1.4341506733931635, + "learning_rate": 9.851130260528749e-05, + "loss": 3.585, + "step": 6097 + }, + { + "epoch": 3.929572925060435, + "grad_norm": 1.5006274721770112, + "learning_rate": 9.85108142882607e-05, + "loss": 3.4256, + "step": 6098 + }, + { + "epoch": 3.930217566478646, + "grad_norm": 1.3911131338753098, + "learning_rate": 9.851032589238218e-05, + "loss": 3.875, + "step": 6099 + }, + { + "epoch": 3.9308622078968574, + "grad_norm": 1.6423869955700108, + "learning_rate": 9.85098374176527e-05, + "loss": 3.6153, + "step": 6100 + }, + { + "epoch": 3.9308622078968574, + "eval_loss": 4.009616851806641, + "eval_runtime": 2.9832, + "eval_samples_per_second": 33.521, + "eval_steps_per_second": 4.358, + "step": 6100 + }, + { + "epoch": 3.9315068493150687, + "grad_norm": 1.1708520957171165, + "learning_rate": 9.850934886407312e-05, + "loss": 3.6203, + "step": 6101 + }, + { + "epoch": 3.9321514907332795, + "grad_norm": 2.261471520451014, + "learning_rate": 9.850886023164418e-05, + "loss": 3.8372, + "step": 6102 + }, + { + "epoch": 3.932796132151491, + "grad_norm": 1.7261069184129694, + "learning_rate": 9.85083715203667e-05, + "loss": 4.0181, + "step": 6103 + }, + { + "epoch": 3.9334407735697017, + "grad_norm": 1.764016724428091, + "learning_rate": 9.850788273024152e-05, + "loss": 3.9913, + "step": 6104 + }, + { + "epoch": 3.934085414987913, + "grad_norm": 1.4064729093475976, + "learning_rate": 9.850739386126939e-05, + "loss": 3.7706, + "step": 6105 + }, + { + "epoch": 3.9347300564061243, + "grad_norm": 3.242158501444657, + "learning_rate": 9.850690491345113e-05, + "loss": 3.3276, + "step": 6106 + }, + { + "epoch": 3.935374697824335, + "grad_norm": 1.747327144474494, + "learning_rate": 9.850641588678757e-05, + "loss": 3.7714, + "step": 6107 + }, + { + "epoch": 3.9360193392425464, + "grad_norm": 1.4284224011137336, + "learning_rate": 9.85059267812795e-05, + "loss": 3.8596, + "step": 6108 + }, + { + "epoch": 3.9366639806607573, + "grad_norm": 1.7733580579101202, + "learning_rate": 9.850543759692768e-05, + "loss": 3.4328, + "step": 6109 + }, + { + "epoch": 3.9373086220789686, + "grad_norm": 2.4071566004536558, + "learning_rate": 9.850494833373298e-05, + "loss": 3.6953, + "step": 6110 + }, + { + "epoch": 3.93795326349718, + "grad_norm": 1.7900169677137983, + "learning_rate": 9.850445899169615e-05, + "loss": 3.782, + "step": 6111 + }, + { + "epoch": 3.9385979049153907, + "grad_norm": 2.1178880277689034, + "learning_rate": 9.850396957081803e-05, + "loss": 3.8371, + "step": 6112 + }, + { + "epoch": 3.939242546333602, + "grad_norm": 1.8460768726914247, + "learning_rate": 9.850348007109941e-05, + "loss": 3.6863, + "step": 6113 + }, + { + "epoch": 3.939887187751813, + "grad_norm": 2.0729979302875736, + "learning_rate": 9.85029904925411e-05, + "loss": 3.4885, + "step": 6114 + }, + { + "epoch": 3.940531829170024, + "grad_norm": 1.9589431508153923, + "learning_rate": 9.850250083514388e-05, + "loss": 3.9348, + "step": 6115 + }, + { + "epoch": 3.9411764705882355, + "grad_norm": 1.616116742965927, + "learning_rate": 9.850201109890858e-05, + "loss": 3.246, + "step": 6116 + }, + { + "epoch": 3.9418211120064464, + "grad_norm": 1.9327597821057092, + "learning_rate": 9.8501521283836e-05, + "loss": 3.8643, + "step": 6117 + }, + { + "epoch": 3.9424657534246577, + "grad_norm": 1.5001766869069442, + "learning_rate": 9.850103138992695e-05, + "loss": 3.6784, + "step": 6118 + }, + { + "epoch": 3.9431103948428685, + "grad_norm": 1.9916677924456079, + "learning_rate": 9.850054141718221e-05, + "loss": 3.5449, + "step": 6119 + }, + { + "epoch": 3.94375503626108, + "grad_norm": 1.3417823762663257, + "learning_rate": 9.85000513656026e-05, + "loss": 3.4618, + "step": 6120 + }, + { + "epoch": 3.944399677679291, + "grad_norm": 1.7610631892703468, + "learning_rate": 9.849956123518894e-05, + "loss": 3.4661, + "step": 6121 + }, + { + "epoch": 3.945044319097502, + "grad_norm": 2.5509373957362182, + "learning_rate": 9.8499071025942e-05, + "loss": 3.6573, + "step": 6122 + }, + { + "epoch": 3.9456889605157133, + "grad_norm": 1.5700299190445761, + "learning_rate": 9.849858073786262e-05, + "loss": 3.874, + "step": 6123 + }, + { + "epoch": 3.946333601933924, + "grad_norm": 1.5670649020459204, + "learning_rate": 9.849809037095158e-05, + "loss": 3.7652, + "step": 6124 + }, + { + "epoch": 3.9469782433521354, + "grad_norm": 1.5797132092554433, + "learning_rate": 9.849759992520972e-05, + "loss": 3.7468, + "step": 6125 + }, + { + "epoch": 3.9476228847703467, + "grad_norm": 1.7712367794435522, + "learning_rate": 9.849710940063782e-05, + "loss": 3.1732, + "step": 6126 + }, + { + "epoch": 3.9482675261885576, + "grad_norm": 1.8335453555656884, + "learning_rate": 9.849661879723665e-05, + "loss": 3.6656, + "step": 6127 + }, + { + "epoch": 3.948912167606769, + "grad_norm": 1.3804307583580409, + "learning_rate": 9.849612811500707e-05, + "loss": 3.825, + "step": 6128 + }, + { + "epoch": 3.9495568090249797, + "grad_norm": 1.5445120111362467, + "learning_rate": 9.849563735394989e-05, + "loss": 4.1363, + "step": 6129 + }, + { + "epoch": 3.950201450443191, + "grad_norm": 1.2492052321934228, + "learning_rate": 9.849514651406588e-05, + "loss": 3.6602, + "step": 6130 + }, + { + "epoch": 3.9508460918614023, + "grad_norm": 1.516809664973437, + "learning_rate": 9.849465559535586e-05, + "loss": 3.3478, + "step": 6131 + }, + { + "epoch": 3.951490733279613, + "grad_norm": 1.4760089284427473, + "learning_rate": 9.849416459782063e-05, + "loss": 3.8217, + "step": 6132 + }, + { + "epoch": 3.9521353746978245, + "grad_norm": 1.4260247968937534, + "learning_rate": 9.849367352146102e-05, + "loss": 3.5949, + "step": 6133 + }, + { + "epoch": 3.9527800161160354, + "grad_norm": 1.538027862852471, + "learning_rate": 9.84931823662778e-05, + "loss": 3.4179, + "step": 6134 + }, + { + "epoch": 3.9534246575342467, + "grad_norm": 1.2445369908292752, + "learning_rate": 9.84926911322718e-05, + "loss": 3.9984, + "step": 6135 + }, + { + "epoch": 3.954069298952458, + "grad_norm": 1.3317541874913492, + "learning_rate": 9.849219981944385e-05, + "loss": 3.4674, + "step": 6136 + }, + { + "epoch": 3.954713940370669, + "grad_norm": 1.3447562288129402, + "learning_rate": 9.84917084277947e-05, + "loss": 3.4133, + "step": 6137 + }, + { + "epoch": 3.9553585817888797, + "grad_norm": 1.3321262486873118, + "learning_rate": 9.84912169573252e-05, + "loss": 3.9171, + "step": 6138 + }, + { + "epoch": 3.956003223207091, + "grad_norm": 1.4713934114559797, + "learning_rate": 9.849072540803614e-05, + "loss": 3.8809, + "step": 6139 + }, + { + "epoch": 3.9566478646253023, + "grad_norm": 1.4317507013394026, + "learning_rate": 9.849023377992835e-05, + "loss": 3.6685, + "step": 6140 + }, + { + "epoch": 3.9572925060435136, + "grad_norm": 1.3548873891248743, + "learning_rate": 9.84897420730026e-05, + "loss": 3.7232, + "step": 6141 + }, + { + "epoch": 3.9579371474617244, + "grad_norm": 1.6599108183157187, + "learning_rate": 9.84892502872597e-05, + "loss": 3.8915, + "step": 6142 + }, + { + "epoch": 3.9585817888799353, + "grad_norm": 1.4118279239921527, + "learning_rate": 9.84887584227005e-05, + "loss": 3.5174, + "step": 6143 + }, + { + "epoch": 3.9592264302981466, + "grad_norm": 1.5941012812855977, + "learning_rate": 9.848826647932578e-05, + "loss": 3.6389, + "step": 6144 + }, + { + "epoch": 3.959871071716358, + "grad_norm": 1.8135905270541377, + "learning_rate": 9.848777445713637e-05, + "loss": 3.4926, + "step": 6145 + }, + { + "epoch": 3.9605157131345687, + "grad_norm": 1.1525798735554282, + "learning_rate": 9.848728235613303e-05, + "loss": 3.7074, + "step": 6146 + }, + { + "epoch": 3.96116035455278, + "grad_norm": 1.591011601406954, + "learning_rate": 9.848679017631661e-05, + "loss": 3.9589, + "step": 6147 + }, + { + "epoch": 3.961804995970991, + "grad_norm": 1.3892234804769341, + "learning_rate": 9.84862979176879e-05, + "loss": 3.8825, + "step": 6148 + }, + { + "epoch": 3.962449637389202, + "grad_norm": 1.7062846372943532, + "learning_rate": 9.848580558024771e-05, + "loss": 3.9438, + "step": 6149 + }, + { + "epoch": 3.9630942788074135, + "grad_norm": 1.257479819569349, + "learning_rate": 9.848531316399687e-05, + "loss": 3.8187, + "step": 6150 + }, + { + "epoch": 3.9637389202256244, + "grad_norm": 1.3807159452116162, + "learning_rate": 9.848482066893614e-05, + "loss": 3.7185, + "step": 6151 + }, + { + "epoch": 3.9643835616438357, + "grad_norm": 1.1833585763418415, + "learning_rate": 9.848432809506638e-05, + "loss": 3.5873, + "step": 6152 + }, + { + "epoch": 3.9650282030620465, + "grad_norm": 1.6282772169589994, + "learning_rate": 9.848383544238838e-05, + "loss": 4.1418, + "step": 6153 + }, + { + "epoch": 3.965672844480258, + "grad_norm": 1.1614094662852468, + "learning_rate": 9.848334271090295e-05, + "loss": 3.7939, + "step": 6154 + }, + { + "epoch": 3.966317485898469, + "grad_norm": 1.4481711330572227, + "learning_rate": 9.848284990061089e-05, + "loss": 3.9616, + "step": 6155 + }, + { + "epoch": 3.96696212731668, + "grad_norm": 1.5424178550283454, + "learning_rate": 9.848235701151301e-05, + "loss": 3.4746, + "step": 6156 + }, + { + "epoch": 3.9676067687348913, + "grad_norm": 1.6037047358617311, + "learning_rate": 9.848186404361013e-05, + "loss": 3.7573, + "step": 6157 + }, + { + "epoch": 3.968251410153102, + "grad_norm": 1.4584537246252542, + "learning_rate": 9.848137099690306e-05, + "loss": 3.6976, + "step": 6158 + }, + { + "epoch": 3.9688960515713134, + "grad_norm": 1.6451744160015687, + "learning_rate": 9.848087787139262e-05, + "loss": 3.5671, + "step": 6159 + }, + { + "epoch": 3.9695406929895247, + "grad_norm": 1.2014159361279342, + "learning_rate": 9.848038466707958e-05, + "loss": 3.5931, + "step": 6160 + }, + { + "epoch": 3.9701853344077356, + "grad_norm": 1.484835875315241, + "learning_rate": 9.84798913839648e-05, + "loss": 3.496, + "step": 6161 + }, + { + "epoch": 3.970829975825947, + "grad_norm": 1.4373088431377687, + "learning_rate": 9.847939802204904e-05, + "loss": 3.5526, + "step": 6162 + }, + { + "epoch": 3.9714746172441577, + "grad_norm": 1.333560622105719, + "learning_rate": 9.847890458133314e-05, + "loss": 3.7002, + "step": 6163 + }, + { + "epoch": 3.972119258662369, + "grad_norm": 1.5306869143476092, + "learning_rate": 9.847841106181793e-05, + "loss": 4.0866, + "step": 6164 + }, + { + "epoch": 3.9727639000805803, + "grad_norm": 1.5443862082206905, + "learning_rate": 9.847791746350416e-05, + "loss": 3.7411, + "step": 6165 + }, + { + "epoch": 3.973408541498791, + "grad_norm": 1.4585756987327443, + "learning_rate": 9.847742378639271e-05, + "loss": 3.6083, + "step": 6166 + }, + { + "epoch": 3.9740531829170025, + "grad_norm": 1.6219321413597632, + "learning_rate": 9.847693003048434e-05, + "loss": 3.8401, + "step": 6167 + }, + { + "epoch": 3.9746978243352133, + "grad_norm": 1.5014051082127609, + "learning_rate": 9.847643619577989e-05, + "loss": 3.7491, + "step": 6168 + }, + { + "epoch": 3.9753424657534246, + "grad_norm": 1.617442653369783, + "learning_rate": 9.847594228228015e-05, + "loss": 3.3985, + "step": 6169 + }, + { + "epoch": 3.975987107171636, + "grad_norm": 1.617973665554024, + "learning_rate": 9.847544828998597e-05, + "loss": 3.8321, + "step": 6170 + }, + { + "epoch": 3.976631748589847, + "grad_norm": 1.5869314561698644, + "learning_rate": 9.847495421889809e-05, + "loss": 3.6639, + "step": 6171 + }, + { + "epoch": 3.977276390008058, + "grad_norm": 1.405398324944923, + "learning_rate": 9.847446006901738e-05, + "loss": 3.6007, + "step": 6172 + }, + { + "epoch": 3.977921031426269, + "grad_norm": 1.4878785901417861, + "learning_rate": 9.847396584034465e-05, + "loss": 3.7313, + "step": 6173 + }, + { + "epoch": 3.9785656728444803, + "grad_norm": 1.561543257338446, + "learning_rate": 9.847347153288068e-05, + "loss": 3.6951, + "step": 6174 + }, + { + "epoch": 3.9792103142626916, + "grad_norm": 1.5282063168593856, + "learning_rate": 9.847297714662631e-05, + "loss": 3.2995, + "step": 6175 + }, + { + "epoch": 3.9798549556809024, + "grad_norm": 1.8578236620127984, + "learning_rate": 9.847248268158234e-05, + "loss": 3.4851, + "step": 6176 + }, + { + "epoch": 3.9804995970991137, + "grad_norm": 1.743789833026136, + "learning_rate": 9.847198813774959e-05, + "loss": 3.6393, + "step": 6177 + }, + { + "epoch": 3.9811442385173246, + "grad_norm": 1.5118265947474292, + "learning_rate": 9.847149351512886e-05, + "loss": 3.9823, + "step": 6178 + }, + { + "epoch": 3.981788879935536, + "grad_norm": 1.3916722149648144, + "learning_rate": 9.847099881372097e-05, + "loss": 3.5565, + "step": 6179 + }, + { + "epoch": 3.982433521353747, + "grad_norm": 1.7082874729186739, + "learning_rate": 9.847050403352672e-05, + "loss": 3.9264, + "step": 6180 + }, + { + "epoch": 3.983078162771958, + "grad_norm": 1.193236606748939, + "learning_rate": 9.847000917454693e-05, + "loss": 3.6551, + "step": 6181 + }, + { + "epoch": 3.9837228041901693, + "grad_norm": 1.3829554344946067, + "learning_rate": 9.846951423678242e-05, + "loss": 3.4757, + "step": 6182 + }, + { + "epoch": 3.98436744560838, + "grad_norm": 1.269342604992164, + "learning_rate": 9.8469019220234e-05, + "loss": 3.7877, + "step": 6183 + }, + { + "epoch": 3.9850120870265915, + "grad_norm": 1.5023644580367592, + "learning_rate": 9.846852412490249e-05, + "loss": 3.6703, + "step": 6184 + }, + { + "epoch": 3.985656728444803, + "grad_norm": 1.557091282451335, + "learning_rate": 9.84680289507887e-05, + "loss": 3.5678, + "step": 6185 + }, + { + "epoch": 3.9863013698630136, + "grad_norm": 1.731113715768024, + "learning_rate": 9.846753369789342e-05, + "loss": 3.0124, + "step": 6186 + }, + { + "epoch": 3.986946011281225, + "grad_norm": 2.3199391656407418, + "learning_rate": 9.846703836621749e-05, + "loss": 3.9188, + "step": 6187 + }, + { + "epoch": 3.987590652699436, + "grad_norm": 1.3513192796376159, + "learning_rate": 9.846654295576172e-05, + "loss": 3.5103, + "step": 6188 + }, + { + "epoch": 3.988235294117647, + "grad_norm": 1.7149999459999918, + "learning_rate": 9.846604746652691e-05, + "loss": 3.5646, + "step": 6189 + }, + { + "epoch": 3.9888799355358584, + "grad_norm": 1.4493152137902163, + "learning_rate": 9.846555189851388e-05, + "loss": 3.9939, + "step": 6190 + }, + { + "epoch": 3.9895245769540693, + "grad_norm": 1.6504294884414756, + "learning_rate": 9.846505625172347e-05, + "loss": 3.6243, + "step": 6191 + }, + { + "epoch": 3.9901692183722806, + "grad_norm": 1.6626616778276642, + "learning_rate": 9.846456052615644e-05, + "loss": 3.492, + "step": 6192 + }, + { + "epoch": 3.9908138597904914, + "grad_norm": 1.2440891129149159, + "learning_rate": 9.846406472181365e-05, + "loss": 3.9456, + "step": 6193 + }, + { + "epoch": 3.9914585012087027, + "grad_norm": 1.7268013250811787, + "learning_rate": 9.84635688386959e-05, + "loss": 3.7089, + "step": 6194 + }, + { + "epoch": 3.992103142626914, + "grad_norm": 1.5761128823414032, + "learning_rate": 9.846307287680398e-05, + "loss": 3.7955, + "step": 6195 + }, + { + "epoch": 3.992747784045125, + "grad_norm": 1.4021843302764674, + "learning_rate": 9.846257683613876e-05, + "loss": 3.5889, + "step": 6196 + }, + { + "epoch": 3.993392425463336, + "grad_norm": 1.3880032962371978, + "learning_rate": 9.846208071670102e-05, + "loss": 3.7078, + "step": 6197 + }, + { + "epoch": 3.994037066881547, + "grad_norm": 1.5754674184349433, + "learning_rate": 9.846158451849156e-05, + "loss": 3.5171, + "step": 6198 + }, + { + "epoch": 3.9946817082997583, + "grad_norm": 1.191568174600014, + "learning_rate": 9.846108824151122e-05, + "loss": 3.2821, + "step": 6199 + }, + { + "epoch": 3.9953263497179696, + "grad_norm": 1.8975904300223465, + "learning_rate": 9.846059188576081e-05, + "loss": 3.8232, + "step": 6200 + }, + { + "epoch": 3.9953263497179696, + "eval_loss": 4.030847072601318, + "eval_runtime": 2.9777, + "eval_samples_per_second": 33.583, + "eval_steps_per_second": 4.366, + "step": 6200 + }, + { + "epoch": 3.9959709911361805, + "grad_norm": 1.1827184465441893, + "learning_rate": 9.846009545124112e-05, + "loss": 3.9454, + "step": 6201 + }, + { + "epoch": 3.996615632554392, + "grad_norm": 1.577474061883615, + "learning_rate": 9.845959893795301e-05, + "loss": 3.9145, + "step": 6202 + }, + { + "epoch": 3.9972602739726026, + "grad_norm": 1.5350711990709247, + "learning_rate": 9.845910234589727e-05, + "loss": 3.6085, + "step": 6203 + }, + { + "epoch": 3.997904915390814, + "grad_norm": 1.4448399118980215, + "learning_rate": 9.845860567507472e-05, + "loss": 3.8487, + "step": 6204 + }, + { + "epoch": 3.9985495568090252, + "grad_norm": 1.538942169924735, + "learning_rate": 9.845810892548617e-05, + "loss": 3.6299, + "step": 6205 + }, + { + "epoch": 3.999194198227236, + "grad_norm": 1.3815189749105434, + "learning_rate": 9.845761209713244e-05, + "loss": 3.3809, + "step": 6206 + }, + { + "epoch": 3.999838839645447, + "grad_norm": 1.3373817878866998, + "learning_rate": 9.845711519001437e-05, + "loss": 4.0961, + "step": 6207 + }, + { + "epoch": 4.0, + "grad_norm": 1.5951414159594335, + "learning_rate": 9.845661820413272e-05, + "loss": 0.9913, + "step": 6208 + }, + { + "epoch": 4.000644641418211, + "grad_norm": 2.138748667527694, + "learning_rate": 9.845612113948835e-05, + "loss": 3.3596, + "step": 6209 + }, + { + "epoch": 4.001289282836423, + "grad_norm": 1.677310228469739, + "learning_rate": 9.845562399608208e-05, + "loss": 3.3858, + "step": 6210 + }, + { + "epoch": 4.001933924254633, + "grad_norm": 3.065121954737844, + "learning_rate": 9.84551267739147e-05, + "loss": 3.423, + "step": 6211 + }, + { + "epoch": 4.002578565672844, + "grad_norm": 1.4859998635768161, + "learning_rate": 9.845462947298703e-05, + "loss": 3.2444, + "step": 6212 + }, + { + "epoch": 4.003223207091056, + "grad_norm": 1.86292578120615, + "learning_rate": 9.84541320932999e-05, + "loss": 3.1834, + "step": 6213 + }, + { + "epoch": 4.003867848509267, + "grad_norm": 1.9574607700931035, + "learning_rate": 9.845363463485414e-05, + "loss": 3.1774, + "step": 6214 + }, + { + "epoch": 4.004512489927478, + "grad_norm": 1.6092007117612863, + "learning_rate": 9.845313709765052e-05, + "loss": 2.8917, + "step": 6215 + }, + { + "epoch": 4.005157131345689, + "grad_norm": 1.4915833443217885, + "learning_rate": 9.845263948168992e-05, + "loss": 3.1338, + "step": 6216 + }, + { + "epoch": 4.0058017727639, + "grad_norm": 1.8854769080187581, + "learning_rate": 9.84521417869731e-05, + "loss": 3.4739, + "step": 6217 + }, + { + "epoch": 4.006446414182111, + "grad_norm": 2.073303742867963, + "learning_rate": 9.84516440135009e-05, + "loss": 3.3667, + "step": 6218 + }, + { + "epoch": 4.0070910556003225, + "grad_norm": 1.3597075428880032, + "learning_rate": 9.845114616127413e-05, + "loss": 3.5228, + "step": 6219 + }, + { + "epoch": 4.007735697018534, + "grad_norm": 1.652061924114162, + "learning_rate": 9.845064823029364e-05, + "loss": 3.0122, + "step": 6220 + }, + { + "epoch": 4.008380338436744, + "grad_norm": 1.5181716165789936, + "learning_rate": 9.845015022056021e-05, + "loss": 3.3321, + "step": 6221 + }, + { + "epoch": 4.0090249798549555, + "grad_norm": 1.461896205636779, + "learning_rate": 9.844965213207467e-05, + "loss": 3.2519, + "step": 6222 + }, + { + "epoch": 4.009669621273167, + "grad_norm": 1.767140475568899, + "learning_rate": 9.844915396483785e-05, + "loss": 3.339, + "step": 6223 + }, + { + "epoch": 4.010314262691378, + "grad_norm": 1.7221651537378324, + "learning_rate": 9.844865571885053e-05, + "loss": 3.2271, + "step": 6224 + }, + { + "epoch": 4.010958904109589, + "grad_norm": 1.4001296200679965, + "learning_rate": 9.844815739411359e-05, + "loss": 3.4847, + "step": 6225 + }, + { + "epoch": 4.0116035455278, + "grad_norm": 1.6807813699932028, + "learning_rate": 9.844765899062781e-05, + "loss": 3.5872, + "step": 6226 + }, + { + "epoch": 4.012248186946011, + "grad_norm": 1.5960569976278374, + "learning_rate": 9.844716050839401e-05, + "loss": 3.0053, + "step": 6227 + }, + { + "epoch": 4.0128928283642225, + "grad_norm": 1.519134605808332, + "learning_rate": 9.844666194741299e-05, + "loss": 3.4513, + "step": 6228 + }, + { + "epoch": 4.013537469782434, + "grad_norm": 1.4479241702164334, + "learning_rate": 9.844616330768561e-05, + "loss": 3.0406, + "step": 6229 + }, + { + "epoch": 4.014182111200645, + "grad_norm": 2.1971649757571923, + "learning_rate": 9.844566458921267e-05, + "loss": 3.2089, + "step": 6230 + }, + { + "epoch": 4.0148267526188555, + "grad_norm": 1.4094107375893214, + "learning_rate": 9.844516579199498e-05, + "loss": 3.6336, + "step": 6231 + }, + { + "epoch": 4.015471394037067, + "grad_norm": 1.9998712757604504, + "learning_rate": 9.844466691603339e-05, + "loss": 3.198, + "step": 6232 + }, + { + "epoch": 4.016116035455278, + "grad_norm": 1.8936923938307075, + "learning_rate": 9.844416796132866e-05, + "loss": 3.1175, + "step": 6233 + }, + { + "epoch": 4.016760676873489, + "grad_norm": 1.4014918814506074, + "learning_rate": 9.844366892788168e-05, + "loss": 3.0922, + "step": 6234 + }, + { + "epoch": 4.017405318291701, + "grad_norm": 1.8200410111833247, + "learning_rate": 9.844316981569324e-05, + "loss": 3.4904, + "step": 6235 + }, + { + "epoch": 4.018049959709911, + "grad_norm": 1.4516053511808549, + "learning_rate": 9.844267062476413e-05, + "loss": 3.4476, + "step": 6236 + }, + { + "epoch": 4.018694601128122, + "grad_norm": 1.460675542351015, + "learning_rate": 9.844217135509521e-05, + "loss": 3.3561, + "step": 6237 + }, + { + "epoch": 4.019339242546334, + "grad_norm": 1.5379084150810949, + "learning_rate": 9.844167200668728e-05, + "loss": 2.9188, + "step": 6238 + }, + { + "epoch": 4.019983883964545, + "grad_norm": 1.8682243898911444, + "learning_rate": 9.844117257954117e-05, + "loss": 3.2662, + "step": 6239 + }, + { + "epoch": 4.020628525382756, + "grad_norm": 1.6962353324350625, + "learning_rate": 9.844067307365771e-05, + "loss": 3.2381, + "step": 6240 + }, + { + "epoch": 4.021273166800967, + "grad_norm": 1.909341008078696, + "learning_rate": 9.844017348903768e-05, + "loss": 3.18, + "step": 6241 + }, + { + "epoch": 4.021917808219178, + "grad_norm": 1.5449705359199464, + "learning_rate": 9.843967382568194e-05, + "loss": 3.1876, + "step": 6242 + }, + { + "epoch": 4.022562449637389, + "grad_norm": 2.56119325569683, + "learning_rate": 9.84391740835913e-05, + "loss": 3.4121, + "step": 6243 + }, + { + "epoch": 4.023207091055601, + "grad_norm": 1.3730039638433584, + "learning_rate": 9.843867426276657e-05, + "loss": 3.4679, + "step": 6244 + }, + { + "epoch": 4.023851732473811, + "grad_norm": 2.363061557490876, + "learning_rate": 9.843817436320859e-05, + "loss": 3.3851, + "step": 6245 + }, + { + "epoch": 4.024496373892022, + "grad_norm": 1.6477730120062075, + "learning_rate": 9.843767438491817e-05, + "loss": 3.3629, + "step": 6246 + }, + { + "epoch": 4.025141015310234, + "grad_norm": 1.589597393075451, + "learning_rate": 9.843717432789612e-05, + "loss": 3.1891, + "step": 6247 + }, + { + "epoch": 4.025785656728445, + "grad_norm": 1.5379999370635062, + "learning_rate": 9.843667419214328e-05, + "loss": 3.1146, + "step": 6248 + }, + { + "epoch": 4.026430298146656, + "grad_norm": 1.548387411417338, + "learning_rate": 9.843617397766048e-05, + "loss": 3.1825, + "step": 6249 + }, + { + "epoch": 4.027074939564867, + "grad_norm": 1.7988245492271382, + "learning_rate": 9.843567368444849e-05, + "loss": 3.0488, + "step": 6250 + }, + { + "epoch": 4.027719580983078, + "grad_norm": 1.3817228016334986, + "learning_rate": 9.84351733125082e-05, + "loss": 3.3702, + "step": 6251 + }, + { + "epoch": 4.028364222401289, + "grad_norm": 1.720060186336293, + "learning_rate": 9.843467286184038e-05, + "loss": 3.3068, + "step": 6252 + }, + { + "epoch": 4.0290088638195005, + "grad_norm": 1.4904096738307377, + "learning_rate": 9.843417233244588e-05, + "loss": 3.3412, + "step": 6253 + }, + { + "epoch": 4.029653505237712, + "grad_norm": 1.7119201647677493, + "learning_rate": 9.84336717243255e-05, + "loss": 3.3652, + "step": 6254 + }, + { + "epoch": 4.030298146655922, + "grad_norm": 1.7088501681936148, + "learning_rate": 9.84331710374801e-05, + "loss": 3.0028, + "step": 6255 + }, + { + "epoch": 4.0309427880741335, + "grad_norm": 1.435184470301724, + "learning_rate": 9.843267027191047e-05, + "loss": 3.3103, + "step": 6256 + }, + { + "epoch": 4.031587429492345, + "grad_norm": 2.0316504252063345, + "learning_rate": 9.843216942761744e-05, + "loss": 3.2316, + "step": 6257 + }, + { + "epoch": 4.032232070910556, + "grad_norm": 1.71649598847237, + "learning_rate": 9.843166850460182e-05, + "loss": 3.3671, + "step": 6258 + }, + { + "epoch": 4.032876712328767, + "grad_norm": 1.8391150295151535, + "learning_rate": 9.843116750286446e-05, + "loss": 3.2926, + "step": 6259 + }, + { + "epoch": 4.033521353746978, + "grad_norm": 1.691552014788736, + "learning_rate": 9.843066642240616e-05, + "loss": 3.349, + "step": 6260 + }, + { + "epoch": 4.034165995165189, + "grad_norm": 2.430748675908019, + "learning_rate": 9.843016526322776e-05, + "loss": 2.8903, + "step": 6261 + }, + { + "epoch": 4.0348106365834004, + "grad_norm": 2.5157051910556554, + "learning_rate": 9.842966402533007e-05, + "loss": 3.0744, + "step": 6262 + }, + { + "epoch": 4.035455278001612, + "grad_norm": 1.9876102835886233, + "learning_rate": 9.842916270871392e-05, + "loss": 3.2446, + "step": 6263 + }, + { + "epoch": 4.036099919419823, + "grad_norm": 2.5500021826744934, + "learning_rate": 9.842866131338012e-05, + "loss": 3.0212, + "step": 6264 + }, + { + "epoch": 4.0367445608380335, + "grad_norm": 1.467250246117523, + "learning_rate": 9.842815983932952e-05, + "loss": 3.2932, + "step": 6265 + }, + { + "epoch": 4.037389202256245, + "grad_norm": 1.9827766788127048, + "learning_rate": 9.842765828656291e-05, + "loss": 3.3173, + "step": 6266 + }, + { + "epoch": 4.038033843674456, + "grad_norm": 1.4934213043325633, + "learning_rate": 9.842715665508114e-05, + "loss": 3.2676, + "step": 6267 + }, + { + "epoch": 4.038678485092667, + "grad_norm": 1.6706003716589937, + "learning_rate": 9.842665494488504e-05, + "loss": 3.1619, + "step": 6268 + }, + { + "epoch": 4.039323126510879, + "grad_norm": 1.6200207449754445, + "learning_rate": 9.84261531559754e-05, + "loss": 2.9522, + "step": 6269 + }, + { + "epoch": 4.039967767929089, + "grad_norm": 1.365459078354166, + "learning_rate": 9.842565128835308e-05, + "loss": 3.385, + "step": 6270 + }, + { + "epoch": 4.0406124093473, + "grad_norm": 1.5911859694348627, + "learning_rate": 9.842514934201889e-05, + "loss": 2.8725, + "step": 6271 + }, + { + "epoch": 4.041257050765512, + "grad_norm": 1.7478385686574553, + "learning_rate": 9.842464731697363e-05, + "loss": 3.3543, + "step": 6272 + }, + { + "epoch": 4.041901692183723, + "grad_norm": 1.58871221810988, + "learning_rate": 9.842414521321815e-05, + "loss": 2.8407, + "step": 6273 + }, + { + "epoch": 4.042546333601934, + "grad_norm": 1.716726953301924, + "learning_rate": 9.842364303075328e-05, + "loss": 3.2035, + "step": 6274 + }, + { + "epoch": 4.043190975020145, + "grad_norm": 1.485091345984951, + "learning_rate": 9.842314076957985e-05, + "loss": 3.1675, + "step": 6275 + }, + { + "epoch": 4.043835616438356, + "grad_norm": 1.6230514157834481, + "learning_rate": 9.842263842969866e-05, + "loss": 3.2781, + "step": 6276 + }, + { + "epoch": 4.044480257856567, + "grad_norm": 1.4912438070784721, + "learning_rate": 9.842213601111054e-05, + "loss": 3.1296, + "step": 6277 + }, + { + "epoch": 4.045124899274779, + "grad_norm": 1.4960341653395446, + "learning_rate": 9.842163351381633e-05, + "loss": 3.2762, + "step": 6278 + }, + { + "epoch": 4.04576954069299, + "grad_norm": 1.2798869135645288, + "learning_rate": 9.842113093781683e-05, + "loss": 3.0043, + "step": 6279 + }, + { + "epoch": 4.0464141821112, + "grad_norm": 1.3090685915943723, + "learning_rate": 9.842062828311291e-05, + "loss": 3.3944, + "step": 6280 + }, + { + "epoch": 4.047058823529412, + "grad_norm": 1.366283119246797, + "learning_rate": 9.842012554970534e-05, + "loss": 3.1852, + "step": 6281 + }, + { + "epoch": 4.047703464947623, + "grad_norm": 1.3819926517681174, + "learning_rate": 9.841962273759499e-05, + "loss": 3.4016, + "step": 6282 + }, + { + "epoch": 4.048348106365834, + "grad_norm": 1.3967531833294833, + "learning_rate": 9.841911984678267e-05, + "loss": 3.0628, + "step": 6283 + }, + { + "epoch": 4.0489927477840455, + "grad_norm": 1.4777224364938173, + "learning_rate": 9.84186168772692e-05, + "loss": 3.3023, + "step": 6284 + }, + { + "epoch": 4.049637389202256, + "grad_norm": 1.3609974963863956, + "learning_rate": 9.841811382905541e-05, + "loss": 3.306, + "step": 6285 + }, + { + "epoch": 4.050282030620467, + "grad_norm": 1.4928397179619357, + "learning_rate": 9.841761070214214e-05, + "loss": 3.1515, + "step": 6286 + }, + { + "epoch": 4.0509266720386785, + "grad_norm": 1.7429902766751344, + "learning_rate": 9.84171074965302e-05, + "loss": 3.1331, + "step": 6287 + }, + { + "epoch": 4.05157131345689, + "grad_norm": 1.5167947012517744, + "learning_rate": 9.841660421222039e-05, + "loss": 2.992, + "step": 6288 + }, + { + "epoch": 4.052215954875101, + "grad_norm": 2.02895041543239, + "learning_rate": 9.84161008492136e-05, + "loss": 3.3197, + "step": 6289 + }, + { + "epoch": 4.0528605962933115, + "grad_norm": 1.7197946620407332, + "learning_rate": 9.841559740751061e-05, + "loss": 3.2296, + "step": 6290 + }, + { + "epoch": 4.053505237711523, + "grad_norm": 1.7983665048522914, + "learning_rate": 9.841509388711228e-05, + "loss": 3.0449, + "step": 6291 + }, + { + "epoch": 4.054149879129734, + "grad_norm": 1.8591903212827816, + "learning_rate": 9.84145902880194e-05, + "loss": 3.1947, + "step": 6292 + }, + { + "epoch": 4.054794520547945, + "grad_norm": 1.5356799571061668, + "learning_rate": 9.841408661023282e-05, + "loss": 3.4166, + "step": 6293 + }, + { + "epoch": 4.055439161966157, + "grad_norm": 1.8444236591484238, + "learning_rate": 9.841358285375335e-05, + "loss": 3.2257, + "step": 6294 + }, + { + "epoch": 4.056083803384367, + "grad_norm": 1.5588264959675893, + "learning_rate": 9.841307901858183e-05, + "loss": 3.1422, + "step": 6295 + }, + { + "epoch": 4.056728444802578, + "grad_norm": 1.5082510407300023, + "learning_rate": 9.84125751047191e-05, + "loss": 3.5953, + "step": 6296 + }, + { + "epoch": 4.05737308622079, + "grad_norm": 1.881437560432274, + "learning_rate": 9.841207111216597e-05, + "loss": 3.3023, + "step": 6297 + }, + { + "epoch": 4.058017727639001, + "grad_norm": 1.7042768196997502, + "learning_rate": 9.841156704092327e-05, + "loss": 3.1972, + "step": 6298 + }, + { + "epoch": 4.058662369057212, + "grad_norm": 1.7902622102612629, + "learning_rate": 9.841106289099183e-05, + "loss": 3.185, + "step": 6299 + }, + { + "epoch": 4.059307010475423, + "grad_norm": 1.595812953154173, + "learning_rate": 9.841055866237248e-05, + "loss": 3.3185, + "step": 6300 + }, + { + "epoch": 4.059307010475423, + "eval_loss": 4.2397780418396, + "eval_runtime": 2.9829, + "eval_samples_per_second": 33.524, + "eval_steps_per_second": 4.358, + "step": 6300 + }, + { + "epoch": 4.059951651893634, + "grad_norm": 1.4906882502866312, + "learning_rate": 9.841005435506604e-05, + "loss": 2.8607, + "step": 6301 + }, + { + "epoch": 4.060596293311845, + "grad_norm": 1.7895134405241526, + "learning_rate": 9.840954996907333e-05, + "loss": 3.1292, + "step": 6302 + }, + { + "epoch": 4.061240934730057, + "grad_norm": 1.84122671323097, + "learning_rate": 9.840904550439523e-05, + "loss": 3.3527, + "step": 6303 + }, + { + "epoch": 4.061885576148268, + "grad_norm": 1.8083562234210395, + "learning_rate": 9.84085409610325e-05, + "loss": 3.2849, + "step": 6304 + }, + { + "epoch": 4.062530217566478, + "grad_norm": 1.5001176493403483, + "learning_rate": 9.8408036338986e-05, + "loss": 3.1255, + "step": 6305 + }, + { + "epoch": 4.06317485898469, + "grad_norm": 1.8229702378629966, + "learning_rate": 9.840753163825657e-05, + "loss": 3.4537, + "step": 6306 + }, + { + "epoch": 4.063819500402901, + "grad_norm": 1.7395711860387695, + "learning_rate": 9.840702685884504e-05, + "loss": 3.5775, + "step": 6307 + }, + { + "epoch": 4.064464141821112, + "grad_norm": 1.630408649942803, + "learning_rate": 9.840652200075221e-05, + "loss": 3.0318, + "step": 6308 + }, + { + "epoch": 4.065108783239323, + "grad_norm": 1.6981421817773696, + "learning_rate": 9.840601706397893e-05, + "loss": 3.2055, + "step": 6309 + }, + { + "epoch": 4.065753424657534, + "grad_norm": 1.3815093446903826, + "learning_rate": 9.840551204852602e-05, + "loss": 3.3289, + "step": 6310 + }, + { + "epoch": 4.066398066075745, + "grad_norm": 1.9474503742611455, + "learning_rate": 9.840500695439431e-05, + "loss": 3.2171, + "step": 6311 + }, + { + "epoch": 4.067042707493957, + "grad_norm": 1.8146310410714146, + "learning_rate": 9.840450178158464e-05, + "loss": 3.1185, + "step": 6312 + }, + { + "epoch": 4.067687348912168, + "grad_norm": 1.88610064125699, + "learning_rate": 9.840399653009785e-05, + "loss": 3.732, + "step": 6313 + }, + { + "epoch": 4.068331990330378, + "grad_norm": 1.516661552647394, + "learning_rate": 9.840349119993473e-05, + "loss": 3.0866, + "step": 6314 + }, + { + "epoch": 4.06897663174859, + "grad_norm": 1.8849845706479949, + "learning_rate": 9.840298579109614e-05, + "loss": 3.4591, + "step": 6315 + }, + { + "epoch": 4.069621273166801, + "grad_norm": 2.360473559197621, + "learning_rate": 9.84024803035829e-05, + "loss": 3.1408, + "step": 6316 + }, + { + "epoch": 4.070265914585012, + "grad_norm": 1.7349842855998305, + "learning_rate": 9.840197473739586e-05, + "loss": 3.3744, + "step": 6317 + }, + { + "epoch": 4.0709105560032235, + "grad_norm": 2.3534706122269693, + "learning_rate": 9.840146909253582e-05, + "loss": 3.0386, + "step": 6318 + }, + { + "epoch": 4.071555197421434, + "grad_norm": 1.6153490021047474, + "learning_rate": 9.840096336900362e-05, + "loss": 2.8665, + "step": 6319 + }, + { + "epoch": 4.072199838839645, + "grad_norm": 1.756259677923804, + "learning_rate": 9.84004575668001e-05, + "loss": 3.0405, + "step": 6320 + }, + { + "epoch": 4.0728444802578565, + "grad_norm": 1.5506709782926573, + "learning_rate": 9.839995168592609e-05, + "loss": 3.3145, + "step": 6321 + }, + { + "epoch": 4.073489121676068, + "grad_norm": 1.9430809266515177, + "learning_rate": 9.83994457263824e-05, + "loss": 3.4457, + "step": 6322 + }, + { + "epoch": 4.074133763094279, + "grad_norm": 2.0563433768578507, + "learning_rate": 9.83989396881699e-05, + "loss": 3.126, + "step": 6323 + }, + { + "epoch": 4.0747784045124895, + "grad_norm": 1.7183807898028356, + "learning_rate": 9.83984335712894e-05, + "loss": 3.2766, + "step": 6324 + }, + { + "epoch": 4.075423045930701, + "grad_norm": 1.6578754211045967, + "learning_rate": 9.839792737574171e-05, + "loss": 3.4109, + "step": 6325 + }, + { + "epoch": 4.076067687348912, + "grad_norm": 1.6885513517999955, + "learning_rate": 9.839742110152768e-05, + "loss": 3.3362, + "step": 6326 + }, + { + "epoch": 4.076712328767123, + "grad_norm": 1.501732567685128, + "learning_rate": 9.839691474864816e-05, + "loss": 3.2832, + "step": 6327 + }, + { + "epoch": 4.077356970185335, + "grad_norm": 1.6905061924924762, + "learning_rate": 9.839640831710396e-05, + "loss": 3.3358, + "step": 6328 + }, + { + "epoch": 4.078001611603545, + "grad_norm": 1.6587011909165006, + "learning_rate": 9.83959018068959e-05, + "loss": 3.5954, + "step": 6329 + }, + { + "epoch": 4.078646253021756, + "grad_norm": 1.4118889602873814, + "learning_rate": 9.839539521802486e-05, + "loss": 3.1279, + "step": 6330 + }, + { + "epoch": 4.079290894439968, + "grad_norm": 1.5265187257648976, + "learning_rate": 9.839488855049163e-05, + "loss": 2.9199, + "step": 6331 + }, + { + "epoch": 4.079935535858179, + "grad_norm": 1.644908232206094, + "learning_rate": 9.839438180429703e-05, + "loss": 3.4272, + "step": 6332 + }, + { + "epoch": 4.08058017727639, + "grad_norm": 1.5299741822157877, + "learning_rate": 9.839387497944192e-05, + "loss": 2.9901, + "step": 6333 + }, + { + "epoch": 4.081224818694601, + "grad_norm": 1.4087626443514907, + "learning_rate": 9.839336807592714e-05, + "loss": 3.1166, + "step": 6334 + }, + { + "epoch": 4.081869460112812, + "grad_norm": 1.8864561389241175, + "learning_rate": 9.839286109375351e-05, + "loss": 3.1665, + "step": 6335 + }, + { + "epoch": 4.082514101531023, + "grad_norm": 1.4335244596487176, + "learning_rate": 9.839235403292185e-05, + "loss": 3.537, + "step": 6336 + }, + { + "epoch": 4.083158742949235, + "grad_norm": 2.1064590045610156, + "learning_rate": 9.8391846893433e-05, + "loss": 3.6307, + "step": 6337 + }, + { + "epoch": 4.083803384367446, + "grad_norm": 1.5118247936921272, + "learning_rate": 9.83913396752878e-05, + "loss": 3.2647, + "step": 6338 + }, + { + "epoch": 4.084448025785656, + "grad_norm": 1.7223208186504029, + "learning_rate": 9.839083237848711e-05, + "loss": 2.9742, + "step": 6339 + }, + { + "epoch": 4.085092667203868, + "grad_norm": 1.92678665066432, + "learning_rate": 9.839032500303171e-05, + "loss": 3.0545, + "step": 6340 + }, + { + "epoch": 4.085737308622079, + "grad_norm": 1.376458081473057, + "learning_rate": 9.838981754892246e-05, + "loss": 3.0598, + "step": 6341 + }, + { + "epoch": 4.08638195004029, + "grad_norm": 1.6590001632248623, + "learning_rate": 9.838931001616017e-05, + "loss": 3.3128, + "step": 6342 + }, + { + "epoch": 4.087026591458502, + "grad_norm": 1.460723730705444, + "learning_rate": 9.838880240474572e-05, + "loss": 3.3042, + "step": 6343 + }, + { + "epoch": 4.087671232876712, + "grad_norm": 1.6911896952299312, + "learning_rate": 9.838829471467991e-05, + "loss": 2.8365, + "step": 6344 + }, + { + "epoch": 4.088315874294923, + "grad_norm": 1.4762928682429783, + "learning_rate": 9.83877869459636e-05, + "loss": 3.3587, + "step": 6345 + }, + { + "epoch": 4.088960515713135, + "grad_norm": 1.6832494441727555, + "learning_rate": 9.838727909859759e-05, + "loss": 3.2523, + "step": 6346 + }, + { + "epoch": 4.089605157131346, + "grad_norm": 1.4871328763385263, + "learning_rate": 9.838677117258273e-05, + "loss": 3.1915, + "step": 6347 + }, + { + "epoch": 4.090249798549557, + "grad_norm": 1.8245005765077145, + "learning_rate": 9.838626316791983e-05, + "loss": 3.2397, + "step": 6348 + }, + { + "epoch": 4.090894439967768, + "grad_norm": 1.5590273088723423, + "learning_rate": 9.838575508460977e-05, + "loss": 3.0795, + "step": 6349 + }, + { + "epoch": 4.091539081385979, + "grad_norm": 2.601337949965237, + "learning_rate": 9.838524692265336e-05, + "loss": 3.4564, + "step": 6350 + }, + { + "epoch": 4.09218372280419, + "grad_norm": 2.1090268073844203, + "learning_rate": 9.838473868205145e-05, + "loss": 3.2342, + "step": 6351 + }, + { + "epoch": 4.0928283642224015, + "grad_norm": 2.2104433503028997, + "learning_rate": 9.838423036280485e-05, + "loss": 3.0407, + "step": 6352 + }, + { + "epoch": 4.093473005640613, + "grad_norm": 1.7099883068620452, + "learning_rate": 9.838372196491442e-05, + "loss": 3.167, + "step": 6353 + }, + { + "epoch": 4.094117647058823, + "grad_norm": 1.9399276219838555, + "learning_rate": 9.838321348838097e-05, + "loss": 3.3788, + "step": 6354 + }, + { + "epoch": 4.0947622884770345, + "grad_norm": 1.6605187541634054, + "learning_rate": 9.838270493320535e-05, + "loss": 2.9925, + "step": 6355 + }, + { + "epoch": 4.095406929895246, + "grad_norm": 1.6269228614151072, + "learning_rate": 9.838219629938837e-05, + "loss": 3.2763, + "step": 6356 + }, + { + "epoch": 4.096051571313457, + "grad_norm": 1.823063185369545, + "learning_rate": 9.838168758693092e-05, + "loss": 2.91, + "step": 6357 + }, + { + "epoch": 4.096696212731668, + "grad_norm": 1.7230676167955905, + "learning_rate": 9.838117879583379e-05, + "loss": 3.2038, + "step": 6358 + }, + { + "epoch": 4.097340854149879, + "grad_norm": 1.5629966116965304, + "learning_rate": 9.838066992609782e-05, + "loss": 3.3872, + "step": 6359 + }, + { + "epoch": 4.09798549556809, + "grad_norm": 2.226990488764917, + "learning_rate": 9.838016097772387e-05, + "loss": 2.8643, + "step": 6360 + }, + { + "epoch": 4.098630136986301, + "grad_norm": 1.5602009312521696, + "learning_rate": 9.837965195071275e-05, + "loss": 3.132, + "step": 6361 + }, + { + "epoch": 4.099274778404513, + "grad_norm": 1.7747177062031023, + "learning_rate": 9.837914284506531e-05, + "loss": 3.0594, + "step": 6362 + }, + { + "epoch": 4.099919419822724, + "grad_norm": 1.6302338671695806, + "learning_rate": 9.837863366078239e-05, + "loss": 3.2984, + "step": 6363 + }, + { + "epoch": 4.100564061240934, + "grad_norm": 1.6552964190133865, + "learning_rate": 9.83781243978648e-05, + "loss": 3.5723, + "step": 6364 + }, + { + "epoch": 4.101208702659146, + "grad_norm": 1.3509773677126848, + "learning_rate": 9.837761505631341e-05, + "loss": 3.0875, + "step": 6365 + }, + { + "epoch": 4.101853344077357, + "grad_norm": 1.8726632321161552, + "learning_rate": 9.837710563612905e-05, + "loss": 3.4193, + "step": 6366 + }, + { + "epoch": 4.102497985495568, + "grad_norm": 1.7196638201097705, + "learning_rate": 9.837659613731251e-05, + "loss": 3.186, + "step": 6367 + }, + { + "epoch": 4.10314262691378, + "grad_norm": 1.6330178090184557, + "learning_rate": 9.83760865598647e-05, + "loss": 3.3035, + "step": 6368 + }, + { + "epoch": 4.10378726833199, + "grad_norm": 1.9411370817963065, + "learning_rate": 9.83755769037864e-05, + "loss": 3.1958, + "step": 6369 + }, + { + "epoch": 4.104431909750201, + "grad_norm": 1.5209842076266031, + "learning_rate": 9.837506716907849e-05, + "loss": 3.2627, + "step": 6370 + }, + { + "epoch": 4.105076551168413, + "grad_norm": 1.6295875894667906, + "learning_rate": 9.837455735574179e-05, + "loss": 3.3112, + "step": 6371 + }, + { + "epoch": 4.105721192586624, + "grad_norm": 1.6357344248061871, + "learning_rate": 9.837404746377711e-05, + "loss": 3.276, + "step": 6372 + }, + { + "epoch": 4.106365834004835, + "grad_norm": 1.461227421448411, + "learning_rate": 9.837353749318532e-05, + "loss": 2.9203, + "step": 6373 + }, + { + "epoch": 4.107010475423046, + "grad_norm": 1.646540574394212, + "learning_rate": 9.837302744396725e-05, + "loss": 3.1018, + "step": 6374 + }, + { + "epoch": 4.107655116841257, + "grad_norm": 1.4965188091317967, + "learning_rate": 9.837251731612373e-05, + "loss": 3.2366, + "step": 6375 + }, + { + "epoch": 4.108299758259468, + "grad_norm": 1.7032961852467983, + "learning_rate": 9.837200710965562e-05, + "loss": 3.3845, + "step": 6376 + }, + { + "epoch": 4.1089443996776795, + "grad_norm": 1.7237849060435106, + "learning_rate": 9.837149682456373e-05, + "loss": 3.1639, + "step": 6377 + }, + { + "epoch": 4.109589041095891, + "grad_norm": 1.2624359559934144, + "learning_rate": 9.837098646084893e-05, + "loss": 2.7805, + "step": 6378 + }, + { + "epoch": 4.110233682514101, + "grad_norm": 1.5775324405966575, + "learning_rate": 9.837047601851201e-05, + "loss": 2.8663, + "step": 6379 + }, + { + "epoch": 4.110878323932313, + "grad_norm": 1.3284183170153583, + "learning_rate": 9.836996549755385e-05, + "loss": 3.1684, + "step": 6380 + }, + { + "epoch": 4.111522965350524, + "grad_norm": 1.41864366270017, + "learning_rate": 9.836945489797525e-05, + "loss": 3.3401, + "step": 6381 + }, + { + "epoch": 4.112167606768735, + "grad_norm": 1.53301322491405, + "learning_rate": 9.836894421977712e-05, + "loss": 3.3868, + "step": 6382 + }, + { + "epoch": 4.1128122481869465, + "grad_norm": 1.7685920452028718, + "learning_rate": 9.836843346296022e-05, + "loss": 3.2901, + "step": 6383 + }, + { + "epoch": 4.113456889605157, + "grad_norm": 1.5669550861382382, + "learning_rate": 9.836792262752543e-05, + "loss": 3.0018, + "step": 6384 + }, + { + "epoch": 4.114101531023368, + "grad_norm": 1.8881744207123634, + "learning_rate": 9.836741171347359e-05, + "loss": 3.415, + "step": 6385 + }, + { + "epoch": 4.1147461724415795, + "grad_norm": 1.5555260821231625, + "learning_rate": 9.83669007208055e-05, + "loss": 3.1747, + "step": 6386 + }, + { + "epoch": 4.115390813859791, + "grad_norm": 1.6462088799796402, + "learning_rate": 9.836638964952207e-05, + "loss": 3.3904, + "step": 6387 + }, + { + "epoch": 4.116035455278001, + "grad_norm": 1.5487679999417128, + "learning_rate": 9.836587849962406e-05, + "loss": 3.0496, + "step": 6388 + }, + { + "epoch": 4.1166800966962125, + "grad_norm": 1.7067557276076788, + "learning_rate": 9.836536727111236e-05, + "loss": 3.365, + "step": 6389 + }, + { + "epoch": 4.117324738114424, + "grad_norm": 1.6211451101017085, + "learning_rate": 9.836485596398781e-05, + "loss": 3.5476, + "step": 6390 + }, + { + "epoch": 4.117969379532635, + "grad_norm": 1.7346508575798163, + "learning_rate": 9.836434457825122e-05, + "loss": 3.2266, + "step": 6391 + }, + { + "epoch": 4.118614020950846, + "grad_norm": 1.6944386222528303, + "learning_rate": 9.836383311390346e-05, + "loss": 3.3298, + "step": 6392 + }, + { + "epoch": 4.119258662369057, + "grad_norm": 1.326777168262789, + "learning_rate": 9.836332157094536e-05, + "loss": 3.1142, + "step": 6393 + }, + { + "epoch": 4.119903303787268, + "grad_norm": 1.85631579098964, + "learning_rate": 9.836280994937773e-05, + "loss": 3.4867, + "step": 6394 + }, + { + "epoch": 4.120547945205479, + "grad_norm": 1.9206401588218853, + "learning_rate": 9.836229824920146e-05, + "loss": 2.9924, + "step": 6395 + }, + { + "epoch": 4.121192586623691, + "grad_norm": 1.2791535168074035, + "learning_rate": 9.836178647041736e-05, + "loss": 3.0025, + "step": 6396 + }, + { + "epoch": 4.121837228041902, + "grad_norm": 1.7114915885990218, + "learning_rate": 9.836127461302628e-05, + "loss": 3.4046, + "step": 6397 + }, + { + "epoch": 4.122481869460112, + "grad_norm": 1.5992989797638293, + "learning_rate": 9.836076267702905e-05, + "loss": 3.0282, + "step": 6398 + }, + { + "epoch": 4.123126510878324, + "grad_norm": 1.6304616850723783, + "learning_rate": 9.836025066242654e-05, + "loss": 3.2812, + "step": 6399 + }, + { + "epoch": 4.123771152296535, + "grad_norm": 1.9538541992445533, + "learning_rate": 9.835973856921955e-05, + "loss": 3.3825, + "step": 6400 + }, + { + "epoch": 4.123771152296535, + "eval_loss": 4.242188930511475, + "eval_runtime": 2.9834, + "eval_samples_per_second": 33.519, + "eval_steps_per_second": 4.357, + "step": 6400 + }, + { + "epoch": 4.124415793714746, + "grad_norm": 1.5328615727934232, + "learning_rate": 9.835922639740896e-05, + "loss": 3.1894, + "step": 6401 + }, + { + "epoch": 4.125060435132958, + "grad_norm": 1.9058561208131555, + "learning_rate": 9.835871414699559e-05, + "loss": 3.0742, + "step": 6402 + }, + { + "epoch": 4.125705076551168, + "grad_norm": 1.6258371076364033, + "learning_rate": 9.835820181798027e-05, + "loss": 3.2922, + "step": 6403 + }, + { + "epoch": 4.126349717969379, + "grad_norm": 2.1373357719295445, + "learning_rate": 9.835768941036386e-05, + "loss": 3.3832, + "step": 6404 + }, + { + "epoch": 4.126994359387591, + "grad_norm": 1.4685887406018758, + "learning_rate": 9.83571769241472e-05, + "loss": 3.5131, + "step": 6405 + }, + { + "epoch": 4.127639000805802, + "grad_norm": 2.0174068866103667, + "learning_rate": 9.835666435933113e-05, + "loss": 3.2661, + "step": 6406 + }, + { + "epoch": 4.128283642224013, + "grad_norm": 1.3231012482667819, + "learning_rate": 9.835615171591649e-05, + "loss": 3.1005, + "step": 6407 + }, + { + "epoch": 4.128928283642224, + "grad_norm": 1.8698984529642875, + "learning_rate": 9.835563899390412e-05, + "loss": 3.5966, + "step": 6408 + }, + { + "epoch": 4.129572925060435, + "grad_norm": 1.3414622054149035, + "learning_rate": 9.835512619329487e-05, + "loss": 2.9866, + "step": 6409 + }, + { + "epoch": 4.130217566478646, + "grad_norm": 1.7087212318190268, + "learning_rate": 9.835461331408955e-05, + "loss": 3.0091, + "step": 6410 + }, + { + "epoch": 4.1308622078968575, + "grad_norm": 1.5219446109554158, + "learning_rate": 9.835410035628906e-05, + "loss": 3.4093, + "step": 6411 + }, + { + "epoch": 4.131506849315069, + "grad_norm": 1.6949606871268714, + "learning_rate": 9.835358731989419e-05, + "loss": 3.23, + "step": 6412 + }, + { + "epoch": 4.132151490733279, + "grad_norm": 1.4136991323067907, + "learning_rate": 9.835307420490582e-05, + "loss": 3.8186, + "step": 6413 + }, + { + "epoch": 4.1327961321514906, + "grad_norm": 1.721133905166282, + "learning_rate": 9.835256101132478e-05, + "loss": 3.2777, + "step": 6414 + }, + { + "epoch": 4.133440773569702, + "grad_norm": 1.3873049129154351, + "learning_rate": 9.83520477391519e-05, + "loss": 3.2742, + "step": 6415 + }, + { + "epoch": 4.134085414987913, + "grad_norm": 1.8878159630697229, + "learning_rate": 9.835153438838804e-05, + "loss": 2.9774, + "step": 6416 + }, + { + "epoch": 4.1347300564061245, + "grad_norm": 1.4459365340853851, + "learning_rate": 9.835102095903404e-05, + "loss": 2.9943, + "step": 6417 + }, + { + "epoch": 4.135374697824335, + "grad_norm": 1.939492227418958, + "learning_rate": 9.835050745109073e-05, + "loss": 3.2754, + "step": 6418 + }, + { + "epoch": 4.136019339242546, + "grad_norm": 1.5487200624143542, + "learning_rate": 9.834999386455896e-05, + "loss": 3.313, + "step": 6419 + }, + { + "epoch": 4.1366639806607575, + "grad_norm": 1.7965574946551692, + "learning_rate": 9.834948019943958e-05, + "loss": 3.3366, + "step": 6420 + }, + { + "epoch": 4.137308622078969, + "grad_norm": 1.6755067843844862, + "learning_rate": 9.834896645573344e-05, + "loss": 3.0383, + "step": 6421 + }, + { + "epoch": 4.13795326349718, + "grad_norm": 1.5713854497543536, + "learning_rate": 9.834845263344136e-05, + "loss": 3.3066, + "step": 6422 + }, + { + "epoch": 4.1385979049153905, + "grad_norm": 1.5352713561016786, + "learning_rate": 9.834793873256419e-05, + "loss": 3.3979, + "step": 6423 + }, + { + "epoch": 4.139242546333602, + "grad_norm": 1.7000991066448086, + "learning_rate": 9.834742475310279e-05, + "loss": 3.1974, + "step": 6424 + }, + { + "epoch": 4.139887187751813, + "grad_norm": 1.59437226749242, + "learning_rate": 9.8346910695058e-05, + "loss": 3.2911, + "step": 6425 + }, + { + "epoch": 4.140531829170024, + "grad_norm": 1.471493754128757, + "learning_rate": 9.834639655843065e-05, + "loss": 3.0504, + "step": 6426 + }, + { + "epoch": 4.141176470588236, + "grad_norm": 1.7350552139190696, + "learning_rate": 9.83458823432216e-05, + "loss": 3.3614, + "step": 6427 + }, + { + "epoch": 4.141821112006446, + "grad_norm": 1.652702021638134, + "learning_rate": 9.834536804943169e-05, + "loss": 3.4526, + "step": 6428 + }, + { + "epoch": 4.142465753424657, + "grad_norm": 1.8234070374797524, + "learning_rate": 9.834485367706176e-05, + "loss": 3.2949, + "step": 6429 + }, + { + "epoch": 4.143110394842869, + "grad_norm": 1.76307625204579, + "learning_rate": 9.834433922611266e-05, + "loss": 2.7933, + "step": 6430 + }, + { + "epoch": 4.14375503626108, + "grad_norm": 1.5313883329874172, + "learning_rate": 9.834382469658522e-05, + "loss": 2.9891, + "step": 6431 + }, + { + "epoch": 4.144399677679291, + "grad_norm": 1.7585808255950817, + "learning_rate": 9.834331008848033e-05, + "loss": 3.2805, + "step": 6432 + }, + { + "epoch": 4.145044319097502, + "grad_norm": 1.869551112763148, + "learning_rate": 9.834279540179878e-05, + "loss": 3.192, + "step": 6433 + }, + { + "epoch": 4.145688960515713, + "grad_norm": 1.752172078330383, + "learning_rate": 9.834228063654145e-05, + "loss": 3.0999, + "step": 6434 + }, + { + "epoch": 4.146333601933924, + "grad_norm": 1.410689670109155, + "learning_rate": 9.834176579270915e-05, + "loss": 2.9658, + "step": 6435 + }, + { + "epoch": 4.146978243352136, + "grad_norm": 1.8028705229219888, + "learning_rate": 9.834125087030278e-05, + "loss": 3.2605, + "step": 6436 + }, + { + "epoch": 4.147622884770347, + "grad_norm": 1.6350626166058069, + "learning_rate": 9.834073586932313e-05, + "loss": 3.4418, + "step": 6437 + }, + { + "epoch": 4.148267526188557, + "grad_norm": 1.3248395809136078, + "learning_rate": 9.834022078977109e-05, + "loss": 3.332, + "step": 6438 + }, + { + "epoch": 4.148912167606769, + "grad_norm": 1.8588450940205463, + "learning_rate": 9.833970563164748e-05, + "loss": 3.2043, + "step": 6439 + }, + { + "epoch": 4.14955680902498, + "grad_norm": 1.2225947300655027, + "learning_rate": 9.833919039495313e-05, + "loss": 3.259, + "step": 6440 + }, + { + "epoch": 4.150201450443191, + "grad_norm": 1.926950174674398, + "learning_rate": 9.833867507968895e-05, + "loss": 3.7437, + "step": 6441 + }, + { + "epoch": 4.1508460918614025, + "grad_norm": 1.4532783328544625, + "learning_rate": 9.833815968585573e-05, + "loss": 3.2939, + "step": 6442 + }, + { + "epoch": 4.151490733279613, + "grad_norm": 1.8304775179258237, + "learning_rate": 9.833764421345433e-05, + "loss": 3.4917, + "step": 6443 + }, + { + "epoch": 4.152135374697824, + "grad_norm": 1.60470365982539, + "learning_rate": 9.833712866248559e-05, + "loss": 3.4024, + "step": 6444 + }, + { + "epoch": 4.1527800161160355, + "grad_norm": 1.5257149465184188, + "learning_rate": 9.833661303295039e-05, + "loss": 3.3397, + "step": 6445 + }, + { + "epoch": 4.153424657534247, + "grad_norm": 1.38101922818797, + "learning_rate": 9.833609732484952e-05, + "loss": 3.3317, + "step": 6446 + }, + { + "epoch": 4.154069298952457, + "grad_norm": 1.882627609769109, + "learning_rate": 9.833558153818389e-05, + "loss": 3.0324, + "step": 6447 + }, + { + "epoch": 4.1547139403706685, + "grad_norm": 1.6453231718125487, + "learning_rate": 9.833506567295428e-05, + "loss": 3.3141, + "step": 6448 + }, + { + "epoch": 4.15535858178888, + "grad_norm": 1.5331414400747991, + "learning_rate": 9.83345497291616e-05, + "loss": 3.3555, + "step": 6449 + }, + { + "epoch": 4.156003223207091, + "grad_norm": 1.6234443712064208, + "learning_rate": 9.833403370680667e-05, + "loss": 3.1492, + "step": 6450 + }, + { + "epoch": 4.1566478646253024, + "grad_norm": 1.3268952088844805, + "learning_rate": 9.833351760589034e-05, + "loss": 3.2798, + "step": 6451 + }, + { + "epoch": 4.157292506043513, + "grad_norm": 1.478945934234305, + "learning_rate": 9.833300142641345e-05, + "loss": 2.9833, + "step": 6452 + }, + { + "epoch": 4.157937147461724, + "grad_norm": 1.5178949799221013, + "learning_rate": 9.833248516837685e-05, + "loss": 3.2257, + "step": 6453 + }, + { + "epoch": 4.1585817888799355, + "grad_norm": 1.655360971002452, + "learning_rate": 9.833196883178139e-05, + "loss": 3.0287, + "step": 6454 + }, + { + "epoch": 4.159226430298147, + "grad_norm": 1.475404569307715, + "learning_rate": 9.833145241662793e-05, + "loss": 3.2105, + "step": 6455 + }, + { + "epoch": 4.159871071716358, + "grad_norm": 1.5375187198511262, + "learning_rate": 9.83309359229173e-05, + "loss": 3.1251, + "step": 6456 + }, + { + "epoch": 4.1605157131345685, + "grad_norm": 1.609780312669881, + "learning_rate": 9.833041935065036e-05, + "loss": 3.3503, + "step": 6457 + }, + { + "epoch": 4.16116035455278, + "grad_norm": 1.472928822483713, + "learning_rate": 9.832990269982796e-05, + "loss": 3.2144, + "step": 6458 + }, + { + "epoch": 4.161804995970991, + "grad_norm": 1.448954258335045, + "learning_rate": 9.832938597045094e-05, + "loss": 3.345, + "step": 6459 + }, + { + "epoch": 4.162449637389202, + "grad_norm": 1.3130478567679467, + "learning_rate": 9.832886916252015e-05, + "loss": 3.3855, + "step": 6460 + }, + { + "epoch": 4.163094278807414, + "grad_norm": 1.317468611875551, + "learning_rate": 9.832835227603644e-05, + "loss": 2.9889, + "step": 6461 + }, + { + "epoch": 4.163738920225624, + "grad_norm": 1.4396123954036528, + "learning_rate": 9.832783531100066e-05, + "loss": 3.1183, + "step": 6462 + }, + { + "epoch": 4.164383561643835, + "grad_norm": 1.715289346226137, + "learning_rate": 9.832731826741365e-05, + "loss": 3.3366, + "step": 6463 + }, + { + "epoch": 4.165028203062047, + "grad_norm": 1.2953466898666952, + "learning_rate": 9.832680114527628e-05, + "loss": 3.4564, + "step": 6464 + }, + { + "epoch": 4.165672844480258, + "grad_norm": 1.8955677002423366, + "learning_rate": 9.832628394458938e-05, + "loss": 3.3809, + "step": 6465 + }, + { + "epoch": 4.166317485898469, + "grad_norm": 1.4942463491143263, + "learning_rate": 9.832576666535382e-05, + "loss": 3.4297, + "step": 6466 + }, + { + "epoch": 4.16696212731668, + "grad_norm": 1.926625837900246, + "learning_rate": 9.832524930757042e-05, + "loss": 3.2214, + "step": 6467 + }, + { + "epoch": 4.167606768734891, + "grad_norm": 2.003982368362077, + "learning_rate": 9.832473187124005e-05, + "loss": 3.1127, + "step": 6468 + }, + { + "epoch": 4.168251410153102, + "grad_norm": 1.4851599594544498, + "learning_rate": 9.832421435636355e-05, + "loss": 3.4207, + "step": 6469 + }, + { + "epoch": 4.168896051571314, + "grad_norm": 1.980734732559906, + "learning_rate": 9.832369676294179e-05, + "loss": 3.171, + "step": 6470 + }, + { + "epoch": 4.169540692989525, + "grad_norm": 1.4713051557222547, + "learning_rate": 9.83231790909756e-05, + "loss": 3.3771, + "step": 6471 + }, + { + "epoch": 4.170185334407735, + "grad_norm": 2.0062704854412066, + "learning_rate": 9.832266134046584e-05, + "loss": 3.3405, + "step": 6472 + }, + { + "epoch": 4.170829975825947, + "grad_norm": 1.4427060169175216, + "learning_rate": 9.832214351141335e-05, + "loss": 3.192, + "step": 6473 + }, + { + "epoch": 4.171474617244158, + "grad_norm": 1.677536370834164, + "learning_rate": 9.832162560381899e-05, + "loss": 3.4258, + "step": 6474 + }, + { + "epoch": 4.172119258662369, + "grad_norm": 1.3639015691213798, + "learning_rate": 9.83211076176836e-05, + "loss": 3.3445, + "step": 6475 + }, + { + "epoch": 4.1727639000805805, + "grad_norm": 1.4029846674220172, + "learning_rate": 9.832058955300804e-05, + "loss": 3.4832, + "step": 6476 + }, + { + "epoch": 4.173408541498791, + "grad_norm": 1.403632639653016, + "learning_rate": 9.832007140979315e-05, + "loss": 3.2683, + "step": 6477 + }, + { + "epoch": 4.174053182917002, + "grad_norm": 1.608961985428998, + "learning_rate": 9.831955318803981e-05, + "loss": 3.2401, + "step": 6478 + }, + { + "epoch": 4.1746978243352135, + "grad_norm": 1.5902040454221646, + "learning_rate": 9.831903488774884e-05, + "loss": 3.1665, + "step": 6479 + }, + { + "epoch": 4.175342465753425, + "grad_norm": 1.4658360618193296, + "learning_rate": 9.831851650892111e-05, + "loss": 3.5802, + "step": 6480 + }, + { + "epoch": 4.175987107171636, + "grad_norm": 1.53725610260801, + "learning_rate": 9.831799805155746e-05, + "loss": 2.9681, + "step": 6481 + }, + { + "epoch": 4.1766317485898465, + "grad_norm": 1.4200524133762744, + "learning_rate": 9.831747951565873e-05, + "loss": 2.975, + "step": 6482 + }, + { + "epoch": 4.177276390008058, + "grad_norm": 1.4951057956486051, + "learning_rate": 9.83169609012258e-05, + "loss": 3.2234, + "step": 6483 + }, + { + "epoch": 4.177921031426269, + "grad_norm": 1.733405882124199, + "learning_rate": 9.831644220825952e-05, + "loss": 3.1663, + "step": 6484 + }, + { + "epoch": 4.17856567284448, + "grad_norm": 1.4283338811999495, + "learning_rate": 9.831592343676072e-05, + "loss": 3.3377, + "step": 6485 + }, + { + "epoch": 4.179210314262692, + "grad_norm": 1.5205558867049778, + "learning_rate": 9.831540458673026e-05, + "loss": 3.3708, + "step": 6486 + }, + { + "epoch": 4.179854955680902, + "grad_norm": 1.3767639438424155, + "learning_rate": 9.831488565816899e-05, + "loss": 3.3663, + "step": 6487 + }, + { + "epoch": 4.1804995970991135, + "grad_norm": 1.5643428558810744, + "learning_rate": 9.831436665107777e-05, + "loss": 3.3732, + "step": 6488 + }, + { + "epoch": 4.181144238517325, + "grad_norm": 1.3966410927339106, + "learning_rate": 9.831384756545745e-05, + "loss": 3.5567, + "step": 6489 + }, + { + "epoch": 4.181788879935536, + "grad_norm": 1.4983790235692998, + "learning_rate": 9.831332840130888e-05, + "loss": 3.1565, + "step": 6490 + }, + { + "epoch": 4.182433521353747, + "grad_norm": 1.8608521158250249, + "learning_rate": 9.831280915863291e-05, + "loss": 3.489, + "step": 6491 + }, + { + "epoch": 4.183078162771958, + "grad_norm": 1.4196623837892477, + "learning_rate": 9.831228983743043e-05, + "loss": 3.0074, + "step": 6492 + }, + { + "epoch": 4.183722804190169, + "grad_norm": 1.786211472748744, + "learning_rate": 9.831177043770222e-05, + "loss": 3.069, + "step": 6493 + }, + { + "epoch": 4.18436744560838, + "grad_norm": 1.4931741158634855, + "learning_rate": 9.831125095944919e-05, + "loss": 3.3094, + "step": 6494 + }, + { + "epoch": 4.185012087026592, + "grad_norm": 1.6842813832639225, + "learning_rate": 9.831073140267216e-05, + "loss": 2.9604, + "step": 6495 + }, + { + "epoch": 4.185656728444803, + "grad_norm": 1.9189173535621447, + "learning_rate": 9.831021176737203e-05, + "loss": 3.5417, + "step": 6496 + }, + { + "epoch": 4.186301369863013, + "grad_norm": 1.7508917826319896, + "learning_rate": 9.830969205354961e-05, + "loss": 3.3964, + "step": 6497 + }, + { + "epoch": 4.186946011281225, + "grad_norm": 1.7888781228784585, + "learning_rate": 9.830917226120574e-05, + "loss": 3.223, + "step": 6498 + }, + { + "epoch": 4.187590652699436, + "grad_norm": 1.4685112460365897, + "learning_rate": 9.830865239034134e-05, + "loss": 3.3694, + "step": 6499 + }, + { + "epoch": 4.188235294117647, + "grad_norm": 1.8533501972048176, + "learning_rate": 9.83081324409572e-05, + "loss": 3.4338, + "step": 6500 + }, + { + "epoch": 4.188235294117647, + "eval_loss": 4.244458198547363, + "eval_runtime": 2.9909, + "eval_samples_per_second": 33.435, + "eval_steps_per_second": 4.347, + "step": 6500 + }, + { + "epoch": 4.188879935535859, + "grad_norm": 1.6260041357162371, + "learning_rate": 9.830761241305422e-05, + "loss": 3.2943, + "step": 6501 + }, + { + "epoch": 4.189524576954069, + "grad_norm": 1.802333142587033, + "learning_rate": 9.830709230663321e-05, + "loss": 3.7324, + "step": 6502 + }, + { + "epoch": 4.19016921837228, + "grad_norm": 1.6938118179838793, + "learning_rate": 9.830657212169506e-05, + "loss": 3.5772, + "step": 6503 + }, + { + "epoch": 4.190813859790492, + "grad_norm": 1.863564356528444, + "learning_rate": 9.83060518582406e-05, + "loss": 3.4422, + "step": 6504 + }, + { + "epoch": 4.191458501208703, + "grad_norm": 1.6330590067529493, + "learning_rate": 9.83055315162707e-05, + "loss": 3.3057, + "step": 6505 + }, + { + "epoch": 4.192103142626914, + "grad_norm": 1.4984455568859727, + "learning_rate": 9.83050110957862e-05, + "loss": 3.3002, + "step": 6506 + }, + { + "epoch": 4.192747784045125, + "grad_norm": 1.8780693656308693, + "learning_rate": 9.830449059678798e-05, + "loss": 3.0307, + "step": 6507 + }, + { + "epoch": 4.193392425463336, + "grad_norm": 1.6361336516191833, + "learning_rate": 9.830397001927687e-05, + "loss": 3.4148, + "step": 6508 + }, + { + "epoch": 4.194037066881547, + "grad_norm": 1.7482824028539627, + "learning_rate": 9.830344936325374e-05, + "loss": 3.4686, + "step": 6509 + }, + { + "epoch": 4.1946817082997585, + "grad_norm": 1.9117712123350377, + "learning_rate": 9.830292862871945e-05, + "loss": 3.0187, + "step": 6510 + }, + { + "epoch": 4.19532634971797, + "grad_norm": 1.5990973132599606, + "learning_rate": 9.830240781567482e-05, + "loss": 3.3416, + "step": 6511 + }, + { + "epoch": 4.19597099113618, + "grad_norm": 2.012714248375571, + "learning_rate": 9.830188692412074e-05, + "loss": 3.3629, + "step": 6512 + }, + { + "epoch": 4.1966156325543915, + "grad_norm": 1.707911482087105, + "learning_rate": 9.830136595405805e-05, + "loss": 3.2958, + "step": 6513 + }, + { + "epoch": 4.197260273972603, + "grad_norm": 1.640850692240952, + "learning_rate": 9.830084490548763e-05, + "loss": 3.2881, + "step": 6514 + }, + { + "epoch": 4.197904915390814, + "grad_norm": 1.3933142594125454, + "learning_rate": 9.830032377841031e-05, + "loss": 3.0727, + "step": 6515 + }, + { + "epoch": 4.198549556809025, + "grad_norm": 1.8493127316178708, + "learning_rate": 9.829980257282695e-05, + "loss": 3.1112, + "step": 6516 + }, + { + "epoch": 4.199194198227236, + "grad_norm": 1.5603541444401123, + "learning_rate": 9.82992812887384e-05, + "loss": 3.074, + "step": 6517 + }, + { + "epoch": 4.199838839645447, + "grad_norm": 1.731725313141627, + "learning_rate": 9.829875992614553e-05, + "loss": 3.3111, + "step": 6518 + }, + { + "epoch": 4.200483481063658, + "grad_norm": 1.6925010886396905, + "learning_rate": 9.82982384850492e-05, + "loss": 3.529, + "step": 6519 + }, + { + "epoch": 4.20112812248187, + "grad_norm": 1.6417144601585079, + "learning_rate": 9.829771696545025e-05, + "loss": 2.8407, + "step": 6520 + }, + { + "epoch": 4.201772763900081, + "grad_norm": 1.933689342982341, + "learning_rate": 9.829719536734955e-05, + "loss": 3.5119, + "step": 6521 + }, + { + "epoch": 4.202417405318291, + "grad_norm": 1.6701858584739704, + "learning_rate": 9.829667369074794e-05, + "loss": 3.2341, + "step": 6522 + }, + { + "epoch": 4.203062046736503, + "grad_norm": 2.343940977101888, + "learning_rate": 9.829615193564628e-05, + "loss": 3.1754, + "step": 6523 + }, + { + "epoch": 4.203706688154714, + "grad_norm": 1.6126085336524196, + "learning_rate": 9.829563010204547e-05, + "loss": 3.714, + "step": 6524 + }, + { + "epoch": 4.204351329572925, + "grad_norm": 1.7201790391170426, + "learning_rate": 9.829510818994631e-05, + "loss": 3.1505, + "step": 6525 + }, + { + "epoch": 4.204995970991136, + "grad_norm": 1.821318429932843, + "learning_rate": 9.829458619934969e-05, + "loss": 3.4996, + "step": 6526 + }, + { + "epoch": 4.205640612409347, + "grad_norm": 1.7630912259821732, + "learning_rate": 9.829406413025643e-05, + "loss": 3.2951, + "step": 6527 + }, + { + "epoch": 4.206285253827558, + "grad_norm": 1.9393116734741809, + "learning_rate": 9.829354198266743e-05, + "loss": 3.3578, + "step": 6528 + }, + { + "epoch": 4.20692989524577, + "grad_norm": 2.045721294289673, + "learning_rate": 9.829301975658354e-05, + "loss": 2.892, + "step": 6529 + }, + { + "epoch": 4.207574536663981, + "grad_norm": 1.8245976245475763, + "learning_rate": 9.82924974520056e-05, + "loss": 2.9266, + "step": 6530 + }, + { + "epoch": 4.208219178082191, + "grad_norm": 1.8153474800637899, + "learning_rate": 9.829197506893446e-05, + "loss": 3.3415, + "step": 6531 + }, + { + "epoch": 4.208863819500403, + "grad_norm": 1.4942315136065707, + "learning_rate": 9.829145260737101e-05, + "loss": 3.3792, + "step": 6532 + }, + { + "epoch": 4.209508460918614, + "grad_norm": 1.6695491778796883, + "learning_rate": 9.829093006731609e-05, + "loss": 3.4488, + "step": 6533 + }, + { + "epoch": 4.210153102336825, + "grad_norm": 1.444238471953953, + "learning_rate": 9.829040744877057e-05, + "loss": 3.3698, + "step": 6534 + }, + { + "epoch": 4.210797743755037, + "grad_norm": 1.6587217564695276, + "learning_rate": 9.82898847517353e-05, + "loss": 3.3837, + "step": 6535 + }, + { + "epoch": 4.211442385173247, + "grad_norm": 1.781950776409235, + "learning_rate": 9.828936197621112e-05, + "loss": 3.1677, + "step": 6536 + }, + { + "epoch": 4.212087026591458, + "grad_norm": 1.440391337454448, + "learning_rate": 9.828883912219892e-05, + "loss": 3.4296, + "step": 6537 + }, + { + "epoch": 4.21273166800967, + "grad_norm": 1.762226078667667, + "learning_rate": 9.828831618969954e-05, + "loss": 3.1645, + "step": 6538 + }, + { + "epoch": 4.213376309427881, + "grad_norm": 1.3133677380526, + "learning_rate": 9.828779317871385e-05, + "loss": 3.1778, + "step": 6539 + }, + { + "epoch": 4.214020950846092, + "grad_norm": 1.6908004227985345, + "learning_rate": 9.82872700892427e-05, + "loss": 2.9276, + "step": 6540 + }, + { + "epoch": 4.214665592264303, + "grad_norm": 1.7879078344908608, + "learning_rate": 9.828674692128695e-05, + "loss": 3.3121, + "step": 6541 + }, + { + "epoch": 4.215310233682514, + "grad_norm": 1.6416141471186592, + "learning_rate": 9.828622367484746e-05, + "loss": 3.3929, + "step": 6542 + }, + { + "epoch": 4.215954875100725, + "grad_norm": 2.1739584522752016, + "learning_rate": 9.828570034992508e-05, + "loss": 2.9933, + "step": 6543 + }, + { + "epoch": 4.2165995165189365, + "grad_norm": 1.7131234741879717, + "learning_rate": 9.82851769465207e-05, + "loss": 3.291, + "step": 6544 + }, + { + "epoch": 4.217244157937148, + "grad_norm": 1.9513821987993842, + "learning_rate": 9.828465346463514e-05, + "loss": 2.9653, + "step": 6545 + }, + { + "epoch": 4.217888799355358, + "grad_norm": 1.765096000626883, + "learning_rate": 9.828412990426929e-05, + "loss": 3.0699, + "step": 6546 + }, + { + "epoch": 4.2185334407735695, + "grad_norm": 1.9768539351485246, + "learning_rate": 9.828360626542399e-05, + "loss": 3.1149, + "step": 6547 + }, + { + "epoch": 4.219178082191781, + "grad_norm": 1.6887905148338462, + "learning_rate": 9.828308254810013e-05, + "loss": 3.2536, + "step": 6548 + }, + { + "epoch": 4.219822723609992, + "grad_norm": 2.2727537364247747, + "learning_rate": 9.828255875229853e-05, + "loss": 3.3903, + "step": 6549 + }, + { + "epoch": 4.220467365028203, + "grad_norm": 1.4933853453495158, + "learning_rate": 9.828203487802009e-05, + "loss": 2.9616, + "step": 6550 + }, + { + "epoch": 4.221112006446414, + "grad_norm": 2.4767146991454205, + "learning_rate": 9.828151092526563e-05, + "loss": 2.9753, + "step": 6551 + }, + { + "epoch": 4.221756647864625, + "grad_norm": 1.776505458174225, + "learning_rate": 9.828098689403604e-05, + "loss": 3.4454, + "step": 6552 + }, + { + "epoch": 4.222401289282836, + "grad_norm": 2.3564858604756083, + "learning_rate": 9.828046278433218e-05, + "loss": 3.2159, + "step": 6553 + }, + { + "epoch": 4.223045930701048, + "grad_norm": 1.9118833995112754, + "learning_rate": 9.827993859615487e-05, + "loss": 3.2063, + "step": 6554 + }, + { + "epoch": 4.223690572119259, + "grad_norm": 2.3607817377015596, + "learning_rate": 9.827941432950504e-05, + "loss": 3.2355, + "step": 6555 + }, + { + "epoch": 4.224335213537469, + "grad_norm": 2.0525578647532794, + "learning_rate": 9.827888998438348e-05, + "loss": 3.2111, + "step": 6556 + }, + { + "epoch": 4.224979854955681, + "grad_norm": 2.080283133093149, + "learning_rate": 9.827836556079111e-05, + "loss": 3.4617, + "step": 6557 + }, + { + "epoch": 4.225624496373892, + "grad_norm": 2.067761298842895, + "learning_rate": 9.827784105872876e-05, + "loss": 3.3463, + "step": 6558 + }, + { + "epoch": 4.226269137792103, + "grad_norm": 1.9520269884237267, + "learning_rate": 9.827731647819731e-05, + "loss": 3.3593, + "step": 6559 + }, + { + "epoch": 4.226913779210315, + "grad_norm": 1.64901459734639, + "learning_rate": 9.827679181919759e-05, + "loss": 3.2814, + "step": 6560 + }, + { + "epoch": 4.227558420628525, + "grad_norm": 2.0782476590399326, + "learning_rate": 9.827626708173049e-05, + "loss": 2.8433, + "step": 6561 + }, + { + "epoch": 4.228203062046736, + "grad_norm": 1.583940413707889, + "learning_rate": 9.827574226579687e-05, + "loss": 3.1016, + "step": 6562 + }, + { + "epoch": 4.228847703464948, + "grad_norm": 1.8565349119699084, + "learning_rate": 9.827521737139756e-05, + "loss": 3.4717, + "step": 6563 + }, + { + "epoch": 4.229492344883159, + "grad_norm": 1.5456047831665902, + "learning_rate": 9.827469239853346e-05, + "loss": 3.5775, + "step": 6564 + }, + { + "epoch": 4.23013698630137, + "grad_norm": 1.7103180502887654, + "learning_rate": 9.827416734720543e-05, + "loss": 3.0757, + "step": 6565 + }, + { + "epoch": 4.230781627719581, + "grad_norm": 1.8021034805223615, + "learning_rate": 9.827364221741431e-05, + "loss": 3.2138, + "step": 6566 + }, + { + "epoch": 4.231426269137792, + "grad_norm": 1.5325449580774275, + "learning_rate": 9.827311700916097e-05, + "loss": 3.3137, + "step": 6567 + }, + { + "epoch": 4.232070910556003, + "grad_norm": 1.650475939541151, + "learning_rate": 9.827259172244631e-05, + "loss": 3.3531, + "step": 6568 + }, + { + "epoch": 4.232715551974215, + "grad_norm": 1.7098530578196465, + "learning_rate": 9.827206635727112e-05, + "loss": 3.2804, + "step": 6569 + }, + { + "epoch": 4.233360193392426, + "grad_norm": 1.3205778558663133, + "learning_rate": 9.82715409136363e-05, + "loss": 3.2122, + "step": 6570 + }, + { + "epoch": 4.234004834810636, + "grad_norm": 1.8413714410892, + "learning_rate": 9.827101539154276e-05, + "loss": 3.4945, + "step": 6571 + }, + { + "epoch": 4.234649476228848, + "grad_norm": 1.632967971321622, + "learning_rate": 9.827048979099127e-05, + "loss": 3.083, + "step": 6572 + }, + { + "epoch": 4.235294117647059, + "grad_norm": 1.627814739660012, + "learning_rate": 9.826996411198276e-05, + "loss": 3.0786, + "step": 6573 + }, + { + "epoch": 4.23593875906527, + "grad_norm": 1.320971645090511, + "learning_rate": 9.826943835451807e-05, + "loss": 3.3834, + "step": 6574 + }, + { + "epoch": 4.2365834004834815, + "grad_norm": 1.6891068879798938, + "learning_rate": 9.826891251859808e-05, + "loss": 3.338, + "step": 6575 + }, + { + "epoch": 4.237228041901692, + "grad_norm": 1.4730594850487353, + "learning_rate": 9.826838660422364e-05, + "loss": 3.3464, + "step": 6576 + }, + { + "epoch": 4.237872683319903, + "grad_norm": 1.349586165083921, + "learning_rate": 9.826786061139561e-05, + "loss": 3.2673, + "step": 6577 + }, + { + "epoch": 4.2385173247381145, + "grad_norm": 1.3654885976591196, + "learning_rate": 9.826733454011486e-05, + "loss": 3.4554, + "step": 6578 + }, + { + "epoch": 4.239161966156326, + "grad_norm": 1.4270990730271427, + "learning_rate": 9.826680839038226e-05, + "loss": 3.6721, + "step": 6579 + }, + { + "epoch": 4.239806607574537, + "grad_norm": 1.3920752864546986, + "learning_rate": 9.826628216219865e-05, + "loss": 3.2025, + "step": 6580 + }, + { + "epoch": 4.2404512489927475, + "grad_norm": 1.9104871306706261, + "learning_rate": 9.826575585556493e-05, + "loss": 3.0687, + "step": 6581 + }, + { + "epoch": 4.241095890410959, + "grad_norm": 1.6551788889455674, + "learning_rate": 9.826522947048195e-05, + "loss": 3.3481, + "step": 6582 + }, + { + "epoch": 4.24174053182917, + "grad_norm": 1.9640387700276518, + "learning_rate": 9.826470300695054e-05, + "loss": 3.4169, + "step": 6583 + }, + { + "epoch": 4.242385173247381, + "grad_norm": 1.8329482118036786, + "learning_rate": 9.826417646497162e-05, + "loss": 3.4619, + "step": 6584 + }, + { + "epoch": 4.243029814665592, + "grad_norm": 1.4047169396754093, + "learning_rate": 9.826364984454603e-05, + "loss": 3.2202, + "step": 6585 + }, + { + "epoch": 4.243674456083803, + "grad_norm": 1.7416269442775758, + "learning_rate": 9.826312314567464e-05, + "loss": 3.0859, + "step": 6586 + }, + { + "epoch": 4.244319097502014, + "grad_norm": 1.6571839968584217, + "learning_rate": 9.826259636835829e-05, + "loss": 3.0523, + "step": 6587 + }, + { + "epoch": 4.244963738920226, + "grad_norm": 1.7452270793014375, + "learning_rate": 9.826206951259788e-05, + "loss": 3.2098, + "step": 6588 + }, + { + "epoch": 4.245608380338437, + "grad_norm": 1.839753559221245, + "learning_rate": 9.826154257839426e-05, + "loss": 3.2725, + "step": 6589 + }, + { + "epoch": 4.246253021756647, + "grad_norm": 1.884175278193502, + "learning_rate": 9.82610155657483e-05, + "loss": 3.3616, + "step": 6590 + }, + { + "epoch": 4.246897663174859, + "grad_norm": 1.5245646321741264, + "learning_rate": 9.826048847466085e-05, + "loss": 3.3699, + "step": 6591 + }, + { + "epoch": 4.24754230459307, + "grad_norm": 1.4387873253979568, + "learning_rate": 9.82599613051328e-05, + "loss": 3.0141, + "step": 6592 + }, + { + "epoch": 4.248186946011281, + "grad_norm": 1.4933813898257602, + "learning_rate": 9.825943405716499e-05, + "loss": 3.1978, + "step": 6593 + }, + { + "epoch": 4.248831587429493, + "grad_norm": 1.5755652077314972, + "learning_rate": 9.82589067307583e-05, + "loss": 3.4804, + "step": 6594 + }, + { + "epoch": 4.249476228847703, + "grad_norm": 1.7235881360185386, + "learning_rate": 9.82583793259136e-05, + "loss": 2.9224, + "step": 6595 + }, + { + "epoch": 4.250120870265914, + "grad_norm": 1.9702550418944, + "learning_rate": 9.825785184263174e-05, + "loss": 3.3076, + "step": 6596 + }, + { + "epoch": 4.250765511684126, + "grad_norm": 1.8675547104470345, + "learning_rate": 9.825732428091364e-05, + "loss": 3.7668, + "step": 6597 + }, + { + "epoch": 4.251410153102337, + "grad_norm": 2.0236489753083275, + "learning_rate": 9.825679664076008e-05, + "loss": 3.2463, + "step": 6598 + }, + { + "epoch": 4.252054794520548, + "grad_norm": 1.517485352417023, + "learning_rate": 9.8256268922172e-05, + "loss": 3.4969, + "step": 6599 + }, + { + "epoch": 4.252699435938759, + "grad_norm": 1.9570367572849101, + "learning_rate": 9.82557411251502e-05, + "loss": 3.4867, + "step": 6600 + }, + { + "epoch": 4.252699435938759, + "eval_loss": 4.224485874176025, + "eval_runtime": 2.9641, + "eval_samples_per_second": 33.737, + "eval_steps_per_second": 4.386, + "step": 6600 + }, + { + "epoch": 4.25334407735697, + "grad_norm": 1.7281261116778357, + "learning_rate": 9.82552132496956e-05, + "loss": 3.1067, + "step": 6601 + }, + { + "epoch": 4.253988718775181, + "grad_norm": 1.6436120414274789, + "learning_rate": 9.825468529580907e-05, + "loss": 3.1436, + "step": 6602 + }, + { + "epoch": 4.2546333601933926, + "grad_norm": 1.445329027677594, + "learning_rate": 9.825415726349143e-05, + "loss": 3.401, + "step": 6603 + }, + { + "epoch": 4.255278001611604, + "grad_norm": 2.238334021792848, + "learning_rate": 9.82536291527436e-05, + "loss": 3.3839, + "step": 6604 + }, + { + "epoch": 4.255922643029814, + "grad_norm": 1.7419912209392492, + "learning_rate": 9.825310096356641e-05, + "loss": 3.5095, + "step": 6605 + }, + { + "epoch": 4.256567284448026, + "grad_norm": 3.8825961381236853, + "learning_rate": 9.825257269596073e-05, + "loss": 3.0217, + "step": 6606 + }, + { + "epoch": 4.257211925866237, + "grad_norm": 1.729795169859526, + "learning_rate": 9.825204434992747e-05, + "loss": 3.2563, + "step": 6607 + }, + { + "epoch": 4.257856567284448, + "grad_norm": 1.9606003810165302, + "learning_rate": 9.825151592546745e-05, + "loss": 3.4915, + "step": 6608 + }, + { + "epoch": 4.2585012087026595, + "grad_norm": 1.4771760313672697, + "learning_rate": 9.825098742258156e-05, + "loss": 3.1918, + "step": 6609 + }, + { + "epoch": 4.25914585012087, + "grad_norm": 1.7008012722528953, + "learning_rate": 9.825045884127065e-05, + "loss": 3.298, + "step": 6610 + }, + { + "epoch": 4.259790491539081, + "grad_norm": 1.4790940027639499, + "learning_rate": 9.824993018153558e-05, + "loss": 3.2619, + "step": 6611 + }, + { + "epoch": 4.2604351329572925, + "grad_norm": 1.5274691577280488, + "learning_rate": 9.824940144337727e-05, + "loss": 3.3232, + "step": 6612 + }, + { + "epoch": 4.261079774375504, + "grad_norm": 1.3519777031162659, + "learning_rate": 9.824887262679654e-05, + "loss": 3.2408, + "step": 6613 + }, + { + "epoch": 4.261724415793715, + "grad_norm": 1.390012381917473, + "learning_rate": 9.824834373179429e-05, + "loss": 3.2047, + "step": 6614 + }, + { + "epoch": 4.2623690572119255, + "grad_norm": 1.4342088989543251, + "learning_rate": 9.824781475837136e-05, + "loss": 3.2114, + "step": 6615 + }, + { + "epoch": 4.263013698630137, + "grad_norm": 1.636599631915386, + "learning_rate": 9.824728570652865e-05, + "loss": 3.3018, + "step": 6616 + }, + { + "epoch": 4.263658340048348, + "grad_norm": 1.2771330565888133, + "learning_rate": 9.8246756576267e-05, + "loss": 3.2144, + "step": 6617 + }, + { + "epoch": 4.264302981466559, + "grad_norm": 1.6046930099623489, + "learning_rate": 9.824622736758729e-05, + "loss": 3.2473, + "step": 6618 + }, + { + "epoch": 4.264947622884771, + "grad_norm": 1.550640208680244, + "learning_rate": 9.824569808049038e-05, + "loss": 3.3584, + "step": 6619 + }, + { + "epoch": 4.265592264302981, + "grad_norm": 1.5255755599270246, + "learning_rate": 9.824516871497715e-05, + "loss": 3.1005, + "step": 6620 + }, + { + "epoch": 4.266236905721192, + "grad_norm": 1.5680312646193098, + "learning_rate": 9.824463927104848e-05, + "loss": 3.1227, + "step": 6621 + }, + { + "epoch": 4.266881547139404, + "grad_norm": 1.4619917044504995, + "learning_rate": 9.824410974870522e-05, + "loss": 3.1802, + "step": 6622 + }, + { + "epoch": 4.267526188557615, + "grad_norm": 1.7314870860825733, + "learning_rate": 9.824358014794825e-05, + "loss": 3.4048, + "step": 6623 + }, + { + "epoch": 4.268170829975826, + "grad_norm": 1.3360746771679737, + "learning_rate": 9.824305046877844e-05, + "loss": 3.4066, + "step": 6624 + }, + { + "epoch": 4.268815471394037, + "grad_norm": 1.8511315348327235, + "learning_rate": 9.824252071119665e-05, + "loss": 3.2798, + "step": 6625 + }, + { + "epoch": 4.269460112812248, + "grad_norm": 1.7040774588524636, + "learning_rate": 9.824199087520377e-05, + "loss": 3.3448, + "step": 6626 + }, + { + "epoch": 4.270104754230459, + "grad_norm": 1.6622238444331148, + "learning_rate": 9.824146096080064e-05, + "loss": 3.4796, + "step": 6627 + }, + { + "epoch": 4.270749395648671, + "grad_norm": 1.943184873704491, + "learning_rate": 9.824093096798816e-05, + "loss": 3.3585, + "step": 6628 + }, + { + "epoch": 4.271394037066882, + "grad_norm": 1.6720946820028713, + "learning_rate": 9.824040089676718e-05, + "loss": 3.3203, + "step": 6629 + }, + { + "epoch": 4.272038678485092, + "grad_norm": 2.1268518208866247, + "learning_rate": 9.823987074713858e-05, + "loss": 2.9599, + "step": 6630 + }, + { + "epoch": 4.272683319903304, + "grad_norm": 1.9912353512842784, + "learning_rate": 9.823934051910324e-05, + "loss": 3.4932, + "step": 6631 + }, + { + "epoch": 4.273327961321515, + "grad_norm": 1.7226068177347733, + "learning_rate": 9.823881021266202e-05, + "loss": 3.5348, + "step": 6632 + }, + { + "epoch": 4.273972602739726, + "grad_norm": 1.6892610519250773, + "learning_rate": 9.823827982781578e-05, + "loss": 3.4319, + "step": 6633 + }, + { + "epoch": 4.2746172441579375, + "grad_norm": 1.507453370284961, + "learning_rate": 9.82377493645654e-05, + "loss": 3.1008, + "step": 6634 + }, + { + "epoch": 4.275261885576148, + "grad_norm": 1.5609324850967505, + "learning_rate": 9.823721882291176e-05, + "loss": 3.3953, + "step": 6635 + }, + { + "epoch": 4.275906526994359, + "grad_norm": 1.5931755100798106, + "learning_rate": 9.823668820285573e-05, + "loss": 3.1946, + "step": 6636 + }, + { + "epoch": 4.2765511684125705, + "grad_norm": 1.571830678557992, + "learning_rate": 9.823615750439817e-05, + "loss": 3.523, + "step": 6637 + }, + { + "epoch": 4.277195809830782, + "grad_norm": 1.5523274261629765, + "learning_rate": 9.823562672753995e-05, + "loss": 3.0023, + "step": 6638 + }, + { + "epoch": 4.277840451248993, + "grad_norm": 1.5439140011369734, + "learning_rate": 9.823509587228197e-05, + "loss": 3.1457, + "step": 6639 + }, + { + "epoch": 4.278485092667204, + "grad_norm": 1.3671994306828492, + "learning_rate": 9.823456493862508e-05, + "loss": 3.5133, + "step": 6640 + }, + { + "epoch": 4.279129734085415, + "grad_norm": 1.5803057359076789, + "learning_rate": 9.823403392657013e-05, + "loss": 3.344, + "step": 6641 + }, + { + "epoch": 4.279774375503626, + "grad_norm": 1.2681410259197372, + "learning_rate": 9.823350283611802e-05, + "loss": 3.2402, + "step": 6642 + }, + { + "epoch": 4.2804190169218375, + "grad_norm": 1.9767529443939076, + "learning_rate": 9.823297166726964e-05, + "loss": 3.1251, + "step": 6643 + }, + { + "epoch": 4.281063658340049, + "grad_norm": 1.7358954917220708, + "learning_rate": 9.823244042002581e-05, + "loss": 3.5864, + "step": 6644 + }, + { + "epoch": 4.281708299758259, + "grad_norm": 1.7922198885050427, + "learning_rate": 9.823190909438746e-05, + "loss": 3.4358, + "step": 6645 + }, + { + "epoch": 4.2823529411764705, + "grad_norm": 1.8907318895043785, + "learning_rate": 9.823137769035542e-05, + "loss": 3.2169, + "step": 6646 + }, + { + "epoch": 4.282997582594682, + "grad_norm": 1.978951618655281, + "learning_rate": 9.823084620793059e-05, + "loss": 3.3197, + "step": 6647 + }, + { + "epoch": 4.283642224012893, + "grad_norm": 1.8253121504047252, + "learning_rate": 9.823031464711382e-05, + "loss": 3.1895, + "step": 6648 + }, + { + "epoch": 4.284286865431104, + "grad_norm": 1.7083406433090047, + "learning_rate": 9.8229783007906e-05, + "loss": 3.355, + "step": 6649 + }, + { + "epoch": 4.284931506849315, + "grad_norm": 1.871740450752689, + "learning_rate": 9.822925129030799e-05, + "loss": 2.8536, + "step": 6650 + }, + { + "epoch": 4.285576148267526, + "grad_norm": 1.6452026766215353, + "learning_rate": 9.822871949432067e-05, + "loss": 3.2826, + "step": 6651 + }, + { + "epoch": 4.286220789685737, + "grad_norm": 1.7693559717903116, + "learning_rate": 9.822818761994494e-05, + "loss": 2.9954, + "step": 6652 + }, + { + "epoch": 4.286865431103949, + "grad_norm": 1.5565786575369576, + "learning_rate": 9.822765566718162e-05, + "loss": 3.4774, + "step": 6653 + }, + { + "epoch": 4.28751007252216, + "grad_norm": 1.7707757968227444, + "learning_rate": 9.822712363603161e-05, + "loss": 2.8146, + "step": 6654 + }, + { + "epoch": 4.28815471394037, + "grad_norm": 1.6033632828399118, + "learning_rate": 9.82265915264958e-05, + "loss": 3.1846, + "step": 6655 + }, + { + "epoch": 4.288799355358582, + "grad_norm": 1.7122186613980943, + "learning_rate": 9.822605933857503e-05, + "loss": 3.3944, + "step": 6656 + }, + { + "epoch": 4.289443996776793, + "grad_norm": 1.6643485886330083, + "learning_rate": 9.822552707227022e-05, + "loss": 3.2791, + "step": 6657 + }, + { + "epoch": 4.290088638195004, + "grad_norm": 1.6355347314365682, + "learning_rate": 9.822499472758219e-05, + "loss": 3.161, + "step": 6658 + }, + { + "epoch": 4.290733279613216, + "grad_norm": 2.009947880729177, + "learning_rate": 9.822446230451187e-05, + "loss": 3.1922, + "step": 6659 + }, + { + "epoch": 4.291377921031426, + "grad_norm": 1.3509879151850024, + "learning_rate": 9.822392980306008e-05, + "loss": 3.3469, + "step": 6660 + }, + { + "epoch": 4.292022562449637, + "grad_norm": 1.8388855400885562, + "learning_rate": 9.822339722322774e-05, + "loss": 3.2743, + "step": 6661 + }, + { + "epoch": 4.292667203867849, + "grad_norm": 1.736515445272142, + "learning_rate": 9.822286456501568e-05, + "loss": 3.1131, + "step": 6662 + }, + { + "epoch": 4.29331184528606, + "grad_norm": 1.869901322446008, + "learning_rate": 9.822233182842481e-05, + "loss": 3.1624, + "step": 6663 + }, + { + "epoch": 4.293956486704271, + "grad_norm": 1.67165067201763, + "learning_rate": 9.8221799013456e-05, + "loss": 3.2091, + "step": 6664 + }, + { + "epoch": 4.294601128122482, + "grad_norm": 2.0973972010113084, + "learning_rate": 9.822126612011013e-05, + "loss": 3.3239, + "step": 6665 + }, + { + "epoch": 4.295245769540693, + "grad_norm": 1.8221862941316092, + "learning_rate": 9.822073314838805e-05, + "loss": 3.2482, + "step": 6666 + }, + { + "epoch": 4.295890410958904, + "grad_norm": 1.9861219373480075, + "learning_rate": 9.822020009829064e-05, + "loss": 3.2924, + "step": 6667 + }, + { + "epoch": 4.2965350523771155, + "grad_norm": 1.6275003753705017, + "learning_rate": 9.821966696981881e-05, + "loss": 3.0806, + "step": 6668 + }, + { + "epoch": 4.297179693795326, + "grad_norm": 2.26940274471585, + "learning_rate": 9.82191337629734e-05, + "loss": 3.4387, + "step": 6669 + }, + { + "epoch": 4.297824335213537, + "grad_norm": 1.6785452314366969, + "learning_rate": 9.82186004777553e-05, + "loss": 3.6324, + "step": 6670 + }, + { + "epoch": 4.2984689766317485, + "grad_norm": 2.0477436166588094, + "learning_rate": 9.82180671141654e-05, + "loss": 3.1054, + "step": 6671 + }, + { + "epoch": 4.29911361804996, + "grad_norm": 1.3901391535176155, + "learning_rate": 9.821753367220453e-05, + "loss": 2.9151, + "step": 6672 + }, + { + "epoch": 4.299758259468171, + "grad_norm": 2.1047987260407535, + "learning_rate": 9.821700015187362e-05, + "loss": 3.2986, + "step": 6673 + }, + { + "epoch": 4.3004029008863816, + "grad_norm": 2.1785729941100054, + "learning_rate": 9.821646655317349e-05, + "loss": 3.4103, + "step": 6674 + }, + { + "epoch": 4.301047542304593, + "grad_norm": 1.649322529506706, + "learning_rate": 9.821593287610507e-05, + "loss": 3.1148, + "step": 6675 + }, + { + "epoch": 4.301692183722804, + "grad_norm": 1.7519121051532298, + "learning_rate": 9.821539912066921e-05, + "loss": 3.1465, + "step": 6676 + }, + { + "epoch": 4.3023368251410155, + "grad_norm": 2.1189767729621987, + "learning_rate": 9.82148652868668e-05, + "loss": 3.3067, + "step": 6677 + }, + { + "epoch": 4.302981466559227, + "grad_norm": 1.7148571243480855, + "learning_rate": 9.82143313746987e-05, + "loss": 3.3436, + "step": 6678 + }, + { + "epoch": 4.303626107977437, + "grad_norm": 1.9655767810235996, + "learning_rate": 9.821379738416578e-05, + "loss": 3.1684, + "step": 6679 + }, + { + "epoch": 4.3042707493956485, + "grad_norm": 1.6105180043351757, + "learning_rate": 9.821326331526895e-05, + "loss": 3.1205, + "step": 6680 + }, + { + "epoch": 4.30491539081386, + "grad_norm": 1.7345851921209505, + "learning_rate": 9.821272916800907e-05, + "loss": 3.4445, + "step": 6681 + }, + { + "epoch": 4.305560032232071, + "grad_norm": 1.5085738248792977, + "learning_rate": 9.8212194942387e-05, + "loss": 3.2194, + "step": 6682 + }, + { + "epoch": 4.306204673650282, + "grad_norm": 1.7041955026426296, + "learning_rate": 9.821166063840364e-05, + "loss": 3.2208, + "step": 6683 + }, + { + "epoch": 4.306849315068493, + "grad_norm": 1.7408246348869194, + "learning_rate": 9.821112625605987e-05, + "loss": 3.3669, + "step": 6684 + }, + { + "epoch": 4.307493956486704, + "grad_norm": 1.606792503424339, + "learning_rate": 9.821059179535656e-05, + "loss": 3.2576, + "step": 6685 + }, + { + "epoch": 4.308138597904915, + "grad_norm": 1.4089005015281248, + "learning_rate": 9.821005725629458e-05, + "loss": 3.1811, + "step": 6686 + }, + { + "epoch": 4.308783239323127, + "grad_norm": 1.7855000091282216, + "learning_rate": 9.820952263887481e-05, + "loss": 3.3235, + "step": 6687 + }, + { + "epoch": 4.309427880741338, + "grad_norm": 1.3808900340264267, + "learning_rate": 9.820898794309813e-05, + "loss": 3.2391, + "step": 6688 + }, + { + "epoch": 4.310072522159548, + "grad_norm": 1.2667715090248675, + "learning_rate": 9.820845316896543e-05, + "loss": 3.1663, + "step": 6689 + }, + { + "epoch": 4.31071716357776, + "grad_norm": 1.5726041550599488, + "learning_rate": 9.820791831647757e-05, + "loss": 3.2712, + "step": 6690 + }, + { + "epoch": 4.311361804995971, + "grad_norm": 2.0069095810484088, + "learning_rate": 9.820738338563543e-05, + "loss": 2.7305, + "step": 6691 + }, + { + "epoch": 4.312006446414182, + "grad_norm": 1.3552714107892616, + "learning_rate": 9.820684837643991e-05, + "loss": 3.4546, + "step": 6692 + }, + { + "epoch": 4.312651087832394, + "grad_norm": 2.1347913333579775, + "learning_rate": 9.820631328889187e-05, + "loss": 3.409, + "step": 6693 + }, + { + "epoch": 4.313295729250604, + "grad_norm": 1.4779346405841383, + "learning_rate": 9.82057781229922e-05, + "loss": 3.3294, + "step": 6694 + }, + { + "epoch": 4.313940370668815, + "grad_norm": 2.2014975940310295, + "learning_rate": 9.820524287874176e-05, + "loss": 3.4864, + "step": 6695 + }, + { + "epoch": 4.314585012087027, + "grad_norm": 1.573823175739995, + "learning_rate": 9.820470755614147e-05, + "loss": 3.379, + "step": 6696 + }, + { + "epoch": 4.315229653505238, + "grad_norm": 1.8818922681280355, + "learning_rate": 9.820417215519216e-05, + "loss": 3.171, + "step": 6697 + }, + { + "epoch": 4.315874294923449, + "grad_norm": 1.5659677003381085, + "learning_rate": 9.820363667589471e-05, + "loss": 3.4392, + "step": 6698 + }, + { + "epoch": 4.31651893634166, + "grad_norm": 2.0434581182521385, + "learning_rate": 9.820310111825005e-05, + "loss": 3.4505, + "step": 6699 + }, + { + "epoch": 4.317163577759871, + "grad_norm": 1.9207485417249583, + "learning_rate": 9.8202565482259e-05, + "loss": 3.2934, + "step": 6700 + }, + { + "epoch": 4.317163577759871, + "eval_loss": 4.229304313659668, + "eval_runtime": 2.9875, + "eval_samples_per_second": 33.473, + "eval_steps_per_second": 4.352, + "step": 6700 + }, + { + "epoch": 4.317808219178082, + "grad_norm": 2.1424794241464533, + "learning_rate": 9.82020297679225e-05, + "loss": 3.056, + "step": 6701 + }, + { + "epoch": 4.3184528605962935, + "grad_norm": 1.6486131711796466, + "learning_rate": 9.820149397524138e-05, + "loss": 3.3442, + "step": 6702 + }, + { + "epoch": 4.319097502014505, + "grad_norm": 2.007687318554893, + "learning_rate": 9.820095810421656e-05, + "loss": 3.5327, + "step": 6703 + }, + { + "epoch": 4.319742143432715, + "grad_norm": 1.5777342598547712, + "learning_rate": 9.820042215484887e-05, + "loss": 3.2874, + "step": 6704 + }, + { + "epoch": 4.3203867848509265, + "grad_norm": 1.8643402867544023, + "learning_rate": 9.819988612713924e-05, + "loss": 3.4499, + "step": 6705 + }, + { + "epoch": 4.321031426269138, + "grad_norm": 1.5178721489697886, + "learning_rate": 9.819935002108852e-05, + "loss": 3.4875, + "step": 6706 + }, + { + "epoch": 4.321676067687349, + "grad_norm": 2.0814306468430024, + "learning_rate": 9.819881383669759e-05, + "loss": 3.2122, + "step": 6707 + }, + { + "epoch": 4.32232070910556, + "grad_norm": 1.4583484721480187, + "learning_rate": 9.819827757396735e-05, + "loss": 3.5204, + "step": 6708 + }, + { + "epoch": 4.322965350523771, + "grad_norm": 1.6226930078342527, + "learning_rate": 9.819774123289866e-05, + "loss": 3.3507, + "step": 6709 + }, + { + "epoch": 4.323609991941982, + "grad_norm": 2.225505739008342, + "learning_rate": 9.819720481349243e-05, + "loss": 3.4526, + "step": 6710 + }, + { + "epoch": 4.3242546333601934, + "grad_norm": 1.9247014052702578, + "learning_rate": 9.819666831574951e-05, + "loss": 3.0822, + "step": 6711 + }, + { + "epoch": 4.324899274778405, + "grad_norm": 1.4612374208811312, + "learning_rate": 9.819613173967079e-05, + "loss": 3.3144, + "step": 6712 + }, + { + "epoch": 4.325543916196616, + "grad_norm": 1.511505606754708, + "learning_rate": 9.819559508525717e-05, + "loss": 3.3177, + "step": 6713 + }, + { + "epoch": 4.3261885576148265, + "grad_norm": 1.4928869753571279, + "learning_rate": 9.81950583525095e-05, + "loss": 3.436, + "step": 6714 + }, + { + "epoch": 4.326833199033038, + "grad_norm": 1.4293902035625228, + "learning_rate": 9.81945215414287e-05, + "loss": 3.3864, + "step": 6715 + }, + { + "epoch": 4.327477840451249, + "grad_norm": 1.6011972022725647, + "learning_rate": 9.819398465201562e-05, + "loss": 3.5066, + "step": 6716 + }, + { + "epoch": 4.32812248186946, + "grad_norm": 1.5863867987988383, + "learning_rate": 9.819344768427115e-05, + "loss": 3.4943, + "step": 6717 + }, + { + "epoch": 4.328767123287671, + "grad_norm": 1.4552533918338273, + "learning_rate": 9.819291063819617e-05, + "loss": 3.3616, + "step": 6718 + }, + { + "epoch": 4.329411764705882, + "grad_norm": 1.419541382930855, + "learning_rate": 9.819237351379156e-05, + "loss": 3.359, + "step": 6719 + }, + { + "epoch": 4.330056406124093, + "grad_norm": 1.6167527429821869, + "learning_rate": 9.819183631105822e-05, + "loss": 3.351, + "step": 6720 + }, + { + "epoch": 4.330701047542305, + "grad_norm": 1.6664593174969624, + "learning_rate": 9.819129902999702e-05, + "loss": 3.3754, + "step": 6721 + }, + { + "epoch": 4.331345688960516, + "grad_norm": 1.429138672387302, + "learning_rate": 9.819076167060883e-05, + "loss": 3.3598, + "step": 6722 + }, + { + "epoch": 4.331990330378726, + "grad_norm": 1.717954417864415, + "learning_rate": 9.819022423289456e-05, + "loss": 3.2675, + "step": 6723 + }, + { + "epoch": 4.332634971796938, + "grad_norm": 1.580203473826872, + "learning_rate": 9.818968671685507e-05, + "loss": 3.4182, + "step": 6724 + }, + { + "epoch": 4.333279613215149, + "grad_norm": 1.3037942825318225, + "learning_rate": 9.818914912249124e-05, + "loss": 3.249, + "step": 6725 + }, + { + "epoch": 4.33392425463336, + "grad_norm": 1.9114591223682043, + "learning_rate": 9.818861144980399e-05, + "loss": 3.2935, + "step": 6726 + }, + { + "epoch": 4.334568896051572, + "grad_norm": 1.5796693188930788, + "learning_rate": 9.818807369879415e-05, + "loss": 3.3175, + "step": 6727 + }, + { + "epoch": 4.335213537469782, + "grad_norm": 1.6736160208613664, + "learning_rate": 9.818753586946263e-05, + "loss": 3.335, + "step": 6728 + }, + { + "epoch": 4.335858178887993, + "grad_norm": 1.7281637982669955, + "learning_rate": 9.818699796181033e-05, + "loss": 3.6689, + "step": 6729 + }, + { + "epoch": 4.336502820306205, + "grad_norm": 1.399469681049217, + "learning_rate": 9.818645997583812e-05, + "loss": 3.3284, + "step": 6730 + }, + { + "epoch": 4.337147461724416, + "grad_norm": 1.649813562482317, + "learning_rate": 9.818592191154687e-05, + "loss": 3.2104, + "step": 6731 + }, + { + "epoch": 4.337792103142627, + "grad_norm": 1.2480020586541274, + "learning_rate": 9.818538376893747e-05, + "loss": 3.038, + "step": 6732 + }, + { + "epoch": 4.338436744560838, + "grad_norm": 1.8979580170998818, + "learning_rate": 9.818484554801082e-05, + "loss": 3.2594, + "step": 6733 + }, + { + "epoch": 4.339081385979049, + "grad_norm": 1.6637907623213568, + "learning_rate": 9.818430724876778e-05, + "loss": 3.2823, + "step": 6734 + }, + { + "epoch": 4.33972602739726, + "grad_norm": 1.8834148557900368, + "learning_rate": 9.818376887120923e-05, + "loss": 3.4067, + "step": 6735 + }, + { + "epoch": 4.3403706688154715, + "grad_norm": 1.3717678435980607, + "learning_rate": 9.81832304153361e-05, + "loss": 3.1165, + "step": 6736 + }, + { + "epoch": 4.341015310233683, + "grad_norm": 1.7166877770653972, + "learning_rate": 9.818269188114922e-05, + "loss": 3.3085, + "step": 6737 + }, + { + "epoch": 4.341659951651893, + "grad_norm": 1.4011134138082504, + "learning_rate": 9.81821532686495e-05, + "loss": 3.5958, + "step": 6738 + }, + { + "epoch": 4.3423045930701045, + "grad_norm": 2.1017544075226873, + "learning_rate": 9.818161457783784e-05, + "loss": 3.2534, + "step": 6739 + }, + { + "epoch": 4.342949234488316, + "grad_norm": 1.7748739317551494, + "learning_rate": 9.81810758087151e-05, + "loss": 3.1124, + "step": 6740 + }, + { + "epoch": 4.343593875906527, + "grad_norm": 1.7558390575995977, + "learning_rate": 9.818053696128217e-05, + "loss": 3.5234, + "step": 6741 + }, + { + "epoch": 4.344238517324738, + "grad_norm": 2.2972649677011674, + "learning_rate": 9.817999803553994e-05, + "loss": 3.3514, + "step": 6742 + }, + { + "epoch": 4.344883158742949, + "grad_norm": 1.8182923048073787, + "learning_rate": 9.817945903148929e-05, + "loss": 3.3797, + "step": 6743 + }, + { + "epoch": 4.34552780016116, + "grad_norm": 1.4744257921139836, + "learning_rate": 9.817891994913111e-05, + "loss": 3.3595, + "step": 6744 + }, + { + "epoch": 4.346172441579371, + "grad_norm": 1.5806497385246752, + "learning_rate": 9.817838078846629e-05, + "loss": 3.7036, + "step": 6745 + }, + { + "epoch": 4.346817082997583, + "grad_norm": 1.4775738800896756, + "learning_rate": 9.81778415494957e-05, + "loss": 3.3292, + "step": 6746 + }, + { + "epoch": 4.347461724415794, + "grad_norm": 1.7471714309288917, + "learning_rate": 9.817730223222024e-05, + "loss": 3.1695, + "step": 6747 + }, + { + "epoch": 4.3481063658340044, + "grad_norm": 1.666848802064422, + "learning_rate": 9.81767628366408e-05, + "loss": 3.1231, + "step": 6748 + }, + { + "epoch": 4.348751007252216, + "grad_norm": 1.683291248195062, + "learning_rate": 9.817622336275822e-05, + "loss": 3.2361, + "step": 6749 + }, + { + "epoch": 4.349395648670427, + "grad_norm": 1.7647975318542295, + "learning_rate": 9.817568381057345e-05, + "loss": 3.1486, + "step": 6750 + }, + { + "epoch": 4.350040290088638, + "grad_norm": 1.745736970747674, + "learning_rate": 9.817514418008736e-05, + "loss": 3.1944, + "step": 6751 + }, + { + "epoch": 4.35068493150685, + "grad_norm": 2.0871995843858078, + "learning_rate": 9.817460447130078e-05, + "loss": 3.2257, + "step": 6752 + }, + { + "epoch": 4.35132957292506, + "grad_norm": 1.5280159855578157, + "learning_rate": 9.817406468421469e-05, + "loss": 3.4742, + "step": 6753 + }, + { + "epoch": 4.351974214343271, + "grad_norm": 1.8078254061021803, + "learning_rate": 9.817352481882991e-05, + "loss": 3.3843, + "step": 6754 + }, + { + "epoch": 4.352618855761483, + "grad_norm": 1.466626031744718, + "learning_rate": 9.817298487514732e-05, + "loss": 3.4538, + "step": 6755 + }, + { + "epoch": 4.353263497179694, + "grad_norm": 1.9446062185350852, + "learning_rate": 9.817244485316786e-05, + "loss": 3.5021, + "step": 6756 + }, + { + "epoch": 4.353908138597905, + "grad_norm": 1.436025121043199, + "learning_rate": 9.817190475289239e-05, + "loss": 3.4803, + "step": 6757 + }, + { + "epoch": 4.354552780016116, + "grad_norm": 1.4349901707379304, + "learning_rate": 9.817136457432179e-05, + "loss": 3.3567, + "step": 6758 + }, + { + "epoch": 4.355197421434327, + "grad_norm": 1.643694747105608, + "learning_rate": 9.817082431745693e-05, + "loss": 3.5437, + "step": 6759 + }, + { + "epoch": 4.355842062852538, + "grad_norm": 1.3368448855013237, + "learning_rate": 9.817028398229875e-05, + "loss": 3.232, + "step": 6760 + }, + { + "epoch": 4.35648670427075, + "grad_norm": 1.4301008255839343, + "learning_rate": 9.81697435688481e-05, + "loss": 3.3251, + "step": 6761 + }, + { + "epoch": 4.357131345688961, + "grad_norm": 1.361027176713304, + "learning_rate": 9.816920307710586e-05, + "loss": 3.2133, + "step": 6762 + }, + { + "epoch": 4.357775987107171, + "grad_norm": 1.3925798010861337, + "learning_rate": 9.816866250707295e-05, + "loss": 3.3059, + "step": 6763 + }, + { + "epoch": 4.358420628525383, + "grad_norm": 1.2317521643055358, + "learning_rate": 9.816812185875022e-05, + "loss": 3.3831, + "step": 6764 + }, + { + "epoch": 4.359065269943594, + "grad_norm": 1.4700807550343495, + "learning_rate": 9.816758113213858e-05, + "loss": 3.5577, + "step": 6765 + }, + { + "epoch": 4.359709911361805, + "grad_norm": 1.3825233519184899, + "learning_rate": 9.816704032723894e-05, + "loss": 3.6411, + "step": 6766 + }, + { + "epoch": 4.3603545527800165, + "grad_norm": 1.4026864267921786, + "learning_rate": 9.816649944405215e-05, + "loss": 3.556, + "step": 6767 + }, + { + "epoch": 4.360999194198227, + "grad_norm": 1.6490213174595607, + "learning_rate": 9.816595848257912e-05, + "loss": 3.3138, + "step": 6768 + }, + { + "epoch": 4.361643835616438, + "grad_norm": 1.4262357125157468, + "learning_rate": 9.81654174428207e-05, + "loss": 3.6242, + "step": 6769 + }, + { + "epoch": 4.3622884770346495, + "grad_norm": 1.5760338240421028, + "learning_rate": 9.816487632477783e-05, + "loss": 3.4616, + "step": 6770 + }, + { + "epoch": 4.362933118452861, + "grad_norm": 1.5556454393990469, + "learning_rate": 9.816433512845139e-05, + "loss": 3.3136, + "step": 6771 + }, + { + "epoch": 4.363577759871072, + "grad_norm": 1.9302979443348505, + "learning_rate": 9.816379385384223e-05, + "loss": 3.4662, + "step": 6772 + }, + { + "epoch": 4.3642224012892825, + "grad_norm": 1.599288453023659, + "learning_rate": 9.81632525009513e-05, + "loss": 3.159, + "step": 6773 + }, + { + "epoch": 4.364867042707494, + "grad_norm": 1.8583183716564273, + "learning_rate": 9.816271106977943e-05, + "loss": 3.0363, + "step": 6774 + }, + { + "epoch": 4.365511684125705, + "grad_norm": 1.5886686498123264, + "learning_rate": 9.816216956032754e-05, + "loss": 2.9247, + "step": 6775 + }, + { + "epoch": 4.366156325543916, + "grad_norm": 1.6935937208900491, + "learning_rate": 9.816162797259653e-05, + "loss": 3.4867, + "step": 6776 + }, + { + "epoch": 4.366800966962128, + "grad_norm": 1.7146188643922609, + "learning_rate": 9.816108630658725e-05, + "loss": 3.074, + "step": 6777 + }, + { + "epoch": 4.367445608380338, + "grad_norm": 1.4411962870055666, + "learning_rate": 9.816054456230063e-05, + "loss": 3.2318, + "step": 6778 + }, + { + "epoch": 4.368090249798549, + "grad_norm": 1.6158057010733264, + "learning_rate": 9.816000273973754e-05, + "loss": 3.491, + "step": 6779 + }, + { + "epoch": 4.368734891216761, + "grad_norm": 1.523928929903416, + "learning_rate": 9.815946083889887e-05, + "loss": 3.245, + "step": 6780 + }, + { + "epoch": 4.369379532634972, + "grad_norm": 1.5767635080823945, + "learning_rate": 9.81589188597855e-05, + "loss": 3.3305, + "step": 6781 + }, + { + "epoch": 4.370024174053183, + "grad_norm": 1.6295098189838477, + "learning_rate": 9.815837680239836e-05, + "loss": 3.5759, + "step": 6782 + }, + { + "epoch": 4.370668815471394, + "grad_norm": 1.460621084896054, + "learning_rate": 9.815783466673827e-05, + "loss": 3.5156, + "step": 6783 + }, + { + "epoch": 4.371313456889605, + "grad_norm": 1.603790528177317, + "learning_rate": 9.815729245280621e-05, + "loss": 3.207, + "step": 6784 + }, + { + "epoch": 4.371958098307816, + "grad_norm": 1.5435998355542893, + "learning_rate": 9.815675016060299e-05, + "loss": 3.1713, + "step": 6785 + }, + { + "epoch": 4.372602739726028, + "grad_norm": 1.5809582249190066, + "learning_rate": 9.815620779012956e-05, + "loss": 3.0085, + "step": 6786 + }, + { + "epoch": 4.373247381144239, + "grad_norm": 1.6575211559420293, + "learning_rate": 9.815566534138677e-05, + "loss": 3.2807, + "step": 6787 + }, + { + "epoch": 4.373892022562449, + "grad_norm": 1.6623883356494005, + "learning_rate": 9.815512281437553e-05, + "loss": 3.1322, + "step": 6788 + }, + { + "epoch": 4.374536663980661, + "grad_norm": 1.4284289845839477, + "learning_rate": 9.815458020909672e-05, + "loss": 3.4578, + "step": 6789 + }, + { + "epoch": 4.375181305398872, + "grad_norm": 1.528506288518509, + "learning_rate": 9.815403752555125e-05, + "loss": 3.364, + "step": 6790 + }, + { + "epoch": 4.375825946817083, + "grad_norm": 1.6804495749034336, + "learning_rate": 9.815349476374e-05, + "loss": 3.4727, + "step": 6791 + }, + { + "epoch": 4.376470588235295, + "grad_norm": 1.6099610265255662, + "learning_rate": 9.815295192366385e-05, + "loss": 3.4215, + "step": 6792 + }, + { + "epoch": 4.377115229653505, + "grad_norm": 1.5570232671437725, + "learning_rate": 9.815240900532371e-05, + "loss": 3.392, + "step": 6793 + }, + { + "epoch": 4.377759871071716, + "grad_norm": 1.9379861171608528, + "learning_rate": 9.815186600872047e-05, + "loss": 3.0918, + "step": 6794 + }, + { + "epoch": 4.378404512489928, + "grad_norm": 1.626809475445904, + "learning_rate": 9.8151322933855e-05, + "loss": 3.5346, + "step": 6795 + }, + { + "epoch": 4.379049153908139, + "grad_norm": 1.8387992377830764, + "learning_rate": 9.815077978072822e-05, + "loss": 3.0235, + "step": 6796 + }, + { + "epoch": 4.37969379532635, + "grad_norm": 2.1447641845686314, + "learning_rate": 9.8150236549341e-05, + "loss": 3.3466, + "step": 6797 + }, + { + "epoch": 4.380338436744561, + "grad_norm": 1.4107404598453175, + "learning_rate": 9.814969323969426e-05, + "loss": 2.9636, + "step": 6798 + }, + { + "epoch": 4.380983078162772, + "grad_norm": 1.8188665513372533, + "learning_rate": 9.814914985178885e-05, + "loss": 3.3756, + "step": 6799 + }, + { + "epoch": 4.381627719580983, + "grad_norm": 1.5420753358868626, + "learning_rate": 9.81486063856257e-05, + "loss": 3.0767, + "step": 6800 + }, + { + "epoch": 4.381627719580983, + "eval_loss": 4.2057342529296875, + "eval_runtime": 2.9985, + "eval_samples_per_second": 33.35, + "eval_steps_per_second": 4.336, + "step": 6800 + }, + { + "epoch": 4.3822723609991945, + "grad_norm": 1.9141975015952768, + "learning_rate": 9.81480628412057e-05, + "loss": 3.4606, + "step": 6801 + }, + { + "epoch": 4.382917002417406, + "grad_norm": 1.79122613588859, + "learning_rate": 9.814751921852971e-05, + "loss": 3.3703, + "step": 6802 + }, + { + "epoch": 4.383561643835616, + "grad_norm": 2.317957757465462, + "learning_rate": 9.814697551759867e-05, + "loss": 3.2976, + "step": 6803 + }, + { + "epoch": 4.3842062852538275, + "grad_norm": 1.4844987500925306, + "learning_rate": 9.814643173841343e-05, + "loss": 3.3109, + "step": 6804 + }, + { + "epoch": 4.384850926672039, + "grad_norm": 2.096303311013823, + "learning_rate": 9.814588788097491e-05, + "loss": 3.4981, + "step": 6805 + }, + { + "epoch": 4.38549556809025, + "grad_norm": 1.5992289696165973, + "learning_rate": 9.8145343945284e-05, + "loss": 3.1129, + "step": 6806 + }, + { + "epoch": 4.3861402095084605, + "grad_norm": 2.084285075401708, + "learning_rate": 9.814479993134157e-05, + "loss": 3.222, + "step": 6807 + }, + { + "epoch": 4.386784850926672, + "grad_norm": 1.461897474980229, + "learning_rate": 9.814425583914855e-05, + "loss": 3.2965, + "step": 6808 + }, + { + "epoch": 4.387429492344883, + "grad_norm": 1.8643050396424607, + "learning_rate": 9.814371166870579e-05, + "loss": 3.3173, + "step": 6809 + }, + { + "epoch": 4.388074133763094, + "grad_norm": 1.6644635656297944, + "learning_rate": 9.814316742001424e-05, + "loss": 3.2034, + "step": 6810 + }, + { + "epoch": 4.388718775181306, + "grad_norm": 2.0097227691904136, + "learning_rate": 9.814262309307475e-05, + "loss": 3.2177, + "step": 6811 + }, + { + "epoch": 4.389363416599516, + "grad_norm": 1.7323803098734967, + "learning_rate": 9.814207868788822e-05, + "loss": 3.3376, + "step": 6812 + }, + { + "epoch": 4.390008058017727, + "grad_norm": 1.9447689680667966, + "learning_rate": 9.814153420445554e-05, + "loss": 3.0537, + "step": 6813 + }, + { + "epoch": 4.390652699435939, + "grad_norm": 1.5664691913645206, + "learning_rate": 9.814098964277763e-05, + "loss": 3.3068, + "step": 6814 + }, + { + "epoch": 4.39129734085415, + "grad_norm": 1.8564464201264057, + "learning_rate": 9.814044500285537e-05, + "loss": 3.4991, + "step": 6815 + }, + { + "epoch": 4.391941982272361, + "grad_norm": 1.6552124666930086, + "learning_rate": 9.813990028468964e-05, + "loss": 3.6848, + "step": 6816 + }, + { + "epoch": 4.392586623690572, + "grad_norm": 1.9825949018636226, + "learning_rate": 9.813935548828137e-05, + "loss": 3.2947, + "step": 6817 + }, + { + "epoch": 4.393231265108783, + "grad_norm": 1.781747383744115, + "learning_rate": 9.813881061363142e-05, + "loss": 3.2033, + "step": 6818 + }, + { + "epoch": 4.393875906526994, + "grad_norm": 1.759740568890916, + "learning_rate": 9.81382656607407e-05, + "loss": 3.7118, + "step": 6819 + }, + { + "epoch": 4.394520547945206, + "grad_norm": 1.8538786276266703, + "learning_rate": 9.81377206296101e-05, + "loss": 3.5172, + "step": 6820 + }, + { + "epoch": 4.395165189363417, + "grad_norm": 2.04745579974079, + "learning_rate": 9.813717552024053e-05, + "loss": 3.5217, + "step": 6821 + }, + { + "epoch": 4.395809830781627, + "grad_norm": 1.4241416397393083, + "learning_rate": 9.813663033263286e-05, + "loss": 3.2288, + "step": 6822 + }, + { + "epoch": 4.396454472199839, + "grad_norm": 1.4417727757749277, + "learning_rate": 9.813608506678802e-05, + "loss": 3.29, + "step": 6823 + }, + { + "epoch": 4.39709911361805, + "grad_norm": 1.4985073889223224, + "learning_rate": 9.813553972270684e-05, + "loss": 3.3412, + "step": 6824 + }, + { + "epoch": 4.397743755036261, + "grad_norm": 1.6176618580717834, + "learning_rate": 9.81349943003903e-05, + "loss": 3.3595, + "step": 6825 + }, + { + "epoch": 4.3983883964544725, + "grad_norm": 1.3897882072140477, + "learning_rate": 9.813444879983926e-05, + "loss": 3.2688, + "step": 6826 + }, + { + "epoch": 4.399033037872683, + "grad_norm": 1.4372466799913244, + "learning_rate": 9.813390322105458e-05, + "loss": 3.2752, + "step": 6827 + }, + { + "epoch": 4.399677679290894, + "grad_norm": 1.2499418208032502, + "learning_rate": 9.813335756403721e-05, + "loss": 3.2313, + "step": 6828 + }, + { + "epoch": 4.400322320709106, + "grad_norm": 1.6095396176717631, + "learning_rate": 9.813281182878801e-05, + "loss": 3.7213, + "step": 6829 + }, + { + "epoch": 4.400966962127317, + "grad_norm": 1.5635258384270188, + "learning_rate": 9.81322660153079e-05, + "loss": 3.6378, + "step": 6830 + }, + { + "epoch": 4.401611603545528, + "grad_norm": 1.5277381136882675, + "learning_rate": 9.813172012359777e-05, + "loss": 3.0807, + "step": 6831 + }, + { + "epoch": 4.402256244963739, + "grad_norm": 1.8360986066300975, + "learning_rate": 9.81311741536585e-05, + "loss": 3.628, + "step": 6832 + }, + { + "epoch": 4.40290088638195, + "grad_norm": 1.341777163378123, + "learning_rate": 9.813062810549101e-05, + "loss": 3.2734, + "step": 6833 + }, + { + "epoch": 4.403545527800161, + "grad_norm": 1.606171096979181, + "learning_rate": 9.813008197909618e-05, + "loss": 3.3906, + "step": 6834 + }, + { + "epoch": 4.4041901692183725, + "grad_norm": 1.2729654022353494, + "learning_rate": 9.812953577447491e-05, + "loss": 3.514, + "step": 6835 + }, + { + "epoch": 4.404834810636584, + "grad_norm": 1.301614931725479, + "learning_rate": 9.812898949162811e-05, + "loss": 3.3251, + "step": 6836 + }, + { + "epoch": 4.405479452054794, + "grad_norm": 1.4436093494765714, + "learning_rate": 9.812844313055667e-05, + "loss": 3.2859, + "step": 6837 + }, + { + "epoch": 4.4061240934730055, + "grad_norm": 1.4394209587016975, + "learning_rate": 9.812789669126149e-05, + "loss": 3.1845, + "step": 6838 + }, + { + "epoch": 4.406768734891217, + "grad_norm": 1.4639305243514151, + "learning_rate": 9.812735017374344e-05, + "loss": 3.3672, + "step": 6839 + }, + { + "epoch": 4.407413376309428, + "grad_norm": 1.5320734712333874, + "learning_rate": 9.812680357800346e-05, + "loss": 3.4106, + "step": 6840 + }, + { + "epoch": 4.408058017727639, + "grad_norm": 1.5593052239917207, + "learning_rate": 9.812625690404242e-05, + "loss": 3.4004, + "step": 6841 + }, + { + "epoch": 4.40870265914585, + "grad_norm": 1.4196324179382258, + "learning_rate": 9.812571015186123e-05, + "loss": 3.272, + "step": 6842 + }, + { + "epoch": 4.409347300564061, + "grad_norm": 1.829562353299768, + "learning_rate": 9.812516332146078e-05, + "loss": 3.4446, + "step": 6843 + }, + { + "epoch": 4.409991941982272, + "grad_norm": 1.579232487093613, + "learning_rate": 9.812461641284198e-05, + "loss": 3.4546, + "step": 6844 + }, + { + "epoch": 4.410636583400484, + "grad_norm": 1.739380763830941, + "learning_rate": 9.81240694260057e-05, + "loss": 3.6894, + "step": 6845 + }, + { + "epoch": 4.411281224818695, + "grad_norm": 1.6779150567420564, + "learning_rate": 9.812352236095288e-05, + "loss": 3.2009, + "step": 6846 + }, + { + "epoch": 4.411925866236905, + "grad_norm": 1.7399830227711492, + "learning_rate": 9.812297521768439e-05, + "loss": 3.0518, + "step": 6847 + }, + { + "epoch": 4.412570507655117, + "grad_norm": 1.6042615987552915, + "learning_rate": 9.812242799620114e-05, + "loss": 3.0752, + "step": 6848 + }, + { + "epoch": 4.413215149073328, + "grad_norm": 1.766860590156316, + "learning_rate": 9.812188069650401e-05, + "loss": 2.9684, + "step": 6849 + }, + { + "epoch": 4.413859790491539, + "grad_norm": 1.4115566953583425, + "learning_rate": 9.812133331859394e-05, + "loss": 3.3035, + "step": 6850 + }, + { + "epoch": 4.414504431909751, + "grad_norm": 1.8292371791836124, + "learning_rate": 9.812078586247178e-05, + "loss": 3.0666, + "step": 6851 + }, + { + "epoch": 4.415149073327961, + "grad_norm": 1.2586593367624028, + "learning_rate": 9.812023832813845e-05, + "loss": 3.3342, + "step": 6852 + }, + { + "epoch": 4.415793714746172, + "grad_norm": 1.69409034958004, + "learning_rate": 9.811969071559485e-05, + "loss": 3.1447, + "step": 6853 + }, + { + "epoch": 4.416438356164384, + "grad_norm": 1.6663547957944866, + "learning_rate": 9.811914302484189e-05, + "loss": 3.5089, + "step": 6854 + }, + { + "epoch": 4.417082997582595, + "grad_norm": 1.550466876869761, + "learning_rate": 9.811859525588045e-05, + "loss": 3.4252, + "step": 6855 + }, + { + "epoch": 4.417727639000805, + "grad_norm": 1.7150540239114878, + "learning_rate": 9.811804740871145e-05, + "loss": 3.3224, + "step": 6856 + }, + { + "epoch": 4.418372280419017, + "grad_norm": 1.833849806262637, + "learning_rate": 9.811749948333578e-05, + "loss": 3.3249, + "step": 6857 + }, + { + "epoch": 4.419016921837228, + "grad_norm": 1.5267148597436155, + "learning_rate": 9.811695147975433e-05, + "loss": 3.1791, + "step": 6858 + }, + { + "epoch": 4.419661563255439, + "grad_norm": 1.7599247703425516, + "learning_rate": 9.8116403397968e-05, + "loss": 3.2714, + "step": 6859 + }, + { + "epoch": 4.4203062046736505, + "grad_norm": 1.559499315099855, + "learning_rate": 9.811585523797772e-05, + "loss": 3.3077, + "step": 6860 + }, + { + "epoch": 4.420950846091861, + "grad_norm": 1.3507793127969452, + "learning_rate": 9.811530699978435e-05, + "loss": 3.2659, + "step": 6861 + }, + { + "epoch": 4.421595487510072, + "grad_norm": 1.5012455999022825, + "learning_rate": 9.811475868338883e-05, + "loss": 3.3109, + "step": 6862 + }, + { + "epoch": 4.4222401289282836, + "grad_norm": 1.5404844120507648, + "learning_rate": 9.811421028879202e-05, + "loss": 3.4879, + "step": 6863 + }, + { + "epoch": 4.422884770346495, + "grad_norm": 1.5654720484663542, + "learning_rate": 9.811366181599487e-05, + "loss": 3.2548, + "step": 6864 + }, + { + "epoch": 4.423529411764706, + "grad_norm": 1.4559219947516349, + "learning_rate": 9.811311326499822e-05, + "loss": 3.3349, + "step": 6865 + }, + { + "epoch": 4.424174053182917, + "grad_norm": 1.4667256626927851, + "learning_rate": 9.811256463580302e-05, + "loss": 3.2005, + "step": 6866 + }, + { + "epoch": 4.424818694601128, + "grad_norm": 1.4273303326207956, + "learning_rate": 9.811201592841015e-05, + "loss": 3.3884, + "step": 6867 + }, + { + "epoch": 4.425463336019339, + "grad_norm": 1.303609850785996, + "learning_rate": 9.811146714282053e-05, + "loss": 3.3349, + "step": 6868 + }, + { + "epoch": 4.4261079774375505, + "grad_norm": 1.611339105875222, + "learning_rate": 9.811091827903501e-05, + "loss": 3.107, + "step": 6869 + }, + { + "epoch": 4.426752618855762, + "grad_norm": 1.3198533947997004, + "learning_rate": 9.811036933705456e-05, + "loss": 3.3898, + "step": 6870 + }, + { + "epoch": 4.427397260273972, + "grad_norm": 1.6648615108360438, + "learning_rate": 9.810982031688004e-05, + "loss": 3.414, + "step": 6871 + }, + { + "epoch": 4.4280419016921835, + "grad_norm": 1.4053249666103727, + "learning_rate": 9.810927121851235e-05, + "loss": 3.3613, + "step": 6872 + }, + { + "epoch": 4.428686543110395, + "grad_norm": 1.7362171813087535, + "learning_rate": 9.810872204195241e-05, + "loss": 3.5497, + "step": 6873 + }, + { + "epoch": 4.429331184528606, + "grad_norm": 1.557530850532916, + "learning_rate": 9.810817278720111e-05, + "loss": 3.3281, + "step": 6874 + }, + { + "epoch": 4.429975825946817, + "grad_norm": 1.5154640973494764, + "learning_rate": 9.810762345425937e-05, + "loss": 3.6247, + "step": 6875 + }, + { + "epoch": 4.430620467365028, + "grad_norm": 1.5544510691521525, + "learning_rate": 9.810707404312806e-05, + "loss": 3.3091, + "step": 6876 + }, + { + "epoch": 4.431265108783239, + "grad_norm": 1.5511050911866104, + "learning_rate": 9.81065245538081e-05, + "loss": 3.5445, + "step": 6877 + }, + { + "epoch": 4.43190975020145, + "grad_norm": 1.5188044256985855, + "learning_rate": 9.810597498630042e-05, + "loss": 3.3638, + "step": 6878 + }, + { + "epoch": 4.432554391619662, + "grad_norm": 1.5872967046935276, + "learning_rate": 9.810542534060589e-05, + "loss": 3.1585, + "step": 6879 + }, + { + "epoch": 4.433199033037873, + "grad_norm": 1.6820601522532905, + "learning_rate": 9.81048756167254e-05, + "loss": 2.8764, + "step": 6880 + }, + { + "epoch": 4.433843674456083, + "grad_norm": 1.5184218895329424, + "learning_rate": 9.810432581465989e-05, + "loss": 3.5994, + "step": 6881 + }, + { + "epoch": 4.434488315874295, + "grad_norm": 1.457103390269983, + "learning_rate": 9.810377593441024e-05, + "loss": 3.2164, + "step": 6882 + }, + { + "epoch": 4.435132957292506, + "grad_norm": 1.4558046180313926, + "learning_rate": 9.810322597597736e-05, + "loss": 3.4044, + "step": 6883 + }, + { + "epoch": 4.435777598710717, + "grad_norm": 1.6656761061869818, + "learning_rate": 9.810267593936213e-05, + "loss": 3.1221, + "step": 6884 + }, + { + "epoch": 4.436422240128929, + "grad_norm": 1.6384217629177062, + "learning_rate": 9.810212582456549e-05, + "loss": 3.4882, + "step": 6885 + }, + { + "epoch": 4.437066881547139, + "grad_norm": 1.5672162709572675, + "learning_rate": 9.810157563158834e-05, + "loss": 3.3362, + "step": 6886 + }, + { + "epoch": 4.43771152296535, + "grad_norm": 1.720016419557279, + "learning_rate": 9.810102536043157e-05, + "loss": 3.2036, + "step": 6887 + }, + { + "epoch": 4.438356164383562, + "grad_norm": 1.4546457110922493, + "learning_rate": 9.810047501109606e-05, + "loss": 3.4511, + "step": 6888 + }, + { + "epoch": 4.439000805801773, + "grad_norm": 1.6155970765514938, + "learning_rate": 9.809992458358276e-05, + "loss": 3.1462, + "step": 6889 + }, + { + "epoch": 4.439645447219984, + "grad_norm": 1.4647403016854665, + "learning_rate": 9.809937407789256e-05, + "loss": 3.4998, + "step": 6890 + }, + { + "epoch": 4.440290088638195, + "grad_norm": 1.7168169724529199, + "learning_rate": 9.809882349402633e-05, + "loss": 3.3052, + "step": 6891 + }, + { + "epoch": 4.440934730056406, + "grad_norm": 1.444996585552516, + "learning_rate": 9.809827283198503e-05, + "loss": 3.4491, + "step": 6892 + }, + { + "epoch": 4.441579371474617, + "grad_norm": 1.667247508148862, + "learning_rate": 9.809772209176952e-05, + "loss": 3.4693, + "step": 6893 + }, + { + "epoch": 4.4422240128928285, + "grad_norm": 1.553403713157894, + "learning_rate": 9.809717127338072e-05, + "loss": 3.5773, + "step": 6894 + }, + { + "epoch": 4.44286865431104, + "grad_norm": 1.3445236017647457, + "learning_rate": 9.809662037681955e-05, + "loss": 3.3615, + "step": 6895 + }, + { + "epoch": 4.44351329572925, + "grad_norm": 1.670664015232265, + "learning_rate": 9.809606940208689e-05, + "loss": 3.0431, + "step": 6896 + }, + { + "epoch": 4.4441579371474615, + "grad_norm": 1.6548912041879422, + "learning_rate": 9.809551834918366e-05, + "loss": 3.4671, + "step": 6897 + }, + { + "epoch": 4.444802578565673, + "grad_norm": 1.7988428240491376, + "learning_rate": 9.809496721811075e-05, + "loss": 3.211, + "step": 6898 + }, + { + "epoch": 4.445447219983884, + "grad_norm": 1.8462638393422834, + "learning_rate": 9.809441600886909e-05, + "loss": 3.3939, + "step": 6899 + }, + { + "epoch": 4.4460918614020954, + "grad_norm": 1.5014430771226444, + "learning_rate": 9.809386472145955e-05, + "loss": 3.3334, + "step": 6900 + }, + { + "epoch": 4.4460918614020954, + "eval_loss": 4.190564155578613, + "eval_runtime": 2.9801, + "eval_samples_per_second": 33.556, + "eval_steps_per_second": 4.362, + "step": 6900 + }, + { + "epoch": 4.446736502820306, + "grad_norm": 1.7266813579183686, + "learning_rate": 9.809331335588308e-05, + "loss": 3.4037, + "step": 6901 + }, + { + "epoch": 4.447381144238517, + "grad_norm": 1.3468459967654514, + "learning_rate": 9.809276191214054e-05, + "loss": 3.0675, + "step": 6902 + }, + { + "epoch": 4.4480257856567285, + "grad_norm": 1.8209287094854532, + "learning_rate": 9.809221039023286e-05, + "loss": 3.4019, + "step": 6903 + }, + { + "epoch": 4.44867042707494, + "grad_norm": 1.44974748073688, + "learning_rate": 9.809165879016097e-05, + "loss": 3.3943, + "step": 6904 + }, + { + "epoch": 4.449315068493151, + "grad_norm": 1.9490655146175113, + "learning_rate": 9.80911071119257e-05, + "loss": 3.2822, + "step": 6905 + }, + { + "epoch": 4.4499597099113615, + "grad_norm": 1.6698371154447358, + "learning_rate": 9.809055535552805e-05, + "loss": 3.0842, + "step": 6906 + }, + { + "epoch": 4.450604351329573, + "grad_norm": 1.9606370489955351, + "learning_rate": 9.809000352096887e-05, + "loss": 3.4585, + "step": 6907 + }, + { + "epoch": 4.451248992747784, + "grad_norm": 2.122916397955482, + "learning_rate": 9.808945160824908e-05, + "loss": 3.2926, + "step": 6908 + }, + { + "epoch": 4.451893634165995, + "grad_norm": 1.8461108954682242, + "learning_rate": 9.808889961736957e-05, + "loss": 3.4953, + "step": 6909 + }, + { + "epoch": 4.452538275584207, + "grad_norm": 2.120550646204594, + "learning_rate": 9.808834754833126e-05, + "loss": 3.2341, + "step": 6910 + }, + { + "epoch": 4.453182917002417, + "grad_norm": 1.9197807448840232, + "learning_rate": 9.808779540113506e-05, + "loss": 3.0056, + "step": 6911 + }, + { + "epoch": 4.453827558420628, + "grad_norm": 2.0262364021442085, + "learning_rate": 9.808724317578189e-05, + "loss": 3.2301, + "step": 6912 + }, + { + "epoch": 4.45447219983884, + "grad_norm": 1.4758315793148324, + "learning_rate": 9.808669087227262e-05, + "loss": 3.0947, + "step": 6913 + }, + { + "epoch": 4.455116841257051, + "grad_norm": 1.6454651837210381, + "learning_rate": 9.808613849060819e-05, + "loss": 3.4172, + "step": 6914 + }, + { + "epoch": 4.455761482675262, + "grad_norm": 1.6673827192552333, + "learning_rate": 9.80855860307895e-05, + "loss": 3.2899, + "step": 6915 + }, + { + "epoch": 4.456406124093473, + "grad_norm": 1.8475603532938416, + "learning_rate": 9.808503349281743e-05, + "loss": 3.2896, + "step": 6916 + }, + { + "epoch": 4.457050765511684, + "grad_norm": 1.5322354685894464, + "learning_rate": 9.808448087669292e-05, + "loss": 3.3236, + "step": 6917 + }, + { + "epoch": 4.457695406929895, + "grad_norm": 1.7505585911615131, + "learning_rate": 9.808392818241689e-05, + "loss": 3.4228, + "step": 6918 + }, + { + "epoch": 4.458340048348107, + "grad_norm": 1.8625484858052381, + "learning_rate": 9.80833754099902e-05, + "loss": 3.2624, + "step": 6919 + }, + { + "epoch": 4.458984689766318, + "grad_norm": 2.300550482240659, + "learning_rate": 9.808282255941378e-05, + "loss": 3.375, + "step": 6920 + }, + { + "epoch": 4.459629331184528, + "grad_norm": 1.5579075166834961, + "learning_rate": 9.808226963068856e-05, + "loss": 3.3013, + "step": 6921 + }, + { + "epoch": 4.46027397260274, + "grad_norm": 1.9443890068815355, + "learning_rate": 9.808171662381542e-05, + "loss": 3.5278, + "step": 6922 + }, + { + "epoch": 4.460918614020951, + "grad_norm": 1.5918705579548527, + "learning_rate": 9.808116353879528e-05, + "loss": 3.5806, + "step": 6923 + }, + { + "epoch": 4.461563255439162, + "grad_norm": 1.8378992085885404, + "learning_rate": 9.808061037562905e-05, + "loss": 3.4009, + "step": 6924 + }, + { + "epoch": 4.4622078968573735, + "grad_norm": 1.4798537731274612, + "learning_rate": 9.808005713431763e-05, + "loss": 3.1858, + "step": 6925 + }, + { + "epoch": 4.462852538275584, + "grad_norm": 1.850248388032762, + "learning_rate": 9.807950381486193e-05, + "loss": 3.4212, + "step": 6926 + }, + { + "epoch": 4.463497179693795, + "grad_norm": 1.5293309558393413, + "learning_rate": 9.807895041726287e-05, + "loss": 3.1039, + "step": 6927 + }, + { + "epoch": 4.4641418211120065, + "grad_norm": 1.6065605215749221, + "learning_rate": 9.807839694152135e-05, + "loss": 3.2977, + "step": 6928 + }, + { + "epoch": 4.464786462530218, + "grad_norm": 1.707853364241195, + "learning_rate": 9.807784338763827e-05, + "loss": 3.621, + "step": 6929 + }, + { + "epoch": 4.465431103948429, + "grad_norm": 1.2670677058966537, + "learning_rate": 9.807728975561457e-05, + "loss": 3.6139, + "step": 6930 + }, + { + "epoch": 4.4660757453666395, + "grad_norm": 1.905892827647803, + "learning_rate": 9.807673604545111e-05, + "loss": 3.2092, + "step": 6931 + }, + { + "epoch": 4.466720386784851, + "grad_norm": 1.2645673800525412, + "learning_rate": 9.807618225714885e-05, + "loss": 3.2151, + "step": 6932 + }, + { + "epoch": 4.467365028203062, + "grad_norm": 1.8445255890810694, + "learning_rate": 9.807562839070868e-05, + "loss": 3.2041, + "step": 6933 + }, + { + "epoch": 4.468009669621273, + "grad_norm": 1.6412855060121954, + "learning_rate": 9.807507444613148e-05, + "loss": 3.34, + "step": 6934 + }, + { + "epoch": 4.468654311039485, + "grad_norm": 1.6841514249572695, + "learning_rate": 9.807452042341821e-05, + "loss": 3.2635, + "step": 6935 + }, + { + "epoch": 4.469298952457695, + "grad_norm": 2.1509557414856024, + "learning_rate": 9.807396632256974e-05, + "loss": 3.5574, + "step": 6936 + }, + { + "epoch": 4.4699435938759065, + "grad_norm": 1.5624666595015346, + "learning_rate": 9.807341214358701e-05, + "loss": 3.4318, + "step": 6937 + }, + { + "epoch": 4.470588235294118, + "grad_norm": 1.659462432949892, + "learning_rate": 9.807285788647092e-05, + "loss": 3.5676, + "step": 6938 + }, + { + "epoch": 4.471232876712329, + "grad_norm": 1.6016331954205218, + "learning_rate": 9.807230355122237e-05, + "loss": 3.3293, + "step": 6939 + }, + { + "epoch": 4.47187751813054, + "grad_norm": 1.3424520737878292, + "learning_rate": 9.807174913784227e-05, + "loss": 3.2918, + "step": 6940 + }, + { + "epoch": 4.472522159548751, + "grad_norm": 1.3701587841303917, + "learning_rate": 9.807119464633154e-05, + "loss": 3.3204, + "step": 6941 + }, + { + "epoch": 4.473166800966962, + "grad_norm": 1.5942425708633867, + "learning_rate": 9.807064007669109e-05, + "loss": 3.1964, + "step": 6942 + }, + { + "epoch": 4.473811442385173, + "grad_norm": 1.4475680446746189, + "learning_rate": 9.807008542892183e-05, + "loss": 3.1839, + "step": 6943 + }, + { + "epoch": 4.474456083803385, + "grad_norm": 1.928046438900423, + "learning_rate": 9.806953070302468e-05, + "loss": 3.383, + "step": 6944 + }, + { + "epoch": 4.475100725221595, + "grad_norm": 1.3175994898739267, + "learning_rate": 9.806897589900051e-05, + "loss": 3.664, + "step": 6945 + }, + { + "epoch": 4.475745366639806, + "grad_norm": 2.0740535585481648, + "learning_rate": 9.806842101685031e-05, + "loss": 3.1214, + "step": 6946 + }, + { + "epoch": 4.476390008058018, + "grad_norm": 1.8275712559909505, + "learning_rate": 9.806786605657492e-05, + "loss": 3.5501, + "step": 6947 + }, + { + "epoch": 4.477034649476229, + "grad_norm": 1.7831149373306403, + "learning_rate": 9.806731101817528e-05, + "loss": 3.3883, + "step": 6948 + }, + { + "epoch": 4.47767929089444, + "grad_norm": 1.5504678312112175, + "learning_rate": 9.806675590165227e-05, + "loss": 3.5687, + "step": 6949 + }, + { + "epoch": 4.478323932312651, + "grad_norm": 1.7089777270848057, + "learning_rate": 9.806620070700686e-05, + "loss": 3.3537, + "step": 6950 + }, + { + "epoch": 4.478968573730862, + "grad_norm": 1.249322042145893, + "learning_rate": 9.806564543423993e-05, + "loss": 3.3629, + "step": 6951 + }, + { + "epoch": 4.479613215149073, + "grad_norm": 1.7741270893047296, + "learning_rate": 9.806509008335239e-05, + "loss": 3.593, + "step": 6952 + }, + { + "epoch": 4.480257856567285, + "grad_norm": 1.5925536683029096, + "learning_rate": 9.806453465434516e-05, + "loss": 3.4574, + "step": 6953 + }, + { + "epoch": 4.480902497985496, + "grad_norm": 1.8868381145024644, + "learning_rate": 9.806397914721911e-05, + "loss": 3.2798, + "step": 6954 + }, + { + "epoch": 4.481547139403706, + "grad_norm": 1.7795939845642996, + "learning_rate": 9.806342356197523e-05, + "loss": 3.6427, + "step": 6955 + }, + { + "epoch": 4.482191780821918, + "grad_norm": 1.9925122630497853, + "learning_rate": 9.806286789861438e-05, + "loss": 3.3367, + "step": 6956 + }, + { + "epoch": 4.482836422240129, + "grad_norm": 2.006493502874115, + "learning_rate": 9.806231215713748e-05, + "loss": 3.3428, + "step": 6957 + }, + { + "epoch": 4.48348106365834, + "grad_norm": 1.610977212148472, + "learning_rate": 9.806175633754546e-05, + "loss": 3.2349, + "step": 6958 + }, + { + "epoch": 4.4841257050765515, + "grad_norm": 1.6181164092014633, + "learning_rate": 9.80612004398392e-05, + "loss": 3.2957, + "step": 6959 + }, + { + "epoch": 4.484770346494762, + "grad_norm": 1.407785398472564, + "learning_rate": 9.806064446401965e-05, + "loss": 3.2423, + "step": 6960 + }, + { + "epoch": 4.485414987912973, + "grad_norm": 1.7632213110161061, + "learning_rate": 9.806008841008769e-05, + "loss": 3.4032, + "step": 6961 + }, + { + "epoch": 4.4860596293311845, + "grad_norm": 1.802684347018247, + "learning_rate": 9.805953227804426e-05, + "loss": 3.3719, + "step": 6962 + }, + { + "epoch": 4.486704270749396, + "grad_norm": 1.6130805201204779, + "learning_rate": 9.805897606789025e-05, + "loss": 3.1424, + "step": 6963 + }, + { + "epoch": 4.487348912167607, + "grad_norm": 1.5673604651335307, + "learning_rate": 9.805841977962662e-05, + "loss": 3.3992, + "step": 6964 + }, + { + "epoch": 4.4879935535858175, + "grad_norm": 2.0126483479066946, + "learning_rate": 9.805786341325421e-05, + "loss": 3.0742, + "step": 6965 + }, + { + "epoch": 4.488638195004029, + "grad_norm": 1.2992447794021653, + "learning_rate": 9.8057306968774e-05, + "loss": 3.3796, + "step": 6966 + }, + { + "epoch": 4.48928283642224, + "grad_norm": 1.7566460065406655, + "learning_rate": 9.805675044618685e-05, + "loss": 3.2802, + "step": 6967 + }, + { + "epoch": 4.489927477840451, + "grad_norm": 1.289726330361112, + "learning_rate": 9.805619384549373e-05, + "loss": 2.989, + "step": 6968 + }, + { + "epoch": 4.490572119258663, + "grad_norm": 1.642734386413561, + "learning_rate": 9.805563716669551e-05, + "loss": 3.5162, + "step": 6969 + }, + { + "epoch": 4.491216760676873, + "grad_norm": 1.3871680927583205, + "learning_rate": 9.805508040979312e-05, + "loss": 3.389, + "step": 6970 + }, + { + "epoch": 4.491861402095084, + "grad_norm": 1.48971986182304, + "learning_rate": 9.805452357478748e-05, + "loss": 3.4927, + "step": 6971 + }, + { + "epoch": 4.492506043513296, + "grad_norm": 1.6040249119139844, + "learning_rate": 9.80539666616795e-05, + "loss": 3.3038, + "step": 6972 + }, + { + "epoch": 4.493150684931507, + "grad_norm": 1.8431360657070495, + "learning_rate": 9.805340967047009e-05, + "loss": 3.4188, + "step": 6973 + }, + { + "epoch": 4.493795326349718, + "grad_norm": 1.4799742949645314, + "learning_rate": 9.805285260116017e-05, + "loss": 3.503, + "step": 6974 + }, + { + "epoch": 4.494439967767929, + "grad_norm": 1.6260873571438461, + "learning_rate": 9.805229545375065e-05, + "loss": 3.3327, + "step": 6975 + }, + { + "epoch": 4.49508460918614, + "grad_norm": 1.6605063030140303, + "learning_rate": 9.805173822824245e-05, + "loss": 3.1014, + "step": 6976 + }, + { + "epoch": 4.495729250604351, + "grad_norm": 1.5857869657912682, + "learning_rate": 9.805118092463649e-05, + "loss": 3.3876, + "step": 6977 + }, + { + "epoch": 4.496373892022563, + "grad_norm": 1.5118135997354463, + "learning_rate": 9.805062354293367e-05, + "loss": 2.9517, + "step": 6978 + }, + { + "epoch": 4.497018533440774, + "grad_norm": 1.4713712420519505, + "learning_rate": 9.805006608313491e-05, + "loss": 3.4966, + "step": 6979 + }, + { + "epoch": 4.497663174858984, + "grad_norm": 1.5268581002098014, + "learning_rate": 9.804950854524114e-05, + "loss": 3.3791, + "step": 6980 + }, + { + "epoch": 4.498307816277196, + "grad_norm": 1.502079888874801, + "learning_rate": 9.804895092925326e-05, + "loss": 3.4704, + "step": 6981 + }, + { + "epoch": 4.498952457695407, + "grad_norm": 1.4229430836887182, + "learning_rate": 9.80483932351722e-05, + "loss": 3.2978, + "step": 6982 + }, + { + "epoch": 4.499597099113618, + "grad_norm": 1.4308971146446257, + "learning_rate": 9.804783546299886e-05, + "loss": 3.5884, + "step": 6983 + }, + { + "epoch": 4.500241740531829, + "grad_norm": 1.576739878159721, + "learning_rate": 9.804727761273416e-05, + "loss": 3.4751, + "step": 6984 + }, + { + "epoch": 4.50088638195004, + "grad_norm": 1.4310500378634425, + "learning_rate": 9.804671968437903e-05, + "loss": 3.0248, + "step": 6985 + }, + { + "epoch": 4.501531023368251, + "grad_norm": 1.528128430469118, + "learning_rate": 9.804616167793436e-05, + "loss": 3.1953, + "step": 6986 + }, + { + "epoch": 4.502175664786463, + "grad_norm": 1.5548137176284729, + "learning_rate": 9.804560359340109e-05, + "loss": 3.4182, + "step": 6987 + }, + { + "epoch": 4.502820306204674, + "grad_norm": 1.588946681639425, + "learning_rate": 9.804504543078012e-05, + "loss": 3.7481, + "step": 6988 + }, + { + "epoch": 4.503464947622884, + "grad_norm": 1.512332330936065, + "learning_rate": 9.80444871900724e-05, + "loss": 3.3135, + "step": 6989 + }, + { + "epoch": 4.504109589041096, + "grad_norm": 2.0799227513664458, + "learning_rate": 9.804392887127881e-05, + "loss": 3.4527, + "step": 6990 + }, + { + "epoch": 4.504754230459307, + "grad_norm": 1.5933364905826501, + "learning_rate": 9.804337047440026e-05, + "loss": 3.2927, + "step": 6991 + }, + { + "epoch": 4.505398871877518, + "grad_norm": 1.958945183495913, + "learning_rate": 9.804281199943771e-05, + "loss": 3.4529, + "step": 6992 + }, + { + "epoch": 4.5060435132957295, + "grad_norm": 1.734196060783524, + "learning_rate": 9.804225344639207e-05, + "loss": 3.6399, + "step": 6993 + }, + { + "epoch": 4.50668815471394, + "grad_norm": 1.9366666498264418, + "learning_rate": 9.804169481526421e-05, + "loss": 3.8213, + "step": 6994 + }, + { + "epoch": 4.507332796132151, + "grad_norm": 1.504391118252761, + "learning_rate": 9.804113610605509e-05, + "loss": 3.3688, + "step": 6995 + }, + { + "epoch": 4.5079774375503625, + "grad_norm": 2.0148797371382403, + "learning_rate": 9.80405773187656e-05, + "loss": 3.4827, + "step": 6996 + }, + { + "epoch": 4.508622078968574, + "grad_norm": 1.7259491289169058, + "learning_rate": 9.804001845339669e-05, + "loss": 3.522, + "step": 6997 + }, + { + "epoch": 4.509266720386785, + "grad_norm": 1.5665601377528189, + "learning_rate": 9.803945950994926e-05, + "loss": 3.2073, + "step": 6998 + }, + { + "epoch": 4.5099113618049955, + "grad_norm": 1.5242234114539952, + "learning_rate": 9.803890048842422e-05, + "loss": 3.4298, + "step": 6999 + }, + { + "epoch": 4.510556003223207, + "grad_norm": 1.4017980757036541, + "learning_rate": 9.803834138882251e-05, + "loss": 3.5008, + "step": 7000 + }, + { + "epoch": 4.510556003223207, + "eval_loss": 4.197268962860107, + "eval_runtime": 2.9777, + "eval_samples_per_second": 33.583, + "eval_steps_per_second": 4.366, + "step": 7000 + }, + { + "epoch": 4.511200644641418, + "grad_norm": 1.4752406413653512, + "learning_rate": 9.803778221114504e-05, + "loss": 3.0732, + "step": 7001 + }, + { + "epoch": 4.511845286059629, + "grad_norm": 1.4260701553428645, + "learning_rate": 9.803722295539271e-05, + "loss": 3.5883, + "step": 7002 + }, + { + "epoch": 4.512489927477841, + "grad_norm": 1.5066403514334055, + "learning_rate": 9.803666362156647e-05, + "loss": 3.4445, + "step": 7003 + }, + { + "epoch": 4.513134568896051, + "grad_norm": 1.275857580627955, + "learning_rate": 9.803610420966719e-05, + "loss": 3.0667, + "step": 7004 + }, + { + "epoch": 4.513779210314262, + "grad_norm": 1.409872046439332, + "learning_rate": 9.803554471969584e-05, + "loss": 3.3229, + "step": 7005 + }, + { + "epoch": 4.514423851732474, + "grad_norm": 1.4809662190781638, + "learning_rate": 9.803498515165333e-05, + "loss": 3.0654, + "step": 7006 + }, + { + "epoch": 4.515068493150685, + "grad_norm": 1.5553346717627607, + "learning_rate": 9.803442550554056e-05, + "loss": 3.3285, + "step": 7007 + }, + { + "epoch": 4.515713134568896, + "grad_norm": 1.4918948720322536, + "learning_rate": 9.803386578135846e-05, + "loss": 3.4005, + "step": 7008 + }, + { + "epoch": 4.516357775987107, + "grad_norm": 1.3799904377841636, + "learning_rate": 9.803330597910795e-05, + "loss": 3.4503, + "step": 7009 + }, + { + "epoch": 4.517002417405318, + "grad_norm": 1.3284545242045809, + "learning_rate": 9.803274609878991e-05, + "loss": 3.172, + "step": 7010 + }, + { + "epoch": 4.517647058823529, + "grad_norm": 1.5984462197257365, + "learning_rate": 9.803218614040533e-05, + "loss": 3.4926, + "step": 7011 + }, + { + "epoch": 4.518291700241741, + "grad_norm": 1.8032946231607125, + "learning_rate": 9.80316261039551e-05, + "loss": 3.0183, + "step": 7012 + }, + { + "epoch": 4.518936341659952, + "grad_norm": 1.5224594616867966, + "learning_rate": 9.803106598944013e-05, + "loss": 2.9319, + "step": 7013 + }, + { + "epoch": 4.519580983078162, + "grad_norm": 1.5900257341944202, + "learning_rate": 9.803050579686133e-05, + "loss": 3.4609, + "step": 7014 + }, + { + "epoch": 4.520225624496374, + "grad_norm": 1.6023548107404666, + "learning_rate": 9.802994552621966e-05, + "loss": 3.3802, + "step": 7015 + }, + { + "epoch": 4.520870265914585, + "grad_norm": 1.3449971616081027, + "learning_rate": 9.802938517751597e-05, + "loss": 3.3671, + "step": 7016 + }, + { + "epoch": 4.521514907332796, + "grad_norm": 1.2596816021546666, + "learning_rate": 9.802882475075127e-05, + "loss": 3.6154, + "step": 7017 + }, + { + "epoch": 4.522159548751008, + "grad_norm": 1.4233883688242583, + "learning_rate": 9.802826424592641e-05, + "loss": 3.8235, + "step": 7018 + }, + { + "epoch": 4.522804190169218, + "grad_norm": 1.4580753472573493, + "learning_rate": 9.802770366304237e-05, + "loss": 3.4509, + "step": 7019 + }, + { + "epoch": 4.523448831587429, + "grad_norm": 1.3924063120278922, + "learning_rate": 9.80271430021e-05, + "loss": 3.2816, + "step": 7020 + }, + { + "epoch": 4.524093473005641, + "grad_norm": 1.54479694793717, + "learning_rate": 9.802658226310028e-05, + "loss": 3.2581, + "step": 7021 + }, + { + "epoch": 4.524738114423852, + "grad_norm": 1.4654503243576484, + "learning_rate": 9.80260214460441e-05, + "loss": 3.3347, + "step": 7022 + }, + { + "epoch": 4.525382755842063, + "grad_norm": 1.527872286559322, + "learning_rate": 9.802546055093238e-05, + "loss": 3.2985, + "step": 7023 + }, + { + "epoch": 4.526027397260274, + "grad_norm": 1.7348364481268617, + "learning_rate": 9.802489957776607e-05, + "loss": 3.5528, + "step": 7024 + }, + { + "epoch": 4.526672038678485, + "grad_norm": 1.6917676866862756, + "learning_rate": 9.802433852654606e-05, + "loss": 3.3074, + "step": 7025 + }, + { + "epoch": 4.527316680096696, + "grad_norm": 1.8968178506610698, + "learning_rate": 9.80237773972733e-05, + "loss": 3.3233, + "step": 7026 + }, + { + "epoch": 4.5279613215149075, + "grad_norm": 1.3497659250196357, + "learning_rate": 9.802321618994868e-05, + "loss": 3.6184, + "step": 7027 + }, + { + "epoch": 4.528605962933119, + "grad_norm": 1.6756708940824252, + "learning_rate": 9.802265490457314e-05, + "loss": 3.1775, + "step": 7028 + }, + { + "epoch": 4.529250604351329, + "grad_norm": 1.2420366892642234, + "learning_rate": 9.802209354114761e-05, + "loss": 3.1188, + "step": 7029 + }, + { + "epoch": 4.5298952457695405, + "grad_norm": 1.8309733568862867, + "learning_rate": 9.802153209967299e-05, + "loss": 3.2277, + "step": 7030 + }, + { + "epoch": 4.530539887187752, + "grad_norm": 1.526203562004621, + "learning_rate": 9.802097058015021e-05, + "loss": 3.279, + "step": 7031 + }, + { + "epoch": 4.531184528605963, + "grad_norm": 1.4170155095379726, + "learning_rate": 9.80204089825802e-05, + "loss": 3.3136, + "step": 7032 + }, + { + "epoch": 4.531829170024174, + "grad_norm": 1.5853665855596755, + "learning_rate": 9.801984730696389e-05, + "loss": 3.2217, + "step": 7033 + }, + { + "epoch": 4.532473811442385, + "grad_norm": 1.7326385286905424, + "learning_rate": 9.801928555330218e-05, + "loss": 3.3568, + "step": 7034 + }, + { + "epoch": 4.533118452860596, + "grad_norm": 1.3709510636116025, + "learning_rate": 9.8018723721596e-05, + "loss": 3.5086, + "step": 7035 + }, + { + "epoch": 4.533763094278807, + "grad_norm": 1.6872588318323585, + "learning_rate": 9.801816181184628e-05, + "loss": 3.2662, + "step": 7036 + }, + { + "epoch": 4.534407735697019, + "grad_norm": 1.2258834873227824, + "learning_rate": 9.801759982405393e-05, + "loss": 3.3492, + "step": 7037 + }, + { + "epoch": 4.53505237711523, + "grad_norm": 1.529580039829679, + "learning_rate": 9.801703775821988e-05, + "loss": 3.6054, + "step": 7038 + }, + { + "epoch": 4.53569701853344, + "grad_norm": 1.5406559416874013, + "learning_rate": 9.801647561434506e-05, + "loss": 3.3392, + "step": 7039 + }, + { + "epoch": 4.536341659951652, + "grad_norm": 1.7353567470272364, + "learning_rate": 9.801591339243039e-05, + "loss": 3.2697, + "step": 7040 + }, + { + "epoch": 4.536986301369863, + "grad_norm": 1.5666194101089401, + "learning_rate": 9.801535109247679e-05, + "loss": 3.1035, + "step": 7041 + }, + { + "epoch": 4.537630942788074, + "grad_norm": 1.4371378821912348, + "learning_rate": 9.801478871448519e-05, + "loss": 3.54, + "step": 7042 + }, + { + "epoch": 4.538275584206286, + "grad_norm": 1.6598732579131459, + "learning_rate": 9.801422625845651e-05, + "loss": 3.4099, + "step": 7043 + }, + { + "epoch": 4.538920225624496, + "grad_norm": 1.7050017050826813, + "learning_rate": 9.801366372439168e-05, + "loss": 3.2081, + "step": 7044 + }, + { + "epoch": 4.539564867042707, + "grad_norm": 1.711968108167899, + "learning_rate": 9.801310111229158e-05, + "loss": 3.333, + "step": 7045 + }, + { + "epoch": 4.540209508460919, + "grad_norm": 1.6221149480159491, + "learning_rate": 9.80125384221572e-05, + "loss": 3.7594, + "step": 7046 + }, + { + "epoch": 4.54085414987913, + "grad_norm": 1.6134046249722025, + "learning_rate": 9.801197565398942e-05, + "loss": 3.3362, + "step": 7047 + }, + { + "epoch": 4.541498791297341, + "grad_norm": 1.7032670190821126, + "learning_rate": 9.80114128077892e-05, + "loss": 3.8491, + "step": 7048 + }, + { + "epoch": 4.542143432715552, + "grad_norm": 1.6570882419788262, + "learning_rate": 9.801084988355743e-05, + "loss": 3.5632, + "step": 7049 + }, + { + "epoch": 4.542788074133763, + "grad_norm": 1.7251542129639588, + "learning_rate": 9.801028688129504e-05, + "loss": 3.3806, + "step": 7050 + }, + { + "epoch": 4.543432715551974, + "grad_norm": 1.4048209535002345, + "learning_rate": 9.800972380100296e-05, + "loss": 3.4073, + "step": 7051 + }, + { + "epoch": 4.5440773569701856, + "grad_norm": 1.774026599900524, + "learning_rate": 9.800916064268212e-05, + "loss": 3.4683, + "step": 7052 + }, + { + "epoch": 4.544721998388397, + "grad_norm": 1.3446202206714402, + "learning_rate": 9.800859740633346e-05, + "loss": 3.5724, + "step": 7053 + }, + { + "epoch": 4.545366639806607, + "grad_norm": 2.120553271181087, + "learning_rate": 9.800803409195787e-05, + "loss": 3.2354, + "step": 7054 + }, + { + "epoch": 4.546011281224819, + "grad_norm": 1.424368038415171, + "learning_rate": 9.800747069955629e-05, + "loss": 3.3629, + "step": 7055 + }, + { + "epoch": 4.54665592264303, + "grad_norm": 2.1286421121265993, + "learning_rate": 9.800690722912965e-05, + "loss": 3.4884, + "step": 7056 + }, + { + "epoch": 4.547300564061241, + "grad_norm": 1.7610313303143676, + "learning_rate": 9.800634368067888e-05, + "loss": 3.5041, + "step": 7057 + }, + { + "epoch": 4.5479452054794525, + "grad_norm": 1.8025886840263563, + "learning_rate": 9.80057800542049e-05, + "loss": 3.1133, + "step": 7058 + }, + { + "epoch": 4.548589846897663, + "grad_norm": 2.150123289407005, + "learning_rate": 9.800521634970862e-05, + "loss": 3.6095, + "step": 7059 + }, + { + "epoch": 4.549234488315874, + "grad_norm": 1.555556911284957, + "learning_rate": 9.800465256719099e-05, + "loss": 3.4644, + "step": 7060 + }, + { + "epoch": 4.5498791297340855, + "grad_norm": 1.7770389854214133, + "learning_rate": 9.800408870665292e-05, + "loss": 3.4763, + "step": 7061 + }, + { + "epoch": 4.550523771152297, + "grad_norm": 1.7518133399733056, + "learning_rate": 9.800352476809534e-05, + "loss": 3.4309, + "step": 7062 + }, + { + "epoch": 4.551168412570508, + "grad_norm": 1.570137898873802, + "learning_rate": 9.800296075151918e-05, + "loss": 3.0176, + "step": 7063 + }, + { + "epoch": 4.5518130539887185, + "grad_norm": 1.7197797300031155, + "learning_rate": 9.800239665692538e-05, + "loss": 3.3832, + "step": 7064 + }, + { + "epoch": 4.55245769540693, + "grad_norm": 2.0530377881312396, + "learning_rate": 9.800183248431483e-05, + "loss": 3.7585, + "step": 7065 + }, + { + "epoch": 4.553102336825141, + "grad_norm": 1.9241954851325054, + "learning_rate": 9.800126823368849e-05, + "loss": 3.2975, + "step": 7066 + }, + { + "epoch": 4.553746978243352, + "grad_norm": 1.5413235599596962, + "learning_rate": 9.800070390504727e-05, + "loss": 3.2885, + "step": 7067 + }, + { + "epoch": 4.554391619661564, + "grad_norm": 2.1096918816863055, + "learning_rate": 9.800013949839208e-05, + "loss": 3.3239, + "step": 7068 + }, + { + "epoch": 4.555036261079774, + "grad_norm": 1.5631771626884396, + "learning_rate": 9.79995750137239e-05, + "loss": 3.2299, + "step": 7069 + }, + { + "epoch": 4.555680902497985, + "grad_norm": 1.7416250426670996, + "learning_rate": 9.799901045104362e-05, + "loss": 3.5498, + "step": 7070 + }, + { + "epoch": 4.556325543916197, + "grad_norm": 1.6382394422393316, + "learning_rate": 9.799844581035215e-05, + "loss": 3.3861, + "step": 7071 + }, + { + "epoch": 4.556970185334408, + "grad_norm": 1.7952671059712801, + "learning_rate": 9.799788109165046e-05, + "loss": 3.3117, + "step": 7072 + }, + { + "epoch": 4.557614826752619, + "grad_norm": 1.6587774914966422, + "learning_rate": 9.799731629493946e-05, + "loss": 3.6785, + "step": 7073 + }, + { + "epoch": 4.55825946817083, + "grad_norm": 1.5004382421138183, + "learning_rate": 9.799675142022006e-05, + "loss": 3.3599, + "step": 7074 + }, + { + "epoch": 4.558904109589041, + "grad_norm": 1.7365938324833747, + "learning_rate": 9.799618646749321e-05, + "loss": 3.1698, + "step": 7075 + }, + { + "epoch": 4.559548751007252, + "grad_norm": 1.5532851080988155, + "learning_rate": 9.799562143675984e-05, + "loss": 3.4688, + "step": 7076 + }, + { + "epoch": 4.560193392425464, + "grad_norm": 2.0723402498222776, + "learning_rate": 9.799505632802085e-05, + "loss": 3.4881, + "step": 7077 + }, + { + "epoch": 4.560838033843675, + "grad_norm": 1.6284357885973881, + "learning_rate": 9.799449114127718e-05, + "loss": 3.4041, + "step": 7078 + }, + { + "epoch": 4.561482675261885, + "grad_norm": 1.5831393071673392, + "learning_rate": 9.799392587652979e-05, + "loss": 3.3319, + "step": 7079 + }, + { + "epoch": 4.562127316680097, + "grad_norm": 1.653749548747719, + "learning_rate": 9.799336053377957e-05, + "loss": 3.4145, + "step": 7080 + }, + { + "epoch": 4.562771958098308, + "grad_norm": 1.4166605521010938, + "learning_rate": 9.799279511302745e-05, + "loss": 3.3772, + "step": 7081 + }, + { + "epoch": 4.563416599516519, + "grad_norm": 1.38154250471991, + "learning_rate": 9.799222961427439e-05, + "loss": 3.3623, + "step": 7082 + }, + { + "epoch": 4.5640612409347305, + "grad_norm": 1.5040792009426416, + "learning_rate": 9.799166403752129e-05, + "loss": 3.2524, + "step": 7083 + }, + { + "epoch": 4.564705882352941, + "grad_norm": 1.7369906395099832, + "learning_rate": 9.799109838276909e-05, + "loss": 3.346, + "step": 7084 + }, + { + "epoch": 4.565350523771152, + "grad_norm": 1.3927358204987215, + "learning_rate": 9.799053265001871e-05, + "loss": 3.2537, + "step": 7085 + }, + { + "epoch": 4.5659951651893635, + "grad_norm": 1.5229103357921818, + "learning_rate": 9.798996683927109e-05, + "loss": 3.5682, + "step": 7086 + }, + { + "epoch": 4.566639806607575, + "grad_norm": 1.5695242460643182, + "learning_rate": 9.798940095052716e-05, + "loss": 3.4291, + "step": 7087 + }, + { + "epoch": 4.567284448025786, + "grad_norm": 1.708309715478387, + "learning_rate": 9.798883498378785e-05, + "loss": 3.2855, + "step": 7088 + }, + { + "epoch": 4.567929089443997, + "grad_norm": 1.5436487301384634, + "learning_rate": 9.798826893905408e-05, + "loss": 3.6114, + "step": 7089 + }, + { + "epoch": 4.568573730862208, + "grad_norm": 1.7439433251103027, + "learning_rate": 9.798770281632677e-05, + "loss": 3.5381, + "step": 7090 + }, + { + "epoch": 4.569218372280419, + "grad_norm": 1.822337677026748, + "learning_rate": 9.798713661560687e-05, + "loss": 3.1144, + "step": 7091 + }, + { + "epoch": 4.5698630136986305, + "grad_norm": 1.7854018723681795, + "learning_rate": 9.79865703368953e-05, + "loss": 3.588, + "step": 7092 + }, + { + "epoch": 4.570507655116841, + "grad_norm": 1.5341085814824846, + "learning_rate": 9.798600398019301e-05, + "loss": 3.4172, + "step": 7093 + }, + { + "epoch": 4.571152296535052, + "grad_norm": 1.6300267450917012, + "learning_rate": 9.798543754550091e-05, + "loss": 3.638, + "step": 7094 + }, + { + "epoch": 4.5717969379532635, + "grad_norm": 1.3599350543786513, + "learning_rate": 9.798487103281992e-05, + "loss": 3.8832, + "step": 7095 + }, + { + "epoch": 4.572441579371475, + "grad_norm": 1.6058049579762932, + "learning_rate": 9.798430444215099e-05, + "loss": 3.6356, + "step": 7096 + }, + { + "epoch": 4.573086220789686, + "grad_norm": 1.425883032159409, + "learning_rate": 9.798373777349506e-05, + "loss": 3.5649, + "step": 7097 + }, + { + "epoch": 4.5737308622078965, + "grad_norm": 1.794180493891417, + "learning_rate": 9.798317102685303e-05, + "loss": 3.4787, + "step": 7098 + }, + { + "epoch": 4.574375503626108, + "grad_norm": 1.7340805719309016, + "learning_rate": 9.798260420222585e-05, + "loss": 3.2939, + "step": 7099 + }, + { + "epoch": 4.575020145044319, + "grad_norm": 1.4714068063035444, + "learning_rate": 9.798203729961443e-05, + "loss": 3.1376, + "step": 7100 + }, + { + "epoch": 4.575020145044319, + "eval_loss": 4.1493754386901855, + "eval_runtime": 2.984, + "eval_samples_per_second": 33.512, + "eval_steps_per_second": 4.357, + "step": 7100 + }, + { + "epoch": 4.57566478646253, + "grad_norm": 1.5911200917342725, + "learning_rate": 9.798147031901973e-05, + "loss": 3.6056, + "step": 7101 + }, + { + "epoch": 4.576309427880742, + "grad_norm": 1.6642759952263335, + "learning_rate": 9.798090326044269e-05, + "loss": 3.7047, + "step": 7102 + }, + { + "epoch": 4.576954069298952, + "grad_norm": 1.418022670655498, + "learning_rate": 9.79803361238842e-05, + "loss": 3.2445, + "step": 7103 + }, + { + "epoch": 4.577598710717163, + "grad_norm": 1.4126731577226026, + "learning_rate": 9.797976890934522e-05, + "loss": 3.3807, + "step": 7104 + }, + { + "epoch": 4.578243352135375, + "grad_norm": 1.6906872224410288, + "learning_rate": 9.797920161682666e-05, + "loss": 3.5151, + "step": 7105 + }, + { + "epoch": 4.578887993553586, + "grad_norm": 1.6147937384262077, + "learning_rate": 9.797863424632948e-05, + "loss": 3.3872, + "step": 7106 + }, + { + "epoch": 4.579532634971797, + "grad_norm": 1.9979268581575087, + "learning_rate": 9.797806679785459e-05, + "loss": 3.5978, + "step": 7107 + }, + { + "epoch": 4.580177276390008, + "grad_norm": 1.8257943578542424, + "learning_rate": 9.797749927140293e-05, + "loss": 3.3348, + "step": 7108 + }, + { + "epoch": 4.580821917808219, + "grad_norm": 1.635715220531596, + "learning_rate": 9.797693166697543e-05, + "loss": 3.1711, + "step": 7109 + }, + { + "epoch": 4.58146655922643, + "grad_norm": 2.0202166385404716, + "learning_rate": 9.797636398457304e-05, + "loss": 3.438, + "step": 7110 + }, + { + "epoch": 4.582111200644642, + "grad_norm": 1.377439394504826, + "learning_rate": 9.797579622419667e-05, + "loss": 3.591, + "step": 7111 + }, + { + "epoch": 4.582755842062853, + "grad_norm": 1.6188398962923811, + "learning_rate": 9.797522838584724e-05, + "loss": 3.2316, + "step": 7112 + }, + { + "epoch": 4.583400483481063, + "grad_norm": 1.7706854657218292, + "learning_rate": 9.797466046952573e-05, + "loss": 3.2455, + "step": 7113 + }, + { + "epoch": 4.584045124899275, + "grad_norm": 1.53570735241034, + "learning_rate": 9.797409247523303e-05, + "loss": 3.2153, + "step": 7114 + }, + { + "epoch": 4.584689766317486, + "grad_norm": 1.6940966054970217, + "learning_rate": 9.797352440297008e-05, + "loss": 3.3294, + "step": 7115 + }, + { + "epoch": 4.585334407735697, + "grad_norm": 1.4187879652320639, + "learning_rate": 9.797295625273784e-05, + "loss": 3.5516, + "step": 7116 + }, + { + "epoch": 4.5859790491539085, + "grad_norm": 1.5819243007552635, + "learning_rate": 9.797238802453721e-05, + "loss": 3.3904, + "step": 7117 + }, + { + "epoch": 4.586623690572119, + "grad_norm": 1.822746269780821, + "learning_rate": 9.797181971836913e-05, + "loss": 3.2848, + "step": 7118 + }, + { + "epoch": 4.58726833199033, + "grad_norm": 1.4589455136636906, + "learning_rate": 9.797125133423454e-05, + "loss": 3.3041, + "step": 7119 + }, + { + "epoch": 4.5879129734085415, + "grad_norm": 1.647388227167389, + "learning_rate": 9.79706828721344e-05, + "loss": 3.5695, + "step": 7120 + }, + { + "epoch": 4.588557614826753, + "grad_norm": 1.8759608228076192, + "learning_rate": 9.797011433206957e-05, + "loss": 3.7613, + "step": 7121 + }, + { + "epoch": 4.589202256244963, + "grad_norm": 1.1640763680885184, + "learning_rate": 9.796954571404106e-05, + "loss": 3.4581, + "step": 7122 + }, + { + "epoch": 4.5898468976631746, + "grad_norm": 1.4536468726643326, + "learning_rate": 9.796897701804976e-05, + "loss": 3.4635, + "step": 7123 + }, + { + "epoch": 4.590491539081386, + "grad_norm": 1.2543951629875754, + "learning_rate": 9.796840824409664e-05, + "loss": 3.3732, + "step": 7124 + }, + { + "epoch": 4.591136180499597, + "grad_norm": 1.2897808244402906, + "learning_rate": 9.79678393921826e-05, + "loss": 3.7362, + "step": 7125 + }, + { + "epoch": 4.5917808219178085, + "grad_norm": 1.3442551193713768, + "learning_rate": 9.796727046230859e-05, + "loss": 3.6366, + "step": 7126 + }, + { + "epoch": 4.592425463336019, + "grad_norm": 1.2631441524713196, + "learning_rate": 9.796670145447554e-05, + "loss": 3.4574, + "step": 7127 + }, + { + "epoch": 4.59307010475423, + "grad_norm": 1.3878687963890068, + "learning_rate": 9.796613236868439e-05, + "loss": 3.461, + "step": 7128 + }, + { + "epoch": 4.5937147461724415, + "grad_norm": 1.3871319632215608, + "learning_rate": 9.796556320493605e-05, + "loss": 3.6094, + "step": 7129 + }, + { + "epoch": 4.594359387590653, + "grad_norm": 1.3416462025696658, + "learning_rate": 9.79649939632315e-05, + "loss": 3.3816, + "step": 7130 + }, + { + "epoch": 4.595004029008864, + "grad_norm": 1.4751611618074436, + "learning_rate": 9.796442464357163e-05, + "loss": 3.2376, + "step": 7131 + }, + { + "epoch": 4.5956486704270745, + "grad_norm": 1.5307060209882786, + "learning_rate": 9.79638552459574e-05, + "loss": 3.1339, + "step": 7132 + }, + { + "epoch": 4.596293311845286, + "grad_norm": 1.3147299401603103, + "learning_rate": 9.796328577038976e-05, + "loss": 3.1781, + "step": 7133 + }, + { + "epoch": 4.596937953263497, + "grad_norm": 1.3644489772800408, + "learning_rate": 9.796271621686959e-05, + "loss": 3.368, + "step": 7134 + }, + { + "epoch": 4.597582594681708, + "grad_norm": 1.3313407665007952, + "learning_rate": 9.796214658539788e-05, + "loss": 3.2598, + "step": 7135 + }, + { + "epoch": 4.59822723609992, + "grad_norm": 1.6197283379441234, + "learning_rate": 9.796157687597556e-05, + "loss": 3.437, + "step": 7136 + }, + { + "epoch": 4.59887187751813, + "grad_norm": 1.501882215460088, + "learning_rate": 9.796100708860354e-05, + "loss": 3.6201, + "step": 7137 + }, + { + "epoch": 4.599516518936341, + "grad_norm": 1.3808697022077323, + "learning_rate": 9.796043722328277e-05, + "loss": 3.6964, + "step": 7138 + }, + { + "epoch": 4.600161160354553, + "grad_norm": 1.5961268980128465, + "learning_rate": 9.795986728001418e-05, + "loss": 3.774, + "step": 7139 + }, + { + "epoch": 4.600805801772764, + "grad_norm": 1.3774008385304415, + "learning_rate": 9.795929725879871e-05, + "loss": 3.6509, + "step": 7140 + }, + { + "epoch": 4.601450443190975, + "grad_norm": 1.6017750086905136, + "learning_rate": 9.795872715963729e-05, + "loss": 3.4455, + "step": 7141 + }, + { + "epoch": 4.602095084609186, + "grad_norm": 1.4021130117621283, + "learning_rate": 9.795815698253088e-05, + "loss": 3.2147, + "step": 7142 + }, + { + "epoch": 4.602739726027397, + "grad_norm": 1.291134135741415, + "learning_rate": 9.795758672748039e-05, + "loss": 3.3897, + "step": 7143 + }, + { + "epoch": 4.603384367445608, + "grad_norm": 1.8102053603640154, + "learning_rate": 9.795701639448676e-05, + "loss": 3.2808, + "step": 7144 + }, + { + "epoch": 4.60402900886382, + "grad_norm": 1.4149521418558566, + "learning_rate": 9.795644598355093e-05, + "loss": 3.2004, + "step": 7145 + }, + { + "epoch": 4.604673650282031, + "grad_norm": 1.6595818350806484, + "learning_rate": 9.795587549467385e-05, + "loss": 3.4344, + "step": 7146 + }, + { + "epoch": 4.605318291700241, + "grad_norm": 1.2747682941484921, + "learning_rate": 9.795530492785644e-05, + "loss": 3.3177, + "step": 7147 + }, + { + "epoch": 4.605962933118453, + "grad_norm": 1.499213708819945, + "learning_rate": 9.795473428309965e-05, + "loss": 3.6922, + "step": 7148 + }, + { + "epoch": 4.606607574536664, + "grad_norm": 1.419260151576891, + "learning_rate": 9.79541635604044e-05, + "loss": 3.4812, + "step": 7149 + }, + { + "epoch": 4.607252215954875, + "grad_norm": 1.3605736732295155, + "learning_rate": 9.795359275977163e-05, + "loss": 3.6236, + "step": 7150 + }, + { + "epoch": 4.6078968573730865, + "grad_norm": 1.4623041503377856, + "learning_rate": 9.79530218812023e-05, + "loss": 3.7878, + "step": 7151 + }, + { + "epoch": 4.608541498791297, + "grad_norm": 1.8203574073771802, + "learning_rate": 9.795245092469735e-05, + "loss": 3.4078, + "step": 7152 + }, + { + "epoch": 4.609186140209508, + "grad_norm": 1.5896109666000822, + "learning_rate": 9.795187989025767e-05, + "loss": 3.6899, + "step": 7153 + }, + { + "epoch": 4.6098307816277195, + "grad_norm": 1.6698505473119625, + "learning_rate": 9.795130877788423e-05, + "loss": 3.4876, + "step": 7154 + }, + { + "epoch": 4.610475423045931, + "grad_norm": 1.5083591694214977, + "learning_rate": 9.795073758757798e-05, + "loss": 3.166, + "step": 7155 + }, + { + "epoch": 4.611120064464142, + "grad_norm": 1.4624172768659243, + "learning_rate": 9.795016631933985e-05, + "loss": 3.1963, + "step": 7156 + }, + { + "epoch": 4.6117647058823525, + "grad_norm": 1.2554683135537759, + "learning_rate": 9.794959497317075e-05, + "loss": 3.0469, + "step": 7157 + }, + { + "epoch": 4.612409347300564, + "grad_norm": 1.3339540996687274, + "learning_rate": 9.794902354907166e-05, + "loss": 3.2769, + "step": 7158 + }, + { + "epoch": 4.613053988718775, + "grad_norm": 1.5442577477597763, + "learning_rate": 9.794845204704348e-05, + "loss": 3.4491, + "step": 7159 + }, + { + "epoch": 4.6136986301369864, + "grad_norm": 1.2796793050207025, + "learning_rate": 9.794788046708718e-05, + "loss": 2.8901, + "step": 7160 + }, + { + "epoch": 4.614343271555198, + "grad_norm": 1.7550394782818632, + "learning_rate": 9.79473088092037e-05, + "loss": 3.2235, + "step": 7161 + }, + { + "epoch": 4.614987912973408, + "grad_norm": 1.2027655584953556, + "learning_rate": 9.794673707339395e-05, + "loss": 3.152, + "step": 7162 + }, + { + "epoch": 4.6156325543916195, + "grad_norm": 1.8250459800492602, + "learning_rate": 9.794616525965888e-05, + "loss": 3.4243, + "step": 7163 + }, + { + "epoch": 4.616277195809831, + "grad_norm": 1.6039699380466648, + "learning_rate": 9.794559336799944e-05, + "loss": 3.5025, + "step": 7164 + }, + { + "epoch": 4.616921837228042, + "grad_norm": 1.490898632490818, + "learning_rate": 9.794502139841657e-05, + "loss": 3.0318, + "step": 7165 + }, + { + "epoch": 4.617566478646253, + "grad_norm": 1.5787376963685436, + "learning_rate": 9.79444493509112e-05, + "loss": 3.531, + "step": 7166 + }, + { + "epoch": 4.618211120064464, + "grad_norm": 1.4199520249041295, + "learning_rate": 9.794387722548428e-05, + "loss": 3.3894, + "step": 7167 + }, + { + "epoch": 4.618855761482675, + "grad_norm": 1.6905807254801506, + "learning_rate": 9.79433050221367e-05, + "loss": 3.1003, + "step": 7168 + }, + { + "epoch": 4.619500402900886, + "grad_norm": 1.667680630064425, + "learning_rate": 9.794273274086948e-05, + "loss": 3.1571, + "step": 7169 + }, + { + "epoch": 4.620145044319098, + "grad_norm": 1.920280757685309, + "learning_rate": 9.794216038168352e-05, + "loss": 3.0985, + "step": 7170 + }, + { + "epoch": 4.620789685737309, + "grad_norm": 2.107282113858085, + "learning_rate": 9.794158794457976e-05, + "loss": 3.5423, + "step": 7171 + }, + { + "epoch": 4.621434327155519, + "grad_norm": 1.829167527128166, + "learning_rate": 9.794101542955912e-05, + "loss": 3.0312, + "step": 7172 + }, + { + "epoch": 4.622078968573731, + "grad_norm": 2.01363012701135, + "learning_rate": 9.794044283662258e-05, + "loss": 3.4851, + "step": 7173 + }, + { + "epoch": 4.622723609991942, + "grad_norm": 1.429956053120847, + "learning_rate": 9.793987016577106e-05, + "loss": 3.2127, + "step": 7174 + }, + { + "epoch": 4.623368251410153, + "grad_norm": 2.2222663284006337, + "learning_rate": 9.79392974170055e-05, + "loss": 3.5326, + "step": 7175 + }, + { + "epoch": 4.624012892828365, + "grad_norm": 1.482174181014998, + "learning_rate": 9.793872459032684e-05, + "loss": 3.7067, + "step": 7176 + }, + { + "epoch": 4.624657534246575, + "grad_norm": 1.5247627425961168, + "learning_rate": 9.793815168573603e-05, + "loss": 3.5637, + "step": 7177 + }, + { + "epoch": 4.625302175664786, + "grad_norm": 1.6030629888460257, + "learning_rate": 9.7937578703234e-05, + "loss": 3.1731, + "step": 7178 + }, + { + "epoch": 4.625946817082998, + "grad_norm": 1.7968381406721037, + "learning_rate": 9.793700564282171e-05, + "loss": 3.7344, + "step": 7179 + }, + { + "epoch": 4.626591458501209, + "grad_norm": 1.4470398364558308, + "learning_rate": 9.793643250450006e-05, + "loss": 3.4015, + "step": 7180 + }, + { + "epoch": 4.62723609991942, + "grad_norm": 1.948636476636696, + "learning_rate": 9.793585928827005e-05, + "loss": 3.339, + "step": 7181 + }, + { + "epoch": 4.627880741337631, + "grad_norm": 1.368112239764551, + "learning_rate": 9.793528599413258e-05, + "loss": 3.4084, + "step": 7182 + }, + { + "epoch": 4.628525382755842, + "grad_norm": 2.1020593670332555, + "learning_rate": 9.793471262208859e-05, + "loss": 3.3799, + "step": 7183 + }, + { + "epoch": 4.629170024174053, + "grad_norm": 1.6863905942248045, + "learning_rate": 9.793413917213905e-05, + "loss": 3.7818, + "step": 7184 + }, + { + "epoch": 4.6298146655922645, + "grad_norm": 1.8012123613725977, + "learning_rate": 9.793356564428487e-05, + "loss": 3.297, + "step": 7185 + }, + { + "epoch": 4.630459307010476, + "grad_norm": 1.8103778380475999, + "learning_rate": 9.7932992038527e-05, + "loss": 3.4675, + "step": 7186 + }, + { + "epoch": 4.631103948428686, + "grad_norm": 2.028694824050997, + "learning_rate": 9.793241835486641e-05, + "loss": 3.4024, + "step": 7187 + }, + { + "epoch": 4.6317485898468975, + "grad_norm": 1.6807113053386933, + "learning_rate": 9.793184459330402e-05, + "loss": 3.0978, + "step": 7188 + }, + { + "epoch": 4.632393231265109, + "grad_norm": 1.9129312846134898, + "learning_rate": 9.793127075384077e-05, + "loss": 3.3164, + "step": 7189 + }, + { + "epoch": 4.63303787268332, + "grad_norm": 1.3111718410840094, + "learning_rate": 9.793069683647759e-05, + "loss": 3.6947, + "step": 7190 + }, + { + "epoch": 4.633682514101531, + "grad_norm": 1.7271428085750071, + "learning_rate": 9.793012284121546e-05, + "loss": 3.3992, + "step": 7191 + }, + { + "epoch": 4.634327155519742, + "grad_norm": 1.632208626782183, + "learning_rate": 9.792954876805529e-05, + "loss": 3.2926, + "step": 7192 + }, + { + "epoch": 4.634971796937953, + "grad_norm": 1.8771926370351, + "learning_rate": 9.792897461699804e-05, + "loss": 3.1761, + "step": 7193 + }, + { + "epoch": 4.635616438356164, + "grad_norm": 1.5069829600330313, + "learning_rate": 9.792840038804465e-05, + "loss": 3.4064, + "step": 7194 + }, + { + "epoch": 4.636261079774376, + "grad_norm": 1.7443846897234077, + "learning_rate": 9.792782608119606e-05, + "loss": 3.5356, + "step": 7195 + }, + { + "epoch": 4.636905721192587, + "grad_norm": 1.3343726607982742, + "learning_rate": 9.79272516964532e-05, + "loss": 3.5602, + "step": 7196 + }, + { + "epoch": 4.6375503626107974, + "grad_norm": 1.7970110263463583, + "learning_rate": 9.792667723381704e-05, + "loss": 3.1383, + "step": 7197 + }, + { + "epoch": 4.638195004029009, + "grad_norm": 1.601068488360747, + "learning_rate": 9.792610269328851e-05, + "loss": 3.1581, + "step": 7198 + }, + { + "epoch": 4.63883964544722, + "grad_norm": 1.7501824573527627, + "learning_rate": 9.792552807486855e-05, + "loss": 3.0339, + "step": 7199 + }, + { + "epoch": 4.639484286865431, + "grad_norm": 1.7683022716289702, + "learning_rate": 9.79249533785581e-05, + "loss": 3.3417, + "step": 7200 + }, + { + "epoch": 4.639484286865431, + "eval_loss": 4.170380115509033, + "eval_runtime": 2.9894, + "eval_samples_per_second": 33.451, + "eval_steps_per_second": 4.349, + "step": 7200 + }, + { + "epoch": 4.640128928283643, + "grad_norm": 1.807170330282197, + "learning_rate": 9.792437860435813e-05, + "loss": 3.7144, + "step": 7201 + }, + { + "epoch": 4.640773569701853, + "grad_norm": 1.5499267921029927, + "learning_rate": 9.792380375226956e-05, + "loss": 3.2277, + "step": 7202 + }, + { + "epoch": 4.641418211120064, + "grad_norm": 1.9287839183861224, + "learning_rate": 9.792322882229334e-05, + "loss": 3.4448, + "step": 7203 + }, + { + "epoch": 4.642062852538276, + "grad_norm": 1.4719030243400093, + "learning_rate": 9.79226538144304e-05, + "loss": 3.2911, + "step": 7204 + }, + { + "epoch": 4.642707493956487, + "grad_norm": 1.8307089760853987, + "learning_rate": 9.79220787286817e-05, + "loss": 3.3957, + "step": 7205 + }, + { + "epoch": 4.643352135374698, + "grad_norm": 1.5820417196355878, + "learning_rate": 9.79215035650482e-05, + "loss": 3.2935, + "step": 7206 + }, + { + "epoch": 4.643996776792909, + "grad_norm": 1.6463608315113687, + "learning_rate": 9.792092832353081e-05, + "loss": 3.4718, + "step": 7207 + }, + { + "epoch": 4.64464141821112, + "grad_norm": 1.8340895199465943, + "learning_rate": 9.792035300413051e-05, + "loss": 3.4731, + "step": 7208 + }, + { + "epoch": 4.645286059629331, + "grad_norm": 1.6486153166780733, + "learning_rate": 9.791977760684821e-05, + "loss": 3.3233, + "step": 7209 + }, + { + "epoch": 4.645930701047543, + "grad_norm": 1.8623668854599724, + "learning_rate": 9.791920213168488e-05, + "loss": 3.3269, + "step": 7210 + }, + { + "epoch": 4.646575342465754, + "grad_norm": 1.512381485240537, + "learning_rate": 9.791862657864146e-05, + "loss": 3.5332, + "step": 7211 + }, + { + "epoch": 4.647219983883964, + "grad_norm": 1.9421725295613725, + "learning_rate": 9.791805094771888e-05, + "loss": 3.6917, + "step": 7212 + }, + { + "epoch": 4.647864625302176, + "grad_norm": 1.8013999108673977, + "learning_rate": 9.791747523891812e-05, + "loss": 3.4312, + "step": 7213 + }, + { + "epoch": 4.648509266720387, + "grad_norm": 1.7181833252894307, + "learning_rate": 9.791689945224008e-05, + "loss": 3.1492, + "step": 7214 + }, + { + "epoch": 4.649153908138598, + "grad_norm": 1.6537862333033162, + "learning_rate": 9.791632358768576e-05, + "loss": 3.1819, + "step": 7215 + }, + { + "epoch": 4.6497985495568095, + "grad_norm": 1.6949269850009512, + "learning_rate": 9.791574764525605e-05, + "loss": 3.013, + "step": 7216 + }, + { + "epoch": 4.65044319097502, + "grad_norm": 1.5718447278092276, + "learning_rate": 9.791517162495192e-05, + "loss": 3.533, + "step": 7217 + }, + { + "epoch": 4.651087832393231, + "grad_norm": 1.7184999036003186, + "learning_rate": 9.791459552677432e-05, + "loss": 3.1989, + "step": 7218 + }, + { + "epoch": 4.6517324738114425, + "grad_norm": 1.5121836102047612, + "learning_rate": 9.79140193507242e-05, + "loss": 3.5607, + "step": 7219 + }, + { + "epoch": 4.652377115229654, + "grad_norm": 1.5253478932344748, + "learning_rate": 9.791344309680249e-05, + "loss": 3.3101, + "step": 7220 + }, + { + "epoch": 4.653021756647865, + "grad_norm": 1.282340953640996, + "learning_rate": 9.791286676501016e-05, + "loss": 3.1898, + "step": 7221 + }, + { + "epoch": 4.6536663980660755, + "grad_norm": 1.4678357432139506, + "learning_rate": 9.791229035534813e-05, + "loss": 3.6849, + "step": 7222 + }, + { + "epoch": 4.654311039484287, + "grad_norm": 1.3595164181479524, + "learning_rate": 9.791171386781736e-05, + "loss": 3.4554, + "step": 7223 + }, + { + "epoch": 4.654955680902498, + "grad_norm": 1.463833557545455, + "learning_rate": 9.791113730241882e-05, + "loss": 3.2801, + "step": 7224 + }, + { + "epoch": 4.655600322320709, + "grad_norm": 1.3070641682277704, + "learning_rate": 9.79105606591534e-05, + "loss": 3.2825, + "step": 7225 + }, + { + "epoch": 4.656244963738921, + "grad_norm": 1.3655971850867523, + "learning_rate": 9.790998393802208e-05, + "loss": 3.3586, + "step": 7226 + }, + { + "epoch": 4.656889605157131, + "grad_norm": 1.5568944350628713, + "learning_rate": 9.790940713902584e-05, + "loss": 3.87, + "step": 7227 + }, + { + "epoch": 4.657534246575342, + "grad_norm": 1.7657277774385778, + "learning_rate": 9.790883026216556e-05, + "loss": 3.3713, + "step": 7228 + }, + { + "epoch": 4.658178887993554, + "grad_norm": 1.2303511153586033, + "learning_rate": 9.790825330744223e-05, + "loss": 3.5004, + "step": 7229 + }, + { + "epoch": 4.658823529411765, + "grad_norm": 1.8584600446156188, + "learning_rate": 9.790767627485679e-05, + "loss": 3.6888, + "step": 7230 + }, + { + "epoch": 4.659468170829975, + "grad_norm": 1.4190883190074612, + "learning_rate": 9.79070991644102e-05, + "loss": 3.3208, + "step": 7231 + }, + { + "epoch": 4.660112812248187, + "grad_norm": 1.6230049691465787, + "learning_rate": 9.79065219761034e-05, + "loss": 3.5058, + "step": 7232 + }, + { + "epoch": 4.660757453666398, + "grad_norm": 1.2214333411347573, + "learning_rate": 9.79059447099373e-05, + "loss": 3.3397, + "step": 7233 + }, + { + "epoch": 4.661402095084609, + "grad_norm": 1.7637174272398002, + "learning_rate": 9.790536736591291e-05, + "loss": 3.0793, + "step": 7234 + }, + { + "epoch": 4.662046736502821, + "grad_norm": 1.2796359051567536, + "learning_rate": 9.790478994403114e-05, + "loss": 3.4732, + "step": 7235 + }, + { + "epoch": 4.662691377921031, + "grad_norm": 1.435931235532467, + "learning_rate": 9.790421244429293e-05, + "loss": 3.0558, + "step": 7236 + }, + { + "epoch": 4.663336019339242, + "grad_norm": 1.3669037907378192, + "learning_rate": 9.790363486669928e-05, + "loss": 3.3654, + "step": 7237 + }, + { + "epoch": 4.663980660757454, + "grad_norm": 1.8462859693501175, + "learning_rate": 9.790305721125108e-05, + "loss": 3.1096, + "step": 7238 + }, + { + "epoch": 4.664625302175665, + "grad_norm": 1.9115195490222057, + "learning_rate": 9.790247947794931e-05, + "loss": 3.4493, + "step": 7239 + }, + { + "epoch": 4.665269943593876, + "grad_norm": 1.5005281924010023, + "learning_rate": 9.790190166679491e-05, + "loss": 3.4013, + "step": 7240 + }, + { + "epoch": 4.665914585012087, + "grad_norm": 1.440022042585719, + "learning_rate": 9.790132377778885e-05, + "loss": 3.4382, + "step": 7241 + }, + { + "epoch": 4.666559226430298, + "grad_norm": 1.2538084896661854, + "learning_rate": 9.790074581093204e-05, + "loss": 3.6435, + "step": 7242 + }, + { + "epoch": 4.667203867848509, + "grad_norm": 1.4910439122782937, + "learning_rate": 9.790016776622545e-05, + "loss": 3.3713, + "step": 7243 + }, + { + "epoch": 4.667848509266721, + "grad_norm": 1.463826209721453, + "learning_rate": 9.789958964367004e-05, + "loss": 3.1847, + "step": 7244 + }, + { + "epoch": 4.668493150684932, + "grad_norm": 1.6580257018421405, + "learning_rate": 9.789901144326674e-05, + "loss": 3.7253, + "step": 7245 + }, + { + "epoch": 4.669137792103142, + "grad_norm": 1.78877088608762, + "learning_rate": 9.789843316501651e-05, + "loss": 3.5497, + "step": 7246 + }, + { + "epoch": 4.669782433521354, + "grad_norm": 1.3623205092426176, + "learning_rate": 9.789785480892029e-05, + "loss": 3.2528, + "step": 7247 + }, + { + "epoch": 4.670427074939565, + "grad_norm": 1.3995914513567569, + "learning_rate": 9.789727637497906e-05, + "loss": 3.3827, + "step": 7248 + }, + { + "epoch": 4.671071716357776, + "grad_norm": 1.4424531063397432, + "learning_rate": 9.789669786319373e-05, + "loss": 3.5428, + "step": 7249 + }, + { + "epoch": 4.6717163577759875, + "grad_norm": 1.7779827909935684, + "learning_rate": 9.789611927356525e-05, + "loss": 3.2792, + "step": 7250 + }, + { + "epoch": 4.672360999194198, + "grad_norm": 1.391499176097527, + "learning_rate": 9.789554060609461e-05, + "loss": 3.1522, + "step": 7251 + }, + { + "epoch": 4.673005640612409, + "grad_norm": 1.383858657985127, + "learning_rate": 9.789496186078275e-05, + "loss": 3.3279, + "step": 7252 + }, + { + "epoch": 4.6736502820306205, + "grad_norm": 1.356936129597094, + "learning_rate": 9.78943830376306e-05, + "loss": 3.2467, + "step": 7253 + }, + { + "epoch": 4.674294923448832, + "grad_norm": 1.458490274914813, + "learning_rate": 9.78938041366391e-05, + "loss": 3.4588, + "step": 7254 + }, + { + "epoch": 4.674939564867043, + "grad_norm": 1.529759013574791, + "learning_rate": 9.789322515780925e-05, + "loss": 3.5223, + "step": 7255 + }, + { + "epoch": 4.6755842062852535, + "grad_norm": 1.4884945312603286, + "learning_rate": 9.789264610114195e-05, + "loss": 3.5157, + "step": 7256 + }, + { + "epoch": 4.676228847703465, + "grad_norm": 1.4542642686377931, + "learning_rate": 9.789206696663818e-05, + "loss": 3.4264, + "step": 7257 + }, + { + "epoch": 4.676873489121676, + "grad_norm": 1.6006196813161788, + "learning_rate": 9.789148775429888e-05, + "loss": 3.5132, + "step": 7258 + }, + { + "epoch": 4.677518130539887, + "grad_norm": 1.497666652400072, + "learning_rate": 9.789090846412501e-05, + "loss": 3.0343, + "step": 7259 + }, + { + "epoch": 4.678162771958098, + "grad_norm": 1.5072482763289508, + "learning_rate": 9.789032909611752e-05, + "loss": 3.5391, + "step": 7260 + }, + { + "epoch": 4.678807413376309, + "grad_norm": 1.5256053193454135, + "learning_rate": 9.788974965027734e-05, + "loss": 3.5239, + "step": 7261 + }, + { + "epoch": 4.67945205479452, + "grad_norm": 1.2713320295015162, + "learning_rate": 9.788917012660545e-05, + "loss": 3.712, + "step": 7262 + }, + { + "epoch": 4.680096696212732, + "grad_norm": 1.224566004962834, + "learning_rate": 9.788859052510281e-05, + "loss": 3.3743, + "step": 7263 + }, + { + "epoch": 4.680741337630943, + "grad_norm": 1.2592196401483133, + "learning_rate": 9.788801084577032e-05, + "loss": 3.2158, + "step": 7264 + }, + { + "epoch": 4.681385979049153, + "grad_norm": 1.4342900620384116, + "learning_rate": 9.788743108860899e-05, + "loss": 3.2313, + "step": 7265 + }, + { + "epoch": 4.682030620467365, + "grad_norm": 1.2551976121207271, + "learning_rate": 9.788685125361972e-05, + "loss": 3.4597, + "step": 7266 + }, + { + "epoch": 4.682675261885576, + "grad_norm": 1.5112228453946748, + "learning_rate": 9.788627134080351e-05, + "loss": 3.3159, + "step": 7267 + }, + { + "epoch": 4.683319903303787, + "grad_norm": 1.4270572604553866, + "learning_rate": 9.788569135016128e-05, + "loss": 3.3919, + "step": 7268 + }, + { + "epoch": 4.683964544721999, + "grad_norm": 1.3660344739210397, + "learning_rate": 9.788511128169403e-05, + "loss": 3.5585, + "step": 7269 + }, + { + "epoch": 4.684609186140209, + "grad_norm": 1.613942024250894, + "learning_rate": 9.788453113540264e-05, + "loss": 3.5815, + "step": 7270 + }, + { + "epoch": 4.68525382755842, + "grad_norm": 1.4643974819332952, + "learning_rate": 9.78839509112881e-05, + "loss": 3.1589, + "step": 7271 + }, + { + "epoch": 4.685898468976632, + "grad_norm": 1.67497379103937, + "learning_rate": 9.788337060935139e-05, + "loss": 3.1703, + "step": 7272 + }, + { + "epoch": 4.686543110394843, + "grad_norm": 1.842880088130242, + "learning_rate": 9.788279022959341e-05, + "loss": 3.3912, + "step": 7273 + }, + { + "epoch": 4.687187751813054, + "grad_norm": 5.171330859378235, + "learning_rate": 9.788220977201516e-05, + "loss": 3.4703, + "step": 7274 + }, + { + "epoch": 4.687832393231265, + "grad_norm": 1.6046161131910484, + "learning_rate": 9.788162923661756e-05, + "loss": 2.9731, + "step": 7275 + }, + { + "epoch": 4.688477034649476, + "grad_norm": 1.9732384034096027, + "learning_rate": 9.788104862340158e-05, + "loss": 3.2782, + "step": 7276 + }, + { + "epoch": 4.689121676067687, + "grad_norm": 1.4517810850476318, + "learning_rate": 9.788046793236816e-05, + "loss": 3.1147, + "step": 7277 + }, + { + "epoch": 4.689766317485899, + "grad_norm": 1.646344712711364, + "learning_rate": 9.787988716351828e-05, + "loss": 3.1342, + "step": 7278 + }, + { + "epoch": 4.69041095890411, + "grad_norm": 1.7056949987300845, + "learning_rate": 9.787930631685285e-05, + "loss": 3.1219, + "step": 7279 + }, + { + "epoch": 4.69105560032232, + "grad_norm": 1.5527513862663955, + "learning_rate": 9.787872539237288e-05, + "loss": 3.3653, + "step": 7280 + }, + { + "epoch": 4.691700241740532, + "grad_norm": 1.493219848160238, + "learning_rate": 9.787814439007928e-05, + "loss": 3.5097, + "step": 7281 + }, + { + "epoch": 4.692344883158743, + "grad_norm": 1.635570303169889, + "learning_rate": 9.787756330997302e-05, + "loss": 3.3613, + "step": 7282 + }, + { + "epoch": 4.692989524576954, + "grad_norm": 1.4569888459076716, + "learning_rate": 9.787698215205506e-05, + "loss": 3.2133, + "step": 7283 + }, + { + "epoch": 4.6936341659951655, + "grad_norm": 1.4493545359818376, + "learning_rate": 9.787640091632636e-05, + "loss": 3.0337, + "step": 7284 + }, + { + "epoch": 4.694278807413376, + "grad_norm": 1.9734323259645834, + "learning_rate": 9.787581960278784e-05, + "loss": 3.28, + "step": 7285 + }, + { + "epoch": 4.694923448831587, + "grad_norm": 1.8085197197948801, + "learning_rate": 9.78752382114405e-05, + "loss": 3.3504, + "step": 7286 + }, + { + "epoch": 4.6955680902497985, + "grad_norm": 1.90399900396138, + "learning_rate": 9.787465674228525e-05, + "loss": 3.2617, + "step": 7287 + }, + { + "epoch": 4.69621273166801, + "grad_norm": 1.7779405499379635, + "learning_rate": 9.787407519532309e-05, + "loss": 3.5691, + "step": 7288 + }, + { + "epoch": 4.696857373086221, + "grad_norm": 1.5939100735975233, + "learning_rate": 9.787349357055493e-05, + "loss": 3.198, + "step": 7289 + }, + { + "epoch": 4.6975020145044315, + "grad_norm": 1.5108970236977246, + "learning_rate": 9.787291186798176e-05, + "loss": 3.5671, + "step": 7290 + }, + { + "epoch": 4.698146655922643, + "grad_norm": 1.2684850947684942, + "learning_rate": 9.787233008760452e-05, + "loss": 3.4076, + "step": 7291 + }, + { + "epoch": 4.698791297340854, + "grad_norm": 1.488268513787416, + "learning_rate": 9.787174822942418e-05, + "loss": 3.4938, + "step": 7292 + }, + { + "epoch": 4.699435938759065, + "grad_norm": 1.2571368150163795, + "learning_rate": 9.787116629344166e-05, + "loss": 3.4473, + "step": 7293 + }, + { + "epoch": 4.700080580177277, + "grad_norm": 1.4702457462244662, + "learning_rate": 9.787058427965795e-05, + "loss": 3.5036, + "step": 7294 + }, + { + "epoch": 4.700725221595487, + "grad_norm": 1.4769190454680832, + "learning_rate": 9.7870002188074e-05, + "loss": 3.4684, + "step": 7295 + }, + { + "epoch": 4.701369863013698, + "grad_norm": 1.7161769462303498, + "learning_rate": 9.786942001869077e-05, + "loss": 3.3857, + "step": 7296 + }, + { + "epoch": 4.70201450443191, + "grad_norm": 1.5130052096682238, + "learning_rate": 9.786883777150921e-05, + "loss": 3.5928, + "step": 7297 + }, + { + "epoch": 4.702659145850121, + "grad_norm": 1.5584407349526872, + "learning_rate": 9.786825544653027e-05, + "loss": 3.5306, + "step": 7298 + }, + { + "epoch": 4.703303787268332, + "grad_norm": 1.5780562006200314, + "learning_rate": 9.786767304375489e-05, + "loss": 3.3979, + "step": 7299 + }, + { + "epoch": 4.703948428686543, + "grad_norm": 1.7079709434567787, + "learning_rate": 9.786709056318406e-05, + "loss": 3.369, + "step": 7300 + }, + { + "epoch": 4.703948428686543, + "eval_loss": 4.174077033996582, + "eval_runtime": 2.9687, + "eval_samples_per_second": 33.685, + "eval_steps_per_second": 4.379, + "step": 7300 + }, + { + "epoch": 4.704593070104754, + "grad_norm": 1.4334013112098238, + "learning_rate": 9.786650800481874e-05, + "loss": 3.383, + "step": 7301 + }, + { + "epoch": 4.705237711522965, + "grad_norm": 1.7027809079237937, + "learning_rate": 9.786592536865986e-05, + "loss": 3.2993, + "step": 7302 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 1.5798880171826826, + "learning_rate": 9.786534265470839e-05, + "loss": 3.4206, + "step": 7303 + }, + { + "epoch": 4.706526994359388, + "grad_norm": 1.976140988155453, + "learning_rate": 9.786475986296527e-05, + "loss": 3.4692, + "step": 7304 + }, + { + "epoch": 4.707171635777598, + "grad_norm": 1.8400606002002604, + "learning_rate": 9.786417699343149e-05, + "loss": 3.9124, + "step": 7305 + }, + { + "epoch": 4.70781627719581, + "grad_norm": 1.7111527453822615, + "learning_rate": 9.786359404610798e-05, + "loss": 3.4806, + "step": 7306 + }, + { + "epoch": 4.708460918614021, + "grad_norm": 1.9192769023514509, + "learning_rate": 9.78630110209957e-05, + "loss": 3.4381, + "step": 7307 + }, + { + "epoch": 4.709105560032232, + "grad_norm": 1.5346432276763973, + "learning_rate": 9.786242791809563e-05, + "loss": 3.5169, + "step": 7308 + }, + { + "epoch": 4.7097502014504435, + "grad_norm": 1.612077242864421, + "learning_rate": 9.78618447374087e-05, + "loss": 3.2199, + "step": 7309 + }, + { + "epoch": 4.710394842868654, + "grad_norm": 1.4768015564269297, + "learning_rate": 9.786126147893589e-05, + "loss": 3.0906, + "step": 7310 + }, + { + "epoch": 4.711039484286865, + "grad_norm": 1.8655495838607508, + "learning_rate": 9.786067814267814e-05, + "loss": 3.5329, + "step": 7311 + }, + { + "epoch": 4.7116841257050766, + "grad_norm": 1.4460435084672483, + "learning_rate": 9.786009472863642e-05, + "loss": 3.1018, + "step": 7312 + }, + { + "epoch": 4.712328767123288, + "grad_norm": 1.6573102339028354, + "learning_rate": 9.785951123681169e-05, + "loss": 3.4712, + "step": 7313 + }, + { + "epoch": 4.712973408541499, + "grad_norm": 1.4485701499020154, + "learning_rate": 9.785892766720489e-05, + "loss": 3.6887, + "step": 7314 + }, + { + "epoch": 4.71361804995971, + "grad_norm": 1.4453018950525702, + "learning_rate": 9.7858344019817e-05, + "loss": 3.5646, + "step": 7315 + }, + { + "epoch": 4.714262691377921, + "grad_norm": 1.5372807093000878, + "learning_rate": 9.785776029464894e-05, + "loss": 3.508, + "step": 7316 + }, + { + "epoch": 4.714907332796132, + "grad_norm": 1.5575601621162183, + "learning_rate": 9.785717649170174e-05, + "loss": 3.4621, + "step": 7317 + }, + { + "epoch": 4.7155519742143435, + "grad_norm": 1.4113870349566309, + "learning_rate": 9.785659261097629e-05, + "loss": 3.458, + "step": 7318 + }, + { + "epoch": 4.716196615632555, + "grad_norm": 1.3761242152019477, + "learning_rate": 9.78560086524736e-05, + "loss": 3.4012, + "step": 7319 + }, + { + "epoch": 4.716841257050765, + "grad_norm": 1.366480777786709, + "learning_rate": 9.785542461619459e-05, + "loss": 3.2696, + "step": 7320 + }, + { + "epoch": 4.7174858984689765, + "grad_norm": 1.4332515963526344, + "learning_rate": 9.785484050214022e-05, + "loss": 3.2433, + "step": 7321 + }, + { + "epoch": 4.718130539887188, + "grad_norm": 1.4592021829047574, + "learning_rate": 9.785425631031148e-05, + "loss": 3.4003, + "step": 7322 + }, + { + "epoch": 4.718775181305399, + "grad_norm": 1.6161746600306206, + "learning_rate": 9.785367204070931e-05, + "loss": 3.2484, + "step": 7323 + }, + { + "epoch": 4.71941982272361, + "grad_norm": 1.3870444036603207, + "learning_rate": 9.785308769333466e-05, + "loss": 3.3784, + "step": 7324 + }, + { + "epoch": 4.720064464141821, + "grad_norm": 1.5917905829485446, + "learning_rate": 9.785250326818851e-05, + "loss": 3.3189, + "step": 7325 + }, + { + "epoch": 4.720709105560032, + "grad_norm": 1.2966500500381941, + "learning_rate": 9.785191876527182e-05, + "loss": 3.4852, + "step": 7326 + }, + { + "epoch": 4.721353746978243, + "grad_norm": 1.3970710742676442, + "learning_rate": 9.785133418458556e-05, + "loss": 3.517, + "step": 7327 + }, + { + "epoch": 4.721998388396455, + "grad_norm": 1.6820237102452162, + "learning_rate": 9.785074952613063e-05, + "loss": 3.142, + "step": 7328 + }, + { + "epoch": 4.722643029814666, + "grad_norm": 1.5779832016089417, + "learning_rate": 9.785016478990804e-05, + "loss": 3.4984, + "step": 7329 + }, + { + "epoch": 4.723287671232876, + "grad_norm": 1.5707845227907242, + "learning_rate": 9.784957997591876e-05, + "loss": 3.5735, + "step": 7330 + }, + { + "epoch": 4.723932312651088, + "grad_norm": 1.6507692567318986, + "learning_rate": 9.784899508416373e-05, + "loss": 3.4064, + "step": 7331 + }, + { + "epoch": 4.724576954069299, + "grad_norm": 1.5278418312655642, + "learning_rate": 9.784841011464391e-05, + "loss": 3.225, + "step": 7332 + }, + { + "epoch": 4.72522159548751, + "grad_norm": 1.7103852986541226, + "learning_rate": 9.784782506736026e-05, + "loss": 3.349, + "step": 7333 + }, + { + "epoch": 4.725866236905722, + "grad_norm": 1.4268872440941593, + "learning_rate": 9.784723994231375e-05, + "loss": 3.4671, + "step": 7334 + }, + { + "epoch": 4.726510878323932, + "grad_norm": 1.769241354897618, + "learning_rate": 9.784665473950534e-05, + "loss": 3.3487, + "step": 7335 + }, + { + "epoch": 4.727155519742143, + "grad_norm": 1.8448462369562386, + "learning_rate": 9.784606945893597e-05, + "loss": 3.0936, + "step": 7336 + }, + { + "epoch": 4.727800161160355, + "grad_norm": 1.6342031972125777, + "learning_rate": 9.784548410060664e-05, + "loss": 3.7388, + "step": 7337 + }, + { + "epoch": 4.728444802578566, + "grad_norm": 1.6270030305094714, + "learning_rate": 9.784489866451828e-05, + "loss": 3.7334, + "step": 7338 + }, + { + "epoch": 4.729089443996777, + "grad_norm": 1.5430498009125915, + "learning_rate": 9.784431315067186e-05, + "loss": 3.618, + "step": 7339 + }, + { + "epoch": 4.729734085414988, + "grad_norm": 1.4272962262171232, + "learning_rate": 9.784372755906835e-05, + "loss": 3.5207, + "step": 7340 + }, + { + "epoch": 4.730378726833199, + "grad_norm": 1.82840382663936, + "learning_rate": 9.784314188970871e-05, + "loss": 3.3279, + "step": 7341 + }, + { + "epoch": 4.73102336825141, + "grad_norm": 1.6332890818999433, + "learning_rate": 9.78425561425939e-05, + "loss": 3.3662, + "step": 7342 + }, + { + "epoch": 4.7316680096696215, + "grad_norm": 1.7767473110166543, + "learning_rate": 9.784197031772485e-05, + "loss": 3.2442, + "step": 7343 + }, + { + "epoch": 4.732312651087833, + "grad_norm": 1.6466231174075647, + "learning_rate": 9.784138441510256e-05, + "loss": 3.2114, + "step": 7344 + }, + { + "epoch": 4.732957292506043, + "grad_norm": 1.6335009669064628, + "learning_rate": 9.784079843472799e-05, + "loss": 3.0573, + "step": 7345 + }, + { + "epoch": 4.7336019339242545, + "grad_norm": 1.9131468602579265, + "learning_rate": 9.784021237660209e-05, + "loss": 3.3574, + "step": 7346 + }, + { + "epoch": 4.734246575342466, + "grad_norm": 1.6591475946628695, + "learning_rate": 9.783962624072584e-05, + "loss": 3.4606, + "step": 7347 + }, + { + "epoch": 4.734891216760677, + "grad_norm": 1.595626663136053, + "learning_rate": 9.783904002710019e-05, + "loss": 3.7231, + "step": 7348 + }, + { + "epoch": 4.7355358581788884, + "grad_norm": 2.0779355621514566, + "learning_rate": 9.78384537357261e-05, + "loss": 3.3128, + "step": 7349 + }, + { + "epoch": 4.736180499597099, + "grad_norm": 1.36920444359161, + "learning_rate": 9.783786736660453e-05, + "loss": 3.6275, + "step": 7350 + }, + { + "epoch": 4.73682514101531, + "grad_norm": 1.6163395784834993, + "learning_rate": 9.783728091973645e-05, + "loss": 3.416, + "step": 7351 + }, + { + "epoch": 4.7374697824335215, + "grad_norm": 1.4805854102412792, + "learning_rate": 9.783669439512284e-05, + "loss": 3.2372, + "step": 7352 + }, + { + "epoch": 4.738114423851733, + "grad_norm": 1.9983989359522092, + "learning_rate": 9.783610779276463e-05, + "loss": 3.0892, + "step": 7353 + }, + { + "epoch": 4.738759065269944, + "grad_norm": 1.9049736955190135, + "learning_rate": 9.78355211126628e-05, + "loss": 3.1857, + "step": 7354 + }, + { + "epoch": 4.7394037066881545, + "grad_norm": 1.5598208717471571, + "learning_rate": 9.783493435481831e-05, + "loss": 3.465, + "step": 7355 + }, + { + "epoch": 4.740048348106366, + "grad_norm": 1.6794066612348117, + "learning_rate": 9.783434751923214e-05, + "loss": 3.4256, + "step": 7356 + }, + { + "epoch": 4.740692989524577, + "grad_norm": 1.5979127440357226, + "learning_rate": 9.783376060590523e-05, + "loss": 3.3787, + "step": 7357 + }, + { + "epoch": 4.741337630942788, + "grad_norm": 1.3913762869136954, + "learning_rate": 9.783317361483855e-05, + "loss": 3.3432, + "step": 7358 + }, + { + "epoch": 4.741982272361, + "grad_norm": 1.938929388863372, + "learning_rate": 9.783258654603306e-05, + "loss": 3.3468, + "step": 7359 + }, + { + "epoch": 4.74262691377921, + "grad_norm": 1.9269913020986806, + "learning_rate": 9.783199939948977e-05, + "loss": 3.2234, + "step": 7360 + }, + { + "epoch": 4.743271555197421, + "grad_norm": 1.515642605315628, + "learning_rate": 9.783141217520958e-05, + "loss": 3.1503, + "step": 7361 + }, + { + "epoch": 4.743916196615633, + "grad_norm": 1.71607363430269, + "learning_rate": 9.783082487319349e-05, + "loss": 3.2275, + "step": 7362 + }, + { + "epoch": 4.744560838033844, + "grad_norm": 1.4490849049510066, + "learning_rate": 9.783023749344245e-05, + "loss": 3.1185, + "step": 7363 + }, + { + "epoch": 4.745205479452055, + "grad_norm": 1.7117437758398506, + "learning_rate": 9.782965003595742e-05, + "loss": 3.681, + "step": 7364 + }, + { + "epoch": 4.745850120870266, + "grad_norm": 1.374586014631371, + "learning_rate": 9.78290625007394e-05, + "loss": 3.2216, + "step": 7365 + }, + { + "epoch": 4.746494762288477, + "grad_norm": 1.9706179026680346, + "learning_rate": 9.782847488778932e-05, + "loss": 3.2959, + "step": 7366 + }, + { + "epoch": 4.747139403706688, + "grad_norm": 1.588221787582426, + "learning_rate": 9.782788719710815e-05, + "loss": 3.6982, + "step": 7367 + }, + { + "epoch": 4.7477840451249, + "grad_norm": 1.7117841223894694, + "learning_rate": 9.782729942869687e-05, + "loss": 3.7958, + "step": 7368 + }, + { + "epoch": 4.74842868654311, + "grad_norm": 1.722974150351314, + "learning_rate": 9.782671158255642e-05, + "loss": 3.3614, + "step": 7369 + }, + { + "epoch": 4.749073327961321, + "grad_norm": 1.6658109646223886, + "learning_rate": 9.782612365868779e-05, + "loss": 3.2525, + "step": 7370 + }, + { + "epoch": 4.749717969379533, + "grad_norm": 1.2860356590864355, + "learning_rate": 9.782553565709194e-05, + "loss": 3.3149, + "step": 7371 + }, + { + "epoch": 4.750362610797744, + "grad_norm": 1.7646599006664903, + "learning_rate": 9.782494757776983e-05, + "loss": 3.1137, + "step": 7372 + }, + { + "epoch": 4.751007252215955, + "grad_norm": 1.7850898572055258, + "learning_rate": 9.782435942072244e-05, + "loss": 3.4908, + "step": 7373 + }, + { + "epoch": 4.751651893634166, + "grad_norm": 1.5340773149239186, + "learning_rate": 9.782377118595071e-05, + "loss": 3.2963, + "step": 7374 + }, + { + "epoch": 4.752296535052377, + "grad_norm": 1.9153366714786784, + "learning_rate": 9.782318287345562e-05, + "loss": 3.3654, + "step": 7375 + }, + { + "epoch": 4.752941176470588, + "grad_norm": 1.3532790333247113, + "learning_rate": 9.782259448323815e-05, + "loss": 3.5007, + "step": 7376 + }, + { + "epoch": 4.7535858178887995, + "grad_norm": 1.4193574956547321, + "learning_rate": 9.782200601529923e-05, + "loss": 3.8514, + "step": 7377 + }, + { + "epoch": 4.754230459307011, + "grad_norm": 1.5149038602161058, + "learning_rate": 9.782141746963988e-05, + "loss": 2.9314, + "step": 7378 + }, + { + "epoch": 4.754875100725221, + "grad_norm": 1.7715951906477168, + "learning_rate": 9.782082884626102e-05, + "loss": 3.3787, + "step": 7379 + }, + { + "epoch": 4.7555197421434325, + "grad_norm": 1.3898194403595237, + "learning_rate": 9.782024014516364e-05, + "loss": 3.3943, + "step": 7380 + }, + { + "epoch": 4.756164383561644, + "grad_norm": 1.338580361464613, + "learning_rate": 9.781965136634868e-05, + "loss": 3.6237, + "step": 7381 + }, + { + "epoch": 4.756809024979855, + "grad_norm": 1.4826834902080843, + "learning_rate": 9.781906250981715e-05, + "loss": 3.0709, + "step": 7382 + }, + { + "epoch": 4.757453666398066, + "grad_norm": 1.849308585803247, + "learning_rate": 9.781847357556999e-05, + "loss": 3.1199, + "step": 7383 + }, + { + "epoch": 4.758098307816277, + "grad_norm": 1.3391357497600904, + "learning_rate": 9.781788456360818e-05, + "loss": 3.7758, + "step": 7384 + }, + { + "epoch": 4.758742949234488, + "grad_norm": 1.7280590521298211, + "learning_rate": 9.781729547393266e-05, + "loss": 3.3108, + "step": 7385 + }, + { + "epoch": 4.7593875906526995, + "grad_norm": 1.4807808622862246, + "learning_rate": 9.781670630654442e-05, + "loss": 3.2046, + "step": 7386 + }, + { + "epoch": 4.760032232070911, + "grad_norm": 1.4340625428774336, + "learning_rate": 9.781611706144443e-05, + "loss": 3.4259, + "step": 7387 + }, + { + "epoch": 4.760676873489122, + "grad_norm": 1.5765101891495563, + "learning_rate": 9.781552773863363e-05, + "loss": 3.547, + "step": 7388 + }, + { + "epoch": 4.7613215149073325, + "grad_norm": 1.6020536207874374, + "learning_rate": 9.781493833811304e-05, + "loss": 3.3364, + "step": 7389 + }, + { + "epoch": 4.761966156325544, + "grad_norm": 1.260144163327278, + "learning_rate": 9.781434885988358e-05, + "loss": 3.5591, + "step": 7390 + }, + { + "epoch": 4.762610797743755, + "grad_norm": 1.6493704128994366, + "learning_rate": 9.781375930394624e-05, + "loss": 3.3126, + "step": 7391 + }, + { + "epoch": 4.763255439161966, + "grad_norm": 1.3909923413399738, + "learning_rate": 9.781316967030197e-05, + "loss": 3.5715, + "step": 7392 + }, + { + "epoch": 4.763900080580177, + "grad_norm": 1.4938278214324223, + "learning_rate": 9.781257995895177e-05, + "loss": 3.2469, + "step": 7393 + }, + { + "epoch": 4.764544721998388, + "grad_norm": 1.400224760010658, + "learning_rate": 9.78119901698966e-05, + "loss": 3.4616, + "step": 7394 + }, + { + "epoch": 4.765189363416599, + "grad_norm": 1.421354552496412, + "learning_rate": 9.78114003031374e-05, + "loss": 2.9137, + "step": 7395 + }, + { + "epoch": 4.765834004834811, + "grad_norm": 3.1123814902983553, + "learning_rate": 9.781081035867517e-05, + "loss": 3.6156, + "step": 7396 + }, + { + "epoch": 4.766478646253022, + "grad_norm": 1.535395883367723, + "learning_rate": 9.781022033651086e-05, + "loss": 3.108, + "step": 7397 + }, + { + "epoch": 4.767123287671232, + "grad_norm": 1.3997816934565377, + "learning_rate": 9.780963023664545e-05, + "loss": 3.5662, + "step": 7398 + }, + { + "epoch": 4.767767929089444, + "grad_norm": 1.277991165231854, + "learning_rate": 9.780904005907991e-05, + "loss": 3.1351, + "step": 7399 + }, + { + "epoch": 4.768412570507655, + "grad_norm": 1.4393133619267136, + "learning_rate": 9.780844980381519e-05, + "loss": 3.7684, + "step": 7400 + }, + { + "epoch": 4.768412570507655, + "eval_loss": 4.118045806884766, + "eval_runtime": 2.9929, + "eval_samples_per_second": 33.412, + "eval_steps_per_second": 4.344, + "step": 7400 + }, + { + "epoch": 4.769057211925866, + "grad_norm": 1.3774038340399242, + "learning_rate": 9.78078594708523e-05, + "loss": 3.3599, + "step": 7401 + }, + { + "epoch": 4.769701853344078, + "grad_norm": 1.4254092739372877, + "learning_rate": 9.780726906019217e-05, + "loss": 3.5771, + "step": 7402 + }, + { + "epoch": 4.770346494762288, + "grad_norm": 1.3478836964517646, + "learning_rate": 9.780667857183577e-05, + "loss": 3.4105, + "step": 7403 + }, + { + "epoch": 4.770991136180499, + "grad_norm": 1.4418413426262164, + "learning_rate": 9.780608800578409e-05, + "loss": 3.2801, + "step": 7404 + }, + { + "epoch": 4.771635777598711, + "grad_norm": 1.4259904576686377, + "learning_rate": 9.780549736203809e-05, + "loss": 3.4212, + "step": 7405 + }, + { + "epoch": 4.772280419016922, + "grad_norm": 1.3543529987265817, + "learning_rate": 9.780490664059877e-05, + "loss": 3.4783, + "step": 7406 + }, + { + "epoch": 4.772925060435133, + "grad_norm": 1.934062345703003, + "learning_rate": 9.780431584146705e-05, + "loss": 3.145, + "step": 7407 + }, + { + "epoch": 4.773569701853344, + "grad_norm": 1.6416319364277383, + "learning_rate": 9.780372496464391e-05, + "loss": 3.4175, + "step": 7408 + }, + { + "epoch": 4.774214343271555, + "grad_norm": 1.9539044537317583, + "learning_rate": 9.780313401013034e-05, + "loss": 3.5605, + "step": 7409 + }, + { + "epoch": 4.774858984689766, + "grad_norm": 1.8697784392864691, + "learning_rate": 9.780254297792732e-05, + "loss": 3.0545, + "step": 7410 + }, + { + "epoch": 4.7755036261079775, + "grad_norm": 1.300723346578619, + "learning_rate": 9.78019518680358e-05, + "loss": 3.2597, + "step": 7411 + }, + { + "epoch": 4.776148267526189, + "grad_norm": 1.4448329759455159, + "learning_rate": 9.780136068045678e-05, + "loss": 3.48, + "step": 7412 + }, + { + "epoch": 4.776792908944399, + "grad_norm": 1.5519529953833109, + "learning_rate": 9.780076941519118e-05, + "loss": 3.3143, + "step": 7413 + }, + { + "epoch": 4.7774375503626105, + "grad_norm": 1.4526262644560897, + "learning_rate": 9.780017807224e-05, + "loss": 3.6466, + "step": 7414 + }, + { + "epoch": 4.778082191780822, + "grad_norm": 1.4633474324917817, + "learning_rate": 9.779958665160421e-05, + "loss": 3.2083, + "step": 7415 + }, + { + "epoch": 4.778726833199033, + "grad_norm": 1.5585999237662778, + "learning_rate": 9.779899515328478e-05, + "loss": 3.3946, + "step": 7416 + }, + { + "epoch": 4.779371474617244, + "grad_norm": 1.356218648565389, + "learning_rate": 9.779840357728268e-05, + "loss": 3.2798, + "step": 7417 + }, + { + "epoch": 4.780016116035455, + "grad_norm": 1.316315245957478, + "learning_rate": 9.77978119235989e-05, + "loss": 3.2322, + "step": 7418 + }, + { + "epoch": 4.780660757453666, + "grad_norm": 1.7897060631534876, + "learning_rate": 9.779722019223439e-05, + "loss": 3.241, + "step": 7419 + }, + { + "epoch": 4.781305398871877, + "grad_norm": 1.761893226486683, + "learning_rate": 9.779662838319011e-05, + "loss": 3.6632, + "step": 7420 + }, + { + "epoch": 4.781950040290089, + "grad_norm": 1.5317117699795817, + "learning_rate": 9.779603649646706e-05, + "loss": 3.4288, + "step": 7421 + }, + { + "epoch": 4.7825946817083, + "grad_norm": 3.0339508929602697, + "learning_rate": 9.779544453206621e-05, + "loss": 3.3917, + "step": 7422 + }, + { + "epoch": 4.7832393231265105, + "grad_norm": 1.4535059551716414, + "learning_rate": 9.77948524899885e-05, + "loss": 3.5491, + "step": 7423 + }, + { + "epoch": 4.783883964544722, + "grad_norm": 1.709842489218722, + "learning_rate": 9.779426037023495e-05, + "loss": 3.5374, + "step": 7424 + }, + { + "epoch": 4.784528605962933, + "grad_norm": 1.739578500205449, + "learning_rate": 9.77936681728065e-05, + "loss": 3.5929, + "step": 7425 + }, + { + "epoch": 4.785173247381144, + "grad_norm": 1.6633556838731234, + "learning_rate": 9.779307589770413e-05, + "loss": 3.474, + "step": 7426 + }, + { + "epoch": 4.785817888799356, + "grad_norm": 1.523131723912643, + "learning_rate": 9.779248354492883e-05, + "loss": 3.1355, + "step": 7427 + }, + { + "epoch": 4.786462530217566, + "grad_norm": 1.452393897713782, + "learning_rate": 9.779189111448153e-05, + "loss": 3.5735, + "step": 7428 + }, + { + "epoch": 4.787107171635777, + "grad_norm": 1.55778709768666, + "learning_rate": 9.779129860636324e-05, + "loss": 3.5506, + "step": 7429 + }, + { + "epoch": 4.787751813053989, + "grad_norm": 1.5995968862639742, + "learning_rate": 9.779070602057494e-05, + "loss": 3.1703, + "step": 7430 + }, + { + "epoch": 4.7883964544722, + "grad_norm": 1.5367526861627865, + "learning_rate": 9.779011335711757e-05, + "loss": 3.5793, + "step": 7431 + }, + { + "epoch": 4.789041095890411, + "grad_norm": 1.9483630628796496, + "learning_rate": 9.778952061599212e-05, + "loss": 3.4573, + "step": 7432 + }, + { + "epoch": 4.789685737308622, + "grad_norm": 1.5640868945754893, + "learning_rate": 9.778892779719956e-05, + "loss": 3.7376, + "step": 7433 + }, + { + "epoch": 4.790330378726833, + "grad_norm": 1.5925453663306623, + "learning_rate": 9.778833490074087e-05, + "loss": 2.9983, + "step": 7434 + }, + { + "epoch": 4.790975020145044, + "grad_norm": 1.5674914103241864, + "learning_rate": 9.778774192661702e-05, + "loss": 3.4443, + "step": 7435 + }, + { + "epoch": 4.791619661563256, + "grad_norm": 1.4291977248407812, + "learning_rate": 9.778714887482899e-05, + "loss": 3.4162, + "step": 7436 + }, + { + "epoch": 4.792264302981467, + "grad_norm": 1.9965895088953882, + "learning_rate": 9.778655574537772e-05, + "loss": 3.25, + "step": 7437 + }, + { + "epoch": 4.792908944399677, + "grad_norm": 1.472873241483292, + "learning_rate": 9.778596253826425e-05, + "loss": 3.3559, + "step": 7438 + }, + { + "epoch": 4.793553585817889, + "grad_norm": 1.711895908474121, + "learning_rate": 9.77853692534895e-05, + "loss": 3.2061, + "step": 7439 + }, + { + "epoch": 4.7941982272361, + "grad_norm": 1.8669978586748528, + "learning_rate": 9.778477589105446e-05, + "loss": 3.3047, + "step": 7440 + }, + { + "epoch": 4.794842868654311, + "grad_norm": 1.7292860239041241, + "learning_rate": 9.77841824509601e-05, + "loss": 3.3219, + "step": 7441 + }, + { + "epoch": 4.7954875100725225, + "grad_norm": 1.5541265243748859, + "learning_rate": 9.778358893320743e-05, + "loss": 3.473, + "step": 7442 + }, + { + "epoch": 4.796132151490733, + "grad_norm": 1.6105728652626967, + "learning_rate": 9.778299533779735e-05, + "loss": 3.2913, + "step": 7443 + }, + { + "epoch": 4.796776792908944, + "grad_norm": 1.6326350688208022, + "learning_rate": 9.778240166473091e-05, + "loss": 3.0354, + "step": 7444 + }, + { + "epoch": 4.7974214343271555, + "grad_norm": 1.4624771108634504, + "learning_rate": 9.778180791400905e-05, + "loss": 3.4786, + "step": 7445 + }, + { + "epoch": 4.798066075745367, + "grad_norm": 1.2760092390455324, + "learning_rate": 9.778121408563276e-05, + "loss": 3.4358, + "step": 7446 + }, + { + "epoch": 4.798710717163578, + "grad_norm": 1.3113881815889814, + "learning_rate": 9.778062017960299e-05, + "loss": 3.3531, + "step": 7447 + }, + { + "epoch": 4.7993553585817885, + "grad_norm": 1.3752275871844124, + "learning_rate": 9.778002619592073e-05, + "loss": 3.483, + "step": 7448 + }, + { + "epoch": 4.8, + "grad_norm": 1.5101127196542694, + "learning_rate": 9.777943213458697e-05, + "loss": 3.2693, + "step": 7449 + }, + { + "epoch": 4.800644641418211, + "grad_norm": 1.4413390687849994, + "learning_rate": 9.777883799560266e-05, + "loss": 3.7389, + "step": 7450 + }, + { + "epoch": 4.801289282836422, + "grad_norm": 1.5267487572842464, + "learning_rate": 9.77782437789688e-05, + "loss": 3.5285, + "step": 7451 + }, + { + "epoch": 4.801933924254634, + "grad_norm": 1.4369935700526073, + "learning_rate": 9.777764948468634e-05, + "loss": 3.3309, + "step": 7452 + }, + { + "epoch": 4.802578565672844, + "grad_norm": 1.5091273099483828, + "learning_rate": 9.777705511275627e-05, + "loss": 3.4155, + "step": 7453 + }, + { + "epoch": 4.803223207091055, + "grad_norm": 1.5274472145552562, + "learning_rate": 9.777646066317958e-05, + "loss": 3.3871, + "step": 7454 + }, + { + "epoch": 4.803867848509267, + "grad_norm": 1.8820244312988703, + "learning_rate": 9.777586613595722e-05, + "loss": 3.425, + "step": 7455 + }, + { + "epoch": 4.804512489927478, + "grad_norm": 1.4595929510829386, + "learning_rate": 9.777527153109018e-05, + "loss": 3.5826, + "step": 7456 + }, + { + "epoch": 4.805157131345689, + "grad_norm": 2.086287930968733, + "learning_rate": 9.777467684857944e-05, + "loss": 3.2765, + "step": 7457 + }, + { + "epoch": 4.8058017727639, + "grad_norm": 1.6308128175960033, + "learning_rate": 9.777408208842597e-05, + "loss": 3.6054, + "step": 7458 + }, + { + "epoch": 4.806446414182111, + "grad_norm": 1.3616548712974978, + "learning_rate": 9.777348725063074e-05, + "loss": 3.4221, + "step": 7459 + }, + { + "epoch": 4.807091055600322, + "grad_norm": 1.8283128484589877, + "learning_rate": 9.777289233519475e-05, + "loss": 3.4252, + "step": 7460 + }, + { + "epoch": 4.807735697018534, + "grad_norm": 1.3220594353539967, + "learning_rate": 9.777229734211894e-05, + "loss": 3.1498, + "step": 7461 + }, + { + "epoch": 4.808380338436745, + "grad_norm": 1.9765627561485728, + "learning_rate": 9.777170227140434e-05, + "loss": 3.3572, + "step": 7462 + }, + { + "epoch": 4.809024979854955, + "grad_norm": 1.4797176956792037, + "learning_rate": 9.777110712305188e-05, + "loss": 3.6292, + "step": 7463 + }, + { + "epoch": 4.809669621273167, + "grad_norm": 1.8469837271623297, + "learning_rate": 9.777051189706256e-05, + "loss": 3.4157, + "step": 7464 + }, + { + "epoch": 4.810314262691378, + "grad_norm": 1.344788632410037, + "learning_rate": 9.776991659343735e-05, + "loss": 3.2741, + "step": 7465 + }, + { + "epoch": 4.810958904109589, + "grad_norm": 2.1050374675362424, + "learning_rate": 9.776932121217725e-05, + "loss": 3.242, + "step": 7466 + }, + { + "epoch": 4.811603545527801, + "grad_norm": 1.5665614996197712, + "learning_rate": 9.776872575328317e-05, + "loss": 3.4192, + "step": 7467 + }, + { + "epoch": 4.812248186946011, + "grad_norm": 1.6710227432475486, + "learning_rate": 9.776813021675618e-05, + "loss": 3.423, + "step": 7468 + }, + { + "epoch": 4.812892828364222, + "grad_norm": 1.5802877729493885, + "learning_rate": 9.77675346025972e-05, + "loss": 3.2035, + "step": 7469 + }, + { + "epoch": 4.813537469782434, + "grad_norm": 1.679528264312448, + "learning_rate": 9.776693891080721e-05, + "loss": 3.4293, + "step": 7470 + }, + { + "epoch": 4.814182111200645, + "grad_norm": 1.4823088496586398, + "learning_rate": 9.776634314138722e-05, + "loss": 3.5494, + "step": 7471 + }, + { + "epoch": 4.814826752618856, + "grad_norm": 1.715550397570161, + "learning_rate": 9.776574729433818e-05, + "loss": 3.3159, + "step": 7472 + }, + { + "epoch": 4.815471394037067, + "grad_norm": 1.5338984742777722, + "learning_rate": 9.776515136966108e-05, + "loss": 3.3212, + "step": 7473 + }, + { + "epoch": 4.816116035455278, + "grad_norm": 1.902986166296194, + "learning_rate": 9.776455536735689e-05, + "loss": 3.3561, + "step": 7474 + }, + { + "epoch": 4.816760676873489, + "grad_norm": 1.8662212104669378, + "learning_rate": 9.776395928742661e-05, + "loss": 3.5148, + "step": 7475 + }, + { + "epoch": 4.8174053182917005, + "grad_norm": 1.8570108388501039, + "learning_rate": 9.776336312987119e-05, + "loss": 3.6412, + "step": 7476 + }, + { + "epoch": 4.818049959709912, + "grad_norm": 1.6476730733547233, + "learning_rate": 9.776276689469162e-05, + "loss": 3.4276, + "step": 7477 + }, + { + "epoch": 4.818694601128122, + "grad_norm": 2.0792179435651588, + "learning_rate": 9.77621705818889e-05, + "loss": 3.5217, + "step": 7478 + }, + { + "epoch": 4.8193392425463335, + "grad_norm": 1.430760598545491, + "learning_rate": 9.776157419146398e-05, + "loss": 3.3512, + "step": 7479 + }, + { + "epoch": 4.819983883964545, + "grad_norm": 1.7592907021055688, + "learning_rate": 9.776097772341785e-05, + "loss": 3.6468, + "step": 7480 + }, + { + "epoch": 4.820628525382756, + "grad_norm": 1.5372102524830682, + "learning_rate": 9.77603811777515e-05, + "loss": 3.3795, + "step": 7481 + }, + { + "epoch": 4.821273166800967, + "grad_norm": 1.7570465659929546, + "learning_rate": 9.775978455446589e-05, + "loss": 3.3684, + "step": 7482 + }, + { + "epoch": 4.821917808219178, + "grad_norm": 1.8164368941040574, + "learning_rate": 9.775918785356203e-05, + "loss": 3.411, + "step": 7483 + }, + { + "epoch": 4.822562449637389, + "grad_norm": 1.838531205929678, + "learning_rate": 9.775859107504086e-05, + "loss": 3.5649, + "step": 7484 + }, + { + "epoch": 4.8232070910556, + "grad_norm": 1.8091374374159974, + "learning_rate": 9.775799421890339e-05, + "loss": 3.6063, + "step": 7485 + }, + { + "epoch": 4.823851732473812, + "grad_norm": 1.520141410210944, + "learning_rate": 9.77573972851506e-05, + "loss": 3.3833, + "step": 7486 + }, + { + "epoch": 4.824496373892023, + "grad_norm": 1.952461338858103, + "learning_rate": 9.775680027378343e-05, + "loss": 3.7299, + "step": 7487 + }, + { + "epoch": 4.825141015310233, + "grad_norm": 1.5424124514055368, + "learning_rate": 9.775620318480292e-05, + "loss": 3.1966, + "step": 7488 + }, + { + "epoch": 4.825785656728445, + "grad_norm": 1.6978306048787233, + "learning_rate": 9.775560601821002e-05, + "loss": 3.5093, + "step": 7489 + }, + { + "epoch": 4.826430298146656, + "grad_norm": 1.6275892217064338, + "learning_rate": 9.775500877400571e-05, + "loss": 3.5916, + "step": 7490 + }, + { + "epoch": 4.827074939564867, + "grad_norm": 1.5613372703971728, + "learning_rate": 9.775441145219096e-05, + "loss": 3.4255, + "step": 7491 + }, + { + "epoch": 4.827719580983079, + "grad_norm": 1.6172134474126514, + "learning_rate": 9.775381405276679e-05, + "loss": 3.3028, + "step": 7492 + }, + { + "epoch": 4.828364222401289, + "grad_norm": 1.710373663689209, + "learning_rate": 9.775321657573415e-05, + "loss": 3.5376, + "step": 7493 + }, + { + "epoch": 4.8290088638195, + "grad_norm": 1.5552259726030893, + "learning_rate": 9.775261902109402e-05, + "loss": 3.6046, + "step": 7494 + }, + { + "epoch": 4.829653505237712, + "grad_norm": 1.680174072560133, + "learning_rate": 9.775202138884738e-05, + "loss": 3.7111, + "step": 7495 + }, + { + "epoch": 4.830298146655923, + "grad_norm": 1.5485511438413846, + "learning_rate": 9.775142367899524e-05, + "loss": 3.5183, + "step": 7496 + }, + { + "epoch": 4.830942788074134, + "grad_norm": 1.4730337768642112, + "learning_rate": 9.775082589153855e-05, + "loss": 3.4975, + "step": 7497 + }, + { + "epoch": 4.831587429492345, + "grad_norm": 2.0597765276018656, + "learning_rate": 9.775022802647832e-05, + "loss": 3.3397, + "step": 7498 + }, + { + "epoch": 4.832232070910556, + "grad_norm": 1.494070656314731, + "learning_rate": 9.77496300838155e-05, + "loss": 3.3187, + "step": 7499 + }, + { + "epoch": 4.832876712328767, + "grad_norm": 1.8280373294992807, + "learning_rate": 9.774903206355109e-05, + "loss": 3.1755, + "step": 7500 + }, + { + "epoch": 4.832876712328767, + "eval_loss": 4.121467590332031, + "eval_runtime": 2.9818, + "eval_samples_per_second": 33.537, + "eval_steps_per_second": 4.36, + "step": 7500 + }, + { + "epoch": 4.8335213537469786, + "grad_norm": 1.4066611109225509, + "learning_rate": 9.774843396568607e-05, + "loss": 3.5769, + "step": 7501 + }, + { + "epoch": 4.83416599516519, + "grad_norm": 1.693350302350426, + "learning_rate": 9.774783579022143e-05, + "loss": 3.6078, + "step": 7502 + }, + { + "epoch": 4.8348106365834, + "grad_norm": 1.6770404024805141, + "learning_rate": 9.774723753715815e-05, + "loss": 3.4827, + "step": 7503 + }, + { + "epoch": 4.835455278001612, + "grad_norm": 1.6568727236468326, + "learning_rate": 9.774663920649719e-05, + "loss": 3.6766, + "step": 7504 + }, + { + "epoch": 4.836099919419823, + "grad_norm": 1.6539476996856095, + "learning_rate": 9.774604079823957e-05, + "loss": 3.2584, + "step": 7505 + }, + { + "epoch": 4.836744560838034, + "grad_norm": 1.3406997801924234, + "learning_rate": 9.774544231238624e-05, + "loss": 3.5287, + "step": 7506 + }, + { + "epoch": 4.837389202256245, + "grad_norm": 1.8416589848562963, + "learning_rate": 9.77448437489382e-05, + "loss": 3.3508, + "step": 7507 + }, + { + "epoch": 4.838033843674456, + "grad_norm": 1.486653313463249, + "learning_rate": 9.774424510789643e-05, + "loss": 3.5783, + "step": 7508 + }, + { + "epoch": 4.838678485092667, + "grad_norm": 1.878572012890944, + "learning_rate": 9.774364638926192e-05, + "loss": 3.2867, + "step": 7509 + }, + { + "epoch": 4.8393231265108785, + "grad_norm": 1.2234330679009753, + "learning_rate": 9.774304759303562e-05, + "loss": 3.3341, + "step": 7510 + }, + { + "epoch": 4.83996776792909, + "grad_norm": 1.664367132672184, + "learning_rate": 9.774244871921856e-05, + "loss": 3.4192, + "step": 7511 + }, + { + "epoch": 4.8406124093473, + "grad_norm": 1.4881430942809866, + "learning_rate": 9.77418497678117e-05, + "loss": 3.544, + "step": 7512 + }, + { + "epoch": 4.8412570507655115, + "grad_norm": 1.4132427865322243, + "learning_rate": 9.774125073881602e-05, + "loss": 3.3749, + "step": 7513 + }, + { + "epoch": 4.841901692183723, + "grad_norm": 1.9783728483041112, + "learning_rate": 9.77406516322325e-05, + "loss": 3.2009, + "step": 7514 + }, + { + "epoch": 4.842546333601934, + "grad_norm": 1.5862744669696234, + "learning_rate": 9.774005244806215e-05, + "loss": 3.4347, + "step": 7515 + }, + { + "epoch": 4.843190975020145, + "grad_norm": 1.5996309186452542, + "learning_rate": 9.773945318630594e-05, + "loss": 3.3271, + "step": 7516 + }, + { + "epoch": 4.843835616438356, + "grad_norm": 1.52812021715412, + "learning_rate": 9.773885384696486e-05, + "loss": 3.2975, + "step": 7517 + }, + { + "epoch": 4.844480257856567, + "grad_norm": 1.3602848917063075, + "learning_rate": 9.773825443003987e-05, + "loss": 3.6014, + "step": 7518 + }, + { + "epoch": 4.845124899274778, + "grad_norm": 1.6003478568415646, + "learning_rate": 9.773765493553197e-05, + "loss": 3.5301, + "step": 7519 + }, + { + "epoch": 4.84576954069299, + "grad_norm": 1.692092975495124, + "learning_rate": 9.773705536344216e-05, + "loss": 3.4822, + "step": 7520 + }, + { + "epoch": 4.846414182111201, + "grad_norm": 1.8450105385474018, + "learning_rate": 9.773645571377141e-05, + "loss": 3.4443, + "step": 7521 + }, + { + "epoch": 4.847058823529411, + "grad_norm": 1.4950694935031568, + "learning_rate": 9.773585598652069e-05, + "loss": 3.6136, + "step": 7522 + }, + { + "epoch": 4.847703464947623, + "grad_norm": 1.4523346440883096, + "learning_rate": 9.773525618169101e-05, + "loss": 3.4725, + "step": 7523 + }, + { + "epoch": 4.848348106365834, + "grad_norm": 1.5337464667016654, + "learning_rate": 9.773465629928335e-05, + "loss": 3.3245, + "step": 7524 + }, + { + "epoch": 4.848992747784045, + "grad_norm": 1.490064830863172, + "learning_rate": 9.773405633929869e-05, + "loss": 3.5679, + "step": 7525 + }, + { + "epoch": 4.849637389202257, + "grad_norm": 1.205284570612124, + "learning_rate": 9.7733456301738e-05, + "loss": 3.6685, + "step": 7526 + }, + { + "epoch": 4.850282030620467, + "grad_norm": 1.6110082011532878, + "learning_rate": 9.77328561866023e-05, + "loss": 3.5235, + "step": 7527 + }, + { + "epoch": 4.850926672038678, + "grad_norm": 1.3509347548138135, + "learning_rate": 9.773225599389255e-05, + "loss": 3.2146, + "step": 7528 + }, + { + "epoch": 4.85157131345689, + "grad_norm": 1.4061632672028184, + "learning_rate": 9.773165572360975e-05, + "loss": 3.4308, + "step": 7529 + }, + { + "epoch": 4.852215954875101, + "grad_norm": 1.335078028767644, + "learning_rate": 9.773105537575486e-05, + "loss": 3.5319, + "step": 7530 + }, + { + "epoch": 4.852860596293311, + "grad_norm": 1.414568558162952, + "learning_rate": 9.77304549503289e-05, + "loss": 3.3489, + "step": 7531 + }, + { + "epoch": 4.853505237711523, + "grad_norm": 1.314412013230321, + "learning_rate": 9.772985444733283e-05, + "loss": 3.7592, + "step": 7532 + }, + { + "epoch": 4.854149879129734, + "grad_norm": 1.5084148237493096, + "learning_rate": 9.772925386676767e-05, + "loss": 3.5469, + "step": 7533 + }, + { + "epoch": 4.854794520547945, + "grad_norm": 1.6787476625312518, + "learning_rate": 9.772865320863437e-05, + "loss": 3.2861, + "step": 7534 + }, + { + "epoch": 4.8554391619661565, + "grad_norm": 1.2770972392376199, + "learning_rate": 9.772805247293391e-05, + "loss": 3.1948, + "step": 7535 + }, + { + "epoch": 4.856083803384367, + "grad_norm": 1.824981143411536, + "learning_rate": 9.772745165966733e-05, + "loss": 3.5667, + "step": 7536 + }, + { + "epoch": 4.856728444802578, + "grad_norm": 1.5283002780753492, + "learning_rate": 9.772685076883557e-05, + "loss": 3.2721, + "step": 7537 + }, + { + "epoch": 4.85737308622079, + "grad_norm": 3.6278665362751576, + "learning_rate": 9.772624980043963e-05, + "loss": 3.5255, + "step": 7538 + }, + { + "epoch": 4.858017727639001, + "grad_norm": 1.8168848115493528, + "learning_rate": 9.772564875448049e-05, + "loss": 3.6651, + "step": 7539 + }, + { + "epoch": 4.858662369057212, + "grad_norm": 1.4079294511733944, + "learning_rate": 9.772504763095916e-05, + "loss": 3.7128, + "step": 7540 + }, + { + "epoch": 4.859307010475423, + "grad_norm": 1.6160012957714933, + "learning_rate": 9.772444642987659e-05, + "loss": 3.6955, + "step": 7541 + }, + { + "epoch": 4.859951651893634, + "grad_norm": 1.4352254336246169, + "learning_rate": 9.77238451512338e-05, + "loss": 3.1287, + "step": 7542 + }, + { + "epoch": 4.860596293311845, + "grad_norm": 1.3944352134397602, + "learning_rate": 9.772324379503177e-05, + "loss": 3.1534, + "step": 7543 + }, + { + "epoch": 4.8612409347300565, + "grad_norm": 1.3378724972429368, + "learning_rate": 9.772264236127147e-05, + "loss": 3.4625, + "step": 7544 + }, + { + "epoch": 4.861885576148268, + "grad_norm": 1.519135570461572, + "learning_rate": 9.772204084995392e-05, + "loss": 3.5764, + "step": 7545 + }, + { + "epoch": 4.862530217566478, + "grad_norm": 1.395342497836529, + "learning_rate": 9.77214392610801e-05, + "loss": 3.5185, + "step": 7546 + }, + { + "epoch": 4.8631748589846895, + "grad_norm": 1.6381649288890725, + "learning_rate": 9.772083759465097e-05, + "loss": 3.5599, + "step": 7547 + }, + { + "epoch": 4.863819500402901, + "grad_norm": 1.455154460201652, + "learning_rate": 9.772023585066753e-05, + "loss": 3.5148, + "step": 7548 + }, + { + "epoch": 4.864464141821112, + "grad_norm": 1.6566995728633502, + "learning_rate": 9.77196340291308e-05, + "loss": 3.694, + "step": 7549 + }, + { + "epoch": 4.865108783239323, + "grad_norm": 1.378456687766477, + "learning_rate": 9.771903213004171e-05, + "loss": 3.2018, + "step": 7550 + }, + { + "epoch": 4.865753424657534, + "grad_norm": 1.4798122488257184, + "learning_rate": 9.771843015340129e-05, + "loss": 3.3704, + "step": 7551 + }, + { + "epoch": 4.866398066075745, + "grad_norm": 1.6056613138889337, + "learning_rate": 9.771782809921055e-05, + "loss": 3.3936, + "step": 7552 + }, + { + "epoch": 4.867042707493956, + "grad_norm": 1.4011294322549828, + "learning_rate": 9.771722596747042e-05, + "loss": 3.585, + "step": 7553 + }, + { + "epoch": 4.867687348912168, + "grad_norm": 1.4304903146917134, + "learning_rate": 9.771662375818192e-05, + "loss": 3.4226, + "step": 7554 + }, + { + "epoch": 4.868331990330379, + "grad_norm": 1.4963088764533208, + "learning_rate": 9.771602147134605e-05, + "loss": 3.3808, + "step": 7555 + }, + { + "epoch": 4.868976631748589, + "grad_norm": 1.5770502633691006, + "learning_rate": 9.771541910696378e-05, + "loss": 3.416, + "step": 7556 + }, + { + "epoch": 4.869621273166801, + "grad_norm": 1.6243296762954094, + "learning_rate": 9.771481666503609e-05, + "loss": 3.2207, + "step": 7557 + }, + { + "epoch": 4.870265914585012, + "grad_norm": 1.4459377277415308, + "learning_rate": 9.771421414556401e-05, + "loss": 3.4261, + "step": 7558 + }, + { + "epoch": 4.870910556003223, + "grad_norm": 2.0293345395880307, + "learning_rate": 9.771361154854849e-05, + "loss": 3.4754, + "step": 7559 + }, + { + "epoch": 4.871555197421435, + "grad_norm": 1.3962737320067307, + "learning_rate": 9.771300887399054e-05, + "loss": 3.527, + "step": 7560 + }, + { + "epoch": 4.872199838839645, + "grad_norm": 1.793639407864795, + "learning_rate": 9.771240612189114e-05, + "loss": 3.3426, + "step": 7561 + }, + { + "epoch": 4.872844480257856, + "grad_norm": 1.8028529563060518, + "learning_rate": 9.771180329225128e-05, + "loss": 3.633, + "step": 7562 + }, + { + "epoch": 4.873489121676068, + "grad_norm": 1.5576651982148866, + "learning_rate": 9.771120038507195e-05, + "loss": 3.0574, + "step": 7563 + }, + { + "epoch": 4.874133763094279, + "grad_norm": 1.6238458639935753, + "learning_rate": 9.771059740035416e-05, + "loss": 3.5041, + "step": 7564 + }, + { + "epoch": 4.87477840451249, + "grad_norm": 1.5697048584751379, + "learning_rate": 9.770999433809889e-05, + "loss": 3.4141, + "step": 7565 + }, + { + "epoch": 4.875423045930701, + "grad_norm": 1.599484061121266, + "learning_rate": 9.77093911983071e-05, + "loss": 3.4874, + "step": 7566 + }, + { + "epoch": 4.876067687348912, + "grad_norm": 5.2152727346052465, + "learning_rate": 9.770878798097982e-05, + "loss": 3.2179, + "step": 7567 + }, + { + "epoch": 4.876712328767123, + "grad_norm": 1.4620374608057685, + "learning_rate": 9.770818468611802e-05, + "loss": 3.2005, + "step": 7568 + }, + { + "epoch": 4.8773569701853345, + "grad_norm": 1.684100541959076, + "learning_rate": 9.77075813137227e-05, + "loss": 3.5396, + "step": 7569 + }, + { + "epoch": 4.878001611603546, + "grad_norm": 1.336942024734043, + "learning_rate": 9.770697786379485e-05, + "loss": 3.5482, + "step": 7570 + }, + { + "epoch": 4.878646253021756, + "grad_norm": 1.6574693541901344, + "learning_rate": 9.770637433633546e-05, + "loss": 3.6471, + "step": 7571 + }, + { + "epoch": 4.8792908944399676, + "grad_norm": 1.5423804332576638, + "learning_rate": 9.770577073134551e-05, + "loss": 3.494, + "step": 7572 + }, + { + "epoch": 4.879935535858179, + "grad_norm": 1.590242285027182, + "learning_rate": 9.770516704882601e-05, + "loss": 3.3577, + "step": 7573 + }, + { + "epoch": 4.88058017727639, + "grad_norm": 1.4302768175081109, + "learning_rate": 9.770456328877794e-05, + "loss": 3.2488, + "step": 7574 + }, + { + "epoch": 4.8812248186946015, + "grad_norm": 1.2815283495390923, + "learning_rate": 9.770395945120231e-05, + "loss": 3.4839, + "step": 7575 + }, + { + "epoch": 4.881869460112812, + "grad_norm": 1.4536522807275964, + "learning_rate": 9.770335553610008e-05, + "loss": 3.3033, + "step": 7576 + }, + { + "epoch": 4.882514101531023, + "grad_norm": 1.5619185737361962, + "learning_rate": 9.770275154347226e-05, + "loss": 3.4338, + "step": 7577 + }, + { + "epoch": 4.8831587429492345, + "grad_norm": 1.6952846656163114, + "learning_rate": 9.770214747331983e-05, + "loss": 3.2757, + "step": 7578 + }, + { + "epoch": 4.883803384367446, + "grad_norm": 1.4920116540583654, + "learning_rate": 9.770154332564381e-05, + "loss": 3.2551, + "step": 7579 + }, + { + "epoch": 4.884448025785657, + "grad_norm": 1.4771580015321777, + "learning_rate": 9.770093910044518e-05, + "loss": 3.1827, + "step": 7580 + }, + { + "epoch": 4.8850926672038675, + "grad_norm": 1.3241377420932263, + "learning_rate": 9.77003347977249e-05, + "loss": 3.7168, + "step": 7581 + }, + { + "epoch": 4.885737308622079, + "grad_norm": 1.4444375050946814, + "learning_rate": 9.769973041748401e-05, + "loss": 3.4813, + "step": 7582 + }, + { + "epoch": 4.88638195004029, + "grad_norm": 1.2618505844070191, + "learning_rate": 9.769912595972349e-05, + "loss": 3.4311, + "step": 7583 + }, + { + "epoch": 4.887026591458501, + "grad_norm": 1.848202953130245, + "learning_rate": 9.76985214244443e-05, + "loss": 3.3206, + "step": 7584 + }, + { + "epoch": 4.887671232876713, + "grad_norm": 1.3685154612841757, + "learning_rate": 9.769791681164747e-05, + "loss": 3.1433, + "step": 7585 + }, + { + "epoch": 4.888315874294923, + "grad_norm": 1.6236912340328074, + "learning_rate": 9.769731212133398e-05, + "loss": 3.003, + "step": 7586 + }, + { + "epoch": 4.888960515713134, + "grad_norm": 1.3298975985480803, + "learning_rate": 9.769670735350484e-05, + "loss": 3.4411, + "step": 7587 + }, + { + "epoch": 4.889605157131346, + "grad_norm": 1.4004094780493588, + "learning_rate": 9.7696102508161e-05, + "loss": 3.4882, + "step": 7588 + }, + { + "epoch": 4.890249798549557, + "grad_norm": 1.3817999687594746, + "learning_rate": 9.769549758530349e-05, + "loss": 3.4136, + "step": 7589 + }, + { + "epoch": 4.890894439967768, + "grad_norm": 1.6315438073142567, + "learning_rate": 9.76948925849333e-05, + "loss": 3.2512, + "step": 7590 + }, + { + "epoch": 4.891539081385979, + "grad_norm": 1.3938812370040203, + "learning_rate": 9.769428750705141e-05, + "loss": 3.5385, + "step": 7591 + }, + { + "epoch": 4.89218372280419, + "grad_norm": 1.6331418164164762, + "learning_rate": 9.769368235165883e-05, + "loss": 3.3072, + "step": 7592 + }, + { + "epoch": 4.892828364222401, + "grad_norm": 1.2840922327221393, + "learning_rate": 9.769307711875655e-05, + "loss": 3.3018, + "step": 7593 + }, + { + "epoch": 4.893473005640613, + "grad_norm": 1.7950627745916434, + "learning_rate": 9.769247180834555e-05, + "loss": 3.2879, + "step": 7594 + }, + { + "epoch": 4.894117647058824, + "grad_norm": 1.3460324533597574, + "learning_rate": 9.769186642042683e-05, + "loss": 3.7524, + "step": 7595 + }, + { + "epoch": 4.894762288477034, + "grad_norm": 1.5385241686385034, + "learning_rate": 9.769126095500138e-05, + "loss": 3.3078, + "step": 7596 + }, + { + "epoch": 4.895406929895246, + "grad_norm": 1.5467482219635333, + "learning_rate": 9.769065541207023e-05, + "loss": 3.4047, + "step": 7597 + }, + { + "epoch": 4.896051571313457, + "grad_norm": 1.47410926623413, + "learning_rate": 9.769004979163432e-05, + "loss": 3.5757, + "step": 7598 + }, + { + "epoch": 4.896696212731668, + "grad_norm": 1.3979416030287837, + "learning_rate": 9.768944409369469e-05, + "loss": 3.3026, + "step": 7599 + }, + { + "epoch": 4.8973408541498795, + "grad_norm": 1.334392327897572, + "learning_rate": 9.76888383182523e-05, + "loss": 3.3832, + "step": 7600 + }, + { + "epoch": 4.8973408541498795, + "eval_loss": 4.1092939376831055, + "eval_runtime": 2.9773, + "eval_samples_per_second": 33.588, + "eval_steps_per_second": 4.366, + "step": 7600 + }, + { + "epoch": 4.89798549556809, + "grad_norm": 2.047217960780248, + "learning_rate": 9.768823246530817e-05, + "loss": 3.533, + "step": 7601 + }, + { + "epoch": 4.898630136986301, + "grad_norm": 1.346395432844081, + "learning_rate": 9.768762653486329e-05, + "loss": 3.4373, + "step": 7602 + }, + { + "epoch": 4.8992747784045125, + "grad_norm": 2.1660252284690995, + "learning_rate": 9.768702052691865e-05, + "loss": 3.4989, + "step": 7603 + }, + { + "epoch": 4.899919419822724, + "grad_norm": 1.6475513372204655, + "learning_rate": 9.768641444147524e-05, + "loss": 3.4515, + "step": 7604 + }, + { + "epoch": 4.900564061240935, + "grad_norm": 1.4291598480262717, + "learning_rate": 9.768580827853407e-05, + "loss": 3.3729, + "step": 7605 + }, + { + "epoch": 4.9012087026591455, + "grad_norm": 1.4550083477089548, + "learning_rate": 9.768520203809612e-05, + "loss": 3.2429, + "step": 7606 + }, + { + "epoch": 4.901853344077357, + "grad_norm": 1.372703389460423, + "learning_rate": 9.768459572016239e-05, + "loss": 3.8281, + "step": 7607 + }, + { + "epoch": 4.902497985495568, + "grad_norm": 1.5708262245176965, + "learning_rate": 9.768398932473388e-05, + "loss": 3.1369, + "step": 7608 + }, + { + "epoch": 4.9031426269137794, + "grad_norm": 1.5443717713226812, + "learning_rate": 9.76833828518116e-05, + "loss": 3.4645, + "step": 7609 + }, + { + "epoch": 4.903787268331991, + "grad_norm": 1.8244114177174426, + "learning_rate": 9.76827763013965e-05, + "loss": 3.2075, + "step": 7610 + }, + { + "epoch": 4.904431909750201, + "grad_norm": 1.7165893214471144, + "learning_rate": 9.768216967348963e-05, + "loss": 3.5374, + "step": 7611 + }, + { + "epoch": 4.9050765511684125, + "grad_norm": 2.042933923676594, + "learning_rate": 9.768156296809197e-05, + "loss": 3.5282, + "step": 7612 + }, + { + "epoch": 4.905721192586624, + "grad_norm": 1.6466874997367236, + "learning_rate": 9.768095618520449e-05, + "loss": 3.2005, + "step": 7613 + }, + { + "epoch": 4.906365834004835, + "grad_norm": 1.7321745261361376, + "learning_rate": 9.768034932482822e-05, + "loss": 3.5929, + "step": 7614 + }, + { + "epoch": 4.907010475423046, + "grad_norm": 2.0154562579677266, + "learning_rate": 9.767974238696413e-05, + "loss": 3.0462, + "step": 7615 + }, + { + "epoch": 4.907655116841257, + "grad_norm": 1.531139528133544, + "learning_rate": 9.767913537161325e-05, + "loss": 3.461, + "step": 7616 + }, + { + "epoch": 4.908299758259468, + "grad_norm": 1.878412203679443, + "learning_rate": 9.767852827877655e-05, + "loss": 3.7038, + "step": 7617 + }, + { + "epoch": 4.908944399677679, + "grad_norm": 1.6041137541704444, + "learning_rate": 9.767792110845503e-05, + "loss": 3.5197, + "step": 7618 + }, + { + "epoch": 4.909589041095891, + "grad_norm": 1.7539387799927013, + "learning_rate": 9.767731386064969e-05, + "loss": 3.5483, + "step": 7619 + }, + { + "epoch": 4.910233682514102, + "grad_norm": 1.6129761571914751, + "learning_rate": 9.767670653536153e-05, + "loss": 3.656, + "step": 7620 + }, + { + "epoch": 4.910878323932312, + "grad_norm": 1.5671810158427455, + "learning_rate": 9.767609913259154e-05, + "loss": 3.6915, + "step": 7621 + }, + { + "epoch": 4.911522965350524, + "grad_norm": 1.4675104231600398, + "learning_rate": 9.767549165234075e-05, + "loss": 3.662, + "step": 7622 + }, + { + "epoch": 4.912167606768735, + "grad_norm": 1.5302558346773938, + "learning_rate": 9.76748840946101e-05, + "loss": 3.3397, + "step": 7623 + }, + { + "epoch": 4.912812248186946, + "grad_norm": 1.3796323645676918, + "learning_rate": 9.767427645940062e-05, + "loss": 3.8178, + "step": 7624 + }, + { + "epoch": 4.913456889605158, + "grad_norm": 2.0979002896391226, + "learning_rate": 9.767366874671332e-05, + "loss": 3.4532, + "step": 7625 + }, + { + "epoch": 4.914101531023368, + "grad_norm": 1.4280072705793332, + "learning_rate": 9.767306095654919e-05, + "loss": 3.3261, + "step": 7626 + }, + { + "epoch": 4.914746172441579, + "grad_norm": 1.7067077012467682, + "learning_rate": 9.767245308890921e-05, + "loss": 3.402, + "step": 7627 + }, + { + "epoch": 4.915390813859791, + "grad_norm": 1.3773176970075136, + "learning_rate": 9.76718451437944e-05, + "loss": 3.4612, + "step": 7628 + }, + { + "epoch": 4.916035455278002, + "grad_norm": 1.5855499399892516, + "learning_rate": 9.767123712120575e-05, + "loss": 3.4652, + "step": 7629 + }, + { + "epoch": 4.916680096696213, + "grad_norm": 1.611355638759723, + "learning_rate": 9.767062902114427e-05, + "loss": 2.8968, + "step": 7630 + }, + { + "epoch": 4.917324738114424, + "grad_norm": 1.6667792403282893, + "learning_rate": 9.767002084361093e-05, + "loss": 3.0899, + "step": 7631 + }, + { + "epoch": 4.917969379532635, + "grad_norm": 1.6010946513481972, + "learning_rate": 9.766941258860674e-05, + "loss": 3.4639, + "step": 7632 + }, + { + "epoch": 4.918614020950846, + "grad_norm": 1.7643705672115324, + "learning_rate": 9.766880425613272e-05, + "loss": 3.7274, + "step": 7633 + }, + { + "epoch": 4.9192586623690575, + "grad_norm": 1.4015820116120397, + "learning_rate": 9.766819584618984e-05, + "loss": 3.4779, + "step": 7634 + }, + { + "epoch": 4.919903303787269, + "grad_norm": 1.615573511117536, + "learning_rate": 9.766758735877913e-05, + "loss": 3.5001, + "step": 7635 + }, + { + "epoch": 4.920547945205479, + "grad_norm": 1.4272555325688994, + "learning_rate": 9.766697879390157e-05, + "loss": 3.3916, + "step": 7636 + }, + { + "epoch": 4.9211925866236905, + "grad_norm": 1.8954102109934854, + "learning_rate": 9.766637015155814e-05, + "loss": 3.4959, + "step": 7637 + }, + { + "epoch": 4.921837228041902, + "grad_norm": 2.079861851321097, + "learning_rate": 9.76657614317499e-05, + "loss": 3.5417, + "step": 7638 + }, + { + "epoch": 4.922481869460113, + "grad_norm": 1.2969005753882994, + "learning_rate": 9.766515263447779e-05, + "loss": 3.44, + "step": 7639 + }, + { + "epoch": 4.923126510878324, + "grad_norm": 1.8590244857593123, + "learning_rate": 9.766454375974283e-05, + "loss": 3.489, + "step": 7640 + }, + { + "epoch": 4.923771152296535, + "grad_norm": 1.309771582637967, + "learning_rate": 9.766393480754602e-05, + "loss": 3.3664, + "step": 7641 + }, + { + "epoch": 4.924415793714746, + "grad_norm": 1.5415698188898228, + "learning_rate": 9.766332577788838e-05, + "loss": 3.4325, + "step": 7642 + }, + { + "epoch": 4.925060435132957, + "grad_norm": 1.3174975227095693, + "learning_rate": 9.766271667077088e-05, + "loss": 3.4798, + "step": 7643 + }, + { + "epoch": 4.925705076551169, + "grad_norm": 1.4231059309284344, + "learning_rate": 9.766210748619452e-05, + "loss": 3.2368, + "step": 7644 + }, + { + "epoch": 4.926349717969379, + "grad_norm": 1.3619721361704142, + "learning_rate": 9.766149822416035e-05, + "loss": 3.5917, + "step": 7645 + }, + { + "epoch": 4.9269943593875904, + "grad_norm": 1.5850226280016586, + "learning_rate": 9.76608888846693e-05, + "loss": 3.1728, + "step": 7646 + }, + { + "epoch": 4.927639000805802, + "grad_norm": 1.2644931623324922, + "learning_rate": 9.76602794677224e-05, + "loss": 3.3649, + "step": 7647 + }, + { + "epoch": 4.928283642224013, + "grad_norm": 1.576184604909377, + "learning_rate": 9.765966997332069e-05, + "loss": 3.9221, + "step": 7648 + }, + { + "epoch": 4.928928283642224, + "grad_norm": 1.2566391824812144, + "learning_rate": 9.76590604014651e-05, + "loss": 3.547, + "step": 7649 + }, + { + "epoch": 4.929572925060435, + "grad_norm": 1.4945916411939753, + "learning_rate": 9.765845075215669e-05, + "loss": 3.5263, + "step": 7650 + }, + { + "epoch": 4.930217566478646, + "grad_norm": 1.376291705428167, + "learning_rate": 9.765784102539644e-05, + "loss": 3.4935, + "step": 7651 + }, + { + "epoch": 4.930862207896857, + "grad_norm": 1.3154757480776447, + "learning_rate": 9.765723122118534e-05, + "loss": 3.7593, + "step": 7652 + }, + { + "epoch": 4.931506849315069, + "grad_norm": 1.5616928134403698, + "learning_rate": 9.76566213395244e-05, + "loss": 3.3234, + "step": 7653 + }, + { + "epoch": 4.93215149073328, + "grad_norm": 1.6130989979814092, + "learning_rate": 9.765601138041462e-05, + "loss": 3.5019, + "step": 7654 + }, + { + "epoch": 4.93279613215149, + "grad_norm": 1.8936071739216918, + "learning_rate": 9.7655401343857e-05, + "loss": 3.717, + "step": 7655 + }, + { + "epoch": 4.933440773569702, + "grad_norm": 1.516656578832939, + "learning_rate": 9.765479122985258e-05, + "loss": 3.2003, + "step": 7656 + }, + { + "epoch": 4.934085414987913, + "grad_norm": 1.714107377514972, + "learning_rate": 9.76541810384023e-05, + "loss": 3.4382, + "step": 7657 + }, + { + "epoch": 4.934730056406124, + "grad_norm": 1.5578665748353715, + "learning_rate": 9.765357076950721e-05, + "loss": 2.9338, + "step": 7658 + }, + { + "epoch": 4.935374697824336, + "grad_norm": 1.672282316063632, + "learning_rate": 9.765296042316827e-05, + "loss": 3.5366, + "step": 7659 + }, + { + "epoch": 4.936019339242546, + "grad_norm": 1.4960815184235885, + "learning_rate": 9.765234999938653e-05, + "loss": 3.4611, + "step": 7660 + }, + { + "epoch": 4.936663980660757, + "grad_norm": 1.3042130714256905, + "learning_rate": 9.765173949816296e-05, + "loss": 3.3774, + "step": 7661 + }, + { + "epoch": 4.937308622078969, + "grad_norm": 1.610889366165888, + "learning_rate": 9.765112891949858e-05, + "loss": 3.3783, + "step": 7662 + }, + { + "epoch": 4.93795326349718, + "grad_norm": 1.5298834738711038, + "learning_rate": 9.765051826339436e-05, + "loss": 3.7971, + "step": 7663 + }, + { + "epoch": 4.938597904915391, + "grad_norm": 1.4555144491931824, + "learning_rate": 9.764990752985133e-05, + "loss": 3.3531, + "step": 7664 + }, + { + "epoch": 4.939242546333602, + "grad_norm": 1.7328668592477745, + "learning_rate": 9.76492967188705e-05, + "loss": 3.1403, + "step": 7665 + }, + { + "epoch": 4.939887187751813, + "grad_norm": 1.7084777247020186, + "learning_rate": 9.764868583045288e-05, + "loss": 3.6869, + "step": 7666 + }, + { + "epoch": 4.940531829170024, + "grad_norm": 1.5524754402823828, + "learning_rate": 9.764807486459943e-05, + "loss": 3.1938, + "step": 7667 + }, + { + "epoch": 4.9411764705882355, + "grad_norm": 1.5307875516456073, + "learning_rate": 9.764746382131119e-05, + "loss": 3.6316, + "step": 7668 + }, + { + "epoch": 4.941821112006446, + "grad_norm": 1.428501815860682, + "learning_rate": 9.764685270058917e-05, + "loss": 3.4108, + "step": 7669 + }, + { + "epoch": 4.942465753424657, + "grad_norm": 1.4232345296381725, + "learning_rate": 9.764624150243434e-05, + "loss": 3.6258, + "step": 7670 + }, + { + "epoch": 4.9431103948428685, + "grad_norm": 1.439615326204321, + "learning_rate": 9.764563022684771e-05, + "loss": 3.3598, + "step": 7671 + }, + { + "epoch": 4.94375503626108, + "grad_norm": 1.640245335184082, + "learning_rate": 9.76450188738303e-05, + "loss": 3.0985, + "step": 7672 + }, + { + "epoch": 4.944399677679291, + "grad_norm": 1.7466231808903585, + "learning_rate": 9.764440744338313e-05, + "loss": 3.3653, + "step": 7673 + }, + { + "epoch": 4.9450443190975015, + "grad_norm": 1.342493937242462, + "learning_rate": 9.764379593550716e-05, + "loss": 3.6091, + "step": 7674 + }, + { + "epoch": 4.945688960515713, + "grad_norm": 1.5598587303988876, + "learning_rate": 9.764318435020342e-05, + "loss": 3.1288, + "step": 7675 + }, + { + "epoch": 4.946333601933924, + "grad_norm": 1.5555591399653907, + "learning_rate": 9.764257268747292e-05, + "loss": 3.5027, + "step": 7676 + }, + { + "epoch": 4.946978243352135, + "grad_norm": 1.3851264130331975, + "learning_rate": 9.764196094731667e-05, + "loss": 3.5825, + "step": 7677 + }, + { + "epoch": 4.947622884770347, + "grad_norm": 1.5262447091432496, + "learning_rate": 9.764134912973564e-05, + "loss": 3.6509, + "step": 7678 + }, + { + "epoch": 4.948267526188557, + "grad_norm": 1.6750616319797245, + "learning_rate": 9.764073723473084e-05, + "loss": 3.1506, + "step": 7679 + }, + { + "epoch": 4.948912167606768, + "grad_norm": 1.5414056124614983, + "learning_rate": 9.764012526230333e-05, + "loss": 3.5405, + "step": 7680 + }, + { + "epoch": 4.94955680902498, + "grad_norm": 1.6025362901718878, + "learning_rate": 9.763951321245407e-05, + "loss": 3.5942, + "step": 7681 + }, + { + "epoch": 4.950201450443191, + "grad_norm": 1.399312765984018, + "learning_rate": 9.763890108518404e-05, + "loss": 3.2596, + "step": 7682 + }, + { + "epoch": 4.950846091861402, + "grad_norm": 1.654595940757172, + "learning_rate": 9.76382888804943e-05, + "loss": 3.5785, + "step": 7683 + }, + { + "epoch": 4.951490733279613, + "grad_norm": 1.450561965112871, + "learning_rate": 9.763767659838583e-05, + "loss": 3.5893, + "step": 7684 + }, + { + "epoch": 4.952135374697824, + "grad_norm": 1.6587753493565245, + "learning_rate": 9.763706423885963e-05, + "loss": 3.2888, + "step": 7685 + }, + { + "epoch": 4.952780016116035, + "grad_norm": 1.4848226474334185, + "learning_rate": 9.763645180191673e-05, + "loss": 3.4913, + "step": 7686 + }, + { + "epoch": 4.953424657534247, + "grad_norm": 2.1077693526519967, + "learning_rate": 9.76358392875581e-05, + "loss": 3.0132, + "step": 7687 + }, + { + "epoch": 4.954069298952458, + "grad_norm": 1.6097300731418027, + "learning_rate": 9.763522669578478e-05, + "loss": 3.6075, + "step": 7688 + }, + { + "epoch": 4.954713940370668, + "grad_norm": 2.0817324235928334, + "learning_rate": 9.763461402659775e-05, + "loss": 3.5638, + "step": 7689 + }, + { + "epoch": 4.95535858178888, + "grad_norm": 1.6294216649832032, + "learning_rate": 9.763400127999803e-05, + "loss": 3.2432, + "step": 7690 + }, + { + "epoch": 4.956003223207091, + "grad_norm": 1.7661612470297419, + "learning_rate": 9.763338845598663e-05, + "loss": 3.4841, + "step": 7691 + }, + { + "epoch": 4.956647864625302, + "grad_norm": 1.659112593093933, + "learning_rate": 9.763277555456454e-05, + "loss": 3.3325, + "step": 7692 + }, + { + "epoch": 4.957292506043514, + "grad_norm": 1.8621590131364905, + "learning_rate": 9.76321625757328e-05, + "loss": 3.3565, + "step": 7693 + }, + { + "epoch": 4.957937147461724, + "grad_norm": 1.4050882190574887, + "learning_rate": 9.763154951949237e-05, + "loss": 3.5521, + "step": 7694 + }, + { + "epoch": 4.958581788879935, + "grad_norm": 2.1217320006495597, + "learning_rate": 9.76309363858443e-05, + "loss": 3.0738, + "step": 7695 + }, + { + "epoch": 4.959226430298147, + "grad_norm": 1.3901029570419032, + "learning_rate": 9.763032317478955e-05, + "loss": 3.5278, + "step": 7696 + }, + { + "epoch": 4.959871071716358, + "grad_norm": 1.7795247054304406, + "learning_rate": 9.762970988632917e-05, + "loss": 3.3436, + "step": 7697 + }, + { + "epoch": 4.960515713134569, + "grad_norm": 1.2984396067608954, + "learning_rate": 9.762909652046416e-05, + "loss": 3.4596, + "step": 7698 + }, + { + "epoch": 4.96116035455278, + "grad_norm": 1.4403678718543318, + "learning_rate": 9.762848307719552e-05, + "loss": 3.0804, + "step": 7699 + }, + { + "epoch": 4.961804995970991, + "grad_norm": 1.3111009616614653, + "learning_rate": 9.762786955652426e-05, + "loss": 3.0904, + "step": 7700 + }, + { + "epoch": 4.961804995970991, + "eval_loss": 4.083297252655029, + "eval_runtime": 2.9731, + "eval_samples_per_second": 33.635, + "eval_steps_per_second": 4.373, + "step": 7700 + }, + { + "epoch": 4.962449637389202, + "grad_norm": 1.3327912082078848, + "learning_rate": 9.762725595845135e-05, + "loss": 3.2833, + "step": 7701 + }, + { + "epoch": 4.9630942788074135, + "grad_norm": 1.7850698587389078, + "learning_rate": 9.762664228297786e-05, + "loss": 3.4342, + "step": 7702 + }, + { + "epoch": 4.963738920225625, + "grad_norm": 1.6254056015474485, + "learning_rate": 9.762602853010477e-05, + "loss": 3.5962, + "step": 7703 + }, + { + "epoch": 4.964383561643835, + "grad_norm": 1.5797229824644896, + "learning_rate": 9.762541469983308e-05, + "loss": 3.7308, + "step": 7704 + }, + { + "epoch": 4.9650282030620465, + "grad_norm": 1.295138153430926, + "learning_rate": 9.762480079216383e-05, + "loss": 3.484, + "step": 7705 + }, + { + "epoch": 4.965672844480258, + "grad_norm": 2.037882618006325, + "learning_rate": 9.762418680709799e-05, + "loss": 3.6354, + "step": 7706 + }, + { + "epoch": 4.966317485898469, + "grad_norm": 1.3890967531896914, + "learning_rate": 9.762357274463657e-05, + "loss": 3.3936, + "step": 7707 + }, + { + "epoch": 4.96696212731668, + "grad_norm": 1.4756732325220432, + "learning_rate": 9.76229586047806e-05, + "loss": 3.5754, + "step": 7708 + }, + { + "epoch": 4.967606768734891, + "grad_norm": 1.6806936899452012, + "learning_rate": 9.762234438753107e-05, + "loss": 3.2795, + "step": 7709 + }, + { + "epoch": 4.968251410153102, + "grad_norm": 1.4067922033521052, + "learning_rate": 9.762173009288902e-05, + "loss": 3.4597, + "step": 7710 + }, + { + "epoch": 4.968896051571313, + "grad_norm": 1.3949282697503507, + "learning_rate": 9.762111572085543e-05, + "loss": 3.3045, + "step": 7711 + }, + { + "epoch": 4.969540692989525, + "grad_norm": 1.4352704438769817, + "learning_rate": 9.762050127143132e-05, + "loss": 3.6004, + "step": 7712 + }, + { + "epoch": 4.970185334407736, + "grad_norm": 1.532831012542807, + "learning_rate": 9.761988674461768e-05, + "loss": 3.0164, + "step": 7713 + }, + { + "epoch": 4.970829975825946, + "grad_norm": 1.5217011422700395, + "learning_rate": 9.761927214041554e-05, + "loss": 3.5322, + "step": 7714 + }, + { + "epoch": 4.971474617244158, + "grad_norm": 1.4044107924279343, + "learning_rate": 9.761865745882592e-05, + "loss": 3.3446, + "step": 7715 + }, + { + "epoch": 4.972119258662369, + "grad_norm": 1.1199095567792365, + "learning_rate": 9.761804269984981e-05, + "loss": 2.9026, + "step": 7716 + }, + { + "epoch": 4.97276390008058, + "grad_norm": 1.7108542220107552, + "learning_rate": 9.761742786348821e-05, + "loss": 3.209, + "step": 7717 + }, + { + "epoch": 4.973408541498792, + "grad_norm": 1.4255709511739463, + "learning_rate": 9.761681294974216e-05, + "loss": 3.7545, + "step": 7718 + }, + { + "epoch": 4.974053182917002, + "grad_norm": 1.349718053422233, + "learning_rate": 9.761619795861265e-05, + "loss": 3.3032, + "step": 7719 + }, + { + "epoch": 4.974697824335213, + "grad_norm": 1.6969322775425462, + "learning_rate": 9.761558289010068e-05, + "loss": 3.2727, + "step": 7720 + }, + { + "epoch": 4.975342465753425, + "grad_norm": 1.3050653362032603, + "learning_rate": 9.761496774420728e-05, + "loss": 3.487, + "step": 7721 + }, + { + "epoch": 4.975987107171636, + "grad_norm": 1.5925119860386527, + "learning_rate": 9.761435252093346e-05, + "loss": 3.8019, + "step": 7722 + }, + { + "epoch": 4.976631748589847, + "grad_norm": 1.4367395300692658, + "learning_rate": 9.761373722028023e-05, + "loss": 3.6101, + "step": 7723 + }, + { + "epoch": 4.977276390008058, + "grad_norm": 1.5450058089747112, + "learning_rate": 9.761312184224859e-05, + "loss": 3.5536, + "step": 7724 + }, + { + "epoch": 4.977921031426269, + "grad_norm": 1.2942013930380958, + "learning_rate": 9.761250638683955e-05, + "loss": 3.4668, + "step": 7725 + }, + { + "epoch": 4.97856567284448, + "grad_norm": 1.635513904696065, + "learning_rate": 9.761189085405414e-05, + "loss": 3.2968, + "step": 7726 + }, + { + "epoch": 4.979210314262692, + "grad_norm": 1.6633402033003573, + "learning_rate": 9.761127524389334e-05, + "loss": 3.3309, + "step": 7727 + }, + { + "epoch": 4.979854955680903, + "grad_norm": 1.9144431929753163, + "learning_rate": 9.761065955635817e-05, + "loss": 3.2567, + "step": 7728 + }, + { + "epoch": 4.980499597099113, + "grad_norm": 1.8958851663278222, + "learning_rate": 9.76100437914497e-05, + "loss": 3.3389, + "step": 7729 + }, + { + "epoch": 4.981144238517325, + "grad_norm": 1.4657183595650642, + "learning_rate": 9.760942794916883e-05, + "loss": 3.7007, + "step": 7730 + }, + { + "epoch": 4.981788879935536, + "grad_norm": 2.2434861299026463, + "learning_rate": 9.760881202951667e-05, + "loss": 3.622, + "step": 7731 + }, + { + "epoch": 4.982433521353747, + "grad_norm": 1.5321309459969008, + "learning_rate": 9.760819603249418e-05, + "loss": 3.1761, + "step": 7732 + }, + { + "epoch": 4.9830781627719585, + "grad_norm": 1.460108036488777, + "learning_rate": 9.760757995810239e-05, + "loss": 3.5548, + "step": 7733 + }, + { + "epoch": 4.983722804190169, + "grad_norm": 1.2878175512762593, + "learning_rate": 9.760696380634231e-05, + "loss": 3.5015, + "step": 7734 + }, + { + "epoch": 4.98436744560838, + "grad_norm": 2.0051034642212024, + "learning_rate": 9.760634757721493e-05, + "loss": 3.4333, + "step": 7735 + }, + { + "epoch": 4.9850120870265915, + "grad_norm": 1.646332093025767, + "learning_rate": 9.760573127072132e-05, + "loss": 3.246, + "step": 7736 + }, + { + "epoch": 4.985656728444803, + "grad_norm": 2.141777795222853, + "learning_rate": 9.760511488686243e-05, + "loss": 3.4231, + "step": 7737 + }, + { + "epoch": 4.986301369863014, + "grad_norm": 1.7939589262471554, + "learning_rate": 9.760449842563929e-05, + "loss": 3.3803, + "step": 7738 + }, + { + "epoch": 4.9869460112812245, + "grad_norm": 1.460788709368353, + "learning_rate": 9.760388188705293e-05, + "loss": 3.6585, + "step": 7739 + }, + { + "epoch": 4.987590652699436, + "grad_norm": 1.727711174606157, + "learning_rate": 9.760326527110435e-05, + "loss": 3.6148, + "step": 7740 + }, + { + "epoch": 4.988235294117647, + "grad_norm": 1.8708379302758622, + "learning_rate": 9.760264857779457e-05, + "loss": 3.5262, + "step": 7741 + }, + { + "epoch": 4.988879935535858, + "grad_norm": 1.3338357121455064, + "learning_rate": 9.76020318071246e-05, + "loss": 3.4913, + "step": 7742 + }, + { + "epoch": 4.98952457695407, + "grad_norm": 1.787122121956021, + "learning_rate": 9.760141495909542e-05, + "loss": 3.4767, + "step": 7743 + }, + { + "epoch": 4.99016921837228, + "grad_norm": 1.5578912890973178, + "learning_rate": 9.760079803370811e-05, + "loss": 3.2133, + "step": 7744 + }, + { + "epoch": 4.990813859790491, + "grad_norm": 1.5969999407985322, + "learning_rate": 9.760018103096362e-05, + "loss": 3.916, + "step": 7745 + }, + { + "epoch": 4.991458501208703, + "grad_norm": 1.3970652871300782, + "learning_rate": 9.759956395086299e-05, + "loss": 4.0031, + "step": 7746 + }, + { + "epoch": 4.992103142626914, + "grad_norm": 1.6532911958066483, + "learning_rate": 9.759894679340725e-05, + "loss": 3.4042, + "step": 7747 + }, + { + "epoch": 4.992747784045125, + "grad_norm": 1.7304431275711463, + "learning_rate": 9.75983295585974e-05, + "loss": 3.5196, + "step": 7748 + }, + { + "epoch": 4.993392425463336, + "grad_norm": 1.7514098380642946, + "learning_rate": 9.759771224643442e-05, + "loss": 3.6246, + "step": 7749 + }, + { + "epoch": 4.994037066881547, + "grad_norm": 1.4842944981878778, + "learning_rate": 9.759709485691938e-05, + "loss": 3.5813, + "step": 7750 + }, + { + "epoch": 4.994681708299758, + "grad_norm": 1.612714051718011, + "learning_rate": 9.759647739005326e-05, + "loss": 3.7339, + "step": 7751 + }, + { + "epoch": 4.99532634971797, + "grad_norm": 1.536617901365441, + "learning_rate": 9.759585984583709e-05, + "loss": 3.4061, + "step": 7752 + }, + { + "epoch": 4.995970991136181, + "grad_norm": 1.655757985111099, + "learning_rate": 9.759524222427187e-05, + "loss": 3.5339, + "step": 7753 + }, + { + "epoch": 4.996615632554391, + "grad_norm": 1.583974247357161, + "learning_rate": 9.759462452535861e-05, + "loss": 3.3663, + "step": 7754 + }, + { + "epoch": 4.997260273972603, + "grad_norm": 1.7087656677101428, + "learning_rate": 9.759400674909834e-05, + "loss": 3.2665, + "step": 7755 + }, + { + "epoch": 4.997904915390814, + "grad_norm": 1.5837103441320408, + "learning_rate": 9.759338889549208e-05, + "loss": 3.6449, + "step": 7756 + }, + { + "epoch": 4.998549556809025, + "grad_norm": 1.3721519754366123, + "learning_rate": 9.759277096454082e-05, + "loss": 3.3779, + "step": 7757 + }, + { + "epoch": 4.9991941982272365, + "grad_norm": 1.7050895335363834, + "learning_rate": 9.759215295624561e-05, + "loss": 3.1655, + "step": 7758 + }, + { + "epoch": 4.999838839645447, + "grad_norm": 1.4420755011008366, + "learning_rate": 9.759153487060743e-05, + "loss": 3.127, + "step": 7759 + }, + { + "epoch": 5.0, + "grad_norm": 1.4420755011008366, + "learning_rate": 9.759091670762732e-05, + "loss": 0.8053, + "step": 7760 + }, + { + "epoch": 5.000644641418211, + "grad_norm": 2.171347424351551, + "learning_rate": 9.759029846730628e-05, + "loss": 2.8912, + "step": 7761 + }, + { + "epoch": 5.001289282836423, + "grad_norm": 1.8927157454232322, + "learning_rate": 9.758968014964533e-05, + "loss": 2.934, + "step": 7762 + }, + { + "epoch": 5.001933924254633, + "grad_norm": 2.191696867718596, + "learning_rate": 9.758906175464548e-05, + "loss": 3.2875, + "step": 7763 + }, + { + "epoch": 5.002578565672844, + "grad_norm": 2.1802969365860156, + "learning_rate": 9.758844328230775e-05, + "loss": 2.6133, + "step": 7764 + }, + { + "epoch": 5.003223207091056, + "grad_norm": 1.9311073611611826, + "learning_rate": 9.758782473263318e-05, + "loss": 2.9634, + "step": 7765 + }, + { + "epoch": 5.003867848509267, + "grad_norm": 1.9584788678979905, + "learning_rate": 9.758720610562273e-05, + "loss": 2.8626, + "step": 7766 + }, + { + "epoch": 5.004512489927478, + "grad_norm": 1.8965639716330878, + "learning_rate": 9.758658740127748e-05, + "loss": 2.8127, + "step": 7767 + }, + { + "epoch": 5.005157131345689, + "grad_norm": 1.7244380408612132, + "learning_rate": 9.75859686195984e-05, + "loss": 2.7494, + "step": 7768 + }, + { + "epoch": 5.0058017727639, + "grad_norm": 1.6955461767077133, + "learning_rate": 9.758534976058652e-05, + "loss": 2.9813, + "step": 7769 + }, + { + "epoch": 5.006446414182111, + "grad_norm": 1.9830313270715287, + "learning_rate": 9.758473082424287e-05, + "loss": 2.4146, + "step": 7770 + }, + { + "epoch": 5.0070910556003225, + "grad_norm": 1.9370739679467217, + "learning_rate": 9.758411181056845e-05, + "loss": 2.7004, + "step": 7771 + }, + { + "epoch": 5.007735697018534, + "grad_norm": 1.9508132138898067, + "learning_rate": 9.758349271956428e-05, + "loss": 2.963, + "step": 7772 + }, + { + "epoch": 5.008380338436744, + "grad_norm": 1.7799400788025284, + "learning_rate": 9.758287355123135e-05, + "loss": 2.8002, + "step": 7773 + }, + { + "epoch": 5.0090249798549555, + "grad_norm": 1.6727078750374214, + "learning_rate": 9.758225430557076e-05, + "loss": 2.5232, + "step": 7774 + }, + { + "epoch": 5.009669621273167, + "grad_norm": 1.7535028430644222, + "learning_rate": 9.758163498258343e-05, + "loss": 2.8254, + "step": 7775 + }, + { + "epoch": 5.010314262691378, + "grad_norm": 1.711000896101296, + "learning_rate": 9.758101558227043e-05, + "loss": 2.8689, + "step": 7776 + }, + { + "epoch": 5.010958904109589, + "grad_norm": 1.7274758712038867, + "learning_rate": 9.758039610463276e-05, + "loss": 2.8766, + "step": 7777 + }, + { + "epoch": 5.0116035455278, + "grad_norm": 1.6805040247409806, + "learning_rate": 9.757977654967146e-05, + "loss": 2.4379, + "step": 7778 + }, + { + "epoch": 5.012248186946011, + "grad_norm": 1.6064126354644743, + "learning_rate": 9.757915691738753e-05, + "loss": 2.7499, + "step": 7779 + }, + { + "epoch": 5.0128928283642225, + "grad_norm": 1.6987116838803509, + "learning_rate": 9.757853720778197e-05, + "loss": 2.7091, + "step": 7780 + }, + { + "epoch": 5.013537469782434, + "grad_norm": 1.6693600295428317, + "learning_rate": 9.757791742085581e-05, + "loss": 2.7493, + "step": 7781 + }, + { + "epoch": 5.014182111200645, + "grad_norm": 1.828976898518578, + "learning_rate": 9.757729755661012e-05, + "loss": 2.9719, + "step": 7782 + }, + { + "epoch": 5.0148267526188555, + "grad_norm": 1.5945494481110045, + "learning_rate": 9.757667761504584e-05, + "loss": 2.8065, + "step": 7783 + }, + { + "epoch": 5.015471394037067, + "grad_norm": 1.5467363935575404, + "learning_rate": 9.757605759616403e-05, + "loss": 2.8434, + "step": 7784 + }, + { + "epoch": 5.016116035455278, + "grad_norm": 1.6216743197336996, + "learning_rate": 9.757543749996569e-05, + "loss": 2.6936, + "step": 7785 + }, + { + "epoch": 5.016760676873489, + "grad_norm": 1.5022707188853306, + "learning_rate": 9.757481732645184e-05, + "loss": 2.5619, + "step": 7786 + }, + { + "epoch": 5.017405318291701, + "grad_norm": 1.7735717067710874, + "learning_rate": 9.757419707562352e-05, + "loss": 2.6095, + "step": 7787 + }, + { + "epoch": 5.018049959709911, + "grad_norm": 1.4702780171236196, + "learning_rate": 9.757357674748175e-05, + "loss": 2.8857, + "step": 7788 + }, + { + "epoch": 5.018694601128122, + "grad_norm": 1.934373939572435, + "learning_rate": 9.757295634202751e-05, + "loss": 2.6172, + "step": 7789 + }, + { + "epoch": 5.019339242546334, + "grad_norm": 1.7287745557767011, + "learning_rate": 9.757233585926184e-05, + "loss": 2.944, + "step": 7790 + }, + { + "epoch": 5.019983883964545, + "grad_norm": 1.7686219643923338, + "learning_rate": 9.757171529918579e-05, + "loss": 2.8582, + "step": 7791 + }, + { + "epoch": 5.020628525382756, + "grad_norm": 1.9268920784512962, + "learning_rate": 9.757109466180034e-05, + "loss": 2.8234, + "step": 7792 + }, + { + "epoch": 5.021273166800967, + "grad_norm": 1.5183149405816836, + "learning_rate": 9.75704739471065e-05, + "loss": 2.704, + "step": 7793 + }, + { + "epoch": 5.021917808219178, + "grad_norm": 1.7137367558058838, + "learning_rate": 9.756985315510532e-05, + "loss": 2.7054, + "step": 7794 + }, + { + "epoch": 5.022562449637389, + "grad_norm": 1.525230796795948, + "learning_rate": 9.75692322857978e-05, + "loss": 2.4748, + "step": 7795 + }, + { + "epoch": 5.023207091055601, + "grad_norm": 1.7942367206333154, + "learning_rate": 9.756861133918499e-05, + "loss": 2.8602, + "step": 7796 + }, + { + "epoch": 5.023851732473811, + "grad_norm": 1.622463925642233, + "learning_rate": 9.756799031526788e-05, + "loss": 2.652, + "step": 7797 + }, + { + "epoch": 5.024496373892022, + "grad_norm": 1.6527004236715706, + "learning_rate": 9.75673692140475e-05, + "loss": 2.5145, + "step": 7798 + }, + { + "epoch": 5.025141015310234, + "grad_norm": 1.651217150368993, + "learning_rate": 9.756674803552486e-05, + "loss": 2.4972, + "step": 7799 + }, + { + "epoch": 5.025785656728445, + "grad_norm": 1.5600370854018992, + "learning_rate": 9.7566126779701e-05, + "loss": 2.898, + "step": 7800 + }, + { + "epoch": 5.025785656728445, + "eval_loss": 4.5614399909973145, + "eval_runtime": 2.967, + "eval_samples_per_second": 33.704, + "eval_steps_per_second": 4.382, + "step": 7800 + }, + { + "epoch": 5.026430298146656, + "grad_norm": 1.6397026735721771, + "learning_rate": 9.756550544657692e-05, + "loss": 2.6171, + "step": 7801 + }, + { + "epoch": 5.027074939564867, + "grad_norm": 1.4955482352618596, + "learning_rate": 9.756488403615366e-05, + "loss": 2.6075, + "step": 7802 + }, + { + "epoch": 5.027719580983078, + "grad_norm": 1.6228608130495925, + "learning_rate": 9.756426254843224e-05, + "loss": 2.5865, + "step": 7803 + }, + { + "epoch": 5.028364222401289, + "grad_norm": 1.749530009923149, + "learning_rate": 9.756364098341365e-05, + "loss": 2.6144, + "step": 7804 + }, + { + "epoch": 5.0290088638195005, + "grad_norm": 1.545031188354635, + "learning_rate": 9.756301934109894e-05, + "loss": 2.7564, + "step": 7805 + }, + { + "epoch": 5.029653505237712, + "grad_norm": 1.730684239202292, + "learning_rate": 9.756239762148912e-05, + "loss": 2.7782, + "step": 7806 + }, + { + "epoch": 5.030298146655922, + "grad_norm": 1.7066452494753725, + "learning_rate": 9.756177582458522e-05, + "loss": 2.9636, + "step": 7807 + }, + { + "epoch": 5.0309427880741335, + "grad_norm": 1.6271074598343, + "learning_rate": 9.756115395038824e-05, + "loss": 3.1625, + "step": 7808 + }, + { + "epoch": 5.031587429492345, + "grad_norm": 5.797658347637937, + "learning_rate": 9.756053199889923e-05, + "loss": 2.7785, + "step": 7809 + }, + { + "epoch": 5.032232070910556, + "grad_norm": 1.7259060882433161, + "learning_rate": 9.755990997011919e-05, + "loss": 2.5996, + "step": 7810 + }, + { + "epoch": 5.032876712328767, + "grad_norm": 1.8448298827663978, + "learning_rate": 9.755928786404916e-05, + "loss": 2.8796, + "step": 7811 + }, + { + "epoch": 5.033521353746978, + "grad_norm": 2.0857493412141297, + "learning_rate": 9.755866568069014e-05, + "loss": 2.4756, + "step": 7812 + }, + { + "epoch": 5.034165995165189, + "grad_norm": 1.6478023951507708, + "learning_rate": 9.755804342004317e-05, + "loss": 2.5629, + "step": 7813 + }, + { + "epoch": 5.0348106365834004, + "grad_norm": 1.7909656304639592, + "learning_rate": 9.755742108210927e-05, + "loss": 2.7285, + "step": 7814 + }, + { + "epoch": 5.035455278001612, + "grad_norm": 1.6655567046743378, + "learning_rate": 9.755679866688944e-05, + "loss": 2.5308, + "step": 7815 + }, + { + "epoch": 5.036099919419823, + "grad_norm": 1.6863121617245285, + "learning_rate": 9.755617617438474e-05, + "loss": 2.6206, + "step": 7816 + }, + { + "epoch": 5.0367445608380335, + "grad_norm": 1.8135814429531922, + "learning_rate": 9.755555360459618e-05, + "loss": 2.7931, + "step": 7817 + }, + { + "epoch": 5.037389202256245, + "grad_norm": 1.5738339897288185, + "learning_rate": 9.755493095752474e-05, + "loss": 2.8061, + "step": 7818 + }, + { + "epoch": 5.038033843674456, + "grad_norm": 1.7176823419621146, + "learning_rate": 9.75543082331715e-05, + "loss": 2.6825, + "step": 7819 + }, + { + "epoch": 5.038678485092667, + "grad_norm": 1.604990462741236, + "learning_rate": 9.755368543153746e-05, + "loss": 2.6772, + "step": 7820 + }, + { + "epoch": 5.039323126510879, + "grad_norm": 1.5933609247020961, + "learning_rate": 9.755306255262364e-05, + "loss": 2.7964, + "step": 7821 + }, + { + "epoch": 5.039967767929089, + "grad_norm": 1.780359300812491, + "learning_rate": 9.755243959643105e-05, + "loss": 2.7149, + "step": 7822 + }, + { + "epoch": 5.0406124093473, + "grad_norm": 1.6877507166255368, + "learning_rate": 9.755181656296075e-05, + "loss": 2.8943, + "step": 7823 + }, + { + "epoch": 5.041257050765512, + "grad_norm": 1.665647770518363, + "learning_rate": 9.755119345221374e-05, + "loss": 2.6339, + "step": 7824 + }, + { + "epoch": 5.041901692183723, + "grad_norm": 1.593325706598919, + "learning_rate": 9.755057026419104e-05, + "loss": 2.9378, + "step": 7825 + }, + { + "epoch": 5.042546333601934, + "grad_norm": 1.8892879063530188, + "learning_rate": 9.754994699889367e-05, + "loss": 2.685, + "step": 7826 + }, + { + "epoch": 5.043190975020145, + "grad_norm": 1.519410417227538, + "learning_rate": 9.754932365632267e-05, + "loss": 2.677, + "step": 7827 + }, + { + "epoch": 5.043835616438356, + "grad_norm": 1.9165787268011663, + "learning_rate": 9.754870023647905e-05, + "loss": 2.7371, + "step": 7828 + }, + { + "epoch": 5.044480257856567, + "grad_norm": 1.664422633193906, + "learning_rate": 9.754807673936385e-05, + "loss": 2.7695, + "step": 7829 + }, + { + "epoch": 5.045124899274779, + "grad_norm": 1.5984434508836098, + "learning_rate": 9.75474531649781e-05, + "loss": 2.6206, + "step": 7830 + }, + { + "epoch": 5.04576954069299, + "grad_norm": 1.6172984584304761, + "learning_rate": 9.754682951332278e-05, + "loss": 2.8104, + "step": 7831 + }, + { + "epoch": 5.0464141821112, + "grad_norm": 1.7595577525599777, + "learning_rate": 9.754620578439895e-05, + "loss": 2.6523, + "step": 7832 + }, + { + "epoch": 5.047058823529412, + "grad_norm": 1.7956005015650198, + "learning_rate": 9.754558197820764e-05, + "loss": 2.9677, + "step": 7833 + }, + { + "epoch": 5.047703464947623, + "grad_norm": 1.789633549781397, + "learning_rate": 9.754495809474984e-05, + "loss": 2.6398, + "step": 7834 + }, + { + "epoch": 5.048348106365834, + "grad_norm": 1.7675771045706994, + "learning_rate": 9.754433413402661e-05, + "loss": 2.54, + "step": 7835 + }, + { + "epoch": 5.0489927477840455, + "grad_norm": 1.7962040626647775, + "learning_rate": 9.754371009603896e-05, + "loss": 2.8464, + "step": 7836 + }, + { + "epoch": 5.049637389202256, + "grad_norm": 1.7622830464066253, + "learning_rate": 9.754308598078792e-05, + "loss": 2.7691, + "step": 7837 + }, + { + "epoch": 5.050282030620467, + "grad_norm": 1.9457037030602522, + "learning_rate": 9.754246178827451e-05, + "loss": 2.9552, + "step": 7838 + }, + { + "epoch": 5.0509266720386785, + "grad_norm": 2.378673944684884, + "learning_rate": 9.754183751849975e-05, + "loss": 2.6966, + "step": 7839 + }, + { + "epoch": 5.05157131345689, + "grad_norm": 2.0253271175642364, + "learning_rate": 9.754121317146467e-05, + "loss": 2.7274, + "step": 7840 + }, + { + "epoch": 5.052215954875101, + "grad_norm": 2.4556856677406578, + "learning_rate": 9.75405887471703e-05, + "loss": 2.6141, + "step": 7841 + }, + { + "epoch": 5.0528605962933115, + "grad_norm": 2.871902060958796, + "learning_rate": 9.753996424561767e-05, + "loss": 2.5347, + "step": 7842 + }, + { + "epoch": 5.053505237711523, + "grad_norm": 1.6635148482749753, + "learning_rate": 9.753933966680779e-05, + "loss": 2.8515, + "step": 7843 + }, + { + "epoch": 5.054149879129734, + "grad_norm": 2.0683618664350556, + "learning_rate": 9.753871501074168e-05, + "loss": 2.8097, + "step": 7844 + }, + { + "epoch": 5.054794520547945, + "grad_norm": 1.5808900003133592, + "learning_rate": 9.753809027742041e-05, + "loss": 2.8422, + "step": 7845 + }, + { + "epoch": 5.055439161966157, + "grad_norm": 1.9278906911876321, + "learning_rate": 9.753746546684494e-05, + "loss": 2.6456, + "step": 7846 + }, + { + "epoch": 5.056083803384367, + "grad_norm": 1.4707364411117745, + "learning_rate": 9.753684057901635e-05, + "loss": 2.6077, + "step": 7847 + }, + { + "epoch": 5.056728444802578, + "grad_norm": 1.8811918234676448, + "learning_rate": 9.753621561393566e-05, + "loss": 2.5464, + "step": 7848 + }, + { + "epoch": 5.05737308622079, + "grad_norm": 1.6110707542654672, + "learning_rate": 9.753559057160388e-05, + "loss": 2.7651, + "step": 7849 + }, + { + "epoch": 5.058017727639001, + "grad_norm": 2.1067443927363523, + "learning_rate": 9.753496545202203e-05, + "loss": 3.0417, + "step": 7850 + }, + { + "epoch": 5.058662369057212, + "grad_norm": 1.727476464874995, + "learning_rate": 9.753434025519116e-05, + "loss": 2.5599, + "step": 7851 + }, + { + "epoch": 5.059307010475423, + "grad_norm": 2.017499579233829, + "learning_rate": 9.753371498111227e-05, + "loss": 3.0169, + "step": 7852 + }, + { + "epoch": 5.059951651893634, + "grad_norm": 1.7062278398922988, + "learning_rate": 9.753308962978641e-05, + "loss": 2.8702, + "step": 7853 + }, + { + "epoch": 5.060596293311845, + "grad_norm": 1.9776062408290687, + "learning_rate": 9.75324642012146e-05, + "loss": 2.6652, + "step": 7854 + }, + { + "epoch": 5.061240934730057, + "grad_norm": 1.8424737430864955, + "learning_rate": 9.753183869539787e-05, + "loss": 2.8436, + "step": 7855 + }, + { + "epoch": 5.061885576148268, + "grad_norm": 2.121437153331136, + "learning_rate": 9.753121311233724e-05, + "loss": 2.5532, + "step": 7856 + }, + { + "epoch": 5.062530217566478, + "grad_norm": 1.4584825093790335, + "learning_rate": 9.753058745203374e-05, + "loss": 2.5845, + "step": 7857 + }, + { + "epoch": 5.06317485898469, + "grad_norm": 2.0599204562218003, + "learning_rate": 9.752996171448838e-05, + "loss": 2.5811, + "step": 7858 + }, + { + "epoch": 5.063819500402901, + "grad_norm": 1.765826310508067, + "learning_rate": 9.752933589970224e-05, + "loss": 2.4094, + "step": 7859 + }, + { + "epoch": 5.064464141821112, + "grad_norm": 2.2787278313053903, + "learning_rate": 9.752871000767629e-05, + "loss": 2.8877, + "step": 7860 + }, + { + "epoch": 5.065108783239323, + "grad_norm": 1.857537915772621, + "learning_rate": 9.75280840384116e-05, + "loss": 2.6847, + "step": 7861 + }, + { + "epoch": 5.065753424657534, + "grad_norm": 1.7774783065317423, + "learning_rate": 9.752745799190916e-05, + "loss": 2.5338, + "step": 7862 + }, + { + "epoch": 5.066398066075745, + "grad_norm": 1.8356539096998026, + "learning_rate": 9.752683186817003e-05, + "loss": 2.8872, + "step": 7863 + }, + { + "epoch": 5.067042707493957, + "grad_norm": 1.5371368367573726, + "learning_rate": 9.752620566719523e-05, + "loss": 2.4372, + "step": 7864 + }, + { + "epoch": 5.067687348912168, + "grad_norm": 1.6799710711626636, + "learning_rate": 9.752557938898575e-05, + "loss": 2.6801, + "step": 7865 + }, + { + "epoch": 5.068331990330378, + "grad_norm": 1.6510670202511613, + "learning_rate": 9.75249530335427e-05, + "loss": 2.9475, + "step": 7866 + }, + { + "epoch": 5.06897663174859, + "grad_norm": 1.6811838154287284, + "learning_rate": 9.752432660086703e-05, + "loss": 2.8584, + "step": 7867 + }, + { + "epoch": 5.069621273166801, + "grad_norm": 1.6556236583890693, + "learning_rate": 9.752370009095982e-05, + "loss": 2.8392, + "step": 7868 + }, + { + "epoch": 5.070265914585012, + "grad_norm": 1.46060014079966, + "learning_rate": 9.752307350382207e-05, + "loss": 2.8397, + "step": 7869 + }, + { + "epoch": 5.0709105560032235, + "grad_norm": 1.5010794126410902, + "learning_rate": 9.752244683945482e-05, + "loss": 2.7743, + "step": 7870 + }, + { + "epoch": 5.071555197421434, + "grad_norm": 1.4134245945805233, + "learning_rate": 9.75218200978591e-05, + "loss": 2.9283, + "step": 7871 + }, + { + "epoch": 5.072199838839645, + "grad_norm": 1.7641466133160562, + "learning_rate": 9.752119327903594e-05, + "loss": 2.8534, + "step": 7872 + }, + { + "epoch": 5.0728444802578565, + "grad_norm": 1.7332791915750998, + "learning_rate": 9.752056638298637e-05, + "loss": 2.7176, + "step": 7873 + }, + { + "epoch": 5.073489121676068, + "grad_norm": 1.5190045094296887, + "learning_rate": 9.75199394097114e-05, + "loss": 2.7495, + "step": 7874 + }, + { + "epoch": 5.074133763094279, + "grad_norm": 1.9994600785365366, + "learning_rate": 9.75193123592121e-05, + "loss": 2.9287, + "step": 7875 + }, + { + "epoch": 5.0747784045124895, + "grad_norm": 1.674425562127454, + "learning_rate": 9.751868523148945e-05, + "loss": 2.7673, + "step": 7876 + }, + { + "epoch": 5.075423045930701, + "grad_norm": 2.2676885647440783, + "learning_rate": 9.751805802654452e-05, + "loss": 3.0894, + "step": 7877 + }, + { + "epoch": 5.076067687348912, + "grad_norm": 1.6545158550021846, + "learning_rate": 9.751743074437834e-05, + "loss": 2.9478, + "step": 7878 + }, + { + "epoch": 5.076712328767123, + "grad_norm": 2.0010965160071357, + "learning_rate": 9.751680338499191e-05, + "loss": 2.2948, + "step": 7879 + }, + { + "epoch": 5.077356970185335, + "grad_norm": 1.6912797723280562, + "learning_rate": 9.751617594838627e-05, + "loss": 2.5768, + "step": 7880 + }, + { + "epoch": 5.078001611603545, + "grad_norm": 2.0907864471446573, + "learning_rate": 9.751554843456247e-05, + "loss": 2.8644, + "step": 7881 + }, + { + "epoch": 5.078646253021756, + "grad_norm": 1.7980841067880242, + "learning_rate": 9.751492084352151e-05, + "loss": 2.8203, + "step": 7882 + }, + { + "epoch": 5.079290894439968, + "grad_norm": 1.655985196313232, + "learning_rate": 9.751429317526448e-05, + "loss": 2.5294, + "step": 7883 + }, + { + "epoch": 5.079935535858179, + "grad_norm": 1.7145488957784798, + "learning_rate": 9.751366542979232e-05, + "loss": 2.6252, + "step": 7884 + }, + { + "epoch": 5.08058017727639, + "grad_norm": 1.7439238288704255, + "learning_rate": 9.751303760710615e-05, + "loss": 3.0297, + "step": 7885 + }, + { + "epoch": 5.081224818694601, + "grad_norm": 1.6391291350432202, + "learning_rate": 9.751240970720693e-05, + "loss": 2.9324, + "step": 7886 + }, + { + "epoch": 5.081869460112812, + "grad_norm": 1.9737196107007924, + "learning_rate": 9.751178173009574e-05, + "loss": 2.5399, + "step": 7887 + }, + { + "epoch": 5.082514101531023, + "grad_norm": 1.6435330910340156, + "learning_rate": 9.751115367577359e-05, + "loss": 2.8336, + "step": 7888 + }, + { + "epoch": 5.083158742949235, + "grad_norm": 1.6821679694774805, + "learning_rate": 9.751052554424153e-05, + "loss": 2.7261, + "step": 7889 + }, + { + "epoch": 5.083803384367446, + "grad_norm": 1.8094201828537522, + "learning_rate": 9.750989733550056e-05, + "loss": 2.7658, + "step": 7890 + }, + { + "epoch": 5.084448025785656, + "grad_norm": 2.0029932726102926, + "learning_rate": 9.750926904955173e-05, + "loss": 2.6539, + "step": 7891 + }, + { + "epoch": 5.085092667203868, + "grad_norm": 1.8051920554428547, + "learning_rate": 9.750864068639608e-05, + "loss": 2.76, + "step": 7892 + }, + { + "epoch": 5.085737308622079, + "grad_norm": 1.9409995031973366, + "learning_rate": 9.750801224603462e-05, + "loss": 2.6391, + "step": 7893 + }, + { + "epoch": 5.08638195004029, + "grad_norm": 2.042008521123735, + "learning_rate": 9.750738372846841e-05, + "loss": 2.9019, + "step": 7894 + }, + { + "epoch": 5.087026591458502, + "grad_norm": 1.8018404463323392, + "learning_rate": 9.750675513369846e-05, + "loss": 2.7099, + "step": 7895 + }, + { + "epoch": 5.087671232876712, + "grad_norm": 2.0451029474900047, + "learning_rate": 9.750612646172581e-05, + "loss": 2.9822, + "step": 7896 + }, + { + "epoch": 5.088315874294923, + "grad_norm": 1.6490168564809609, + "learning_rate": 9.75054977125515e-05, + "loss": 2.7812, + "step": 7897 + }, + { + "epoch": 5.088960515713135, + "grad_norm": 1.7765675782835526, + "learning_rate": 9.750486888617654e-05, + "loss": 2.7443, + "step": 7898 + }, + { + "epoch": 5.089605157131346, + "grad_norm": 1.6992231683357941, + "learning_rate": 9.750423998260198e-05, + "loss": 2.8302, + "step": 7899 + }, + { + "epoch": 5.090249798549557, + "grad_norm": 1.8025471474317094, + "learning_rate": 9.750361100182885e-05, + "loss": 3.0506, + "step": 7900 + }, + { + "epoch": 5.090249798549557, + "eval_loss": 4.6372833251953125, + "eval_runtime": 2.9783, + "eval_samples_per_second": 33.576, + "eval_steps_per_second": 4.365, + "step": 7900 + }, + { + "epoch": 5.090894439967768, + "grad_norm": 1.8512500789205226, + "learning_rate": 9.75029819438582e-05, + "loss": 2.5999, + "step": 7901 + }, + { + "epoch": 5.091539081385979, + "grad_norm": 1.7068815835151974, + "learning_rate": 9.750235280869104e-05, + "loss": 2.9495, + "step": 7902 + }, + { + "epoch": 5.09218372280419, + "grad_norm": 1.8255802817488587, + "learning_rate": 9.75017235963284e-05, + "loss": 2.8876, + "step": 7903 + }, + { + "epoch": 5.0928283642224015, + "grad_norm": 1.7990622844914481, + "learning_rate": 9.750109430677134e-05, + "loss": 2.8084, + "step": 7904 + }, + { + "epoch": 5.093473005640613, + "grad_norm": 2.112476616369418, + "learning_rate": 9.750046494002086e-05, + "loss": 2.8842, + "step": 7905 + }, + { + "epoch": 5.094117647058823, + "grad_norm": 1.6123010224065826, + "learning_rate": 9.749983549607801e-05, + "loss": 3.0053, + "step": 7906 + }, + { + "epoch": 5.0947622884770345, + "grad_norm": 1.848677414591614, + "learning_rate": 9.749920597494384e-05, + "loss": 2.7031, + "step": 7907 + }, + { + "epoch": 5.095406929895246, + "grad_norm": 1.86731162934549, + "learning_rate": 9.749857637661936e-05, + "loss": 2.5699, + "step": 7908 + }, + { + "epoch": 5.096051571313457, + "grad_norm": 1.9775998959926133, + "learning_rate": 9.749794670110562e-05, + "loss": 2.7244, + "step": 7909 + }, + { + "epoch": 5.096696212731668, + "grad_norm": 1.6284534971836149, + "learning_rate": 9.749731694840363e-05, + "loss": 2.7052, + "step": 7910 + }, + { + "epoch": 5.097340854149879, + "grad_norm": 1.748661563039541, + "learning_rate": 9.749668711851445e-05, + "loss": 2.6225, + "step": 7911 + }, + { + "epoch": 5.09798549556809, + "grad_norm": 1.6961535485128754, + "learning_rate": 9.749605721143911e-05, + "loss": 2.7158, + "step": 7912 + }, + { + "epoch": 5.098630136986301, + "grad_norm": 1.698180339375473, + "learning_rate": 9.749542722717864e-05, + "loss": 2.5637, + "step": 7913 + }, + { + "epoch": 5.099274778404513, + "grad_norm": 1.5828100546610597, + "learning_rate": 9.749479716573407e-05, + "loss": 2.8616, + "step": 7914 + }, + { + "epoch": 5.099919419822724, + "grad_norm": 1.8422855360261428, + "learning_rate": 9.749416702710643e-05, + "loss": 2.7166, + "step": 7915 + }, + { + "epoch": 5.100564061240934, + "grad_norm": 1.7871511121367574, + "learning_rate": 9.749353681129677e-05, + "loss": 2.6295, + "step": 7916 + }, + { + "epoch": 5.101208702659146, + "grad_norm": 1.8474887734075698, + "learning_rate": 9.749290651830612e-05, + "loss": 2.6965, + "step": 7917 + }, + { + "epoch": 5.101853344077357, + "grad_norm": 1.5756595749718267, + "learning_rate": 9.749227614813553e-05, + "loss": 2.8005, + "step": 7918 + }, + { + "epoch": 5.102497985495568, + "grad_norm": 2.155972130331878, + "learning_rate": 9.7491645700786e-05, + "loss": 2.9391, + "step": 7919 + }, + { + "epoch": 5.10314262691378, + "grad_norm": 1.907529151127008, + "learning_rate": 9.74910151762586e-05, + "loss": 2.8098, + "step": 7920 + }, + { + "epoch": 5.10378726833199, + "grad_norm": 2.0372714985855307, + "learning_rate": 9.749038457455433e-05, + "loss": 2.5674, + "step": 7921 + }, + { + "epoch": 5.104431909750201, + "grad_norm": 1.888322969096294, + "learning_rate": 9.748975389567427e-05, + "loss": 2.6875, + "step": 7922 + }, + { + "epoch": 5.105076551168413, + "grad_norm": 2.1231495535817597, + "learning_rate": 9.74891231396194e-05, + "loss": 2.6638, + "step": 7923 + }, + { + "epoch": 5.105721192586624, + "grad_norm": 1.9423342105231913, + "learning_rate": 9.748849230639082e-05, + "loss": 2.8579, + "step": 7924 + }, + { + "epoch": 5.106365834004835, + "grad_norm": 2.1358406495500875, + "learning_rate": 9.748786139598953e-05, + "loss": 2.8946, + "step": 7925 + }, + { + "epoch": 5.107010475423046, + "grad_norm": 1.8716948226734724, + "learning_rate": 9.748723040841656e-05, + "loss": 2.7783, + "step": 7926 + }, + { + "epoch": 5.107655116841257, + "grad_norm": 2.3050323938127337, + "learning_rate": 9.748659934367297e-05, + "loss": 2.7199, + "step": 7927 + }, + { + "epoch": 5.108299758259468, + "grad_norm": 1.914339468008234, + "learning_rate": 9.748596820175977e-05, + "loss": 2.7213, + "step": 7928 + }, + { + "epoch": 5.1089443996776795, + "grad_norm": 2.3937630746953382, + "learning_rate": 9.748533698267803e-05, + "loss": 2.7815, + "step": 7929 + }, + { + "epoch": 5.109589041095891, + "grad_norm": 2.197725825393771, + "learning_rate": 9.748470568642873e-05, + "loss": 2.7466, + "step": 7930 + }, + { + "epoch": 5.110233682514101, + "grad_norm": 1.920511267096566, + "learning_rate": 9.748407431301297e-05, + "loss": 2.6531, + "step": 7931 + }, + { + "epoch": 5.110878323932313, + "grad_norm": 1.9794456362920512, + "learning_rate": 9.748344286243175e-05, + "loss": 2.8275, + "step": 7932 + }, + { + "epoch": 5.111522965350524, + "grad_norm": 1.8674963374628109, + "learning_rate": 9.748281133468612e-05, + "loss": 2.8557, + "step": 7933 + }, + { + "epoch": 5.112167606768735, + "grad_norm": 2.1570207781462862, + "learning_rate": 9.748217972977712e-05, + "loss": 2.8783, + "step": 7934 + }, + { + "epoch": 5.1128122481869465, + "grad_norm": 1.9160920806967958, + "learning_rate": 9.748154804770578e-05, + "loss": 2.6666, + "step": 7935 + }, + { + "epoch": 5.113456889605157, + "grad_norm": 1.765095300773666, + "learning_rate": 9.748091628847314e-05, + "loss": 3.0577, + "step": 7936 + }, + { + "epoch": 5.114101531023368, + "grad_norm": 1.9367472337042224, + "learning_rate": 9.748028445208023e-05, + "loss": 2.5625, + "step": 7937 + }, + { + "epoch": 5.1147461724415795, + "grad_norm": 1.8641095088357889, + "learning_rate": 9.747965253852809e-05, + "loss": 2.938, + "step": 7938 + }, + { + "epoch": 5.115390813859791, + "grad_norm": 1.7546221069192527, + "learning_rate": 9.747902054781777e-05, + "loss": 2.9484, + "step": 7939 + }, + { + "epoch": 5.116035455278001, + "grad_norm": 1.8180962696332075, + "learning_rate": 9.74783884799503e-05, + "loss": 3.1285, + "step": 7940 + }, + { + "epoch": 5.1166800966962125, + "grad_norm": 1.762898778740835, + "learning_rate": 9.747775633492672e-05, + "loss": 2.6839, + "step": 7941 + }, + { + "epoch": 5.117324738114424, + "grad_norm": 1.8243517544519, + "learning_rate": 9.747712411274808e-05, + "loss": 2.7201, + "step": 7942 + }, + { + "epoch": 5.117969379532635, + "grad_norm": 1.8391969898360916, + "learning_rate": 9.747649181341538e-05, + "loss": 2.8496, + "step": 7943 + }, + { + "epoch": 5.118614020950846, + "grad_norm": 1.6922159436085766, + "learning_rate": 9.747585943692969e-05, + "loss": 3.0392, + "step": 7944 + }, + { + "epoch": 5.119258662369057, + "grad_norm": 2.1179194289279266, + "learning_rate": 9.747522698329205e-05, + "loss": 2.6416, + "step": 7945 + }, + { + "epoch": 5.119903303787268, + "grad_norm": 1.671328591411771, + "learning_rate": 9.74745944525035e-05, + "loss": 2.6567, + "step": 7946 + }, + { + "epoch": 5.120547945205479, + "grad_norm": 2.1885660794905246, + "learning_rate": 9.747396184456503e-05, + "loss": 3.1407, + "step": 7947 + }, + { + "epoch": 5.121192586623691, + "grad_norm": 1.759277979664218, + "learning_rate": 9.747332915947777e-05, + "loss": 3.2502, + "step": 7948 + }, + { + "epoch": 5.121837228041902, + "grad_norm": 2.148391549055482, + "learning_rate": 9.747269639724267e-05, + "loss": 3.0483, + "step": 7949 + }, + { + "epoch": 5.122481869460112, + "grad_norm": 1.6762793003387635, + "learning_rate": 9.747206355786082e-05, + "loss": 2.6492, + "step": 7950 + }, + { + "epoch": 5.123126510878324, + "grad_norm": 2.0382634677327216, + "learning_rate": 9.747143064133325e-05, + "loss": 2.7162, + "step": 7951 + }, + { + "epoch": 5.123771152296535, + "grad_norm": 1.7104176379494258, + "learning_rate": 9.747079764766099e-05, + "loss": 2.7924, + "step": 7952 + }, + { + "epoch": 5.124415793714746, + "grad_norm": 2.1150163660102037, + "learning_rate": 9.747016457684509e-05, + "loss": 2.882, + "step": 7953 + }, + { + "epoch": 5.125060435132958, + "grad_norm": 1.7633429466059076, + "learning_rate": 9.746953142888657e-05, + "loss": 2.729, + "step": 7954 + }, + { + "epoch": 5.125705076551168, + "grad_norm": 1.822318922928923, + "learning_rate": 9.74688982037865e-05, + "loss": 2.6259, + "step": 7955 + }, + { + "epoch": 5.126349717969379, + "grad_norm": 1.5848223217903494, + "learning_rate": 9.746826490154591e-05, + "loss": 2.605, + "step": 7956 + }, + { + "epoch": 5.126994359387591, + "grad_norm": 1.8987128587917694, + "learning_rate": 9.746763152216583e-05, + "loss": 2.7393, + "step": 7957 + }, + { + "epoch": 5.127639000805802, + "grad_norm": 1.8773978262467224, + "learning_rate": 9.74669980656473e-05, + "loss": 2.8521, + "step": 7958 + }, + { + "epoch": 5.128283642224013, + "grad_norm": 1.929570727808158, + "learning_rate": 9.746636453199138e-05, + "loss": 2.5966, + "step": 7959 + }, + { + "epoch": 5.128928283642224, + "grad_norm": 1.9215577231402963, + "learning_rate": 9.746573092119908e-05, + "loss": 3.1357, + "step": 7960 + }, + { + "epoch": 5.129572925060435, + "grad_norm": 2.1651672429255537, + "learning_rate": 9.746509723327147e-05, + "loss": 2.6292, + "step": 7961 + }, + { + "epoch": 5.130217566478646, + "grad_norm": 2.266159393717578, + "learning_rate": 9.746446346820956e-05, + "loss": 2.476, + "step": 7962 + }, + { + "epoch": 5.1308622078968575, + "grad_norm": 1.7647470846048647, + "learning_rate": 9.746382962601442e-05, + "loss": 2.6158, + "step": 7963 + }, + { + "epoch": 5.131506849315069, + "grad_norm": 2.2685489549016937, + "learning_rate": 9.746319570668709e-05, + "loss": 2.9895, + "step": 7964 + }, + { + "epoch": 5.132151490733279, + "grad_norm": 1.7363131452859877, + "learning_rate": 9.746256171022859e-05, + "loss": 2.7065, + "step": 7965 + }, + { + "epoch": 5.1327961321514906, + "grad_norm": 2.165704268497245, + "learning_rate": 9.746192763663995e-05, + "loss": 2.7787, + "step": 7966 + }, + { + "epoch": 5.133440773569702, + "grad_norm": 1.7501514462828078, + "learning_rate": 9.746129348592227e-05, + "loss": 2.7556, + "step": 7967 + }, + { + "epoch": 5.134085414987913, + "grad_norm": 2.376272544151617, + "learning_rate": 9.746065925807653e-05, + "loss": 2.6359, + "step": 7968 + }, + { + "epoch": 5.1347300564061245, + "grad_norm": 1.681354373877882, + "learning_rate": 9.746002495310382e-05, + "loss": 2.8671, + "step": 7969 + }, + { + "epoch": 5.135374697824335, + "grad_norm": 2.003223970547526, + "learning_rate": 9.745939057100513e-05, + "loss": 2.8137, + "step": 7970 + }, + { + "epoch": 5.136019339242546, + "grad_norm": 1.873076823480402, + "learning_rate": 9.745875611178156e-05, + "loss": 2.8003, + "step": 7971 + }, + { + "epoch": 5.1366639806607575, + "grad_norm": 2.133665698293191, + "learning_rate": 9.74581215754341e-05, + "loss": 2.7419, + "step": 7972 + }, + { + "epoch": 5.137308622078969, + "grad_norm": 2.0716328781962035, + "learning_rate": 9.745748696196382e-05, + "loss": 2.7977, + "step": 7973 + }, + { + "epoch": 5.13795326349718, + "grad_norm": 1.9588647748816626, + "learning_rate": 9.745685227137177e-05, + "loss": 2.9964, + "step": 7974 + }, + { + "epoch": 5.1385979049153905, + "grad_norm": 1.8343698260737193, + "learning_rate": 9.745621750365895e-05, + "loss": 2.5236, + "step": 7975 + }, + { + "epoch": 5.139242546333602, + "grad_norm": 1.7924670998885013, + "learning_rate": 9.745558265882645e-05, + "loss": 3.0507, + "step": 7976 + }, + { + "epoch": 5.139887187751813, + "grad_norm": 1.8245177865312305, + "learning_rate": 9.745494773687529e-05, + "loss": 2.5739, + "step": 7977 + }, + { + "epoch": 5.140531829170024, + "grad_norm": 1.8295991134334944, + "learning_rate": 9.745431273780654e-05, + "loss": 2.9758, + "step": 7978 + }, + { + "epoch": 5.141176470588236, + "grad_norm": 1.9709505714075628, + "learning_rate": 9.745367766162118e-05, + "loss": 2.8749, + "step": 7979 + }, + { + "epoch": 5.141821112006446, + "grad_norm": 1.8128131991740923, + "learning_rate": 9.745304250832033e-05, + "loss": 2.7912, + "step": 7980 + }, + { + "epoch": 5.142465753424657, + "grad_norm": 1.724252167808803, + "learning_rate": 9.745240727790497e-05, + "loss": 2.7356, + "step": 7981 + }, + { + "epoch": 5.143110394842869, + "grad_norm": 2.0014662767268256, + "learning_rate": 9.745177197037617e-05, + "loss": 2.766, + "step": 7982 + }, + { + "epoch": 5.14375503626108, + "grad_norm": 1.6754625612705065, + "learning_rate": 9.745113658573498e-05, + "loss": 2.4864, + "step": 7983 + }, + { + "epoch": 5.144399677679291, + "grad_norm": 1.6459599373461489, + "learning_rate": 9.745050112398245e-05, + "loss": 2.9703, + "step": 7984 + }, + { + "epoch": 5.145044319097502, + "grad_norm": 2.0853769862632863, + "learning_rate": 9.74498655851196e-05, + "loss": 2.8172, + "step": 7985 + }, + { + "epoch": 5.145688960515713, + "grad_norm": 1.8464162592549611, + "learning_rate": 9.744922996914747e-05, + "loss": 2.7935, + "step": 7986 + }, + { + "epoch": 5.146333601933924, + "grad_norm": 1.8233394044761693, + "learning_rate": 9.744859427606712e-05, + "loss": 3.0541, + "step": 7987 + }, + { + "epoch": 5.146978243352136, + "grad_norm": 1.6535812592832264, + "learning_rate": 9.744795850587961e-05, + "loss": 2.842, + "step": 7988 + }, + { + "epoch": 5.147622884770347, + "grad_norm": 1.6046500915594162, + "learning_rate": 9.744732265858596e-05, + "loss": 2.8427, + "step": 7989 + }, + { + "epoch": 5.148267526188557, + "grad_norm": 1.6898209885627895, + "learning_rate": 9.744668673418722e-05, + "loss": 2.7196, + "step": 7990 + }, + { + "epoch": 5.148912167606769, + "grad_norm": 1.5954193530680552, + "learning_rate": 9.744605073268443e-05, + "loss": 2.6149, + "step": 7991 + }, + { + "epoch": 5.14955680902498, + "grad_norm": 1.9864280263692478, + "learning_rate": 9.744541465407864e-05, + "loss": 2.9291, + "step": 7992 + }, + { + "epoch": 5.150201450443191, + "grad_norm": 1.6335566373069172, + "learning_rate": 9.74447784983709e-05, + "loss": 2.7714, + "step": 7993 + }, + { + "epoch": 5.1508460918614025, + "grad_norm": 2.0196060505691826, + "learning_rate": 9.744414226556223e-05, + "loss": 2.6711, + "step": 7994 + }, + { + "epoch": 5.151490733279613, + "grad_norm": 1.7630970527052279, + "learning_rate": 9.744350595565371e-05, + "loss": 2.8085, + "step": 7995 + }, + { + "epoch": 5.152135374697824, + "grad_norm": 1.7114384694411113, + "learning_rate": 9.744286956864636e-05, + "loss": 2.9453, + "step": 7996 + }, + { + "epoch": 5.1527800161160355, + "grad_norm": 1.813775452114667, + "learning_rate": 9.744223310454124e-05, + "loss": 2.5656, + "step": 7997 + }, + { + "epoch": 5.153424657534247, + "grad_norm": 1.6823362046528552, + "learning_rate": 9.744159656333938e-05, + "loss": 2.7728, + "step": 7998 + }, + { + "epoch": 5.154069298952457, + "grad_norm": 1.8531668264864172, + "learning_rate": 9.744095994504185e-05, + "loss": 2.6996, + "step": 7999 + }, + { + "epoch": 5.1547139403706685, + "grad_norm": 1.7479517125212714, + "learning_rate": 9.744032324964967e-05, + "loss": 2.9621, + "step": 8000 + }, + { + "epoch": 5.1547139403706685, + "eval_loss": 4.628431797027588, + "eval_runtime": 2.9971, + "eval_samples_per_second": 33.366, + "eval_steps_per_second": 4.338, + "step": 8000 + }, + { + "epoch": 5.15535858178888, + "grad_norm": 1.7349012278458382, + "learning_rate": 9.743968647716389e-05, + "loss": 2.9185, + "step": 8001 + }, + { + "epoch": 5.156003223207091, + "grad_norm": 1.547018286456073, + "learning_rate": 9.743904962758558e-05, + "loss": 2.8913, + "step": 8002 + }, + { + "epoch": 5.1566478646253024, + "grad_norm": 1.6548057274830106, + "learning_rate": 9.743841270091574e-05, + "loss": 2.8496, + "step": 8003 + }, + { + "epoch": 5.157292506043513, + "grad_norm": 1.535664691544257, + "learning_rate": 9.743777569715547e-05, + "loss": 2.9248, + "step": 8004 + }, + { + "epoch": 5.157937147461724, + "grad_norm": 1.768359214777369, + "learning_rate": 9.74371386163058e-05, + "loss": 2.7469, + "step": 8005 + }, + { + "epoch": 5.1585817888799355, + "grad_norm": 1.6427673326759862, + "learning_rate": 9.743650145836773e-05, + "loss": 2.7502, + "step": 8006 + }, + { + "epoch": 5.159226430298147, + "grad_norm": 1.7899859066150552, + "learning_rate": 9.743586422334236e-05, + "loss": 3.1742, + "step": 8007 + }, + { + "epoch": 5.159871071716358, + "grad_norm": 1.5387948313814266, + "learning_rate": 9.743522691123072e-05, + "loss": 2.7872, + "step": 8008 + }, + { + "epoch": 5.1605157131345685, + "grad_norm": 1.8007155612817907, + "learning_rate": 9.743458952203385e-05, + "loss": 3.0044, + "step": 8009 + }, + { + "epoch": 5.16116035455278, + "grad_norm": 1.586247725747852, + "learning_rate": 9.743395205575282e-05, + "loss": 2.6903, + "step": 8010 + }, + { + "epoch": 5.161804995970991, + "grad_norm": 1.6536417452216534, + "learning_rate": 9.743331451238864e-05, + "loss": 2.8217, + "step": 8011 + }, + { + "epoch": 5.162449637389202, + "grad_norm": 1.5276057363145592, + "learning_rate": 9.743267689194238e-05, + "loss": 2.7187, + "step": 8012 + }, + { + "epoch": 5.163094278807414, + "grad_norm": 1.7879105885977338, + "learning_rate": 9.743203919441509e-05, + "loss": 2.944, + "step": 8013 + }, + { + "epoch": 5.163738920225624, + "grad_norm": 1.5668873047906338, + "learning_rate": 9.74314014198078e-05, + "loss": 2.8443, + "step": 8014 + }, + { + "epoch": 5.164383561643835, + "grad_norm": 1.81086186397235, + "learning_rate": 9.74307635681216e-05, + "loss": 3.0699, + "step": 8015 + }, + { + "epoch": 5.165028203062047, + "grad_norm": 1.6187942710474144, + "learning_rate": 9.743012563935747e-05, + "loss": 2.7298, + "step": 8016 + }, + { + "epoch": 5.165672844480258, + "grad_norm": 1.8844751312868833, + "learning_rate": 9.742948763351651e-05, + "loss": 2.6658, + "step": 8017 + }, + { + "epoch": 5.166317485898469, + "grad_norm": 1.4515828148220582, + "learning_rate": 9.742884955059977e-05, + "loss": 2.8084, + "step": 8018 + }, + { + "epoch": 5.16696212731668, + "grad_norm": 2.0772895868891283, + "learning_rate": 9.742821139060825e-05, + "loss": 2.9308, + "step": 8019 + }, + { + "epoch": 5.167606768734891, + "grad_norm": 1.6855390936367238, + "learning_rate": 9.742757315354303e-05, + "loss": 2.9307, + "step": 8020 + }, + { + "epoch": 5.168251410153102, + "grad_norm": 1.6533105990525168, + "learning_rate": 9.742693483940517e-05, + "loss": 2.7002, + "step": 8021 + }, + { + "epoch": 5.168896051571314, + "grad_norm": 1.7833539058727177, + "learning_rate": 9.74262964481957e-05, + "loss": 2.6859, + "step": 8022 + }, + { + "epoch": 5.169540692989525, + "grad_norm": 1.6225055217174684, + "learning_rate": 9.742565797991568e-05, + "loss": 2.8707, + "step": 8023 + }, + { + "epoch": 5.170185334407735, + "grad_norm": 1.7267763994379663, + "learning_rate": 9.742501943456614e-05, + "loss": 2.7958, + "step": 8024 + }, + { + "epoch": 5.170829975825947, + "grad_norm": 1.7644923702162465, + "learning_rate": 9.742438081214815e-05, + "loss": 2.6504, + "step": 8025 + }, + { + "epoch": 5.171474617244158, + "grad_norm": 1.6179548494684368, + "learning_rate": 9.742374211266275e-05, + "loss": 2.6779, + "step": 8026 + }, + { + "epoch": 5.172119258662369, + "grad_norm": 2.0158640556877, + "learning_rate": 9.742310333611097e-05, + "loss": 2.7723, + "step": 8027 + }, + { + "epoch": 5.1727639000805805, + "grad_norm": 1.721667111343044, + "learning_rate": 9.74224644824939e-05, + "loss": 2.875, + "step": 8028 + }, + { + "epoch": 5.173408541498791, + "grad_norm": 2.1823079787169295, + "learning_rate": 9.742182555181254e-05, + "loss": 2.6316, + "step": 8029 + }, + { + "epoch": 5.174053182917002, + "grad_norm": 2.2189209970768995, + "learning_rate": 9.7421186544068e-05, + "loss": 2.3856, + "step": 8030 + }, + { + "epoch": 5.1746978243352135, + "grad_norm": 1.8732002856496464, + "learning_rate": 9.742054745926127e-05, + "loss": 2.4503, + "step": 8031 + }, + { + "epoch": 5.175342465753425, + "grad_norm": 2.2043934945643726, + "learning_rate": 9.741990829739343e-05, + "loss": 3.0425, + "step": 8032 + }, + { + "epoch": 5.175987107171636, + "grad_norm": 1.89916375864722, + "learning_rate": 9.741926905846554e-05, + "loss": 2.9469, + "step": 8033 + }, + { + "epoch": 5.1766317485898465, + "grad_norm": 2.2982917785796277, + "learning_rate": 9.74186297424786e-05, + "loss": 3.1061, + "step": 8034 + }, + { + "epoch": 5.177276390008058, + "grad_norm": 1.9822127174659232, + "learning_rate": 9.741799034943375e-05, + "loss": 2.5965, + "step": 8035 + }, + { + "epoch": 5.177921031426269, + "grad_norm": 2.2986255172844294, + "learning_rate": 9.741735087933194e-05, + "loss": 2.875, + "step": 8036 + }, + { + "epoch": 5.17856567284448, + "grad_norm": 2.165521605658416, + "learning_rate": 9.741671133217427e-05, + "loss": 2.6676, + "step": 8037 + }, + { + "epoch": 5.179210314262692, + "grad_norm": 1.9552591562633053, + "learning_rate": 9.74160717079618e-05, + "loss": 2.7127, + "step": 8038 + }, + { + "epoch": 5.179854955680902, + "grad_norm": 2.0637386171332284, + "learning_rate": 9.741543200669557e-05, + "loss": 2.8285, + "step": 8039 + }, + { + "epoch": 5.1804995970991135, + "grad_norm": 1.794401786034989, + "learning_rate": 9.741479222837661e-05, + "loss": 2.8772, + "step": 8040 + }, + { + "epoch": 5.181144238517325, + "grad_norm": 1.8433598685145938, + "learning_rate": 9.741415237300599e-05, + "loss": 2.4684, + "step": 8041 + }, + { + "epoch": 5.181788879935536, + "grad_norm": 1.7948735882970834, + "learning_rate": 9.741351244058475e-05, + "loss": 3.0244, + "step": 8042 + }, + { + "epoch": 5.182433521353747, + "grad_norm": 1.845728335026638, + "learning_rate": 9.741287243111396e-05, + "loss": 2.8251, + "step": 8043 + }, + { + "epoch": 5.183078162771958, + "grad_norm": 1.7662163192552158, + "learning_rate": 9.741223234459466e-05, + "loss": 2.8635, + "step": 8044 + }, + { + "epoch": 5.183722804190169, + "grad_norm": 2.0980704403474695, + "learning_rate": 9.741159218102791e-05, + "loss": 2.9752, + "step": 8045 + }, + { + "epoch": 5.18436744560838, + "grad_norm": 1.604326576556212, + "learning_rate": 9.741095194041475e-05, + "loss": 2.7794, + "step": 8046 + }, + { + "epoch": 5.185012087026592, + "grad_norm": 1.9830968742177928, + "learning_rate": 9.741031162275622e-05, + "loss": 2.7652, + "step": 8047 + }, + { + "epoch": 5.185656728444803, + "grad_norm": 1.6884423136682878, + "learning_rate": 9.74096712280534e-05, + "loss": 2.7763, + "step": 8048 + }, + { + "epoch": 5.186301369863013, + "grad_norm": 1.8643113661537423, + "learning_rate": 9.740903075630733e-05, + "loss": 2.7791, + "step": 8049 + }, + { + "epoch": 5.186946011281225, + "grad_norm": 1.7895669951191804, + "learning_rate": 9.740839020751905e-05, + "loss": 2.5876, + "step": 8050 + }, + { + "epoch": 5.187590652699436, + "grad_norm": 2.143561654020472, + "learning_rate": 9.740774958168962e-05, + "loss": 2.8909, + "step": 8051 + }, + { + "epoch": 5.188235294117647, + "grad_norm": 1.9130482701208882, + "learning_rate": 9.74071088788201e-05, + "loss": 2.9324, + "step": 8052 + }, + { + "epoch": 5.188879935535859, + "grad_norm": 1.8492515551352273, + "learning_rate": 9.740646809891154e-05, + "loss": 2.8988, + "step": 8053 + }, + { + "epoch": 5.189524576954069, + "grad_norm": 1.7952437909296695, + "learning_rate": 9.740582724196499e-05, + "loss": 2.9723, + "step": 8054 + }, + { + "epoch": 5.19016921837228, + "grad_norm": 1.6106985580721866, + "learning_rate": 9.740518630798149e-05, + "loss": 2.7306, + "step": 8055 + }, + { + "epoch": 5.190813859790492, + "grad_norm": 1.7438093409112256, + "learning_rate": 9.740454529696212e-05, + "loss": 2.5956, + "step": 8056 + }, + { + "epoch": 5.191458501208703, + "grad_norm": 1.6537555505261, + "learning_rate": 9.74039042089079e-05, + "loss": 2.8642, + "step": 8057 + }, + { + "epoch": 5.192103142626914, + "grad_norm": 1.8431817295965358, + "learning_rate": 9.740326304381991e-05, + "loss": 2.7058, + "step": 8058 + }, + { + "epoch": 5.192747784045125, + "grad_norm": 1.5607193178473786, + "learning_rate": 9.740262180169917e-05, + "loss": 2.6669, + "step": 8059 + }, + { + "epoch": 5.193392425463336, + "grad_norm": 1.727310456928799, + "learning_rate": 9.740198048254678e-05, + "loss": 2.8947, + "step": 8060 + }, + { + "epoch": 5.194037066881547, + "grad_norm": 1.8080008212598386, + "learning_rate": 9.740133908636378e-05, + "loss": 2.6354, + "step": 8061 + }, + { + "epoch": 5.1946817082997585, + "grad_norm": 1.6613330310211167, + "learning_rate": 9.740069761315119e-05, + "loss": 3.0566, + "step": 8062 + }, + { + "epoch": 5.19532634971797, + "grad_norm": 1.8006396997316174, + "learning_rate": 9.740005606291009e-05, + "loss": 3.1825, + "step": 8063 + }, + { + "epoch": 5.19597099113618, + "grad_norm": 1.5598768813630466, + "learning_rate": 9.739941443564155e-05, + "loss": 2.8702, + "step": 8064 + }, + { + "epoch": 5.1966156325543915, + "grad_norm": 1.7770212789546878, + "learning_rate": 9.739877273134657e-05, + "loss": 2.6965, + "step": 8065 + }, + { + "epoch": 5.197260273972603, + "grad_norm": 1.6449019652270087, + "learning_rate": 9.739813095002625e-05, + "loss": 2.7135, + "step": 8066 + }, + { + "epoch": 5.197904915390814, + "grad_norm": 1.5861125761860286, + "learning_rate": 9.739748909168164e-05, + "loss": 3.1678, + "step": 8067 + }, + { + "epoch": 5.198549556809025, + "grad_norm": 2.069030841812252, + "learning_rate": 9.739684715631378e-05, + "loss": 2.9529, + "step": 8068 + }, + { + "epoch": 5.199194198227236, + "grad_norm": 1.8766121379518463, + "learning_rate": 9.739620514392373e-05, + "loss": 2.8101, + "step": 8069 + }, + { + "epoch": 5.199838839645447, + "grad_norm": 1.9905631063278109, + "learning_rate": 9.739556305451253e-05, + "loss": 3.0962, + "step": 8070 + }, + { + "epoch": 5.200483481063658, + "grad_norm": 1.8938080016993064, + "learning_rate": 9.739492088808127e-05, + "loss": 2.8294, + "step": 8071 + }, + { + "epoch": 5.20112812248187, + "grad_norm": 1.72597699754953, + "learning_rate": 9.739427864463097e-05, + "loss": 3.0729, + "step": 8072 + }, + { + "epoch": 5.201772763900081, + "grad_norm": 2.1036772807944932, + "learning_rate": 9.739363632416272e-05, + "loss": 2.7469, + "step": 8073 + }, + { + "epoch": 5.202417405318291, + "grad_norm": 1.6023881464918723, + "learning_rate": 9.739299392667753e-05, + "loss": 2.5036, + "step": 8074 + }, + { + "epoch": 5.203062046736503, + "grad_norm": 2.0523989012224018, + "learning_rate": 9.739235145217648e-05, + "loss": 3.0606, + "step": 8075 + }, + { + "epoch": 5.203706688154714, + "grad_norm": 1.5897363819550518, + "learning_rate": 9.739170890066064e-05, + "loss": 2.7998, + "step": 8076 + }, + { + "epoch": 5.204351329572925, + "grad_norm": 1.958165847641136, + "learning_rate": 9.739106627213103e-05, + "loss": 2.8304, + "step": 8077 + }, + { + "epoch": 5.204995970991136, + "grad_norm": 1.8660766216063498, + "learning_rate": 9.739042356658873e-05, + "loss": 2.6756, + "step": 8078 + }, + { + "epoch": 5.205640612409347, + "grad_norm": 1.8026465279356167, + "learning_rate": 9.73897807840348e-05, + "loss": 3.2377, + "step": 8079 + }, + { + "epoch": 5.206285253827558, + "grad_norm": 2.081611521298184, + "learning_rate": 9.738913792447026e-05, + "loss": 2.8732, + "step": 8080 + }, + { + "epoch": 5.20692989524577, + "grad_norm": 1.6239403083765338, + "learning_rate": 9.738849498789622e-05, + "loss": 3.076, + "step": 8081 + }, + { + "epoch": 5.207574536663981, + "grad_norm": 1.9387134737810927, + "learning_rate": 9.73878519743137e-05, + "loss": 2.78, + "step": 8082 + }, + { + "epoch": 5.208219178082191, + "grad_norm": 1.5209342512932864, + "learning_rate": 9.738720888372376e-05, + "loss": 2.791, + "step": 8083 + }, + { + "epoch": 5.208863819500403, + "grad_norm": 2.078382029698186, + "learning_rate": 9.738656571612746e-05, + "loss": 2.9488, + "step": 8084 + }, + { + "epoch": 5.209508460918614, + "grad_norm": 1.7134126848963742, + "learning_rate": 9.738592247152585e-05, + "loss": 2.7105, + "step": 8085 + }, + { + "epoch": 5.210153102336825, + "grad_norm": 1.9102090204295668, + "learning_rate": 9.738527914992e-05, + "loss": 2.8121, + "step": 8086 + }, + { + "epoch": 5.210797743755037, + "grad_norm": 1.8982813975013049, + "learning_rate": 9.738463575131094e-05, + "loss": 2.9184, + "step": 8087 + }, + { + "epoch": 5.211442385173247, + "grad_norm": 1.8386441882148286, + "learning_rate": 9.738399227569978e-05, + "loss": 3.2289, + "step": 8088 + }, + { + "epoch": 5.212087026591458, + "grad_norm": 1.8997753895362997, + "learning_rate": 9.738334872308753e-05, + "loss": 3.0968, + "step": 8089 + }, + { + "epoch": 5.21273166800967, + "grad_norm": 1.834899203598382, + "learning_rate": 9.738270509347525e-05, + "loss": 2.8583, + "step": 8090 + }, + { + "epoch": 5.213376309427881, + "grad_norm": 1.729365295923879, + "learning_rate": 9.738206138686401e-05, + "loss": 2.9112, + "step": 8091 + }, + { + "epoch": 5.214020950846092, + "grad_norm": 1.7555297929674354, + "learning_rate": 9.738141760325485e-05, + "loss": 2.5943, + "step": 8092 + }, + { + "epoch": 5.214665592264303, + "grad_norm": 2.2352918375579667, + "learning_rate": 9.738077374264887e-05, + "loss": 2.8652, + "step": 8093 + }, + { + "epoch": 5.215310233682514, + "grad_norm": 1.6399558893102144, + "learning_rate": 9.738012980504707e-05, + "loss": 2.673, + "step": 8094 + }, + { + "epoch": 5.215954875100725, + "grad_norm": 2.0356743210537385, + "learning_rate": 9.737948579045055e-05, + "loss": 2.6858, + "step": 8095 + }, + { + "epoch": 5.2165995165189365, + "grad_norm": 1.5575487190775463, + "learning_rate": 9.737884169886035e-05, + "loss": 2.8171, + "step": 8096 + }, + { + "epoch": 5.217244157937148, + "grad_norm": 1.7820666938010297, + "learning_rate": 9.737819753027754e-05, + "loss": 2.8598, + "step": 8097 + }, + { + "epoch": 5.217888799355358, + "grad_norm": 1.6997435953436382, + "learning_rate": 9.737755328470317e-05, + "loss": 2.8686, + "step": 8098 + }, + { + "epoch": 5.2185334407735695, + "grad_norm": 2.18877078482046, + "learning_rate": 9.737690896213829e-05, + "loss": 2.6871, + "step": 8099 + }, + { + "epoch": 5.219178082191781, + "grad_norm": 1.8405343404911303, + "learning_rate": 9.737626456258396e-05, + "loss": 2.7628, + "step": 8100 + }, + { + "epoch": 5.219178082191781, + "eval_loss": 4.51017951965332, + "eval_runtime": 2.9701, + "eval_samples_per_second": 33.669, + "eval_steps_per_second": 4.377, + "step": 8100 + }, + { + "epoch": 5.219822723609992, + "grad_norm": 2.293135934145684, + "learning_rate": 9.737562008604125e-05, + "loss": 2.8773, + "step": 8101 + }, + { + "epoch": 5.220467365028203, + "grad_norm": 1.9464708124623888, + "learning_rate": 9.737497553251121e-05, + "loss": 2.7566, + "step": 8102 + }, + { + "epoch": 5.221112006446414, + "grad_norm": 2.3568777552430684, + "learning_rate": 9.737433090199491e-05, + "loss": 2.9131, + "step": 8103 + }, + { + "epoch": 5.221756647864625, + "grad_norm": 1.7612131860859357, + "learning_rate": 9.73736861944934e-05, + "loss": 2.8295, + "step": 8104 + }, + { + "epoch": 5.222401289282836, + "grad_norm": 2.304945965629983, + "learning_rate": 9.737304141000774e-05, + "loss": 2.8773, + "step": 8105 + }, + { + "epoch": 5.223045930701048, + "grad_norm": 1.8915256564639231, + "learning_rate": 9.737239654853898e-05, + "loss": 2.3875, + "step": 8106 + }, + { + "epoch": 5.223690572119259, + "grad_norm": 2.417666240654452, + "learning_rate": 9.737175161008819e-05, + "loss": 2.8119, + "step": 8107 + }, + { + "epoch": 5.224335213537469, + "grad_norm": 1.8059913824336806, + "learning_rate": 9.737110659465643e-05, + "loss": 2.8199, + "step": 8108 + }, + { + "epoch": 5.224979854955681, + "grad_norm": 2.2375474113278018, + "learning_rate": 9.737046150224474e-05, + "loss": 2.9, + "step": 8109 + }, + { + "epoch": 5.225624496373892, + "grad_norm": 1.540431418313584, + "learning_rate": 9.736981633285422e-05, + "loss": 2.6969, + "step": 8110 + }, + { + "epoch": 5.226269137792103, + "grad_norm": 2.1395544413147425, + "learning_rate": 9.736917108648588e-05, + "loss": 2.7563, + "step": 8111 + }, + { + "epoch": 5.226913779210315, + "grad_norm": 1.6526641012602366, + "learning_rate": 9.736852576314082e-05, + "loss": 2.8599, + "step": 8112 + }, + { + "epoch": 5.227558420628525, + "grad_norm": 2.0440125645864393, + "learning_rate": 9.736788036282007e-05, + "loss": 2.6905, + "step": 8113 + }, + { + "epoch": 5.228203062046736, + "grad_norm": 1.9012716275343038, + "learning_rate": 9.736723488552472e-05, + "loss": 2.9258, + "step": 8114 + }, + { + "epoch": 5.228847703464948, + "grad_norm": 1.8682318511017868, + "learning_rate": 9.736658933125581e-05, + "loss": 2.7864, + "step": 8115 + }, + { + "epoch": 5.229492344883159, + "grad_norm": 1.8812860380165806, + "learning_rate": 9.73659437000144e-05, + "loss": 2.9001, + "step": 8116 + }, + { + "epoch": 5.23013698630137, + "grad_norm": 1.6669975500422176, + "learning_rate": 9.736529799180155e-05, + "loss": 2.9945, + "step": 8117 + }, + { + "epoch": 5.230781627719581, + "grad_norm": 1.7658387878406703, + "learning_rate": 9.736465220661834e-05, + "loss": 3.0072, + "step": 8118 + }, + { + "epoch": 5.231426269137792, + "grad_norm": 1.6505805831806941, + "learning_rate": 9.73640063444658e-05, + "loss": 2.5948, + "step": 8119 + }, + { + "epoch": 5.232070910556003, + "grad_norm": 1.6636733713006002, + "learning_rate": 9.736336040534502e-05, + "loss": 3.0832, + "step": 8120 + }, + { + "epoch": 5.232715551974215, + "grad_norm": 1.6962715375532011, + "learning_rate": 9.736271438925704e-05, + "loss": 3.1108, + "step": 8121 + }, + { + "epoch": 5.233360193392426, + "grad_norm": 1.7671120021460742, + "learning_rate": 9.736206829620293e-05, + "loss": 2.9775, + "step": 8122 + }, + { + "epoch": 5.234004834810636, + "grad_norm": 1.5717440230567814, + "learning_rate": 9.736142212618377e-05, + "loss": 2.7684, + "step": 8123 + }, + { + "epoch": 5.234649476228848, + "grad_norm": 1.4811168141846807, + "learning_rate": 9.736077587920057e-05, + "loss": 3.0163, + "step": 8124 + }, + { + "epoch": 5.235294117647059, + "grad_norm": 1.4657922469544364, + "learning_rate": 9.736012955525444e-05, + "loss": 2.8119, + "step": 8125 + }, + { + "epoch": 5.23593875906527, + "grad_norm": 1.581701759711749, + "learning_rate": 9.73594831543464e-05, + "loss": 2.7296, + "step": 8126 + }, + { + "epoch": 5.2365834004834815, + "grad_norm": 1.6504414364764035, + "learning_rate": 9.735883667647757e-05, + "loss": 3.0462, + "step": 8127 + }, + { + "epoch": 5.237228041901692, + "grad_norm": 1.8837139166374757, + "learning_rate": 9.735819012164896e-05, + "loss": 2.7485, + "step": 8128 + }, + { + "epoch": 5.237872683319903, + "grad_norm": 1.5748937869476833, + "learning_rate": 9.735754348986165e-05, + "loss": 3.1877, + "step": 8129 + }, + { + "epoch": 5.2385173247381145, + "grad_norm": 1.5758411065884377, + "learning_rate": 9.73568967811167e-05, + "loss": 2.606, + "step": 8130 + }, + { + "epoch": 5.239161966156326, + "grad_norm": 1.632805050838984, + "learning_rate": 9.735624999541518e-05, + "loss": 2.5746, + "step": 8131 + }, + { + "epoch": 5.239806607574537, + "grad_norm": 1.7430075586851428, + "learning_rate": 9.735560313275814e-05, + "loss": 3.0567, + "step": 8132 + }, + { + "epoch": 5.2404512489927475, + "grad_norm": 1.7682678985293272, + "learning_rate": 9.735495619314664e-05, + "loss": 2.9189, + "step": 8133 + }, + { + "epoch": 5.241095890410959, + "grad_norm": 1.7713820927287727, + "learning_rate": 9.735430917658176e-05, + "loss": 2.9296, + "step": 8134 + }, + { + "epoch": 5.24174053182917, + "grad_norm": 1.7861763218248925, + "learning_rate": 9.735366208306456e-05, + "loss": 3.0806, + "step": 8135 + }, + { + "epoch": 5.242385173247381, + "grad_norm": 2.008211119824268, + "learning_rate": 9.73530149125961e-05, + "loss": 2.7189, + "step": 8136 + }, + { + "epoch": 5.243029814665592, + "grad_norm": 1.7225606853172073, + "learning_rate": 9.735236766517742e-05, + "loss": 2.7064, + "step": 8137 + }, + { + "epoch": 5.243674456083803, + "grad_norm": 1.959597013384406, + "learning_rate": 9.735172034080962e-05, + "loss": 3.1523, + "step": 8138 + }, + { + "epoch": 5.244319097502014, + "grad_norm": 1.5543063556897843, + "learning_rate": 9.735107293949372e-05, + "loss": 2.9856, + "step": 8139 + }, + { + "epoch": 5.244963738920226, + "grad_norm": 1.8391572252357293, + "learning_rate": 9.735042546123082e-05, + "loss": 2.5737, + "step": 8140 + }, + { + "epoch": 5.245608380338437, + "grad_norm": 1.457896980477478, + "learning_rate": 9.734977790602198e-05, + "loss": 2.755, + "step": 8141 + }, + { + "epoch": 5.246253021756647, + "grad_norm": 1.7299151360501288, + "learning_rate": 9.734913027386825e-05, + "loss": 2.9675, + "step": 8142 + }, + { + "epoch": 5.246897663174859, + "grad_norm": 1.4475037027951363, + "learning_rate": 9.73484825647707e-05, + "loss": 2.5454, + "step": 8143 + }, + { + "epoch": 5.24754230459307, + "grad_norm": 1.6784183042658356, + "learning_rate": 9.734783477873038e-05, + "loss": 2.4767, + "step": 8144 + }, + { + "epoch": 5.248186946011281, + "grad_norm": 1.5397077139375825, + "learning_rate": 9.734718691574838e-05, + "loss": 2.7885, + "step": 8145 + }, + { + "epoch": 5.248831587429493, + "grad_norm": 1.8092086543756714, + "learning_rate": 9.734653897582576e-05, + "loss": 3.1842, + "step": 8146 + }, + { + "epoch": 5.249476228847703, + "grad_norm": 1.5961971402842192, + "learning_rate": 9.734589095896356e-05, + "loss": 2.8026, + "step": 8147 + }, + { + "epoch": 5.250120870265914, + "grad_norm": 1.7933129571946407, + "learning_rate": 9.734524286516286e-05, + "loss": 2.9487, + "step": 8148 + }, + { + "epoch": 5.250765511684126, + "grad_norm": 1.5567799817107193, + "learning_rate": 9.734459469442474e-05, + "loss": 2.7031, + "step": 8149 + }, + { + "epoch": 5.251410153102337, + "grad_norm": 1.8816606993610183, + "learning_rate": 9.734394644675022e-05, + "loss": 3.1784, + "step": 8150 + }, + { + "epoch": 5.252054794520548, + "grad_norm": 1.7052616162113876, + "learning_rate": 9.73432981221404e-05, + "loss": 2.8784, + "step": 8151 + }, + { + "epoch": 5.252699435938759, + "grad_norm": 1.6419141421246763, + "learning_rate": 9.734264972059635e-05, + "loss": 3.0178, + "step": 8152 + }, + { + "epoch": 5.25334407735697, + "grad_norm": 1.7975076269768646, + "learning_rate": 9.734200124211912e-05, + "loss": 2.4938, + "step": 8153 + }, + { + "epoch": 5.253988718775181, + "grad_norm": 1.5644666861776688, + "learning_rate": 9.734135268670977e-05, + "loss": 2.9107, + "step": 8154 + }, + { + "epoch": 5.2546333601933926, + "grad_norm": 1.8610901929324215, + "learning_rate": 9.734070405436938e-05, + "loss": 2.8936, + "step": 8155 + }, + { + "epoch": 5.255278001611604, + "grad_norm": 1.5687318810447308, + "learning_rate": 9.7340055345099e-05, + "loss": 2.7031, + "step": 8156 + }, + { + "epoch": 5.255922643029814, + "grad_norm": 2.177197832562818, + "learning_rate": 9.73394065588997e-05, + "loss": 2.7838, + "step": 8157 + }, + { + "epoch": 5.256567284448026, + "grad_norm": 1.725064475943791, + "learning_rate": 9.733875769577256e-05, + "loss": 2.988, + "step": 8158 + }, + { + "epoch": 5.257211925866237, + "grad_norm": 1.8296826520127225, + "learning_rate": 9.733810875571863e-05, + "loss": 3.0385, + "step": 8159 + }, + { + "epoch": 5.257856567284448, + "grad_norm": 1.7275618718741974, + "learning_rate": 9.733745973873899e-05, + "loss": 2.8968, + "step": 8160 + }, + { + "epoch": 5.2585012087026595, + "grad_norm": 2.0104599362485005, + "learning_rate": 9.733681064483468e-05, + "loss": 2.8118, + "step": 8161 + }, + { + "epoch": 5.25914585012087, + "grad_norm": 2.131897265763298, + "learning_rate": 9.733616147400679e-05, + "loss": 2.763, + "step": 8162 + }, + { + "epoch": 5.259790491539081, + "grad_norm": 1.755421222637628, + "learning_rate": 9.733551222625638e-05, + "loss": 2.818, + "step": 8163 + }, + { + "epoch": 5.2604351329572925, + "grad_norm": 2.1488805026715823, + "learning_rate": 9.733486290158452e-05, + "loss": 2.619, + "step": 8164 + }, + { + "epoch": 5.261079774375504, + "grad_norm": 1.6876219856591952, + "learning_rate": 9.733421349999225e-05, + "loss": 2.9164, + "step": 8165 + }, + { + "epoch": 5.261724415793715, + "grad_norm": 2.0339667587758714, + "learning_rate": 9.733356402148067e-05, + "loss": 2.7892, + "step": 8166 + }, + { + "epoch": 5.2623690572119255, + "grad_norm": 2.2946188001948387, + "learning_rate": 9.733291446605085e-05, + "loss": 2.6524, + "step": 8167 + }, + { + "epoch": 5.263013698630137, + "grad_norm": 1.9433831199621956, + "learning_rate": 9.733226483370382e-05, + "loss": 2.7039, + "step": 8168 + }, + { + "epoch": 5.263658340048348, + "grad_norm": 1.7138316744816449, + "learning_rate": 9.733161512444068e-05, + "loss": 2.9442, + "step": 8169 + }, + { + "epoch": 5.264302981466559, + "grad_norm": 2.0026209477438184, + "learning_rate": 9.733096533826247e-05, + "loss": 3.0099, + "step": 8170 + }, + { + "epoch": 5.264947622884771, + "grad_norm": 2.1081874169968464, + "learning_rate": 9.733031547517028e-05, + "loss": 2.5351, + "step": 8171 + }, + { + "epoch": 5.265592264302981, + "grad_norm": 1.7795293303975217, + "learning_rate": 9.732966553516519e-05, + "loss": 2.7862, + "step": 8172 + }, + { + "epoch": 5.266236905721192, + "grad_norm": 2.2566932693996997, + "learning_rate": 9.732901551824823e-05, + "loss": 3.2677, + "step": 8173 + }, + { + "epoch": 5.266881547139404, + "grad_norm": 1.745262803984665, + "learning_rate": 9.732836542442049e-05, + "loss": 2.903, + "step": 8174 + }, + { + "epoch": 5.267526188557615, + "grad_norm": 2.2281102309866125, + "learning_rate": 9.732771525368302e-05, + "loss": 2.7629, + "step": 8175 + }, + { + "epoch": 5.268170829975826, + "grad_norm": 1.7802783194199954, + "learning_rate": 9.732706500603692e-05, + "loss": 2.819, + "step": 8176 + }, + { + "epoch": 5.268815471394037, + "grad_norm": 1.91801093079765, + "learning_rate": 9.732641468148321e-05, + "loss": 2.89, + "step": 8177 + }, + { + "epoch": 5.269460112812248, + "grad_norm": 1.7296200575245433, + "learning_rate": 9.732576428002301e-05, + "loss": 2.5643, + "step": 8178 + }, + { + "epoch": 5.270104754230459, + "grad_norm": 1.829455096082345, + "learning_rate": 9.732511380165737e-05, + "loss": 3.078, + "step": 8179 + }, + { + "epoch": 5.270749395648671, + "grad_norm": 1.6611402530854567, + "learning_rate": 9.732446324638733e-05, + "loss": 2.7513, + "step": 8180 + }, + { + "epoch": 5.271394037066882, + "grad_norm": 1.93764610191293, + "learning_rate": 9.7323812614214e-05, + "loss": 2.6081, + "step": 8181 + }, + { + "epoch": 5.272038678485092, + "grad_norm": 1.7867458442827349, + "learning_rate": 9.732316190513841e-05, + "loss": 2.7502, + "step": 8182 + }, + { + "epoch": 5.272683319903304, + "grad_norm": 1.6284642214847533, + "learning_rate": 9.732251111916167e-05, + "loss": 2.7529, + "step": 8183 + }, + { + "epoch": 5.273327961321515, + "grad_norm": 1.767638146768135, + "learning_rate": 9.732186025628481e-05, + "loss": 2.918, + "step": 8184 + }, + { + "epoch": 5.273972602739726, + "grad_norm": 1.9430495495055973, + "learning_rate": 9.732120931650892e-05, + "loss": 2.6426, + "step": 8185 + }, + { + "epoch": 5.2746172441579375, + "grad_norm": 1.9212921781654129, + "learning_rate": 9.732055829983507e-05, + "loss": 2.627, + "step": 8186 + }, + { + "epoch": 5.275261885576148, + "grad_norm": 2.425989423462475, + "learning_rate": 9.731990720626434e-05, + "loss": 2.7048, + "step": 8187 + }, + { + "epoch": 5.275906526994359, + "grad_norm": 1.7507715390872902, + "learning_rate": 9.731925603579777e-05, + "loss": 2.9737, + "step": 8188 + }, + { + "epoch": 5.2765511684125705, + "grad_norm": 2.4760406606003835, + "learning_rate": 9.731860478843643e-05, + "loss": 3.0384, + "step": 8189 + }, + { + "epoch": 5.277195809830782, + "grad_norm": 1.895109717504345, + "learning_rate": 9.73179534641814e-05, + "loss": 3.0922, + "step": 8190 + }, + { + "epoch": 5.277840451248993, + "grad_norm": 2.061326339918073, + "learning_rate": 9.731730206303378e-05, + "loss": 2.7348, + "step": 8191 + }, + { + "epoch": 5.278485092667204, + "grad_norm": 1.787136588754776, + "learning_rate": 9.731665058499462e-05, + "loss": 3.1005, + "step": 8192 + }, + { + "epoch": 5.279129734085415, + "grad_norm": 2.046460680815917, + "learning_rate": 9.731599903006495e-05, + "loss": 3.1536, + "step": 8193 + }, + { + "epoch": 5.279774375503626, + "grad_norm": 1.6547440472309745, + "learning_rate": 9.731534739824588e-05, + "loss": 2.7911, + "step": 8194 + }, + { + "epoch": 5.2804190169218375, + "grad_norm": 1.848354264207588, + "learning_rate": 9.731469568953849e-05, + "loss": 3.0199, + "step": 8195 + }, + { + "epoch": 5.281063658340049, + "grad_norm": 1.6280411681510487, + "learning_rate": 9.731404390394383e-05, + "loss": 2.8464, + "step": 8196 + }, + { + "epoch": 5.281708299758259, + "grad_norm": 2.0912038510810573, + "learning_rate": 9.731339204146297e-05, + "loss": 2.7941, + "step": 8197 + }, + { + "epoch": 5.2823529411764705, + "grad_norm": 1.7057962671007203, + "learning_rate": 9.731274010209698e-05, + "loss": 2.8204, + "step": 8198 + }, + { + "epoch": 5.282997582594682, + "grad_norm": 2.0083604265107864, + "learning_rate": 9.731208808584693e-05, + "loss": 2.6635, + "step": 8199 + }, + { + "epoch": 5.283642224012893, + "grad_norm": 1.8567516693639565, + "learning_rate": 9.73114359927139e-05, + "loss": 3.106, + "step": 8200 + }, + { + "epoch": 5.283642224012893, + "eval_loss": 4.551726818084717, + "eval_runtime": 2.9904, + "eval_samples_per_second": 33.44, + "eval_steps_per_second": 4.347, + "step": 8200 + }, + { + "epoch": 5.284286865431104, + "grad_norm": 2.1476942831643693, + "learning_rate": 9.731078382269895e-05, + "loss": 3.0692, + "step": 8201 + }, + { + "epoch": 5.284931506849315, + "grad_norm": 1.6684821924151654, + "learning_rate": 9.731013157580319e-05, + "loss": 2.8595, + "step": 8202 + }, + { + "epoch": 5.285576148267526, + "grad_norm": 2.253294916894999, + "learning_rate": 9.730947925202763e-05, + "loss": 3.1352, + "step": 8203 + }, + { + "epoch": 5.286220789685737, + "grad_norm": 2.1011868832048104, + "learning_rate": 9.730882685137338e-05, + "loss": 2.734, + "step": 8204 + }, + { + "epoch": 5.286865431103949, + "grad_norm": 2.271184871958667, + "learning_rate": 9.73081743738415e-05, + "loss": 2.9379, + "step": 8205 + }, + { + "epoch": 5.28751007252216, + "grad_norm": 2.0507624857567386, + "learning_rate": 9.730752181943305e-05, + "loss": 2.7599, + "step": 8206 + }, + { + "epoch": 5.28815471394037, + "grad_norm": 1.9832126047828993, + "learning_rate": 9.730686918814914e-05, + "loss": 2.9095, + "step": 8207 + }, + { + "epoch": 5.288799355358582, + "grad_norm": 1.9265393183744464, + "learning_rate": 9.730621647999081e-05, + "loss": 2.9324, + "step": 8208 + }, + { + "epoch": 5.289443996776793, + "grad_norm": 1.954646764530386, + "learning_rate": 9.730556369495914e-05, + "loss": 2.8948, + "step": 8209 + }, + { + "epoch": 5.290088638195004, + "grad_norm": 1.9973494954263125, + "learning_rate": 9.730491083305518e-05, + "loss": 2.8373, + "step": 8210 + }, + { + "epoch": 5.290733279613216, + "grad_norm": 2.070280038426543, + "learning_rate": 9.730425789428004e-05, + "loss": 2.7708, + "step": 8211 + }, + { + "epoch": 5.291377921031426, + "grad_norm": 1.7040587601780215, + "learning_rate": 9.73036048786348e-05, + "loss": 3.0714, + "step": 8212 + }, + { + "epoch": 5.292022562449637, + "grad_norm": 2.1046201793806847, + "learning_rate": 9.730295178612048e-05, + "loss": 2.6891, + "step": 8213 + }, + { + "epoch": 5.292667203867849, + "grad_norm": 1.5724713323056063, + "learning_rate": 9.730229861673819e-05, + "loss": 2.8903, + "step": 8214 + }, + { + "epoch": 5.29331184528606, + "grad_norm": 2.0410511987698903, + "learning_rate": 9.730164537048898e-05, + "loss": 2.7537, + "step": 8215 + }, + { + "epoch": 5.293956486704271, + "grad_norm": 1.7358965502670174, + "learning_rate": 9.730099204737396e-05, + "loss": 3.0345, + "step": 8216 + }, + { + "epoch": 5.294601128122482, + "grad_norm": 1.962779811324727, + "learning_rate": 9.730033864739416e-05, + "loss": 2.7039, + "step": 8217 + }, + { + "epoch": 5.295245769540693, + "grad_norm": 1.6577905054600202, + "learning_rate": 9.729968517055069e-05, + "loss": 2.7803, + "step": 8218 + }, + { + "epoch": 5.295890410958904, + "grad_norm": 2.42290495822662, + "learning_rate": 9.729903161684459e-05, + "loss": 3.1114, + "step": 8219 + }, + { + "epoch": 5.2965350523771155, + "grad_norm": 1.6066040722517072, + "learning_rate": 9.729837798627695e-05, + "loss": 3.0748, + "step": 8220 + }, + { + "epoch": 5.297179693795326, + "grad_norm": 2.147340228540289, + "learning_rate": 9.729772427884885e-05, + "loss": 2.7077, + "step": 8221 + }, + { + "epoch": 5.297824335213537, + "grad_norm": 1.8180822208703746, + "learning_rate": 9.729707049456136e-05, + "loss": 3.0253, + "step": 8222 + }, + { + "epoch": 5.2984689766317485, + "grad_norm": 1.9747774765692727, + "learning_rate": 9.729641663341555e-05, + "loss": 2.9385, + "step": 8223 + }, + { + "epoch": 5.29911361804996, + "grad_norm": 1.840768947630542, + "learning_rate": 9.729576269541248e-05, + "loss": 2.7858, + "step": 8224 + }, + { + "epoch": 5.299758259468171, + "grad_norm": 1.9571037336189099, + "learning_rate": 9.729510868055327e-05, + "loss": 2.9119, + "step": 8225 + }, + { + "epoch": 5.3004029008863816, + "grad_norm": 1.6049731306874908, + "learning_rate": 9.729445458883894e-05, + "loss": 2.8579, + "step": 8226 + }, + { + "epoch": 5.301047542304593, + "grad_norm": 1.8175443294847917, + "learning_rate": 9.729380042027058e-05, + "loss": 2.99, + "step": 8227 + }, + { + "epoch": 5.301692183722804, + "grad_norm": 1.7645343378754395, + "learning_rate": 9.729314617484929e-05, + "loss": 2.9408, + "step": 8228 + }, + { + "epoch": 5.3023368251410155, + "grad_norm": 1.7167035104815118, + "learning_rate": 9.729249185257611e-05, + "loss": 2.8992, + "step": 8229 + }, + { + "epoch": 5.302981466559227, + "grad_norm": 1.7315028428550152, + "learning_rate": 9.729183745345214e-05, + "loss": 2.5502, + "step": 8230 + }, + { + "epoch": 5.303626107977437, + "grad_norm": 1.819944447213016, + "learning_rate": 9.729118297747844e-05, + "loss": 2.8004, + "step": 8231 + }, + { + "epoch": 5.3042707493956485, + "grad_norm": 1.7327977223139088, + "learning_rate": 9.72905284246561e-05, + "loss": 2.4424, + "step": 8232 + }, + { + "epoch": 5.30491539081386, + "grad_norm": 1.9900041584120185, + "learning_rate": 9.728987379498618e-05, + "loss": 3.015, + "step": 8233 + }, + { + "epoch": 5.305560032232071, + "grad_norm": 1.9268675882022592, + "learning_rate": 9.728921908846975e-05, + "loss": 3.1751, + "step": 8234 + }, + { + "epoch": 5.306204673650282, + "grad_norm": 1.9218614046897602, + "learning_rate": 9.72885643051079e-05, + "loss": 2.7657, + "step": 8235 + }, + { + "epoch": 5.306849315068493, + "grad_norm": 1.9589241583816843, + "learning_rate": 9.72879094449017e-05, + "loss": 2.6799, + "step": 8236 + }, + { + "epoch": 5.307493956486704, + "grad_norm": 2.0204587703620493, + "learning_rate": 9.728725450785223e-05, + "loss": 2.8797, + "step": 8237 + }, + { + "epoch": 5.308138597904915, + "grad_norm": 1.9620668265983128, + "learning_rate": 9.728659949396057e-05, + "loss": 2.9049, + "step": 8238 + }, + { + "epoch": 5.308783239323127, + "grad_norm": 2.1669002963112556, + "learning_rate": 9.728594440322778e-05, + "loss": 3.1248, + "step": 8239 + }, + { + "epoch": 5.309427880741338, + "grad_norm": 1.59798432489124, + "learning_rate": 9.728528923565496e-05, + "loss": 2.8157, + "step": 8240 + }, + { + "epoch": 5.310072522159548, + "grad_norm": 2.2061346560873414, + "learning_rate": 9.728463399124313e-05, + "loss": 2.5393, + "step": 8241 + }, + { + "epoch": 5.31071716357776, + "grad_norm": 1.6703063567730578, + "learning_rate": 9.728397866999343e-05, + "loss": 2.7508, + "step": 8242 + }, + { + "epoch": 5.311361804995971, + "grad_norm": 2.020711595770021, + "learning_rate": 9.728332327190692e-05, + "loss": 2.875, + "step": 8243 + }, + { + "epoch": 5.312006446414182, + "grad_norm": 1.4645929905776616, + "learning_rate": 9.728266779698466e-05, + "loss": 2.4844, + "step": 8244 + }, + { + "epoch": 5.312651087832394, + "grad_norm": 2.028398693016579, + "learning_rate": 9.728201224522772e-05, + "loss": 2.7822, + "step": 8245 + }, + { + "epoch": 5.313295729250604, + "grad_norm": 1.8490379555321754, + "learning_rate": 9.728135661663721e-05, + "loss": 2.7333, + "step": 8246 + }, + { + "epoch": 5.313940370668815, + "grad_norm": 1.8764523314321717, + "learning_rate": 9.72807009112142e-05, + "loss": 3.0791, + "step": 8247 + }, + { + "epoch": 5.314585012087027, + "grad_norm": 1.9685989273257696, + "learning_rate": 9.728004512895975e-05, + "loss": 2.5296, + "step": 8248 + }, + { + "epoch": 5.315229653505238, + "grad_norm": 1.6746668126122144, + "learning_rate": 9.727938926987492e-05, + "loss": 2.6078, + "step": 8249 + }, + { + "epoch": 5.315874294923449, + "grad_norm": 1.907094005856468, + "learning_rate": 9.727873333396083e-05, + "loss": 2.8185, + "step": 8250 + }, + { + "epoch": 5.31651893634166, + "grad_norm": 1.9620397491127446, + "learning_rate": 9.727807732121853e-05, + "loss": 3.3019, + "step": 8251 + }, + { + "epoch": 5.317163577759871, + "grad_norm": 1.9551321227205083, + "learning_rate": 9.727742123164911e-05, + "loss": 3.1645, + "step": 8252 + }, + { + "epoch": 5.317808219178082, + "grad_norm": 1.9382019552393484, + "learning_rate": 9.727676506525364e-05, + "loss": 3.0817, + "step": 8253 + }, + { + "epoch": 5.3184528605962935, + "grad_norm": 2.025515397262446, + "learning_rate": 9.727610882203318e-05, + "loss": 2.7051, + "step": 8254 + }, + { + "epoch": 5.319097502014505, + "grad_norm": 1.937438455987471, + "learning_rate": 9.727545250198885e-05, + "loss": 2.5808, + "step": 8255 + }, + { + "epoch": 5.319742143432715, + "grad_norm": 2.078925067445094, + "learning_rate": 9.72747961051217e-05, + "loss": 2.6321, + "step": 8256 + }, + { + "epoch": 5.3203867848509265, + "grad_norm": 1.7669383019086422, + "learning_rate": 9.727413963143283e-05, + "loss": 2.7164, + "step": 8257 + }, + { + "epoch": 5.321031426269138, + "grad_norm": 1.9056293450692394, + "learning_rate": 9.727348308092327e-05, + "loss": 3.2125, + "step": 8258 + }, + { + "epoch": 5.321676067687349, + "grad_norm": 1.7127811632154903, + "learning_rate": 9.727282645359415e-05, + "loss": 2.9323, + "step": 8259 + }, + { + "epoch": 5.32232070910556, + "grad_norm": 1.6618294318252145, + "learning_rate": 9.727216974944653e-05, + "loss": 2.6392, + "step": 8260 + }, + { + "epoch": 5.322965350523771, + "grad_norm": 1.8461893345023255, + "learning_rate": 9.727151296848146e-05, + "loss": 3.1002, + "step": 8261 + }, + { + "epoch": 5.323609991941982, + "grad_norm": 1.8181696235492952, + "learning_rate": 9.727085611070008e-05, + "loss": 3.174, + "step": 8262 + }, + { + "epoch": 5.3242546333601934, + "grad_norm": 1.9198546037466735, + "learning_rate": 9.727019917610342e-05, + "loss": 2.8307, + "step": 8263 + }, + { + "epoch": 5.324899274778405, + "grad_norm": 2.261155840134655, + "learning_rate": 9.726954216469256e-05, + "loss": 2.9215, + "step": 8264 + }, + { + "epoch": 5.325543916196616, + "grad_norm": 2.269811455190473, + "learning_rate": 9.726888507646862e-05, + "loss": 2.6994, + "step": 8265 + }, + { + "epoch": 5.3261885576148265, + "grad_norm": 1.8165640122302735, + "learning_rate": 9.726822791143264e-05, + "loss": 2.757, + "step": 8266 + }, + { + "epoch": 5.326833199033038, + "grad_norm": 1.922288110562174, + "learning_rate": 9.72675706695857e-05, + "loss": 3.101, + "step": 8267 + }, + { + "epoch": 5.327477840451249, + "grad_norm": 1.7600970244990262, + "learning_rate": 9.72669133509289e-05, + "loss": 2.9479, + "step": 8268 + }, + { + "epoch": 5.32812248186946, + "grad_norm": 1.7698909325771024, + "learning_rate": 9.72662559554633e-05, + "loss": 3.1743, + "step": 8269 + }, + { + "epoch": 5.328767123287671, + "grad_norm": 1.7760944387516198, + "learning_rate": 9.726559848319e-05, + "loss": 2.7971, + "step": 8270 + }, + { + "epoch": 5.329411764705882, + "grad_norm": 1.5588136682688496, + "learning_rate": 9.726494093411007e-05, + "loss": 3.085, + "step": 8271 + }, + { + "epoch": 5.330056406124093, + "grad_norm": 1.7603728471900961, + "learning_rate": 9.726428330822459e-05, + "loss": 2.9316, + "step": 8272 + }, + { + "epoch": 5.330701047542305, + "grad_norm": 1.7171848450881424, + "learning_rate": 9.726362560553464e-05, + "loss": 3.1628, + "step": 8273 + }, + { + "epoch": 5.331345688960516, + "grad_norm": 1.6640624734927292, + "learning_rate": 9.726296782604128e-05, + "loss": 3.0752, + "step": 8274 + }, + { + "epoch": 5.331990330378726, + "grad_norm": 1.6497567282482348, + "learning_rate": 9.726230996974564e-05, + "loss": 2.7634, + "step": 8275 + }, + { + "epoch": 5.332634971796938, + "grad_norm": 1.4757204905749106, + "learning_rate": 9.726165203664876e-05, + "loss": 3.0566, + "step": 8276 + }, + { + "epoch": 5.333279613215149, + "grad_norm": 1.5210209696524484, + "learning_rate": 9.726099402675171e-05, + "loss": 2.9778, + "step": 8277 + }, + { + "epoch": 5.33392425463336, + "grad_norm": 1.4129305321365084, + "learning_rate": 9.726033594005561e-05, + "loss": 2.7831, + "step": 8278 + }, + { + "epoch": 5.334568896051572, + "grad_norm": 1.5326544018132953, + "learning_rate": 9.725967777656152e-05, + "loss": 2.8077, + "step": 8279 + }, + { + "epoch": 5.335213537469782, + "grad_norm": 1.5799219001332099, + "learning_rate": 9.725901953627052e-05, + "loss": 3.2396, + "step": 8280 + }, + { + "epoch": 5.335858178887993, + "grad_norm": 1.6309948508405225, + "learning_rate": 9.72583612191837e-05, + "loss": 3.0363, + "step": 8281 + }, + { + "epoch": 5.336502820306205, + "grad_norm": 1.5497961645856004, + "learning_rate": 9.725770282530213e-05, + "loss": 3.2646, + "step": 8282 + }, + { + "epoch": 5.337147461724416, + "grad_norm": 1.7157030695494946, + "learning_rate": 9.725704435462691e-05, + "loss": 3.0569, + "step": 8283 + }, + { + "epoch": 5.337792103142627, + "grad_norm": 1.8330617862587035, + "learning_rate": 9.72563858071591e-05, + "loss": 2.6973, + "step": 8284 + }, + { + "epoch": 5.338436744560838, + "grad_norm": 1.7097800502790363, + "learning_rate": 9.72557271828998e-05, + "loss": 2.8548, + "step": 8285 + }, + { + "epoch": 5.339081385979049, + "grad_norm": 1.6050001178275741, + "learning_rate": 9.725506848185006e-05, + "loss": 2.9466, + "step": 8286 + }, + { + "epoch": 5.33972602739726, + "grad_norm": 1.767505954511699, + "learning_rate": 9.7254409704011e-05, + "loss": 3.1295, + "step": 8287 + }, + { + "epoch": 5.3403706688154715, + "grad_norm": 1.5978720007740865, + "learning_rate": 9.725375084938367e-05, + "loss": 2.9916, + "step": 8288 + }, + { + "epoch": 5.341015310233683, + "grad_norm": 1.6705245689429644, + "learning_rate": 9.725309191796918e-05, + "loss": 3.1468, + "step": 8289 + }, + { + "epoch": 5.341659951651893, + "grad_norm": 1.482411819068057, + "learning_rate": 9.72524329097686e-05, + "loss": 3.4491, + "step": 8290 + }, + { + "epoch": 5.3423045930701045, + "grad_norm": 1.6302215239717712, + "learning_rate": 9.725177382478301e-05, + "loss": 3.0731, + "step": 8291 + }, + { + "epoch": 5.342949234488316, + "grad_norm": 1.6434407774394901, + "learning_rate": 9.72511146630135e-05, + "loss": 3.0704, + "step": 8292 + }, + { + "epoch": 5.343593875906527, + "grad_norm": 1.547568458873637, + "learning_rate": 9.725045542446114e-05, + "loss": 2.9777, + "step": 8293 + }, + { + "epoch": 5.344238517324738, + "grad_norm": 1.9386274516683464, + "learning_rate": 9.724979610912702e-05, + "loss": 2.9817, + "step": 8294 + }, + { + "epoch": 5.344883158742949, + "grad_norm": 1.5078678160148824, + "learning_rate": 9.724913671701223e-05, + "loss": 2.7983, + "step": 8295 + }, + { + "epoch": 5.34552780016116, + "grad_norm": 1.955100932566124, + "learning_rate": 9.724847724811783e-05, + "loss": 2.8064, + "step": 8296 + }, + { + "epoch": 5.346172441579371, + "grad_norm": 1.5247812657815458, + "learning_rate": 9.724781770244493e-05, + "loss": 2.5616, + "step": 8297 + }, + { + "epoch": 5.346817082997583, + "grad_norm": 1.771928594241986, + "learning_rate": 9.724715807999461e-05, + "loss": 3.002, + "step": 8298 + }, + { + "epoch": 5.347461724415794, + "grad_norm": 1.7729219770949811, + "learning_rate": 9.724649838076793e-05, + "loss": 3.0643, + "step": 8299 + }, + { + "epoch": 5.3481063658340044, + "grad_norm": 1.7849734139386066, + "learning_rate": 9.724583860476599e-05, + "loss": 3.0537, + "step": 8300 + }, + { + "epoch": 5.3481063658340044, + "eval_loss": 4.479726314544678, + "eval_runtime": 2.9895, + "eval_samples_per_second": 33.45, + "eval_steps_per_second": 4.349, + "step": 8300 + }, + { + "epoch": 5.348751007252216, + "grad_norm": 1.924370058023018, + "learning_rate": 9.724517875198988e-05, + "loss": 3.1556, + "step": 8301 + }, + { + "epoch": 5.349395648670427, + "grad_norm": 1.494426081883718, + "learning_rate": 9.724451882244068e-05, + "loss": 2.6672, + "step": 8302 + }, + { + "epoch": 5.350040290088638, + "grad_norm": 1.7660263561226337, + "learning_rate": 9.724385881611946e-05, + "loss": 2.912, + "step": 8303 + }, + { + "epoch": 5.35068493150685, + "grad_norm": 1.5753850237718692, + "learning_rate": 9.724319873302731e-05, + "loss": 2.7517, + "step": 8304 + }, + { + "epoch": 5.35132957292506, + "grad_norm": 1.6344511346737505, + "learning_rate": 9.724253857316531e-05, + "loss": 2.816, + "step": 8305 + }, + { + "epoch": 5.351974214343271, + "grad_norm": 1.7796087635841007, + "learning_rate": 9.72418783365346e-05, + "loss": 2.8255, + "step": 8306 + }, + { + "epoch": 5.352618855761483, + "grad_norm": 1.5512754310183583, + "learning_rate": 9.724121802313618e-05, + "loss": 3.0362, + "step": 8307 + }, + { + "epoch": 5.353263497179694, + "grad_norm": 1.5232711983553258, + "learning_rate": 9.724055763297117e-05, + "loss": 2.9135, + "step": 8308 + }, + { + "epoch": 5.353908138597905, + "grad_norm": 1.7329985830066548, + "learning_rate": 9.723989716604066e-05, + "loss": 2.826, + "step": 8309 + }, + { + "epoch": 5.354552780016116, + "grad_norm": 1.5225516570584379, + "learning_rate": 9.723923662234574e-05, + "loss": 2.9053, + "step": 8310 + }, + { + "epoch": 5.355197421434327, + "grad_norm": 1.5754240063955676, + "learning_rate": 9.723857600188747e-05, + "loss": 2.8185, + "step": 8311 + }, + { + "epoch": 5.355842062852538, + "grad_norm": 1.4744948792422, + "learning_rate": 9.723791530466697e-05, + "loss": 2.9586, + "step": 8312 + }, + { + "epoch": 5.35648670427075, + "grad_norm": 1.7484712693856053, + "learning_rate": 9.723725453068528e-05, + "loss": 2.975, + "step": 8313 + }, + { + "epoch": 5.357131345688961, + "grad_norm": 1.4948002866828463, + "learning_rate": 9.723659367994353e-05, + "loss": 2.8632, + "step": 8314 + }, + { + "epoch": 5.357775987107171, + "grad_norm": 1.7895294232219403, + "learning_rate": 9.723593275244279e-05, + "loss": 3.2244, + "step": 8315 + }, + { + "epoch": 5.358420628525383, + "grad_norm": 1.7559538024222077, + "learning_rate": 9.723527174818414e-05, + "loss": 2.9467, + "step": 8316 + }, + { + "epoch": 5.359065269943594, + "grad_norm": 1.6669791777568894, + "learning_rate": 9.723461066716864e-05, + "loss": 2.8591, + "step": 8317 + }, + { + "epoch": 5.359709911361805, + "grad_norm": 1.846501089343554, + "learning_rate": 9.723394950939743e-05, + "loss": 2.9721, + "step": 8318 + }, + { + "epoch": 5.3603545527800165, + "grad_norm": 1.9672193663114532, + "learning_rate": 9.723328827487155e-05, + "loss": 2.615, + "step": 8319 + }, + { + "epoch": 5.360999194198227, + "grad_norm": 1.7891930168582826, + "learning_rate": 9.723262696359213e-05, + "loss": 3.341, + "step": 8320 + }, + { + "epoch": 5.361643835616438, + "grad_norm": 1.994922867004749, + "learning_rate": 9.723196557556023e-05, + "loss": 2.5592, + "step": 8321 + }, + { + "epoch": 5.3622884770346495, + "grad_norm": 1.499573246054798, + "learning_rate": 9.723130411077692e-05, + "loss": 3.042, + "step": 8322 + }, + { + "epoch": 5.362933118452861, + "grad_norm": 2.189902540918175, + "learning_rate": 9.72306425692433e-05, + "loss": 2.9559, + "step": 8323 + }, + { + "epoch": 5.363577759871072, + "grad_norm": 1.5066875091708198, + "learning_rate": 9.722998095096046e-05, + "loss": 2.9856, + "step": 8324 + }, + { + "epoch": 5.3642224012892825, + "grad_norm": 1.6778269473868774, + "learning_rate": 9.722931925592951e-05, + "loss": 3.2107, + "step": 8325 + }, + { + "epoch": 5.364867042707494, + "grad_norm": 1.6714118418644628, + "learning_rate": 9.722865748415149e-05, + "loss": 2.8541, + "step": 8326 + }, + { + "epoch": 5.365511684125705, + "grad_norm": 1.8412961264125163, + "learning_rate": 9.722799563562753e-05, + "loss": 2.6973, + "step": 8327 + }, + { + "epoch": 5.366156325543916, + "grad_norm": 1.464136981843602, + "learning_rate": 9.722733371035867e-05, + "loss": 3.0554, + "step": 8328 + }, + { + "epoch": 5.366800966962128, + "grad_norm": 1.8212609882509274, + "learning_rate": 9.722667170834604e-05, + "loss": 2.8261, + "step": 8329 + }, + { + "epoch": 5.367445608380338, + "grad_norm": 1.4978551614488977, + "learning_rate": 9.722600962959072e-05, + "loss": 2.8879, + "step": 8330 + }, + { + "epoch": 5.368090249798549, + "grad_norm": 1.8491994599538388, + "learning_rate": 9.722534747409379e-05, + "loss": 2.9702, + "step": 8331 + }, + { + "epoch": 5.368734891216761, + "grad_norm": 1.5534416985939865, + "learning_rate": 9.722468524185632e-05, + "loss": 2.666, + "step": 8332 + }, + { + "epoch": 5.369379532634972, + "grad_norm": 1.516449845514581, + "learning_rate": 9.722402293287942e-05, + "loss": 3.0629, + "step": 8333 + }, + { + "epoch": 5.370024174053183, + "grad_norm": 1.4906251058036373, + "learning_rate": 9.722336054716417e-05, + "loss": 3.1079, + "step": 8334 + }, + { + "epoch": 5.370668815471394, + "grad_norm": 1.431284547371385, + "learning_rate": 9.722269808471167e-05, + "loss": 2.8553, + "step": 8335 + }, + { + "epoch": 5.371313456889605, + "grad_norm": 1.3778758380488698, + "learning_rate": 9.722203554552299e-05, + "loss": 2.8025, + "step": 8336 + }, + { + "epoch": 5.371958098307816, + "grad_norm": 1.5783871938136345, + "learning_rate": 9.722137292959923e-05, + "loss": 3.1893, + "step": 8337 + }, + { + "epoch": 5.372602739726028, + "grad_norm": 1.512885043529392, + "learning_rate": 9.722071023694146e-05, + "loss": 2.6863, + "step": 8338 + }, + { + "epoch": 5.373247381144239, + "grad_norm": 1.4759936244009615, + "learning_rate": 9.722004746755081e-05, + "loss": 2.8056, + "step": 8339 + }, + { + "epoch": 5.373892022562449, + "grad_norm": 1.388457634706935, + "learning_rate": 9.721938462142833e-05, + "loss": 2.8441, + "step": 8340 + }, + { + "epoch": 5.374536663980661, + "grad_norm": 1.5557816181245696, + "learning_rate": 9.72187216985751e-05, + "loss": 2.7749, + "step": 8341 + }, + { + "epoch": 5.375181305398872, + "grad_norm": 1.4364027499603182, + "learning_rate": 9.721805869899226e-05, + "loss": 3.211, + "step": 8342 + }, + { + "epoch": 5.375825946817083, + "grad_norm": 1.5276967837596622, + "learning_rate": 9.721739562268086e-05, + "loss": 2.9842, + "step": 8343 + }, + { + "epoch": 5.376470588235295, + "grad_norm": 1.5976582562212243, + "learning_rate": 9.721673246964197e-05, + "loss": 2.8937, + "step": 8344 + }, + { + "epoch": 5.377115229653505, + "grad_norm": 1.6375887323721015, + "learning_rate": 9.721606923987672e-05, + "loss": 3.0663, + "step": 8345 + }, + { + "epoch": 5.377759871071716, + "grad_norm": 1.4918535234232357, + "learning_rate": 9.72154059333862e-05, + "loss": 3.0452, + "step": 8346 + }, + { + "epoch": 5.378404512489928, + "grad_norm": 1.5274672458735636, + "learning_rate": 9.721474255017146e-05, + "loss": 2.8419, + "step": 8347 + }, + { + "epoch": 5.379049153908139, + "grad_norm": 1.5935272821643143, + "learning_rate": 9.721407909023364e-05, + "loss": 2.7281, + "step": 8348 + }, + { + "epoch": 5.37969379532635, + "grad_norm": 1.5941466465893264, + "learning_rate": 9.721341555357379e-05, + "loss": 3.0287, + "step": 8349 + }, + { + "epoch": 5.380338436744561, + "grad_norm": 1.5782272166798357, + "learning_rate": 9.7212751940193e-05, + "loss": 2.5746, + "step": 8350 + }, + { + "epoch": 5.380983078162772, + "grad_norm": 1.439631054256054, + "learning_rate": 9.72120882500924e-05, + "loss": 2.8985, + "step": 8351 + }, + { + "epoch": 5.381627719580983, + "grad_norm": 1.591774245589868, + "learning_rate": 9.721142448327304e-05, + "loss": 3.0185, + "step": 8352 + }, + { + "epoch": 5.3822723609991945, + "grad_norm": 1.6046330807571667, + "learning_rate": 9.721076063973604e-05, + "loss": 2.8358, + "step": 8353 + }, + { + "epoch": 5.382917002417406, + "grad_norm": 1.786929852831182, + "learning_rate": 9.721009671948244e-05, + "loss": 2.6002, + "step": 8354 + }, + { + "epoch": 5.383561643835616, + "grad_norm": 1.661199681115397, + "learning_rate": 9.720943272251338e-05, + "loss": 3.1788, + "step": 8355 + }, + { + "epoch": 5.3842062852538275, + "grad_norm": 1.5662398344116342, + "learning_rate": 9.720876864882994e-05, + "loss": 2.6995, + "step": 8356 + }, + { + "epoch": 5.384850926672039, + "grad_norm": 1.5100201817187096, + "learning_rate": 9.72081044984332e-05, + "loss": 3.2039, + "step": 8357 + }, + { + "epoch": 5.38549556809025, + "grad_norm": 1.6075242113822743, + "learning_rate": 9.720744027132425e-05, + "loss": 2.7391, + "step": 8358 + }, + { + "epoch": 5.3861402095084605, + "grad_norm": 1.6505193290088764, + "learning_rate": 9.72067759675042e-05, + "loss": 2.7578, + "step": 8359 + }, + { + "epoch": 5.386784850926672, + "grad_norm": 1.8008765600153278, + "learning_rate": 9.720611158697412e-05, + "loss": 2.7317, + "step": 8360 + }, + { + "epoch": 5.387429492344883, + "grad_norm": 1.4887378908246396, + "learning_rate": 9.720544712973512e-05, + "loss": 2.9228, + "step": 8361 + }, + { + "epoch": 5.388074133763094, + "grad_norm": 1.5728000052848383, + "learning_rate": 9.720478259578826e-05, + "loss": 2.9683, + "step": 8362 + }, + { + "epoch": 5.388718775181306, + "grad_norm": 1.479840457255096, + "learning_rate": 9.720411798513468e-05, + "loss": 2.794, + "step": 8363 + }, + { + "epoch": 5.389363416599516, + "grad_norm": 1.7172783617391536, + "learning_rate": 9.720345329777541e-05, + "loss": 2.8515, + "step": 8364 + }, + { + "epoch": 5.390008058017727, + "grad_norm": 1.5234799883016779, + "learning_rate": 9.72027885337116e-05, + "loss": 3.1041, + "step": 8365 + }, + { + "epoch": 5.390652699435939, + "grad_norm": 1.6163795934049028, + "learning_rate": 9.720212369294431e-05, + "loss": 2.9186, + "step": 8366 + }, + { + "epoch": 5.39129734085415, + "grad_norm": 1.614101756804547, + "learning_rate": 9.720145877547463e-05, + "loss": 2.8204, + "step": 8367 + }, + { + "epoch": 5.391941982272361, + "grad_norm": 1.5832669511127448, + "learning_rate": 9.720079378130367e-05, + "loss": 2.747, + "step": 8368 + }, + { + "epoch": 5.392586623690572, + "grad_norm": 1.6925854482240739, + "learning_rate": 9.720012871043252e-05, + "loss": 3.0228, + "step": 8369 + }, + { + "epoch": 5.393231265108783, + "grad_norm": 1.729888769165375, + "learning_rate": 9.719946356286225e-05, + "loss": 2.8658, + "step": 8370 + }, + { + "epoch": 5.393875906526994, + "grad_norm": 1.6077739010398866, + "learning_rate": 9.719879833859398e-05, + "loss": 2.9252, + "step": 8371 + }, + { + "epoch": 5.394520547945206, + "grad_norm": 1.8844869378509692, + "learning_rate": 9.719813303762878e-05, + "loss": 3.2111, + "step": 8372 + }, + { + "epoch": 5.395165189363417, + "grad_norm": 1.6883170468020312, + "learning_rate": 9.719746765996776e-05, + "loss": 3.0912, + "step": 8373 + }, + { + "epoch": 5.395809830781627, + "grad_norm": 1.7557306005340376, + "learning_rate": 9.7196802205612e-05, + "loss": 2.5555, + "step": 8374 + }, + { + "epoch": 5.396454472199839, + "grad_norm": 1.548657599606138, + "learning_rate": 9.719613667456261e-05, + "loss": 2.9339, + "step": 8375 + }, + { + "epoch": 5.39709911361805, + "grad_norm": 2.3724545969848125, + "learning_rate": 9.719547106682065e-05, + "loss": 2.8424, + "step": 8376 + }, + { + "epoch": 5.397743755036261, + "grad_norm": 1.8558758841478056, + "learning_rate": 9.719480538238725e-05, + "loss": 3.211, + "step": 8377 + }, + { + "epoch": 5.3983883964544725, + "grad_norm": 2.1085824862021263, + "learning_rate": 9.719413962126349e-05, + "loss": 2.779, + "step": 8378 + }, + { + "epoch": 5.399033037872683, + "grad_norm": 2.267550296889941, + "learning_rate": 9.719347378345046e-05, + "loss": 2.9805, + "step": 8379 + }, + { + "epoch": 5.399677679290894, + "grad_norm": 1.7242056391703575, + "learning_rate": 9.719280786894926e-05, + "loss": 2.9493, + "step": 8380 + }, + { + "epoch": 5.400322320709106, + "grad_norm": 2.212457387092582, + "learning_rate": 9.719214187776096e-05, + "loss": 2.6607, + "step": 8381 + }, + { + "epoch": 5.400966962127317, + "grad_norm": 1.716435604887175, + "learning_rate": 9.719147580988669e-05, + "loss": 3.0355, + "step": 8382 + }, + { + "epoch": 5.401611603545528, + "grad_norm": 2.592337208034638, + "learning_rate": 9.719080966532754e-05, + "loss": 2.9832, + "step": 8383 + }, + { + "epoch": 5.402256244963739, + "grad_norm": 1.4101806449021657, + "learning_rate": 9.719014344408456e-05, + "loss": 2.59, + "step": 8384 + }, + { + "epoch": 5.40290088638195, + "grad_norm": 2.3036696311803997, + "learning_rate": 9.718947714615888e-05, + "loss": 2.9548, + "step": 8385 + }, + { + "epoch": 5.403545527800161, + "grad_norm": 1.553551881645934, + "learning_rate": 9.718881077155159e-05, + "loss": 2.8669, + "step": 8386 + }, + { + "epoch": 5.4041901692183725, + "grad_norm": 2.60697593512922, + "learning_rate": 9.71881443202638e-05, + "loss": 3.3261, + "step": 8387 + }, + { + "epoch": 5.404834810636584, + "grad_norm": 1.571664872638453, + "learning_rate": 9.718747779229656e-05, + "loss": 3.039, + "step": 8388 + }, + { + "epoch": 5.405479452054794, + "grad_norm": 2.4778623504520634, + "learning_rate": 9.718681118765101e-05, + "loss": 2.6204, + "step": 8389 + }, + { + "epoch": 5.4061240934730055, + "grad_norm": 1.6316238225191184, + "learning_rate": 9.718614450632824e-05, + "loss": 2.8671, + "step": 8390 + }, + { + "epoch": 5.406768734891217, + "grad_norm": 2.1167008634735627, + "learning_rate": 9.718547774832931e-05, + "loss": 2.8389, + "step": 8391 + }, + { + "epoch": 5.407413376309428, + "grad_norm": 1.6888800851177972, + "learning_rate": 9.718481091365535e-05, + "loss": 2.9304, + "step": 8392 + }, + { + "epoch": 5.408058017727639, + "grad_norm": 2.3646431244446444, + "learning_rate": 9.718414400230746e-05, + "loss": 3.0834, + "step": 8393 + }, + { + "epoch": 5.40870265914585, + "grad_norm": 1.6704229695183102, + "learning_rate": 9.718347701428668e-05, + "loss": 2.8642, + "step": 8394 + }, + { + "epoch": 5.409347300564061, + "grad_norm": 2.0705571834432375, + "learning_rate": 9.718280994959416e-05, + "loss": 2.6474, + "step": 8395 + }, + { + "epoch": 5.409991941982272, + "grad_norm": 1.6642279052291948, + "learning_rate": 9.718214280823099e-05, + "loss": 3.0168, + "step": 8396 + }, + { + "epoch": 5.410636583400484, + "grad_norm": 1.8985114627299773, + "learning_rate": 9.718147559019825e-05, + "loss": 2.6556, + "step": 8397 + }, + { + "epoch": 5.411281224818695, + "grad_norm": 1.6914310168797018, + "learning_rate": 9.718080829549704e-05, + "loss": 3.2812, + "step": 8398 + }, + { + "epoch": 5.411925866236905, + "grad_norm": 1.7654309063576592, + "learning_rate": 9.718014092412846e-05, + "loss": 3.2085, + "step": 8399 + }, + { + "epoch": 5.412570507655117, + "grad_norm": 1.7889174930616194, + "learning_rate": 9.717947347609359e-05, + "loss": 3.0235, + "step": 8400 + }, + { + "epoch": 5.412570507655117, + "eval_loss": 4.469461917877197, + "eval_runtime": 2.9706, + "eval_samples_per_second": 33.663, + "eval_steps_per_second": 4.376, + "step": 8400 + }, + { + "epoch": 5.413215149073328, + "grad_norm": 1.6306234320618533, + "learning_rate": 9.717880595139355e-05, + "loss": 3.0538, + "step": 8401 + }, + { + "epoch": 5.413859790491539, + "grad_norm": 1.8697924059191762, + "learning_rate": 9.717813835002941e-05, + "loss": 2.842, + "step": 8402 + }, + { + "epoch": 5.414504431909751, + "grad_norm": 1.6609644773131882, + "learning_rate": 9.717747067200229e-05, + "loss": 2.8019, + "step": 8403 + }, + { + "epoch": 5.415149073327961, + "grad_norm": 1.5754525828589419, + "learning_rate": 9.717680291731328e-05, + "loss": 3.0897, + "step": 8404 + }, + { + "epoch": 5.415793714746172, + "grad_norm": 1.6791068901236608, + "learning_rate": 9.717613508596349e-05, + "loss": 2.7927, + "step": 8405 + }, + { + "epoch": 5.416438356164384, + "grad_norm": 1.71558480363014, + "learning_rate": 9.717546717795397e-05, + "loss": 2.799, + "step": 8406 + }, + { + "epoch": 5.417082997582595, + "grad_norm": 1.626025828271331, + "learning_rate": 9.717479919328588e-05, + "loss": 2.8374, + "step": 8407 + }, + { + "epoch": 5.417727639000805, + "grad_norm": 1.9850594244801363, + "learning_rate": 9.717413113196027e-05, + "loss": 2.7067, + "step": 8408 + }, + { + "epoch": 5.418372280419017, + "grad_norm": 1.690150375157107, + "learning_rate": 9.717346299397827e-05, + "loss": 3.0264, + "step": 8409 + }, + { + "epoch": 5.419016921837228, + "grad_norm": 1.6398557323301646, + "learning_rate": 9.717279477934093e-05, + "loss": 3.0538, + "step": 8410 + }, + { + "epoch": 5.419661563255439, + "grad_norm": 1.5836847112088421, + "learning_rate": 9.717212648804939e-05, + "loss": 2.9303, + "step": 8411 + }, + { + "epoch": 5.4203062046736505, + "grad_norm": 1.7538951374812737, + "learning_rate": 9.717145812010473e-05, + "loss": 2.8514, + "step": 8412 + }, + { + "epoch": 5.420950846091861, + "grad_norm": 1.443637141145392, + "learning_rate": 9.717078967550807e-05, + "loss": 2.5527, + "step": 8413 + }, + { + "epoch": 5.421595487510072, + "grad_norm": 1.5737962627947675, + "learning_rate": 9.717012115426049e-05, + "loss": 2.6368, + "step": 8414 + }, + { + "epoch": 5.4222401289282836, + "grad_norm": 1.4545972982891597, + "learning_rate": 9.716945255636308e-05, + "loss": 3.0627, + "step": 8415 + }, + { + "epoch": 5.422884770346495, + "grad_norm": 1.792297423652421, + "learning_rate": 9.716878388181692e-05, + "loss": 2.8517, + "step": 8416 + }, + { + "epoch": 5.423529411764706, + "grad_norm": 1.4558585821547485, + "learning_rate": 9.716811513062318e-05, + "loss": 2.9008, + "step": 8417 + }, + { + "epoch": 5.424174053182917, + "grad_norm": 1.6781837366545433, + "learning_rate": 9.716744630278291e-05, + "loss": 2.7535, + "step": 8418 + }, + { + "epoch": 5.424818694601128, + "grad_norm": 1.831290740170773, + "learning_rate": 9.716677739829718e-05, + "loss": 2.7813, + "step": 8419 + }, + { + "epoch": 5.425463336019339, + "grad_norm": 2.062769462306306, + "learning_rate": 9.716610841716714e-05, + "loss": 2.8904, + "step": 8420 + }, + { + "epoch": 5.4261079774375505, + "grad_norm": 1.7078681638064153, + "learning_rate": 9.716543935939389e-05, + "loss": 3.21, + "step": 8421 + }, + { + "epoch": 5.426752618855762, + "grad_norm": 2.6142241913680064, + "learning_rate": 9.716477022497846e-05, + "loss": 3.0748, + "step": 8422 + }, + { + "epoch": 5.427397260273972, + "grad_norm": 2.0836307536032934, + "learning_rate": 9.716410101392203e-05, + "loss": 2.8053, + "step": 8423 + }, + { + "epoch": 5.4280419016921835, + "grad_norm": 1.9831332392377754, + "learning_rate": 9.716343172622566e-05, + "loss": 2.7543, + "step": 8424 + }, + { + "epoch": 5.428686543110395, + "grad_norm": 2.18786272827323, + "learning_rate": 9.716276236189046e-05, + "loss": 3.0436, + "step": 8425 + }, + { + "epoch": 5.429331184528606, + "grad_norm": 1.854961541625253, + "learning_rate": 9.716209292091752e-05, + "loss": 2.9847, + "step": 8426 + }, + { + "epoch": 5.429975825946817, + "grad_norm": 1.8011873051813148, + "learning_rate": 9.716142340330795e-05, + "loss": 2.999, + "step": 8427 + }, + { + "epoch": 5.430620467365028, + "grad_norm": 1.7346296738246536, + "learning_rate": 9.716075380906284e-05, + "loss": 2.9504, + "step": 8428 + }, + { + "epoch": 5.431265108783239, + "grad_norm": 2.162681976989563, + "learning_rate": 9.716008413818329e-05, + "loss": 2.7898, + "step": 8429 + }, + { + "epoch": 5.43190975020145, + "grad_norm": 1.9516820834223003, + "learning_rate": 9.71594143906704e-05, + "loss": 3.1325, + "step": 8430 + }, + { + "epoch": 5.432554391619662, + "grad_norm": 2.1179128301082235, + "learning_rate": 9.715874456652529e-05, + "loss": 3.0477, + "step": 8431 + }, + { + "epoch": 5.433199033037873, + "grad_norm": 1.6716836215127375, + "learning_rate": 9.715807466574903e-05, + "loss": 3.0985, + "step": 8432 + }, + { + "epoch": 5.433843674456083, + "grad_norm": 1.8672766895498352, + "learning_rate": 9.715740468834274e-05, + "loss": 2.9586, + "step": 8433 + }, + { + "epoch": 5.434488315874295, + "grad_norm": 1.5990651137322267, + "learning_rate": 9.71567346343075e-05, + "loss": 2.8557, + "step": 8434 + }, + { + "epoch": 5.435132957292506, + "grad_norm": 1.8507210725454404, + "learning_rate": 9.715606450364444e-05, + "loss": 2.4318, + "step": 8435 + }, + { + "epoch": 5.435777598710717, + "grad_norm": 1.5527501150371472, + "learning_rate": 9.715539429635463e-05, + "loss": 2.7208, + "step": 8436 + }, + { + "epoch": 5.436422240128929, + "grad_norm": 1.8622532284009496, + "learning_rate": 9.71547240124392e-05, + "loss": 2.8315, + "step": 8437 + }, + { + "epoch": 5.437066881547139, + "grad_norm": 1.7365906721085171, + "learning_rate": 9.715405365189923e-05, + "loss": 3.1052, + "step": 8438 + }, + { + "epoch": 5.43771152296535, + "grad_norm": 1.6951403919176429, + "learning_rate": 9.715338321473585e-05, + "loss": 2.9613, + "step": 8439 + }, + { + "epoch": 5.438356164383562, + "grad_norm": 1.4798585390042662, + "learning_rate": 9.715271270095012e-05, + "loss": 2.8826, + "step": 8440 + }, + { + "epoch": 5.439000805801773, + "grad_norm": 1.9048579158962344, + "learning_rate": 9.715204211054316e-05, + "loss": 2.9435, + "step": 8441 + }, + { + "epoch": 5.439645447219984, + "grad_norm": 1.4728325565583409, + "learning_rate": 9.715137144351606e-05, + "loss": 3.1627, + "step": 8442 + }, + { + "epoch": 5.440290088638195, + "grad_norm": 1.756551828068619, + "learning_rate": 9.715070069986994e-05, + "loss": 2.9783, + "step": 8443 + }, + { + "epoch": 5.440934730056406, + "grad_norm": 1.578000648922595, + "learning_rate": 9.71500298796059e-05, + "loss": 3.1096, + "step": 8444 + }, + { + "epoch": 5.441579371474617, + "grad_norm": 1.5106561486782113, + "learning_rate": 9.714935898272503e-05, + "loss": 2.9047, + "step": 8445 + }, + { + "epoch": 5.4422240128928285, + "grad_norm": 1.5467903614019407, + "learning_rate": 9.714868800922845e-05, + "loss": 3.128, + "step": 8446 + }, + { + "epoch": 5.44286865431104, + "grad_norm": 1.4733442732764017, + "learning_rate": 9.714801695911726e-05, + "loss": 2.7315, + "step": 8447 + }, + { + "epoch": 5.44351329572925, + "grad_norm": 1.4361796095411685, + "learning_rate": 9.714734583239252e-05, + "loss": 3.0172, + "step": 8448 + }, + { + "epoch": 5.4441579371474615, + "grad_norm": 1.5690021518203554, + "learning_rate": 9.714667462905537e-05, + "loss": 3.1426, + "step": 8449 + }, + { + "epoch": 5.444802578565673, + "grad_norm": 1.700759765402143, + "learning_rate": 9.714600334910693e-05, + "loss": 2.9674, + "step": 8450 + }, + { + "epoch": 5.445447219983884, + "grad_norm": 1.512597316820578, + "learning_rate": 9.714533199254826e-05, + "loss": 2.7142, + "step": 8451 + }, + { + "epoch": 5.4460918614020954, + "grad_norm": 1.7934999568202312, + "learning_rate": 9.71446605593805e-05, + "loss": 3.018, + "step": 8452 + }, + { + "epoch": 5.446736502820306, + "grad_norm": 1.5352409765231694, + "learning_rate": 9.714398904960472e-05, + "loss": 2.7828, + "step": 8453 + }, + { + "epoch": 5.447381144238517, + "grad_norm": 1.589569956421959, + "learning_rate": 9.714331746322203e-05, + "loss": 3.1278, + "step": 8454 + }, + { + "epoch": 5.4480257856567285, + "grad_norm": 1.8549716956519455, + "learning_rate": 9.714264580023356e-05, + "loss": 2.9214, + "step": 8455 + }, + { + "epoch": 5.44867042707494, + "grad_norm": 1.5485601431314466, + "learning_rate": 9.714197406064038e-05, + "loss": 3.0446, + "step": 8456 + }, + { + "epoch": 5.449315068493151, + "grad_norm": 1.6864464171718967, + "learning_rate": 9.714130224444361e-05, + "loss": 2.8334, + "step": 8457 + }, + { + "epoch": 5.4499597099113615, + "grad_norm": 1.6607776151454476, + "learning_rate": 9.714063035164434e-05, + "loss": 2.9074, + "step": 8458 + }, + { + "epoch": 5.450604351329573, + "grad_norm": 1.4462834108566673, + "learning_rate": 9.713995838224372e-05, + "loss": 2.3839, + "step": 8459 + }, + { + "epoch": 5.451248992747784, + "grad_norm": 2.937292109911695, + "learning_rate": 9.713928633624277e-05, + "loss": 2.8015, + "step": 8460 + }, + { + "epoch": 5.451893634165995, + "grad_norm": 1.8762661534931928, + "learning_rate": 9.713861421364269e-05, + "loss": 3.1443, + "step": 8461 + }, + { + "epoch": 5.452538275584207, + "grad_norm": 2.225854876840701, + "learning_rate": 9.713794201444449e-05, + "loss": 2.9697, + "step": 8462 + }, + { + "epoch": 5.453182917002417, + "grad_norm": 1.7347096359793641, + "learning_rate": 9.713726973864935e-05, + "loss": 2.979, + "step": 8463 + }, + { + "epoch": 5.453827558420628, + "grad_norm": 1.8386134588786742, + "learning_rate": 9.713659738625833e-05, + "loss": 2.7545, + "step": 8464 + }, + { + "epoch": 5.45447219983884, + "grad_norm": 1.8405921063082462, + "learning_rate": 9.713592495727254e-05, + "loss": 3.034, + "step": 8465 + }, + { + "epoch": 5.455116841257051, + "grad_norm": 1.9330275014930958, + "learning_rate": 9.71352524516931e-05, + "loss": 3.1415, + "step": 8466 + }, + { + "epoch": 5.455761482675262, + "grad_norm": 1.5119261603161809, + "learning_rate": 9.713457986952112e-05, + "loss": 2.9818, + "step": 8467 + }, + { + "epoch": 5.456406124093473, + "grad_norm": 2.0011727558724552, + "learning_rate": 9.713390721075768e-05, + "loss": 2.8508, + "step": 8468 + }, + { + "epoch": 5.457050765511684, + "grad_norm": 1.8822058412456142, + "learning_rate": 9.713323447540389e-05, + "loss": 3.0283, + "step": 8469 + }, + { + "epoch": 5.457695406929895, + "grad_norm": 1.5423056399239097, + "learning_rate": 9.713256166346086e-05, + "loss": 3.1874, + "step": 8470 + }, + { + "epoch": 5.458340048348107, + "grad_norm": 1.5584655388886226, + "learning_rate": 9.713188877492971e-05, + "loss": 2.7216, + "step": 8471 + }, + { + "epoch": 5.458984689766318, + "grad_norm": 1.6007380635863764, + "learning_rate": 9.713121580981152e-05, + "loss": 2.924, + "step": 8472 + }, + { + "epoch": 5.459629331184528, + "grad_norm": 1.8327592727267696, + "learning_rate": 9.713054276810741e-05, + "loss": 3.0106, + "step": 8473 + }, + { + "epoch": 5.46027397260274, + "grad_norm": 1.7119819874906113, + "learning_rate": 9.712986964981848e-05, + "loss": 2.8951, + "step": 8474 + }, + { + "epoch": 5.460918614020951, + "grad_norm": 1.6597442666961462, + "learning_rate": 9.712919645494584e-05, + "loss": 2.9441, + "step": 8475 + }, + { + "epoch": 5.461563255439162, + "grad_norm": 1.5707130352227372, + "learning_rate": 9.712852318349059e-05, + "loss": 2.6993, + "step": 8476 + }, + { + "epoch": 5.4622078968573735, + "grad_norm": 1.4913978440633604, + "learning_rate": 9.712784983545383e-05, + "loss": 2.9677, + "step": 8477 + }, + { + "epoch": 5.462852538275584, + "grad_norm": 1.5011909677071276, + "learning_rate": 9.71271764108367e-05, + "loss": 2.8891, + "step": 8478 + }, + { + "epoch": 5.463497179693795, + "grad_norm": 1.5050499899101808, + "learning_rate": 9.712650290964025e-05, + "loss": 2.905, + "step": 8479 + }, + { + "epoch": 5.4641418211120065, + "grad_norm": 1.4612361922985662, + "learning_rate": 9.712582933186561e-05, + "loss": 3.147, + "step": 8480 + }, + { + "epoch": 5.464786462530218, + "grad_norm": 1.8368537637282498, + "learning_rate": 9.712515567751392e-05, + "loss": 2.8468, + "step": 8481 + }, + { + "epoch": 5.465431103948429, + "grad_norm": 1.7311605173845899, + "learning_rate": 9.712448194658625e-05, + "loss": 3.2193, + "step": 8482 + }, + { + "epoch": 5.4660757453666395, + "grad_norm": 1.827634077553673, + "learning_rate": 9.712380813908371e-05, + "loss": 3.034, + "step": 8483 + }, + { + "epoch": 5.466720386784851, + "grad_norm": 1.842868847935458, + "learning_rate": 9.712313425500742e-05, + "loss": 2.9188, + "step": 8484 + }, + { + "epoch": 5.467365028203062, + "grad_norm": 1.8467798722992526, + "learning_rate": 9.712246029435849e-05, + "loss": 2.8551, + "step": 8485 + }, + { + "epoch": 5.468009669621273, + "grad_norm": 1.7753648124517547, + "learning_rate": 9.7121786257138e-05, + "loss": 2.9066, + "step": 8486 + }, + { + "epoch": 5.468654311039485, + "grad_norm": 1.7650310733677694, + "learning_rate": 9.712111214334707e-05, + "loss": 2.8721, + "step": 8487 + }, + { + "epoch": 5.469298952457695, + "grad_norm": 1.8844118573755926, + "learning_rate": 9.712043795298682e-05, + "loss": 2.8675, + "step": 8488 + }, + { + "epoch": 5.4699435938759065, + "grad_norm": 1.7527489615920055, + "learning_rate": 9.711976368605834e-05, + "loss": 3.2541, + "step": 8489 + }, + { + "epoch": 5.470588235294118, + "grad_norm": 1.5784347971619381, + "learning_rate": 9.711908934256274e-05, + "loss": 3.2534, + "step": 8490 + }, + { + "epoch": 5.471232876712329, + "grad_norm": 1.5276370984863425, + "learning_rate": 9.711841492250114e-05, + "loss": 3.0895, + "step": 8491 + }, + { + "epoch": 5.47187751813054, + "grad_norm": 1.543217190603449, + "learning_rate": 9.711774042587465e-05, + "loss": 2.7759, + "step": 8492 + }, + { + "epoch": 5.472522159548751, + "grad_norm": 1.744905796606508, + "learning_rate": 9.711706585268435e-05, + "loss": 2.9772, + "step": 8493 + }, + { + "epoch": 5.473166800966962, + "grad_norm": 1.683397076226801, + "learning_rate": 9.711639120293139e-05, + "loss": 2.9794, + "step": 8494 + }, + { + "epoch": 5.473811442385173, + "grad_norm": 1.6882341450480973, + "learning_rate": 9.711571647661684e-05, + "loss": 3.1893, + "step": 8495 + }, + { + "epoch": 5.474456083803385, + "grad_norm": 1.6758230787989936, + "learning_rate": 9.711504167374182e-05, + "loss": 3.2722, + "step": 8496 + }, + { + "epoch": 5.475100725221595, + "grad_norm": 1.7073707533448752, + "learning_rate": 9.711436679430742e-05, + "loss": 3.0503, + "step": 8497 + }, + { + "epoch": 5.475745366639806, + "grad_norm": 1.6049507087927826, + "learning_rate": 9.711369183831478e-05, + "loss": 2.6421, + "step": 8498 + }, + { + "epoch": 5.476390008058018, + "grad_norm": 1.9410319316839297, + "learning_rate": 9.711301680576501e-05, + "loss": 2.9068, + "step": 8499 + }, + { + "epoch": 5.477034649476229, + "grad_norm": 1.9600520588272263, + "learning_rate": 9.711234169665921e-05, + "loss": 2.848, + "step": 8500 + }, + { + "epoch": 5.477034649476229, + "eval_loss": 4.491221904754639, + "eval_runtime": 2.9928, + "eval_samples_per_second": 33.413, + "eval_steps_per_second": 4.344, + "step": 8500 + }, + { + "epoch": 5.47767929089444, + "grad_norm": 1.7327628936334, + "learning_rate": 9.711166651099845e-05, + "loss": 3.1121, + "step": 8501 + }, + { + "epoch": 5.478323932312651, + "grad_norm": 1.846845905209121, + "learning_rate": 9.71109912487839e-05, + "loss": 3.0863, + "step": 8502 + }, + { + "epoch": 5.478968573730862, + "grad_norm": 1.760688652900645, + "learning_rate": 9.711031591001665e-05, + "loss": 3.0768, + "step": 8503 + }, + { + "epoch": 5.479613215149073, + "grad_norm": 1.7033071076687514, + "learning_rate": 9.710964049469779e-05, + "loss": 2.692, + "step": 8504 + }, + { + "epoch": 5.480257856567285, + "grad_norm": 1.6468322067065155, + "learning_rate": 9.710896500282844e-05, + "loss": 2.906, + "step": 8505 + }, + { + "epoch": 5.480902497985496, + "grad_norm": 1.5690252181077518, + "learning_rate": 9.710828943440973e-05, + "loss": 2.8854, + "step": 8506 + }, + { + "epoch": 5.481547139403706, + "grad_norm": 1.7656643116598858, + "learning_rate": 9.710761378944271e-05, + "loss": 3.2346, + "step": 8507 + }, + { + "epoch": 5.482191780821918, + "grad_norm": 1.4681981702044669, + "learning_rate": 9.710693806792855e-05, + "loss": 3.256, + "step": 8508 + }, + { + "epoch": 5.482836422240129, + "grad_norm": 1.705520037701859, + "learning_rate": 9.710626226986834e-05, + "loss": 3.0771, + "step": 8509 + }, + { + "epoch": 5.48348106365834, + "grad_norm": 1.6645044130648246, + "learning_rate": 9.71055863952632e-05, + "loss": 2.7162, + "step": 8510 + }, + { + "epoch": 5.4841257050765515, + "grad_norm": 1.4470910981245586, + "learning_rate": 9.710491044411422e-05, + "loss": 3.0929, + "step": 8511 + }, + { + "epoch": 5.484770346494762, + "grad_norm": 1.9532133052775142, + "learning_rate": 9.710423441642252e-05, + "loss": 2.5826, + "step": 8512 + }, + { + "epoch": 5.485414987912973, + "grad_norm": 1.4230351073410934, + "learning_rate": 9.71035583121892e-05, + "loss": 2.8601, + "step": 8513 + }, + { + "epoch": 5.4860596293311845, + "grad_norm": 1.883683655979059, + "learning_rate": 9.710288213141538e-05, + "loss": 3.1235, + "step": 8514 + }, + { + "epoch": 5.486704270749396, + "grad_norm": 1.6049134337219115, + "learning_rate": 9.71022058741022e-05, + "loss": 3.1784, + "step": 8515 + }, + { + "epoch": 5.487348912167607, + "grad_norm": 1.8866261868491554, + "learning_rate": 9.710152954025071e-05, + "loss": 3.1713, + "step": 8516 + }, + { + "epoch": 5.4879935535858175, + "grad_norm": 1.85912978975498, + "learning_rate": 9.710085312986207e-05, + "loss": 2.8394, + "step": 8517 + }, + { + "epoch": 5.488638195004029, + "grad_norm": 1.5366393959815638, + "learning_rate": 9.710017664293735e-05, + "loss": 2.8846, + "step": 8518 + }, + { + "epoch": 5.48928283642224, + "grad_norm": 1.86091141710068, + "learning_rate": 9.70995000794777e-05, + "loss": 2.9974, + "step": 8519 + }, + { + "epoch": 5.489927477840451, + "grad_norm": 1.6123738883233718, + "learning_rate": 9.709882343948421e-05, + "loss": 2.8721, + "step": 8520 + }, + { + "epoch": 5.490572119258663, + "grad_norm": 1.7781865952107994, + "learning_rate": 9.7098146722958e-05, + "loss": 3.3918, + "step": 8521 + }, + { + "epoch": 5.491216760676873, + "grad_norm": 1.5068506939737516, + "learning_rate": 9.709746992990019e-05, + "loss": 2.7974, + "step": 8522 + }, + { + "epoch": 5.491861402095084, + "grad_norm": 1.5393717565972889, + "learning_rate": 9.709679306031186e-05, + "loss": 2.9244, + "step": 8523 + }, + { + "epoch": 5.492506043513296, + "grad_norm": 1.6339894057346154, + "learning_rate": 9.709611611419414e-05, + "loss": 2.9327, + "step": 8524 + }, + { + "epoch": 5.493150684931507, + "grad_norm": 1.6312468581805657, + "learning_rate": 9.709543909154816e-05, + "loss": 2.9088, + "step": 8525 + }, + { + "epoch": 5.493795326349718, + "grad_norm": 1.7876143479021978, + "learning_rate": 9.709476199237502e-05, + "loss": 3.0264, + "step": 8526 + }, + { + "epoch": 5.494439967767929, + "grad_norm": 1.6016520394035556, + "learning_rate": 9.70940848166758e-05, + "loss": 2.9547, + "step": 8527 + }, + { + "epoch": 5.49508460918614, + "grad_norm": 1.9473706739436265, + "learning_rate": 9.709340756445166e-05, + "loss": 2.7031, + "step": 8528 + }, + { + "epoch": 5.495729250604351, + "grad_norm": 1.8818484754999518, + "learning_rate": 9.709273023570369e-05, + "loss": 2.8666, + "step": 8529 + }, + { + "epoch": 5.496373892022563, + "grad_norm": 1.5584209485139515, + "learning_rate": 9.709205283043298e-05, + "loss": 3.2011, + "step": 8530 + }, + { + "epoch": 5.497018533440774, + "grad_norm": 1.7447994440824033, + "learning_rate": 9.709137534864069e-05, + "loss": 3.0663, + "step": 8531 + }, + { + "epoch": 5.497663174858984, + "grad_norm": 1.9295088007119052, + "learning_rate": 9.70906977903279e-05, + "loss": 2.7521, + "step": 8532 + }, + { + "epoch": 5.498307816277196, + "grad_norm": 1.999581116192203, + "learning_rate": 9.709002015549573e-05, + "loss": 3.4144, + "step": 8533 + }, + { + "epoch": 5.498952457695407, + "grad_norm": 1.6765573505259832, + "learning_rate": 9.70893424441453e-05, + "loss": 3.5506, + "step": 8534 + }, + { + "epoch": 5.499597099113618, + "grad_norm": 1.5918610500367774, + "learning_rate": 9.708866465627772e-05, + "loss": 2.9998, + "step": 8535 + }, + { + "epoch": 5.500241740531829, + "grad_norm": 1.3929058550036315, + "learning_rate": 9.708798679189409e-05, + "loss": 2.9741, + "step": 8536 + }, + { + "epoch": 5.50088638195004, + "grad_norm": 1.5552441774814756, + "learning_rate": 9.708730885099555e-05, + "loss": 3.0174, + "step": 8537 + }, + { + "epoch": 5.501531023368251, + "grad_norm": 1.6446254705406715, + "learning_rate": 9.708663083358319e-05, + "loss": 2.8237, + "step": 8538 + }, + { + "epoch": 5.502175664786463, + "grad_norm": 1.4137093138201327, + "learning_rate": 9.708595273965813e-05, + "loss": 3.0031, + "step": 8539 + }, + { + "epoch": 5.502820306204674, + "grad_norm": 1.5296403239861391, + "learning_rate": 9.708527456922148e-05, + "loss": 2.6119, + "step": 8540 + }, + { + "epoch": 5.503464947622884, + "grad_norm": 1.6543285364263038, + "learning_rate": 9.708459632227436e-05, + "loss": 3.086, + "step": 8541 + }, + { + "epoch": 5.504109589041096, + "grad_norm": 1.6008407406996832, + "learning_rate": 9.708391799881789e-05, + "loss": 2.887, + "step": 8542 + }, + { + "epoch": 5.504754230459307, + "grad_norm": 1.583779822960499, + "learning_rate": 9.708323959885319e-05, + "loss": 2.7865, + "step": 8543 + }, + { + "epoch": 5.505398871877518, + "grad_norm": 1.615542363973502, + "learning_rate": 9.708256112238134e-05, + "loss": 2.7043, + "step": 8544 + }, + { + "epoch": 5.5060435132957295, + "grad_norm": 1.5873870447598948, + "learning_rate": 9.708188256940348e-05, + "loss": 3.1745, + "step": 8545 + }, + { + "epoch": 5.50668815471394, + "grad_norm": 1.7127644242234104, + "learning_rate": 9.708120393992071e-05, + "loss": 2.9838, + "step": 8546 + }, + { + "epoch": 5.507332796132151, + "grad_norm": 1.7073249121594942, + "learning_rate": 9.708052523393418e-05, + "loss": 2.88, + "step": 8547 + }, + { + "epoch": 5.5079774375503625, + "grad_norm": 1.5314866296028287, + "learning_rate": 9.707984645144496e-05, + "loss": 3.0769, + "step": 8548 + }, + { + "epoch": 5.508622078968574, + "grad_norm": 1.5132817111919918, + "learning_rate": 9.70791675924542e-05, + "loss": 3.3023, + "step": 8549 + }, + { + "epoch": 5.509266720386785, + "grad_norm": 1.492888972995497, + "learning_rate": 9.707848865696298e-05, + "loss": 3.1896, + "step": 8550 + }, + { + "epoch": 5.5099113618049955, + "grad_norm": 1.919875667017389, + "learning_rate": 9.707780964497242e-05, + "loss": 2.8641, + "step": 8551 + }, + { + "epoch": 5.510556003223207, + "grad_norm": 1.4487707953367783, + "learning_rate": 9.707713055648368e-05, + "loss": 2.8656, + "step": 8552 + }, + { + "epoch": 5.511200644641418, + "grad_norm": 2.009024194671416, + "learning_rate": 9.707645139149783e-05, + "loss": 3.0411, + "step": 8553 + }, + { + "epoch": 5.511845286059629, + "grad_norm": 1.8740388545359519, + "learning_rate": 9.7075772150016e-05, + "loss": 3.0545, + "step": 8554 + }, + { + "epoch": 5.512489927477841, + "grad_norm": 1.7870081304935255, + "learning_rate": 9.707509283203931e-05, + "loss": 2.892, + "step": 8555 + }, + { + "epoch": 5.513134568896051, + "grad_norm": 2.0021342430401936, + "learning_rate": 9.707441343756886e-05, + "loss": 2.9729, + "step": 8556 + }, + { + "epoch": 5.513779210314262, + "grad_norm": 1.4846929743508195, + "learning_rate": 9.707373396660579e-05, + "loss": 2.8721, + "step": 8557 + }, + { + "epoch": 5.514423851732474, + "grad_norm": 1.9866474219635313, + "learning_rate": 9.707305441915122e-05, + "loss": 3.1859, + "step": 8558 + }, + { + "epoch": 5.515068493150685, + "grad_norm": 1.5199280001913476, + "learning_rate": 9.707237479520622e-05, + "loss": 2.8705, + "step": 8559 + }, + { + "epoch": 5.515713134568896, + "grad_norm": 1.9512319316608018, + "learning_rate": 9.707169509477194e-05, + "loss": 2.9083, + "step": 8560 + }, + { + "epoch": 5.516357775987107, + "grad_norm": 1.7096126068682043, + "learning_rate": 9.70710153178495e-05, + "loss": 3.1662, + "step": 8561 + }, + { + "epoch": 5.517002417405318, + "grad_norm": 1.9759873238460317, + "learning_rate": 9.707033546443999e-05, + "loss": 3.0328, + "step": 8562 + }, + { + "epoch": 5.517647058823529, + "grad_norm": 1.6067636124632005, + "learning_rate": 9.706965553454456e-05, + "loss": 2.9174, + "step": 8563 + }, + { + "epoch": 5.518291700241741, + "grad_norm": 1.8654902492454661, + "learning_rate": 9.70689755281643e-05, + "loss": 3.0271, + "step": 8564 + }, + { + "epoch": 5.518936341659952, + "grad_norm": 1.484423613073976, + "learning_rate": 9.706829544530034e-05, + "loss": 2.9994, + "step": 8565 + }, + { + "epoch": 5.519580983078162, + "grad_norm": 1.7393842692447543, + "learning_rate": 9.706761528595382e-05, + "loss": 3.0045, + "step": 8566 + }, + { + "epoch": 5.520225624496374, + "grad_norm": 1.5899887892364992, + "learning_rate": 9.706693505012581e-05, + "loss": 2.9231, + "step": 8567 + }, + { + "epoch": 5.520870265914585, + "grad_norm": 1.5883311067929498, + "learning_rate": 9.706625473781744e-05, + "loss": 2.8677, + "step": 8568 + }, + { + "epoch": 5.521514907332796, + "grad_norm": 1.520730392350124, + "learning_rate": 9.706557434902986e-05, + "loss": 2.8816, + "step": 8569 + }, + { + "epoch": 5.522159548751008, + "grad_norm": 1.5469893738347185, + "learning_rate": 9.706489388376415e-05, + "loss": 2.9246, + "step": 8570 + }, + { + "epoch": 5.522804190169218, + "grad_norm": 1.5540460747976, + "learning_rate": 9.706421334202144e-05, + "loss": 2.9862, + "step": 8571 + }, + { + "epoch": 5.523448831587429, + "grad_norm": 1.628413797876734, + "learning_rate": 9.706353272380286e-05, + "loss": 3.056, + "step": 8572 + }, + { + "epoch": 5.524093473005641, + "grad_norm": 1.494349024109918, + "learning_rate": 9.70628520291095e-05, + "loss": 2.7416, + "step": 8573 + }, + { + "epoch": 5.524738114423852, + "grad_norm": 1.5060063927206118, + "learning_rate": 9.70621712579425e-05, + "loss": 2.7422, + "step": 8574 + }, + { + "epoch": 5.525382755842063, + "grad_norm": 1.4646376902905194, + "learning_rate": 9.706149041030297e-05, + "loss": 2.9792, + "step": 8575 + }, + { + "epoch": 5.526027397260274, + "grad_norm": 1.612018420904749, + "learning_rate": 9.706080948619205e-05, + "loss": 2.8226, + "step": 8576 + }, + { + "epoch": 5.526672038678485, + "grad_norm": 1.5655709997066356, + "learning_rate": 9.706012848561082e-05, + "loss": 2.9728, + "step": 8577 + }, + { + "epoch": 5.527316680096696, + "grad_norm": 1.602306516733856, + "learning_rate": 9.705944740856042e-05, + "loss": 2.8122, + "step": 8578 + }, + { + "epoch": 5.5279613215149075, + "grad_norm": 1.5688398787261146, + "learning_rate": 9.7058766255042e-05, + "loss": 2.9362, + "step": 8579 + }, + { + "epoch": 5.528605962933119, + "grad_norm": 1.461703472823526, + "learning_rate": 9.70580850250566e-05, + "loss": 2.8748, + "step": 8580 + }, + { + "epoch": 5.529250604351329, + "grad_norm": 1.602192126923254, + "learning_rate": 9.705740371860541e-05, + "loss": 2.7075, + "step": 8581 + }, + { + "epoch": 5.5298952457695405, + "grad_norm": 1.506281656427431, + "learning_rate": 9.705672233568951e-05, + "loss": 2.9388, + "step": 8582 + }, + { + "epoch": 5.530539887187752, + "grad_norm": 1.7788064141526674, + "learning_rate": 9.705604087631004e-05, + "loss": 2.9738, + "step": 8583 + }, + { + "epoch": 5.531184528605963, + "grad_norm": 1.5988435856595025, + "learning_rate": 9.70553593404681e-05, + "loss": 2.9881, + "step": 8584 + }, + { + "epoch": 5.531829170024174, + "grad_norm": 1.6167569904802865, + "learning_rate": 9.705467772816484e-05, + "loss": 3.0109, + "step": 8585 + }, + { + "epoch": 5.532473811442385, + "grad_norm": 1.612425484870296, + "learning_rate": 9.705399603940136e-05, + "loss": 2.9932, + "step": 8586 + }, + { + "epoch": 5.533118452860596, + "grad_norm": 1.471961094063075, + "learning_rate": 9.705331427417876e-05, + "loss": 2.8183, + "step": 8587 + }, + { + "epoch": 5.533763094278807, + "grad_norm": 1.8302047284697098, + "learning_rate": 9.705263243249818e-05, + "loss": 3.1421, + "step": 8588 + }, + { + "epoch": 5.534407735697019, + "grad_norm": 1.7269855850894442, + "learning_rate": 9.705195051436076e-05, + "loss": 2.9109, + "step": 8589 + }, + { + "epoch": 5.53505237711523, + "grad_norm": 1.4523389823514516, + "learning_rate": 9.705126851976758e-05, + "loss": 3.1338, + "step": 8590 + }, + { + "epoch": 5.53569701853344, + "grad_norm": 1.6703546664689262, + "learning_rate": 9.70505864487198e-05, + "loss": 3.0599, + "step": 8591 + }, + { + "epoch": 5.536341659951652, + "grad_norm": 1.5598328168695783, + "learning_rate": 9.70499043012185e-05, + "loss": 3.2526, + "step": 8592 + }, + { + "epoch": 5.536986301369863, + "grad_norm": 1.8787556925741955, + "learning_rate": 9.704922207726482e-05, + "loss": 2.9537, + "step": 8593 + }, + { + "epoch": 5.537630942788074, + "grad_norm": 1.6480739678225287, + "learning_rate": 9.70485397768599e-05, + "loss": 2.8839, + "step": 8594 + }, + { + "epoch": 5.538275584206286, + "grad_norm": 1.8282482835269551, + "learning_rate": 9.704785740000483e-05, + "loss": 2.9447, + "step": 8595 + }, + { + "epoch": 5.538920225624496, + "grad_norm": 1.6644455578727657, + "learning_rate": 9.704717494670074e-05, + "loss": 3.069, + "step": 8596 + }, + { + "epoch": 5.539564867042707, + "grad_norm": 1.7529585508449252, + "learning_rate": 9.704649241694875e-05, + "loss": 3.1919, + "step": 8597 + }, + { + "epoch": 5.540209508460919, + "grad_norm": 1.771263541787228, + "learning_rate": 9.704580981074998e-05, + "loss": 3.2399, + "step": 8598 + }, + { + "epoch": 5.54085414987913, + "grad_norm": 1.4808956975703231, + "learning_rate": 9.704512712810557e-05, + "loss": 3.0754, + "step": 8599 + }, + { + "epoch": 5.541498791297341, + "grad_norm": 1.84032631956564, + "learning_rate": 9.704444436901661e-05, + "loss": 3.2851, + "step": 8600 + }, + { + "epoch": 5.541498791297341, + "eval_loss": 4.439577579498291, + "eval_runtime": 2.9764, + "eval_samples_per_second": 33.597, + "eval_steps_per_second": 4.368, + "step": 8600 + }, + { + "epoch": 5.542143432715552, + "grad_norm": 1.49820332912128, + "learning_rate": 9.704376153348426e-05, + "loss": 3.0109, + "step": 8601 + }, + { + "epoch": 5.542788074133763, + "grad_norm": 1.6170752178287353, + "learning_rate": 9.704307862150958e-05, + "loss": 3.0443, + "step": 8602 + }, + { + "epoch": 5.543432715551974, + "grad_norm": 1.6033356601512025, + "learning_rate": 9.704239563309377e-05, + "loss": 3.187, + "step": 8603 + }, + { + "epoch": 5.5440773569701856, + "grad_norm": 1.70030841130893, + "learning_rate": 9.70417125682379e-05, + "loss": 2.8248, + "step": 8604 + }, + { + "epoch": 5.544721998388397, + "grad_norm": 1.521349180342689, + "learning_rate": 9.70410294269431e-05, + "loss": 3.2475, + "step": 8605 + }, + { + "epoch": 5.545366639806607, + "grad_norm": 1.5105882703539268, + "learning_rate": 9.70403462092105e-05, + "loss": 2.9699, + "step": 8606 + }, + { + "epoch": 5.546011281224819, + "grad_norm": 1.756609472087505, + "learning_rate": 9.703966291504121e-05, + "loss": 2.9903, + "step": 8607 + }, + { + "epoch": 5.54665592264303, + "grad_norm": 2.001948629841641, + "learning_rate": 9.703897954443637e-05, + "loss": 3.188, + "step": 8608 + }, + { + "epoch": 5.547300564061241, + "grad_norm": 1.4869693547137586, + "learning_rate": 9.703829609739709e-05, + "loss": 3.0752, + "step": 8609 + }, + { + "epoch": 5.5479452054794525, + "grad_norm": 1.6398375751342533, + "learning_rate": 9.703761257392447e-05, + "loss": 3.4231, + "step": 8610 + }, + { + "epoch": 5.548589846897663, + "grad_norm": 1.4417350077003097, + "learning_rate": 9.703692897401968e-05, + "loss": 2.6586, + "step": 8611 + }, + { + "epoch": 5.549234488315874, + "grad_norm": 1.851721464751769, + "learning_rate": 9.703624529768384e-05, + "loss": 2.8981, + "step": 8612 + }, + { + "epoch": 5.5498791297340855, + "grad_norm": 1.6305388207050409, + "learning_rate": 9.703556154491802e-05, + "loss": 2.9876, + "step": 8613 + }, + { + "epoch": 5.550523771152297, + "grad_norm": 1.7410536633665217, + "learning_rate": 9.703487771572339e-05, + "loss": 2.9125, + "step": 8614 + }, + { + "epoch": 5.551168412570508, + "grad_norm": 1.6740751102457607, + "learning_rate": 9.703419381010106e-05, + "loss": 2.9122, + "step": 8615 + }, + { + "epoch": 5.5518130539887185, + "grad_norm": 1.6525484976691975, + "learning_rate": 9.703350982805215e-05, + "loss": 2.9185, + "step": 8616 + }, + { + "epoch": 5.55245769540693, + "grad_norm": 1.737349174714696, + "learning_rate": 9.70328257695778e-05, + "loss": 3.3719, + "step": 8617 + }, + { + "epoch": 5.553102336825141, + "grad_norm": 1.6245679726661846, + "learning_rate": 9.70321416346791e-05, + "loss": 2.8776, + "step": 8618 + }, + { + "epoch": 5.553746978243352, + "grad_norm": 1.6973632160649526, + "learning_rate": 9.703145742335718e-05, + "loss": 3.0116, + "step": 8619 + }, + { + "epoch": 5.554391619661564, + "grad_norm": 1.7366447328661827, + "learning_rate": 9.703077313561321e-05, + "loss": 3.261, + "step": 8620 + }, + { + "epoch": 5.555036261079774, + "grad_norm": 1.6137122097916314, + "learning_rate": 9.703008877144825e-05, + "loss": 2.8037, + "step": 8621 + }, + { + "epoch": 5.555680902497985, + "grad_norm": 1.7732821667715106, + "learning_rate": 9.702940433086348e-05, + "loss": 2.7073, + "step": 8622 + }, + { + "epoch": 5.556325543916197, + "grad_norm": 1.7059842481146887, + "learning_rate": 9.702871981385999e-05, + "loss": 3.1238, + "step": 8623 + }, + { + "epoch": 5.556970185334408, + "grad_norm": 1.7279286061974306, + "learning_rate": 9.70280352204389e-05, + "loss": 2.8046, + "step": 8624 + }, + { + "epoch": 5.557614826752619, + "grad_norm": 2.122698660152939, + "learning_rate": 9.702735055060137e-05, + "loss": 2.7625, + "step": 8625 + }, + { + "epoch": 5.55825946817083, + "grad_norm": 1.615295538106217, + "learning_rate": 9.70266658043485e-05, + "loss": 2.9815, + "step": 8626 + }, + { + "epoch": 5.558904109589041, + "grad_norm": 1.6153600877207408, + "learning_rate": 9.70259809816814e-05, + "loss": 3.0755, + "step": 8627 + }, + { + "epoch": 5.559548751007252, + "grad_norm": 1.6223643491897632, + "learning_rate": 9.702529608260121e-05, + "loss": 2.797, + "step": 8628 + }, + { + "epoch": 5.560193392425464, + "grad_norm": 1.602222911252279, + "learning_rate": 9.702461110710907e-05, + "loss": 2.8413, + "step": 8629 + }, + { + "epoch": 5.560838033843675, + "grad_norm": 1.7822840018599062, + "learning_rate": 9.702392605520609e-05, + "loss": 2.6899, + "step": 8630 + }, + { + "epoch": 5.561482675261885, + "grad_norm": 1.6167093812321909, + "learning_rate": 9.70232409268934e-05, + "loss": 2.9857, + "step": 8631 + }, + { + "epoch": 5.562127316680097, + "grad_norm": 1.607582015073865, + "learning_rate": 9.70225557221721e-05, + "loss": 3.0127, + "step": 8632 + }, + { + "epoch": 5.562771958098308, + "grad_norm": 1.7559683934803385, + "learning_rate": 9.702187044104335e-05, + "loss": 2.6909, + "step": 8633 + }, + { + "epoch": 5.563416599516519, + "grad_norm": 1.4835716999839965, + "learning_rate": 9.702118508350826e-05, + "loss": 2.8924, + "step": 8634 + }, + { + "epoch": 5.5640612409347305, + "grad_norm": 1.7434398944086384, + "learning_rate": 9.702049964956797e-05, + "loss": 2.925, + "step": 8635 + }, + { + "epoch": 5.564705882352941, + "grad_norm": 1.7808192906074032, + "learning_rate": 9.701981413922358e-05, + "loss": 2.9352, + "step": 8636 + }, + { + "epoch": 5.565350523771152, + "grad_norm": 2.1145273513227503, + "learning_rate": 9.701912855247624e-05, + "loss": 3.2106, + "step": 8637 + }, + { + "epoch": 5.5659951651893635, + "grad_norm": 2.326751621985127, + "learning_rate": 9.701844288932704e-05, + "loss": 3.0892, + "step": 8638 + }, + { + "epoch": 5.566639806607575, + "grad_norm": 1.7940175227803767, + "learning_rate": 9.701775714977716e-05, + "loss": 3.0977, + "step": 8639 + }, + { + "epoch": 5.567284448025786, + "grad_norm": 2.149649640229096, + "learning_rate": 9.701707133382768e-05, + "loss": 3.1088, + "step": 8640 + }, + { + "epoch": 5.567929089443997, + "grad_norm": 1.744843280178607, + "learning_rate": 9.701638544147977e-05, + "loss": 3.1345, + "step": 8641 + }, + { + "epoch": 5.568573730862208, + "grad_norm": 2.2978450728576556, + "learning_rate": 9.70156994727345e-05, + "loss": 3.3154, + "step": 8642 + }, + { + "epoch": 5.569218372280419, + "grad_norm": 1.8119173606955035, + "learning_rate": 9.701501342759303e-05, + "loss": 3.0794, + "step": 8643 + }, + { + "epoch": 5.5698630136986305, + "grad_norm": 2.175688787564837, + "learning_rate": 9.701432730605649e-05, + "loss": 3.1001, + "step": 8644 + }, + { + "epoch": 5.570507655116841, + "grad_norm": 1.8372191168499776, + "learning_rate": 9.7013641108126e-05, + "loss": 2.8187, + "step": 8645 + }, + { + "epoch": 5.571152296535052, + "grad_norm": 2.0508442153415785, + "learning_rate": 9.701295483380271e-05, + "loss": 2.9257, + "step": 8646 + }, + { + "epoch": 5.5717969379532635, + "grad_norm": 1.986434155488514, + "learning_rate": 9.70122684830877e-05, + "loss": 2.8826, + "step": 8647 + }, + { + "epoch": 5.572441579371475, + "grad_norm": 1.67706039977127, + "learning_rate": 9.701158205598212e-05, + "loss": 3.0508, + "step": 8648 + }, + { + "epoch": 5.573086220789686, + "grad_norm": 1.9516874759439866, + "learning_rate": 9.70108955524871e-05, + "loss": 2.9281, + "step": 8649 + }, + { + "epoch": 5.5737308622078965, + "grad_norm": 1.4865378277015573, + "learning_rate": 9.701020897260378e-05, + "loss": 3.2957, + "step": 8650 + }, + { + "epoch": 5.574375503626108, + "grad_norm": 1.7523635708337089, + "learning_rate": 9.700952231633327e-05, + "loss": 3.23, + "step": 8651 + }, + { + "epoch": 5.575020145044319, + "grad_norm": 1.7527296902079086, + "learning_rate": 9.70088355836767e-05, + "loss": 3.0524, + "step": 8652 + }, + { + "epoch": 5.57566478646253, + "grad_norm": 1.6566592327268526, + "learning_rate": 9.700814877463519e-05, + "loss": 3.2677, + "step": 8653 + }, + { + "epoch": 5.576309427880742, + "grad_norm": 1.4748131978463785, + "learning_rate": 9.700746188920989e-05, + "loss": 2.9454, + "step": 8654 + }, + { + "epoch": 5.576954069298952, + "grad_norm": 1.6550576219511763, + "learning_rate": 9.700677492740194e-05, + "loss": 3.022, + "step": 8655 + }, + { + "epoch": 5.577598710717163, + "grad_norm": 1.7463181489580153, + "learning_rate": 9.70060878892124e-05, + "loss": 2.8172, + "step": 8656 + }, + { + "epoch": 5.578243352135375, + "grad_norm": 1.6329032507766523, + "learning_rate": 9.700540077464246e-05, + "loss": 3.2222, + "step": 8657 + }, + { + "epoch": 5.578887993553586, + "grad_norm": 1.6085212060452883, + "learning_rate": 9.700471358369326e-05, + "loss": 2.8785, + "step": 8658 + }, + { + "epoch": 5.579532634971797, + "grad_norm": 1.6644642898620874, + "learning_rate": 9.700402631636584e-05, + "loss": 3.2578, + "step": 8659 + }, + { + "epoch": 5.580177276390008, + "grad_norm": 1.5179538816858793, + "learning_rate": 9.700333897266144e-05, + "loss": 2.8651, + "step": 8660 + }, + { + "epoch": 5.580821917808219, + "grad_norm": 1.5473952696483193, + "learning_rate": 9.700265155258112e-05, + "loss": 2.9991, + "step": 8661 + }, + { + "epoch": 5.58146655922643, + "grad_norm": 1.5110310354991854, + "learning_rate": 9.700196405612604e-05, + "loss": 2.662, + "step": 8662 + }, + { + "epoch": 5.582111200644642, + "grad_norm": 1.7776458711062182, + "learning_rate": 9.700127648329729e-05, + "loss": 3.269, + "step": 8663 + }, + { + "epoch": 5.582755842062853, + "grad_norm": 1.4632789414475054, + "learning_rate": 9.700058883409604e-05, + "loss": 3.0235, + "step": 8664 + }, + { + "epoch": 5.583400483481063, + "grad_norm": 1.6516309738269745, + "learning_rate": 9.69999011085234e-05, + "loss": 3.1432, + "step": 8665 + }, + { + "epoch": 5.584045124899275, + "grad_norm": 1.4451048777183655, + "learning_rate": 9.699921330658051e-05, + "loss": 2.9368, + "step": 8666 + }, + { + "epoch": 5.584689766317486, + "grad_norm": 1.7117138541566839, + "learning_rate": 9.699852542826849e-05, + "loss": 3.0443, + "step": 8667 + }, + { + "epoch": 5.585334407735697, + "grad_norm": 1.5960030135397845, + "learning_rate": 9.699783747358848e-05, + "loss": 3.0757, + "step": 8668 + }, + { + "epoch": 5.5859790491539085, + "grad_norm": 1.8383553759326916, + "learning_rate": 9.699714944254159e-05, + "loss": 2.921, + "step": 8669 + }, + { + "epoch": 5.586623690572119, + "grad_norm": 1.6655947412616485, + "learning_rate": 9.699646133512897e-05, + "loss": 2.8946, + "step": 8670 + }, + { + "epoch": 5.58726833199033, + "grad_norm": 1.4523045733931101, + "learning_rate": 9.699577315135175e-05, + "loss": 3.2435, + "step": 8671 + }, + { + "epoch": 5.5879129734085415, + "grad_norm": 1.4981450896335327, + "learning_rate": 9.699508489121105e-05, + "loss": 3.0926, + "step": 8672 + }, + { + "epoch": 5.588557614826753, + "grad_norm": 1.623235897007477, + "learning_rate": 9.6994396554708e-05, + "loss": 3.0092, + "step": 8673 + }, + { + "epoch": 5.589202256244963, + "grad_norm": 1.491980195877957, + "learning_rate": 9.699370814184373e-05, + "loss": 2.7427, + "step": 8674 + }, + { + "epoch": 5.5898468976631746, + "grad_norm": 1.6732763680019727, + "learning_rate": 9.699301965261939e-05, + "loss": 2.8346, + "step": 8675 + }, + { + "epoch": 5.590491539081386, + "grad_norm": 1.463728538149246, + "learning_rate": 9.699233108703609e-05, + "loss": 2.9363, + "step": 8676 + }, + { + "epoch": 5.591136180499597, + "grad_norm": 1.7188779859515833, + "learning_rate": 9.699164244509496e-05, + "loss": 3.232, + "step": 8677 + }, + { + "epoch": 5.5917808219178085, + "grad_norm": 1.462671773185325, + "learning_rate": 9.699095372679714e-05, + "loss": 2.85, + "step": 8678 + }, + { + "epoch": 5.592425463336019, + "grad_norm": 1.6653035328865846, + "learning_rate": 9.699026493214376e-05, + "loss": 3.1976, + "step": 8679 + }, + { + "epoch": 5.59307010475423, + "grad_norm": 1.48641534263315, + "learning_rate": 9.698957606113594e-05, + "loss": 3.1203, + "step": 8680 + }, + { + "epoch": 5.5937147461724415, + "grad_norm": 1.5839803869660514, + "learning_rate": 9.698888711377485e-05, + "loss": 3.1688, + "step": 8681 + }, + { + "epoch": 5.594359387590653, + "grad_norm": 1.5588104962900058, + "learning_rate": 9.698819809006157e-05, + "loss": 2.8538, + "step": 8682 + }, + { + "epoch": 5.595004029008864, + "grad_norm": 1.4911436578681387, + "learning_rate": 9.698750898999727e-05, + "loss": 2.8443, + "step": 8683 + }, + { + "epoch": 5.5956486704270745, + "grad_norm": 1.4925123998824716, + "learning_rate": 9.698681981358304e-05, + "loss": 2.9368, + "step": 8684 + }, + { + "epoch": 5.596293311845286, + "grad_norm": 1.5350594088596377, + "learning_rate": 9.698613056082006e-05, + "loss": 2.9949, + "step": 8685 + }, + { + "epoch": 5.596937953263497, + "grad_norm": 1.5926510428270295, + "learning_rate": 9.698544123170943e-05, + "loss": 2.7617, + "step": 8686 + }, + { + "epoch": 5.597582594681708, + "grad_norm": 1.5663580232694292, + "learning_rate": 9.69847518262523e-05, + "loss": 3.1935, + "step": 8687 + }, + { + "epoch": 5.59822723609992, + "grad_norm": 1.4827904625988824, + "learning_rate": 9.698406234444981e-05, + "loss": 2.9428, + "step": 8688 + }, + { + "epoch": 5.59887187751813, + "grad_norm": 1.4605869601202301, + "learning_rate": 9.698337278630305e-05, + "loss": 2.8639, + "step": 8689 + }, + { + "epoch": 5.599516518936341, + "grad_norm": 1.4880981615101088, + "learning_rate": 9.69826831518132e-05, + "loss": 3.0277, + "step": 8690 + }, + { + "epoch": 5.600161160354553, + "grad_norm": 1.531456387977809, + "learning_rate": 9.698199344098136e-05, + "loss": 3.085, + "step": 8691 + }, + { + "epoch": 5.600805801772764, + "grad_norm": 1.4307443716558583, + "learning_rate": 9.698130365380868e-05, + "loss": 3.101, + "step": 8692 + }, + { + "epoch": 5.601450443190975, + "grad_norm": 1.6367082186599238, + "learning_rate": 9.698061379029628e-05, + "loss": 3.1032, + "step": 8693 + }, + { + "epoch": 5.602095084609186, + "grad_norm": 1.4386823425035715, + "learning_rate": 9.69799238504453e-05, + "loss": 3.0637, + "step": 8694 + }, + { + "epoch": 5.602739726027397, + "grad_norm": 1.5240462630313099, + "learning_rate": 9.697923383425688e-05, + "loss": 2.9719, + "step": 8695 + }, + { + "epoch": 5.603384367445608, + "grad_norm": 1.605359425440222, + "learning_rate": 9.697854374173215e-05, + "loss": 3.0271, + "step": 8696 + }, + { + "epoch": 5.60402900886382, + "grad_norm": 1.4530984520985115, + "learning_rate": 9.697785357287223e-05, + "loss": 3.066, + "step": 8697 + }, + { + "epoch": 5.604673650282031, + "grad_norm": 1.6307650326163172, + "learning_rate": 9.697716332767828e-05, + "loss": 3.1292, + "step": 8698 + }, + { + "epoch": 5.605318291700241, + "grad_norm": 1.5802356865677385, + "learning_rate": 9.69764730061514e-05, + "loss": 3.1505, + "step": 8699 + }, + { + "epoch": 5.605962933118453, + "grad_norm": 1.3511598098811612, + "learning_rate": 9.697578260829277e-05, + "loss": 3.0028, + "step": 8700 + }, + { + "epoch": 5.605962933118453, + "eval_loss": 4.434357166290283, + "eval_runtime": 2.9808, + "eval_samples_per_second": 33.548, + "eval_steps_per_second": 4.361, + "step": 8700 + }, + { + "epoch": 5.606607574536664, + "grad_norm": 1.6533987363283333, + "learning_rate": 9.697509213410347e-05, + "loss": 2.8135, + "step": 8701 + }, + { + "epoch": 5.607252215954875, + "grad_norm": 1.631736785821975, + "learning_rate": 9.697440158358467e-05, + "loss": 2.6755, + "step": 8702 + }, + { + "epoch": 5.6078968573730865, + "grad_norm": 1.621627975183472, + "learning_rate": 9.697371095673749e-05, + "loss": 3.0614, + "step": 8703 + }, + { + "epoch": 5.608541498791297, + "grad_norm": 1.3669716562512926, + "learning_rate": 9.697302025356309e-05, + "loss": 2.8503, + "step": 8704 + }, + { + "epoch": 5.609186140209508, + "grad_norm": 1.5678775343208666, + "learning_rate": 9.697232947406254e-05, + "loss": 2.6721, + "step": 8705 + }, + { + "epoch": 5.6098307816277195, + "grad_norm": 1.5265072089927887, + "learning_rate": 9.697163861823704e-05, + "loss": 3.0371, + "step": 8706 + }, + { + "epoch": 5.610475423045931, + "grad_norm": 1.542212055321359, + "learning_rate": 9.697094768608771e-05, + "loss": 2.9312, + "step": 8707 + }, + { + "epoch": 5.611120064464142, + "grad_norm": 1.6049052787782525, + "learning_rate": 9.697025667761566e-05, + "loss": 2.7917, + "step": 8708 + }, + { + "epoch": 5.6117647058823525, + "grad_norm": 1.4140894355634723, + "learning_rate": 9.696956559282205e-05, + "loss": 2.9496, + "step": 8709 + }, + { + "epoch": 5.612409347300564, + "grad_norm": 1.4080240197162794, + "learning_rate": 9.6968874431708e-05, + "loss": 3.0931, + "step": 8710 + }, + { + "epoch": 5.613053988718775, + "grad_norm": 1.4591373630614237, + "learning_rate": 9.696818319427465e-05, + "loss": 2.7955, + "step": 8711 + }, + { + "epoch": 5.6136986301369864, + "grad_norm": 1.4974548214087289, + "learning_rate": 9.696749188052314e-05, + "loss": 2.9925, + "step": 8712 + }, + { + "epoch": 5.614343271555198, + "grad_norm": 1.329097685032986, + "learning_rate": 9.696680049045461e-05, + "loss": 3.1293, + "step": 8713 + }, + { + "epoch": 5.614987912973408, + "grad_norm": 1.3836235559195658, + "learning_rate": 9.69661090240702e-05, + "loss": 2.8344, + "step": 8714 + }, + { + "epoch": 5.6156325543916195, + "grad_norm": 1.5852091727600124, + "learning_rate": 9.6965417481371e-05, + "loss": 3.2335, + "step": 8715 + }, + { + "epoch": 5.616277195809831, + "grad_norm": 1.5443799434232788, + "learning_rate": 9.69647258623582e-05, + "loss": 3.1294, + "step": 8716 + }, + { + "epoch": 5.616921837228042, + "grad_norm": 1.5662361918012895, + "learning_rate": 9.69640341670329e-05, + "loss": 3.069, + "step": 8717 + }, + { + "epoch": 5.617566478646253, + "grad_norm": 1.5940412035089717, + "learning_rate": 9.696334239539626e-05, + "loss": 2.868, + "step": 8718 + }, + { + "epoch": 5.618211120064464, + "grad_norm": 1.6791145243859031, + "learning_rate": 9.69626505474494e-05, + "loss": 2.8676, + "step": 8719 + }, + { + "epoch": 5.618855761482675, + "grad_norm": 1.4929964334360961, + "learning_rate": 9.696195862319349e-05, + "loss": 2.9626, + "step": 8720 + }, + { + "epoch": 5.619500402900886, + "grad_norm": 1.7230891391938743, + "learning_rate": 9.696126662262961e-05, + "loss": 2.9852, + "step": 8721 + }, + { + "epoch": 5.620145044319098, + "grad_norm": 1.7329657655484618, + "learning_rate": 9.696057454575894e-05, + "loss": 3.0319, + "step": 8722 + }, + { + "epoch": 5.620789685737309, + "grad_norm": 1.6243677385931374, + "learning_rate": 9.69598823925826e-05, + "loss": 3.0243, + "step": 8723 + }, + { + "epoch": 5.621434327155519, + "grad_norm": 1.7317554353768227, + "learning_rate": 9.695919016310171e-05, + "loss": 3.0281, + "step": 8724 + }, + { + "epoch": 5.622078968573731, + "grad_norm": 2.1010639759705945, + "learning_rate": 9.695849785731747e-05, + "loss": 2.9028, + "step": 8725 + }, + { + "epoch": 5.622723609991942, + "grad_norm": 1.8758718930157774, + "learning_rate": 9.695780547523095e-05, + "loss": 3.317, + "step": 8726 + }, + { + "epoch": 5.623368251410153, + "grad_norm": 1.6823322874412636, + "learning_rate": 9.695711301684331e-05, + "loss": 3.1305, + "step": 8727 + }, + { + "epoch": 5.624012892828365, + "grad_norm": 1.6375679619619832, + "learning_rate": 9.695642048215568e-05, + "loss": 2.7301, + "step": 8728 + }, + { + "epoch": 5.624657534246575, + "grad_norm": 1.7148638265367235, + "learning_rate": 9.695572787116922e-05, + "loss": 3.0865, + "step": 8729 + }, + { + "epoch": 5.625302175664786, + "grad_norm": 1.7110352912215925, + "learning_rate": 9.695503518388503e-05, + "loss": 2.7128, + "step": 8730 + }, + { + "epoch": 5.625946817082998, + "grad_norm": 1.6564016872253802, + "learning_rate": 9.69543424203043e-05, + "loss": 3.1807, + "step": 8731 + }, + { + "epoch": 5.626591458501209, + "grad_norm": 1.6635444626726352, + "learning_rate": 9.695364958042813e-05, + "loss": 3.0556, + "step": 8732 + }, + { + "epoch": 5.62723609991942, + "grad_norm": 1.559287647687551, + "learning_rate": 9.695295666425766e-05, + "loss": 2.8395, + "step": 8733 + }, + { + "epoch": 5.627880741337631, + "grad_norm": 1.617932533405809, + "learning_rate": 9.695226367179404e-05, + "loss": 2.8644, + "step": 8734 + }, + { + "epoch": 5.628525382755842, + "grad_norm": 1.4927652548826187, + "learning_rate": 9.69515706030384e-05, + "loss": 2.7916, + "step": 8735 + }, + { + "epoch": 5.629170024174053, + "grad_norm": 1.625606726407081, + "learning_rate": 9.695087745799187e-05, + "loss": 3.2294, + "step": 8736 + }, + { + "epoch": 5.6298146655922645, + "grad_norm": 1.7571348923878418, + "learning_rate": 9.69501842366556e-05, + "loss": 3.1146, + "step": 8737 + }, + { + "epoch": 5.630459307010476, + "grad_norm": 1.8487506430713596, + "learning_rate": 9.694949093903076e-05, + "loss": 3.1588, + "step": 8738 + }, + { + "epoch": 5.631103948428686, + "grad_norm": 1.627384260055456, + "learning_rate": 9.694879756511842e-05, + "loss": 2.7685, + "step": 8739 + }, + { + "epoch": 5.6317485898468975, + "grad_norm": 1.7112523810884714, + "learning_rate": 9.694810411491976e-05, + "loss": 3.1819, + "step": 8740 + }, + { + "epoch": 5.632393231265109, + "grad_norm": 1.729266406686807, + "learning_rate": 9.694741058843592e-05, + "loss": 3.1303, + "step": 8741 + }, + { + "epoch": 5.63303787268332, + "grad_norm": 1.9864268688647162, + "learning_rate": 9.694671698566805e-05, + "loss": 2.8699, + "step": 8742 + }, + { + "epoch": 5.633682514101531, + "grad_norm": 1.5293465752673543, + "learning_rate": 9.694602330661725e-05, + "loss": 2.7237, + "step": 8743 + }, + { + "epoch": 5.634327155519742, + "grad_norm": 1.7497264387495282, + "learning_rate": 9.694532955128469e-05, + "loss": 3.0204, + "step": 8744 + }, + { + "epoch": 5.634971796937953, + "grad_norm": 1.6204763146456387, + "learning_rate": 9.694463571967151e-05, + "loss": 3.0046, + "step": 8745 + }, + { + "epoch": 5.635616438356164, + "grad_norm": 1.779950823715102, + "learning_rate": 9.694394181177882e-05, + "loss": 3.1259, + "step": 8746 + }, + { + "epoch": 5.636261079774376, + "grad_norm": 1.897925394871309, + "learning_rate": 9.694324782760778e-05, + "loss": 2.9278, + "step": 8747 + }, + { + "epoch": 5.636905721192587, + "grad_norm": 1.5833002445794506, + "learning_rate": 9.694255376715955e-05, + "loss": 3.3687, + "step": 8748 + }, + { + "epoch": 5.6375503626107974, + "grad_norm": 1.720036875034755, + "learning_rate": 9.694185963043524e-05, + "loss": 3.1498, + "step": 8749 + }, + { + "epoch": 5.638195004029009, + "grad_norm": 1.7501651945858046, + "learning_rate": 9.694116541743601e-05, + "loss": 2.8461, + "step": 8750 + }, + { + "epoch": 5.63883964544722, + "grad_norm": 1.540227909494638, + "learning_rate": 9.694047112816297e-05, + "loss": 3.0617, + "step": 8751 + }, + { + "epoch": 5.639484286865431, + "grad_norm": 1.7611046292229984, + "learning_rate": 9.69397767626173e-05, + "loss": 3.1261, + "step": 8752 + }, + { + "epoch": 5.640128928283643, + "grad_norm": 1.5953344330774912, + "learning_rate": 9.693908232080011e-05, + "loss": 3.184, + "step": 8753 + }, + { + "epoch": 5.640773569701853, + "grad_norm": 1.4818309665820535, + "learning_rate": 9.693838780271254e-05, + "loss": 2.912, + "step": 8754 + }, + { + "epoch": 5.641418211120064, + "grad_norm": 1.602500124562854, + "learning_rate": 9.693769320835576e-05, + "loss": 2.9155, + "step": 8755 + }, + { + "epoch": 5.642062852538276, + "grad_norm": 1.4699595301500266, + "learning_rate": 9.693699853773088e-05, + "loss": 3.004, + "step": 8756 + }, + { + "epoch": 5.642707493956487, + "grad_norm": 1.4456638384239167, + "learning_rate": 9.693630379083905e-05, + "loss": 3.094, + "step": 8757 + }, + { + "epoch": 5.643352135374698, + "grad_norm": 1.3815609944811398, + "learning_rate": 9.693560896768145e-05, + "loss": 2.9815, + "step": 8758 + }, + { + "epoch": 5.643996776792909, + "grad_norm": 1.9013266993684408, + "learning_rate": 9.693491406825914e-05, + "loss": 2.857, + "step": 8759 + }, + { + "epoch": 5.64464141821112, + "grad_norm": 1.5871133195992566, + "learning_rate": 9.693421909257334e-05, + "loss": 2.8127, + "step": 8760 + }, + { + "epoch": 5.645286059629331, + "grad_norm": 1.5890155924556912, + "learning_rate": 9.693352404062513e-05, + "loss": 3.314, + "step": 8761 + }, + { + "epoch": 5.645930701047543, + "grad_norm": 1.7010661183568128, + "learning_rate": 9.693282891241572e-05, + "loss": 3.1713, + "step": 8762 + }, + { + "epoch": 5.646575342465754, + "grad_norm": 1.333660831535659, + "learning_rate": 9.693213370794618e-05, + "loss": 2.9027, + "step": 8763 + }, + { + "epoch": 5.647219983883964, + "grad_norm": 1.672203045268186, + "learning_rate": 9.693143842721769e-05, + "loss": 3.3388, + "step": 8764 + }, + { + "epoch": 5.647864625302176, + "grad_norm": 1.4797154132726111, + "learning_rate": 9.693074307023137e-05, + "loss": 3.0838, + "step": 8765 + }, + { + "epoch": 5.648509266720387, + "grad_norm": 1.7061679098309097, + "learning_rate": 9.693004763698841e-05, + "loss": 3.023, + "step": 8766 + }, + { + "epoch": 5.649153908138598, + "grad_norm": 1.3636069169011535, + "learning_rate": 9.69293521274899e-05, + "loss": 2.9033, + "step": 8767 + }, + { + "epoch": 5.6497985495568095, + "grad_norm": 2.417775969315179, + "learning_rate": 9.692865654173701e-05, + "loss": 3.0163, + "step": 8768 + }, + { + "epoch": 5.65044319097502, + "grad_norm": 1.6463638400302272, + "learning_rate": 9.692796087973087e-05, + "loss": 3.066, + "step": 8769 + }, + { + "epoch": 5.651087832393231, + "grad_norm": 1.9220320667635793, + "learning_rate": 9.692726514147262e-05, + "loss": 2.9221, + "step": 8770 + }, + { + "epoch": 5.6517324738114425, + "grad_norm": 1.9087580302049516, + "learning_rate": 9.692656932696341e-05, + "loss": 2.8324, + "step": 8771 + }, + { + "epoch": 5.652377115229654, + "grad_norm": 1.52544859053527, + "learning_rate": 9.69258734362044e-05, + "loss": 2.9643, + "step": 8772 + }, + { + "epoch": 5.653021756647865, + "grad_norm": 2.2099657477722414, + "learning_rate": 9.69251774691967e-05, + "loss": 3.1193, + "step": 8773 + }, + { + "epoch": 5.6536663980660755, + "grad_norm": 1.5228639567528242, + "learning_rate": 9.692448142594147e-05, + "loss": 3.0355, + "step": 8774 + }, + { + "epoch": 5.654311039484287, + "grad_norm": 1.9462584212850236, + "learning_rate": 9.692378530643985e-05, + "loss": 2.9352, + "step": 8775 + }, + { + "epoch": 5.654955680902498, + "grad_norm": 1.6248087228485513, + "learning_rate": 9.6923089110693e-05, + "loss": 3.0326, + "step": 8776 + }, + { + "epoch": 5.655600322320709, + "grad_norm": 1.7946136573176978, + "learning_rate": 9.692239283870203e-05, + "loss": 3.1051, + "step": 8777 + }, + { + "epoch": 5.656244963738921, + "grad_norm": 1.506035678077555, + "learning_rate": 9.69216964904681e-05, + "loss": 2.8922, + "step": 8778 + }, + { + "epoch": 5.656889605157131, + "grad_norm": 1.7606195594181, + "learning_rate": 9.692100006599237e-05, + "loss": 3.0152, + "step": 8779 + }, + { + "epoch": 5.657534246575342, + "grad_norm": 1.5832211752798258, + "learning_rate": 9.692030356527594e-05, + "loss": 2.8874, + "step": 8780 + }, + { + "epoch": 5.658178887993554, + "grad_norm": 1.778715661301746, + "learning_rate": 9.691960698832001e-05, + "loss": 2.8013, + "step": 8781 + }, + { + "epoch": 5.658823529411765, + "grad_norm": 1.6560979785446415, + "learning_rate": 9.691891033512569e-05, + "loss": 3.0218, + "step": 8782 + }, + { + "epoch": 5.659468170829975, + "grad_norm": 1.728558153494665, + "learning_rate": 9.691821360569414e-05, + "loss": 3.0001, + "step": 8783 + }, + { + "epoch": 5.660112812248187, + "grad_norm": 1.5219204556191712, + "learning_rate": 9.691751680002649e-05, + "loss": 3.1235, + "step": 8784 + }, + { + "epoch": 5.660757453666398, + "grad_norm": 1.5565170036005609, + "learning_rate": 9.691681991812388e-05, + "loss": 3.2823, + "step": 8785 + }, + { + "epoch": 5.661402095084609, + "grad_norm": 1.4358636839552497, + "learning_rate": 9.691612295998748e-05, + "loss": 3.0478, + "step": 8786 + }, + { + "epoch": 5.662046736502821, + "grad_norm": 1.5380299700888056, + "learning_rate": 9.691542592561841e-05, + "loss": 3.2797, + "step": 8787 + }, + { + "epoch": 5.662691377921031, + "grad_norm": 1.6173691232343272, + "learning_rate": 9.69147288150178e-05, + "loss": 2.9556, + "step": 8788 + }, + { + "epoch": 5.663336019339242, + "grad_norm": 1.6675569343279224, + "learning_rate": 9.691403162818686e-05, + "loss": 2.8281, + "step": 8789 + }, + { + "epoch": 5.663980660757454, + "grad_norm": 1.5094257293888467, + "learning_rate": 9.691333436512669e-05, + "loss": 3.0583, + "step": 8790 + }, + { + "epoch": 5.664625302175665, + "grad_norm": 1.6340136044899554, + "learning_rate": 9.691263702583842e-05, + "loss": 3.1363, + "step": 8791 + }, + { + "epoch": 5.665269943593876, + "grad_norm": 1.6525522579304217, + "learning_rate": 9.691193961032322e-05, + "loss": 3.0826, + "step": 8792 + }, + { + "epoch": 5.665914585012087, + "grad_norm": 1.485591353520573, + "learning_rate": 9.691124211858223e-05, + "loss": 2.9405, + "step": 8793 + }, + { + "epoch": 5.666559226430298, + "grad_norm": 1.7723257835076534, + "learning_rate": 9.691054455061659e-05, + "loss": 2.9198, + "step": 8794 + }, + { + "epoch": 5.667203867848509, + "grad_norm": 1.6882221515021565, + "learning_rate": 9.690984690642746e-05, + "loss": 2.8154, + "step": 8795 + }, + { + "epoch": 5.667848509266721, + "grad_norm": 1.4978831016194738, + "learning_rate": 9.690914918601597e-05, + "loss": 2.9735, + "step": 8796 + }, + { + "epoch": 5.668493150684932, + "grad_norm": 1.6814415224666606, + "learning_rate": 9.690845138938328e-05, + "loss": 2.8185, + "step": 8797 + }, + { + "epoch": 5.669137792103142, + "grad_norm": 1.7476311150331147, + "learning_rate": 9.690775351653053e-05, + "loss": 2.6946, + "step": 8798 + }, + { + "epoch": 5.669782433521354, + "grad_norm": 1.3207823837854857, + "learning_rate": 9.690705556745885e-05, + "loss": 2.9748, + "step": 8799 + }, + { + "epoch": 5.670427074939565, + "grad_norm": 1.76623491126253, + "learning_rate": 9.690635754216941e-05, + "loss": 3.1963, + "step": 8800 + }, + { + "epoch": 5.670427074939565, + "eval_loss": 4.381584167480469, + "eval_runtime": 2.9876, + "eval_samples_per_second": 33.472, + "eval_steps_per_second": 4.351, + "step": 8800 + }, + { + "epoch": 5.671071716357776, + "grad_norm": 1.4537154076808316, + "learning_rate": 9.690565944066335e-05, + "loss": 3.0438, + "step": 8801 + }, + { + "epoch": 5.6717163577759875, + "grad_norm": 1.7030285855578833, + "learning_rate": 9.690496126294182e-05, + "loss": 2.7875, + "step": 8802 + }, + { + "epoch": 5.672360999194198, + "grad_norm": 1.6867739283315006, + "learning_rate": 9.690426300900595e-05, + "loss": 3.1486, + "step": 8803 + }, + { + "epoch": 5.673005640612409, + "grad_norm": 1.5450357918027569, + "learning_rate": 9.69035646788569e-05, + "loss": 3.1711, + "step": 8804 + }, + { + "epoch": 5.6736502820306205, + "grad_norm": 1.5902012137642227, + "learning_rate": 9.690286627249581e-05, + "loss": 3.1355, + "step": 8805 + }, + { + "epoch": 5.674294923448832, + "grad_norm": 1.6460046731577769, + "learning_rate": 9.690216778992384e-05, + "loss": 3.1337, + "step": 8806 + }, + { + "epoch": 5.674939564867043, + "grad_norm": 1.5376376813453072, + "learning_rate": 9.690146923114213e-05, + "loss": 2.9714, + "step": 8807 + }, + { + "epoch": 5.6755842062852535, + "grad_norm": 1.4471019910581608, + "learning_rate": 9.690077059615183e-05, + "loss": 2.8232, + "step": 8808 + }, + { + "epoch": 5.676228847703465, + "grad_norm": 1.5838240619701642, + "learning_rate": 9.690007188495407e-05, + "loss": 2.7732, + "step": 8809 + }, + { + "epoch": 5.676873489121676, + "grad_norm": 1.6357681748970454, + "learning_rate": 9.689937309755e-05, + "loss": 3.1194, + "step": 8810 + }, + { + "epoch": 5.677518130539887, + "grad_norm": 1.6272552308036474, + "learning_rate": 9.68986742339408e-05, + "loss": 2.8096, + "step": 8811 + }, + { + "epoch": 5.678162771958098, + "grad_norm": 1.4961700128630258, + "learning_rate": 9.68979752941276e-05, + "loss": 2.8702, + "step": 8812 + }, + { + "epoch": 5.678807413376309, + "grad_norm": 1.6823055490067287, + "learning_rate": 9.689727627811153e-05, + "loss": 3.105, + "step": 8813 + }, + { + "epoch": 5.67945205479452, + "grad_norm": 1.6119881261379545, + "learning_rate": 9.689657718589375e-05, + "loss": 3.2718, + "step": 8814 + }, + { + "epoch": 5.680096696212732, + "grad_norm": 1.6232484261613433, + "learning_rate": 9.689587801747542e-05, + "loss": 2.9912, + "step": 8815 + }, + { + "epoch": 5.680741337630943, + "grad_norm": 1.5150454085888327, + "learning_rate": 9.689517877285767e-05, + "loss": 2.8292, + "step": 8816 + }, + { + "epoch": 5.681385979049153, + "grad_norm": 1.5133170550813708, + "learning_rate": 9.689447945204168e-05, + "loss": 3.0667, + "step": 8817 + }, + { + "epoch": 5.682030620467365, + "grad_norm": 1.5175756873638413, + "learning_rate": 9.689378005502856e-05, + "loss": 2.9807, + "step": 8818 + }, + { + "epoch": 5.682675261885576, + "grad_norm": 1.4732355880493586, + "learning_rate": 9.689308058181948e-05, + "loss": 2.9319, + "step": 8819 + }, + { + "epoch": 5.683319903303787, + "grad_norm": 1.4920085759864476, + "learning_rate": 9.689238103241557e-05, + "loss": 2.8946, + "step": 8820 + }, + { + "epoch": 5.683964544721999, + "grad_norm": 1.4739627758197595, + "learning_rate": 9.689168140681799e-05, + "loss": 2.6883, + "step": 8821 + }, + { + "epoch": 5.684609186140209, + "grad_norm": 1.5989824772938668, + "learning_rate": 9.68909817050279e-05, + "loss": 3.1543, + "step": 8822 + }, + { + "epoch": 5.68525382755842, + "grad_norm": 1.5183677437537528, + "learning_rate": 9.689028192704646e-05, + "loss": 2.8501, + "step": 8823 + }, + { + "epoch": 5.685898468976632, + "grad_norm": 1.8050842741013755, + "learning_rate": 9.688958207287478e-05, + "loss": 3.153, + "step": 8824 + }, + { + "epoch": 5.686543110394843, + "grad_norm": 13.980881662714294, + "learning_rate": 9.688888214251402e-05, + "loss": 2.9197, + "step": 8825 + }, + { + "epoch": 5.687187751813054, + "grad_norm": 1.7715995446994939, + "learning_rate": 9.688818213596535e-05, + "loss": 2.9688, + "step": 8826 + }, + { + "epoch": 5.687832393231265, + "grad_norm": 1.7427583849245667, + "learning_rate": 9.688748205322992e-05, + "loss": 2.7615, + "step": 8827 + }, + { + "epoch": 5.688477034649476, + "grad_norm": 1.7926547828970247, + "learning_rate": 9.688678189430884e-05, + "loss": 3.2954, + "step": 8828 + }, + { + "epoch": 5.689121676067687, + "grad_norm": 1.6771444249875906, + "learning_rate": 9.68860816592033e-05, + "loss": 3.1431, + "step": 8829 + }, + { + "epoch": 5.689766317485899, + "grad_norm": 1.6462549664744262, + "learning_rate": 9.688538134791445e-05, + "loss": 2.9922, + "step": 8830 + }, + { + "epoch": 5.69041095890411, + "grad_norm": 1.7344584301152, + "learning_rate": 9.688468096044343e-05, + "loss": 3.2382, + "step": 8831 + }, + { + "epoch": 5.69105560032232, + "grad_norm": 1.570793415624968, + "learning_rate": 9.688398049679136e-05, + "loss": 3.0126, + "step": 8832 + }, + { + "epoch": 5.691700241740532, + "grad_norm": 1.6106746974472235, + "learning_rate": 9.688327995695944e-05, + "loss": 2.8242, + "step": 8833 + }, + { + "epoch": 5.692344883158743, + "grad_norm": 1.5367798271816004, + "learning_rate": 9.688257934094882e-05, + "loss": 2.6949, + "step": 8834 + }, + { + "epoch": 5.692989524576954, + "grad_norm": 1.5636860406621074, + "learning_rate": 9.68818786487606e-05, + "loss": 3.2645, + "step": 8835 + }, + { + "epoch": 5.6936341659951655, + "grad_norm": 1.5203533085666703, + "learning_rate": 9.688117788039597e-05, + "loss": 3.2556, + "step": 8836 + }, + { + "epoch": 5.694278807413376, + "grad_norm": 1.5076377626422057, + "learning_rate": 9.688047703585607e-05, + "loss": 2.9679, + "step": 8837 + }, + { + "epoch": 5.694923448831587, + "grad_norm": 1.4614367953204186, + "learning_rate": 9.687977611514205e-05, + "loss": 3.0061, + "step": 8838 + }, + { + "epoch": 5.6955680902497985, + "grad_norm": 1.3613609413250372, + "learning_rate": 9.687907511825506e-05, + "loss": 2.967, + "step": 8839 + }, + { + "epoch": 5.69621273166801, + "grad_norm": 1.434923557386486, + "learning_rate": 9.687837404519627e-05, + "loss": 3.0131, + "step": 8840 + }, + { + "epoch": 5.696857373086221, + "grad_norm": 1.5646000401130136, + "learning_rate": 9.687767289596681e-05, + "loss": 3.0248, + "step": 8841 + }, + { + "epoch": 5.6975020145044315, + "grad_norm": 1.595343648353129, + "learning_rate": 9.687697167056784e-05, + "loss": 3.1088, + "step": 8842 + }, + { + "epoch": 5.698146655922643, + "grad_norm": 1.5610635769242154, + "learning_rate": 9.687627036900052e-05, + "loss": 2.9412, + "step": 8843 + }, + { + "epoch": 5.698791297340854, + "grad_norm": 1.5249385159135338, + "learning_rate": 9.687556899126598e-05, + "loss": 2.9678, + "step": 8844 + }, + { + "epoch": 5.699435938759065, + "grad_norm": 1.6915832910612245, + "learning_rate": 9.687486753736537e-05, + "loss": 2.791, + "step": 8845 + }, + { + "epoch": 5.700080580177277, + "grad_norm": 1.8100409318522463, + "learning_rate": 9.687416600729987e-05, + "loss": 2.9942, + "step": 8846 + }, + { + "epoch": 5.700725221595487, + "grad_norm": 1.5947743321890906, + "learning_rate": 9.687346440107064e-05, + "loss": 3.0364, + "step": 8847 + }, + { + "epoch": 5.701369863013698, + "grad_norm": 1.4931069446353111, + "learning_rate": 9.687276271867877e-05, + "loss": 2.8415, + "step": 8848 + }, + { + "epoch": 5.70201450443191, + "grad_norm": 2.21251120951008, + "learning_rate": 9.687206096012549e-05, + "loss": 2.8618, + "step": 8849 + }, + { + "epoch": 5.702659145850121, + "grad_norm": 1.4897102523259764, + "learning_rate": 9.687135912541189e-05, + "loss": 3.2211, + "step": 8850 + }, + { + "epoch": 5.703303787268332, + "grad_norm": 1.8510446050552456, + "learning_rate": 9.687065721453915e-05, + "loss": 3.0478, + "step": 8851 + }, + { + "epoch": 5.703948428686543, + "grad_norm": 1.7952185000695822, + "learning_rate": 9.686995522750843e-05, + "loss": 3.2908, + "step": 8852 + }, + { + "epoch": 5.704593070104754, + "grad_norm": 1.6584806979022455, + "learning_rate": 9.686925316432087e-05, + "loss": 3.0727, + "step": 8853 + }, + { + "epoch": 5.705237711522965, + "grad_norm": 1.4688256615319528, + "learning_rate": 9.686855102497762e-05, + "loss": 3.3878, + "step": 8854 + }, + { + "epoch": 5.705882352941177, + "grad_norm": 1.9362915015843425, + "learning_rate": 9.686784880947985e-05, + "loss": 2.894, + "step": 8855 + }, + { + "epoch": 5.706526994359388, + "grad_norm": 1.9542922708509134, + "learning_rate": 9.68671465178287e-05, + "loss": 3.1327, + "step": 8856 + }, + { + "epoch": 5.707171635777598, + "grad_norm": 1.8262277254832202, + "learning_rate": 9.686644415002534e-05, + "loss": 2.991, + "step": 8857 + }, + { + "epoch": 5.70781627719581, + "grad_norm": 1.6362536907793241, + "learning_rate": 9.686574170607089e-05, + "loss": 3.0316, + "step": 8858 + }, + { + "epoch": 5.708460918614021, + "grad_norm": 1.487790345613031, + "learning_rate": 9.686503918596654e-05, + "loss": 3.1135, + "step": 8859 + }, + { + "epoch": 5.709105560032232, + "grad_norm": 1.9339198227642502, + "learning_rate": 9.686433658971342e-05, + "loss": 2.8, + "step": 8860 + }, + { + "epoch": 5.7097502014504435, + "grad_norm": 1.4731468286799738, + "learning_rate": 9.68636339173127e-05, + "loss": 2.8956, + "step": 8861 + }, + { + "epoch": 5.710394842868654, + "grad_norm": 1.7278547161806384, + "learning_rate": 9.68629311687655e-05, + "loss": 2.9173, + "step": 8862 + }, + { + "epoch": 5.711039484286865, + "grad_norm": 1.472253098677865, + "learning_rate": 9.686222834407303e-05, + "loss": 2.9161, + "step": 8863 + }, + { + "epoch": 5.7116841257050766, + "grad_norm": 1.4565934802934155, + "learning_rate": 9.68615254432364e-05, + "loss": 2.9822, + "step": 8864 + }, + { + "epoch": 5.712328767123288, + "grad_norm": 1.5127357169174227, + "learning_rate": 9.686082246625678e-05, + "loss": 3.2745, + "step": 8865 + }, + { + "epoch": 5.712973408541499, + "grad_norm": 1.5787279919936013, + "learning_rate": 9.686011941313532e-05, + "loss": 2.937, + "step": 8866 + }, + { + "epoch": 5.71361804995971, + "grad_norm": 1.4763633452612823, + "learning_rate": 9.685941628387318e-05, + "loss": 3.2086, + "step": 8867 + }, + { + "epoch": 5.714262691377921, + "grad_norm": 1.4560113472003393, + "learning_rate": 9.685871307847151e-05, + "loss": 2.8693, + "step": 8868 + }, + { + "epoch": 5.714907332796132, + "grad_norm": 1.51883311926217, + "learning_rate": 9.685800979693147e-05, + "loss": 3.3343, + "step": 8869 + }, + { + "epoch": 5.7155519742143435, + "grad_norm": 1.4947754362519687, + "learning_rate": 9.685730643925422e-05, + "loss": 3.2938, + "step": 8870 + }, + { + "epoch": 5.716196615632555, + "grad_norm": 1.7220764871319705, + "learning_rate": 9.68566030054409e-05, + "loss": 2.9171, + "step": 8871 + }, + { + "epoch": 5.716841257050765, + "grad_norm": 1.3909440606799681, + "learning_rate": 9.685589949549268e-05, + "loss": 3.0016, + "step": 8872 + }, + { + "epoch": 5.7174858984689765, + "grad_norm": 1.907357676538295, + "learning_rate": 9.68551959094107e-05, + "loss": 3.2697, + "step": 8873 + }, + { + "epoch": 5.718130539887188, + "grad_norm": 1.6648170159817184, + "learning_rate": 9.685449224719612e-05, + "loss": 2.9671, + "step": 8874 + }, + { + "epoch": 5.718775181305399, + "grad_norm": 2.296204134876387, + "learning_rate": 9.685378850885011e-05, + "loss": 3.2501, + "step": 8875 + }, + { + "epoch": 5.71941982272361, + "grad_norm": 1.6821391856760417, + "learning_rate": 9.68530846943738e-05, + "loss": 3.2664, + "step": 8876 + }, + { + "epoch": 5.720064464141821, + "grad_norm": 1.9109380107328433, + "learning_rate": 9.685238080376839e-05, + "loss": 3.1378, + "step": 8877 + }, + { + "epoch": 5.720709105560032, + "grad_norm": 1.6043070557431536, + "learning_rate": 9.685167683703498e-05, + "loss": 3.466, + "step": 8878 + }, + { + "epoch": 5.721353746978243, + "grad_norm": 1.878343358530872, + "learning_rate": 9.685097279417477e-05, + "loss": 3.1061, + "step": 8879 + }, + { + "epoch": 5.721998388396455, + "grad_norm": 1.625861319015192, + "learning_rate": 9.68502686751889e-05, + "loss": 2.9766, + "step": 8880 + }, + { + "epoch": 5.722643029814666, + "grad_norm": 2.0845544817731168, + "learning_rate": 9.684956448007853e-05, + "loss": 2.8976, + "step": 8881 + }, + { + "epoch": 5.723287671232876, + "grad_norm": 1.7230821981036422, + "learning_rate": 9.684886020884481e-05, + "loss": 3.1518, + "step": 8882 + }, + { + "epoch": 5.723932312651088, + "grad_norm": 1.9620315575586555, + "learning_rate": 9.684815586148888e-05, + "loss": 3.3789, + "step": 8883 + }, + { + "epoch": 5.724576954069299, + "grad_norm": 1.4398550293405115, + "learning_rate": 9.684745143801195e-05, + "loss": 2.9164, + "step": 8884 + }, + { + "epoch": 5.72522159548751, + "grad_norm": 1.3964106150420765, + "learning_rate": 9.684674693841512e-05, + "loss": 3.2469, + "step": 8885 + }, + { + "epoch": 5.725866236905722, + "grad_norm": 1.632060138984354, + "learning_rate": 9.684604236269956e-05, + "loss": 3.1306, + "step": 8886 + }, + { + "epoch": 5.726510878323932, + "grad_norm": 1.4819503683964308, + "learning_rate": 9.684533771086647e-05, + "loss": 2.9815, + "step": 8887 + }, + { + "epoch": 5.727155519742143, + "grad_norm": 1.5796875505137267, + "learning_rate": 9.684463298291695e-05, + "loss": 3.0801, + "step": 8888 + }, + { + "epoch": 5.727800161160355, + "grad_norm": 1.3808761909664866, + "learning_rate": 9.684392817885219e-05, + "loss": 2.9639, + "step": 8889 + }, + { + "epoch": 5.728444802578566, + "grad_norm": 1.9647361104774952, + "learning_rate": 9.684322329867334e-05, + "loss": 3.1791, + "step": 8890 + }, + { + "epoch": 5.729089443996777, + "grad_norm": 1.4833703697452003, + "learning_rate": 9.684251834238156e-05, + "loss": 3.2842, + "step": 8891 + }, + { + "epoch": 5.729734085414988, + "grad_norm": 1.7167212972439567, + "learning_rate": 9.684181330997802e-05, + "loss": 3.104, + "step": 8892 + }, + { + "epoch": 5.730378726833199, + "grad_norm": 1.6162283133288138, + "learning_rate": 9.684110820146382e-05, + "loss": 3.0061, + "step": 8893 + }, + { + "epoch": 5.73102336825141, + "grad_norm": 1.6153193138579305, + "learning_rate": 9.68404030168402e-05, + "loss": 3.2279, + "step": 8894 + }, + { + "epoch": 5.7316680096696215, + "grad_norm": 1.6270995206854726, + "learning_rate": 9.683969775610826e-05, + "loss": 3.1976, + "step": 8895 + }, + { + "epoch": 5.732312651087833, + "grad_norm": 1.4944606218797758, + "learning_rate": 9.68389924192692e-05, + "loss": 2.9869, + "step": 8896 + }, + { + "epoch": 5.732957292506043, + "grad_norm": 1.6751927320421112, + "learning_rate": 9.683828700632414e-05, + "loss": 3.1503, + "step": 8897 + }, + { + "epoch": 5.7336019339242545, + "grad_norm": 1.7150204911074791, + "learning_rate": 9.683758151727426e-05, + "loss": 3.2341, + "step": 8898 + }, + { + "epoch": 5.734246575342466, + "grad_norm": 1.8291201179437548, + "learning_rate": 9.683687595212071e-05, + "loss": 3.1018, + "step": 8899 + }, + { + "epoch": 5.734891216760677, + "grad_norm": 1.4906461505866555, + "learning_rate": 9.683617031086466e-05, + "loss": 2.9938, + "step": 8900 + }, + { + "epoch": 5.734891216760677, + "eval_loss": 4.379035472869873, + "eval_runtime": 2.9693, + "eval_samples_per_second": 33.679, + "eval_steps_per_second": 4.378, + "step": 8900 + }, + { + "epoch": 5.7355358581788884, + "grad_norm": 1.9608303257248973, + "learning_rate": 9.683546459350724e-05, + "loss": 2.767, + "step": 8901 + }, + { + "epoch": 5.736180499597099, + "grad_norm": 1.7369164199360123, + "learning_rate": 9.683475880004965e-05, + "loss": 3.1792, + "step": 8902 + }, + { + "epoch": 5.73682514101531, + "grad_norm": 1.5881406041698207, + "learning_rate": 9.683405293049303e-05, + "loss": 2.8849, + "step": 8903 + }, + { + "epoch": 5.7374697824335215, + "grad_norm": 1.8715344714998179, + "learning_rate": 9.683334698483853e-05, + "loss": 2.8759, + "step": 8904 + }, + { + "epoch": 5.738114423851733, + "grad_norm": 1.6380480534171633, + "learning_rate": 9.683264096308732e-05, + "loss": 2.9427, + "step": 8905 + }, + { + "epoch": 5.738759065269944, + "grad_norm": 1.44154664706887, + "learning_rate": 9.683193486524055e-05, + "loss": 2.9835, + "step": 8906 + }, + { + "epoch": 5.7394037066881545, + "grad_norm": 1.5991578441150338, + "learning_rate": 9.68312286912994e-05, + "loss": 3.0458, + "step": 8907 + }, + { + "epoch": 5.740048348106366, + "grad_norm": 1.6233973882574877, + "learning_rate": 9.683052244126502e-05, + "loss": 2.9603, + "step": 8908 + }, + { + "epoch": 5.740692989524577, + "grad_norm": 1.6278282547328216, + "learning_rate": 9.682981611513857e-05, + "loss": 3.1378, + "step": 8909 + }, + { + "epoch": 5.741337630942788, + "grad_norm": 1.725385600928875, + "learning_rate": 9.682910971292119e-05, + "loss": 2.6812, + "step": 8910 + }, + { + "epoch": 5.741982272361, + "grad_norm": 1.5841465227537788, + "learning_rate": 9.682840323461406e-05, + "loss": 2.7362, + "step": 8911 + }, + { + "epoch": 5.74262691377921, + "grad_norm": 1.590241159714689, + "learning_rate": 9.682769668021834e-05, + "loss": 3.037, + "step": 8912 + }, + { + "epoch": 5.743271555197421, + "grad_norm": 1.8600765152955456, + "learning_rate": 9.68269900497352e-05, + "loss": 2.8566, + "step": 8913 + }, + { + "epoch": 5.743916196615633, + "grad_norm": 1.5303266335795251, + "learning_rate": 9.682628334316578e-05, + "loss": 3.2946, + "step": 8914 + }, + { + "epoch": 5.744560838033844, + "grad_norm": 1.9577681059315348, + "learning_rate": 9.682557656051124e-05, + "loss": 3.0939, + "step": 8915 + }, + { + "epoch": 5.745205479452055, + "grad_norm": 1.6224006362171592, + "learning_rate": 9.682486970177276e-05, + "loss": 3.0169, + "step": 8916 + }, + { + "epoch": 5.745850120870266, + "grad_norm": 1.721035542120715, + "learning_rate": 9.68241627669515e-05, + "loss": 2.9036, + "step": 8917 + }, + { + "epoch": 5.746494762288477, + "grad_norm": 1.535959537816797, + "learning_rate": 9.68234557560486e-05, + "loss": 3.1135, + "step": 8918 + }, + { + "epoch": 5.747139403706688, + "grad_norm": 1.6579384830988195, + "learning_rate": 9.682274866906524e-05, + "loss": 3.1634, + "step": 8919 + }, + { + "epoch": 5.7477840451249, + "grad_norm": 1.6170552866173418, + "learning_rate": 9.682204150600258e-05, + "loss": 3.1469, + "step": 8920 + }, + { + "epoch": 5.74842868654311, + "grad_norm": 1.4512155936444624, + "learning_rate": 9.682133426686175e-05, + "loss": 3.0399, + "step": 8921 + }, + { + "epoch": 5.749073327961321, + "grad_norm": 1.9087410360501413, + "learning_rate": 9.682062695164396e-05, + "loss": 2.8884, + "step": 8922 + }, + { + "epoch": 5.749717969379533, + "grad_norm": 1.6171196202275717, + "learning_rate": 9.681991956035033e-05, + "loss": 2.8067, + "step": 8923 + }, + { + "epoch": 5.750362610797744, + "grad_norm": 1.8640530203634218, + "learning_rate": 9.681921209298208e-05, + "loss": 3.0909, + "step": 8924 + }, + { + "epoch": 5.751007252215955, + "grad_norm": 1.611442770970786, + "learning_rate": 9.68185045495403e-05, + "loss": 3.1337, + "step": 8925 + }, + { + "epoch": 5.751651893634166, + "grad_norm": 1.7075842820280596, + "learning_rate": 9.68177969300262e-05, + "loss": 2.976, + "step": 8926 + }, + { + "epoch": 5.752296535052377, + "grad_norm": 1.5952613916920089, + "learning_rate": 9.681708923444091e-05, + "loss": 2.8985, + "step": 8927 + }, + { + "epoch": 5.752941176470588, + "grad_norm": 1.5688329374786596, + "learning_rate": 9.681638146278562e-05, + "loss": 3.1545, + "step": 8928 + }, + { + "epoch": 5.7535858178887995, + "grad_norm": 1.6842161856736366, + "learning_rate": 9.681567361506149e-05, + "loss": 3.3382, + "step": 8929 + }, + { + "epoch": 5.754230459307011, + "grad_norm": 1.4962040040997113, + "learning_rate": 9.681496569126965e-05, + "loss": 3.0606, + "step": 8930 + }, + { + "epoch": 5.754875100725221, + "grad_norm": 1.727500099006228, + "learning_rate": 9.681425769141131e-05, + "loss": 2.9804, + "step": 8931 + }, + { + "epoch": 5.7555197421434325, + "grad_norm": 1.4110880566546342, + "learning_rate": 9.681354961548761e-05, + "loss": 2.9066, + "step": 8932 + }, + { + "epoch": 5.756164383561644, + "grad_norm": 1.3876117355199766, + "learning_rate": 9.681284146349972e-05, + "loss": 2.9788, + "step": 8933 + }, + { + "epoch": 5.756809024979855, + "grad_norm": 1.606443345585666, + "learning_rate": 9.681213323544876e-05, + "loss": 2.844, + "step": 8934 + }, + { + "epoch": 5.757453666398066, + "grad_norm": 1.7043796430219675, + "learning_rate": 9.681142493133596e-05, + "loss": 3.008, + "step": 8935 + }, + { + "epoch": 5.758098307816277, + "grad_norm": 1.4228074350065785, + "learning_rate": 9.681071655116244e-05, + "loss": 2.8365, + "step": 8936 + }, + { + "epoch": 5.758742949234488, + "grad_norm": 1.6096542924594706, + "learning_rate": 9.681000809492939e-05, + "loss": 2.9753, + "step": 8937 + }, + { + "epoch": 5.7593875906526995, + "grad_norm": 1.4366116767924342, + "learning_rate": 9.680929956263795e-05, + "loss": 3.0008, + "step": 8938 + }, + { + "epoch": 5.760032232070911, + "grad_norm": 1.6792930935377264, + "learning_rate": 9.680859095428929e-05, + "loss": 3.2371, + "step": 8939 + }, + { + "epoch": 5.760676873489122, + "grad_norm": 7.545669112366709, + "learning_rate": 9.680788226988459e-05, + "loss": 2.9476, + "step": 8940 + }, + { + "epoch": 5.7613215149073325, + "grad_norm": 1.3669365409242182, + "learning_rate": 9.680717350942498e-05, + "loss": 2.7804, + "step": 8941 + }, + { + "epoch": 5.761966156325544, + "grad_norm": 1.4992299544535308, + "learning_rate": 9.680646467291166e-05, + "loss": 2.8729, + "step": 8942 + }, + { + "epoch": 5.762610797743755, + "grad_norm": 1.5890802385289198, + "learning_rate": 9.680575576034578e-05, + "loss": 3.046, + "step": 8943 + }, + { + "epoch": 5.763255439161966, + "grad_norm": 1.688719858693091, + "learning_rate": 9.680504677172848e-05, + "loss": 2.7244, + "step": 8944 + }, + { + "epoch": 5.763900080580177, + "grad_norm": 1.5058423624084705, + "learning_rate": 9.680433770706097e-05, + "loss": 3.2279, + "step": 8945 + }, + { + "epoch": 5.764544721998388, + "grad_norm": 1.7001891737638242, + "learning_rate": 9.680362856634439e-05, + "loss": 3.117, + "step": 8946 + }, + { + "epoch": 5.765189363416599, + "grad_norm": 1.389908430643668, + "learning_rate": 9.68029193495799e-05, + "loss": 3.0861, + "step": 8947 + }, + { + "epoch": 5.765834004834811, + "grad_norm": 1.6411724185442762, + "learning_rate": 9.680221005676868e-05, + "loss": 3.2484, + "step": 8948 + }, + { + "epoch": 5.766478646253022, + "grad_norm": 1.4740523169738087, + "learning_rate": 9.680150068791189e-05, + "loss": 2.8601, + "step": 8949 + }, + { + "epoch": 5.767123287671232, + "grad_norm": 1.4588846168307987, + "learning_rate": 9.680079124301068e-05, + "loss": 3.1175, + "step": 8950 + }, + { + "epoch": 5.767767929089444, + "grad_norm": 1.7396893469293153, + "learning_rate": 9.680008172206624e-05, + "loss": 3.1685, + "step": 8951 + }, + { + "epoch": 5.768412570507655, + "grad_norm": 1.692734885614243, + "learning_rate": 9.679937212507971e-05, + "loss": 3.0899, + "step": 8952 + }, + { + "epoch": 5.769057211925866, + "grad_norm": 1.620041448833612, + "learning_rate": 9.679866245205227e-05, + "loss": 3.123, + "step": 8953 + }, + { + "epoch": 5.769701853344078, + "grad_norm": 1.719266260639209, + "learning_rate": 9.679795270298508e-05, + "loss": 2.9863, + "step": 8954 + }, + { + "epoch": 5.770346494762288, + "grad_norm": 1.5694212377616164, + "learning_rate": 9.679724287787932e-05, + "loss": 3.2063, + "step": 8955 + }, + { + "epoch": 5.770991136180499, + "grad_norm": 1.375971147282348, + "learning_rate": 9.679653297673615e-05, + "loss": 2.845, + "step": 8956 + }, + { + "epoch": 5.771635777598711, + "grad_norm": 1.511808574619987, + "learning_rate": 9.679582299955671e-05, + "loss": 3.0582, + "step": 8957 + }, + { + "epoch": 5.772280419016922, + "grad_norm": 1.4459864563241234, + "learning_rate": 9.67951129463422e-05, + "loss": 3.385, + "step": 8958 + }, + { + "epoch": 5.772925060435133, + "grad_norm": 1.608507979707795, + "learning_rate": 9.679440281709376e-05, + "loss": 3.1193, + "step": 8959 + }, + { + "epoch": 5.773569701853344, + "grad_norm": 1.5857504056218565, + "learning_rate": 9.679369261181258e-05, + "loss": 3.2723, + "step": 8960 + }, + { + "epoch": 5.774214343271555, + "grad_norm": 1.4163329226388506, + "learning_rate": 9.679298233049981e-05, + "loss": 3.2592, + "step": 8961 + }, + { + "epoch": 5.774858984689766, + "grad_norm": 1.5754872603622558, + "learning_rate": 9.679227197315663e-05, + "loss": 2.8735, + "step": 8962 + }, + { + "epoch": 5.7755036261079775, + "grad_norm": 1.4298678398981406, + "learning_rate": 9.67915615397842e-05, + "loss": 2.8859, + "step": 8963 + }, + { + "epoch": 5.776148267526189, + "grad_norm": 1.5703373465824062, + "learning_rate": 9.679085103038368e-05, + "loss": 2.8507, + "step": 8964 + }, + { + "epoch": 5.776792908944399, + "grad_norm": 1.735025868601722, + "learning_rate": 9.679014044495624e-05, + "loss": 2.9984, + "step": 8965 + }, + { + "epoch": 5.7774375503626105, + "grad_norm": 1.726381679880028, + "learning_rate": 9.678942978350306e-05, + "loss": 3.3846, + "step": 8966 + }, + { + "epoch": 5.778082191780822, + "grad_norm": 1.626984535155322, + "learning_rate": 9.678871904602529e-05, + "loss": 3.123, + "step": 8967 + }, + { + "epoch": 5.778726833199033, + "grad_norm": 1.9074351509981144, + "learning_rate": 9.67880082325241e-05, + "loss": 2.9996, + "step": 8968 + }, + { + "epoch": 5.779371474617244, + "grad_norm": 1.3309485333407567, + "learning_rate": 9.678729734300067e-05, + "loss": 3.1435, + "step": 8969 + }, + { + "epoch": 5.780016116035455, + "grad_norm": 1.5365948880546616, + "learning_rate": 9.678658637745614e-05, + "loss": 3.1791, + "step": 8970 + }, + { + "epoch": 5.780660757453666, + "grad_norm": 1.3997256544570573, + "learning_rate": 9.678587533589172e-05, + "loss": 2.9659, + "step": 8971 + }, + { + "epoch": 5.781305398871877, + "grad_norm": 1.6520078370923292, + "learning_rate": 9.678516421830855e-05, + "loss": 3.1355, + "step": 8972 + }, + { + "epoch": 5.781950040290089, + "grad_norm": 1.573554812930636, + "learning_rate": 9.67844530247078e-05, + "loss": 3.1891, + "step": 8973 + }, + { + "epoch": 5.7825946817083, + "grad_norm": 1.8450878164001214, + "learning_rate": 9.678374175509063e-05, + "loss": 3.0566, + "step": 8974 + }, + { + "epoch": 5.7832393231265105, + "grad_norm": 1.566775898782021, + "learning_rate": 9.678303040945822e-05, + "loss": 3.0293, + "step": 8975 + }, + { + "epoch": 5.783883964544722, + "grad_norm": 1.5834970940767892, + "learning_rate": 9.678231898781175e-05, + "loss": 2.8361, + "step": 8976 + }, + { + "epoch": 5.784528605962933, + "grad_norm": 1.5049860610500487, + "learning_rate": 9.678160749015237e-05, + "loss": 2.997, + "step": 8977 + }, + { + "epoch": 5.785173247381144, + "grad_norm": 1.6852530424647119, + "learning_rate": 9.678089591648127e-05, + "loss": 3.0421, + "step": 8978 + }, + { + "epoch": 5.785817888799356, + "grad_norm": 1.6363154216575424, + "learning_rate": 9.678018426679956e-05, + "loss": 3.1803, + "step": 8979 + }, + { + "epoch": 5.786462530217566, + "grad_norm": 1.8102771204681851, + "learning_rate": 9.677947254110847e-05, + "loss": 2.866, + "step": 8980 + }, + { + "epoch": 5.787107171635777, + "grad_norm": 1.8295555842916444, + "learning_rate": 9.677876073940916e-05, + "loss": 3.2982, + "step": 8981 + }, + { + "epoch": 5.787751813053989, + "grad_norm": 1.5167494326185011, + "learning_rate": 9.677804886170278e-05, + "loss": 2.95, + "step": 8982 + }, + { + "epoch": 5.7883964544722, + "grad_norm": 1.627842091240976, + "learning_rate": 9.677733690799052e-05, + "loss": 2.8695, + "step": 8983 + }, + { + "epoch": 5.789041095890411, + "grad_norm": 1.6091125600583844, + "learning_rate": 9.677662487827355e-05, + "loss": 3.1083, + "step": 8984 + }, + { + "epoch": 5.789685737308622, + "grad_norm": 1.469519425607945, + "learning_rate": 9.6775912772553e-05, + "loss": 3.0708, + "step": 8985 + }, + { + "epoch": 5.790330378726833, + "grad_norm": 1.533748999441618, + "learning_rate": 9.677520059083009e-05, + "loss": 3.0811, + "step": 8986 + }, + { + "epoch": 5.790975020145044, + "grad_norm": 1.6803701845766394, + "learning_rate": 9.677448833310595e-05, + "loss": 3.2071, + "step": 8987 + }, + { + "epoch": 5.791619661563256, + "grad_norm": 1.5480125137613936, + "learning_rate": 9.677377599938177e-05, + "loss": 2.996, + "step": 8988 + }, + { + "epoch": 5.792264302981467, + "grad_norm": 1.7240695661314949, + "learning_rate": 9.67730635896587e-05, + "loss": 3.068, + "step": 8989 + }, + { + "epoch": 5.792908944399677, + "grad_norm": 1.396020708807561, + "learning_rate": 9.677235110393794e-05, + "loss": 2.8673, + "step": 8990 + }, + { + "epoch": 5.793553585817889, + "grad_norm": 1.7436785011243625, + "learning_rate": 9.677163854222066e-05, + "loss": 2.9936, + "step": 8991 + }, + { + "epoch": 5.7941982272361, + "grad_norm": 1.7742028225021342, + "learning_rate": 9.6770925904508e-05, + "loss": 2.8001, + "step": 8992 + }, + { + "epoch": 5.794842868654311, + "grad_norm": 1.5268111941140878, + "learning_rate": 9.677021319080115e-05, + "loss": 2.6939, + "step": 8993 + }, + { + "epoch": 5.7954875100725225, + "grad_norm": 1.909917158061143, + "learning_rate": 9.676950040110127e-05, + "loss": 2.8213, + "step": 8994 + }, + { + "epoch": 5.796132151490733, + "grad_norm": 1.7604284909768193, + "learning_rate": 9.676878753540956e-05, + "loss": 3.0468, + "step": 8995 + }, + { + "epoch": 5.796776792908944, + "grad_norm": 1.7780403801001092, + "learning_rate": 9.676807459372714e-05, + "loss": 2.8195, + "step": 8996 + }, + { + "epoch": 5.7974214343271555, + "grad_norm": 1.735650179326416, + "learning_rate": 9.676736157605523e-05, + "loss": 2.8805, + "step": 8997 + }, + { + "epoch": 5.798066075745367, + "grad_norm": 1.8163831860500796, + "learning_rate": 9.676664848239498e-05, + "loss": 3.1676, + "step": 8998 + }, + { + "epoch": 5.798710717163578, + "grad_norm": 1.5772603362094182, + "learning_rate": 9.676593531274755e-05, + "loss": 2.6401, + "step": 8999 + }, + { + "epoch": 5.7993553585817885, + "grad_norm": 1.7933872079601945, + "learning_rate": 9.676522206711414e-05, + "loss": 3.0492, + "step": 9000 + }, + { + "epoch": 5.7993553585817885, + "eval_loss": 4.368810653686523, + "eval_runtime": 2.9719, + "eval_samples_per_second": 33.649, + "eval_steps_per_second": 4.374, + "step": 9000 + }, + { + "epoch": 5.8, + "grad_norm": 1.5650780204911037, + "learning_rate": 9.67645087454959e-05, + "loss": 2.8603, + "step": 9001 + }, + { + "epoch": 5.800644641418211, + "grad_norm": 1.8373324367180144, + "learning_rate": 9.6763795347894e-05, + "loss": 3.2548, + "step": 9002 + }, + { + "epoch": 5.801289282836422, + "grad_norm": 1.42770837084802, + "learning_rate": 9.676308187430963e-05, + "loss": 3.1929, + "step": 9003 + }, + { + "epoch": 5.801933924254634, + "grad_norm": 1.8480348708840397, + "learning_rate": 9.676236832474395e-05, + "loss": 3.0908, + "step": 9004 + }, + { + "epoch": 5.802578565672844, + "grad_norm": 1.4985556353469938, + "learning_rate": 9.676165469919811e-05, + "loss": 3.0833, + "step": 9005 + }, + { + "epoch": 5.803223207091055, + "grad_norm": 1.8359376092664674, + "learning_rate": 9.676094099767333e-05, + "loss": 3.245, + "step": 9006 + }, + { + "epoch": 5.803867848509267, + "grad_norm": 1.5191607489732084, + "learning_rate": 9.676022722017073e-05, + "loss": 2.8484, + "step": 9007 + }, + { + "epoch": 5.804512489927478, + "grad_norm": 1.7812712998482398, + "learning_rate": 9.675951336669152e-05, + "loss": 2.9966, + "step": 9008 + }, + { + "epoch": 5.805157131345689, + "grad_norm": 1.458713013121349, + "learning_rate": 9.675879943723687e-05, + "loss": 3.3866, + "step": 9009 + }, + { + "epoch": 5.8058017727639, + "grad_norm": 1.6130172740891182, + "learning_rate": 9.675808543180792e-05, + "loss": 3.1741, + "step": 9010 + }, + { + "epoch": 5.806446414182111, + "grad_norm": 1.564995038018554, + "learning_rate": 9.675737135040588e-05, + "loss": 2.6326, + "step": 9011 + }, + { + "epoch": 5.807091055600322, + "grad_norm": 1.5562529704993873, + "learning_rate": 9.675665719303192e-05, + "loss": 3.4032, + "step": 9012 + }, + { + "epoch": 5.807735697018534, + "grad_norm": 1.4872922422605903, + "learning_rate": 9.67559429596872e-05, + "loss": 3.127, + "step": 9013 + }, + { + "epoch": 5.808380338436745, + "grad_norm": 1.5684619388150012, + "learning_rate": 9.675522865037288e-05, + "loss": 2.8313, + "step": 9014 + }, + { + "epoch": 5.809024979854955, + "grad_norm": 1.398593515335392, + "learning_rate": 9.675451426509016e-05, + "loss": 3.0691, + "step": 9015 + }, + { + "epoch": 5.809669621273167, + "grad_norm": 1.6173893451806824, + "learning_rate": 9.675379980384018e-05, + "loss": 3.0388, + "step": 9016 + }, + { + "epoch": 5.810314262691378, + "grad_norm": 1.5712009243309013, + "learning_rate": 9.675308526662415e-05, + "loss": 2.9646, + "step": 9017 + }, + { + "epoch": 5.810958904109589, + "grad_norm": 1.8587331159846312, + "learning_rate": 9.675237065344323e-05, + "loss": 3.1008, + "step": 9018 + }, + { + "epoch": 5.811603545527801, + "grad_norm": 1.6684452761059054, + "learning_rate": 9.67516559642986e-05, + "loss": 3.4054, + "step": 9019 + }, + { + "epoch": 5.812248186946011, + "grad_norm": 1.516853801329951, + "learning_rate": 9.67509411991914e-05, + "loss": 3.1673, + "step": 9020 + }, + { + "epoch": 5.812892828364222, + "grad_norm": 1.6225929583761225, + "learning_rate": 9.675022635812286e-05, + "loss": 3.2415, + "step": 9021 + }, + { + "epoch": 5.813537469782434, + "grad_norm": 1.4160205948287952, + "learning_rate": 9.67495114410941e-05, + "loss": 2.7341, + "step": 9022 + }, + { + "epoch": 5.814182111200645, + "grad_norm": 1.566416819837783, + "learning_rate": 9.674879644810632e-05, + "loss": 3.0629, + "step": 9023 + }, + { + "epoch": 5.814826752618856, + "grad_norm": 1.3325436383034452, + "learning_rate": 9.674808137916071e-05, + "loss": 2.8885, + "step": 9024 + }, + { + "epoch": 5.815471394037067, + "grad_norm": 1.3703970906742449, + "learning_rate": 9.67473662342584e-05, + "loss": 3.2308, + "step": 9025 + }, + { + "epoch": 5.816116035455278, + "grad_norm": 1.4041043390805912, + "learning_rate": 9.674665101340062e-05, + "loss": 2.9692, + "step": 9026 + }, + { + "epoch": 5.816760676873489, + "grad_norm": 1.3315689940540671, + "learning_rate": 9.674593571658849e-05, + "loss": 2.9616, + "step": 9027 + }, + { + "epoch": 5.8174053182917005, + "grad_norm": 1.2810288070814615, + "learning_rate": 9.674522034382323e-05, + "loss": 2.7505, + "step": 9028 + }, + { + "epoch": 5.818049959709912, + "grad_norm": 1.3971281753520222, + "learning_rate": 9.674450489510597e-05, + "loss": 2.9546, + "step": 9029 + }, + { + "epoch": 5.818694601128122, + "grad_norm": 1.4512859501788202, + "learning_rate": 9.674378937043793e-05, + "loss": 3.1265, + "step": 9030 + }, + { + "epoch": 5.8193392425463335, + "grad_norm": 1.4473775237129995, + "learning_rate": 9.674307376982027e-05, + "loss": 3.0873, + "step": 9031 + }, + { + "epoch": 5.819983883964545, + "grad_norm": 1.6051908049742605, + "learning_rate": 9.674235809325415e-05, + "loss": 2.7666, + "step": 9032 + }, + { + "epoch": 5.820628525382756, + "grad_norm": 1.5786335534744391, + "learning_rate": 9.674164234074077e-05, + "loss": 2.8624, + "step": 9033 + }, + { + "epoch": 5.821273166800967, + "grad_norm": 1.680927067255202, + "learning_rate": 9.674092651228126e-05, + "loss": 2.8266, + "step": 9034 + }, + { + "epoch": 5.821917808219178, + "grad_norm": 1.7844109429684636, + "learning_rate": 9.674021060787684e-05, + "loss": 2.9525, + "step": 9035 + }, + { + "epoch": 5.822562449637389, + "grad_norm": 1.6468522320012593, + "learning_rate": 9.67394946275287e-05, + "loss": 2.9636, + "step": 9036 + }, + { + "epoch": 5.8232070910556, + "grad_norm": 1.5919612470636908, + "learning_rate": 9.673877857123797e-05, + "loss": 3.1113, + "step": 9037 + }, + { + "epoch": 5.823851732473812, + "grad_norm": 1.4259627969324093, + "learning_rate": 9.673806243900584e-05, + "loss": 3.0428, + "step": 9038 + }, + { + "epoch": 5.824496373892023, + "grad_norm": 1.4302804324039178, + "learning_rate": 9.673734623083349e-05, + "loss": 3.0525, + "step": 9039 + }, + { + "epoch": 5.825141015310233, + "grad_norm": 1.572958548292172, + "learning_rate": 9.67366299467221e-05, + "loss": 3.1604, + "step": 9040 + }, + { + "epoch": 5.825785656728445, + "grad_norm": 1.4597540162808111, + "learning_rate": 9.673591358667285e-05, + "loss": 2.8082, + "step": 9041 + }, + { + "epoch": 5.826430298146656, + "grad_norm": 1.5570130391850046, + "learning_rate": 9.67351971506869e-05, + "loss": 3.202, + "step": 9042 + }, + { + "epoch": 5.827074939564867, + "grad_norm": 1.6683917287911063, + "learning_rate": 9.673448063876544e-05, + "loss": 2.8821, + "step": 9043 + }, + { + "epoch": 5.827719580983079, + "grad_norm": 1.609803306544445, + "learning_rate": 9.673376405090967e-05, + "loss": 3.0939, + "step": 9044 + }, + { + "epoch": 5.828364222401289, + "grad_norm": 1.948042575863521, + "learning_rate": 9.67330473871207e-05, + "loss": 3.1976, + "step": 9045 + }, + { + "epoch": 5.8290088638195, + "grad_norm": 1.6292766448244813, + "learning_rate": 9.673233064739977e-05, + "loss": 2.9681, + "step": 9046 + }, + { + "epoch": 5.829653505237712, + "grad_norm": 1.4946273932047764, + "learning_rate": 9.673161383174804e-05, + "loss": 3.3468, + "step": 9047 + }, + { + "epoch": 5.830298146655923, + "grad_norm": 1.5651524334523372, + "learning_rate": 9.673089694016666e-05, + "loss": 2.9144, + "step": 9048 + }, + { + "epoch": 5.830942788074134, + "grad_norm": 1.6464444876851636, + "learning_rate": 9.673017997265686e-05, + "loss": 3.5028, + "step": 9049 + }, + { + "epoch": 5.831587429492345, + "grad_norm": 1.4794846835510718, + "learning_rate": 9.672946292921977e-05, + "loss": 3.2859, + "step": 9050 + }, + { + "epoch": 5.832232070910556, + "grad_norm": 1.7325650524489669, + "learning_rate": 9.672874580985658e-05, + "loss": 3.2525, + "step": 9051 + }, + { + "epoch": 5.832876712328767, + "grad_norm": 1.501077087618765, + "learning_rate": 9.672802861456848e-05, + "loss": 2.977, + "step": 9052 + }, + { + "epoch": 5.8335213537469786, + "grad_norm": 1.5061093310080451, + "learning_rate": 9.672731134335662e-05, + "loss": 3.1049, + "step": 9053 + }, + { + "epoch": 5.83416599516519, + "grad_norm": 1.4819772270100757, + "learning_rate": 9.672659399622221e-05, + "loss": 2.9753, + "step": 9054 + }, + { + "epoch": 5.8348106365834, + "grad_norm": 1.6632780693017308, + "learning_rate": 9.672587657316644e-05, + "loss": 2.7081, + "step": 9055 + }, + { + "epoch": 5.835455278001612, + "grad_norm": 1.353011690993719, + "learning_rate": 9.672515907419044e-05, + "loss": 3.1509, + "step": 9056 + }, + { + "epoch": 5.836099919419823, + "grad_norm": 1.7000598092988919, + "learning_rate": 9.672444149929542e-05, + "loss": 3.0983, + "step": 9057 + }, + { + "epoch": 5.836744560838034, + "grad_norm": 1.4223920719590764, + "learning_rate": 9.672372384848257e-05, + "loss": 2.7605, + "step": 9058 + }, + { + "epoch": 5.837389202256245, + "grad_norm": 1.6501691308793551, + "learning_rate": 9.672300612175302e-05, + "loss": 2.6556, + "step": 9059 + }, + { + "epoch": 5.838033843674456, + "grad_norm": 1.5713159463638053, + "learning_rate": 9.6722288319108e-05, + "loss": 2.9805, + "step": 9060 + }, + { + "epoch": 5.838678485092667, + "grad_norm": 1.714808725415758, + "learning_rate": 9.672157044054865e-05, + "loss": 3.081, + "step": 9061 + }, + { + "epoch": 5.8393231265108785, + "grad_norm": 1.6929796449389016, + "learning_rate": 9.672085248607619e-05, + "loss": 2.6712, + "step": 9062 + }, + { + "epoch": 5.83996776792909, + "grad_norm": 1.5653516245249455, + "learning_rate": 9.672013445569176e-05, + "loss": 3.1181, + "step": 9063 + }, + { + "epoch": 5.8406124093473, + "grad_norm": 2.066438348565457, + "learning_rate": 9.671941634939655e-05, + "loss": 3.0254, + "step": 9064 + }, + { + "epoch": 5.8412570507655115, + "grad_norm": 1.554946063922369, + "learning_rate": 9.671869816719176e-05, + "loss": 2.9379, + "step": 9065 + }, + { + "epoch": 5.841901692183723, + "grad_norm": 1.5892910017303992, + "learning_rate": 9.671797990907855e-05, + "loss": 3.3564, + "step": 9066 + }, + { + "epoch": 5.842546333601934, + "grad_norm": 1.5044946018346619, + "learning_rate": 9.67172615750581e-05, + "loss": 3.0164, + "step": 9067 + }, + { + "epoch": 5.843190975020145, + "grad_norm": 1.7395711498469495, + "learning_rate": 9.671654316513159e-05, + "loss": 2.9771, + "step": 9068 + }, + { + "epoch": 5.843835616438356, + "grad_norm": 1.7513467450379228, + "learning_rate": 9.671582467930022e-05, + "loss": 3.0975, + "step": 9069 + }, + { + "epoch": 5.844480257856567, + "grad_norm": 1.844947313592595, + "learning_rate": 9.671510611756515e-05, + "loss": 3.2162, + "step": 9070 + }, + { + "epoch": 5.845124899274778, + "grad_norm": 1.5122780361173096, + "learning_rate": 9.671438747992756e-05, + "loss": 3.0427, + "step": 9071 + }, + { + "epoch": 5.84576954069299, + "grad_norm": 1.708739640409182, + "learning_rate": 9.671366876638863e-05, + "loss": 3.0176, + "step": 9072 + }, + { + "epoch": 5.846414182111201, + "grad_norm": 1.3434835911185958, + "learning_rate": 9.671294997694955e-05, + "loss": 3.0239, + "step": 9073 + }, + { + "epoch": 5.847058823529411, + "grad_norm": 1.9593045946877539, + "learning_rate": 9.671223111161149e-05, + "loss": 3.1615, + "step": 9074 + }, + { + "epoch": 5.847703464947623, + "grad_norm": 1.4246362654269709, + "learning_rate": 9.671151217037564e-05, + "loss": 2.8352, + "step": 9075 + }, + { + "epoch": 5.848348106365834, + "grad_norm": 1.8366692239264937, + "learning_rate": 9.671079315324317e-05, + "loss": 3.3065, + "step": 9076 + }, + { + "epoch": 5.848992747784045, + "grad_norm": 1.3087434260241486, + "learning_rate": 9.671007406021528e-05, + "loss": 3.1063, + "step": 9077 + }, + { + "epoch": 5.849637389202257, + "grad_norm": 1.8125524849311552, + "learning_rate": 9.670935489129312e-05, + "loss": 2.9455, + "step": 9078 + }, + { + "epoch": 5.850282030620467, + "grad_norm": 1.5611897016881704, + "learning_rate": 9.670863564647791e-05, + "loss": 3.1079, + "step": 9079 + }, + { + "epoch": 5.850926672038678, + "grad_norm": 1.8489826951675965, + "learning_rate": 9.67079163257708e-05, + "loss": 3.1763, + "step": 9080 + }, + { + "epoch": 5.85157131345689, + "grad_norm": 1.367851180634743, + "learning_rate": 9.6707196929173e-05, + "loss": 3.0214, + "step": 9081 + }, + { + "epoch": 5.852215954875101, + "grad_norm": 1.9913103430986003, + "learning_rate": 9.670647745668566e-05, + "loss": 3.1992, + "step": 9082 + }, + { + "epoch": 5.852860596293311, + "grad_norm": 1.658302086319094, + "learning_rate": 9.670575790830995e-05, + "loss": 2.9477, + "step": 9083 + }, + { + "epoch": 5.853505237711523, + "grad_norm": 1.6111902272075567, + "learning_rate": 9.67050382840471e-05, + "loss": 2.9841, + "step": 9084 + }, + { + "epoch": 5.854149879129734, + "grad_norm": 1.5858391808738181, + "learning_rate": 9.670431858389829e-05, + "loss": 3.0802, + "step": 9085 + }, + { + "epoch": 5.854794520547945, + "grad_norm": 1.484714369555005, + "learning_rate": 9.670359880786466e-05, + "loss": 2.8673, + "step": 9086 + }, + { + "epoch": 5.8554391619661565, + "grad_norm": 1.7244028105691676, + "learning_rate": 9.670287895594742e-05, + "loss": 3.146, + "step": 9087 + }, + { + "epoch": 5.856083803384367, + "grad_norm": 1.7447542766652684, + "learning_rate": 9.670215902814773e-05, + "loss": 3.0515, + "step": 9088 + }, + { + "epoch": 5.856728444802578, + "grad_norm": 1.6164646674527243, + "learning_rate": 9.670143902446681e-05, + "loss": 3.1431, + "step": 9089 + }, + { + "epoch": 5.85737308622079, + "grad_norm": 1.6078675507751548, + "learning_rate": 9.670071894490581e-05, + "loss": 3.2203, + "step": 9090 + }, + { + "epoch": 5.858017727639001, + "grad_norm": 1.7356750340086908, + "learning_rate": 9.669999878946592e-05, + "loss": 3.1257, + "step": 9091 + }, + { + "epoch": 5.858662369057212, + "grad_norm": 1.5436153893792264, + "learning_rate": 9.669927855814833e-05, + "loss": 3.3083, + "step": 9092 + }, + { + "epoch": 5.859307010475423, + "grad_norm": 1.5647951312368076, + "learning_rate": 9.669855825095421e-05, + "loss": 3.2288, + "step": 9093 + }, + { + "epoch": 5.859951651893634, + "grad_norm": 1.8235097486234983, + "learning_rate": 9.669783786788477e-05, + "loss": 2.8127, + "step": 9094 + }, + { + "epoch": 5.860596293311845, + "grad_norm": 1.4929320885686856, + "learning_rate": 9.669711740894116e-05, + "loss": 3.2345, + "step": 9095 + }, + { + "epoch": 5.8612409347300565, + "grad_norm": 1.7251048735121832, + "learning_rate": 9.669639687412458e-05, + "loss": 2.9903, + "step": 9096 + }, + { + "epoch": 5.861885576148268, + "grad_norm": 1.704115292196688, + "learning_rate": 9.669567626343622e-05, + "loss": 2.7215, + "step": 9097 + }, + { + "epoch": 5.862530217566478, + "grad_norm": 1.6666954328962276, + "learning_rate": 9.669495557687725e-05, + "loss": 3.0601, + "step": 9098 + }, + { + "epoch": 5.8631748589846895, + "grad_norm": 1.8631126057587846, + "learning_rate": 9.669423481444886e-05, + "loss": 2.7987, + "step": 9099 + }, + { + "epoch": 5.863819500402901, + "grad_norm": 1.563266716222286, + "learning_rate": 9.669351397615222e-05, + "loss": 3.1088, + "step": 9100 + }, + { + "epoch": 5.863819500402901, + "eval_loss": 4.353607177734375, + "eval_runtime": 2.9824, + "eval_samples_per_second": 33.53, + "eval_steps_per_second": 4.359, + "step": 9100 + }, + { + "epoch": 5.864464141821112, + "grad_norm": 1.704576394377233, + "learning_rate": 9.669279306198855e-05, + "loss": 2.9874, + "step": 9101 + }, + { + "epoch": 5.865108783239323, + "grad_norm": 1.5672832694425145, + "learning_rate": 9.6692072071959e-05, + "loss": 2.9799, + "step": 9102 + }, + { + "epoch": 5.865753424657534, + "grad_norm": 1.5421077121700892, + "learning_rate": 9.669135100606476e-05, + "loss": 3.4219, + "step": 9103 + }, + { + "epoch": 5.866398066075745, + "grad_norm": 1.763649814719926, + "learning_rate": 9.669062986430703e-05, + "loss": 3.0551, + "step": 9104 + }, + { + "epoch": 5.867042707493956, + "grad_norm": 1.3485797841664962, + "learning_rate": 9.668990864668697e-05, + "loss": 3.2689, + "step": 9105 + }, + { + "epoch": 5.867687348912168, + "grad_norm": 1.6152524110429458, + "learning_rate": 9.668918735320577e-05, + "loss": 3.1509, + "step": 9106 + }, + { + "epoch": 5.868331990330379, + "grad_norm": 1.7884432206306728, + "learning_rate": 9.668846598386463e-05, + "loss": 3.0714, + "step": 9107 + }, + { + "epoch": 5.868976631748589, + "grad_norm": 1.556640921883582, + "learning_rate": 9.668774453866473e-05, + "loss": 3.1401, + "step": 9108 + }, + { + "epoch": 5.869621273166801, + "grad_norm": 1.6907366820034442, + "learning_rate": 9.668702301760726e-05, + "loss": 3.0677, + "step": 9109 + }, + { + "epoch": 5.870265914585012, + "grad_norm": 1.5482214705705202, + "learning_rate": 9.668630142069338e-05, + "loss": 2.9285, + "step": 9110 + }, + { + "epoch": 5.870910556003223, + "grad_norm": 1.6349813967123068, + "learning_rate": 9.668557974792431e-05, + "loss": 2.7325, + "step": 9111 + }, + { + "epoch": 5.871555197421435, + "grad_norm": 1.4330128610866577, + "learning_rate": 9.668485799930122e-05, + "loss": 3.3307, + "step": 9112 + }, + { + "epoch": 5.872199838839645, + "grad_norm": 1.6511801931315448, + "learning_rate": 9.668413617482527e-05, + "loss": 3.1078, + "step": 9113 + }, + { + "epoch": 5.872844480257856, + "grad_norm": 1.5166326121821518, + "learning_rate": 9.668341427449769e-05, + "loss": 2.7059, + "step": 9114 + }, + { + "epoch": 5.873489121676068, + "grad_norm": 1.669509760698563, + "learning_rate": 9.668269229831963e-05, + "loss": 3.1286, + "step": 9115 + }, + { + "epoch": 5.874133763094279, + "grad_norm": 1.5619303452715032, + "learning_rate": 9.66819702462923e-05, + "loss": 2.9129, + "step": 9116 + }, + { + "epoch": 5.87477840451249, + "grad_norm": 1.8767401323475101, + "learning_rate": 9.668124811841686e-05, + "loss": 3.3417, + "step": 9117 + }, + { + "epoch": 5.875423045930701, + "grad_norm": 1.6266927165328664, + "learning_rate": 9.668052591469454e-05, + "loss": 3.1098, + "step": 9118 + }, + { + "epoch": 5.876067687348912, + "grad_norm": 1.718900777554648, + "learning_rate": 9.667980363512648e-05, + "loss": 3.3409, + "step": 9119 + }, + { + "epoch": 5.876712328767123, + "grad_norm": 1.6610220695791076, + "learning_rate": 9.667908127971386e-05, + "loss": 2.9939, + "step": 9120 + }, + { + "epoch": 5.8773569701853345, + "grad_norm": 1.7595669721817866, + "learning_rate": 9.667835884845793e-05, + "loss": 3.3265, + "step": 9121 + }, + { + "epoch": 5.878001611603546, + "grad_norm": 1.827785287122743, + "learning_rate": 9.667763634135982e-05, + "loss": 2.9542, + "step": 9122 + }, + { + "epoch": 5.878646253021756, + "grad_norm": 1.40089650973729, + "learning_rate": 9.667691375842073e-05, + "loss": 2.9756, + "step": 9123 + }, + { + "epoch": 5.8792908944399676, + "grad_norm": 1.5531947476238386, + "learning_rate": 9.667619109964186e-05, + "loss": 3.1248, + "step": 9124 + }, + { + "epoch": 5.879935535858179, + "grad_norm": 2.198675879146414, + "learning_rate": 9.667546836502437e-05, + "loss": 2.965, + "step": 9125 + }, + { + "epoch": 5.88058017727639, + "grad_norm": 2.073432702224969, + "learning_rate": 9.667474555456947e-05, + "loss": 3.1994, + "step": 9126 + }, + { + "epoch": 5.8812248186946015, + "grad_norm": 1.598506228614952, + "learning_rate": 9.667402266827835e-05, + "loss": 2.8643, + "step": 9127 + }, + { + "epoch": 5.881869460112812, + "grad_norm": 1.6982257525999684, + "learning_rate": 9.667329970615219e-05, + "loss": 2.9649, + "step": 9128 + }, + { + "epoch": 5.882514101531023, + "grad_norm": 1.8131533438570273, + "learning_rate": 9.667257666819217e-05, + "loss": 3.0461, + "step": 9129 + }, + { + "epoch": 5.8831587429492345, + "grad_norm": 1.5827395289557187, + "learning_rate": 9.667185355439948e-05, + "loss": 3.2005, + "step": 9130 + }, + { + "epoch": 5.883803384367446, + "grad_norm": 1.7959801494046603, + "learning_rate": 9.667113036477531e-05, + "loss": 3.0756, + "step": 9131 + }, + { + "epoch": 5.884448025785657, + "grad_norm": 3.917915792474288, + "learning_rate": 9.667040709932086e-05, + "loss": 3.2095, + "step": 9132 + }, + { + "epoch": 5.8850926672038675, + "grad_norm": 1.7821835380451796, + "learning_rate": 9.666968375803731e-05, + "loss": 3.1496, + "step": 9133 + }, + { + "epoch": 5.885737308622079, + "grad_norm": 1.6593017123651437, + "learning_rate": 9.666896034092584e-05, + "loss": 2.7889, + "step": 9134 + }, + { + "epoch": 5.88638195004029, + "grad_norm": 1.7057872482636194, + "learning_rate": 9.666823684798762e-05, + "loss": 3.3207, + "step": 9135 + }, + { + "epoch": 5.887026591458501, + "grad_norm": 1.7370142391157628, + "learning_rate": 9.666751327922388e-05, + "loss": 3.4761, + "step": 9136 + }, + { + "epoch": 5.887671232876713, + "grad_norm": 1.7580822927601178, + "learning_rate": 9.66667896346358e-05, + "loss": 3.1889, + "step": 9137 + }, + { + "epoch": 5.888315874294923, + "grad_norm": 2.270965016145323, + "learning_rate": 9.666606591422454e-05, + "loss": 3.1243, + "step": 9138 + }, + { + "epoch": 5.888960515713134, + "grad_norm": 1.586276176595133, + "learning_rate": 9.666534211799131e-05, + "loss": 2.9507, + "step": 9139 + }, + { + "epoch": 5.889605157131346, + "grad_norm": 1.5051575359413074, + "learning_rate": 9.666461824593731e-05, + "loss": 3.0994, + "step": 9140 + }, + { + "epoch": 5.890249798549557, + "grad_norm": 1.8438156563071801, + "learning_rate": 9.666389429806368e-05, + "loss": 2.7021, + "step": 9141 + }, + { + "epoch": 5.890894439967768, + "grad_norm": 1.8560831898387715, + "learning_rate": 9.666317027437169e-05, + "loss": 3.085, + "step": 9142 + }, + { + "epoch": 5.891539081385979, + "grad_norm": 1.7224051378738645, + "learning_rate": 9.666244617486244e-05, + "loss": 3.2469, + "step": 9143 + }, + { + "epoch": 5.89218372280419, + "grad_norm": 1.8933458567242711, + "learning_rate": 9.666172199953718e-05, + "loss": 2.8691, + "step": 9144 + }, + { + "epoch": 5.892828364222401, + "grad_norm": 1.7814634616907652, + "learning_rate": 9.666099774839707e-05, + "loss": 2.6563, + "step": 9145 + }, + { + "epoch": 5.893473005640613, + "grad_norm": 2.3439686476294184, + "learning_rate": 9.666027342144332e-05, + "loss": 3.2541, + "step": 9146 + }, + { + "epoch": 5.894117647058824, + "grad_norm": 1.5105110895917675, + "learning_rate": 9.66595490186771e-05, + "loss": 2.9584, + "step": 9147 + }, + { + "epoch": 5.894762288477034, + "grad_norm": 1.709856876707582, + "learning_rate": 9.665882454009963e-05, + "loss": 3.2975, + "step": 9148 + }, + { + "epoch": 5.895406929895246, + "grad_norm": 1.8241610624258064, + "learning_rate": 9.665809998571207e-05, + "loss": 3.1313, + "step": 9149 + }, + { + "epoch": 5.896051571313457, + "grad_norm": 1.8160094817218426, + "learning_rate": 9.665737535551561e-05, + "loss": 3.195, + "step": 9150 + }, + { + "epoch": 5.896696212731668, + "grad_norm": 1.8438688568212627, + "learning_rate": 9.665665064951147e-05, + "loss": 3.1727, + "step": 9151 + }, + { + "epoch": 5.8973408541498795, + "grad_norm": 1.9126125155727722, + "learning_rate": 9.66559258677008e-05, + "loss": 3.1584, + "step": 9152 + }, + { + "epoch": 5.89798549556809, + "grad_norm": 1.86500414245684, + "learning_rate": 9.665520101008481e-05, + "loss": 3.2284, + "step": 9153 + }, + { + "epoch": 5.898630136986301, + "grad_norm": 1.857216842569338, + "learning_rate": 9.665447607666472e-05, + "loss": 3.2746, + "step": 9154 + }, + { + "epoch": 5.8992747784045125, + "grad_norm": 1.7347887066250578, + "learning_rate": 9.665375106744165e-05, + "loss": 3.1506, + "step": 9155 + }, + { + "epoch": 5.899919419822724, + "grad_norm": 2.1751771444818075, + "learning_rate": 9.665302598241686e-05, + "loss": 3.1836, + "step": 9156 + }, + { + "epoch": 5.900564061240935, + "grad_norm": 1.5451715341957968, + "learning_rate": 9.66523008215915e-05, + "loss": 3.0735, + "step": 9157 + }, + { + "epoch": 5.9012087026591455, + "grad_norm": 2.0129441380985815, + "learning_rate": 9.665157558496678e-05, + "loss": 3.0439, + "step": 9158 + }, + { + "epoch": 5.901853344077357, + "grad_norm": 1.8170388416388104, + "learning_rate": 9.665085027254387e-05, + "loss": 3.2121, + "step": 9159 + }, + { + "epoch": 5.902497985495568, + "grad_norm": 1.951588166804897, + "learning_rate": 9.665012488432398e-05, + "loss": 2.9237, + "step": 9160 + }, + { + "epoch": 5.9031426269137794, + "grad_norm": 1.938848275008709, + "learning_rate": 9.664939942030831e-05, + "loss": 3.4531, + "step": 9161 + }, + { + "epoch": 5.903787268331991, + "grad_norm": 1.7313915978814889, + "learning_rate": 9.664867388049805e-05, + "loss": 3.294, + "step": 9162 + }, + { + "epoch": 5.904431909750201, + "grad_norm": 1.526434294954577, + "learning_rate": 9.664794826489437e-05, + "loss": 3.4601, + "step": 9163 + }, + { + "epoch": 5.9050765511684125, + "grad_norm": 1.607622908524914, + "learning_rate": 9.664722257349845e-05, + "loss": 3.2849, + "step": 9164 + }, + { + "epoch": 5.905721192586624, + "grad_norm": 1.6204405413853442, + "learning_rate": 9.664649680631154e-05, + "loss": 3.2399, + "step": 9165 + }, + { + "epoch": 5.906365834004835, + "grad_norm": 1.6700295734032793, + "learning_rate": 9.664577096333477e-05, + "loss": 3.0438, + "step": 9166 + }, + { + "epoch": 5.907010475423046, + "grad_norm": 1.5910670996434346, + "learning_rate": 9.664504504456936e-05, + "loss": 2.8701, + "step": 9167 + }, + { + "epoch": 5.907655116841257, + "grad_norm": 1.7156155483500597, + "learning_rate": 9.664431905001651e-05, + "loss": 3.0621, + "step": 9168 + }, + { + "epoch": 5.908299758259468, + "grad_norm": 1.5047425973882627, + "learning_rate": 9.664359297967741e-05, + "loss": 3.0685, + "step": 9169 + }, + { + "epoch": 5.908944399677679, + "grad_norm": 1.6218295075992617, + "learning_rate": 9.664286683355323e-05, + "loss": 3.022, + "step": 9170 + }, + { + "epoch": 5.909589041095891, + "grad_norm": 1.5659750209870194, + "learning_rate": 9.66421406116452e-05, + "loss": 3.2348, + "step": 9171 + }, + { + "epoch": 5.910233682514102, + "grad_norm": 1.7754361922999573, + "learning_rate": 9.664141431395447e-05, + "loss": 3.175, + "step": 9172 + }, + { + "epoch": 5.910878323932312, + "grad_norm": 1.5486189957530543, + "learning_rate": 9.664068794048225e-05, + "loss": 3.1033, + "step": 9173 + }, + { + "epoch": 5.911522965350524, + "grad_norm": 1.5199861198006974, + "learning_rate": 9.663996149122974e-05, + "loss": 3.1312, + "step": 9174 + }, + { + "epoch": 5.912167606768735, + "grad_norm": 1.8190445528434251, + "learning_rate": 9.663923496619813e-05, + "loss": 3.1757, + "step": 9175 + }, + { + "epoch": 5.912812248186946, + "grad_norm": 1.4284461782214277, + "learning_rate": 9.663850836538861e-05, + "loss": 3.0013, + "step": 9176 + }, + { + "epoch": 5.913456889605158, + "grad_norm": 1.5709275621535235, + "learning_rate": 9.66377816888024e-05, + "loss": 3.3649, + "step": 9177 + }, + { + "epoch": 5.914101531023368, + "grad_norm": 1.3217256688549222, + "learning_rate": 9.663705493644065e-05, + "loss": 3.2353, + "step": 9178 + }, + { + "epoch": 5.914746172441579, + "grad_norm": 1.5690882708683132, + "learning_rate": 9.663632810830458e-05, + "loss": 3.0089, + "step": 9179 + }, + { + "epoch": 5.915390813859791, + "grad_norm": 1.4724290092232013, + "learning_rate": 9.663560120439538e-05, + "loss": 3.0638, + "step": 9180 + }, + { + "epoch": 5.916035455278002, + "grad_norm": 1.4681842705541412, + "learning_rate": 9.663487422471422e-05, + "loss": 3.1782, + "step": 9181 + }, + { + "epoch": 5.916680096696213, + "grad_norm": 1.7980758710933085, + "learning_rate": 9.663414716926233e-05, + "loss": 2.9671, + "step": 9182 + }, + { + "epoch": 5.917324738114424, + "grad_norm": 1.486024245566749, + "learning_rate": 9.663342003804088e-05, + "loss": 3.3916, + "step": 9183 + }, + { + "epoch": 5.917969379532635, + "grad_norm": 1.692017871263296, + "learning_rate": 9.66326928310511e-05, + "loss": 3.0744, + "step": 9184 + }, + { + "epoch": 5.918614020950846, + "grad_norm": 1.723515682856687, + "learning_rate": 9.663196554829414e-05, + "loss": 2.9903, + "step": 9185 + }, + { + "epoch": 5.9192586623690575, + "grad_norm": 1.5491269602218667, + "learning_rate": 9.663123818977121e-05, + "loss": 3.3242, + "step": 9186 + }, + { + "epoch": 5.919903303787269, + "grad_norm": 1.7208167976027913, + "learning_rate": 9.66305107554835e-05, + "loss": 2.9258, + "step": 9187 + }, + { + "epoch": 5.920547945205479, + "grad_norm": 1.4252318590783521, + "learning_rate": 9.662978324543223e-05, + "loss": 3.1448, + "step": 9188 + }, + { + "epoch": 5.9211925866236905, + "grad_norm": 1.9634952330615263, + "learning_rate": 9.662905565961856e-05, + "loss": 2.7683, + "step": 9189 + }, + { + "epoch": 5.921837228041902, + "grad_norm": 1.338163460981895, + "learning_rate": 9.662832799804371e-05, + "loss": 2.7749, + "step": 9190 + }, + { + "epoch": 5.922481869460113, + "grad_norm": 1.717764517621576, + "learning_rate": 9.662760026070885e-05, + "loss": 2.768, + "step": 9191 + }, + { + "epoch": 5.923126510878324, + "grad_norm": 1.7230906750607218, + "learning_rate": 9.66268724476152e-05, + "loss": 2.7323, + "step": 9192 + }, + { + "epoch": 5.923771152296535, + "grad_norm": 1.8804242041490131, + "learning_rate": 9.662614455876396e-05, + "loss": 3.3867, + "step": 9193 + }, + { + "epoch": 5.924415793714746, + "grad_norm": 1.5710346554238774, + "learning_rate": 9.66254165941563e-05, + "loss": 2.9651, + "step": 9194 + }, + { + "epoch": 5.925060435132957, + "grad_norm": 1.6264872937713293, + "learning_rate": 9.662468855379344e-05, + "loss": 3.189, + "step": 9195 + }, + { + "epoch": 5.925705076551169, + "grad_norm": 1.5091122178166292, + "learning_rate": 9.662396043767655e-05, + "loss": 3.3796, + "step": 9196 + }, + { + "epoch": 5.926349717969379, + "grad_norm": 1.6009611031783806, + "learning_rate": 9.662323224580685e-05, + "loss": 3.0395, + "step": 9197 + }, + { + "epoch": 5.9269943593875904, + "grad_norm": 1.665842790690767, + "learning_rate": 9.66225039781855e-05, + "loss": 3.0271, + "step": 9198 + }, + { + "epoch": 5.927639000805802, + "grad_norm": 1.3622097934350945, + "learning_rate": 9.662177563481375e-05, + "loss": 3.26, + "step": 9199 + }, + { + "epoch": 5.928283642224013, + "grad_norm": 1.56140304433777, + "learning_rate": 9.662104721569276e-05, + "loss": 3.0848, + "step": 9200 + }, + { + "epoch": 5.928283642224013, + "eval_loss": 4.394424915313721, + "eval_runtime": 2.9767, + "eval_samples_per_second": 33.594, + "eval_steps_per_second": 4.367, + "step": 9200 + }, + { + "epoch": 5.928928283642224, + "grad_norm": 1.745649355015695, + "learning_rate": 9.662031872082372e-05, + "loss": 3.5331, + "step": 9201 + }, + { + "epoch": 5.929572925060435, + "grad_norm": 1.6668492310427019, + "learning_rate": 9.661959015020787e-05, + "loss": 2.7866, + "step": 9202 + }, + { + "epoch": 5.930217566478646, + "grad_norm": 1.5932147238382206, + "learning_rate": 9.661886150384634e-05, + "loss": 3.1455, + "step": 9203 + }, + { + "epoch": 5.930862207896857, + "grad_norm": 1.9176673988654664, + "learning_rate": 9.66181327817404e-05, + "loss": 2.9254, + "step": 9204 + }, + { + "epoch": 5.931506849315069, + "grad_norm": 1.746656548568848, + "learning_rate": 9.661740398389118e-05, + "loss": 2.9919, + "step": 9205 + }, + { + "epoch": 5.93215149073328, + "grad_norm": 1.495785880642419, + "learning_rate": 9.661667511029994e-05, + "loss": 3.3047, + "step": 9206 + }, + { + "epoch": 5.93279613215149, + "grad_norm": 1.7053145549418993, + "learning_rate": 9.661594616096782e-05, + "loss": 3.0445, + "step": 9207 + }, + { + "epoch": 5.933440773569702, + "grad_norm": 1.5131128815645838, + "learning_rate": 9.661521713589604e-05, + "loss": 3.3747, + "step": 9208 + }, + { + "epoch": 5.934085414987913, + "grad_norm": 1.9121399856298051, + "learning_rate": 9.661448803508581e-05, + "loss": 3.0121, + "step": 9209 + }, + { + "epoch": 5.934730056406124, + "grad_norm": 1.3989976382196228, + "learning_rate": 9.661375885853832e-05, + "loss": 3.1285, + "step": 9210 + }, + { + "epoch": 5.935374697824336, + "grad_norm": 1.962809890458813, + "learning_rate": 9.661302960625476e-05, + "loss": 3.009, + "step": 9211 + }, + { + "epoch": 5.936019339242546, + "grad_norm": 1.4485538701019167, + "learning_rate": 9.661230027823632e-05, + "loss": 3.2925, + "step": 9212 + }, + { + "epoch": 5.936663980660757, + "grad_norm": 1.836749897692067, + "learning_rate": 9.661157087448423e-05, + "loss": 2.6553, + "step": 9213 + }, + { + "epoch": 5.937308622078969, + "grad_norm": 1.4363224969862727, + "learning_rate": 9.661084139499964e-05, + "loss": 2.6895, + "step": 9214 + }, + { + "epoch": 5.93795326349718, + "grad_norm": 1.79421245379506, + "learning_rate": 9.66101118397838e-05, + "loss": 3.1464, + "step": 9215 + }, + { + "epoch": 5.938597904915391, + "grad_norm": 1.462297160024272, + "learning_rate": 9.660938220883787e-05, + "loss": 3.2325, + "step": 9216 + }, + { + "epoch": 5.939242546333602, + "grad_norm": 1.6946135345474025, + "learning_rate": 9.660865250216308e-05, + "loss": 3.1674, + "step": 9217 + }, + { + "epoch": 5.939887187751813, + "grad_norm": 1.7407507847103545, + "learning_rate": 9.660792271976059e-05, + "loss": 3.2828, + "step": 9218 + }, + { + "epoch": 5.940531829170024, + "grad_norm": 2.0292716767208927, + "learning_rate": 9.660719286163163e-05, + "loss": 3.1741, + "step": 9219 + }, + { + "epoch": 5.9411764705882355, + "grad_norm": 1.7357633586310839, + "learning_rate": 9.660646292777739e-05, + "loss": 2.787, + "step": 9220 + }, + { + "epoch": 5.941821112006446, + "grad_norm": 1.8689096611348173, + "learning_rate": 9.660573291819906e-05, + "loss": 3.1965, + "step": 9221 + }, + { + "epoch": 5.942465753424657, + "grad_norm": 1.7807038587347117, + "learning_rate": 9.660500283289786e-05, + "loss": 3.1661, + "step": 9222 + }, + { + "epoch": 5.9431103948428685, + "grad_norm": 1.6746481312803279, + "learning_rate": 9.660427267187496e-05, + "loss": 3.4881, + "step": 9223 + }, + { + "epoch": 5.94375503626108, + "grad_norm": 1.9156323426538309, + "learning_rate": 9.660354243513157e-05, + "loss": 3.3747, + "step": 9224 + }, + { + "epoch": 5.944399677679291, + "grad_norm": 1.4354094772490182, + "learning_rate": 9.660281212266891e-05, + "loss": 2.7923, + "step": 9225 + }, + { + "epoch": 5.9450443190975015, + "grad_norm": 2.016432935171213, + "learning_rate": 9.660208173448816e-05, + "loss": 3.3629, + "step": 9226 + }, + { + "epoch": 5.945688960515713, + "grad_norm": 1.677081140885617, + "learning_rate": 9.660135127059052e-05, + "loss": 2.9805, + "step": 9227 + }, + { + "epoch": 5.946333601933924, + "grad_norm": 1.8735741808250586, + "learning_rate": 9.660062073097719e-05, + "loss": 3.3431, + "step": 9228 + }, + { + "epoch": 5.946978243352135, + "grad_norm": 1.9333628380927395, + "learning_rate": 9.659989011564936e-05, + "loss": 3.441, + "step": 9229 + }, + { + "epoch": 5.947622884770347, + "grad_norm": 1.5835777895564276, + "learning_rate": 9.659915942460827e-05, + "loss": 3.1477, + "step": 9230 + }, + { + "epoch": 5.948267526188557, + "grad_norm": 1.9883267219540743, + "learning_rate": 9.659842865785506e-05, + "loss": 2.9561, + "step": 9231 + }, + { + "epoch": 5.948912167606768, + "grad_norm": 1.6455486629975224, + "learning_rate": 9.659769781539098e-05, + "loss": 2.9534, + "step": 9232 + }, + { + "epoch": 5.94955680902498, + "grad_norm": 1.744395682891509, + "learning_rate": 9.659696689721721e-05, + "loss": 2.9759, + "step": 9233 + }, + { + "epoch": 5.950201450443191, + "grad_norm": 1.4178551194118294, + "learning_rate": 9.659623590333495e-05, + "loss": 3.085, + "step": 9234 + }, + { + "epoch": 5.950846091861402, + "grad_norm": 2.1879820219777617, + "learning_rate": 9.659550483374541e-05, + "loss": 2.9889, + "step": 9235 + }, + { + "epoch": 5.951490733279613, + "grad_norm": 1.595870887508499, + "learning_rate": 9.659477368844978e-05, + "loss": 3.0603, + "step": 9236 + }, + { + "epoch": 5.952135374697824, + "grad_norm": 1.7298944032199075, + "learning_rate": 9.659404246744927e-05, + "loss": 2.8, + "step": 9237 + }, + { + "epoch": 5.952780016116035, + "grad_norm": 1.6845612751628827, + "learning_rate": 9.659331117074506e-05, + "loss": 2.9977, + "step": 9238 + }, + { + "epoch": 5.953424657534247, + "grad_norm": 1.5232245242435418, + "learning_rate": 9.659257979833838e-05, + "loss": 2.8593, + "step": 9239 + }, + { + "epoch": 5.954069298952458, + "grad_norm": 1.7015858533058168, + "learning_rate": 9.659184835023042e-05, + "loss": 3.5172, + "step": 9240 + }, + { + "epoch": 5.954713940370668, + "grad_norm": 1.3861598223713247, + "learning_rate": 9.659111682642238e-05, + "loss": 3.1742, + "step": 9241 + }, + { + "epoch": 5.95535858178888, + "grad_norm": 1.648432415341718, + "learning_rate": 9.659038522691545e-05, + "loss": 2.9679, + "step": 9242 + }, + { + "epoch": 5.956003223207091, + "grad_norm": 1.7579443759314206, + "learning_rate": 9.658965355171086e-05, + "loss": 3.0813, + "step": 9243 + }, + { + "epoch": 5.956647864625302, + "grad_norm": 1.3643763589008113, + "learning_rate": 9.658892180080977e-05, + "loss": 2.9498, + "step": 9244 + }, + { + "epoch": 5.957292506043514, + "grad_norm": 1.3990218275502226, + "learning_rate": 9.658818997421343e-05, + "loss": 3.1172, + "step": 9245 + }, + { + "epoch": 5.957937147461724, + "grad_norm": 1.735601575825418, + "learning_rate": 9.6587458071923e-05, + "loss": 3.0637, + "step": 9246 + }, + { + "epoch": 5.958581788879935, + "grad_norm": 1.497186299864888, + "learning_rate": 9.658672609393971e-05, + "loss": 3.2058, + "step": 9247 + }, + { + "epoch": 5.959226430298147, + "grad_norm": 1.7089222242429127, + "learning_rate": 9.658599404026474e-05, + "loss": 3.0947, + "step": 9248 + }, + { + "epoch": 5.959871071716358, + "grad_norm": 1.3557682818228223, + "learning_rate": 9.658526191089931e-05, + "loss": 2.9861, + "step": 9249 + }, + { + "epoch": 5.960515713134569, + "grad_norm": 1.5439332006945383, + "learning_rate": 9.658452970584463e-05, + "loss": 2.8968, + "step": 9250 + }, + { + "epoch": 5.96116035455278, + "grad_norm": 1.4751629577370606, + "learning_rate": 9.658379742510187e-05, + "loss": 3.1565, + "step": 9251 + }, + { + "epoch": 5.961804995970991, + "grad_norm": 1.5528376058045528, + "learning_rate": 9.658306506867227e-05, + "loss": 2.929, + "step": 9252 + }, + { + "epoch": 5.962449637389202, + "grad_norm": 1.5967650250406196, + "learning_rate": 9.658233263655702e-05, + "loss": 3.3025, + "step": 9253 + }, + { + "epoch": 5.9630942788074135, + "grad_norm": 1.6449153800833973, + "learning_rate": 9.658160012875729e-05, + "loss": 3.0589, + "step": 9254 + }, + { + "epoch": 5.963738920225625, + "grad_norm": 1.5272286852450658, + "learning_rate": 9.658086754527433e-05, + "loss": 2.9274, + "step": 9255 + }, + { + "epoch": 5.964383561643835, + "grad_norm": 1.5221967721914904, + "learning_rate": 9.658013488610932e-05, + "loss": 3.191, + "step": 9256 + }, + { + "epoch": 5.9650282030620465, + "grad_norm": 1.467950238675879, + "learning_rate": 9.657940215126347e-05, + "loss": 3.0409, + "step": 9257 + }, + { + "epoch": 5.965672844480258, + "grad_norm": 1.5083271927048285, + "learning_rate": 9.657866934073797e-05, + "loss": 2.9658, + "step": 9258 + }, + { + "epoch": 5.966317485898469, + "grad_norm": 1.4630146440263359, + "learning_rate": 9.657793645453406e-05, + "loss": 3.5314, + "step": 9259 + }, + { + "epoch": 5.96696212731668, + "grad_norm": 1.650614202050141, + "learning_rate": 9.65772034926529e-05, + "loss": 3.3925, + "step": 9260 + }, + { + "epoch": 5.967606768734891, + "grad_norm": 1.4418047375624912, + "learning_rate": 9.65764704550957e-05, + "loss": 3.2132, + "step": 9261 + }, + { + "epoch": 5.968251410153102, + "grad_norm": 1.7809808573763664, + "learning_rate": 9.657573734186368e-05, + "loss": 3.0784, + "step": 9262 + }, + { + "epoch": 5.968896051571313, + "grad_norm": 1.5513130263186878, + "learning_rate": 9.657500415295805e-05, + "loss": 3.0886, + "step": 9263 + }, + { + "epoch": 5.969540692989525, + "grad_norm": 1.4929131149455213, + "learning_rate": 9.657427088838e-05, + "loss": 2.8744, + "step": 9264 + }, + { + "epoch": 5.970185334407736, + "grad_norm": 1.6654227728202382, + "learning_rate": 9.657353754813075e-05, + "loss": 3.3602, + "step": 9265 + }, + { + "epoch": 5.970829975825946, + "grad_norm": 1.656159705417308, + "learning_rate": 9.657280413221147e-05, + "loss": 3.3238, + "step": 9266 + }, + { + "epoch": 5.971474617244158, + "grad_norm": 1.5823137318560911, + "learning_rate": 9.65720706406234e-05, + "loss": 3.0586, + "step": 9267 + }, + { + "epoch": 5.972119258662369, + "grad_norm": 1.4892960698049307, + "learning_rate": 9.657133707336775e-05, + "loss": 3.1364, + "step": 9268 + }, + { + "epoch": 5.97276390008058, + "grad_norm": 1.5312090182625524, + "learning_rate": 9.657060343044567e-05, + "loss": 3.2412, + "step": 9269 + }, + { + "epoch": 5.973408541498792, + "grad_norm": 1.7955079773655196, + "learning_rate": 9.656986971185843e-05, + "loss": 2.8034, + "step": 9270 + }, + { + "epoch": 5.974053182917002, + "grad_norm": 1.5160261601574168, + "learning_rate": 9.65691359176072e-05, + "loss": 3.1692, + "step": 9271 + }, + { + "epoch": 5.974697824335213, + "grad_norm": 1.8610568965629009, + "learning_rate": 9.656840204769318e-05, + "loss": 2.909, + "step": 9272 + }, + { + "epoch": 5.975342465753425, + "grad_norm": 1.4413999927135346, + "learning_rate": 9.656766810211759e-05, + "loss": 3.0758, + "step": 9273 + }, + { + "epoch": 5.975987107171636, + "grad_norm": 1.5468484950199648, + "learning_rate": 9.656693408088165e-05, + "loss": 3.1019, + "step": 9274 + }, + { + "epoch": 5.976631748589847, + "grad_norm": 1.3359936652235784, + "learning_rate": 9.656619998398653e-05, + "loss": 3.1328, + "step": 9275 + }, + { + "epoch": 5.977276390008058, + "grad_norm": 1.5057499896857267, + "learning_rate": 9.656546581143345e-05, + "loss": 2.9581, + "step": 9276 + }, + { + "epoch": 5.977921031426269, + "grad_norm": 1.4134795991111162, + "learning_rate": 9.656473156322365e-05, + "loss": 2.9529, + "step": 9277 + }, + { + "epoch": 5.97856567284448, + "grad_norm": 1.5797267553680345, + "learning_rate": 9.656399723935827e-05, + "loss": 3.1128, + "step": 9278 + }, + { + "epoch": 5.979210314262692, + "grad_norm": 1.306226268204563, + "learning_rate": 9.656326283983856e-05, + "loss": 3.0219, + "step": 9279 + }, + { + "epoch": 5.979854955680903, + "grad_norm": 1.4431255122494688, + "learning_rate": 9.656252836466573e-05, + "loss": 3.0818, + "step": 9280 + }, + { + "epoch": 5.980499597099113, + "grad_norm": 1.4294646717601076, + "learning_rate": 9.656179381384097e-05, + "loss": 3.0673, + "step": 9281 + }, + { + "epoch": 5.981144238517325, + "grad_norm": 1.4333123462035395, + "learning_rate": 9.656105918736548e-05, + "loss": 2.8692, + "step": 9282 + }, + { + "epoch": 5.981788879935536, + "grad_norm": 1.564480788040481, + "learning_rate": 9.65603244852405e-05, + "loss": 3.4408, + "step": 9283 + }, + { + "epoch": 5.982433521353747, + "grad_norm": 1.2689222386708516, + "learning_rate": 9.655958970746718e-05, + "loss": 3.1484, + "step": 9284 + }, + { + "epoch": 5.9830781627719585, + "grad_norm": 1.7253678344434586, + "learning_rate": 9.655885485404678e-05, + "loss": 3.0023, + "step": 9285 + }, + { + "epoch": 5.983722804190169, + "grad_norm": 1.4296403700616842, + "learning_rate": 9.655811992498049e-05, + "loss": 3.0536, + "step": 9286 + }, + { + "epoch": 5.98436744560838, + "grad_norm": 1.3846466392322982, + "learning_rate": 9.65573849202695e-05, + "loss": 3.2436, + "step": 9287 + }, + { + "epoch": 5.9850120870265915, + "grad_norm": 1.5702322955185555, + "learning_rate": 9.655664983991505e-05, + "loss": 3.0183, + "step": 9288 + }, + { + "epoch": 5.985656728444803, + "grad_norm": 1.3971124243486273, + "learning_rate": 9.655591468391831e-05, + "loss": 3.3623, + "step": 9289 + }, + { + "epoch": 5.986301369863014, + "grad_norm": 1.3878986597050234, + "learning_rate": 9.655517945228052e-05, + "loss": 3.0185, + "step": 9290 + }, + { + "epoch": 5.9869460112812245, + "grad_norm": 1.6221387801566207, + "learning_rate": 9.655444414500288e-05, + "loss": 3.1899, + "step": 9291 + }, + { + "epoch": 5.987590652699436, + "grad_norm": 1.6582995752734029, + "learning_rate": 9.655370876208657e-05, + "loss": 2.9792, + "step": 9292 + }, + { + "epoch": 5.988235294117647, + "grad_norm": 1.632330521084307, + "learning_rate": 9.655297330353283e-05, + "loss": 3.1858, + "step": 9293 + }, + { + "epoch": 5.988879935535858, + "grad_norm": 1.5099431733354132, + "learning_rate": 9.655223776934285e-05, + "loss": 3.1636, + "step": 9294 + }, + { + "epoch": 5.98952457695407, + "grad_norm": 1.6032025670973045, + "learning_rate": 9.655150215951786e-05, + "loss": 2.977, + "step": 9295 + }, + { + "epoch": 5.99016921837228, + "grad_norm": 1.3462651272302233, + "learning_rate": 9.655076647405903e-05, + "loss": 3.0384, + "step": 9296 + }, + { + "epoch": 5.990813859790491, + "grad_norm": 1.5135651983318978, + "learning_rate": 9.655003071296761e-05, + "loss": 3.0989, + "step": 9297 + }, + { + "epoch": 5.991458501208703, + "grad_norm": 1.577120955323903, + "learning_rate": 9.654929487624479e-05, + "loss": 3.1718, + "step": 9298 + }, + { + "epoch": 5.992103142626914, + "grad_norm": 1.511127936283971, + "learning_rate": 9.654855896389177e-05, + "loss": 3.2229, + "step": 9299 + }, + { + "epoch": 5.992747784045125, + "grad_norm": 1.6226462723227213, + "learning_rate": 9.654782297590976e-05, + "loss": 2.9471, + "step": 9300 + }, + { + "epoch": 5.992747784045125, + "eval_loss": 4.346473217010498, + "eval_runtime": 2.9832, + "eval_samples_per_second": 33.521, + "eval_steps_per_second": 4.358, + "step": 9300 + }, + { + "epoch": 5.993392425463336, + "grad_norm": 1.8503516861599942, + "learning_rate": 9.65470869123e-05, + "loss": 3.1469, + "step": 9301 + }, + { + "epoch": 5.994037066881547, + "grad_norm": 1.6749072810421302, + "learning_rate": 9.654635077306365e-05, + "loss": 2.9177, + "step": 9302 + }, + { + "epoch": 5.994681708299758, + "grad_norm": 1.7249383007542598, + "learning_rate": 9.654561455820194e-05, + "loss": 3.0734, + "step": 9303 + }, + { + "epoch": 5.99532634971797, + "grad_norm": 2.093106645835602, + "learning_rate": 9.654487826771611e-05, + "loss": 3.0025, + "step": 9304 + }, + { + "epoch": 5.995970991136181, + "grad_norm": 1.9312420330877154, + "learning_rate": 9.654414190160733e-05, + "loss": 3.2182, + "step": 9305 + }, + { + "epoch": 5.996615632554391, + "grad_norm": 1.825833349946793, + "learning_rate": 9.654340545987681e-05, + "loss": 2.8678, + "step": 9306 + }, + { + "epoch": 5.997260273972603, + "grad_norm": 2.046193010871372, + "learning_rate": 9.654266894252579e-05, + "loss": 2.898, + "step": 9307 + }, + { + "epoch": 5.997904915390814, + "grad_norm": 1.8715860954096861, + "learning_rate": 9.654193234955545e-05, + "loss": 2.9188, + "step": 9308 + }, + { + "epoch": 5.998549556809025, + "grad_norm": 1.5720792271827722, + "learning_rate": 9.6541195680967e-05, + "loss": 2.8832, + "step": 9309 + }, + { + "epoch": 5.9991941982272365, + "grad_norm": 1.6002599030348121, + "learning_rate": 9.654045893676168e-05, + "loss": 3.0909, + "step": 9310 + }, + { + "epoch": 5.999838839645447, + "grad_norm": 1.6377384080182247, + "learning_rate": 9.653972211694067e-05, + "loss": 3.2863, + "step": 9311 + }, + { + "epoch": 6.0, + "grad_norm": 1.6377384080182247, + "learning_rate": 9.653898522150518e-05, + "loss": 0.7465, + "step": 9312 + }, + { + "epoch": 6.000644641418211, + "grad_norm": 1.832647609915593, + "learning_rate": 9.653824825045645e-05, + "loss": 2.2276, + "step": 9313 + }, + { + "epoch": 6.001289282836423, + "grad_norm": 2.294337151484122, + "learning_rate": 9.653751120379566e-05, + "loss": 2.41, + "step": 9314 + }, + { + "epoch": 6.001933924254633, + "grad_norm": 1.775187011125855, + "learning_rate": 9.653677408152404e-05, + "loss": 2.1857, + "step": 9315 + }, + { + "epoch": 6.002578565672844, + "grad_norm": 2.887157079683197, + "learning_rate": 9.653603688364277e-05, + "loss": 2.3027, + "step": 9316 + }, + { + "epoch": 6.003223207091056, + "grad_norm": 2.298725902483001, + "learning_rate": 9.65352996101531e-05, + "loss": 2.2206, + "step": 9317 + }, + { + "epoch": 6.003867848509267, + "grad_norm": 1.962035460870447, + "learning_rate": 9.653456226105623e-05, + "loss": 2.0741, + "step": 9318 + }, + { + "epoch": 6.004512489927478, + "grad_norm": 1.9443628311999546, + "learning_rate": 9.653382483635335e-05, + "loss": 2.0146, + "step": 9319 + }, + { + "epoch": 6.005157131345689, + "grad_norm": 1.7229790286882851, + "learning_rate": 9.65330873360457e-05, + "loss": 2.0571, + "step": 9320 + }, + { + "epoch": 6.0058017727639, + "grad_norm": 1.8941018805035168, + "learning_rate": 9.653234976013447e-05, + "loss": 1.9932, + "step": 9321 + }, + { + "epoch": 6.006446414182111, + "grad_norm": 1.8019037997378449, + "learning_rate": 9.653161210862088e-05, + "loss": 2.1856, + "step": 9322 + }, + { + "epoch": 6.0070910556003225, + "grad_norm": 1.6795573789933549, + "learning_rate": 9.653087438150614e-05, + "loss": 2.2965, + "step": 9323 + }, + { + "epoch": 6.007735697018534, + "grad_norm": 1.9328289638569325, + "learning_rate": 9.653013657879146e-05, + "loss": 1.9277, + "step": 9324 + }, + { + "epoch": 6.008380338436744, + "grad_norm": 1.9017488426784646, + "learning_rate": 9.652939870047805e-05, + "loss": 1.956, + "step": 9325 + }, + { + "epoch": 6.0090249798549555, + "grad_norm": 1.9891691934503513, + "learning_rate": 9.652866074656713e-05, + "loss": 2.1708, + "step": 9326 + }, + { + "epoch": 6.009669621273167, + "grad_norm": 1.8478931719484246, + "learning_rate": 9.65279227170599e-05, + "loss": 2.1995, + "step": 9327 + }, + { + "epoch": 6.010314262691378, + "grad_norm": 1.8567142897164912, + "learning_rate": 9.652718461195759e-05, + "loss": 2.08, + "step": 9328 + }, + { + "epoch": 6.010958904109589, + "grad_norm": 1.9283759893816557, + "learning_rate": 9.65264464312614e-05, + "loss": 2.3366, + "step": 9329 + }, + { + "epoch": 6.0116035455278, + "grad_norm": 1.9916559010368582, + "learning_rate": 9.652570817497252e-05, + "loss": 1.9872, + "step": 9330 + }, + { + "epoch": 6.012248186946011, + "grad_norm": 1.9793030545280144, + "learning_rate": 9.652496984309221e-05, + "loss": 2.3718, + "step": 9331 + }, + { + "epoch": 6.0128928283642225, + "grad_norm": 2.378179913268776, + "learning_rate": 9.652423143562166e-05, + "loss": 2.0864, + "step": 9332 + }, + { + "epoch": 6.013537469782434, + "grad_norm": 1.7707423597079917, + "learning_rate": 9.652349295256208e-05, + "loss": 2.0062, + "step": 9333 + }, + { + "epoch": 6.014182111200645, + "grad_norm": 2.378757823660111, + "learning_rate": 9.652275439391468e-05, + "loss": 2.0886, + "step": 9334 + }, + { + "epoch": 6.0148267526188555, + "grad_norm": 2.051521029365628, + "learning_rate": 9.652201575968069e-05, + "loss": 2.043, + "step": 9335 + }, + { + "epoch": 6.015471394037067, + "grad_norm": 1.903981122235901, + "learning_rate": 9.65212770498613e-05, + "loss": 2.2341, + "step": 9336 + }, + { + "epoch": 6.016116035455278, + "grad_norm": 1.9538532445521322, + "learning_rate": 9.652053826445772e-05, + "loss": 2.0997, + "step": 9337 + }, + { + "epoch": 6.016760676873489, + "grad_norm": 1.7458262300121918, + "learning_rate": 9.65197994034712e-05, + "loss": 2.1381, + "step": 9338 + }, + { + "epoch": 6.017405318291701, + "grad_norm": 2.0380201318252698, + "learning_rate": 9.651906046690292e-05, + "loss": 2.1276, + "step": 9339 + }, + { + "epoch": 6.018049959709911, + "grad_norm": 1.6894247927556076, + "learning_rate": 9.651832145475412e-05, + "loss": 2.2275, + "step": 9340 + }, + { + "epoch": 6.018694601128122, + "grad_norm": 2.197771936932619, + "learning_rate": 9.651758236702597e-05, + "loss": 2.2266, + "step": 9341 + }, + { + "epoch": 6.019339242546334, + "grad_norm": 1.9725329383635846, + "learning_rate": 9.651684320371975e-05, + "loss": 2.2352, + "step": 9342 + }, + { + "epoch": 6.019983883964545, + "grad_norm": 1.8918007988010586, + "learning_rate": 9.65161039648366e-05, + "loss": 1.9731, + "step": 9343 + }, + { + "epoch": 6.020628525382756, + "grad_norm": 1.8825005419555252, + "learning_rate": 9.651536465037778e-05, + "loss": 2.3126, + "step": 9344 + }, + { + "epoch": 6.021273166800967, + "grad_norm": 1.9233990220468662, + "learning_rate": 9.65146252603445e-05, + "loss": 2.3079, + "step": 9345 + }, + { + "epoch": 6.021917808219178, + "grad_norm": 1.8724021907760022, + "learning_rate": 9.651388579473796e-05, + "loss": 1.9345, + "step": 9346 + }, + { + "epoch": 6.022562449637389, + "grad_norm": 2.0439546000421247, + "learning_rate": 9.65131462535594e-05, + "loss": 2.1223, + "step": 9347 + }, + { + "epoch": 6.023207091055601, + "grad_norm": 2.0211509692920546, + "learning_rate": 9.651240663681e-05, + "loss": 2.1633, + "step": 9348 + }, + { + "epoch": 6.023851732473811, + "grad_norm": 1.9538857367006306, + "learning_rate": 9.651166694449099e-05, + "loss": 1.9781, + "step": 9349 + }, + { + "epoch": 6.024496373892022, + "grad_norm": 1.9095922140945212, + "learning_rate": 9.651092717660361e-05, + "loss": 1.966, + "step": 9350 + }, + { + "epoch": 6.025141015310234, + "grad_norm": 2.0066386343052387, + "learning_rate": 9.651018733314904e-05, + "loss": 2.2863, + "step": 9351 + }, + { + "epoch": 6.025785656728445, + "grad_norm": 2.5107952348898226, + "learning_rate": 9.650944741412851e-05, + "loss": 2.1459, + "step": 9352 + }, + { + "epoch": 6.026430298146656, + "grad_norm": 1.844404027822601, + "learning_rate": 9.650870741954322e-05, + "loss": 1.9338, + "step": 9353 + }, + { + "epoch": 6.027074939564867, + "grad_norm": 2.3537162102080647, + "learning_rate": 9.65079673493944e-05, + "loss": 1.8962, + "step": 9354 + }, + { + "epoch": 6.027719580983078, + "grad_norm": 1.6289126815282287, + "learning_rate": 9.650722720368328e-05, + "loss": 2.1718, + "step": 9355 + }, + { + "epoch": 6.028364222401289, + "grad_norm": 1.8773235113309483, + "learning_rate": 9.650648698241105e-05, + "loss": 2.1123, + "step": 9356 + }, + { + "epoch": 6.0290088638195005, + "grad_norm": 1.8319872624582, + "learning_rate": 9.650574668557894e-05, + "loss": 2.0791, + "step": 9357 + }, + { + "epoch": 6.029653505237712, + "grad_norm": 2.144155101959192, + "learning_rate": 9.650500631318816e-05, + "loss": 2.1459, + "step": 9358 + }, + { + "epoch": 6.030298146655922, + "grad_norm": 1.738344881610973, + "learning_rate": 9.650426586523992e-05, + "loss": 2.3053, + "step": 9359 + }, + { + "epoch": 6.0309427880741335, + "grad_norm": 2.1486874529524527, + "learning_rate": 9.650352534173544e-05, + "loss": 2.1497, + "step": 9360 + }, + { + "epoch": 6.031587429492345, + "grad_norm": 1.8768194823328712, + "learning_rate": 9.650278474267594e-05, + "loss": 2.0904, + "step": 9361 + }, + { + "epoch": 6.032232070910556, + "grad_norm": 2.0853808734953625, + "learning_rate": 9.650204406806264e-05, + "loss": 1.8009, + "step": 9362 + }, + { + "epoch": 6.032876712328767, + "grad_norm": 1.7881561293044366, + "learning_rate": 9.650130331789675e-05, + "loss": 2.2201, + "step": 9363 + }, + { + "epoch": 6.033521353746978, + "grad_norm": 2.2444518102129196, + "learning_rate": 9.65005624921795e-05, + "loss": 1.9506, + "step": 9364 + }, + { + "epoch": 6.034165995165189, + "grad_norm": 1.9558812585163565, + "learning_rate": 9.649982159091208e-05, + "loss": 2.1201, + "step": 9365 + }, + { + "epoch": 6.0348106365834004, + "grad_norm": 2.5341817240115856, + "learning_rate": 9.649908061409573e-05, + "loss": 2.1745, + "step": 9366 + }, + { + "epoch": 6.035455278001612, + "grad_norm": 1.962900728616776, + "learning_rate": 9.649833956173165e-05, + "loss": 1.9667, + "step": 9367 + }, + { + "epoch": 6.036099919419823, + "grad_norm": 1.7571215740567243, + "learning_rate": 9.649759843382107e-05, + "loss": 1.9895, + "step": 9368 + }, + { + "epoch": 6.0367445608380335, + "grad_norm": 2.2297023631373714, + "learning_rate": 9.64968572303652e-05, + "loss": 2.1411, + "step": 9369 + }, + { + "epoch": 6.037389202256245, + "grad_norm": 1.929214441358688, + "learning_rate": 9.649611595136527e-05, + "loss": 2.2911, + "step": 9370 + }, + { + "epoch": 6.038033843674456, + "grad_norm": 2.2363579497855106, + "learning_rate": 9.649537459682248e-05, + "loss": 2.0881, + "step": 9371 + }, + { + "epoch": 6.038678485092667, + "grad_norm": 2.0493418082308366, + "learning_rate": 9.649463316673806e-05, + "loss": 2.2613, + "step": 9372 + }, + { + "epoch": 6.039323126510879, + "grad_norm": 2.1234679975229995, + "learning_rate": 9.649389166111322e-05, + "loss": 2.0747, + "step": 9373 + }, + { + "epoch": 6.039967767929089, + "grad_norm": 1.9414833980458495, + "learning_rate": 9.649315007994917e-05, + "loss": 2.0097, + "step": 9374 + }, + { + "epoch": 6.0406124093473, + "grad_norm": 1.7872265630114292, + "learning_rate": 9.649240842324716e-05, + "loss": 2.2438, + "step": 9375 + }, + { + "epoch": 6.041257050765512, + "grad_norm": 2.008876488223172, + "learning_rate": 9.649166669100837e-05, + "loss": 2.1864, + "step": 9376 + }, + { + "epoch": 6.041901692183723, + "grad_norm": 1.9786124219756858, + "learning_rate": 9.649092488323405e-05, + "loss": 2.1283, + "step": 9377 + }, + { + "epoch": 6.042546333601934, + "grad_norm": 1.9934668583789403, + "learning_rate": 9.64901829999254e-05, + "loss": 2.3698, + "step": 9378 + }, + { + "epoch": 6.043190975020145, + "grad_norm": 1.9776194273183596, + "learning_rate": 9.648944104108364e-05, + "loss": 1.734, + "step": 9379 + }, + { + "epoch": 6.043835616438356, + "grad_norm": 1.8344210085557981, + "learning_rate": 9.648869900670998e-05, + "loss": 2.1073, + "step": 9380 + }, + { + "epoch": 6.044480257856567, + "grad_norm": 2.0513343724107984, + "learning_rate": 9.648795689680566e-05, + "loss": 2.346, + "step": 9381 + }, + { + "epoch": 6.045124899274779, + "grad_norm": 1.7738803302424775, + "learning_rate": 9.648721471137189e-05, + "loss": 2.0363, + "step": 9382 + }, + { + "epoch": 6.04576954069299, + "grad_norm": 2.1344940635885266, + "learning_rate": 9.648647245040989e-05, + "loss": 2.1918, + "step": 9383 + }, + { + "epoch": 6.0464141821112, + "grad_norm": 1.753378233482513, + "learning_rate": 9.648573011392087e-05, + "loss": 2.039, + "step": 9384 + }, + { + "epoch": 6.047058823529412, + "grad_norm": 2.098814652401981, + "learning_rate": 9.648498770190606e-05, + "loss": 2.0677, + "step": 9385 + }, + { + "epoch": 6.047703464947623, + "grad_norm": 1.7656093939527917, + "learning_rate": 9.648424521436667e-05, + "loss": 1.806, + "step": 9386 + }, + { + "epoch": 6.048348106365834, + "grad_norm": 1.8567658807270482, + "learning_rate": 9.648350265130393e-05, + "loss": 2.156, + "step": 9387 + }, + { + "epoch": 6.0489927477840455, + "grad_norm": 1.8330145426012119, + "learning_rate": 9.648276001271905e-05, + "loss": 2.2297, + "step": 9388 + }, + { + "epoch": 6.049637389202256, + "grad_norm": 2.1021707454055454, + "learning_rate": 9.648201729861324e-05, + "loss": 2.2713, + "step": 9389 + }, + { + "epoch": 6.050282030620467, + "grad_norm": 2.3648545673841666, + "learning_rate": 9.648127450898774e-05, + "loss": 2.2826, + "step": 9390 + }, + { + "epoch": 6.0509266720386785, + "grad_norm": 1.93984024304124, + "learning_rate": 9.648053164384377e-05, + "loss": 2.1049, + "step": 9391 + }, + { + "epoch": 6.05157131345689, + "grad_norm": 1.8503382285906056, + "learning_rate": 9.647978870318254e-05, + "loss": 2.0927, + "step": 9392 + }, + { + "epoch": 6.052215954875101, + "grad_norm": 2.10413411167401, + "learning_rate": 9.647904568700528e-05, + "loss": 2.1272, + "step": 9393 + }, + { + "epoch": 6.0528605962933115, + "grad_norm": 2.0537733220723458, + "learning_rate": 9.647830259531319e-05, + "loss": 2.0967, + "step": 9394 + }, + { + "epoch": 6.053505237711523, + "grad_norm": 1.9976962631051298, + "learning_rate": 9.647755942810751e-05, + "loss": 2.2763, + "step": 9395 + }, + { + "epoch": 6.054149879129734, + "grad_norm": 2.0391814124504832, + "learning_rate": 9.647681618538946e-05, + "loss": 2.2474, + "step": 9396 + }, + { + "epoch": 6.054794520547945, + "grad_norm": 2.1498003066863576, + "learning_rate": 9.647607286716025e-05, + "loss": 1.8145, + "step": 9397 + }, + { + "epoch": 6.055439161966157, + "grad_norm": 2.165063162840576, + "learning_rate": 9.647532947342112e-05, + "loss": 1.9192, + "step": 9398 + }, + { + "epoch": 6.056083803384367, + "grad_norm": 1.7667059060328345, + "learning_rate": 9.647458600417326e-05, + "loss": 1.9897, + "step": 9399 + }, + { + "epoch": 6.056728444802578, + "grad_norm": 1.8450603502008902, + "learning_rate": 9.647384245941791e-05, + "loss": 2.1653, + "step": 9400 + }, + { + "epoch": 6.056728444802578, + "eval_loss": 5.122704982757568, + "eval_runtime": 2.9682, + "eval_samples_per_second": 33.69, + "eval_steps_per_second": 4.38, + "step": 9400 + }, + { + "epoch": 6.05737308622079, + "grad_norm": 2.1818302649773593, + "learning_rate": 9.64730988391563e-05, + "loss": 2.2337, + "step": 9401 + }, + { + "epoch": 6.058017727639001, + "grad_norm": 2.1522971482887034, + "learning_rate": 9.647235514338963e-05, + "loss": 2.0353, + "step": 9402 + }, + { + "epoch": 6.058662369057212, + "grad_norm": 2.2738755153078, + "learning_rate": 9.647161137211913e-05, + "loss": 1.9337, + "step": 9403 + }, + { + "epoch": 6.059307010475423, + "grad_norm": 1.9399559454130686, + "learning_rate": 9.647086752534604e-05, + "loss": 2.0227, + "step": 9404 + }, + { + "epoch": 6.059951651893634, + "grad_norm": 2.059130601338045, + "learning_rate": 9.647012360307155e-05, + "loss": 2.1236, + "step": 9405 + }, + { + "epoch": 6.060596293311845, + "grad_norm": 2.0842587347871655, + "learning_rate": 9.64693796052969e-05, + "loss": 2.1026, + "step": 9406 + }, + { + "epoch": 6.061240934730057, + "grad_norm": 2.4490703321269645, + "learning_rate": 9.646863553202331e-05, + "loss": 2.1926, + "step": 9407 + }, + { + "epoch": 6.061885576148268, + "grad_norm": 1.802811885955606, + "learning_rate": 9.646789138325201e-05, + "loss": 1.9892, + "step": 9408 + }, + { + "epoch": 6.062530217566478, + "grad_norm": 2.5919843097901842, + "learning_rate": 9.64671471589842e-05, + "loss": 1.9573, + "step": 9409 + }, + { + "epoch": 6.06317485898469, + "grad_norm": 1.7110721202343249, + "learning_rate": 9.646640285922112e-05, + "loss": 2.0937, + "step": 9410 + }, + { + "epoch": 6.063819500402901, + "grad_norm": 2.315785670908861, + "learning_rate": 9.646565848396399e-05, + "loss": 2.1472, + "step": 9411 + }, + { + "epoch": 6.064464141821112, + "grad_norm": 1.6943674822046733, + "learning_rate": 9.646491403321403e-05, + "loss": 2.2604, + "step": 9412 + }, + { + "epoch": 6.065108783239323, + "grad_norm": 2.157209780606232, + "learning_rate": 9.646416950697245e-05, + "loss": 2.1677, + "step": 9413 + }, + { + "epoch": 6.065753424657534, + "grad_norm": 1.887517624509885, + "learning_rate": 9.646342490524051e-05, + "loss": 2.2044, + "step": 9414 + }, + { + "epoch": 6.066398066075745, + "grad_norm": 2.32327631174108, + "learning_rate": 9.646268022801939e-05, + "loss": 2.2693, + "step": 9415 + }, + { + "epoch": 6.067042707493957, + "grad_norm": 2.3267224331326952, + "learning_rate": 9.646193547531034e-05, + "loss": 2.2292, + "step": 9416 + }, + { + "epoch": 6.067687348912168, + "grad_norm": 1.9655427297105201, + "learning_rate": 9.646119064711456e-05, + "loss": 2.2216, + "step": 9417 + }, + { + "epoch": 6.068331990330378, + "grad_norm": 2.2636905699487553, + "learning_rate": 9.646044574343332e-05, + "loss": 2.3739, + "step": 9418 + }, + { + "epoch": 6.06897663174859, + "grad_norm": 2.065845859364558, + "learning_rate": 9.645970076426779e-05, + "loss": 2.189, + "step": 9419 + }, + { + "epoch": 6.069621273166801, + "grad_norm": 2.010979619715805, + "learning_rate": 9.645895570961922e-05, + "loss": 2.3876, + "step": 9420 + }, + { + "epoch": 6.070265914585012, + "grad_norm": 2.2061162110739967, + "learning_rate": 9.645821057948883e-05, + "loss": 2.0624, + "step": 9421 + }, + { + "epoch": 6.0709105560032235, + "grad_norm": 2.058443169449981, + "learning_rate": 9.645746537387784e-05, + "loss": 2.231, + "step": 9422 + }, + { + "epoch": 6.071555197421434, + "grad_norm": 2.072141144350254, + "learning_rate": 9.645672009278748e-05, + "loss": 2.3223, + "step": 9423 + }, + { + "epoch": 6.072199838839645, + "grad_norm": 1.941026928169514, + "learning_rate": 9.645597473621896e-05, + "loss": 2.0806, + "step": 9424 + }, + { + "epoch": 6.0728444802578565, + "grad_norm": 1.7518803633834703, + "learning_rate": 9.645522930417355e-05, + "loss": 1.9923, + "step": 9425 + }, + { + "epoch": 6.073489121676068, + "grad_norm": 2.0303235149066117, + "learning_rate": 9.645448379665239e-05, + "loss": 2.166, + "step": 9426 + }, + { + "epoch": 6.074133763094279, + "grad_norm": 1.794946766926081, + "learning_rate": 9.645373821365677e-05, + "loss": 1.9376, + "step": 9427 + }, + { + "epoch": 6.0747784045124895, + "grad_norm": 1.74901860395381, + "learning_rate": 9.645299255518792e-05, + "loss": 2.0884, + "step": 9428 + }, + { + "epoch": 6.075423045930701, + "grad_norm": 1.8633136991309343, + "learning_rate": 9.645224682124703e-05, + "loss": 2.1635, + "step": 9429 + }, + { + "epoch": 6.076067687348912, + "grad_norm": 2.0294598810902635, + "learning_rate": 9.645150101183532e-05, + "loss": 2.3215, + "step": 9430 + }, + { + "epoch": 6.076712328767123, + "grad_norm": 1.7710214214278028, + "learning_rate": 9.645075512695405e-05, + "loss": 2.2511, + "step": 9431 + }, + { + "epoch": 6.077356970185335, + "grad_norm": 1.7938631464228585, + "learning_rate": 9.645000916660443e-05, + "loss": 2.2911, + "step": 9432 + }, + { + "epoch": 6.078001611603545, + "grad_norm": 1.7436658238853313, + "learning_rate": 9.644926313078766e-05, + "loss": 2.0399, + "step": 9433 + }, + { + "epoch": 6.078646253021756, + "grad_norm": 1.7221208020555865, + "learning_rate": 9.644851701950502e-05, + "loss": 2.014, + "step": 9434 + }, + { + "epoch": 6.079290894439968, + "grad_norm": 1.8036034656407927, + "learning_rate": 9.644777083275768e-05, + "loss": 2.3233, + "step": 9435 + }, + { + "epoch": 6.079935535858179, + "grad_norm": 1.6618580667579983, + "learning_rate": 9.64470245705469e-05, + "loss": 2.1073, + "step": 9436 + }, + { + "epoch": 6.08058017727639, + "grad_norm": 2.0844963750983556, + "learning_rate": 9.644627823287389e-05, + "loss": 2.1969, + "step": 9437 + }, + { + "epoch": 6.081224818694601, + "grad_norm": 1.8573383595691266, + "learning_rate": 9.644553181973987e-05, + "loss": 2.121, + "step": 9438 + }, + { + "epoch": 6.081869460112812, + "grad_norm": 1.8805780652335138, + "learning_rate": 9.644478533114609e-05, + "loss": 2.0948, + "step": 9439 + }, + { + "epoch": 6.082514101531023, + "grad_norm": 1.7009750343589982, + "learning_rate": 9.644403876709377e-05, + "loss": 2.0846, + "step": 9440 + }, + { + "epoch": 6.083158742949235, + "grad_norm": 1.9991536288844554, + "learning_rate": 9.644329212758411e-05, + "loss": 2.0315, + "step": 9441 + }, + { + "epoch": 6.083803384367446, + "grad_norm": 1.9022982499498227, + "learning_rate": 9.644254541261837e-05, + "loss": 2.1032, + "step": 9442 + }, + { + "epoch": 6.084448025785656, + "grad_norm": 1.8594031464676966, + "learning_rate": 9.644179862219774e-05, + "loss": 2.091, + "step": 9443 + }, + { + "epoch": 6.085092667203868, + "grad_norm": 1.8674820449430531, + "learning_rate": 9.644105175632349e-05, + "loss": 1.9924, + "step": 9444 + }, + { + "epoch": 6.085737308622079, + "grad_norm": 1.8498532812013448, + "learning_rate": 9.64403048149968e-05, + "loss": 2.0818, + "step": 9445 + }, + { + "epoch": 6.08638195004029, + "grad_norm": 1.8285346670830958, + "learning_rate": 9.643955779821894e-05, + "loss": 2.0141, + "step": 9446 + }, + { + "epoch": 6.087026591458502, + "grad_norm": 1.850861161077166, + "learning_rate": 9.643881070599111e-05, + "loss": 2.2074, + "step": 9447 + }, + { + "epoch": 6.087671232876712, + "grad_norm": 1.7591710554834956, + "learning_rate": 9.643806353831456e-05, + "loss": 2.0984, + "step": 9448 + }, + { + "epoch": 6.088315874294923, + "grad_norm": 1.7852890855610188, + "learning_rate": 9.643731629519048e-05, + "loss": 2.1522, + "step": 9449 + }, + { + "epoch": 6.088960515713135, + "grad_norm": 1.6050889250310654, + "learning_rate": 9.643656897662013e-05, + "loss": 2.0371, + "step": 9450 + }, + { + "epoch": 6.089605157131346, + "grad_norm": 1.9023761735844698, + "learning_rate": 9.643582158260472e-05, + "loss": 2.3233, + "step": 9451 + }, + { + "epoch": 6.090249798549557, + "grad_norm": 1.975109466690356, + "learning_rate": 9.643507411314548e-05, + "loss": 2.4611, + "step": 9452 + }, + { + "epoch": 6.090894439967768, + "grad_norm": 1.7071757912798884, + "learning_rate": 9.643432656824366e-05, + "loss": 2.0754, + "step": 9453 + }, + { + "epoch": 6.091539081385979, + "grad_norm": 1.7718669646425123, + "learning_rate": 9.643357894790046e-05, + "loss": 2.1063, + "step": 9454 + }, + { + "epoch": 6.09218372280419, + "grad_norm": 1.906110315278373, + "learning_rate": 9.643283125211711e-05, + "loss": 2.1546, + "step": 9455 + }, + { + "epoch": 6.0928283642224015, + "grad_norm": 1.7791606220258678, + "learning_rate": 9.643208348089485e-05, + "loss": 2.3037, + "step": 9456 + }, + { + "epoch": 6.093473005640613, + "grad_norm": 2.0804762259536056, + "learning_rate": 9.643133563423488e-05, + "loss": 1.997, + "step": 9457 + }, + { + "epoch": 6.094117647058823, + "grad_norm": 2.0399996615750817, + "learning_rate": 9.64305877121385e-05, + "loss": 2.0926, + "step": 9458 + }, + { + "epoch": 6.0947622884770345, + "grad_norm": 1.9098346464208802, + "learning_rate": 9.642983971460686e-05, + "loss": 2.2847, + "step": 9459 + }, + { + "epoch": 6.095406929895246, + "grad_norm": 2.089774431456851, + "learning_rate": 9.642909164164122e-05, + "loss": 2.3363, + "step": 9460 + }, + { + "epoch": 6.096051571313457, + "grad_norm": 1.9083517318594938, + "learning_rate": 9.642834349324281e-05, + "loss": 2.3224, + "step": 9461 + }, + { + "epoch": 6.096696212731668, + "grad_norm": 2.120349237074274, + "learning_rate": 9.642759526941285e-05, + "loss": 2.2219, + "step": 9462 + }, + { + "epoch": 6.097340854149879, + "grad_norm": 1.8408036828645848, + "learning_rate": 9.642684697015258e-05, + "loss": 2.2881, + "step": 9463 + }, + { + "epoch": 6.09798549556809, + "grad_norm": 2.324545466396152, + "learning_rate": 9.642609859546322e-05, + "loss": 2.3891, + "step": 9464 + }, + { + "epoch": 6.098630136986301, + "grad_norm": 1.840366394833409, + "learning_rate": 9.642535014534601e-05, + "loss": 2.3063, + "step": 9465 + }, + { + "epoch": 6.099274778404513, + "grad_norm": 1.908262266134166, + "learning_rate": 9.642460161980218e-05, + "loss": 2.2082, + "step": 9466 + }, + { + "epoch": 6.099919419822724, + "grad_norm": 2.156473385012173, + "learning_rate": 9.642385301883292e-05, + "loss": 2.0798, + "step": 9467 + }, + { + "epoch": 6.100564061240934, + "grad_norm": 1.9921257317089018, + "learning_rate": 9.642310434243952e-05, + "loss": 2.057, + "step": 9468 + }, + { + "epoch": 6.101208702659146, + "grad_norm": 2.055234763352899, + "learning_rate": 9.642235559062317e-05, + "loss": 2.169, + "step": 9469 + }, + { + "epoch": 6.101853344077357, + "grad_norm": 1.9176133874937165, + "learning_rate": 9.642160676338512e-05, + "loss": 2.3242, + "step": 9470 + }, + { + "epoch": 6.102497985495568, + "grad_norm": 1.8757102900108529, + "learning_rate": 9.642085786072656e-05, + "loss": 2.3488, + "step": 9471 + }, + { + "epoch": 6.10314262691378, + "grad_norm": 1.8847770749360673, + "learning_rate": 9.642010888264879e-05, + "loss": 2.0729, + "step": 9472 + }, + { + "epoch": 6.10378726833199, + "grad_norm": 1.7334428134492557, + "learning_rate": 9.641935982915296e-05, + "loss": 1.9642, + "step": 9473 + }, + { + "epoch": 6.104431909750201, + "grad_norm": 2.0456726104300333, + "learning_rate": 9.641861070024038e-05, + "loss": 2.1428, + "step": 9474 + }, + { + "epoch": 6.105076551168413, + "grad_norm": 1.9955187964651246, + "learning_rate": 9.641786149591221e-05, + "loss": 2.1684, + "step": 9475 + }, + { + "epoch": 6.105721192586624, + "grad_norm": 1.9633803694553509, + "learning_rate": 9.641711221616974e-05, + "loss": 2.2688, + "step": 9476 + }, + { + "epoch": 6.106365834004835, + "grad_norm": 1.9705512583322928, + "learning_rate": 9.641636286101414e-05, + "loss": 2.1135, + "step": 9477 + }, + { + "epoch": 6.107010475423046, + "grad_norm": 1.8835484059940022, + "learning_rate": 9.641561343044669e-05, + "loss": 2.2951, + "step": 9478 + }, + { + "epoch": 6.107655116841257, + "grad_norm": 6.992478437356377, + "learning_rate": 9.641486392446859e-05, + "loss": 2.1735, + "step": 9479 + }, + { + "epoch": 6.108299758259468, + "grad_norm": 1.9959567845667747, + "learning_rate": 9.641411434308111e-05, + "loss": 2.2067, + "step": 9480 + }, + { + "epoch": 6.1089443996776795, + "grad_norm": 1.6813320798140097, + "learning_rate": 9.641336468628543e-05, + "loss": 2.204, + "step": 9481 + }, + { + "epoch": 6.109589041095891, + "grad_norm": 1.965619079255091, + "learning_rate": 9.64126149540828e-05, + "loss": 2.29, + "step": 9482 + }, + { + "epoch": 6.110233682514101, + "grad_norm": 2.1829757711612854, + "learning_rate": 9.641186514647449e-05, + "loss": 1.6962, + "step": 9483 + }, + { + "epoch": 6.110878323932313, + "grad_norm": 1.8808478847952341, + "learning_rate": 9.641111526346168e-05, + "loss": 2.2269, + "step": 9484 + }, + { + "epoch": 6.111522965350524, + "grad_norm": 2.172688162975914, + "learning_rate": 9.641036530504562e-05, + "loss": 2.2957, + "step": 9485 + }, + { + "epoch": 6.112167606768735, + "grad_norm": 1.9302008097037338, + "learning_rate": 9.640961527122755e-05, + "loss": 2.044, + "step": 9486 + }, + { + "epoch": 6.1128122481869465, + "grad_norm": 2.4626510888062754, + "learning_rate": 9.640886516200869e-05, + "loss": 2.1413, + "step": 9487 + }, + { + "epoch": 6.113456889605157, + "grad_norm": 1.965627802604761, + "learning_rate": 9.640811497739027e-05, + "loss": 2.2388, + "step": 9488 + }, + { + "epoch": 6.114101531023368, + "grad_norm": 2.5365003596775204, + "learning_rate": 9.640736471737354e-05, + "loss": 2.2277, + "step": 9489 + }, + { + "epoch": 6.1147461724415795, + "grad_norm": 2.163668516933174, + "learning_rate": 9.640661438195972e-05, + "loss": 2.0782, + "step": 9490 + }, + { + "epoch": 6.115390813859791, + "grad_norm": 2.2115345196935574, + "learning_rate": 9.640586397115003e-05, + "loss": 2.3075, + "step": 9491 + }, + { + "epoch": 6.116035455278001, + "grad_norm": 2.6458061994749666, + "learning_rate": 9.640511348494573e-05, + "loss": 2.0045, + "step": 9492 + }, + { + "epoch": 6.1166800966962125, + "grad_norm": 2.167020779442053, + "learning_rate": 9.640436292334804e-05, + "loss": 1.9453, + "step": 9493 + }, + { + "epoch": 6.117324738114424, + "grad_norm": 2.2844744359499067, + "learning_rate": 9.640361228635818e-05, + "loss": 2.256, + "step": 9494 + }, + { + "epoch": 6.117969379532635, + "grad_norm": 1.9175294683651924, + "learning_rate": 9.64028615739774e-05, + "loss": 2.4711, + "step": 9495 + }, + { + "epoch": 6.118614020950846, + "grad_norm": 2.113930115917052, + "learning_rate": 9.640211078620691e-05, + "loss": 2.102, + "step": 9496 + }, + { + "epoch": 6.119258662369057, + "grad_norm": 1.8939610457950031, + "learning_rate": 9.640135992304799e-05, + "loss": 2.2565, + "step": 9497 + }, + { + "epoch": 6.119903303787268, + "grad_norm": 2.0906035535311083, + "learning_rate": 9.640060898450182e-05, + "loss": 2.2693, + "step": 9498 + }, + { + "epoch": 6.120547945205479, + "grad_norm": 1.7910673525496152, + "learning_rate": 9.639985797056967e-05, + "loss": 2.3567, + "step": 9499 + }, + { + "epoch": 6.121192586623691, + "grad_norm": 2.010314749768041, + "learning_rate": 9.639910688125275e-05, + "loss": 2.0595, + "step": 9500 + }, + { + "epoch": 6.121192586623691, + "eval_loss": 5.108989238739014, + "eval_runtime": 2.9786, + "eval_samples_per_second": 33.573, + "eval_steps_per_second": 4.364, + "step": 9500 + }, + { + "epoch": 6.121837228041902, + "grad_norm": 1.7649586155414063, + "learning_rate": 9.639835571655229e-05, + "loss": 2.2129, + "step": 9501 + }, + { + "epoch": 6.122481869460112, + "grad_norm": 2.080779214408741, + "learning_rate": 9.639760447646956e-05, + "loss": 2.4119, + "step": 9502 + }, + { + "epoch": 6.123126510878324, + "grad_norm": 1.8371260421277709, + "learning_rate": 9.639685316100577e-05, + "loss": 1.9325, + "step": 9503 + }, + { + "epoch": 6.123771152296535, + "grad_norm": 1.9184861053700113, + "learning_rate": 9.639610177016214e-05, + "loss": 2.2939, + "step": 9504 + }, + { + "epoch": 6.124415793714746, + "grad_norm": 1.8069490264490267, + "learning_rate": 9.639535030393992e-05, + "loss": 2.2458, + "step": 9505 + }, + { + "epoch": 6.125060435132958, + "grad_norm": 1.801971253504582, + "learning_rate": 9.639459876234035e-05, + "loss": 2.0744, + "step": 9506 + }, + { + "epoch": 6.125705076551168, + "grad_norm": 1.8209782389741394, + "learning_rate": 9.639384714536465e-05, + "loss": 2.1757, + "step": 9507 + }, + { + "epoch": 6.126349717969379, + "grad_norm": 2.0032469597883313, + "learning_rate": 9.639309545301406e-05, + "loss": 2.3297, + "step": 9508 + }, + { + "epoch": 6.126994359387591, + "grad_norm": 1.839753169572518, + "learning_rate": 9.639234368528983e-05, + "loss": 2.1398, + "step": 9509 + }, + { + "epoch": 6.127639000805802, + "grad_norm": 1.7070507181251753, + "learning_rate": 9.639159184219318e-05, + "loss": 1.9543, + "step": 9510 + }, + { + "epoch": 6.128283642224013, + "grad_norm": 1.9773816361254677, + "learning_rate": 9.639083992372533e-05, + "loss": 2.1945, + "step": 9511 + }, + { + "epoch": 6.128928283642224, + "grad_norm": 1.7454056023557862, + "learning_rate": 9.639008792988755e-05, + "loss": 2.2186, + "step": 9512 + }, + { + "epoch": 6.129572925060435, + "grad_norm": 1.9032373009461934, + "learning_rate": 9.638933586068104e-05, + "loss": 2.2225, + "step": 9513 + }, + { + "epoch": 6.130217566478646, + "grad_norm": 1.8202674557855458, + "learning_rate": 9.638858371610705e-05, + "loss": 2.1676, + "step": 9514 + }, + { + "epoch": 6.1308622078968575, + "grad_norm": 1.7495234452722312, + "learning_rate": 9.638783149616684e-05, + "loss": 2.2231, + "step": 9515 + }, + { + "epoch": 6.131506849315069, + "grad_norm": 1.7490332920181217, + "learning_rate": 9.638707920086159e-05, + "loss": 2.3444, + "step": 9516 + }, + { + "epoch": 6.132151490733279, + "grad_norm": 1.7158435645969166, + "learning_rate": 9.638632683019258e-05, + "loss": 2.0971, + "step": 9517 + }, + { + "epoch": 6.1327961321514906, + "grad_norm": 1.5738546139053624, + "learning_rate": 9.638557438416105e-05, + "loss": 1.8066, + "step": 9518 + }, + { + "epoch": 6.133440773569702, + "grad_norm": 1.7199176598356145, + "learning_rate": 9.63848218627682e-05, + "loss": 2.1342, + "step": 9519 + }, + { + "epoch": 6.134085414987913, + "grad_norm": 1.7068199248591294, + "learning_rate": 9.638406926601529e-05, + "loss": 1.9827, + "step": 9520 + }, + { + "epoch": 6.1347300564061245, + "grad_norm": 1.5737692848616953, + "learning_rate": 9.638331659390353e-05, + "loss": 2.0318, + "step": 9521 + }, + { + "epoch": 6.135374697824335, + "grad_norm": 1.7057947142707557, + "learning_rate": 9.638256384643421e-05, + "loss": 2.4459, + "step": 9522 + }, + { + "epoch": 6.136019339242546, + "grad_norm": 1.7749322082842387, + "learning_rate": 9.638181102360851e-05, + "loss": 2.0955, + "step": 9523 + }, + { + "epoch": 6.1366639806607575, + "grad_norm": 1.6735995175337184, + "learning_rate": 9.63810581254277e-05, + "loss": 2.2317, + "step": 9524 + }, + { + "epoch": 6.137308622078969, + "grad_norm": 1.7207155735101687, + "learning_rate": 9.638030515189301e-05, + "loss": 2.2987, + "step": 9525 + }, + { + "epoch": 6.13795326349718, + "grad_norm": 1.7328196786615442, + "learning_rate": 9.637955210300567e-05, + "loss": 2.2749, + "step": 9526 + }, + { + "epoch": 6.1385979049153905, + "grad_norm": 1.9071766658921085, + "learning_rate": 9.63787989787669e-05, + "loss": 2.1249, + "step": 9527 + }, + { + "epoch": 6.139242546333602, + "grad_norm": 1.5413087672488006, + "learning_rate": 9.637804577917798e-05, + "loss": 2.1587, + "step": 9528 + }, + { + "epoch": 6.139887187751813, + "grad_norm": 1.6258791075502042, + "learning_rate": 9.637729250424011e-05, + "loss": 2.1408, + "step": 9529 + }, + { + "epoch": 6.140531829170024, + "grad_norm": 1.602040893352149, + "learning_rate": 9.637653915395454e-05, + "loss": 1.9507, + "step": 9530 + }, + { + "epoch": 6.141176470588236, + "grad_norm": 1.9130656656828022, + "learning_rate": 9.637578572832252e-05, + "loss": 2.0705, + "step": 9531 + }, + { + "epoch": 6.141821112006446, + "grad_norm": 1.712292207365965, + "learning_rate": 9.637503222734526e-05, + "loss": 2.059, + "step": 9532 + }, + { + "epoch": 6.142465753424657, + "grad_norm": 1.8637627794121485, + "learning_rate": 9.637427865102403e-05, + "loss": 2.2658, + "step": 9533 + }, + { + "epoch": 6.143110394842869, + "grad_norm": 1.8562897817599269, + "learning_rate": 9.637352499936005e-05, + "loss": 2.0725, + "step": 9534 + }, + { + "epoch": 6.14375503626108, + "grad_norm": 1.9370160773095788, + "learning_rate": 9.637277127235454e-05, + "loss": 2.1459, + "step": 9535 + }, + { + "epoch": 6.144399677679291, + "grad_norm": 1.8268320295645346, + "learning_rate": 9.637201747000878e-05, + "loss": 2.3287, + "step": 9536 + }, + { + "epoch": 6.145044319097502, + "grad_norm": 1.8502726321415224, + "learning_rate": 9.637126359232397e-05, + "loss": 2.3088, + "step": 9537 + }, + { + "epoch": 6.145688960515713, + "grad_norm": 1.7163948564699545, + "learning_rate": 9.637050963930134e-05, + "loss": 1.9994, + "step": 9538 + }, + { + "epoch": 6.146333601933924, + "grad_norm": 1.7819707764025816, + "learning_rate": 9.63697556109422e-05, + "loss": 2.3465, + "step": 9539 + }, + { + "epoch": 6.146978243352136, + "grad_norm": 1.920269411050331, + "learning_rate": 9.636900150724771e-05, + "loss": 2.193, + "step": 9540 + }, + { + "epoch": 6.147622884770347, + "grad_norm": 1.7476020476180063, + "learning_rate": 9.636824732821913e-05, + "loss": 2.3313, + "step": 9541 + }, + { + "epoch": 6.148267526188557, + "grad_norm": 1.8253333818838837, + "learning_rate": 9.636749307385771e-05, + "loss": 2.2485, + "step": 9542 + }, + { + "epoch": 6.148912167606769, + "grad_norm": 1.730978841400496, + "learning_rate": 9.636673874416468e-05, + "loss": 2.3719, + "step": 9543 + }, + { + "epoch": 6.14955680902498, + "grad_norm": 1.8659718570546406, + "learning_rate": 9.63659843391413e-05, + "loss": 2.4186, + "step": 9544 + }, + { + "epoch": 6.150201450443191, + "grad_norm": 1.7776056788180405, + "learning_rate": 9.636522985878878e-05, + "loss": 2.2317, + "step": 9545 + }, + { + "epoch": 6.1508460918614025, + "grad_norm": 1.8567546747630816, + "learning_rate": 9.63644753031084e-05, + "loss": 2.0028, + "step": 9546 + }, + { + "epoch": 6.151490733279613, + "grad_norm": 1.8891097117429547, + "learning_rate": 9.636372067210134e-05, + "loss": 2.0246, + "step": 9547 + }, + { + "epoch": 6.152135374697824, + "grad_norm": 1.616527220516261, + "learning_rate": 9.636296596576888e-05, + "loss": 2.2051, + "step": 9548 + }, + { + "epoch": 6.1527800161160355, + "grad_norm": 1.9181094573719129, + "learning_rate": 9.636221118411227e-05, + "loss": 2.3736, + "step": 9549 + }, + { + "epoch": 6.153424657534247, + "grad_norm": 1.737090175212225, + "learning_rate": 9.636145632713268e-05, + "loss": 2.0994, + "step": 9550 + }, + { + "epoch": 6.154069298952457, + "grad_norm": 1.7758791951301733, + "learning_rate": 9.636070139483145e-05, + "loss": 2.1024, + "step": 9551 + }, + { + "epoch": 6.1547139403706685, + "grad_norm": 1.717184367150368, + "learning_rate": 9.635994638720975e-05, + "loss": 2.3496, + "step": 9552 + }, + { + "epoch": 6.15535858178888, + "grad_norm": 1.7271314082803282, + "learning_rate": 9.635919130426883e-05, + "loss": 2.1331, + "step": 9553 + }, + { + "epoch": 6.156003223207091, + "grad_norm": 1.9634445988765181, + "learning_rate": 9.635843614600995e-05, + "loss": 2.3876, + "step": 9554 + }, + { + "epoch": 6.1566478646253024, + "grad_norm": 1.8819725509946168, + "learning_rate": 9.635768091243433e-05, + "loss": 2.4578, + "step": 9555 + }, + { + "epoch": 6.157292506043513, + "grad_norm": 2.0355259506167633, + "learning_rate": 9.635692560354323e-05, + "loss": 2.217, + "step": 9556 + }, + { + "epoch": 6.157937147461724, + "grad_norm": 1.89603597861276, + "learning_rate": 9.635617021933788e-05, + "loss": 2.4713, + "step": 9557 + }, + { + "epoch": 6.1585817888799355, + "grad_norm": 2.2666173110623626, + "learning_rate": 9.635541475981952e-05, + "loss": 2.4507, + "step": 9558 + }, + { + "epoch": 6.159226430298147, + "grad_norm": 2.180486668654438, + "learning_rate": 9.63546592249894e-05, + "loss": 2.2821, + "step": 9559 + }, + { + "epoch": 6.159871071716358, + "grad_norm": 2.236112245927566, + "learning_rate": 9.635390361484874e-05, + "loss": 2.2327, + "step": 9560 + }, + { + "epoch": 6.1605157131345685, + "grad_norm": 1.987674048458093, + "learning_rate": 9.63531479293988e-05, + "loss": 2.3667, + "step": 9561 + }, + { + "epoch": 6.16116035455278, + "grad_norm": 2.5486262004500633, + "learning_rate": 9.63523921686408e-05, + "loss": 2.3627, + "step": 9562 + }, + { + "epoch": 6.161804995970991, + "grad_norm": 1.8246267256752025, + "learning_rate": 9.635163633257601e-05, + "loss": 2.343, + "step": 9563 + }, + { + "epoch": 6.162449637389202, + "grad_norm": 2.296113097789829, + "learning_rate": 9.635088042120566e-05, + "loss": 2.2169, + "step": 9564 + }, + { + "epoch": 6.163094278807414, + "grad_norm": 1.8947030113303305, + "learning_rate": 9.6350124434531e-05, + "loss": 2.3355, + "step": 9565 + }, + { + "epoch": 6.163738920225624, + "grad_norm": 2.220562877406976, + "learning_rate": 9.634936837255324e-05, + "loss": 2.3747, + "step": 9566 + }, + { + "epoch": 6.164383561643835, + "grad_norm": 2.16114411346823, + "learning_rate": 9.634861223527365e-05, + "loss": 2.4018, + "step": 9567 + }, + { + "epoch": 6.165028203062047, + "grad_norm": 1.990277876135404, + "learning_rate": 9.634785602269346e-05, + "loss": 2.2275, + "step": 9568 + }, + { + "epoch": 6.165672844480258, + "grad_norm": 2.1038404297397295, + "learning_rate": 9.634709973481392e-05, + "loss": 2.3294, + "step": 9569 + }, + { + "epoch": 6.166317485898469, + "grad_norm": 1.9159324506672195, + "learning_rate": 9.634634337163626e-05, + "loss": 2.0904, + "step": 9570 + }, + { + "epoch": 6.16696212731668, + "grad_norm": 2.1292894061020293, + "learning_rate": 9.634558693316174e-05, + "loss": 2.3031, + "step": 9571 + }, + { + "epoch": 6.167606768734891, + "grad_norm": 2.037233397716541, + "learning_rate": 9.63448304193916e-05, + "loss": 1.9274, + "step": 9572 + }, + { + "epoch": 6.168251410153102, + "grad_norm": 1.8789103550376247, + "learning_rate": 9.634407383032706e-05, + "loss": 2.0718, + "step": 9573 + }, + { + "epoch": 6.168896051571314, + "grad_norm": 1.9562323682590614, + "learning_rate": 9.634331716596939e-05, + "loss": 2.3848, + "step": 9574 + }, + { + "epoch": 6.169540692989525, + "grad_norm": 1.9705073260458184, + "learning_rate": 9.634256042631982e-05, + "loss": 2.1567, + "step": 9575 + }, + { + "epoch": 6.170185334407735, + "grad_norm": 1.838913420296094, + "learning_rate": 9.63418036113796e-05, + "loss": 2.0903, + "step": 9576 + }, + { + "epoch": 6.170829975825947, + "grad_norm": 2.1088229499575557, + "learning_rate": 9.634104672114994e-05, + "loss": 2.27, + "step": 9577 + }, + { + "epoch": 6.171474617244158, + "grad_norm": 1.8926300898178483, + "learning_rate": 9.634028975563212e-05, + "loss": 2.2313, + "step": 9578 + }, + { + "epoch": 6.172119258662369, + "grad_norm": 1.837214650749672, + "learning_rate": 9.633953271482738e-05, + "loss": 2.2672, + "step": 9579 + }, + { + "epoch": 6.1727639000805805, + "grad_norm": 1.9851058601726426, + "learning_rate": 9.633877559873695e-05, + "loss": 2.4026, + "step": 9580 + }, + { + "epoch": 6.173408541498791, + "grad_norm": 1.9367885933787015, + "learning_rate": 9.63380184073621e-05, + "loss": 2.2985, + "step": 9581 + }, + { + "epoch": 6.174053182917002, + "grad_norm": 1.8473789312348872, + "learning_rate": 9.633726114070403e-05, + "loss": 2.1733, + "step": 9582 + }, + { + "epoch": 6.1746978243352135, + "grad_norm": 1.9631248494759954, + "learning_rate": 9.633650379876401e-05, + "loss": 2.1219, + "step": 9583 + }, + { + "epoch": 6.175342465753425, + "grad_norm": 1.7806432693327523, + "learning_rate": 9.633574638154329e-05, + "loss": 2.2569, + "step": 9584 + }, + { + "epoch": 6.175987107171636, + "grad_norm": 1.9930246767964777, + "learning_rate": 9.63349888890431e-05, + "loss": 2.3006, + "step": 9585 + }, + { + "epoch": 6.1766317485898465, + "grad_norm": 1.9857216279201755, + "learning_rate": 9.63342313212647e-05, + "loss": 2.4255, + "step": 9586 + }, + { + "epoch": 6.177276390008058, + "grad_norm": 2.1176145912666113, + "learning_rate": 9.63334736782093e-05, + "loss": 2.4716, + "step": 9587 + }, + { + "epoch": 6.177921031426269, + "grad_norm": 2.093004926545416, + "learning_rate": 9.633271595987819e-05, + "loss": 2.1927, + "step": 9588 + }, + { + "epoch": 6.17856567284448, + "grad_norm": 1.8523470254553116, + "learning_rate": 9.633195816627258e-05, + "loss": 2.2815, + "step": 9589 + }, + { + "epoch": 6.179210314262692, + "grad_norm": 2.182113210262553, + "learning_rate": 9.633120029739373e-05, + "loss": 2.5866, + "step": 9590 + }, + { + "epoch": 6.179854955680902, + "grad_norm": 1.8348489156363312, + "learning_rate": 9.633044235324287e-05, + "loss": 2.1213, + "step": 9591 + }, + { + "epoch": 6.1804995970991135, + "grad_norm": 1.9012034368277533, + "learning_rate": 9.632968433382126e-05, + "loss": 2.3911, + "step": 9592 + }, + { + "epoch": 6.181144238517325, + "grad_norm": 1.9420169473858775, + "learning_rate": 9.632892623913016e-05, + "loss": 2.2404, + "step": 9593 + }, + { + "epoch": 6.181788879935536, + "grad_norm": 2.0418338433084, + "learning_rate": 9.632816806917078e-05, + "loss": 2.6208, + "step": 9594 + }, + { + "epoch": 6.182433521353747, + "grad_norm": 1.97578287380582, + "learning_rate": 9.632740982394437e-05, + "loss": 2.2054, + "step": 9595 + }, + { + "epoch": 6.183078162771958, + "grad_norm": 2.1764778863600975, + "learning_rate": 9.632665150345222e-05, + "loss": 2.3676, + "step": 9596 + }, + { + "epoch": 6.183722804190169, + "grad_norm": 2.052165857712557, + "learning_rate": 9.63258931076955e-05, + "loss": 2.3958, + "step": 9597 + }, + { + "epoch": 6.18436744560838, + "grad_norm": 2.0796461468585203, + "learning_rate": 9.632513463667553e-05, + "loss": 2.5083, + "step": 9598 + }, + { + "epoch": 6.185012087026592, + "grad_norm": 1.9701165375237748, + "learning_rate": 9.632437609039352e-05, + "loss": 2.3761, + "step": 9599 + }, + { + "epoch": 6.185656728444803, + "grad_norm": 1.885549706396662, + "learning_rate": 9.632361746885071e-05, + "loss": 2.2125, + "step": 9600 + }, + { + "epoch": 6.185656728444803, + "eval_loss": 5.1075758934021, + "eval_runtime": 2.974, + "eval_samples_per_second": 33.624, + "eval_steps_per_second": 4.371, + "step": 9600 + }, + { + "epoch": 6.186301369863013, + "grad_norm": 2.1778621485691625, + "learning_rate": 9.632285877204835e-05, + "loss": 2.1205, + "step": 9601 + }, + { + "epoch": 6.186946011281225, + "grad_norm": 1.8315661023216188, + "learning_rate": 9.632209999998772e-05, + "loss": 2.3484, + "step": 9602 + }, + { + "epoch": 6.187590652699436, + "grad_norm": 1.9869036832006461, + "learning_rate": 9.632134115267001e-05, + "loss": 2.3991, + "step": 9603 + }, + { + "epoch": 6.188235294117647, + "grad_norm": 1.7496381334386755, + "learning_rate": 9.632058223009649e-05, + "loss": 2.2455, + "step": 9604 + }, + { + "epoch": 6.188879935535859, + "grad_norm": 1.9977480751457966, + "learning_rate": 9.631982323226843e-05, + "loss": 2.3431, + "step": 9605 + }, + { + "epoch": 6.189524576954069, + "grad_norm": 1.9698554989074188, + "learning_rate": 9.631906415918705e-05, + "loss": 2.1884, + "step": 9606 + }, + { + "epoch": 6.19016921837228, + "grad_norm": 2.059052749313299, + "learning_rate": 9.631830501085358e-05, + "loss": 2.165, + "step": 9607 + }, + { + "epoch": 6.190813859790492, + "grad_norm": 1.9709304189408718, + "learning_rate": 9.631754578726931e-05, + "loss": 2.2209, + "step": 9608 + }, + { + "epoch": 6.191458501208703, + "grad_norm": 1.8838475365555183, + "learning_rate": 9.631678648843547e-05, + "loss": 2.3058, + "step": 9609 + }, + { + "epoch": 6.192103142626914, + "grad_norm": 1.6728269554719677, + "learning_rate": 9.63160271143533e-05, + "loss": 2.1215, + "step": 9610 + }, + { + "epoch": 6.192747784045125, + "grad_norm": 2.194524111986871, + "learning_rate": 9.631526766502407e-05, + "loss": 2.2226, + "step": 9611 + }, + { + "epoch": 6.193392425463336, + "grad_norm": 2.027454170229102, + "learning_rate": 9.631450814044898e-05, + "loss": 2.234, + "step": 9612 + }, + { + "epoch": 6.194037066881547, + "grad_norm": 2.204453488618179, + "learning_rate": 9.631374854062933e-05, + "loss": 2.3545, + "step": 9613 + }, + { + "epoch": 6.1946817082997585, + "grad_norm": 2.0497665951503437, + "learning_rate": 9.631298886556632e-05, + "loss": 2.4181, + "step": 9614 + }, + { + "epoch": 6.19532634971797, + "grad_norm": 1.950906546888677, + "learning_rate": 9.631222911526124e-05, + "loss": 2.2721, + "step": 9615 + }, + { + "epoch": 6.19597099113618, + "grad_norm": 1.9909970506781642, + "learning_rate": 9.63114692897153e-05, + "loss": 2.1366, + "step": 9616 + }, + { + "epoch": 6.1966156325543915, + "grad_norm": 2.0081944276508557, + "learning_rate": 9.631070938892978e-05, + "loss": 2.2273, + "step": 9617 + }, + { + "epoch": 6.197260273972603, + "grad_norm": 2.182681953943495, + "learning_rate": 9.630994941290591e-05, + "loss": 2.2279, + "step": 9618 + }, + { + "epoch": 6.197904915390814, + "grad_norm": 1.962515973028805, + "learning_rate": 9.630918936164495e-05, + "loss": 2.2101, + "step": 9619 + }, + { + "epoch": 6.198549556809025, + "grad_norm": 1.7074019130620686, + "learning_rate": 9.630842923514813e-05, + "loss": 2.2228, + "step": 9620 + }, + { + "epoch": 6.199194198227236, + "grad_norm": 1.8578894893447155, + "learning_rate": 9.630766903341672e-05, + "loss": 2.178, + "step": 9621 + }, + { + "epoch": 6.199838839645447, + "grad_norm": 1.810240543685528, + "learning_rate": 9.630690875645193e-05, + "loss": 2.1324, + "step": 9622 + }, + { + "epoch": 6.200483481063658, + "grad_norm": 1.9625999948036164, + "learning_rate": 9.630614840425508e-05, + "loss": 2.4832, + "step": 9623 + }, + { + "epoch": 6.20112812248187, + "grad_norm": 1.8177642995148555, + "learning_rate": 9.630538797682736e-05, + "loss": 2.2683, + "step": 9624 + }, + { + "epoch": 6.201772763900081, + "grad_norm": 1.8700293595484025, + "learning_rate": 9.630462747417003e-05, + "loss": 2.2128, + "step": 9625 + }, + { + "epoch": 6.202417405318291, + "grad_norm": 1.7433179768937794, + "learning_rate": 9.630386689628436e-05, + "loss": 2.0048, + "step": 9626 + }, + { + "epoch": 6.203062046736503, + "grad_norm": 1.9005144552627773, + "learning_rate": 9.630310624317154e-05, + "loss": 2.0281, + "step": 9627 + }, + { + "epoch": 6.203706688154714, + "grad_norm": 1.7316757867431791, + "learning_rate": 9.63023455148329e-05, + "loss": 2.3186, + "step": 9628 + }, + { + "epoch": 6.204351329572925, + "grad_norm": 1.8252075613363834, + "learning_rate": 9.630158471126965e-05, + "loss": 2.1664, + "step": 9629 + }, + { + "epoch": 6.204995970991136, + "grad_norm": 1.8786427098033565, + "learning_rate": 9.630082383248302e-05, + "loss": 1.9102, + "step": 9630 + }, + { + "epoch": 6.205640612409347, + "grad_norm": 1.6099992332741326, + "learning_rate": 9.63000628784743e-05, + "loss": 2.2775, + "step": 9631 + }, + { + "epoch": 6.206285253827558, + "grad_norm": 1.9592894397561922, + "learning_rate": 9.629930184924472e-05, + "loss": 2.5885, + "step": 9632 + }, + { + "epoch": 6.20692989524577, + "grad_norm": 1.7544395521025589, + "learning_rate": 9.629854074479553e-05, + "loss": 2.1515, + "step": 9633 + }, + { + "epoch": 6.207574536663981, + "grad_norm": 1.8313194306944316, + "learning_rate": 9.629777956512796e-05, + "loss": 2.1197, + "step": 9634 + }, + { + "epoch": 6.208219178082191, + "grad_norm": 1.648140424318947, + "learning_rate": 9.629701831024331e-05, + "loss": 2.6115, + "step": 9635 + }, + { + "epoch": 6.208863819500403, + "grad_norm": 1.8832464620940592, + "learning_rate": 9.629625698014279e-05, + "loss": 2.076, + "step": 9636 + }, + { + "epoch": 6.209508460918614, + "grad_norm": 1.6384139624778133, + "learning_rate": 9.629549557482766e-05, + "loss": 2.1255, + "step": 9637 + }, + { + "epoch": 6.210153102336825, + "grad_norm": 1.8066090635242171, + "learning_rate": 9.629473409429917e-05, + "loss": 2.1806, + "step": 9638 + }, + { + "epoch": 6.210797743755037, + "grad_norm": 1.942829565796675, + "learning_rate": 9.629397253855856e-05, + "loss": 2.1391, + "step": 9639 + }, + { + "epoch": 6.211442385173247, + "grad_norm": 1.7115633043602834, + "learning_rate": 9.629321090760711e-05, + "loss": 2.3176, + "step": 9640 + }, + { + "epoch": 6.212087026591458, + "grad_norm": 1.8764225684880744, + "learning_rate": 9.629244920144605e-05, + "loss": 2.1241, + "step": 9641 + }, + { + "epoch": 6.21273166800967, + "grad_norm": 2.213078120731658, + "learning_rate": 9.629168742007663e-05, + "loss": 2.3844, + "step": 9642 + }, + { + "epoch": 6.213376309427881, + "grad_norm": 1.7054794586984507, + "learning_rate": 9.629092556350011e-05, + "loss": 2.3501, + "step": 9643 + }, + { + "epoch": 6.214020950846092, + "grad_norm": 1.8331931439342455, + "learning_rate": 9.629016363171774e-05, + "loss": 2.2017, + "step": 9644 + }, + { + "epoch": 6.214665592264303, + "grad_norm": 1.6931062847296867, + "learning_rate": 9.628940162473077e-05, + "loss": 2.2764, + "step": 9645 + }, + { + "epoch": 6.215310233682514, + "grad_norm": 1.675550998790943, + "learning_rate": 9.628863954254045e-05, + "loss": 2.0188, + "step": 9646 + }, + { + "epoch": 6.215954875100725, + "grad_norm": 1.7864337300252313, + "learning_rate": 9.628787738514802e-05, + "loss": 2.302, + "step": 9647 + }, + { + "epoch": 6.2165995165189365, + "grad_norm": 1.6650806880262452, + "learning_rate": 9.628711515255476e-05, + "loss": 2.3229, + "step": 9648 + }, + { + "epoch": 6.217244157937148, + "grad_norm": 1.8128304347855528, + "learning_rate": 9.62863528447619e-05, + "loss": 2.1154, + "step": 9649 + }, + { + "epoch": 6.217888799355358, + "grad_norm": 1.6932937884797281, + "learning_rate": 9.628559046177071e-05, + "loss": 2.1901, + "step": 9650 + }, + { + "epoch": 6.2185334407735695, + "grad_norm": 1.8762847753472138, + "learning_rate": 9.62848280035824e-05, + "loss": 2.1206, + "step": 9651 + }, + { + "epoch": 6.219178082191781, + "grad_norm": 1.94674851656481, + "learning_rate": 9.628406547019827e-05, + "loss": 2.3912, + "step": 9652 + }, + { + "epoch": 6.219822723609992, + "grad_norm": 1.8172889630629867, + "learning_rate": 9.628330286161954e-05, + "loss": 2.2469, + "step": 9653 + }, + { + "epoch": 6.220467365028203, + "grad_norm": 2.037487250006675, + "learning_rate": 9.628254017784749e-05, + "loss": 2.3354, + "step": 9654 + }, + { + "epoch": 6.221112006446414, + "grad_norm": 1.906260964898529, + "learning_rate": 9.628177741888337e-05, + "loss": 2.2259, + "step": 9655 + }, + { + "epoch": 6.221756647864625, + "grad_norm": 1.921823558337801, + "learning_rate": 9.62810145847284e-05, + "loss": 2.0241, + "step": 9656 + }, + { + "epoch": 6.222401289282836, + "grad_norm": 1.9083602437240113, + "learning_rate": 9.628025167538388e-05, + "loss": 2.6042, + "step": 9657 + }, + { + "epoch": 6.223045930701048, + "grad_norm": 2.0930702420155747, + "learning_rate": 9.627948869085101e-05, + "loss": 2.2936, + "step": 9658 + }, + { + "epoch": 6.223690572119259, + "grad_norm": 1.9465253922927617, + "learning_rate": 9.627872563113108e-05, + "loss": 2.2172, + "step": 9659 + }, + { + "epoch": 6.224335213537469, + "grad_norm": 2.0530723365677574, + "learning_rate": 9.627796249622535e-05, + "loss": 2.2218, + "step": 9660 + }, + { + "epoch": 6.224979854955681, + "grad_norm": 2.1860147675654016, + "learning_rate": 9.627719928613504e-05, + "loss": 2.4229, + "step": 9661 + }, + { + "epoch": 6.225624496373892, + "grad_norm": 1.8762891296912851, + "learning_rate": 9.627643600086143e-05, + "loss": 2.353, + "step": 9662 + }, + { + "epoch": 6.226269137792103, + "grad_norm": 1.9835450463631181, + "learning_rate": 9.627567264040576e-05, + "loss": 2.4487, + "step": 9663 + }, + { + "epoch": 6.226913779210315, + "grad_norm": 2.0533324185506805, + "learning_rate": 9.627490920476929e-05, + "loss": 2.1596, + "step": 9664 + }, + { + "epoch": 6.227558420628525, + "grad_norm": 2.006169675728871, + "learning_rate": 9.627414569395326e-05, + "loss": 2.1903, + "step": 9665 + }, + { + "epoch": 6.228203062046736, + "grad_norm": 1.9676261756468518, + "learning_rate": 9.627338210795897e-05, + "loss": 2.2012, + "step": 9666 + }, + { + "epoch": 6.228847703464948, + "grad_norm": 2.0332113464072297, + "learning_rate": 9.627261844678761e-05, + "loss": 2.381, + "step": 9667 + }, + { + "epoch": 6.229492344883159, + "grad_norm": 1.834613905221762, + "learning_rate": 9.627185471044047e-05, + "loss": 2.3954, + "step": 9668 + }, + { + "epoch": 6.23013698630137, + "grad_norm": 1.8754399299240232, + "learning_rate": 9.62710908989188e-05, + "loss": 2.1554, + "step": 9669 + }, + { + "epoch": 6.230781627719581, + "grad_norm": 1.6993326791612753, + "learning_rate": 9.627032701222384e-05, + "loss": 2.4191, + "step": 9670 + }, + { + "epoch": 6.231426269137792, + "grad_norm": 2.045950278292271, + "learning_rate": 9.626956305035689e-05, + "loss": 2.2295, + "step": 9671 + }, + { + "epoch": 6.232070910556003, + "grad_norm": 1.583985337174712, + "learning_rate": 9.626879901331915e-05, + "loss": 2.2818, + "step": 9672 + }, + { + "epoch": 6.232715551974215, + "grad_norm": 2.016080167999864, + "learning_rate": 9.626803490111192e-05, + "loss": 2.4748, + "step": 9673 + }, + { + "epoch": 6.233360193392426, + "grad_norm": 1.6673198271750624, + "learning_rate": 9.626727071373641e-05, + "loss": 2.5434, + "step": 9674 + }, + { + "epoch": 6.234004834810636, + "grad_norm": 1.9393298842336957, + "learning_rate": 9.626650645119389e-05, + "loss": 2.4689, + "step": 9675 + }, + { + "epoch": 6.234649476228848, + "grad_norm": 1.6725116906536837, + "learning_rate": 9.626574211348564e-05, + "loss": 2.118, + "step": 9676 + }, + { + "epoch": 6.235294117647059, + "grad_norm": 1.8311805946345057, + "learning_rate": 9.62649777006129e-05, + "loss": 2.3862, + "step": 9677 + }, + { + "epoch": 6.23593875906527, + "grad_norm": 1.956886066655461, + "learning_rate": 9.626421321257692e-05, + "loss": 2.0396, + "step": 9678 + }, + { + "epoch": 6.2365834004834815, + "grad_norm": 1.75776153085669, + "learning_rate": 9.626344864937896e-05, + "loss": 2.4331, + "step": 9679 + }, + { + "epoch": 6.237228041901692, + "grad_norm": 1.9191037134702056, + "learning_rate": 9.626268401102026e-05, + "loss": 2.5151, + "step": 9680 + }, + { + "epoch": 6.237872683319903, + "grad_norm": 1.800568691520507, + "learning_rate": 9.62619192975021e-05, + "loss": 2.1948, + "step": 9681 + }, + { + "epoch": 6.2385173247381145, + "grad_norm": 2.04428643585116, + "learning_rate": 9.626115450882573e-05, + "loss": 2.242, + "step": 9682 + }, + { + "epoch": 6.239161966156326, + "grad_norm": 1.7812476976116527, + "learning_rate": 9.626038964499241e-05, + "loss": 2.4796, + "step": 9683 + }, + { + "epoch": 6.239806607574537, + "grad_norm": 2.161152902420769, + "learning_rate": 9.625962470600337e-05, + "loss": 2.2215, + "step": 9684 + }, + { + "epoch": 6.2404512489927475, + "grad_norm": 1.8355850583327284, + "learning_rate": 9.625885969185991e-05, + "loss": 2.2979, + "step": 9685 + }, + { + "epoch": 6.241095890410959, + "grad_norm": 2.105703060321176, + "learning_rate": 9.625809460256324e-05, + "loss": 1.9774, + "step": 9686 + }, + { + "epoch": 6.24174053182917, + "grad_norm": 1.719356707689515, + "learning_rate": 9.625732943811466e-05, + "loss": 2.3857, + "step": 9687 + }, + { + "epoch": 6.242385173247381, + "grad_norm": 1.906694024437178, + "learning_rate": 9.625656419851538e-05, + "loss": 2.5155, + "step": 9688 + }, + { + "epoch": 6.243029814665592, + "grad_norm": 1.9017007367390721, + "learning_rate": 9.625579888376668e-05, + "loss": 2.2565, + "step": 9689 + }, + { + "epoch": 6.243674456083803, + "grad_norm": 2.010854219356272, + "learning_rate": 9.625503349386983e-05, + "loss": 2.3831, + "step": 9690 + }, + { + "epoch": 6.244319097502014, + "grad_norm": 2.0162027558838393, + "learning_rate": 9.625426802882608e-05, + "loss": 2.0761, + "step": 9691 + }, + { + "epoch": 6.244963738920226, + "grad_norm": 2.06151289067734, + "learning_rate": 9.625350248863668e-05, + "loss": 2.2737, + "step": 9692 + }, + { + "epoch": 6.245608380338437, + "grad_norm": 1.79371479658391, + "learning_rate": 9.625273687330288e-05, + "loss": 2.3845, + "step": 9693 + }, + { + "epoch": 6.246253021756647, + "grad_norm": 1.8787491498877014, + "learning_rate": 9.625197118282596e-05, + "loss": 2.3098, + "step": 9694 + }, + { + "epoch": 6.246897663174859, + "grad_norm": 2.0745883277025405, + "learning_rate": 9.625120541720715e-05, + "loss": 2.4012, + "step": 9695 + }, + { + "epoch": 6.24754230459307, + "grad_norm": 2.190567334156164, + "learning_rate": 9.625043957644775e-05, + "loss": 2.2833, + "step": 9696 + }, + { + "epoch": 6.248186946011281, + "grad_norm": 1.8675079150684786, + "learning_rate": 9.624967366054897e-05, + "loss": 2.1999, + "step": 9697 + }, + { + "epoch": 6.248831587429493, + "grad_norm": 2.053132489462145, + "learning_rate": 9.624890766951209e-05, + "loss": 2.3473, + "step": 9698 + }, + { + "epoch": 6.249476228847703, + "grad_norm": 1.9401639267073816, + "learning_rate": 9.624814160333837e-05, + "loss": 2.1713, + "step": 9699 + }, + { + "epoch": 6.250120870265914, + "grad_norm": 2.146000515361915, + "learning_rate": 9.624737546202906e-05, + "loss": 2.3245, + "step": 9700 + }, + { + "epoch": 6.250120870265914, + "eval_loss": 5.086515426635742, + "eval_runtime": 2.9602, + "eval_samples_per_second": 33.782, + "eval_steps_per_second": 4.392, + "step": 9700 + }, + { + "epoch": 6.250765511684126, + "grad_norm": 1.7672115693382726, + "learning_rate": 9.624660924558543e-05, + "loss": 2.2566, + "step": 9701 + }, + { + "epoch": 6.251410153102337, + "grad_norm": 2.0115403047795923, + "learning_rate": 9.624584295400873e-05, + "loss": 2.1079, + "step": 9702 + }, + { + "epoch": 6.252054794520548, + "grad_norm": 1.749856451241603, + "learning_rate": 9.62450765873002e-05, + "loss": 2.189, + "step": 9703 + }, + { + "epoch": 6.252699435938759, + "grad_norm": 1.8697916048451675, + "learning_rate": 9.624431014546116e-05, + "loss": 2.4891, + "step": 9704 + }, + { + "epoch": 6.25334407735697, + "grad_norm": 1.8337559415562212, + "learning_rate": 9.62435436284928e-05, + "loss": 2.4035, + "step": 9705 + }, + { + "epoch": 6.253988718775181, + "grad_norm": 1.9478779687010204, + "learning_rate": 9.624277703639641e-05, + "loss": 2.3854, + "step": 9706 + }, + { + "epoch": 6.2546333601933926, + "grad_norm": 1.7776341085594274, + "learning_rate": 9.624201036917324e-05, + "loss": 2.3825, + "step": 9707 + }, + { + "epoch": 6.255278001611604, + "grad_norm": 1.8004535192358149, + "learning_rate": 9.624124362682457e-05, + "loss": 2.0221, + "step": 9708 + }, + { + "epoch": 6.255922643029814, + "grad_norm": 1.897164947456096, + "learning_rate": 9.624047680935163e-05, + "loss": 2.1568, + "step": 9709 + }, + { + "epoch": 6.256567284448026, + "grad_norm": 1.6307830222578878, + "learning_rate": 9.623970991675571e-05, + "loss": 2.5711, + "step": 9710 + }, + { + "epoch": 6.257211925866237, + "grad_norm": 1.835966838506758, + "learning_rate": 9.623894294903802e-05, + "loss": 2.4261, + "step": 9711 + }, + { + "epoch": 6.257856567284448, + "grad_norm": 1.7748580120745214, + "learning_rate": 9.623817590619988e-05, + "loss": 2.4144, + "step": 9712 + }, + { + "epoch": 6.2585012087026595, + "grad_norm": 1.9013734096126722, + "learning_rate": 9.623740878824251e-05, + "loss": 2.5175, + "step": 9713 + }, + { + "epoch": 6.25914585012087, + "grad_norm": 1.6714069393403277, + "learning_rate": 9.623664159516719e-05, + "loss": 2.4887, + "step": 9714 + }, + { + "epoch": 6.259790491539081, + "grad_norm": 1.7740424257933431, + "learning_rate": 9.623587432697516e-05, + "loss": 2.1566, + "step": 9715 + }, + { + "epoch": 6.2604351329572925, + "grad_norm": 1.6317865364456399, + "learning_rate": 9.62351069836677e-05, + "loss": 2.4111, + "step": 9716 + }, + { + "epoch": 6.261079774375504, + "grad_norm": 1.8757491315905441, + "learning_rate": 9.623433956524608e-05, + "loss": 2.2574, + "step": 9717 + }, + { + "epoch": 6.261724415793715, + "grad_norm": 1.9812898293006946, + "learning_rate": 9.623357207171152e-05, + "loss": 2.3125, + "step": 9718 + }, + { + "epoch": 6.2623690572119255, + "grad_norm": 1.8763686658674508, + "learning_rate": 9.62328045030653e-05, + "loss": 2.2606, + "step": 9719 + }, + { + "epoch": 6.263013698630137, + "grad_norm": 1.838134742541069, + "learning_rate": 9.623203685930869e-05, + "loss": 2.2708, + "step": 9720 + }, + { + "epoch": 6.263658340048348, + "grad_norm": 1.8208638911694774, + "learning_rate": 9.623126914044294e-05, + "loss": 2.353, + "step": 9721 + }, + { + "epoch": 6.264302981466559, + "grad_norm": 2.186719147734828, + "learning_rate": 9.623050134646934e-05, + "loss": 2.2315, + "step": 9722 + }, + { + "epoch": 6.264947622884771, + "grad_norm": 1.8475577789613131, + "learning_rate": 9.622973347738911e-05, + "loss": 2.4494, + "step": 9723 + }, + { + "epoch": 6.265592264302981, + "grad_norm": 1.7984510468791994, + "learning_rate": 9.622896553320353e-05, + "loss": 2.2727, + "step": 9724 + }, + { + "epoch": 6.266236905721192, + "grad_norm": 1.9614090243419988, + "learning_rate": 9.622819751391385e-05, + "loss": 2.3496, + "step": 9725 + }, + { + "epoch": 6.266881547139404, + "grad_norm": 1.8521256850099683, + "learning_rate": 9.622742941952136e-05, + "loss": 2.3803, + "step": 9726 + }, + { + "epoch": 6.267526188557615, + "grad_norm": 1.9419410530390884, + "learning_rate": 9.622666125002728e-05, + "loss": 2.1295, + "step": 9727 + }, + { + "epoch": 6.268170829975826, + "grad_norm": 1.975151386198486, + "learning_rate": 9.622589300543291e-05, + "loss": 2.4252, + "step": 9728 + }, + { + "epoch": 6.268815471394037, + "grad_norm": 1.9430666972439439, + "learning_rate": 9.622512468573947e-05, + "loss": 2.2385, + "step": 9729 + }, + { + "epoch": 6.269460112812248, + "grad_norm": 1.6934506257291333, + "learning_rate": 9.622435629094827e-05, + "loss": 2.1391, + "step": 9730 + }, + { + "epoch": 6.270104754230459, + "grad_norm": 1.8455003522738878, + "learning_rate": 9.622358782106055e-05, + "loss": 2.4103, + "step": 9731 + }, + { + "epoch": 6.270749395648671, + "grad_norm": 1.8230732017897529, + "learning_rate": 9.622281927607756e-05, + "loss": 2.2876, + "step": 9732 + }, + { + "epoch": 6.271394037066882, + "grad_norm": 2.0475303790236907, + "learning_rate": 9.622205065600058e-05, + "loss": 2.215, + "step": 9733 + }, + { + "epoch": 6.272038678485092, + "grad_norm": 1.9183320338396064, + "learning_rate": 9.622128196083086e-05, + "loss": 2.4273, + "step": 9734 + }, + { + "epoch": 6.272683319903304, + "grad_norm": 1.8526074899215985, + "learning_rate": 9.622051319056967e-05, + "loss": 2.2855, + "step": 9735 + }, + { + "epoch": 6.273327961321515, + "grad_norm": 1.7203601818716536, + "learning_rate": 9.621974434521828e-05, + "loss": 2.3752, + "step": 9736 + }, + { + "epoch": 6.273972602739726, + "grad_norm": 1.7285052376105212, + "learning_rate": 9.621897542477793e-05, + "loss": 2.1782, + "step": 9737 + }, + { + "epoch": 6.2746172441579375, + "grad_norm": 1.875309562722776, + "learning_rate": 9.621820642924991e-05, + "loss": 2.0944, + "step": 9738 + }, + { + "epoch": 6.275261885576148, + "grad_norm": 1.8557997527882588, + "learning_rate": 9.621743735863545e-05, + "loss": 2.4799, + "step": 9739 + }, + { + "epoch": 6.275906526994359, + "grad_norm": 1.9122810362657543, + "learning_rate": 9.621666821293586e-05, + "loss": 2.5765, + "step": 9740 + }, + { + "epoch": 6.2765511684125705, + "grad_norm": 2.028783937796503, + "learning_rate": 9.621589899215235e-05, + "loss": 2.2818, + "step": 9741 + }, + { + "epoch": 6.277195809830782, + "grad_norm": 2.2784540136577016, + "learning_rate": 9.621512969628624e-05, + "loss": 2.4398, + "step": 9742 + }, + { + "epoch": 6.277840451248993, + "grad_norm": 2.4645172664076656, + "learning_rate": 9.621436032533874e-05, + "loss": 2.3745, + "step": 9743 + }, + { + "epoch": 6.278485092667204, + "grad_norm": 2.2029819324810864, + "learning_rate": 9.621359087931113e-05, + "loss": 2.3823, + "step": 9744 + }, + { + "epoch": 6.279129734085415, + "grad_norm": 2.0282463184050683, + "learning_rate": 9.62128213582047e-05, + "loss": 2.3187, + "step": 9745 + }, + { + "epoch": 6.279774375503626, + "grad_norm": 2.14214933320342, + "learning_rate": 9.621205176202067e-05, + "loss": 2.331, + "step": 9746 + }, + { + "epoch": 6.2804190169218375, + "grad_norm": 1.8893069097996569, + "learning_rate": 9.621128209076034e-05, + "loss": 2.4275, + "step": 9747 + }, + { + "epoch": 6.281063658340049, + "grad_norm": 2.3868830962470295, + "learning_rate": 9.621051234442496e-05, + "loss": 2.3538, + "step": 9748 + }, + { + "epoch": 6.281708299758259, + "grad_norm": 1.9647250642350076, + "learning_rate": 9.620974252301581e-05, + "loss": 2.2408, + "step": 9749 + }, + { + "epoch": 6.2823529411764705, + "grad_norm": 2.3099105117617844, + "learning_rate": 9.620897262653412e-05, + "loss": 2.1265, + "step": 9750 + }, + { + "epoch": 6.282997582594682, + "grad_norm": 1.6877931015376637, + "learning_rate": 9.62082026549812e-05, + "loss": 2.5266, + "step": 9751 + }, + { + "epoch": 6.283642224012893, + "grad_norm": 2.1757169838831465, + "learning_rate": 9.620743260835826e-05, + "loss": 2.4417, + "step": 9752 + }, + { + "epoch": 6.284286865431104, + "grad_norm": 1.7367295090381891, + "learning_rate": 9.62066624866666e-05, + "loss": 2.3594, + "step": 9753 + }, + { + "epoch": 6.284931506849315, + "grad_norm": 2.0264078200462916, + "learning_rate": 9.62058922899075e-05, + "loss": 2.2945, + "step": 9754 + }, + { + "epoch": 6.285576148267526, + "grad_norm": 1.7631629748289297, + "learning_rate": 9.620512201808218e-05, + "loss": 2.1943, + "step": 9755 + }, + { + "epoch": 6.286220789685737, + "grad_norm": 1.741113597028502, + "learning_rate": 9.620435167119195e-05, + "loss": 2.1108, + "step": 9756 + }, + { + "epoch": 6.286865431103949, + "grad_norm": 1.8302279627738185, + "learning_rate": 9.620358124923805e-05, + "loss": 2.578, + "step": 9757 + }, + { + "epoch": 6.28751007252216, + "grad_norm": 1.8779989788299891, + "learning_rate": 9.620281075222173e-05, + "loss": 2.5307, + "step": 9758 + }, + { + "epoch": 6.28815471394037, + "grad_norm": 2.0788382067735403, + "learning_rate": 9.620204018014428e-05, + "loss": 2.3788, + "step": 9759 + }, + { + "epoch": 6.288799355358582, + "grad_norm": 1.9296674031048133, + "learning_rate": 9.620126953300697e-05, + "loss": 2.4708, + "step": 9760 + }, + { + "epoch": 6.289443996776793, + "grad_norm": 1.8446475399258893, + "learning_rate": 9.620049881081106e-05, + "loss": 2.297, + "step": 9761 + }, + { + "epoch": 6.290088638195004, + "grad_norm": 1.8949232597484993, + "learning_rate": 9.619972801355781e-05, + "loss": 2.0874, + "step": 9762 + }, + { + "epoch": 6.290733279613216, + "grad_norm": 1.9657322622744957, + "learning_rate": 9.619895714124848e-05, + "loss": 2.6337, + "step": 9763 + }, + { + "epoch": 6.291377921031426, + "grad_norm": 2.125811909475734, + "learning_rate": 9.619818619388434e-05, + "loss": 2.5071, + "step": 9764 + }, + { + "epoch": 6.292022562449637, + "grad_norm": 1.8200307843623782, + "learning_rate": 9.619741517146668e-05, + "loss": 2.5558, + "step": 9765 + }, + { + "epoch": 6.292667203867849, + "grad_norm": 1.89932917808209, + "learning_rate": 9.619664407399673e-05, + "loss": 2.6503, + "step": 9766 + }, + { + "epoch": 6.29331184528606, + "grad_norm": 2.001062415647151, + "learning_rate": 9.619587290147577e-05, + "loss": 2.4976, + "step": 9767 + }, + { + "epoch": 6.293956486704271, + "grad_norm": 1.8083921608943505, + "learning_rate": 9.619510165390507e-05, + "loss": 2.4402, + "step": 9768 + }, + { + "epoch": 6.294601128122482, + "grad_norm": 1.8595220939206913, + "learning_rate": 9.61943303312859e-05, + "loss": 2.4094, + "step": 9769 + }, + { + "epoch": 6.295245769540693, + "grad_norm": 1.99625483227907, + "learning_rate": 9.619355893361954e-05, + "loss": 2.2601, + "step": 9770 + }, + { + "epoch": 6.295890410958904, + "grad_norm": 1.6953366358441109, + "learning_rate": 9.619278746090723e-05, + "loss": 2.4157, + "step": 9771 + }, + { + "epoch": 6.2965350523771155, + "grad_norm": 1.7401329538208357, + "learning_rate": 9.619201591315024e-05, + "loss": 2.3819, + "step": 9772 + }, + { + "epoch": 6.297179693795326, + "grad_norm": 1.7400233284519198, + "learning_rate": 9.619124429034983e-05, + "loss": 2.4004, + "step": 9773 + }, + { + "epoch": 6.297824335213537, + "grad_norm": 1.8245980585180914, + "learning_rate": 9.619047259250732e-05, + "loss": 2.383, + "step": 9774 + }, + { + "epoch": 6.2984689766317485, + "grad_norm": 1.8174893533455918, + "learning_rate": 9.618970081962391e-05, + "loss": 2.3984, + "step": 9775 + }, + { + "epoch": 6.29911361804996, + "grad_norm": 1.792756748669414, + "learning_rate": 9.618892897170092e-05, + "loss": 2.2978, + "step": 9776 + }, + { + "epoch": 6.299758259468171, + "grad_norm": 1.8553049971850732, + "learning_rate": 9.618815704873957e-05, + "loss": 2.4271, + "step": 9777 + }, + { + "epoch": 6.3004029008863816, + "grad_norm": 1.7010754207759176, + "learning_rate": 9.618738505074115e-05, + "loss": 2.1929, + "step": 9778 + }, + { + "epoch": 6.301047542304593, + "grad_norm": 1.8062685965060967, + "learning_rate": 9.618661297770694e-05, + "loss": 2.2386, + "step": 9779 + }, + { + "epoch": 6.301692183722804, + "grad_norm": 1.807201409204094, + "learning_rate": 9.61858408296382e-05, + "loss": 2.5928, + "step": 9780 + }, + { + "epoch": 6.3023368251410155, + "grad_norm": 2.265049750238398, + "learning_rate": 9.618506860653618e-05, + "loss": 2.2065, + "step": 9781 + }, + { + "epoch": 6.302981466559227, + "grad_norm": 2.039281928486056, + "learning_rate": 9.618429630840218e-05, + "loss": 2.353, + "step": 9782 + }, + { + "epoch": 6.303626107977437, + "grad_norm": 1.875133980077986, + "learning_rate": 9.618352393523746e-05, + "loss": 2.3329, + "step": 9783 + }, + { + "epoch": 6.3042707493956485, + "grad_norm": 2.157333621905658, + "learning_rate": 9.618275148704327e-05, + "loss": 1.9921, + "step": 9784 + }, + { + "epoch": 6.30491539081386, + "grad_norm": 1.7539486758912888, + "learning_rate": 9.618197896382089e-05, + "loss": 2.4985, + "step": 9785 + }, + { + "epoch": 6.305560032232071, + "grad_norm": 2.1939388706460923, + "learning_rate": 9.618120636557158e-05, + "loss": 2.5119, + "step": 9786 + }, + { + "epoch": 6.306204673650282, + "grad_norm": 1.5627072707898608, + "learning_rate": 9.618043369229665e-05, + "loss": 2.1946, + "step": 9787 + }, + { + "epoch": 6.306849315068493, + "grad_norm": 2.1137523946507866, + "learning_rate": 9.61796609439973e-05, + "loss": 2.4426, + "step": 9788 + }, + { + "epoch": 6.307493956486704, + "grad_norm": 1.6357706251011193, + "learning_rate": 9.617888812067485e-05, + "loss": 2.4463, + "step": 9789 + }, + { + "epoch": 6.308138597904915, + "grad_norm": 1.8773039840609622, + "learning_rate": 9.617811522233055e-05, + "loss": 2.447, + "step": 9790 + }, + { + "epoch": 6.308783239323127, + "grad_norm": 1.648843481369671, + "learning_rate": 9.617734224896568e-05, + "loss": 2.3924, + "step": 9791 + }, + { + "epoch": 6.309427880741338, + "grad_norm": 2.0053445071313423, + "learning_rate": 9.617656920058151e-05, + "loss": 2.4225, + "step": 9792 + }, + { + "epoch": 6.310072522159548, + "grad_norm": 1.7362571475629591, + "learning_rate": 9.617579607717928e-05, + "loss": 2.2164, + "step": 9793 + }, + { + "epoch": 6.31071716357776, + "grad_norm": 1.5713675398488987, + "learning_rate": 9.617502287876031e-05, + "loss": 2.4034, + "step": 9794 + }, + { + "epoch": 6.311361804995971, + "grad_norm": 1.6866550488757581, + "learning_rate": 9.617424960532583e-05, + "loss": 2.3202, + "step": 9795 + }, + { + "epoch": 6.312006446414182, + "grad_norm": 1.6746286014772096, + "learning_rate": 9.617347625687712e-05, + "loss": 2.5756, + "step": 9796 + }, + { + "epoch": 6.312651087832394, + "grad_norm": 1.7458429976051235, + "learning_rate": 9.617270283341545e-05, + "loss": 2.5369, + "step": 9797 + }, + { + "epoch": 6.313295729250604, + "grad_norm": 1.7191920663551676, + "learning_rate": 9.61719293349421e-05, + "loss": 2.2212, + "step": 9798 + }, + { + "epoch": 6.313940370668815, + "grad_norm": 1.7171378599968004, + "learning_rate": 9.617115576145835e-05, + "loss": 2.2775, + "step": 9799 + }, + { + "epoch": 6.314585012087027, + "grad_norm": 1.6529494720892504, + "learning_rate": 9.617038211296542e-05, + "loss": 2.2674, + "step": 9800 + }, + { + "epoch": 6.314585012087027, + "eval_loss": 4.997007846832275, + "eval_runtime": 2.9821, + "eval_samples_per_second": 33.533, + "eval_steps_per_second": 4.359, + "step": 9800 + }, + { + "epoch": 6.315229653505238, + "grad_norm": 1.71311361191236, + "learning_rate": 9.616960838946464e-05, + "loss": 2.5174, + "step": 9801 + }, + { + "epoch": 6.315874294923449, + "grad_norm": 1.594261835845527, + "learning_rate": 9.616883459095726e-05, + "loss": 2.284, + "step": 9802 + }, + { + "epoch": 6.31651893634166, + "grad_norm": 1.780658882847468, + "learning_rate": 9.616806071744452e-05, + "loss": 2.3357, + "step": 9803 + }, + { + "epoch": 6.317163577759871, + "grad_norm": 1.7036820868634637, + "learning_rate": 9.616728676892774e-05, + "loss": 2.3908, + "step": 9804 + }, + { + "epoch": 6.317808219178082, + "grad_norm": 1.749661968453695, + "learning_rate": 9.616651274540816e-05, + "loss": 2.5418, + "step": 9805 + }, + { + "epoch": 6.3184528605962935, + "grad_norm": 2.078507038931017, + "learning_rate": 9.616573864688706e-05, + "loss": 2.1299, + "step": 9806 + }, + { + "epoch": 6.319097502014505, + "grad_norm": 1.8314941474062354, + "learning_rate": 9.616496447336571e-05, + "loss": 2.4867, + "step": 9807 + }, + { + "epoch": 6.319742143432715, + "grad_norm": 1.8429853177624431, + "learning_rate": 9.616419022484538e-05, + "loss": 2.3181, + "step": 9808 + }, + { + "epoch": 6.3203867848509265, + "grad_norm": 1.6895109953390894, + "learning_rate": 9.616341590132736e-05, + "loss": 2.3328, + "step": 9809 + }, + { + "epoch": 6.321031426269138, + "grad_norm": 1.6895703163616858, + "learning_rate": 9.616264150281288e-05, + "loss": 2.5313, + "step": 9810 + }, + { + "epoch": 6.321676067687349, + "grad_norm": 1.7515653197275467, + "learning_rate": 9.616186702930327e-05, + "loss": 2.2298, + "step": 9811 + }, + { + "epoch": 6.32232070910556, + "grad_norm": 1.8996893501996694, + "learning_rate": 9.616109248079975e-05, + "loss": 2.3613, + "step": 9812 + }, + { + "epoch": 6.322965350523771, + "grad_norm": 2.0477148331703408, + "learning_rate": 9.616031785730362e-05, + "loss": 2.427, + "step": 9813 + }, + { + "epoch": 6.323609991941982, + "grad_norm": 1.691803184612626, + "learning_rate": 9.615954315881614e-05, + "loss": 2.4643, + "step": 9814 + }, + { + "epoch": 6.3242546333601934, + "grad_norm": 1.8920224043556417, + "learning_rate": 9.615876838533859e-05, + "loss": 2.2334, + "step": 9815 + }, + { + "epoch": 6.324899274778405, + "grad_norm": 1.862397565521904, + "learning_rate": 9.615799353687222e-05, + "loss": 2.2164, + "step": 9816 + }, + { + "epoch": 6.325543916196616, + "grad_norm": 2.0384400780773677, + "learning_rate": 9.615721861341834e-05, + "loss": 2.349, + "step": 9817 + }, + { + "epoch": 6.3261885576148265, + "grad_norm": 1.8235966109464967, + "learning_rate": 9.61564436149782e-05, + "loss": 2.367, + "step": 9818 + }, + { + "epoch": 6.326833199033038, + "grad_norm": 2.093971176796739, + "learning_rate": 9.615566854155308e-05, + "loss": 2.4481, + "step": 9819 + }, + { + "epoch": 6.327477840451249, + "grad_norm": 1.9684038807385758, + "learning_rate": 9.615489339314425e-05, + "loss": 2.4748, + "step": 9820 + }, + { + "epoch": 6.32812248186946, + "grad_norm": 1.789786322664632, + "learning_rate": 9.615411816975298e-05, + "loss": 1.9727, + "step": 9821 + }, + { + "epoch": 6.328767123287671, + "grad_norm": 2.2872642539582126, + "learning_rate": 9.615334287138055e-05, + "loss": 2.4324, + "step": 9822 + }, + { + "epoch": 6.329411764705882, + "grad_norm": 2.0113431291626935, + "learning_rate": 9.615256749802821e-05, + "loss": 2.3731, + "step": 9823 + }, + { + "epoch": 6.330056406124093, + "grad_norm": 3.9571146349947757, + "learning_rate": 9.615179204969728e-05, + "loss": 2.2616, + "step": 9824 + }, + { + "epoch": 6.330701047542305, + "grad_norm": 1.9408358587044845, + "learning_rate": 9.615101652638902e-05, + "loss": 2.4262, + "step": 9825 + }, + { + "epoch": 6.331345688960516, + "grad_norm": 1.8548973191471623, + "learning_rate": 9.615024092810465e-05, + "loss": 2.2068, + "step": 9826 + }, + { + "epoch": 6.331990330378726, + "grad_norm": 1.8624500771167378, + "learning_rate": 9.61494652548455e-05, + "loss": 2.1344, + "step": 9827 + }, + { + "epoch": 6.332634971796938, + "grad_norm": 1.8171371671049825, + "learning_rate": 9.614868950661283e-05, + "loss": 2.4349, + "step": 9828 + }, + { + "epoch": 6.333279613215149, + "grad_norm": 2.158906988305934, + "learning_rate": 9.614791368340792e-05, + "loss": 2.1773, + "step": 9829 + }, + { + "epoch": 6.33392425463336, + "grad_norm": 1.745073568204265, + "learning_rate": 9.614713778523204e-05, + "loss": 2.2141, + "step": 9830 + }, + { + "epoch": 6.334568896051572, + "grad_norm": 3.1591338725413536, + "learning_rate": 9.614636181208643e-05, + "loss": 2.4388, + "step": 9831 + }, + { + "epoch": 6.335213537469782, + "grad_norm": 2.1288474067925742, + "learning_rate": 9.614558576397241e-05, + "loss": 2.6272, + "step": 9832 + }, + { + "epoch": 6.335858178887993, + "grad_norm": 2.0488715914050473, + "learning_rate": 9.614480964089126e-05, + "loss": 2.2507, + "step": 9833 + }, + { + "epoch": 6.336502820306205, + "grad_norm": 2.0824206948656356, + "learning_rate": 9.61440334428442e-05, + "loss": 2.2825, + "step": 9834 + }, + { + "epoch": 6.337147461724416, + "grad_norm": 1.9140313339042578, + "learning_rate": 9.614325716983258e-05, + "loss": 2.228, + "step": 9835 + }, + { + "epoch": 6.337792103142627, + "grad_norm": 2.1010761762153582, + "learning_rate": 9.61424808218576e-05, + "loss": 2.4497, + "step": 9836 + }, + { + "epoch": 6.338436744560838, + "grad_norm": 2.00104915980423, + "learning_rate": 9.614170439892058e-05, + "loss": 2.4365, + "step": 9837 + }, + { + "epoch": 6.339081385979049, + "grad_norm": 2.362921728586528, + "learning_rate": 9.61409279010228e-05, + "loss": 2.7111, + "step": 9838 + }, + { + "epoch": 6.33972602739726, + "grad_norm": 1.8216969505175657, + "learning_rate": 9.614015132816549e-05, + "loss": 2.3765, + "step": 9839 + }, + { + "epoch": 6.3403706688154715, + "grad_norm": 1.994049216511458, + "learning_rate": 9.613937468034997e-05, + "loss": 2.3985, + "step": 9840 + }, + { + "epoch": 6.341015310233683, + "grad_norm": 1.999491531047964, + "learning_rate": 9.613859795757753e-05, + "loss": 2.2455, + "step": 9841 + }, + { + "epoch": 6.341659951651893, + "grad_norm": 1.8934525954600323, + "learning_rate": 9.613782115984938e-05, + "loss": 2.4692, + "step": 9842 + }, + { + "epoch": 6.3423045930701045, + "grad_norm": 1.8415648633873531, + "learning_rate": 9.613704428716683e-05, + "loss": 2.3252, + "step": 9843 + }, + { + "epoch": 6.342949234488316, + "grad_norm": 1.9220692762509473, + "learning_rate": 9.613626733953117e-05, + "loss": 2.302, + "step": 9844 + }, + { + "epoch": 6.343593875906527, + "grad_norm": 1.9794189881430815, + "learning_rate": 9.613549031694366e-05, + "loss": 2.337, + "step": 9845 + }, + { + "epoch": 6.344238517324738, + "grad_norm": 1.8409856923389223, + "learning_rate": 9.61347132194056e-05, + "loss": 2.2451, + "step": 9846 + }, + { + "epoch": 6.344883158742949, + "grad_norm": 1.9590199755861575, + "learning_rate": 9.613393604691822e-05, + "loss": 2.3625, + "step": 9847 + }, + { + "epoch": 6.34552780016116, + "grad_norm": 1.7752287121934414, + "learning_rate": 9.613315879948282e-05, + "loss": 2.372, + "step": 9848 + }, + { + "epoch": 6.346172441579371, + "grad_norm": 1.909801240611313, + "learning_rate": 9.61323814771007e-05, + "loss": 2.556, + "step": 9849 + }, + { + "epoch": 6.346817082997583, + "grad_norm": 1.9418722422179717, + "learning_rate": 9.61316040797731e-05, + "loss": 2.4221, + "step": 9850 + }, + { + "epoch": 6.347461724415794, + "grad_norm": 2.0478849939329513, + "learning_rate": 9.613082660750134e-05, + "loss": 2.3231, + "step": 9851 + }, + { + "epoch": 6.3481063658340044, + "grad_norm": 1.8449735103173424, + "learning_rate": 9.613004906028665e-05, + "loss": 2.364, + "step": 9852 + }, + { + "epoch": 6.348751007252216, + "grad_norm": 2.005681368801519, + "learning_rate": 9.612927143813031e-05, + "loss": 2.3919, + "step": 9853 + }, + { + "epoch": 6.349395648670427, + "grad_norm": 2.3373730366522576, + "learning_rate": 9.612849374103364e-05, + "loss": 2.3708, + "step": 9854 + }, + { + "epoch": 6.350040290088638, + "grad_norm": 2.06918848596193, + "learning_rate": 9.612771596899788e-05, + "loss": 2.4754, + "step": 9855 + }, + { + "epoch": 6.35068493150685, + "grad_norm": 1.9542291416294824, + "learning_rate": 9.612693812202434e-05, + "loss": 2.1579, + "step": 9856 + }, + { + "epoch": 6.35132957292506, + "grad_norm": 2.03200536149455, + "learning_rate": 9.612616020011425e-05, + "loss": 2.5046, + "step": 9857 + }, + { + "epoch": 6.351974214343271, + "grad_norm": 1.9334184298205204, + "learning_rate": 9.612538220326892e-05, + "loss": 2.3143, + "step": 9858 + }, + { + "epoch": 6.352618855761483, + "grad_norm": 2.0268314661125686, + "learning_rate": 9.612460413148963e-05, + "loss": 2.382, + "step": 9859 + }, + { + "epoch": 6.353263497179694, + "grad_norm": 2.046236995393878, + "learning_rate": 9.612382598477763e-05, + "loss": 2.4243, + "step": 9860 + }, + { + "epoch": 6.353908138597905, + "grad_norm": 1.9043674635398913, + "learning_rate": 9.612304776313424e-05, + "loss": 2.1468, + "step": 9861 + }, + { + "epoch": 6.354552780016116, + "grad_norm": 1.9303548561614101, + "learning_rate": 9.61222694665607e-05, + "loss": 2.2035, + "step": 9862 + }, + { + "epoch": 6.355197421434327, + "grad_norm": 1.9505253099610593, + "learning_rate": 9.612149109505832e-05, + "loss": 2.4707, + "step": 9863 + }, + { + "epoch": 6.355842062852538, + "grad_norm": 1.9550155160321359, + "learning_rate": 9.612071264862835e-05, + "loss": 2.4093, + "step": 9864 + }, + { + "epoch": 6.35648670427075, + "grad_norm": 2.097900378755158, + "learning_rate": 9.611993412727208e-05, + "loss": 2.3185, + "step": 9865 + }, + { + "epoch": 6.357131345688961, + "grad_norm": 1.8910602896700939, + "learning_rate": 9.61191555309908e-05, + "loss": 2.408, + "step": 9866 + }, + { + "epoch": 6.357775987107171, + "grad_norm": 2.0615181200509882, + "learning_rate": 9.611837685978577e-05, + "loss": 2.1842, + "step": 9867 + }, + { + "epoch": 6.358420628525383, + "grad_norm": 1.8117816484418736, + "learning_rate": 9.611759811365827e-05, + "loss": 2.3607, + "step": 9868 + }, + { + "epoch": 6.359065269943594, + "grad_norm": 2.180675733192426, + "learning_rate": 9.61168192926096e-05, + "loss": 2.5444, + "step": 9869 + }, + { + "epoch": 6.359709911361805, + "grad_norm": 1.8119696957718094, + "learning_rate": 9.611604039664101e-05, + "loss": 2.5839, + "step": 9870 + }, + { + "epoch": 6.3603545527800165, + "grad_norm": 1.8603180575671583, + "learning_rate": 9.611526142575382e-05, + "loss": 2.2139, + "step": 9871 + }, + { + "epoch": 6.360999194198227, + "grad_norm": 1.8107858269783021, + "learning_rate": 9.611448237994925e-05, + "loss": 2.3266, + "step": 9872 + }, + { + "epoch": 6.361643835616438, + "grad_norm": 1.8139588292048823, + "learning_rate": 9.611370325922864e-05, + "loss": 2.5056, + "step": 9873 + }, + { + "epoch": 6.3622884770346495, + "grad_norm": 1.7659749196911496, + "learning_rate": 9.611292406359324e-05, + "loss": 2.5022, + "step": 9874 + }, + { + "epoch": 6.362933118452861, + "grad_norm": 1.6336345318115801, + "learning_rate": 9.611214479304431e-05, + "loss": 2.2998, + "step": 9875 + }, + { + "epoch": 6.363577759871072, + "grad_norm": 1.7538089083510873, + "learning_rate": 9.611136544758318e-05, + "loss": 2.2544, + "step": 9876 + }, + { + "epoch": 6.3642224012892825, + "grad_norm": 1.6120066719778146, + "learning_rate": 9.611058602721109e-05, + "loss": 2.2317, + "step": 9877 + }, + { + "epoch": 6.364867042707494, + "grad_norm": 1.9020791980410632, + "learning_rate": 9.610980653192932e-05, + "loss": 2.4576, + "step": 9878 + }, + { + "epoch": 6.365511684125705, + "grad_norm": 1.7711133506179906, + "learning_rate": 9.61090269617392e-05, + "loss": 2.3062, + "step": 9879 + }, + { + "epoch": 6.366156325543916, + "grad_norm": 1.8193148229024392, + "learning_rate": 9.610824731664193e-05, + "loss": 2.725, + "step": 9880 + }, + { + "epoch": 6.366800966962128, + "grad_norm": 1.7341706911391135, + "learning_rate": 9.610746759663887e-05, + "loss": 2.5368, + "step": 9881 + }, + { + "epoch": 6.367445608380338, + "grad_norm": 1.7431714348935115, + "learning_rate": 9.610668780173124e-05, + "loss": 2.5107, + "step": 9882 + }, + { + "epoch": 6.368090249798549, + "grad_norm": 1.6636453848360138, + "learning_rate": 9.610590793192035e-05, + "loss": 2.5918, + "step": 9883 + }, + { + "epoch": 6.368734891216761, + "grad_norm": 1.6305808425831567, + "learning_rate": 9.610512798720748e-05, + "loss": 2.47, + "step": 9884 + }, + { + "epoch": 6.369379532634972, + "grad_norm": 1.8825918657783276, + "learning_rate": 9.61043479675939e-05, + "loss": 2.3627, + "step": 9885 + }, + { + "epoch": 6.370024174053183, + "grad_norm": 1.7596703808171557, + "learning_rate": 9.610356787308089e-05, + "loss": 2.2762, + "step": 9886 + }, + { + "epoch": 6.370668815471394, + "grad_norm": 1.858323915569899, + "learning_rate": 9.610278770366976e-05, + "loss": 2.4167, + "step": 9887 + }, + { + "epoch": 6.371313456889605, + "grad_norm": 1.709133773673387, + "learning_rate": 9.610200745936176e-05, + "loss": 2.1772, + "step": 9888 + }, + { + "epoch": 6.371958098307816, + "grad_norm": 1.883336359896675, + "learning_rate": 9.610122714015818e-05, + "loss": 2.307, + "step": 9889 + }, + { + "epoch": 6.372602739726028, + "grad_norm": 1.8532468121934937, + "learning_rate": 9.61004467460603e-05, + "loss": 2.5977, + "step": 9890 + }, + { + "epoch": 6.373247381144239, + "grad_norm": 1.897802486163325, + "learning_rate": 9.609966627706942e-05, + "loss": 2.4886, + "step": 9891 + }, + { + "epoch": 6.373892022562449, + "grad_norm": 1.7770841294864923, + "learning_rate": 9.60988857331868e-05, + "loss": 2.3541, + "step": 9892 + }, + { + "epoch": 6.374536663980661, + "grad_norm": 2.01730131934104, + "learning_rate": 9.60981051144137e-05, + "loss": 2.4716, + "step": 9893 + }, + { + "epoch": 6.375181305398872, + "grad_norm": 1.8288228513025788, + "learning_rate": 9.609732442075148e-05, + "loss": 2.4436, + "step": 9894 + }, + { + "epoch": 6.375825946817083, + "grad_norm": 1.8950779305600702, + "learning_rate": 9.609654365220134e-05, + "loss": 2.1825, + "step": 9895 + }, + { + "epoch": 6.376470588235295, + "grad_norm": 1.9198301820781645, + "learning_rate": 9.609576280876461e-05, + "loss": 2.288, + "step": 9896 + }, + { + "epoch": 6.377115229653505, + "grad_norm": 1.9062710491083377, + "learning_rate": 9.609498189044255e-05, + "loss": 2.4444, + "step": 9897 + }, + { + "epoch": 6.377759871071716, + "grad_norm": 1.7949944486635945, + "learning_rate": 9.609420089723645e-05, + "loss": 2.2057, + "step": 9898 + }, + { + "epoch": 6.378404512489928, + "grad_norm": 1.718251190471998, + "learning_rate": 9.60934198291476e-05, + "loss": 2.1398, + "step": 9899 + }, + { + "epoch": 6.379049153908139, + "grad_norm": 1.7234248222812716, + "learning_rate": 9.609263868617727e-05, + "loss": 2.4536, + "step": 9900 + }, + { + "epoch": 6.379049153908139, + "eval_loss": 4.927133560180664, + "eval_runtime": 2.9724, + "eval_samples_per_second": 33.642, + "eval_steps_per_second": 4.374, + "step": 9900 + }, + { + "epoch": 6.37969379532635, + "grad_norm": 1.646865814323767, + "learning_rate": 9.609185746832676e-05, + "loss": 2.4408, + "step": 9901 + }, + { + "epoch": 6.380338436744561, + "grad_norm": 1.6947887206550496, + "learning_rate": 9.609107617559732e-05, + "loss": 2.2251, + "step": 9902 + }, + { + "epoch": 6.380983078162772, + "grad_norm": 1.6581094468908473, + "learning_rate": 9.609029480799027e-05, + "loss": 2.2459, + "step": 9903 + }, + { + "epoch": 6.381627719580983, + "grad_norm": 1.7126462335432493, + "learning_rate": 9.608951336550688e-05, + "loss": 2.4011, + "step": 9904 + }, + { + "epoch": 6.3822723609991945, + "grad_norm": 1.7994161341204742, + "learning_rate": 9.608873184814841e-05, + "loss": 2.5206, + "step": 9905 + }, + { + "epoch": 6.382917002417406, + "grad_norm": 2.0775878445953513, + "learning_rate": 9.608795025591621e-05, + "loss": 2.2953, + "step": 9906 + }, + { + "epoch": 6.383561643835616, + "grad_norm": 1.7413785399423471, + "learning_rate": 9.608716858881148e-05, + "loss": 2.4151, + "step": 9907 + }, + { + "epoch": 6.3842062852538275, + "grad_norm": 1.9814075748350806, + "learning_rate": 9.608638684683556e-05, + "loss": 2.4371, + "step": 9908 + }, + { + "epoch": 6.384850926672039, + "grad_norm": 1.8618636382560543, + "learning_rate": 9.60856050299897e-05, + "loss": 2.4968, + "step": 9909 + }, + { + "epoch": 6.38549556809025, + "grad_norm": 1.904559283265776, + "learning_rate": 9.608482313827522e-05, + "loss": 2.4863, + "step": 9910 + }, + { + "epoch": 6.3861402095084605, + "grad_norm": 1.6473055262729166, + "learning_rate": 9.608404117169337e-05, + "loss": 2.3927, + "step": 9911 + }, + { + "epoch": 6.386784850926672, + "grad_norm": 1.923157226784993, + "learning_rate": 9.608325913024545e-05, + "loss": 2.4938, + "step": 9912 + }, + { + "epoch": 6.387429492344883, + "grad_norm": 1.6349496596143438, + "learning_rate": 9.608247701393275e-05, + "loss": 2.4552, + "step": 9913 + }, + { + "epoch": 6.388074133763094, + "grad_norm": 1.8282875668536889, + "learning_rate": 9.608169482275656e-05, + "loss": 2.5223, + "step": 9914 + }, + { + "epoch": 6.388718775181306, + "grad_norm": 1.761123113817024, + "learning_rate": 9.608091255671811e-05, + "loss": 2.2694, + "step": 9915 + }, + { + "epoch": 6.389363416599516, + "grad_norm": 1.6786507160118047, + "learning_rate": 9.608013021581877e-05, + "loss": 2.5088, + "step": 9916 + }, + { + "epoch": 6.390008058017727, + "grad_norm": 1.8241109057751086, + "learning_rate": 9.607934780005979e-05, + "loss": 2.2166, + "step": 9917 + }, + { + "epoch": 6.390652699435939, + "grad_norm": 2.0431264386074095, + "learning_rate": 9.60785653094424e-05, + "loss": 2.4022, + "step": 9918 + }, + { + "epoch": 6.39129734085415, + "grad_norm": 1.9350303529529826, + "learning_rate": 9.607778274396797e-05, + "loss": 2.5136, + "step": 9919 + }, + { + "epoch": 6.391941982272361, + "grad_norm": 1.6891168302286617, + "learning_rate": 9.607700010363773e-05, + "loss": 2.4212, + "step": 9920 + }, + { + "epoch": 6.392586623690572, + "grad_norm": 2.113874522506474, + "learning_rate": 9.607621738845299e-05, + "loss": 2.2852, + "step": 9921 + }, + { + "epoch": 6.393231265108783, + "grad_norm": 1.6266095934721208, + "learning_rate": 9.607543459841501e-05, + "loss": 2.4844, + "step": 9922 + }, + { + "epoch": 6.393875906526994, + "grad_norm": 1.8037993342370378, + "learning_rate": 9.607465173352513e-05, + "loss": 2.469, + "step": 9923 + }, + { + "epoch": 6.394520547945206, + "grad_norm": 1.8129453699191609, + "learning_rate": 9.607386879378458e-05, + "loss": 2.338, + "step": 9924 + }, + { + "epoch": 6.395165189363417, + "grad_norm": 1.8339744864793959, + "learning_rate": 9.607308577919467e-05, + "loss": 2.2672, + "step": 9925 + }, + { + "epoch": 6.395809830781627, + "grad_norm": 1.9030588115545573, + "learning_rate": 9.60723026897567e-05, + "loss": 2.3876, + "step": 9926 + }, + { + "epoch": 6.396454472199839, + "grad_norm": 2.148729243035117, + "learning_rate": 9.60715195254719e-05, + "loss": 2.4985, + "step": 9927 + }, + { + "epoch": 6.39709911361805, + "grad_norm": 1.7732917353189621, + "learning_rate": 9.607073628634161e-05, + "loss": 2.4836, + "step": 9928 + }, + { + "epoch": 6.397743755036261, + "grad_norm": 1.9970430064181717, + "learning_rate": 9.60699529723671e-05, + "loss": 2.385, + "step": 9929 + }, + { + "epoch": 6.3983883964544725, + "grad_norm": 1.8516770828082332, + "learning_rate": 9.606916958354967e-05, + "loss": 2.496, + "step": 9930 + }, + { + "epoch": 6.399033037872683, + "grad_norm": 1.8629144094258312, + "learning_rate": 9.606838611989058e-05, + "loss": 2.3859, + "step": 9931 + }, + { + "epoch": 6.399677679290894, + "grad_norm": 2.1106944448394, + "learning_rate": 9.606760258139113e-05, + "loss": 2.3041, + "step": 9932 + }, + { + "epoch": 6.400322320709106, + "grad_norm": 1.926678579424492, + "learning_rate": 9.606681896805262e-05, + "loss": 2.64, + "step": 9933 + }, + { + "epoch": 6.400966962127317, + "grad_norm": 2.0896012851824635, + "learning_rate": 9.606603527987632e-05, + "loss": 2.3496, + "step": 9934 + }, + { + "epoch": 6.401611603545528, + "grad_norm": 1.9357832710656382, + "learning_rate": 9.606525151686351e-05, + "loss": 2.4181, + "step": 9935 + }, + { + "epoch": 6.402256244963739, + "grad_norm": 1.9131205578396484, + "learning_rate": 9.60644676790155e-05, + "loss": 2.6402, + "step": 9936 + }, + { + "epoch": 6.40290088638195, + "grad_norm": 1.776010479463409, + "learning_rate": 9.606368376633356e-05, + "loss": 2.5814, + "step": 9937 + }, + { + "epoch": 6.403545527800161, + "grad_norm": 2.2358083159651896, + "learning_rate": 9.606289977881899e-05, + "loss": 2.2157, + "step": 9938 + }, + { + "epoch": 6.4041901692183725, + "grad_norm": 1.7921330109018692, + "learning_rate": 9.606211571647305e-05, + "loss": 2.4794, + "step": 9939 + }, + { + "epoch": 6.404834810636584, + "grad_norm": 1.855939700822755, + "learning_rate": 9.606133157929709e-05, + "loss": 2.4116, + "step": 9940 + }, + { + "epoch": 6.405479452054794, + "grad_norm": 1.7938944470391396, + "learning_rate": 9.606054736729233e-05, + "loss": 2.3903, + "step": 9941 + }, + { + "epoch": 6.4061240934730055, + "grad_norm": 1.800374425017054, + "learning_rate": 9.605976308046009e-05, + "loss": 2.4258, + "step": 9942 + }, + { + "epoch": 6.406768734891217, + "grad_norm": 1.8129242271251036, + "learning_rate": 9.605897871880165e-05, + "loss": 2.1421, + "step": 9943 + }, + { + "epoch": 6.407413376309428, + "grad_norm": 1.648337825098486, + "learning_rate": 9.605819428231831e-05, + "loss": 2.481, + "step": 9944 + }, + { + "epoch": 6.408058017727639, + "grad_norm": 2.1060618026721882, + "learning_rate": 9.605740977101133e-05, + "loss": 2.6356, + "step": 9945 + }, + { + "epoch": 6.40870265914585, + "grad_norm": 1.8741871329190165, + "learning_rate": 9.605662518488203e-05, + "loss": 2.4027, + "step": 9946 + }, + { + "epoch": 6.409347300564061, + "grad_norm": 1.9974991405788853, + "learning_rate": 9.605584052393168e-05, + "loss": 2.5607, + "step": 9947 + }, + { + "epoch": 6.409991941982272, + "grad_norm": 1.9214071930324481, + "learning_rate": 9.605505578816158e-05, + "loss": 2.2451, + "step": 9948 + }, + { + "epoch": 6.410636583400484, + "grad_norm": 1.9117299035988253, + "learning_rate": 9.605427097757301e-05, + "loss": 2.4937, + "step": 9949 + }, + { + "epoch": 6.411281224818695, + "grad_norm": 1.887530239839766, + "learning_rate": 9.605348609216726e-05, + "loss": 2.4772, + "step": 9950 + }, + { + "epoch": 6.411925866236905, + "grad_norm": 1.6954119077513579, + "learning_rate": 9.605270113194564e-05, + "loss": 2.4626, + "step": 9951 + }, + { + "epoch": 6.412570507655117, + "grad_norm": 1.907019872632714, + "learning_rate": 9.605191609690941e-05, + "loss": 2.3921, + "step": 9952 + }, + { + "epoch": 6.413215149073328, + "grad_norm": 1.834029178041008, + "learning_rate": 9.605113098705988e-05, + "loss": 2.4397, + "step": 9953 + }, + { + "epoch": 6.413859790491539, + "grad_norm": 1.8290814880304365, + "learning_rate": 9.605034580239832e-05, + "loss": 2.2901, + "step": 9954 + }, + { + "epoch": 6.414504431909751, + "grad_norm": 2.078203704259606, + "learning_rate": 9.604956054292601e-05, + "loss": 2.6629, + "step": 9955 + }, + { + "epoch": 6.415149073327961, + "grad_norm": 2.0291996906244063, + "learning_rate": 9.604877520864429e-05, + "loss": 2.6199, + "step": 9956 + }, + { + "epoch": 6.415793714746172, + "grad_norm": 1.9746037572571014, + "learning_rate": 9.604798979955441e-05, + "loss": 2.5724, + "step": 9957 + }, + { + "epoch": 6.416438356164384, + "grad_norm": 2.33321722210117, + "learning_rate": 9.604720431565766e-05, + "loss": 2.3855, + "step": 9958 + }, + { + "epoch": 6.417082997582595, + "grad_norm": 1.584041164330518, + "learning_rate": 9.604641875695536e-05, + "loss": 2.4488, + "step": 9959 + }, + { + "epoch": 6.417727639000805, + "grad_norm": 1.950440931886656, + "learning_rate": 9.604563312344875e-05, + "loss": 2.3394, + "step": 9960 + }, + { + "epoch": 6.418372280419017, + "grad_norm": 1.6162666842267, + "learning_rate": 9.604484741513918e-05, + "loss": 2.5246, + "step": 9961 + }, + { + "epoch": 6.419016921837228, + "grad_norm": 1.8068576751327057, + "learning_rate": 9.604406163202788e-05, + "loss": 2.4926, + "step": 9962 + }, + { + "epoch": 6.419661563255439, + "grad_norm": 2.177781370856149, + "learning_rate": 9.604327577411619e-05, + "loss": 2.2979, + "step": 9963 + }, + { + "epoch": 6.4203062046736505, + "grad_norm": 2.428205974585656, + "learning_rate": 9.604248984140537e-05, + "loss": 2.2049, + "step": 9964 + }, + { + "epoch": 6.420950846091861, + "grad_norm": 1.884924643138595, + "learning_rate": 9.604170383389675e-05, + "loss": 2.2791, + "step": 9965 + }, + { + "epoch": 6.421595487510072, + "grad_norm": 2.300905927548337, + "learning_rate": 9.604091775159156e-05, + "loss": 2.2148, + "step": 9966 + }, + { + "epoch": 6.4222401289282836, + "grad_norm": 1.6418472933325134, + "learning_rate": 9.604013159449115e-05, + "loss": 2.4469, + "step": 9967 + }, + { + "epoch": 6.422884770346495, + "grad_norm": 2.0492089118560353, + "learning_rate": 9.603934536259675e-05, + "loss": 2.6199, + "step": 9968 + }, + { + "epoch": 6.423529411764706, + "grad_norm": 1.8735228694612833, + "learning_rate": 9.603855905590971e-05, + "loss": 2.3675, + "step": 9969 + }, + { + "epoch": 6.424174053182917, + "grad_norm": 1.943133128561061, + "learning_rate": 9.603777267443129e-05, + "loss": 2.3806, + "step": 9970 + }, + { + "epoch": 6.424818694601128, + "grad_norm": 1.7424376760242715, + "learning_rate": 9.60369862181628e-05, + "loss": 2.66, + "step": 9971 + }, + { + "epoch": 6.425463336019339, + "grad_norm": 1.8174331135807091, + "learning_rate": 9.603619968710551e-05, + "loss": 2.4916, + "step": 9972 + }, + { + "epoch": 6.4261079774375505, + "grad_norm": 1.7905749785728469, + "learning_rate": 9.603541308126072e-05, + "loss": 2.4665, + "step": 9973 + }, + { + "epoch": 6.426752618855762, + "grad_norm": 2.138041757612764, + "learning_rate": 9.603462640062974e-05, + "loss": 2.4055, + "step": 9974 + }, + { + "epoch": 6.427397260273972, + "grad_norm": 1.7892149980172225, + "learning_rate": 9.603383964521384e-05, + "loss": 2.6819, + "step": 9975 + }, + { + "epoch": 6.4280419016921835, + "grad_norm": 2.045454110858621, + "learning_rate": 9.603305281501432e-05, + "loss": 2.7255, + "step": 9976 + }, + { + "epoch": 6.428686543110395, + "grad_norm": 2.1495059197463657, + "learning_rate": 9.603226591003244e-05, + "loss": 2.6553, + "step": 9977 + }, + { + "epoch": 6.429331184528606, + "grad_norm": 1.958441634127081, + "learning_rate": 9.603147893026958e-05, + "loss": 2.4642, + "step": 9978 + }, + { + "epoch": 6.429975825946817, + "grad_norm": 2.2898739614546564, + "learning_rate": 9.603069187572694e-05, + "loss": 2.1428, + "step": 9979 + }, + { + "epoch": 6.430620467365028, + "grad_norm": 1.847179339621137, + "learning_rate": 9.602990474640586e-05, + "loss": 2.7798, + "step": 9980 + }, + { + "epoch": 6.431265108783239, + "grad_norm": 2.2739573433942724, + "learning_rate": 9.602911754230762e-05, + "loss": 2.4857, + "step": 9981 + }, + { + "epoch": 6.43190975020145, + "grad_norm": 1.9007088788254487, + "learning_rate": 9.602833026343351e-05, + "loss": 2.5083, + "step": 9982 + }, + { + "epoch": 6.432554391619662, + "grad_norm": 1.8353982485118128, + "learning_rate": 9.602754290978483e-05, + "loss": 2.4245, + "step": 9983 + }, + { + "epoch": 6.433199033037873, + "grad_norm": 2.0131005197648126, + "learning_rate": 9.602675548136287e-05, + "loss": 2.3509, + "step": 9984 + }, + { + "epoch": 6.433843674456083, + "grad_norm": 1.9144544875069953, + "learning_rate": 9.602596797816892e-05, + "loss": 2.5653, + "step": 9985 + }, + { + "epoch": 6.434488315874295, + "grad_norm": 2.1426137843815183, + "learning_rate": 9.602518040020427e-05, + "loss": 2.3853, + "step": 9986 + }, + { + "epoch": 6.435132957292506, + "grad_norm": 1.958676170133713, + "learning_rate": 9.602439274747022e-05, + "loss": 2.476, + "step": 9987 + }, + { + "epoch": 6.435777598710717, + "grad_norm": 2.0528716074528566, + "learning_rate": 9.60236050199681e-05, + "loss": 2.2307, + "step": 9988 + }, + { + "epoch": 6.436422240128929, + "grad_norm": 1.8347065679776728, + "learning_rate": 9.602281721769912e-05, + "loss": 2.495, + "step": 9989 + }, + { + "epoch": 6.437066881547139, + "grad_norm": 1.9086011262266478, + "learning_rate": 9.602202934066466e-05, + "loss": 2.4801, + "step": 9990 + }, + { + "epoch": 6.43771152296535, + "grad_norm": 2.0182361213675812, + "learning_rate": 9.602124138886596e-05, + "loss": 2.3862, + "step": 9991 + }, + { + "epoch": 6.438356164383562, + "grad_norm": 1.7768401631283857, + "learning_rate": 9.602045336230433e-05, + "loss": 2.4641, + "step": 9992 + }, + { + "epoch": 6.439000805801773, + "grad_norm": 1.908454280099832, + "learning_rate": 9.601966526098107e-05, + "loss": 2.6702, + "step": 9993 + }, + { + "epoch": 6.439645447219984, + "grad_norm": 1.9926149698793822, + "learning_rate": 9.601887708489746e-05, + "loss": 2.5471, + "step": 9994 + }, + { + "epoch": 6.440290088638195, + "grad_norm": 2.0033895213564366, + "learning_rate": 9.601808883405479e-05, + "loss": 2.3669, + "step": 9995 + }, + { + "epoch": 6.440934730056406, + "grad_norm": 1.9202770685780521, + "learning_rate": 9.601730050845438e-05, + "loss": 2.3147, + "step": 9996 + }, + { + "epoch": 6.441579371474617, + "grad_norm": 1.9563462962361908, + "learning_rate": 9.601651210809753e-05, + "loss": 2.3544, + "step": 9997 + }, + { + "epoch": 6.4422240128928285, + "grad_norm": 1.80860490297048, + "learning_rate": 9.601572363298549e-05, + "loss": 2.6115, + "step": 9998 + }, + { + "epoch": 6.44286865431104, + "grad_norm": 1.9649993388706128, + "learning_rate": 9.60149350831196e-05, + "loss": 2.4511, + "step": 9999 + }, + { + "epoch": 6.44351329572925, + "grad_norm": 1.8703505330235723, + "learning_rate": 9.601414645850114e-05, + "loss": 2.2736, + "step": 10000 + }, + { + "epoch": 6.44351329572925, + "eval_loss": 4.9998321533203125, + "eval_runtime": 2.9744, + "eval_samples_per_second": 33.62, + "eval_steps_per_second": 4.371, + "step": 10000 + }, + { + "epoch": 6.4441579371474615, + "grad_norm": 1.9664438770610524, + "learning_rate": 9.601335775913138e-05, + "loss": 2.5882, + "step": 10001 + }, + { + "epoch": 6.444802578565673, + "grad_norm": 1.646145924539832, + "learning_rate": 9.601256898501166e-05, + "loss": 2.4091, + "step": 10002 + }, + { + "epoch": 6.445447219983884, + "grad_norm": 1.77726841280037, + "learning_rate": 9.601178013614324e-05, + "loss": 2.2261, + "step": 10003 + }, + { + "epoch": 6.4460918614020954, + "grad_norm": 1.6950070643356674, + "learning_rate": 9.601099121252743e-05, + "loss": 2.386, + "step": 10004 + }, + { + "epoch": 6.446736502820306, + "grad_norm": 1.7573943068259879, + "learning_rate": 9.601020221416553e-05, + "loss": 2.3876, + "step": 10005 + }, + { + "epoch": 6.447381144238517, + "grad_norm": 1.673947600022991, + "learning_rate": 9.600941314105882e-05, + "loss": 2.3865, + "step": 10006 + }, + { + "epoch": 6.4480257856567285, + "grad_norm": 1.788882768772208, + "learning_rate": 9.600862399320862e-05, + "loss": 2.3624, + "step": 10007 + }, + { + "epoch": 6.44867042707494, + "grad_norm": 1.913583857967614, + "learning_rate": 9.600783477061622e-05, + "loss": 2.2791, + "step": 10008 + }, + { + "epoch": 6.449315068493151, + "grad_norm": 1.6466928524872173, + "learning_rate": 9.600704547328287e-05, + "loss": 2.475, + "step": 10009 + }, + { + "epoch": 6.4499597099113615, + "grad_norm": 2.115097674882928, + "learning_rate": 9.600625610120994e-05, + "loss": 2.6478, + "step": 10010 + }, + { + "epoch": 6.450604351329573, + "grad_norm": 1.71661015774633, + "learning_rate": 9.600546665439869e-05, + "loss": 2.5401, + "step": 10011 + }, + { + "epoch": 6.451248992747784, + "grad_norm": 1.9248616659597155, + "learning_rate": 9.600467713285041e-05, + "loss": 2.4382, + "step": 10012 + }, + { + "epoch": 6.451893634165995, + "grad_norm": 1.741899226601797, + "learning_rate": 9.60038875365664e-05, + "loss": 2.2033, + "step": 10013 + }, + { + "epoch": 6.452538275584207, + "grad_norm": 1.7429731182252965, + "learning_rate": 9.600309786554796e-05, + "loss": 2.6097, + "step": 10014 + }, + { + "epoch": 6.453182917002417, + "grad_norm": 1.902233984266677, + "learning_rate": 9.60023081197964e-05, + "loss": 2.666, + "step": 10015 + }, + { + "epoch": 6.453827558420628, + "grad_norm": 1.9802238222254005, + "learning_rate": 9.600151829931299e-05, + "loss": 2.1041, + "step": 10016 + }, + { + "epoch": 6.45447219983884, + "grad_norm": 1.9125532536127896, + "learning_rate": 9.600072840409906e-05, + "loss": 2.555, + "step": 10017 + }, + { + "epoch": 6.455116841257051, + "grad_norm": 2.180406054389646, + "learning_rate": 9.599993843415588e-05, + "loss": 2.5954, + "step": 10018 + }, + { + "epoch": 6.455761482675262, + "grad_norm": 1.8794625711492745, + "learning_rate": 9.599914838948475e-05, + "loss": 2.5954, + "step": 10019 + }, + { + "epoch": 6.456406124093473, + "grad_norm": 1.8278143651808152, + "learning_rate": 9.599835827008699e-05, + "loss": 2.7086, + "step": 10020 + }, + { + "epoch": 6.457050765511684, + "grad_norm": 1.6499571514191134, + "learning_rate": 9.599756807596387e-05, + "loss": 2.1959, + "step": 10021 + }, + { + "epoch": 6.457695406929895, + "grad_norm": 2.03053437167555, + "learning_rate": 9.599677780711671e-05, + "loss": 2.6613, + "step": 10022 + }, + { + "epoch": 6.458340048348107, + "grad_norm": 1.7279841326115595, + "learning_rate": 9.59959874635468e-05, + "loss": 2.4461, + "step": 10023 + }, + { + "epoch": 6.458984689766318, + "grad_norm": 2.0862395233755464, + "learning_rate": 9.599519704525542e-05, + "loss": 2.4739, + "step": 10024 + }, + { + "epoch": 6.459629331184528, + "grad_norm": 1.5982406611221305, + "learning_rate": 9.59944065522439e-05, + "loss": 2.176, + "step": 10025 + }, + { + "epoch": 6.46027397260274, + "grad_norm": 1.9653654184470934, + "learning_rate": 9.59936159845135e-05, + "loss": 2.6589, + "step": 10026 + }, + { + "epoch": 6.460918614020951, + "grad_norm": 1.8121903992934785, + "learning_rate": 9.599282534206558e-05, + "loss": 2.3427, + "step": 10027 + }, + { + "epoch": 6.461563255439162, + "grad_norm": 1.7219582738716337, + "learning_rate": 9.599203462490136e-05, + "loss": 2.1912, + "step": 10028 + }, + { + "epoch": 6.4622078968573735, + "grad_norm": 1.5580485897623269, + "learning_rate": 9.599124383302221e-05, + "loss": 2.2887, + "step": 10029 + }, + { + "epoch": 6.462852538275584, + "grad_norm": 1.8552570455520248, + "learning_rate": 9.599045296642937e-05, + "loss": 2.6492, + "step": 10030 + }, + { + "epoch": 6.463497179693795, + "grad_norm": 1.9772541468873654, + "learning_rate": 9.598966202512419e-05, + "loss": 2.3335, + "step": 10031 + }, + { + "epoch": 6.4641418211120065, + "grad_norm": 1.763766308626098, + "learning_rate": 9.598887100910793e-05, + "loss": 2.5856, + "step": 10032 + }, + { + "epoch": 6.464786462530218, + "grad_norm": 2.006553274145058, + "learning_rate": 9.598807991838192e-05, + "loss": 2.2399, + "step": 10033 + }, + { + "epoch": 6.465431103948429, + "grad_norm": 1.5666268016056124, + "learning_rate": 9.598728875294742e-05, + "loss": 2.5137, + "step": 10034 + }, + { + "epoch": 6.4660757453666395, + "grad_norm": 1.814485242033149, + "learning_rate": 9.598649751280577e-05, + "loss": 2.1521, + "step": 10035 + }, + { + "epoch": 6.466720386784851, + "grad_norm": 1.8271525943963132, + "learning_rate": 9.598570619795825e-05, + "loss": 2.3486, + "step": 10036 + }, + { + "epoch": 6.467365028203062, + "grad_norm": 3.252070591692132, + "learning_rate": 9.598491480840616e-05, + "loss": 2.5509, + "step": 10037 + }, + { + "epoch": 6.468009669621273, + "grad_norm": 1.8008055275484165, + "learning_rate": 9.59841233441508e-05, + "loss": 2.3698, + "step": 10038 + }, + { + "epoch": 6.468654311039485, + "grad_norm": 1.9506586141576476, + "learning_rate": 9.598333180519346e-05, + "loss": 2.5659, + "step": 10039 + }, + { + "epoch": 6.469298952457695, + "grad_norm": 1.7721751908033208, + "learning_rate": 9.598254019153547e-05, + "loss": 2.5052, + "step": 10040 + }, + { + "epoch": 6.4699435938759065, + "grad_norm": 1.9758347601712196, + "learning_rate": 9.59817485031781e-05, + "loss": 2.2307, + "step": 10041 + }, + { + "epoch": 6.470588235294118, + "grad_norm": 5.710023504239762, + "learning_rate": 9.598095674012267e-05, + "loss": 2.398, + "step": 10042 + }, + { + "epoch": 6.471232876712329, + "grad_norm": 1.7765057850472243, + "learning_rate": 9.598016490237047e-05, + "loss": 2.5292, + "step": 10043 + }, + { + "epoch": 6.47187751813054, + "grad_norm": 1.7060206148888244, + "learning_rate": 9.597937298992281e-05, + "loss": 2.4603, + "step": 10044 + }, + { + "epoch": 6.472522159548751, + "grad_norm": 1.7181383222196445, + "learning_rate": 9.597858100278097e-05, + "loss": 2.464, + "step": 10045 + }, + { + "epoch": 6.473166800966962, + "grad_norm": 1.608484630628071, + "learning_rate": 9.597778894094627e-05, + "loss": 2.5433, + "step": 10046 + }, + { + "epoch": 6.473811442385173, + "grad_norm": 1.6720366070400445, + "learning_rate": 9.597699680442e-05, + "loss": 2.6257, + "step": 10047 + }, + { + "epoch": 6.474456083803385, + "grad_norm": 1.9133913833866754, + "learning_rate": 9.597620459320348e-05, + "loss": 2.3485, + "step": 10048 + }, + { + "epoch": 6.475100725221595, + "grad_norm": 1.6404428857515398, + "learning_rate": 9.597541230729798e-05, + "loss": 2.5958, + "step": 10049 + }, + { + "epoch": 6.475745366639806, + "grad_norm": 1.750214661097653, + "learning_rate": 9.597461994670484e-05, + "loss": 2.5035, + "step": 10050 + }, + { + "epoch": 6.476390008058018, + "grad_norm": 1.6896377863439953, + "learning_rate": 9.597382751142532e-05, + "loss": 2.6035, + "step": 10051 + }, + { + "epoch": 6.477034649476229, + "grad_norm": 1.8333953849234985, + "learning_rate": 9.597303500146074e-05, + "loss": 2.544, + "step": 10052 + }, + { + "epoch": 6.47767929089444, + "grad_norm": 1.831346392795613, + "learning_rate": 9.597224241681242e-05, + "loss": 2.5278, + "step": 10053 + }, + { + "epoch": 6.478323932312651, + "grad_norm": 1.8402323457962213, + "learning_rate": 9.597144975748163e-05, + "loss": 2.2172, + "step": 10054 + }, + { + "epoch": 6.478968573730862, + "grad_norm": 1.9164386423504085, + "learning_rate": 9.597065702346969e-05, + "loss": 2.469, + "step": 10055 + }, + { + "epoch": 6.479613215149073, + "grad_norm": 1.870903590592564, + "learning_rate": 9.59698642147779e-05, + "loss": 2.555, + "step": 10056 + }, + { + "epoch": 6.480257856567285, + "grad_norm": 2.0331697769991393, + "learning_rate": 9.596907133140757e-05, + "loss": 2.4461, + "step": 10057 + }, + { + "epoch": 6.480902497985496, + "grad_norm": 1.6093793549919502, + "learning_rate": 9.596827837335998e-05, + "loss": 2.4808, + "step": 10058 + }, + { + "epoch": 6.481547139403706, + "grad_norm": 1.8638495015149346, + "learning_rate": 9.596748534063643e-05, + "loss": 2.1552, + "step": 10059 + }, + { + "epoch": 6.482191780821918, + "grad_norm": 1.719009571703793, + "learning_rate": 9.596669223323827e-05, + "loss": 2.4407, + "step": 10060 + }, + { + "epoch": 6.482836422240129, + "grad_norm": 1.9050913944684293, + "learning_rate": 9.596589905116675e-05, + "loss": 2.3365, + "step": 10061 + }, + { + "epoch": 6.48348106365834, + "grad_norm": 1.9717060064976693, + "learning_rate": 9.59651057944232e-05, + "loss": 2.4496, + "step": 10062 + }, + { + "epoch": 6.4841257050765515, + "grad_norm": 1.9396263751443632, + "learning_rate": 9.59643124630089e-05, + "loss": 2.5437, + "step": 10063 + }, + { + "epoch": 6.484770346494762, + "grad_norm": 2.094482518670553, + "learning_rate": 9.596351905692518e-05, + "loss": 2.4977, + "step": 10064 + }, + { + "epoch": 6.485414987912973, + "grad_norm": 1.7788372510890917, + "learning_rate": 9.596272557617334e-05, + "loss": 2.4648, + "step": 10065 + }, + { + "epoch": 6.4860596293311845, + "grad_norm": 2.301830277225739, + "learning_rate": 9.596193202075467e-05, + "loss": 2.3896, + "step": 10066 + }, + { + "epoch": 6.486704270749396, + "grad_norm": 1.9346362534450832, + "learning_rate": 9.596113839067046e-05, + "loss": 2.5683, + "step": 10067 + }, + { + "epoch": 6.487348912167607, + "grad_norm": 2.345536047771939, + "learning_rate": 9.596034468592207e-05, + "loss": 2.5041, + "step": 10068 + }, + { + "epoch": 6.4879935535858175, + "grad_norm": 2.2148345065584145, + "learning_rate": 9.595955090651074e-05, + "loss": 2.5173, + "step": 10069 + }, + { + "epoch": 6.488638195004029, + "grad_norm": 1.9482597438757, + "learning_rate": 9.595875705243779e-05, + "loss": 2.3361, + "step": 10070 + }, + { + "epoch": 6.48928283642224, + "grad_norm": 1.6670005002686001, + "learning_rate": 9.595796312370456e-05, + "loss": 2.6222, + "step": 10071 + }, + { + "epoch": 6.489927477840451, + "grad_norm": 2.151780256310304, + "learning_rate": 9.59571691203123e-05, + "loss": 2.7791, + "step": 10072 + }, + { + "epoch": 6.490572119258663, + "grad_norm": 2.0461414663769997, + "learning_rate": 9.595637504226234e-05, + "loss": 2.5274, + "step": 10073 + }, + { + "epoch": 6.491216760676873, + "grad_norm": 1.932386269940792, + "learning_rate": 9.595558088955599e-05, + "loss": 2.6334, + "step": 10074 + }, + { + "epoch": 6.491861402095084, + "grad_norm": 1.6516725427331445, + "learning_rate": 9.595478666219456e-05, + "loss": 2.4781, + "step": 10075 + }, + { + "epoch": 6.492506043513296, + "grad_norm": 1.8299301353961115, + "learning_rate": 9.595399236017935e-05, + "loss": 2.6843, + "step": 10076 + }, + { + "epoch": 6.493150684931507, + "grad_norm": 1.7040160061584346, + "learning_rate": 9.595319798351165e-05, + "loss": 2.5144, + "step": 10077 + }, + { + "epoch": 6.493795326349718, + "grad_norm": 1.6633886572944854, + "learning_rate": 9.595240353219275e-05, + "loss": 2.9683, + "step": 10078 + }, + { + "epoch": 6.494439967767929, + "grad_norm": 1.5831999067654954, + "learning_rate": 9.595160900622401e-05, + "loss": 2.3598, + "step": 10079 + }, + { + "epoch": 6.49508460918614, + "grad_norm": 1.8902773395933312, + "learning_rate": 9.595081440560669e-05, + "loss": 2.4858, + "step": 10080 + }, + { + "epoch": 6.495729250604351, + "grad_norm": 1.759404989440991, + "learning_rate": 9.595001973034209e-05, + "loss": 2.6027, + "step": 10081 + }, + { + "epoch": 6.496373892022563, + "grad_norm": 1.8007733703050883, + "learning_rate": 9.594922498043154e-05, + "loss": 2.6501, + "step": 10082 + }, + { + "epoch": 6.497018533440774, + "grad_norm": 1.7336772023785791, + "learning_rate": 9.594843015587636e-05, + "loss": 2.5896, + "step": 10083 + }, + { + "epoch": 6.497663174858984, + "grad_norm": 1.8405440834832452, + "learning_rate": 9.594763525667781e-05, + "loss": 2.6441, + "step": 10084 + }, + { + "epoch": 6.498307816277196, + "grad_norm": 1.7279133405199905, + "learning_rate": 9.594684028283723e-05, + "loss": 2.1959, + "step": 10085 + }, + { + "epoch": 6.498952457695407, + "grad_norm": 1.7602620968638683, + "learning_rate": 9.594604523435591e-05, + "loss": 2.4532, + "step": 10086 + }, + { + "epoch": 6.499597099113618, + "grad_norm": 1.945354442524771, + "learning_rate": 9.594525011123515e-05, + "loss": 2.3762, + "step": 10087 + }, + { + "epoch": 6.500241740531829, + "grad_norm": 1.831860288372633, + "learning_rate": 9.594445491347629e-05, + "loss": 2.485, + "step": 10088 + }, + { + "epoch": 6.50088638195004, + "grad_norm": 2.0365504564555543, + "learning_rate": 9.594365964108061e-05, + "loss": 2.4789, + "step": 10089 + }, + { + "epoch": 6.501531023368251, + "grad_norm": 1.8119927938813944, + "learning_rate": 9.59428642940494e-05, + "loss": 2.696, + "step": 10090 + }, + { + "epoch": 6.502175664786463, + "grad_norm": 2.0735149483950415, + "learning_rate": 9.594206887238399e-05, + "loss": 2.2543, + "step": 10091 + }, + { + "epoch": 6.502820306204674, + "grad_norm": 1.662754386792458, + "learning_rate": 9.594127337608568e-05, + "loss": 2.7849, + "step": 10092 + }, + { + "epoch": 6.503464947622884, + "grad_norm": 1.8190821165930664, + "learning_rate": 9.594047780515579e-05, + "loss": 2.3213, + "step": 10093 + }, + { + "epoch": 6.504109589041096, + "grad_norm": 1.6747166814607004, + "learning_rate": 9.593968215959559e-05, + "loss": 2.4921, + "step": 10094 + }, + { + "epoch": 6.504754230459307, + "grad_norm": 2.1020924715352938, + "learning_rate": 9.593888643940643e-05, + "loss": 2.7532, + "step": 10095 + }, + { + "epoch": 6.505398871877518, + "grad_norm": 1.9319284472785254, + "learning_rate": 9.59380906445896e-05, + "loss": 2.2452, + "step": 10096 + }, + { + "epoch": 6.5060435132957295, + "grad_norm": 1.6468448848682722, + "learning_rate": 9.593729477514638e-05, + "loss": 2.3141, + "step": 10097 + }, + { + "epoch": 6.50668815471394, + "grad_norm": 1.9634773824315659, + "learning_rate": 9.593649883107814e-05, + "loss": 2.5287, + "step": 10098 + }, + { + "epoch": 6.507332796132151, + "grad_norm": 1.834188794152246, + "learning_rate": 9.593570281238612e-05, + "loss": 2.4831, + "step": 10099 + }, + { + "epoch": 6.5079774375503625, + "grad_norm": 1.902446640539468, + "learning_rate": 9.593490671907166e-05, + "loss": 2.5456, + "step": 10100 + }, + { + "epoch": 6.5079774375503625, + "eval_loss": 4.959571838378906, + "eval_runtime": 2.9854, + "eval_samples_per_second": 33.496, + "eval_steps_per_second": 4.354, + "step": 10100 + }, + { + "epoch": 6.508622078968574, + "grad_norm": 1.916998287091647, + "learning_rate": 9.593411055113606e-05, + "loss": 2.5878, + "step": 10101 + }, + { + "epoch": 6.509266720386785, + "grad_norm": 1.720894699413659, + "learning_rate": 9.593331430858063e-05, + "loss": 2.4971, + "step": 10102 + }, + { + "epoch": 6.5099113618049955, + "grad_norm": 1.9959013374077417, + "learning_rate": 9.59325179914067e-05, + "loss": 2.4153, + "step": 10103 + }, + { + "epoch": 6.510556003223207, + "grad_norm": 1.7091133433777925, + "learning_rate": 9.593172159961554e-05, + "loss": 2.42, + "step": 10104 + }, + { + "epoch": 6.511200644641418, + "grad_norm": 2.1322344784975753, + "learning_rate": 9.593092513320848e-05, + "loss": 2.5444, + "step": 10105 + }, + { + "epoch": 6.511845286059629, + "grad_norm": 2.0839029501650517, + "learning_rate": 9.593012859218682e-05, + "loss": 2.3364, + "step": 10106 + }, + { + "epoch": 6.512489927477841, + "grad_norm": 1.8852923794801648, + "learning_rate": 9.592933197655188e-05, + "loss": 2.5105, + "step": 10107 + }, + { + "epoch": 6.513134568896051, + "grad_norm": 1.900111982689415, + "learning_rate": 9.592853528630494e-05, + "loss": 2.3712, + "step": 10108 + }, + { + "epoch": 6.513779210314262, + "grad_norm": 1.7816196826369868, + "learning_rate": 9.592773852144736e-05, + "loss": 2.5751, + "step": 10109 + }, + { + "epoch": 6.514423851732474, + "grad_norm": 1.9575339675230448, + "learning_rate": 9.592694168198039e-05, + "loss": 2.1013, + "step": 10110 + }, + { + "epoch": 6.515068493150685, + "grad_norm": 1.7585424865460115, + "learning_rate": 9.592614476790536e-05, + "loss": 2.1339, + "step": 10111 + }, + { + "epoch": 6.515713134568896, + "grad_norm": 1.7107798898512063, + "learning_rate": 9.59253477792236e-05, + "loss": 2.4702, + "step": 10112 + }, + { + "epoch": 6.516357775987107, + "grad_norm": 1.7201678617567242, + "learning_rate": 9.592455071593641e-05, + "loss": 2.385, + "step": 10113 + }, + { + "epoch": 6.517002417405318, + "grad_norm": 1.9164804442755077, + "learning_rate": 9.592375357804507e-05, + "loss": 2.3904, + "step": 10114 + }, + { + "epoch": 6.517647058823529, + "grad_norm": 1.8899490344575698, + "learning_rate": 9.592295636555091e-05, + "loss": 2.1845, + "step": 10115 + }, + { + "epoch": 6.518291700241741, + "grad_norm": 1.68580218672442, + "learning_rate": 9.592215907845526e-05, + "loss": 2.4832, + "step": 10116 + }, + { + "epoch": 6.518936341659952, + "grad_norm": 1.6892831374186603, + "learning_rate": 9.59213617167594e-05, + "loss": 2.5435, + "step": 10117 + }, + { + "epoch": 6.519580983078162, + "grad_norm": 2.077738887285037, + "learning_rate": 9.592056428046467e-05, + "loss": 2.306, + "step": 10118 + }, + { + "epoch": 6.520225624496374, + "grad_norm": 1.846486039077226, + "learning_rate": 9.591976676957234e-05, + "loss": 2.5663, + "step": 10119 + }, + { + "epoch": 6.520870265914585, + "grad_norm": 2.0123407047113226, + "learning_rate": 9.591896918408374e-05, + "loss": 2.2926, + "step": 10120 + }, + { + "epoch": 6.521514907332796, + "grad_norm": 2.0687614662732376, + "learning_rate": 9.591817152400019e-05, + "loss": 2.451, + "step": 10121 + }, + { + "epoch": 6.522159548751008, + "grad_norm": 1.8970433273905207, + "learning_rate": 9.591737378932299e-05, + "loss": 2.3922, + "step": 10122 + }, + { + "epoch": 6.522804190169218, + "grad_norm": 2.523452929975214, + "learning_rate": 9.591657598005343e-05, + "loss": 2.5, + "step": 10123 + }, + { + "epoch": 6.523448831587429, + "grad_norm": 1.617095160741778, + "learning_rate": 9.591577809619284e-05, + "loss": 2.2979, + "step": 10124 + }, + { + "epoch": 6.524093473005641, + "grad_norm": 2.0230397694236335, + "learning_rate": 9.591498013774254e-05, + "loss": 2.5299, + "step": 10125 + }, + { + "epoch": 6.524738114423852, + "grad_norm": 1.7292228998519028, + "learning_rate": 9.591418210470384e-05, + "loss": 2.5995, + "step": 10126 + }, + { + "epoch": 6.525382755842063, + "grad_norm": 2.0211312172536817, + "learning_rate": 9.5913383997078e-05, + "loss": 2.5425, + "step": 10127 + }, + { + "epoch": 6.526027397260274, + "grad_norm": 1.6205996772068978, + "learning_rate": 9.591258581486642e-05, + "loss": 2.5072, + "step": 10128 + }, + { + "epoch": 6.526672038678485, + "grad_norm": 2.039428680886686, + "learning_rate": 9.591178755807035e-05, + "loss": 2.6494, + "step": 10129 + }, + { + "epoch": 6.527316680096696, + "grad_norm": 1.771538286195642, + "learning_rate": 9.59109892266911e-05, + "loss": 2.4496, + "step": 10130 + }, + { + "epoch": 6.5279613215149075, + "grad_norm": 1.8952582812360383, + "learning_rate": 9.591019082073002e-05, + "loss": 2.7011, + "step": 10131 + }, + { + "epoch": 6.528605962933119, + "grad_norm": 1.5406786517432405, + "learning_rate": 9.590939234018839e-05, + "loss": 2.3466, + "step": 10132 + }, + { + "epoch": 6.529250604351329, + "grad_norm": 1.7964783221542493, + "learning_rate": 9.590859378506752e-05, + "loss": 2.4401, + "step": 10133 + }, + { + "epoch": 6.5298952457695405, + "grad_norm": 1.7412836904517337, + "learning_rate": 9.590779515536872e-05, + "loss": 2.3498, + "step": 10134 + }, + { + "epoch": 6.530539887187752, + "grad_norm": 1.6536134618570422, + "learning_rate": 9.590699645109332e-05, + "loss": 2.4438, + "step": 10135 + }, + { + "epoch": 6.531184528605963, + "grad_norm": 1.881104220279974, + "learning_rate": 9.590619767224261e-05, + "loss": 2.4035, + "step": 10136 + }, + { + "epoch": 6.531829170024174, + "grad_norm": 1.6841359325249572, + "learning_rate": 9.590539881881793e-05, + "loss": 2.6859, + "step": 10137 + }, + { + "epoch": 6.532473811442385, + "grad_norm": 1.7720107403305319, + "learning_rate": 9.590459989082058e-05, + "loss": 2.6202, + "step": 10138 + }, + { + "epoch": 6.533118452860596, + "grad_norm": 1.8860202413859495, + "learning_rate": 9.590380088825186e-05, + "loss": 2.2681, + "step": 10139 + }, + { + "epoch": 6.533763094278807, + "grad_norm": 1.5797415339354262, + "learning_rate": 9.59030018111131e-05, + "loss": 2.4074, + "step": 10140 + }, + { + "epoch": 6.534407735697019, + "grad_norm": 1.96687254700298, + "learning_rate": 9.590220265940559e-05, + "loss": 2.3251, + "step": 10141 + }, + { + "epoch": 6.53505237711523, + "grad_norm": 1.681681384285699, + "learning_rate": 9.590140343313068e-05, + "loss": 2.3828, + "step": 10142 + }, + { + "epoch": 6.53569701853344, + "grad_norm": 2.0668102404991466, + "learning_rate": 9.590060413228963e-05, + "loss": 2.6451, + "step": 10143 + }, + { + "epoch": 6.536341659951652, + "grad_norm": 1.730565301148513, + "learning_rate": 9.589980475688381e-05, + "loss": 2.6373, + "step": 10144 + }, + { + "epoch": 6.536986301369863, + "grad_norm": 1.7654985078280518, + "learning_rate": 9.589900530691449e-05, + "loss": 2.4272, + "step": 10145 + }, + { + "epoch": 6.537630942788074, + "grad_norm": 1.901029248002888, + "learning_rate": 9.589820578238299e-05, + "loss": 2.5932, + "step": 10146 + }, + { + "epoch": 6.538275584206286, + "grad_norm": 1.788030103199712, + "learning_rate": 9.589740618329062e-05, + "loss": 2.5729, + "step": 10147 + }, + { + "epoch": 6.538920225624496, + "grad_norm": 1.7538930548862317, + "learning_rate": 9.589660650963873e-05, + "loss": 2.3369, + "step": 10148 + }, + { + "epoch": 6.539564867042707, + "grad_norm": 1.7664339228951955, + "learning_rate": 9.589580676142858e-05, + "loss": 2.5897, + "step": 10149 + }, + { + "epoch": 6.540209508460919, + "grad_norm": 1.8617518487396503, + "learning_rate": 9.589500693866155e-05, + "loss": 2.6622, + "step": 10150 + }, + { + "epoch": 6.54085414987913, + "grad_norm": 1.9583852853277632, + "learning_rate": 9.589420704133888e-05, + "loss": 2.3807, + "step": 10151 + }, + { + "epoch": 6.541498791297341, + "grad_norm": 1.67276316049612, + "learning_rate": 9.589340706946194e-05, + "loss": 2.4371, + "step": 10152 + }, + { + "epoch": 6.542143432715552, + "grad_norm": 1.7786835714845874, + "learning_rate": 9.589260702303201e-05, + "loss": 2.3749, + "step": 10153 + }, + { + "epoch": 6.542788074133763, + "grad_norm": 1.7950940579459838, + "learning_rate": 9.58918069020504e-05, + "loss": 2.5549, + "step": 10154 + }, + { + "epoch": 6.543432715551974, + "grad_norm": 1.5969900127728036, + "learning_rate": 9.589100670651846e-05, + "loss": 2.6594, + "step": 10155 + }, + { + "epoch": 6.5440773569701856, + "grad_norm": 1.7642765814070307, + "learning_rate": 9.589020643643749e-05, + "loss": 2.4376, + "step": 10156 + }, + { + "epoch": 6.544721998388397, + "grad_norm": 1.6695233664741207, + "learning_rate": 9.588940609180877e-05, + "loss": 2.5578, + "step": 10157 + }, + { + "epoch": 6.545366639806607, + "grad_norm": 1.7106499048565589, + "learning_rate": 9.588860567263368e-05, + "loss": 2.2902, + "step": 10158 + }, + { + "epoch": 6.546011281224819, + "grad_norm": 1.6947808510901934, + "learning_rate": 9.588780517891346e-05, + "loss": 2.2957, + "step": 10159 + }, + { + "epoch": 6.54665592264303, + "grad_norm": 1.682125166574984, + "learning_rate": 9.58870046106495e-05, + "loss": 2.4084, + "step": 10160 + }, + { + "epoch": 6.547300564061241, + "grad_norm": 1.7462272239095615, + "learning_rate": 9.588620396784306e-05, + "loss": 2.6435, + "step": 10161 + }, + { + "epoch": 6.5479452054794525, + "grad_norm": 1.7802603562649724, + "learning_rate": 9.588540325049546e-05, + "loss": 2.5874, + "step": 10162 + }, + { + "epoch": 6.548589846897663, + "grad_norm": 1.6596507867507535, + "learning_rate": 9.588460245860804e-05, + "loss": 2.2718, + "step": 10163 + }, + { + "epoch": 6.549234488315874, + "grad_norm": 1.6732245494646787, + "learning_rate": 9.58838015921821e-05, + "loss": 2.3621, + "step": 10164 + }, + { + "epoch": 6.5498791297340855, + "grad_norm": 1.7132537733963735, + "learning_rate": 9.588300065121895e-05, + "loss": 2.5679, + "step": 10165 + }, + { + "epoch": 6.550523771152297, + "grad_norm": 1.7841884281954763, + "learning_rate": 9.588219963571993e-05, + "loss": 2.7024, + "step": 10166 + }, + { + "epoch": 6.551168412570508, + "grad_norm": 2.1534296735142595, + "learning_rate": 9.588139854568632e-05, + "loss": 2.2242, + "step": 10167 + }, + { + "epoch": 6.5518130539887185, + "grad_norm": 1.8743436165499456, + "learning_rate": 9.588059738111945e-05, + "loss": 2.5215, + "step": 10168 + }, + { + "epoch": 6.55245769540693, + "grad_norm": 1.7820036825945025, + "learning_rate": 9.587979614202066e-05, + "loss": 2.398, + "step": 10169 + }, + { + "epoch": 6.553102336825141, + "grad_norm": 1.899567473104815, + "learning_rate": 9.587899482839125e-05, + "loss": 2.127, + "step": 10170 + }, + { + "epoch": 6.553746978243352, + "grad_norm": 1.5428335216622364, + "learning_rate": 9.58781934402325e-05, + "loss": 2.4225, + "step": 10171 + }, + { + "epoch": 6.554391619661564, + "grad_norm": 1.75178618186129, + "learning_rate": 9.587739197754579e-05, + "loss": 2.3369, + "step": 10172 + }, + { + "epoch": 6.555036261079774, + "grad_norm": 1.738700893578496, + "learning_rate": 9.58765904403324e-05, + "loss": 2.3557, + "step": 10173 + }, + { + "epoch": 6.555680902497985, + "grad_norm": 1.7117630414774672, + "learning_rate": 9.587578882859363e-05, + "loss": 2.4361, + "step": 10174 + }, + { + "epoch": 6.556325543916197, + "grad_norm": 1.8963228178525515, + "learning_rate": 9.587498714233083e-05, + "loss": 2.5253, + "step": 10175 + }, + { + "epoch": 6.556970185334408, + "grad_norm": 1.7398306405844306, + "learning_rate": 9.58741853815453e-05, + "loss": 2.4257, + "step": 10176 + }, + { + "epoch": 6.557614826752619, + "grad_norm": 1.7670589737136, + "learning_rate": 9.587338354623836e-05, + "loss": 2.7352, + "step": 10177 + }, + { + "epoch": 6.55825946817083, + "grad_norm": 1.6869302393201882, + "learning_rate": 9.587258163641134e-05, + "loss": 2.7027, + "step": 10178 + }, + { + "epoch": 6.558904109589041, + "grad_norm": 1.792515976667623, + "learning_rate": 9.587177965206555e-05, + "loss": 2.5493, + "step": 10179 + }, + { + "epoch": 6.559548751007252, + "grad_norm": 1.5546290979306132, + "learning_rate": 9.587097759320229e-05, + "loss": 2.5972, + "step": 10180 + }, + { + "epoch": 6.560193392425464, + "grad_norm": 1.8057774490134524, + "learning_rate": 9.587017545982287e-05, + "loss": 2.6479, + "step": 10181 + }, + { + "epoch": 6.560838033843675, + "grad_norm": 1.6188886010205108, + "learning_rate": 9.586937325192866e-05, + "loss": 2.5446, + "step": 10182 + }, + { + "epoch": 6.561482675261885, + "grad_norm": 1.800960702202529, + "learning_rate": 9.586857096952091e-05, + "loss": 2.5555, + "step": 10183 + }, + { + "epoch": 6.562127316680097, + "grad_norm": 1.9438368675860898, + "learning_rate": 9.586776861260101e-05, + "loss": 2.3154, + "step": 10184 + }, + { + "epoch": 6.562771958098308, + "grad_norm": 1.7406019569189042, + "learning_rate": 9.58669661811702e-05, + "loss": 2.5803, + "step": 10185 + }, + { + "epoch": 6.563416599516519, + "grad_norm": 1.7660983875430212, + "learning_rate": 9.586616367522988e-05, + "loss": 2.658, + "step": 10186 + }, + { + "epoch": 6.5640612409347305, + "grad_norm": 1.7692934102580793, + "learning_rate": 9.58653610947813e-05, + "loss": 2.492, + "step": 10187 + }, + { + "epoch": 6.564705882352941, + "grad_norm": 1.5582834691629839, + "learning_rate": 9.58645584398258e-05, + "loss": 2.5668, + "step": 10188 + }, + { + "epoch": 6.565350523771152, + "grad_norm": 1.7252195329267606, + "learning_rate": 9.586375571036473e-05, + "loss": 2.7055, + "step": 10189 + }, + { + "epoch": 6.5659951651893635, + "grad_norm": 1.7119766928243996, + "learning_rate": 9.586295290639935e-05, + "loss": 2.711, + "step": 10190 + }, + { + "epoch": 6.566639806607575, + "grad_norm": 1.6548961441743253, + "learning_rate": 9.586215002793102e-05, + "loss": 2.8385, + "step": 10191 + }, + { + "epoch": 6.567284448025786, + "grad_norm": 1.7269828604886235, + "learning_rate": 9.586134707496105e-05, + "loss": 2.5125, + "step": 10192 + }, + { + "epoch": 6.567929089443997, + "grad_norm": 1.7051729047416955, + "learning_rate": 9.586054404749075e-05, + "loss": 2.5063, + "step": 10193 + }, + { + "epoch": 6.568573730862208, + "grad_norm": 1.878467903096568, + "learning_rate": 9.585974094552144e-05, + "loss": 2.5435, + "step": 10194 + }, + { + "epoch": 6.569218372280419, + "grad_norm": 1.9953660732142395, + "learning_rate": 9.585893776905446e-05, + "loss": 2.4176, + "step": 10195 + }, + { + "epoch": 6.5698630136986305, + "grad_norm": 1.7448880430679674, + "learning_rate": 9.585813451809111e-05, + "loss": 2.5168, + "step": 10196 + }, + { + "epoch": 6.570507655116841, + "grad_norm": 1.8257598898690466, + "learning_rate": 9.58573311926327e-05, + "loss": 2.3407, + "step": 10197 + }, + { + "epoch": 6.571152296535052, + "grad_norm": 1.7905228795926156, + "learning_rate": 9.585652779268057e-05, + "loss": 2.6104, + "step": 10198 + }, + { + "epoch": 6.5717969379532635, + "grad_norm": 1.6913762810584387, + "learning_rate": 9.585572431823603e-05, + "loss": 2.6315, + "step": 10199 + }, + { + "epoch": 6.572441579371475, + "grad_norm": 1.7801177655821914, + "learning_rate": 9.58549207693004e-05, + "loss": 2.6777, + "step": 10200 + }, + { + "epoch": 6.572441579371475, + "eval_loss": 4.946020126342773, + "eval_runtime": 2.9741, + "eval_samples_per_second": 33.623, + "eval_steps_per_second": 4.371, + "step": 10200 + }, + { + "epoch": 6.573086220789686, + "grad_norm": 2.079105534417266, + "learning_rate": 9.585411714587501e-05, + "loss": 2.4817, + "step": 10201 + }, + { + "epoch": 6.5737308622078965, + "grad_norm": 2.056602321708, + "learning_rate": 9.585331344796115e-05, + "loss": 2.7192, + "step": 10202 + }, + { + "epoch": 6.574375503626108, + "grad_norm": 2.1555780428304034, + "learning_rate": 9.585250967556018e-05, + "loss": 2.3701, + "step": 10203 + }, + { + "epoch": 6.575020145044319, + "grad_norm": 1.8710084836355478, + "learning_rate": 9.585170582867338e-05, + "loss": 2.7129, + "step": 10204 + }, + { + "epoch": 6.57566478646253, + "grad_norm": 1.9164921122753473, + "learning_rate": 9.585090190730211e-05, + "loss": 2.6347, + "step": 10205 + }, + { + "epoch": 6.576309427880742, + "grad_norm": 1.9441615758365163, + "learning_rate": 9.585009791144767e-05, + "loss": 2.8389, + "step": 10206 + }, + { + "epoch": 6.576954069298952, + "grad_norm": 1.6935753154229545, + "learning_rate": 9.584929384111139e-05, + "loss": 2.3761, + "step": 10207 + }, + { + "epoch": 6.577598710717163, + "grad_norm": 1.7894747508492324, + "learning_rate": 9.584848969629455e-05, + "loss": 2.4718, + "step": 10208 + }, + { + "epoch": 6.578243352135375, + "grad_norm": 1.6961469011537076, + "learning_rate": 9.584768547699853e-05, + "loss": 2.519, + "step": 10209 + }, + { + "epoch": 6.578887993553586, + "grad_norm": 1.8184869140962465, + "learning_rate": 9.584688118322462e-05, + "loss": 2.5369, + "step": 10210 + }, + { + "epoch": 6.579532634971797, + "grad_norm": 1.6202687063191086, + "learning_rate": 9.584607681497414e-05, + "loss": 2.2825, + "step": 10211 + }, + { + "epoch": 6.580177276390008, + "grad_norm": 1.7219048671556239, + "learning_rate": 9.58452723722484e-05, + "loss": 2.531, + "step": 10212 + }, + { + "epoch": 6.580821917808219, + "grad_norm": 1.9653127905651102, + "learning_rate": 9.584446785504877e-05, + "loss": 2.5647, + "step": 10213 + }, + { + "epoch": 6.58146655922643, + "grad_norm": 1.6488526187540902, + "learning_rate": 9.584366326337651e-05, + "loss": 2.499, + "step": 10214 + }, + { + "epoch": 6.582111200644642, + "grad_norm": 1.9552400066245295, + "learning_rate": 9.584285859723298e-05, + "loss": 2.3819, + "step": 10215 + }, + { + "epoch": 6.582755842062853, + "grad_norm": 1.9111645727208495, + "learning_rate": 9.584205385661949e-05, + "loss": 2.7522, + "step": 10216 + }, + { + "epoch": 6.583400483481063, + "grad_norm": 1.7690799066235503, + "learning_rate": 9.584124904153737e-05, + "loss": 2.2482, + "step": 10217 + }, + { + "epoch": 6.584045124899275, + "grad_norm": 1.6849125597824464, + "learning_rate": 9.584044415198792e-05, + "loss": 2.8478, + "step": 10218 + }, + { + "epoch": 6.584689766317486, + "grad_norm": 1.713045729954229, + "learning_rate": 9.583963918797249e-05, + "loss": 2.4628, + "step": 10219 + }, + { + "epoch": 6.585334407735697, + "grad_norm": 1.6578118022934267, + "learning_rate": 9.58388341494924e-05, + "loss": 2.5528, + "step": 10220 + }, + { + "epoch": 6.5859790491539085, + "grad_norm": 1.6664341068689412, + "learning_rate": 9.583802903654892e-05, + "loss": 2.561, + "step": 10221 + }, + { + "epoch": 6.586623690572119, + "grad_norm": 1.8962597400819223, + "learning_rate": 9.583722384914344e-05, + "loss": 2.7691, + "step": 10222 + }, + { + "epoch": 6.58726833199033, + "grad_norm": 1.778010315606753, + "learning_rate": 9.583641858727725e-05, + "loss": 2.6029, + "step": 10223 + }, + { + "epoch": 6.5879129734085415, + "grad_norm": 1.68124271979706, + "learning_rate": 9.58356132509517e-05, + "loss": 2.6567, + "step": 10224 + }, + { + "epoch": 6.588557614826753, + "grad_norm": 1.6843379654212118, + "learning_rate": 9.583480784016806e-05, + "loss": 2.4809, + "step": 10225 + }, + { + "epoch": 6.589202256244963, + "grad_norm": 1.606136809761741, + "learning_rate": 9.583400235492771e-05, + "loss": 2.6627, + "step": 10226 + }, + { + "epoch": 6.5898468976631746, + "grad_norm": 1.6516804629779915, + "learning_rate": 9.583319679523194e-05, + "loss": 2.5658, + "step": 10227 + }, + { + "epoch": 6.590491539081386, + "grad_norm": 1.6096395430653487, + "learning_rate": 9.583239116108207e-05, + "loss": 2.3835, + "step": 10228 + }, + { + "epoch": 6.591136180499597, + "grad_norm": 1.6550747475947478, + "learning_rate": 9.583158545247942e-05, + "loss": 2.2672, + "step": 10229 + }, + { + "epoch": 6.5917808219178085, + "grad_norm": 1.7924589972642184, + "learning_rate": 9.583077966942534e-05, + "loss": 2.5745, + "step": 10230 + }, + { + "epoch": 6.592425463336019, + "grad_norm": 1.618471289168263, + "learning_rate": 9.582997381192115e-05, + "loss": 2.4933, + "step": 10231 + }, + { + "epoch": 6.59307010475423, + "grad_norm": 1.9566460010521114, + "learning_rate": 9.582916787996815e-05, + "loss": 2.458, + "step": 10232 + }, + { + "epoch": 6.5937147461724415, + "grad_norm": 1.755929124072223, + "learning_rate": 9.582836187356768e-05, + "loss": 2.6261, + "step": 10233 + }, + { + "epoch": 6.594359387590653, + "grad_norm": 2.07610599169915, + "learning_rate": 9.582755579272105e-05, + "loss": 2.31, + "step": 10234 + }, + { + "epoch": 6.595004029008864, + "grad_norm": 2.10280403670911, + "learning_rate": 9.582674963742962e-05, + "loss": 2.3407, + "step": 10235 + }, + { + "epoch": 6.5956486704270745, + "grad_norm": 2.098605982222916, + "learning_rate": 9.582594340769467e-05, + "loss": 2.4152, + "step": 10236 + }, + { + "epoch": 6.596293311845286, + "grad_norm": 1.9936102241111662, + "learning_rate": 9.582513710351754e-05, + "loss": 2.5006, + "step": 10237 + }, + { + "epoch": 6.596937953263497, + "grad_norm": 1.9765509459600445, + "learning_rate": 9.582433072489957e-05, + "loss": 2.6035, + "step": 10238 + }, + { + "epoch": 6.597582594681708, + "grad_norm": 1.8645951202856508, + "learning_rate": 9.582352427184205e-05, + "loss": 2.6723, + "step": 10239 + }, + { + "epoch": 6.59822723609992, + "grad_norm": 2.1013078051511567, + "learning_rate": 9.582271774434634e-05, + "loss": 2.3092, + "step": 10240 + }, + { + "epoch": 6.59887187751813, + "grad_norm": 1.8037879064977587, + "learning_rate": 9.582191114241376e-05, + "loss": 2.7291, + "step": 10241 + }, + { + "epoch": 6.599516518936341, + "grad_norm": 2.0917848434636266, + "learning_rate": 9.582110446604561e-05, + "loss": 2.3289, + "step": 10242 + }, + { + "epoch": 6.600161160354553, + "grad_norm": 2.0855633942599976, + "learning_rate": 9.58202977152432e-05, + "loss": 2.2141, + "step": 10243 + }, + { + "epoch": 6.600805801772764, + "grad_norm": 2.037918224659102, + "learning_rate": 9.581949089000794e-05, + "loss": 2.5086, + "step": 10244 + }, + { + "epoch": 6.601450443190975, + "grad_norm": 2.0981631143832717, + "learning_rate": 9.581868399034107e-05, + "loss": 2.6247, + "step": 10245 + }, + { + "epoch": 6.602095084609186, + "grad_norm": 1.9126273763523054, + "learning_rate": 9.581787701624395e-05, + "loss": 2.6701, + "step": 10246 + }, + { + "epoch": 6.602739726027397, + "grad_norm": 2.0981241499535512, + "learning_rate": 9.58170699677179e-05, + "loss": 2.5204, + "step": 10247 + }, + { + "epoch": 6.603384367445608, + "grad_norm": 1.7701614907199357, + "learning_rate": 9.581626284476426e-05, + "loss": 2.5034, + "step": 10248 + }, + { + "epoch": 6.60402900886382, + "grad_norm": 2.135808796330523, + "learning_rate": 9.581545564738431e-05, + "loss": 2.3967, + "step": 10249 + }, + { + "epoch": 6.604673650282031, + "grad_norm": 2.2063159854430348, + "learning_rate": 9.581464837557942e-05, + "loss": 2.4335, + "step": 10250 + }, + { + "epoch": 6.605318291700241, + "grad_norm": 1.9352494442469146, + "learning_rate": 9.581384102935092e-05, + "loss": 2.7796, + "step": 10251 + }, + { + "epoch": 6.605962933118453, + "grad_norm": 2.1792804622923514, + "learning_rate": 9.581303360870012e-05, + "loss": 2.5184, + "step": 10252 + }, + { + "epoch": 6.606607574536664, + "grad_norm": 1.5739738900518094, + "learning_rate": 9.581222611362832e-05, + "loss": 2.3774, + "step": 10253 + }, + { + "epoch": 6.607252215954875, + "grad_norm": 2.4795242347397175, + "learning_rate": 9.581141854413688e-05, + "loss": 2.5431, + "step": 10254 + }, + { + "epoch": 6.6078968573730865, + "grad_norm": 1.8191599197891377, + "learning_rate": 9.581061090022714e-05, + "loss": 2.6911, + "step": 10255 + }, + { + "epoch": 6.608541498791297, + "grad_norm": 2.300787653741846, + "learning_rate": 9.580980318190037e-05, + "loss": 2.5654, + "step": 10256 + }, + { + "epoch": 6.609186140209508, + "grad_norm": 1.9673718587195617, + "learning_rate": 9.580899538915797e-05, + "loss": 2.6067, + "step": 10257 + }, + { + "epoch": 6.6098307816277195, + "grad_norm": 2.035481273187234, + "learning_rate": 9.58081875220012e-05, + "loss": 2.3528, + "step": 10258 + }, + { + "epoch": 6.610475423045931, + "grad_norm": 1.7948419944794798, + "learning_rate": 9.580737958043142e-05, + "loss": 2.4989, + "step": 10259 + }, + { + "epoch": 6.611120064464142, + "grad_norm": 2.224631110936644, + "learning_rate": 9.580657156444993e-05, + "loss": 2.3536, + "step": 10260 + }, + { + "epoch": 6.6117647058823525, + "grad_norm": 1.6402509227230924, + "learning_rate": 9.580576347405808e-05, + "loss": 2.4688, + "step": 10261 + }, + { + "epoch": 6.612409347300564, + "grad_norm": 1.906144801276774, + "learning_rate": 9.580495530925723e-05, + "loss": 2.5066, + "step": 10262 + }, + { + "epoch": 6.613053988718775, + "grad_norm": 1.7922443897117586, + "learning_rate": 9.580414707004865e-05, + "loss": 2.7919, + "step": 10263 + }, + { + "epoch": 6.6136986301369864, + "grad_norm": 1.9013158406628237, + "learning_rate": 9.580333875643369e-05, + "loss": 2.5447, + "step": 10264 + }, + { + "epoch": 6.614343271555198, + "grad_norm": 1.7845964297332868, + "learning_rate": 9.580253036841369e-05, + "loss": 2.6461, + "step": 10265 + }, + { + "epoch": 6.614987912973408, + "grad_norm": 1.7938485478325557, + "learning_rate": 9.580172190598996e-05, + "loss": 2.5008, + "step": 10266 + }, + { + "epoch": 6.6156325543916195, + "grad_norm": 1.858770361244232, + "learning_rate": 9.580091336916383e-05, + "loss": 2.3881, + "step": 10267 + }, + { + "epoch": 6.616277195809831, + "grad_norm": 1.675776060630807, + "learning_rate": 9.580010475793662e-05, + "loss": 2.3283, + "step": 10268 + }, + { + "epoch": 6.616921837228042, + "grad_norm": 1.8199196219261486, + "learning_rate": 9.579929607230969e-05, + "loss": 2.6426, + "step": 10269 + }, + { + "epoch": 6.617566478646253, + "grad_norm": 1.8543721175765542, + "learning_rate": 9.579848731228434e-05, + "loss": 2.1816, + "step": 10270 + }, + { + "epoch": 6.618211120064464, + "grad_norm": 1.6688649345804045, + "learning_rate": 9.57976784778619e-05, + "loss": 2.4075, + "step": 10271 + }, + { + "epoch": 6.618855761482675, + "grad_norm": 1.929791611568829, + "learning_rate": 9.57968695690437e-05, + "loss": 2.5779, + "step": 10272 + }, + { + "epoch": 6.619500402900886, + "grad_norm": 1.6717963275631285, + "learning_rate": 9.579606058583109e-05, + "loss": 2.4007, + "step": 10273 + }, + { + "epoch": 6.620145044319098, + "grad_norm": 1.8012100711454953, + "learning_rate": 9.579525152822536e-05, + "loss": 2.4178, + "step": 10274 + }, + { + "epoch": 6.620789685737309, + "grad_norm": 1.6681573094980864, + "learning_rate": 9.579444239622787e-05, + "loss": 2.5402, + "step": 10275 + }, + { + "epoch": 6.621434327155519, + "grad_norm": 1.71438529166281, + "learning_rate": 9.579363318983994e-05, + "loss": 2.5473, + "step": 10276 + }, + { + "epoch": 6.622078968573731, + "grad_norm": 1.763025610150768, + "learning_rate": 9.57928239090629e-05, + "loss": 2.3674, + "step": 10277 + }, + { + "epoch": 6.622723609991942, + "grad_norm": 1.71746189972693, + "learning_rate": 9.579201455389807e-05, + "loss": 2.757, + "step": 10278 + }, + { + "epoch": 6.623368251410153, + "grad_norm": 1.7750149046369603, + "learning_rate": 9.579120512434678e-05, + "loss": 2.5745, + "step": 10279 + }, + { + "epoch": 6.624012892828365, + "grad_norm": 1.8015910382622755, + "learning_rate": 9.579039562041039e-05, + "loss": 2.6712, + "step": 10280 + }, + { + "epoch": 6.624657534246575, + "grad_norm": 1.6447415455974528, + "learning_rate": 9.578958604209018e-05, + "loss": 2.8744, + "step": 10281 + }, + { + "epoch": 6.625302175664786, + "grad_norm": 1.74385617685758, + "learning_rate": 9.578877638938753e-05, + "loss": 2.6273, + "step": 10282 + }, + { + "epoch": 6.625946817082998, + "grad_norm": 1.7217129856475593, + "learning_rate": 9.578796666230372e-05, + "loss": 2.8073, + "step": 10283 + }, + { + "epoch": 6.626591458501209, + "grad_norm": 1.7941377928432993, + "learning_rate": 9.578715686084011e-05, + "loss": 2.4168, + "step": 10284 + }, + { + "epoch": 6.62723609991942, + "grad_norm": 1.6971061098293863, + "learning_rate": 9.578634698499805e-05, + "loss": 2.7386, + "step": 10285 + }, + { + "epoch": 6.627880741337631, + "grad_norm": 1.6324622403418891, + "learning_rate": 9.57855370347788e-05, + "loss": 2.2797, + "step": 10286 + }, + { + "epoch": 6.628525382755842, + "grad_norm": 1.5424984375101172, + "learning_rate": 9.578472701018376e-05, + "loss": 2.5105, + "step": 10287 + }, + { + "epoch": 6.629170024174053, + "grad_norm": 1.7290177625440017, + "learning_rate": 9.578391691121422e-05, + "loss": 2.6741, + "step": 10288 + }, + { + "epoch": 6.6298146655922645, + "grad_norm": 1.6511996466594159, + "learning_rate": 9.578310673787155e-05, + "loss": 2.4254, + "step": 10289 + }, + { + "epoch": 6.630459307010476, + "grad_norm": 1.5120214044510338, + "learning_rate": 9.578229649015704e-05, + "loss": 2.5195, + "step": 10290 + }, + { + "epoch": 6.631103948428686, + "grad_norm": 1.5927010697826565, + "learning_rate": 9.578148616807204e-05, + "loss": 2.3249, + "step": 10291 + }, + { + "epoch": 6.6317485898468975, + "grad_norm": 1.7002417967399563, + "learning_rate": 9.578067577161788e-05, + "loss": 2.3642, + "step": 10292 + }, + { + "epoch": 6.632393231265109, + "grad_norm": 1.6052384673527411, + "learning_rate": 9.57798653007959e-05, + "loss": 2.4117, + "step": 10293 + }, + { + "epoch": 6.63303787268332, + "grad_norm": 1.7245750328610443, + "learning_rate": 9.57790547556074e-05, + "loss": 2.3057, + "step": 10294 + }, + { + "epoch": 6.633682514101531, + "grad_norm": 1.744585009572988, + "learning_rate": 9.577824413605374e-05, + "loss": 2.5408, + "step": 10295 + }, + { + "epoch": 6.634327155519742, + "grad_norm": 1.7485987880601221, + "learning_rate": 9.577743344213624e-05, + "loss": 2.456, + "step": 10296 + }, + { + "epoch": 6.634971796937953, + "grad_norm": 1.6554670937980427, + "learning_rate": 9.577662267385623e-05, + "loss": 2.3154, + "step": 10297 + }, + { + "epoch": 6.635616438356164, + "grad_norm": 1.7597131308347556, + "learning_rate": 9.577581183121504e-05, + "loss": 2.7206, + "step": 10298 + }, + { + "epoch": 6.636261079774376, + "grad_norm": 1.5416642680820838, + "learning_rate": 9.577500091421401e-05, + "loss": 2.5087, + "step": 10299 + }, + { + "epoch": 6.636905721192587, + "grad_norm": 1.8031794452711447, + "learning_rate": 9.577418992285448e-05, + "loss": 2.6229, + "step": 10300 + }, + { + "epoch": 6.636905721192587, + "eval_loss": 4.862412929534912, + "eval_runtime": 2.9731, + "eval_samples_per_second": 33.635, + "eval_steps_per_second": 4.372, + "step": 10300 + }, + { + "epoch": 6.6375503626107974, + "grad_norm": 1.6746596380593224, + "learning_rate": 9.577337885713778e-05, + "loss": 2.7452, + "step": 10301 + }, + { + "epoch": 6.638195004029009, + "grad_norm": 1.7570931411383806, + "learning_rate": 9.577256771706522e-05, + "loss": 2.3685, + "step": 10302 + }, + { + "epoch": 6.63883964544722, + "grad_norm": 1.7581295300793673, + "learning_rate": 9.577175650263815e-05, + "loss": 2.3697, + "step": 10303 + }, + { + "epoch": 6.639484286865431, + "grad_norm": 1.8028759997019879, + "learning_rate": 9.577094521385788e-05, + "loss": 2.5944, + "step": 10304 + }, + { + "epoch": 6.640128928283643, + "grad_norm": 1.7573553497273282, + "learning_rate": 9.577013385072578e-05, + "loss": 2.5887, + "step": 10305 + }, + { + "epoch": 6.640773569701853, + "grad_norm": 1.7725779600002654, + "learning_rate": 9.576932241324314e-05, + "loss": 2.4336, + "step": 10306 + }, + { + "epoch": 6.641418211120064, + "grad_norm": 1.6497712450223172, + "learning_rate": 9.576851090141133e-05, + "loss": 2.491, + "step": 10307 + }, + { + "epoch": 6.642062852538276, + "grad_norm": 1.6240036319086555, + "learning_rate": 9.576769931523168e-05, + "loss": 2.7695, + "step": 10308 + }, + { + "epoch": 6.642707493956487, + "grad_norm": 1.6370136006970524, + "learning_rate": 9.57668876547055e-05, + "loss": 2.4792, + "step": 10309 + }, + { + "epoch": 6.643352135374698, + "grad_norm": 1.7519006284586371, + "learning_rate": 9.576607591983412e-05, + "loss": 2.6214, + "step": 10310 + }, + { + "epoch": 6.643996776792909, + "grad_norm": 1.7035233363406954, + "learning_rate": 9.57652641106189e-05, + "loss": 2.297, + "step": 10311 + }, + { + "epoch": 6.64464141821112, + "grad_norm": 1.8495131749466254, + "learning_rate": 9.576445222706116e-05, + "loss": 2.485, + "step": 10312 + }, + { + "epoch": 6.645286059629331, + "grad_norm": 1.5948607437635656, + "learning_rate": 9.576364026916223e-05, + "loss": 2.5308, + "step": 10313 + }, + { + "epoch": 6.645930701047543, + "grad_norm": 1.7547072047151768, + "learning_rate": 9.576282823692346e-05, + "loss": 2.635, + "step": 10314 + }, + { + "epoch": 6.646575342465754, + "grad_norm": 1.6399689397490549, + "learning_rate": 9.576201613034616e-05, + "loss": 2.4633, + "step": 10315 + }, + { + "epoch": 6.647219983883964, + "grad_norm": 1.6583817047265264, + "learning_rate": 9.576120394943168e-05, + "loss": 2.5795, + "step": 10316 + }, + { + "epoch": 6.647864625302176, + "grad_norm": 1.8211665956232743, + "learning_rate": 9.576039169418134e-05, + "loss": 2.3606, + "step": 10317 + }, + { + "epoch": 6.648509266720387, + "grad_norm": 1.905807362013521, + "learning_rate": 9.57595793645965e-05, + "loss": 2.3415, + "step": 10318 + }, + { + "epoch": 6.649153908138598, + "grad_norm": 1.749233915950849, + "learning_rate": 9.575876696067845e-05, + "loss": 2.6736, + "step": 10319 + }, + { + "epoch": 6.6497985495568095, + "grad_norm": 1.764435551534647, + "learning_rate": 9.575795448242856e-05, + "loss": 2.6339, + "step": 10320 + }, + { + "epoch": 6.65044319097502, + "grad_norm": 1.8497746919433136, + "learning_rate": 9.575714192984816e-05, + "loss": 2.3273, + "step": 10321 + }, + { + "epoch": 6.651087832393231, + "grad_norm": 1.9042055912469333, + "learning_rate": 9.575632930293859e-05, + "loss": 2.7328, + "step": 10322 + }, + { + "epoch": 6.6517324738114425, + "grad_norm": 1.7870823432325826, + "learning_rate": 9.575551660170118e-05, + "loss": 2.627, + "step": 10323 + }, + { + "epoch": 6.652377115229654, + "grad_norm": 1.6364610935792183, + "learning_rate": 9.575470382613724e-05, + "loss": 2.3594, + "step": 10324 + }, + { + "epoch": 6.653021756647865, + "grad_norm": 1.831311144449783, + "learning_rate": 9.575389097624812e-05, + "loss": 2.675, + "step": 10325 + }, + { + "epoch": 6.6536663980660755, + "grad_norm": 1.7068013628371677, + "learning_rate": 9.575307805203517e-05, + "loss": 2.5222, + "step": 10326 + }, + { + "epoch": 6.654311039484287, + "grad_norm": 1.8422588803886688, + "learning_rate": 9.575226505349972e-05, + "loss": 2.6463, + "step": 10327 + }, + { + "epoch": 6.654955680902498, + "grad_norm": 1.8872616659485333, + "learning_rate": 9.57514519806431e-05, + "loss": 2.3613, + "step": 10328 + }, + { + "epoch": 6.655600322320709, + "grad_norm": 1.6774161034032593, + "learning_rate": 9.575063883346663e-05, + "loss": 2.4344, + "step": 10329 + }, + { + "epoch": 6.656244963738921, + "grad_norm": 2.0490450585792526, + "learning_rate": 9.574982561197167e-05, + "loss": 2.5526, + "step": 10330 + }, + { + "epoch": 6.656889605157131, + "grad_norm": 1.5937989674867896, + "learning_rate": 9.574901231615953e-05, + "loss": 2.5384, + "step": 10331 + }, + { + "epoch": 6.657534246575342, + "grad_norm": 2.0543447961003793, + "learning_rate": 9.57481989460316e-05, + "loss": 2.2231, + "step": 10332 + }, + { + "epoch": 6.658178887993554, + "grad_norm": 1.686957781701808, + "learning_rate": 9.574738550158916e-05, + "loss": 2.4463, + "step": 10333 + }, + { + "epoch": 6.658823529411765, + "grad_norm": 2.203927830510015, + "learning_rate": 9.574657198283355e-05, + "loss": 2.4896, + "step": 10334 + }, + { + "epoch": 6.659468170829975, + "grad_norm": 1.7613217626793864, + "learning_rate": 9.574575838976613e-05, + "loss": 2.6071, + "step": 10335 + }, + { + "epoch": 6.660112812248187, + "grad_norm": 1.7702554951802008, + "learning_rate": 9.574494472238823e-05, + "loss": 2.5663, + "step": 10336 + }, + { + "epoch": 6.660757453666398, + "grad_norm": 1.7298617178651257, + "learning_rate": 9.574413098070118e-05, + "loss": 2.4471, + "step": 10337 + }, + { + "epoch": 6.661402095084609, + "grad_norm": 2.1171841004983163, + "learning_rate": 9.57433171647063e-05, + "loss": 2.1887, + "step": 10338 + }, + { + "epoch": 6.662046736502821, + "grad_norm": 1.7327887754697502, + "learning_rate": 9.574250327440497e-05, + "loss": 2.4844, + "step": 10339 + }, + { + "epoch": 6.662691377921031, + "grad_norm": 2.0408495828337796, + "learning_rate": 9.57416893097985e-05, + "loss": 2.4538, + "step": 10340 + }, + { + "epoch": 6.663336019339242, + "grad_norm": 1.6506267448233973, + "learning_rate": 9.574087527088823e-05, + "loss": 2.4918, + "step": 10341 + }, + { + "epoch": 6.663980660757454, + "grad_norm": 1.9557347169656218, + "learning_rate": 9.574006115767548e-05, + "loss": 2.4951, + "step": 10342 + }, + { + "epoch": 6.664625302175665, + "grad_norm": 1.7049715597291153, + "learning_rate": 9.57392469701616e-05, + "loss": 2.6023, + "step": 10343 + }, + { + "epoch": 6.665269943593876, + "grad_norm": 2.3562512280154855, + "learning_rate": 9.573843270834796e-05, + "loss": 2.7386, + "step": 10344 + }, + { + "epoch": 6.665914585012087, + "grad_norm": 1.7887207957750324, + "learning_rate": 9.573761837223584e-05, + "loss": 2.6881, + "step": 10345 + }, + { + "epoch": 6.666559226430298, + "grad_norm": 2.3154708976779, + "learning_rate": 9.573680396182663e-05, + "loss": 2.4412, + "step": 10346 + }, + { + "epoch": 6.667203867848509, + "grad_norm": 1.6118304487060855, + "learning_rate": 9.573598947712162e-05, + "loss": 2.4224, + "step": 10347 + }, + { + "epoch": 6.667848509266721, + "grad_norm": 2.11028700496417, + "learning_rate": 9.573517491812219e-05, + "loss": 2.5143, + "step": 10348 + }, + { + "epoch": 6.668493150684932, + "grad_norm": 1.6104862198581225, + "learning_rate": 9.573436028482964e-05, + "loss": 2.5123, + "step": 10349 + }, + { + "epoch": 6.669137792103142, + "grad_norm": 2.0813266512139896, + "learning_rate": 9.573354557724533e-05, + "loss": 2.8138, + "step": 10350 + }, + { + "epoch": 6.669782433521354, + "grad_norm": 1.7939988315319089, + "learning_rate": 9.57327307953706e-05, + "loss": 2.4175, + "step": 10351 + }, + { + "epoch": 6.670427074939565, + "grad_norm": 1.8423838185522934, + "learning_rate": 9.573191593920677e-05, + "loss": 2.6629, + "step": 10352 + }, + { + "epoch": 6.671071716357776, + "grad_norm": 1.72639450467159, + "learning_rate": 9.57311010087552e-05, + "loss": 2.3273, + "step": 10353 + }, + { + "epoch": 6.6717163577759875, + "grad_norm": 1.7115690468762352, + "learning_rate": 9.573028600401723e-05, + "loss": 2.1791, + "step": 10354 + }, + { + "epoch": 6.672360999194198, + "grad_norm": 1.6681263678811185, + "learning_rate": 9.572947092499418e-05, + "loss": 2.5501, + "step": 10355 + }, + { + "epoch": 6.673005640612409, + "grad_norm": 1.793379187271911, + "learning_rate": 9.572865577168739e-05, + "loss": 2.5796, + "step": 10356 + }, + { + "epoch": 6.6736502820306205, + "grad_norm": 1.8153706190230101, + "learning_rate": 9.572784054409819e-05, + "loss": 2.5152, + "step": 10357 + }, + { + "epoch": 6.674294923448832, + "grad_norm": 1.935091927409612, + "learning_rate": 9.572702524222796e-05, + "loss": 2.5121, + "step": 10358 + }, + { + "epoch": 6.674939564867043, + "grad_norm": 1.8639403439310436, + "learning_rate": 9.5726209866078e-05, + "loss": 2.4519, + "step": 10359 + }, + { + "epoch": 6.6755842062852535, + "grad_norm": 1.8382958240619907, + "learning_rate": 9.572539441564969e-05, + "loss": 2.6039, + "step": 10360 + }, + { + "epoch": 6.676228847703465, + "grad_norm": 1.640686761562019, + "learning_rate": 9.57245788909443e-05, + "loss": 2.3956, + "step": 10361 + }, + { + "epoch": 6.676873489121676, + "grad_norm": 1.8106450631953632, + "learning_rate": 9.572376329196324e-05, + "loss": 2.5528, + "step": 10362 + }, + { + "epoch": 6.677518130539887, + "grad_norm": 1.7090819895162128, + "learning_rate": 9.572294761870782e-05, + "loss": 2.5555, + "step": 10363 + }, + { + "epoch": 6.678162771958098, + "grad_norm": 1.77321532889682, + "learning_rate": 9.572213187117934e-05, + "loss": 2.3553, + "step": 10364 + }, + { + "epoch": 6.678807413376309, + "grad_norm": 1.6926078996089597, + "learning_rate": 9.572131604937923e-05, + "loss": 2.6262, + "step": 10365 + }, + { + "epoch": 6.67945205479452, + "grad_norm": 1.7124388500662857, + "learning_rate": 9.572050015330874e-05, + "loss": 2.4807, + "step": 10366 + }, + { + "epoch": 6.680096696212732, + "grad_norm": 1.8544374881817176, + "learning_rate": 9.571968418296928e-05, + "loss": 2.701, + "step": 10367 + }, + { + "epoch": 6.680741337630943, + "grad_norm": 1.6801225886311946, + "learning_rate": 9.571886813836214e-05, + "loss": 2.4075, + "step": 10368 + }, + { + "epoch": 6.681385979049153, + "grad_norm": 1.924950369072606, + "learning_rate": 9.571805201948869e-05, + "loss": 2.3677, + "step": 10369 + }, + { + "epoch": 6.682030620467365, + "grad_norm": 1.8749760399140554, + "learning_rate": 9.571723582635026e-05, + "loss": 2.3921, + "step": 10370 + }, + { + "epoch": 6.682675261885576, + "grad_norm": 1.824631937532197, + "learning_rate": 9.571641955894817e-05, + "loss": 2.685, + "step": 10371 + }, + { + "epoch": 6.683319903303787, + "grad_norm": 1.912125892801743, + "learning_rate": 9.571560321728382e-05, + "loss": 2.6822, + "step": 10372 + }, + { + "epoch": 6.683964544721999, + "grad_norm": 1.6585418281555204, + "learning_rate": 9.57147868013585e-05, + "loss": 2.4802, + "step": 10373 + }, + { + "epoch": 6.684609186140209, + "grad_norm": 1.7573115601919824, + "learning_rate": 9.571397031117356e-05, + "loss": 2.7134, + "step": 10374 + }, + { + "epoch": 6.68525382755842, + "grad_norm": 1.774995989633471, + "learning_rate": 9.571315374673034e-05, + "loss": 2.5643, + "step": 10375 + }, + { + "epoch": 6.685898468976632, + "grad_norm": 1.6872783914004452, + "learning_rate": 9.571233710803018e-05, + "loss": 2.2846, + "step": 10376 + }, + { + "epoch": 6.686543110394843, + "grad_norm": 1.5624236463980417, + "learning_rate": 9.571152039507442e-05, + "loss": 2.4419, + "step": 10377 + }, + { + "epoch": 6.687187751813054, + "grad_norm": 1.6345707077960616, + "learning_rate": 9.571070360786441e-05, + "loss": 2.4915, + "step": 10378 + }, + { + "epoch": 6.687832393231265, + "grad_norm": 1.8298106665016478, + "learning_rate": 9.570988674640151e-05, + "loss": 2.785, + "step": 10379 + }, + { + "epoch": 6.688477034649476, + "grad_norm": 1.6679234336152555, + "learning_rate": 9.570906981068703e-05, + "loss": 2.4116, + "step": 10380 + }, + { + "epoch": 6.689121676067687, + "grad_norm": 1.6623707051572068, + "learning_rate": 9.570825280072232e-05, + "loss": 2.3268, + "step": 10381 + }, + { + "epoch": 6.689766317485899, + "grad_norm": 1.944682795797112, + "learning_rate": 9.570743571650871e-05, + "loss": 2.6058, + "step": 10382 + }, + { + "epoch": 6.69041095890411, + "grad_norm": 1.6303610473635863, + "learning_rate": 9.570661855804759e-05, + "loss": 2.718, + "step": 10383 + }, + { + "epoch": 6.69105560032232, + "grad_norm": 1.7649762861753038, + "learning_rate": 9.570580132534025e-05, + "loss": 2.4183, + "step": 10384 + }, + { + "epoch": 6.691700241740532, + "grad_norm": 1.9484713936498212, + "learning_rate": 9.570498401838804e-05, + "loss": 2.645, + "step": 10385 + }, + { + "epoch": 6.692344883158743, + "grad_norm": 1.737705767362111, + "learning_rate": 9.570416663719234e-05, + "loss": 2.5566, + "step": 10386 + }, + { + "epoch": 6.692989524576954, + "grad_norm": 1.7420307260262904, + "learning_rate": 9.570334918175445e-05, + "loss": 2.2364, + "step": 10387 + }, + { + "epoch": 6.6936341659951655, + "grad_norm": 1.7029145260757714, + "learning_rate": 9.570253165207572e-05, + "loss": 2.4424, + "step": 10388 + }, + { + "epoch": 6.694278807413376, + "grad_norm": 1.666968227163287, + "learning_rate": 9.570171404815753e-05, + "loss": 2.6296, + "step": 10389 + }, + { + "epoch": 6.694923448831587, + "grad_norm": 1.596096782467872, + "learning_rate": 9.570089637000116e-05, + "loss": 2.4484, + "step": 10390 + }, + { + "epoch": 6.6955680902497985, + "grad_norm": 1.7401618955855327, + "learning_rate": 9.5700078617608e-05, + "loss": 2.4341, + "step": 10391 + }, + { + "epoch": 6.69621273166801, + "grad_norm": 1.6290354428341487, + "learning_rate": 9.569926079097937e-05, + "loss": 2.4152, + "step": 10392 + }, + { + "epoch": 6.696857373086221, + "grad_norm": 1.7916538899648575, + "learning_rate": 9.569844289011662e-05, + "loss": 2.433, + "step": 10393 + }, + { + "epoch": 6.6975020145044315, + "grad_norm": 1.8350445679775285, + "learning_rate": 9.569762491502111e-05, + "loss": 2.5259, + "step": 10394 + }, + { + "epoch": 6.698146655922643, + "grad_norm": 1.6617815449103808, + "learning_rate": 9.569680686569417e-05, + "loss": 2.6602, + "step": 10395 + }, + { + "epoch": 6.698791297340854, + "grad_norm": 1.90142330073893, + "learning_rate": 9.569598874213713e-05, + "loss": 2.6473, + "step": 10396 + }, + { + "epoch": 6.699435938759065, + "grad_norm": 1.5434564947968692, + "learning_rate": 9.569517054435137e-05, + "loss": 2.5826, + "step": 10397 + }, + { + "epoch": 6.700080580177277, + "grad_norm": 1.6039121842532935, + "learning_rate": 9.569435227233817e-05, + "loss": 2.3111, + "step": 10398 + }, + { + "epoch": 6.700725221595487, + "grad_norm": 1.7986886148724386, + "learning_rate": 9.569353392609895e-05, + "loss": 2.5817, + "step": 10399 + }, + { + "epoch": 6.701369863013698, + "grad_norm": 1.7442960694260345, + "learning_rate": 9.569271550563501e-05, + "loss": 2.4545, + "step": 10400 + }, + { + "epoch": 6.701369863013698, + "eval_loss": 4.879100322723389, + "eval_runtime": 2.9819, + "eval_samples_per_second": 33.536, + "eval_steps_per_second": 4.36, + "step": 10400 + }, + { + "epoch": 6.70201450443191, + "grad_norm": 1.6148319826582918, + "learning_rate": 9.56918970109477e-05, + "loss": 2.5801, + "step": 10401 + }, + { + "epoch": 6.702659145850121, + "grad_norm": 1.7369369453315464, + "learning_rate": 9.569107844203838e-05, + "loss": 2.2674, + "step": 10402 + }, + { + "epoch": 6.703303787268332, + "grad_norm": 1.668971745780144, + "learning_rate": 9.569025979890835e-05, + "loss": 2.3843, + "step": 10403 + }, + { + "epoch": 6.703948428686543, + "grad_norm": 1.7404202982756949, + "learning_rate": 9.568944108155901e-05, + "loss": 2.4701, + "step": 10404 + }, + { + "epoch": 6.704593070104754, + "grad_norm": 1.6722979338957489, + "learning_rate": 9.568862228999167e-05, + "loss": 2.8344, + "step": 10405 + }, + { + "epoch": 6.705237711522965, + "grad_norm": 1.8294420995718021, + "learning_rate": 9.568780342420768e-05, + "loss": 2.7785, + "step": 10406 + }, + { + "epoch": 6.705882352941177, + "grad_norm": 1.909314029607573, + "learning_rate": 9.56869844842084e-05, + "loss": 2.2808, + "step": 10407 + }, + { + "epoch": 6.706526994359388, + "grad_norm": 1.6869075470824864, + "learning_rate": 9.568616546999517e-05, + "loss": 2.4846, + "step": 10408 + }, + { + "epoch": 6.707171635777598, + "grad_norm": 1.8064425279600662, + "learning_rate": 9.568534638156932e-05, + "loss": 2.6386, + "step": 10409 + }, + { + "epoch": 6.70781627719581, + "grad_norm": 1.8716378235913091, + "learning_rate": 9.568452721893221e-05, + "loss": 2.4931, + "step": 10410 + }, + { + "epoch": 6.708460918614021, + "grad_norm": 1.8169938255427283, + "learning_rate": 9.568370798208517e-05, + "loss": 2.4224, + "step": 10411 + }, + { + "epoch": 6.709105560032232, + "grad_norm": 1.7946060548596048, + "learning_rate": 9.568288867102957e-05, + "loss": 2.6184, + "step": 10412 + }, + { + "epoch": 6.7097502014504435, + "grad_norm": 1.8828327437686827, + "learning_rate": 9.568206928576672e-05, + "loss": 2.5082, + "step": 10413 + }, + { + "epoch": 6.710394842868654, + "grad_norm": 1.8947873101104764, + "learning_rate": 9.568124982629803e-05, + "loss": 2.8323, + "step": 10414 + }, + { + "epoch": 6.711039484286865, + "grad_norm": 2.0867868826763174, + "learning_rate": 9.568043029262477e-05, + "loss": 2.4427, + "step": 10415 + }, + { + "epoch": 6.7116841257050766, + "grad_norm": 1.5243352925148945, + "learning_rate": 9.567961068474834e-05, + "loss": 2.56, + "step": 10416 + }, + { + "epoch": 6.712328767123288, + "grad_norm": 2.049094412938017, + "learning_rate": 9.567879100267004e-05, + "loss": 2.541, + "step": 10417 + }, + { + "epoch": 6.712973408541499, + "grad_norm": 1.6985727989944694, + "learning_rate": 9.567797124639127e-05, + "loss": 2.4564, + "step": 10418 + }, + { + "epoch": 6.71361804995971, + "grad_norm": 2.101721896412036, + "learning_rate": 9.567715141591331e-05, + "loss": 2.8518, + "step": 10419 + }, + { + "epoch": 6.714262691377921, + "grad_norm": 1.8237054006503397, + "learning_rate": 9.567633151123757e-05, + "loss": 2.819, + "step": 10420 + }, + { + "epoch": 6.714907332796132, + "grad_norm": 2.0167545719504987, + "learning_rate": 9.567551153236537e-05, + "loss": 2.6225, + "step": 10421 + }, + { + "epoch": 6.7155519742143435, + "grad_norm": 1.6521977232480154, + "learning_rate": 9.567469147929803e-05, + "loss": 2.5875, + "step": 10422 + }, + { + "epoch": 6.716196615632555, + "grad_norm": 2.199916863777039, + "learning_rate": 9.567387135203697e-05, + "loss": 2.6699, + "step": 10423 + }, + { + "epoch": 6.716841257050765, + "grad_norm": 1.7459839926823355, + "learning_rate": 9.567305115058346e-05, + "loss": 2.5343, + "step": 10424 + }, + { + "epoch": 6.7174858984689765, + "grad_norm": 1.8961545635576662, + "learning_rate": 9.567223087493889e-05, + "loss": 2.6609, + "step": 10425 + }, + { + "epoch": 6.718130539887188, + "grad_norm": 1.756975800280992, + "learning_rate": 9.56714105251046e-05, + "loss": 2.7639, + "step": 10426 + }, + { + "epoch": 6.718775181305399, + "grad_norm": 1.8407975416096691, + "learning_rate": 9.567059010108194e-05, + "loss": 2.5355, + "step": 10427 + }, + { + "epoch": 6.71941982272361, + "grad_norm": 1.7290468949831133, + "learning_rate": 9.566976960287224e-05, + "loss": 2.5068, + "step": 10428 + }, + { + "epoch": 6.720064464141821, + "grad_norm": 1.9180370953147365, + "learning_rate": 9.566894903047685e-05, + "loss": 2.4998, + "step": 10429 + }, + { + "epoch": 6.720709105560032, + "grad_norm": 1.6086454945780717, + "learning_rate": 9.566812838389713e-05, + "loss": 2.7319, + "step": 10430 + }, + { + "epoch": 6.721353746978243, + "grad_norm": 1.7441699995984568, + "learning_rate": 9.566730766313443e-05, + "loss": 2.4729, + "step": 10431 + }, + { + "epoch": 6.721998388396455, + "grad_norm": 1.7605516277202895, + "learning_rate": 9.566648686819008e-05, + "loss": 2.5239, + "step": 10432 + }, + { + "epoch": 6.722643029814666, + "grad_norm": 1.6374129145459875, + "learning_rate": 9.566566599906545e-05, + "loss": 2.3458, + "step": 10433 + }, + { + "epoch": 6.723287671232876, + "grad_norm": 1.7450643853215346, + "learning_rate": 9.566484505576188e-05, + "loss": 2.4738, + "step": 10434 + }, + { + "epoch": 6.723932312651088, + "grad_norm": 1.6682862022305547, + "learning_rate": 9.566402403828072e-05, + "loss": 2.7944, + "step": 10435 + }, + { + "epoch": 6.724576954069299, + "grad_norm": 1.7314943986927007, + "learning_rate": 9.566320294662331e-05, + "loss": 2.4477, + "step": 10436 + }, + { + "epoch": 6.72522159548751, + "grad_norm": 1.5990634481500006, + "learning_rate": 9.566238178079099e-05, + "loss": 2.7999, + "step": 10437 + }, + { + "epoch": 6.725866236905722, + "grad_norm": 1.9130779733962704, + "learning_rate": 9.566156054078514e-05, + "loss": 2.7236, + "step": 10438 + }, + { + "epoch": 6.726510878323932, + "grad_norm": 1.7896150003830056, + "learning_rate": 9.566073922660707e-05, + "loss": 2.8395, + "step": 10439 + }, + { + "epoch": 6.727155519742143, + "grad_norm": 1.8916274277516127, + "learning_rate": 9.565991783825817e-05, + "loss": 2.7292, + "step": 10440 + }, + { + "epoch": 6.727800161160355, + "grad_norm": 4.517784771213163, + "learning_rate": 9.565909637573976e-05, + "loss": 2.56, + "step": 10441 + }, + { + "epoch": 6.728444802578566, + "grad_norm": 1.81179822883781, + "learning_rate": 9.56582748390532e-05, + "loss": 2.521, + "step": 10442 + }, + { + "epoch": 6.729089443996777, + "grad_norm": 1.9484605608743342, + "learning_rate": 9.565745322819984e-05, + "loss": 2.5058, + "step": 10443 + }, + { + "epoch": 6.729734085414988, + "grad_norm": 1.670133158674318, + "learning_rate": 9.565663154318101e-05, + "loss": 2.6857, + "step": 10444 + }, + { + "epoch": 6.730378726833199, + "grad_norm": 1.6478381253505772, + "learning_rate": 9.565580978399808e-05, + "loss": 2.5764, + "step": 10445 + }, + { + "epoch": 6.73102336825141, + "grad_norm": 1.7525423910507147, + "learning_rate": 9.565498795065241e-05, + "loss": 2.5979, + "step": 10446 + }, + { + "epoch": 6.7316680096696215, + "grad_norm": 1.6093187802538327, + "learning_rate": 9.565416604314533e-05, + "loss": 2.5962, + "step": 10447 + }, + { + "epoch": 6.732312651087833, + "grad_norm": 1.8384200308799892, + "learning_rate": 9.56533440614782e-05, + "loss": 2.5254, + "step": 10448 + }, + { + "epoch": 6.732957292506043, + "grad_norm": 1.66262298669999, + "learning_rate": 9.565252200565235e-05, + "loss": 2.7487, + "step": 10449 + }, + { + "epoch": 6.7336019339242545, + "grad_norm": 2.1714486117787577, + "learning_rate": 9.565169987566915e-05, + "loss": 2.712, + "step": 10450 + }, + { + "epoch": 6.734246575342466, + "grad_norm": 2.000301719901469, + "learning_rate": 9.565087767152994e-05, + "loss": 2.4874, + "step": 10451 + }, + { + "epoch": 6.734891216760677, + "grad_norm": 1.9542840318324741, + "learning_rate": 9.56500553932361e-05, + "loss": 2.7284, + "step": 10452 + }, + { + "epoch": 6.7355358581788884, + "grad_norm": 2.0100187498122595, + "learning_rate": 9.564923304078891e-05, + "loss": 2.5506, + "step": 10453 + }, + { + "epoch": 6.736180499597099, + "grad_norm": 1.4965342354872149, + "learning_rate": 9.564841061418982e-05, + "loss": 2.2909, + "step": 10454 + }, + { + "epoch": 6.73682514101531, + "grad_norm": 1.6367541385862525, + "learning_rate": 9.56475881134401e-05, + "loss": 2.4457, + "step": 10455 + }, + { + "epoch": 6.7374697824335215, + "grad_norm": 1.4675204996946056, + "learning_rate": 9.564676553854114e-05, + "loss": 2.5204, + "step": 10456 + }, + { + "epoch": 6.738114423851733, + "grad_norm": 1.7557586126140365, + "learning_rate": 9.564594288949426e-05, + "loss": 2.5929, + "step": 10457 + }, + { + "epoch": 6.738759065269944, + "grad_norm": 1.627653036395786, + "learning_rate": 9.564512016630084e-05, + "loss": 2.68, + "step": 10458 + }, + { + "epoch": 6.7394037066881545, + "grad_norm": 1.7586064784338937, + "learning_rate": 9.564429736896223e-05, + "loss": 2.4819, + "step": 10459 + }, + { + "epoch": 6.740048348106366, + "grad_norm": 1.663492124009382, + "learning_rate": 9.564347449747977e-05, + "loss": 2.6448, + "step": 10460 + }, + { + "epoch": 6.740692989524577, + "grad_norm": 1.8417169230354862, + "learning_rate": 9.564265155185482e-05, + "loss": 2.5607, + "step": 10461 + }, + { + "epoch": 6.741337630942788, + "grad_norm": 1.7493226761493696, + "learning_rate": 9.564182853208872e-05, + "loss": 2.4449, + "step": 10462 + }, + { + "epoch": 6.741982272361, + "grad_norm": 1.8354364196096882, + "learning_rate": 9.564100543818282e-05, + "loss": 2.5081, + "step": 10463 + }, + { + "epoch": 6.74262691377921, + "grad_norm": 1.6244318701220806, + "learning_rate": 9.564018227013849e-05, + "loss": 2.9719, + "step": 10464 + }, + { + "epoch": 6.743271555197421, + "grad_norm": 1.7120773783845162, + "learning_rate": 9.563935902795706e-05, + "loss": 2.4022, + "step": 10465 + }, + { + "epoch": 6.743916196615633, + "grad_norm": 1.8055081054355337, + "learning_rate": 9.563853571163989e-05, + "loss": 2.7534, + "step": 10466 + }, + { + "epoch": 6.744560838033844, + "grad_norm": 1.6958449871388437, + "learning_rate": 9.563771232118835e-05, + "loss": 2.6856, + "step": 10467 + }, + { + "epoch": 6.745205479452055, + "grad_norm": 1.5951040505581775, + "learning_rate": 9.563688885660378e-05, + "loss": 2.4726, + "step": 10468 + }, + { + "epoch": 6.745850120870266, + "grad_norm": 1.6488908392472779, + "learning_rate": 9.563606531788754e-05, + "loss": 2.5574, + "step": 10469 + }, + { + "epoch": 6.746494762288477, + "grad_norm": 1.6993852973064314, + "learning_rate": 9.563524170504094e-05, + "loss": 2.5716, + "step": 10470 + }, + { + "epoch": 6.747139403706688, + "grad_norm": 1.5533543894801167, + "learning_rate": 9.563441801806539e-05, + "loss": 2.5842, + "step": 10471 + }, + { + "epoch": 6.7477840451249, + "grad_norm": 1.777998662237995, + "learning_rate": 9.563359425696223e-05, + "loss": 2.4327, + "step": 10472 + }, + { + "epoch": 6.74842868654311, + "grad_norm": 1.6116668811084693, + "learning_rate": 9.563277042173278e-05, + "loss": 2.7394, + "step": 10473 + }, + { + "epoch": 6.749073327961321, + "grad_norm": 1.621982716391358, + "learning_rate": 9.563194651237842e-05, + "loss": 2.4863, + "step": 10474 + }, + { + "epoch": 6.749717969379533, + "grad_norm": 1.7890030552100944, + "learning_rate": 9.56311225289005e-05, + "loss": 2.3491, + "step": 10475 + }, + { + "epoch": 6.750362610797744, + "grad_norm": 1.7667900765727547, + "learning_rate": 9.563029847130036e-05, + "loss": 2.4875, + "step": 10476 + }, + { + "epoch": 6.751007252215955, + "grad_norm": 1.5173877590453564, + "learning_rate": 9.562947433957939e-05, + "loss": 2.5274, + "step": 10477 + }, + { + "epoch": 6.751651893634166, + "grad_norm": 1.969436198102241, + "learning_rate": 9.562865013373889e-05, + "loss": 2.4265, + "step": 10478 + }, + { + "epoch": 6.752296535052377, + "grad_norm": 1.560705541492583, + "learning_rate": 9.562782585378026e-05, + "loss": 2.3947, + "step": 10479 + }, + { + "epoch": 6.752941176470588, + "grad_norm": 2.02433131964171, + "learning_rate": 9.562700149970483e-05, + "loss": 2.7153, + "step": 10480 + }, + { + "epoch": 6.7535858178887995, + "grad_norm": 1.8972260387096014, + "learning_rate": 9.562617707151395e-05, + "loss": 2.6678, + "step": 10481 + }, + { + "epoch": 6.754230459307011, + "grad_norm": 1.5713785591934502, + "learning_rate": 9.5625352569209e-05, + "loss": 2.8006, + "step": 10482 + }, + { + "epoch": 6.754875100725221, + "grad_norm": 1.895380649687581, + "learning_rate": 9.56245279927913e-05, + "loss": 2.6138, + "step": 10483 + }, + { + "epoch": 6.7555197421434325, + "grad_norm": 1.729867711240985, + "learning_rate": 9.562370334226224e-05, + "loss": 2.6647, + "step": 10484 + }, + { + "epoch": 6.756164383561644, + "grad_norm": 1.6580683121422939, + "learning_rate": 9.562287861762314e-05, + "loss": 2.4864, + "step": 10485 + }, + { + "epoch": 6.756809024979855, + "grad_norm": 1.7955741847298747, + "learning_rate": 9.562205381887538e-05, + "loss": 2.7199, + "step": 10486 + }, + { + "epoch": 6.757453666398066, + "grad_norm": 1.8835281679050617, + "learning_rate": 9.56212289460203e-05, + "loss": 2.5483, + "step": 10487 + }, + { + "epoch": 6.758098307816277, + "grad_norm": 2.0309550559482874, + "learning_rate": 9.562040399905927e-05, + "loss": 2.5886, + "step": 10488 + }, + { + "epoch": 6.758742949234488, + "grad_norm": 1.8445241356782744, + "learning_rate": 9.561957897799363e-05, + "loss": 2.8364, + "step": 10489 + }, + { + "epoch": 6.7593875906526995, + "grad_norm": 1.8145995459260218, + "learning_rate": 9.561875388282473e-05, + "loss": 2.5348, + "step": 10490 + }, + { + "epoch": 6.760032232070911, + "grad_norm": 1.642568303726401, + "learning_rate": 9.561792871355394e-05, + "loss": 2.6657, + "step": 10491 + }, + { + "epoch": 6.760676873489122, + "grad_norm": 1.8559919247234704, + "learning_rate": 9.561710347018262e-05, + "loss": 2.6793, + "step": 10492 + }, + { + "epoch": 6.7613215149073325, + "grad_norm": 1.7155357682840802, + "learning_rate": 9.561627815271211e-05, + "loss": 2.5993, + "step": 10493 + }, + { + "epoch": 6.761966156325544, + "grad_norm": 1.744635542912884, + "learning_rate": 9.561545276114377e-05, + "loss": 2.539, + "step": 10494 + }, + { + "epoch": 6.762610797743755, + "grad_norm": 1.6903050452299653, + "learning_rate": 9.561462729547896e-05, + "loss": 2.7532, + "step": 10495 + }, + { + "epoch": 6.763255439161966, + "grad_norm": 1.590212296286342, + "learning_rate": 9.561380175571905e-05, + "loss": 2.7057, + "step": 10496 + }, + { + "epoch": 6.763900080580177, + "grad_norm": 1.9224234348463811, + "learning_rate": 9.561297614186536e-05, + "loss": 2.4053, + "step": 10497 + }, + { + "epoch": 6.764544721998388, + "grad_norm": 1.6026723670793857, + "learning_rate": 9.561215045391924e-05, + "loss": 2.4406, + "step": 10498 + }, + { + "epoch": 6.765189363416599, + "grad_norm": 1.7239055037727316, + "learning_rate": 9.56113246918821e-05, + "loss": 2.5331, + "step": 10499 + }, + { + "epoch": 6.765834004834811, + "grad_norm": 1.586989784018083, + "learning_rate": 9.561049885575526e-05, + "loss": 2.4256, + "step": 10500 + }, + { + "epoch": 6.765834004834811, + "eval_loss": 4.883433818817139, + "eval_runtime": 2.9938, + "eval_samples_per_second": 33.402, + "eval_steps_per_second": 4.342, + "step": 10500 + }, + { + "epoch": 6.766478646253022, + "grad_norm": 1.7197795044892537, + "learning_rate": 9.560967294554008e-05, + "loss": 2.2789, + "step": 10501 + }, + { + "epoch": 6.767123287671232, + "grad_norm": 1.557733058755237, + "learning_rate": 9.560884696123793e-05, + "loss": 2.6592, + "step": 10502 + }, + { + "epoch": 6.767767929089444, + "grad_norm": 2.0047197113215156, + "learning_rate": 9.560802090285015e-05, + "loss": 2.2967, + "step": 10503 + }, + { + "epoch": 6.768412570507655, + "grad_norm": 1.6511466868534823, + "learning_rate": 9.560719477037808e-05, + "loss": 2.4907, + "step": 10504 + }, + { + "epoch": 6.769057211925866, + "grad_norm": 1.8332853130443436, + "learning_rate": 9.560636856382313e-05, + "loss": 2.8891, + "step": 10505 + }, + { + "epoch": 6.769701853344078, + "grad_norm": 1.8084653082387707, + "learning_rate": 9.560554228318662e-05, + "loss": 2.645, + "step": 10506 + }, + { + "epoch": 6.770346494762288, + "grad_norm": 1.6974634568601938, + "learning_rate": 9.560471592846992e-05, + "loss": 2.5381, + "step": 10507 + }, + { + "epoch": 6.770991136180499, + "grad_norm": 1.8972150844750622, + "learning_rate": 9.560388949967437e-05, + "loss": 2.5043, + "step": 10508 + }, + { + "epoch": 6.771635777598711, + "grad_norm": 1.7684111613326468, + "learning_rate": 9.560306299680132e-05, + "loss": 2.6396, + "step": 10509 + }, + { + "epoch": 6.772280419016922, + "grad_norm": 1.6974410562003615, + "learning_rate": 9.560223641985217e-05, + "loss": 2.817, + "step": 10510 + }, + { + "epoch": 6.772925060435133, + "grad_norm": 1.7147534665769097, + "learning_rate": 9.560140976882825e-05, + "loss": 2.3763, + "step": 10511 + }, + { + "epoch": 6.773569701853344, + "grad_norm": 1.535774828705437, + "learning_rate": 9.560058304373092e-05, + "loss": 2.6858, + "step": 10512 + }, + { + "epoch": 6.774214343271555, + "grad_norm": 1.834289706128217, + "learning_rate": 9.559975624456154e-05, + "loss": 2.8889, + "step": 10513 + }, + { + "epoch": 6.774858984689766, + "grad_norm": 1.7632463670745986, + "learning_rate": 9.559892937132145e-05, + "loss": 2.4862, + "step": 10514 + }, + { + "epoch": 6.7755036261079775, + "grad_norm": 1.7611634444219386, + "learning_rate": 9.559810242401205e-05, + "loss": 2.6362, + "step": 10515 + }, + { + "epoch": 6.776148267526189, + "grad_norm": 1.6109784921577408, + "learning_rate": 9.559727540263466e-05, + "loss": 2.3626, + "step": 10516 + }, + { + "epoch": 6.776792908944399, + "grad_norm": 1.7426946852027174, + "learning_rate": 9.559644830719064e-05, + "loss": 2.7705, + "step": 10517 + }, + { + "epoch": 6.7774375503626105, + "grad_norm": 1.6099105841292467, + "learning_rate": 9.559562113768138e-05, + "loss": 2.3889, + "step": 10518 + }, + { + "epoch": 6.778082191780822, + "grad_norm": 1.703115015996959, + "learning_rate": 9.55947938941082e-05, + "loss": 2.382, + "step": 10519 + }, + { + "epoch": 6.778726833199033, + "grad_norm": 1.6195871941683455, + "learning_rate": 9.55939665764725e-05, + "loss": 2.3396, + "step": 10520 + }, + { + "epoch": 6.779371474617244, + "grad_norm": 1.6524714473273308, + "learning_rate": 9.55931391847756e-05, + "loss": 2.6727, + "step": 10521 + }, + { + "epoch": 6.780016116035455, + "grad_norm": 1.6064959622264818, + "learning_rate": 9.559231171901887e-05, + "loss": 2.2964, + "step": 10522 + }, + { + "epoch": 6.780660757453666, + "grad_norm": 1.6383256017134598, + "learning_rate": 9.559148417920366e-05, + "loss": 2.4899, + "step": 10523 + }, + { + "epoch": 6.781305398871877, + "grad_norm": 1.6199040949930161, + "learning_rate": 9.559065656533137e-05, + "loss": 2.5207, + "step": 10524 + }, + { + "epoch": 6.781950040290089, + "grad_norm": 1.8361802257610829, + "learning_rate": 9.558982887740331e-05, + "loss": 2.4836, + "step": 10525 + }, + { + "epoch": 6.7825946817083, + "grad_norm": 1.7925947035520913, + "learning_rate": 9.558900111542089e-05, + "loss": 2.8746, + "step": 10526 + }, + { + "epoch": 6.7832393231265105, + "grad_norm": 2.0658931135360397, + "learning_rate": 9.558817327938541e-05, + "loss": 2.7319, + "step": 10527 + }, + { + "epoch": 6.783883964544722, + "grad_norm": 1.6990113776635194, + "learning_rate": 9.558734536929827e-05, + "loss": 2.5244, + "step": 10528 + }, + { + "epoch": 6.784528605962933, + "grad_norm": 1.8445186180351112, + "learning_rate": 9.558651738516081e-05, + "loss": 2.4568, + "step": 10529 + }, + { + "epoch": 6.785173247381144, + "grad_norm": 1.6399781458050744, + "learning_rate": 9.558568932697441e-05, + "loss": 2.7555, + "step": 10530 + }, + { + "epoch": 6.785817888799356, + "grad_norm": 1.7358530788923776, + "learning_rate": 9.558486119474041e-05, + "loss": 2.64, + "step": 10531 + }, + { + "epoch": 6.786462530217566, + "grad_norm": 1.6818656541266723, + "learning_rate": 9.558403298846019e-05, + "loss": 2.417, + "step": 10532 + }, + { + "epoch": 6.787107171635777, + "grad_norm": 1.5542691521387222, + "learning_rate": 9.558320470813511e-05, + "loss": 2.9027, + "step": 10533 + }, + { + "epoch": 6.787751813053989, + "grad_norm": 1.7489923411058665, + "learning_rate": 9.55823763537665e-05, + "loss": 2.7737, + "step": 10534 + }, + { + "epoch": 6.7883964544722, + "grad_norm": 1.8426293759947014, + "learning_rate": 9.558154792535575e-05, + "loss": 2.6394, + "step": 10535 + }, + { + "epoch": 6.789041095890411, + "grad_norm": 1.7670376406386965, + "learning_rate": 9.558071942290421e-05, + "loss": 2.3729, + "step": 10536 + }, + { + "epoch": 6.789685737308622, + "grad_norm": 1.5688573779409907, + "learning_rate": 9.557989084641325e-05, + "loss": 2.596, + "step": 10537 + }, + { + "epoch": 6.790330378726833, + "grad_norm": 1.684245032146543, + "learning_rate": 9.557906219588421e-05, + "loss": 2.5528, + "step": 10538 + }, + { + "epoch": 6.790975020145044, + "grad_norm": 1.6828831480919022, + "learning_rate": 9.557823347131848e-05, + "loss": 2.4743, + "step": 10539 + }, + { + "epoch": 6.791619661563256, + "grad_norm": 1.5804260403369257, + "learning_rate": 9.55774046727174e-05, + "loss": 2.4975, + "step": 10540 + }, + { + "epoch": 6.792264302981467, + "grad_norm": 1.6915560582615898, + "learning_rate": 9.557657580008232e-05, + "loss": 2.5276, + "step": 10541 + }, + { + "epoch": 6.792908944399677, + "grad_norm": 1.7996451492525225, + "learning_rate": 9.557574685341462e-05, + "loss": 2.4219, + "step": 10542 + }, + { + "epoch": 6.793553585817889, + "grad_norm": 1.7714008557669143, + "learning_rate": 9.557491783271566e-05, + "loss": 2.5527, + "step": 10543 + }, + { + "epoch": 6.7941982272361, + "grad_norm": 2.0575774096023496, + "learning_rate": 9.557408873798682e-05, + "loss": 2.5313, + "step": 10544 + }, + { + "epoch": 6.794842868654311, + "grad_norm": 1.6077905680204403, + "learning_rate": 9.557325956922942e-05, + "loss": 2.5801, + "step": 10545 + }, + { + "epoch": 6.7954875100725225, + "grad_norm": 1.7680051349972066, + "learning_rate": 9.557243032644485e-05, + "loss": 2.5941, + "step": 10546 + }, + { + "epoch": 6.796132151490733, + "grad_norm": 1.6929647813548605, + "learning_rate": 9.557160100963447e-05, + "loss": 2.7383, + "step": 10547 + }, + { + "epoch": 6.796776792908944, + "grad_norm": 1.7864514271890106, + "learning_rate": 9.557077161879964e-05, + "loss": 2.4279, + "step": 10548 + }, + { + "epoch": 6.7974214343271555, + "grad_norm": 1.7296868329630197, + "learning_rate": 9.55699421539417e-05, + "loss": 2.5757, + "step": 10549 + }, + { + "epoch": 6.798066075745367, + "grad_norm": 1.8795987362260833, + "learning_rate": 9.556911261506206e-05, + "loss": 2.655, + "step": 10550 + }, + { + "epoch": 6.798710717163578, + "grad_norm": 1.7577716006220536, + "learning_rate": 9.556828300216205e-05, + "loss": 2.5231, + "step": 10551 + }, + { + "epoch": 6.7993553585817885, + "grad_norm": 1.804797396042713, + "learning_rate": 9.556745331524301e-05, + "loss": 2.7587, + "step": 10552 + }, + { + "epoch": 6.8, + "grad_norm": 1.6804401472100843, + "learning_rate": 9.556662355430636e-05, + "loss": 2.3628, + "step": 10553 + }, + { + "epoch": 6.800644641418211, + "grad_norm": 1.7429004145689553, + "learning_rate": 9.556579371935341e-05, + "loss": 2.4277, + "step": 10554 + }, + { + "epoch": 6.801289282836422, + "grad_norm": 1.5970415077948608, + "learning_rate": 9.556496381038557e-05, + "loss": 2.3281, + "step": 10555 + }, + { + "epoch": 6.801933924254634, + "grad_norm": 1.9034814146209482, + "learning_rate": 9.556413382740416e-05, + "loss": 2.4979, + "step": 10556 + }, + { + "epoch": 6.802578565672844, + "grad_norm": 1.7623365922420515, + "learning_rate": 9.556330377041057e-05, + "loss": 2.5579, + "step": 10557 + }, + { + "epoch": 6.803223207091055, + "grad_norm": 2.1468316232473366, + "learning_rate": 9.556247363940616e-05, + "loss": 2.4981, + "step": 10558 + }, + { + "epoch": 6.803867848509267, + "grad_norm": 1.7236951329493269, + "learning_rate": 9.556164343439226e-05, + "loss": 2.3768, + "step": 10559 + }, + { + "epoch": 6.804512489927478, + "grad_norm": 1.834490494968536, + "learning_rate": 9.556081315537028e-05, + "loss": 2.6197, + "step": 10560 + }, + { + "epoch": 6.805157131345689, + "grad_norm": 1.8179416556148942, + "learning_rate": 9.555998280234157e-05, + "loss": 2.424, + "step": 10561 + }, + { + "epoch": 6.8058017727639, + "grad_norm": 1.9418755318285938, + "learning_rate": 9.55591523753075e-05, + "loss": 2.6688, + "step": 10562 + }, + { + "epoch": 6.806446414182111, + "grad_norm": 2.00667244806701, + "learning_rate": 9.555832187426941e-05, + "loss": 2.8163, + "step": 10563 + }, + { + "epoch": 6.807091055600322, + "grad_norm": 1.6416812046227427, + "learning_rate": 9.555749129922867e-05, + "loss": 2.5656, + "step": 10564 + }, + { + "epoch": 6.807735697018534, + "grad_norm": 2.0290347610360753, + "learning_rate": 9.555666065018667e-05, + "loss": 2.7668, + "step": 10565 + }, + { + "epoch": 6.808380338436745, + "grad_norm": 1.772773018069502, + "learning_rate": 9.555582992714475e-05, + "loss": 2.4758, + "step": 10566 + }, + { + "epoch": 6.809024979854955, + "grad_norm": 1.9692021715449615, + "learning_rate": 9.555499913010427e-05, + "loss": 2.3418, + "step": 10567 + }, + { + "epoch": 6.809669621273167, + "grad_norm": 1.630133681680224, + "learning_rate": 9.555416825906659e-05, + "loss": 2.8781, + "step": 10568 + }, + { + "epoch": 6.810314262691378, + "grad_norm": 1.697256947118775, + "learning_rate": 9.555333731403311e-05, + "loss": 2.325, + "step": 10569 + }, + { + "epoch": 6.810958904109589, + "grad_norm": 1.6518411180178376, + "learning_rate": 9.555250629500517e-05, + "loss": 2.6021, + "step": 10570 + }, + { + "epoch": 6.811603545527801, + "grad_norm": 1.7201976312484457, + "learning_rate": 9.555167520198415e-05, + "loss": 2.943, + "step": 10571 + }, + { + "epoch": 6.812248186946011, + "grad_norm": 1.6014141840940532, + "learning_rate": 9.555084403497138e-05, + "loss": 2.4686, + "step": 10572 + }, + { + "epoch": 6.812892828364222, + "grad_norm": 1.6603846633541983, + "learning_rate": 9.555001279396828e-05, + "loss": 2.6052, + "step": 10573 + }, + { + "epoch": 6.813537469782434, + "grad_norm": 1.6909206915937331, + "learning_rate": 9.554918147897615e-05, + "loss": 2.5688, + "step": 10574 + }, + { + "epoch": 6.814182111200645, + "grad_norm": 1.5843059921748024, + "learning_rate": 9.554835008999641e-05, + "loss": 2.7569, + "step": 10575 + }, + { + "epoch": 6.814826752618856, + "grad_norm": 1.6641886125005354, + "learning_rate": 9.55475186270304e-05, + "loss": 2.4717, + "step": 10576 + }, + { + "epoch": 6.815471394037067, + "grad_norm": 1.565898692485087, + "learning_rate": 9.554668709007948e-05, + "loss": 2.2635, + "step": 10577 + }, + { + "epoch": 6.816116035455278, + "grad_norm": 1.7353471086284873, + "learning_rate": 9.554585547914504e-05, + "loss": 2.5399, + "step": 10578 + }, + { + "epoch": 6.816760676873489, + "grad_norm": 1.7658908427880637, + "learning_rate": 9.554502379422842e-05, + "loss": 2.6277, + "step": 10579 + }, + { + "epoch": 6.8174053182917005, + "grad_norm": 1.8566567057963055, + "learning_rate": 9.5544192035331e-05, + "loss": 2.798, + "step": 10580 + }, + { + "epoch": 6.818049959709912, + "grad_norm": 1.6069848403846798, + "learning_rate": 9.554336020245416e-05, + "loss": 2.5769, + "step": 10581 + }, + { + "epoch": 6.818694601128122, + "grad_norm": 1.7802646510641926, + "learning_rate": 9.554252829559923e-05, + "loss": 2.6376, + "step": 10582 + }, + { + "epoch": 6.8193392425463335, + "grad_norm": 1.5775649062889472, + "learning_rate": 9.554169631476759e-05, + "loss": 2.5482, + "step": 10583 + }, + { + "epoch": 6.819983883964545, + "grad_norm": 1.99635964463089, + "learning_rate": 9.554086425996063e-05, + "loss": 2.6468, + "step": 10584 + }, + { + "epoch": 6.820628525382756, + "grad_norm": 1.5783977036545767, + "learning_rate": 9.554003213117969e-05, + "loss": 2.6549, + "step": 10585 + }, + { + "epoch": 6.821273166800967, + "grad_norm": 2.0343792762633015, + "learning_rate": 9.553919992842616e-05, + "loss": 2.4539, + "step": 10586 + }, + { + "epoch": 6.821917808219178, + "grad_norm": 1.7280945155853278, + "learning_rate": 9.553836765170137e-05, + "loss": 2.7304, + "step": 10587 + }, + { + "epoch": 6.822562449637389, + "grad_norm": 2.0481385544274233, + "learning_rate": 9.553753530100672e-05, + "loss": 2.0794, + "step": 10588 + }, + { + "epoch": 6.8232070910556, + "grad_norm": 1.7830555813074749, + "learning_rate": 9.553670287634358e-05, + "loss": 2.3889, + "step": 10589 + }, + { + "epoch": 6.823851732473812, + "grad_norm": 1.802537020158496, + "learning_rate": 9.55358703777133e-05, + "loss": 2.7476, + "step": 10590 + }, + { + "epoch": 6.824496373892023, + "grad_norm": 1.7390256118426508, + "learning_rate": 9.553503780511723e-05, + "loss": 2.6691, + "step": 10591 + }, + { + "epoch": 6.825141015310233, + "grad_norm": 1.912698886464437, + "learning_rate": 9.553420515855678e-05, + "loss": 2.7383, + "step": 10592 + }, + { + "epoch": 6.825785656728445, + "grad_norm": 1.7062783534835253, + "learning_rate": 9.553337243803329e-05, + "loss": 2.6855, + "step": 10593 + }, + { + "epoch": 6.826430298146656, + "grad_norm": 1.769908196538734, + "learning_rate": 9.553253964354812e-05, + "loss": 2.5153, + "step": 10594 + }, + { + "epoch": 6.827074939564867, + "grad_norm": 1.7382239206731505, + "learning_rate": 9.553170677510265e-05, + "loss": 2.6698, + "step": 10595 + }, + { + "epoch": 6.827719580983079, + "grad_norm": 1.8452999897407216, + "learning_rate": 9.553087383269826e-05, + "loss": 2.9715, + "step": 10596 + }, + { + "epoch": 6.828364222401289, + "grad_norm": 1.7114967432578811, + "learning_rate": 9.553004081633632e-05, + "loss": 2.4659, + "step": 10597 + }, + { + "epoch": 6.8290088638195, + "grad_norm": 1.9292993898098716, + "learning_rate": 9.552920772601816e-05, + "loss": 2.6316, + "step": 10598 + }, + { + "epoch": 6.829653505237712, + "grad_norm": 1.7952250734107864, + "learning_rate": 9.55283745617452e-05, + "loss": 2.5576, + "step": 10599 + }, + { + "epoch": 6.830298146655923, + "grad_norm": 1.945596592061578, + "learning_rate": 9.552754132351876e-05, + "loss": 2.3626, + "step": 10600 + }, + { + "epoch": 6.830298146655923, + "eval_loss": 4.898785591125488, + "eval_runtime": 2.9703, + "eval_samples_per_second": 33.666, + "eval_steps_per_second": 4.377, + "step": 10600 + }, + { + "epoch": 6.830942788074134, + "grad_norm": 1.893245076258688, + "learning_rate": 9.552670801134024e-05, + "loss": 2.5895, + "step": 10601 + }, + { + "epoch": 6.831587429492345, + "grad_norm": 1.9171535086426301, + "learning_rate": 9.552587462521099e-05, + "loss": 2.5293, + "step": 10602 + }, + { + "epoch": 6.832232070910556, + "grad_norm": 1.7747235653964015, + "learning_rate": 9.552504116513238e-05, + "loss": 2.4135, + "step": 10603 + }, + { + "epoch": 6.832876712328767, + "grad_norm": 1.9827437796931782, + "learning_rate": 9.55242076311058e-05, + "loss": 2.5642, + "step": 10604 + }, + { + "epoch": 6.8335213537469786, + "grad_norm": 1.8038792366561895, + "learning_rate": 9.552337402313262e-05, + "loss": 2.763, + "step": 10605 + }, + { + "epoch": 6.83416599516519, + "grad_norm": 2.1806603955117883, + "learning_rate": 9.552254034121418e-05, + "loss": 2.3857, + "step": 10606 + }, + { + "epoch": 6.8348106365834, + "grad_norm": 1.8766343381834907, + "learning_rate": 9.552170658535186e-05, + "loss": 2.3602, + "step": 10607 + }, + { + "epoch": 6.835455278001612, + "grad_norm": 2.2576114954315116, + "learning_rate": 9.552087275554704e-05, + "loss": 2.8245, + "step": 10608 + }, + { + "epoch": 6.836099919419823, + "grad_norm": 1.9644478151685862, + "learning_rate": 9.552003885180108e-05, + "loss": 2.5702, + "step": 10609 + }, + { + "epoch": 6.836744560838034, + "grad_norm": 2.079095697347591, + "learning_rate": 9.551920487411536e-05, + "loss": 2.5361, + "step": 10610 + }, + { + "epoch": 6.837389202256245, + "grad_norm": 2.0946907224428806, + "learning_rate": 9.551837082249123e-05, + "loss": 2.6716, + "step": 10611 + }, + { + "epoch": 6.838033843674456, + "grad_norm": 1.9088623926037604, + "learning_rate": 9.551753669693008e-05, + "loss": 2.2534, + "step": 10612 + }, + { + "epoch": 6.838678485092667, + "grad_norm": 1.8242278486012995, + "learning_rate": 9.551670249743327e-05, + "loss": 2.455, + "step": 10613 + }, + { + "epoch": 6.8393231265108785, + "grad_norm": 1.928770594980851, + "learning_rate": 9.551586822400216e-05, + "loss": 2.6107, + "step": 10614 + }, + { + "epoch": 6.83996776792909, + "grad_norm": 1.773881152485354, + "learning_rate": 9.551503387663814e-05, + "loss": 2.5169, + "step": 10615 + }, + { + "epoch": 6.8406124093473, + "grad_norm": 1.7070520444253807, + "learning_rate": 9.551419945534256e-05, + "loss": 2.4962, + "step": 10616 + }, + { + "epoch": 6.8412570507655115, + "grad_norm": 1.740533646450984, + "learning_rate": 9.551336496011682e-05, + "loss": 2.3153, + "step": 10617 + }, + { + "epoch": 6.841901692183723, + "grad_norm": 1.9752505804189808, + "learning_rate": 9.551253039096229e-05, + "loss": 2.6755, + "step": 10618 + }, + { + "epoch": 6.842546333601934, + "grad_norm": 1.720579909684287, + "learning_rate": 9.551169574788029e-05, + "loss": 2.7241, + "step": 10619 + }, + { + "epoch": 6.843190975020145, + "grad_norm": 1.9488512068455843, + "learning_rate": 9.551086103087224e-05, + "loss": 2.7009, + "step": 10620 + }, + { + "epoch": 6.843835616438356, + "grad_norm": 1.9525195277987715, + "learning_rate": 9.551002623993949e-05, + "loss": 2.8993, + "step": 10621 + }, + { + "epoch": 6.844480257856567, + "grad_norm": 1.9416136530036514, + "learning_rate": 9.550919137508342e-05, + "loss": 2.5952, + "step": 10622 + }, + { + "epoch": 6.845124899274778, + "grad_norm": 1.9999455019379502, + "learning_rate": 9.550835643630539e-05, + "loss": 2.7878, + "step": 10623 + }, + { + "epoch": 6.84576954069299, + "grad_norm": 2.3572177789141904, + "learning_rate": 9.55075214236068e-05, + "loss": 2.3862, + "step": 10624 + }, + { + "epoch": 6.846414182111201, + "grad_norm": 1.6866737632978555, + "learning_rate": 9.550668633698898e-05, + "loss": 2.3468, + "step": 10625 + }, + { + "epoch": 6.847058823529411, + "grad_norm": 1.984377057980437, + "learning_rate": 9.550585117645334e-05, + "loss": 2.455, + "step": 10626 + }, + { + "epoch": 6.847703464947623, + "grad_norm": 1.783194228254046, + "learning_rate": 9.55050159420012e-05, + "loss": 2.8182, + "step": 10627 + }, + { + "epoch": 6.848348106365834, + "grad_norm": 1.865710356978981, + "learning_rate": 9.550418063363401e-05, + "loss": 2.6891, + "step": 10628 + }, + { + "epoch": 6.848992747784045, + "grad_norm": 1.7922618848417562, + "learning_rate": 9.550334525135307e-05, + "loss": 2.4272, + "step": 10629 + }, + { + "epoch": 6.849637389202257, + "grad_norm": 1.5674041028771255, + "learning_rate": 9.550250979515978e-05, + "loss": 2.6639, + "step": 10630 + }, + { + "epoch": 6.850282030620467, + "grad_norm": 1.9733877549382863, + "learning_rate": 9.550167426505551e-05, + "loss": 2.439, + "step": 10631 + }, + { + "epoch": 6.850926672038678, + "grad_norm": 1.4675958998530638, + "learning_rate": 9.550083866104165e-05, + "loss": 2.7141, + "step": 10632 + }, + { + "epoch": 6.85157131345689, + "grad_norm": 1.8754500633560407, + "learning_rate": 9.550000298311955e-05, + "loss": 2.6464, + "step": 10633 + }, + { + "epoch": 6.852215954875101, + "grad_norm": 1.6616136245522897, + "learning_rate": 9.549916723129057e-05, + "loss": 2.8162, + "step": 10634 + }, + { + "epoch": 6.852860596293311, + "grad_norm": 1.7322961429658028, + "learning_rate": 9.549833140555611e-05, + "loss": 2.4406, + "step": 10635 + }, + { + "epoch": 6.853505237711523, + "grad_norm": 1.435251789312024, + "learning_rate": 9.549749550591756e-05, + "loss": 2.2737, + "step": 10636 + }, + { + "epoch": 6.854149879129734, + "grad_norm": 1.700742933576434, + "learning_rate": 9.549665953237623e-05, + "loss": 2.4515, + "step": 10637 + }, + { + "epoch": 6.854794520547945, + "grad_norm": 1.52479449453031, + "learning_rate": 9.549582348493354e-05, + "loss": 2.6714, + "step": 10638 + }, + { + "epoch": 6.8554391619661565, + "grad_norm": 1.6156173467983024, + "learning_rate": 9.549498736359086e-05, + "loss": 2.4169, + "step": 10639 + }, + { + "epoch": 6.856083803384367, + "grad_norm": 1.4714042102547273, + "learning_rate": 9.549415116834956e-05, + "loss": 2.798, + "step": 10640 + }, + { + "epoch": 6.856728444802578, + "grad_norm": 1.648860261858742, + "learning_rate": 9.549331489921101e-05, + "loss": 2.7353, + "step": 10641 + }, + { + "epoch": 6.85737308622079, + "grad_norm": 1.596896041443441, + "learning_rate": 9.549247855617658e-05, + "loss": 2.7561, + "step": 10642 + }, + { + "epoch": 6.858017727639001, + "grad_norm": 1.6443963882051955, + "learning_rate": 9.549164213924764e-05, + "loss": 2.6475, + "step": 10643 + }, + { + "epoch": 6.858662369057212, + "grad_norm": 1.6555353073020087, + "learning_rate": 9.549080564842556e-05, + "loss": 2.8978, + "step": 10644 + }, + { + "epoch": 6.859307010475423, + "grad_norm": 1.7325797041351367, + "learning_rate": 9.548996908371175e-05, + "loss": 2.2642, + "step": 10645 + }, + { + "epoch": 6.859951651893634, + "grad_norm": 1.6321068073758171, + "learning_rate": 9.548913244510754e-05, + "loss": 2.5358, + "step": 10646 + }, + { + "epoch": 6.860596293311845, + "grad_norm": 1.6356811596261438, + "learning_rate": 9.548829573261433e-05, + "loss": 2.6856, + "step": 10647 + }, + { + "epoch": 6.8612409347300565, + "grad_norm": 1.6424705158780275, + "learning_rate": 9.548745894623349e-05, + "loss": 2.6431, + "step": 10648 + }, + { + "epoch": 6.861885576148268, + "grad_norm": 1.6172410856741821, + "learning_rate": 9.548662208596639e-05, + "loss": 2.6057, + "step": 10649 + }, + { + "epoch": 6.862530217566478, + "grad_norm": 1.5405364126779275, + "learning_rate": 9.548578515181438e-05, + "loss": 2.6468, + "step": 10650 + }, + { + "epoch": 6.8631748589846895, + "grad_norm": 1.605385936074553, + "learning_rate": 9.548494814377889e-05, + "loss": 2.5739, + "step": 10651 + }, + { + "epoch": 6.863819500402901, + "grad_norm": 1.7037527368672383, + "learning_rate": 9.548411106186124e-05, + "loss": 2.7844, + "step": 10652 + }, + { + "epoch": 6.864464141821112, + "grad_norm": 1.6282988381791679, + "learning_rate": 9.548327390606284e-05, + "loss": 2.8635, + "step": 10653 + }, + { + "epoch": 6.865108783239323, + "grad_norm": 2.0289306829814384, + "learning_rate": 9.548243667638507e-05, + "loss": 2.6145, + "step": 10654 + }, + { + "epoch": 6.865753424657534, + "grad_norm": 1.6553295910750945, + "learning_rate": 9.548159937282927e-05, + "loss": 2.5521, + "step": 10655 + }, + { + "epoch": 6.866398066075745, + "grad_norm": 1.9787117123176867, + "learning_rate": 9.548076199539684e-05, + "loss": 2.5809, + "step": 10656 + }, + { + "epoch": 6.867042707493956, + "grad_norm": 1.742642837256717, + "learning_rate": 9.547992454408914e-05, + "loss": 2.6451, + "step": 10657 + }, + { + "epoch": 6.867687348912168, + "grad_norm": 1.7614786614736808, + "learning_rate": 9.547908701890757e-05, + "loss": 2.5963, + "step": 10658 + }, + { + "epoch": 6.868331990330379, + "grad_norm": 1.5403156003111138, + "learning_rate": 9.547824941985349e-05, + "loss": 2.3818, + "step": 10659 + }, + { + "epoch": 6.868976631748589, + "grad_norm": 1.6583558700870609, + "learning_rate": 9.547741174692825e-05, + "loss": 2.4825, + "step": 10660 + }, + { + "epoch": 6.869621273166801, + "grad_norm": 1.7354344728939985, + "learning_rate": 9.547657400013328e-05, + "loss": 2.4578, + "step": 10661 + }, + { + "epoch": 6.870265914585012, + "grad_norm": 1.6730691138241418, + "learning_rate": 9.547573617946993e-05, + "loss": 2.3566, + "step": 10662 + }, + { + "epoch": 6.870910556003223, + "grad_norm": 1.776769378564079, + "learning_rate": 9.547489828493956e-05, + "loss": 2.676, + "step": 10663 + }, + { + "epoch": 6.871555197421435, + "grad_norm": 1.916766889422264, + "learning_rate": 9.547406031654356e-05, + "loss": 2.5786, + "step": 10664 + }, + { + "epoch": 6.872199838839645, + "grad_norm": 1.7815567632828044, + "learning_rate": 9.547322227428331e-05, + "loss": 2.5637, + "step": 10665 + }, + { + "epoch": 6.872844480257856, + "grad_norm": 1.8132047091359698, + "learning_rate": 9.54723841581602e-05, + "loss": 2.6796, + "step": 10666 + }, + { + "epoch": 6.873489121676068, + "grad_norm": 1.748826435463122, + "learning_rate": 9.547154596817556e-05, + "loss": 2.4486, + "step": 10667 + }, + { + "epoch": 6.874133763094279, + "grad_norm": 1.9270669578173396, + "learning_rate": 9.54707077043308e-05, + "loss": 2.5515, + "step": 10668 + }, + { + "epoch": 6.87477840451249, + "grad_norm": 1.9497033553884984, + "learning_rate": 9.54698693666273e-05, + "loss": 2.4588, + "step": 10669 + }, + { + "epoch": 6.875423045930701, + "grad_norm": 1.7488846115884056, + "learning_rate": 9.546903095506644e-05, + "loss": 2.4607, + "step": 10670 + }, + { + "epoch": 6.876067687348912, + "grad_norm": 1.7470387387357003, + "learning_rate": 9.546819246964957e-05, + "loss": 2.5432, + "step": 10671 + }, + { + "epoch": 6.876712328767123, + "grad_norm": 1.8824012620971264, + "learning_rate": 9.546735391037808e-05, + "loss": 2.5669, + "step": 10672 + }, + { + "epoch": 6.8773569701853345, + "grad_norm": 1.7209143693439626, + "learning_rate": 9.546651527725336e-05, + "loss": 2.5831, + "step": 10673 + }, + { + "epoch": 6.878001611603546, + "grad_norm": 1.6707941400172726, + "learning_rate": 9.546567657027677e-05, + "loss": 2.7031, + "step": 10674 + }, + { + "epoch": 6.878646253021756, + "grad_norm": 1.7098853203972113, + "learning_rate": 9.54648377894497e-05, + "loss": 2.6297, + "step": 10675 + }, + { + "epoch": 6.8792908944399676, + "grad_norm": 1.6052210690319202, + "learning_rate": 9.546399893477353e-05, + "loss": 2.7489, + "step": 10676 + }, + { + "epoch": 6.879935535858179, + "grad_norm": 1.628435688484134, + "learning_rate": 9.546316000624963e-05, + "loss": 2.9075, + "step": 10677 + }, + { + "epoch": 6.88058017727639, + "grad_norm": 1.631779147327234, + "learning_rate": 9.546232100387938e-05, + "loss": 2.363, + "step": 10678 + }, + { + "epoch": 6.8812248186946015, + "grad_norm": 1.7592299207453825, + "learning_rate": 9.546148192766413e-05, + "loss": 2.8789, + "step": 10679 + }, + { + "epoch": 6.881869460112812, + "grad_norm": 1.6726225884124575, + "learning_rate": 9.546064277760531e-05, + "loss": 2.626, + "step": 10680 + }, + { + "epoch": 6.882514101531023, + "grad_norm": 1.7870630704182062, + "learning_rate": 9.545980355370427e-05, + "loss": 2.3636, + "step": 10681 + }, + { + "epoch": 6.8831587429492345, + "grad_norm": 1.575748030697134, + "learning_rate": 9.545896425596238e-05, + "loss": 2.6224, + "step": 10682 + }, + { + "epoch": 6.883803384367446, + "grad_norm": 1.7026981745200305, + "learning_rate": 9.545812488438105e-05, + "loss": 2.4237, + "step": 10683 + }, + { + "epoch": 6.884448025785657, + "grad_norm": 1.6675572027657624, + "learning_rate": 9.545728543896162e-05, + "loss": 2.7203, + "step": 10684 + }, + { + "epoch": 6.8850926672038675, + "grad_norm": 1.6651157503464558, + "learning_rate": 9.54564459197055e-05, + "loss": 2.4636, + "step": 10685 + }, + { + "epoch": 6.885737308622079, + "grad_norm": 6.485696638818666, + "learning_rate": 9.545560632661404e-05, + "loss": 2.7034, + "step": 10686 + }, + { + "epoch": 6.88638195004029, + "grad_norm": 1.7097692087995529, + "learning_rate": 9.545476665968864e-05, + "loss": 2.3825, + "step": 10687 + }, + { + "epoch": 6.887026591458501, + "grad_norm": 1.730576563431928, + "learning_rate": 9.545392691893068e-05, + "loss": 2.7612, + "step": 10688 + }, + { + "epoch": 6.887671232876713, + "grad_norm": 1.6620614015380935, + "learning_rate": 9.545308710434153e-05, + "loss": 2.542, + "step": 10689 + }, + { + "epoch": 6.888315874294923, + "grad_norm": 1.74489509608904, + "learning_rate": 9.545224721592257e-05, + "loss": 2.6899, + "step": 10690 + }, + { + "epoch": 6.888960515713134, + "grad_norm": 1.7311249847642354, + "learning_rate": 9.545140725367519e-05, + "loss": 2.7773, + "step": 10691 + }, + { + "epoch": 6.889605157131346, + "grad_norm": 1.8409574763140473, + "learning_rate": 9.545056721760075e-05, + "loss": 2.6279, + "step": 10692 + }, + { + "epoch": 6.890249798549557, + "grad_norm": 1.7079529176011625, + "learning_rate": 9.544972710770064e-05, + "loss": 2.5614, + "step": 10693 + }, + { + "epoch": 6.890894439967768, + "grad_norm": 1.6028650594725595, + "learning_rate": 9.544888692397625e-05, + "loss": 2.4157, + "step": 10694 + }, + { + "epoch": 6.891539081385979, + "grad_norm": 1.8082177422257106, + "learning_rate": 9.544804666642895e-05, + "loss": 2.6406, + "step": 10695 + }, + { + "epoch": 6.89218372280419, + "grad_norm": 1.7245067169588317, + "learning_rate": 9.544720633506012e-05, + "loss": 2.8918, + "step": 10696 + }, + { + "epoch": 6.892828364222401, + "grad_norm": 1.9038165720011595, + "learning_rate": 9.54463659298711e-05, + "loss": 2.5121, + "step": 10697 + }, + { + "epoch": 6.893473005640613, + "grad_norm": 1.8708589865918237, + "learning_rate": 9.544552545086335e-05, + "loss": 2.7887, + "step": 10698 + }, + { + "epoch": 6.894117647058824, + "grad_norm": 1.7148447658424537, + "learning_rate": 9.544468489803822e-05, + "loss": 2.4741, + "step": 10699 + }, + { + "epoch": 6.894762288477034, + "grad_norm": 1.8815204431232415, + "learning_rate": 9.544384427139707e-05, + "loss": 2.5049, + "step": 10700 + }, + { + "epoch": 6.894762288477034, + "eval_loss": 4.777799606323242, + "eval_runtime": 2.9805, + "eval_samples_per_second": 33.552, + "eval_steps_per_second": 4.362, + "step": 10700 + }, + { + "epoch": 6.895406929895246, + "grad_norm": 1.8600885242159262, + "learning_rate": 9.544300357094129e-05, + "loss": 2.7449, + "step": 10701 + }, + { + "epoch": 6.896051571313457, + "grad_norm": 1.752468396391317, + "learning_rate": 9.544216279667227e-05, + "loss": 2.8159, + "step": 10702 + }, + { + "epoch": 6.896696212731668, + "grad_norm": 1.7184003484452697, + "learning_rate": 9.544132194859137e-05, + "loss": 2.6158, + "step": 10703 + }, + { + "epoch": 6.8973408541498795, + "grad_norm": 1.587047641475336, + "learning_rate": 9.544048102669999e-05, + "loss": 2.3738, + "step": 10704 + }, + { + "epoch": 6.89798549556809, + "grad_norm": 1.7148505514539805, + "learning_rate": 9.54396400309995e-05, + "loss": 2.2046, + "step": 10705 + }, + { + "epoch": 6.898630136986301, + "grad_norm": 1.996805130768899, + "learning_rate": 9.54387989614913e-05, + "loss": 2.7247, + "step": 10706 + }, + { + "epoch": 6.8992747784045125, + "grad_norm": 1.642596635164513, + "learning_rate": 9.543795781817677e-05, + "loss": 2.633, + "step": 10707 + }, + { + "epoch": 6.899919419822724, + "grad_norm": 1.7240335828294326, + "learning_rate": 9.543711660105726e-05, + "loss": 2.5246, + "step": 10708 + }, + { + "epoch": 6.900564061240935, + "grad_norm": 1.778593165031121, + "learning_rate": 9.543627531013416e-05, + "loss": 2.5615, + "step": 10709 + }, + { + "epoch": 6.9012087026591455, + "grad_norm": 1.6366310840938683, + "learning_rate": 9.543543394540889e-05, + "loss": 2.5356, + "step": 10710 + }, + { + "epoch": 6.901853344077357, + "grad_norm": 11.879953886214045, + "learning_rate": 9.543459250688279e-05, + "loss": 2.7259, + "step": 10711 + }, + { + "epoch": 6.902497985495568, + "grad_norm": 1.7433459830006095, + "learning_rate": 9.543375099455726e-05, + "loss": 2.8318, + "step": 10712 + }, + { + "epoch": 6.9031426269137794, + "grad_norm": 1.6943917844779957, + "learning_rate": 9.543290940843368e-05, + "loss": 2.7031, + "step": 10713 + }, + { + "epoch": 6.903787268331991, + "grad_norm": 1.852837808090599, + "learning_rate": 9.543206774851343e-05, + "loss": 2.5448, + "step": 10714 + }, + { + "epoch": 6.904431909750201, + "grad_norm": 1.9150288379273486, + "learning_rate": 9.543122601479789e-05, + "loss": 2.6453, + "step": 10715 + }, + { + "epoch": 6.9050765511684125, + "grad_norm": 2.02090128014072, + "learning_rate": 9.543038420728846e-05, + "loss": 2.5452, + "step": 10716 + }, + { + "epoch": 6.905721192586624, + "grad_norm": 2.334445800230615, + "learning_rate": 9.54295423259865e-05, + "loss": 2.6798, + "step": 10717 + }, + { + "epoch": 6.906365834004835, + "grad_norm": 2.0486266592140905, + "learning_rate": 9.54287003708934e-05, + "loss": 2.4939, + "step": 10718 + }, + { + "epoch": 6.907010475423046, + "grad_norm": 1.7121002587753782, + "learning_rate": 9.542785834201054e-05, + "loss": 2.7955, + "step": 10719 + }, + { + "epoch": 6.907655116841257, + "grad_norm": 1.8902656459698846, + "learning_rate": 9.542701623933932e-05, + "loss": 2.5404, + "step": 10720 + }, + { + "epoch": 6.908299758259468, + "grad_norm": 1.795762491641506, + "learning_rate": 9.54261740628811e-05, + "loss": 2.5776, + "step": 10721 + }, + { + "epoch": 6.908944399677679, + "grad_norm": 1.772804899462848, + "learning_rate": 9.542533181263728e-05, + "loss": 2.8115, + "step": 10722 + }, + { + "epoch": 6.909589041095891, + "grad_norm": 1.89450431082389, + "learning_rate": 9.542448948860923e-05, + "loss": 2.4625, + "step": 10723 + }, + { + "epoch": 6.910233682514102, + "grad_norm": 1.8298403116959738, + "learning_rate": 9.542364709079834e-05, + "loss": 2.8811, + "step": 10724 + }, + { + "epoch": 6.910878323932312, + "grad_norm": 1.657462607468151, + "learning_rate": 9.5422804619206e-05, + "loss": 2.6432, + "step": 10725 + }, + { + "epoch": 6.911522965350524, + "grad_norm": 1.7712291679966075, + "learning_rate": 9.542196207383358e-05, + "loss": 2.4185, + "step": 10726 + }, + { + "epoch": 6.912167606768735, + "grad_norm": 1.6924120034703514, + "learning_rate": 9.542111945468247e-05, + "loss": 2.8787, + "step": 10727 + }, + { + "epoch": 6.912812248186946, + "grad_norm": 1.6410399514485519, + "learning_rate": 9.542027676175406e-05, + "loss": 2.6382, + "step": 10728 + }, + { + "epoch": 6.913456889605158, + "grad_norm": 1.6242374358244627, + "learning_rate": 9.541943399504973e-05, + "loss": 2.8641, + "step": 10729 + }, + { + "epoch": 6.914101531023368, + "grad_norm": 1.695699856041165, + "learning_rate": 9.541859115457085e-05, + "loss": 2.6198, + "step": 10730 + }, + { + "epoch": 6.914746172441579, + "grad_norm": 1.5945257252106504, + "learning_rate": 9.541774824031882e-05, + "loss": 2.7021, + "step": 10731 + }, + { + "epoch": 6.915390813859791, + "grad_norm": 1.7115143159899335, + "learning_rate": 9.541690525229504e-05, + "loss": 2.6338, + "step": 10732 + }, + { + "epoch": 6.916035455278002, + "grad_norm": 1.5670235933863266, + "learning_rate": 9.541606219050086e-05, + "loss": 2.7564, + "step": 10733 + }, + { + "epoch": 6.916680096696213, + "grad_norm": 1.707471945416202, + "learning_rate": 9.54152190549377e-05, + "loss": 2.7524, + "step": 10734 + }, + { + "epoch": 6.917324738114424, + "grad_norm": 1.612171424587669, + "learning_rate": 9.541437584560691e-05, + "loss": 2.6304, + "step": 10735 + }, + { + "epoch": 6.917969379532635, + "grad_norm": 1.6727648324545772, + "learning_rate": 9.541353256250988e-05, + "loss": 2.5914, + "step": 10736 + }, + { + "epoch": 6.918614020950846, + "grad_norm": 1.5908408549016557, + "learning_rate": 9.541268920564804e-05, + "loss": 2.659, + "step": 10737 + }, + { + "epoch": 6.9192586623690575, + "grad_norm": 1.5298659514470152, + "learning_rate": 9.541184577502272e-05, + "loss": 2.7557, + "step": 10738 + }, + { + "epoch": 6.919903303787269, + "grad_norm": 1.61798145864766, + "learning_rate": 9.541100227063532e-05, + "loss": 2.8283, + "step": 10739 + }, + { + "epoch": 6.920547945205479, + "grad_norm": 1.7170275884217139, + "learning_rate": 9.541015869248722e-05, + "loss": 2.6796, + "step": 10740 + }, + { + "epoch": 6.9211925866236905, + "grad_norm": 1.6581774967847651, + "learning_rate": 9.540931504057983e-05, + "loss": 2.7396, + "step": 10741 + }, + { + "epoch": 6.921837228041902, + "grad_norm": 1.5382840764910848, + "learning_rate": 9.540847131491452e-05, + "loss": 2.4761, + "step": 10742 + }, + { + "epoch": 6.922481869460113, + "grad_norm": 1.5269302409276033, + "learning_rate": 9.54076275154927e-05, + "loss": 2.4174, + "step": 10743 + }, + { + "epoch": 6.923126510878324, + "grad_norm": 1.5377455307588699, + "learning_rate": 9.540678364231571e-05, + "loss": 2.7476, + "step": 10744 + }, + { + "epoch": 6.923771152296535, + "grad_norm": 1.6589850388342218, + "learning_rate": 9.540593969538497e-05, + "loss": 2.6639, + "step": 10745 + }, + { + "epoch": 6.924415793714746, + "grad_norm": 1.5319417009981835, + "learning_rate": 9.540509567470184e-05, + "loss": 2.7395, + "step": 10746 + }, + { + "epoch": 6.925060435132957, + "grad_norm": 1.704941101331392, + "learning_rate": 9.540425158026774e-05, + "loss": 2.6394, + "step": 10747 + }, + { + "epoch": 6.925705076551169, + "grad_norm": 1.6282788455875241, + "learning_rate": 9.540340741208402e-05, + "loss": 2.7011, + "step": 10748 + }, + { + "epoch": 6.926349717969379, + "grad_norm": 1.5482172477762102, + "learning_rate": 9.540256317015209e-05, + "loss": 2.6303, + "step": 10749 + }, + { + "epoch": 6.9269943593875904, + "grad_norm": 1.6911287778465693, + "learning_rate": 9.540171885447335e-05, + "loss": 2.7086, + "step": 10750 + }, + { + "epoch": 6.927639000805802, + "grad_norm": 1.6181808531732953, + "learning_rate": 9.540087446504915e-05, + "loss": 2.6507, + "step": 10751 + }, + { + "epoch": 6.928283642224013, + "grad_norm": 1.7219305455602252, + "learning_rate": 9.540003000188089e-05, + "loss": 2.6687, + "step": 10752 + }, + { + "epoch": 6.928928283642224, + "grad_norm": 1.6177584857823137, + "learning_rate": 9.539918546496997e-05, + "loss": 2.695, + "step": 10753 + }, + { + "epoch": 6.929572925060435, + "grad_norm": 1.7148206384605997, + "learning_rate": 9.539834085431776e-05, + "loss": 2.7259, + "step": 10754 + }, + { + "epoch": 6.930217566478646, + "grad_norm": 1.5400237623252102, + "learning_rate": 9.539749616992568e-05, + "loss": 2.5839, + "step": 10755 + }, + { + "epoch": 6.930862207896857, + "grad_norm": 1.80474415733962, + "learning_rate": 9.539665141179508e-05, + "loss": 2.8586, + "step": 10756 + }, + { + "epoch": 6.931506849315069, + "grad_norm": 1.638284646929508, + "learning_rate": 9.539580657992735e-05, + "loss": 2.5192, + "step": 10757 + }, + { + "epoch": 6.93215149073328, + "grad_norm": 1.8173156035623317, + "learning_rate": 9.539496167432389e-05, + "loss": 2.5629, + "step": 10758 + }, + { + "epoch": 6.93279613215149, + "grad_norm": 1.7961242589501003, + "learning_rate": 9.53941166949861e-05, + "loss": 2.4219, + "step": 10759 + }, + { + "epoch": 6.933440773569702, + "grad_norm": 1.5194849666087658, + "learning_rate": 9.539327164191535e-05, + "loss": 2.6822, + "step": 10760 + }, + { + "epoch": 6.934085414987913, + "grad_norm": 4.897400621739971, + "learning_rate": 9.539242651511303e-05, + "loss": 2.6049, + "step": 10761 + }, + { + "epoch": 6.934730056406124, + "grad_norm": 1.7795271663851704, + "learning_rate": 9.539158131458052e-05, + "loss": 2.4841, + "step": 10762 + }, + { + "epoch": 6.935374697824336, + "grad_norm": 4.253264471545102, + "learning_rate": 9.539073604031922e-05, + "loss": 2.6687, + "step": 10763 + }, + { + "epoch": 6.936019339242546, + "grad_norm": 1.6284203213726036, + "learning_rate": 9.538989069233054e-05, + "loss": 2.7729, + "step": 10764 + }, + { + "epoch": 6.936663980660757, + "grad_norm": 3.082639540043831, + "learning_rate": 9.538904527061582e-05, + "loss": 2.5067, + "step": 10765 + }, + { + "epoch": 6.937308622078969, + "grad_norm": 1.8484486810744511, + "learning_rate": 9.538819977517647e-05, + "loss": 3.0062, + "step": 10766 + }, + { + "epoch": 6.93795326349718, + "grad_norm": 1.9839795769195934, + "learning_rate": 9.538735420601388e-05, + "loss": 2.6114, + "step": 10767 + }, + { + "epoch": 6.938597904915391, + "grad_norm": 2.0004068116935056, + "learning_rate": 9.538650856312946e-05, + "loss": 2.6816, + "step": 10768 + }, + { + "epoch": 6.939242546333602, + "grad_norm": 1.8483413347634547, + "learning_rate": 9.538566284652457e-05, + "loss": 2.3592, + "step": 10769 + }, + { + "epoch": 6.939887187751813, + "grad_norm": 2.1338638539675463, + "learning_rate": 9.53848170562006e-05, + "loss": 2.3082, + "step": 10770 + }, + { + "epoch": 6.940531829170024, + "grad_norm": 1.9724825976741163, + "learning_rate": 9.538397119215897e-05, + "loss": 2.5891, + "step": 10771 + }, + { + "epoch": 6.9411764705882355, + "grad_norm": 1.6880642561258739, + "learning_rate": 9.538312525440101e-05, + "loss": 2.6352, + "step": 10772 + }, + { + "epoch": 6.941821112006446, + "grad_norm": 2.3445515658186977, + "learning_rate": 9.538227924292817e-05, + "loss": 2.7805, + "step": 10773 + }, + { + "epoch": 6.942465753424657, + "grad_norm": 1.7152347941970854, + "learning_rate": 9.538143315774181e-05, + "loss": 2.6001, + "step": 10774 + }, + { + "epoch": 6.9431103948428685, + "grad_norm": 2.213924399923096, + "learning_rate": 9.538058699884332e-05, + "loss": 2.8451, + "step": 10775 + }, + { + "epoch": 6.94375503626108, + "grad_norm": 2.0315893735397124, + "learning_rate": 9.537974076623409e-05, + "loss": 2.6235, + "step": 10776 + }, + { + "epoch": 6.944399677679291, + "grad_norm": 1.978915317369968, + "learning_rate": 9.537889445991554e-05, + "loss": 2.8696, + "step": 10777 + }, + { + "epoch": 6.9450443190975015, + "grad_norm": 1.874761627448491, + "learning_rate": 9.537804807988901e-05, + "loss": 2.6192, + "step": 10778 + }, + { + "epoch": 6.945688960515713, + "grad_norm": 2.0552015377445683, + "learning_rate": 9.537720162615592e-05, + "loss": 2.6887, + "step": 10779 + }, + { + "epoch": 6.946333601933924, + "grad_norm": 2.0091623395885474, + "learning_rate": 9.537635509871765e-05, + "loss": 2.3925, + "step": 10780 + }, + { + "epoch": 6.946978243352135, + "grad_norm": 1.741191239569472, + "learning_rate": 9.537550849757559e-05, + "loss": 2.9059, + "step": 10781 + }, + { + "epoch": 6.947622884770347, + "grad_norm": 1.916177911404599, + "learning_rate": 9.537466182273117e-05, + "loss": 2.7186, + "step": 10782 + }, + { + "epoch": 6.948267526188557, + "grad_norm": 2.180744608508311, + "learning_rate": 9.53738150741857e-05, + "loss": 2.515, + "step": 10783 + }, + { + "epoch": 6.948912167606768, + "grad_norm": 1.749302748765521, + "learning_rate": 9.537296825194064e-05, + "loss": 2.5647, + "step": 10784 + }, + { + "epoch": 6.94955680902498, + "grad_norm": 2.057734022121657, + "learning_rate": 9.537212135599735e-05, + "loss": 2.9556, + "step": 10785 + }, + { + "epoch": 6.950201450443191, + "grad_norm": 1.6710163379825889, + "learning_rate": 9.537127438635723e-05, + "loss": 2.6979, + "step": 10786 + }, + { + "epoch": 6.950846091861402, + "grad_norm": 2.3120283261473373, + "learning_rate": 9.537042734302168e-05, + "loss": 2.4984, + "step": 10787 + }, + { + "epoch": 6.951490733279613, + "grad_norm": 1.6845964427235713, + "learning_rate": 9.536958022599206e-05, + "loss": 2.4665, + "step": 10788 + }, + { + "epoch": 6.952135374697824, + "grad_norm": 2.1905323576099107, + "learning_rate": 9.536873303526981e-05, + "loss": 2.5035, + "step": 10789 + }, + { + "epoch": 6.952780016116035, + "grad_norm": 1.4957661889077858, + "learning_rate": 9.536788577085626e-05, + "loss": 2.7008, + "step": 10790 + }, + { + "epoch": 6.953424657534247, + "grad_norm": 1.8546102357005099, + "learning_rate": 9.536703843275285e-05, + "loss": 2.6684, + "step": 10791 + }, + { + "epoch": 6.954069298952458, + "grad_norm": 1.5851706861613408, + "learning_rate": 9.536619102096097e-05, + "loss": 2.6926, + "step": 10792 + }, + { + "epoch": 6.954713940370668, + "grad_norm": 1.860498001057461, + "learning_rate": 9.536534353548198e-05, + "loss": 2.6353, + "step": 10793 + }, + { + "epoch": 6.95535858178888, + "grad_norm": 1.6438803534557895, + "learning_rate": 9.536449597631728e-05, + "loss": 2.7762, + "step": 10794 + }, + { + "epoch": 6.956003223207091, + "grad_norm": 1.655173486369097, + "learning_rate": 9.53636483434683e-05, + "loss": 2.5096, + "step": 10795 + }, + { + "epoch": 6.956647864625302, + "grad_norm": 1.9001771941693792, + "learning_rate": 9.536280063693639e-05, + "loss": 2.606, + "step": 10796 + }, + { + "epoch": 6.957292506043514, + "grad_norm": 1.8491812628109452, + "learning_rate": 9.536195285672295e-05, + "loss": 2.7331, + "step": 10797 + }, + { + "epoch": 6.957937147461724, + "grad_norm": 1.8243623483800853, + "learning_rate": 9.536110500282939e-05, + "loss": 2.6074, + "step": 10798 + }, + { + "epoch": 6.958581788879935, + "grad_norm": 1.5945330874779677, + "learning_rate": 9.536025707525709e-05, + "loss": 2.8997, + "step": 10799 + }, + { + "epoch": 6.959226430298147, + "grad_norm": 2.000241446963776, + "learning_rate": 9.535940907400744e-05, + "loss": 2.5966, + "step": 10800 + }, + { + "epoch": 6.959226430298147, + "eval_loss": 4.77731466293335, + "eval_runtime": 2.9705, + "eval_samples_per_second": 33.664, + "eval_steps_per_second": 4.376, + "step": 10800 + } + ], + "logging_steps": 1.0, + "max_steps": 77550, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2333305093816320.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}